1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 1992 Terrence R. Lambert.
6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * William Jolitz.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 */
40
41 #include "opt_atpic.h"
42 #include "opt_cpu.h"
43 #include "opt_ddb.h"
44 #include "opt_inet.h"
45 #include "opt_isa.h"
46 #include "opt_kstack_pages.h"
47 #include "opt_maxmem.h"
48 #include "opt_pci.h"
49 #include "opt_platform.h"
50 #include "opt_sched.h"
51
52 #include <sys/param.h>
53 #include <sys/proc.h>
54 #include <sys/systm.h>
55 #include <sys/asan.h>
56 #include <sys/bio.h>
57 #include <sys/buf.h>
58 #include <sys/bus.h>
59 #include <sys/callout.h>
60 #include <sys/cons.h>
61 #include <sys/cpu.h>
62 #include <sys/csan.h>
63 #include <sys/efi.h>
64 #include <sys/eventhandler.h>
65 #include <sys/exec.h>
66 #include <sys/imgact.h>
67 #include <sys/kdb.h>
68 #include <sys/kernel.h>
69 #include <sys/ktr.h>
70 #include <sys/linker.h>
71 #include <sys/lock.h>
72 #include <sys/malloc.h>
73 #include <sys/memrange.h>
74 #include <sys/msan.h>
75 #include <sys/msgbuf.h>
76 #include <sys/mutex.h>
77 #include <sys/pcpu.h>
78 #include <sys/ptrace.h>
79 #include <sys/reboot.h>
80 #include <sys/reg.h>
81 #include <sys/rwlock.h>
82 #include <sys/sched.h>
83 #include <sys/signalvar.h>
84 #include <sys/smp.h>
85 #include <sys/syscallsubr.h>
86 #include <sys/sysctl.h>
87 #include <sys/sysent.h>
88 #include <sys/sysproto.h>
89 #include <sys/ucontext.h>
90 #include <sys/vmmeter.h>
91
92 #include <vm/vm.h>
93 #include <vm/vm_param.h>
94 #include <vm/vm_extern.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_page.h>
97 #include <vm/vm_map.h>
98 #include <vm/vm_object.h>
99 #include <vm/vm_pager.h>
100 #include <vm/vm_phys.h>
101 #include <vm/vm_dumpset.h>
102
103 #ifdef DDB
104 #ifndef KDB
105 #error KDB must be enabled in order for DDB to work!
106 #endif
107 #include <ddb/ddb.h>
108 #include <ddb/db_sym.h>
109 #endif
110
111 #include <net/netisr.h>
112
113 #include <dev/smbios/smbios.h>
114
115 #include <machine/clock.h>
116 #include <machine/cpu.h>
117 #include <machine/cputypes.h>
118 #include <machine/frame.h>
119 #include <machine/intr_machdep.h>
120 #include <x86/mca.h>
121 #include <machine/md_var.h>
122 #include <machine/metadata.h>
123 #include <machine/pc/bios.h>
124 #include <machine/pcb.h>
125 #include <machine/proc.h>
126 #include <machine/sigframe.h>
127 #include <machine/specialreg.h>
128 #include <machine/trap.h>
129 #include <machine/tss.h>
130 #include <x86/ucode.h>
131 #include <x86/ifunc.h>
132 #include <machine/smp.h>
133 #ifdef FDT
134 #include <x86/fdt.h>
135 #endif
136
137 #ifdef DEV_ATPIC
138 #include <x86/isa/icu.h>
139 #else
140 #include <x86/apicvar.h>
141 #endif
142
143 #include <isa/isareg.h>
144 #include <isa/rtc.h>
145 #include <x86/init.h>
146
147 #ifndef SMP
148 #error amd64 requires options SMP
149 #endif
150
151 /* Sanity check for __curthread() */
152 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
153
154 /*
155 * The PTI trampoline stack needs enough space for a hardware trapframe and a
156 * couple of scratch registers, as well as the trapframe left behind after an
157 * iret fault.
158 */
159 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
160 offsetof(struct pti_frame, pti_rip));
161
162 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
163
164 static void cpu_startup(void *);
165 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
166
167 /* Probe 8254 PIT and TSC. */
168 static void native_clock_source_init(void);
169
170 /* Preload data parse function */
171 static void native_parse_preload_data(u_int64_t);
172
173 /* Native function to fetch and parse the e820 map */
174 static void native_parse_memmap(vm_paddr_t *, int *);
175
176 /* Default init_ops implementation. */
177 struct init_ops init_ops = {
178 .parse_preload_data = native_parse_preload_data,
179 .early_clock_source_init = native_clock_source_init,
180 .early_delay = i8254_delay,
181 .parse_memmap = native_parse_memmap,
182 };
183
184 /*
185 * Physical address of the EFI System Table. Stashed from the metadata hints
186 * passed into the kernel and used by the EFI code to call runtime services.
187 */
188 vm_paddr_t efi_systbl_phys;
189
190 /*
191 * Bitmap of extra EFI memory region types that should be preserved and mapped
192 * during runtime services calls.
193 */
194 uint32_t efi_map_regs;
195
196 /* Intel ICH registers */
197 #define ICH_PMBASE 0x400
198 #define ICH_SMI_EN ICH_PMBASE + 0x30
199
200 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
201
202 int cold = 1;
203
204 long Maxmem = 0;
205 long realmem = 0;
206 int late_console = 1;
207 int lass_enabled = 0;
208
209 struct kva_md_info kmi;
210
211 struct region_descriptor r_idt;
212
213 struct pcpu *__pcpu;
214 struct pcpu temp_bsp_pcpu;
215
216 struct mtx icu_lock;
217
218 struct mem_range_softc mem_range_softc;
219
220 struct mtx dt_lock; /* lock for GDT and LDT */
221
222 void (*vmm_suspend_p)(void);
223 void (*vmm_resume_p)(void);
224
225 bool efi_boot;
226
227 static void
cpu_startup(void * dummy)228 cpu_startup(void *dummy)
229 {
230 uintmax_t memsize;
231 char *sysenv;
232
233 /*
234 * On MacBooks, we need to disallow the legacy USB circuit to
235 * generate an SMI# because this can cause several problems,
236 * namely: incorrect CPU frequency detection and failure to
237 * start the APs.
238 * We do this by disabling a bit in the SMI_EN (SMI Control and
239 * Enable register) of the Intel ICH LPC Interface Bridge.
240 */
241 sysenv = kern_getenv("smbios.system.product");
242 if (sysenv != NULL) {
243 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
244 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
245 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
246 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
247 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
248 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
249 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
250 strncmp(sysenv, "Macmini1,1", 10) == 0) {
251 if (bootverbose)
252 printf("Disabling LEGACY_USB_EN bit on "
253 "Intel ICH.\n");
254 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
255 }
256 freeenv(sysenv);
257 }
258
259 /*
260 * Good {morning,afternoon,evening,night}.
261 */
262 startrtclock();
263 printcpuinfo();
264
265 /*
266 * Display physical memory if SMBIOS reports reasonable amount.
267 */
268 memsize = 0;
269 sysenv = kern_getenv("smbios.memory.enabled");
270 if (sysenv != NULL) {
271 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
272 freeenv(sysenv);
273 }
274 if (memsize < ptoa((uintmax_t)vm_free_count()))
275 memsize = ptoa((uintmax_t)Maxmem);
276 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
277 realmem = atop(memsize);
278
279 /*
280 * Display any holes after the first chunk of extended memory.
281 */
282 if (bootverbose) {
283 int indx;
284
285 printf("Physical memory chunk(s):\n");
286 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
287 vm_paddr_t size;
288
289 size = phys_avail[indx + 1] - phys_avail[indx];
290 printf(
291 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
292 (uintmax_t)phys_avail[indx],
293 (uintmax_t)phys_avail[indx + 1] - 1,
294 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
295 }
296 }
297
298 vm_ksubmap_init(&kmi);
299
300 printf("avail memory = %ju (%ju MB)\n",
301 ptoa((uintmax_t)vm_free_count()),
302 ptoa((uintmax_t)vm_free_count()) / 1048576);
303 #ifdef DEV_PCI
304 if (bootverbose && intel_graphics_stolen_base != 0)
305 printf("intel stolen mem: base %#jx size %ju MB\n",
306 (uintmax_t)intel_graphics_stolen_base,
307 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
308 #endif
309
310 /*
311 * Set up buffers, so they can be used to read disk labels.
312 */
313 bufinit();
314 vm_pager_bufferinit();
315
316 cpu_setregs();
317 }
318
319 static void
late_ifunc_resolve(void * dummy __unused)320 late_ifunc_resolve(void *dummy __unused)
321 {
322 link_elf_late_ireloc();
323 }
324 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
325
326 void
cpu_setregs(void)327 cpu_setregs(void)
328 {
329 register_t cr0;
330
331 TSENTER();
332 cr0 = rcr0();
333 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
334 TSENTER2("load_cr0");
335 load_cr0(cr0);
336 TSEXIT2("load_cr0");
337 TSEXIT();
338 }
339
340 /*
341 * Initialize amd64 and configure to run kernel
342 */
343
344 /*
345 * Initialize segments & interrupt table
346 */
347 static struct gate_descriptor idt0[NIDT];
348 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
349
350 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
351 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
352 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
353 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
354 CTASSERT(sizeof(struct nmi_pcpu) == 16);
355
356 /*
357 * Software prototypes -- in more palatable form.
358 *
359 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
360 * slots as corresponding segments for i386 kernel.
361 */
362 struct soft_segment_descriptor gdt_segs[] = {
363 [GNULL_SEL] = { /* 0 Null Descriptor */
364 .ssd_base = 0x0,
365 .ssd_limit = 0x0,
366 .ssd_type = 0,
367 .ssd_dpl = 0,
368 .ssd_p = 0,
369 .ssd_long = 0,
370 .ssd_def32 = 0,
371 .ssd_gran = 0 },
372 [GNULL2_SEL] = { /* 1 Null Descriptor */
373 .ssd_base = 0x0,
374 .ssd_limit = 0x0,
375 .ssd_type = 0,
376 .ssd_dpl = 0,
377 .ssd_p = 0,
378 .ssd_long = 0,
379 .ssd_def32 = 0,
380 .ssd_gran = 0 },
381 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */
382 .ssd_base = 0x0,
383 .ssd_limit = 0xfffff,
384 .ssd_type = SDT_MEMRWA,
385 .ssd_dpl = SEL_UPL,
386 .ssd_p = 1,
387 .ssd_long = 0,
388 .ssd_def32 = 1,
389 .ssd_gran = 1 },
390 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */
391 .ssd_base = 0x0,
392 .ssd_limit = 0xfffff,
393 .ssd_type = SDT_MEMRWA,
394 .ssd_dpl = SEL_UPL,
395 .ssd_p = 1,
396 .ssd_long = 0,
397 .ssd_def32 = 1,
398 .ssd_gran = 1 },
399 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */
400 .ssd_base = 0x0,
401 .ssd_limit = 0xfffff,
402 .ssd_type = SDT_MEMERA,
403 .ssd_dpl = SEL_KPL,
404 .ssd_p = 1,
405 .ssd_long = 1,
406 .ssd_def32 = 0,
407 .ssd_gran = 1 },
408 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */
409 .ssd_base = 0x0,
410 .ssd_limit = 0xfffff,
411 .ssd_type = SDT_MEMRWA,
412 .ssd_dpl = SEL_KPL,
413 .ssd_p = 1,
414 .ssd_long = 1,
415 .ssd_def32 = 0,
416 .ssd_gran = 1 },
417 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */
418 .ssd_base = 0x0,
419 .ssd_limit = 0xfffff,
420 .ssd_type = SDT_MEMERA,
421 .ssd_dpl = SEL_UPL,
422 .ssd_p = 1,
423 .ssd_long = 0,
424 .ssd_def32 = 1,
425 .ssd_gran = 1 },
426 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */
427 .ssd_base = 0x0,
428 .ssd_limit = 0xfffff,
429 .ssd_type = SDT_MEMRWA,
430 .ssd_dpl = SEL_UPL,
431 .ssd_p = 1,
432 .ssd_long = 0,
433 .ssd_def32 = 1,
434 .ssd_gran = 1 },
435 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */
436 .ssd_base = 0x0,
437 .ssd_limit = 0xfffff,
438 .ssd_type = SDT_MEMERA,
439 .ssd_dpl = SEL_UPL,
440 .ssd_p = 1,
441 .ssd_long = 1,
442 .ssd_def32 = 0,
443 .ssd_gran = 1 },
444 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */
445 .ssd_base = 0x0,
446 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
447 .ssd_type = SDT_SYSTSS,
448 .ssd_dpl = SEL_KPL,
449 .ssd_p = 1,
450 .ssd_long = 0,
451 .ssd_def32 = 0,
452 .ssd_gran = 0 },
453 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */
454 .ssd_base = 0x0,
455 .ssd_limit = 0x0,
456 .ssd_type = 0,
457 .ssd_dpl = 0,
458 .ssd_p = 0,
459 .ssd_long = 0,
460 .ssd_def32 = 0,
461 .ssd_gran = 0 },
462 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */
463 .ssd_base = 0x0,
464 .ssd_limit = 0x0,
465 .ssd_type = 0,
466 .ssd_dpl = 0,
467 .ssd_p = 0,
468 .ssd_long = 0,
469 .ssd_def32 = 0,
470 .ssd_gran = 0 },
471 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */
472 .ssd_base = 0x0,
473 .ssd_limit = 0x0,
474 .ssd_type = 0,
475 .ssd_dpl = 0,
476 .ssd_p = 0,
477 .ssd_long = 0,
478 .ssd_def32 = 0,
479 .ssd_gran = 0 },
480 };
481 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
482
483 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)484 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
485 {
486 struct gate_descriptor *ip;
487
488 ip = idt + idx;
489 ip->gd_looffset = (uintptr_t)func;
490 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
491 ip->gd_ist = ist;
492 ip->gd_xx = 0;
493 ip->gd_type = typ;
494 ip->gd_dpl = dpl;
495 ip->gd_p = 1;
496 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
497 }
498
499 extern inthand_t
500 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
501 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
502 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
503 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
504 IDTVEC(xmm), IDTVEC(dblfault),
505 IDTVEC(div_pti), IDTVEC(bpt_pti),
506 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
507 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
508 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
509 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
510 IDTVEC(xmm_pti),
511 #ifdef KDTRACE_HOOKS
512 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
513 #endif
514 #ifdef XENHVM
515 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
516 #endif
517 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
518 IDTVEC(fast_syscall_pti);
519
520 #ifdef DDB
521 /*
522 * Display the index and function name of any IDT entries that don't use
523 * the default 'rsvd' entry point.
524 */
DB_SHOW_COMMAND_FLAGS(idt,db_show_idt,DB_CMD_MEMSAFE)525 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
526 {
527 struct gate_descriptor *ip;
528 int idx;
529 uintptr_t func;
530
531 ip = idt;
532 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
533 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
534 if (func != (uintptr_t)&IDTVEC(rsvd)) {
535 db_printf("%3d\t", idx);
536 db_printsym(func, DB_STGY_PROC);
537 db_printf("\n");
538 }
539 ip++;
540 }
541 }
542
543 /* Show privileged registers. */
DB_SHOW_COMMAND_FLAGS(sysregs,db_show_sysregs,DB_CMD_MEMSAFE)544 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
545 {
546 struct {
547 uint16_t limit;
548 uint64_t base;
549 } __packed idtr, gdtr;
550 uint16_t ldt, tr;
551
552 __asm __volatile("sidt %0" : "=m" (idtr));
553 db_printf("idtr\t0x%016lx/%04x\n",
554 (u_long)idtr.base, (u_int)idtr.limit);
555 __asm __volatile("sgdt %0" : "=m" (gdtr));
556 db_printf("gdtr\t0x%016lx/%04x\n",
557 (u_long)gdtr.base, (u_int)gdtr.limit);
558 __asm __volatile("sldt %0" : "=r" (ldt));
559 db_printf("ldtr\t0x%04x\n", ldt);
560 __asm __volatile("str %0" : "=r" (tr));
561 db_printf("tr\t0x%04x\n", tr);
562 db_printf("cr0\t0x%016lx\n", rcr0());
563 db_printf("cr2\t0x%016lx\n", rcr2());
564 db_printf("cr3\t0x%016lx\n", rcr3());
565 db_printf("cr4\t0x%016lx\n", rcr4());
566 if (rcr4() & CR4_XSAVE)
567 db_printf("xcr0\t0x%016lx\n", rxcr(0));
568 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
569 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
570 db_printf("FEATURES_CTL\t%016lx\n",
571 rdmsr(MSR_IA32_FEATURE_CONTROL));
572 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
573 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
574 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
575 }
576
DB_SHOW_COMMAND_FLAGS(dbregs,db_show_dbregs,DB_CMD_MEMSAFE)577 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
578 {
579
580 db_printf("dr0\t0x%016lx\n", rdr0());
581 db_printf("dr1\t0x%016lx\n", rdr1());
582 db_printf("dr2\t0x%016lx\n", rdr2());
583 db_printf("dr3\t0x%016lx\n", rdr3());
584 db_printf("dr6\t0x%016lx\n", rdr6());
585 db_printf("dr7\t0x%016lx\n", rdr7());
586 }
587 #endif
588
589 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)590 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
591 {
592
593 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
594 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
595 ssd->ssd_type = sd->sd_type;
596 ssd->ssd_dpl = sd->sd_dpl;
597 ssd->ssd_p = sd->sd_p;
598 ssd->ssd_long = sd->sd_long;
599 ssd->ssd_def32 = sd->sd_def32;
600 ssd->ssd_gran = sd->sd_gran;
601 }
602
603 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)604 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
605 {
606
607 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
608 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
609 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
610 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
611 sd->sd_type = ssd->ssd_type;
612 sd->sd_dpl = ssd->ssd_dpl;
613 sd->sd_p = ssd->ssd_p;
614 sd->sd_long = ssd->ssd_long;
615 sd->sd_def32 = ssd->ssd_def32;
616 sd->sd_gran = ssd->ssd_gran;
617 }
618
619 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)620 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
621 {
622
623 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
624 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
625 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
626 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
627 sd->sd_type = ssd->ssd_type;
628 sd->sd_dpl = ssd->ssd_dpl;
629 sd->sd_p = ssd->ssd_p;
630 sd->sd_gran = ssd->ssd_gran;
631 }
632
633 u_int basemem;
634
635 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)636 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
637 int *physmap_idxp)
638 {
639 int i, insert_idx, physmap_idx;
640
641 physmap_idx = *physmap_idxp;
642
643 if (length == 0)
644 return (1);
645
646 /*
647 * Find insertion point while checking for overlap. Start off by
648 * assuming the new entry will be added to the end.
649 *
650 * NB: physmap_idx points to the next free slot.
651 */
652 insert_idx = physmap_idx;
653 for (i = 0; i < physmap_idx; i += 2) {
654 if (base < physmap[i + 1]) {
655 if (base + length <= physmap[i]) {
656 insert_idx = i;
657 break;
658 }
659 if (boothowto & RB_VERBOSE)
660 printf(
661 "Overlapping memory regions, ignoring second region\n");
662 return (1);
663 }
664 }
665
666 /* See if we can prepend to the next entry. */
667 if (insert_idx < physmap_idx && base + length == physmap[insert_idx]) {
668 physmap[insert_idx] = base;
669 return (1);
670 }
671
672 /* See if we can append to the previous entry. */
673 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
674 physmap[insert_idx - 1] += length;
675 return (1);
676 }
677
678 if (physmap_idx == PHYS_AVAIL_ENTRIES) {
679 printf(
680 "Too many segments in the physical address map, giving up\n");
681 return (0);
682 }
683
684 /*
685 * Move the last 'N' entries down to make room for the new
686 * entry if needed.
687 */
688 for (i = physmap_idx; i > insert_idx; i -= 2) {
689 physmap[i] = physmap[i - 2];
690 physmap[i + 1] = physmap[i - 1];
691 }
692
693 physmap_idx += 2;
694 *physmap_idxp = physmap_idx;
695
696 /* Insert the new entry. */
697 physmap[insert_idx] = base;
698 physmap[insert_idx + 1] = base + length;
699 return (1);
700 }
701
702 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)703 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
704 vm_paddr_t *physmap, int *physmap_idx)
705 {
706 struct bios_smap *smap, *smapend;
707
708 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
709
710 for (smap = smapbase; smap < smapend; smap++) {
711 if (boothowto & RB_VERBOSE)
712 printf("SMAP type=%02x base=%016lx len=%016lx\n",
713 smap->type, smap->base, smap->length);
714
715 if (smap->type != SMAP_TYPE_MEMORY)
716 continue;
717
718 if (!add_physmap_entry(smap->base, smap->length, physmap,
719 physmap_idx))
720 break;
721 }
722 }
723
724 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)725 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
726 int *physmap_idx)
727 {
728 struct efi_md *map, *p;
729 const char *type;
730 size_t efisz;
731 int ndesc, i;
732
733 static const char *types[] = {
734 "Reserved",
735 "LoaderCode",
736 "LoaderData",
737 "BootServicesCode",
738 "BootServicesData",
739 "RuntimeServicesCode",
740 "RuntimeServicesData",
741 "ConventionalMemory",
742 "UnusableMemory",
743 "ACPIReclaimMemory",
744 "ACPIMemoryNVS",
745 "MemoryMappedIO",
746 "MemoryMappedIOPortSpace",
747 "PalCode",
748 "PersistentMemory"
749 };
750
751 /*
752 * Memory map data provided by UEFI via the GetMemoryMap
753 * Boot Services API.
754 */
755 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
756 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
757
758 if (efihdr->descriptor_size == 0)
759 return;
760 ndesc = efihdr->memory_size / efihdr->descriptor_size;
761
762 if (boothowto & RB_VERBOSE)
763 printf("%23s %12s %12s %8s %4s\n",
764 "Type", "Physical", "Virtual", "#Pages", "Attr");
765
766 TUNABLE_INT_FETCH("machdep.efirt.regs", &efi_map_regs);
767 for (i = 0, p = map; i < ndesc; i++,
768 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
769 if (boothowto & RB_VERBOSE) {
770 if (p->md_type < nitems(types))
771 type = types[p->md_type];
772 else
773 type = "<INVALID>";
774 printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
775 p->md_virt, p->md_pages);
776 if (p->md_attr & EFI_MD_ATTR_UC)
777 printf("UC ");
778 if (p->md_attr & EFI_MD_ATTR_WC)
779 printf("WC ");
780 if (p->md_attr & EFI_MD_ATTR_WT)
781 printf("WT ");
782 if (p->md_attr & EFI_MD_ATTR_WB)
783 printf("WB ");
784 if (p->md_attr & EFI_MD_ATTR_UCE)
785 printf("UCE ");
786 if (p->md_attr & EFI_MD_ATTR_WP)
787 printf("WP ");
788 if (p->md_attr & EFI_MD_ATTR_RP)
789 printf("RP ");
790 if (p->md_attr & EFI_MD_ATTR_XP)
791 printf("XP ");
792 if (p->md_attr & EFI_MD_ATTR_NV)
793 printf("NV ");
794 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
795 printf("MORE_RELIABLE ");
796 if (p->md_attr & EFI_MD_ATTR_RO)
797 printf("RO ");
798 if (p->md_attr & EFI_MD_ATTR_RT)
799 printf("RUNTIME");
800 printf("\n");
801 }
802
803 switch (p->md_type) {
804 case EFI_MD_TYPE_BS_CODE:
805 case EFI_MD_TYPE_BS_DATA:
806 if (EFI_MAP_BOOTTYPE_ALLOWED(p->md_type))
807 continue;
808 /* FALLTHROUGH */
809 case EFI_MD_TYPE_CODE:
810 case EFI_MD_TYPE_DATA:
811 case EFI_MD_TYPE_FREE:
812 /*
813 * We're allowed to use any entry with these types.
814 */
815 break;
816 default:
817 continue;
818 }
819
820 if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
821 physmap, physmap_idx))
822 break;
823 }
824 }
825
826 static void
native_parse_memmap(vm_paddr_t * physmap,int * physmap_idx)827 native_parse_memmap(vm_paddr_t *physmap, int *physmap_idx)
828 {
829 struct bios_smap *smap;
830 struct efi_map_header *efihdr;
831
832 efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
833 MODINFO_METADATA | MODINFOMD_EFI_MAP);
834 smap = (struct bios_smap *)preload_search_info(preload_kmdp,
835 MODINFO_METADATA | MODINFOMD_SMAP);
836 if (efihdr == NULL && smap == NULL)
837 panic("No BIOS smap or EFI map info from loader!");
838
839 if (efihdr != NULL) {
840 add_efi_map_entries(efihdr, physmap, physmap_idx);
841 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
842 } else {
843 /*
844 * Memory map from INT 15:E820.
845 *
846 * subr_module.c says:
847 * "Consumer may safely assume that size value precedes data."
848 * ie: an int32_t immediately precedes smap.
849 */
850 u_int32_t size = *((u_int32_t *)smap - 1);
851
852 bios_add_smap_entries(smap, size, physmap, physmap_idx);
853 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
854 }
855 }
856
857 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
858
859 /*
860 * Populate the (physmap) array with base/bound pairs describing the
861 * available physical memory in the system, then test this memory and
862 * build the phys_avail array describing the actually-available memory.
863 *
864 * Total memory size may be set by the kernel environment variable
865 * hw.physmem or the compile-time define MAXMEM.
866 *
867 * XXX first should be vm_paddr_t.
868 */
869 static void
getmemsize(u_int64_t first)870 getmemsize(u_int64_t first)
871 {
872 int i, physmap_idx, pa_indx, da_indx;
873 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
874 u_long physmem_start, physmem_tunable, memtest;
875 pt_entry_t *pte;
876 quad_t dcons_addr, dcons_size;
877 int page_counter;
878
879 TSENTER();
880 /*
881 * Tell the physical memory allocator about pages used to store
882 * the kernel and preloaded data. See kmem_bootstrap_free().
883 */
884 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
885
886 bzero(physmap, sizeof(physmap));
887 physmap_idx = 0;
888
889 init_ops.parse_memmap(physmap, &physmap_idx);
890 physmap_idx -= 2;
891
892 /*
893 * Find the 'base memory' segment for SMP
894 */
895 basemem = 0;
896 for (i = 0; i <= physmap_idx; i += 2) {
897 if (physmap[i] <= 0xA0000) {
898 basemem = physmap[i + 1] / 1024;
899 break;
900 }
901 }
902 if (basemem == 0 || basemem > 640) {
903 if (bootverbose)
904 printf(
905 "Memory map doesn't contain a basemem segment, faking it");
906 basemem = 640;
907 }
908
909 /*
910 * Maxmem isn't the "maximum memory", it's one larger than the
911 * highest page of the physical address space. It should be
912 * called something like "Maxphyspage". We may adjust this
913 * based on ``hw.physmem'' and the results of the memory test.
914 */
915 Maxmem = atop(physmap[physmap_idx + 1]);
916
917 #ifdef MAXMEM
918 Maxmem = MAXMEM / 4;
919 #endif
920
921 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
922 Maxmem = atop(physmem_tunable);
923
924 /*
925 * The boot memory test is disabled by default, as it takes a
926 * significant amount of time on large-memory systems, and is
927 * unfriendly to virtual machines as it unnecessarily touches all
928 * pages.
929 *
930 * A general name is used as the code may be extended to support
931 * additional tests beyond the current "page present" test.
932 */
933 memtest = 0;
934 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
935
936 /*
937 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
938 * in the system.
939 */
940 if (Maxmem > atop(physmap[physmap_idx + 1]))
941 Maxmem = atop(physmap[physmap_idx + 1]);
942
943 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
944 (boothowto & RB_VERBOSE))
945 printf("Physical memory use set to %ldK\n", Maxmem * 4);
946
947 /* call pmap initialization to make new kernel address space */
948 pmap_bootstrap(&first);
949
950 /*
951 * Size up each available chunk of physical memory.
952 *
953 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
954 * By default, mask off the first 16 pages unless we appear to be
955 * running in a VM.
956 */
957 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
958 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
959 if (physmap[0] < physmem_start) {
960 if (physmem_start < PAGE_SIZE)
961 physmap[0] = PAGE_SIZE;
962 else if (physmem_start >= physmap[1])
963 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
964 else
965 physmap[0] = round_page(physmem_start);
966 }
967 pa_indx = 0;
968 da_indx = 1;
969 phys_avail[pa_indx++] = physmap[0];
970 phys_avail[pa_indx] = physmap[0];
971 dump_avail[da_indx] = physmap[0];
972 pte = CMAP1;
973
974 /*
975 * Get dcons buffer address
976 */
977 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
978 getenv_quad("dcons.size", &dcons_size) == 0)
979 dcons_addr = 0;
980
981 /*
982 * physmap is in bytes, so when converting to page boundaries,
983 * round up the start address and round down the end address.
984 */
985 page_counter = 0;
986 if (memtest != 0)
987 printf("Testing system memory");
988 for (i = 0; i <= physmap_idx; i += 2) {
989 vm_paddr_t end;
990
991 end = ptoa((vm_paddr_t)Maxmem);
992 if (physmap[i + 1] < end)
993 end = trunc_page(physmap[i + 1]);
994 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
995 int *ptr = (int *)CADDR1;
996 int tmp;
997 bool full, page_bad;
998
999 full = false;
1000 /*
1001 * block out kernel memory as not available.
1002 */
1003 if (pa >= (vm_paddr_t)kernphys && pa < first)
1004 goto do_dump_avail;
1005
1006 /*
1007 * block out dcons buffer
1008 */
1009 if (dcons_addr > 0
1010 && pa >= trunc_page(dcons_addr)
1011 && pa < dcons_addr + dcons_size)
1012 goto do_dump_avail;
1013
1014 page_bad = false;
1015 if (memtest == 0)
1016 goto skip_memtest;
1017
1018 /*
1019 * Print a "." every GB to show we're making
1020 * progress.
1021 */
1022 page_counter++;
1023 if ((page_counter % PAGES_PER_GB) == 0)
1024 printf(".");
1025
1026 /*
1027 * map page into kernel: valid, read/write,non-cacheable
1028 */
1029 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1030 invltlb();
1031
1032 tmp = *(int *)ptr;
1033 /*
1034 * Test for alternating 1's and 0's
1035 */
1036 *(volatile int *)ptr = 0xaaaaaaaa;
1037 if (*(volatile int *)ptr != 0xaaaaaaaa)
1038 page_bad = true;
1039 /*
1040 * Test for alternating 0's and 1's
1041 */
1042 *(volatile int *)ptr = 0x55555555;
1043 if (*(volatile int *)ptr != 0x55555555)
1044 page_bad = true;
1045 /*
1046 * Test for all 1's
1047 */
1048 *(volatile int *)ptr = 0xffffffff;
1049 if (*(volatile int *)ptr != 0xffffffff)
1050 page_bad = true;
1051 /*
1052 * Test for all 0's
1053 */
1054 *(volatile int *)ptr = 0x0;
1055 if (*(volatile int *)ptr != 0x0)
1056 page_bad = true;
1057 /*
1058 * Restore original value.
1059 */
1060 *(int *)ptr = tmp;
1061
1062 skip_memtest:
1063 /*
1064 * Adjust array of valid/good pages.
1065 */
1066 if (page_bad == true)
1067 continue;
1068 /*
1069 * If this good page is a continuation of the
1070 * previous set of good pages, then just increase
1071 * the end pointer. Otherwise start a new chunk.
1072 * Note that "end" points one higher than end,
1073 * making the range >= start and < end.
1074 * If we're also doing a speculative memory
1075 * test and we at or past the end, bump up Maxmem
1076 * so that we keep going. The first bad page
1077 * will terminate the loop.
1078 */
1079 if (phys_avail[pa_indx] == pa) {
1080 phys_avail[pa_indx] += PAGE_SIZE;
1081 } else {
1082 pa_indx++;
1083 if (pa_indx == PHYS_AVAIL_ENTRIES) {
1084 printf(
1085 "Too many holes in the physical address space, giving up\n");
1086 pa_indx--;
1087 full = true;
1088 goto do_dump_avail;
1089 }
1090 phys_avail[pa_indx++] = pa; /* start */
1091 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1092 }
1093 physmem++;
1094 do_dump_avail:
1095 if (dump_avail[da_indx] == pa) {
1096 dump_avail[da_indx] += PAGE_SIZE;
1097 } else {
1098 da_indx++;
1099 if (da_indx == PHYS_AVAIL_ENTRIES) {
1100 da_indx--;
1101 goto do_next;
1102 }
1103 dump_avail[da_indx++] = pa; /* start */
1104 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1105 }
1106 do_next:
1107 if (full)
1108 break;
1109 }
1110 }
1111 *pte = 0;
1112 invltlb();
1113 if (memtest != 0)
1114 printf("\n");
1115
1116 /*
1117 * XXX
1118 * The last chunk must contain at least one page plus the message
1119 * buffer to avoid complicating other code (message buffer address
1120 * calculation, etc.).
1121 */
1122 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1123 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1124 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1125 phys_avail[pa_indx--] = 0;
1126 phys_avail[pa_indx--] = 0;
1127 }
1128
1129 Maxmem = atop(phys_avail[pa_indx]);
1130
1131 /* Trim off space for the message buffer. */
1132 phys_avail[pa_indx] -= round_page(msgbufsize);
1133
1134 /* Map the message buffer. */
1135 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1136 TSEXIT();
1137 }
1138
1139 static void
native_parse_preload_data(u_int64_t modulep)1140 native_parse_preload_data(u_int64_t modulep)
1141 {
1142 char *envp;
1143 #ifdef DDB
1144 vm_offset_t ksym_start;
1145 vm_offset_t ksym_end;
1146 #endif
1147
1148 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1149 preload_bootstrap_relocate(KERNBASE);
1150 preload_initkmdp(true);
1151 boothowto = MD_FETCH(preload_kmdp, MODINFOMD_HOWTO, int);
1152 envp = MD_FETCH(preload_kmdp, MODINFOMD_ENVP, char *);
1153 if (envp != NULL)
1154 envp += KERNBASE;
1155 init_static_kenv(envp, 0);
1156 #ifdef DDB
1157 ksym_start = MD_FETCH(preload_kmdp, MODINFOMD_SSYM, uintptr_t);
1158 ksym_end = MD_FETCH(preload_kmdp, MODINFOMD_ESYM, uintptr_t);
1159 db_fetch_ksymtab(ksym_start, ksym_end, 0);
1160 #endif
1161 efi_systbl_phys = MD_FETCH(preload_kmdp, MODINFOMD_FW_HANDLE,
1162 vm_paddr_t);
1163 }
1164
1165 static void
native_clock_source_init(void)1166 native_clock_source_init(void)
1167 {
1168 i8254_init();
1169 }
1170
1171 static void
amd64_kdb_init(void)1172 amd64_kdb_init(void)
1173 {
1174 kdb_init();
1175 #ifdef KDB
1176 if (boothowto & RB_KDB)
1177 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1178 #endif
1179 }
1180
1181 /* Set up the fast syscall stuff */
1182 void
amd64_conf_fast_syscall(void)1183 amd64_conf_fast_syscall(void)
1184 {
1185 uint64_t msr;
1186
1187 msr = rdmsr(MSR_EFER) | EFER_SCE;
1188 wrmsr(MSR_EFER, msr);
1189 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1190 (u_int64_t)IDTVEC(fast_syscall));
1191 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1192 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1193 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1194 wrmsr(MSR_STAR, msr);
1195 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1196 }
1197
1198 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1199 amd64_bsp_pcpu_init1(struct pcpu *pc)
1200 {
1201 struct user_segment_descriptor *gdt;
1202
1203 PCPU_SET(prvspace, pc);
1204 gdt = *PCPU_PTR(gdt);
1205 PCPU_SET(curthread, &thread0);
1206 PCPU_SET(tssp, PCPU_PTR(common_tss));
1207 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1208 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1209 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1210 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1211 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1212 PCPU_SET(smp_tlb_gen, 1);
1213 }
1214
1215 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1216 amd64_bsp_pcpu_init2(uint64_t rsp0)
1217 {
1218
1219 PCPU_SET(rsp0, rsp0);
1220 PCPU_SET(pti_rsp0, STACKALIGN((vm_offset_t)PCPU_PTR(pti_stack) +
1221 PC_PTI_STACK_SZ * sizeof(uint64_t)));
1222 PCPU_SET(curpcb, thread0.td_pcb);
1223 }
1224
1225 void
amd64_bsp_ist_init(struct pcpu * pc)1226 amd64_bsp_ist_init(struct pcpu *pc)
1227 {
1228 struct nmi_pcpu *np;
1229 struct amd64tss *tssp;
1230
1231 tssp = &pc->pc_common_tss;
1232
1233 /* doublefault stack space, runs on ist1 */
1234 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1235 np->np_pcpu = (register_t)pc;
1236 tssp->tss_ist1 = (long)np;
1237
1238 /*
1239 * NMI stack, runs on ist2. The pcpu pointer is stored just
1240 * above the start of the ist2 stack.
1241 */
1242 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1243 np->np_pcpu = (register_t)pc;
1244 tssp->tss_ist2 = (long)np;
1245
1246 /*
1247 * MC# stack, runs on ist3. The pcpu pointer is stored just
1248 * above the start of the ist3 stack.
1249 */
1250 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1251 np->np_pcpu = (register_t)pc;
1252 tssp->tss_ist3 = (long)np;
1253
1254 /*
1255 * DB# stack, runs on ist4.
1256 */
1257 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1258 np->np_pcpu = (register_t)pc;
1259 tssp->tss_ist4 = (long)np;
1260 }
1261
1262 /*
1263 * Calculate the kernel load address by inspecting page table created by loader.
1264 * The assumptions:
1265 * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1266 * aligned at 2M, below 4G (the latter is important for AP startup)
1267 * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1268 * - kernel is mapped with 2M superpages
1269 * - all participating memory, i.e. kernel, modules, metadata,
1270 * page table is accessible by pre-created 1:1 mapping
1271 * (right now loader creates 1:1 mapping for lower 4G, and all
1272 * memory is from there)
1273 * - there is a usable memory block right after the end of the
1274 * mapped kernel and all modules/metadata, pointed to by
1275 * physfree, for early allocations
1276 */
1277 vm_paddr_t __nosanitizeaddress __nosanitizememory
amd64_loadaddr(void)1278 amd64_loadaddr(void)
1279 {
1280 pml4_entry_t *pml4e;
1281 pdp_entry_t *pdpe;
1282 pd_entry_t *pde;
1283 uint64_t cr3;
1284
1285 cr3 = rcr3();
1286 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1287 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1288 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1289 return (*pde & PG_FRAME);
1290 }
1291
1292 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1293 hammer_time(u_int64_t modulep, u_int64_t physfree)
1294 {
1295 int gsel_tss, x;
1296 struct pcpu *pc;
1297 uint64_t rsp0;
1298 char *env;
1299 struct user_segment_descriptor *gdt;
1300 struct region_descriptor r_gdt;
1301 size_t kstack0_sz;
1302
1303 TSRAW(&thread0, TS_ENTER, __func__, NULL);
1304
1305 kernphys = amd64_loadaddr();
1306
1307 physfree += kernphys;
1308
1309 /* Initializes preload_kmdp */
1310 init_ops.parse_preload_data(modulep);
1311
1312 efi_boot = preload_search_info(preload_kmdp, MODINFO_METADATA |
1313 MODINFOMD_EFI_MAP) != NULL;
1314
1315 if (!efi_boot) {
1316 /* Tell the bios to warmboot next time */
1317 atomic_store_short((u_short *)0x472, 0x1234);
1318 }
1319
1320 physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1321 physfree = roundup2(physfree, PAGE_SIZE);
1322
1323 identify_cpu1();
1324 identify_hypervisor();
1325 identify_hypervisor_smbios();
1326 identify_cpu_fixup_bsp();
1327 identify_cpu2();
1328 initializecpucache();
1329
1330 /*
1331 * Check for pti, pcid, and invpcid before ifuncs are
1332 * resolved, to correctly select the implementation for
1333 * pmap_activate_sw_mode().
1334 */
1335 pti = pti_get_default();
1336 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1337 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1338 if ((cpu_feature2 & CPUID2_PCID) == 0)
1339 pmap_pcid_enabled = 0;
1340 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) != 0;
1341
1342 /*
1343 * Now we can do small core initialization, after the PCID
1344 * CPU features and user knobs are evaluated.
1345 */
1346 TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1347 &pmap_pcid_invlpg_workaround_uena);
1348 cpu_init_small_core();
1349
1350 if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1351 use_xsave = 1;
1352 TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1353 }
1354
1355 sched_instance_select();
1356
1357 link_elf_ireloc();
1358
1359 /*
1360 * This may be done better later if it gets more high level
1361 * components in it. If so just link td->td_proc here.
1362 */
1363 proc_linkup0(&proc0, &thread0);
1364
1365 /* Init basic tunables, hz etc */
1366 init_param1();
1367
1368 thread0.td_kstack = physfree - kernphys + KERNSTART;
1369 thread0.td_kstack_pages = kstack_pages;
1370 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1371 bzero((void *)thread0.td_kstack, kstack0_sz);
1372 physfree += kstack0_sz;
1373
1374 /*
1375 * Initialize enough of thread0 for delayed invalidation to
1376 * work very early. Rely on thread0.td_base_pri
1377 * zero-initialization, it is reset to PVM at proc0_init().
1378 */
1379 pmap_thread_init_invl_gen(&thread0);
1380
1381 pc = &temp_bsp_pcpu;
1382 pcpu_init(pc, 0, sizeof(struct pcpu));
1383 gdt = &temp_bsp_pcpu.pc_gdt[0];
1384
1385 /*
1386 * make gdt memory segments
1387 */
1388 for (x = 0; x < NGDT; x++) {
1389 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1390 x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1))
1391 ssdtosd(&gdt_segs[x], &gdt[x]);
1392 }
1393 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1394 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1395 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1396
1397 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1398 r_gdt.rd_base = (long)gdt;
1399 lgdt(&r_gdt);
1400
1401 wrmsr(MSR_FSBASE, 0); /* User value */
1402 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1403 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1404
1405 dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1406 physfree += DPCPU_SIZE;
1407 amd64_bsp_pcpu_init1(pc);
1408 /* Non-late cninit() and printf() can be moved up to here. */
1409
1410 /*
1411 * Initialize mutexes.
1412 *
1413 * icu_lock: in order to allow an interrupt to occur in a critical
1414 * section, to set pcpu->ipending (etc...) properly, we
1415 * must be able to get the icu lock, so it can't be
1416 * under witness.
1417 */
1418 mutex_init();
1419 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1420 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1421
1422 /* exceptions */
1423 for (x = 0; x < NIDT; x++)
1424 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1425 SEL_KPL, 0);
1426 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1427 SEL_KPL, 0);
1428 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1429 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1430 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1431 SEL_UPL, 0);
1432 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1433 SEL_UPL, 0);
1434 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1435 SEL_KPL, 0);
1436 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1437 SEL_KPL, 0);
1438 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1439 SEL_KPL, 0);
1440 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1441 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1442 SDT_SYSIGT, SEL_KPL, 0);
1443 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1444 SEL_KPL, 0);
1445 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1446 SDT_SYSIGT, SEL_KPL, 0);
1447 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1448 SEL_KPL, 0);
1449 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1450 SEL_KPL, 0);
1451 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1452 SEL_KPL, 0);
1453 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1454 SEL_KPL, 0);
1455 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1456 SEL_KPL, 0);
1457 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1458 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1459 SEL_KPL, 0);
1460 #ifdef KDTRACE_HOOKS
1461 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1462 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1463 #endif
1464 #ifdef XENHVM
1465 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1466 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1467 #endif
1468 r_idt.rd_limit = sizeof(idt0) - 1;
1469 r_idt.rd_base = (long) idt;
1470 lidt(&r_idt);
1471
1472 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1473 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1474
1475 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1476 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1477
1478 TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1479 &syscall_ret_l1d_flush_mode);
1480
1481 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1482 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1483
1484 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1485
1486 TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1487 &x86_rngds_mitg_enable);
1488
1489 TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1490 &zenbleed_enable);
1491 zenbleed_sanitize_enable();
1492
1493 finishidentcpu(); /* Final stage of CPU initialization */
1494
1495 invlpgb_works = (amd_extended_feature_extensions &
1496 AMDFEID_INVLPGB) != 0;
1497 TUNABLE_INT_FETCH("vm.pmap.invlpgb_works", &invlpgb_works);
1498 if (invlpgb_works)
1499 invlpgb_maxcnt = cpu_procinfo3 & AMDID_INVLPGB_MAXCNT;
1500
1501 /*
1502 * Initialize the clock before the console so that console
1503 * initialization can use DELAY().
1504 */
1505 clock_init();
1506
1507 initializecpu(); /* Initialize CPU registers */
1508
1509 amd64_bsp_ist_init(pc);
1510
1511 /* Set the IO permission bitmap (empty due to tss seg limit) */
1512 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1513 IOPERM_BITMAP_SIZE;
1514
1515 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1516 ltr(gsel_tss);
1517
1518 amd64_conf_fast_syscall();
1519
1520 /*
1521 * We initialize the PCB pointer early so that exception
1522 * handlers will work.
1523 */
1524 cpu_max_ext_state_size = sizeof(struct savefpu);
1525 set_top_of_stack_td(&thread0);
1526 thread0.td_pcb = get_pcb_td(&thread0);
1527
1528 /*
1529 * The console and kdb should be initialized even earlier than here,
1530 * but some console drivers don't work until after getmemsize().
1531 * Default to late console initialization to support these drivers.
1532 * This loses mainly printf()s in getmemsize() and early debugging.
1533 */
1534 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1535 if (!late_console) {
1536 cninit();
1537 amd64_kdb_init();
1538 }
1539
1540 getmemsize(physfree);
1541 init_param2(physmem);
1542
1543 /* now running on new page tables, configured,and u/iom is accessible */
1544
1545 #ifdef DEV_PCI
1546 /* This call might adjust phys_avail[]. */
1547 pci_early_quirks();
1548 #endif
1549
1550 if (late_console)
1551 cninit();
1552
1553 /*
1554 * Dump the boot metadata. We have to wait for cninit() since console
1555 * output is required. If it's grossly incorrect the kernel will never
1556 * make it this far.
1557 */
1558 if (getenv_is_true("debug.dump_modinfo_at_boot"))
1559 preload_dump();
1560
1561 #ifdef DEV_ISA
1562 #ifdef DEV_ATPIC
1563 elcr_probe();
1564 atpic_startup();
1565 #else
1566 /* Reset and mask the atpics and leave them shut down. */
1567 atpic_reset();
1568
1569 /*
1570 * Point the ICU spurious interrupt vectors at the APIC spurious
1571 * interrupt handler.
1572 */
1573 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1574 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1575 #endif
1576 #else
1577 #error "have you forgotten the isa device?"
1578 #endif
1579
1580 if (late_console)
1581 amd64_kdb_init();
1582
1583 msgbufinit(msgbufp, msgbufsize);
1584 fpuinit();
1585
1586 /* make an initial tss so cpu can get interrupt stack on syscall! */
1587 rsp0 = thread0.td_md.md_stack_base;
1588 /* Ensure the stack is aligned to 16 bytes */
1589 rsp0 = STACKALIGN(rsp0);
1590 PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1591 amd64_bsp_pcpu_init2(rsp0);
1592
1593 /* transfer to user mode */
1594
1595 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1596 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1597 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1598 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1599 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1600
1601 load_ds(_udatasel);
1602 load_es(_udatasel);
1603 load_fs(_ufssel);
1604
1605 /* setup proc 0's pcb */
1606 thread0.td_pcb->pcb_flags = 0;
1607
1608 env = kern_getenv("kernelname");
1609 if (env != NULL)
1610 strlcpy(kernelname, env, sizeof(kernelname));
1611
1612 kcsan_cpu_init(0);
1613
1614 #ifdef FDT
1615 x86_init_fdt();
1616 #endif
1617
1618 kasan_init();
1619 kmsan_init();
1620
1621 TSEXIT();
1622
1623 /* Location of kernel stack for locore */
1624 return (thread0.td_md.md_stack_base);
1625 }
1626
1627 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1628 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1629 {
1630
1631 pcpu->pc_acpi_id = 0xffffffff;
1632 }
1633
1634 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1635 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1636 {
1637 struct bios_smap *smapbase;
1638 struct bios_smap_xattr smap;
1639 uint32_t *smapattr;
1640 int count, error, i;
1641
1642 /* Retrieve the system memory map from the loader. */
1643 smapbase = (struct bios_smap *)preload_search_info(preload_kmdp,
1644 MODINFO_METADATA | MODINFOMD_SMAP);
1645 if (smapbase == NULL)
1646 return (0);
1647 smapattr = (uint32_t *)preload_search_info(preload_kmdp,
1648 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1649 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1650 error = 0;
1651 for (i = 0; i < count; i++) {
1652 smap.base = smapbase[i].base;
1653 smap.length = smapbase[i].length;
1654 smap.type = smapbase[i].type;
1655 if (smapattr != NULL)
1656 smap.xattr = smapattr[i];
1657 else
1658 smap.xattr = 0;
1659 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1660 }
1661 return (error);
1662 }
1663 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1664 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1665 smap_sysctl_handler, "S,bios_smap_xattr",
1666 "Raw BIOS SMAP data");
1667
1668 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1669 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1670 {
1671 struct efi_map_header *efihdr;
1672 uint32_t efisize;
1673
1674 efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
1675 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1676 if (efihdr == NULL)
1677 return (0);
1678 efisize = *((uint32_t *)efihdr - 1);
1679 return (SYSCTL_OUT(req, efihdr, efisize));
1680 }
1681 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1682 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1683 efi_map_sysctl_handler, "S,efi_map_header",
1684 "Raw EFI Memory Map");
1685
1686 static int
efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)1687 efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)
1688 {
1689 char *arch;
1690
1691 arch = (char *)preload_search_info(preload_kmdp,
1692 MODINFO_METADATA | MODINFOMD_EFI_ARCH);
1693 if (arch == NULL)
1694 return (0);
1695
1696 return (SYSCTL_OUT_STR(req, arch));
1697 }
1698 SYSCTL_PROC(_machdep, OID_AUTO, efi_arch,
1699 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1700 efi_arch_sysctl_handler, "A", "EFI Firmware Architecture");
1701
1702 void
spinlock_enter(void)1703 spinlock_enter(void)
1704 {
1705 struct thread *td;
1706 register_t flags;
1707
1708 td = curthread;
1709 if (td->td_md.md_spinlock_count == 0) {
1710 flags = intr_disable();
1711 td->td_md.md_spinlock_count = 1;
1712 td->td_md.md_saved_flags = flags;
1713 critical_enter();
1714 } else
1715 td->td_md.md_spinlock_count++;
1716 }
1717
1718 void
spinlock_exit(void)1719 spinlock_exit(void)
1720 {
1721 struct thread *td;
1722 register_t flags;
1723
1724 td = curthread;
1725 flags = td->td_md.md_saved_flags;
1726 td->td_md.md_spinlock_count--;
1727 if (td->td_md.md_spinlock_count == 0) {
1728 critical_exit();
1729 intr_restore(flags);
1730 }
1731 }
1732
1733 /*
1734 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1735 * we want to start a backtrace from the function that caused us to enter
1736 * the debugger. We have the context in the trapframe, but base the trace
1737 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1738 * enough for a backtrace.
1739 */
1740 void
makectx(struct trapframe * tf,struct pcb * pcb)1741 makectx(struct trapframe *tf, struct pcb *pcb)
1742 {
1743
1744 pcb->pcb_r12 = tf->tf_r12;
1745 pcb->pcb_r13 = tf->tf_r13;
1746 pcb->pcb_r14 = tf->tf_r14;
1747 pcb->pcb_r15 = tf->tf_r15;
1748 pcb->pcb_rbp = tf->tf_rbp;
1749 pcb->pcb_rbx = tf->tf_rbx;
1750 pcb->pcb_rip = tf->tf_rip;
1751 pcb->pcb_rsp = tf->tf_rsp;
1752 }
1753
1754 /*
1755 * The pcb_flags is only modified by current thread, or by other threads
1756 * when current thread is stopped. However, current thread may change it
1757 * from the interrupt context in cpu_switch(), or in the trap handler.
1758 * When we read-modify-write pcb_flags from C sources, compiler may generate
1759 * code that is not atomic regarding the interrupt handler. If a trap or
1760 * interrupt happens and any flag is modified from the handler, it can be
1761 * clobbered with the cached value later. Therefore, we implement setting
1762 * and clearing flags with single-instruction functions, which do not race
1763 * with possible modification of the flags from the trap or interrupt context,
1764 * because traps and interrupts are executed only on instruction boundary.
1765 */
1766 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)1767 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1768 {
1769
1770 __asm __volatile("orl %1,%0"
1771 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1772 : "cc", "memory");
1773
1774 }
1775
1776 /*
1777 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1778 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1779 * pcb if user space modified the bases. We must save on the context
1780 * switch or if the return to usermode happens through the doreti.
1781 *
1782 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1783 * which have a consequence that the base MSRs must be saved each time
1784 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with
1785 * context switches.
1786 */
1787 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)1788 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1789 {
1790 register_t r;
1791
1792 if (curpcb == pcb &&
1793 (flags & PCB_FULL_IRET) != 0 &&
1794 (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1795 r = intr_disable();
1796 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1797 pcb->pcb_fsbase = rdfsbase();
1798 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1799 }
1800 set_pcb_flags_raw(pcb, flags);
1801 intr_restore(r);
1802 } else {
1803 set_pcb_flags_raw(pcb, flags);
1804 }
1805 }
1806
1807 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1808 {
1809
1810 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1811 set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1812 }
1813
1814 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)1815 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1816 {
1817
1818 __asm __volatile("andl %1,%0"
1819 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1820 : "cc", "memory");
1821 }
1822
1823 extern const char wrmsr_early_safe_gp_handler[];
1824 static struct region_descriptor wrmsr_early_safe_orig_efi_idt;
1825
1826 void
wrmsr_early_safe_start(void)1827 wrmsr_early_safe_start(void)
1828 {
1829 struct region_descriptor efi_idt;
1830 struct gate_descriptor *gpf_descr;
1831
1832 sidt(&wrmsr_early_safe_orig_efi_idt);
1833 efi_idt.rd_limit = 32 * sizeof(idt0[0]);
1834 efi_idt.rd_base = (uintptr_t)idt0;
1835 lidt(&efi_idt);
1836
1837 gpf_descr = &idt0[IDT_GP];
1838 gpf_descr->gd_looffset = (uintptr_t)wrmsr_early_safe_gp_handler;
1839 gpf_descr->gd_hioffset = (uintptr_t)wrmsr_early_safe_gp_handler >> 16;
1840 gpf_descr->gd_selector = rcs();
1841 gpf_descr->gd_type = SDT_SYSTGT;
1842 gpf_descr->gd_p = 1;
1843 }
1844
1845 void
wrmsr_early_safe_end(void)1846 wrmsr_early_safe_end(void)
1847 {
1848 struct gate_descriptor *gpf_descr;
1849
1850 lidt(&wrmsr_early_safe_orig_efi_idt);
1851
1852 gpf_descr = &idt0[IDT_GP];
1853 memset(gpf_descr, 0, sizeof(*gpf_descr));
1854 }
1855
1856 #ifdef KDB
1857
1858 /*
1859 * Provide inb() and outb() as functions. They are normally only available as
1860 * inline functions, thus cannot be called from the debugger.
1861 */
1862
1863 /* silence compiler warnings */
1864 u_char inb_(u_short);
1865 void outb_(u_short, u_char);
1866
1867 u_char
inb_(u_short port)1868 inb_(u_short port)
1869 {
1870 return inb(port);
1871 }
1872
1873 void
outb_(u_short port,u_char data)1874 outb_(u_short port, u_char data)
1875 {
1876 outb(port, data);
1877 }
1878
1879 #endif /* KDB */
1880
1881 #undef memset
1882 #undef memmove
1883 #undef memcpy
1884
1885 void *memset_std(void *buf, int c, size_t len);
1886 void *memset_erms(void *buf, int c, size_t len);
1887 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1888 size_t len);
1889 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1890 size_t len);
1891 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1892 size_t len);
1893 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1894 size_t len);
1895
1896 #ifdef KCSAN
1897 /*
1898 * These fail to build as ifuncs when used with KCSAN.
1899 */
1900 void *
memset(void * buf,int c,size_t len)1901 memset(void *buf, int c, size_t len)
1902 {
1903
1904 return (memset_std(buf, c, len));
1905 }
1906
1907 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)1908 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1909 {
1910
1911 return (memmove_std(dst, src, len));
1912 }
1913
1914 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)1915 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1916 {
1917
1918 return (memcpy_std(dst, src, len));
1919 }
1920 #else
1921 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1922 {
1923
1924 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1925 memset_erms : memset_std);
1926 }
1927
1928 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1929 size_t))
1930 {
1931
1932 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1933 memmove_erms : memmove_std);
1934 }
1935
1936 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1937 {
1938
1939 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1940 memcpy_erms : memcpy_std);
1941 }
1942 #endif
1943
1944 void pagezero_std(void *addr);
1945 void pagezero_erms(void *addr);
1946 DEFINE_IFUNC(, void , pagezero, (void *))
1947 {
1948
1949 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1950 pagezero_erms : pagezero_std);
1951 }
1952