1 /*-
2 * Copyright (c) 2015 Nathan Whitehorn
3 * Copyright (c) 2017-2018 Semihalf
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/kernel.h>
31 #include <sys/bus.h>
32 #include <sys/pcpu.h>
33 #include <sys/proc.h>
34 #include <sys/smp.h>
35 #include <vm/vm.h>
36 #include <vm/pmap.h>
37
38 #include <machine/bus.h>
39 #include <machine/cpu.h>
40 #include <machine/hid.h>
41 #include <machine/platformvar.h>
42 #include <machine/pmap.h>
43 #include <machine/rtas.h>
44 #include <machine/smp.h>
45 #include <machine/spr.h>
46 #include <machine/trap.h>
47
48 #include <dev/ofw/openfirm.h>
49 #include <dev/ofw/ofw_bus.h>
50 #include <dev/ofw/ofw_bus_subr.h>
51 #include <machine/ofw_machdep.h>
52 #include <powerpc/aim/mmu_oea64.h>
53
54 #include "platform_if.h"
55 #include "opal.h"
56
57 #ifdef SMP
58 extern void *ap_pcpu;
59 #endif
60
61 void (*powernv_smp_ap_extra_init)(void);
62
63 static int powernv_probe(platform_t);
64 static int powernv_attach(platform_t);
65 void powernv_mem_regions(platform_t, struct mem_region *phys, int *physsz,
66 struct mem_region *avail, int *availsz);
67 static void powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz);
68 static u_long powernv_timebase_freq(platform_t, struct cpuref *cpuref);
69 static int powernv_smp_first_cpu(platform_t, struct cpuref *cpuref);
70 static int powernv_smp_next_cpu(platform_t, struct cpuref *cpuref);
71 static int powernv_smp_get_bsp(platform_t, struct cpuref *cpuref);
72 static void powernv_smp_ap_init(platform_t);
73 #ifdef SMP
74 static int powernv_smp_start_cpu(platform_t, struct pcpu *cpu);
75 static void powernv_smp_probe_threads(platform_t);
76 static struct cpu_group *powernv_smp_topo(platform_t plat);
77 #endif
78 static void powernv_reset(platform_t);
79 static void powernv_cpu_idle(sbintime_t sbt);
80 static int powernv_cpuref_init(void);
81 static int powernv_node_numa_domain(platform_t platform, phandle_t node);
82
83 static platform_method_t powernv_methods[] = {
84 PLATFORMMETHOD(platform_probe, powernv_probe),
85 PLATFORMMETHOD(platform_attach, powernv_attach),
86 PLATFORMMETHOD(platform_mem_regions, powernv_mem_regions),
87 PLATFORMMETHOD(platform_numa_mem_regions, powernv_numa_mem_regions),
88 PLATFORMMETHOD(platform_timebase_freq, powernv_timebase_freq),
89
90 PLATFORMMETHOD(platform_smp_ap_init, powernv_smp_ap_init),
91 PLATFORMMETHOD(platform_smp_first_cpu, powernv_smp_first_cpu),
92 PLATFORMMETHOD(platform_smp_next_cpu, powernv_smp_next_cpu),
93 PLATFORMMETHOD(platform_smp_get_bsp, powernv_smp_get_bsp),
94 #ifdef SMP
95 PLATFORMMETHOD(platform_smp_start_cpu, powernv_smp_start_cpu),
96 PLATFORMMETHOD(platform_smp_probe_threads, powernv_smp_probe_threads),
97 PLATFORMMETHOD(platform_smp_topo, powernv_smp_topo),
98 #endif
99 PLATFORMMETHOD(platform_node_numa_domain, powernv_node_numa_domain),
100
101 PLATFORMMETHOD(platform_reset, powernv_reset),
102 { 0, 0 }
103 };
104
105 static platform_def_t powernv_platform = {
106 "powernv",
107 powernv_methods,
108 0
109 };
110
111 static struct cpuref platform_cpuref[MAXCPU];
112 static int platform_cpuref_cnt;
113 static int platform_cpuref_valid;
114 static int platform_associativity;
115
116 PLATFORM_DEF(powernv_platform);
117
118 static uint64_t powernv_boot_pir;
119
120 static int
powernv_probe(platform_t plat)121 powernv_probe(platform_t plat)
122 {
123 if (opal_check() == 0)
124 return (BUS_PROBE_SPECIFIC);
125
126 return (ENXIO);
127 }
128
129 static int
powernv_attach(platform_t plat)130 powernv_attach(platform_t plat)
131 {
132 uint32_t nptlp, shift = 0, slb_encoding = 0;
133 int32_t lp_size, lp_encoding;
134 char buf[255];
135 pcell_t refpoints[3];
136 pcell_t prop;
137 phandle_t cpu;
138 phandle_t opal;
139 int res, len, idx;
140 register_t msr;
141 register_t fscr;
142 bool has_lp;
143
144 /* Ping OPAL again just to make sure */
145 opal_check();
146
147 #if BYTE_ORDER == LITTLE_ENDIAN
148 opal_call(OPAL_REINIT_CPUS, 2 /* Little endian */);
149 #else
150 opal_call(OPAL_REINIT_CPUS, 1 /* Big endian */);
151 #endif
152 opal = OF_finddevice("/ibm,opal");
153
154 platform_associativity = 4; /* Skiboot default. */
155 if (OF_getencprop(opal, "ibm,associativity-reference-points", refpoints,
156 sizeof(refpoints)) > 0) {
157 platform_associativity = refpoints[0];
158 }
159
160 if (cpu_idle_hook == NULL)
161 cpu_idle_hook = powernv_cpu_idle;
162
163 powernv_boot_pir = mfspr(SPR_PIR);
164
165 /* LPID must not be altered when PSL_DR or PSL_IR is set */
166 msr = mfmsr();
167 mtmsr(msr & ~(PSL_DR | PSL_IR));
168
169 /* Direct interrupts to SRR instead of HSRR and reset LPCR otherwise */
170 mtspr(SPR_LPID, 0);
171 isync();
172
173 if (cpu_features2 & PPC_FEATURE2_ARCH_3_00)
174 lpcr |= LPCR_HVICE;
175
176 #if BYTE_ORDER == LITTLE_ENDIAN
177 lpcr |= LPCR_ILE;
178 #endif
179
180 mtspr(SPR_LPCR, lpcr);
181 isync();
182
183 fscr = mfspr(SPR_HFSCR);
184 fscr |= FSCR_TAR | FSCR_EBB | HFSCR_BHRB | HFSCR_PM |
185 HFSCR_VECVSX | HFSCR_FP | FSCR_MSGP | FSCR_DSCR;
186 mtspr(SPR_HFSCR, fscr);
187
188 mtmsr(msr);
189
190 powernv_cpuref_init();
191
192 /* Set SLB count from device tree */
193 cpu = OF_peer(0);
194 cpu = OF_child(cpu);
195 while (cpu != 0) {
196 res = OF_getprop(cpu, "name", buf, sizeof(buf));
197 if (res > 0 && strcmp(buf, "cpus") == 0)
198 break;
199 cpu = OF_peer(cpu);
200 }
201 if (cpu == 0)
202 goto out;
203
204 cpu = OF_child(cpu);
205 while (cpu != 0) {
206 res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
207 if (res > 0 && strcmp(buf, "cpu") == 0)
208 break;
209 cpu = OF_peer(cpu);
210 }
211 if (cpu == 0)
212 goto out;
213
214 res = OF_getencprop(cpu, "ibm,slb-size", &prop, sizeof(prop));
215 if (res > 0)
216 n_slbs = prop;
217
218 /*
219 * Scan the large page size property for PAPR compatible machines.
220 * See PAPR D.5 Changes to Section 5.1.4, 'CPU Node Properties'
221 * for the encoding of the property.
222 */
223
224 len = OF_getproplen(cpu, "ibm,segment-page-sizes");
225 if (len > 0) {
226 /*
227 * We have to use a variable length array on the stack
228 * since we have very limited stack space.
229 */
230 pcell_t arr[len/sizeof(cell_t)];
231 res = OF_getencprop(cpu, "ibm,segment-page-sizes", arr,
232 sizeof(arr));
233 len /= 4;
234 idx = 0;
235 has_lp = false;
236 while (len > 0) {
237 shift = arr[idx];
238 slb_encoding = arr[idx + 1];
239 nptlp = arr[idx + 2];
240 idx += 3;
241 len -= 3;
242 while (len > 0 && nptlp) {
243 lp_size = arr[idx];
244 lp_encoding = arr[idx+1];
245 if (slb_encoding == SLBV_L && lp_encoding == 0)
246 has_lp = true;
247
248 if (slb_encoding == SLB_PGSZ_4K_4K &&
249 lp_encoding == LP_4K_16M)
250 moea64_has_lp_4k_16m = true;
251
252 idx += 2;
253 len -= 2;
254 nptlp--;
255 }
256 if (has_lp && moea64_has_lp_4k_16m)
257 break;
258 }
259
260 if (!has_lp)
261 panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) "
262 "not supported by this system.");
263
264 moea64_large_page_shift = shift;
265 moea64_large_page_size = 1ULL << lp_size;
266 }
267
268 out:
269 return (0);
270 }
271
272 void
powernv_mem_regions(platform_t plat,struct mem_region * phys,int * physsz,struct mem_region * avail,int * availsz)273 powernv_mem_regions(platform_t plat, struct mem_region *phys, int *physsz,
274 struct mem_region *avail, int *availsz)
275 {
276
277 ofw_mem_regions(phys, physsz, avail, availsz);
278 }
279
280 static void
powernv_numa_mem_regions(platform_t plat,struct numa_mem_region * phys,int * physsz)281 powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz)
282 {
283
284 ofw_numa_mem_regions(phys, physsz);
285 }
286
287 static u_long
powernv_timebase_freq(platform_t plat,struct cpuref * cpuref)288 powernv_timebase_freq(platform_t plat, struct cpuref *cpuref)
289 {
290 char buf[8];
291 phandle_t cpu, dev, root;
292 int res;
293 int32_t ticks = -1;
294
295 root = OF_peer(0);
296 dev = OF_child(root);
297 while (dev != 0) {
298 res = OF_getprop(dev, "name", buf, sizeof(buf));
299 if (res > 0 && strcmp(buf, "cpus") == 0)
300 break;
301 dev = OF_peer(dev);
302 }
303
304 for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) {
305 res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
306 if (res > 0 && strcmp(buf, "cpu") == 0)
307 break;
308 }
309 if (cpu == 0)
310 return (512000000);
311
312 OF_getencprop(cpu, "timebase-frequency", &ticks, sizeof(ticks));
313
314 if (ticks <= 0)
315 panic("Unable to determine timebase frequency!");
316
317 return (ticks);
318
319 }
320
321 static int
powernv_cpuref_init(void)322 powernv_cpuref_init(void)
323 {
324 phandle_t cpu, dev;
325 char buf[32];
326 int a, res, tmp_cpuref_cnt;
327 static struct cpuref tmp_cpuref[MAXCPU];
328 cell_t interrupt_servers[32];
329 uint64_t bsp;
330
331 if (platform_cpuref_valid)
332 return (0);
333
334 dev = OF_peer(0);
335 dev = OF_child(dev);
336 while (dev != 0) {
337 res = OF_getprop(dev, "name", buf, sizeof(buf));
338 if (res > 0 && strcmp(buf, "cpus") == 0)
339 break;
340 dev = OF_peer(dev);
341 }
342
343 bsp = 0;
344 tmp_cpuref_cnt = 0;
345 for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) {
346 res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
347 if (res > 0 && strcmp(buf, "cpu") == 0) {
348 if (!ofw_bus_node_status_okay(cpu))
349 continue;
350 res = OF_getproplen(cpu, "ibm,ppc-interrupt-server#s");
351 if (res > 0) {
352 OF_getencprop(cpu, "ibm,ppc-interrupt-server#s",
353 interrupt_servers, res);
354
355 for (a = 0; a < res/sizeof(cell_t); a++) {
356 tmp_cpuref[tmp_cpuref_cnt].cr_hwref = interrupt_servers[a];
357 tmp_cpuref[tmp_cpuref_cnt].cr_cpuid = tmp_cpuref_cnt;
358 tmp_cpuref[tmp_cpuref_cnt].cr_domain =
359 powernv_node_numa_domain(NULL, cpu);
360 if (interrupt_servers[a] == (uint32_t)powernv_boot_pir)
361 bsp = tmp_cpuref_cnt;
362
363 tmp_cpuref_cnt++;
364 }
365 }
366 }
367 }
368
369 /* Map IDs, so BSP has CPUID 0 regardless of hwref */
370 for (a = bsp; a < tmp_cpuref_cnt; a++) {
371 platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref;
372 platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt;
373 platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain;
374 platform_cpuref_cnt++;
375 }
376 for (a = 0; a < bsp; a++) {
377 platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref;
378 platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt;
379 platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain;
380 platform_cpuref_cnt++;
381 }
382
383 platform_cpuref_valid = 1;
384
385 return (0);
386 }
387
388 static int
powernv_smp_first_cpu(platform_t plat,struct cpuref * cpuref)389 powernv_smp_first_cpu(platform_t plat, struct cpuref *cpuref)
390 {
391 if (platform_cpuref_valid == 0)
392 return (EINVAL);
393
394 cpuref->cr_cpuid = 0;
395 cpuref->cr_hwref = platform_cpuref[0].cr_hwref;
396 cpuref->cr_domain = platform_cpuref[0].cr_domain;
397
398 return (0);
399 }
400
401 static int
powernv_smp_next_cpu(platform_t plat,struct cpuref * cpuref)402 powernv_smp_next_cpu(platform_t plat, struct cpuref *cpuref)
403 {
404 int id;
405
406 if (platform_cpuref_valid == 0)
407 return (EINVAL);
408
409 id = cpuref->cr_cpuid + 1;
410 if (id >= platform_cpuref_cnt)
411 return (ENOENT);
412
413 cpuref->cr_cpuid = platform_cpuref[id].cr_cpuid;
414 cpuref->cr_hwref = platform_cpuref[id].cr_hwref;
415 cpuref->cr_domain = platform_cpuref[id].cr_domain;
416
417 return (0);
418 }
419
420 static int
powernv_smp_get_bsp(platform_t plat,struct cpuref * cpuref)421 powernv_smp_get_bsp(platform_t plat, struct cpuref *cpuref)
422 {
423
424 cpuref->cr_cpuid = platform_cpuref[0].cr_cpuid;
425 cpuref->cr_hwref = platform_cpuref[0].cr_hwref;
426 cpuref->cr_domain = platform_cpuref[0].cr_domain;
427 return (0);
428 }
429
430 #ifdef SMP
431 static int
powernv_smp_start_cpu(platform_t plat,struct pcpu * pc)432 powernv_smp_start_cpu(platform_t plat, struct pcpu *pc)
433 {
434 int result;
435
436 ap_pcpu = pc;
437 powerpc_sync();
438
439 result = opal_call(OPAL_START_CPU, pc->pc_hwref, EXC_RST);
440 if (result != OPAL_SUCCESS) {
441 printf("OPAL error (%d): unable to start AP %d\n",
442 result, (int)pc->pc_hwref);
443 return (ENXIO);
444 }
445
446 return (0);
447 }
448
449 static void
powernv_smp_probe_threads(platform_t plat)450 powernv_smp_probe_threads(platform_t plat)
451 {
452 char buf[8];
453 phandle_t cpu, dev, root;
454 int res, nthreads;
455
456 root = OF_peer(0);
457
458 dev = OF_child(root);
459 while (dev != 0) {
460 res = OF_getprop(dev, "name", buf, sizeof(buf));
461 if (res > 0 && strcmp(buf, "cpus") == 0)
462 break;
463 dev = OF_peer(dev);
464 }
465
466 nthreads = 1;
467 for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) {
468 res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
469 if (res <= 0 || strcmp(buf, "cpu") != 0)
470 continue;
471
472 res = OF_getproplen(cpu, "ibm,ppc-interrupt-server#s");
473
474 if (res >= 0)
475 nthreads = res / sizeof(cell_t);
476 else
477 nthreads = 1;
478 break;
479 }
480
481 smp_threads_per_core = nthreads;
482 if (mp_ncpus % nthreads == 0)
483 mp_ncores = mp_ncpus / nthreads;
484 }
485
486 static struct cpu_group *
cpu_group_init(struct cpu_group * group,struct cpu_group * parent,const cpuset_t * cpus,int children,int level,int flags)487 cpu_group_init(struct cpu_group *group, struct cpu_group *parent,
488 const cpuset_t *cpus, int children, int level, int flags)
489 {
490 struct cpu_group *child;
491
492 child = children != 0 ? smp_topo_alloc(children) : NULL;
493
494 group->cg_parent = parent;
495 group->cg_child = child;
496 CPU_COPY(cpus, &group->cg_mask);
497 group->cg_count = CPU_COUNT(cpus);
498 group->cg_children = children;
499 group->cg_level = level;
500 group->cg_flags = flags;
501
502 return (child);
503 }
504
505 static struct cpu_group *
powernv_smp_topo(platform_t plat)506 powernv_smp_topo(platform_t plat)
507 {
508 struct cpu_group *core, *dom, *root;
509 cpuset_t corecpus, domcpus;
510 int cpuid, i, j, k, ncores;
511
512 if (mp_ncpus % smp_threads_per_core != 0) {
513 printf("%s: irregular SMP topology (%d threads, %d per core)\n",
514 __func__, mp_ncpus, smp_threads_per_core);
515 return (smp_topo_none());
516 }
517
518 root = smp_topo_alloc(1);
519 dom = cpu_group_init(root, NULL, &all_cpus, vm_ndomains, CG_SHARE_NONE,
520 0);
521
522 /*
523 * Redundant layers will be collapsed by the caller so we don't need a
524 * special case for a single domain.
525 */
526 for (i = 0; i < vm_ndomains; i++, dom++) {
527 CPU_COPY(&cpuset_domain[i], &domcpus);
528 ncores = CPU_COUNT(&domcpus) / smp_threads_per_core;
529 KASSERT(CPU_COUNT(&domcpus) % smp_threads_per_core == 0,
530 ("%s: domain %d core count not divisible by thread count",
531 __func__, i));
532
533 core = cpu_group_init(dom, root, &domcpus, ncores, CG_SHARE_L3,
534 0);
535 for (j = 0; j < ncores; j++, core++) {
536 /*
537 * Assume that consecutive CPU IDs correspond to sibling
538 * threads.
539 */
540 CPU_ZERO(&corecpus);
541 for (k = 0; k < smp_threads_per_core; k++) {
542 cpuid = CPU_FFS(&domcpus) - 1;
543 CPU_CLR(cpuid, &domcpus);
544 CPU_SET(cpuid, &corecpus);
545 }
546 (void)cpu_group_init(core, dom, &corecpus, 0,
547 CG_SHARE_L1, CG_FLAG_SMT);
548 }
549 }
550
551 return (root);
552 }
553
554 #endif
555
556 static void
powernv_reset(platform_t platform)557 powernv_reset(platform_t platform)
558 {
559
560 opal_call(OPAL_CEC_REBOOT);
561 }
562
563 static void
powernv_smp_ap_init(platform_t platform)564 powernv_smp_ap_init(platform_t platform)
565 {
566
567 if (powernv_smp_ap_extra_init != NULL)
568 powernv_smp_ap_extra_init();
569 }
570
571 static void
powernv_cpu_idle(sbintime_t sbt)572 powernv_cpu_idle(sbintime_t sbt)
573 {
574 }
575
576 static int
powernv_node_numa_domain(platform_t platform,phandle_t node)577 powernv_node_numa_domain(platform_t platform, phandle_t node)
578 {
579 /* XXX: Is locking necessary in here? */
580 static int numa_domains[MAXMEMDOM];
581 static int numa_max_domain;
582 cell_t associativity[5];
583 int i, res;
584
585 #ifndef NUMA
586 return (0);
587 #endif
588 i = 0;
589 TUNABLE_INT_FETCH("vm.numa.disabled", &i);
590 if (i)
591 return (0);
592
593 res = OF_getencprop(node, "ibm,associativity",
594 associativity, sizeof(associativity));
595
596 /*
597 * If this node doesn't have associativity, or if there are not
598 * enough elements in it, check its parent.
599 */
600 if (res < (int)(sizeof(cell_t) * (platform_associativity + 1))) {
601 node = OF_parent(node);
602 /* If already at the root, use default domain. */
603 if (node == 0)
604 return (0);
605 return (powernv_node_numa_domain(platform, node));
606 }
607
608 for (i = 0; i < numa_max_domain; i++) {
609 if (numa_domains[i] == associativity[platform_associativity])
610 return (i);
611 }
612 if (i < MAXMEMDOM)
613 numa_domains[numa_max_domain++] =
614 associativity[platform_associativity];
615 else
616 i = 0;
617
618 return (i);
619 }
620
621 /* Set up the Nest MMU on POWER9 relatively early, but after pmap is setup. */
622 static void
powernv_setup_nmmu(void * unused)623 powernv_setup_nmmu(void *unused)
624 {
625 if (opal_check() != 0)
626 return;
627 opal_call(OPAL_NMMU_SET_PTCR, -1, mfspr(SPR_PTCR));
628 }
629
630 SYSINIT(powernv_setup_nmmu, SI_SUB_CPU, SI_ORDER_ANY, powernv_setup_nmmu, NULL);
631