xref: /qemu/hw/ppc/spapr.c (revision 735f9c878a3e97b1257f2345579734ea2877c46d)
1 /*
2  * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
3  *
4  * Copyright (c) 2004-2007 Fabrice Bellard
5  * Copyright (c) 2007 Jocelyn Mayer
6  * Copyright (c) 2010 David Gibson, IBM Corporation.
7  * Copyright (c) 2010-2024, IBM Corporation..
8  *
9  * SPDX-License-Identifier: GPL-2.0-or-later
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
27  * THE SOFTWARE.
28  */
29 
30 #include "qemu/osdep.h"
31 #include "qemu/datadir.h"
32 #include "qemu/memalign.h"
33 #include "qemu/guest-random.h"
34 #include "qapi/error.h"
35 #include "qapi/qapi-events-machine.h"
36 #include "qapi/qapi-events-qdev.h"
37 #include "qapi/visitor.h"
38 #include "system/system.h"
39 #include "system/hostmem.h"
40 #include "system/numa.h"
41 #include "system/tcg.h"
42 #include "system/qtest.h"
43 #include "system/reset.h"
44 #include "system/runstate.h"
45 #include "qemu/log.h"
46 #include "hw/fw-path-provider.h"
47 #include "elf.h"
48 #include "net/net.h"
49 #include "system/device_tree.h"
50 #include "system/cpus.h"
51 #include "system/hw_accel.h"
52 #include "kvm_ppc.h"
53 #include "migration/misc.h"
54 #include "migration/qemu-file-types.h"
55 #include "migration/global_state.h"
56 #include "migration/register.h"
57 #include "migration/blocker.h"
58 #include "mmu-hash64.h"
59 #include "mmu-book3s-v3.h"
60 #include "cpu-models.h"
61 #include "hw/core/cpu.h"
62 
63 #include "hw/ppc/ppc.h"
64 #include "hw/loader.h"
65 
66 #include "hw/ppc/fdt.h"
67 #include "hw/ppc/spapr.h"
68 #include "hw/ppc/spapr_nested.h"
69 #include "hw/ppc/spapr_vio.h"
70 #include "hw/ppc/vof.h"
71 #include "hw/qdev-properties.h"
72 #include "hw/pci-host/spapr.h"
73 #include "hw/pci/msi.h"
74 
75 #include "hw/pci/pci.h"
76 #include "hw/scsi/scsi.h"
77 #include "hw/virtio/virtio-scsi.h"
78 #include "hw/virtio/vhost-scsi-common.h"
79 
80 #include "exec/ram_addr.h"
81 #include "system/confidential-guest-support.h"
82 #include "hw/usb.h"
83 #include "qemu/config-file.h"
84 #include "qemu/error-report.h"
85 #include "trace.h"
86 #include "hw/nmi.h"
87 #include "hw/intc/intc.h"
88 
89 #include "hw/ppc/spapr_cpu_core.h"
90 #include "hw/mem/memory-device.h"
91 #include "hw/ppc/spapr_tpm_proxy.h"
92 #include "hw/ppc/spapr_nvdimm.h"
93 #include "hw/ppc/spapr_numa.h"
94 
95 #include <libfdt.h>
96 
97 /* SLOF memory layout:
98  *
99  * SLOF raw image loaded at 0, copies its romfs right below the flat
100  * device-tree, then position SLOF itself 31M below that
101  *
102  * So we set FW_OVERHEAD to 40MB which should account for all of that
103  * and more
104  *
105  * We load our kernel at 4M, leaving space for SLOF initial image
106  */
107 #define FDT_MAX_ADDR            0x80000000 /* FDT must stay below that */
108 #define FW_MAX_SIZE             0x400000
109 #define FW_FILE_NAME            "slof.bin"
110 #define FW_FILE_NAME_VOF        "vof.bin"
111 #define FW_OVERHEAD             0x2800000
112 #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
113 
114 #define MIN_RMA_SLOF            (128 * MiB)
115 
116 #define PHANDLE_INTC            0x00001111
117 
118 /* These two functions implement the VCPU id numbering: one to compute them
119  * all and one to identify thread 0 of a VCORE. Any change to the first one
120  * is likely to have an impact on the second one, so let's keep them close.
121  */
122 static int spapr_vcpu_id(SpaprMachineState *spapr, int cpu_index)
123 {
124     MachineState *ms = MACHINE(spapr);
125     unsigned int smp_threads = ms->smp.threads;
126 
127     assert(spapr->vsmt);
128     return
129         (cpu_index / smp_threads) * spapr->vsmt + cpu_index % smp_threads;
130 }
131 static bool spapr_is_thread0_in_vcore(SpaprMachineState *spapr,
132                                       PowerPCCPU *cpu)
133 {
134     assert(spapr->vsmt);
135     return spapr_get_vcpu_id(cpu) % spapr->vsmt == 0;
136 }
137 
138 int spapr_max_server_number(SpaprMachineState *spapr)
139 {
140     MachineState *ms = MACHINE(spapr);
141 
142     assert(spapr->vsmt);
143     return DIV_ROUND_UP(ms->smp.max_cpus * spapr->vsmt, ms->smp.threads);
144 }
145 
146 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
147                                   int smt_threads)
148 {
149     int i, ret = 0;
150     g_autofree uint32_t *servers_prop = g_new(uint32_t, smt_threads);
151     g_autofree uint32_t *gservers_prop = g_new(uint32_t, smt_threads * 2);
152     int index = spapr_get_vcpu_id(cpu);
153 
154     if (cpu->compat_pvr) {
155         ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->compat_pvr);
156         if (ret < 0) {
157             return ret;
158         }
159     }
160 
161     /* Build interrupt servers and gservers properties */
162     for (i = 0; i < smt_threads; i++) {
163         servers_prop[i] = cpu_to_be32(index + i);
164         /* Hack, direct the group queues back to cpu 0 */
165         gservers_prop[i*2] = cpu_to_be32(index + i);
166         gservers_prop[i*2 + 1] = 0;
167     }
168     ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
169                       servers_prop, sizeof(*servers_prop) * smt_threads);
170     if (ret < 0) {
171         return ret;
172     }
173     ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
174                       gservers_prop, sizeof(*gservers_prop) * smt_threads * 2);
175 
176     return ret;
177 }
178 
179 static void spapr_dt_pa_features(SpaprMachineState *spapr,
180                                  PowerPCCPU *cpu,
181                                  void *fdt, int offset)
182 {
183     /*
184      * SSO (SAO) ordering is supported on KVM and thread=single hosts,
185      * but not MTTCG, so disable it. To advertise it, a cap would have
186      * to be added, or support implemented for MTTCG.
187      *
188      * Copy/paste is not supported by TCG, so it is not advertised. KVM
189      * can execute them but it has no accelerator drivers which are usable,
190      * so there isn't much need for it anyway.
191      */
192 
193     /* These should be kept in sync with pnv */
194     uint8_t pa_features_206[] = { 6, 0,
195         0xf6, 0x1f, 0xc7, 0x00, 0x00, 0xc0 };
196     uint8_t pa_features_207[] = { 24, 0,
197         0xf6, 0x1f, 0xc7, 0xc0, 0x00, 0xf0,
198         0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
199         0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
200         0x80, 0x00, 0x80, 0x00, 0x00, 0x00 };
201     uint8_t pa_features_300[] = { 66, 0,
202         /* 0: MMU|FPU|SLB|RUN|DABR|NX, 1: fri[nzpm]|DABRX|SPRG3|SLB0|PP110 */
203         /* 2: VPM|DS205|PPR|DS202|DS206, 3: LSD|URG, 5: LE|CFAR|EB|LSQ */
204         0xf6, 0x1f, 0xc7, 0xc0, 0x00, 0xf0, /* 0 - 5 */
205         /* 6: DS207 */
206         0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 6 - 11 */
207         /* 16: Vector */
208         0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */
209         /* 18: Vec. Scalar, 20: Vec. XOR */
210         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 18 - 23 */
211         /* 24: Ext. Dec, 26: 64 bit ftrs, 28: PM ftrs */
212         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 24 - 29 */
213         /* 32: LE atomic, 34: EBB + ext EBB */
214         0x00, 0x00, 0x80, 0x00, 0xC0, 0x00, /* 30 - 35 */
215         /* 40: Radix MMU */
216         0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 36 - 41 */
217         /* 42: PM, 44: PC RA, 46: SC vec'd */
218         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 42 - 47 */
219         /* 48: SIMD, 50: QP BFP, 52: String */
220         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
221         /* 54: DecFP, 56: DecI, 58: SHA */
222         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
223         /* 60: NM atomic, 62: RNG */
224         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
225     };
226     /* 3.1 removes SAO, HTM support */
227     uint8_t pa_features_31[] = { 74, 0,
228         /* 0: MMU|FPU|SLB|RUN|DABR|NX, 1: fri[nzpm]|DABRX|SPRG3|SLB0|PP110 */
229         /* 2: VPM|DS205|PPR|DS202|DS206, 3: LSD|URG, 5: LE|CFAR|EB|LSQ */
230         0xf6, 0x1f, 0xc7, 0xc0, 0x00, 0xf0, /* 0 - 5 */
231         /* 6: DS207 */
232         0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 6 - 11 */
233         /* 16: Vector */
234         0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */
235         /* 18: Vec. Scalar, 20: Vec. XOR */
236         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 18 - 23 */
237         /* 24: Ext. Dec, 26: 64 bit ftrs, 28: PM ftrs */
238         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 24 - 29 */
239         /* 32: LE atomic, 34: EBB + ext EBB */
240         0x00, 0x00, 0x80, 0x00, 0xC0, 0x00, /* 30 - 35 */
241         /* 40: Radix MMU */
242         0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 36 - 41 */
243         /* 42: PM, 44: PC RA, 46: SC vec'd */
244         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 42 - 47 */
245         /* 48: SIMD, 50: QP BFP, 52: String */
246         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
247         /* 54: DecFP, 56: DecI, 58: SHA */
248         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
249         /* 60: NM atomic, 62: RNG */
250         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
251         /* 68: DEXCR[SBHE|IBRTPDUS|SRAPD|NPHIE|PHIE] */
252         0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 66 - 71 */
253         /* 72: [P]HASHST/[P]HASHCHK */
254         0x80, 0x00,                         /* 72 - 73 */
255     };
256     uint8_t *pa_features = NULL;
257     size_t pa_size;
258 
259     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_06, 0, cpu->compat_pvr)) {
260         pa_features = pa_features_206;
261         pa_size = sizeof(pa_features_206);
262     }
263     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_07, 0, cpu->compat_pvr)) {
264         pa_features = pa_features_207;
265         pa_size = sizeof(pa_features_207);
266     }
267     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0, cpu->compat_pvr)) {
268         pa_features = pa_features_300;
269         pa_size = sizeof(pa_features_300);
270     }
271     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_10, 0, cpu->compat_pvr)) {
272         pa_features = pa_features_31;
273         pa_size = sizeof(pa_features_31);
274     }
275     if (!pa_features) {
276         return;
277     }
278 
279     if (ppc_hash64_has(cpu, PPC_HASH64_CI_LARGEPAGE)) {
280         /*
281          * Note: we keep CI large pages off by default because a 64K capable
282          * guest provisioned with large pages might otherwise try to map a qemu
283          * framebuffer (or other kind of memory mapped PCI BAR) using 64K pages
284          * even if that qemu runs on a 4k host.
285          * We dd this bit back here if we are confident this is not an issue
286          */
287         pa_features[3] |= 0x20;
288     }
289     if ((spapr_get_cap(spapr, SPAPR_CAP_HTM) != 0) && pa_size > 24) {
290         pa_features[24] |= 0x80;    /* Transactional memory support */
291     }
292     if (spapr->cas_pre_isa3_guest && pa_size > 40) {
293         /* Workaround for broken kernels that attempt (guest) radix
294          * mode when they can't handle it, if they see the radix bit set
295          * in pa-features. So hide it from them. */
296         pa_features[40 + 2] &= ~0x80; /* Radix MMU */
297     }
298 
299     _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
300 }
301 
302 static void spapr_dt_pi_features(SpaprMachineState *spapr,
303                                  PowerPCCPU *cpu,
304                                  void *fdt, int offset)
305 {
306     uint8_t pi_features[] = { 1, 0,
307         0x00 };
308 
309     if (kvm_enabled() && ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00,
310                                           0, cpu->compat_pvr)) {
311         /*
312          * POWER9 and later CPUs with KVM run in LPAR-per-thread mode where
313          * all threads are essentially independent CPUs, and msgsndp does not
314          * work (because it is physically-addressed) and therefore is
315          * emulated by KVM, so disable it here to ensure XIVE will be used.
316          * This is both KVM and CPU implementation-specific behaviour so a KVM
317          * cap would be cleanest, but for now this works. If KVM ever permits
318          * native msgsndp execution by guests, a cap could be added at that
319          * time.
320          */
321         pi_features[2] |= 0x08; /* 4: No msgsndp */
322     }
323 
324     _FDT((fdt_setprop(fdt, offset, "ibm,pi-features", pi_features,
325                       sizeof(pi_features))));
326 }
327 
328 static hwaddr spapr_node0_size(MachineState *machine)
329 {
330     if (machine->numa_state->num_nodes) {
331         int i;
332         for (i = 0; i < machine->numa_state->num_nodes; ++i) {
333             if (machine->numa_state->nodes[i].node_mem) {
334                 return MIN(pow2floor(machine->numa_state->nodes[i].node_mem),
335                            machine->ram_size);
336             }
337         }
338     }
339     return machine->ram_size;
340 }
341 
342 static void add_str(GString *s, const gchar *s1)
343 {
344     g_string_append_len(s, s1, strlen(s1) + 1);
345 }
346 
347 static int spapr_dt_memory_node(SpaprMachineState *spapr, void *fdt, int nodeid,
348                                 hwaddr start, hwaddr size)
349 {
350     char mem_name[32];
351     uint64_t mem_reg_property[2];
352     int off;
353 
354     mem_reg_property[0] = cpu_to_be64(start);
355     mem_reg_property[1] = cpu_to_be64(size);
356 
357     sprintf(mem_name, "memory@%" HWADDR_PRIx, start);
358     off = fdt_add_subnode(fdt, 0, mem_name);
359     _FDT(off);
360     _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
361     _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
362                       sizeof(mem_reg_property))));
363     spapr_numa_write_associativity_dt(spapr, fdt, off, nodeid);
364     return off;
365 }
366 
367 static uint32_t spapr_pc_dimm_node(MemoryDeviceInfoList *list, ram_addr_t addr)
368 {
369     MemoryDeviceInfoList *info;
370 
371     for (info = list; info; info = info->next) {
372         MemoryDeviceInfo *value = info->value;
373 
374         if (value && value->type == MEMORY_DEVICE_INFO_KIND_DIMM) {
375             PCDIMMDeviceInfo *pcdimm_info = value->u.dimm.data;
376 
377             if (addr >= pcdimm_info->addr &&
378                 addr < (pcdimm_info->addr + pcdimm_info->size)) {
379                 return pcdimm_info->node;
380             }
381         }
382     }
383 
384     return -1;
385 }
386 
387 struct sPAPRDrconfCellV2 {
388      uint32_t seq_lmbs;
389      uint64_t base_addr;
390      uint32_t drc_index;
391      uint32_t aa_index;
392      uint32_t flags;
393 } QEMU_PACKED;
394 
395 typedef struct DrconfCellQueue {
396     struct sPAPRDrconfCellV2 cell;
397     QSIMPLEQ_ENTRY(DrconfCellQueue) entry;
398 } DrconfCellQueue;
399 
400 static DrconfCellQueue *
401 spapr_get_drconf_cell(uint32_t seq_lmbs, uint64_t base_addr,
402                       uint32_t drc_index, uint32_t aa_index,
403                       uint32_t flags)
404 {
405     DrconfCellQueue *elem;
406 
407     elem = g_malloc0(sizeof(*elem));
408     elem->cell.seq_lmbs = cpu_to_be32(seq_lmbs);
409     elem->cell.base_addr = cpu_to_be64(base_addr);
410     elem->cell.drc_index = cpu_to_be32(drc_index);
411     elem->cell.aa_index = cpu_to_be32(aa_index);
412     elem->cell.flags = cpu_to_be32(flags);
413 
414     return elem;
415 }
416 
417 static int spapr_dt_dynamic_memory_v2(SpaprMachineState *spapr, void *fdt,
418                                       int offset, MemoryDeviceInfoList *dimms)
419 {
420     MachineState *machine = MACHINE(spapr);
421     uint8_t *int_buf, *cur_index;
422     int ret;
423     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
424     uint64_t addr, cur_addr, size;
425     uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
426     uint64_t mem_end = machine->device_memory->base +
427                        memory_region_size(&machine->device_memory->mr);
428     uint32_t node, buf_len, nr_entries = 0;
429     SpaprDrc *drc;
430     DrconfCellQueue *elem, *next;
431     MemoryDeviceInfoList *info;
432     QSIMPLEQ_HEAD(, DrconfCellQueue) drconf_queue
433         = QSIMPLEQ_HEAD_INITIALIZER(drconf_queue);
434 
435     /* Entry to cover RAM and the gap area */
436     elem = spapr_get_drconf_cell(nr_boot_lmbs, 0, 0, -1,
437                                  SPAPR_LMB_FLAGS_RESERVED |
438                                  SPAPR_LMB_FLAGS_DRC_INVALID);
439     QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
440     nr_entries++;
441 
442     cur_addr = machine->device_memory->base;
443     for (info = dimms; info; info = info->next) {
444         PCDIMMDeviceInfo *di = info->value->u.dimm.data;
445 
446         addr = di->addr;
447         size = di->size;
448         node = di->node;
449 
450         /*
451          * The NVDIMM area is hotpluggable after the NVDIMM is unplugged. The
452          * area is marked hotpluggable in the next iteration for the bigger
453          * chunk including the NVDIMM occupied area.
454          */
455         if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM)
456             continue;
457 
458         /* Entry for hot-pluggable area */
459         if (cur_addr < addr) {
460             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
461             g_assert(drc);
462             elem = spapr_get_drconf_cell((addr - cur_addr) / lmb_size,
463                                          cur_addr, spapr_drc_index(drc), -1, 0);
464             QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
465             nr_entries++;
466         }
467 
468         /* Entry for DIMM */
469         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
470         g_assert(drc);
471         elem = spapr_get_drconf_cell(size / lmb_size, addr,
472                                      spapr_drc_index(drc), node,
473                                      (SPAPR_LMB_FLAGS_ASSIGNED |
474                                       SPAPR_LMB_FLAGS_HOTREMOVABLE));
475         QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
476         nr_entries++;
477         cur_addr = addr + size;
478     }
479 
480     /* Entry for remaining hotpluggable area */
481     if (cur_addr < mem_end) {
482         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
483         g_assert(drc);
484         elem = spapr_get_drconf_cell((mem_end - cur_addr) / lmb_size,
485                                      cur_addr, spapr_drc_index(drc), -1, 0);
486         QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
487         nr_entries++;
488     }
489 
490     buf_len = nr_entries * sizeof(struct sPAPRDrconfCellV2) + sizeof(uint32_t);
491     int_buf = cur_index = g_malloc0(buf_len);
492     *(uint32_t *)int_buf = cpu_to_be32(nr_entries);
493     cur_index += sizeof(nr_entries);
494 
495     QSIMPLEQ_FOREACH_SAFE(elem, &drconf_queue, entry, next) {
496         memcpy(cur_index, &elem->cell, sizeof(elem->cell));
497         cur_index += sizeof(elem->cell);
498         QSIMPLEQ_REMOVE(&drconf_queue, elem, DrconfCellQueue, entry);
499         g_free(elem);
500     }
501 
502     ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory-v2", int_buf, buf_len);
503     g_free(int_buf);
504     if (ret < 0) {
505         return -1;
506     }
507     return 0;
508 }
509 
510 static int spapr_dt_dynamic_memory(SpaprMachineState *spapr, void *fdt,
511                                    int offset, MemoryDeviceInfoList *dimms)
512 {
513     MachineState *machine = MACHINE(spapr);
514     int i, ret;
515     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
516     uint32_t device_lmb_start = machine->device_memory->base / lmb_size;
517     uint32_t nr_lmbs = (machine->device_memory->base +
518                        memory_region_size(&machine->device_memory->mr)) /
519                        lmb_size;
520     uint32_t *int_buf, *cur_index, buf_len;
521 
522     /*
523      * Allocate enough buffer size to fit in ibm,dynamic-memory
524      */
525     buf_len = (nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1) * sizeof(uint32_t);
526     cur_index = int_buf = g_malloc0(buf_len);
527     int_buf[0] = cpu_to_be32(nr_lmbs);
528     cur_index++;
529     for (i = 0; i < nr_lmbs; i++) {
530         uint64_t addr = i * lmb_size;
531         uint32_t *dynamic_memory = cur_index;
532 
533         if (i >= device_lmb_start) {
534             SpaprDrc *drc;
535 
536             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, i);
537             g_assert(drc);
538 
539             dynamic_memory[0] = cpu_to_be32(addr >> 32);
540             dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
541             dynamic_memory[2] = cpu_to_be32(spapr_drc_index(drc));
542             dynamic_memory[3] = cpu_to_be32(0); /* reserved */
543             dynamic_memory[4] = cpu_to_be32(spapr_pc_dimm_node(dimms, addr));
544             if (memory_region_present(get_system_memory(), addr)) {
545                 dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED);
546             } else {
547                 dynamic_memory[5] = cpu_to_be32(0);
548             }
549         } else {
550             /*
551              * LMB information for RMA, boot time RAM and gap b/n RAM and
552              * device memory region -- all these are marked as reserved
553              * and as having no valid DRC.
554              */
555             dynamic_memory[0] = cpu_to_be32(addr >> 32);
556             dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
557             dynamic_memory[2] = cpu_to_be32(0);
558             dynamic_memory[3] = cpu_to_be32(0); /* reserved */
559             dynamic_memory[4] = cpu_to_be32(-1);
560             dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED |
561                                             SPAPR_LMB_FLAGS_DRC_INVALID);
562         }
563 
564         cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE;
565     }
566     ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
567     g_free(int_buf);
568     if (ret < 0) {
569         return -1;
570     }
571     return 0;
572 }
573 
574 /*
575  * Adds ibm,dynamic-reconfiguration-memory node.
576  * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
577  * of this device tree node.
578  */
579 static int spapr_dt_dynamic_reconfiguration_memory(SpaprMachineState *spapr,
580                                                    void *fdt)
581 {
582     MachineState *machine = MACHINE(spapr);
583     int ret, offset;
584     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
585     uint32_t prop_lmb_size[] = {cpu_to_be32(lmb_size >> 32),
586                                 cpu_to_be32(lmb_size & 0xffffffff)};
587     MemoryDeviceInfoList *dimms = NULL;
588 
589     /* Don't create the node if there is no device memory. */
590     if (!machine->device_memory) {
591         return 0;
592     }
593 
594     offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
595 
596     ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
597                     sizeof(prop_lmb_size));
598     if (ret < 0) {
599         return ret;
600     }
601 
602     ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
603     if (ret < 0) {
604         return ret;
605     }
606 
607     ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
608     if (ret < 0) {
609         return ret;
610     }
611 
612     /* ibm,dynamic-memory or ibm,dynamic-memory-v2 */
613     dimms = qmp_memory_device_list();
614     if (spapr_ovec_test(spapr->ov5_cas, OV5_DRMEM_V2)) {
615         ret = spapr_dt_dynamic_memory_v2(spapr, fdt, offset, dimms);
616     } else {
617         ret = spapr_dt_dynamic_memory(spapr, fdt, offset, dimms);
618     }
619     qapi_free_MemoryDeviceInfoList(dimms);
620 
621     if (ret < 0) {
622         return ret;
623     }
624 
625     ret = spapr_numa_write_assoc_lookup_arrays(spapr, fdt, offset);
626 
627     return ret;
628 }
629 
630 static int spapr_dt_memory(SpaprMachineState *spapr, void *fdt)
631 {
632     MachineState *machine = MACHINE(spapr);
633     hwaddr mem_start, node_size;
634     int i, nb_nodes = machine->numa_state->num_nodes;
635     NodeInfo *nodes = machine->numa_state->nodes;
636 
637     for (i = 0, mem_start = 0; i < nb_nodes; ++i) {
638         if (!nodes[i].node_mem) {
639             continue;
640         }
641         if (mem_start >= machine->ram_size) {
642             node_size = 0;
643         } else {
644             node_size = nodes[i].node_mem;
645             if (node_size > machine->ram_size - mem_start) {
646                 node_size = machine->ram_size - mem_start;
647             }
648         }
649         if (!mem_start) {
650             /* spapr_machine_init() checks for rma_size <= node0_size
651              * already */
652             spapr_dt_memory_node(spapr, fdt, i, 0, spapr->rma_size);
653             mem_start += spapr->rma_size;
654             node_size -= spapr->rma_size;
655         }
656         for ( ; node_size; ) {
657             hwaddr sizetmp = pow2floor(node_size);
658 
659             /* mem_start != 0 here */
660             if (ctzl(mem_start) < ctzl(sizetmp)) {
661                 sizetmp = 1ULL << ctzl(mem_start);
662             }
663 
664             spapr_dt_memory_node(spapr, fdt, i, mem_start, sizetmp);
665             node_size -= sizetmp;
666             mem_start += sizetmp;
667         }
668     }
669 
670     /* Generate ibm,dynamic-reconfiguration-memory node if required */
671     if (spapr_ovec_test(spapr->ov5_cas, OV5_DRCONF_MEMORY)) {
672         int ret;
673 
674         ret = spapr_dt_dynamic_reconfiguration_memory(spapr, fdt);
675         if (ret) {
676             return ret;
677         }
678     }
679 
680     return 0;
681 }
682 
683 static void spapr_dt_cpu(CPUState *cs, void *fdt, int offset,
684                          SpaprMachineState *spapr)
685 {
686     MachineState *ms = MACHINE(spapr);
687     PowerPCCPU *cpu = POWERPC_CPU(cs);
688     CPUPPCState *env = &cpu->env;
689     PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
690     int index = spapr_get_vcpu_id(cpu);
691     uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
692                        0xffffffff, 0xffffffff};
693     uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq()
694         : SPAPR_TIMEBASE_FREQ;
695     uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
696     uint32_t page_sizes_prop[64];
697     size_t page_sizes_prop_size;
698     unsigned int smp_threads = ms->smp.threads;
699     uint32_t vcpus_per_socket = smp_threads * ms->smp.cores;
700     uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
701     int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu));
702     SpaprDrc *drc;
703     int drc_index;
704     uint32_t radix_AP_encodings[PPC_PAGE_SIZES_MAX_SZ];
705     int i;
706 
707     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, env->core_index);
708     if (drc) {
709         drc_index = spapr_drc_index(drc);
710         _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)));
711     }
712 
713     _FDT((fdt_setprop_cell(fdt, offset, "reg", index)));
714     _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu")));
715 
716     _FDT((fdt_setprop_cell(fdt, offset, "cpu-version", env->spr[SPR_PVR])));
717     _FDT((fdt_setprop_cell(fdt, offset, "d-cache-block-size",
718                            env->dcache_line_size)));
719     _FDT((fdt_setprop_cell(fdt, offset, "d-cache-line-size",
720                            env->dcache_line_size)));
721     _FDT((fdt_setprop_cell(fdt, offset, "i-cache-block-size",
722                            env->icache_line_size)));
723     _FDT((fdt_setprop_cell(fdt, offset, "i-cache-line-size",
724                            env->icache_line_size)));
725 
726     if (pcc->l1_dcache_size) {
727         _FDT((fdt_setprop_cell(fdt, offset, "d-cache-size",
728                                pcc->l1_dcache_size)));
729     } else {
730         warn_report("Unknown L1 dcache size for cpu");
731     }
732     if (pcc->l1_icache_size) {
733         _FDT((fdt_setprop_cell(fdt, offset, "i-cache-size",
734                                pcc->l1_icache_size)));
735     } else {
736         warn_report("Unknown L1 icache size for cpu");
737     }
738 
739     _FDT((fdt_setprop_cell(fdt, offset, "timebase-frequency", tbfreq)));
740     _FDT((fdt_setprop_cell(fdt, offset, "clock-frequency", cpufreq)));
741     _FDT((fdt_setprop_cell(fdt, offset, "slb-size", cpu->hash64_opts->slb_size)));
742     _FDT((fdt_setprop_cell(fdt, offset, "ibm,slb-size", cpu->hash64_opts->slb_size)));
743     _FDT((fdt_setprop_string(fdt, offset, "status", "okay")));
744     _FDT((fdt_setprop(fdt, offset, "64-bit", NULL, 0)));
745 
746     if (ppc_has_spr(cpu, SPR_PURR)) {
747         _FDT((fdt_setprop_cell(fdt, offset, "ibm,purr", 1)));
748     }
749     if (ppc_has_spr(cpu, SPR_PURR)) {
750         _FDT((fdt_setprop_cell(fdt, offset, "ibm,spurr", 1)));
751     }
752 
753     if (ppc_hash64_has(cpu, PPC_HASH64_1TSEG)) {
754         _FDT((fdt_setprop(fdt, offset, "ibm,processor-segment-sizes",
755                           segs, sizeof(segs))));
756     }
757 
758     /* Advertise VSX (vector extensions) if available
759      *   1               == VMX / Altivec available
760      *   2               == VSX available
761      *
762      * Only CPUs for which we create core types in spapr_cpu_core.c
763      * are possible, and all of those have VMX */
764     if (env->insns_flags & PPC_ALTIVEC) {
765         if (spapr_get_cap(spapr, SPAPR_CAP_VSX) != 0) {
766             _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 2)));
767         } else {
768             _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 1)));
769         }
770     }
771 
772     /* Advertise DFP (Decimal Floating Point) if available
773      *   0 / no property == no DFP
774      *   1               == DFP available */
775     if (spapr_get_cap(spapr, SPAPR_CAP_DFP) != 0) {
776         _FDT((fdt_setprop_cell(fdt, offset, "ibm,dfp", 1)));
777     }
778 
779     page_sizes_prop_size = ppc_create_page_sizes_prop(cpu, page_sizes_prop,
780                                                       sizeof(page_sizes_prop));
781     if (page_sizes_prop_size) {
782         _FDT((fdt_setprop(fdt, offset, "ibm,segment-page-sizes",
783                           page_sizes_prop, page_sizes_prop_size)));
784     }
785 
786     spapr_dt_pa_features(spapr, cpu, fdt, offset);
787 
788     spapr_dt_pi_features(spapr, cpu, fdt, offset);
789 
790     _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
791                            cs->cpu_index / vcpus_per_socket)));
792 
793     _FDT((fdt_setprop(fdt, offset, "ibm,pft-size",
794                       pft_size_prop, sizeof(pft_size_prop))));
795 
796     if (ms->numa_state->num_nodes > 1) {
797         _FDT(spapr_numa_fixup_cpu_dt(spapr, fdt, offset, cpu));
798     }
799 
800     _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt));
801 
802     if (pcc->radix_page_info) {
803         for (i = 0; i < pcc->radix_page_info->count; i++) {
804             radix_AP_encodings[i] =
805                 cpu_to_be32(pcc->radix_page_info->entries[i]);
806         }
807         _FDT((fdt_setprop(fdt, offset, "ibm,processor-radix-AP-encodings",
808                           radix_AP_encodings,
809                           pcc->radix_page_info->count *
810                           sizeof(radix_AP_encodings[0]))));
811     }
812 
813     /*
814      * We set this property to let the guest know that it can use the large
815      * decrementer and its width in bits.
816      */
817     if (spapr_get_cap(spapr, SPAPR_CAP_LARGE_DECREMENTER) != SPAPR_CAP_OFF)
818         _FDT((fdt_setprop_u32(fdt, offset, "ibm,dec-bits",
819                               pcc->lrg_decr_bits)));
820 }
821 
822 static void spapr_dt_one_cpu(void *fdt, SpaprMachineState *spapr, CPUState *cs,
823                              int cpus_offset)
824 {
825     PowerPCCPU *cpu = POWERPC_CPU(cs);
826     int index = spapr_get_vcpu_id(cpu);
827     DeviceClass *dc = DEVICE_GET_CLASS(cs);
828     g_autofree char *nodename = NULL;
829     int offset;
830 
831     if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
832         return;
833     }
834 
835     nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
836     offset = fdt_add_subnode(fdt, cpus_offset, nodename);
837     _FDT(offset);
838     spapr_dt_cpu(cs, fdt, offset, spapr);
839 }
840 
841 
842 static void spapr_dt_cpus(void *fdt, SpaprMachineState *spapr)
843 {
844     CPUState **rev;
845     CPUState *cs;
846     int n_cpus;
847     int cpus_offset;
848     int i;
849 
850     cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
851     _FDT(cpus_offset);
852     _FDT((fdt_setprop_cell(fdt, cpus_offset, "#address-cells", 0x1)));
853     _FDT((fdt_setprop_cell(fdt, cpus_offset, "#size-cells", 0x0)));
854 
855     /*
856      * We walk the CPUs in reverse order to ensure that CPU DT nodes
857      * created by fdt_add_subnode() end up in the right order in FDT
858      * for the guest kernel the enumerate the CPUs correctly.
859      *
860      * The CPU list cannot be traversed in reverse order, so we need
861      * to do extra work.
862      */
863     n_cpus = 0;
864     rev = NULL;
865     CPU_FOREACH(cs) {
866         rev = g_renew(CPUState *, rev, n_cpus + 1);
867         rev[n_cpus++] = cs;
868     }
869 
870     for (i = n_cpus - 1; i >= 0; i--) {
871         spapr_dt_one_cpu(fdt, spapr, rev[i], cpus_offset);
872     }
873 
874     g_free(rev);
875 }
876 
877 static int spapr_dt_rng(void *fdt)
878 {
879     int node;
880     int ret;
881 
882     node = qemu_fdt_add_subnode(fdt, "/ibm,platform-facilities");
883     if (node <= 0) {
884         return -1;
885     }
886     ret = fdt_setprop_string(fdt, node, "device_type",
887                              "ibm,platform-facilities");
888     ret |= fdt_setprop_cell(fdt, node, "#address-cells", 0x1);
889     ret |= fdt_setprop_cell(fdt, node, "#size-cells", 0x0);
890 
891     node = fdt_add_subnode(fdt, node, "ibm,random-v1");
892     if (node <= 0) {
893         return -1;
894     }
895     ret |= fdt_setprop_string(fdt, node, "compatible", "ibm,random");
896 
897     return ret ? -1 : 0;
898 }
899 
900 static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
901 {
902     MachineState *ms = MACHINE(spapr);
903     int rtas;
904     GString *hypertas = g_string_sized_new(256);
905     GString *qemu_hypertas = g_string_sized_new(256);
906     uint32_t lrdr_capacity[] = {
907         0,
908         0,
909         cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE >> 32),
910         cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE & 0xffffffff),
911         cpu_to_be32(ms->smp.max_cpus / ms->smp.threads),
912     };
913 
914     /* Do we have device memory? */
915     if (MACHINE(spapr)->device_memory) {
916         uint64_t max_device_addr = MACHINE(spapr)->device_memory->base +
917             memory_region_size(&MACHINE(spapr)->device_memory->mr);
918 
919         lrdr_capacity[0] = cpu_to_be32(max_device_addr >> 32);
920         lrdr_capacity[1] = cpu_to_be32(max_device_addr & 0xffffffff);
921     }
922 
923     _FDT(rtas = fdt_add_subnode(fdt, 0, "rtas"));
924 
925     /* hypertas */
926     add_str(hypertas, "hcall-pft");
927     add_str(hypertas, "hcall-term");
928     add_str(hypertas, "hcall-dabr");
929     add_str(hypertas, "hcall-interrupt");
930     add_str(hypertas, "hcall-tce");
931     add_str(hypertas, "hcall-vio");
932     add_str(hypertas, "hcall-splpar");
933     add_str(hypertas, "hcall-join");
934     add_str(hypertas, "hcall-bulk");
935     add_str(hypertas, "hcall-set-mode");
936     add_str(hypertas, "hcall-sprg0");
937     add_str(hypertas, "hcall-copy");
938     add_str(hypertas, "hcall-debug");
939     add_str(hypertas, "hcall-vphn");
940     if (spapr_get_cap(spapr, SPAPR_CAP_RPT_INVALIDATE) == SPAPR_CAP_ON) {
941         add_str(hypertas, "hcall-rpt-invalidate");
942     }
943 
944     add_str(qemu_hypertas, "hcall-memop1");
945 
946     if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
947         add_str(hypertas, "hcall-multi-tce");
948     }
949 
950     if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
951         add_str(hypertas, "hcall-hpt-resize");
952     }
953 
954     add_str(hypertas, "hcall-watchdog");
955 
956     _FDT(fdt_setprop(fdt, rtas, "ibm,hypertas-functions",
957                      hypertas->str, hypertas->len));
958     g_string_free(hypertas, TRUE);
959     _FDT(fdt_setprop(fdt, rtas, "qemu,hypertas-functions",
960                      qemu_hypertas->str, qemu_hypertas->len));
961     g_string_free(qemu_hypertas, TRUE);
962 
963     spapr_numa_write_rtas_dt(spapr, fdt, rtas);
964 
965     /*
966      * FWNMI reserves RTAS_ERROR_LOG_MAX for the machine check error log,
967      * and 16 bytes per CPU for system reset error log plus an extra 8 bytes.
968      *
969      * The system reset requirements are driven by existing Linux and PowerVM
970      * implementation which (contrary to PAPR) saves r3 in the error log
971      * structure like machine check, so Linux expects to find the saved r3
972      * value at the address in r3 upon FWNMI-enabled sreset interrupt (and
973      * does not look at the error value).
974      *
975      * System reset interrupts are not subject to interlock like machine
976      * check, so this memory area could be corrupted if the sreset is
977      * interrupted by a machine check (or vice versa) if it was shared. To
978      * prevent this, system reset uses per-CPU areas for the sreset save
979      * area. A system reset that interrupts a system reset handler could
980      * still overwrite this area, but Linux doesn't try to recover in that
981      * case anyway.
982      *
983      * The extra 8 bytes is required because Linux's FWNMI error log check
984      * is off-by-one.
985      *
986      * RTAS_MIN_SIZE is required for the RTAS blob itself.
987      */
988     _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_MIN_SIZE +
989                           RTAS_ERROR_LOG_MAX +
990                           ms->smp.max_cpus * sizeof(uint64_t) * 2 +
991                           sizeof(uint64_t)));
992     _FDT(fdt_setprop_cell(fdt, rtas, "rtas-error-log-max",
993                           RTAS_ERROR_LOG_MAX));
994     _FDT(fdt_setprop_cell(fdt, rtas, "rtas-event-scan-rate",
995                           RTAS_EVENT_SCAN_RATE));
996 
997     g_assert(msi_nonbroken);
998     _FDT(fdt_setprop(fdt, rtas, "ibm,change-msix-capable", NULL, 0));
999 
1000     /*
1001      * According to PAPR, rtas ibm,os-term does not guarantee a return
1002      * back to the guest cpu.
1003      *
1004      * While an additional ibm,extended-os-term property indicates
1005      * that rtas call return will always occur. Set this property.
1006      */
1007     _FDT(fdt_setprop(fdt, rtas, "ibm,extended-os-term", NULL, 0));
1008 
1009     _FDT(fdt_setprop(fdt, rtas, "ibm,lrdr-capacity",
1010                      lrdr_capacity, sizeof(lrdr_capacity)));
1011 
1012     spapr_dt_rtas_tokens(fdt, rtas);
1013 }
1014 
1015 /*
1016  * Prepare ibm,arch-vec-5-platform-support, which indicates the MMU
1017  * and the XIVE features that the guest may request and thus the valid
1018  * values for bytes 23..26 of option vector 5:
1019  */
1020 static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
1021                                           int chosen)
1022 {
1023     PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
1024 
1025     char val[2 * 4] = {
1026         23, 0x00, /* XICS / XIVE mode */
1027         24, 0x00, /* Hash/Radix, filled in below. */
1028         25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
1029         26, 0x40, /* Radix options: GTSE == yes. */
1030     };
1031 
1032     if (spapr->irq->xics && spapr->irq->xive) {
1033         val[1] = SPAPR_OV5_XIVE_BOTH;
1034     } else if (spapr->irq->xive) {
1035         val[1] = SPAPR_OV5_XIVE_EXPLOIT;
1036     } else {
1037         assert(spapr->irq->xics);
1038         val[1] = SPAPR_OV5_XIVE_LEGACY;
1039     }
1040 
1041     if (!ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0,
1042                           first_ppc_cpu->compat_pvr)) {
1043         /*
1044          * If we're in a pre POWER9 compat mode then the guest should
1045          * do hash and use the legacy interrupt mode
1046          */
1047         val[1] = SPAPR_OV5_XIVE_LEGACY; /* XICS */
1048         val[3] = 0x00; /* Hash */
1049         spapr_check_mmu_mode(false);
1050     } else if (kvm_enabled()) {
1051         if (kvmppc_has_cap_mmu_radix() && kvmppc_has_cap_mmu_hash_v3()) {
1052             val[3] = 0x80; /* OV5_MMU_BOTH */
1053         } else if (kvmppc_has_cap_mmu_radix()) {
1054             val[3] = 0x40; /* OV5_MMU_RADIX_300 */
1055         } else {
1056             val[3] = 0x00; /* Hash */
1057         }
1058     } else {
1059         /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
1060         val[3] = 0xC0;
1061     }
1062     _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
1063                      val, sizeof(val)));
1064 }
1065 
1066 static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset)
1067 {
1068     MachineState *machine = MACHINE(spapr);
1069     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
1070     int chosen;
1071 
1072     _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen"));
1073 
1074     if (reset) {
1075         const char *boot_device = spapr->boot_device;
1076         g_autofree char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
1077         size_t cb = 0;
1078         g_autofree char *bootlist = get_boot_devices_list(&cb);
1079 
1080         if (machine->kernel_cmdline && machine->kernel_cmdline[0]) {
1081             _FDT(fdt_setprop_string(fdt, chosen, "bootargs",
1082                                     machine->kernel_cmdline));
1083         }
1084 
1085         if (spapr->initrd_size) {
1086             _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-start",
1087                                   spapr->initrd_base));
1088             _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-end",
1089                                   spapr->initrd_base + spapr->initrd_size));
1090         }
1091 
1092         if (spapr->kernel_size) {
1093             uint64_t kprop[2] = { cpu_to_be64(spapr->kernel_addr),
1094                                   cpu_to_be64(spapr->kernel_size) };
1095 
1096             _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel",
1097                          &kprop, sizeof(kprop)));
1098             if (spapr->kernel_le) {
1099                 _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel-le", NULL, 0));
1100             }
1101         }
1102         if (machine->boot_config.has_menu && machine->boot_config.menu) {
1103             _FDT((fdt_setprop_cell(fdt, chosen, "qemu,boot-menu", true)));
1104         }
1105         _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-width", graphic_width));
1106         _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-height", graphic_height));
1107         _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-depth", graphic_depth));
1108 
1109         if (cb && bootlist) {
1110             int i;
1111 
1112             for (i = 0; i < cb; i++) {
1113                 if (bootlist[i] == '\n') {
1114                     bootlist[i] = ' ';
1115                 }
1116             }
1117             _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-list", bootlist));
1118         }
1119 
1120         if (boot_device && strlen(boot_device)) {
1121             _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-device", boot_device));
1122         }
1123 
1124         if (spapr->want_stdout_path && stdout_path) {
1125             /*
1126              * "linux,stdout-path" and "stdout" properties are
1127              * deprecated by linux kernel. New platforms should only
1128              * use the "stdout-path" property. Set the new property
1129              * and continue using older property to remain compatible
1130              * with the existing firmware.
1131              */
1132             _FDT(fdt_setprop_string(fdt, chosen, "linux,stdout-path", stdout_path));
1133             _FDT(fdt_setprop_string(fdt, chosen, "stdout-path", stdout_path));
1134         }
1135 
1136         /*
1137          * We can deal with BAR reallocation just fine, advertise it
1138          * to the guest
1139          */
1140         if (smc->linux_pci_probe) {
1141             _FDT(fdt_setprop_cell(fdt, chosen, "linux,pci-probe-only", 0));
1142         }
1143 
1144         spapr_dt_ov5_platform_support(spapr, fdt, chosen);
1145     }
1146 
1147     _FDT(fdt_setprop(fdt, chosen, "rng-seed", spapr->fdt_rng_seed, 32));
1148 
1149     _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5"));
1150 }
1151 
1152 static void spapr_dt_hypervisor(SpaprMachineState *spapr, void *fdt)
1153 {
1154     /* The /hypervisor node isn't in PAPR - this is a hack to allow PR
1155      * KVM to work under pHyp with some guest co-operation */
1156     int hypervisor;
1157     uint8_t hypercall[16];
1158 
1159     _FDT(hypervisor = fdt_add_subnode(fdt, 0, "hypervisor"));
1160     /* indicate KVM hypercall interface */
1161     _FDT(fdt_setprop_string(fdt, hypervisor, "compatible", "linux,kvm"));
1162     if (kvmppc_has_cap_fixup_hcalls()) {
1163         /*
1164          * Older KVM versions with older guest kernels were broken
1165          * with the magic page, don't allow the guest to map it.
1166          */
1167         if (!kvmppc_get_hypercall(cpu_env(first_cpu), hypercall,
1168                                   sizeof(hypercall))) {
1169             _FDT(fdt_setprop(fdt, hypervisor, "hcall-instructions",
1170                              hypercall, sizeof(hypercall)));
1171         }
1172     }
1173 }
1174 
1175 void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, size_t space)
1176 {
1177     MachineState *machine = MACHINE(spapr);
1178     MachineClass *mc = MACHINE_GET_CLASS(machine);
1179     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
1180     uint32_t root_drc_type_mask = 0;
1181     int ret;
1182     void *fdt;
1183     SpaprPhbState *phb;
1184     char *buf;
1185 
1186     fdt = g_malloc0(space);
1187     _FDT((fdt_create_empty_tree(fdt, space)));
1188 
1189     /* Root node */
1190     _FDT(fdt_setprop_string(fdt, 0, "device_type", "chrp"));
1191     _FDT(fdt_setprop_string(fdt, 0, "model", "IBM pSeries (emulated by qemu)"));
1192     _FDT(fdt_setprop_string(fdt, 0, "compatible", "qemu,pseries"));
1193 
1194     /* Guest UUID & Name*/
1195     buf = qemu_uuid_unparse_strdup(&qemu_uuid);
1196     _FDT(fdt_setprop_string(fdt, 0, "vm,uuid", buf));
1197     if (qemu_uuid_set) {
1198         _FDT(fdt_setprop_string(fdt, 0, "system-id", buf));
1199     }
1200     g_free(buf);
1201 
1202     if (qemu_get_vm_name()) {
1203         _FDT(fdt_setprop_string(fdt, 0, "ibm,partition-name",
1204                                 qemu_get_vm_name()));
1205     }
1206 
1207     /* Host Model & Serial Number */
1208     if (spapr->host_model) {
1209         _FDT(fdt_setprop_string(fdt, 0, "host-model", spapr->host_model));
1210     } else if (smc->broken_host_serial_model && kvmppc_get_host_model(&buf)) {
1211         _FDT(fdt_setprop_string(fdt, 0, "host-model", buf));
1212         g_free(buf);
1213     }
1214 
1215     if (spapr->host_serial) {
1216         _FDT(fdt_setprop_string(fdt, 0, "host-serial", spapr->host_serial));
1217     } else if (smc->broken_host_serial_model && kvmppc_get_host_serial(&buf)) {
1218         _FDT(fdt_setprop_string(fdt, 0, "host-serial", buf));
1219         g_free(buf);
1220     }
1221 
1222     _FDT(fdt_setprop_cell(fdt, 0, "#address-cells", 2));
1223     _FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2));
1224 
1225     /* /interrupt controller */
1226     spapr_irq_dt(spapr, spapr_max_server_number(spapr), fdt, PHANDLE_INTC);
1227 
1228     ret = spapr_dt_memory(spapr, fdt);
1229     if (ret < 0) {
1230         error_report("couldn't setup memory nodes in fdt");
1231         exit(1);
1232     }
1233 
1234     /* /vdevice */
1235     spapr_dt_vdevice(spapr->vio_bus, fdt);
1236 
1237     if (object_resolve_path_type("", TYPE_SPAPR_RNG, NULL)) {
1238         ret = spapr_dt_rng(fdt);
1239         if (ret < 0) {
1240             error_report("could not set up rng device in the fdt");
1241             exit(1);
1242         }
1243     }
1244 
1245     QLIST_FOREACH(phb, &spapr->phbs, list) {
1246         ret = spapr_dt_phb(spapr, phb, PHANDLE_INTC, fdt, NULL);
1247         if (ret < 0) {
1248             error_report("couldn't setup PCI devices in fdt");
1249             exit(1);
1250         }
1251     }
1252 
1253     spapr_dt_cpus(fdt, spapr);
1254 
1255     /* ibm,drc-indexes and friends */
1256     root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_LMB;
1257     if (smc->dr_phb_enabled) {
1258         root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PHB;
1259     }
1260     if (mc->nvdimm_supported) {
1261         root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PMEM;
1262     }
1263     if (root_drc_type_mask) {
1264         _FDT(spapr_dt_drc(fdt, 0, NULL, root_drc_type_mask));
1265     }
1266 
1267     if (mc->has_hotpluggable_cpus) {
1268         int offset = fdt_path_offset(fdt, "/cpus");
1269         ret = spapr_dt_drc(fdt, offset, NULL, SPAPR_DR_CONNECTOR_TYPE_CPU);
1270         if (ret < 0) {
1271             error_report("Couldn't set up CPU DR device tree properties");
1272             exit(1);
1273         }
1274     }
1275 
1276     /* /event-sources */
1277     spapr_dt_events(spapr, fdt);
1278 
1279     /* /rtas */
1280     spapr_dt_rtas(spapr, fdt);
1281 
1282     /* /chosen */
1283     spapr_dt_chosen(spapr, fdt, reset);
1284 
1285     /* /hypervisor */
1286     if (kvm_enabled()) {
1287         spapr_dt_hypervisor(spapr, fdt);
1288     }
1289 
1290     /* Build memory reserve map */
1291     if (reset) {
1292         if (spapr->kernel_size) {
1293             _FDT((fdt_add_mem_rsv(fdt, spapr->kernel_addr,
1294                                   spapr->kernel_size)));
1295         }
1296         if (spapr->initrd_size) {
1297             _FDT((fdt_add_mem_rsv(fdt, spapr->initrd_base,
1298                                   spapr->initrd_size)));
1299         }
1300     }
1301 
1302     /* NVDIMM devices */
1303     if (mc->nvdimm_supported) {
1304         spapr_dt_persistent_memory(spapr, fdt);
1305     }
1306 
1307     return fdt;
1308 }
1309 
1310 static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
1311 {
1312     SpaprMachineState *spapr = opaque;
1313 
1314     return (addr & 0x0fffffff) + spapr->kernel_addr;
1315 }
1316 
1317 static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
1318                                     PowerPCCPU *cpu)
1319 {
1320     CPUPPCState *env = &cpu->env;
1321 
1322     /* The TCG path should also be holding the BQL at this point */
1323     g_assert(bql_locked());
1324 
1325     g_assert(!vhyp_cpu_in_nested(cpu));
1326 
1327     if (FIELD_EX64(env->msr, MSR, PR)) {
1328         hcall_dprintf("Hypercall made with MSR[PR]=1\n");
1329         env->gpr[3] = H_PRIVILEGE;
1330     } else {
1331         env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
1332     }
1333 }
1334 
1335 struct LPCRSyncState {
1336     target_ulong value;
1337     target_ulong mask;
1338 };
1339 
1340 static void do_lpcr_sync(CPUState *cs, run_on_cpu_data arg)
1341 {
1342     struct LPCRSyncState *s = arg.host_ptr;
1343     PowerPCCPU *cpu = POWERPC_CPU(cs);
1344     CPUPPCState *env = &cpu->env;
1345     target_ulong lpcr;
1346 
1347     cpu_synchronize_state(cs);
1348     lpcr = env->spr[SPR_LPCR];
1349     lpcr &= ~s->mask;
1350     lpcr |= s->value;
1351     ppc_store_lpcr(cpu, lpcr);
1352 }
1353 
1354 void spapr_set_all_lpcrs(target_ulong value, target_ulong mask)
1355 {
1356     CPUState *cs;
1357     struct LPCRSyncState s = {
1358         .value = value,
1359         .mask = mask
1360     };
1361     CPU_FOREACH(cs) {
1362         run_on_cpu(cs, do_lpcr_sync, RUN_ON_CPU_HOST_PTR(&s));
1363     }
1364 }
1365 
1366 /* May be used when the machine is not running */
1367 void spapr_init_all_lpcrs(target_ulong value, target_ulong mask)
1368 {
1369     CPUState *cs;
1370     CPU_FOREACH(cs) {
1371         PowerPCCPU *cpu = POWERPC_CPU(cs);
1372         CPUPPCState *env = &cpu->env;
1373         target_ulong lpcr;
1374 
1375         lpcr = env->spr[SPR_LPCR];
1376         lpcr &= ~(LPCR_HR | LPCR_UPRT);
1377         ppc_store_lpcr(cpu, lpcr);
1378     }
1379 }
1380 
1381 static bool spapr_get_pate(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu,
1382                            target_ulong lpid, ppc_v3_pate_t *entry)
1383 {
1384     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1385     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
1386 
1387     if (!spapr_cpu->in_nested) {
1388         assert(lpid == 0);
1389 
1390         /* Copy PATE1:GR into PATE0:HR */
1391         entry->dw0 = spapr->patb_entry & PATE0_HR;
1392         entry->dw1 = spapr->patb_entry;
1393         return true;
1394     } else {
1395         if (spapr_nested_api(spapr) == NESTED_API_KVM_HV) {
1396             return spapr_get_pate_nested_hv(spapr, cpu, lpid, entry);
1397         } else if (spapr_nested_api(spapr) == NESTED_API_PAPR) {
1398             return spapr_get_pate_nested_papr(spapr, cpu, lpid, entry);
1399         } else {
1400             g_assert_not_reached();
1401         }
1402     }
1403 }
1404 
1405 static uint64_t *hpte_get_ptr(SpaprMachineState *s, unsigned index)
1406 {
1407     uint64_t *table = s->htab;
1408 
1409     return &table[2 * index];
1410 }
1411 
1412 static bool hpte_is_valid(SpaprMachineState *s, unsigned index)
1413 {
1414     return ldq_be_p(hpte_get_ptr(s, index)) & HPTE64_V_VALID;
1415 }
1416 
1417 static bool hpte_is_dirty(SpaprMachineState *s, unsigned index)
1418 {
1419     return ldq_be_p(hpte_get_ptr(s, index)) & HPTE64_V_HPTE_DIRTY;
1420 }
1421 
1422 static void hpte_set_clean(SpaprMachineState *s, unsigned index)
1423 {
1424     stq_be_p(hpte_get_ptr(s, index),
1425              ldq_be_p(hpte_get_ptr(s, index)) & ~HPTE64_V_HPTE_DIRTY);
1426 }
1427 
1428 #define DIRTY_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) |= tswap64(HPTE64_V_HPTE_DIRTY))
1429 
1430 /*
1431  * Get the fd to access the kernel htab, re-opening it if necessary
1432  */
1433 static int get_htab_fd(SpaprMachineState *spapr)
1434 {
1435     Error *local_err = NULL;
1436 
1437     if (spapr->htab_fd >= 0) {
1438         return spapr->htab_fd;
1439     }
1440 
1441     spapr->htab_fd = kvmppc_get_htab_fd(false, 0, &local_err);
1442     if (spapr->htab_fd < 0) {
1443         error_report_err(local_err);
1444     }
1445 
1446     return spapr->htab_fd;
1447 }
1448 
1449 void close_htab_fd(SpaprMachineState *spapr)
1450 {
1451     if (spapr->htab_fd >= 0) {
1452         close(spapr->htab_fd);
1453     }
1454     spapr->htab_fd = -1;
1455 }
1456 
1457 static hwaddr spapr_hpt_mask(PPCVirtualHypervisor *vhyp)
1458 {
1459     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1460 
1461     return HTAB_SIZE(spapr) / HASH_PTEG_SIZE_64 - 1;
1462 }
1463 
1464 static target_ulong spapr_encode_hpt_for_kvm_pr(PPCVirtualHypervisor *vhyp)
1465 {
1466     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1467 
1468     assert(kvm_enabled());
1469 
1470     if (!spapr->htab) {
1471         return 0;
1472     }
1473 
1474     return (target_ulong)(uintptr_t)spapr->htab | (spapr->htab_shift - 18);
1475 }
1476 
1477 static const ppc_hash_pte64_t *spapr_map_hptes(PPCVirtualHypervisor *vhyp,
1478                                                 hwaddr ptex, int n)
1479 {
1480     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1481     hwaddr pte_offset = ptex * HASH_PTE_SIZE_64;
1482 
1483     if (!spapr->htab) {
1484         /*
1485          * HTAB is controlled by KVM. Fetch into temporary buffer
1486          */
1487         ppc_hash_pte64_t *hptes = g_malloc(n * HASH_PTE_SIZE_64);
1488         kvmppc_read_hptes(hptes, ptex, n);
1489         return hptes;
1490     }
1491 
1492     /*
1493      * HTAB is controlled by QEMU. Just point to the internally
1494      * accessible PTEG.
1495      */
1496     return (const ppc_hash_pte64_t *)(spapr->htab + pte_offset);
1497 }
1498 
1499 static void spapr_unmap_hptes(PPCVirtualHypervisor *vhyp,
1500                               const ppc_hash_pte64_t *hptes,
1501                               hwaddr ptex, int n)
1502 {
1503     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1504 
1505     if (!spapr->htab) {
1506         g_free((void *)hptes);
1507     }
1508 
1509     /* Nothing to do for qemu managed HPT */
1510 }
1511 
1512 void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
1513                       uint64_t pte0, uint64_t pte1)
1514 {
1515     SpaprMachineState *spapr = SPAPR_MACHINE(cpu->vhyp);
1516     hwaddr offset = ptex * HASH_PTE_SIZE_64;
1517 
1518     if (!spapr->htab) {
1519         kvmppc_write_hpte(ptex, pte0, pte1);
1520     } else {
1521         if (pte0 & HPTE64_V_VALID) {
1522             stq_p(spapr->htab + offset + HPTE64_DW1, pte1);
1523             /*
1524              * When setting valid, we write PTE1 first. This ensures
1525              * proper synchronization with the reading code in
1526              * ppc_hash64_pteg_search()
1527              */
1528             smp_wmb();
1529             stq_p(spapr->htab + offset, pte0);
1530         } else {
1531             stq_p(spapr->htab + offset, pte0);
1532             /*
1533              * When clearing it we set PTE0 first. This ensures proper
1534              * synchronization with the reading code in
1535              * ppc_hash64_pteg_search()
1536              */
1537             smp_wmb();
1538             stq_p(spapr->htab + offset + HPTE64_DW1, pte1);
1539         }
1540     }
1541 }
1542 
1543 static void spapr_hpte_set_c(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1544                              uint64_t pte1)
1545 {
1546     hwaddr offset = ptex * HASH_PTE_SIZE_64 + HPTE64_DW1_C;
1547     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1548 
1549     if (!spapr->htab) {
1550         /* There should always be a hash table when this is called */
1551         error_report("spapr_hpte_set_c called with no hash table !");
1552         return;
1553     }
1554 
1555     /* The HW performs a non-atomic byte update */
1556     stb_p(spapr->htab + offset, (pte1 & 0xff) | 0x80);
1557 }
1558 
1559 static void spapr_hpte_set_r(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1560                              uint64_t pte1)
1561 {
1562     hwaddr offset = ptex * HASH_PTE_SIZE_64 + HPTE64_DW1_R;
1563     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1564 
1565     if (!spapr->htab) {
1566         /* There should always be a hash table when this is called */
1567         error_report("spapr_hpte_set_r called with no hash table !");
1568         return;
1569     }
1570 
1571     /* The HW performs a non-atomic byte update */
1572     stb_p(spapr->htab + offset, ((pte1 >> 8) & 0xff) | 0x01);
1573 }
1574 
1575 int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
1576 {
1577     int shift;
1578 
1579     /* We aim for a hash table of size 1/128 the size of RAM (rounded
1580      * up).  The PAPR recommendation is actually 1/64 of RAM size, but
1581      * that's much more than is needed for Linux guests */
1582     shift = ctz64(pow2ceil(ramsize)) - 7;
1583     shift = MAX(shift, 18); /* Minimum architected size */
1584     shift = MIN(shift, 46); /* Maximum architected size */
1585     return shift;
1586 }
1587 
1588 void spapr_free_hpt(SpaprMachineState *spapr)
1589 {
1590     qemu_vfree(spapr->htab);
1591     spapr->htab = NULL;
1592     spapr->htab_shift = 0;
1593     close_htab_fd(spapr);
1594 }
1595 
1596 int spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, Error **errp)
1597 {
1598     ERRP_GUARD();
1599     long rc;
1600 
1601     /* Clean up any HPT info from a previous boot */
1602     spapr_free_hpt(spapr);
1603 
1604     rc = kvmppc_reset_htab(shift);
1605 
1606     if (rc == -EOPNOTSUPP) {
1607         error_setg(errp, "HPT not supported in nested guests");
1608         return -EOPNOTSUPP;
1609     }
1610 
1611     if (rc < 0) {
1612         /* kernel-side HPT needed, but couldn't allocate one */
1613         error_setg_errno(errp, errno, "Failed to allocate KVM HPT of order %d",
1614                          shift);
1615         error_append_hint(errp, "Try smaller maxmem?\n");
1616         return -errno;
1617     } else if (rc > 0) {
1618         /* kernel-side HPT allocated */
1619         if (rc != shift) {
1620             error_setg(errp,
1621                        "Requested order %d HPT, but kernel allocated order %ld",
1622                        shift, rc);
1623             error_append_hint(errp, "Try smaller maxmem?\n");
1624             return -ENOSPC;
1625         }
1626 
1627         spapr->htab_shift = shift;
1628         spapr->htab = NULL;
1629     } else {
1630         /* kernel-side HPT not needed, allocate in userspace instead */
1631         size_t size = 1ULL << shift;
1632         int i;
1633 
1634         spapr->htab = qemu_memalign(size, size);
1635         memset(spapr->htab, 0, size);
1636         spapr->htab_shift = shift;
1637 
1638         for (i = 0; i < size / HASH_PTE_SIZE_64; i++) {
1639             DIRTY_HPTE(hpte_get_ptr(spapr, i));
1640         }
1641     }
1642     /* We're setting up a hash table, so that means we're not radix */
1643     spapr->patb_entry = 0;
1644     spapr_init_all_lpcrs(0, LPCR_HR | LPCR_UPRT);
1645     return 0;
1646 }
1647 
1648 void spapr_setup_hpt(SpaprMachineState *spapr)
1649 {
1650     int hpt_shift;
1651 
1652     if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
1653         hpt_shift = spapr_hpt_shift_for_ramsize(MACHINE(spapr)->maxram_size);
1654     } else {
1655         uint64_t current_ram_size;
1656 
1657         current_ram_size = MACHINE(spapr)->ram_size + get_plugged_memory_size();
1658         hpt_shift = spapr_hpt_shift_for_ramsize(current_ram_size);
1659     }
1660     spapr_reallocate_hpt(spapr, hpt_shift, &error_fatal);
1661 
1662     if (kvm_enabled()) {
1663         hwaddr vrma_limit = kvmppc_vrma_limit(spapr->htab_shift);
1664 
1665         /* Check our RMA fits in the possible VRMA */
1666         if (vrma_limit < spapr->rma_size) {
1667             error_report("Unable to create %" HWADDR_PRIu
1668                          "MiB RMA (VRMA only allows %" HWADDR_PRIu "MiB",
1669                          spapr->rma_size / MiB, vrma_limit / MiB);
1670             exit(EXIT_FAILURE);
1671         }
1672     }
1673 }
1674 
1675 void spapr_check_mmu_mode(bool guest_radix)
1676 {
1677     if (guest_radix) {
1678         if (kvm_enabled() && !kvmppc_has_cap_mmu_radix()) {
1679             error_report("Guest requested unavailable MMU mode (radix).");
1680             exit(EXIT_FAILURE);
1681         }
1682     } else {
1683         if (kvm_enabled() && kvmppc_has_cap_mmu_radix()
1684             && !kvmppc_has_cap_mmu_hash_v3()) {
1685             error_report("Guest requested unavailable MMU mode (hash).");
1686             exit(EXIT_FAILURE);
1687         }
1688     }
1689 }
1690 
1691 static void spapr_machine_reset(MachineState *machine, ResetType type)
1692 {
1693     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
1694     PowerPCCPU *first_ppc_cpu;
1695     hwaddr fdt_addr;
1696     void *fdt;
1697     int rc;
1698 
1699     if (type != RESET_TYPE_SNAPSHOT_LOAD) {
1700         /*
1701          * Record-replay snapshot load must not consume random, this was
1702          * already replayed from initial machine reset.
1703          */
1704         qemu_guest_getrandom_nofail(spapr->fdt_rng_seed, 32);
1705     }
1706 
1707     if (machine->cgs) {
1708         confidential_guest_kvm_reset(machine->cgs, &error_fatal);
1709     }
1710     spapr_caps_apply(spapr);
1711     spapr_nested_reset(spapr);
1712 
1713     first_ppc_cpu = POWERPC_CPU(first_cpu);
1714     if (kvm_enabled() && kvmppc_has_cap_mmu_radix() &&
1715         ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
1716                               spapr->max_compat_pvr)) {
1717         /*
1718          * If using KVM with radix mode available, VCPUs can be started
1719          * without a HPT because KVM will start them in radix mode.
1720          * Set the GR bit in PATE so that we know there is no HPT.
1721          */
1722         spapr->patb_entry = PATE1_GR;
1723         spapr_set_all_lpcrs(LPCR_HR | LPCR_UPRT, LPCR_HR | LPCR_UPRT);
1724     } else {
1725         spapr_setup_hpt(spapr);
1726     }
1727 
1728     qemu_devices_reset(type);
1729 
1730     spapr_ovec_cleanup(spapr->ov5_cas);
1731     spapr->ov5_cas = spapr_ovec_new();
1732 
1733     ppc_init_compat_all(spapr->max_compat_pvr, &error_fatal);
1734 
1735     /*
1736      * This is fixing some of the default configuration of the XIVE
1737      * devices. To be called after the reset of the machine devices.
1738      */
1739     spapr_irq_reset(spapr, &error_fatal);
1740 
1741     /*
1742      * There is no CAS under qtest. Simulate one to please the code that
1743      * depends on spapr->ov5_cas. This is especially needed to test device
1744      * unplug, so we do that before resetting the DRCs.
1745      */
1746     if (qtest_enabled()) {
1747         spapr_ovec_cleanup(spapr->ov5_cas);
1748         spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
1749     }
1750 
1751     spapr_nvdimm_finish_flushes();
1752 
1753     /* DRC reset may cause a device to be unplugged. This will cause troubles
1754      * if this device is used by another device (eg, a running vhost backend
1755      * will crash QEMU if the DIMM holding the vring goes away). To avoid such
1756      * situations, we reset DRCs after all devices have been reset.
1757      */
1758     spapr_drc_reset_all(spapr);
1759 
1760     spapr_clear_pending_events(spapr);
1761 
1762     /*
1763      * We place the device tree just below either the top of the RMA,
1764      * or just below 2GB, whichever is lower, so that it can be
1765      * processed with 32-bit real mode code if necessary
1766      */
1767     fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE;
1768 
1769     fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
1770     if (spapr->vof) {
1771         spapr_vof_reset(spapr, fdt, &error_fatal);
1772         /*
1773          * Do not pack the FDT as the client may change properties.
1774          * VOF client does not expect the FDT so we do not load it to the VM.
1775          */
1776     } else {
1777         rc = fdt_pack(fdt);
1778         /* Should only fail if we've built a corrupted tree */
1779         assert(rc == 0);
1780 
1781         spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
1782                                   0, fdt_addr, 0);
1783         cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
1784     }
1785 
1786     g_free(spapr->fdt_blob);
1787     spapr->fdt_size = fdt_totalsize(fdt);
1788     spapr->fdt_initial_size = spapr->fdt_size;
1789     spapr->fdt_blob = fdt;
1790 
1791     /* Set machine->fdt for 'dumpdtb' QMP/HMP command */
1792     machine->fdt = fdt;
1793 
1794     /* Set up the entry state */
1795     first_ppc_cpu->env.gpr[5] = 0;
1796 
1797     spapr->fwnmi_system_reset_addr = -1;
1798     spapr->fwnmi_machine_check_addr = -1;
1799     spapr->fwnmi_machine_check_interlock = -1;
1800 
1801     /* Signal all vCPUs waiting on this condition */
1802     qemu_cond_broadcast(&spapr->fwnmi_machine_check_interlock_cond);
1803 
1804     migrate_del_blocker(&spapr->fwnmi_migration_blocker);
1805 }
1806 
1807 static void spapr_create_nvram(SpaprMachineState *spapr)
1808 {
1809     DeviceState *dev = qdev_new("spapr-nvram");
1810     DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
1811 
1812     if (dinfo) {
1813         qdev_prop_set_drive_err(dev, "drive", blk_by_legacy_dinfo(dinfo),
1814                                 &error_fatal);
1815     }
1816 
1817     qdev_realize_and_unref(dev, &spapr->vio_bus->bus, &error_fatal);
1818 
1819     spapr->nvram = (struct SpaprNvram *)dev;
1820 }
1821 
1822 static void spapr_rtc_create(SpaprMachineState *spapr)
1823 {
1824     object_initialize_child_with_props(OBJECT(spapr), "rtc", &spapr->rtc,
1825                                        sizeof(spapr->rtc), TYPE_SPAPR_RTC,
1826                                        &error_fatal, NULL);
1827     qdev_realize(DEVICE(&spapr->rtc), NULL, &error_fatal);
1828     object_property_add_alias(OBJECT(spapr), "rtc-time", OBJECT(&spapr->rtc),
1829                               "date");
1830 }
1831 
1832 /* Returns whether we want to use VGA or not */
1833 static bool spapr_vga_init(PCIBus *pci_bus, Error **errp)
1834 {
1835     vga_interface_created = true;
1836     switch (vga_interface_type) {
1837     case VGA_NONE:
1838         return false;
1839     case VGA_DEVICE:
1840         return true;
1841     case VGA_STD:
1842     case VGA_VIRTIO:
1843     case VGA_CIRRUS:
1844         return pci_vga_init(pci_bus) != NULL;
1845     default:
1846         error_setg(errp,
1847                    "Unsupported VGA mode, only -vga std or -vga virtio is supported");
1848         return false;
1849     }
1850 }
1851 
1852 static int spapr_pre_load(void *opaque)
1853 {
1854     int rc;
1855 
1856     rc = spapr_caps_pre_load(opaque);
1857     if (rc) {
1858         return rc;
1859     }
1860 
1861     return 0;
1862 }
1863 
1864 static int spapr_post_load(void *opaque, int version_id)
1865 {
1866     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1867     int err = 0;
1868 
1869     err = spapr_caps_post_migration(spapr);
1870     if (err) {
1871         return err;
1872     }
1873 
1874     /*
1875      * In earlier versions, there was no separate qdev for the PAPR
1876      * RTC, so the RTC offset was stored directly in sPAPREnvironment.
1877      * So when migrating from those versions, poke the incoming offset
1878      * value into the RTC device
1879      */
1880     if (version_id < 3) {
1881         err = spapr_rtc_import_offset(&spapr->rtc, spapr->rtc_offset);
1882         if (err) {
1883             return err;
1884         }
1885     }
1886 
1887     if (kvm_enabled() && spapr->patb_entry) {
1888         PowerPCCPU *cpu = POWERPC_CPU(first_cpu);
1889         bool radix = !!(spapr->patb_entry & PATE1_GR);
1890         bool gtse = !!(cpu->env.spr[SPR_LPCR] & LPCR_GTSE);
1891 
1892         /*
1893          * Update LPCR:HR and UPRT as they may not be set properly in
1894          * the stream
1895          */
1896         spapr_set_all_lpcrs(radix ? (LPCR_HR | LPCR_UPRT) : 0,
1897                             LPCR_HR | LPCR_UPRT);
1898 
1899         err = kvmppc_configure_v3_mmu(cpu, radix, gtse, spapr->patb_entry);
1900         if (err) {
1901             error_report("Process table config unsupported by the host");
1902             return -EINVAL;
1903         }
1904     }
1905 
1906     err = spapr_irq_post_load(spapr, version_id);
1907     if (err) {
1908         return err;
1909     }
1910 
1911     return err;
1912 }
1913 
1914 static int spapr_pre_save(void *opaque)
1915 {
1916     int rc;
1917 
1918     rc = spapr_caps_pre_save(opaque);
1919     if (rc) {
1920         return rc;
1921     }
1922 
1923     return 0;
1924 }
1925 
1926 static bool version_before_3(void *opaque, int version_id)
1927 {
1928     return version_id < 3;
1929 }
1930 
1931 static bool spapr_pending_events_needed(void *opaque)
1932 {
1933     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1934     return !QTAILQ_EMPTY(&spapr->pending_events);
1935 }
1936 
1937 static const VMStateDescription vmstate_spapr_event_entry = {
1938     .name = "spapr_event_log_entry",
1939     .version_id = 1,
1940     .minimum_version_id = 1,
1941     .fields = (const VMStateField[]) {
1942         VMSTATE_UINT32(summary, SpaprEventLogEntry),
1943         VMSTATE_UINT32(extended_length, SpaprEventLogEntry),
1944         VMSTATE_VBUFFER_ALLOC_UINT32(extended_log, SpaprEventLogEntry, 0,
1945                                      NULL, extended_length),
1946         VMSTATE_END_OF_LIST()
1947     },
1948 };
1949 
1950 static const VMStateDescription vmstate_spapr_pending_events = {
1951     .name = "spapr_pending_events",
1952     .version_id = 1,
1953     .minimum_version_id = 1,
1954     .needed = spapr_pending_events_needed,
1955     .fields = (const VMStateField[]) {
1956         VMSTATE_QTAILQ_V(pending_events, SpaprMachineState, 1,
1957                          vmstate_spapr_event_entry, SpaprEventLogEntry, next),
1958         VMSTATE_END_OF_LIST()
1959     },
1960 };
1961 
1962 static bool spapr_ov5_cas_needed(void *opaque)
1963 {
1964     SpaprMachineState *spapr = opaque;
1965     SpaprOptionVector *ov5_mask = spapr_ovec_new();
1966     bool cas_needed;
1967 
1968     /* Prior to the introduction of SpaprOptionVector, we had two option
1969      * vectors we dealt with: OV5_FORM1_AFFINITY, and OV5_DRCONF_MEMORY.
1970      * Both of these options encode machine topology into the device-tree
1971      * in such a way that the now-booted OS should still be able to interact
1972      * appropriately with QEMU regardless of what options were actually
1973      * negotiatied on the source side.
1974      *
1975      * As such, we can avoid migrating the CAS-negotiated options if these
1976      * are the only options available on the current machine/platform.
1977      * Since these are the only options available for pseries-2.7 and
1978      * earlier, this allows us to maintain old->new/new->old migration
1979      * compatibility.
1980      *
1981      * For QEMU 2.8+, there are additional CAS-negotiatable options available
1982      * via default pseries-2.8 machines and explicit command-line parameters.
1983      * Some of these options, like OV5_HP_EVT, *do* require QEMU to be aware
1984      * of the actual CAS-negotiated values to continue working properly. For
1985      * example, availability of memory unplug depends on knowing whether
1986      * OV5_HP_EVT was negotiated via CAS.
1987      *
1988      * Thus, for any cases where the set of available CAS-negotiatable
1989      * options extends beyond OV5_FORM1_AFFINITY and OV5_DRCONF_MEMORY, we
1990      * include the CAS-negotiated options in the migration stream, unless
1991      * if they affect boot time behaviour only.
1992      */
1993     spapr_ovec_set(ov5_mask, OV5_FORM1_AFFINITY);
1994     spapr_ovec_set(ov5_mask, OV5_DRCONF_MEMORY);
1995     spapr_ovec_set(ov5_mask, OV5_DRMEM_V2);
1996 
1997     /* We need extra information if we have any bits outside the mask
1998      * defined above */
1999     cas_needed = !spapr_ovec_subset(spapr->ov5, ov5_mask);
2000 
2001     spapr_ovec_cleanup(ov5_mask);
2002 
2003     return cas_needed;
2004 }
2005 
2006 static const VMStateDescription vmstate_spapr_ov5_cas = {
2007     .name = "spapr_option_vector_ov5_cas",
2008     .version_id = 1,
2009     .minimum_version_id = 1,
2010     .needed = spapr_ov5_cas_needed,
2011     .fields = (const VMStateField[]) {
2012         VMSTATE_STRUCT_POINTER_V(ov5_cas, SpaprMachineState, 1,
2013                                  vmstate_spapr_ovec, SpaprOptionVector),
2014         VMSTATE_END_OF_LIST()
2015     },
2016 };
2017 
2018 static bool spapr_patb_entry_needed(void *opaque)
2019 {
2020     SpaprMachineState *spapr = opaque;
2021 
2022     return !!spapr->patb_entry;
2023 }
2024 
2025 static const VMStateDescription vmstate_spapr_patb_entry = {
2026     .name = "spapr_patb_entry",
2027     .version_id = 1,
2028     .minimum_version_id = 1,
2029     .needed = spapr_patb_entry_needed,
2030     .fields = (const VMStateField[]) {
2031         VMSTATE_UINT64(patb_entry, SpaprMachineState),
2032         VMSTATE_END_OF_LIST()
2033     },
2034 };
2035 
2036 static bool spapr_irq_map_needed(void *opaque)
2037 {
2038     SpaprMachineState *spapr = opaque;
2039 
2040     return spapr->irq_map && !bitmap_empty(spapr->irq_map, spapr->irq_map_nr);
2041 }
2042 
2043 static const VMStateDescription vmstate_spapr_irq_map = {
2044     .name = "spapr_irq_map",
2045     .version_id = 1,
2046     .minimum_version_id = 1,
2047     .needed = spapr_irq_map_needed,
2048     .fields = (const VMStateField[]) {
2049         VMSTATE_BITMAP(irq_map, SpaprMachineState, 0, irq_map_nr),
2050         VMSTATE_END_OF_LIST()
2051     },
2052 };
2053 
2054 static bool spapr_dtb_needed(void *opaque)
2055 {
2056     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque);
2057 
2058     return smc->update_dt_enabled;
2059 }
2060 
2061 static int spapr_dtb_pre_load(void *opaque)
2062 {
2063     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
2064 
2065     g_free(spapr->fdt_blob);
2066     spapr->fdt_blob = NULL;
2067     spapr->fdt_size = 0;
2068 
2069     return 0;
2070 }
2071 
2072 static const VMStateDescription vmstate_spapr_dtb = {
2073     .name = "spapr_dtb",
2074     .version_id = 1,
2075     .minimum_version_id = 1,
2076     .needed = spapr_dtb_needed,
2077     .pre_load = spapr_dtb_pre_load,
2078     .fields = (const VMStateField[]) {
2079         VMSTATE_UINT32(fdt_initial_size, SpaprMachineState),
2080         VMSTATE_UINT32(fdt_size, SpaprMachineState),
2081         VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, SpaprMachineState, 0, NULL,
2082                                      fdt_size),
2083         VMSTATE_END_OF_LIST()
2084     },
2085 };
2086 
2087 static bool spapr_fwnmi_needed(void *opaque)
2088 {
2089     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
2090 
2091     return spapr->fwnmi_machine_check_addr != -1;
2092 }
2093 
2094 static int spapr_fwnmi_pre_save(void *opaque)
2095 {
2096     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
2097 
2098     /*
2099      * Check if machine check handling is in progress and print a
2100      * warning message.
2101      */
2102     if (spapr->fwnmi_machine_check_interlock != -1) {
2103         warn_report("A machine check is being handled during migration. The"
2104                 "handler may run and log hardware error on the destination");
2105     }
2106 
2107     return 0;
2108 }
2109 
2110 static const VMStateDescription vmstate_spapr_fwnmi = {
2111     .name = "spapr_fwnmi",
2112     .version_id = 1,
2113     .minimum_version_id = 1,
2114     .needed = spapr_fwnmi_needed,
2115     .pre_save = spapr_fwnmi_pre_save,
2116     .fields = (const VMStateField[]) {
2117         VMSTATE_UINT64(fwnmi_system_reset_addr, SpaprMachineState),
2118         VMSTATE_UINT64(fwnmi_machine_check_addr, SpaprMachineState),
2119         VMSTATE_INT32(fwnmi_machine_check_interlock, SpaprMachineState),
2120         VMSTATE_END_OF_LIST()
2121     },
2122 };
2123 
2124 static const VMStateDescription vmstate_spapr = {
2125     .name = "spapr",
2126     .version_id = 3,
2127     .minimum_version_id = 1,
2128     .pre_load = spapr_pre_load,
2129     .post_load = spapr_post_load,
2130     .pre_save = spapr_pre_save,
2131     .fields = (const VMStateField[]) {
2132         /* used to be @next_irq */
2133         VMSTATE_UNUSED_BUFFER(version_before_3, 0, 4),
2134 
2135         /* RTC offset */
2136         VMSTATE_UINT64_TEST(rtc_offset, SpaprMachineState, version_before_3),
2137 
2138         VMSTATE_PPC_TIMEBASE_V(tb, SpaprMachineState, 2),
2139         VMSTATE_END_OF_LIST()
2140     },
2141     .subsections = (const VMStateDescription * const []) {
2142         &vmstate_spapr_ov5_cas,
2143         &vmstate_spapr_patb_entry,
2144         &vmstate_spapr_pending_events,
2145         &vmstate_spapr_cap_htm,
2146         &vmstate_spapr_cap_vsx,
2147         &vmstate_spapr_cap_dfp,
2148         &vmstate_spapr_cap_cfpc,
2149         &vmstate_spapr_cap_sbbc,
2150         &vmstate_spapr_cap_ibs,
2151         &vmstate_spapr_cap_hpt_maxpagesize,
2152         &vmstate_spapr_irq_map,
2153         &vmstate_spapr_cap_nested_kvm_hv,
2154         &vmstate_spapr_dtb,
2155         &vmstate_spapr_cap_large_decr,
2156         &vmstate_spapr_cap_ccf_assist,
2157         &vmstate_spapr_cap_fwnmi,
2158         &vmstate_spapr_fwnmi,
2159         &vmstate_spapr_cap_rpt_invalidate,
2160         &vmstate_spapr_cap_ail_mode_3,
2161         &vmstate_spapr_cap_nested_papr,
2162         NULL
2163     }
2164 };
2165 
2166 static int htab_save_setup(QEMUFile *f, void *opaque, Error **errp)
2167 {
2168     SpaprMachineState *spapr = opaque;
2169 
2170     /* "Iteration" header */
2171     if (!spapr->htab_shift) {
2172         qemu_put_be32(f, -1);
2173     } else {
2174         qemu_put_be32(f, spapr->htab_shift);
2175     }
2176 
2177     if (spapr->htab) {
2178         spapr->htab_save_index = 0;
2179         spapr->htab_first_pass = true;
2180     } else {
2181         if (spapr->htab_shift) {
2182             assert(kvm_enabled());
2183         }
2184     }
2185 
2186 
2187     return 0;
2188 }
2189 
2190 static void htab_save_chunk(QEMUFile *f, SpaprMachineState *spapr,
2191                             int chunkstart, int n_valid, int n_invalid)
2192 {
2193     qemu_put_be32(f, chunkstart);
2194     qemu_put_be16(f, n_valid);
2195     qemu_put_be16(f, n_invalid);
2196     qemu_put_buffer(f, (void *)hpte_get_ptr(spapr, chunkstart),
2197                     HASH_PTE_SIZE_64 * n_valid);
2198 }
2199 
2200 static void htab_save_end_marker(QEMUFile *f)
2201 {
2202     qemu_put_be32(f, 0);
2203     qemu_put_be16(f, 0);
2204     qemu_put_be16(f, 0);
2205 }
2206 
2207 static void htab_save_first_pass(QEMUFile *f, SpaprMachineState *spapr,
2208                                  int64_t max_ns)
2209 {
2210     bool has_timeout = max_ns != -1;
2211     int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2212     int index = spapr->htab_save_index;
2213     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2214 
2215     assert(spapr->htab_first_pass);
2216 
2217     do {
2218         int chunkstart;
2219 
2220         /* Consume invalid HPTEs */
2221         while ((index < htabslots)
2222                && !hpte_is_valid(spapr, index)) {
2223             hpte_set_clean(spapr, index);
2224             index++;
2225         }
2226 
2227         /* Consume valid HPTEs */
2228         chunkstart = index;
2229         while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2230                && hpte_is_valid(spapr, index)) {
2231             hpte_set_clean(spapr, index);
2232             index++;
2233         }
2234 
2235         if (index > chunkstart) {
2236             int n_valid = index - chunkstart;
2237 
2238             htab_save_chunk(f, spapr, chunkstart, n_valid, 0);
2239 
2240             if (has_timeout &&
2241                 (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2242                 break;
2243             }
2244         }
2245     } while ((index < htabslots) && !migration_rate_exceeded(f));
2246 
2247     if (index >= htabslots) {
2248         assert(index == htabslots);
2249         index = 0;
2250         spapr->htab_first_pass = false;
2251     }
2252     spapr->htab_save_index = index;
2253 }
2254 
2255 static int htab_save_later_pass(QEMUFile *f, SpaprMachineState *spapr,
2256                                 int64_t max_ns)
2257 {
2258     bool final = max_ns < 0;
2259     int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2260     int examined = 0, sent = 0;
2261     int index = spapr->htab_save_index;
2262     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2263 
2264     assert(!spapr->htab_first_pass);
2265 
2266     do {
2267         int chunkstart, invalidstart;
2268 
2269         /* Consume non-dirty HPTEs */
2270         while ((index < htabslots)
2271                && !hpte_is_dirty(spapr, index)) {
2272             index++;
2273             examined++;
2274         }
2275 
2276         chunkstart = index;
2277         /* Consume valid dirty HPTEs */
2278         while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2279                && hpte_is_dirty(spapr, index)
2280                && hpte_is_valid(spapr, index)) {
2281             hpte_set_clean(spapr, index);
2282             index++;
2283             examined++;
2284         }
2285 
2286         invalidstart = index;
2287         /* Consume invalid dirty HPTEs */
2288         while ((index < htabslots) && (index - invalidstart < USHRT_MAX)
2289                && hpte_is_dirty(spapr, index)
2290                && !hpte_is_valid(spapr, index)) {
2291             hpte_set_clean(spapr, index);
2292             index++;
2293             examined++;
2294         }
2295 
2296         if (index > chunkstart) {
2297             int n_valid = invalidstart - chunkstart;
2298             int n_invalid = index - invalidstart;
2299 
2300             htab_save_chunk(f, spapr, chunkstart, n_valid, n_invalid);
2301             sent += index - chunkstart;
2302 
2303             if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2304                 break;
2305             }
2306         }
2307 
2308         if (examined >= htabslots) {
2309             break;
2310         }
2311 
2312         if (index >= htabslots) {
2313             assert(index == htabslots);
2314             index = 0;
2315         }
2316     } while ((examined < htabslots) && (!migration_rate_exceeded(f) || final));
2317 
2318     if (index >= htabslots) {
2319         assert(index == htabslots);
2320         index = 0;
2321     }
2322 
2323     spapr->htab_save_index = index;
2324 
2325     return (examined >= htabslots) && (sent == 0) ? 1 : 0;
2326 }
2327 
2328 #define MAX_ITERATION_NS    5000000 /* 5 ms */
2329 #define MAX_KVM_BUF_SIZE    2048
2330 
2331 static int htab_save_iterate(QEMUFile *f, void *opaque)
2332 {
2333     SpaprMachineState *spapr = opaque;
2334     int fd;
2335     int rc = 0;
2336 
2337     /* Iteration header */
2338     if (!spapr->htab_shift) {
2339         qemu_put_be32(f, -1);
2340         return 1;
2341     } else {
2342         qemu_put_be32(f, 0);
2343     }
2344 
2345     if (!spapr->htab) {
2346         assert(kvm_enabled());
2347 
2348         fd = get_htab_fd(spapr);
2349         if (fd < 0) {
2350             return fd;
2351         }
2352 
2353         rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
2354         if (rc < 0) {
2355             return rc;
2356         }
2357     } else  if (spapr->htab_first_pass) {
2358         htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
2359     } else {
2360         rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
2361     }
2362 
2363     htab_save_end_marker(f);
2364 
2365     return rc;
2366 }
2367 
2368 static int htab_save_complete(QEMUFile *f, void *opaque)
2369 {
2370     SpaprMachineState *spapr = opaque;
2371     int fd;
2372 
2373     /* Iteration header */
2374     if (!spapr->htab_shift) {
2375         qemu_put_be32(f, -1);
2376         return 0;
2377     } else {
2378         qemu_put_be32(f, 0);
2379     }
2380 
2381     if (!spapr->htab) {
2382         int rc;
2383 
2384         assert(kvm_enabled());
2385 
2386         fd = get_htab_fd(spapr);
2387         if (fd < 0) {
2388             return fd;
2389         }
2390 
2391         rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, -1);
2392         if (rc < 0) {
2393             return rc;
2394         }
2395     } else {
2396         if (spapr->htab_first_pass) {
2397             htab_save_first_pass(f, spapr, -1);
2398         }
2399         htab_save_later_pass(f, spapr, -1);
2400     }
2401 
2402     /* End marker */
2403     htab_save_end_marker(f);
2404 
2405     return 0;
2406 }
2407 
2408 static int htab_load(QEMUFile *f, void *opaque, int version_id)
2409 {
2410     SpaprMachineState *spapr = opaque;
2411     uint32_t section_hdr;
2412     int fd = -1;
2413     Error *local_err = NULL;
2414 
2415     if (version_id < 1 || version_id > 1) {
2416         error_report("htab_load() bad version");
2417         return -EINVAL;
2418     }
2419 
2420     section_hdr = qemu_get_be32(f);
2421 
2422     if (section_hdr == -1) {
2423         spapr_free_hpt(spapr);
2424         return 0;
2425     }
2426 
2427     if (section_hdr) {
2428         int ret;
2429 
2430         /* First section gives the htab size */
2431         ret = spapr_reallocate_hpt(spapr, section_hdr, &local_err);
2432         if (ret < 0) {
2433             error_report_err(local_err);
2434             return ret;
2435         }
2436         return 0;
2437     }
2438 
2439     if (!spapr->htab) {
2440         assert(kvm_enabled());
2441 
2442         fd = kvmppc_get_htab_fd(true, 0, &local_err);
2443         if (fd < 0) {
2444             error_report_err(local_err);
2445             return fd;
2446         }
2447     }
2448 
2449     while (true) {
2450         uint32_t index;
2451         uint16_t n_valid, n_invalid;
2452 
2453         index = qemu_get_be32(f);
2454         n_valid = qemu_get_be16(f);
2455         n_invalid = qemu_get_be16(f);
2456 
2457         if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
2458             /* End of Stream */
2459             break;
2460         }
2461 
2462         if ((index + n_valid + n_invalid) >
2463             (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
2464             /* Bad index in stream */
2465             error_report(
2466                 "htab_load() bad index %d (%hd+%hd entries) in htab stream (htab_shift=%d)",
2467                 index, n_valid, n_invalid, spapr->htab_shift);
2468             return -EINVAL;
2469         }
2470 
2471         if (spapr->htab) {
2472             if (n_valid) {
2473                 qemu_get_buffer(f, (void *)hpte_get_ptr(spapr, index),
2474                                 HASH_PTE_SIZE_64 * n_valid);
2475             }
2476             if (n_invalid) {
2477                 memset(hpte_get_ptr(spapr, index + n_valid), 0,
2478                        HASH_PTE_SIZE_64 * n_invalid);
2479             }
2480         } else {
2481             int rc;
2482 
2483             assert(fd >= 0);
2484 
2485             rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid,
2486                                         &local_err);
2487             if (rc < 0) {
2488                 error_report_err(local_err);
2489                 return rc;
2490             }
2491         }
2492     }
2493 
2494     if (!spapr->htab) {
2495         assert(fd >= 0);
2496         close(fd);
2497     }
2498 
2499     return 0;
2500 }
2501 
2502 static void htab_save_cleanup(void *opaque)
2503 {
2504     SpaprMachineState *spapr = opaque;
2505 
2506     close_htab_fd(spapr);
2507 }
2508 
2509 static SaveVMHandlers savevm_htab_handlers = {
2510     .save_setup = htab_save_setup,
2511     .save_live_iterate = htab_save_iterate,
2512     .save_live_complete_precopy = htab_save_complete,
2513     .save_cleanup = htab_save_cleanup,
2514     .load_state = htab_load,
2515 };
2516 
2517 static void spapr_boot_set(void *opaque, const char *boot_device,
2518                            Error **errp)
2519 {
2520     SpaprMachineState *spapr = SPAPR_MACHINE(opaque);
2521 
2522     g_free(spapr->boot_device);
2523     spapr->boot_device = g_strdup(boot_device);
2524 }
2525 
2526 static void spapr_create_lmb_dr_connectors(SpaprMachineState *spapr)
2527 {
2528     MachineState *machine = MACHINE(spapr);
2529     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
2530     uint32_t nr_lmbs = (machine->maxram_size - machine->ram_size)/lmb_size;
2531     int i;
2532 
2533     g_assert(!nr_lmbs || machine->device_memory);
2534     for (i = 0; i < nr_lmbs; i++) {
2535         uint64_t addr;
2536 
2537         addr = i * lmb_size + machine->device_memory->base;
2538         spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_LMB,
2539                                addr / lmb_size);
2540     }
2541 }
2542 
2543 /*
2544  * If RAM size, maxmem size and individual node mem sizes aren't aligned
2545  * to SPAPR_MEMORY_BLOCK_SIZE(256MB), then refuse to start the guest
2546  * since we can't support such unaligned sizes with DRCONF_MEMORY.
2547  */
2548 static void spapr_validate_node_memory(MachineState *machine, Error **errp)
2549 {
2550     int i;
2551 
2552     if (machine->ram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2553         error_setg(errp, "Memory size 0x" RAM_ADDR_FMT
2554                    " is not aligned to %" PRIu64 " MiB",
2555                    machine->ram_size,
2556                    SPAPR_MEMORY_BLOCK_SIZE / MiB);
2557         return;
2558     }
2559 
2560     if (machine->maxram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2561         error_setg(errp, "Maximum memory size 0x" RAM_ADDR_FMT
2562                    " is not aligned to %" PRIu64 " MiB",
2563                    machine->ram_size,
2564                    SPAPR_MEMORY_BLOCK_SIZE / MiB);
2565         return;
2566     }
2567 
2568     for (i = 0; i < machine->numa_state->num_nodes; i++) {
2569         if (machine->numa_state->nodes[i].node_mem % SPAPR_MEMORY_BLOCK_SIZE) {
2570             error_setg(errp,
2571                        "Node %d memory size 0x%" PRIx64
2572                        " is not aligned to %" PRIu64 " MiB",
2573                        i, machine->numa_state->nodes[i].node_mem,
2574                        SPAPR_MEMORY_BLOCK_SIZE / MiB);
2575             return;
2576         }
2577     }
2578 }
2579 
2580 /* find cpu slot in machine->possible_cpus by core_id */
2581 static CPUArchId *spapr_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
2582 {
2583     int index = id / ms->smp.threads;
2584 
2585     if (index >= ms->possible_cpus->len) {
2586         return NULL;
2587     }
2588     if (idx) {
2589         *idx = index;
2590     }
2591     return &ms->possible_cpus->cpus[index];
2592 }
2593 
2594 static void spapr_set_vsmt_mode(SpaprMachineState *spapr, Error **errp)
2595 {
2596     MachineState *ms = MACHINE(spapr);
2597     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
2598     Error *local_err = NULL;
2599     bool vsmt_user = !!spapr->vsmt;
2600     int kvm_smt = kvmppc_smt_threads();
2601     int ret;
2602     unsigned int smp_threads = ms->smp.threads;
2603 
2604     if (tcg_enabled()) {
2605         if (smp_threads > 1 &&
2606             !ppc_type_check_compat(ms->cpu_type, CPU_POWERPC_LOGICAL_2_07, 0,
2607                                    spapr->max_compat_pvr)) {
2608             error_setg(errp, "TCG only supports SMT on POWER8 or newer CPUs");
2609             return;
2610         }
2611 
2612         if (smp_threads > 8) {
2613             error_setg(errp, "TCG cannot support more than 8 threads/core "
2614                        "on a pseries machine");
2615             return;
2616         }
2617     }
2618     if (!is_power_of_2(smp_threads)) {
2619         error_setg(errp, "Cannot support %d threads/core on a pseries "
2620                    "machine because it must be a power of 2", smp_threads);
2621         return;
2622     }
2623 
2624     /* Determine the VSMT mode to use: */
2625     if (vsmt_user) {
2626         if (spapr->vsmt < smp_threads) {
2627             error_setg(errp, "Cannot support VSMT mode %d"
2628                        " because it must be >= threads/core (%d)",
2629                        spapr->vsmt, smp_threads);
2630             return;
2631         }
2632         /* In this case, spapr->vsmt has been set by the command line */
2633     } else if (!smc->smp_threads_vsmt) {
2634         /*
2635          * Default VSMT value is tricky, because we need it to be as
2636          * consistent as possible (for migration), but this requires
2637          * changing it for at least some existing cases.  We pick 8 as
2638          * the value that we'd get with KVM on POWER8, the
2639          * overwhelmingly common case in production systems.
2640          */
2641         spapr->vsmt = MAX(8, smp_threads);
2642     } else {
2643         spapr->vsmt = smp_threads;
2644     }
2645 
2646     /* KVM: If necessary, set the SMT mode: */
2647     if (kvm_enabled() && (spapr->vsmt != kvm_smt)) {
2648         ret = kvmppc_set_smt_threads(spapr->vsmt);
2649         if (ret) {
2650             /* Looks like KVM isn't able to change VSMT mode */
2651             error_setg(&local_err,
2652                        "Failed to set KVM's VSMT mode to %d (errno %d)",
2653                        spapr->vsmt, ret);
2654             /* We can live with that if the default one is big enough
2655              * for the number of threads, and a submultiple of the one
2656              * we want.  In this case we'll waste some vcpu ids, but
2657              * behaviour will be correct */
2658             if ((kvm_smt >= smp_threads) && ((spapr->vsmt % kvm_smt) == 0)) {
2659                 warn_report_err(local_err);
2660             } else {
2661                 if (!vsmt_user) {
2662                     error_append_hint(&local_err,
2663                                       "On PPC, a VM with %d threads/core"
2664                                       " on a host with %d threads/core"
2665                                       " requires the use of VSMT mode %d.\n",
2666                                       smp_threads, kvm_smt, spapr->vsmt);
2667                 }
2668                 kvmppc_error_append_smt_possible_hint(&local_err);
2669                 error_propagate(errp, local_err);
2670             }
2671         }
2672     }
2673     /* else TCG: nothing to do currently */
2674 }
2675 
2676 static void spapr_init_cpus(SpaprMachineState *spapr)
2677 {
2678     MachineState *machine = MACHINE(spapr);
2679     MachineClass *mc = MACHINE_GET_CLASS(machine);
2680     const char *type = spapr_get_cpu_core_type(machine->cpu_type);
2681     const CPUArchIdList *possible_cpus;
2682     unsigned int smp_cpus = machine->smp.cpus;
2683     unsigned int smp_threads = machine->smp.threads;
2684     unsigned int max_cpus = machine->smp.max_cpus;
2685     int boot_cores_nr = smp_cpus / smp_threads;
2686     int i;
2687 
2688     possible_cpus = mc->possible_cpu_arch_ids(machine);
2689     if (mc->has_hotpluggable_cpus) {
2690         if (smp_cpus % smp_threads) {
2691             error_report("smp_cpus (%u) must be multiple of threads (%u)",
2692                          smp_cpus, smp_threads);
2693             exit(1);
2694         }
2695         if (max_cpus % smp_threads) {
2696             error_report("max_cpus (%u) must be multiple of threads (%u)",
2697                          max_cpus, smp_threads);
2698             exit(1);
2699         }
2700     } else {
2701         if (max_cpus != smp_cpus) {
2702             error_report("This machine version does not support CPU hotplug");
2703             exit(1);
2704         }
2705         boot_cores_nr = possible_cpus->len;
2706     }
2707 
2708     for (i = 0; i < possible_cpus->len; i++) {
2709         int core_id = i * smp_threads;
2710 
2711         if (mc->has_hotpluggable_cpus) {
2712             spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_CPU,
2713                                    spapr_vcpu_id(spapr, core_id));
2714         }
2715 
2716         if (i < boot_cores_nr) {
2717             Object *core  = object_new(type);
2718             int nr_threads = smp_threads;
2719 
2720             /* Handle the partially filled core for older machine types */
2721             if ((i + 1) * smp_threads >= smp_cpus) {
2722                 nr_threads = smp_cpus - i * smp_threads;
2723             }
2724 
2725             object_property_set_int(core, "nr-threads", nr_threads,
2726                                     &error_fatal);
2727             object_property_set_int(core, CPU_CORE_PROP_CORE_ID, core_id,
2728                                     &error_fatal);
2729             qdev_realize(DEVICE(core), NULL, &error_fatal);
2730 
2731             object_unref(core);
2732         }
2733     }
2734 }
2735 
2736 static PCIHostState *spapr_create_default_phb(void)
2737 {
2738     DeviceState *dev;
2739 
2740     dev = qdev_new(TYPE_SPAPR_PCI_HOST_BRIDGE);
2741     qdev_prop_set_uint32(dev, "index", 0);
2742     sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
2743 
2744     return PCI_HOST_BRIDGE(dev);
2745 }
2746 
2747 static hwaddr spapr_rma_size(SpaprMachineState *spapr, Error **errp)
2748 {
2749     MachineState *machine = MACHINE(spapr);
2750     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
2751     hwaddr rma_size = machine->ram_size;
2752     hwaddr node0_size = spapr_node0_size(machine);
2753 
2754     /* RMA has to fit in the first NUMA node */
2755     rma_size = MIN(rma_size, node0_size);
2756 
2757     /*
2758      * VRMA access is via a special 1TiB SLB mapping, so the RMA can
2759      * never exceed that
2760      */
2761     rma_size = MIN(rma_size, 1 * TiB);
2762 
2763     /*
2764      * Clamp the RMA size based on machine type.  This is for
2765      * migration compatibility with older qemu versions, which limited
2766      * the RMA size for complicated and mostly bad reasons.
2767      */
2768     if (smc->rma_limit) {
2769         rma_size = MIN(rma_size, smc->rma_limit);
2770     }
2771 
2772     if (rma_size < MIN_RMA_SLOF) {
2773         error_setg(errp,
2774                    "pSeries SLOF firmware requires >= %" HWADDR_PRIx
2775                    "ldMiB guest RMA (Real Mode Area memory)",
2776                    MIN_RMA_SLOF / MiB);
2777         return 0;
2778     }
2779 
2780     return rma_size;
2781 }
2782 
2783 static void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr)
2784 {
2785     MachineState *machine = MACHINE(spapr);
2786     int i;
2787 
2788     for (i = 0; i < machine->ram_slots; i++) {
2789         spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM, i);
2790     }
2791 }
2792 
2793 /* pSeries LPAR / sPAPR hardware init */
2794 static void spapr_machine_init(MachineState *machine)
2795 {
2796     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
2797     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
2798     MachineClass *mc = MACHINE_GET_CLASS(machine);
2799     const char *bios_default = spapr->vof ? FW_FILE_NAME_VOF : FW_FILE_NAME;
2800     const char *bios_name = machine->firmware ?: bios_default;
2801     g_autofree char *filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
2802     const char *kernel_filename = machine->kernel_filename;
2803     const char *initrd_filename = machine->initrd_filename;
2804     PCIHostState *phb;
2805     bool has_vga;
2806     int i;
2807     MemoryRegion *sysmem = get_system_memory();
2808     long load_limit, fw_size;
2809     Error *resize_hpt_err = NULL;
2810     NICInfo *nd;
2811 
2812     if (!filename) {
2813         error_report("Could not find LPAR firmware '%s'", bios_name);
2814         exit(1);
2815     }
2816     fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
2817     if (fw_size <= 0) {
2818         error_report("Could not load LPAR firmware '%s'", filename);
2819         exit(1);
2820     }
2821 
2822     /*
2823      * if Secure VM (PEF) support is configured, then initialize it
2824      */
2825     if (machine->cgs) {
2826         confidential_guest_kvm_init(machine->cgs, &error_fatal);
2827     }
2828 
2829     msi_nonbroken = true;
2830 
2831     QLIST_INIT(&spapr->phbs);
2832     QTAILQ_INIT(&spapr->pending_dimm_unplugs);
2833 
2834     /* Determine capabilities to run with */
2835     spapr_caps_init(spapr);
2836 
2837     kvmppc_check_papr_resize_hpt(&resize_hpt_err);
2838     if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DEFAULT) {
2839         /*
2840          * If the user explicitly requested a mode we should either
2841          * supply it, or fail completely (which we do below).  But if
2842          * it's not set explicitly, we reset our mode to something
2843          * that works
2844          */
2845         if (resize_hpt_err) {
2846             spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
2847             error_free(resize_hpt_err);
2848             resize_hpt_err = NULL;
2849         } else {
2850             spapr->resize_hpt = smc->resize_hpt_default;
2851         }
2852     }
2853 
2854     assert(spapr->resize_hpt != SPAPR_RESIZE_HPT_DEFAULT);
2855 
2856     if ((spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) && resize_hpt_err) {
2857         /*
2858          * User requested HPT resize, but this host can't supply it.  Bail out
2859          */
2860         error_report_err(resize_hpt_err);
2861         exit(1);
2862     }
2863     error_free(resize_hpt_err);
2864 
2865     spapr->rma_size = spapr_rma_size(spapr, &error_fatal);
2866 
2867     /* Setup a load limit for the ramdisk leaving room for SLOF and FDT */
2868     load_limit = MIN(spapr->rma_size, FDT_MAX_ADDR) - FW_OVERHEAD;
2869 
2870     /*
2871      * VSMT must be set in order to be able to compute VCPU ids, ie to
2872      * call spapr_max_server_number() or spapr_vcpu_id().
2873      */
2874     spapr_set_vsmt_mode(spapr, &error_fatal);
2875 
2876     /* Set up Interrupt Controller before we create the VCPUs */
2877     spapr_irq_init(spapr, &error_fatal);
2878 
2879     /* Set up containers for ibm,client-architecture-support negotiated options
2880      */
2881     spapr->ov5 = spapr_ovec_new();
2882     spapr->ov5_cas = spapr_ovec_new();
2883 
2884     spapr_ovec_set(spapr->ov5, OV5_DRCONF_MEMORY);
2885     spapr_validate_node_memory(machine, &error_fatal);
2886 
2887     spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
2888 
2889     /* Do not advertise FORM2 NUMA support for pseries-6.1 and older */
2890     if (!smc->pre_6_2_numa_affinity) {
2891         spapr_ovec_set(spapr->ov5, OV5_FORM2_AFFINITY);
2892     }
2893 
2894     /* advertise support for dedicated HP event source to guests */
2895     if (spapr->use_hotplug_event_source) {
2896         spapr_ovec_set(spapr->ov5, OV5_HP_EVT);
2897     }
2898 
2899     /* advertise support for HPT resizing */
2900     if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
2901         spapr_ovec_set(spapr->ov5, OV5_HPT_RESIZE);
2902     }
2903 
2904     /* advertise support for ibm,dyamic-memory-v2 */
2905     spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2);
2906 
2907     /* advertise XIVE on POWER9 machines */
2908     if (spapr->irq->xive) {
2909         spapr_ovec_set(spapr->ov5, OV5_XIVE_EXPLOIT);
2910     }
2911 
2912     /* init CPUs */
2913     spapr_init_cpus(spapr);
2914 
2915     /* Init numa_assoc_array */
2916     spapr_numa_associativity_init(spapr, machine);
2917 
2918     if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
2919         ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
2920                               spapr->max_compat_pvr)) {
2921         spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
2922         /* KVM and TCG always allow GTSE with radix... */
2923         spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
2924     }
2925     /* ... but not with hash (currently). */
2926 
2927     if (kvm_enabled()) {
2928         /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
2929         kvmppc_enable_logical_ci_hcalls();
2930         kvmppc_enable_set_mode_hcall();
2931 
2932         /* H_CLEAR_MOD/_REF are mandatory in PAPR, but off by default */
2933         kvmppc_enable_clear_ref_mod_hcalls();
2934 
2935         /* Enable H_PAGE_INIT */
2936         kvmppc_enable_h_page_init();
2937     }
2938 
2939     /* map RAM */
2940     memory_region_add_subregion(sysmem, 0, machine->ram);
2941 
2942     /* initialize hotplug memory address space */
2943     if (machine->ram_size < machine->maxram_size) {
2944         ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size;
2945         hwaddr device_mem_base;
2946 
2947         /*
2948          * Limit the number of hotpluggable memory slots to half the number
2949          * slots that KVM supports, leaving the other half for PCI and other
2950          * devices. However ensure that number of slots doesn't drop below 32.
2951          */
2952         int max_memslots = kvm_enabled() ? kvm_get_max_memslots() / 2 :
2953                            SPAPR_MAX_RAM_SLOTS;
2954 
2955         if (max_memslots < SPAPR_MAX_RAM_SLOTS) {
2956             max_memslots = SPAPR_MAX_RAM_SLOTS;
2957         }
2958         if (machine->ram_slots > max_memslots) {
2959             error_report("Specified number of memory slots %"
2960                          PRIu64" exceeds max supported %d",
2961                          machine->ram_slots, max_memslots);
2962             exit(1);
2963         }
2964 
2965         device_mem_base = ROUND_UP(machine->ram_size, SPAPR_DEVICE_MEM_ALIGN);
2966         machine_memory_devices_init(machine, device_mem_base, device_mem_size);
2967     }
2968 
2969     spapr_create_lmb_dr_connectors(spapr);
2970 
2971     if (mc->nvdimm_supported) {
2972         spapr_create_nvdimm_dr_connectors(spapr);
2973     }
2974 
2975     /* Set up RTAS event infrastructure */
2976     spapr_events_init(spapr);
2977 
2978     /* Set up the RTC RTAS interfaces */
2979     spapr_rtc_create(spapr);
2980 
2981     /* Set up VIO bus */
2982     spapr->vio_bus = spapr_vio_bus_init();
2983 
2984     for (i = 0; serial_hd(i); i++) {
2985         spapr_vty_create(spapr->vio_bus, serial_hd(i));
2986     }
2987 
2988     /* We always have at least the nvram device on VIO */
2989     spapr_create_nvram(spapr);
2990 
2991     /*
2992      * Setup hotplug / dynamic-reconfiguration connectors. top-level
2993      * connectors (described in root DT node's "ibm,drc-types" property)
2994      * are pre-initialized here. additional child connectors (such as
2995      * connectors for a PHBs PCI slots) are added as needed during their
2996      * parent's realization.
2997      */
2998     if (smc->dr_phb_enabled) {
2999         for (i = 0; i < SPAPR_MAX_PHBS; i++) {
3000             spapr_dr_connector_new(OBJECT(machine), TYPE_SPAPR_DRC_PHB, i);
3001         }
3002     }
3003 
3004     /* Set up PCI */
3005     spapr_pci_rtas_init();
3006 
3007     phb = spapr_create_default_phb();
3008 
3009     while ((nd = qemu_find_nic_info("spapr-vlan", true, "ibmveth"))) {
3010         spapr_vlan_create(spapr->vio_bus, nd);
3011     }
3012 
3013     pci_init_nic_devices(phb->bus, NULL);
3014 
3015     for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
3016         spapr_vscsi_create(spapr->vio_bus);
3017     }
3018 
3019     /* Graphics */
3020     has_vga = spapr_vga_init(phb->bus, &error_fatal);
3021     if (has_vga) {
3022         spapr->want_stdout_path = !machine->enable_graphics;
3023         machine->usb |= defaults_enabled() && !machine->usb_disabled;
3024     } else {
3025         spapr->want_stdout_path = true;
3026     }
3027 
3028     if (machine->usb) {
3029         pci_create_simple(phb->bus, -1, "nec-usb-xhci");
3030 
3031         if (has_vga) {
3032             USBBus *usb_bus;
3033 
3034             usb_bus = USB_BUS(object_resolve_type_unambiguous(TYPE_USB_BUS,
3035                                                               &error_abort));
3036             usb_create_simple(usb_bus, "usb-kbd");
3037             usb_create_simple(usb_bus, "usb-mouse");
3038         }
3039     }
3040 
3041     if (kernel_filename) {
3042         uint64_t loaded_addr = 0;
3043 
3044         spapr->kernel_size = load_elf(kernel_filename, NULL,
3045                                       translate_kernel_address, spapr,
3046                                       NULL, &loaded_addr, NULL, NULL,
3047                                       ELFDATA2MSB, PPC_ELF_MACHINE, 0, 0);
3048         if (spapr->kernel_size == ELF_LOAD_WRONG_ENDIAN) {
3049             spapr->kernel_size = load_elf(kernel_filename, NULL,
3050                                           translate_kernel_address, spapr,
3051                                           NULL, &loaded_addr, NULL, NULL,
3052                                           ELFDATA2LSB, PPC_ELF_MACHINE, 0, 0);
3053             spapr->kernel_le = spapr->kernel_size > 0;
3054         }
3055         if (spapr->kernel_size < 0) {
3056             error_report("error loading %s: %s", kernel_filename,
3057                          load_elf_strerror(spapr->kernel_size));
3058             exit(1);
3059         }
3060 
3061         if (spapr->kernel_addr != loaded_addr) {
3062             warn_report("spapr: kernel_addr changed from 0x%"PRIx64
3063                         " to 0x%"PRIx64,
3064                         spapr->kernel_addr, loaded_addr);
3065             spapr->kernel_addr = loaded_addr;
3066         }
3067 
3068         /* load initrd */
3069         if (initrd_filename) {
3070             /* Try to locate the initrd in the gap between the kernel
3071              * and the firmware. Add a bit of space just in case
3072              */
3073             spapr->initrd_base = (spapr->kernel_addr + spapr->kernel_size
3074                                   + 0x1ffff) & ~0xffff;
3075             spapr->initrd_size = load_image_targphys(initrd_filename,
3076                                                      spapr->initrd_base,
3077                                                      load_limit
3078                                                      - spapr->initrd_base);
3079             if (spapr->initrd_size < 0) {
3080                 error_report("could not load initial ram disk '%s'",
3081                              initrd_filename);
3082                 exit(1);
3083             }
3084         }
3085     }
3086 
3087     /* FIXME: Should register things through the MachineState's qdev
3088      * interface, this is a legacy from the sPAPREnvironment structure
3089      * which predated MachineState but had a similar function */
3090     vmstate_register(NULL, 0, &vmstate_spapr, spapr);
3091     register_savevm_live("spapr/htab", VMSTATE_INSTANCE_ID_ANY, 1,
3092                          &savevm_htab_handlers, spapr);
3093 
3094     qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine));
3095 
3096     qemu_register_boot_set(spapr_boot_set, spapr);
3097 
3098     /*
3099      * Nothing needs to be done to resume a suspended guest because
3100      * suspending does not change the machine state, so no need for
3101      * a ->wakeup method.
3102      */
3103     qemu_register_wakeup_support();
3104 
3105     if (kvm_enabled()) {
3106         /* to stop and start vmclock */
3107         qemu_add_vm_change_state_handler(cpu_ppc_clock_vm_state_change,
3108                                          &spapr->tb);
3109 
3110         kvmppc_spapr_enable_inkernel_multitce();
3111     }
3112 
3113     qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
3114     if (spapr->vof) {
3115         spapr->vof->fw_size = fw_size; /* for claim() on itself */
3116         spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client);
3117     }
3118 
3119     spapr_watchdog_init(spapr);
3120 }
3121 
3122 #define DEFAULT_KVM_TYPE "auto"
3123 static int spapr_kvm_type(MachineState *machine, const char *vm_type)
3124 {
3125     /*
3126      * The use of g_ascii_strcasecmp() for 'hv' and 'pr' is to
3127      * accommodate the 'HV' and 'PV' formats that exists in the
3128      * wild. The 'auto' mode is being introduced already as
3129      * lower-case, thus we don't need to bother checking for
3130      * "AUTO".
3131      */
3132     if (!vm_type || !strcmp(vm_type, DEFAULT_KVM_TYPE)) {
3133         return 0;
3134     }
3135 
3136     if (!g_ascii_strcasecmp(vm_type, "hv")) {
3137         return 1;
3138     }
3139 
3140     if (!g_ascii_strcasecmp(vm_type, "pr")) {
3141         return 2;
3142     }
3143 
3144     error_report("Unknown kvm-type specified '%s'", vm_type);
3145     return -1;
3146 }
3147 
3148 /*
3149  * Implementation of an interface to adjust firmware path
3150  * for the bootindex property handling.
3151  */
3152 static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
3153                                    DeviceState *dev)
3154 {
3155 #define CAST(type, obj, name) \
3156     ((type *)object_dynamic_cast(OBJECT(obj), (name)))
3157     SCSIDevice *d = CAST(SCSIDevice,  dev, TYPE_SCSI_DEVICE);
3158     SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);
3159     VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON);
3160     PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
3161 
3162     if (d && bus) {
3163         void *spapr = CAST(void, bus->parent, "spapr-vscsi");
3164         VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
3165         USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);
3166 
3167         if (spapr) {
3168             /*
3169              * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
3170              * In the top 16 bits of the 64-bit LUN, we use SRP luns of the form
3171              * 0x8000 | (target << 8) | (bus << 5) | lun
3172              * (see the "Logical unit addressing format" table in SAM5)
3173              */
3174             unsigned id = 0x8000 | (d->id << 8) | (d->channel << 5) | d->lun;
3175             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3176                                    (uint64_t)id << 48);
3177         } else if (virtio) {
3178             /*
3179              * We use SRP luns of the form 01000000 | (target << 8) | lun
3180              * in the top 32 bits of the 64-bit LUN
3181              * Note: the quote above is from SLOF and it is wrong,
3182              * the actual binding is:
3183              * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
3184              */
3185             unsigned id = 0x1000000 | (d->id << 16) | d->lun;
3186             if (d->lun >= 256) {
3187                 /* Use the LUN "flat space addressing method" */
3188                 id |= 0x4000;
3189             }
3190             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3191                                    (uint64_t)id << 32);
3192         } else if (usb) {
3193             /*
3194              * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
3195              * in the top 32 bits of the 64-bit LUN
3196              */
3197             unsigned usb_port = atoi(usb->port->path);
3198             unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
3199             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3200                                    (uint64_t)id << 32);
3201         }
3202     }
3203 
3204     /*
3205      * SLOF probes the USB devices, and if it recognizes that the device is a
3206      * storage device, it changes its name to "storage" instead of "usb-host",
3207      * and additionally adds a child node for the SCSI LUN, so the correct
3208      * boot path in SLOF is something like .../storage@1/disk@xxx" instead.
3209      */
3210     if (strcmp("usb-host", qdev_fw_name(dev)) == 0) {
3211         USBDevice *usbdev = CAST(USBDevice, dev, TYPE_USB_DEVICE);
3212         if (usb_device_is_scsi_storage(usbdev)) {
3213             return g_strdup_printf("storage@%s/disk", usbdev->port->path);
3214         }
3215     }
3216 
3217     if (phb) {
3218         /* Replace "pci" with "pci@800000020000000" */
3219         return g_strdup_printf("pci@%"PRIX64, phb->buid);
3220     }
3221 
3222     if (vsc) {
3223         /* Same logic as virtio above */
3224         unsigned id = 0x1000000 | (vsc->target << 16) | vsc->lun;
3225         return g_strdup_printf("disk@%"PRIX64, (uint64_t)id << 32);
3226     }
3227 
3228     if (g_str_equal("pci-bridge", qdev_fw_name(dev))) {
3229         /* SLOF uses "pci" instead of "pci-bridge" for PCI bridges */
3230         PCIDevice *pdev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
3231         return g_strdup_printf("pci@%x", PCI_SLOT(pdev->devfn));
3232     }
3233 
3234     if (pcidev) {
3235         return spapr_pci_fw_dev_name(pcidev);
3236     }
3237 
3238     return NULL;
3239 }
3240 
3241 static char *spapr_get_kvm_type(Object *obj, Error **errp)
3242 {
3243     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3244 
3245     return g_strdup(spapr->kvm_type);
3246 }
3247 
3248 static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
3249 {
3250     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3251 
3252     g_free(spapr->kvm_type);
3253     spapr->kvm_type = g_strdup(value);
3254 }
3255 
3256 static bool spapr_get_modern_hotplug_events(Object *obj, Error **errp)
3257 {
3258     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3259 
3260     return spapr->use_hotplug_event_source;
3261 }
3262 
3263 static void spapr_set_modern_hotplug_events(Object *obj, bool value,
3264                                             Error **errp)
3265 {
3266     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3267 
3268     spapr->use_hotplug_event_source = value;
3269 }
3270 
3271 static bool spapr_get_msix_emulation(Object *obj, Error **errp)
3272 {
3273     return true;
3274 }
3275 
3276 static char *spapr_get_resize_hpt(Object *obj, Error **errp)
3277 {
3278     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3279 
3280     switch (spapr->resize_hpt) {
3281     case SPAPR_RESIZE_HPT_DEFAULT:
3282         return g_strdup("default");
3283     case SPAPR_RESIZE_HPT_DISABLED:
3284         return g_strdup("disabled");
3285     case SPAPR_RESIZE_HPT_ENABLED:
3286         return g_strdup("enabled");
3287     case SPAPR_RESIZE_HPT_REQUIRED:
3288         return g_strdup("required");
3289     }
3290     g_assert_not_reached();
3291 }
3292 
3293 static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
3294 {
3295     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3296 
3297     if (strcmp(value, "default") == 0) {
3298         spapr->resize_hpt = SPAPR_RESIZE_HPT_DEFAULT;
3299     } else if (strcmp(value, "disabled") == 0) {
3300         spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
3301     } else if (strcmp(value, "enabled") == 0) {
3302         spapr->resize_hpt = SPAPR_RESIZE_HPT_ENABLED;
3303     } else if (strcmp(value, "required") == 0) {
3304         spapr->resize_hpt = SPAPR_RESIZE_HPT_REQUIRED;
3305     } else {
3306         error_setg(errp, "Bad value for \"resize-hpt\" property");
3307     }
3308 }
3309 
3310 static bool spapr_get_vof(Object *obj, Error **errp)
3311 {
3312     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3313 
3314     return spapr->vof != NULL;
3315 }
3316 
3317 static void spapr_set_vof(Object *obj, bool value, Error **errp)
3318 {
3319     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3320 
3321     if (spapr->vof) {
3322         vof_cleanup(spapr->vof);
3323         g_free(spapr->vof);
3324         spapr->vof = NULL;
3325     }
3326     if (!value) {
3327         return;
3328     }
3329     spapr->vof = g_malloc0(sizeof(*spapr->vof));
3330 }
3331 
3332 static char *spapr_get_ic_mode(Object *obj, Error **errp)
3333 {
3334     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3335 
3336     if (spapr->irq == &spapr_irq_xics_legacy) {
3337         return g_strdup("legacy");
3338     } else if (spapr->irq == &spapr_irq_xics) {
3339         return g_strdup("xics");
3340     } else if (spapr->irq == &spapr_irq_xive) {
3341         return g_strdup("xive");
3342     } else if (spapr->irq == &spapr_irq_dual) {
3343         return g_strdup("dual");
3344     }
3345     g_assert_not_reached();
3346 }
3347 
3348 static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
3349 {
3350     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3351 
3352     if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
3353         error_setg(errp, "This machine only uses the legacy XICS backend, don't pass ic-mode");
3354         return;
3355     }
3356 
3357     /* The legacy IRQ backend can not be set */
3358     if (strcmp(value, "xics") == 0) {
3359         spapr->irq = &spapr_irq_xics;
3360     } else if (strcmp(value, "xive") == 0) {
3361         spapr->irq = &spapr_irq_xive;
3362     } else if (strcmp(value, "dual") == 0) {
3363         spapr->irq = &spapr_irq_dual;
3364     } else {
3365         error_setg(errp, "Bad value for \"ic-mode\" property");
3366     }
3367 }
3368 
3369 static char *spapr_get_host_model(Object *obj, Error **errp)
3370 {
3371     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3372 
3373     return g_strdup(spapr->host_model);
3374 }
3375 
3376 static void spapr_set_host_model(Object *obj, const char *value, Error **errp)
3377 {
3378     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3379 
3380     g_free(spapr->host_model);
3381     spapr->host_model = g_strdup(value);
3382 }
3383 
3384 static char *spapr_get_host_serial(Object *obj, Error **errp)
3385 {
3386     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3387 
3388     return g_strdup(spapr->host_serial);
3389 }
3390 
3391 static void spapr_set_host_serial(Object *obj, const char *value, Error **errp)
3392 {
3393     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3394 
3395     g_free(spapr->host_serial);
3396     spapr->host_serial = g_strdup(value);
3397 }
3398 
3399 static void spapr_instance_init(Object *obj)
3400 {
3401     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3402     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
3403     MachineState *ms = MACHINE(spapr);
3404     MachineClass *mc = MACHINE_GET_CLASS(ms);
3405 
3406     /*
3407      * NVDIMM support went live in 5.1 without considering that, in
3408      * other archs, the user needs to enable NVDIMM support with the
3409      * 'nvdimm' machine option and the default behavior is NVDIMM
3410      * support disabled. It is too late to roll back to the standard
3411      * behavior without breaking 5.1 guests.
3412      */
3413     if (mc->nvdimm_supported) {
3414         ms->nvdimms_state->is_enabled = true;
3415     }
3416 
3417     spapr->htab_fd = -1;
3418     spapr->use_hotplug_event_source = true;
3419     spapr->kvm_type = g_strdup(DEFAULT_KVM_TYPE);
3420     object_property_add_str(obj, "kvm-type",
3421                             spapr_get_kvm_type, spapr_set_kvm_type);
3422     object_property_set_description(obj, "kvm-type",
3423                                     "Specifies the KVM virtualization mode (auto,"
3424                                     " hv, pr). Defaults to 'auto'. This mode will use"
3425                                     " any available KVM module loaded in the host,"
3426                                     " where kvm_hv takes precedence if both kvm_hv and"
3427                                     " kvm_pr are loaded.");
3428     object_property_add_bool(obj, "modern-hotplug-events",
3429                             spapr_get_modern_hotplug_events,
3430                             spapr_set_modern_hotplug_events);
3431     object_property_set_description(obj, "modern-hotplug-events",
3432                                     "Use dedicated hotplug event mechanism in"
3433                                     " place of standard EPOW events when possible"
3434                                     " (required for memory hot-unplug support)");
3435     ppc_compat_add_property(obj, "max-cpu-compat", &spapr->max_compat_pvr,
3436                             "Maximum permitted CPU compatibility mode");
3437 
3438     object_property_add_str(obj, "resize-hpt",
3439                             spapr_get_resize_hpt, spapr_set_resize_hpt);
3440     object_property_set_description(obj, "resize-hpt",
3441                                     "Resizing of the Hash Page Table (enabled, disabled, required)");
3442     object_property_add_uint32_ptr(obj, "vsmt",
3443                                    &spapr->vsmt, OBJ_PROP_FLAG_READWRITE);
3444     object_property_set_description(obj, "vsmt",
3445                                     "Virtual SMT: KVM behaves as if this were"
3446                                     " the host's SMT mode");
3447 
3448     object_property_add_bool(obj, "vfio-no-msix-emulation",
3449                              spapr_get_msix_emulation, NULL);
3450 
3451     object_property_add_uint64_ptr(obj, "kernel-addr",
3452                                    &spapr->kernel_addr, OBJ_PROP_FLAG_READWRITE);
3453     object_property_set_description(obj, "kernel-addr",
3454                                     stringify(KERNEL_LOAD_ADDR)
3455                                     " for -kernel is the default");
3456     spapr->kernel_addr = KERNEL_LOAD_ADDR;
3457 
3458     object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof);
3459     object_property_set_description(obj, "x-vof",
3460                                     "Enable Virtual Open Firmware (experimental)");
3461 
3462     /* The machine class defines the default interrupt controller mode */
3463     spapr->irq = smc->irq;
3464     object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
3465                             spapr_set_ic_mode);
3466     object_property_set_description(obj, "ic-mode",
3467                  "Specifies the interrupt controller mode (xics, xive, dual)");
3468 
3469     object_property_add_str(obj, "host-model",
3470         spapr_get_host_model, spapr_set_host_model);
3471     object_property_set_description(obj, "host-model",
3472         "Host model to advertise in guest device tree");
3473     object_property_add_str(obj, "host-serial",
3474         spapr_get_host_serial, spapr_set_host_serial);
3475     object_property_set_description(obj, "host-serial",
3476         "Host serial number to advertise in guest device tree");
3477 }
3478 
3479 static void spapr_machine_finalizefn(Object *obj)
3480 {
3481     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3482 
3483     g_free(spapr->kvm_type);
3484 }
3485 
3486 void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg)
3487 {
3488     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
3489     CPUPPCState *env = cpu_env(cs);
3490 
3491     cpu_synchronize_state(cs);
3492     /* If FWNMI is inactive, addr will be -1, which will deliver to 0x100 */
3493     if (spapr->fwnmi_system_reset_addr != -1) {
3494         uint64_t rtas_addr, addr;
3495 
3496         /* get rtas addr from fdt */
3497         rtas_addr = spapr_get_rtas_addr();
3498         if (!rtas_addr) {
3499             qemu_system_guest_panicked(NULL);
3500             return;
3501         }
3502 
3503         addr = rtas_addr + RTAS_ERROR_LOG_MAX + cs->cpu_index * sizeof(uint64_t)*2;
3504         stq_be_phys(&address_space_memory, addr, env->gpr[3]);
3505         stq_be_phys(&address_space_memory, addr + sizeof(uint64_t), 0);
3506         env->gpr[3] = addr;
3507     }
3508     ppc_cpu_do_system_reset(cs);
3509     if (spapr->fwnmi_system_reset_addr != -1) {
3510         env->nip = spapr->fwnmi_system_reset_addr;
3511     }
3512 }
3513 
3514 static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
3515 {
3516     CPUState *cs;
3517 
3518     CPU_FOREACH(cs) {
3519         async_run_on_cpu(cs, spapr_do_system_reset_on_cpu, RUN_ON_CPU_NULL);
3520     }
3521 }
3522 
3523 int spapr_lmb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3524                           void *fdt, int *fdt_start_offset, Error **errp)
3525 {
3526     uint64_t addr;
3527     uint32_t node;
3528 
3529     addr = spapr_drc_index(drc) * SPAPR_MEMORY_BLOCK_SIZE;
3530     node = object_property_get_uint(OBJECT(drc->dev), PC_DIMM_NODE_PROP,
3531                                     &error_abort);
3532     *fdt_start_offset = spapr_dt_memory_node(spapr, fdt, node, addr,
3533                                              SPAPR_MEMORY_BLOCK_SIZE);
3534     return 0;
3535 }
3536 
3537 static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
3538                            bool dedicated_hp_event_source)
3539 {
3540     SpaprDrc *drc;
3541     uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE;
3542     int i;
3543     uint64_t addr = addr_start;
3544     bool hotplugged = spapr_drc_hotplugged(dev);
3545 
3546     for (i = 0; i < nr_lmbs; i++) {
3547         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3548                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3549         g_assert(drc);
3550 
3551         /*
3552          * memory_device_get_free_addr() provided a range of free addresses
3553          * that doesn't overlap with any existing mapping at pre-plug. The
3554          * corresponding LMB DRCs are thus assumed to be all attachable.
3555          */
3556         spapr_drc_attach(drc, dev);
3557         if (!hotplugged) {
3558             spapr_drc_reset(drc);
3559         }
3560         addr += SPAPR_MEMORY_BLOCK_SIZE;
3561     }
3562     /* send hotplug notification to the
3563      * guest only in case of hotplugged memory
3564      */
3565     if (hotplugged) {
3566         if (dedicated_hp_event_source) {
3567             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3568                                   addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3569             g_assert(drc);
3570             spapr_hotplug_req_add_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3571                                                    nr_lmbs,
3572                                                    spapr_drc_index(drc));
3573         } else {
3574             spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB,
3575                                            nr_lmbs);
3576         }
3577     }
3578 }
3579 
3580 static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
3581 {
3582     SpaprMachineState *ms = SPAPR_MACHINE(hotplug_dev);
3583     PCDIMMDevice *dimm = PC_DIMM(dev);
3584     uint64_t size, addr;
3585     int64_t slot;
3586     bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
3587 
3588     size = memory_device_get_region_size(MEMORY_DEVICE(dev), &error_abort);
3589 
3590     pc_dimm_plug(dimm, MACHINE(ms));
3591 
3592     if (!is_nvdimm) {
3593         addr = object_property_get_uint(OBJECT(dimm),
3594                                         PC_DIMM_ADDR_PROP, &error_abort);
3595         spapr_add_lmbs(dev, addr, size,
3596                        spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT));
3597     } else {
3598         slot = object_property_get_int(OBJECT(dimm),
3599                                        PC_DIMM_SLOT_PROP, &error_abort);
3600         /* We should have valid slot number at this point */
3601         g_assert(slot >= 0);
3602         spapr_add_nvdimm(dev, slot);
3603     }
3604 }
3605 
3606 static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3607                                   Error **errp)
3608 {
3609     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3610     bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
3611     PCDIMMDevice *dimm = PC_DIMM(dev);
3612     Error *local_err = NULL;
3613     uint64_t size;
3614     Object *memdev;
3615     hwaddr pagesize;
3616 
3617     size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &local_err);
3618     if (local_err) {
3619         error_propagate(errp, local_err);
3620         return;
3621     }
3622 
3623     if (is_nvdimm) {
3624         if (!spapr_nvdimm_validate(hotplug_dev, NVDIMM(dev), size, errp)) {
3625             return;
3626         }
3627     } else if (size % SPAPR_MEMORY_BLOCK_SIZE) {
3628         error_setg(errp, "Hotplugged memory size must be a multiple of "
3629                    "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
3630         return;
3631     }
3632 
3633     memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
3634                                       &error_abort);
3635     pagesize = host_memory_backend_pagesize(MEMORY_BACKEND(memdev));
3636     if (!spapr_check_pagesize(spapr, pagesize, errp)) {
3637         return;
3638     }
3639 
3640     pc_dimm_pre_plug(dimm, MACHINE(hotplug_dev), errp);
3641 }
3642 
3643 struct SpaprDimmState {
3644     PCDIMMDevice *dimm;
3645     uint32_t nr_lmbs;
3646     QTAILQ_ENTRY(SpaprDimmState) next;
3647 };
3648 
3649 static SpaprDimmState *spapr_pending_dimm_unplugs_find(SpaprMachineState *s,
3650                                                        PCDIMMDevice *dimm)
3651 {
3652     SpaprDimmState *dimm_state = NULL;
3653 
3654     QTAILQ_FOREACH(dimm_state, &s->pending_dimm_unplugs, next) {
3655         if (dimm_state->dimm == dimm) {
3656             break;
3657         }
3658     }
3659     return dimm_state;
3660 }
3661 
3662 static SpaprDimmState *spapr_pending_dimm_unplugs_add(SpaprMachineState *spapr,
3663                                                       uint32_t nr_lmbs,
3664                                                       PCDIMMDevice *dimm)
3665 {
3666     SpaprDimmState *ds = NULL;
3667 
3668     /*
3669      * If this request is for a DIMM whose removal had failed earlier
3670      * (due to guest's refusal to remove the LMBs), we would have this
3671      * dimm already in the pending_dimm_unplugs list. In that
3672      * case don't add again.
3673      */
3674     ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
3675     if (!ds) {
3676         ds = g_new0(SpaprDimmState, 1);
3677         ds->nr_lmbs = nr_lmbs;
3678         ds->dimm = dimm;
3679         QTAILQ_INSERT_HEAD(&spapr->pending_dimm_unplugs, ds, next);
3680     }
3681     return ds;
3682 }
3683 
3684 static void spapr_pending_dimm_unplugs_remove(SpaprMachineState *spapr,
3685                                               SpaprDimmState *dimm_state)
3686 {
3687     QTAILQ_REMOVE(&spapr->pending_dimm_unplugs, dimm_state, next);
3688     g_free(dimm_state);
3689 }
3690 
3691 static SpaprDimmState *spapr_recover_pending_dimm_state(SpaprMachineState *ms,
3692                                                         PCDIMMDevice *dimm)
3693 {
3694     SpaprDrc *drc;
3695     uint64_t size = memory_device_get_region_size(MEMORY_DEVICE(dimm),
3696                                                   &error_abort);
3697     uint32_t nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3698     uint32_t avail_lmbs = 0;
3699     uint64_t addr_start, addr;
3700     int i;
3701 
3702     addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3703                                           &error_abort);
3704 
3705     addr = addr_start;
3706     for (i = 0; i < nr_lmbs; i++) {
3707         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3708                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3709         g_assert(drc);
3710         if (drc->dev) {
3711             avail_lmbs++;
3712         }
3713         addr += SPAPR_MEMORY_BLOCK_SIZE;
3714     }
3715 
3716     return spapr_pending_dimm_unplugs_add(ms, avail_lmbs, dimm);
3717 }
3718 
3719 void spapr_memory_unplug_rollback(SpaprMachineState *spapr, DeviceState *dev)
3720 {
3721     SpaprDimmState *ds;
3722     PCDIMMDevice *dimm;
3723     SpaprDrc *drc;
3724     uint32_t nr_lmbs;
3725     uint64_t size, addr_start, addr;
3726     int i;
3727 
3728     if (!dev) {
3729         return;
3730     }
3731 
3732     dimm = PC_DIMM(dev);
3733     ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
3734 
3735     /*
3736      * 'ds == NULL' would mean that the DIMM doesn't have a pending
3737      * unplug state, but one of its DRC is marked as unplug_requested.
3738      * This is bad and weird enough to g_assert() out.
3739      */
3740     g_assert(ds);
3741 
3742     spapr_pending_dimm_unplugs_remove(spapr, ds);
3743 
3744     size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
3745     nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3746 
3747     addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3748                                           &error_abort);
3749 
3750     addr = addr_start;
3751     for (i = 0; i < nr_lmbs; i++) {
3752         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3753                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3754         g_assert(drc);
3755 
3756         drc->unplug_requested = false;
3757         addr += SPAPR_MEMORY_BLOCK_SIZE;
3758     }
3759 
3760     /*
3761      * Tell QAPI that something happened and the memory
3762      * hotunplug wasn't successful.
3763      */
3764     qapi_event_send_device_unplug_guest_error(dev->id,
3765                                               dev->canonical_path);
3766 }
3767 
3768 /* Callback to be called during DRC release. */
3769 void spapr_lmb_release(DeviceState *dev)
3770 {
3771     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3772     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_ctrl);
3773     SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3774 
3775     /* This information will get lost if a migration occurs
3776      * during the unplug process. In this case recover it. */
3777     if (ds == NULL) {
3778         ds = spapr_recover_pending_dimm_state(spapr, PC_DIMM(dev));
3779         g_assert(ds);
3780         /* The DRC being examined by the caller at least must be counted */
3781         g_assert(ds->nr_lmbs);
3782     }
3783 
3784     if (--ds->nr_lmbs) {
3785         return;
3786     }
3787 
3788     /*
3789      * Now that all the LMBs have been removed by the guest, call the
3790      * unplug handler chain. This can never fail.
3791      */
3792     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3793     object_unparent(OBJECT(dev));
3794 }
3795 
3796 static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3797 {
3798     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3799     SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3800 
3801     /* We really shouldn't get this far without anything to unplug */
3802     g_assert(ds);
3803 
3804     pc_dimm_unplug(PC_DIMM(dev), MACHINE(hotplug_dev));
3805     qdev_unrealize(dev);
3806     spapr_pending_dimm_unplugs_remove(spapr, ds);
3807 }
3808 
3809 static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
3810                                         DeviceState *dev, Error **errp)
3811 {
3812     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3813     PCDIMMDevice *dimm = PC_DIMM(dev);
3814     uint32_t nr_lmbs;
3815     uint64_t size, addr_start, addr;
3816     int i;
3817     SpaprDrc *drc;
3818 
3819     if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
3820         error_setg(errp, "nvdimm device hot unplug is not supported yet.");
3821         return;
3822     }
3823 
3824     size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
3825     nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3826 
3827     addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3828                                           &error_abort);
3829 
3830     /*
3831      * An existing pending dimm state for this DIMM means that there is an
3832      * unplug operation in progress, waiting for the spapr_lmb_release
3833      * callback to complete the job (BQL can't cover that far). In this case,
3834      * bail out to avoid detaching DRCs that were already released.
3835      */
3836     if (spapr_pending_dimm_unplugs_find(spapr, dimm)) {
3837         error_setg(errp, "Memory unplug already in progress for device %s",
3838                    dev->id);
3839         return;
3840     }
3841 
3842     spapr_pending_dimm_unplugs_add(spapr, nr_lmbs, dimm);
3843 
3844     addr = addr_start;
3845     for (i = 0; i < nr_lmbs; i++) {
3846         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3847                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3848         g_assert(drc);
3849 
3850         spapr_drc_unplug_request(drc);
3851         addr += SPAPR_MEMORY_BLOCK_SIZE;
3852     }
3853 
3854     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3855                           addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3856     spapr_hotplug_req_remove_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3857                                               nr_lmbs, spapr_drc_index(drc));
3858 }
3859 
3860 /* Callback to be called during DRC release. */
3861 void spapr_core_release(DeviceState *dev)
3862 {
3863     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3864 
3865     /* Call the unplug handler chain. This can never fail. */
3866     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3867     object_unparent(OBJECT(dev));
3868 }
3869 
3870 static void spapr_core_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3871 {
3872     MachineState *ms = MACHINE(hotplug_dev);
3873     CPUCore *cc = CPU_CORE(dev);
3874     CPUArchId *core_slot = spapr_find_cpu_slot(ms, cc->core_id, NULL);
3875 
3876     assert(core_slot);
3877     core_slot->cpu = NULL;
3878     qdev_unrealize(dev);
3879 }
3880 
3881 static
3882 void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev,
3883                                Error **errp)
3884 {
3885     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3886     int index;
3887     SpaprDrc *drc;
3888     CPUCore *cc = CPU_CORE(dev);
3889 
3890     if (!spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index)) {
3891         error_setg(errp, "Unable to find CPU core with core-id: %d",
3892                    cc->core_id);
3893         return;
3894     }
3895     if (index == 0) {
3896         error_setg(errp, "Boot CPU core may not be unplugged");
3897         return;
3898     }
3899 
3900     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3901                           spapr_vcpu_id(spapr, cc->core_id));
3902     g_assert(drc);
3903 
3904     if (!spapr_drc_unplug_requested(drc)) {
3905         spapr_drc_unplug_request(drc);
3906     }
3907 
3908     /*
3909      * spapr_hotplug_req_remove_by_index is left unguarded, out of the
3910      * "!spapr_drc_unplug_requested" check, to allow for multiple IRQ
3911      * pulses removing the same CPU. Otherwise, in an failed hotunplug
3912      * attempt (e.g. the kernel will refuse to remove the last online
3913      * CPU), we will never attempt it again because unplug_requested
3914      * will still be 'true' in that case.
3915      */
3916     spapr_hotplug_req_remove_by_index(drc);
3917 }
3918 
3919 int spapr_core_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3920                            void *fdt, int *fdt_start_offset, Error **errp)
3921 {
3922     SpaprCpuCore *core = SPAPR_CPU_CORE(drc->dev);
3923     CPUState *cs = CPU(core->threads[0]);
3924     PowerPCCPU *cpu = POWERPC_CPU(cs);
3925     DeviceClass *dc = DEVICE_GET_CLASS(cs);
3926     int id = spapr_get_vcpu_id(cpu);
3927     g_autofree char *nodename = NULL;
3928     int offset;
3929 
3930     nodename = g_strdup_printf("%s@%x", dc->fw_name, id);
3931     offset = fdt_add_subnode(fdt, 0, nodename);
3932 
3933     spapr_dt_cpu(cs, fdt, offset, spapr);
3934 
3935     /*
3936      * spapr_dt_cpu() does not fill the 'name' property in the
3937      * CPU node. The function is called during boot process, before
3938      * and after CAS, and overwriting the 'name' property written
3939      * by SLOF is not allowed.
3940      *
3941      * Write it manually after spapr_dt_cpu(). This makes the hotplug
3942      * CPUs more compatible with the coldplugged ones, which have
3943      * the 'name' property. Linux Kernel also relies on this
3944      * property to identify CPU nodes.
3945      */
3946     _FDT((fdt_setprop_string(fdt, offset, "name", nodename)));
3947 
3948     *fdt_start_offset = offset;
3949     return 0;
3950 }
3951 
3952 static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
3953 {
3954     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3955     MachineClass *mc = MACHINE_GET_CLASS(spapr);
3956     SpaprCpuCore *core = SPAPR_CPU_CORE(OBJECT(dev));
3957     CPUCore *cc = CPU_CORE(dev);
3958     SpaprDrc *drc;
3959     CPUArchId *core_slot;
3960     int index;
3961     bool hotplugged = spapr_drc_hotplugged(dev);
3962     int i;
3963 
3964     core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
3965     g_assert(core_slot); /* Already checked in spapr_core_pre_plug() */
3966 
3967     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3968                           spapr_vcpu_id(spapr, cc->core_id));
3969 
3970     g_assert(drc || !mc->has_hotpluggable_cpus);
3971 
3972     if (drc) {
3973         /*
3974          * spapr_core_pre_plug() already buys us this is a brand new
3975          * core being plugged into a free slot. Nothing should already
3976          * be attached to the corresponding DRC.
3977          */
3978         spapr_drc_attach(drc, dev);
3979 
3980         if (hotplugged) {
3981             /*
3982              * Send hotplug notification interrupt to the guest only
3983              * in case of hotplugged CPUs.
3984              */
3985             spapr_hotplug_req_add_by_index(drc);
3986         } else {
3987             spapr_drc_reset(drc);
3988         }
3989     }
3990 
3991     core_slot->cpu = CPU(dev);
3992 
3993     /*
3994      * Set compatibility mode to match the boot CPU, which was either set
3995      * by the machine reset code or by CAS. This really shouldn't fail at
3996      * this point.
3997      */
3998     if (hotplugged) {
3999         for (i = 0; i < cc->nr_threads; i++) {
4000             ppc_set_compat(core->threads[i], POWERPC_CPU(first_cpu)->compat_pvr,
4001                            &error_abort);
4002         }
4003     }
4004 
4005 }
4006 
4007 static void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
4008                                 Error **errp)
4009 {
4010     MachineState *machine = MACHINE(OBJECT(hotplug_dev));
4011     MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
4012     CPUCore *cc = CPU_CORE(dev);
4013     const char *base_core_type = spapr_get_cpu_core_type(machine->cpu_type);
4014     const char *type = object_get_typename(OBJECT(dev));
4015     CPUArchId *core_slot;
4016     int index;
4017     unsigned int smp_threads = machine->smp.threads;
4018 
4019     if (dev->hotplugged && !mc->has_hotpluggable_cpus) {
4020         error_setg(errp, "CPU hotplug not supported for this machine");
4021         return;
4022     }
4023 
4024     if (strcmp(base_core_type, type)) {
4025         error_setg(errp, "CPU core type should be %s", base_core_type);
4026         return;
4027     }
4028 
4029     if (cc->core_id % smp_threads) {
4030         error_setg(errp, "invalid core id %d", cc->core_id);
4031         return;
4032     }
4033 
4034     /*
4035      * In general we should have homogeneous threads-per-core, but old
4036      * (pre hotplug support) machine types allow the last core to have
4037      * reduced threads as a compatibility hack for when we allowed
4038      * total vcpus not a multiple of threads-per-core.
4039      */
4040     if (mc->has_hotpluggable_cpus && (cc->nr_threads != smp_threads)) {
4041         error_setg(errp, "invalid nr-threads %d, must be %d", cc->nr_threads,
4042                    smp_threads);
4043         return;
4044     }
4045 
4046     core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
4047     if (!core_slot) {
4048         error_setg(errp, "core id %d out of range", cc->core_id);
4049         return;
4050     }
4051 
4052     if (core_slot->cpu) {
4053         error_setg(errp, "core %d already populated", cc->core_id);
4054         return;
4055     }
4056 
4057     numa_cpu_pre_plug(core_slot, dev, errp);
4058 }
4059 
4060 int spapr_phb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
4061                           void *fdt, int *fdt_start_offset, Error **errp)
4062 {
4063     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(drc->dev);
4064     int intc_phandle;
4065 
4066     intc_phandle = spapr_irq_get_phandle(spapr, spapr->fdt_blob, errp);
4067     if (intc_phandle <= 0) {
4068         return -1;
4069     }
4070 
4071     if (spapr_dt_phb(spapr, sphb, intc_phandle, fdt, fdt_start_offset)) {
4072         error_setg(errp, "unable to create FDT node for PHB %d", sphb->index);
4073         return -1;
4074     }
4075 
4076     /* generally SLOF creates these, for hotplug it's up to QEMU */
4077     _FDT(fdt_setprop_string(fdt, *fdt_start_offset, "name", "pci"));
4078 
4079     return 0;
4080 }
4081 
4082 static bool spapr_phb_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
4083                                Error **errp)
4084 {
4085     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4086     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4087     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
4088     const unsigned windows_supported = spapr_phb_windows_supported(sphb);
4089     SpaprDrc *drc;
4090 
4091     if (dev->hotplugged && !smc->dr_phb_enabled) {
4092         error_setg(errp, "PHB hotplug not supported for this machine");
4093         return false;
4094     }
4095 
4096     if (sphb->index == (uint32_t)-1) {
4097         error_setg(errp, "\"index\" for PAPR PHB is mandatory");
4098         return false;
4099     }
4100 
4101     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4102     if (drc && drc->dev) {
4103         error_setg(errp, "PHB %d already attached", sphb->index);
4104         return false;
4105     }
4106 
4107     /*
4108      * This will check that sphb->index doesn't exceed the maximum number of
4109      * PHBs for the current machine type.
4110      */
4111     return
4112         smc->phb_placement(spapr, sphb->index,
4113                            &sphb->buid, &sphb->io_win_addr,
4114                            &sphb->mem_win_addr, &sphb->mem64_win_addr,
4115                            windows_supported, sphb->dma_liobn,
4116                            errp);
4117 }
4118 
4119 static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
4120 {
4121     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4122     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
4123     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4124     SpaprDrc *drc;
4125     bool hotplugged = spapr_drc_hotplugged(dev);
4126 
4127     if (!smc->dr_phb_enabled) {
4128         return;
4129     }
4130 
4131     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4132     /* hotplug hooks should check it's enabled before getting this far */
4133     assert(drc);
4134 
4135     /* spapr_phb_pre_plug() already checked the DRC is attachable */
4136     spapr_drc_attach(drc, dev);
4137 
4138     if (hotplugged) {
4139         spapr_hotplug_req_add_by_index(drc);
4140     } else {
4141         spapr_drc_reset(drc);
4142     }
4143 }
4144 
4145 void spapr_phb_release(DeviceState *dev)
4146 {
4147     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
4148 
4149     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
4150     object_unparent(OBJECT(dev));
4151 }
4152 
4153 static void spapr_phb_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
4154 {
4155     qdev_unrealize(dev);
4156 }
4157 
4158 static void spapr_phb_unplug_request(HotplugHandler *hotplug_dev,
4159                                      DeviceState *dev, Error **errp)
4160 {
4161     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4162     SpaprDrc *drc;
4163 
4164     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4165     assert(drc);
4166 
4167     if (!spapr_drc_unplug_requested(drc)) {
4168         spapr_drc_unplug_request(drc);
4169         spapr_hotplug_req_remove_by_index(drc);
4170     } else {
4171         error_setg(errp,
4172                    "PCI Host Bridge unplug already in progress for device %s",
4173                    dev->id);
4174     }
4175 }
4176 
4177 static
4178 bool spapr_tpm_proxy_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
4179                               Error **errp)
4180 {
4181     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4182 
4183     if (spapr->tpm_proxy != NULL) {
4184         error_setg(errp, "Only one TPM proxy can be specified for this machine");
4185         return false;
4186     }
4187 
4188     return true;
4189 }
4190 
4191 static void spapr_tpm_proxy_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
4192 {
4193     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4194     SpaprTpmProxy *tpm_proxy = SPAPR_TPM_PROXY(dev);
4195 
4196     /* Already checked in spapr_tpm_proxy_pre_plug() */
4197     g_assert(spapr->tpm_proxy == NULL);
4198 
4199     spapr->tpm_proxy = tpm_proxy;
4200 }
4201 
4202 static void spapr_tpm_proxy_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
4203 {
4204     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4205 
4206     qdev_unrealize(dev);
4207     object_unparent(OBJECT(dev));
4208     spapr->tpm_proxy = NULL;
4209 }
4210 
4211 static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
4212                                       DeviceState *dev, Error **errp)
4213 {
4214     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4215         spapr_memory_plug(hotplug_dev, dev);
4216     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4217         spapr_core_plug(hotplug_dev, dev);
4218     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4219         spapr_phb_plug(hotplug_dev, dev);
4220     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4221         spapr_tpm_proxy_plug(hotplug_dev, dev);
4222     }
4223 }
4224 
4225 static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
4226                                         DeviceState *dev, Error **errp)
4227 {
4228     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4229         spapr_memory_unplug(hotplug_dev, dev);
4230     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4231         spapr_core_unplug(hotplug_dev, dev);
4232     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4233         spapr_phb_unplug(hotplug_dev, dev);
4234     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4235         spapr_tpm_proxy_unplug(hotplug_dev, dev);
4236     }
4237 }
4238 
4239 bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr)
4240 {
4241     return spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT) ||
4242         /*
4243          * CAS will process all pending unplug requests.
4244          *
4245          * HACK: a guest could theoretically have cleared all bits in OV5,
4246          * but none of the guests we care for do.
4247          */
4248         spapr_ovec_empty(spapr->ov5_cas);
4249 }
4250 
4251 static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev,
4252                                                 DeviceState *dev, Error **errp)
4253 {
4254     SpaprMachineState *sms = SPAPR_MACHINE(OBJECT(hotplug_dev));
4255     MachineClass *mc = MACHINE_GET_CLASS(sms);
4256     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4257 
4258     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4259         if (spapr_memory_hot_unplug_supported(sms)) {
4260             spapr_memory_unplug_request(hotplug_dev, dev, errp);
4261         } else {
4262             error_setg(errp, "Memory hot unplug not supported for this guest");
4263         }
4264     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4265         if (!mc->has_hotpluggable_cpus) {
4266             error_setg(errp, "CPU hot unplug not supported on this machine");
4267             return;
4268         }
4269         spapr_core_unplug_request(hotplug_dev, dev, errp);
4270     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4271         if (!smc->dr_phb_enabled) {
4272             error_setg(errp, "PHB hot unplug not supported on this machine");
4273             return;
4274         }
4275         spapr_phb_unplug_request(hotplug_dev, dev, errp);
4276     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4277         spapr_tpm_proxy_unplug(hotplug_dev, dev);
4278     }
4279 }
4280 
4281 static void spapr_machine_device_pre_plug(HotplugHandler *hotplug_dev,
4282                                           DeviceState *dev, Error **errp)
4283 {
4284     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4285         spapr_memory_pre_plug(hotplug_dev, dev, errp);
4286     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4287         spapr_core_pre_plug(hotplug_dev, dev, errp);
4288     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4289         spapr_phb_pre_plug(hotplug_dev, dev, errp);
4290     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4291         spapr_tpm_proxy_pre_plug(hotplug_dev, dev, errp);
4292     }
4293 }
4294 
4295 static HotplugHandler *spapr_get_hotplug_handler(MachineState *machine,
4296                                                  DeviceState *dev)
4297 {
4298     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
4299         object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE) ||
4300         object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE) ||
4301         object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4302         return HOTPLUG_HANDLER(machine);
4303     }
4304     if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
4305         PCIDevice *pcidev = PCI_DEVICE(dev);
4306         PCIBus *root = pci_device_root_bus(pcidev);
4307         SpaprPhbState *phb =
4308             (SpaprPhbState *)object_dynamic_cast(OBJECT(BUS(root)->parent),
4309                                                  TYPE_SPAPR_PCI_HOST_BRIDGE);
4310 
4311         if (phb) {
4312             return HOTPLUG_HANDLER(phb);
4313         }
4314     }
4315     return NULL;
4316 }
4317 
4318 static CpuInstanceProperties
4319 spapr_cpu_index_to_props(MachineState *machine, unsigned cpu_index)
4320 {
4321     CPUArchId *core_slot;
4322     MachineClass *mc = MACHINE_GET_CLASS(machine);
4323 
4324     /* make sure possible_cpu are initialized */
4325     mc->possible_cpu_arch_ids(machine);
4326     /* get CPU core slot containing thread that matches cpu_index */
4327     core_slot = spapr_find_cpu_slot(machine, cpu_index, NULL);
4328     assert(core_slot);
4329     return core_slot->props;
4330 }
4331 
4332 static int64_t spapr_get_default_cpu_node_id(const MachineState *ms, int idx)
4333 {
4334     return idx / ms->smp.cores % ms->numa_state->num_nodes;
4335 }
4336 
4337 static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine)
4338 {
4339     int i;
4340     unsigned int smp_threads = machine->smp.threads;
4341     unsigned int smp_cpus = machine->smp.cpus;
4342     const char *core_type;
4343     int spapr_max_cores = machine->smp.max_cpus / smp_threads;
4344     MachineClass *mc = MACHINE_GET_CLASS(machine);
4345 
4346     if (!mc->has_hotpluggable_cpus) {
4347         spapr_max_cores = QEMU_ALIGN_UP(smp_cpus, smp_threads) / smp_threads;
4348     }
4349     if (machine->possible_cpus) {
4350         assert(machine->possible_cpus->len == spapr_max_cores);
4351         return machine->possible_cpus;
4352     }
4353 
4354     core_type = spapr_get_cpu_core_type(machine->cpu_type);
4355     if (!core_type) {
4356         error_report("Unable to find sPAPR CPU Core definition");
4357         exit(1);
4358     }
4359 
4360     machine->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
4361                              sizeof(CPUArchId) * spapr_max_cores);
4362     machine->possible_cpus->len = spapr_max_cores;
4363     for (i = 0; i < machine->possible_cpus->len; i++) {
4364         int core_id = i * smp_threads;
4365 
4366         machine->possible_cpus->cpus[i].type = core_type;
4367         machine->possible_cpus->cpus[i].vcpus_count = smp_threads;
4368         machine->possible_cpus->cpus[i].arch_id = core_id;
4369         machine->possible_cpus->cpus[i].props.has_core_id = true;
4370         machine->possible_cpus->cpus[i].props.core_id = core_id;
4371     }
4372     return machine->possible_cpus;
4373 }
4374 
4375 static bool spapr_phb_placement(SpaprMachineState *spapr, uint32_t index,
4376                                 uint64_t *buid, hwaddr *pio,
4377                                 hwaddr *mmio32, hwaddr *mmio64,
4378                                 unsigned n_dma, uint32_t *liobns, Error **errp)
4379 {
4380     /*
4381      * New-style PHB window placement.
4382      *
4383      * Goals: Gives large (1TiB), naturally aligned 64-bit MMIO window
4384      * for each PHB, in addition to 2GiB 32-bit MMIO and 64kiB PIO
4385      * windows.
4386      *
4387      * Some guest kernels can't work with MMIO windows above 1<<46
4388      * (64TiB), so we place up to 31 PHBs in the area 32TiB..64TiB
4389      *
4390      * 32TiB..(33TiB+1984kiB) contains the 64kiB PIO windows for each
4391      * PHB stacked together.  (32TiB+2GiB)..(32TiB+64GiB) contains the
4392      * 2GiB 32-bit MMIO windows for each PHB.  Then 33..64TiB has the
4393      * 1TiB 64-bit MMIO windows for each PHB.
4394      */
4395     const uint64_t base_buid = 0x800000020000000ULL;
4396     int i;
4397 
4398     /* Sanity check natural alignments */
4399     QEMU_BUILD_BUG_ON((SPAPR_PCI_BASE % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4400     QEMU_BUILD_BUG_ON((SPAPR_PCI_LIMIT % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4401     QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM64_WIN_SIZE % SPAPR_PCI_MEM32_WIN_SIZE) != 0);
4402     QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM32_WIN_SIZE % SPAPR_PCI_IO_WIN_SIZE) != 0);
4403     /* Sanity check bounds */
4404     QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_IO_WIN_SIZE) >
4405                       SPAPR_PCI_MEM32_WIN_SIZE);
4406     QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_MEM32_WIN_SIZE) >
4407                       SPAPR_PCI_MEM64_WIN_SIZE);
4408 
4409     if (index >= SPAPR_MAX_PHBS) {
4410         error_setg(errp, "\"index\" for PAPR PHB is too large (max %llu)",
4411                    SPAPR_MAX_PHBS - 1);
4412         return false;
4413     }
4414 
4415     *buid = base_buid + index;
4416     for (i = 0; i < n_dma; ++i) {
4417         liobns[i] = SPAPR_PCI_LIOBN(index, i);
4418     }
4419 
4420     *pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE;
4421     *mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE;
4422     *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;
4423     return true;
4424 }
4425 
4426 static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
4427 {
4428     SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4429 
4430     return ics_valid_irq(spapr->ics, irq) ? spapr->ics : NULL;
4431 }
4432 
4433 static void spapr_ics_resend(XICSFabric *dev)
4434 {
4435     SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4436 
4437     ics_resend(spapr->ics);
4438 }
4439 
4440 static ICPState *spapr_icp_get(XICSFabric *xi, int vcpu_id)
4441 {
4442     PowerPCCPU *cpu = spapr_find_cpu(vcpu_id);
4443 
4444     return cpu ? spapr_cpu_state(cpu)->icp : NULL;
4445 }
4446 
4447 static void spapr_pic_print_info(InterruptStatsProvider *obj, GString *buf)
4448 {
4449     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
4450 
4451     spapr_irq_print_info(spapr, buf);
4452     g_string_append_printf(buf, "irqchip: %s\n",
4453                            kvm_irqchip_in_kernel() ? "in-kernel" : "emulated");
4454 }
4455 
4456 /*
4457  * This is a XIVE only operation
4458  */
4459 static int spapr_match_nvt(XiveFabric *xfb, uint8_t format,
4460                            uint8_t nvt_blk, uint32_t nvt_idx,
4461                            bool crowd, bool cam_ignore, uint8_t priority,
4462                            uint32_t logic_serv, XiveTCTXMatch *match)
4463 {
4464     SpaprMachineState *spapr = SPAPR_MACHINE(xfb);
4465     XivePresenter *xptr = XIVE_PRESENTER(spapr->active_intc);
4466     XivePresenterClass *xpc = XIVE_PRESENTER_GET_CLASS(xptr);
4467     int count;
4468 
4469     count = xpc->match_nvt(xptr, format, nvt_blk, nvt_idx, crowd, cam_ignore,
4470                            priority, logic_serv, match);
4471     if (count < 0) {
4472         return count;
4473     }
4474 
4475     /*
4476      * When we implement the save and restore of the thread interrupt
4477      * contexts in the enter/exit CPU handlers of the machine and the
4478      * escalations in QEMU, we should be able to handle non dispatched
4479      * vCPUs.
4480      *
4481      * Until this is done, the sPAPR machine should find at least one
4482      * matching context always.
4483      */
4484     if (count == 0) {
4485         qemu_log_mask(LOG_GUEST_ERROR, "XIVE: NVT %x/%x is not dispatched\n",
4486                       nvt_blk, nvt_idx);
4487     }
4488 
4489     return count;
4490 }
4491 
4492 int spapr_get_vcpu_id(PowerPCCPU *cpu)
4493 {
4494     return cpu->vcpu_id;
4495 }
4496 
4497 bool spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp)
4498 {
4499     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
4500     MachineState *ms = MACHINE(spapr);
4501     int vcpu_id;
4502 
4503     vcpu_id = spapr_vcpu_id(spapr, cpu_index);
4504 
4505     if (kvm_enabled() && !kvm_vcpu_id_is_valid(vcpu_id)) {
4506         error_setg(errp, "Can't create CPU with id %d in KVM", vcpu_id);
4507         error_append_hint(errp, "Adjust the number of cpus to %d "
4508                           "or try to raise the number of threads per core\n",
4509                           vcpu_id * ms->smp.threads / spapr->vsmt);
4510         return false;
4511     }
4512 
4513     cpu->vcpu_id = vcpu_id;
4514     return true;
4515 }
4516 
4517 PowerPCCPU *spapr_find_cpu(int vcpu_id)
4518 {
4519     CPUState *cs;
4520 
4521     CPU_FOREACH(cs) {
4522         PowerPCCPU *cpu = POWERPC_CPU(cs);
4523 
4524         if (spapr_get_vcpu_id(cpu) == vcpu_id) {
4525             return cpu;
4526         }
4527     }
4528 
4529     return NULL;
4530 }
4531 
4532 static bool spapr_cpu_in_nested(PowerPCCPU *cpu)
4533 {
4534     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
4535 
4536     return spapr_cpu->in_nested;
4537 }
4538 
4539 static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
4540 {
4541     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
4542 
4543     /* These are only called by TCG, KVM maintains dispatch state */
4544 
4545     spapr_cpu->prod = false;
4546     if (spapr_cpu->vpa_addr) {
4547         CPUState *cs = CPU(cpu);
4548         uint32_t dispatch;
4549 
4550         dispatch = ldl_be_phys(cs->as,
4551                                spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER);
4552         dispatch++;
4553         if ((dispatch & 1) != 0) {
4554             qemu_log_mask(LOG_GUEST_ERROR,
4555                           "VPA: incorrect dispatch counter value for "
4556                           "dispatched partition %u, correcting.\n", dispatch);
4557             dispatch++;
4558         }
4559         stl_be_phys(cs->as,
4560                     spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch);
4561     }
4562 }
4563 
4564 static void spapr_cpu_exec_exit(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
4565 {
4566     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
4567 
4568     if (spapr_cpu->vpa_addr) {
4569         CPUState *cs = CPU(cpu);
4570         uint32_t dispatch;
4571 
4572         dispatch = ldl_be_phys(cs->as,
4573                                spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER);
4574         dispatch++;
4575         if ((dispatch & 1) != 1) {
4576             qemu_log_mask(LOG_GUEST_ERROR,
4577                           "VPA: incorrect dispatch counter value for "
4578                           "preempted partition %u, correcting.\n", dispatch);
4579             dispatch++;
4580         }
4581         stl_be_phys(cs->as,
4582                     spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch);
4583     }
4584 }
4585 
4586 static void spapr_machine_class_init(ObjectClass *oc, void *data)
4587 {
4588     MachineClass *mc = MACHINE_CLASS(oc);
4589     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(oc);
4590     FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
4591     NMIClass *nc = NMI_CLASS(oc);
4592     HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
4593     PPCVirtualHypervisorClass *vhc = PPC_VIRTUAL_HYPERVISOR_CLASS(oc);
4594     XICSFabricClass *xic = XICS_FABRIC_CLASS(oc);
4595     InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);
4596     XiveFabricClass *xfc = XIVE_FABRIC_CLASS(oc);
4597     VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc);
4598 
4599     mc->desc = "pSeries Logical Partition (PAPR compliant)";
4600     mc->ignore_boot_device_suffixes = true;
4601 
4602     /*
4603      * We set up the default / latest behaviour here.  The class_init
4604      * functions for the specific versioned machine types can override
4605      * these details for backwards compatibility
4606      */
4607     mc->init = spapr_machine_init;
4608     mc->reset = spapr_machine_reset;
4609     mc->block_default_type = IF_SCSI;
4610 
4611     /*
4612      * While KVM determines max cpus in kvm_init() using kvm_max_vcpus(),
4613      * In TCG the limit is restricted by the range of CPU IPIs available.
4614      */
4615     mc->max_cpus = SPAPR_IRQ_NR_IPIS;
4616 
4617     mc->no_parallel = 1;
4618     mc->default_boot_order = "";
4619     mc->default_ram_size = 512 * MiB;
4620     mc->default_ram_id = "ppc_spapr.ram";
4621     mc->default_display = "std";
4622     mc->kvm_type = spapr_kvm_type;
4623     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SPAPR_PCI_HOST_BRIDGE);
4624     mc->pci_allow_0_address = true;
4625     assert(!mc->get_hotplug_handler);
4626     mc->get_hotplug_handler = spapr_get_hotplug_handler;
4627     hc->pre_plug = spapr_machine_device_pre_plug;
4628     hc->plug = spapr_machine_device_plug;
4629     mc->cpu_index_to_instance_props = spapr_cpu_index_to_props;
4630     mc->get_default_cpu_node_id = spapr_get_default_cpu_node_id;
4631     mc->possible_cpu_arch_ids = spapr_possible_cpu_arch_ids;
4632     hc->unplug_request = spapr_machine_device_unplug_request;
4633     hc->unplug = spapr_machine_device_unplug;
4634 
4635     smc->update_dt_enabled = true;
4636     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power10_v2.0");
4637     mc->has_hotpluggable_cpus = true;
4638     mc->nvdimm_supported = true;
4639     smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
4640     fwc->get_dev_path = spapr_get_fw_dev_path;
4641     nc->nmi_monitor_handler = spapr_nmi;
4642     smc->phb_placement = spapr_phb_placement;
4643     vhc->cpu_in_nested = spapr_cpu_in_nested;
4644     vhc->deliver_hv_excp = spapr_exit_nested;
4645     vhc->hypercall = emulate_spapr_hypercall;
4646     vhc->hpt_mask = spapr_hpt_mask;
4647     vhc->map_hptes = spapr_map_hptes;
4648     vhc->unmap_hptes = spapr_unmap_hptes;
4649     vhc->hpte_set_c = spapr_hpte_set_c;
4650     vhc->hpte_set_r = spapr_hpte_set_r;
4651     vhc->get_pate = spapr_get_pate;
4652     vhc->encode_hpt_for_kvm_pr = spapr_encode_hpt_for_kvm_pr;
4653     vhc->cpu_exec_enter = spapr_cpu_exec_enter;
4654     vhc->cpu_exec_exit = spapr_cpu_exec_exit;
4655     xic->ics_get = spapr_ics_get;
4656     xic->ics_resend = spapr_ics_resend;
4657     xic->icp_get = spapr_icp_get;
4658     ispc->print_info = spapr_pic_print_info;
4659     /* Force NUMA node memory size to be a multiple of
4660      * SPAPR_MEMORY_BLOCK_SIZE (256M) since that's the granularity
4661      * in which LMBs are represented and hot-added
4662      */
4663     mc->numa_mem_align_shift = 28;
4664     mc->auto_enable_numa = true;
4665 
4666     smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_OFF;
4667     smc->default_caps.caps[SPAPR_CAP_VSX] = SPAPR_CAP_ON;
4668     smc->default_caps.caps[SPAPR_CAP_DFP] = SPAPR_CAP_ON;
4669     smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
4670     smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
4671     smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_WORKAROUND;
4672     smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 16; /* 64kiB */
4673     smc->default_caps.caps[SPAPR_CAP_NESTED_KVM_HV] = SPAPR_CAP_OFF;
4674     smc->default_caps.caps[SPAPR_CAP_NESTED_PAPR] = SPAPR_CAP_OFF;
4675     smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON;
4676     smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
4677     smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
4678     smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
4679 
4680     /*
4681      * This cap specifies whether the AIL 3 mode for
4682      * H_SET_RESOURCE is supported. The default is modified
4683      * by default_caps_with_cpu().
4684      */
4685     smc->default_caps.caps[SPAPR_CAP_AIL_MODE_3] = SPAPR_CAP_ON;
4686     spapr_caps_add_properties(smc);
4687     smc->irq = &spapr_irq_dual;
4688     smc->dr_phb_enabled = true;
4689     smc->linux_pci_probe = true;
4690     smc->smp_threads_vsmt = true;
4691     smc->nr_xirqs = SPAPR_NR_XIRQS;
4692     xfc->match_nvt = spapr_match_nvt;
4693     vmc->client_architecture_support = spapr_vof_client_architecture_support;
4694     vmc->quiesce = spapr_vof_quiesce;
4695     vmc->setprop = spapr_vof_setprop;
4696 }
4697 
4698 static const TypeInfo spapr_machine_info = {
4699     .name          = TYPE_SPAPR_MACHINE,
4700     .parent        = TYPE_MACHINE,
4701     .abstract      = true,
4702     .instance_size = sizeof(SpaprMachineState),
4703     .instance_init = spapr_instance_init,
4704     .instance_finalize = spapr_machine_finalizefn,
4705     .class_size    = sizeof(SpaprMachineClass),
4706     .class_init    = spapr_machine_class_init,
4707     .interfaces = (InterfaceInfo[]) {
4708         { TYPE_FW_PATH_PROVIDER },
4709         { TYPE_NMI },
4710         { TYPE_HOTPLUG_HANDLER },
4711         { TYPE_PPC_VIRTUAL_HYPERVISOR },
4712         { TYPE_XICS_FABRIC },
4713         { TYPE_INTERRUPT_STATS_PROVIDER },
4714         { TYPE_XIVE_FABRIC },
4715         { TYPE_VOF_MACHINE_IF },
4716         { }
4717     },
4718 };
4719 
4720 static void spapr_machine_latest_class_options(MachineClass *mc)
4721 {
4722     mc->alias = "pseries";
4723     mc->is_default = true;
4724 }
4725 
4726 #define DEFINE_SPAPR_MACHINE_IMPL(latest, ...)                       \
4727     static void MACHINE_VER_SYM(class_init, spapr, __VA_ARGS__)(     \
4728         ObjectClass *oc,                                             \
4729         void *data)                                                  \
4730     {                                                                \
4731         MachineClass *mc = MACHINE_CLASS(oc);                        \
4732         MACHINE_VER_SYM(class_options, spapr, __VA_ARGS__)(mc);      \
4733         MACHINE_VER_DEPRECATION(__VA_ARGS__);                        \
4734         if (latest) {                                                \
4735             spapr_machine_latest_class_options(mc);                  \
4736         }                                                            \
4737     }                                                                \
4738     static const TypeInfo MACHINE_VER_SYM(info, spapr, __VA_ARGS__) = \
4739     {                                                                \
4740         .name = MACHINE_VER_TYPE_NAME("pseries", __VA_ARGS__),       \
4741         .parent = TYPE_SPAPR_MACHINE,                                \
4742         .class_init = MACHINE_VER_SYM(class_init, spapr, __VA_ARGS__), \
4743     };                                                               \
4744     static void MACHINE_VER_SYM(register, spapr, __VA_ARGS__)(void)  \
4745     {                                                                \
4746         MACHINE_VER_DELETION(__VA_ARGS__);                           \
4747         type_register_static(&MACHINE_VER_SYM(info, spapr, __VA_ARGS__));   \
4748     }                                                                \
4749     type_init(MACHINE_VER_SYM(register, spapr, __VA_ARGS__))
4750 
4751 #define DEFINE_SPAPR_MACHINE_AS_LATEST(major, minor) \
4752     DEFINE_SPAPR_MACHINE_IMPL(true, major, minor)
4753 #define DEFINE_SPAPR_MACHINE(major, minor) \
4754     DEFINE_SPAPR_MACHINE_IMPL(false, major, minor)
4755 
4756 /*
4757  * pseries-10.0
4758  */
4759 static void spapr_machine_10_0_class_options(MachineClass *mc)
4760 {
4761     /* Defaults for the latest behaviour inherited from the base class */
4762 }
4763 
4764 DEFINE_SPAPR_MACHINE_AS_LATEST(10, 0);
4765 
4766 /*
4767  * pseries-9.2
4768  */
4769 static void spapr_machine_9_2_class_options(MachineClass *mc)
4770 {
4771     spapr_machine_10_0_class_options(mc);
4772     compat_props_add(mc->compat_props, hw_compat_9_2, hw_compat_9_2_len);
4773 }
4774 
4775 DEFINE_SPAPR_MACHINE(9, 2);
4776 
4777 /*
4778  * pseries-9.1
4779  */
4780 static void spapr_machine_9_1_class_options(MachineClass *mc)
4781 {
4782     spapr_machine_9_2_class_options(mc);
4783     compat_props_add(mc->compat_props, hw_compat_9_1, hw_compat_9_1_len);
4784 }
4785 
4786 DEFINE_SPAPR_MACHINE(9, 1);
4787 
4788 /*
4789  * pseries-9.0
4790  */
4791 static void spapr_machine_9_0_class_options(MachineClass *mc)
4792 {
4793     spapr_machine_9_1_class_options(mc);
4794     compat_props_add(mc->compat_props, hw_compat_9_0, hw_compat_9_0_len);
4795 }
4796 
4797 DEFINE_SPAPR_MACHINE(9, 0);
4798 
4799 /*
4800  * pseries-8.2
4801  */
4802 static void spapr_machine_8_2_class_options(MachineClass *mc)
4803 {
4804     spapr_machine_9_0_class_options(mc);
4805     compat_props_add(mc->compat_props, hw_compat_8_2, hw_compat_8_2_len);
4806 }
4807 
4808 DEFINE_SPAPR_MACHINE(8, 2);
4809 
4810 /*
4811  * pseries-8.1
4812  */
4813 static void spapr_machine_8_1_class_options(MachineClass *mc)
4814 {
4815     spapr_machine_8_2_class_options(mc);
4816     compat_props_add(mc->compat_props, hw_compat_8_1, hw_compat_8_1_len);
4817 }
4818 
4819 DEFINE_SPAPR_MACHINE(8, 1);
4820 
4821 /*
4822  * pseries-8.0
4823  */
4824 static void spapr_machine_8_0_class_options(MachineClass *mc)
4825 {
4826     spapr_machine_8_1_class_options(mc);
4827     compat_props_add(mc->compat_props, hw_compat_8_0, hw_compat_8_0_len);
4828 }
4829 
4830 DEFINE_SPAPR_MACHINE(8, 0);
4831 
4832 /*
4833  * pseries-7.2
4834  */
4835 static void spapr_machine_7_2_class_options(MachineClass *mc)
4836 {
4837     spapr_machine_8_0_class_options(mc);
4838     compat_props_add(mc->compat_props, hw_compat_7_2, hw_compat_7_2_len);
4839 }
4840 
4841 DEFINE_SPAPR_MACHINE(7, 2);
4842 
4843 /*
4844  * pseries-7.1
4845  */
4846 static void spapr_machine_7_1_class_options(MachineClass *mc)
4847 {
4848     spapr_machine_7_2_class_options(mc);
4849     compat_props_add(mc->compat_props, hw_compat_7_1, hw_compat_7_1_len);
4850 }
4851 
4852 DEFINE_SPAPR_MACHINE(7, 1);
4853 
4854 /*
4855  * pseries-7.0
4856  */
4857 static void spapr_machine_7_0_class_options(MachineClass *mc)
4858 {
4859     spapr_machine_7_1_class_options(mc);
4860     compat_props_add(mc->compat_props, hw_compat_7_0, hw_compat_7_0_len);
4861 }
4862 
4863 DEFINE_SPAPR_MACHINE(7, 0);
4864 
4865 /*
4866  * pseries-6.2
4867  */
4868 static void spapr_machine_6_2_class_options(MachineClass *mc)
4869 {
4870     spapr_machine_7_0_class_options(mc);
4871     compat_props_add(mc->compat_props, hw_compat_6_2, hw_compat_6_2_len);
4872 }
4873 
4874 DEFINE_SPAPR_MACHINE(6, 2);
4875 
4876 /*
4877  * pseries-6.1
4878  */
4879 static void spapr_machine_6_1_class_options(MachineClass *mc)
4880 {
4881     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4882 
4883     spapr_machine_6_2_class_options(mc);
4884     compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len);
4885     smc->pre_6_2_numa_affinity = true;
4886     mc->smp_props.prefer_sockets = true;
4887 }
4888 
4889 DEFINE_SPAPR_MACHINE(6, 1);
4890 
4891 /*
4892  * pseries-6.0
4893  */
4894 static void spapr_machine_6_0_class_options(MachineClass *mc)
4895 {
4896     spapr_machine_6_1_class_options(mc);
4897     compat_props_add(mc->compat_props, hw_compat_6_0, hw_compat_6_0_len);
4898 }
4899 
4900 DEFINE_SPAPR_MACHINE(6, 0);
4901 
4902 /*
4903  * pseries-5.2
4904  */
4905 static void spapr_machine_5_2_class_options(MachineClass *mc)
4906 {
4907     spapr_machine_6_0_class_options(mc);
4908     compat_props_add(mc->compat_props, hw_compat_5_2, hw_compat_5_2_len);
4909 }
4910 
4911 DEFINE_SPAPR_MACHINE(5, 2);
4912 
4913 /*
4914  * pseries-5.1
4915  */
4916 static void spapr_machine_5_1_class_options(MachineClass *mc)
4917 {
4918     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4919 
4920     spapr_machine_5_2_class_options(mc);
4921     compat_props_add(mc->compat_props, hw_compat_5_1, hw_compat_5_1_len);
4922     smc->pre_5_2_numa_associativity = true;
4923 }
4924 
4925 DEFINE_SPAPR_MACHINE(5, 1);
4926 
4927 /*
4928  * pseries-5.0
4929  */
4930 static void spapr_machine_5_0_class_options(MachineClass *mc)
4931 {
4932     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4933     static GlobalProperty compat[] = {
4934         { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" },
4935     };
4936 
4937     spapr_machine_5_1_class_options(mc);
4938     compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len);
4939     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4940     mc->numa_mem_supported = true;
4941     smc->pre_5_1_assoc_refpoints = true;
4942 }
4943 
4944 DEFINE_SPAPR_MACHINE(5, 0);
4945 
4946 /*
4947  * pseries-4.2
4948  */
4949 static void spapr_machine_4_2_class_options(MachineClass *mc)
4950 {
4951     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4952 
4953     spapr_machine_5_0_class_options(mc);
4954     compat_props_add(mc->compat_props, hw_compat_4_2, hw_compat_4_2_len);
4955     smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF;
4956     smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_OFF;
4957     smc->rma_limit = 16 * GiB;
4958     mc->nvdimm_supported = false;
4959 }
4960 
4961 DEFINE_SPAPR_MACHINE(4, 2);
4962 
4963 /*
4964  * pseries-4.1
4965  */
4966 static void spapr_machine_4_1_class_options(MachineClass *mc)
4967 {
4968     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4969     static GlobalProperty compat[] = {
4970         /* Only allow 4kiB and 64kiB IOMMU pagesizes */
4971         { TYPE_SPAPR_PCI_HOST_BRIDGE, "pgsz", "0x11000" },
4972     };
4973 
4974     spapr_machine_4_2_class_options(mc);
4975     smc->linux_pci_probe = false;
4976     smc->smp_threads_vsmt = false;
4977     compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len);
4978     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4979 }
4980 
4981 DEFINE_SPAPR_MACHINE(4, 1);
4982 
4983 /*
4984  * pseries-4.0
4985  */
4986 static bool phb_placement_4_0(SpaprMachineState *spapr, uint32_t index,
4987                               uint64_t *buid, hwaddr *pio,
4988                               hwaddr *mmio32, hwaddr *mmio64,
4989                               unsigned n_dma, uint32_t *liobns, Error **errp)
4990 {
4991     if (!spapr_phb_placement(spapr, index, buid, pio, mmio32, mmio64, n_dma,
4992                              liobns, errp)) {
4993         return false;
4994     }
4995     return true;
4996 }
4997 static void spapr_machine_4_0_class_options(MachineClass *mc)
4998 {
4999     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5000 
5001     spapr_machine_4_1_class_options(mc);
5002     compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len);
5003     smc->phb_placement = phb_placement_4_0;
5004     smc->irq = &spapr_irq_xics;
5005     smc->pre_4_1_migration = true;
5006 }
5007 
5008 DEFINE_SPAPR_MACHINE(4, 0);
5009 
5010 /*
5011  * pseries-3.1
5012  */
5013 static void spapr_machine_3_1_class_options(MachineClass *mc)
5014 {
5015     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5016 
5017     spapr_machine_4_0_class_options(mc);
5018     compat_props_add(mc->compat_props, hw_compat_3_1, hw_compat_3_1_len);
5019 
5020     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
5021     smc->update_dt_enabled = false;
5022     smc->dr_phb_enabled = false;
5023     smc->broken_host_serial_model = true;
5024     smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN;
5025     smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN;
5026     smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN;
5027     smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF;
5028 }
5029 
5030 DEFINE_SPAPR_MACHINE(3, 1);
5031 
5032 /*
5033  * pseries-3.0
5034  */
5035 
5036 static void spapr_machine_3_0_class_options(MachineClass *mc)
5037 {
5038     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5039 
5040     spapr_machine_3_1_class_options(mc);
5041     compat_props_add(mc->compat_props, hw_compat_3_0, hw_compat_3_0_len);
5042 
5043     smc->legacy_irq_allocation = true;
5044     smc->nr_xirqs = 0x400;
5045     smc->irq = &spapr_irq_xics_legacy;
5046 }
5047 
5048 DEFINE_SPAPR_MACHINE(3, 0);
5049 
5050 static void spapr_machine_register_types(void)
5051 {
5052     type_register_static(&spapr_machine_info);
5053 }
5054 
5055 type_init(spapr_machine_register_types)
5056