xref: /qemu/target/ppc/kvm.c (revision 2cbd158131e5a5e392417fb4511075a5d2af1bdd)
1 /*
2  * PowerPC implementation of KVM hooks
3  *
4  * Copyright IBM Corp. 2007
5  * Copyright (C) 2011 Freescale Semiconductor, Inc.
6  *
7  * Authors:
8  *  Jerone Young <jyoung5@us.ibm.com>
9  *  Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
10  *  Hollis Blanchard <hollisb@us.ibm.com>
11  *
12  * This work is licensed under the terms of the GNU GPL, version 2 or later.
13  * See the COPYING file in the top-level directory.
14  *
15  */
16 
17 #include "qemu/osdep.h"
18 #include <dirent.h>
19 #include <sys/ioctl.h>
20 #include <sys/vfs.h>
21 
22 #include <linux/kvm.h>
23 
24 #include "qemu-common.h"
25 #include "qapi/error.h"
26 #include "qemu/error-report.h"
27 #include "cpu.h"
28 #include "cpu-models.h"
29 #include "qemu/timer.h"
30 #include "sysemu/sysemu.h"
31 #include "sysemu/hw_accel.h"
32 #include "kvm_ppc.h"
33 #include "sysemu/cpus.h"
34 #include "sysemu/device_tree.h"
35 #include "mmu-hash64.h"
36 
37 #include "hw/sysbus.h"
38 #include "hw/ppc/spapr.h"
39 #include "hw/ppc/spapr_cpu_core.h"
40 #include "hw/ppc/ppc.h"
41 #include "sysemu/watchdog.h"
42 #include "trace.h"
43 #include "exec/gdbstub.h"
44 #include "exec/memattrs.h"
45 #include "exec/ram_addr.h"
46 #include "sysemu/hostmem.h"
47 #include "qemu/cutils.h"
48 #include "qemu/mmap-alloc.h"
49 #include "elf.h"
50 #include "sysemu/kvm_int.h"
51 
52 //#define DEBUG_KVM
53 
54 #ifdef DEBUG_KVM
55 #define DPRINTF(fmt, ...) \
56     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
57 #else
58 #define DPRINTF(fmt, ...) \
59     do { } while (0)
60 #endif
61 
62 #define PROC_DEVTREE_CPU      "/proc/device-tree/cpus/"
63 
64 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
65     KVM_CAP_LAST_INFO
66 };
67 
68 static int cap_interrupt_unset = false;
69 static int cap_interrupt_level = false;
70 static int cap_segstate;
71 static int cap_booke_sregs;
72 static int cap_ppc_smt;
73 static int cap_ppc_smt_possible;
74 static int cap_spapr_tce;
75 static int cap_spapr_tce_64;
76 static int cap_spapr_multitce;
77 static int cap_spapr_vfio;
78 static int cap_hior;
79 static int cap_one_reg;
80 static int cap_epr;
81 static int cap_ppc_watchdog;
82 static int cap_papr;
83 static int cap_htab_fd;
84 static int cap_fixup_hcalls;
85 static int cap_htm;             /* Hardware transactional memory support */
86 static int cap_mmu_radix;
87 static int cap_mmu_hash_v3;
88 static int cap_resize_hpt;
89 static int cap_ppc_pvr_compat;
90 static int cap_ppc_safe_cache;
91 static int cap_ppc_safe_bounds_check;
92 static int cap_ppc_safe_indirect_branch;
93 static int cap_ppc_count_cache_flush_assist;
94 static int cap_ppc_nested_kvm_hv;
95 static int cap_large_decr;
96 
97 static uint32_t debug_inst_opcode;
98 
99 /* XXX We have a race condition where we actually have a level triggered
100  *     interrupt, but the infrastructure can't expose that yet, so the guest
101  *     takes but ignores it, goes to sleep and never gets notified that there's
102  *     still an interrupt pending.
103  *
104  *     As a quick workaround, let's just wake up again 20 ms after we injected
105  *     an interrupt. That way we can assure that we're always reinjecting
106  *     interrupts in case the guest swallowed them.
107  */
108 static QEMUTimer *idle_timer;
109 
110 static void kvm_kick_cpu(void *opaque)
111 {
112     PowerPCCPU *cpu = opaque;
113 
114     qemu_cpu_kick(CPU(cpu));
115 }
116 
117 /* Check whether we are running with KVM-PR (instead of KVM-HV).  This
118  * should only be used for fallback tests - generally we should use
119  * explicit capabilities for the features we want, rather than
120  * assuming what is/isn't available depending on the KVM variant. */
121 static bool kvmppc_is_pr(KVMState *ks)
122 {
123     /* Assume KVM-PR if the GET_PVINFO capability is available */
124     return kvm_vm_check_extension(ks, KVM_CAP_PPC_GET_PVINFO) != 0;
125 }
126 
127 static int kvm_ppc_register_host_cpu_type(MachineState *ms);
128 static void kvmppc_get_cpu_characteristics(KVMState *s);
129 static int kvmppc_get_dec_bits(void);
130 
131 int kvm_arch_init(MachineState *ms, KVMState *s)
132 {
133     cap_interrupt_unset = kvm_check_extension(s, KVM_CAP_PPC_UNSET_IRQ);
134     cap_interrupt_level = kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL);
135     cap_segstate = kvm_check_extension(s, KVM_CAP_PPC_SEGSTATE);
136     cap_booke_sregs = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_SREGS);
137     cap_ppc_smt_possible = kvm_vm_check_extension(s, KVM_CAP_PPC_SMT_POSSIBLE);
138     cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
139     cap_spapr_tce_64 = kvm_check_extension(s, KVM_CAP_SPAPR_TCE_64);
140     cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
141     cap_spapr_vfio = kvm_vm_check_extension(s, KVM_CAP_SPAPR_TCE_VFIO);
142     cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
143     cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
144     cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
145     cap_ppc_watchdog = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_WATCHDOG);
146     /* Note: we don't set cap_papr here, because this capability is
147      * only activated after this by kvmppc_set_papr() */
148     cap_htab_fd = kvm_vm_check_extension(s, KVM_CAP_PPC_HTAB_FD);
149     cap_fixup_hcalls = kvm_check_extension(s, KVM_CAP_PPC_FIXUP_HCALL);
150     cap_ppc_smt = kvm_vm_check_extension(s, KVM_CAP_PPC_SMT);
151     cap_htm = kvm_vm_check_extension(s, KVM_CAP_PPC_HTM);
152     cap_mmu_radix = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_RADIX);
153     cap_mmu_hash_v3 = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_HASH_V3);
154     cap_resize_hpt = kvm_vm_check_extension(s, KVM_CAP_SPAPR_RESIZE_HPT);
155     kvmppc_get_cpu_characteristics(s);
156     cap_ppc_nested_kvm_hv = kvm_vm_check_extension(s, KVM_CAP_PPC_NESTED_HV);
157     cap_large_decr = kvmppc_get_dec_bits();
158     /*
159      * Note: setting it to false because there is not such capability
160      * in KVM at this moment.
161      *
162      * TODO: call kvm_vm_check_extension() with the right capability
163      * after the kernel starts implementing it.*/
164     cap_ppc_pvr_compat = false;
165 
166     if (!cap_interrupt_level) {
167         fprintf(stderr, "KVM: Couldn't find level irq capability. Expect the "
168                         "VM to stall at times!\n");
169     }
170 
171     kvm_ppc_register_host_cpu_type(ms);
172 
173     return 0;
174 }
175 
176 int kvm_arch_irqchip_create(MachineState *ms, KVMState *s)
177 {
178     return 0;
179 }
180 
181 static int kvm_arch_sync_sregs(PowerPCCPU *cpu)
182 {
183     CPUPPCState *cenv = &cpu->env;
184     CPUState *cs = CPU(cpu);
185     struct kvm_sregs sregs;
186     int ret;
187 
188     if (cenv->excp_model == POWERPC_EXCP_BOOKE) {
189         /* What we're really trying to say is "if we're on BookE, we use
190            the native PVR for now". This is the only sane way to check
191            it though, so we potentially confuse users that they can run
192            BookE guests on BookS. Let's hope nobody dares enough :) */
193         return 0;
194     } else {
195         if (!cap_segstate) {
196             fprintf(stderr, "kvm error: missing PVR setting capability\n");
197             return -ENOSYS;
198         }
199     }
200 
201     ret = kvm_vcpu_ioctl(cs, KVM_GET_SREGS, &sregs);
202     if (ret) {
203         return ret;
204     }
205 
206     sregs.pvr = cenv->spr[SPR_PVR];
207     return kvm_vcpu_ioctl(cs, KVM_SET_SREGS, &sregs);
208 }
209 
210 /* Set up a shared TLB array with KVM */
211 static int kvm_booke206_tlb_init(PowerPCCPU *cpu)
212 {
213     CPUPPCState *env = &cpu->env;
214     CPUState *cs = CPU(cpu);
215     struct kvm_book3e_206_tlb_params params = {};
216     struct kvm_config_tlb cfg = {};
217     unsigned int entries = 0;
218     int ret, i;
219 
220     if (!kvm_enabled() ||
221         !kvm_check_extension(cs->kvm_state, KVM_CAP_SW_TLB)) {
222         return 0;
223     }
224 
225     assert(ARRAY_SIZE(params.tlb_sizes) == BOOKE206_MAX_TLBN);
226 
227     for (i = 0; i < BOOKE206_MAX_TLBN; i++) {
228         params.tlb_sizes[i] = booke206_tlb_size(env, i);
229         params.tlb_ways[i] = booke206_tlb_ways(env, i);
230         entries += params.tlb_sizes[i];
231     }
232 
233     assert(entries == env->nb_tlb);
234     assert(sizeof(struct kvm_book3e_206_tlb_entry) == sizeof(ppcmas_tlb_t));
235 
236     env->tlb_dirty = true;
237 
238     cfg.array = (uintptr_t)env->tlb.tlbm;
239     cfg.array_len = sizeof(ppcmas_tlb_t) * entries;
240     cfg.params = (uintptr_t)&params;
241     cfg.mmu_type = KVM_MMU_FSL_BOOKE_NOHV;
242 
243     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_SW_TLB, 0, (uintptr_t)&cfg);
244     if (ret < 0) {
245         fprintf(stderr, "%s: couldn't enable KVM_CAP_SW_TLB: %s\n",
246                 __func__, strerror(-ret));
247         return ret;
248     }
249 
250     env->kvm_sw_tlb = true;
251     return 0;
252 }
253 
254 
255 #if defined(TARGET_PPC64)
256 static void kvm_get_smmu_info(struct kvm_ppc_smmu_info *info, Error **errp)
257 {
258     int ret;
259 
260     assert(kvm_state != NULL);
261 
262     if (!kvm_check_extension(kvm_state, KVM_CAP_PPC_GET_SMMU_INFO)) {
263         error_setg(errp, "KVM doesn't expose the MMU features it supports");
264         error_append_hint(errp, "Consider switching to a newer KVM\n");
265         return;
266     }
267 
268     ret = kvm_vm_ioctl(kvm_state, KVM_PPC_GET_SMMU_INFO, info);
269     if (ret == 0) {
270         return;
271     }
272 
273     error_setg_errno(errp, -ret,
274                      "KVM failed to provide the MMU features it supports");
275 }
276 
277 struct ppc_radix_page_info *kvm_get_radix_page_info(void)
278 {
279     KVMState *s = KVM_STATE(current_machine->accelerator);
280     struct ppc_radix_page_info *radix_page_info;
281     struct kvm_ppc_rmmu_info rmmu_info;
282     int i;
283 
284     if (!kvm_check_extension(s, KVM_CAP_PPC_MMU_RADIX)) {
285         return NULL;
286     }
287     if (kvm_vm_ioctl(s, KVM_PPC_GET_RMMU_INFO, &rmmu_info)) {
288         return NULL;
289     }
290     radix_page_info = g_malloc0(sizeof(*radix_page_info));
291     radix_page_info->count = 0;
292     for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
293         if (rmmu_info.ap_encodings[i]) {
294             radix_page_info->entries[i] = rmmu_info.ap_encodings[i];
295             radix_page_info->count++;
296         }
297     }
298     return radix_page_info;
299 }
300 
301 target_ulong kvmppc_configure_v3_mmu(PowerPCCPU *cpu,
302                                      bool radix, bool gtse,
303                                      uint64_t proc_tbl)
304 {
305     CPUState *cs = CPU(cpu);
306     int ret;
307     uint64_t flags = 0;
308     struct kvm_ppc_mmuv3_cfg cfg = {
309         .process_table = proc_tbl,
310     };
311 
312     if (radix) {
313         flags |= KVM_PPC_MMUV3_RADIX;
314     }
315     if (gtse) {
316         flags |= KVM_PPC_MMUV3_GTSE;
317     }
318     cfg.flags = flags;
319     ret = kvm_vm_ioctl(cs->kvm_state, KVM_PPC_CONFIGURE_V3_MMU, &cfg);
320     switch (ret) {
321     case 0:
322         return H_SUCCESS;
323     case -EINVAL:
324         return H_PARAMETER;
325     case -ENODEV:
326         return H_NOT_AVAILABLE;
327     default:
328         return H_HARDWARE;
329     }
330 }
331 
332 bool kvmppc_hpt_needs_host_contiguous_pages(void)
333 {
334     static struct kvm_ppc_smmu_info smmu_info;
335 
336     if (!kvm_enabled()) {
337         return false;
338     }
339 
340     kvm_get_smmu_info(&smmu_info, &error_fatal);
341     return !!(smmu_info.flags & KVM_PPC_PAGE_SIZES_REAL);
342 }
343 
344 void kvm_check_mmu(PowerPCCPU *cpu, Error **errp)
345 {
346     struct kvm_ppc_smmu_info smmu_info;
347     int iq, ik, jq, jk;
348     Error *local_err = NULL;
349 
350     /* For now, we only have anything to check on hash64 MMUs */
351     if (!cpu->hash64_opts || !kvm_enabled()) {
352         return;
353     }
354 
355     kvm_get_smmu_info(&smmu_info, &local_err);
356     if (local_err) {
357         error_propagate(errp, local_err);
358         return;
359     }
360 
361     if (ppc_hash64_has(cpu, PPC_HASH64_1TSEG)
362         && !(smmu_info.flags & KVM_PPC_1T_SEGMENTS)) {
363         error_setg(errp,
364                    "KVM does not support 1TiB segments which guest expects");
365         return;
366     }
367 
368     if (smmu_info.slb_size < cpu->hash64_opts->slb_size) {
369         error_setg(errp, "KVM only supports %u SLB entries, but guest needs %u",
370                    smmu_info.slb_size, cpu->hash64_opts->slb_size);
371         return;
372     }
373 
374     /*
375      * Verify that every pagesize supported by the cpu model is
376      * supported by KVM with the same encodings
377      */
378     for (iq = 0; iq < ARRAY_SIZE(cpu->hash64_opts->sps); iq++) {
379         PPCHash64SegmentPageSizes *qsps = &cpu->hash64_opts->sps[iq];
380         struct kvm_ppc_one_seg_page_size *ksps;
381 
382         for (ik = 0; ik < ARRAY_SIZE(smmu_info.sps); ik++) {
383             if (qsps->page_shift == smmu_info.sps[ik].page_shift) {
384                 break;
385             }
386         }
387         if (ik >= ARRAY_SIZE(smmu_info.sps)) {
388             error_setg(errp, "KVM doesn't support for base page shift %u",
389                        qsps->page_shift);
390             return;
391         }
392 
393         ksps = &smmu_info.sps[ik];
394         if (ksps->slb_enc != qsps->slb_enc) {
395             error_setg(errp,
396 "KVM uses SLB encoding 0x%x for page shift %u, but guest expects 0x%x",
397                        ksps->slb_enc, ksps->page_shift, qsps->slb_enc);
398             return;
399         }
400 
401         for (jq = 0; jq < ARRAY_SIZE(qsps->enc); jq++) {
402             for (jk = 0; jk < ARRAY_SIZE(ksps->enc); jk++) {
403                 if (qsps->enc[jq].page_shift == ksps->enc[jk].page_shift) {
404                     break;
405                 }
406             }
407 
408             if (jk >= ARRAY_SIZE(ksps->enc)) {
409                 error_setg(errp, "KVM doesn't support page shift %u/%u",
410                            qsps->enc[jq].page_shift, qsps->page_shift);
411                 return;
412             }
413             if (qsps->enc[jq].pte_enc != ksps->enc[jk].pte_enc) {
414                 error_setg(errp,
415 "KVM uses PTE encoding 0x%x for page shift %u/%u, but guest expects 0x%x",
416                            ksps->enc[jk].pte_enc, qsps->enc[jq].page_shift,
417                            qsps->page_shift, qsps->enc[jq].pte_enc);
418                 return;
419             }
420         }
421     }
422 
423     if (ppc_hash64_has(cpu, PPC_HASH64_CI_LARGEPAGE)) {
424         /* Mostly what guest pagesizes we can use are related to the
425          * host pages used to map guest RAM, which is handled in the
426          * platform code. Cache-Inhibited largepages (64k) however are
427          * used for I/O, so if they're mapped to the host at all it
428          * will be a normal mapping, not a special hugepage one used
429          * for RAM. */
430         if (getpagesize() < 0x10000) {
431             error_setg(errp,
432                        "KVM can't supply 64kiB CI pages, which guest expects");
433         }
434     }
435 }
436 #endif /* !defined (TARGET_PPC64) */
437 
438 unsigned long kvm_arch_vcpu_id(CPUState *cpu)
439 {
440     return POWERPC_CPU(cpu)->vcpu_id;
441 }
442 
443 /* e500 supports 2 h/w breakpoint and 2 watchpoint.
444  * book3s supports only 1 watchpoint, so array size
445  * of 4 is sufficient for now.
446  */
447 #define MAX_HW_BKPTS 4
448 
449 static struct HWBreakpoint {
450     target_ulong addr;
451     int type;
452 } hw_debug_points[MAX_HW_BKPTS];
453 
454 static CPUWatchpoint hw_watchpoint;
455 
456 /* Default there is no breakpoint and watchpoint supported */
457 static int max_hw_breakpoint;
458 static int max_hw_watchpoint;
459 static int nb_hw_breakpoint;
460 static int nb_hw_watchpoint;
461 
462 static void kvmppc_hw_debug_points_init(CPUPPCState *cenv)
463 {
464     if (cenv->excp_model == POWERPC_EXCP_BOOKE) {
465         max_hw_breakpoint = 2;
466         max_hw_watchpoint = 2;
467     }
468 
469     if ((max_hw_breakpoint + max_hw_watchpoint) > MAX_HW_BKPTS) {
470         fprintf(stderr, "Error initializing h/w breakpoints\n");
471         return;
472     }
473 }
474 
475 int kvm_arch_init_vcpu(CPUState *cs)
476 {
477     PowerPCCPU *cpu = POWERPC_CPU(cs);
478     CPUPPCState *cenv = &cpu->env;
479     int ret;
480 
481     /* Synchronize sregs with kvm */
482     ret = kvm_arch_sync_sregs(cpu);
483     if (ret) {
484         if (ret == -EINVAL) {
485             error_report("Register sync failed... If you're using kvm-hv.ko,"
486                          " only \"-cpu host\" is possible");
487         }
488         return ret;
489     }
490 
491     idle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
492 
493     switch (cenv->mmu_model) {
494     case POWERPC_MMU_BOOKE206:
495         /* This target supports access to KVM's guest TLB */
496         ret = kvm_booke206_tlb_init(cpu);
497         break;
498     case POWERPC_MMU_2_07:
499         if (!cap_htm && !kvmppc_is_pr(cs->kvm_state)) {
500             /* KVM-HV has transactional memory on POWER8 also without the
501              * KVM_CAP_PPC_HTM extension, so enable it here instead as
502              * long as it's availble to userspace on the host. */
503             if (qemu_getauxval(AT_HWCAP2) & PPC_FEATURE2_HAS_HTM) {
504                 cap_htm = true;
505             }
506         }
507         break;
508     default:
509         break;
510     }
511 
512     kvm_get_one_reg(cs, KVM_REG_PPC_DEBUG_INST, &debug_inst_opcode);
513     kvmppc_hw_debug_points_init(cenv);
514 
515     return ret;
516 }
517 
518 static void kvm_sw_tlb_put(PowerPCCPU *cpu)
519 {
520     CPUPPCState *env = &cpu->env;
521     CPUState *cs = CPU(cpu);
522     struct kvm_dirty_tlb dirty_tlb;
523     unsigned char *bitmap;
524     int ret;
525 
526     if (!env->kvm_sw_tlb) {
527         return;
528     }
529 
530     bitmap = g_malloc((env->nb_tlb + 7) / 8);
531     memset(bitmap, 0xFF, (env->nb_tlb + 7) / 8);
532 
533     dirty_tlb.bitmap = (uintptr_t)bitmap;
534     dirty_tlb.num_dirty = env->nb_tlb;
535 
536     ret = kvm_vcpu_ioctl(cs, KVM_DIRTY_TLB, &dirty_tlb);
537     if (ret) {
538         fprintf(stderr, "%s: KVM_DIRTY_TLB: %s\n",
539                 __func__, strerror(-ret));
540     }
541 
542     g_free(bitmap);
543 }
544 
545 static void kvm_get_one_spr(CPUState *cs, uint64_t id, int spr)
546 {
547     PowerPCCPU *cpu = POWERPC_CPU(cs);
548     CPUPPCState *env = &cpu->env;
549     union {
550         uint32_t u32;
551         uint64_t u64;
552     } val;
553     struct kvm_one_reg reg = {
554         .id = id,
555         .addr = (uintptr_t) &val,
556     };
557     int ret;
558 
559     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
560     if (ret != 0) {
561         trace_kvm_failed_spr_get(spr, strerror(errno));
562     } else {
563         switch (id & KVM_REG_SIZE_MASK) {
564         case KVM_REG_SIZE_U32:
565             env->spr[spr] = val.u32;
566             break;
567 
568         case KVM_REG_SIZE_U64:
569             env->spr[spr] = val.u64;
570             break;
571 
572         default:
573             /* Don't handle this size yet */
574             abort();
575         }
576     }
577 }
578 
579 static void kvm_put_one_spr(CPUState *cs, uint64_t id, int spr)
580 {
581     PowerPCCPU *cpu = POWERPC_CPU(cs);
582     CPUPPCState *env = &cpu->env;
583     union {
584         uint32_t u32;
585         uint64_t u64;
586     } val;
587     struct kvm_one_reg reg = {
588         .id = id,
589         .addr = (uintptr_t) &val,
590     };
591     int ret;
592 
593     switch (id & KVM_REG_SIZE_MASK) {
594     case KVM_REG_SIZE_U32:
595         val.u32 = env->spr[spr];
596         break;
597 
598     case KVM_REG_SIZE_U64:
599         val.u64 = env->spr[spr];
600         break;
601 
602     default:
603         /* Don't handle this size yet */
604         abort();
605     }
606 
607     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
608     if (ret != 0) {
609         trace_kvm_failed_spr_set(spr, strerror(errno));
610     }
611 }
612 
613 static int kvm_put_fp(CPUState *cs)
614 {
615     PowerPCCPU *cpu = POWERPC_CPU(cs);
616     CPUPPCState *env = &cpu->env;
617     struct kvm_one_reg reg;
618     int i;
619     int ret;
620 
621     if (env->insns_flags & PPC_FLOAT) {
622         uint64_t fpscr = env->fpscr;
623         bool vsx = !!(env->insns_flags2 & PPC2_VSX);
624 
625         reg.id = KVM_REG_PPC_FPSCR;
626         reg.addr = (uintptr_t)&fpscr;
627         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
628         if (ret < 0) {
629             DPRINTF("Unable to set FPSCR to KVM: %s\n", strerror(errno));
630             return ret;
631         }
632 
633         for (i = 0; i < 32; i++) {
634             uint64_t vsr[2];
635             uint64_t *fpr = cpu_fpr_ptr(&cpu->env, i);
636             uint64_t *vsrl = cpu_vsrl_ptr(&cpu->env, i);
637 
638 #ifdef HOST_WORDS_BIGENDIAN
639             vsr[0] = float64_val(*fpr);
640             vsr[1] = *vsrl;
641 #else
642             vsr[0] = *vsrl;
643             vsr[1] = float64_val(*fpr);
644 #endif
645             reg.addr = (uintptr_t) &vsr;
646             reg.id = vsx ? KVM_REG_PPC_VSR(i) : KVM_REG_PPC_FPR(i);
647 
648             ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
649             if (ret < 0) {
650                 DPRINTF("Unable to set %s%d to KVM: %s\n", vsx ? "VSR" : "FPR",
651                         i, strerror(errno));
652                 return ret;
653             }
654         }
655     }
656 
657     if (env->insns_flags & PPC_ALTIVEC) {
658         reg.id = KVM_REG_PPC_VSCR;
659         reg.addr = (uintptr_t)&env->vscr;
660         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
661         if (ret < 0) {
662             DPRINTF("Unable to set VSCR to KVM: %s\n", strerror(errno));
663             return ret;
664         }
665 
666         for (i = 0; i < 32; i++) {
667             reg.id = KVM_REG_PPC_VR(i);
668             reg.addr = (uintptr_t)cpu_avr_ptr(env, i);
669             ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
670             if (ret < 0) {
671                 DPRINTF("Unable to set VR%d to KVM: %s\n", i, strerror(errno));
672                 return ret;
673             }
674         }
675     }
676 
677     return 0;
678 }
679 
680 static int kvm_get_fp(CPUState *cs)
681 {
682     PowerPCCPU *cpu = POWERPC_CPU(cs);
683     CPUPPCState *env = &cpu->env;
684     struct kvm_one_reg reg;
685     int i;
686     int ret;
687 
688     if (env->insns_flags & PPC_FLOAT) {
689         uint64_t fpscr;
690         bool vsx = !!(env->insns_flags2 & PPC2_VSX);
691 
692         reg.id = KVM_REG_PPC_FPSCR;
693         reg.addr = (uintptr_t)&fpscr;
694         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
695         if (ret < 0) {
696             DPRINTF("Unable to get FPSCR from KVM: %s\n", strerror(errno));
697             return ret;
698         } else {
699             env->fpscr = fpscr;
700         }
701 
702         for (i = 0; i < 32; i++) {
703             uint64_t vsr[2];
704             uint64_t *fpr = cpu_fpr_ptr(&cpu->env, i);
705             uint64_t *vsrl = cpu_vsrl_ptr(&cpu->env, i);
706 
707             reg.addr = (uintptr_t) &vsr;
708             reg.id = vsx ? KVM_REG_PPC_VSR(i) : KVM_REG_PPC_FPR(i);
709 
710             ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
711             if (ret < 0) {
712                 DPRINTF("Unable to get %s%d from KVM: %s\n",
713                         vsx ? "VSR" : "FPR", i, strerror(errno));
714                 return ret;
715             } else {
716 #ifdef HOST_WORDS_BIGENDIAN
717                 *fpr = vsr[0];
718                 if (vsx) {
719                     *vsrl = vsr[1];
720                 }
721 #else
722                 *fpr = vsr[1];
723                 if (vsx) {
724                     *vsrl = vsr[0];
725                 }
726 #endif
727             }
728         }
729     }
730 
731     if (env->insns_flags & PPC_ALTIVEC) {
732         reg.id = KVM_REG_PPC_VSCR;
733         reg.addr = (uintptr_t)&env->vscr;
734         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
735         if (ret < 0) {
736             DPRINTF("Unable to get VSCR from KVM: %s\n", strerror(errno));
737             return ret;
738         }
739 
740         for (i = 0; i < 32; i++) {
741             reg.id = KVM_REG_PPC_VR(i);
742             reg.addr = (uintptr_t)cpu_avr_ptr(env, i);
743             ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
744             if (ret < 0) {
745                 DPRINTF("Unable to get VR%d from KVM: %s\n",
746                         i, strerror(errno));
747                 return ret;
748             }
749         }
750     }
751 
752     return 0;
753 }
754 
755 #if defined(TARGET_PPC64)
756 static int kvm_get_vpa(CPUState *cs)
757 {
758     PowerPCCPU *cpu = POWERPC_CPU(cs);
759     sPAPRCPUState *spapr_cpu = spapr_cpu_state(cpu);
760     struct kvm_one_reg reg;
761     int ret;
762 
763     reg.id = KVM_REG_PPC_VPA_ADDR;
764     reg.addr = (uintptr_t)&spapr_cpu->vpa_addr;
765     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
766     if (ret < 0) {
767         DPRINTF("Unable to get VPA address from KVM: %s\n", strerror(errno));
768         return ret;
769     }
770 
771     assert((uintptr_t)&spapr_cpu->slb_shadow_size
772            == ((uintptr_t)&spapr_cpu->slb_shadow_addr + 8));
773     reg.id = KVM_REG_PPC_VPA_SLB;
774     reg.addr = (uintptr_t)&spapr_cpu->slb_shadow_addr;
775     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
776     if (ret < 0) {
777         DPRINTF("Unable to get SLB shadow state from KVM: %s\n",
778                 strerror(errno));
779         return ret;
780     }
781 
782     assert((uintptr_t)&spapr_cpu->dtl_size
783            == ((uintptr_t)&spapr_cpu->dtl_addr + 8));
784     reg.id = KVM_REG_PPC_VPA_DTL;
785     reg.addr = (uintptr_t)&spapr_cpu->dtl_addr;
786     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
787     if (ret < 0) {
788         DPRINTF("Unable to get dispatch trace log state from KVM: %s\n",
789                 strerror(errno));
790         return ret;
791     }
792 
793     return 0;
794 }
795 
796 static int kvm_put_vpa(CPUState *cs)
797 {
798     PowerPCCPU *cpu = POWERPC_CPU(cs);
799     sPAPRCPUState *spapr_cpu = spapr_cpu_state(cpu);
800     struct kvm_one_reg reg;
801     int ret;
802 
803     /* SLB shadow or DTL can't be registered unless a master VPA is
804      * registered.  That means when restoring state, if a VPA *is*
805      * registered, we need to set that up first.  If not, we need to
806      * deregister the others before deregistering the master VPA */
807     assert(spapr_cpu->vpa_addr
808            || !(spapr_cpu->slb_shadow_addr || spapr_cpu->dtl_addr));
809 
810     if (spapr_cpu->vpa_addr) {
811         reg.id = KVM_REG_PPC_VPA_ADDR;
812         reg.addr = (uintptr_t)&spapr_cpu->vpa_addr;
813         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
814         if (ret < 0) {
815             DPRINTF("Unable to set VPA address to KVM: %s\n", strerror(errno));
816             return ret;
817         }
818     }
819 
820     assert((uintptr_t)&spapr_cpu->slb_shadow_size
821            == ((uintptr_t)&spapr_cpu->slb_shadow_addr + 8));
822     reg.id = KVM_REG_PPC_VPA_SLB;
823     reg.addr = (uintptr_t)&spapr_cpu->slb_shadow_addr;
824     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
825     if (ret < 0) {
826         DPRINTF("Unable to set SLB shadow state to KVM: %s\n", strerror(errno));
827         return ret;
828     }
829 
830     assert((uintptr_t)&spapr_cpu->dtl_size
831            == ((uintptr_t)&spapr_cpu->dtl_addr + 8));
832     reg.id = KVM_REG_PPC_VPA_DTL;
833     reg.addr = (uintptr_t)&spapr_cpu->dtl_addr;
834     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
835     if (ret < 0) {
836         DPRINTF("Unable to set dispatch trace log state to KVM: %s\n",
837                 strerror(errno));
838         return ret;
839     }
840 
841     if (!spapr_cpu->vpa_addr) {
842         reg.id = KVM_REG_PPC_VPA_ADDR;
843         reg.addr = (uintptr_t)&spapr_cpu->vpa_addr;
844         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
845         if (ret < 0) {
846             DPRINTF("Unable to set VPA address to KVM: %s\n", strerror(errno));
847             return ret;
848         }
849     }
850 
851     return 0;
852 }
853 #endif /* TARGET_PPC64 */
854 
855 int kvmppc_put_books_sregs(PowerPCCPU *cpu)
856 {
857     CPUPPCState *env = &cpu->env;
858     struct kvm_sregs sregs;
859     int i;
860 
861     sregs.pvr = env->spr[SPR_PVR];
862 
863     if (cpu->vhyp) {
864         PPCVirtualHypervisorClass *vhc =
865             PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
866         sregs.u.s.sdr1 = vhc->encode_hpt_for_kvm_pr(cpu->vhyp);
867     } else {
868         sregs.u.s.sdr1 = env->spr[SPR_SDR1];
869     }
870 
871     /* Sync SLB */
872 #ifdef TARGET_PPC64
873     for (i = 0; i < ARRAY_SIZE(env->slb); i++) {
874         sregs.u.s.ppc64.slb[i].slbe = env->slb[i].esid;
875         if (env->slb[i].esid & SLB_ESID_V) {
876             sregs.u.s.ppc64.slb[i].slbe |= i;
877         }
878         sregs.u.s.ppc64.slb[i].slbv = env->slb[i].vsid;
879     }
880 #endif
881 
882     /* Sync SRs */
883     for (i = 0; i < 16; i++) {
884         sregs.u.s.ppc32.sr[i] = env->sr[i];
885     }
886 
887     /* Sync BATs */
888     for (i = 0; i < 8; i++) {
889         /* Beware. We have to swap upper and lower bits here */
890         sregs.u.s.ppc32.dbat[i] = ((uint64_t)env->DBAT[0][i] << 32)
891             | env->DBAT[1][i];
892         sregs.u.s.ppc32.ibat[i] = ((uint64_t)env->IBAT[0][i] << 32)
893             | env->IBAT[1][i];
894     }
895 
896     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
897 }
898 
899 int kvm_arch_put_registers(CPUState *cs, int level)
900 {
901     PowerPCCPU *cpu = POWERPC_CPU(cs);
902     CPUPPCState *env = &cpu->env;
903     struct kvm_regs regs;
904     int ret;
905     int i;
906 
907     ret = kvm_vcpu_ioctl(cs, KVM_GET_REGS, &regs);
908     if (ret < 0) {
909         return ret;
910     }
911 
912     regs.ctr = env->ctr;
913     regs.lr  = env->lr;
914     regs.xer = cpu_read_xer(env);
915     regs.msr = env->msr;
916     regs.pc = env->nip;
917 
918     regs.srr0 = env->spr[SPR_SRR0];
919     regs.srr1 = env->spr[SPR_SRR1];
920 
921     regs.sprg0 = env->spr[SPR_SPRG0];
922     regs.sprg1 = env->spr[SPR_SPRG1];
923     regs.sprg2 = env->spr[SPR_SPRG2];
924     regs.sprg3 = env->spr[SPR_SPRG3];
925     regs.sprg4 = env->spr[SPR_SPRG4];
926     regs.sprg5 = env->spr[SPR_SPRG5];
927     regs.sprg6 = env->spr[SPR_SPRG6];
928     regs.sprg7 = env->spr[SPR_SPRG7];
929 
930     regs.pid = env->spr[SPR_BOOKE_PID];
931 
932     for (i = 0;i < 32; i++)
933         regs.gpr[i] = env->gpr[i];
934 
935     regs.cr = 0;
936     for (i = 0; i < 8; i++) {
937         regs.cr |= (env->crf[i] & 15) << (4 * (7 - i));
938     }
939 
940     ret = kvm_vcpu_ioctl(cs, KVM_SET_REGS, &regs);
941     if (ret < 0)
942         return ret;
943 
944     kvm_put_fp(cs);
945 
946     if (env->tlb_dirty) {
947         kvm_sw_tlb_put(cpu);
948         env->tlb_dirty = false;
949     }
950 
951     if (cap_segstate && (level >= KVM_PUT_RESET_STATE)) {
952         ret = kvmppc_put_books_sregs(cpu);
953         if (ret < 0) {
954             return ret;
955         }
956     }
957 
958     if (cap_hior && (level >= KVM_PUT_RESET_STATE)) {
959         kvm_put_one_spr(cs, KVM_REG_PPC_HIOR, SPR_HIOR);
960     }
961 
962     if (cap_one_reg) {
963         int i;
964 
965         /* We deliberately ignore errors here, for kernels which have
966          * the ONE_REG calls, but don't support the specific
967          * registers, there's a reasonable chance things will still
968          * work, at least until we try to migrate. */
969         for (i = 0; i < 1024; i++) {
970             uint64_t id = env->spr_cb[i].one_reg_id;
971 
972             if (id != 0) {
973                 kvm_put_one_spr(cs, id, i);
974             }
975         }
976 
977 #ifdef TARGET_PPC64
978         if (msr_ts) {
979             for (i = 0; i < ARRAY_SIZE(env->tm_gpr); i++) {
980                 kvm_set_one_reg(cs, KVM_REG_PPC_TM_GPR(i), &env->tm_gpr[i]);
981             }
982             for (i = 0; i < ARRAY_SIZE(env->tm_vsr); i++) {
983                 kvm_set_one_reg(cs, KVM_REG_PPC_TM_VSR(i), &env->tm_vsr[i]);
984             }
985             kvm_set_one_reg(cs, KVM_REG_PPC_TM_CR, &env->tm_cr);
986             kvm_set_one_reg(cs, KVM_REG_PPC_TM_LR, &env->tm_lr);
987             kvm_set_one_reg(cs, KVM_REG_PPC_TM_CTR, &env->tm_ctr);
988             kvm_set_one_reg(cs, KVM_REG_PPC_TM_FPSCR, &env->tm_fpscr);
989             kvm_set_one_reg(cs, KVM_REG_PPC_TM_AMR, &env->tm_amr);
990             kvm_set_one_reg(cs, KVM_REG_PPC_TM_PPR, &env->tm_ppr);
991             kvm_set_one_reg(cs, KVM_REG_PPC_TM_VRSAVE, &env->tm_vrsave);
992             kvm_set_one_reg(cs, KVM_REG_PPC_TM_VSCR, &env->tm_vscr);
993             kvm_set_one_reg(cs, KVM_REG_PPC_TM_DSCR, &env->tm_dscr);
994             kvm_set_one_reg(cs, KVM_REG_PPC_TM_TAR, &env->tm_tar);
995         }
996 
997         if (cap_papr) {
998             if (kvm_put_vpa(cs) < 0) {
999                 DPRINTF("Warning: Unable to set VPA information to KVM\n");
1000             }
1001         }
1002 
1003         kvm_set_one_reg(cs, KVM_REG_PPC_TB_OFFSET, &env->tb_env->tb_offset);
1004 #endif /* TARGET_PPC64 */
1005     }
1006 
1007     return ret;
1008 }
1009 
1010 static void kvm_sync_excp(CPUPPCState *env, int vector, int ivor)
1011 {
1012      env->excp_vectors[vector] = env->spr[ivor] + env->spr[SPR_BOOKE_IVPR];
1013 }
1014 
1015 static int kvmppc_get_booke_sregs(PowerPCCPU *cpu)
1016 {
1017     CPUPPCState *env = &cpu->env;
1018     struct kvm_sregs sregs;
1019     int ret;
1020 
1021     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
1022     if (ret < 0) {
1023         return ret;
1024     }
1025 
1026     if (sregs.u.e.features & KVM_SREGS_E_BASE) {
1027         env->spr[SPR_BOOKE_CSRR0] = sregs.u.e.csrr0;
1028         env->spr[SPR_BOOKE_CSRR1] = sregs.u.e.csrr1;
1029         env->spr[SPR_BOOKE_ESR] = sregs.u.e.esr;
1030         env->spr[SPR_BOOKE_DEAR] = sregs.u.e.dear;
1031         env->spr[SPR_BOOKE_MCSR] = sregs.u.e.mcsr;
1032         env->spr[SPR_BOOKE_TSR] = sregs.u.e.tsr;
1033         env->spr[SPR_BOOKE_TCR] = sregs.u.e.tcr;
1034         env->spr[SPR_DECR] = sregs.u.e.dec;
1035         env->spr[SPR_TBL] = sregs.u.e.tb & 0xffffffff;
1036         env->spr[SPR_TBU] = sregs.u.e.tb >> 32;
1037         env->spr[SPR_VRSAVE] = sregs.u.e.vrsave;
1038     }
1039 
1040     if (sregs.u.e.features & KVM_SREGS_E_ARCH206) {
1041         env->spr[SPR_BOOKE_PIR] = sregs.u.e.pir;
1042         env->spr[SPR_BOOKE_MCSRR0] = sregs.u.e.mcsrr0;
1043         env->spr[SPR_BOOKE_MCSRR1] = sregs.u.e.mcsrr1;
1044         env->spr[SPR_BOOKE_DECAR] = sregs.u.e.decar;
1045         env->spr[SPR_BOOKE_IVPR] = sregs.u.e.ivpr;
1046     }
1047 
1048     if (sregs.u.e.features & KVM_SREGS_E_64) {
1049         env->spr[SPR_BOOKE_EPCR] = sregs.u.e.epcr;
1050     }
1051 
1052     if (sregs.u.e.features & KVM_SREGS_E_SPRG8) {
1053         env->spr[SPR_BOOKE_SPRG8] = sregs.u.e.sprg8;
1054     }
1055 
1056     if (sregs.u.e.features & KVM_SREGS_E_IVOR) {
1057         env->spr[SPR_BOOKE_IVOR0] = sregs.u.e.ivor_low[0];
1058         kvm_sync_excp(env, POWERPC_EXCP_CRITICAL,  SPR_BOOKE_IVOR0);
1059         env->spr[SPR_BOOKE_IVOR1] = sregs.u.e.ivor_low[1];
1060         kvm_sync_excp(env, POWERPC_EXCP_MCHECK,  SPR_BOOKE_IVOR1);
1061         env->spr[SPR_BOOKE_IVOR2] = sregs.u.e.ivor_low[2];
1062         kvm_sync_excp(env, POWERPC_EXCP_DSI,  SPR_BOOKE_IVOR2);
1063         env->spr[SPR_BOOKE_IVOR3] = sregs.u.e.ivor_low[3];
1064         kvm_sync_excp(env, POWERPC_EXCP_ISI,  SPR_BOOKE_IVOR3);
1065         env->spr[SPR_BOOKE_IVOR4] = sregs.u.e.ivor_low[4];
1066         kvm_sync_excp(env, POWERPC_EXCP_EXTERNAL,  SPR_BOOKE_IVOR4);
1067         env->spr[SPR_BOOKE_IVOR5] = sregs.u.e.ivor_low[5];
1068         kvm_sync_excp(env, POWERPC_EXCP_ALIGN,  SPR_BOOKE_IVOR5);
1069         env->spr[SPR_BOOKE_IVOR6] = sregs.u.e.ivor_low[6];
1070         kvm_sync_excp(env, POWERPC_EXCP_PROGRAM,  SPR_BOOKE_IVOR6);
1071         env->spr[SPR_BOOKE_IVOR7] = sregs.u.e.ivor_low[7];
1072         kvm_sync_excp(env, POWERPC_EXCP_FPU,  SPR_BOOKE_IVOR7);
1073         env->spr[SPR_BOOKE_IVOR8] = sregs.u.e.ivor_low[8];
1074         kvm_sync_excp(env, POWERPC_EXCP_SYSCALL,  SPR_BOOKE_IVOR8);
1075         env->spr[SPR_BOOKE_IVOR9] = sregs.u.e.ivor_low[9];
1076         kvm_sync_excp(env, POWERPC_EXCP_APU,  SPR_BOOKE_IVOR9);
1077         env->spr[SPR_BOOKE_IVOR10] = sregs.u.e.ivor_low[10];
1078         kvm_sync_excp(env, POWERPC_EXCP_DECR,  SPR_BOOKE_IVOR10);
1079         env->spr[SPR_BOOKE_IVOR11] = sregs.u.e.ivor_low[11];
1080         kvm_sync_excp(env, POWERPC_EXCP_FIT,  SPR_BOOKE_IVOR11);
1081         env->spr[SPR_BOOKE_IVOR12] = sregs.u.e.ivor_low[12];
1082         kvm_sync_excp(env, POWERPC_EXCP_WDT,  SPR_BOOKE_IVOR12);
1083         env->spr[SPR_BOOKE_IVOR13] = sregs.u.e.ivor_low[13];
1084         kvm_sync_excp(env, POWERPC_EXCP_DTLB,  SPR_BOOKE_IVOR13);
1085         env->spr[SPR_BOOKE_IVOR14] = sregs.u.e.ivor_low[14];
1086         kvm_sync_excp(env, POWERPC_EXCP_ITLB,  SPR_BOOKE_IVOR14);
1087         env->spr[SPR_BOOKE_IVOR15] = sregs.u.e.ivor_low[15];
1088         kvm_sync_excp(env, POWERPC_EXCP_DEBUG,  SPR_BOOKE_IVOR15);
1089 
1090         if (sregs.u.e.features & KVM_SREGS_E_SPE) {
1091             env->spr[SPR_BOOKE_IVOR32] = sregs.u.e.ivor_high[0];
1092             kvm_sync_excp(env, POWERPC_EXCP_SPEU,  SPR_BOOKE_IVOR32);
1093             env->spr[SPR_BOOKE_IVOR33] = sregs.u.e.ivor_high[1];
1094             kvm_sync_excp(env, POWERPC_EXCP_EFPDI,  SPR_BOOKE_IVOR33);
1095             env->spr[SPR_BOOKE_IVOR34] = sregs.u.e.ivor_high[2];
1096             kvm_sync_excp(env, POWERPC_EXCP_EFPRI,  SPR_BOOKE_IVOR34);
1097         }
1098 
1099         if (sregs.u.e.features & KVM_SREGS_E_PM) {
1100             env->spr[SPR_BOOKE_IVOR35] = sregs.u.e.ivor_high[3];
1101             kvm_sync_excp(env, POWERPC_EXCP_EPERFM,  SPR_BOOKE_IVOR35);
1102         }
1103 
1104         if (sregs.u.e.features & KVM_SREGS_E_PC) {
1105             env->spr[SPR_BOOKE_IVOR36] = sregs.u.e.ivor_high[4];
1106             kvm_sync_excp(env, POWERPC_EXCP_DOORI,  SPR_BOOKE_IVOR36);
1107             env->spr[SPR_BOOKE_IVOR37] = sregs.u.e.ivor_high[5];
1108             kvm_sync_excp(env, POWERPC_EXCP_DOORCI, SPR_BOOKE_IVOR37);
1109         }
1110     }
1111 
1112     if (sregs.u.e.features & KVM_SREGS_E_ARCH206_MMU) {
1113         env->spr[SPR_BOOKE_MAS0] = sregs.u.e.mas0;
1114         env->spr[SPR_BOOKE_MAS1] = sregs.u.e.mas1;
1115         env->spr[SPR_BOOKE_MAS2] = sregs.u.e.mas2;
1116         env->spr[SPR_BOOKE_MAS3] = sregs.u.e.mas7_3 & 0xffffffff;
1117         env->spr[SPR_BOOKE_MAS4] = sregs.u.e.mas4;
1118         env->spr[SPR_BOOKE_MAS6] = sregs.u.e.mas6;
1119         env->spr[SPR_BOOKE_MAS7] = sregs.u.e.mas7_3 >> 32;
1120         env->spr[SPR_MMUCFG] = sregs.u.e.mmucfg;
1121         env->spr[SPR_BOOKE_TLB0CFG] = sregs.u.e.tlbcfg[0];
1122         env->spr[SPR_BOOKE_TLB1CFG] = sregs.u.e.tlbcfg[1];
1123     }
1124 
1125     if (sregs.u.e.features & KVM_SREGS_EXP) {
1126         env->spr[SPR_BOOKE_EPR] = sregs.u.e.epr;
1127     }
1128 
1129     if (sregs.u.e.features & KVM_SREGS_E_PD) {
1130         env->spr[SPR_BOOKE_EPLC] = sregs.u.e.eplc;
1131         env->spr[SPR_BOOKE_EPSC] = sregs.u.e.epsc;
1132     }
1133 
1134     if (sregs.u.e.impl_id == KVM_SREGS_E_IMPL_FSL) {
1135         env->spr[SPR_E500_SVR] = sregs.u.e.impl.fsl.svr;
1136         env->spr[SPR_Exxx_MCAR] = sregs.u.e.impl.fsl.mcar;
1137         env->spr[SPR_HID0] = sregs.u.e.impl.fsl.hid0;
1138 
1139         if (sregs.u.e.impl.fsl.features & KVM_SREGS_E_FSL_PIDn) {
1140             env->spr[SPR_BOOKE_PID1] = sregs.u.e.impl.fsl.pid1;
1141             env->spr[SPR_BOOKE_PID2] = sregs.u.e.impl.fsl.pid2;
1142         }
1143     }
1144 
1145     return 0;
1146 }
1147 
1148 static int kvmppc_get_books_sregs(PowerPCCPU *cpu)
1149 {
1150     CPUPPCState *env = &cpu->env;
1151     struct kvm_sregs sregs;
1152     int ret;
1153     int i;
1154 
1155     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
1156     if (ret < 0) {
1157         return ret;
1158     }
1159 
1160     if (!cpu->vhyp) {
1161         ppc_store_sdr1(env, sregs.u.s.sdr1);
1162     }
1163 
1164     /* Sync SLB */
1165 #ifdef TARGET_PPC64
1166     /*
1167      * The packed SLB array we get from KVM_GET_SREGS only contains
1168      * information about valid entries. So we flush our internal copy
1169      * to get rid of stale ones, then put all valid SLB entries back
1170      * in.
1171      */
1172     memset(env->slb, 0, sizeof(env->slb));
1173     for (i = 0; i < ARRAY_SIZE(env->slb); i++) {
1174         target_ulong rb = sregs.u.s.ppc64.slb[i].slbe;
1175         target_ulong rs = sregs.u.s.ppc64.slb[i].slbv;
1176         /*
1177          * Only restore valid entries
1178          */
1179         if (rb & SLB_ESID_V) {
1180             ppc_store_slb(cpu, rb & 0xfff, rb & ~0xfffULL, rs);
1181         }
1182     }
1183 #endif
1184 
1185     /* Sync SRs */
1186     for (i = 0; i < 16; i++) {
1187         env->sr[i] = sregs.u.s.ppc32.sr[i];
1188     }
1189 
1190     /* Sync BATs */
1191     for (i = 0; i < 8; i++) {
1192         env->DBAT[0][i] = sregs.u.s.ppc32.dbat[i] & 0xffffffff;
1193         env->DBAT[1][i] = sregs.u.s.ppc32.dbat[i] >> 32;
1194         env->IBAT[0][i] = sregs.u.s.ppc32.ibat[i] & 0xffffffff;
1195         env->IBAT[1][i] = sregs.u.s.ppc32.ibat[i] >> 32;
1196     }
1197 
1198     return 0;
1199 }
1200 
1201 int kvm_arch_get_registers(CPUState *cs)
1202 {
1203     PowerPCCPU *cpu = POWERPC_CPU(cs);
1204     CPUPPCState *env = &cpu->env;
1205     struct kvm_regs regs;
1206     uint32_t cr;
1207     int i, ret;
1208 
1209     ret = kvm_vcpu_ioctl(cs, KVM_GET_REGS, &regs);
1210     if (ret < 0)
1211         return ret;
1212 
1213     cr = regs.cr;
1214     for (i = 7; i >= 0; i--) {
1215         env->crf[i] = cr & 15;
1216         cr >>= 4;
1217     }
1218 
1219     env->ctr = regs.ctr;
1220     env->lr = regs.lr;
1221     cpu_write_xer(env, regs.xer);
1222     env->msr = regs.msr;
1223     env->nip = regs.pc;
1224 
1225     env->spr[SPR_SRR0] = regs.srr0;
1226     env->spr[SPR_SRR1] = regs.srr1;
1227 
1228     env->spr[SPR_SPRG0] = regs.sprg0;
1229     env->spr[SPR_SPRG1] = regs.sprg1;
1230     env->spr[SPR_SPRG2] = regs.sprg2;
1231     env->spr[SPR_SPRG3] = regs.sprg3;
1232     env->spr[SPR_SPRG4] = regs.sprg4;
1233     env->spr[SPR_SPRG5] = regs.sprg5;
1234     env->spr[SPR_SPRG6] = regs.sprg6;
1235     env->spr[SPR_SPRG7] = regs.sprg7;
1236 
1237     env->spr[SPR_BOOKE_PID] = regs.pid;
1238 
1239     for (i = 0;i < 32; i++)
1240         env->gpr[i] = regs.gpr[i];
1241 
1242     kvm_get_fp(cs);
1243 
1244     if (cap_booke_sregs) {
1245         ret = kvmppc_get_booke_sregs(cpu);
1246         if (ret < 0) {
1247             return ret;
1248         }
1249     }
1250 
1251     if (cap_segstate) {
1252         ret = kvmppc_get_books_sregs(cpu);
1253         if (ret < 0) {
1254             return ret;
1255         }
1256     }
1257 
1258     if (cap_hior) {
1259         kvm_get_one_spr(cs, KVM_REG_PPC_HIOR, SPR_HIOR);
1260     }
1261 
1262     if (cap_one_reg) {
1263         int i;
1264 
1265         /* We deliberately ignore errors here, for kernels which have
1266          * the ONE_REG calls, but don't support the specific
1267          * registers, there's a reasonable chance things will still
1268          * work, at least until we try to migrate. */
1269         for (i = 0; i < 1024; i++) {
1270             uint64_t id = env->spr_cb[i].one_reg_id;
1271 
1272             if (id != 0) {
1273                 kvm_get_one_spr(cs, id, i);
1274             }
1275         }
1276 
1277 #ifdef TARGET_PPC64
1278         if (msr_ts) {
1279             for (i = 0; i < ARRAY_SIZE(env->tm_gpr); i++) {
1280                 kvm_get_one_reg(cs, KVM_REG_PPC_TM_GPR(i), &env->tm_gpr[i]);
1281             }
1282             for (i = 0; i < ARRAY_SIZE(env->tm_vsr); i++) {
1283                 kvm_get_one_reg(cs, KVM_REG_PPC_TM_VSR(i), &env->tm_vsr[i]);
1284             }
1285             kvm_get_one_reg(cs, KVM_REG_PPC_TM_CR, &env->tm_cr);
1286             kvm_get_one_reg(cs, KVM_REG_PPC_TM_LR, &env->tm_lr);
1287             kvm_get_one_reg(cs, KVM_REG_PPC_TM_CTR, &env->tm_ctr);
1288             kvm_get_one_reg(cs, KVM_REG_PPC_TM_FPSCR, &env->tm_fpscr);
1289             kvm_get_one_reg(cs, KVM_REG_PPC_TM_AMR, &env->tm_amr);
1290             kvm_get_one_reg(cs, KVM_REG_PPC_TM_PPR, &env->tm_ppr);
1291             kvm_get_one_reg(cs, KVM_REG_PPC_TM_VRSAVE, &env->tm_vrsave);
1292             kvm_get_one_reg(cs, KVM_REG_PPC_TM_VSCR, &env->tm_vscr);
1293             kvm_get_one_reg(cs, KVM_REG_PPC_TM_DSCR, &env->tm_dscr);
1294             kvm_get_one_reg(cs, KVM_REG_PPC_TM_TAR, &env->tm_tar);
1295         }
1296 
1297         if (cap_papr) {
1298             if (kvm_get_vpa(cs) < 0) {
1299                 DPRINTF("Warning: Unable to get VPA information from KVM\n");
1300             }
1301         }
1302 
1303         kvm_get_one_reg(cs, KVM_REG_PPC_TB_OFFSET, &env->tb_env->tb_offset);
1304 #endif
1305     }
1306 
1307     return 0;
1308 }
1309 
1310 int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int level)
1311 {
1312     unsigned virq = level ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
1313 
1314     if (irq != PPC_INTERRUPT_EXT) {
1315         return 0;
1316     }
1317 
1318     if (!kvm_enabled() || !cap_interrupt_unset || !cap_interrupt_level) {
1319         return 0;
1320     }
1321 
1322     kvm_vcpu_ioctl(CPU(cpu), KVM_INTERRUPT, &virq);
1323 
1324     return 0;
1325 }
1326 
1327 #if defined(TARGET_PPC64)
1328 #define PPC_INPUT_INT PPC970_INPUT_INT
1329 #else
1330 #define PPC_INPUT_INT PPC6xx_INPUT_INT
1331 #endif
1332 
1333 void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
1334 {
1335     PowerPCCPU *cpu = POWERPC_CPU(cs);
1336     CPUPPCState *env = &cpu->env;
1337     int r;
1338     unsigned irq;
1339 
1340     qemu_mutex_lock_iothread();
1341 
1342     /* PowerPC QEMU tracks the various core input pins (interrupt, critical
1343      * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
1344     if (!cap_interrupt_level &&
1345         run->ready_for_interrupt_injection &&
1346         (cs->interrupt_request & CPU_INTERRUPT_HARD) &&
1347         (env->irq_input_state & (1<<PPC_INPUT_INT)))
1348     {
1349         /* For now KVM disregards the 'irq' argument. However, in the
1350          * future KVM could cache it in-kernel to avoid a heavyweight exit
1351          * when reading the UIC.
1352          */
1353         irq = KVM_INTERRUPT_SET;
1354 
1355         DPRINTF("injected interrupt %d\n", irq);
1356         r = kvm_vcpu_ioctl(cs, KVM_INTERRUPT, &irq);
1357         if (r < 0) {
1358             printf("cpu %d fail inject %x\n", cs->cpu_index, irq);
1359         }
1360 
1361         /* Always wake up soon in case the interrupt was level based */
1362         timer_mod(idle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
1363                        (NANOSECONDS_PER_SECOND / 50));
1364     }
1365 
1366     /* We don't know if there are more interrupts pending after this. However,
1367      * the guest will return to userspace in the course of handling this one
1368      * anyways, so we will get a chance to deliver the rest. */
1369 
1370     qemu_mutex_unlock_iothread();
1371 }
1372 
1373 MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run)
1374 {
1375     return MEMTXATTRS_UNSPECIFIED;
1376 }
1377 
1378 int kvm_arch_process_async_events(CPUState *cs)
1379 {
1380     return cs->halted;
1381 }
1382 
1383 static int kvmppc_handle_halt(PowerPCCPU *cpu)
1384 {
1385     CPUState *cs = CPU(cpu);
1386     CPUPPCState *env = &cpu->env;
1387 
1388     if (!(cs->interrupt_request & CPU_INTERRUPT_HARD) && (msr_ee)) {
1389         cs->halted = 1;
1390         cs->exception_index = EXCP_HLT;
1391     }
1392 
1393     return 0;
1394 }
1395 
1396 /* map dcr access to existing qemu dcr emulation */
1397 static int kvmppc_handle_dcr_read(CPUPPCState *env, uint32_t dcrn, uint32_t *data)
1398 {
1399     if (ppc_dcr_read(env->dcr_env, dcrn, data) < 0)
1400         fprintf(stderr, "Read to unhandled DCR (0x%x)\n", dcrn);
1401 
1402     return 0;
1403 }
1404 
1405 static int kvmppc_handle_dcr_write(CPUPPCState *env, uint32_t dcrn, uint32_t data)
1406 {
1407     if (ppc_dcr_write(env->dcr_env, dcrn, data) < 0)
1408         fprintf(stderr, "Write to unhandled DCR (0x%x)\n", dcrn);
1409 
1410     return 0;
1411 }
1412 
1413 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
1414 {
1415     /* Mixed endian case is not handled */
1416     uint32_t sc = debug_inst_opcode;
1417 
1418     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn,
1419                             sizeof(sc), 0) ||
1420         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&sc, sizeof(sc), 1)) {
1421         return -EINVAL;
1422     }
1423 
1424     return 0;
1425 }
1426 
1427 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
1428 {
1429     uint32_t sc;
1430 
1431     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&sc, sizeof(sc), 0) ||
1432         sc != debug_inst_opcode ||
1433         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn,
1434                             sizeof(sc), 1)) {
1435         return -EINVAL;
1436     }
1437 
1438     return 0;
1439 }
1440 
1441 static int find_hw_breakpoint(target_ulong addr, int type)
1442 {
1443     int n;
1444 
1445     assert((nb_hw_breakpoint + nb_hw_watchpoint)
1446            <= ARRAY_SIZE(hw_debug_points));
1447 
1448     for (n = 0; n < nb_hw_breakpoint + nb_hw_watchpoint; n++) {
1449         if (hw_debug_points[n].addr == addr &&
1450              hw_debug_points[n].type == type) {
1451             return n;
1452         }
1453     }
1454 
1455     return -1;
1456 }
1457 
1458 static int find_hw_watchpoint(target_ulong addr, int *flag)
1459 {
1460     int n;
1461 
1462     n = find_hw_breakpoint(addr, GDB_WATCHPOINT_ACCESS);
1463     if (n >= 0) {
1464         *flag = BP_MEM_ACCESS;
1465         return n;
1466     }
1467 
1468     n = find_hw_breakpoint(addr, GDB_WATCHPOINT_WRITE);
1469     if (n >= 0) {
1470         *flag = BP_MEM_WRITE;
1471         return n;
1472     }
1473 
1474     n = find_hw_breakpoint(addr, GDB_WATCHPOINT_READ);
1475     if (n >= 0) {
1476         *flag = BP_MEM_READ;
1477         return n;
1478     }
1479 
1480     return -1;
1481 }
1482 
1483 int kvm_arch_insert_hw_breakpoint(target_ulong addr,
1484                                   target_ulong len, int type)
1485 {
1486     if ((nb_hw_breakpoint + nb_hw_watchpoint) >= ARRAY_SIZE(hw_debug_points)) {
1487         return -ENOBUFS;
1488     }
1489 
1490     hw_debug_points[nb_hw_breakpoint + nb_hw_watchpoint].addr = addr;
1491     hw_debug_points[nb_hw_breakpoint + nb_hw_watchpoint].type = type;
1492 
1493     switch (type) {
1494     case GDB_BREAKPOINT_HW:
1495         if (nb_hw_breakpoint >= max_hw_breakpoint) {
1496             return -ENOBUFS;
1497         }
1498 
1499         if (find_hw_breakpoint(addr, type) >= 0) {
1500             return -EEXIST;
1501         }
1502 
1503         nb_hw_breakpoint++;
1504         break;
1505 
1506     case GDB_WATCHPOINT_WRITE:
1507     case GDB_WATCHPOINT_READ:
1508     case GDB_WATCHPOINT_ACCESS:
1509         if (nb_hw_watchpoint >= max_hw_watchpoint) {
1510             return -ENOBUFS;
1511         }
1512 
1513         if (find_hw_breakpoint(addr, type) >= 0) {
1514             return -EEXIST;
1515         }
1516 
1517         nb_hw_watchpoint++;
1518         break;
1519 
1520     default:
1521         return -ENOSYS;
1522     }
1523 
1524     return 0;
1525 }
1526 
1527 int kvm_arch_remove_hw_breakpoint(target_ulong addr,
1528                                   target_ulong len, int type)
1529 {
1530     int n;
1531 
1532     n = find_hw_breakpoint(addr, type);
1533     if (n < 0) {
1534         return -ENOENT;
1535     }
1536 
1537     switch (type) {
1538     case GDB_BREAKPOINT_HW:
1539         nb_hw_breakpoint--;
1540         break;
1541 
1542     case GDB_WATCHPOINT_WRITE:
1543     case GDB_WATCHPOINT_READ:
1544     case GDB_WATCHPOINT_ACCESS:
1545         nb_hw_watchpoint--;
1546         break;
1547 
1548     default:
1549         return -ENOSYS;
1550     }
1551     hw_debug_points[n] = hw_debug_points[nb_hw_breakpoint + nb_hw_watchpoint];
1552 
1553     return 0;
1554 }
1555 
1556 void kvm_arch_remove_all_hw_breakpoints(void)
1557 {
1558     nb_hw_breakpoint = nb_hw_watchpoint = 0;
1559 }
1560 
1561 void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
1562 {
1563     int n;
1564 
1565     /* Software Breakpoint updates */
1566     if (kvm_sw_breakpoints_active(cs)) {
1567         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
1568     }
1569 
1570     assert((nb_hw_breakpoint + nb_hw_watchpoint)
1571            <= ARRAY_SIZE(hw_debug_points));
1572     assert((nb_hw_breakpoint + nb_hw_watchpoint) <= ARRAY_SIZE(dbg->arch.bp));
1573 
1574     if (nb_hw_breakpoint + nb_hw_watchpoint > 0) {
1575         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
1576         memset(dbg->arch.bp, 0, sizeof(dbg->arch.bp));
1577         for (n = 0; n < nb_hw_breakpoint + nb_hw_watchpoint; n++) {
1578             switch (hw_debug_points[n].type) {
1579             case GDB_BREAKPOINT_HW:
1580                 dbg->arch.bp[n].type = KVMPPC_DEBUG_BREAKPOINT;
1581                 break;
1582             case GDB_WATCHPOINT_WRITE:
1583                 dbg->arch.bp[n].type = KVMPPC_DEBUG_WATCH_WRITE;
1584                 break;
1585             case GDB_WATCHPOINT_READ:
1586                 dbg->arch.bp[n].type = KVMPPC_DEBUG_WATCH_READ;
1587                 break;
1588             case GDB_WATCHPOINT_ACCESS:
1589                 dbg->arch.bp[n].type = KVMPPC_DEBUG_WATCH_WRITE |
1590                                         KVMPPC_DEBUG_WATCH_READ;
1591                 break;
1592             default:
1593                 cpu_abort(cs, "Unsupported breakpoint type\n");
1594             }
1595             dbg->arch.bp[n].addr = hw_debug_points[n].addr;
1596         }
1597     }
1598 }
1599 
1600 static int kvm_handle_hw_breakpoint(CPUState *cs,
1601                                     struct kvm_debug_exit_arch *arch_info)
1602 {
1603     int handle = 0;
1604     int n;
1605     int flag = 0;
1606 
1607     if (nb_hw_breakpoint + nb_hw_watchpoint > 0) {
1608         if (arch_info->status & KVMPPC_DEBUG_BREAKPOINT) {
1609             n = find_hw_breakpoint(arch_info->address, GDB_BREAKPOINT_HW);
1610             if (n >= 0) {
1611                 handle = 1;
1612             }
1613         } else if (arch_info->status & (KVMPPC_DEBUG_WATCH_READ |
1614                                         KVMPPC_DEBUG_WATCH_WRITE)) {
1615             n = find_hw_watchpoint(arch_info->address,  &flag);
1616             if (n >= 0) {
1617                 handle = 1;
1618                 cs->watchpoint_hit = &hw_watchpoint;
1619                 hw_watchpoint.vaddr = hw_debug_points[n].addr;
1620                 hw_watchpoint.flags = flag;
1621             }
1622         }
1623     }
1624     return handle;
1625 }
1626 
1627 static int kvm_handle_debug(PowerPCCPU *cpu, struct kvm_run *run)
1628 {
1629     CPUState *cs = CPU(cpu);
1630     CPUPPCState *env = &cpu->env;
1631     struct kvm_debug_exit_arch *arch_info = &run->debug.arch;
1632     int handle = 0;
1633 
1634     if (cs->singlestep_enabled) {
1635         handle = 1;
1636     } else if (arch_info->status) {
1637         handle = kvm_handle_hw_breakpoint(cs, arch_info);
1638     } else if (kvm_find_sw_breakpoint(cs, arch_info->address)) {
1639         handle = 1;
1640     } else {
1641         /* QEMU is not able to handle debug exception, so inject
1642          * program exception to guest;
1643          * Yes program exception NOT debug exception !!
1644          * When QEMU is using debug resources then debug exception must
1645          * be always set. To achieve this we set MSR_DE and also set
1646          * MSRP_DEP so guest cannot change MSR_DE.
1647          * When emulating debug resource for guest we want guest
1648          * to control MSR_DE (enable/disable debug interrupt on need).
1649          * Supporting both configurations are NOT possible.
1650          * So the result is that we cannot share debug resources
1651          * between QEMU and Guest on BOOKE architecture.
1652          * In the current design QEMU gets the priority over guest,
1653          * this means that if QEMU is using debug resources then guest
1654          * cannot use them;
1655          * For software breakpoint QEMU uses a privileged instruction;
1656          * So there cannot be any reason that we are here for guest
1657          * set debug exception, only possibility is guest executed a
1658          * privileged / illegal instruction and that's why we are
1659          * injecting a program interrupt.
1660          */
1661 
1662         cpu_synchronize_state(cs);
1663         /* env->nip is PC, so increment this by 4 to use
1664          * ppc_cpu_do_interrupt(), which set srr0 = env->nip - 4.
1665          */
1666         env->nip += 4;
1667         cs->exception_index = POWERPC_EXCP_PROGRAM;
1668         env->error_code = POWERPC_EXCP_INVAL;
1669         ppc_cpu_do_interrupt(cs);
1670     }
1671 
1672     return handle;
1673 }
1674 
1675 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
1676 {
1677     PowerPCCPU *cpu = POWERPC_CPU(cs);
1678     CPUPPCState *env = &cpu->env;
1679     int ret;
1680 
1681     qemu_mutex_lock_iothread();
1682 
1683     switch (run->exit_reason) {
1684     case KVM_EXIT_DCR:
1685         if (run->dcr.is_write) {
1686             DPRINTF("handle dcr write\n");
1687             ret = kvmppc_handle_dcr_write(env, run->dcr.dcrn, run->dcr.data);
1688         } else {
1689             DPRINTF("handle dcr read\n");
1690             ret = kvmppc_handle_dcr_read(env, run->dcr.dcrn, &run->dcr.data);
1691         }
1692         break;
1693     case KVM_EXIT_HLT:
1694         DPRINTF("handle halt\n");
1695         ret = kvmppc_handle_halt(cpu);
1696         break;
1697 #if defined(TARGET_PPC64)
1698     case KVM_EXIT_PAPR_HCALL:
1699         DPRINTF("handle PAPR hypercall\n");
1700         run->papr_hcall.ret = spapr_hypercall(cpu,
1701                                               run->papr_hcall.nr,
1702                                               run->papr_hcall.args);
1703         ret = 0;
1704         break;
1705 #endif
1706     case KVM_EXIT_EPR:
1707         DPRINTF("handle epr\n");
1708         run->epr.epr = ldl_phys(cs->as, env->mpic_iack);
1709         ret = 0;
1710         break;
1711     case KVM_EXIT_WATCHDOG:
1712         DPRINTF("handle watchdog expiry\n");
1713         watchdog_perform_action();
1714         ret = 0;
1715         break;
1716 
1717     case KVM_EXIT_DEBUG:
1718         DPRINTF("handle debug exception\n");
1719         if (kvm_handle_debug(cpu, run)) {
1720             ret = EXCP_DEBUG;
1721             break;
1722         }
1723         /* re-enter, this exception was guest-internal */
1724         ret = 0;
1725         break;
1726 
1727     default:
1728         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
1729         ret = -1;
1730         break;
1731     }
1732 
1733     qemu_mutex_unlock_iothread();
1734     return ret;
1735 }
1736 
1737 int kvmppc_or_tsr_bits(PowerPCCPU *cpu, uint32_t tsr_bits)
1738 {
1739     CPUState *cs = CPU(cpu);
1740     uint32_t bits = tsr_bits;
1741     struct kvm_one_reg reg = {
1742         .id = KVM_REG_PPC_OR_TSR,
1743         .addr = (uintptr_t) &bits,
1744     };
1745 
1746     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1747 }
1748 
1749 int kvmppc_clear_tsr_bits(PowerPCCPU *cpu, uint32_t tsr_bits)
1750 {
1751 
1752     CPUState *cs = CPU(cpu);
1753     uint32_t bits = tsr_bits;
1754     struct kvm_one_reg reg = {
1755         .id = KVM_REG_PPC_CLEAR_TSR,
1756         .addr = (uintptr_t) &bits,
1757     };
1758 
1759     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1760 }
1761 
1762 int kvmppc_set_tcr(PowerPCCPU *cpu)
1763 {
1764     CPUState *cs = CPU(cpu);
1765     CPUPPCState *env = &cpu->env;
1766     uint32_t tcr = env->spr[SPR_BOOKE_TCR];
1767 
1768     struct kvm_one_reg reg = {
1769         .id = KVM_REG_PPC_TCR,
1770         .addr = (uintptr_t) &tcr,
1771     };
1772 
1773     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1774 }
1775 
1776 int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu)
1777 {
1778     CPUState *cs = CPU(cpu);
1779     int ret;
1780 
1781     if (!kvm_enabled()) {
1782         return -1;
1783     }
1784 
1785     if (!cap_ppc_watchdog) {
1786         printf("warning: KVM does not support watchdog");
1787         return -1;
1788     }
1789 
1790     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_BOOKE_WATCHDOG, 0);
1791     if (ret < 0) {
1792         fprintf(stderr, "%s: couldn't enable KVM_CAP_PPC_BOOKE_WATCHDOG: %s\n",
1793                 __func__, strerror(-ret));
1794         return ret;
1795     }
1796 
1797     return ret;
1798 }
1799 
1800 static int read_cpuinfo(const char *field, char *value, int len)
1801 {
1802     FILE *f;
1803     int ret = -1;
1804     int field_len = strlen(field);
1805     char line[512];
1806 
1807     f = fopen("/proc/cpuinfo", "r");
1808     if (!f) {
1809         return -1;
1810     }
1811 
1812     do {
1813         if (!fgets(line, sizeof(line), f)) {
1814             break;
1815         }
1816         if (!strncmp(line, field, field_len)) {
1817             pstrcpy(value, len, line);
1818             ret = 0;
1819             break;
1820         }
1821     } while(*line);
1822 
1823     fclose(f);
1824 
1825     return ret;
1826 }
1827 
1828 uint32_t kvmppc_get_tbfreq(void)
1829 {
1830     char line[512];
1831     char *ns;
1832     uint32_t retval = NANOSECONDS_PER_SECOND;
1833 
1834     if (read_cpuinfo("timebase", line, sizeof(line))) {
1835         return retval;
1836     }
1837 
1838     if (!(ns = strchr(line, ':'))) {
1839         return retval;
1840     }
1841 
1842     ns++;
1843 
1844     return atoi(ns);
1845 }
1846 
1847 bool kvmppc_get_host_serial(char **value)
1848 {
1849     return g_file_get_contents("/proc/device-tree/system-id", value, NULL,
1850                                NULL);
1851 }
1852 
1853 bool kvmppc_get_host_model(char **value)
1854 {
1855     return g_file_get_contents("/proc/device-tree/model", value, NULL, NULL);
1856 }
1857 
1858 /* Try to find a device tree node for a CPU with clock-frequency property */
1859 static int kvmppc_find_cpu_dt(char *buf, int buf_len)
1860 {
1861     struct dirent *dirp;
1862     DIR *dp;
1863 
1864     if ((dp = opendir(PROC_DEVTREE_CPU)) == NULL) {
1865         printf("Can't open directory " PROC_DEVTREE_CPU "\n");
1866         return -1;
1867     }
1868 
1869     buf[0] = '\0';
1870     while ((dirp = readdir(dp)) != NULL) {
1871         FILE *f;
1872         snprintf(buf, buf_len, "%s%s/clock-frequency", PROC_DEVTREE_CPU,
1873                  dirp->d_name);
1874         f = fopen(buf, "r");
1875         if (f) {
1876             snprintf(buf, buf_len, "%s%s", PROC_DEVTREE_CPU, dirp->d_name);
1877             fclose(f);
1878             break;
1879         }
1880         buf[0] = '\0';
1881     }
1882     closedir(dp);
1883     if (buf[0] == '\0') {
1884         printf("Unknown host!\n");
1885         return -1;
1886     }
1887 
1888     return 0;
1889 }
1890 
1891 static uint64_t kvmppc_read_int_dt(const char *filename)
1892 {
1893     union {
1894         uint32_t v32;
1895         uint64_t v64;
1896     } u;
1897     FILE *f;
1898     int len;
1899 
1900     f = fopen(filename, "rb");
1901     if (!f) {
1902         return -1;
1903     }
1904 
1905     len = fread(&u, 1, sizeof(u), f);
1906     fclose(f);
1907     switch (len) {
1908     case 4:
1909         /* property is a 32-bit quantity */
1910         return be32_to_cpu(u.v32);
1911     case 8:
1912         return be64_to_cpu(u.v64);
1913     }
1914 
1915     return 0;
1916 }
1917 
1918 /* Read a CPU node property from the host device tree that's a single
1919  * integer (32-bit or 64-bit).  Returns 0 if anything goes wrong
1920  * (can't find or open the property, or doesn't understand the
1921  * format) */
1922 static uint64_t kvmppc_read_int_cpu_dt(const char *propname)
1923 {
1924     char buf[PATH_MAX], *tmp;
1925     uint64_t val;
1926 
1927     if (kvmppc_find_cpu_dt(buf, sizeof(buf))) {
1928         return -1;
1929     }
1930 
1931     tmp = g_strdup_printf("%s/%s", buf, propname);
1932     val = kvmppc_read_int_dt(tmp);
1933     g_free(tmp);
1934 
1935     return val;
1936 }
1937 
1938 uint64_t kvmppc_get_clockfreq(void)
1939 {
1940     return kvmppc_read_int_cpu_dt("clock-frequency");
1941 }
1942 
1943 static int kvmppc_get_dec_bits(void)
1944 {
1945     int nr_bits = kvmppc_read_int_cpu_dt("ibm,dec-bits");
1946 
1947     if (nr_bits > 0) {
1948         return nr_bits;
1949     }
1950     return 0;
1951 }
1952 
1953 static int kvmppc_get_pvinfo(CPUPPCState *env, struct kvm_ppc_pvinfo *pvinfo)
1954  {
1955      PowerPCCPU *cpu = ppc_env_get_cpu(env);
1956      CPUState *cs = CPU(cpu);
1957 
1958     if (kvm_vm_check_extension(cs->kvm_state, KVM_CAP_PPC_GET_PVINFO) &&
1959         !kvm_vm_ioctl(cs->kvm_state, KVM_PPC_GET_PVINFO, pvinfo)) {
1960         return 0;
1961     }
1962 
1963     return 1;
1964 }
1965 
1966 int kvmppc_get_hasidle(CPUPPCState *env)
1967 {
1968     struct kvm_ppc_pvinfo pvinfo;
1969 
1970     if (!kvmppc_get_pvinfo(env, &pvinfo) &&
1971         (pvinfo.flags & KVM_PPC_PVINFO_FLAGS_EV_IDLE)) {
1972         return 1;
1973     }
1974 
1975     return 0;
1976 }
1977 
1978 int kvmppc_get_hypercall(CPUPPCState *env, uint8_t *buf, int buf_len)
1979 {
1980     uint32_t *hc = (uint32_t*)buf;
1981     struct kvm_ppc_pvinfo pvinfo;
1982 
1983     if (!kvmppc_get_pvinfo(env, &pvinfo)) {
1984         memcpy(buf, pvinfo.hcall, buf_len);
1985         return 0;
1986     }
1987 
1988     /*
1989      * Fallback to always fail hypercalls regardless of endianness:
1990      *
1991      *     tdi 0,r0,72 (becomes b .+8 in wrong endian, nop in good endian)
1992      *     li r3, -1
1993      *     b .+8       (becomes nop in wrong endian)
1994      *     bswap32(li r3, -1)
1995      */
1996 
1997     hc[0] = cpu_to_be32(0x08000048);
1998     hc[1] = cpu_to_be32(0x3860ffff);
1999     hc[2] = cpu_to_be32(0x48000008);
2000     hc[3] = cpu_to_be32(bswap32(0x3860ffff));
2001 
2002     return 1;
2003 }
2004 
2005 static inline int kvmppc_enable_hcall(KVMState *s, target_ulong hcall)
2006 {
2007     return kvm_vm_enable_cap(s, KVM_CAP_PPC_ENABLE_HCALL, 0, hcall, 1);
2008 }
2009 
2010 void kvmppc_enable_logical_ci_hcalls(void)
2011 {
2012     /*
2013      * FIXME: it would be nice if we could detect the cases where
2014      * we're using a device which requires the in kernel
2015      * implementation of these hcalls, but the kernel lacks them and
2016      * produce a warning.
2017      */
2018     kvmppc_enable_hcall(kvm_state, H_LOGICAL_CI_LOAD);
2019     kvmppc_enable_hcall(kvm_state, H_LOGICAL_CI_STORE);
2020 }
2021 
2022 void kvmppc_enable_set_mode_hcall(void)
2023 {
2024     kvmppc_enable_hcall(kvm_state, H_SET_MODE);
2025 }
2026 
2027 void kvmppc_enable_clear_ref_mod_hcalls(void)
2028 {
2029     kvmppc_enable_hcall(kvm_state, H_CLEAR_REF);
2030     kvmppc_enable_hcall(kvm_state, H_CLEAR_MOD);
2031 }
2032 
2033 void kvmppc_set_papr(PowerPCCPU *cpu)
2034 {
2035     CPUState *cs = CPU(cpu);
2036     int ret;
2037 
2038     if (!kvm_enabled()) {
2039         return;
2040     }
2041 
2042     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_PAPR, 0);
2043     if (ret) {
2044         error_report("This vCPU type or KVM version does not support PAPR");
2045         exit(1);
2046     }
2047 
2048     /* Update the capability flag so we sync the right information
2049      * with kvm */
2050     cap_papr = 1;
2051 }
2052 
2053 int kvmppc_set_compat(PowerPCCPU *cpu, uint32_t compat_pvr)
2054 {
2055     return kvm_set_one_reg(CPU(cpu), KVM_REG_PPC_ARCH_COMPAT, &compat_pvr);
2056 }
2057 
2058 void kvmppc_set_mpic_proxy(PowerPCCPU *cpu, int mpic_proxy)
2059 {
2060     CPUState *cs = CPU(cpu);
2061     int ret;
2062 
2063     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_EPR, 0, mpic_proxy);
2064     if (ret && mpic_proxy) {
2065         error_report("This KVM version does not support EPR");
2066         exit(1);
2067     }
2068 }
2069 
2070 int kvmppc_smt_threads(void)
2071 {
2072     return cap_ppc_smt ? cap_ppc_smt : 1;
2073 }
2074 
2075 int kvmppc_set_smt_threads(int smt)
2076 {
2077     int ret;
2078 
2079     ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PPC_SMT, 0, smt, 0);
2080     if (!ret) {
2081         cap_ppc_smt = smt;
2082     }
2083     return ret;
2084 }
2085 
2086 void kvmppc_hint_smt_possible(Error **errp)
2087 {
2088     int i;
2089     GString *g;
2090     char *s;
2091 
2092     assert(kvm_enabled());
2093     if (cap_ppc_smt_possible) {
2094         g = g_string_new("Available VSMT modes:");
2095         for (i = 63; i >= 0; i--) {
2096             if ((1UL << i) & cap_ppc_smt_possible) {
2097                 g_string_append_printf(g, " %lu", (1UL << i));
2098             }
2099         }
2100         s = g_string_free(g, false);
2101         error_append_hint(errp, "%s.\n", s);
2102         g_free(s);
2103     } else {
2104         error_append_hint(errp,
2105                           "This KVM seems to be too old to support VSMT.\n");
2106     }
2107 }
2108 
2109 
2110 #ifdef TARGET_PPC64
2111 uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift)
2112 {
2113     struct kvm_ppc_smmu_info info;
2114     long rampagesize, best_page_shift;
2115     int i;
2116 
2117     /* Find the largest hardware supported page size that's less than
2118      * or equal to the (logical) backing page size of guest RAM */
2119     kvm_get_smmu_info(&info, &error_fatal);
2120     rampagesize = qemu_getrampagesize();
2121     best_page_shift = 0;
2122 
2123     for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
2124         struct kvm_ppc_one_seg_page_size *sps = &info.sps[i];
2125 
2126         if (!sps->page_shift) {
2127             continue;
2128         }
2129 
2130         if ((sps->page_shift > best_page_shift)
2131             && ((1UL << sps->page_shift) <= rampagesize)) {
2132             best_page_shift = sps->page_shift;
2133         }
2134     }
2135 
2136     return MIN(current_size,
2137                1ULL << (best_page_shift + hash_shift - 7));
2138 }
2139 #endif
2140 
2141 bool kvmppc_spapr_use_multitce(void)
2142 {
2143     return cap_spapr_multitce;
2144 }
2145 
2146 int kvmppc_spapr_enable_inkernel_multitce(void)
2147 {
2148     int ret;
2149 
2150     ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PPC_ENABLE_HCALL, 0,
2151                             H_PUT_TCE_INDIRECT, 1);
2152     if (!ret) {
2153         ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PPC_ENABLE_HCALL, 0,
2154                                 H_STUFF_TCE, 1);
2155     }
2156 
2157     return ret;
2158 }
2159 
2160 void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t page_shift,
2161                               uint64_t bus_offset, uint32_t nb_table,
2162                               int *pfd, bool need_vfio)
2163 {
2164     long len;
2165     int fd;
2166     void *table;
2167 
2168     /* Must set fd to -1 so we don't try to munmap when called for
2169      * destroying the table, which the upper layers -will- do
2170      */
2171     *pfd = -1;
2172     if (!cap_spapr_tce || (need_vfio && !cap_spapr_vfio)) {
2173         return NULL;
2174     }
2175 
2176     if (cap_spapr_tce_64) {
2177         struct kvm_create_spapr_tce_64 args = {
2178             .liobn = liobn,
2179             .page_shift = page_shift,
2180             .offset = bus_offset >> page_shift,
2181             .size = nb_table,
2182             .flags = 0
2183         };
2184         fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE_64, &args);
2185         if (fd < 0) {
2186             fprintf(stderr,
2187                     "KVM: Failed to create TCE64 table for liobn 0x%x\n",
2188                     liobn);
2189             return NULL;
2190         }
2191     } else if (cap_spapr_tce) {
2192         uint64_t window_size = (uint64_t) nb_table << page_shift;
2193         struct kvm_create_spapr_tce args = {
2194             .liobn = liobn,
2195             .window_size = window_size,
2196         };
2197         if ((window_size != args.window_size) || bus_offset) {
2198             return NULL;
2199         }
2200         fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE, &args);
2201         if (fd < 0) {
2202             fprintf(stderr, "KVM: Failed to create TCE table for liobn 0x%x\n",
2203                     liobn);
2204             return NULL;
2205         }
2206     } else {
2207         return NULL;
2208     }
2209 
2210     len = nb_table * sizeof(uint64_t);
2211     /* FIXME: round this up to page size */
2212 
2213     table = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
2214     if (table == MAP_FAILED) {
2215         fprintf(stderr, "KVM: Failed to map TCE table for liobn 0x%x\n",
2216                 liobn);
2217         close(fd);
2218         return NULL;
2219     }
2220 
2221     *pfd = fd;
2222     return table;
2223 }
2224 
2225 int kvmppc_remove_spapr_tce(void *table, int fd, uint32_t nb_table)
2226 {
2227     long len;
2228 
2229     if (fd < 0) {
2230         return -1;
2231     }
2232 
2233     len = nb_table * sizeof(uint64_t);
2234     if ((munmap(table, len) < 0) ||
2235         (close(fd) < 0)) {
2236         fprintf(stderr, "KVM: Unexpected error removing TCE table: %s",
2237                 strerror(errno));
2238         /* Leak the table */
2239     }
2240 
2241     return 0;
2242 }
2243 
2244 int kvmppc_reset_htab(int shift_hint)
2245 {
2246     uint32_t shift = shift_hint;
2247 
2248     if (!kvm_enabled()) {
2249         /* Full emulation, tell caller to allocate htab itself */
2250         return 0;
2251     }
2252     if (kvm_vm_check_extension(kvm_state, KVM_CAP_PPC_ALLOC_HTAB)) {
2253         int ret;
2254         ret = kvm_vm_ioctl(kvm_state, KVM_PPC_ALLOCATE_HTAB, &shift);
2255         if (ret == -ENOTTY) {
2256             /* At least some versions of PR KVM advertise the
2257              * capability, but don't implement the ioctl().  Oops.
2258              * Return 0 so that we allocate the htab in qemu, as is
2259              * correct for PR. */
2260             return 0;
2261         } else if (ret < 0) {
2262             return ret;
2263         }
2264         return shift;
2265     }
2266 
2267     /* We have a kernel that predates the htab reset calls.  For PR
2268      * KVM, we need to allocate the htab ourselves, for an HV KVM of
2269      * this era, it has allocated a 16MB fixed size hash table already. */
2270     if (kvmppc_is_pr(kvm_state)) {
2271         /* PR - tell caller to allocate htab */
2272         return 0;
2273     } else {
2274         /* HV - assume 16MB kernel allocated htab */
2275         return 24;
2276     }
2277 }
2278 
2279 static inline uint32_t mfpvr(void)
2280 {
2281     uint32_t pvr;
2282 
2283     asm ("mfpvr %0"
2284          : "=r"(pvr));
2285     return pvr;
2286 }
2287 
2288 static void alter_insns(uint64_t *word, uint64_t flags, bool on)
2289 {
2290     if (on) {
2291         *word |= flags;
2292     } else {
2293         *word &= ~flags;
2294     }
2295 }
2296 
2297 static void kvmppc_host_cpu_class_init(ObjectClass *oc, void *data)
2298 {
2299     PowerPCCPUClass *pcc = POWERPC_CPU_CLASS(oc);
2300     uint32_t dcache_size = kvmppc_read_int_cpu_dt("d-cache-size");
2301     uint32_t icache_size = kvmppc_read_int_cpu_dt("i-cache-size");
2302 
2303     /* Now fix up the class with information we can query from the host */
2304     pcc->pvr = mfpvr();
2305 
2306     alter_insns(&pcc->insns_flags, PPC_ALTIVEC,
2307                 qemu_getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC);
2308     alter_insns(&pcc->insns_flags2, PPC2_VSX,
2309                 qemu_getauxval(AT_HWCAP) & PPC_FEATURE_HAS_VSX);
2310     alter_insns(&pcc->insns_flags2, PPC2_DFP,
2311                 qemu_getauxval(AT_HWCAP) & PPC_FEATURE_HAS_DFP);
2312 
2313     if (dcache_size != -1) {
2314         pcc->l1_dcache_size = dcache_size;
2315     }
2316 
2317     if (icache_size != -1) {
2318         pcc->l1_icache_size = icache_size;
2319     }
2320 
2321 #if defined(TARGET_PPC64)
2322     pcc->radix_page_info = kvm_get_radix_page_info();
2323 
2324     if ((pcc->pvr & 0xffffff00) == CPU_POWERPC_POWER9_DD1) {
2325         /*
2326          * POWER9 DD1 has some bugs which make it not really ISA 3.00
2327          * compliant.  More importantly, advertising ISA 3.00
2328          * architected mode may prevent guests from activating
2329          * necessary DD1 workarounds.
2330          */
2331         pcc->pcr_supported &= ~(PCR_COMPAT_3_00 | PCR_COMPAT_2_07
2332                                 | PCR_COMPAT_2_06 | PCR_COMPAT_2_05);
2333     }
2334 #endif /* defined(TARGET_PPC64) */
2335 }
2336 
2337 bool kvmppc_has_cap_epr(void)
2338 {
2339     return cap_epr;
2340 }
2341 
2342 bool kvmppc_has_cap_fixup_hcalls(void)
2343 {
2344     return cap_fixup_hcalls;
2345 }
2346 
2347 bool kvmppc_has_cap_htm(void)
2348 {
2349     return cap_htm;
2350 }
2351 
2352 bool kvmppc_has_cap_mmu_radix(void)
2353 {
2354     return cap_mmu_radix;
2355 }
2356 
2357 bool kvmppc_has_cap_mmu_hash_v3(void)
2358 {
2359     return cap_mmu_hash_v3;
2360 }
2361 
2362 static bool kvmppc_power8_host(void)
2363 {
2364     bool ret = false;
2365 #ifdef TARGET_PPC64
2366     {
2367         uint32_t base_pvr = CPU_POWERPC_POWER_SERVER_MASK & mfpvr();
2368         ret = (base_pvr == CPU_POWERPC_POWER8E_BASE) ||
2369               (base_pvr == CPU_POWERPC_POWER8NVL_BASE) ||
2370               (base_pvr == CPU_POWERPC_POWER8_BASE);
2371     }
2372 #endif /* TARGET_PPC64 */
2373     return ret;
2374 }
2375 
2376 static int parse_cap_ppc_safe_cache(struct kvm_ppc_cpu_char c)
2377 {
2378     bool l1d_thread_priv_req = !kvmppc_power8_host();
2379 
2380     if (~c.behaviour & c.behaviour_mask & H_CPU_BEHAV_L1D_FLUSH_PR) {
2381         return 2;
2382     } else if ((!l1d_thread_priv_req ||
2383                 c.character & c.character_mask & H_CPU_CHAR_L1D_THREAD_PRIV) &&
2384                (c.character & c.character_mask
2385                 & (H_CPU_CHAR_L1D_FLUSH_ORI30 | H_CPU_CHAR_L1D_FLUSH_TRIG2))) {
2386         return 1;
2387     }
2388 
2389     return 0;
2390 }
2391 
2392 static int parse_cap_ppc_safe_bounds_check(struct kvm_ppc_cpu_char c)
2393 {
2394     if (~c.behaviour & c.behaviour_mask & H_CPU_BEHAV_BNDS_CHK_SPEC_BAR) {
2395         return 2;
2396     } else if (c.character & c.character_mask & H_CPU_CHAR_SPEC_BAR_ORI31) {
2397         return 1;
2398     }
2399 
2400     return 0;
2401 }
2402 
2403 static int parse_cap_ppc_safe_indirect_branch(struct kvm_ppc_cpu_char c)
2404 {
2405     if ((~c.behaviour & c.behaviour_mask & H_CPU_BEHAV_FLUSH_COUNT_CACHE) &&
2406         (~c.character & c.character_mask & H_CPU_CHAR_CACHE_COUNT_DIS) &&
2407         (~c.character & c.character_mask & H_CPU_CHAR_BCCTRL_SERIALISED)) {
2408         return SPAPR_CAP_FIXED_NA;
2409     } else if (c.behaviour & c.behaviour_mask & H_CPU_BEHAV_FLUSH_COUNT_CACHE) {
2410         return SPAPR_CAP_WORKAROUND;
2411     } else if (c.character & c.character_mask & H_CPU_CHAR_CACHE_COUNT_DIS) {
2412         return  SPAPR_CAP_FIXED_CCD;
2413     } else if (c.character & c.character_mask & H_CPU_CHAR_BCCTRL_SERIALISED) {
2414         return SPAPR_CAP_FIXED_IBS;
2415     }
2416 
2417     return 0;
2418 }
2419 
2420 static int parse_cap_ppc_count_cache_flush_assist(struct kvm_ppc_cpu_char c)
2421 {
2422     if (c.character & c.character_mask & H_CPU_CHAR_BCCTR_FLUSH_ASSIST) {
2423         return 1;
2424     }
2425     return 0;
2426 }
2427 
2428 static void kvmppc_get_cpu_characteristics(KVMState *s)
2429 {
2430     struct kvm_ppc_cpu_char c;
2431     int ret;
2432 
2433     /* Assume broken */
2434     cap_ppc_safe_cache = 0;
2435     cap_ppc_safe_bounds_check = 0;
2436     cap_ppc_safe_indirect_branch = 0;
2437 
2438     ret = kvm_vm_check_extension(s, KVM_CAP_PPC_GET_CPU_CHAR);
2439     if (!ret) {
2440         return;
2441     }
2442     ret = kvm_vm_ioctl(s, KVM_PPC_GET_CPU_CHAR, &c);
2443     if (ret < 0) {
2444         return;
2445     }
2446 
2447     cap_ppc_safe_cache = parse_cap_ppc_safe_cache(c);
2448     cap_ppc_safe_bounds_check = parse_cap_ppc_safe_bounds_check(c);
2449     cap_ppc_safe_indirect_branch = parse_cap_ppc_safe_indirect_branch(c);
2450     cap_ppc_count_cache_flush_assist =
2451         parse_cap_ppc_count_cache_flush_assist(c);
2452 }
2453 
2454 int kvmppc_get_cap_safe_cache(void)
2455 {
2456     return cap_ppc_safe_cache;
2457 }
2458 
2459 int kvmppc_get_cap_safe_bounds_check(void)
2460 {
2461     return cap_ppc_safe_bounds_check;
2462 }
2463 
2464 int kvmppc_get_cap_safe_indirect_branch(void)
2465 {
2466     return cap_ppc_safe_indirect_branch;
2467 }
2468 
2469 int kvmppc_get_cap_count_cache_flush_assist(void)
2470 {
2471     return cap_ppc_count_cache_flush_assist;
2472 }
2473 
2474 bool kvmppc_has_cap_nested_kvm_hv(void)
2475 {
2476     return !!cap_ppc_nested_kvm_hv;
2477 }
2478 
2479 int kvmppc_set_cap_nested_kvm_hv(int enable)
2480 {
2481     return kvm_vm_enable_cap(kvm_state, KVM_CAP_PPC_NESTED_HV, 0, enable);
2482 }
2483 
2484 bool kvmppc_has_cap_spapr_vfio(void)
2485 {
2486     return cap_spapr_vfio;
2487 }
2488 
2489 int kvmppc_get_cap_large_decr(void)
2490 {
2491     return cap_large_decr;
2492 }
2493 
2494 int kvmppc_enable_cap_large_decr(PowerPCCPU *cpu, int enable)
2495 {
2496     CPUState *cs = CPU(cpu);
2497     uint64_t lpcr;
2498 
2499     kvm_get_one_reg(cs, KVM_REG_PPC_LPCR_64, &lpcr);
2500     /* Do we need to modify the LPCR? */
2501     if (!!(lpcr & LPCR_LD) != !!enable) {
2502         if (enable) {
2503             lpcr |= LPCR_LD;
2504         } else {
2505             lpcr &= ~LPCR_LD;
2506         }
2507         kvm_set_one_reg(cs, KVM_REG_PPC_LPCR_64, &lpcr);
2508         kvm_get_one_reg(cs, KVM_REG_PPC_LPCR_64, &lpcr);
2509 
2510         if (!!(lpcr & LPCR_LD) != !!enable) {
2511             return -1;
2512         }
2513     }
2514 
2515     return 0;
2516 }
2517 
2518 PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
2519 {
2520     uint32_t host_pvr = mfpvr();
2521     PowerPCCPUClass *pvr_pcc;
2522 
2523     pvr_pcc = ppc_cpu_class_by_pvr(host_pvr);
2524     if (pvr_pcc == NULL) {
2525         pvr_pcc = ppc_cpu_class_by_pvr_mask(host_pvr);
2526     }
2527 
2528     return pvr_pcc;
2529 }
2530 
2531 static int kvm_ppc_register_host_cpu_type(MachineState *ms)
2532 {
2533     TypeInfo type_info = {
2534         .name = TYPE_HOST_POWERPC_CPU,
2535         .class_init = kvmppc_host_cpu_class_init,
2536     };
2537     MachineClass *mc = MACHINE_GET_CLASS(ms);
2538     PowerPCCPUClass *pvr_pcc;
2539     ObjectClass *oc;
2540     DeviceClass *dc;
2541     int i;
2542 
2543     pvr_pcc = kvm_ppc_get_host_cpu_class();
2544     if (pvr_pcc == NULL) {
2545         return -1;
2546     }
2547     type_info.parent = object_class_get_name(OBJECT_CLASS(pvr_pcc));
2548     type_register(&type_info);
2549     if (object_dynamic_cast(OBJECT(ms), TYPE_SPAPR_MACHINE)) {
2550         /* override TCG default cpu type with 'host' cpu model */
2551         mc->default_cpu_type = TYPE_HOST_POWERPC_CPU;
2552     }
2553 
2554     oc = object_class_by_name(type_info.name);
2555     g_assert(oc);
2556 
2557     /*
2558      * Update generic CPU family class alias (e.g. on a POWER8NVL host,
2559      * we want "POWER8" to be a "family" alias that points to the current
2560      * host CPU type, too)
2561      */
2562     dc = DEVICE_CLASS(ppc_cpu_get_family_class(pvr_pcc));
2563     for (i = 0; ppc_cpu_aliases[i].alias != NULL; i++) {
2564         if (strcasecmp(ppc_cpu_aliases[i].alias, dc->desc) == 0) {
2565             char *suffix;
2566 
2567             ppc_cpu_aliases[i].model = g_strdup(object_class_get_name(oc));
2568             suffix = strstr(ppc_cpu_aliases[i].model, POWERPC_CPU_TYPE_SUFFIX);
2569             if (suffix) {
2570                 *suffix = 0;
2571             }
2572             break;
2573         }
2574     }
2575 
2576     return 0;
2577 }
2578 
2579 int kvmppc_define_rtas_kernel_token(uint32_t token, const char *function)
2580 {
2581     struct kvm_rtas_token_args args = {
2582         .token = token,
2583     };
2584 
2585     if (!kvm_check_extension(kvm_state, KVM_CAP_PPC_RTAS)) {
2586         return -ENOENT;
2587     }
2588 
2589     strncpy(args.name, function, sizeof(args.name));
2590 
2591     return kvm_vm_ioctl(kvm_state, KVM_PPC_RTAS_DEFINE_TOKEN, &args);
2592 }
2593 
2594 int kvmppc_get_htab_fd(bool write, uint64_t index, Error **errp)
2595 {
2596     struct kvm_get_htab_fd s = {
2597         .flags = write ? KVM_GET_HTAB_WRITE : 0,
2598         .start_index = index,
2599     };
2600     int ret;
2601 
2602     if (!cap_htab_fd) {
2603         error_setg(errp, "KVM version doesn't support %s the HPT",
2604                    write ? "writing" : "reading");
2605         return -ENOTSUP;
2606     }
2607 
2608     ret = kvm_vm_ioctl(kvm_state, KVM_PPC_GET_HTAB_FD, &s);
2609     if (ret < 0) {
2610         error_setg(errp, "Unable to open fd for %s HPT %s KVM: %s",
2611                    write ? "writing" : "reading", write ? "to" : "from",
2612                    strerror(errno));
2613         return -errno;
2614     }
2615 
2616     return ret;
2617 }
2618 
2619 int kvmppc_save_htab(QEMUFile *f, int fd, size_t bufsize, int64_t max_ns)
2620 {
2621     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2622     uint8_t buf[bufsize];
2623     ssize_t rc;
2624 
2625     do {
2626         rc = read(fd, buf, bufsize);
2627         if (rc < 0) {
2628             fprintf(stderr, "Error reading data from KVM HTAB fd: %s\n",
2629                     strerror(errno));
2630             return rc;
2631         } else if (rc) {
2632             uint8_t *buffer = buf;
2633             ssize_t n = rc;
2634             while (n) {
2635                 struct kvm_get_htab_header *head =
2636                     (struct kvm_get_htab_header *) buffer;
2637                 size_t chunksize = sizeof(*head) +
2638                      HASH_PTE_SIZE_64 * head->n_valid;
2639 
2640                 qemu_put_be32(f, head->index);
2641                 qemu_put_be16(f, head->n_valid);
2642                 qemu_put_be16(f, head->n_invalid);
2643                 qemu_put_buffer(f, (void *)(head + 1),
2644                                 HASH_PTE_SIZE_64 * head->n_valid);
2645 
2646                 buffer += chunksize;
2647                 n -= chunksize;
2648             }
2649         }
2650     } while ((rc != 0)
2651              && ((max_ns < 0)
2652                  || ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) < max_ns)));
2653 
2654     return (rc == 0) ? 1 : 0;
2655 }
2656 
2657 int kvmppc_load_htab_chunk(QEMUFile *f, int fd, uint32_t index,
2658                            uint16_t n_valid, uint16_t n_invalid)
2659 {
2660     struct kvm_get_htab_header *buf;
2661     size_t chunksize = sizeof(*buf) + n_valid*HASH_PTE_SIZE_64;
2662     ssize_t rc;
2663 
2664     buf = alloca(chunksize);
2665     buf->index = index;
2666     buf->n_valid = n_valid;
2667     buf->n_invalid = n_invalid;
2668 
2669     qemu_get_buffer(f, (void *)(buf + 1), HASH_PTE_SIZE_64*n_valid);
2670 
2671     rc = write(fd, buf, chunksize);
2672     if (rc < 0) {
2673         fprintf(stderr, "Error writing KVM hash table: %s\n",
2674                 strerror(errno));
2675         return rc;
2676     }
2677     if (rc != chunksize) {
2678         /* We should never get a short write on a single chunk */
2679         fprintf(stderr, "Short write, restoring KVM hash table\n");
2680         return -1;
2681     }
2682     return 0;
2683 }
2684 
2685 bool kvm_arch_stop_on_emulation_error(CPUState *cpu)
2686 {
2687     return true;
2688 }
2689 
2690 void kvm_arch_init_irq_routing(KVMState *s)
2691 {
2692 }
2693 
2694 void kvmppc_read_hptes(ppc_hash_pte64_t *hptes, hwaddr ptex, int n)
2695 {
2696     int fd, rc;
2697     int i;
2698 
2699     fd = kvmppc_get_htab_fd(false, ptex, &error_abort);
2700 
2701     i = 0;
2702     while (i < n) {
2703         struct kvm_get_htab_header *hdr;
2704         int m = n < HPTES_PER_GROUP ? n : HPTES_PER_GROUP;
2705         char buf[sizeof(*hdr) + m * HASH_PTE_SIZE_64];
2706 
2707         rc = read(fd, buf, sizeof(buf));
2708         if (rc < 0) {
2709             hw_error("kvmppc_read_hptes: Unable to read HPTEs");
2710         }
2711 
2712         hdr = (struct kvm_get_htab_header *)buf;
2713         while ((i < n) && ((char *)hdr < (buf + rc))) {
2714             int invalid = hdr->n_invalid, valid = hdr->n_valid;
2715 
2716             if (hdr->index != (ptex + i)) {
2717                 hw_error("kvmppc_read_hptes: Unexpected HPTE index %"PRIu32
2718                          " != (%"HWADDR_PRIu" + %d", hdr->index, ptex, i);
2719             }
2720 
2721             if (n - i < valid) {
2722                 valid = n - i;
2723             }
2724             memcpy(hptes + i, hdr + 1, HASH_PTE_SIZE_64 * valid);
2725             i += valid;
2726 
2727             if ((n - i) < invalid) {
2728                 invalid = n - i;
2729             }
2730             memset(hptes + i, 0, invalid * HASH_PTE_SIZE_64);
2731             i += invalid;
2732 
2733             hdr = (struct kvm_get_htab_header *)
2734                 ((char *)(hdr + 1) + HASH_PTE_SIZE_64 * hdr->n_valid);
2735         }
2736     }
2737 
2738     close(fd);
2739 }
2740 
2741 void kvmppc_write_hpte(hwaddr ptex, uint64_t pte0, uint64_t pte1)
2742 {
2743     int fd, rc;
2744     struct {
2745         struct kvm_get_htab_header hdr;
2746         uint64_t pte0;
2747         uint64_t pte1;
2748     } buf;
2749 
2750     fd = kvmppc_get_htab_fd(true, 0 /* Ignored */, &error_abort);
2751 
2752     buf.hdr.n_valid = 1;
2753     buf.hdr.n_invalid = 0;
2754     buf.hdr.index = ptex;
2755     buf.pte0 = cpu_to_be64(pte0);
2756     buf.pte1 = cpu_to_be64(pte1);
2757 
2758     rc = write(fd, &buf, sizeof(buf));
2759     if (rc != sizeof(buf)) {
2760         hw_error("kvmppc_write_hpte: Unable to update KVM HPT");
2761     }
2762     close(fd);
2763 }
2764 
2765 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
2766                              uint64_t address, uint32_t data, PCIDevice *dev)
2767 {
2768     return 0;
2769 }
2770 
2771 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
2772                                 int vector, PCIDevice *dev)
2773 {
2774     return 0;
2775 }
2776 
2777 int kvm_arch_release_virq_post(int virq)
2778 {
2779     return 0;
2780 }
2781 
2782 int kvm_arch_msi_data_to_gsi(uint32_t data)
2783 {
2784     return data & 0xffff;
2785 }
2786 
2787 int kvmppc_enable_hwrng(void)
2788 {
2789     if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_PPC_HWRNG)) {
2790         return -1;
2791     }
2792 
2793     return kvmppc_enable_hcall(kvm_state, H_RANDOM);
2794 }
2795 
2796 void kvmppc_check_papr_resize_hpt(Error **errp)
2797 {
2798     if (!kvm_enabled()) {
2799         return; /* No KVM, we're good */
2800     }
2801 
2802     if (cap_resize_hpt) {
2803         return; /* Kernel has explicit support, we're good */
2804     }
2805 
2806     /* Otherwise fallback on looking for PR KVM */
2807     if (kvmppc_is_pr(kvm_state)) {
2808         return;
2809     }
2810 
2811     error_setg(errp,
2812                "Hash page table resizing not available with this KVM version");
2813 }
2814 
2815 int kvmppc_resize_hpt_prepare(PowerPCCPU *cpu, target_ulong flags, int shift)
2816 {
2817     CPUState *cs = CPU(cpu);
2818     struct kvm_ppc_resize_hpt rhpt = {
2819         .flags = flags,
2820         .shift = shift,
2821     };
2822 
2823     if (!cap_resize_hpt) {
2824         return -ENOSYS;
2825     }
2826 
2827     return kvm_vm_ioctl(cs->kvm_state, KVM_PPC_RESIZE_HPT_PREPARE, &rhpt);
2828 }
2829 
2830 int kvmppc_resize_hpt_commit(PowerPCCPU *cpu, target_ulong flags, int shift)
2831 {
2832     CPUState *cs = CPU(cpu);
2833     struct kvm_ppc_resize_hpt rhpt = {
2834         .flags = flags,
2835         .shift = shift,
2836     };
2837 
2838     if (!cap_resize_hpt) {
2839         return -ENOSYS;
2840     }
2841 
2842     return kvm_vm_ioctl(cs->kvm_state, KVM_PPC_RESIZE_HPT_COMMIT, &rhpt);
2843 }
2844 
2845 /*
2846  * This is a helper function to detect a post migration scenario
2847  * in which a guest, running as KVM-HV, freezes in cpu_post_load because
2848  * the guest kernel can't handle a PVR value other than the actual host
2849  * PVR in KVM_SET_SREGS, even if pvr_match() returns true.
2850  *
2851  * If we don't have cap_ppc_pvr_compat and we're not running in PR
2852  * (so, we're HV), return true. The workaround itself is done in
2853  * cpu_post_load.
2854  *
2855  * The order here is important: we'll only check for KVM PR as a
2856  * fallback if the guest kernel can't handle the situation itself.
2857  * We need to avoid as much as possible querying the running KVM type
2858  * in QEMU level.
2859  */
2860 bool kvmppc_pvr_workaround_required(PowerPCCPU *cpu)
2861 {
2862     CPUState *cs = CPU(cpu);
2863 
2864     if (!kvm_enabled()) {
2865         return false;
2866     }
2867 
2868     if (cap_ppc_pvr_compat) {
2869         return false;
2870     }
2871 
2872     return !kvmppc_is_pr(cs->kvm_state);
2873 }
2874 
2875 void kvmppc_set_reg_ppc_online(PowerPCCPU *cpu, unsigned int online)
2876 {
2877     CPUState *cs = CPU(cpu);
2878 
2879     if (kvm_enabled()) {
2880         kvm_set_one_reg(cs, KVM_REG_PPC_ONLINE, &online);
2881     }
2882 }
2883