xref: /qemu/accel/kvm/kvm-all.c (revision 98721058d6d50ef218e0c26e4f67c8ef96965859)
1 /*
2  * QEMU KVM support
3  *
4  * Copyright IBM, Corp. 2008
5  *           Red Hat, Inc. 2008
6  *
7  * Authors:
8  *  Anthony Liguori   <aliguori@us.ibm.com>
9  *  Glauber Costa     <gcosta@redhat.com>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
12  * See the COPYING file in the top-level directory.
13  *
14  */
15 
16 #include "qemu/osdep.h"
17 #include <sys/ioctl.h>
18 #include <poll.h>
19 
20 #include <linux/kvm.h>
21 
22 #include "qemu/atomic.h"
23 #include "qemu/option.h"
24 #include "qemu/config-file.h"
25 #include "qemu/error-report.h"
26 #include "qapi/error.h"
27 #include "hw/pci/msi.h"
28 #include "hw/pci/msix.h"
29 #include "hw/s390x/adapter.h"
30 #include "gdbstub/enums.h"
31 #include "system/kvm_int.h"
32 #include "system/runstate.h"
33 #include "system/cpus.h"
34 #include "system/accel-blocker.h"
35 #include "qemu/bswap.h"
36 #include "exec/tswap.h"
37 #include "system/memory.h"
38 #include "system/ram_addr.h"
39 #include "qemu/event_notifier.h"
40 #include "qemu/main-loop.h"
41 #include "trace.h"
42 #include "hw/irq.h"
43 #include "qapi/visitor.h"
44 #include "qapi/qapi-types-common.h"
45 #include "qapi/qapi-visit-common.h"
46 #include "system/reset.h"
47 #include "qemu/guest-random.h"
48 #include "system/hw_accel.h"
49 #include "kvm-cpus.h"
50 #include "system/dirtylimit.h"
51 #include "qemu/range.h"
52 
53 #include "hw/boards.h"
54 #include "system/stats.h"
55 
56 /* This check must be after config-host.h is included */
57 #ifdef CONFIG_EVENTFD
58 #include <sys/eventfd.h>
59 #endif
60 
61 #if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__)
62 # define KVM_HAVE_MCE_INJECTION 1
63 #endif
64 
65 
66 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
67  * need to use the real host PAGE_SIZE, as that's what KVM will use.
68  */
69 #ifdef PAGE_SIZE
70 #undef PAGE_SIZE
71 #endif
72 #define PAGE_SIZE qemu_real_host_page_size()
73 
74 #ifndef KVM_GUESTDBG_BLOCKIRQ
75 #define KVM_GUESTDBG_BLOCKIRQ 0
76 #endif
77 
78 /* Default num of memslots to be allocated when VM starts */
79 #define  KVM_MEMSLOTS_NR_ALLOC_DEFAULT                      16
80 /* Default max allowed memslots if kernel reported nothing */
81 #define  KVM_MEMSLOTS_NR_MAX_DEFAULT                        32
82 
83 struct KVMParkedVcpu {
84     unsigned long vcpu_id;
85     int kvm_fd;
86     QLIST_ENTRY(KVMParkedVcpu) node;
87 };
88 
89 KVMState *kvm_state;
90 bool kvm_kernel_irqchip;
91 bool kvm_split_irqchip;
92 bool kvm_async_interrupts_allowed;
93 bool kvm_halt_in_kernel_allowed;
94 bool kvm_resamplefds_allowed;
95 bool kvm_msi_via_irqfd_allowed;
96 bool kvm_gsi_routing_allowed;
97 bool kvm_gsi_direct_mapping;
98 bool kvm_allowed;
99 bool kvm_readonly_mem_allowed;
100 bool kvm_vm_attributes_allowed;
101 bool kvm_msi_use_devid;
102 static bool kvm_has_guest_debug;
103 static int kvm_sstep_flags;
104 static bool kvm_immediate_exit;
105 static uint64_t kvm_supported_memory_attributes;
106 static bool kvm_guest_memfd_supported;
107 static hwaddr kvm_max_slot_size = ~0;
108 
109 static const KVMCapabilityInfo kvm_required_capabilites[] = {
110     KVM_CAP_INFO(USER_MEMORY),
111     KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
112     KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
113     KVM_CAP_INFO(INTERNAL_ERROR_DATA),
114     KVM_CAP_INFO(IOEVENTFD),
115     KVM_CAP_INFO(IOEVENTFD_ANY_LENGTH),
116     KVM_CAP_LAST_INFO
117 };
118 
119 static NotifierList kvm_irqchip_change_notifiers =
120     NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
121 
122 struct KVMResampleFd {
123     int gsi;
124     EventNotifier *resample_event;
125     QLIST_ENTRY(KVMResampleFd) node;
126 };
127 typedef struct KVMResampleFd KVMResampleFd;
128 
129 /*
130  * Only used with split irqchip where we need to do the resample fd
131  * kick for the kernel from userspace.
132  */
133 static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
134     QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
135 
136 static QemuMutex kml_slots_lock;
137 
138 #define kvm_slots_lock()    qemu_mutex_lock(&kml_slots_lock)
139 #define kvm_slots_unlock()  qemu_mutex_unlock(&kml_slots_lock)
140 
141 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem);
142 
kvm_resample_fd_remove(int gsi)143 static inline void kvm_resample_fd_remove(int gsi)
144 {
145     KVMResampleFd *rfd;
146 
147     QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
148         if (rfd->gsi == gsi) {
149             QLIST_REMOVE(rfd, node);
150             g_free(rfd);
151             break;
152         }
153     }
154 }
155 
kvm_resample_fd_insert(int gsi,EventNotifier * event)156 static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
157 {
158     KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
159 
160     rfd->gsi = gsi;
161     rfd->resample_event = event;
162 
163     QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
164 }
165 
kvm_resample_fd_notify(int gsi)166 void kvm_resample_fd_notify(int gsi)
167 {
168     KVMResampleFd *rfd;
169 
170     QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
171         if (rfd->gsi == gsi) {
172             event_notifier_set(rfd->resample_event);
173             trace_kvm_resample_fd_notify(gsi);
174             return;
175         }
176     }
177 }
178 
179 /**
180  * kvm_slots_grow(): Grow the slots[] array in the KVMMemoryListener
181  *
182  * @kml: The KVMMemoryListener* to grow the slots[] array
183  * @nr_slots_new: The new size of slots[] array
184  *
185  * Returns: True if the array grows larger, false otherwise.
186  */
kvm_slots_grow(KVMMemoryListener * kml,unsigned int nr_slots_new)187 static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new)
188 {
189     unsigned int i, cur = kml->nr_slots_allocated;
190     KVMSlot *slots;
191 
192     if (nr_slots_new > kvm_state->nr_slots_max) {
193         nr_slots_new = kvm_state->nr_slots_max;
194     }
195 
196     if (cur >= nr_slots_new) {
197         /* Big enough, no need to grow, or we reached max */
198         return false;
199     }
200 
201     if (cur == 0) {
202         slots = g_new0(KVMSlot, nr_slots_new);
203     } else {
204         assert(kml->slots);
205         slots = g_renew(KVMSlot, kml->slots, nr_slots_new);
206         /*
207          * g_renew() doesn't initialize extended buffers, however kvm
208          * memslots require fields to be zero-initialized. E.g. pointers,
209          * memory_size field, etc.
210          */
211         memset(&slots[cur], 0x0, sizeof(slots[0]) * (nr_slots_new - cur));
212     }
213 
214     for (i = cur; i < nr_slots_new; i++) {
215         slots[i].slot = i;
216     }
217 
218     kml->slots = slots;
219     kml->nr_slots_allocated = nr_slots_new;
220     trace_kvm_slots_grow(cur, nr_slots_new);
221 
222     return true;
223 }
224 
kvm_slots_double(KVMMemoryListener * kml)225 static bool kvm_slots_double(KVMMemoryListener *kml)
226 {
227     return kvm_slots_grow(kml, kml->nr_slots_allocated * 2);
228 }
229 
kvm_get_max_memslots(void)230 unsigned int kvm_get_max_memslots(void)
231 {
232     KVMState *s = KVM_STATE(current_accel());
233 
234     return s->nr_slots_max;
235 }
236 
kvm_get_free_memslots(void)237 unsigned int kvm_get_free_memslots(void)
238 {
239     unsigned int used_slots = 0;
240     KVMState *s = kvm_state;
241     int i;
242 
243     kvm_slots_lock();
244     for (i = 0; i < s->nr_as; i++) {
245         if (!s->as[i].ml) {
246             continue;
247         }
248         used_slots = MAX(used_slots, s->as[i].ml->nr_slots_used);
249     }
250     kvm_slots_unlock();
251 
252     return s->nr_slots_max - used_slots;
253 }
254 
255 /* Called with KVMMemoryListener.slots_lock held */
kvm_get_free_slot(KVMMemoryListener * kml)256 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
257 {
258     unsigned int n;
259     int i;
260 
261     for (i = 0; i < kml->nr_slots_allocated; i++) {
262         if (kml->slots[i].memory_size == 0) {
263             return &kml->slots[i];
264         }
265     }
266 
267     /*
268      * If no free slots, try to grow first by doubling.  Cache the old size
269      * here to avoid another round of search: if the grow succeeded, it
270      * means slots[] now must have the existing "n" slots occupied,
271      * followed by one or more free slots starting from slots[n].
272      */
273     n = kml->nr_slots_allocated;
274     if (kvm_slots_double(kml)) {
275         return &kml->slots[n];
276     }
277 
278     return NULL;
279 }
280 
281 /* Called with KVMMemoryListener.slots_lock held */
kvm_alloc_slot(KVMMemoryListener * kml)282 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
283 {
284     KVMSlot *slot = kvm_get_free_slot(kml);
285 
286     if (slot) {
287         return slot;
288     }
289 
290     fprintf(stderr, "%s: no free slot available\n", __func__);
291     abort();
292 }
293 
kvm_lookup_matching_slot(KVMMemoryListener * kml,hwaddr start_addr,hwaddr size)294 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
295                                          hwaddr start_addr,
296                                          hwaddr size)
297 {
298     int i;
299 
300     for (i = 0; i < kml->nr_slots_allocated; i++) {
301         KVMSlot *mem = &kml->slots[i];
302 
303         if (start_addr == mem->start_addr && size == mem->memory_size) {
304             return mem;
305         }
306     }
307 
308     return NULL;
309 }
310 
311 /*
312  * Calculate and align the start address and the size of the section.
313  * Return the size. If the size is 0, the aligned section is empty.
314  */
kvm_align_section(MemoryRegionSection * section,hwaddr * start)315 static hwaddr kvm_align_section(MemoryRegionSection *section,
316                                 hwaddr *start)
317 {
318     hwaddr size = int128_get64(section->size);
319     hwaddr delta, aligned;
320 
321     /* kvm works in page size chunks, but the function may be called
322        with sub-page size and unaligned start address. Pad the start
323        address to next and truncate size to previous page boundary. */
324     aligned = ROUND_UP(section->offset_within_address_space,
325                        qemu_real_host_page_size());
326     delta = aligned - section->offset_within_address_space;
327     *start = aligned;
328     if (delta > size) {
329         return 0;
330     }
331 
332     return (size - delta) & qemu_real_host_page_mask();
333 }
334 
kvm_physical_memory_addr_from_host(KVMState * s,void * ram,hwaddr * phys_addr)335 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
336                                        hwaddr *phys_addr)
337 {
338     KVMMemoryListener *kml = &s->memory_listener;
339     int i, ret = 0;
340 
341     kvm_slots_lock();
342     for (i = 0; i < kml->nr_slots_allocated; i++) {
343         KVMSlot *mem = &kml->slots[i];
344 
345         if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
346             *phys_addr = mem->start_addr + (ram - mem->ram);
347             ret = 1;
348             break;
349         }
350     }
351     kvm_slots_unlock();
352 
353     return ret;
354 }
355 
kvm_set_user_memory_region(KVMMemoryListener * kml,KVMSlot * slot,bool new)356 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
357 {
358     KVMState *s = kvm_state;
359     struct kvm_userspace_memory_region2 mem;
360     int ret;
361 
362     mem.slot = slot->slot | (kml->as_id << 16);
363     mem.guest_phys_addr = slot->start_addr;
364     mem.userspace_addr = (unsigned long)slot->ram;
365     mem.flags = slot->flags;
366     mem.guest_memfd = slot->guest_memfd;
367     mem.guest_memfd_offset = slot->guest_memfd_offset;
368 
369     if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
370         /* Set the slot size to 0 before setting the slot to the desired
371          * value. This is needed based on KVM commit 75d61fbc. */
372         mem.memory_size = 0;
373 
374         if (kvm_guest_memfd_supported) {
375             ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
376         } else {
377             ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
378         }
379         if (ret < 0) {
380             goto err;
381         }
382     }
383     mem.memory_size = slot->memory_size;
384     if (kvm_guest_memfd_supported) {
385         ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
386     } else {
387         ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
388     }
389     slot->old_flags = mem.flags;
390 err:
391     trace_kvm_set_user_memory(mem.slot >> 16, (uint16_t)mem.slot, mem.flags,
392                               mem.guest_phys_addr, mem.memory_size,
393                               mem.userspace_addr, mem.guest_memfd,
394                               mem.guest_memfd_offset, ret);
395     if (ret < 0) {
396         if (kvm_guest_memfd_supported) {
397                 error_report("%s: KVM_SET_USER_MEMORY_REGION2 failed, slot=%d,"
398                         " start=0x%" PRIx64 ", size=0x%" PRIx64 ","
399                         " flags=0x%" PRIx32 ", guest_memfd=%" PRId32 ","
400                         " guest_memfd_offset=0x%" PRIx64 ": %s",
401                         __func__, mem.slot, slot->start_addr,
402                         (uint64_t)mem.memory_size, mem.flags,
403                         mem.guest_memfd, (uint64_t)mem.guest_memfd_offset,
404                         strerror(errno));
405         } else {
406                 error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
407                             " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
408                             __func__, mem.slot, slot->start_addr,
409                             (uint64_t)mem.memory_size, strerror(errno));
410         }
411     }
412     return ret;
413 }
414 
kvm_park_vcpu(CPUState * cpu)415 void kvm_park_vcpu(CPUState *cpu)
416 {
417     struct KVMParkedVcpu *vcpu;
418 
419     trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
420 
421     vcpu = g_malloc0(sizeof(*vcpu));
422     vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
423     vcpu->kvm_fd = cpu->kvm_fd;
424     QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
425 }
426 
kvm_unpark_vcpu(KVMState * s,unsigned long vcpu_id)427 int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id)
428 {
429     struct KVMParkedVcpu *cpu;
430     int kvm_fd = -ENOENT;
431 
432     QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
433         if (cpu->vcpu_id == vcpu_id) {
434             QLIST_REMOVE(cpu, node);
435             kvm_fd = cpu->kvm_fd;
436             g_free(cpu);
437             break;
438         }
439     }
440 
441     trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked");
442 
443     return kvm_fd;
444 }
445 
kvm_reset_parked_vcpus(KVMState * s)446 static void kvm_reset_parked_vcpus(KVMState *s)
447 {
448     struct KVMParkedVcpu *cpu;
449 
450     QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
451         kvm_arch_reset_parked_vcpu(cpu->vcpu_id, cpu->kvm_fd);
452     }
453 }
454 
kvm_create_vcpu(CPUState * cpu)455 int kvm_create_vcpu(CPUState *cpu)
456 {
457     unsigned long vcpu_id = kvm_arch_vcpu_id(cpu);
458     KVMState *s = kvm_state;
459     int kvm_fd;
460 
461     /* check if the KVM vCPU already exist but is parked */
462     kvm_fd = kvm_unpark_vcpu(s, vcpu_id);
463     if (kvm_fd < 0) {
464         /* vCPU not parked: create a new KVM vCPU */
465         kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id);
466         if (kvm_fd < 0) {
467             error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id);
468             return kvm_fd;
469         }
470     }
471 
472     cpu->kvm_fd = kvm_fd;
473     cpu->kvm_state = s;
474     if (!s->guest_state_protected) {
475         cpu->vcpu_dirty = true;
476     }
477     cpu->dirty_pages = 0;
478     cpu->throttle_us_per_full = 0;
479 
480     trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd);
481 
482     return 0;
483 }
484 
kvm_create_and_park_vcpu(CPUState * cpu)485 int kvm_create_and_park_vcpu(CPUState *cpu)
486 {
487     int ret = 0;
488 
489     ret = kvm_create_vcpu(cpu);
490     if (!ret) {
491         kvm_park_vcpu(cpu);
492     }
493 
494     return ret;
495 }
496 
do_kvm_destroy_vcpu(CPUState * cpu)497 static int do_kvm_destroy_vcpu(CPUState *cpu)
498 {
499     KVMState *s = kvm_state;
500     int mmap_size;
501     int ret = 0;
502 
503     trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
504 
505     ret = kvm_arch_destroy_vcpu(cpu);
506     if (ret < 0) {
507         goto err;
508     }
509 
510     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
511     if (mmap_size < 0) {
512         ret = mmap_size;
513         trace_kvm_failed_get_vcpu_mmap_size();
514         goto err;
515     }
516 
517     ret = munmap(cpu->kvm_run, mmap_size);
518     if (ret < 0) {
519         goto err;
520     }
521 
522     if (cpu->kvm_dirty_gfns) {
523         ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
524         if (ret < 0) {
525             goto err;
526         }
527     }
528 
529     kvm_park_vcpu(cpu);
530 err:
531     return ret;
532 }
533 
kvm_destroy_vcpu(CPUState * cpu)534 void kvm_destroy_vcpu(CPUState *cpu)
535 {
536     if (do_kvm_destroy_vcpu(cpu) < 0) {
537         error_report("kvm_destroy_vcpu failed");
538         exit(EXIT_FAILURE);
539     }
540 }
541 
kvm_init_vcpu(CPUState * cpu,Error ** errp)542 int kvm_init_vcpu(CPUState *cpu, Error **errp)
543 {
544     KVMState *s = kvm_state;
545     int mmap_size;
546     int ret;
547 
548     trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
549 
550     ret = kvm_arch_pre_create_vcpu(cpu, errp);
551     if (ret < 0) {
552         goto err;
553     }
554 
555     ret = kvm_create_vcpu(cpu);
556     if (ret < 0) {
557         error_setg_errno(errp, -ret,
558                          "kvm_init_vcpu: kvm_create_vcpu failed (%lu)",
559                          kvm_arch_vcpu_id(cpu));
560         goto err;
561     }
562 
563     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
564     if (mmap_size < 0) {
565         ret = mmap_size;
566         error_setg_errno(errp, -mmap_size,
567                          "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
568         goto err;
569     }
570 
571     cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
572                         cpu->kvm_fd, 0);
573     if (cpu->kvm_run == MAP_FAILED) {
574         ret = -errno;
575         error_setg_errno(errp, ret,
576                          "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)",
577                          kvm_arch_vcpu_id(cpu));
578         goto err;
579     }
580 
581     if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
582         s->coalesced_mmio_ring =
583             (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
584     }
585 
586     if (s->kvm_dirty_ring_size) {
587         /* Use MAP_SHARED to share pages with the kernel */
588         cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes,
589                                    PROT_READ | PROT_WRITE, MAP_SHARED,
590                                    cpu->kvm_fd,
591                                    PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
592         if (cpu->kvm_dirty_gfns == MAP_FAILED) {
593             ret = -errno;
594             goto err;
595         }
596     }
597 
598     ret = kvm_arch_init_vcpu(cpu);
599     if (ret < 0) {
600         error_setg_errno(errp, -ret,
601                          "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)",
602                          kvm_arch_vcpu_id(cpu));
603     }
604     cpu->kvm_vcpu_stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL);
605 
606 err:
607     return ret;
608 }
609 
610 /*
611  * dirty pages logging control
612  */
613 
kvm_mem_flags(MemoryRegion * mr)614 static int kvm_mem_flags(MemoryRegion *mr)
615 {
616     bool readonly = mr->readonly || memory_region_is_romd(mr);
617     int flags = 0;
618 
619     if (memory_region_get_dirty_log_mask(mr) != 0) {
620         flags |= KVM_MEM_LOG_DIRTY_PAGES;
621     }
622     if (readonly && kvm_readonly_mem_allowed) {
623         flags |= KVM_MEM_READONLY;
624     }
625     if (memory_region_has_guest_memfd(mr)) {
626         assert(kvm_guest_memfd_supported);
627         flags |= KVM_MEM_GUEST_MEMFD;
628     }
629     return flags;
630 }
631 
632 /* Called with KVMMemoryListener.slots_lock held */
kvm_slot_update_flags(KVMMemoryListener * kml,KVMSlot * mem,MemoryRegion * mr)633 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
634                                  MemoryRegion *mr)
635 {
636     mem->flags = kvm_mem_flags(mr);
637 
638     /* If nothing changed effectively, no need to issue ioctl */
639     if (mem->flags == mem->old_flags) {
640         return 0;
641     }
642 
643     kvm_slot_init_dirty_bitmap(mem);
644     return kvm_set_user_memory_region(kml, mem, false);
645 }
646 
kvm_section_update_flags(KVMMemoryListener * kml,MemoryRegionSection * section)647 static int kvm_section_update_flags(KVMMemoryListener *kml,
648                                     MemoryRegionSection *section)
649 {
650     hwaddr start_addr, size, slot_size;
651     KVMSlot *mem;
652     int ret = 0;
653 
654     size = kvm_align_section(section, &start_addr);
655     if (!size) {
656         return 0;
657     }
658 
659     kvm_slots_lock();
660 
661     while (size && !ret) {
662         slot_size = MIN(kvm_max_slot_size, size);
663         mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
664         if (!mem) {
665             /* We don't have a slot if we want to trap every access. */
666             goto out;
667         }
668 
669         ret = kvm_slot_update_flags(kml, mem, section->mr);
670         start_addr += slot_size;
671         size -= slot_size;
672     }
673 
674 out:
675     kvm_slots_unlock();
676     return ret;
677 }
678 
kvm_log_start(MemoryListener * listener,MemoryRegionSection * section,int old,int new)679 static void kvm_log_start(MemoryListener *listener,
680                           MemoryRegionSection *section,
681                           int old, int new)
682 {
683     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
684     int r;
685 
686     if (old != 0) {
687         return;
688     }
689 
690     r = kvm_section_update_flags(kml, section);
691     if (r < 0) {
692         abort();
693     }
694 }
695 
kvm_log_stop(MemoryListener * listener,MemoryRegionSection * section,int old,int new)696 static void kvm_log_stop(MemoryListener *listener,
697                           MemoryRegionSection *section,
698                           int old, int new)
699 {
700     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
701     int r;
702 
703     if (new != 0) {
704         return;
705     }
706 
707     r = kvm_section_update_flags(kml, section);
708     if (r < 0) {
709         abort();
710     }
711 }
712 
713 /* get kvm's dirty pages bitmap and update qemu's */
kvm_slot_sync_dirty_pages(KVMSlot * slot)714 static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
715 {
716     ram_addr_t start = slot->ram_start_offset;
717     ram_addr_t pages = slot->memory_size / qemu_real_host_page_size();
718 
719     cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
720 }
721 
kvm_slot_reset_dirty_pages(KVMSlot * slot)722 static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
723 {
724     memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
725 }
726 
727 #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
728 
729 /* Allocate the dirty bitmap for a slot  */
kvm_slot_init_dirty_bitmap(KVMSlot * mem)730 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem)
731 {
732     if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) {
733         return;
734     }
735 
736     /*
737      * XXX bad kernel interface alert
738      * For dirty bitmap, kernel allocates array of size aligned to
739      * bits-per-long.  But for case when the kernel is 64bits and
740      * the userspace is 32bits, userspace can't align to the same
741      * bits-per-long, since sizeof(long) is different between kernel
742      * and user space.  This way, userspace will provide buffer which
743      * may be 4 bytes less than the kernel will use, resulting in
744      * userspace memory corruption (which is not detectable by valgrind
745      * too, in most cases).
746      * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
747      * a hope that sizeof(long) won't become >8 any time soon.
748      *
749      * Note: the granule of kvm dirty log is qemu_real_host_page_size.
750      * And mem->memory_size is aligned to it (otherwise this mem can't
751      * be registered to KVM).
752      */
753     hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size(),
754                                         /*HOST_LONG_BITS*/ 64) / 8;
755     mem->dirty_bmap = g_malloc0(bitmap_size);
756     mem->dirty_bmap_size = bitmap_size;
757 }
758 
759 /*
760  * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if
761  * succeeded, false otherwise
762  */
kvm_slot_get_dirty_log(KVMState * s,KVMSlot * slot)763 static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
764 {
765     struct kvm_dirty_log d = {};
766     int ret;
767 
768     d.dirty_bitmap = slot->dirty_bmap;
769     d.slot = slot->slot | (slot->as_id << 16);
770     ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
771 
772     if (ret == -ENOENT) {
773         /* kernel does not have dirty bitmap in this slot */
774         ret = 0;
775     }
776     if (ret) {
777         error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d",
778                           __func__, ret);
779     }
780     return ret == 0;
781 }
782 
783 /* Should be with all slots_lock held for the address spaces. */
kvm_dirty_ring_mark_page(KVMState * s,uint32_t as_id,uint32_t slot_id,uint64_t offset)784 static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
785                                      uint32_t slot_id, uint64_t offset)
786 {
787     KVMMemoryListener *kml;
788     KVMSlot *mem;
789 
790     if (as_id >= s->nr_as) {
791         return;
792     }
793 
794     kml = s->as[as_id].ml;
795     mem = &kml->slots[slot_id];
796 
797     if (!mem->memory_size || offset >=
798         (mem->memory_size / qemu_real_host_page_size())) {
799         return;
800     }
801 
802     set_bit(offset, mem->dirty_bmap);
803 }
804 
dirty_gfn_is_dirtied(struct kvm_dirty_gfn * gfn)805 static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
806 {
807     /*
808      * Read the flags before the value.  Pairs with barrier in
809      * KVM's kvm_dirty_ring_push() function.
810      */
811     return qatomic_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY;
812 }
813 
dirty_gfn_set_collected(struct kvm_dirty_gfn * gfn)814 static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
815 {
816     /*
817      * Use a store-release so that the CPU that executes KVM_RESET_DIRTY_RINGS
818      * sees the full content of the ring:
819      *
820      * CPU0                     CPU1                         CPU2
821      * ------------------------------------------------------------------------------
822      *                                                       fill gfn0
823      *                                                       store-rel flags for gfn0
824      * load-acq flags for gfn0
825      * store-rel RESET for gfn0
826      *                          ioctl(RESET_RINGS)
827      *                            load-acq flags for gfn0
828      *                            check if flags have RESET
829      *
830      * The synchronization goes from CPU2 to CPU0 to CPU1.
831      */
832     qatomic_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET);
833 }
834 
835 /*
836  * Should be with all slots_lock held for the address spaces.  It returns the
837  * dirty page we've collected on this dirty ring.
838  */
kvm_dirty_ring_reap_one(KVMState * s,CPUState * cpu)839 static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
840 {
841     struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
842     uint32_t ring_size = s->kvm_dirty_ring_size;
843     uint32_t count = 0, fetch = cpu->kvm_fetch_index;
844 
845     /*
846      * It's possible that we race with vcpu creation code where the vcpu is
847      * put onto the vcpus list but not yet initialized the dirty ring
848      * structures.  If so, skip it.
849      */
850     if (!cpu->created) {
851         return 0;
852     }
853 
854     assert(dirty_gfns && ring_size);
855     trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
856 
857     while (true) {
858         cur = &dirty_gfns[fetch % ring_size];
859         if (!dirty_gfn_is_dirtied(cur)) {
860             break;
861         }
862         kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
863                                  cur->offset);
864         dirty_gfn_set_collected(cur);
865         trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
866         fetch++;
867         count++;
868     }
869     cpu->kvm_fetch_index = fetch;
870     cpu->dirty_pages += count;
871 
872     return count;
873 }
874 
875 /* Must be with slots_lock held */
kvm_dirty_ring_reap_locked(KVMState * s,CPUState * cpu)876 static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu)
877 {
878     int ret;
879     uint64_t total = 0;
880     int64_t stamp;
881 
882     stamp = get_clock();
883 
884     if (cpu) {
885         total = kvm_dirty_ring_reap_one(s, cpu);
886     } else {
887         CPU_FOREACH(cpu) {
888             total += kvm_dirty_ring_reap_one(s, cpu);
889         }
890     }
891 
892     if (total) {
893         ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
894         assert(ret == total);
895     }
896 
897     stamp = get_clock() - stamp;
898 
899     if (total) {
900         trace_kvm_dirty_ring_reap(total, stamp / 1000);
901     }
902 
903     return total;
904 }
905 
906 /*
907  * Currently for simplicity, we must hold BQL before calling this.  We can
908  * consider to drop the BQL if we're clear with all the race conditions.
909  */
kvm_dirty_ring_reap(KVMState * s,CPUState * cpu)910 static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu)
911 {
912     uint64_t total;
913 
914     /*
915      * We need to lock all kvm slots for all address spaces here,
916      * because:
917      *
918      * (1) We need to mark dirty for dirty bitmaps in multiple slots
919      *     and for tons of pages, so it's better to take the lock here
920      *     once rather than once per page.  And more importantly,
921      *
922      * (2) We must _NOT_ publish dirty bits to the other threads
923      *     (e.g., the migration thread) via the kvm memory slot dirty
924      *     bitmaps before correctly re-protect those dirtied pages.
925      *     Otherwise we can have potential risk of data corruption if
926      *     the page data is read in the other thread before we do
927      *     reset below.
928      */
929     kvm_slots_lock();
930     total = kvm_dirty_ring_reap_locked(s, cpu);
931     kvm_slots_unlock();
932 
933     return total;
934 }
935 
do_kvm_cpu_synchronize_kick(CPUState * cpu,run_on_cpu_data arg)936 static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
937 {
938     /* No need to do anything */
939 }
940 
941 /*
942  * Kick all vcpus out in a synchronized way.  When returned, we
943  * guarantee that every vcpu has been kicked and at least returned to
944  * userspace once.
945  */
kvm_cpu_synchronize_kick_all(void)946 static void kvm_cpu_synchronize_kick_all(void)
947 {
948     CPUState *cpu;
949 
950     CPU_FOREACH(cpu) {
951         run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
952     }
953 }
954 
955 /*
956  * Flush all the existing dirty pages to the KVM slot buffers.  When
957  * this call returns, we guarantee that all the touched dirty pages
958  * before calling this function have been put into the per-kvmslot
959  * dirty bitmap.
960  *
961  * This function must be called with BQL held.
962  */
kvm_dirty_ring_flush(void)963 static void kvm_dirty_ring_flush(void)
964 {
965     trace_kvm_dirty_ring_flush(0);
966     /*
967      * The function needs to be serialized.  Since this function
968      * should always be with BQL held, serialization is guaranteed.
969      * However, let's be sure of it.
970      */
971     assert(bql_locked());
972     /*
973      * First make sure to flush the hardware buffers by kicking all
974      * vcpus out in a synchronous way.
975      */
976     kvm_cpu_synchronize_kick_all();
977     kvm_dirty_ring_reap(kvm_state, NULL);
978     trace_kvm_dirty_ring_flush(1);
979 }
980 
981 /**
982  * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
983  *
984  * This function will first try to fetch dirty bitmap from the kernel,
985  * and then updates qemu's dirty bitmap.
986  *
987  * NOTE: caller must be with kml->slots_lock held.
988  *
989  * @kml: the KVM memory listener object
990  * @section: the memory section to sync the dirty bitmap with
991  */
kvm_physical_sync_dirty_bitmap(KVMMemoryListener * kml,MemoryRegionSection * section)992 static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
993                                            MemoryRegionSection *section)
994 {
995     KVMState *s = kvm_state;
996     KVMSlot *mem;
997     hwaddr start_addr, size;
998     hwaddr slot_size;
999 
1000     size = kvm_align_section(section, &start_addr);
1001     while (size) {
1002         slot_size = MIN(kvm_max_slot_size, size);
1003         mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
1004         if (!mem) {
1005             /* We don't have a slot if we want to trap every access. */
1006             return;
1007         }
1008         if (kvm_slot_get_dirty_log(s, mem)) {
1009             kvm_slot_sync_dirty_pages(mem);
1010         }
1011         start_addr += slot_size;
1012         size -= slot_size;
1013     }
1014 }
1015 
1016 /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
1017 #define KVM_CLEAR_LOG_SHIFT  6
1018 #define KVM_CLEAR_LOG_ALIGN  (qemu_real_host_page_size() << KVM_CLEAR_LOG_SHIFT)
1019 #define KVM_CLEAR_LOG_MASK   (-KVM_CLEAR_LOG_ALIGN)
1020 
kvm_log_clear_one_slot(KVMSlot * mem,int as_id,uint64_t start,uint64_t size)1021 static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
1022                                   uint64_t size)
1023 {
1024     KVMState *s = kvm_state;
1025     uint64_t end, bmap_start, start_delta, bmap_npages;
1026     struct kvm_clear_dirty_log d;
1027     unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size();
1028     int ret;
1029 
1030     /*
1031      * We need to extend either the start or the size or both to
1032      * satisfy the KVM interface requirement.  Firstly, do the start
1033      * page alignment on 64 host pages
1034      */
1035     bmap_start = start & KVM_CLEAR_LOG_MASK;
1036     start_delta = start - bmap_start;
1037     bmap_start /= psize;
1038 
1039     /*
1040      * The kernel interface has restriction on the size too, that either:
1041      *
1042      * (1) the size is 64 host pages aligned (just like the start), or
1043      * (2) the size fills up until the end of the KVM memslot.
1044      */
1045     bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
1046         << KVM_CLEAR_LOG_SHIFT;
1047     end = mem->memory_size / psize;
1048     if (bmap_npages > end - bmap_start) {
1049         bmap_npages = end - bmap_start;
1050     }
1051     start_delta /= psize;
1052 
1053     /*
1054      * Prepare the bitmap to clear dirty bits.  Here we must guarantee
1055      * that we won't clear any unknown dirty bits otherwise we might
1056      * accidentally clear some set bits which are not yet synced from
1057      * the kernel into QEMU's bitmap, then we'll lose track of the
1058      * guest modifications upon those pages (which can directly lead
1059      * to guest data loss or panic after migration).
1060      *
1061      * Layout of the KVMSlot.dirty_bmap:
1062      *
1063      *                   |<-------- bmap_npages -----------..>|
1064      *                                                     [1]
1065      *                     start_delta         size
1066      *  |----------------|-------------|------------------|------------|
1067      *  ^                ^             ^                               ^
1068      *  |                |             |                               |
1069      * start          bmap_start     (start)                         end
1070      * of memslot                                             of memslot
1071      *
1072      * [1] bmap_npages can be aligned to either 64 pages or the end of slot
1073      */
1074 
1075     assert(bmap_start % BITS_PER_LONG == 0);
1076     /* We should never do log_clear before log_sync */
1077     assert(mem->dirty_bmap);
1078     if (start_delta || bmap_npages - size / psize) {
1079         /* Slow path - we need to manipulate a temp bitmap */
1080         bmap_clear = bitmap_new(bmap_npages);
1081         bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
1082                                     bmap_start, start_delta + size / psize);
1083         /*
1084          * We need to fill the holes at start because that was not
1085          * specified by the caller and we extended the bitmap only for
1086          * 64 pages alignment
1087          */
1088         bitmap_clear(bmap_clear, 0, start_delta);
1089         d.dirty_bitmap = bmap_clear;
1090     } else {
1091         /*
1092          * Fast path - both start and size align well with BITS_PER_LONG
1093          * (or the end of memory slot)
1094          */
1095         d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
1096     }
1097 
1098     d.first_page = bmap_start;
1099     /* It should never overflow.  If it happens, say something */
1100     assert(bmap_npages <= UINT32_MAX);
1101     d.num_pages = bmap_npages;
1102     d.slot = mem->slot | (as_id << 16);
1103 
1104     ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d);
1105     if (ret < 0 && ret != -ENOENT) {
1106         error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
1107                      "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
1108                      __func__, d.slot, (uint64_t)d.first_page,
1109                      (uint32_t)d.num_pages, ret);
1110     } else {
1111         ret = 0;
1112         trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
1113     }
1114 
1115     /*
1116      * After we have updated the remote dirty bitmap, we update the
1117      * cached bitmap as well for the memslot, then if another user
1118      * clears the same region we know we shouldn't clear it again on
1119      * the remote otherwise it's data loss as well.
1120      */
1121     bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
1122                  size / psize);
1123     /* This handles the NULL case well */
1124     g_free(bmap_clear);
1125     return ret;
1126 }
1127 
1128 
1129 /**
1130  * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
1131  *
1132  * NOTE: this will be a no-op if we haven't enabled manual dirty log
1133  * protection in the host kernel because in that case this operation
1134  * will be done within log_sync().
1135  *
1136  * @kml:     the kvm memory listener
1137  * @section: the memory range to clear dirty bitmap
1138  */
kvm_physical_log_clear(KVMMemoryListener * kml,MemoryRegionSection * section)1139 static int kvm_physical_log_clear(KVMMemoryListener *kml,
1140                                   MemoryRegionSection *section)
1141 {
1142     KVMState *s = kvm_state;
1143     uint64_t start, size, offset, count;
1144     KVMSlot *mem;
1145     int ret = 0, i;
1146 
1147     if (!s->manual_dirty_log_protect) {
1148         /* No need to do explicit clear */
1149         return ret;
1150     }
1151 
1152     start = section->offset_within_address_space;
1153     size = int128_get64(section->size);
1154 
1155     if (!size) {
1156         /* Nothing more we can do... */
1157         return ret;
1158     }
1159 
1160     kvm_slots_lock();
1161 
1162     for (i = 0; i < kml->nr_slots_allocated; i++) {
1163         mem = &kml->slots[i];
1164         /* Discard slots that are empty or do not overlap the section */
1165         if (!mem->memory_size ||
1166             mem->start_addr > start + size - 1 ||
1167             start > mem->start_addr + mem->memory_size - 1) {
1168             continue;
1169         }
1170 
1171         if (start >= mem->start_addr) {
1172             /* The slot starts before section or is aligned to it.  */
1173             offset = start - mem->start_addr;
1174             count = MIN(mem->memory_size - offset, size);
1175         } else {
1176             /* The slot starts after section.  */
1177             offset = 0;
1178             count = MIN(mem->memory_size, size - (mem->start_addr - start));
1179         }
1180         ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count);
1181         if (ret < 0) {
1182             break;
1183         }
1184     }
1185 
1186     kvm_slots_unlock();
1187 
1188     return ret;
1189 }
1190 
kvm_coalesce_mmio_region(MemoryListener * listener,MemoryRegionSection * secion,hwaddr start,hwaddr size)1191 static void kvm_coalesce_mmio_region(MemoryListener *listener,
1192                                      MemoryRegionSection *secion,
1193                                      hwaddr start, hwaddr size)
1194 {
1195     KVMState *s = kvm_state;
1196 
1197     if (s->coalesced_mmio) {
1198         struct kvm_coalesced_mmio_zone zone;
1199 
1200         zone.addr = start;
1201         zone.size = size;
1202         zone.pad = 0;
1203 
1204         (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1205     }
1206 }
1207 
kvm_uncoalesce_mmio_region(MemoryListener * listener,MemoryRegionSection * secion,hwaddr start,hwaddr size)1208 static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
1209                                        MemoryRegionSection *secion,
1210                                        hwaddr start, hwaddr size)
1211 {
1212     KVMState *s = kvm_state;
1213 
1214     if (s->coalesced_mmio) {
1215         struct kvm_coalesced_mmio_zone zone;
1216 
1217         zone.addr = start;
1218         zone.size = size;
1219         zone.pad = 0;
1220 
1221         (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1222     }
1223 }
1224 
kvm_coalesce_pio_add(MemoryListener * listener,MemoryRegionSection * section,hwaddr start,hwaddr size)1225 static void kvm_coalesce_pio_add(MemoryListener *listener,
1226                                 MemoryRegionSection *section,
1227                                 hwaddr start, hwaddr size)
1228 {
1229     KVMState *s = kvm_state;
1230 
1231     if (s->coalesced_pio) {
1232         struct kvm_coalesced_mmio_zone zone;
1233 
1234         zone.addr = start;
1235         zone.size = size;
1236         zone.pio = 1;
1237 
1238         (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1239     }
1240 }
1241 
kvm_coalesce_pio_del(MemoryListener * listener,MemoryRegionSection * section,hwaddr start,hwaddr size)1242 static void kvm_coalesce_pio_del(MemoryListener *listener,
1243                                 MemoryRegionSection *section,
1244                                 hwaddr start, hwaddr size)
1245 {
1246     KVMState *s = kvm_state;
1247 
1248     if (s->coalesced_pio) {
1249         struct kvm_coalesced_mmio_zone zone;
1250 
1251         zone.addr = start;
1252         zone.size = size;
1253         zone.pio = 1;
1254 
1255         (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1256      }
1257 }
1258 
kvm_check_extension(KVMState * s,unsigned int extension)1259 int kvm_check_extension(KVMState *s, unsigned int extension)
1260 {
1261     int ret;
1262 
1263     ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1264     if (ret < 0) {
1265         ret = 0;
1266     }
1267 
1268     return ret;
1269 }
1270 
kvm_vm_check_extension(KVMState * s,unsigned int extension)1271 int kvm_vm_check_extension(KVMState *s, unsigned int extension)
1272 {
1273     int ret;
1274 
1275     ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1276     if (ret < 0) {
1277         /* VM wide version not implemented, use global one instead */
1278         ret = kvm_check_extension(s, extension);
1279     }
1280 
1281     return ret;
1282 }
1283 
1284 /*
1285  * We track the poisoned pages to be able to:
1286  * - replace them on VM reset
1287  * - block a migration for a VM with a poisoned page
1288  */
1289 typedef struct HWPoisonPage {
1290     ram_addr_t ram_addr;
1291     QLIST_ENTRY(HWPoisonPage) list;
1292 } HWPoisonPage;
1293 
1294 static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
1295     QLIST_HEAD_INITIALIZER(hwpoison_page_list);
1296 
kvm_unpoison_all(void * param)1297 static void kvm_unpoison_all(void *param)
1298 {
1299     HWPoisonPage *page, *next_page;
1300 
1301     QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
1302         QLIST_REMOVE(page, list);
1303         qemu_ram_remap(page->ram_addr);
1304         g_free(page);
1305     }
1306 }
1307 
kvm_hwpoison_page_add(ram_addr_t ram_addr)1308 void kvm_hwpoison_page_add(ram_addr_t ram_addr)
1309 {
1310     HWPoisonPage *page;
1311 
1312     QLIST_FOREACH(page, &hwpoison_page_list, list) {
1313         if (page->ram_addr == ram_addr) {
1314             return;
1315         }
1316     }
1317     page = g_new(HWPoisonPage, 1);
1318     page->ram_addr = ram_addr;
1319     QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
1320 }
1321 
kvm_hwpoisoned_mem(void)1322 bool kvm_hwpoisoned_mem(void)
1323 {
1324     return !QLIST_EMPTY(&hwpoison_page_list);
1325 }
1326 
adjust_ioeventfd_endianness(uint32_t val,uint32_t size)1327 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
1328 {
1329     if (target_needs_bswap()) {
1330         /*
1331          * The kernel expects ioeventfd values in HOST_BIG_ENDIAN
1332          * endianness, but the memory core hands them in target endianness.
1333          * For example, PPC is always treated as big-endian even if running
1334          * on KVM and on PPC64LE.  Correct here, swapping back.
1335          */
1336         switch (size) {
1337         case 2:
1338             val = bswap16(val);
1339             break;
1340         case 4:
1341             val = bswap32(val);
1342             break;
1343         }
1344     }
1345     return val;
1346 }
1347 
kvm_set_ioeventfd_mmio(int fd,hwaddr addr,uint32_t val,bool assign,uint32_t size,bool datamatch)1348 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
1349                                   bool assign, uint32_t size, bool datamatch)
1350 {
1351     int ret;
1352     struct kvm_ioeventfd iofd = {
1353         .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1354         .addr = addr,
1355         .len = size,
1356         .flags = 0,
1357         .fd = fd,
1358     };
1359 
1360     trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
1361                                  datamatch);
1362     if (!kvm_enabled()) {
1363         return -ENOSYS;
1364     }
1365 
1366     if (datamatch) {
1367         iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1368     }
1369     if (!assign) {
1370         iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1371     }
1372 
1373     ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1374 
1375     if (ret < 0) {
1376         return -errno;
1377     }
1378 
1379     return 0;
1380 }
1381 
kvm_set_ioeventfd_pio(int fd,uint16_t addr,uint16_t val,bool assign,uint32_t size,bool datamatch)1382 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
1383                                  bool assign, uint32_t size, bool datamatch)
1384 {
1385     struct kvm_ioeventfd kick = {
1386         .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1387         .addr = addr,
1388         .flags = KVM_IOEVENTFD_FLAG_PIO,
1389         .len = size,
1390         .fd = fd,
1391     };
1392     int r;
1393     trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
1394     if (!kvm_enabled()) {
1395         return -ENOSYS;
1396     }
1397     if (datamatch) {
1398         kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1399     }
1400     if (!assign) {
1401         kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1402     }
1403     r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1404     if (r < 0) {
1405         return r;
1406     }
1407     return 0;
1408 }
1409 
1410 
1411 static const KVMCapabilityInfo *
kvm_check_extension_list(KVMState * s,const KVMCapabilityInfo * list)1412 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
1413 {
1414     while (list->name) {
1415         if (!kvm_check_extension(s, list->value)) {
1416             return list;
1417         }
1418         list++;
1419     }
1420     return NULL;
1421 }
1422 
kvm_set_max_memslot_size(hwaddr max_slot_size)1423 void kvm_set_max_memslot_size(hwaddr max_slot_size)
1424 {
1425     g_assert(
1426         ROUND_UP(max_slot_size, qemu_real_host_page_size()) == max_slot_size
1427     );
1428     kvm_max_slot_size = max_slot_size;
1429 }
1430 
kvm_set_memory_attributes(hwaddr start,uint64_t size,uint64_t attr)1431 static int kvm_set_memory_attributes(hwaddr start, uint64_t size, uint64_t attr)
1432 {
1433     struct kvm_memory_attributes attrs;
1434     int r;
1435 
1436     assert((attr & kvm_supported_memory_attributes) == attr);
1437     attrs.attributes = attr;
1438     attrs.address = start;
1439     attrs.size = size;
1440     attrs.flags = 0;
1441 
1442     r = kvm_vm_ioctl(kvm_state, KVM_SET_MEMORY_ATTRIBUTES, &attrs);
1443     if (r) {
1444         error_report("failed to set memory (0x%" HWADDR_PRIx "+0x%" PRIx64 ") "
1445                      "with attr 0x%" PRIx64 " error '%s'",
1446                      start, size, attr, strerror(errno));
1447     }
1448     return r;
1449 }
1450 
kvm_set_memory_attributes_private(hwaddr start,uint64_t size)1451 int kvm_set_memory_attributes_private(hwaddr start, uint64_t size)
1452 {
1453     return kvm_set_memory_attributes(start, size, KVM_MEMORY_ATTRIBUTE_PRIVATE);
1454 }
1455 
kvm_set_memory_attributes_shared(hwaddr start,uint64_t size)1456 int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size)
1457 {
1458     return kvm_set_memory_attributes(start, size, 0);
1459 }
1460 
1461 /* Called with KVMMemoryListener.slots_lock held */
kvm_set_phys_mem(KVMMemoryListener * kml,MemoryRegionSection * section,bool add)1462 static void kvm_set_phys_mem(KVMMemoryListener *kml,
1463                              MemoryRegionSection *section, bool add)
1464 {
1465     KVMSlot *mem;
1466     int err;
1467     MemoryRegion *mr = section->mr;
1468     bool writable = !mr->readonly && !mr->rom_device;
1469     hwaddr start_addr, size, slot_size, mr_offset;
1470     ram_addr_t ram_start_offset;
1471     void *ram;
1472 
1473     if (!memory_region_is_ram(mr)) {
1474         if (writable || !kvm_readonly_mem_allowed) {
1475             return;
1476         } else if (!mr->romd_mode) {
1477             /* If the memory device is not in romd_mode, then we actually want
1478              * to remove the kvm memory slot so all accesses will trap. */
1479             add = false;
1480         }
1481     }
1482 
1483     size = kvm_align_section(section, &start_addr);
1484     if (!size) {
1485         return;
1486     }
1487 
1488     /* The offset of the kvmslot within the memory region */
1489     mr_offset = section->offset_within_region + start_addr -
1490         section->offset_within_address_space;
1491 
1492     /* use aligned delta to align the ram address and offset */
1493     ram = memory_region_get_ram_ptr(mr) + mr_offset;
1494     ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
1495 
1496     if (!add) {
1497         do {
1498             slot_size = MIN(kvm_max_slot_size, size);
1499             mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
1500             if (!mem) {
1501                 return;
1502             }
1503             if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1504                 /*
1505                  * NOTE: We should be aware of the fact that here we're only
1506                  * doing a best effort to sync dirty bits.  No matter whether
1507                  * we're using dirty log or dirty ring, we ignored two facts:
1508                  *
1509                  * (1) dirty bits can reside in hardware buffers (PML)
1510                  *
1511                  * (2) after we collected dirty bits here, pages can be dirtied
1512                  * again before we do the final KVM_SET_USER_MEMORY_REGION to
1513                  * remove the slot.
1514                  *
1515                  * Not easy.  Let's cross the fingers until it's fixed.
1516                  */
1517                 if (kvm_state->kvm_dirty_ring_size) {
1518                     kvm_dirty_ring_reap_locked(kvm_state, NULL);
1519                     if (kvm_state->kvm_dirty_ring_with_bitmap) {
1520                         kvm_slot_sync_dirty_pages(mem);
1521                         kvm_slot_get_dirty_log(kvm_state, mem);
1522                     }
1523                 } else {
1524                     kvm_slot_get_dirty_log(kvm_state, mem);
1525                 }
1526                 kvm_slot_sync_dirty_pages(mem);
1527             }
1528 
1529             /* unregister the slot */
1530             g_free(mem->dirty_bmap);
1531             mem->dirty_bmap = NULL;
1532             mem->memory_size = 0;
1533             mem->flags = 0;
1534             err = kvm_set_user_memory_region(kml, mem, false);
1535             if (err) {
1536                 fprintf(stderr, "%s: error unregistering slot: %s\n",
1537                         __func__, strerror(-err));
1538                 abort();
1539             }
1540             start_addr += slot_size;
1541             size -= slot_size;
1542             kml->nr_slots_used--;
1543         } while (size);
1544         return;
1545     }
1546 
1547     /* register the new slot */
1548     do {
1549         slot_size = MIN(kvm_max_slot_size, size);
1550         mem = kvm_alloc_slot(kml);
1551         mem->as_id = kml->as_id;
1552         mem->memory_size = slot_size;
1553         mem->start_addr = start_addr;
1554         mem->ram_start_offset = ram_start_offset;
1555         mem->ram = ram;
1556         mem->flags = kvm_mem_flags(mr);
1557         mem->guest_memfd = mr->ram_block->guest_memfd;
1558         mem->guest_memfd_offset = (uint8_t*)ram - mr->ram_block->host;
1559 
1560         kvm_slot_init_dirty_bitmap(mem);
1561         err = kvm_set_user_memory_region(kml, mem, true);
1562         if (err) {
1563             fprintf(stderr, "%s: error registering slot: %s\n", __func__,
1564                     strerror(-err));
1565             abort();
1566         }
1567 
1568         if (memory_region_has_guest_memfd(mr)) {
1569             err = kvm_set_memory_attributes_private(start_addr, slot_size);
1570             if (err) {
1571                 error_report("%s: failed to set memory attribute private: %s",
1572                              __func__, strerror(-err));
1573                 exit(1);
1574             }
1575         }
1576 
1577         start_addr += slot_size;
1578         ram_start_offset += slot_size;
1579         ram += slot_size;
1580         size -= slot_size;
1581         kml->nr_slots_used++;
1582     } while (size);
1583 }
1584 
kvm_dirty_ring_reaper_thread(void * data)1585 static void *kvm_dirty_ring_reaper_thread(void *data)
1586 {
1587     KVMState *s = data;
1588     struct KVMDirtyRingReaper *r = &s->reaper;
1589 
1590     rcu_register_thread();
1591 
1592     trace_kvm_dirty_ring_reaper("init");
1593 
1594     while (true) {
1595         r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
1596         trace_kvm_dirty_ring_reaper("wait");
1597         /*
1598          * TODO: provide a smarter timeout rather than a constant?
1599          */
1600         sleep(1);
1601 
1602         /* keep sleeping so that dirtylimit not be interfered by reaper */
1603         if (dirtylimit_in_service()) {
1604             continue;
1605         }
1606 
1607         trace_kvm_dirty_ring_reaper("wakeup");
1608         r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
1609 
1610         bql_lock();
1611         kvm_dirty_ring_reap(s, NULL);
1612         bql_unlock();
1613 
1614         r->reaper_iteration++;
1615     }
1616 
1617     g_assert_not_reached();
1618 }
1619 
kvm_dirty_ring_reaper_init(KVMState * s)1620 static void kvm_dirty_ring_reaper_init(KVMState *s)
1621 {
1622     struct KVMDirtyRingReaper *r = &s->reaper;
1623 
1624     qemu_thread_create(&r->reaper_thr, "kvm-reaper",
1625                        kvm_dirty_ring_reaper_thread,
1626                        s, QEMU_THREAD_JOINABLE);
1627 }
1628 
kvm_dirty_ring_init(KVMState * s)1629 static int kvm_dirty_ring_init(KVMState *s)
1630 {
1631     uint32_t ring_size = s->kvm_dirty_ring_size;
1632     uint64_t ring_bytes = ring_size * sizeof(struct kvm_dirty_gfn);
1633     unsigned int capability = KVM_CAP_DIRTY_LOG_RING;
1634     int ret;
1635 
1636     s->kvm_dirty_ring_size = 0;
1637     s->kvm_dirty_ring_bytes = 0;
1638 
1639     /* Bail if the dirty ring size isn't specified */
1640     if (!ring_size) {
1641         return 0;
1642     }
1643 
1644     /*
1645      * Read the max supported pages. Fall back to dirty logging mode
1646      * if the dirty ring isn't supported.
1647      */
1648     ret = kvm_vm_check_extension(s, capability);
1649     if (ret <= 0) {
1650         capability = KVM_CAP_DIRTY_LOG_RING_ACQ_REL;
1651         ret = kvm_vm_check_extension(s, capability);
1652     }
1653 
1654     if (ret <= 0) {
1655         warn_report("KVM dirty ring not available, using bitmap method");
1656         return 0;
1657     }
1658 
1659     if (ring_bytes > ret) {
1660         error_report("KVM dirty ring size %" PRIu32 " too big "
1661                      "(maximum is %ld).  Please use a smaller value.",
1662                      ring_size, (long)ret / sizeof(struct kvm_dirty_gfn));
1663         return -EINVAL;
1664     }
1665 
1666     ret = kvm_vm_enable_cap(s, capability, 0, ring_bytes);
1667     if (ret) {
1668         error_report("Enabling of KVM dirty ring failed: %s. "
1669                      "Suggested minimum value is 1024.", strerror(-ret));
1670         return -EIO;
1671     }
1672 
1673     /* Enable the backup bitmap if it is supported */
1674     ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP);
1675     if (ret > 0) {
1676         ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP, 0);
1677         if (ret) {
1678             error_report("Enabling of KVM dirty ring's backup bitmap failed: "
1679                          "%s. ", strerror(-ret));
1680             return -EIO;
1681         }
1682 
1683         s->kvm_dirty_ring_with_bitmap = true;
1684     }
1685 
1686     s->kvm_dirty_ring_size = ring_size;
1687     s->kvm_dirty_ring_bytes = ring_bytes;
1688 
1689     return 0;
1690 }
1691 
kvm_region_add(MemoryListener * listener,MemoryRegionSection * section)1692 static void kvm_region_add(MemoryListener *listener,
1693                            MemoryRegionSection *section)
1694 {
1695     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1696     KVMMemoryUpdate *update;
1697 
1698     update = g_new0(KVMMemoryUpdate, 1);
1699     update->section = *section;
1700 
1701     QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next);
1702 }
1703 
kvm_region_del(MemoryListener * listener,MemoryRegionSection * section)1704 static void kvm_region_del(MemoryListener *listener,
1705                            MemoryRegionSection *section)
1706 {
1707     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1708     KVMMemoryUpdate *update;
1709 
1710     update = g_new0(KVMMemoryUpdate, 1);
1711     update->section = *section;
1712 
1713     QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next);
1714 }
1715 
kvm_region_commit(MemoryListener * listener)1716 static void kvm_region_commit(MemoryListener *listener)
1717 {
1718     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
1719                                           listener);
1720     KVMMemoryUpdate *u1, *u2;
1721     bool need_inhibit = false;
1722 
1723     if (QSIMPLEQ_EMPTY(&kml->transaction_add) &&
1724         QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1725         return;
1726     }
1727 
1728     /*
1729      * We have to be careful when regions to add overlap with ranges to remove.
1730      * We have to simulate atomic KVM memslot updates by making sure no ioctl()
1731      * is currently active.
1732      *
1733      * The lists are order by addresses, so it's easy to find overlaps.
1734      */
1735     u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1736     u2 = QSIMPLEQ_FIRST(&kml->transaction_add);
1737     while (u1 && u2) {
1738         Range r1, r2;
1739 
1740         range_init_nofail(&r1, u1->section.offset_within_address_space,
1741                           int128_get64(u1->section.size));
1742         range_init_nofail(&r2, u2->section.offset_within_address_space,
1743                           int128_get64(u2->section.size));
1744 
1745         if (range_overlaps_range(&r1, &r2)) {
1746             need_inhibit = true;
1747             break;
1748         }
1749         if (range_lob(&r1) < range_lob(&r2)) {
1750             u1 = QSIMPLEQ_NEXT(u1, next);
1751         } else {
1752             u2 = QSIMPLEQ_NEXT(u2, next);
1753         }
1754     }
1755 
1756     kvm_slots_lock();
1757     if (need_inhibit) {
1758         accel_ioctl_inhibit_begin();
1759     }
1760 
1761     /* Remove all memslots before adding the new ones. */
1762     while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1763         u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1764         QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next);
1765 
1766         kvm_set_phys_mem(kml, &u1->section, false);
1767         memory_region_unref(u1->section.mr);
1768 
1769         g_free(u1);
1770     }
1771     while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) {
1772         u1 = QSIMPLEQ_FIRST(&kml->transaction_add);
1773         QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next);
1774 
1775         memory_region_ref(u1->section.mr);
1776         kvm_set_phys_mem(kml, &u1->section, true);
1777 
1778         g_free(u1);
1779     }
1780 
1781     if (need_inhibit) {
1782         accel_ioctl_inhibit_end();
1783     }
1784     kvm_slots_unlock();
1785 }
1786 
kvm_log_sync(MemoryListener * listener,MemoryRegionSection * section)1787 static void kvm_log_sync(MemoryListener *listener,
1788                          MemoryRegionSection *section)
1789 {
1790     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1791 
1792     kvm_slots_lock();
1793     kvm_physical_sync_dirty_bitmap(kml, section);
1794     kvm_slots_unlock();
1795 }
1796 
kvm_log_sync_global(MemoryListener * l,bool last_stage)1797 static void kvm_log_sync_global(MemoryListener *l, bool last_stage)
1798 {
1799     KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
1800     KVMState *s = kvm_state;
1801     KVMSlot *mem;
1802     int i;
1803 
1804     /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
1805     kvm_dirty_ring_flush();
1806 
1807     kvm_slots_lock();
1808     for (i = 0; i < kml->nr_slots_allocated; i++) {
1809         mem = &kml->slots[i];
1810         if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1811             kvm_slot_sync_dirty_pages(mem);
1812 
1813             if (s->kvm_dirty_ring_with_bitmap && last_stage &&
1814                 kvm_slot_get_dirty_log(s, mem)) {
1815                 kvm_slot_sync_dirty_pages(mem);
1816             }
1817 
1818             /*
1819              * This is not needed by KVM_GET_DIRTY_LOG because the
1820              * ioctl will unconditionally overwrite the whole region.
1821              * However kvm dirty ring has no such side effect.
1822              */
1823             kvm_slot_reset_dirty_pages(mem);
1824         }
1825     }
1826     kvm_slots_unlock();
1827 }
1828 
kvm_log_clear(MemoryListener * listener,MemoryRegionSection * section)1829 static void kvm_log_clear(MemoryListener *listener,
1830                           MemoryRegionSection *section)
1831 {
1832     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1833     int r;
1834 
1835     r = kvm_physical_log_clear(kml, section);
1836     if (r < 0) {
1837         error_report_once("%s: kvm log clear failed: mr=%s "
1838                           "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
1839                           section->mr->name, section->offset_within_region,
1840                           int128_get64(section->size));
1841         abort();
1842     }
1843 }
1844 
kvm_mem_ioeventfd_add(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1845 static void kvm_mem_ioeventfd_add(MemoryListener *listener,
1846                                   MemoryRegionSection *section,
1847                                   bool match_data, uint64_t data,
1848                                   EventNotifier *e)
1849 {
1850     int fd = event_notifier_get_fd(e);
1851     int r;
1852 
1853     r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1854                                data, true, int128_get64(section->size),
1855                                match_data);
1856     if (r < 0) {
1857         fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1858                 __func__, strerror(-r), -r);
1859         abort();
1860     }
1861 }
1862 
kvm_mem_ioeventfd_del(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1863 static void kvm_mem_ioeventfd_del(MemoryListener *listener,
1864                                   MemoryRegionSection *section,
1865                                   bool match_data, uint64_t data,
1866                                   EventNotifier *e)
1867 {
1868     int fd = event_notifier_get_fd(e);
1869     int r;
1870 
1871     r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1872                                data, false, int128_get64(section->size),
1873                                match_data);
1874     if (r < 0) {
1875         fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1876                 __func__, strerror(-r), -r);
1877         abort();
1878     }
1879 }
1880 
kvm_io_ioeventfd_add(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1881 static void kvm_io_ioeventfd_add(MemoryListener *listener,
1882                                  MemoryRegionSection *section,
1883                                  bool match_data, uint64_t data,
1884                                  EventNotifier *e)
1885 {
1886     int fd = event_notifier_get_fd(e);
1887     int r;
1888 
1889     r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1890                               data, true, int128_get64(section->size),
1891                               match_data);
1892     if (r < 0) {
1893         fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1894                 __func__, strerror(-r), -r);
1895         abort();
1896     }
1897 }
1898 
kvm_io_ioeventfd_del(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1899 static void kvm_io_ioeventfd_del(MemoryListener *listener,
1900                                  MemoryRegionSection *section,
1901                                  bool match_data, uint64_t data,
1902                                  EventNotifier *e)
1903 
1904 {
1905     int fd = event_notifier_get_fd(e);
1906     int r;
1907 
1908     r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1909                               data, false, int128_get64(section->size),
1910                               match_data);
1911     if (r < 0) {
1912         fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1913                 __func__, strerror(-r), -r);
1914         abort();
1915     }
1916 }
1917 
kvm_memory_listener_register(KVMState * s,KVMMemoryListener * kml,AddressSpace * as,int as_id,const char * name)1918 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
1919                                   AddressSpace *as, int as_id, const char *name)
1920 {
1921     int i;
1922 
1923     kml->as_id = as_id;
1924 
1925     kvm_slots_grow(kml, KVM_MEMSLOTS_NR_ALLOC_DEFAULT);
1926 
1927     QSIMPLEQ_INIT(&kml->transaction_add);
1928     QSIMPLEQ_INIT(&kml->transaction_del);
1929 
1930     kml->listener.region_add = kvm_region_add;
1931     kml->listener.region_del = kvm_region_del;
1932     kml->listener.commit = kvm_region_commit;
1933     kml->listener.log_start = kvm_log_start;
1934     kml->listener.log_stop = kvm_log_stop;
1935     kml->listener.priority = MEMORY_LISTENER_PRIORITY_ACCEL;
1936     kml->listener.name = name;
1937 
1938     if (s->kvm_dirty_ring_size) {
1939         kml->listener.log_sync_global = kvm_log_sync_global;
1940     } else {
1941         kml->listener.log_sync = kvm_log_sync;
1942         kml->listener.log_clear = kvm_log_clear;
1943     }
1944 
1945     memory_listener_register(&kml->listener, as);
1946 
1947     for (i = 0; i < s->nr_as; ++i) {
1948         if (!s->as[i].as) {
1949             s->as[i].as = as;
1950             s->as[i].ml = kml;
1951             break;
1952         }
1953     }
1954 }
1955 
1956 static MemoryListener kvm_io_listener = {
1957     .name = "kvm-io",
1958     .coalesced_io_add = kvm_coalesce_pio_add,
1959     .coalesced_io_del = kvm_coalesce_pio_del,
1960     .eventfd_add = kvm_io_ioeventfd_add,
1961     .eventfd_del = kvm_io_ioeventfd_del,
1962     .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND,
1963 };
1964 
kvm_set_irq(KVMState * s,int irq,int level)1965 int kvm_set_irq(KVMState *s, int irq, int level)
1966 {
1967     struct kvm_irq_level event;
1968     int ret;
1969 
1970     assert(kvm_async_interrupts_enabled());
1971 
1972     event.level = level;
1973     event.irq = irq;
1974     ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
1975     if (ret < 0) {
1976         perror("kvm_set_irq");
1977         abort();
1978     }
1979 
1980     return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
1981 }
1982 
1983 #ifdef KVM_CAP_IRQ_ROUTING
1984 typedef struct KVMMSIRoute {
1985     struct kvm_irq_routing_entry kroute;
1986     QTAILQ_ENTRY(KVMMSIRoute) entry;
1987 } KVMMSIRoute;
1988 
set_gsi(KVMState * s,unsigned int gsi)1989 static void set_gsi(KVMState *s, unsigned int gsi)
1990 {
1991     set_bit(gsi, s->used_gsi_bitmap);
1992 }
1993 
clear_gsi(KVMState * s,unsigned int gsi)1994 static void clear_gsi(KVMState *s, unsigned int gsi)
1995 {
1996     clear_bit(gsi, s->used_gsi_bitmap);
1997 }
1998 
kvm_init_irq_routing(KVMState * s)1999 void kvm_init_irq_routing(KVMState *s)
2000 {
2001     int gsi_count;
2002 
2003     gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
2004     if (gsi_count > 0) {
2005         /* Round up so we can search ints using ffs */
2006         s->used_gsi_bitmap = bitmap_new(gsi_count);
2007         s->gsi_count = gsi_count;
2008     }
2009 
2010     s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
2011     s->nr_allocated_irq_routes = 0;
2012 
2013     kvm_arch_init_irq_routing(s);
2014 }
2015 
kvm_irqchip_commit_routes(KVMState * s)2016 void kvm_irqchip_commit_routes(KVMState *s)
2017 {
2018     int ret;
2019 
2020     if (kvm_gsi_direct_mapping()) {
2021         return;
2022     }
2023 
2024     if (!kvm_gsi_routing_enabled()) {
2025         return;
2026     }
2027 
2028     s->irq_routes->flags = 0;
2029     trace_kvm_irqchip_commit_routes();
2030     ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
2031     assert(ret == 0);
2032 }
2033 
kvm_add_routing_entry(KVMState * s,struct kvm_irq_routing_entry * entry)2034 void kvm_add_routing_entry(KVMState *s,
2035                            struct kvm_irq_routing_entry *entry)
2036 {
2037     struct kvm_irq_routing_entry *new;
2038     int n, size;
2039 
2040     if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
2041         n = s->nr_allocated_irq_routes * 2;
2042         if (n < 64) {
2043             n = 64;
2044         }
2045         size = sizeof(struct kvm_irq_routing);
2046         size += n * sizeof(*new);
2047         s->irq_routes = g_realloc(s->irq_routes, size);
2048         s->nr_allocated_irq_routes = n;
2049     }
2050     n = s->irq_routes->nr++;
2051     new = &s->irq_routes->entries[n];
2052 
2053     *new = *entry;
2054 
2055     set_gsi(s, entry->gsi);
2056 }
2057 
kvm_update_routing_entry(KVMState * s,struct kvm_irq_routing_entry * new_entry)2058 static int kvm_update_routing_entry(KVMState *s,
2059                                     struct kvm_irq_routing_entry *new_entry)
2060 {
2061     struct kvm_irq_routing_entry *entry;
2062     int n;
2063 
2064     for (n = 0; n < s->irq_routes->nr; n++) {
2065         entry = &s->irq_routes->entries[n];
2066         if (entry->gsi != new_entry->gsi) {
2067             continue;
2068         }
2069 
2070         if(!memcmp(entry, new_entry, sizeof *entry)) {
2071             return 0;
2072         }
2073 
2074         *entry = *new_entry;
2075 
2076         return 0;
2077     }
2078 
2079     return -ESRCH;
2080 }
2081 
kvm_irqchip_add_irq_route(KVMState * s,int irq,int irqchip,int pin)2082 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
2083 {
2084     struct kvm_irq_routing_entry e = {};
2085 
2086     assert(pin < s->gsi_count);
2087 
2088     e.gsi = irq;
2089     e.type = KVM_IRQ_ROUTING_IRQCHIP;
2090     e.flags = 0;
2091     e.u.irqchip.irqchip = irqchip;
2092     e.u.irqchip.pin = pin;
2093     kvm_add_routing_entry(s, &e);
2094 }
2095 
kvm_irqchip_release_virq(KVMState * s,int virq)2096 void kvm_irqchip_release_virq(KVMState *s, int virq)
2097 {
2098     struct kvm_irq_routing_entry *e;
2099     int i;
2100 
2101     if (kvm_gsi_direct_mapping()) {
2102         return;
2103     }
2104 
2105     for (i = 0; i < s->irq_routes->nr; i++) {
2106         e = &s->irq_routes->entries[i];
2107         if (e->gsi == virq) {
2108             s->irq_routes->nr--;
2109             *e = s->irq_routes->entries[s->irq_routes->nr];
2110         }
2111     }
2112     clear_gsi(s, virq);
2113     kvm_arch_release_virq_post(virq);
2114     trace_kvm_irqchip_release_virq(virq);
2115 }
2116 
kvm_irqchip_add_change_notifier(Notifier * n)2117 void kvm_irqchip_add_change_notifier(Notifier *n)
2118 {
2119     notifier_list_add(&kvm_irqchip_change_notifiers, n);
2120 }
2121 
kvm_irqchip_remove_change_notifier(Notifier * n)2122 void kvm_irqchip_remove_change_notifier(Notifier *n)
2123 {
2124     notifier_remove(n);
2125 }
2126 
kvm_irqchip_change_notify(void)2127 void kvm_irqchip_change_notify(void)
2128 {
2129     notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
2130 }
2131 
kvm_irqchip_get_virq(KVMState * s)2132 int kvm_irqchip_get_virq(KVMState *s)
2133 {
2134     int next_virq;
2135 
2136     /* Return the lowest unused GSI in the bitmap */
2137     next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
2138     if (next_virq >= s->gsi_count) {
2139         return -ENOSPC;
2140     } else {
2141         return next_virq;
2142     }
2143 }
2144 
kvm_irqchip_send_msi(KVMState * s,MSIMessage msg)2145 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
2146 {
2147     struct kvm_msi msi;
2148 
2149     msi.address_lo = (uint32_t)msg.address;
2150     msi.address_hi = msg.address >> 32;
2151     msi.data = le32_to_cpu(msg.data);
2152     msi.flags = 0;
2153     memset(msi.pad, 0, sizeof(msi.pad));
2154 
2155     return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
2156 }
2157 
kvm_irqchip_add_msi_route(KVMRouteChange * c,int vector,PCIDevice * dev)2158 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
2159 {
2160     struct kvm_irq_routing_entry kroute = {};
2161     int virq;
2162     KVMState *s = c->s;
2163     MSIMessage msg = {0, 0};
2164 
2165     if (pci_available && dev) {
2166         msg = pci_get_msi_message(dev, vector);
2167     }
2168 
2169     if (kvm_gsi_direct_mapping()) {
2170         return kvm_arch_msi_data_to_gsi(msg.data);
2171     }
2172 
2173     if (!kvm_gsi_routing_enabled()) {
2174         return -ENOSYS;
2175     }
2176 
2177     virq = kvm_irqchip_get_virq(s);
2178     if (virq < 0) {
2179         return virq;
2180     }
2181 
2182     kroute.gsi = virq;
2183     kroute.type = KVM_IRQ_ROUTING_MSI;
2184     kroute.flags = 0;
2185     kroute.u.msi.address_lo = (uint32_t)msg.address;
2186     kroute.u.msi.address_hi = msg.address >> 32;
2187     kroute.u.msi.data = le32_to_cpu(msg.data);
2188     if (pci_available && kvm_msi_devid_required()) {
2189         kroute.flags = KVM_MSI_VALID_DEVID;
2190         kroute.u.msi.devid = pci_requester_id(dev);
2191     }
2192     if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2193         kvm_irqchip_release_virq(s, virq);
2194         return -EINVAL;
2195     }
2196 
2197     if (s->irq_routes->nr < s->gsi_count) {
2198         trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
2199                                         vector, virq);
2200 
2201         kvm_add_routing_entry(s, &kroute);
2202         kvm_arch_add_msi_route_post(&kroute, vector, dev);
2203         c->changes++;
2204     } else {
2205         kvm_irqchip_release_virq(s, virq);
2206         return -ENOSPC;
2207     }
2208 
2209     return virq;
2210 }
2211 
kvm_irqchip_update_msi_route(KVMState * s,int virq,MSIMessage msg,PCIDevice * dev)2212 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
2213                                  PCIDevice *dev)
2214 {
2215     struct kvm_irq_routing_entry kroute = {};
2216 
2217     if (kvm_gsi_direct_mapping()) {
2218         return 0;
2219     }
2220 
2221     if (!kvm_irqchip_in_kernel()) {
2222         return -ENOSYS;
2223     }
2224 
2225     kroute.gsi = virq;
2226     kroute.type = KVM_IRQ_ROUTING_MSI;
2227     kroute.flags = 0;
2228     kroute.u.msi.address_lo = (uint32_t)msg.address;
2229     kroute.u.msi.address_hi = msg.address >> 32;
2230     kroute.u.msi.data = le32_to_cpu(msg.data);
2231     if (pci_available && kvm_msi_devid_required()) {
2232         kroute.flags = KVM_MSI_VALID_DEVID;
2233         kroute.u.msi.devid = pci_requester_id(dev);
2234     }
2235     if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2236         return -EINVAL;
2237     }
2238 
2239     trace_kvm_irqchip_update_msi_route(virq);
2240 
2241     return kvm_update_routing_entry(s, &kroute);
2242 }
2243 
kvm_irqchip_assign_irqfd(KVMState * s,EventNotifier * event,EventNotifier * resample,int virq,bool assign)2244 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2245                                     EventNotifier *resample, int virq,
2246                                     bool assign)
2247 {
2248     int fd = event_notifier_get_fd(event);
2249     int rfd = resample ? event_notifier_get_fd(resample) : -1;
2250 
2251     struct kvm_irqfd irqfd = {
2252         .fd = fd,
2253         .gsi = virq,
2254         .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
2255     };
2256 
2257     if (rfd != -1) {
2258         assert(assign);
2259         if (kvm_irqchip_is_split()) {
2260             /*
2261              * When the slow irqchip (e.g. IOAPIC) is in the
2262              * userspace, KVM kernel resamplefd will not work because
2263              * the EOI of the interrupt will be delivered to userspace
2264              * instead, so the KVM kernel resamplefd kick will be
2265              * skipped.  The userspace here mimics what the kernel
2266              * provides with resamplefd, remember the resamplefd and
2267              * kick it when we receive EOI of this IRQ.
2268              *
2269              * This is hackery because IOAPIC is mostly bypassed
2270              * (except EOI broadcasts) when irqfd is used.  However
2271              * this can bring much performance back for split irqchip
2272              * with INTx IRQs (for VFIO, this gives 93% perf of the
2273              * full fast path, which is 46% perf boost comparing to
2274              * the INTx slow path).
2275              */
2276             kvm_resample_fd_insert(virq, resample);
2277         } else {
2278             irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
2279             irqfd.resamplefd = rfd;
2280         }
2281     } else if (!assign) {
2282         if (kvm_irqchip_is_split()) {
2283             kvm_resample_fd_remove(virq);
2284         }
2285     }
2286 
2287     return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
2288 }
2289 
2290 #else /* !KVM_CAP_IRQ_ROUTING */
2291 
kvm_init_irq_routing(KVMState * s)2292 void kvm_init_irq_routing(KVMState *s)
2293 {
2294 }
2295 
kvm_irqchip_release_virq(KVMState * s,int virq)2296 void kvm_irqchip_release_virq(KVMState *s, int virq)
2297 {
2298 }
2299 
kvm_irqchip_send_msi(KVMState * s,MSIMessage msg)2300 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
2301 {
2302     abort();
2303 }
2304 
kvm_irqchip_add_msi_route(KVMRouteChange * c,int vector,PCIDevice * dev)2305 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
2306 {
2307     return -ENOSYS;
2308 }
2309 
kvm_irqchip_add_adapter_route(KVMState * s,AdapterInfo * adapter)2310 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
2311 {
2312     return -ENOSYS;
2313 }
2314 
kvm_irqchip_add_hv_sint_route(KVMState * s,uint32_t vcpu,uint32_t sint)2315 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
2316 {
2317     return -ENOSYS;
2318 }
2319 
kvm_irqchip_assign_irqfd(KVMState * s,EventNotifier * event,EventNotifier * resample,int virq,bool assign)2320 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2321                                     EventNotifier *resample, int virq,
2322                                     bool assign)
2323 {
2324     abort();
2325 }
2326 
kvm_irqchip_update_msi_route(KVMState * s,int virq,MSIMessage msg)2327 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
2328 {
2329     return -ENOSYS;
2330 }
2331 #endif /* !KVM_CAP_IRQ_ROUTING */
2332 
kvm_irqchip_add_irqfd_notifier_gsi(KVMState * s,EventNotifier * n,EventNotifier * rn,int virq)2333 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2334                                        EventNotifier *rn, int virq)
2335 {
2336     return kvm_irqchip_assign_irqfd(s, n, rn, virq, true);
2337 }
2338 
kvm_irqchip_remove_irqfd_notifier_gsi(KVMState * s,EventNotifier * n,int virq)2339 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2340                                           int virq)
2341 {
2342     return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false);
2343 }
2344 
kvm_irqchip_add_irqfd_notifier(KVMState * s,EventNotifier * n,EventNotifier * rn,qemu_irq irq)2345 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
2346                                    EventNotifier *rn, qemu_irq irq)
2347 {
2348     gpointer key, gsi;
2349     gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2350 
2351     if (!found) {
2352         return -ENXIO;
2353     }
2354     return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
2355 }
2356 
kvm_irqchip_remove_irqfd_notifier(KVMState * s,EventNotifier * n,qemu_irq irq)2357 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
2358                                       qemu_irq irq)
2359 {
2360     gpointer key, gsi;
2361     gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2362 
2363     if (!found) {
2364         return -ENXIO;
2365     }
2366     return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
2367 }
2368 
kvm_irqchip_set_qemuirq_gsi(KVMState * s,qemu_irq irq,int gsi)2369 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
2370 {
2371     g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
2372 }
2373 
kvm_irqchip_create(KVMState * s)2374 static void kvm_irqchip_create(KVMState *s)
2375 {
2376     int ret;
2377 
2378     assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
2379     if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
2380         ;
2381     } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
2382         ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
2383         if (ret < 0) {
2384             fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
2385             exit(1);
2386         }
2387     } else {
2388         return;
2389     }
2390 
2391     if (kvm_check_extension(s, KVM_CAP_IRQFD) <= 0) {
2392         fprintf(stderr, "kvm: irqfd not implemented\n");
2393         exit(1);
2394     }
2395 
2396     /* First probe and see if there's a arch-specific hook to create the
2397      * in-kernel irqchip for us */
2398     ret = kvm_arch_irqchip_create(s);
2399     if (ret == 0) {
2400         if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) {
2401             error_report("Split IRQ chip mode not supported.");
2402             exit(1);
2403         } else {
2404             ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
2405         }
2406     }
2407     if (ret < 0) {
2408         fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
2409         exit(1);
2410     }
2411 
2412     kvm_kernel_irqchip = true;
2413     /* If we have an in-kernel IRQ chip then we must have asynchronous
2414      * interrupt delivery (though the reverse is not necessarily true)
2415      */
2416     kvm_async_interrupts_allowed = true;
2417     kvm_halt_in_kernel_allowed = true;
2418 
2419     kvm_init_irq_routing(s);
2420 
2421     s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
2422 }
2423 
2424 /* Find number of supported CPUs using the recommended
2425  * procedure from the kernel API documentation to cope with
2426  * older kernels that may be missing capabilities.
2427  */
kvm_recommended_vcpus(KVMState * s)2428 static int kvm_recommended_vcpus(KVMState *s)
2429 {
2430     int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
2431     return (ret) ? ret : 4;
2432 }
2433 
kvm_max_vcpus(KVMState * s)2434 static int kvm_max_vcpus(KVMState *s)
2435 {
2436     int ret = kvm_vm_check_extension(s, KVM_CAP_MAX_VCPUS);
2437     return (ret) ? ret : kvm_recommended_vcpus(s);
2438 }
2439 
kvm_max_vcpu_id(KVMState * s)2440 static int kvm_max_vcpu_id(KVMState *s)
2441 {
2442     int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
2443     return (ret) ? ret : kvm_max_vcpus(s);
2444 }
2445 
kvm_vcpu_id_is_valid(int vcpu_id)2446 bool kvm_vcpu_id_is_valid(int vcpu_id)
2447 {
2448     KVMState *s = KVM_STATE(current_accel());
2449     return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
2450 }
2451 
kvm_dirty_ring_enabled(void)2452 bool kvm_dirty_ring_enabled(void)
2453 {
2454     return kvm_state && kvm_state->kvm_dirty_ring_size;
2455 }
2456 
2457 static void query_stats_cb(StatsResultList **result, StatsTarget target,
2458                            strList *names, strList *targets, Error **errp);
2459 static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp);
2460 
kvm_dirty_ring_size(void)2461 uint32_t kvm_dirty_ring_size(void)
2462 {
2463     return kvm_state->kvm_dirty_ring_size;
2464 }
2465 
do_kvm_create_vm(MachineState * ms,int type)2466 static int do_kvm_create_vm(MachineState *ms, int type)
2467 {
2468     KVMState *s;
2469     int ret;
2470 
2471     s = KVM_STATE(ms->accelerator);
2472 
2473     do {
2474         ret = kvm_ioctl(s, KVM_CREATE_VM, type);
2475     } while (ret == -EINTR);
2476 
2477     if (ret < 0) {
2478         error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret));
2479 
2480 #ifdef TARGET_S390X
2481         if (ret == -EINVAL) {
2482             error_printf("Host kernel setup problem detected."
2483                          " Please verify:\n");
2484             error_printf("- for kernels supporting the"
2485                         " switch_amode or user_mode parameters, whether");
2486             error_printf(" user space is running in primary address space\n");
2487             error_printf("- for kernels supporting the vm.allocate_pgste"
2488                          " sysctl, whether it is enabled\n");
2489         }
2490 #elif defined(TARGET_PPC)
2491         if (ret == -EINVAL) {
2492             error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n",
2493                          (type == 2) ? "pr" : "hv");
2494         }
2495 #endif
2496     }
2497 
2498     return ret;
2499 }
2500 
find_kvm_machine_type(MachineState * ms)2501 static int find_kvm_machine_type(MachineState *ms)
2502 {
2503     MachineClass *mc = MACHINE_GET_CLASS(ms);
2504     int type;
2505 
2506     if (object_property_find(OBJECT(current_machine), "kvm-type")) {
2507         g_autofree char *kvm_type;
2508         kvm_type = object_property_get_str(OBJECT(current_machine),
2509                                            "kvm-type",
2510                                            &error_abort);
2511         type = mc->kvm_type(ms, kvm_type);
2512     } else if (mc->kvm_type) {
2513         type = mc->kvm_type(ms, NULL);
2514     } else {
2515         type = kvm_arch_get_default_type(ms);
2516     }
2517     return type;
2518 }
2519 
kvm_setup_dirty_ring(KVMState * s)2520 static int kvm_setup_dirty_ring(KVMState *s)
2521 {
2522     uint64_t dirty_log_manual_caps;
2523     int ret;
2524 
2525     /*
2526      * Enable KVM dirty ring if supported, otherwise fall back to
2527      * dirty logging mode
2528      */
2529     ret = kvm_dirty_ring_init(s);
2530     if (ret < 0) {
2531         return ret;
2532     }
2533 
2534     /*
2535      * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
2536      * enabled.  More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
2537      * page is wr-protected initially, which is against how kvm dirty ring is
2538      * usage - kvm dirty ring requires all pages are wr-protected at the very
2539      * beginning.  Enabling this feature for dirty ring causes data corruption.
2540      *
2541      * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
2542      * we may expect a higher stall time when starting the migration.  In the
2543      * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
2544      * instead of clearing dirty bit, it can be a way to explicitly wr-protect
2545      * guest pages.
2546      */
2547     if (!s->kvm_dirty_ring_size) {
2548         dirty_log_manual_caps =
2549             kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
2550         dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
2551                                   KVM_DIRTY_LOG_INITIALLY_SET);
2552         s->manual_dirty_log_protect = dirty_log_manual_caps;
2553         if (dirty_log_manual_caps) {
2554             ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
2555                                     dirty_log_manual_caps);
2556             if (ret) {
2557                 warn_report("Trying to enable capability %"PRIu64" of "
2558                             "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
2559                             "Falling back to the legacy mode. ",
2560                             dirty_log_manual_caps);
2561                 s->manual_dirty_log_protect = 0;
2562             }
2563         }
2564     }
2565 
2566     return 0;
2567 }
2568 
kvm_init(MachineState * ms)2569 static int kvm_init(MachineState *ms)
2570 {
2571     MachineClass *mc = MACHINE_GET_CLASS(ms);
2572     static const char upgrade_note[] =
2573         "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
2574         "(see http://sourceforge.net/projects/kvm).\n";
2575     const struct {
2576         const char *name;
2577         int num;
2578     } num_cpus[] = {
2579         { "SMP",          ms->smp.cpus },
2580         { "hotpluggable", ms->smp.max_cpus },
2581         { /* end of list */ }
2582     }, *nc = num_cpus;
2583     int soft_vcpus_limit, hard_vcpus_limit;
2584     KVMState *s;
2585     const KVMCapabilityInfo *missing_cap;
2586     int ret;
2587     int type;
2588 
2589     qemu_mutex_init(&kml_slots_lock);
2590 
2591     s = KVM_STATE(ms->accelerator);
2592 
2593     /*
2594      * On systems where the kernel can support different base page
2595      * sizes, host page size may be different from TARGET_PAGE_SIZE,
2596      * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
2597      * page size for the system though.
2598      */
2599     assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size());
2600 
2601     s->sigmask_len = 8;
2602     accel_blocker_init();
2603 
2604 #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
2605     QTAILQ_INIT(&s->kvm_sw_breakpoints);
2606 #endif
2607     QLIST_INIT(&s->kvm_parked_vcpus);
2608     s->fd = qemu_open_old(s->device ?: "/dev/kvm", O_RDWR);
2609     if (s->fd == -1) {
2610         error_report("Could not access KVM kernel module: %m");
2611         ret = -errno;
2612         goto err;
2613     }
2614 
2615     ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
2616     if (ret < KVM_API_VERSION) {
2617         if (ret >= 0) {
2618             ret = -EINVAL;
2619         }
2620         error_report("kvm version too old");
2621         goto err;
2622     }
2623 
2624     if (ret > KVM_API_VERSION) {
2625         ret = -EINVAL;
2626         error_report("kvm version not supported");
2627         goto err;
2628     }
2629 
2630     kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
2631     s->nr_slots_max = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
2632 
2633     /* If unspecified, use the default value */
2634     if (!s->nr_slots_max) {
2635         s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT;
2636     }
2637 
2638     type = find_kvm_machine_type(ms);
2639     if (type < 0) {
2640         ret = -EINVAL;
2641         goto err;
2642     }
2643 
2644     ret = do_kvm_create_vm(ms, type);
2645     if (ret < 0) {
2646         goto err;
2647     }
2648 
2649     s->vmfd = ret;
2650 
2651     s->nr_as = kvm_vm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
2652     if (s->nr_as <= 1) {
2653         s->nr_as = 1;
2654     }
2655     s->as = g_new0(struct KVMAs, s->nr_as);
2656 
2657     /* check the vcpu limits */
2658     soft_vcpus_limit = kvm_recommended_vcpus(s);
2659     hard_vcpus_limit = kvm_max_vcpus(s);
2660 
2661     while (nc->name) {
2662         if (nc->num > soft_vcpus_limit) {
2663             warn_report("Number of %s cpus requested (%d) exceeds "
2664                         "the recommended cpus supported by KVM (%d)",
2665                         nc->name, nc->num, soft_vcpus_limit);
2666 
2667             if (nc->num > hard_vcpus_limit) {
2668                 error_report("Number of %s cpus requested (%d) exceeds "
2669                              "the maximum cpus supported by KVM (%d)",
2670                              nc->name, nc->num, hard_vcpus_limit);
2671                 exit(1);
2672             }
2673         }
2674         nc++;
2675     }
2676 
2677     missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
2678     if (!missing_cap) {
2679         missing_cap =
2680             kvm_check_extension_list(s, kvm_arch_required_capabilities);
2681     }
2682     if (missing_cap) {
2683         ret = -EINVAL;
2684         error_report("kvm does not support %s", missing_cap->name);
2685         error_printf("%s", upgrade_note);
2686         goto err;
2687     }
2688 
2689     s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
2690     s->coalesced_pio = s->coalesced_mmio &&
2691                        kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
2692 
2693     ret = kvm_setup_dirty_ring(s);
2694     if (ret < 0) {
2695         goto err;
2696     }
2697 
2698 #ifdef KVM_CAP_VCPU_EVENTS
2699     s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
2700 #endif
2701     s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
2702 
2703     s->irq_set_ioctl = KVM_IRQ_LINE;
2704     if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
2705         s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
2706     }
2707 
2708     kvm_readonly_mem_allowed =
2709         (kvm_vm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
2710 
2711     kvm_resamplefds_allowed =
2712         (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
2713 
2714     kvm_vm_attributes_allowed =
2715         (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
2716 
2717 #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
2718     kvm_has_guest_debug =
2719         (kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG) > 0);
2720 #endif
2721 
2722     kvm_sstep_flags = 0;
2723     if (kvm_has_guest_debug) {
2724         kvm_sstep_flags = SSTEP_ENABLE;
2725 
2726 #if defined TARGET_KVM_HAVE_GUEST_DEBUG
2727         int guest_debug_flags =
2728             kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG2);
2729 
2730         if (guest_debug_flags & KVM_GUESTDBG_BLOCKIRQ) {
2731             kvm_sstep_flags |= SSTEP_NOIRQ;
2732         }
2733 #endif
2734     }
2735 
2736     kvm_state = s;
2737 
2738     ret = kvm_arch_init(ms, s);
2739     if (ret < 0) {
2740         goto err;
2741     }
2742 
2743     kvm_supported_memory_attributes = kvm_vm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES);
2744     kvm_guest_memfd_supported =
2745         kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) &&
2746         kvm_check_extension(s, KVM_CAP_USER_MEMORY2) &&
2747         (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE);
2748 
2749     if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
2750         s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
2751     }
2752 
2753     qemu_register_reset(kvm_unpoison_all, NULL);
2754 
2755     if (s->kernel_irqchip_allowed) {
2756         kvm_irqchip_create(s);
2757     }
2758 
2759     s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
2760     s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
2761     s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
2762     s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
2763 
2764     kvm_memory_listener_register(s, &s->memory_listener,
2765                                  &address_space_memory, 0, "kvm-memory");
2766     memory_listener_register(&kvm_io_listener,
2767                              &address_space_io);
2768 
2769     s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2770     if (!s->sync_mmu) {
2771         ret = ram_block_discard_disable(true);
2772         assert(!ret);
2773     }
2774 
2775     if (s->kvm_dirty_ring_size) {
2776         kvm_dirty_ring_reaper_init(s);
2777     }
2778 
2779     if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
2780         add_stats_callbacks(STATS_PROVIDER_KVM, query_stats_cb,
2781                             query_stats_schemas_cb);
2782     }
2783 
2784     return 0;
2785 
2786 err:
2787     assert(ret < 0);
2788     if (s->vmfd >= 0) {
2789         close(s->vmfd);
2790     }
2791     if (s->fd != -1) {
2792         close(s->fd);
2793     }
2794     g_free(s->as);
2795     g_free(s->memory_listener.slots);
2796 
2797     return ret;
2798 }
2799 
kvm_set_sigmask_len(KVMState * s,unsigned int sigmask_len)2800 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
2801 {
2802     s->sigmask_len = sigmask_len;
2803 }
2804 
kvm_handle_io(uint16_t port,MemTxAttrs attrs,void * data,int direction,int size,uint32_t count)2805 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
2806                           int size, uint32_t count)
2807 {
2808     int i;
2809     uint8_t *ptr = data;
2810 
2811     for (i = 0; i < count; i++) {
2812         address_space_rw(&address_space_io, port, attrs,
2813                          ptr, size,
2814                          direction == KVM_EXIT_IO_OUT);
2815         ptr += size;
2816     }
2817 }
2818 
kvm_handle_internal_error(CPUState * cpu,struct kvm_run * run)2819 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
2820 {
2821     int i;
2822 
2823     fprintf(stderr, "KVM internal error. Suberror: %d\n",
2824             run->internal.suberror);
2825 
2826     for (i = 0; i < run->internal.ndata; ++i) {
2827         fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n",
2828                 i, (uint64_t)run->internal.data[i]);
2829     }
2830     if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
2831         fprintf(stderr, "emulation failure\n");
2832         if (!kvm_arch_stop_on_emulation_error(cpu)) {
2833             cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2834             return EXCP_INTERRUPT;
2835         }
2836     }
2837     /* FIXME: Should trigger a qmp message to let management know
2838      * something went wrong.
2839      */
2840     return -1;
2841 }
2842 
kvm_flush_coalesced_mmio_buffer(void)2843 void kvm_flush_coalesced_mmio_buffer(void)
2844 {
2845     KVMState *s = kvm_state;
2846 
2847     if (!s || s->coalesced_flush_in_progress) {
2848         return;
2849     }
2850 
2851     s->coalesced_flush_in_progress = true;
2852 
2853     if (s->coalesced_mmio_ring) {
2854         struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
2855         while (ring->first != ring->last) {
2856             struct kvm_coalesced_mmio *ent;
2857 
2858             ent = &ring->coalesced_mmio[ring->first];
2859 
2860             if (ent->pio == 1) {
2861                 address_space_write(&address_space_io, ent->phys_addr,
2862                                     MEMTXATTRS_UNSPECIFIED, ent->data,
2863                                     ent->len);
2864             } else {
2865                 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
2866             }
2867             smp_wmb();
2868             ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
2869         }
2870     }
2871 
2872     s->coalesced_flush_in_progress = false;
2873 }
2874 
do_kvm_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)2875 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2876 {
2877     if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
2878         Error *err = NULL;
2879         int ret = kvm_arch_get_registers(cpu, &err);
2880         if (ret) {
2881             if (err) {
2882                 error_reportf_err(err, "Failed to synchronize CPU state: ");
2883             } else {
2884                 error_report("Failed to get registers: %s", strerror(-ret));
2885             }
2886 
2887             cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2888             vm_stop(RUN_STATE_INTERNAL_ERROR);
2889         }
2890 
2891         cpu->vcpu_dirty = true;
2892     }
2893 }
2894 
kvm_cpu_synchronize_state(CPUState * cpu)2895 void kvm_cpu_synchronize_state(CPUState *cpu)
2896 {
2897     if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
2898         run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
2899     }
2900 }
2901 
do_kvm_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)2902 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
2903 {
2904     Error *err = NULL;
2905     int ret = kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE, &err);
2906     if (ret) {
2907         if (err) {
2908             error_reportf_err(err, "Restoring resisters after reset: ");
2909         } else {
2910             error_report("Failed to put registers after reset: %s",
2911                          strerror(-ret));
2912         }
2913         cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2914         vm_stop(RUN_STATE_INTERNAL_ERROR);
2915     }
2916 
2917     cpu->vcpu_dirty = false;
2918 }
2919 
kvm_cpu_synchronize_post_reset(CPUState * cpu)2920 void kvm_cpu_synchronize_post_reset(CPUState *cpu)
2921 {
2922     run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2923 
2924     if (cpu == first_cpu) {
2925         kvm_reset_parked_vcpus(kvm_state);
2926     }
2927 }
2928 
do_kvm_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)2929 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
2930 {
2931     Error *err = NULL;
2932     int ret = kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE, &err);
2933     if (ret) {
2934         if (err) {
2935             error_reportf_err(err, "Putting registers after init: ");
2936         } else {
2937             error_report("Failed to put registers after init: %s",
2938                          strerror(-ret));
2939         }
2940         exit(1);
2941     }
2942 
2943     cpu->vcpu_dirty = false;
2944 }
2945 
kvm_cpu_synchronize_post_init(CPUState * cpu)2946 void kvm_cpu_synchronize_post_init(CPUState *cpu)
2947 {
2948     if (!kvm_state->guest_state_protected) {
2949         /*
2950          * This runs before the machine_init_done notifiers, and is the last
2951          * opportunity to synchronize the state of confidential guests.
2952          */
2953         run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2954     }
2955 }
2956 
do_kvm_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)2957 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
2958 {
2959     cpu->vcpu_dirty = true;
2960 }
2961 
kvm_cpu_synchronize_pre_loadvm(CPUState * cpu)2962 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
2963 {
2964     run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2965 }
2966 
2967 #ifdef KVM_HAVE_MCE_INJECTION
2968 static __thread void *pending_sigbus_addr;
2969 static __thread int pending_sigbus_code;
2970 static __thread bool have_sigbus_pending;
2971 #endif
2972 
kvm_cpu_kick(CPUState * cpu)2973 static void kvm_cpu_kick(CPUState *cpu)
2974 {
2975     qatomic_set(&cpu->kvm_run->immediate_exit, 1);
2976 }
2977 
kvm_cpu_kick_self(void)2978 static void kvm_cpu_kick_self(void)
2979 {
2980     if (kvm_immediate_exit) {
2981         kvm_cpu_kick(current_cpu);
2982     } else {
2983         qemu_cpu_kick_self();
2984     }
2985 }
2986 
kvm_eat_signals(CPUState * cpu)2987 static void kvm_eat_signals(CPUState *cpu)
2988 {
2989     struct timespec ts = { 0, 0 };
2990     siginfo_t siginfo;
2991     sigset_t waitset;
2992     sigset_t chkset;
2993     int r;
2994 
2995     if (kvm_immediate_exit) {
2996         qatomic_set(&cpu->kvm_run->immediate_exit, 0);
2997         /* Write kvm_run->immediate_exit before the cpu->exit_request
2998          * write in kvm_cpu_exec.
2999          */
3000         smp_wmb();
3001         return;
3002     }
3003 
3004     sigemptyset(&waitset);
3005     sigaddset(&waitset, SIG_IPI);
3006 
3007     do {
3008         r = sigtimedwait(&waitset, &siginfo, &ts);
3009         if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
3010             perror("sigtimedwait");
3011             exit(1);
3012         }
3013 
3014         r = sigpending(&chkset);
3015         if (r == -1) {
3016             perror("sigpending");
3017             exit(1);
3018         }
3019     } while (sigismember(&chkset, SIG_IPI));
3020 }
3021 
kvm_convert_memory(hwaddr start,hwaddr size,bool to_private)3022 int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
3023 {
3024     MemoryRegionSection section;
3025     ram_addr_t offset;
3026     MemoryRegion *mr;
3027     RAMBlock *rb;
3028     void *addr;
3029     int ret = -EINVAL;
3030 
3031     trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared");
3032 
3033     if (!QEMU_PTR_IS_ALIGNED(start, qemu_real_host_page_size()) ||
3034         !QEMU_PTR_IS_ALIGNED(size, qemu_real_host_page_size())) {
3035         return ret;
3036     }
3037 
3038     if (!size) {
3039         return ret;
3040     }
3041 
3042     section = memory_region_find(get_system_memory(), start, size);
3043     mr = section.mr;
3044     if (!mr) {
3045         /*
3046          * Ignore converting non-assigned region to shared.
3047          *
3048          * TDX requires vMMIO region to be shared to inject #VE to guest.
3049          * OVMF issues conservatively MapGPA(shared) on 32bit PCI MMIO region,
3050          * and vIO-APIC 0xFEC00000 4K page.
3051          * OVMF assigns 32bit PCI MMIO region to
3052          * [top of low memory: typically 2GB=0xC000000,  0xFC00000)
3053          */
3054         if (!to_private) {
3055             return 0;
3056         }
3057         return ret;
3058     }
3059 
3060     if (!memory_region_has_guest_memfd(mr)) {
3061         /*
3062          * Because vMMIO region must be shared, guest TD may convert vMMIO
3063          * region to shared explicitly.  Don't complain such case.  See
3064          * memory_region_type() for checking if the region is MMIO region.
3065          */
3066         if (!to_private &&
3067             !memory_region_is_ram(mr) &&
3068             !memory_region_is_ram_device(mr) &&
3069             !memory_region_is_rom(mr) &&
3070             !memory_region_is_romd(mr)) {
3071             ret = 0;
3072         } else {
3073             error_report("Convert non guest_memfd backed memory region "
3074                         "(0x%"HWADDR_PRIx" ,+ 0x%"HWADDR_PRIx") to %s",
3075                         start, size, to_private ? "private" : "shared");
3076         }
3077         goto out_unref;
3078     }
3079 
3080     if (to_private) {
3081         ret = kvm_set_memory_attributes_private(start, size);
3082     } else {
3083         ret = kvm_set_memory_attributes_shared(start, size);
3084     }
3085     if (ret) {
3086         goto out_unref;
3087     }
3088 
3089     addr = memory_region_get_ram_ptr(mr) + section.offset_within_region;
3090     rb = qemu_ram_block_from_host(addr, false, &offset);
3091 
3092     if (to_private) {
3093         if (rb->page_size != qemu_real_host_page_size()) {
3094             /*
3095              * shared memory is backed by hugetlb, which is supposed to be
3096              * pre-allocated and doesn't need to be discarded
3097              */
3098             goto out_unref;
3099         }
3100         ret = ram_block_discard_range(rb, offset, size);
3101     } else {
3102         ret = ram_block_discard_guest_memfd_range(rb, offset, size);
3103     }
3104 
3105 out_unref:
3106     memory_region_unref(mr);
3107     return ret;
3108 }
3109 
kvm_cpu_exec(CPUState * cpu)3110 int kvm_cpu_exec(CPUState *cpu)
3111 {
3112     struct kvm_run *run = cpu->kvm_run;
3113     int ret, run_ret;
3114 
3115     trace_kvm_cpu_exec();
3116 
3117     if (kvm_arch_process_async_events(cpu)) {
3118         qatomic_set(&cpu->exit_request, 0);
3119         return EXCP_HLT;
3120     }
3121 
3122     bql_unlock();
3123     cpu_exec_start(cpu);
3124 
3125     do {
3126         MemTxAttrs attrs;
3127 
3128         if (cpu->vcpu_dirty) {
3129             Error *err = NULL;
3130             ret = kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE, &err);
3131             if (ret) {
3132                 if (err) {
3133                     error_reportf_err(err, "Putting registers after init: ");
3134                 } else {
3135                     error_report("Failed to put registers after init: %s",
3136                                  strerror(-ret));
3137                 }
3138                 ret = -1;
3139                 break;
3140             }
3141 
3142             cpu->vcpu_dirty = false;
3143         }
3144 
3145         kvm_arch_pre_run(cpu, run);
3146         if (qatomic_read(&cpu->exit_request)) {
3147             trace_kvm_interrupt_exit_request();
3148             /*
3149              * KVM requires us to reenter the kernel after IO exits to complete
3150              * instruction emulation. This self-signal will ensure that we
3151              * leave ASAP again.
3152              */
3153             kvm_cpu_kick_self();
3154         }
3155 
3156         /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
3157          * Matching barrier in kvm_eat_signals.
3158          */
3159         smp_rmb();
3160 
3161         run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
3162 
3163         attrs = kvm_arch_post_run(cpu, run);
3164 
3165 #ifdef KVM_HAVE_MCE_INJECTION
3166         if (unlikely(have_sigbus_pending)) {
3167             bql_lock();
3168             kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
3169                                     pending_sigbus_addr);
3170             have_sigbus_pending = false;
3171             bql_unlock();
3172         }
3173 #endif
3174 
3175         if (run_ret < 0) {
3176             if (run_ret == -EINTR || run_ret == -EAGAIN) {
3177                 trace_kvm_io_window_exit();
3178                 kvm_eat_signals(cpu);
3179                 ret = EXCP_INTERRUPT;
3180                 break;
3181             }
3182             if (!(run_ret == -EFAULT && run->exit_reason == KVM_EXIT_MEMORY_FAULT)) {
3183                 fprintf(stderr, "error: kvm run failed %s\n",
3184                         strerror(-run_ret));
3185 #ifdef TARGET_PPC
3186                 if (run_ret == -EBUSY) {
3187                     fprintf(stderr,
3188                             "This is probably because your SMT is enabled.\n"
3189                             "VCPU can only run on primary threads with all "
3190                             "secondary threads offline.\n");
3191                 }
3192 #endif
3193                 ret = -1;
3194                 break;
3195             }
3196         }
3197 
3198         trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
3199         switch (run->exit_reason) {
3200         case KVM_EXIT_IO:
3201             /* Called outside BQL */
3202             kvm_handle_io(run->io.port, attrs,
3203                           (uint8_t *)run + run->io.data_offset,
3204                           run->io.direction,
3205                           run->io.size,
3206                           run->io.count);
3207             ret = 0;
3208             break;
3209         case KVM_EXIT_MMIO:
3210             /* Called outside BQL */
3211             address_space_rw(&address_space_memory,
3212                              run->mmio.phys_addr, attrs,
3213                              run->mmio.data,
3214                              run->mmio.len,
3215                              run->mmio.is_write);
3216             ret = 0;
3217             break;
3218         case KVM_EXIT_IRQ_WINDOW_OPEN:
3219             ret = EXCP_INTERRUPT;
3220             break;
3221         case KVM_EXIT_SHUTDOWN:
3222             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
3223             ret = EXCP_INTERRUPT;
3224             break;
3225         case KVM_EXIT_UNKNOWN:
3226             fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
3227                     (uint64_t)run->hw.hardware_exit_reason);
3228             ret = -1;
3229             break;
3230         case KVM_EXIT_INTERNAL_ERROR:
3231             ret = kvm_handle_internal_error(cpu, run);
3232             break;
3233         case KVM_EXIT_DIRTY_RING_FULL:
3234             /*
3235              * We shouldn't continue if the dirty ring of this vcpu is
3236              * still full.  Got kicked by KVM_RESET_DIRTY_RINGS.
3237              */
3238             trace_kvm_dirty_ring_full(cpu->cpu_index);
3239             bql_lock();
3240             /*
3241              * We throttle vCPU by making it sleep once it exit from kernel
3242              * due to dirty ring full. In the dirtylimit scenario, reaping
3243              * all vCPUs after a single vCPU dirty ring get full result in
3244              * the miss of sleep, so just reap the ring-fulled vCPU.
3245              */
3246             if (dirtylimit_in_service()) {
3247                 kvm_dirty_ring_reap(kvm_state, cpu);
3248             } else {
3249                 kvm_dirty_ring_reap(kvm_state, NULL);
3250             }
3251             bql_unlock();
3252             dirtylimit_vcpu_execute(cpu);
3253             ret = 0;
3254             break;
3255         case KVM_EXIT_SYSTEM_EVENT:
3256             trace_kvm_run_exit_system_event(cpu->cpu_index, run->system_event.type);
3257             switch (run->system_event.type) {
3258             case KVM_SYSTEM_EVENT_SHUTDOWN:
3259                 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
3260                 ret = EXCP_INTERRUPT;
3261                 break;
3262             case KVM_SYSTEM_EVENT_RESET:
3263                 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
3264                 ret = EXCP_INTERRUPT;
3265                 break;
3266             case KVM_SYSTEM_EVENT_CRASH:
3267                 kvm_cpu_synchronize_state(cpu);
3268                 bql_lock();
3269                 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
3270                 bql_unlock();
3271                 ret = 0;
3272                 break;
3273             default:
3274                 ret = kvm_arch_handle_exit(cpu, run);
3275                 break;
3276             }
3277             break;
3278         case KVM_EXIT_MEMORY_FAULT:
3279             trace_kvm_memory_fault(run->memory_fault.gpa,
3280                                    run->memory_fault.size,
3281                                    run->memory_fault.flags);
3282             if (run->memory_fault.flags & ~KVM_MEMORY_EXIT_FLAG_PRIVATE) {
3283                 error_report("KVM_EXIT_MEMORY_FAULT: Unknown flag 0x%" PRIx64,
3284                              (uint64_t)run->memory_fault.flags);
3285                 ret = -1;
3286                 break;
3287             }
3288             ret = kvm_convert_memory(run->memory_fault.gpa, run->memory_fault.size,
3289                                      run->memory_fault.flags & KVM_MEMORY_EXIT_FLAG_PRIVATE);
3290             break;
3291         default:
3292             ret = kvm_arch_handle_exit(cpu, run);
3293             break;
3294         }
3295     } while (ret == 0);
3296 
3297     cpu_exec_end(cpu);
3298     bql_lock();
3299 
3300     if (ret < 0) {
3301         cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
3302         vm_stop(RUN_STATE_INTERNAL_ERROR);
3303     }
3304 
3305     qatomic_set(&cpu->exit_request, 0);
3306     return ret;
3307 }
3308 
kvm_ioctl(KVMState * s,unsigned long type,...)3309 int kvm_ioctl(KVMState *s, unsigned long type, ...)
3310 {
3311     int ret;
3312     void *arg;
3313     va_list ap;
3314 
3315     va_start(ap, type);
3316     arg = va_arg(ap, void *);
3317     va_end(ap);
3318 
3319     trace_kvm_ioctl(type, arg);
3320     ret = ioctl(s->fd, type, arg);
3321     if (ret == -1) {
3322         ret = -errno;
3323     }
3324     return ret;
3325 }
3326 
kvm_vm_ioctl(KVMState * s,unsigned long type,...)3327 int kvm_vm_ioctl(KVMState *s, unsigned long type, ...)
3328 {
3329     int ret;
3330     void *arg;
3331     va_list ap;
3332 
3333     va_start(ap, type);
3334     arg = va_arg(ap, void *);
3335     va_end(ap);
3336 
3337     trace_kvm_vm_ioctl(type, arg);
3338     accel_ioctl_begin();
3339     ret = ioctl(s->vmfd, type, arg);
3340     accel_ioctl_end();
3341     if (ret == -1) {
3342         ret = -errno;
3343     }
3344     return ret;
3345 }
3346 
kvm_vcpu_ioctl(CPUState * cpu,unsigned long type,...)3347 int kvm_vcpu_ioctl(CPUState *cpu, unsigned long type, ...)
3348 {
3349     int ret;
3350     void *arg;
3351     va_list ap;
3352 
3353     va_start(ap, type);
3354     arg = va_arg(ap, void *);
3355     va_end(ap);
3356 
3357     trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
3358     accel_cpu_ioctl_begin(cpu);
3359     ret = ioctl(cpu->kvm_fd, type, arg);
3360     accel_cpu_ioctl_end(cpu);
3361     if (ret == -1) {
3362         ret = -errno;
3363     }
3364     return ret;
3365 }
3366 
kvm_device_ioctl(int fd,unsigned long type,...)3367 int kvm_device_ioctl(int fd, unsigned long type, ...)
3368 {
3369     int ret;
3370     void *arg;
3371     va_list ap;
3372 
3373     va_start(ap, type);
3374     arg = va_arg(ap, void *);
3375     va_end(ap);
3376 
3377     trace_kvm_device_ioctl(fd, type, arg);
3378     accel_ioctl_begin();
3379     ret = ioctl(fd, type, arg);
3380     accel_ioctl_end();
3381     if (ret == -1) {
3382         ret = -errno;
3383     }
3384     return ret;
3385 }
3386 
kvm_vm_check_attr(KVMState * s,uint32_t group,uint64_t attr)3387 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
3388 {
3389     int ret;
3390     struct kvm_device_attr attribute = {
3391         .group = group,
3392         .attr = attr,
3393     };
3394 
3395     if (!kvm_vm_attributes_allowed) {
3396         return 0;
3397     }
3398 
3399     ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
3400     /* kvm returns 0 on success for HAS_DEVICE_ATTR */
3401     return ret ? 0 : 1;
3402 }
3403 
kvm_device_check_attr(int dev_fd,uint32_t group,uint64_t attr)3404 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
3405 {
3406     struct kvm_device_attr attribute = {
3407         .group = group,
3408         .attr = attr,
3409         .flags = 0,
3410     };
3411 
3412     return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
3413 }
3414 
kvm_device_access(int fd,int group,uint64_t attr,void * val,bool write,Error ** errp)3415 int kvm_device_access(int fd, int group, uint64_t attr,
3416                       void *val, bool write, Error **errp)
3417 {
3418     struct kvm_device_attr kvmattr;
3419     int err;
3420 
3421     kvmattr.flags = 0;
3422     kvmattr.group = group;
3423     kvmattr.attr = attr;
3424     kvmattr.addr = (uintptr_t)val;
3425 
3426     err = kvm_device_ioctl(fd,
3427                            write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
3428                            &kvmattr);
3429     if (err < 0) {
3430         error_setg_errno(errp, -err,
3431                          "KVM_%s_DEVICE_ATTR failed: Group %d "
3432                          "attr 0x%016" PRIx64,
3433                          write ? "SET" : "GET", group, attr);
3434     }
3435     return err;
3436 }
3437 
kvm_has_sync_mmu(void)3438 bool kvm_has_sync_mmu(void)
3439 {
3440     return kvm_state->sync_mmu;
3441 }
3442 
kvm_has_vcpu_events(void)3443 int kvm_has_vcpu_events(void)
3444 {
3445     return kvm_state->vcpu_events;
3446 }
3447 
kvm_max_nested_state_length(void)3448 int kvm_max_nested_state_length(void)
3449 {
3450     return kvm_state->max_nested_state_len;
3451 }
3452 
kvm_has_gsi_routing(void)3453 int kvm_has_gsi_routing(void)
3454 {
3455 #ifdef KVM_CAP_IRQ_ROUTING
3456     return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
3457 #else
3458     return false;
3459 #endif
3460 }
3461 
kvm_arm_supports_user_irq(void)3462 bool kvm_arm_supports_user_irq(void)
3463 {
3464     return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
3465 }
3466 
3467 #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
kvm_find_sw_breakpoint(CPUState * cpu,vaddr pc)3468 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, vaddr pc)
3469 {
3470     struct kvm_sw_breakpoint *bp;
3471 
3472     QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
3473         if (bp->pc == pc) {
3474             return bp;
3475         }
3476     }
3477     return NULL;
3478 }
3479 
kvm_sw_breakpoints_active(CPUState * cpu)3480 int kvm_sw_breakpoints_active(CPUState *cpu)
3481 {
3482     return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
3483 }
3484 
3485 struct kvm_set_guest_debug_data {
3486     struct kvm_guest_debug dbg;
3487     int err;
3488 };
3489 
kvm_invoke_set_guest_debug(CPUState * cpu,run_on_cpu_data data)3490 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
3491 {
3492     struct kvm_set_guest_debug_data *dbg_data =
3493         (struct kvm_set_guest_debug_data *) data.host_ptr;
3494 
3495     dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
3496                                    &dbg_data->dbg);
3497 }
3498 
kvm_update_guest_debug(CPUState * cpu,unsigned long reinject_trap)3499 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
3500 {
3501     struct kvm_set_guest_debug_data data;
3502 
3503     data.dbg.control = reinject_trap;
3504 
3505     if (cpu->singlestep_enabled) {
3506         data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
3507 
3508         if (cpu->singlestep_enabled & SSTEP_NOIRQ) {
3509             data.dbg.control |= KVM_GUESTDBG_BLOCKIRQ;
3510         }
3511     }
3512     kvm_arch_update_guest_debug(cpu, &data.dbg);
3513 
3514     run_on_cpu(cpu, kvm_invoke_set_guest_debug,
3515                RUN_ON_CPU_HOST_PTR(&data));
3516     return data.err;
3517 }
3518 
kvm_supports_guest_debug(void)3519 bool kvm_supports_guest_debug(void)
3520 {
3521     /* probed during kvm_init() */
3522     return kvm_has_guest_debug;
3523 }
3524 
kvm_insert_breakpoint(CPUState * cpu,int type,vaddr addr,vaddr len)3525 int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
3526 {
3527     struct kvm_sw_breakpoint *bp;
3528     int err;
3529 
3530     if (type == GDB_BREAKPOINT_SW) {
3531         bp = kvm_find_sw_breakpoint(cpu, addr);
3532         if (bp) {
3533             bp->use_count++;
3534             return 0;
3535         }
3536 
3537         bp = g_new(struct kvm_sw_breakpoint, 1);
3538         bp->pc = addr;
3539         bp->use_count = 1;
3540         err = kvm_arch_insert_sw_breakpoint(cpu, bp);
3541         if (err) {
3542             g_free(bp);
3543             return err;
3544         }
3545 
3546         QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3547     } else {
3548         err = kvm_arch_insert_hw_breakpoint(addr, len, type);
3549         if (err) {
3550             return err;
3551         }
3552     }
3553 
3554     CPU_FOREACH(cpu) {
3555         err = kvm_update_guest_debug(cpu, 0);
3556         if (err) {
3557             return err;
3558         }
3559     }
3560     return 0;
3561 }
3562 
kvm_remove_breakpoint(CPUState * cpu,int type,vaddr addr,vaddr len)3563 int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
3564 {
3565     struct kvm_sw_breakpoint *bp;
3566     int err;
3567 
3568     if (type == GDB_BREAKPOINT_SW) {
3569         bp = kvm_find_sw_breakpoint(cpu, addr);
3570         if (!bp) {
3571             return -ENOENT;
3572         }
3573 
3574         if (bp->use_count > 1) {
3575             bp->use_count--;
3576             return 0;
3577         }
3578 
3579         err = kvm_arch_remove_sw_breakpoint(cpu, bp);
3580         if (err) {
3581             return err;
3582         }
3583 
3584         QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3585         g_free(bp);
3586     } else {
3587         err = kvm_arch_remove_hw_breakpoint(addr, len, type);
3588         if (err) {
3589             return err;
3590         }
3591     }
3592 
3593     CPU_FOREACH(cpu) {
3594         err = kvm_update_guest_debug(cpu, 0);
3595         if (err) {
3596             return err;
3597         }
3598     }
3599     return 0;
3600 }
3601 
kvm_remove_all_breakpoints(CPUState * cpu)3602 void kvm_remove_all_breakpoints(CPUState *cpu)
3603 {
3604     struct kvm_sw_breakpoint *bp, *next;
3605     KVMState *s = cpu->kvm_state;
3606     CPUState *tmpcpu;
3607 
3608     QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
3609         if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
3610             /* Try harder to find a CPU that currently sees the breakpoint. */
3611             CPU_FOREACH(tmpcpu) {
3612                 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
3613                     break;
3614                 }
3615             }
3616         }
3617         QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
3618         g_free(bp);
3619     }
3620     kvm_arch_remove_all_hw_breakpoints();
3621 
3622     CPU_FOREACH(cpu) {
3623         kvm_update_guest_debug(cpu, 0);
3624     }
3625 }
3626 
3627 #endif /* !TARGET_KVM_HAVE_GUEST_DEBUG */
3628 
kvm_set_signal_mask(CPUState * cpu,const sigset_t * sigset)3629 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
3630 {
3631     KVMState *s = kvm_state;
3632     struct kvm_signal_mask *sigmask;
3633     int r;
3634 
3635     sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
3636 
3637     sigmask->len = s->sigmask_len;
3638     memcpy(sigmask->sigset, sigset, sizeof(*sigset));
3639     r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
3640     g_free(sigmask);
3641 
3642     return r;
3643 }
3644 
kvm_ipi_signal(int sig)3645 static void kvm_ipi_signal(int sig)
3646 {
3647     if (current_cpu) {
3648         assert(kvm_immediate_exit);
3649         kvm_cpu_kick(current_cpu);
3650     }
3651 }
3652 
kvm_init_cpu_signals(CPUState * cpu)3653 void kvm_init_cpu_signals(CPUState *cpu)
3654 {
3655     int r;
3656     sigset_t set;
3657     struct sigaction sigact;
3658 
3659     memset(&sigact, 0, sizeof(sigact));
3660     sigact.sa_handler = kvm_ipi_signal;
3661     sigaction(SIG_IPI, &sigact, NULL);
3662 
3663     pthread_sigmask(SIG_BLOCK, NULL, &set);
3664 #if defined KVM_HAVE_MCE_INJECTION
3665     sigdelset(&set, SIGBUS);
3666     pthread_sigmask(SIG_SETMASK, &set, NULL);
3667 #endif
3668     sigdelset(&set, SIG_IPI);
3669     if (kvm_immediate_exit) {
3670         r = pthread_sigmask(SIG_SETMASK, &set, NULL);
3671     } else {
3672         r = kvm_set_signal_mask(cpu, &set);
3673     }
3674     if (r) {
3675         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
3676         exit(1);
3677     }
3678 }
3679 
3680 /* Called asynchronously in VCPU thread.  */
kvm_on_sigbus_vcpu(CPUState * cpu,int code,void * addr)3681 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
3682 {
3683 #ifdef KVM_HAVE_MCE_INJECTION
3684     if (have_sigbus_pending) {
3685         return 1;
3686     }
3687     have_sigbus_pending = true;
3688     pending_sigbus_addr = addr;
3689     pending_sigbus_code = code;
3690     qatomic_set(&cpu->exit_request, 1);
3691     return 0;
3692 #else
3693     return 1;
3694 #endif
3695 }
3696 
3697 /* Called synchronously (via signalfd) in main thread.  */
kvm_on_sigbus(int code,void * addr)3698 int kvm_on_sigbus(int code, void *addr)
3699 {
3700 #ifdef KVM_HAVE_MCE_INJECTION
3701     /* Action required MCE kills the process if SIGBUS is blocked.  Because
3702      * that's what happens in the I/O thread, where we handle MCE via signalfd,
3703      * we can only get action optional here.
3704      */
3705     assert(code != BUS_MCEERR_AR);
3706     kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
3707     return 0;
3708 #else
3709     return 1;
3710 #endif
3711 }
3712 
kvm_create_device(KVMState * s,uint64_t type,bool test)3713 int kvm_create_device(KVMState *s, uint64_t type, bool test)
3714 {
3715     int ret;
3716     struct kvm_create_device create_dev;
3717 
3718     create_dev.type = type;
3719     create_dev.fd = -1;
3720     create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
3721 
3722     if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
3723         return -ENOTSUP;
3724     }
3725 
3726     ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
3727     if (ret) {
3728         return ret;
3729     }
3730 
3731     return test ? 0 : create_dev.fd;
3732 }
3733 
kvm_device_supported(int vmfd,uint64_t type)3734 bool kvm_device_supported(int vmfd, uint64_t type)
3735 {
3736     struct kvm_create_device create_dev = {
3737         .type = type,
3738         .fd = -1,
3739         .flags = KVM_CREATE_DEVICE_TEST,
3740     };
3741 
3742     if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
3743         return false;
3744     }
3745 
3746     return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
3747 }
3748 
kvm_set_one_reg(CPUState * cs,uint64_t id,void * source)3749 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
3750 {
3751     struct kvm_one_reg reg;
3752     int r;
3753 
3754     reg.id = id;
3755     reg.addr = (uintptr_t) source;
3756     r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
3757     if (r) {
3758         trace_kvm_failed_reg_set(id, strerror(-r));
3759     }
3760     return r;
3761 }
3762 
kvm_get_one_reg(CPUState * cs,uint64_t id,void * target)3763 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
3764 {
3765     struct kvm_one_reg reg;
3766     int r;
3767 
3768     reg.id = id;
3769     reg.addr = (uintptr_t) target;
3770     r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
3771     if (r) {
3772         trace_kvm_failed_reg_get(id, strerror(-r));
3773     }
3774     return r;
3775 }
3776 
kvm_accel_has_memory(MachineState * ms,AddressSpace * as,hwaddr start_addr,hwaddr size)3777 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
3778                                  hwaddr start_addr, hwaddr size)
3779 {
3780     KVMState *kvm = KVM_STATE(ms->accelerator);
3781     int i;
3782 
3783     for (i = 0; i < kvm->nr_as; ++i) {
3784         if (kvm->as[i].as == as && kvm->as[i].ml) {
3785             size = MIN(kvm_max_slot_size, size);
3786             return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
3787                                                     start_addr, size);
3788         }
3789     }
3790 
3791     return false;
3792 }
3793 
kvm_get_kvm_shadow_mem(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3794 static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v,
3795                                    const char *name, void *opaque,
3796                                    Error **errp)
3797 {
3798     KVMState *s = KVM_STATE(obj);
3799     int64_t value = s->kvm_shadow_mem;
3800 
3801     visit_type_int(v, name, &value, errp);
3802 }
3803 
kvm_set_kvm_shadow_mem(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3804 static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v,
3805                                    const char *name, void *opaque,
3806                                    Error **errp)
3807 {
3808     KVMState *s = KVM_STATE(obj);
3809     int64_t value;
3810 
3811     if (s->fd != -1) {
3812         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3813         return;
3814     }
3815 
3816     if (!visit_type_int(v, name, &value, errp)) {
3817         return;
3818     }
3819 
3820     s->kvm_shadow_mem = value;
3821 }
3822 
kvm_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3823 static void kvm_set_kernel_irqchip(Object *obj, Visitor *v,
3824                                    const char *name, void *opaque,
3825                                    Error **errp)
3826 {
3827     KVMState *s = KVM_STATE(obj);
3828     OnOffSplit mode;
3829 
3830     if (s->fd != -1) {
3831         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3832         return;
3833     }
3834 
3835     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
3836         return;
3837     }
3838     switch (mode) {
3839     case ON_OFF_SPLIT_ON:
3840         s->kernel_irqchip_allowed = true;
3841         s->kernel_irqchip_required = true;
3842         s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3843         break;
3844     case ON_OFF_SPLIT_OFF:
3845         s->kernel_irqchip_allowed = false;
3846         s->kernel_irqchip_required = false;
3847         s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3848         break;
3849     case ON_OFF_SPLIT_SPLIT:
3850         s->kernel_irqchip_allowed = true;
3851         s->kernel_irqchip_required = true;
3852         s->kernel_irqchip_split = ON_OFF_AUTO_ON;
3853         break;
3854     default:
3855         /* The value was checked in visit_type_OnOffSplit() above. If
3856          * we get here, then something is wrong in QEMU.
3857          */
3858         abort();
3859     }
3860 }
3861 
kvm_kernel_irqchip_allowed(void)3862 bool kvm_kernel_irqchip_allowed(void)
3863 {
3864     return kvm_state->kernel_irqchip_allowed;
3865 }
3866 
kvm_kernel_irqchip_required(void)3867 bool kvm_kernel_irqchip_required(void)
3868 {
3869     return kvm_state->kernel_irqchip_required;
3870 }
3871 
kvm_kernel_irqchip_split(void)3872 bool kvm_kernel_irqchip_split(void)
3873 {
3874     return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
3875 }
3876 
kvm_get_dirty_ring_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3877 static void kvm_get_dirty_ring_size(Object *obj, Visitor *v,
3878                                     const char *name, void *opaque,
3879                                     Error **errp)
3880 {
3881     KVMState *s = KVM_STATE(obj);
3882     uint32_t value = s->kvm_dirty_ring_size;
3883 
3884     visit_type_uint32(v, name, &value, errp);
3885 }
3886 
kvm_set_dirty_ring_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3887 static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
3888                                     const char *name, void *opaque,
3889                                     Error **errp)
3890 {
3891     KVMState *s = KVM_STATE(obj);
3892     uint32_t value;
3893 
3894     if (s->fd != -1) {
3895         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3896         return;
3897     }
3898 
3899     if (!visit_type_uint32(v, name, &value, errp)) {
3900         return;
3901     }
3902     if (value & (value - 1)) {
3903         error_setg(errp, "dirty-ring-size must be a power of two.");
3904         return;
3905     }
3906 
3907     s->kvm_dirty_ring_size = value;
3908 }
3909 
kvm_get_device(Object * obj,Error ** errp G_GNUC_UNUSED)3910 static char *kvm_get_device(Object *obj,
3911                             Error **errp G_GNUC_UNUSED)
3912 {
3913     KVMState *s = KVM_STATE(obj);
3914 
3915     return g_strdup(s->device);
3916 }
3917 
kvm_set_device(Object * obj,const char * value,Error ** errp G_GNUC_UNUSED)3918 static void kvm_set_device(Object *obj,
3919                            const char *value,
3920                            Error **errp G_GNUC_UNUSED)
3921 {
3922     KVMState *s = KVM_STATE(obj);
3923 
3924     g_free(s->device);
3925     s->device = g_strdup(value);
3926 }
3927 
kvm_set_kvm_rapl(Object * obj,bool value,Error ** errp)3928 static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
3929 {
3930     KVMState *s = KVM_STATE(obj);
3931     s->msr_energy.enable = value;
3932 }
3933 
kvm_set_kvm_rapl_socket_path(Object * obj,const char * str,Error ** errp)3934 static void kvm_set_kvm_rapl_socket_path(Object *obj,
3935                                          const char *str,
3936                                          Error **errp)
3937 {
3938     KVMState *s = KVM_STATE(obj);
3939     g_free(s->msr_energy.socket_path);
3940     s->msr_energy.socket_path = g_strdup(str);
3941 }
3942 
kvm_accel_instance_init(Object * obj)3943 static void kvm_accel_instance_init(Object *obj)
3944 {
3945     KVMState *s = KVM_STATE(obj);
3946 
3947     s->fd = -1;
3948     s->vmfd = -1;
3949     s->kvm_shadow_mem = -1;
3950     s->kernel_irqchip_allowed = true;
3951     s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
3952     /* KVM dirty ring is by default off */
3953     s->kvm_dirty_ring_size = 0;
3954     s->kvm_dirty_ring_with_bitmap = false;
3955     s->kvm_eager_split_size = 0;
3956     s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN;
3957     s->notify_window = 0;
3958     s->xen_version = 0;
3959     s->xen_gnttab_max_frames = 64;
3960     s->xen_evtchn_max_pirq = 256;
3961     s->device = NULL;
3962     s->msr_energy.enable = false;
3963 }
3964 
3965 /**
3966  * kvm_gdbstub_sstep_flags():
3967  *
3968  * Returns: SSTEP_* flags that KVM supports for guest debug. The
3969  * support is probed during kvm_init()
3970  */
kvm_gdbstub_sstep_flags(void)3971 static int kvm_gdbstub_sstep_flags(void)
3972 {
3973     return kvm_sstep_flags;
3974 }
3975 
kvm_accel_class_init(ObjectClass * oc,const void * data)3976 static void kvm_accel_class_init(ObjectClass *oc, const void *data)
3977 {
3978     AccelClass *ac = ACCEL_CLASS(oc);
3979     ac->name = "KVM";
3980     ac->init_machine = kvm_init;
3981     ac->has_memory = kvm_accel_has_memory;
3982     ac->allowed = &kvm_allowed;
3983     ac->gdbstub_supported_sstep_flags = kvm_gdbstub_sstep_flags;
3984 
3985     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
3986         NULL, kvm_set_kernel_irqchip,
3987         NULL, NULL);
3988     object_class_property_set_description(oc, "kernel-irqchip",
3989         "Configure KVM in-kernel irqchip");
3990 
3991     object_class_property_add(oc, "kvm-shadow-mem", "int",
3992         kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem,
3993         NULL, NULL);
3994     object_class_property_set_description(oc, "kvm-shadow-mem",
3995         "KVM shadow MMU size");
3996 
3997     object_class_property_add(oc, "dirty-ring-size", "uint32",
3998         kvm_get_dirty_ring_size, kvm_set_dirty_ring_size,
3999         NULL, NULL);
4000     object_class_property_set_description(oc, "dirty-ring-size",
4001         "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
4002 
4003     object_class_property_add_str(oc, "device", kvm_get_device, kvm_set_device);
4004     object_class_property_set_description(oc, "device",
4005         "Path to the device node to use (default: /dev/kvm)");
4006 
4007     object_class_property_add_bool(oc, "rapl",
4008                                    NULL,
4009                                    kvm_set_kvm_rapl);
4010     object_class_property_set_description(oc, "rapl",
4011         "Allow energy related MSRs for RAPL interface in Guest");
4012 
4013     object_class_property_add_str(oc, "rapl-helper-socket", NULL,
4014                                   kvm_set_kvm_rapl_socket_path);
4015     object_class_property_set_description(oc, "rapl-helper-socket",
4016         "Socket Path for comminucating with the Virtual MSR helper daemon");
4017 
4018     kvm_arch_accel_class_init(oc);
4019 }
4020 
4021 static const TypeInfo kvm_accel_type = {
4022     .name = TYPE_KVM_ACCEL,
4023     .parent = TYPE_ACCEL,
4024     .instance_init = kvm_accel_instance_init,
4025     .class_init = kvm_accel_class_init,
4026     .instance_size = sizeof(KVMState),
4027 };
4028 
kvm_type_init(void)4029 static void kvm_type_init(void)
4030 {
4031     type_register_static(&kvm_accel_type);
4032 }
4033 
4034 type_init(kvm_type_init);
4035 
4036 typedef struct StatsArgs {
4037     union StatsResultsType {
4038         StatsResultList **stats;
4039         StatsSchemaList **schema;
4040     } result;
4041     strList *names;
4042     Error **errp;
4043 } StatsArgs;
4044 
add_kvmstat_entry(struct kvm_stats_desc * pdesc,uint64_t * stats_data,StatsList * stats_list,Error ** errp)4045 static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
4046                                     uint64_t *stats_data,
4047                                     StatsList *stats_list,
4048                                     Error **errp)
4049 {
4050 
4051     Stats *stats;
4052     uint64List *val_list = NULL;
4053 
4054     /* Only add stats that we understand.  */
4055     switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
4056     case KVM_STATS_TYPE_CUMULATIVE:
4057     case KVM_STATS_TYPE_INSTANT:
4058     case KVM_STATS_TYPE_PEAK:
4059     case KVM_STATS_TYPE_LINEAR_HIST:
4060     case KVM_STATS_TYPE_LOG_HIST:
4061         break;
4062     default:
4063         return stats_list;
4064     }
4065 
4066     switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
4067     case KVM_STATS_UNIT_NONE:
4068     case KVM_STATS_UNIT_BYTES:
4069     case KVM_STATS_UNIT_CYCLES:
4070     case KVM_STATS_UNIT_SECONDS:
4071     case KVM_STATS_UNIT_BOOLEAN:
4072         break;
4073     default:
4074         return stats_list;
4075     }
4076 
4077     switch (pdesc->flags & KVM_STATS_BASE_MASK) {
4078     case KVM_STATS_BASE_POW10:
4079     case KVM_STATS_BASE_POW2:
4080         break;
4081     default:
4082         return stats_list;
4083     }
4084 
4085     /* Alloc and populate data list */
4086     stats = g_new0(Stats, 1);
4087     stats->name = g_strdup(pdesc->name);
4088     stats->value = g_new0(StatsValue, 1);
4089 
4090     if ((pdesc->flags & KVM_STATS_UNIT_MASK) == KVM_STATS_UNIT_BOOLEAN) {
4091         stats->value->u.boolean = *stats_data;
4092         stats->value->type = QTYPE_QBOOL;
4093     } else if (pdesc->size == 1) {
4094         stats->value->u.scalar = *stats_data;
4095         stats->value->type = QTYPE_QNUM;
4096     } else {
4097         int i;
4098         for (i = 0; i < pdesc->size; i++) {
4099             QAPI_LIST_PREPEND(val_list, stats_data[i]);
4100         }
4101         stats->value->u.list = val_list;
4102         stats->value->type = QTYPE_QLIST;
4103     }
4104 
4105     QAPI_LIST_PREPEND(stats_list, stats);
4106     return stats_list;
4107 }
4108 
add_kvmschema_entry(struct kvm_stats_desc * pdesc,StatsSchemaValueList * list,Error ** errp)4109 static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc,
4110                                                  StatsSchemaValueList *list,
4111                                                  Error **errp)
4112 {
4113     StatsSchemaValueList *schema_entry = g_new0(StatsSchemaValueList, 1);
4114     schema_entry->value = g_new0(StatsSchemaValue, 1);
4115 
4116     switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
4117     case KVM_STATS_TYPE_CUMULATIVE:
4118         schema_entry->value->type = STATS_TYPE_CUMULATIVE;
4119         break;
4120     case KVM_STATS_TYPE_INSTANT:
4121         schema_entry->value->type = STATS_TYPE_INSTANT;
4122         break;
4123     case KVM_STATS_TYPE_PEAK:
4124         schema_entry->value->type = STATS_TYPE_PEAK;
4125         break;
4126     case KVM_STATS_TYPE_LINEAR_HIST:
4127         schema_entry->value->type = STATS_TYPE_LINEAR_HISTOGRAM;
4128         schema_entry->value->bucket_size = pdesc->bucket_size;
4129         schema_entry->value->has_bucket_size = true;
4130         break;
4131     case KVM_STATS_TYPE_LOG_HIST:
4132         schema_entry->value->type = STATS_TYPE_LOG2_HISTOGRAM;
4133         break;
4134     default:
4135         goto exit;
4136     }
4137 
4138     switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
4139     case KVM_STATS_UNIT_NONE:
4140         break;
4141     case KVM_STATS_UNIT_BOOLEAN:
4142         schema_entry->value->has_unit = true;
4143         schema_entry->value->unit = STATS_UNIT_BOOLEAN;
4144         break;
4145     case KVM_STATS_UNIT_BYTES:
4146         schema_entry->value->has_unit = true;
4147         schema_entry->value->unit = STATS_UNIT_BYTES;
4148         break;
4149     case KVM_STATS_UNIT_CYCLES:
4150         schema_entry->value->has_unit = true;
4151         schema_entry->value->unit = STATS_UNIT_CYCLES;
4152         break;
4153     case KVM_STATS_UNIT_SECONDS:
4154         schema_entry->value->has_unit = true;
4155         schema_entry->value->unit = STATS_UNIT_SECONDS;
4156         break;
4157     default:
4158         goto exit;
4159     }
4160 
4161     schema_entry->value->exponent = pdesc->exponent;
4162     if (pdesc->exponent) {
4163         switch (pdesc->flags & KVM_STATS_BASE_MASK) {
4164         case KVM_STATS_BASE_POW10:
4165             schema_entry->value->has_base = true;
4166             schema_entry->value->base = 10;
4167             break;
4168         case KVM_STATS_BASE_POW2:
4169             schema_entry->value->has_base = true;
4170             schema_entry->value->base = 2;
4171             break;
4172         default:
4173             goto exit;
4174         }
4175     }
4176 
4177     schema_entry->value->name = g_strdup(pdesc->name);
4178     schema_entry->next = list;
4179     return schema_entry;
4180 exit:
4181     g_free(schema_entry->value);
4182     g_free(schema_entry);
4183     return list;
4184 }
4185 
4186 /* Cached stats descriptors */
4187 typedef struct StatsDescriptors {
4188     const char *ident; /* cache key, currently the StatsTarget */
4189     struct kvm_stats_desc *kvm_stats_desc;
4190     struct kvm_stats_header kvm_stats_header;
4191     QTAILQ_ENTRY(StatsDescriptors) next;
4192 } StatsDescriptors;
4193 
4194 static QTAILQ_HEAD(, StatsDescriptors) stats_descriptors =
4195     QTAILQ_HEAD_INITIALIZER(stats_descriptors);
4196 
4197 /*
4198  * Return the descriptors for 'target', that either have already been read
4199  * or are retrieved from 'stats_fd'.
4200  */
find_stats_descriptors(StatsTarget target,int stats_fd,Error ** errp)4201 static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd,
4202                                                 Error **errp)
4203 {
4204     StatsDescriptors *descriptors;
4205     const char *ident;
4206     struct kvm_stats_desc *kvm_stats_desc;
4207     struct kvm_stats_header *kvm_stats_header;
4208     size_t size_desc;
4209     ssize_t ret;
4210 
4211     ident = StatsTarget_str(target);
4212     QTAILQ_FOREACH(descriptors, &stats_descriptors, next) {
4213         if (g_str_equal(descriptors->ident, ident)) {
4214             return descriptors;
4215         }
4216     }
4217 
4218     descriptors = g_new0(StatsDescriptors, 1);
4219 
4220     /* Read stats header */
4221     kvm_stats_header = &descriptors->kvm_stats_header;
4222     ret = pread(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header), 0);
4223     if (ret != sizeof(*kvm_stats_header)) {
4224         error_setg(errp, "KVM stats: failed to read stats header: "
4225                    "expected %zu actual %zu",
4226                    sizeof(*kvm_stats_header), ret);
4227         g_free(descriptors);
4228         return NULL;
4229     }
4230     size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4231 
4232     /* Read stats descriptors */
4233     kvm_stats_desc = g_malloc0_n(kvm_stats_header->num_desc, size_desc);
4234     ret = pread(stats_fd, kvm_stats_desc,
4235                 size_desc * kvm_stats_header->num_desc,
4236                 kvm_stats_header->desc_offset);
4237 
4238     if (ret != size_desc * kvm_stats_header->num_desc) {
4239         error_setg(errp, "KVM stats: failed to read stats descriptors: "
4240                    "expected %zu actual %zu",
4241                    size_desc * kvm_stats_header->num_desc, ret);
4242         g_free(descriptors);
4243         g_free(kvm_stats_desc);
4244         return NULL;
4245     }
4246     descriptors->kvm_stats_desc = kvm_stats_desc;
4247     descriptors->ident = ident;
4248     QTAILQ_INSERT_TAIL(&stats_descriptors, descriptors, next);
4249     return descriptors;
4250 }
4251 
query_stats(StatsResultList ** result,StatsTarget target,strList * names,int stats_fd,CPUState * cpu,Error ** errp)4252 static void query_stats(StatsResultList **result, StatsTarget target,
4253                         strList *names, int stats_fd, CPUState *cpu,
4254                         Error **errp)
4255 {
4256     struct kvm_stats_desc *kvm_stats_desc;
4257     struct kvm_stats_header *kvm_stats_header;
4258     StatsDescriptors *descriptors;
4259     g_autofree uint64_t *stats_data = NULL;
4260     struct kvm_stats_desc *pdesc;
4261     StatsList *stats_list = NULL;
4262     size_t size_desc, size_data = 0;
4263     ssize_t ret;
4264     int i;
4265 
4266     descriptors = find_stats_descriptors(target, stats_fd, errp);
4267     if (!descriptors) {
4268         return;
4269     }
4270 
4271     kvm_stats_header = &descriptors->kvm_stats_header;
4272     kvm_stats_desc = descriptors->kvm_stats_desc;
4273     size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4274 
4275     /* Tally the total data size; read schema data */
4276     for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4277         pdesc = (void *)kvm_stats_desc + i * size_desc;
4278         size_data += pdesc->size * sizeof(*stats_data);
4279     }
4280 
4281     stats_data = g_malloc0(size_data);
4282     ret = pread(stats_fd, stats_data, size_data, kvm_stats_header->data_offset);
4283 
4284     if (ret != size_data) {
4285         error_setg(errp, "KVM stats: failed to read data: "
4286                    "expected %zu actual %zu", size_data, ret);
4287         return;
4288     }
4289 
4290     for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4291         uint64_t *stats;
4292         pdesc = (void *)kvm_stats_desc + i * size_desc;
4293 
4294         /* Add entry to the list */
4295         stats = (void *)stats_data + pdesc->offset;
4296         if (!apply_str_list_filter(pdesc->name, names)) {
4297             continue;
4298         }
4299         stats_list = add_kvmstat_entry(pdesc, stats, stats_list, errp);
4300     }
4301 
4302     if (!stats_list) {
4303         return;
4304     }
4305 
4306     switch (target) {
4307     case STATS_TARGET_VM:
4308         add_stats_entry(result, STATS_PROVIDER_KVM, NULL, stats_list);
4309         break;
4310     case STATS_TARGET_VCPU:
4311         add_stats_entry(result, STATS_PROVIDER_KVM,
4312                         cpu->parent_obj.canonical_path,
4313                         stats_list);
4314         break;
4315     default:
4316         g_assert_not_reached();
4317     }
4318 }
4319 
query_stats_schema(StatsSchemaList ** result,StatsTarget target,int stats_fd,Error ** errp)4320 static void query_stats_schema(StatsSchemaList **result, StatsTarget target,
4321                                int stats_fd, Error **errp)
4322 {
4323     struct kvm_stats_desc *kvm_stats_desc;
4324     struct kvm_stats_header *kvm_stats_header;
4325     StatsDescriptors *descriptors;
4326     struct kvm_stats_desc *pdesc;
4327     StatsSchemaValueList *stats_list = NULL;
4328     size_t size_desc;
4329     int i;
4330 
4331     descriptors = find_stats_descriptors(target, stats_fd, errp);
4332     if (!descriptors) {
4333         return;
4334     }
4335 
4336     kvm_stats_header = &descriptors->kvm_stats_header;
4337     kvm_stats_desc = descriptors->kvm_stats_desc;
4338     size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4339 
4340     /* Tally the total data size; read schema data */
4341     for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4342         pdesc = (void *)kvm_stats_desc + i * size_desc;
4343         stats_list = add_kvmschema_entry(pdesc, stats_list, errp);
4344     }
4345 
4346     add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list);
4347 }
4348 
query_stats_vcpu(CPUState * cpu,StatsArgs * kvm_stats_args)4349 static void query_stats_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4350 {
4351     int stats_fd = cpu->kvm_vcpu_stats_fd;
4352     Error *local_err = NULL;
4353 
4354     if (stats_fd == -1) {
4355         error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4356         error_propagate(kvm_stats_args->errp, local_err);
4357         return;
4358     }
4359     query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU,
4360                 kvm_stats_args->names, stats_fd, cpu,
4361                 kvm_stats_args->errp);
4362 }
4363 
query_stats_schema_vcpu(CPUState * cpu,StatsArgs * kvm_stats_args)4364 static void query_stats_schema_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4365 {
4366     int stats_fd = cpu->kvm_vcpu_stats_fd;
4367     Error *local_err = NULL;
4368 
4369     if (stats_fd == -1) {
4370         error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4371         error_propagate(kvm_stats_args->errp, local_err);
4372         return;
4373     }
4374     query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd,
4375                        kvm_stats_args->errp);
4376 }
4377 
query_stats_cb(StatsResultList ** result,StatsTarget target,strList * names,strList * targets,Error ** errp)4378 static void query_stats_cb(StatsResultList **result, StatsTarget target,
4379                            strList *names, strList *targets, Error **errp)
4380 {
4381     KVMState *s = kvm_state;
4382     CPUState *cpu;
4383     int stats_fd;
4384 
4385     switch (target) {
4386     case STATS_TARGET_VM:
4387     {
4388         stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4389         if (stats_fd == -1) {
4390             error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4391             return;
4392         }
4393         query_stats(result, target, names, stats_fd, NULL, errp);
4394         close(stats_fd);
4395         break;
4396     }
4397     case STATS_TARGET_VCPU:
4398     {
4399         StatsArgs stats_args;
4400         stats_args.result.stats = result;
4401         stats_args.names = names;
4402         stats_args.errp = errp;
4403         CPU_FOREACH(cpu) {
4404             if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) {
4405                 continue;
4406             }
4407             query_stats_vcpu(cpu, &stats_args);
4408         }
4409         break;
4410     }
4411     default:
4412         break;
4413     }
4414 }
4415 
query_stats_schemas_cb(StatsSchemaList ** result,Error ** errp)4416 void query_stats_schemas_cb(StatsSchemaList **result, Error **errp)
4417 {
4418     StatsArgs stats_args;
4419     KVMState *s = kvm_state;
4420     int stats_fd;
4421 
4422     stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4423     if (stats_fd == -1) {
4424         error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4425         return;
4426     }
4427     query_stats_schema(result, STATS_TARGET_VM, stats_fd, errp);
4428     close(stats_fd);
4429 
4430     if (first_cpu) {
4431         stats_args.result.schema = result;
4432         stats_args.errp = errp;
4433         query_stats_schema_vcpu(first_cpu, &stats_args);
4434     }
4435 }
4436 
kvm_mark_guest_state_protected(void)4437 void kvm_mark_guest_state_protected(void)
4438 {
4439     kvm_state->guest_state_protected = true;
4440 }
4441 
kvm_create_guest_memfd(uint64_t size,uint64_t flags,Error ** errp)4442 int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp)
4443 {
4444     int fd;
4445     struct kvm_create_guest_memfd guest_memfd = {
4446         .size = size,
4447         .flags = flags,
4448     };
4449 
4450     if (!kvm_guest_memfd_supported) {
4451         error_setg(errp, "KVM does not support guest_memfd");
4452         return -1;
4453     }
4454 
4455     fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_GUEST_MEMFD, &guest_memfd);
4456     if (fd < 0) {
4457         error_setg_errno(errp, errno, "Error creating KVM guest_memfd");
4458         return -1;
4459     }
4460 
4461     return fd;
4462 }
4463