1 /*
2 * QEMU KVM support
3 *
4 * Copyright IBM, Corp. 2008
5 * Red Hat, Inc. 2008
6 *
7 * Authors:
8 * Anthony Liguori <aliguori@us.ibm.com>
9 * Glauber Costa <gcosta@redhat.com>
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
13 *
14 */
15
16 #include "qemu/osdep.h"
17 #include <sys/ioctl.h>
18 #include <poll.h>
19
20 #include <linux/kvm.h>
21
22 #include "qemu/atomic.h"
23 #include "qemu/option.h"
24 #include "qemu/config-file.h"
25 #include "qemu/error-report.h"
26 #include "qapi/error.h"
27 #include "hw/pci/msi.h"
28 #include "hw/pci/msix.h"
29 #include "hw/s390x/adapter.h"
30 #include "gdbstub/enums.h"
31 #include "system/kvm_int.h"
32 #include "system/runstate.h"
33 #include "system/cpus.h"
34 #include "system/accel-blocker.h"
35 #include "qemu/bswap.h"
36 #include "exec/tswap.h"
37 #include "system/memory.h"
38 #include "system/ram_addr.h"
39 #include "qemu/event_notifier.h"
40 #include "qemu/main-loop.h"
41 #include "trace.h"
42 #include "hw/irq.h"
43 #include "qapi/visitor.h"
44 #include "qapi/qapi-types-common.h"
45 #include "qapi/qapi-visit-common.h"
46 #include "system/reset.h"
47 #include "qemu/guest-random.h"
48 #include "system/hw_accel.h"
49 #include "kvm-cpus.h"
50 #include "system/dirtylimit.h"
51 #include "qemu/range.h"
52
53 #include "hw/boards.h"
54 #include "system/stats.h"
55
56 /* This check must be after config-host.h is included */
57 #ifdef CONFIG_EVENTFD
58 #include <sys/eventfd.h>
59 #endif
60
61 #if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__)
62 # define KVM_HAVE_MCE_INJECTION 1
63 #endif
64
65
66 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
67 * need to use the real host PAGE_SIZE, as that's what KVM will use.
68 */
69 #ifdef PAGE_SIZE
70 #undef PAGE_SIZE
71 #endif
72 #define PAGE_SIZE qemu_real_host_page_size()
73
74 #ifndef KVM_GUESTDBG_BLOCKIRQ
75 #define KVM_GUESTDBG_BLOCKIRQ 0
76 #endif
77
78 /* Default num of memslots to be allocated when VM starts */
79 #define KVM_MEMSLOTS_NR_ALLOC_DEFAULT 16
80 /* Default max allowed memslots if kernel reported nothing */
81 #define KVM_MEMSLOTS_NR_MAX_DEFAULT 32
82
83 struct KVMParkedVcpu {
84 unsigned long vcpu_id;
85 int kvm_fd;
86 QLIST_ENTRY(KVMParkedVcpu) node;
87 };
88
89 KVMState *kvm_state;
90 bool kvm_kernel_irqchip;
91 bool kvm_split_irqchip;
92 bool kvm_async_interrupts_allowed;
93 bool kvm_halt_in_kernel_allowed;
94 bool kvm_resamplefds_allowed;
95 bool kvm_msi_via_irqfd_allowed;
96 bool kvm_gsi_routing_allowed;
97 bool kvm_gsi_direct_mapping;
98 bool kvm_allowed;
99 bool kvm_readonly_mem_allowed;
100 bool kvm_vm_attributes_allowed;
101 bool kvm_msi_use_devid;
102 static bool kvm_has_guest_debug;
103 static int kvm_sstep_flags;
104 static bool kvm_immediate_exit;
105 static uint64_t kvm_supported_memory_attributes;
106 static bool kvm_guest_memfd_supported;
107 static hwaddr kvm_max_slot_size = ~0;
108
109 static const KVMCapabilityInfo kvm_required_capabilites[] = {
110 KVM_CAP_INFO(USER_MEMORY),
111 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
112 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
113 KVM_CAP_INFO(INTERNAL_ERROR_DATA),
114 KVM_CAP_INFO(IOEVENTFD),
115 KVM_CAP_INFO(IOEVENTFD_ANY_LENGTH),
116 KVM_CAP_LAST_INFO
117 };
118
119 static NotifierList kvm_irqchip_change_notifiers =
120 NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
121
122 struct KVMResampleFd {
123 int gsi;
124 EventNotifier *resample_event;
125 QLIST_ENTRY(KVMResampleFd) node;
126 };
127 typedef struct KVMResampleFd KVMResampleFd;
128
129 /*
130 * Only used with split irqchip where we need to do the resample fd
131 * kick for the kernel from userspace.
132 */
133 static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
134 QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
135
136 static QemuMutex kml_slots_lock;
137
138 #define kvm_slots_lock() qemu_mutex_lock(&kml_slots_lock)
139 #define kvm_slots_unlock() qemu_mutex_unlock(&kml_slots_lock)
140
141 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem);
142
kvm_resample_fd_remove(int gsi)143 static inline void kvm_resample_fd_remove(int gsi)
144 {
145 KVMResampleFd *rfd;
146
147 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
148 if (rfd->gsi == gsi) {
149 QLIST_REMOVE(rfd, node);
150 g_free(rfd);
151 break;
152 }
153 }
154 }
155
kvm_resample_fd_insert(int gsi,EventNotifier * event)156 static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
157 {
158 KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
159
160 rfd->gsi = gsi;
161 rfd->resample_event = event;
162
163 QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
164 }
165
kvm_resample_fd_notify(int gsi)166 void kvm_resample_fd_notify(int gsi)
167 {
168 KVMResampleFd *rfd;
169
170 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
171 if (rfd->gsi == gsi) {
172 event_notifier_set(rfd->resample_event);
173 trace_kvm_resample_fd_notify(gsi);
174 return;
175 }
176 }
177 }
178
179 /**
180 * kvm_slots_grow(): Grow the slots[] array in the KVMMemoryListener
181 *
182 * @kml: The KVMMemoryListener* to grow the slots[] array
183 * @nr_slots_new: The new size of slots[] array
184 *
185 * Returns: True if the array grows larger, false otherwise.
186 */
kvm_slots_grow(KVMMemoryListener * kml,unsigned int nr_slots_new)187 static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new)
188 {
189 unsigned int i, cur = kml->nr_slots_allocated;
190 KVMSlot *slots;
191
192 if (nr_slots_new > kvm_state->nr_slots_max) {
193 nr_slots_new = kvm_state->nr_slots_max;
194 }
195
196 if (cur >= nr_slots_new) {
197 /* Big enough, no need to grow, or we reached max */
198 return false;
199 }
200
201 if (cur == 0) {
202 slots = g_new0(KVMSlot, nr_slots_new);
203 } else {
204 assert(kml->slots);
205 slots = g_renew(KVMSlot, kml->slots, nr_slots_new);
206 /*
207 * g_renew() doesn't initialize extended buffers, however kvm
208 * memslots require fields to be zero-initialized. E.g. pointers,
209 * memory_size field, etc.
210 */
211 memset(&slots[cur], 0x0, sizeof(slots[0]) * (nr_slots_new - cur));
212 }
213
214 for (i = cur; i < nr_slots_new; i++) {
215 slots[i].slot = i;
216 }
217
218 kml->slots = slots;
219 kml->nr_slots_allocated = nr_slots_new;
220 trace_kvm_slots_grow(cur, nr_slots_new);
221
222 return true;
223 }
224
kvm_slots_double(KVMMemoryListener * kml)225 static bool kvm_slots_double(KVMMemoryListener *kml)
226 {
227 return kvm_slots_grow(kml, kml->nr_slots_allocated * 2);
228 }
229
kvm_get_max_memslots(void)230 unsigned int kvm_get_max_memslots(void)
231 {
232 KVMState *s = KVM_STATE(current_accel());
233
234 return s->nr_slots_max;
235 }
236
kvm_get_free_memslots(void)237 unsigned int kvm_get_free_memslots(void)
238 {
239 unsigned int used_slots = 0;
240 KVMState *s = kvm_state;
241 int i;
242
243 kvm_slots_lock();
244 for (i = 0; i < s->nr_as; i++) {
245 if (!s->as[i].ml) {
246 continue;
247 }
248 used_slots = MAX(used_slots, s->as[i].ml->nr_slots_used);
249 }
250 kvm_slots_unlock();
251
252 return s->nr_slots_max - used_slots;
253 }
254
255 /* Called with KVMMemoryListener.slots_lock held */
kvm_get_free_slot(KVMMemoryListener * kml)256 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
257 {
258 unsigned int n;
259 int i;
260
261 for (i = 0; i < kml->nr_slots_allocated; i++) {
262 if (kml->slots[i].memory_size == 0) {
263 return &kml->slots[i];
264 }
265 }
266
267 /*
268 * If no free slots, try to grow first by doubling. Cache the old size
269 * here to avoid another round of search: if the grow succeeded, it
270 * means slots[] now must have the existing "n" slots occupied,
271 * followed by one or more free slots starting from slots[n].
272 */
273 n = kml->nr_slots_allocated;
274 if (kvm_slots_double(kml)) {
275 return &kml->slots[n];
276 }
277
278 return NULL;
279 }
280
281 /* Called with KVMMemoryListener.slots_lock held */
kvm_alloc_slot(KVMMemoryListener * kml)282 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
283 {
284 KVMSlot *slot = kvm_get_free_slot(kml);
285
286 if (slot) {
287 return slot;
288 }
289
290 fprintf(stderr, "%s: no free slot available\n", __func__);
291 abort();
292 }
293
kvm_lookup_matching_slot(KVMMemoryListener * kml,hwaddr start_addr,hwaddr size)294 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
295 hwaddr start_addr,
296 hwaddr size)
297 {
298 int i;
299
300 for (i = 0; i < kml->nr_slots_allocated; i++) {
301 KVMSlot *mem = &kml->slots[i];
302
303 if (start_addr == mem->start_addr && size == mem->memory_size) {
304 return mem;
305 }
306 }
307
308 return NULL;
309 }
310
311 /*
312 * Calculate and align the start address and the size of the section.
313 * Return the size. If the size is 0, the aligned section is empty.
314 */
kvm_align_section(MemoryRegionSection * section,hwaddr * start)315 static hwaddr kvm_align_section(MemoryRegionSection *section,
316 hwaddr *start)
317 {
318 hwaddr size = int128_get64(section->size);
319 hwaddr delta, aligned;
320
321 /* kvm works in page size chunks, but the function may be called
322 with sub-page size and unaligned start address. Pad the start
323 address to next and truncate size to previous page boundary. */
324 aligned = ROUND_UP(section->offset_within_address_space,
325 qemu_real_host_page_size());
326 delta = aligned - section->offset_within_address_space;
327 *start = aligned;
328 if (delta > size) {
329 return 0;
330 }
331
332 return (size - delta) & qemu_real_host_page_mask();
333 }
334
kvm_physical_memory_addr_from_host(KVMState * s,void * ram,hwaddr * phys_addr)335 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
336 hwaddr *phys_addr)
337 {
338 KVMMemoryListener *kml = &s->memory_listener;
339 int i, ret = 0;
340
341 kvm_slots_lock();
342 for (i = 0; i < kml->nr_slots_allocated; i++) {
343 KVMSlot *mem = &kml->slots[i];
344
345 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
346 *phys_addr = mem->start_addr + (ram - mem->ram);
347 ret = 1;
348 break;
349 }
350 }
351 kvm_slots_unlock();
352
353 return ret;
354 }
355
kvm_set_user_memory_region(KVMMemoryListener * kml,KVMSlot * slot,bool new)356 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
357 {
358 KVMState *s = kvm_state;
359 struct kvm_userspace_memory_region2 mem;
360 int ret;
361
362 mem.slot = slot->slot | (kml->as_id << 16);
363 mem.guest_phys_addr = slot->start_addr;
364 mem.userspace_addr = (unsigned long)slot->ram;
365 mem.flags = slot->flags;
366 mem.guest_memfd = slot->guest_memfd;
367 mem.guest_memfd_offset = slot->guest_memfd_offset;
368
369 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
370 /* Set the slot size to 0 before setting the slot to the desired
371 * value. This is needed based on KVM commit 75d61fbc. */
372 mem.memory_size = 0;
373
374 if (kvm_guest_memfd_supported) {
375 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
376 } else {
377 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
378 }
379 if (ret < 0) {
380 goto err;
381 }
382 }
383 mem.memory_size = slot->memory_size;
384 if (kvm_guest_memfd_supported) {
385 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
386 } else {
387 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
388 }
389 slot->old_flags = mem.flags;
390 err:
391 trace_kvm_set_user_memory(mem.slot >> 16, (uint16_t)mem.slot, mem.flags,
392 mem.guest_phys_addr, mem.memory_size,
393 mem.userspace_addr, mem.guest_memfd,
394 mem.guest_memfd_offset, ret);
395 if (ret < 0) {
396 if (kvm_guest_memfd_supported) {
397 error_report("%s: KVM_SET_USER_MEMORY_REGION2 failed, slot=%d,"
398 " start=0x%" PRIx64 ", size=0x%" PRIx64 ","
399 " flags=0x%" PRIx32 ", guest_memfd=%" PRId32 ","
400 " guest_memfd_offset=0x%" PRIx64 ": %s",
401 __func__, mem.slot, slot->start_addr,
402 (uint64_t)mem.memory_size, mem.flags,
403 mem.guest_memfd, (uint64_t)mem.guest_memfd_offset,
404 strerror(errno));
405 } else {
406 error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
407 " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
408 __func__, mem.slot, slot->start_addr,
409 (uint64_t)mem.memory_size, strerror(errno));
410 }
411 }
412 return ret;
413 }
414
kvm_park_vcpu(CPUState * cpu)415 void kvm_park_vcpu(CPUState *cpu)
416 {
417 struct KVMParkedVcpu *vcpu;
418
419 trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
420
421 vcpu = g_malloc0(sizeof(*vcpu));
422 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
423 vcpu->kvm_fd = cpu->kvm_fd;
424 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
425 }
426
kvm_unpark_vcpu(KVMState * s,unsigned long vcpu_id)427 int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id)
428 {
429 struct KVMParkedVcpu *cpu;
430 int kvm_fd = -ENOENT;
431
432 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
433 if (cpu->vcpu_id == vcpu_id) {
434 QLIST_REMOVE(cpu, node);
435 kvm_fd = cpu->kvm_fd;
436 g_free(cpu);
437 break;
438 }
439 }
440
441 trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked");
442
443 return kvm_fd;
444 }
445
kvm_reset_parked_vcpus(KVMState * s)446 static void kvm_reset_parked_vcpus(KVMState *s)
447 {
448 struct KVMParkedVcpu *cpu;
449
450 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
451 kvm_arch_reset_parked_vcpu(cpu->vcpu_id, cpu->kvm_fd);
452 }
453 }
454
kvm_create_vcpu(CPUState * cpu)455 int kvm_create_vcpu(CPUState *cpu)
456 {
457 unsigned long vcpu_id = kvm_arch_vcpu_id(cpu);
458 KVMState *s = kvm_state;
459 int kvm_fd;
460
461 /* check if the KVM vCPU already exist but is parked */
462 kvm_fd = kvm_unpark_vcpu(s, vcpu_id);
463 if (kvm_fd < 0) {
464 /* vCPU not parked: create a new KVM vCPU */
465 kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id);
466 if (kvm_fd < 0) {
467 error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id);
468 return kvm_fd;
469 }
470 }
471
472 cpu->kvm_fd = kvm_fd;
473 cpu->kvm_state = s;
474 if (!s->guest_state_protected) {
475 cpu->vcpu_dirty = true;
476 }
477 cpu->dirty_pages = 0;
478 cpu->throttle_us_per_full = 0;
479
480 trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd);
481
482 return 0;
483 }
484
kvm_create_and_park_vcpu(CPUState * cpu)485 int kvm_create_and_park_vcpu(CPUState *cpu)
486 {
487 int ret = 0;
488
489 ret = kvm_create_vcpu(cpu);
490 if (!ret) {
491 kvm_park_vcpu(cpu);
492 }
493
494 return ret;
495 }
496
do_kvm_destroy_vcpu(CPUState * cpu)497 static int do_kvm_destroy_vcpu(CPUState *cpu)
498 {
499 KVMState *s = kvm_state;
500 int mmap_size;
501 int ret = 0;
502
503 trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
504
505 ret = kvm_arch_destroy_vcpu(cpu);
506 if (ret < 0) {
507 goto err;
508 }
509
510 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
511 if (mmap_size < 0) {
512 ret = mmap_size;
513 trace_kvm_failed_get_vcpu_mmap_size();
514 goto err;
515 }
516
517 ret = munmap(cpu->kvm_run, mmap_size);
518 if (ret < 0) {
519 goto err;
520 }
521
522 if (cpu->kvm_dirty_gfns) {
523 ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
524 if (ret < 0) {
525 goto err;
526 }
527 }
528
529 kvm_park_vcpu(cpu);
530 err:
531 return ret;
532 }
533
kvm_destroy_vcpu(CPUState * cpu)534 void kvm_destroy_vcpu(CPUState *cpu)
535 {
536 if (do_kvm_destroy_vcpu(cpu) < 0) {
537 error_report("kvm_destroy_vcpu failed");
538 exit(EXIT_FAILURE);
539 }
540 }
541
kvm_init_vcpu(CPUState * cpu,Error ** errp)542 int kvm_init_vcpu(CPUState *cpu, Error **errp)
543 {
544 KVMState *s = kvm_state;
545 int mmap_size;
546 int ret;
547
548 trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
549
550 ret = kvm_arch_pre_create_vcpu(cpu, errp);
551 if (ret < 0) {
552 goto err;
553 }
554
555 ret = kvm_create_vcpu(cpu);
556 if (ret < 0) {
557 error_setg_errno(errp, -ret,
558 "kvm_init_vcpu: kvm_create_vcpu failed (%lu)",
559 kvm_arch_vcpu_id(cpu));
560 goto err;
561 }
562
563 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
564 if (mmap_size < 0) {
565 ret = mmap_size;
566 error_setg_errno(errp, -mmap_size,
567 "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
568 goto err;
569 }
570
571 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
572 cpu->kvm_fd, 0);
573 if (cpu->kvm_run == MAP_FAILED) {
574 ret = -errno;
575 error_setg_errno(errp, ret,
576 "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)",
577 kvm_arch_vcpu_id(cpu));
578 goto err;
579 }
580
581 if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
582 s->coalesced_mmio_ring =
583 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
584 }
585
586 if (s->kvm_dirty_ring_size) {
587 /* Use MAP_SHARED to share pages with the kernel */
588 cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes,
589 PROT_READ | PROT_WRITE, MAP_SHARED,
590 cpu->kvm_fd,
591 PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
592 if (cpu->kvm_dirty_gfns == MAP_FAILED) {
593 ret = -errno;
594 goto err;
595 }
596 }
597
598 ret = kvm_arch_init_vcpu(cpu);
599 if (ret < 0) {
600 error_setg_errno(errp, -ret,
601 "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)",
602 kvm_arch_vcpu_id(cpu));
603 }
604 cpu->kvm_vcpu_stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL);
605
606 err:
607 return ret;
608 }
609
610 /*
611 * dirty pages logging control
612 */
613
kvm_mem_flags(MemoryRegion * mr)614 static int kvm_mem_flags(MemoryRegion *mr)
615 {
616 bool readonly = mr->readonly || memory_region_is_romd(mr);
617 int flags = 0;
618
619 if (memory_region_get_dirty_log_mask(mr) != 0) {
620 flags |= KVM_MEM_LOG_DIRTY_PAGES;
621 }
622 if (readonly && kvm_readonly_mem_allowed) {
623 flags |= KVM_MEM_READONLY;
624 }
625 if (memory_region_has_guest_memfd(mr)) {
626 assert(kvm_guest_memfd_supported);
627 flags |= KVM_MEM_GUEST_MEMFD;
628 }
629 return flags;
630 }
631
632 /* Called with KVMMemoryListener.slots_lock held */
kvm_slot_update_flags(KVMMemoryListener * kml,KVMSlot * mem,MemoryRegion * mr)633 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
634 MemoryRegion *mr)
635 {
636 mem->flags = kvm_mem_flags(mr);
637
638 /* If nothing changed effectively, no need to issue ioctl */
639 if (mem->flags == mem->old_flags) {
640 return 0;
641 }
642
643 kvm_slot_init_dirty_bitmap(mem);
644 return kvm_set_user_memory_region(kml, mem, false);
645 }
646
kvm_section_update_flags(KVMMemoryListener * kml,MemoryRegionSection * section)647 static int kvm_section_update_flags(KVMMemoryListener *kml,
648 MemoryRegionSection *section)
649 {
650 hwaddr start_addr, size, slot_size;
651 KVMSlot *mem;
652 int ret = 0;
653
654 size = kvm_align_section(section, &start_addr);
655 if (!size) {
656 return 0;
657 }
658
659 kvm_slots_lock();
660
661 while (size && !ret) {
662 slot_size = MIN(kvm_max_slot_size, size);
663 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
664 if (!mem) {
665 /* We don't have a slot if we want to trap every access. */
666 goto out;
667 }
668
669 ret = kvm_slot_update_flags(kml, mem, section->mr);
670 start_addr += slot_size;
671 size -= slot_size;
672 }
673
674 out:
675 kvm_slots_unlock();
676 return ret;
677 }
678
kvm_log_start(MemoryListener * listener,MemoryRegionSection * section,int old,int new)679 static void kvm_log_start(MemoryListener *listener,
680 MemoryRegionSection *section,
681 int old, int new)
682 {
683 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
684 int r;
685
686 if (old != 0) {
687 return;
688 }
689
690 r = kvm_section_update_flags(kml, section);
691 if (r < 0) {
692 abort();
693 }
694 }
695
kvm_log_stop(MemoryListener * listener,MemoryRegionSection * section,int old,int new)696 static void kvm_log_stop(MemoryListener *listener,
697 MemoryRegionSection *section,
698 int old, int new)
699 {
700 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
701 int r;
702
703 if (new != 0) {
704 return;
705 }
706
707 r = kvm_section_update_flags(kml, section);
708 if (r < 0) {
709 abort();
710 }
711 }
712
713 /* get kvm's dirty pages bitmap and update qemu's */
kvm_slot_sync_dirty_pages(KVMSlot * slot)714 static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
715 {
716 ram_addr_t start = slot->ram_start_offset;
717 ram_addr_t pages = slot->memory_size / qemu_real_host_page_size();
718
719 cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
720 }
721
kvm_slot_reset_dirty_pages(KVMSlot * slot)722 static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
723 {
724 memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
725 }
726
727 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
728
729 /* Allocate the dirty bitmap for a slot */
kvm_slot_init_dirty_bitmap(KVMSlot * mem)730 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem)
731 {
732 if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) {
733 return;
734 }
735
736 /*
737 * XXX bad kernel interface alert
738 * For dirty bitmap, kernel allocates array of size aligned to
739 * bits-per-long. But for case when the kernel is 64bits and
740 * the userspace is 32bits, userspace can't align to the same
741 * bits-per-long, since sizeof(long) is different between kernel
742 * and user space. This way, userspace will provide buffer which
743 * may be 4 bytes less than the kernel will use, resulting in
744 * userspace memory corruption (which is not detectable by valgrind
745 * too, in most cases).
746 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
747 * a hope that sizeof(long) won't become >8 any time soon.
748 *
749 * Note: the granule of kvm dirty log is qemu_real_host_page_size.
750 * And mem->memory_size is aligned to it (otherwise this mem can't
751 * be registered to KVM).
752 */
753 hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size(),
754 /*HOST_LONG_BITS*/ 64) / 8;
755 mem->dirty_bmap = g_malloc0(bitmap_size);
756 mem->dirty_bmap_size = bitmap_size;
757 }
758
759 /*
760 * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if
761 * succeeded, false otherwise
762 */
kvm_slot_get_dirty_log(KVMState * s,KVMSlot * slot)763 static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
764 {
765 struct kvm_dirty_log d = {};
766 int ret;
767
768 d.dirty_bitmap = slot->dirty_bmap;
769 d.slot = slot->slot | (slot->as_id << 16);
770 ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
771
772 if (ret == -ENOENT) {
773 /* kernel does not have dirty bitmap in this slot */
774 ret = 0;
775 }
776 if (ret) {
777 error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d",
778 __func__, ret);
779 }
780 return ret == 0;
781 }
782
783 /* Should be with all slots_lock held for the address spaces. */
kvm_dirty_ring_mark_page(KVMState * s,uint32_t as_id,uint32_t slot_id,uint64_t offset)784 static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
785 uint32_t slot_id, uint64_t offset)
786 {
787 KVMMemoryListener *kml;
788 KVMSlot *mem;
789
790 if (as_id >= s->nr_as) {
791 return;
792 }
793
794 kml = s->as[as_id].ml;
795 mem = &kml->slots[slot_id];
796
797 if (!mem->memory_size || offset >=
798 (mem->memory_size / qemu_real_host_page_size())) {
799 return;
800 }
801
802 set_bit(offset, mem->dirty_bmap);
803 }
804
dirty_gfn_is_dirtied(struct kvm_dirty_gfn * gfn)805 static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
806 {
807 /*
808 * Read the flags before the value. Pairs with barrier in
809 * KVM's kvm_dirty_ring_push() function.
810 */
811 return qatomic_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY;
812 }
813
dirty_gfn_set_collected(struct kvm_dirty_gfn * gfn)814 static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
815 {
816 /*
817 * Use a store-release so that the CPU that executes KVM_RESET_DIRTY_RINGS
818 * sees the full content of the ring:
819 *
820 * CPU0 CPU1 CPU2
821 * ------------------------------------------------------------------------------
822 * fill gfn0
823 * store-rel flags for gfn0
824 * load-acq flags for gfn0
825 * store-rel RESET for gfn0
826 * ioctl(RESET_RINGS)
827 * load-acq flags for gfn0
828 * check if flags have RESET
829 *
830 * The synchronization goes from CPU2 to CPU0 to CPU1.
831 */
832 qatomic_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET);
833 }
834
835 /*
836 * Should be with all slots_lock held for the address spaces. It returns the
837 * dirty page we've collected on this dirty ring.
838 */
kvm_dirty_ring_reap_one(KVMState * s,CPUState * cpu)839 static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
840 {
841 struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
842 uint32_t ring_size = s->kvm_dirty_ring_size;
843 uint32_t count = 0, fetch = cpu->kvm_fetch_index;
844
845 /*
846 * It's possible that we race with vcpu creation code where the vcpu is
847 * put onto the vcpus list but not yet initialized the dirty ring
848 * structures. If so, skip it.
849 */
850 if (!cpu->created) {
851 return 0;
852 }
853
854 assert(dirty_gfns && ring_size);
855 trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
856
857 while (true) {
858 cur = &dirty_gfns[fetch % ring_size];
859 if (!dirty_gfn_is_dirtied(cur)) {
860 break;
861 }
862 kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
863 cur->offset);
864 dirty_gfn_set_collected(cur);
865 trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
866 fetch++;
867 count++;
868 }
869 cpu->kvm_fetch_index = fetch;
870 cpu->dirty_pages += count;
871
872 return count;
873 }
874
875 /* Must be with slots_lock held */
kvm_dirty_ring_reap_locked(KVMState * s,CPUState * cpu)876 static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu)
877 {
878 int ret;
879 uint64_t total = 0;
880 int64_t stamp;
881
882 stamp = get_clock();
883
884 if (cpu) {
885 total = kvm_dirty_ring_reap_one(s, cpu);
886 } else {
887 CPU_FOREACH(cpu) {
888 total += kvm_dirty_ring_reap_one(s, cpu);
889 }
890 }
891
892 if (total) {
893 ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
894 assert(ret == total);
895 }
896
897 stamp = get_clock() - stamp;
898
899 if (total) {
900 trace_kvm_dirty_ring_reap(total, stamp / 1000);
901 }
902
903 return total;
904 }
905
906 /*
907 * Currently for simplicity, we must hold BQL before calling this. We can
908 * consider to drop the BQL if we're clear with all the race conditions.
909 */
kvm_dirty_ring_reap(KVMState * s,CPUState * cpu)910 static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu)
911 {
912 uint64_t total;
913
914 /*
915 * We need to lock all kvm slots for all address spaces here,
916 * because:
917 *
918 * (1) We need to mark dirty for dirty bitmaps in multiple slots
919 * and for tons of pages, so it's better to take the lock here
920 * once rather than once per page. And more importantly,
921 *
922 * (2) We must _NOT_ publish dirty bits to the other threads
923 * (e.g., the migration thread) via the kvm memory slot dirty
924 * bitmaps before correctly re-protect those dirtied pages.
925 * Otherwise we can have potential risk of data corruption if
926 * the page data is read in the other thread before we do
927 * reset below.
928 */
929 kvm_slots_lock();
930 total = kvm_dirty_ring_reap_locked(s, cpu);
931 kvm_slots_unlock();
932
933 return total;
934 }
935
do_kvm_cpu_synchronize_kick(CPUState * cpu,run_on_cpu_data arg)936 static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
937 {
938 /* No need to do anything */
939 }
940
941 /*
942 * Kick all vcpus out in a synchronized way. When returned, we
943 * guarantee that every vcpu has been kicked and at least returned to
944 * userspace once.
945 */
kvm_cpu_synchronize_kick_all(void)946 static void kvm_cpu_synchronize_kick_all(void)
947 {
948 CPUState *cpu;
949
950 CPU_FOREACH(cpu) {
951 run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
952 }
953 }
954
955 /*
956 * Flush all the existing dirty pages to the KVM slot buffers. When
957 * this call returns, we guarantee that all the touched dirty pages
958 * before calling this function have been put into the per-kvmslot
959 * dirty bitmap.
960 *
961 * This function must be called with BQL held.
962 */
kvm_dirty_ring_flush(void)963 static void kvm_dirty_ring_flush(void)
964 {
965 trace_kvm_dirty_ring_flush(0);
966 /*
967 * The function needs to be serialized. Since this function
968 * should always be with BQL held, serialization is guaranteed.
969 * However, let's be sure of it.
970 */
971 assert(bql_locked());
972 /*
973 * First make sure to flush the hardware buffers by kicking all
974 * vcpus out in a synchronous way.
975 */
976 kvm_cpu_synchronize_kick_all();
977 kvm_dirty_ring_reap(kvm_state, NULL);
978 trace_kvm_dirty_ring_flush(1);
979 }
980
981 /**
982 * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
983 *
984 * This function will first try to fetch dirty bitmap from the kernel,
985 * and then updates qemu's dirty bitmap.
986 *
987 * NOTE: caller must be with kml->slots_lock held.
988 *
989 * @kml: the KVM memory listener object
990 * @section: the memory section to sync the dirty bitmap with
991 */
kvm_physical_sync_dirty_bitmap(KVMMemoryListener * kml,MemoryRegionSection * section)992 static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
993 MemoryRegionSection *section)
994 {
995 KVMState *s = kvm_state;
996 KVMSlot *mem;
997 hwaddr start_addr, size;
998 hwaddr slot_size;
999
1000 size = kvm_align_section(section, &start_addr);
1001 while (size) {
1002 slot_size = MIN(kvm_max_slot_size, size);
1003 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
1004 if (!mem) {
1005 /* We don't have a slot if we want to trap every access. */
1006 return;
1007 }
1008 if (kvm_slot_get_dirty_log(s, mem)) {
1009 kvm_slot_sync_dirty_pages(mem);
1010 }
1011 start_addr += slot_size;
1012 size -= slot_size;
1013 }
1014 }
1015
1016 /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
1017 #define KVM_CLEAR_LOG_SHIFT 6
1018 #define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size() << KVM_CLEAR_LOG_SHIFT)
1019 #define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN)
1020
kvm_log_clear_one_slot(KVMSlot * mem,int as_id,uint64_t start,uint64_t size)1021 static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
1022 uint64_t size)
1023 {
1024 KVMState *s = kvm_state;
1025 uint64_t end, bmap_start, start_delta, bmap_npages;
1026 struct kvm_clear_dirty_log d;
1027 unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size();
1028 int ret;
1029
1030 /*
1031 * We need to extend either the start or the size or both to
1032 * satisfy the KVM interface requirement. Firstly, do the start
1033 * page alignment on 64 host pages
1034 */
1035 bmap_start = start & KVM_CLEAR_LOG_MASK;
1036 start_delta = start - bmap_start;
1037 bmap_start /= psize;
1038
1039 /*
1040 * The kernel interface has restriction on the size too, that either:
1041 *
1042 * (1) the size is 64 host pages aligned (just like the start), or
1043 * (2) the size fills up until the end of the KVM memslot.
1044 */
1045 bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
1046 << KVM_CLEAR_LOG_SHIFT;
1047 end = mem->memory_size / psize;
1048 if (bmap_npages > end - bmap_start) {
1049 bmap_npages = end - bmap_start;
1050 }
1051 start_delta /= psize;
1052
1053 /*
1054 * Prepare the bitmap to clear dirty bits. Here we must guarantee
1055 * that we won't clear any unknown dirty bits otherwise we might
1056 * accidentally clear some set bits which are not yet synced from
1057 * the kernel into QEMU's bitmap, then we'll lose track of the
1058 * guest modifications upon those pages (which can directly lead
1059 * to guest data loss or panic after migration).
1060 *
1061 * Layout of the KVMSlot.dirty_bmap:
1062 *
1063 * |<-------- bmap_npages -----------..>|
1064 * [1]
1065 * start_delta size
1066 * |----------------|-------------|------------------|------------|
1067 * ^ ^ ^ ^
1068 * | | | |
1069 * start bmap_start (start) end
1070 * of memslot of memslot
1071 *
1072 * [1] bmap_npages can be aligned to either 64 pages or the end of slot
1073 */
1074
1075 assert(bmap_start % BITS_PER_LONG == 0);
1076 /* We should never do log_clear before log_sync */
1077 assert(mem->dirty_bmap);
1078 if (start_delta || bmap_npages - size / psize) {
1079 /* Slow path - we need to manipulate a temp bitmap */
1080 bmap_clear = bitmap_new(bmap_npages);
1081 bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
1082 bmap_start, start_delta + size / psize);
1083 /*
1084 * We need to fill the holes at start because that was not
1085 * specified by the caller and we extended the bitmap only for
1086 * 64 pages alignment
1087 */
1088 bitmap_clear(bmap_clear, 0, start_delta);
1089 d.dirty_bitmap = bmap_clear;
1090 } else {
1091 /*
1092 * Fast path - both start and size align well with BITS_PER_LONG
1093 * (or the end of memory slot)
1094 */
1095 d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
1096 }
1097
1098 d.first_page = bmap_start;
1099 /* It should never overflow. If it happens, say something */
1100 assert(bmap_npages <= UINT32_MAX);
1101 d.num_pages = bmap_npages;
1102 d.slot = mem->slot | (as_id << 16);
1103
1104 ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d);
1105 if (ret < 0 && ret != -ENOENT) {
1106 error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
1107 "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
1108 __func__, d.slot, (uint64_t)d.first_page,
1109 (uint32_t)d.num_pages, ret);
1110 } else {
1111 ret = 0;
1112 trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
1113 }
1114
1115 /*
1116 * After we have updated the remote dirty bitmap, we update the
1117 * cached bitmap as well for the memslot, then if another user
1118 * clears the same region we know we shouldn't clear it again on
1119 * the remote otherwise it's data loss as well.
1120 */
1121 bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
1122 size / psize);
1123 /* This handles the NULL case well */
1124 g_free(bmap_clear);
1125 return ret;
1126 }
1127
1128
1129 /**
1130 * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
1131 *
1132 * NOTE: this will be a no-op if we haven't enabled manual dirty log
1133 * protection in the host kernel because in that case this operation
1134 * will be done within log_sync().
1135 *
1136 * @kml: the kvm memory listener
1137 * @section: the memory range to clear dirty bitmap
1138 */
kvm_physical_log_clear(KVMMemoryListener * kml,MemoryRegionSection * section)1139 static int kvm_physical_log_clear(KVMMemoryListener *kml,
1140 MemoryRegionSection *section)
1141 {
1142 KVMState *s = kvm_state;
1143 uint64_t start, size, offset, count;
1144 KVMSlot *mem;
1145 int ret = 0, i;
1146
1147 if (!s->manual_dirty_log_protect) {
1148 /* No need to do explicit clear */
1149 return ret;
1150 }
1151
1152 start = section->offset_within_address_space;
1153 size = int128_get64(section->size);
1154
1155 if (!size) {
1156 /* Nothing more we can do... */
1157 return ret;
1158 }
1159
1160 kvm_slots_lock();
1161
1162 for (i = 0; i < kml->nr_slots_allocated; i++) {
1163 mem = &kml->slots[i];
1164 /* Discard slots that are empty or do not overlap the section */
1165 if (!mem->memory_size ||
1166 mem->start_addr > start + size - 1 ||
1167 start > mem->start_addr + mem->memory_size - 1) {
1168 continue;
1169 }
1170
1171 if (start >= mem->start_addr) {
1172 /* The slot starts before section or is aligned to it. */
1173 offset = start - mem->start_addr;
1174 count = MIN(mem->memory_size - offset, size);
1175 } else {
1176 /* The slot starts after section. */
1177 offset = 0;
1178 count = MIN(mem->memory_size, size - (mem->start_addr - start));
1179 }
1180 ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count);
1181 if (ret < 0) {
1182 break;
1183 }
1184 }
1185
1186 kvm_slots_unlock();
1187
1188 return ret;
1189 }
1190
kvm_coalesce_mmio_region(MemoryListener * listener,MemoryRegionSection * secion,hwaddr start,hwaddr size)1191 static void kvm_coalesce_mmio_region(MemoryListener *listener,
1192 MemoryRegionSection *secion,
1193 hwaddr start, hwaddr size)
1194 {
1195 KVMState *s = kvm_state;
1196
1197 if (s->coalesced_mmio) {
1198 struct kvm_coalesced_mmio_zone zone;
1199
1200 zone.addr = start;
1201 zone.size = size;
1202 zone.pad = 0;
1203
1204 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1205 }
1206 }
1207
kvm_uncoalesce_mmio_region(MemoryListener * listener,MemoryRegionSection * secion,hwaddr start,hwaddr size)1208 static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
1209 MemoryRegionSection *secion,
1210 hwaddr start, hwaddr size)
1211 {
1212 KVMState *s = kvm_state;
1213
1214 if (s->coalesced_mmio) {
1215 struct kvm_coalesced_mmio_zone zone;
1216
1217 zone.addr = start;
1218 zone.size = size;
1219 zone.pad = 0;
1220
1221 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1222 }
1223 }
1224
kvm_coalesce_pio_add(MemoryListener * listener,MemoryRegionSection * section,hwaddr start,hwaddr size)1225 static void kvm_coalesce_pio_add(MemoryListener *listener,
1226 MemoryRegionSection *section,
1227 hwaddr start, hwaddr size)
1228 {
1229 KVMState *s = kvm_state;
1230
1231 if (s->coalesced_pio) {
1232 struct kvm_coalesced_mmio_zone zone;
1233
1234 zone.addr = start;
1235 zone.size = size;
1236 zone.pio = 1;
1237
1238 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1239 }
1240 }
1241
kvm_coalesce_pio_del(MemoryListener * listener,MemoryRegionSection * section,hwaddr start,hwaddr size)1242 static void kvm_coalesce_pio_del(MemoryListener *listener,
1243 MemoryRegionSection *section,
1244 hwaddr start, hwaddr size)
1245 {
1246 KVMState *s = kvm_state;
1247
1248 if (s->coalesced_pio) {
1249 struct kvm_coalesced_mmio_zone zone;
1250
1251 zone.addr = start;
1252 zone.size = size;
1253 zone.pio = 1;
1254
1255 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1256 }
1257 }
1258
kvm_check_extension(KVMState * s,unsigned int extension)1259 int kvm_check_extension(KVMState *s, unsigned int extension)
1260 {
1261 int ret;
1262
1263 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1264 if (ret < 0) {
1265 ret = 0;
1266 }
1267
1268 return ret;
1269 }
1270
kvm_vm_check_extension(KVMState * s,unsigned int extension)1271 int kvm_vm_check_extension(KVMState *s, unsigned int extension)
1272 {
1273 int ret;
1274
1275 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1276 if (ret < 0) {
1277 /* VM wide version not implemented, use global one instead */
1278 ret = kvm_check_extension(s, extension);
1279 }
1280
1281 return ret;
1282 }
1283
1284 /*
1285 * We track the poisoned pages to be able to:
1286 * - replace them on VM reset
1287 * - block a migration for a VM with a poisoned page
1288 */
1289 typedef struct HWPoisonPage {
1290 ram_addr_t ram_addr;
1291 QLIST_ENTRY(HWPoisonPage) list;
1292 } HWPoisonPage;
1293
1294 static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
1295 QLIST_HEAD_INITIALIZER(hwpoison_page_list);
1296
kvm_unpoison_all(void * param)1297 static void kvm_unpoison_all(void *param)
1298 {
1299 HWPoisonPage *page, *next_page;
1300
1301 QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
1302 QLIST_REMOVE(page, list);
1303 qemu_ram_remap(page->ram_addr);
1304 g_free(page);
1305 }
1306 }
1307
kvm_hwpoison_page_add(ram_addr_t ram_addr)1308 void kvm_hwpoison_page_add(ram_addr_t ram_addr)
1309 {
1310 HWPoisonPage *page;
1311
1312 QLIST_FOREACH(page, &hwpoison_page_list, list) {
1313 if (page->ram_addr == ram_addr) {
1314 return;
1315 }
1316 }
1317 page = g_new(HWPoisonPage, 1);
1318 page->ram_addr = ram_addr;
1319 QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
1320 }
1321
kvm_hwpoisoned_mem(void)1322 bool kvm_hwpoisoned_mem(void)
1323 {
1324 return !QLIST_EMPTY(&hwpoison_page_list);
1325 }
1326
adjust_ioeventfd_endianness(uint32_t val,uint32_t size)1327 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
1328 {
1329 if (target_needs_bswap()) {
1330 /*
1331 * The kernel expects ioeventfd values in HOST_BIG_ENDIAN
1332 * endianness, but the memory core hands them in target endianness.
1333 * For example, PPC is always treated as big-endian even if running
1334 * on KVM and on PPC64LE. Correct here, swapping back.
1335 */
1336 switch (size) {
1337 case 2:
1338 val = bswap16(val);
1339 break;
1340 case 4:
1341 val = bswap32(val);
1342 break;
1343 }
1344 }
1345 return val;
1346 }
1347
kvm_set_ioeventfd_mmio(int fd,hwaddr addr,uint32_t val,bool assign,uint32_t size,bool datamatch)1348 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
1349 bool assign, uint32_t size, bool datamatch)
1350 {
1351 int ret;
1352 struct kvm_ioeventfd iofd = {
1353 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1354 .addr = addr,
1355 .len = size,
1356 .flags = 0,
1357 .fd = fd,
1358 };
1359
1360 trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
1361 datamatch);
1362 if (!kvm_enabled()) {
1363 return -ENOSYS;
1364 }
1365
1366 if (datamatch) {
1367 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1368 }
1369 if (!assign) {
1370 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1371 }
1372
1373 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1374
1375 if (ret < 0) {
1376 return -errno;
1377 }
1378
1379 return 0;
1380 }
1381
kvm_set_ioeventfd_pio(int fd,uint16_t addr,uint16_t val,bool assign,uint32_t size,bool datamatch)1382 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
1383 bool assign, uint32_t size, bool datamatch)
1384 {
1385 struct kvm_ioeventfd kick = {
1386 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1387 .addr = addr,
1388 .flags = KVM_IOEVENTFD_FLAG_PIO,
1389 .len = size,
1390 .fd = fd,
1391 };
1392 int r;
1393 trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
1394 if (!kvm_enabled()) {
1395 return -ENOSYS;
1396 }
1397 if (datamatch) {
1398 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1399 }
1400 if (!assign) {
1401 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1402 }
1403 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1404 if (r < 0) {
1405 return r;
1406 }
1407 return 0;
1408 }
1409
1410
1411 static const KVMCapabilityInfo *
kvm_check_extension_list(KVMState * s,const KVMCapabilityInfo * list)1412 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
1413 {
1414 while (list->name) {
1415 if (!kvm_check_extension(s, list->value)) {
1416 return list;
1417 }
1418 list++;
1419 }
1420 return NULL;
1421 }
1422
kvm_set_max_memslot_size(hwaddr max_slot_size)1423 void kvm_set_max_memslot_size(hwaddr max_slot_size)
1424 {
1425 g_assert(
1426 ROUND_UP(max_slot_size, qemu_real_host_page_size()) == max_slot_size
1427 );
1428 kvm_max_slot_size = max_slot_size;
1429 }
1430
kvm_set_memory_attributes(hwaddr start,uint64_t size,uint64_t attr)1431 static int kvm_set_memory_attributes(hwaddr start, uint64_t size, uint64_t attr)
1432 {
1433 struct kvm_memory_attributes attrs;
1434 int r;
1435
1436 assert((attr & kvm_supported_memory_attributes) == attr);
1437 attrs.attributes = attr;
1438 attrs.address = start;
1439 attrs.size = size;
1440 attrs.flags = 0;
1441
1442 r = kvm_vm_ioctl(kvm_state, KVM_SET_MEMORY_ATTRIBUTES, &attrs);
1443 if (r) {
1444 error_report("failed to set memory (0x%" HWADDR_PRIx "+0x%" PRIx64 ") "
1445 "with attr 0x%" PRIx64 " error '%s'",
1446 start, size, attr, strerror(errno));
1447 }
1448 return r;
1449 }
1450
kvm_set_memory_attributes_private(hwaddr start,uint64_t size)1451 int kvm_set_memory_attributes_private(hwaddr start, uint64_t size)
1452 {
1453 return kvm_set_memory_attributes(start, size, KVM_MEMORY_ATTRIBUTE_PRIVATE);
1454 }
1455
kvm_set_memory_attributes_shared(hwaddr start,uint64_t size)1456 int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size)
1457 {
1458 return kvm_set_memory_attributes(start, size, 0);
1459 }
1460
1461 /* Called with KVMMemoryListener.slots_lock held */
kvm_set_phys_mem(KVMMemoryListener * kml,MemoryRegionSection * section,bool add)1462 static void kvm_set_phys_mem(KVMMemoryListener *kml,
1463 MemoryRegionSection *section, bool add)
1464 {
1465 KVMSlot *mem;
1466 int err;
1467 MemoryRegion *mr = section->mr;
1468 bool writable = !mr->readonly && !mr->rom_device;
1469 hwaddr start_addr, size, slot_size, mr_offset;
1470 ram_addr_t ram_start_offset;
1471 void *ram;
1472
1473 if (!memory_region_is_ram(mr)) {
1474 if (writable || !kvm_readonly_mem_allowed) {
1475 return;
1476 } else if (!mr->romd_mode) {
1477 /* If the memory device is not in romd_mode, then we actually want
1478 * to remove the kvm memory slot so all accesses will trap. */
1479 add = false;
1480 }
1481 }
1482
1483 size = kvm_align_section(section, &start_addr);
1484 if (!size) {
1485 return;
1486 }
1487
1488 /* The offset of the kvmslot within the memory region */
1489 mr_offset = section->offset_within_region + start_addr -
1490 section->offset_within_address_space;
1491
1492 /* use aligned delta to align the ram address and offset */
1493 ram = memory_region_get_ram_ptr(mr) + mr_offset;
1494 ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
1495
1496 if (!add) {
1497 do {
1498 slot_size = MIN(kvm_max_slot_size, size);
1499 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
1500 if (!mem) {
1501 return;
1502 }
1503 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1504 /*
1505 * NOTE: We should be aware of the fact that here we're only
1506 * doing a best effort to sync dirty bits. No matter whether
1507 * we're using dirty log or dirty ring, we ignored two facts:
1508 *
1509 * (1) dirty bits can reside in hardware buffers (PML)
1510 *
1511 * (2) after we collected dirty bits here, pages can be dirtied
1512 * again before we do the final KVM_SET_USER_MEMORY_REGION to
1513 * remove the slot.
1514 *
1515 * Not easy. Let's cross the fingers until it's fixed.
1516 */
1517 if (kvm_state->kvm_dirty_ring_size) {
1518 kvm_dirty_ring_reap_locked(kvm_state, NULL);
1519 if (kvm_state->kvm_dirty_ring_with_bitmap) {
1520 kvm_slot_sync_dirty_pages(mem);
1521 kvm_slot_get_dirty_log(kvm_state, mem);
1522 }
1523 } else {
1524 kvm_slot_get_dirty_log(kvm_state, mem);
1525 }
1526 kvm_slot_sync_dirty_pages(mem);
1527 }
1528
1529 /* unregister the slot */
1530 g_free(mem->dirty_bmap);
1531 mem->dirty_bmap = NULL;
1532 mem->memory_size = 0;
1533 mem->flags = 0;
1534 err = kvm_set_user_memory_region(kml, mem, false);
1535 if (err) {
1536 fprintf(stderr, "%s: error unregistering slot: %s\n",
1537 __func__, strerror(-err));
1538 abort();
1539 }
1540 start_addr += slot_size;
1541 size -= slot_size;
1542 kml->nr_slots_used--;
1543 } while (size);
1544 return;
1545 }
1546
1547 /* register the new slot */
1548 do {
1549 slot_size = MIN(kvm_max_slot_size, size);
1550 mem = kvm_alloc_slot(kml);
1551 mem->as_id = kml->as_id;
1552 mem->memory_size = slot_size;
1553 mem->start_addr = start_addr;
1554 mem->ram_start_offset = ram_start_offset;
1555 mem->ram = ram;
1556 mem->flags = kvm_mem_flags(mr);
1557 mem->guest_memfd = mr->ram_block->guest_memfd;
1558 mem->guest_memfd_offset = (uint8_t*)ram - mr->ram_block->host;
1559
1560 kvm_slot_init_dirty_bitmap(mem);
1561 err = kvm_set_user_memory_region(kml, mem, true);
1562 if (err) {
1563 fprintf(stderr, "%s: error registering slot: %s\n", __func__,
1564 strerror(-err));
1565 abort();
1566 }
1567
1568 if (memory_region_has_guest_memfd(mr)) {
1569 err = kvm_set_memory_attributes_private(start_addr, slot_size);
1570 if (err) {
1571 error_report("%s: failed to set memory attribute private: %s",
1572 __func__, strerror(-err));
1573 exit(1);
1574 }
1575 }
1576
1577 start_addr += slot_size;
1578 ram_start_offset += slot_size;
1579 ram += slot_size;
1580 size -= slot_size;
1581 kml->nr_slots_used++;
1582 } while (size);
1583 }
1584
kvm_dirty_ring_reaper_thread(void * data)1585 static void *kvm_dirty_ring_reaper_thread(void *data)
1586 {
1587 KVMState *s = data;
1588 struct KVMDirtyRingReaper *r = &s->reaper;
1589
1590 rcu_register_thread();
1591
1592 trace_kvm_dirty_ring_reaper("init");
1593
1594 while (true) {
1595 r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
1596 trace_kvm_dirty_ring_reaper("wait");
1597 /*
1598 * TODO: provide a smarter timeout rather than a constant?
1599 */
1600 sleep(1);
1601
1602 /* keep sleeping so that dirtylimit not be interfered by reaper */
1603 if (dirtylimit_in_service()) {
1604 continue;
1605 }
1606
1607 trace_kvm_dirty_ring_reaper("wakeup");
1608 r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
1609
1610 bql_lock();
1611 kvm_dirty_ring_reap(s, NULL);
1612 bql_unlock();
1613
1614 r->reaper_iteration++;
1615 }
1616
1617 g_assert_not_reached();
1618 }
1619
kvm_dirty_ring_reaper_init(KVMState * s)1620 static void kvm_dirty_ring_reaper_init(KVMState *s)
1621 {
1622 struct KVMDirtyRingReaper *r = &s->reaper;
1623
1624 qemu_thread_create(&r->reaper_thr, "kvm-reaper",
1625 kvm_dirty_ring_reaper_thread,
1626 s, QEMU_THREAD_JOINABLE);
1627 }
1628
kvm_dirty_ring_init(KVMState * s)1629 static int kvm_dirty_ring_init(KVMState *s)
1630 {
1631 uint32_t ring_size = s->kvm_dirty_ring_size;
1632 uint64_t ring_bytes = ring_size * sizeof(struct kvm_dirty_gfn);
1633 unsigned int capability = KVM_CAP_DIRTY_LOG_RING;
1634 int ret;
1635
1636 s->kvm_dirty_ring_size = 0;
1637 s->kvm_dirty_ring_bytes = 0;
1638
1639 /* Bail if the dirty ring size isn't specified */
1640 if (!ring_size) {
1641 return 0;
1642 }
1643
1644 /*
1645 * Read the max supported pages. Fall back to dirty logging mode
1646 * if the dirty ring isn't supported.
1647 */
1648 ret = kvm_vm_check_extension(s, capability);
1649 if (ret <= 0) {
1650 capability = KVM_CAP_DIRTY_LOG_RING_ACQ_REL;
1651 ret = kvm_vm_check_extension(s, capability);
1652 }
1653
1654 if (ret <= 0) {
1655 warn_report("KVM dirty ring not available, using bitmap method");
1656 return 0;
1657 }
1658
1659 if (ring_bytes > ret) {
1660 error_report("KVM dirty ring size %" PRIu32 " too big "
1661 "(maximum is %ld). Please use a smaller value.",
1662 ring_size, (long)ret / sizeof(struct kvm_dirty_gfn));
1663 return -EINVAL;
1664 }
1665
1666 ret = kvm_vm_enable_cap(s, capability, 0, ring_bytes);
1667 if (ret) {
1668 error_report("Enabling of KVM dirty ring failed: %s. "
1669 "Suggested minimum value is 1024.", strerror(-ret));
1670 return -EIO;
1671 }
1672
1673 /* Enable the backup bitmap if it is supported */
1674 ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP);
1675 if (ret > 0) {
1676 ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP, 0);
1677 if (ret) {
1678 error_report("Enabling of KVM dirty ring's backup bitmap failed: "
1679 "%s. ", strerror(-ret));
1680 return -EIO;
1681 }
1682
1683 s->kvm_dirty_ring_with_bitmap = true;
1684 }
1685
1686 s->kvm_dirty_ring_size = ring_size;
1687 s->kvm_dirty_ring_bytes = ring_bytes;
1688
1689 return 0;
1690 }
1691
kvm_region_add(MemoryListener * listener,MemoryRegionSection * section)1692 static void kvm_region_add(MemoryListener *listener,
1693 MemoryRegionSection *section)
1694 {
1695 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1696 KVMMemoryUpdate *update;
1697
1698 update = g_new0(KVMMemoryUpdate, 1);
1699 update->section = *section;
1700
1701 QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next);
1702 }
1703
kvm_region_del(MemoryListener * listener,MemoryRegionSection * section)1704 static void kvm_region_del(MemoryListener *listener,
1705 MemoryRegionSection *section)
1706 {
1707 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1708 KVMMemoryUpdate *update;
1709
1710 update = g_new0(KVMMemoryUpdate, 1);
1711 update->section = *section;
1712
1713 QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next);
1714 }
1715
kvm_region_commit(MemoryListener * listener)1716 static void kvm_region_commit(MemoryListener *listener)
1717 {
1718 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
1719 listener);
1720 KVMMemoryUpdate *u1, *u2;
1721 bool need_inhibit = false;
1722
1723 if (QSIMPLEQ_EMPTY(&kml->transaction_add) &&
1724 QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1725 return;
1726 }
1727
1728 /*
1729 * We have to be careful when regions to add overlap with ranges to remove.
1730 * We have to simulate atomic KVM memslot updates by making sure no ioctl()
1731 * is currently active.
1732 *
1733 * The lists are order by addresses, so it's easy to find overlaps.
1734 */
1735 u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1736 u2 = QSIMPLEQ_FIRST(&kml->transaction_add);
1737 while (u1 && u2) {
1738 Range r1, r2;
1739
1740 range_init_nofail(&r1, u1->section.offset_within_address_space,
1741 int128_get64(u1->section.size));
1742 range_init_nofail(&r2, u2->section.offset_within_address_space,
1743 int128_get64(u2->section.size));
1744
1745 if (range_overlaps_range(&r1, &r2)) {
1746 need_inhibit = true;
1747 break;
1748 }
1749 if (range_lob(&r1) < range_lob(&r2)) {
1750 u1 = QSIMPLEQ_NEXT(u1, next);
1751 } else {
1752 u2 = QSIMPLEQ_NEXT(u2, next);
1753 }
1754 }
1755
1756 kvm_slots_lock();
1757 if (need_inhibit) {
1758 accel_ioctl_inhibit_begin();
1759 }
1760
1761 /* Remove all memslots before adding the new ones. */
1762 while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1763 u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1764 QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next);
1765
1766 kvm_set_phys_mem(kml, &u1->section, false);
1767 memory_region_unref(u1->section.mr);
1768
1769 g_free(u1);
1770 }
1771 while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) {
1772 u1 = QSIMPLEQ_FIRST(&kml->transaction_add);
1773 QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next);
1774
1775 memory_region_ref(u1->section.mr);
1776 kvm_set_phys_mem(kml, &u1->section, true);
1777
1778 g_free(u1);
1779 }
1780
1781 if (need_inhibit) {
1782 accel_ioctl_inhibit_end();
1783 }
1784 kvm_slots_unlock();
1785 }
1786
kvm_log_sync(MemoryListener * listener,MemoryRegionSection * section)1787 static void kvm_log_sync(MemoryListener *listener,
1788 MemoryRegionSection *section)
1789 {
1790 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1791
1792 kvm_slots_lock();
1793 kvm_physical_sync_dirty_bitmap(kml, section);
1794 kvm_slots_unlock();
1795 }
1796
kvm_log_sync_global(MemoryListener * l,bool last_stage)1797 static void kvm_log_sync_global(MemoryListener *l, bool last_stage)
1798 {
1799 KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
1800 KVMState *s = kvm_state;
1801 KVMSlot *mem;
1802 int i;
1803
1804 /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
1805 kvm_dirty_ring_flush();
1806
1807 kvm_slots_lock();
1808 for (i = 0; i < kml->nr_slots_allocated; i++) {
1809 mem = &kml->slots[i];
1810 if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1811 kvm_slot_sync_dirty_pages(mem);
1812
1813 if (s->kvm_dirty_ring_with_bitmap && last_stage &&
1814 kvm_slot_get_dirty_log(s, mem)) {
1815 kvm_slot_sync_dirty_pages(mem);
1816 }
1817
1818 /*
1819 * This is not needed by KVM_GET_DIRTY_LOG because the
1820 * ioctl will unconditionally overwrite the whole region.
1821 * However kvm dirty ring has no such side effect.
1822 */
1823 kvm_slot_reset_dirty_pages(mem);
1824 }
1825 }
1826 kvm_slots_unlock();
1827 }
1828
kvm_log_clear(MemoryListener * listener,MemoryRegionSection * section)1829 static void kvm_log_clear(MemoryListener *listener,
1830 MemoryRegionSection *section)
1831 {
1832 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1833 int r;
1834
1835 r = kvm_physical_log_clear(kml, section);
1836 if (r < 0) {
1837 error_report_once("%s: kvm log clear failed: mr=%s "
1838 "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
1839 section->mr->name, section->offset_within_region,
1840 int128_get64(section->size));
1841 abort();
1842 }
1843 }
1844
kvm_mem_ioeventfd_add(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1845 static void kvm_mem_ioeventfd_add(MemoryListener *listener,
1846 MemoryRegionSection *section,
1847 bool match_data, uint64_t data,
1848 EventNotifier *e)
1849 {
1850 int fd = event_notifier_get_fd(e);
1851 int r;
1852
1853 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1854 data, true, int128_get64(section->size),
1855 match_data);
1856 if (r < 0) {
1857 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1858 __func__, strerror(-r), -r);
1859 abort();
1860 }
1861 }
1862
kvm_mem_ioeventfd_del(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1863 static void kvm_mem_ioeventfd_del(MemoryListener *listener,
1864 MemoryRegionSection *section,
1865 bool match_data, uint64_t data,
1866 EventNotifier *e)
1867 {
1868 int fd = event_notifier_get_fd(e);
1869 int r;
1870
1871 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1872 data, false, int128_get64(section->size),
1873 match_data);
1874 if (r < 0) {
1875 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1876 __func__, strerror(-r), -r);
1877 abort();
1878 }
1879 }
1880
kvm_io_ioeventfd_add(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1881 static void kvm_io_ioeventfd_add(MemoryListener *listener,
1882 MemoryRegionSection *section,
1883 bool match_data, uint64_t data,
1884 EventNotifier *e)
1885 {
1886 int fd = event_notifier_get_fd(e);
1887 int r;
1888
1889 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1890 data, true, int128_get64(section->size),
1891 match_data);
1892 if (r < 0) {
1893 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1894 __func__, strerror(-r), -r);
1895 abort();
1896 }
1897 }
1898
kvm_io_ioeventfd_del(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1899 static void kvm_io_ioeventfd_del(MemoryListener *listener,
1900 MemoryRegionSection *section,
1901 bool match_data, uint64_t data,
1902 EventNotifier *e)
1903
1904 {
1905 int fd = event_notifier_get_fd(e);
1906 int r;
1907
1908 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1909 data, false, int128_get64(section->size),
1910 match_data);
1911 if (r < 0) {
1912 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1913 __func__, strerror(-r), -r);
1914 abort();
1915 }
1916 }
1917
kvm_memory_listener_register(KVMState * s,KVMMemoryListener * kml,AddressSpace * as,int as_id,const char * name)1918 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
1919 AddressSpace *as, int as_id, const char *name)
1920 {
1921 int i;
1922
1923 kml->as_id = as_id;
1924
1925 kvm_slots_grow(kml, KVM_MEMSLOTS_NR_ALLOC_DEFAULT);
1926
1927 QSIMPLEQ_INIT(&kml->transaction_add);
1928 QSIMPLEQ_INIT(&kml->transaction_del);
1929
1930 kml->listener.region_add = kvm_region_add;
1931 kml->listener.region_del = kvm_region_del;
1932 kml->listener.commit = kvm_region_commit;
1933 kml->listener.log_start = kvm_log_start;
1934 kml->listener.log_stop = kvm_log_stop;
1935 kml->listener.priority = MEMORY_LISTENER_PRIORITY_ACCEL;
1936 kml->listener.name = name;
1937
1938 if (s->kvm_dirty_ring_size) {
1939 kml->listener.log_sync_global = kvm_log_sync_global;
1940 } else {
1941 kml->listener.log_sync = kvm_log_sync;
1942 kml->listener.log_clear = kvm_log_clear;
1943 }
1944
1945 memory_listener_register(&kml->listener, as);
1946
1947 for (i = 0; i < s->nr_as; ++i) {
1948 if (!s->as[i].as) {
1949 s->as[i].as = as;
1950 s->as[i].ml = kml;
1951 break;
1952 }
1953 }
1954 }
1955
1956 static MemoryListener kvm_io_listener = {
1957 .name = "kvm-io",
1958 .coalesced_io_add = kvm_coalesce_pio_add,
1959 .coalesced_io_del = kvm_coalesce_pio_del,
1960 .eventfd_add = kvm_io_ioeventfd_add,
1961 .eventfd_del = kvm_io_ioeventfd_del,
1962 .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND,
1963 };
1964
kvm_set_irq(KVMState * s,int irq,int level)1965 int kvm_set_irq(KVMState *s, int irq, int level)
1966 {
1967 struct kvm_irq_level event;
1968 int ret;
1969
1970 assert(kvm_async_interrupts_enabled());
1971
1972 event.level = level;
1973 event.irq = irq;
1974 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
1975 if (ret < 0) {
1976 perror("kvm_set_irq");
1977 abort();
1978 }
1979
1980 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
1981 }
1982
1983 #ifdef KVM_CAP_IRQ_ROUTING
1984 typedef struct KVMMSIRoute {
1985 struct kvm_irq_routing_entry kroute;
1986 QTAILQ_ENTRY(KVMMSIRoute) entry;
1987 } KVMMSIRoute;
1988
set_gsi(KVMState * s,unsigned int gsi)1989 static void set_gsi(KVMState *s, unsigned int gsi)
1990 {
1991 set_bit(gsi, s->used_gsi_bitmap);
1992 }
1993
clear_gsi(KVMState * s,unsigned int gsi)1994 static void clear_gsi(KVMState *s, unsigned int gsi)
1995 {
1996 clear_bit(gsi, s->used_gsi_bitmap);
1997 }
1998
kvm_init_irq_routing(KVMState * s)1999 void kvm_init_irq_routing(KVMState *s)
2000 {
2001 int gsi_count;
2002
2003 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
2004 if (gsi_count > 0) {
2005 /* Round up so we can search ints using ffs */
2006 s->used_gsi_bitmap = bitmap_new(gsi_count);
2007 s->gsi_count = gsi_count;
2008 }
2009
2010 s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
2011 s->nr_allocated_irq_routes = 0;
2012
2013 kvm_arch_init_irq_routing(s);
2014 }
2015
kvm_irqchip_commit_routes(KVMState * s)2016 void kvm_irqchip_commit_routes(KVMState *s)
2017 {
2018 int ret;
2019
2020 if (kvm_gsi_direct_mapping()) {
2021 return;
2022 }
2023
2024 if (!kvm_gsi_routing_enabled()) {
2025 return;
2026 }
2027
2028 s->irq_routes->flags = 0;
2029 trace_kvm_irqchip_commit_routes();
2030 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
2031 assert(ret == 0);
2032 }
2033
kvm_add_routing_entry(KVMState * s,struct kvm_irq_routing_entry * entry)2034 void kvm_add_routing_entry(KVMState *s,
2035 struct kvm_irq_routing_entry *entry)
2036 {
2037 struct kvm_irq_routing_entry *new;
2038 int n, size;
2039
2040 if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
2041 n = s->nr_allocated_irq_routes * 2;
2042 if (n < 64) {
2043 n = 64;
2044 }
2045 size = sizeof(struct kvm_irq_routing);
2046 size += n * sizeof(*new);
2047 s->irq_routes = g_realloc(s->irq_routes, size);
2048 s->nr_allocated_irq_routes = n;
2049 }
2050 n = s->irq_routes->nr++;
2051 new = &s->irq_routes->entries[n];
2052
2053 *new = *entry;
2054
2055 set_gsi(s, entry->gsi);
2056 }
2057
kvm_update_routing_entry(KVMState * s,struct kvm_irq_routing_entry * new_entry)2058 static int kvm_update_routing_entry(KVMState *s,
2059 struct kvm_irq_routing_entry *new_entry)
2060 {
2061 struct kvm_irq_routing_entry *entry;
2062 int n;
2063
2064 for (n = 0; n < s->irq_routes->nr; n++) {
2065 entry = &s->irq_routes->entries[n];
2066 if (entry->gsi != new_entry->gsi) {
2067 continue;
2068 }
2069
2070 if(!memcmp(entry, new_entry, sizeof *entry)) {
2071 return 0;
2072 }
2073
2074 *entry = *new_entry;
2075
2076 return 0;
2077 }
2078
2079 return -ESRCH;
2080 }
2081
kvm_irqchip_add_irq_route(KVMState * s,int irq,int irqchip,int pin)2082 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
2083 {
2084 struct kvm_irq_routing_entry e = {};
2085
2086 assert(pin < s->gsi_count);
2087
2088 e.gsi = irq;
2089 e.type = KVM_IRQ_ROUTING_IRQCHIP;
2090 e.flags = 0;
2091 e.u.irqchip.irqchip = irqchip;
2092 e.u.irqchip.pin = pin;
2093 kvm_add_routing_entry(s, &e);
2094 }
2095
kvm_irqchip_release_virq(KVMState * s,int virq)2096 void kvm_irqchip_release_virq(KVMState *s, int virq)
2097 {
2098 struct kvm_irq_routing_entry *e;
2099 int i;
2100
2101 if (kvm_gsi_direct_mapping()) {
2102 return;
2103 }
2104
2105 for (i = 0; i < s->irq_routes->nr; i++) {
2106 e = &s->irq_routes->entries[i];
2107 if (e->gsi == virq) {
2108 s->irq_routes->nr--;
2109 *e = s->irq_routes->entries[s->irq_routes->nr];
2110 }
2111 }
2112 clear_gsi(s, virq);
2113 kvm_arch_release_virq_post(virq);
2114 trace_kvm_irqchip_release_virq(virq);
2115 }
2116
kvm_irqchip_add_change_notifier(Notifier * n)2117 void kvm_irqchip_add_change_notifier(Notifier *n)
2118 {
2119 notifier_list_add(&kvm_irqchip_change_notifiers, n);
2120 }
2121
kvm_irqchip_remove_change_notifier(Notifier * n)2122 void kvm_irqchip_remove_change_notifier(Notifier *n)
2123 {
2124 notifier_remove(n);
2125 }
2126
kvm_irqchip_change_notify(void)2127 void kvm_irqchip_change_notify(void)
2128 {
2129 notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
2130 }
2131
kvm_irqchip_get_virq(KVMState * s)2132 int kvm_irqchip_get_virq(KVMState *s)
2133 {
2134 int next_virq;
2135
2136 /* Return the lowest unused GSI in the bitmap */
2137 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
2138 if (next_virq >= s->gsi_count) {
2139 return -ENOSPC;
2140 } else {
2141 return next_virq;
2142 }
2143 }
2144
kvm_irqchip_send_msi(KVMState * s,MSIMessage msg)2145 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
2146 {
2147 struct kvm_msi msi;
2148
2149 msi.address_lo = (uint32_t)msg.address;
2150 msi.address_hi = msg.address >> 32;
2151 msi.data = le32_to_cpu(msg.data);
2152 msi.flags = 0;
2153 memset(msi.pad, 0, sizeof(msi.pad));
2154
2155 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
2156 }
2157
kvm_irqchip_add_msi_route(KVMRouteChange * c,int vector,PCIDevice * dev)2158 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
2159 {
2160 struct kvm_irq_routing_entry kroute = {};
2161 int virq;
2162 KVMState *s = c->s;
2163 MSIMessage msg = {0, 0};
2164
2165 if (pci_available && dev) {
2166 msg = pci_get_msi_message(dev, vector);
2167 }
2168
2169 if (kvm_gsi_direct_mapping()) {
2170 return kvm_arch_msi_data_to_gsi(msg.data);
2171 }
2172
2173 if (!kvm_gsi_routing_enabled()) {
2174 return -ENOSYS;
2175 }
2176
2177 virq = kvm_irqchip_get_virq(s);
2178 if (virq < 0) {
2179 return virq;
2180 }
2181
2182 kroute.gsi = virq;
2183 kroute.type = KVM_IRQ_ROUTING_MSI;
2184 kroute.flags = 0;
2185 kroute.u.msi.address_lo = (uint32_t)msg.address;
2186 kroute.u.msi.address_hi = msg.address >> 32;
2187 kroute.u.msi.data = le32_to_cpu(msg.data);
2188 if (pci_available && kvm_msi_devid_required()) {
2189 kroute.flags = KVM_MSI_VALID_DEVID;
2190 kroute.u.msi.devid = pci_requester_id(dev);
2191 }
2192 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2193 kvm_irqchip_release_virq(s, virq);
2194 return -EINVAL;
2195 }
2196
2197 if (s->irq_routes->nr < s->gsi_count) {
2198 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
2199 vector, virq);
2200
2201 kvm_add_routing_entry(s, &kroute);
2202 kvm_arch_add_msi_route_post(&kroute, vector, dev);
2203 c->changes++;
2204 } else {
2205 kvm_irqchip_release_virq(s, virq);
2206 return -ENOSPC;
2207 }
2208
2209 return virq;
2210 }
2211
kvm_irqchip_update_msi_route(KVMState * s,int virq,MSIMessage msg,PCIDevice * dev)2212 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
2213 PCIDevice *dev)
2214 {
2215 struct kvm_irq_routing_entry kroute = {};
2216
2217 if (kvm_gsi_direct_mapping()) {
2218 return 0;
2219 }
2220
2221 if (!kvm_irqchip_in_kernel()) {
2222 return -ENOSYS;
2223 }
2224
2225 kroute.gsi = virq;
2226 kroute.type = KVM_IRQ_ROUTING_MSI;
2227 kroute.flags = 0;
2228 kroute.u.msi.address_lo = (uint32_t)msg.address;
2229 kroute.u.msi.address_hi = msg.address >> 32;
2230 kroute.u.msi.data = le32_to_cpu(msg.data);
2231 if (pci_available && kvm_msi_devid_required()) {
2232 kroute.flags = KVM_MSI_VALID_DEVID;
2233 kroute.u.msi.devid = pci_requester_id(dev);
2234 }
2235 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2236 return -EINVAL;
2237 }
2238
2239 trace_kvm_irqchip_update_msi_route(virq);
2240
2241 return kvm_update_routing_entry(s, &kroute);
2242 }
2243
kvm_irqchip_assign_irqfd(KVMState * s,EventNotifier * event,EventNotifier * resample,int virq,bool assign)2244 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2245 EventNotifier *resample, int virq,
2246 bool assign)
2247 {
2248 int fd = event_notifier_get_fd(event);
2249 int rfd = resample ? event_notifier_get_fd(resample) : -1;
2250
2251 struct kvm_irqfd irqfd = {
2252 .fd = fd,
2253 .gsi = virq,
2254 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
2255 };
2256
2257 if (rfd != -1) {
2258 assert(assign);
2259 if (kvm_irqchip_is_split()) {
2260 /*
2261 * When the slow irqchip (e.g. IOAPIC) is in the
2262 * userspace, KVM kernel resamplefd will not work because
2263 * the EOI of the interrupt will be delivered to userspace
2264 * instead, so the KVM kernel resamplefd kick will be
2265 * skipped. The userspace here mimics what the kernel
2266 * provides with resamplefd, remember the resamplefd and
2267 * kick it when we receive EOI of this IRQ.
2268 *
2269 * This is hackery because IOAPIC is mostly bypassed
2270 * (except EOI broadcasts) when irqfd is used. However
2271 * this can bring much performance back for split irqchip
2272 * with INTx IRQs (for VFIO, this gives 93% perf of the
2273 * full fast path, which is 46% perf boost comparing to
2274 * the INTx slow path).
2275 */
2276 kvm_resample_fd_insert(virq, resample);
2277 } else {
2278 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
2279 irqfd.resamplefd = rfd;
2280 }
2281 } else if (!assign) {
2282 if (kvm_irqchip_is_split()) {
2283 kvm_resample_fd_remove(virq);
2284 }
2285 }
2286
2287 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
2288 }
2289
2290 #else /* !KVM_CAP_IRQ_ROUTING */
2291
kvm_init_irq_routing(KVMState * s)2292 void kvm_init_irq_routing(KVMState *s)
2293 {
2294 }
2295
kvm_irqchip_release_virq(KVMState * s,int virq)2296 void kvm_irqchip_release_virq(KVMState *s, int virq)
2297 {
2298 }
2299
kvm_irqchip_send_msi(KVMState * s,MSIMessage msg)2300 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
2301 {
2302 abort();
2303 }
2304
kvm_irqchip_add_msi_route(KVMRouteChange * c,int vector,PCIDevice * dev)2305 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
2306 {
2307 return -ENOSYS;
2308 }
2309
kvm_irqchip_add_adapter_route(KVMState * s,AdapterInfo * adapter)2310 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
2311 {
2312 return -ENOSYS;
2313 }
2314
kvm_irqchip_add_hv_sint_route(KVMState * s,uint32_t vcpu,uint32_t sint)2315 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
2316 {
2317 return -ENOSYS;
2318 }
2319
kvm_irqchip_assign_irqfd(KVMState * s,EventNotifier * event,EventNotifier * resample,int virq,bool assign)2320 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2321 EventNotifier *resample, int virq,
2322 bool assign)
2323 {
2324 abort();
2325 }
2326
kvm_irqchip_update_msi_route(KVMState * s,int virq,MSIMessage msg)2327 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
2328 {
2329 return -ENOSYS;
2330 }
2331 #endif /* !KVM_CAP_IRQ_ROUTING */
2332
kvm_irqchip_add_irqfd_notifier_gsi(KVMState * s,EventNotifier * n,EventNotifier * rn,int virq)2333 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2334 EventNotifier *rn, int virq)
2335 {
2336 return kvm_irqchip_assign_irqfd(s, n, rn, virq, true);
2337 }
2338
kvm_irqchip_remove_irqfd_notifier_gsi(KVMState * s,EventNotifier * n,int virq)2339 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2340 int virq)
2341 {
2342 return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false);
2343 }
2344
kvm_irqchip_add_irqfd_notifier(KVMState * s,EventNotifier * n,EventNotifier * rn,qemu_irq irq)2345 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
2346 EventNotifier *rn, qemu_irq irq)
2347 {
2348 gpointer key, gsi;
2349 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2350
2351 if (!found) {
2352 return -ENXIO;
2353 }
2354 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
2355 }
2356
kvm_irqchip_remove_irqfd_notifier(KVMState * s,EventNotifier * n,qemu_irq irq)2357 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
2358 qemu_irq irq)
2359 {
2360 gpointer key, gsi;
2361 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2362
2363 if (!found) {
2364 return -ENXIO;
2365 }
2366 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
2367 }
2368
kvm_irqchip_set_qemuirq_gsi(KVMState * s,qemu_irq irq,int gsi)2369 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
2370 {
2371 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
2372 }
2373
kvm_irqchip_create(KVMState * s)2374 static void kvm_irqchip_create(KVMState *s)
2375 {
2376 int ret;
2377
2378 assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
2379 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
2380 ;
2381 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
2382 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
2383 if (ret < 0) {
2384 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
2385 exit(1);
2386 }
2387 } else {
2388 return;
2389 }
2390
2391 if (kvm_check_extension(s, KVM_CAP_IRQFD) <= 0) {
2392 fprintf(stderr, "kvm: irqfd not implemented\n");
2393 exit(1);
2394 }
2395
2396 /* First probe and see if there's a arch-specific hook to create the
2397 * in-kernel irqchip for us */
2398 ret = kvm_arch_irqchip_create(s);
2399 if (ret == 0) {
2400 if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) {
2401 error_report("Split IRQ chip mode not supported.");
2402 exit(1);
2403 } else {
2404 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
2405 }
2406 }
2407 if (ret < 0) {
2408 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
2409 exit(1);
2410 }
2411
2412 kvm_kernel_irqchip = true;
2413 /* If we have an in-kernel IRQ chip then we must have asynchronous
2414 * interrupt delivery (though the reverse is not necessarily true)
2415 */
2416 kvm_async_interrupts_allowed = true;
2417 kvm_halt_in_kernel_allowed = true;
2418
2419 kvm_init_irq_routing(s);
2420
2421 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
2422 }
2423
2424 /* Find number of supported CPUs using the recommended
2425 * procedure from the kernel API documentation to cope with
2426 * older kernels that may be missing capabilities.
2427 */
kvm_recommended_vcpus(KVMState * s)2428 static int kvm_recommended_vcpus(KVMState *s)
2429 {
2430 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
2431 return (ret) ? ret : 4;
2432 }
2433
kvm_max_vcpus(KVMState * s)2434 static int kvm_max_vcpus(KVMState *s)
2435 {
2436 int ret = kvm_vm_check_extension(s, KVM_CAP_MAX_VCPUS);
2437 return (ret) ? ret : kvm_recommended_vcpus(s);
2438 }
2439
kvm_max_vcpu_id(KVMState * s)2440 static int kvm_max_vcpu_id(KVMState *s)
2441 {
2442 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
2443 return (ret) ? ret : kvm_max_vcpus(s);
2444 }
2445
kvm_vcpu_id_is_valid(int vcpu_id)2446 bool kvm_vcpu_id_is_valid(int vcpu_id)
2447 {
2448 KVMState *s = KVM_STATE(current_accel());
2449 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
2450 }
2451
kvm_dirty_ring_enabled(void)2452 bool kvm_dirty_ring_enabled(void)
2453 {
2454 return kvm_state && kvm_state->kvm_dirty_ring_size;
2455 }
2456
2457 static void query_stats_cb(StatsResultList **result, StatsTarget target,
2458 strList *names, strList *targets, Error **errp);
2459 static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp);
2460
kvm_dirty_ring_size(void)2461 uint32_t kvm_dirty_ring_size(void)
2462 {
2463 return kvm_state->kvm_dirty_ring_size;
2464 }
2465
do_kvm_create_vm(MachineState * ms,int type)2466 static int do_kvm_create_vm(MachineState *ms, int type)
2467 {
2468 KVMState *s;
2469 int ret;
2470
2471 s = KVM_STATE(ms->accelerator);
2472
2473 do {
2474 ret = kvm_ioctl(s, KVM_CREATE_VM, type);
2475 } while (ret == -EINTR);
2476
2477 if (ret < 0) {
2478 error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret));
2479
2480 #ifdef TARGET_S390X
2481 if (ret == -EINVAL) {
2482 error_printf("Host kernel setup problem detected."
2483 " Please verify:\n");
2484 error_printf("- for kernels supporting the"
2485 " switch_amode or user_mode parameters, whether");
2486 error_printf(" user space is running in primary address space\n");
2487 error_printf("- for kernels supporting the vm.allocate_pgste"
2488 " sysctl, whether it is enabled\n");
2489 }
2490 #elif defined(TARGET_PPC)
2491 if (ret == -EINVAL) {
2492 error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n",
2493 (type == 2) ? "pr" : "hv");
2494 }
2495 #endif
2496 }
2497
2498 return ret;
2499 }
2500
find_kvm_machine_type(MachineState * ms)2501 static int find_kvm_machine_type(MachineState *ms)
2502 {
2503 MachineClass *mc = MACHINE_GET_CLASS(ms);
2504 int type;
2505
2506 if (object_property_find(OBJECT(current_machine), "kvm-type")) {
2507 g_autofree char *kvm_type;
2508 kvm_type = object_property_get_str(OBJECT(current_machine),
2509 "kvm-type",
2510 &error_abort);
2511 type = mc->kvm_type(ms, kvm_type);
2512 } else if (mc->kvm_type) {
2513 type = mc->kvm_type(ms, NULL);
2514 } else {
2515 type = kvm_arch_get_default_type(ms);
2516 }
2517 return type;
2518 }
2519
kvm_setup_dirty_ring(KVMState * s)2520 static int kvm_setup_dirty_ring(KVMState *s)
2521 {
2522 uint64_t dirty_log_manual_caps;
2523 int ret;
2524
2525 /*
2526 * Enable KVM dirty ring if supported, otherwise fall back to
2527 * dirty logging mode
2528 */
2529 ret = kvm_dirty_ring_init(s);
2530 if (ret < 0) {
2531 return ret;
2532 }
2533
2534 /*
2535 * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
2536 * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
2537 * page is wr-protected initially, which is against how kvm dirty ring is
2538 * usage - kvm dirty ring requires all pages are wr-protected at the very
2539 * beginning. Enabling this feature for dirty ring causes data corruption.
2540 *
2541 * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
2542 * we may expect a higher stall time when starting the migration. In the
2543 * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
2544 * instead of clearing dirty bit, it can be a way to explicitly wr-protect
2545 * guest pages.
2546 */
2547 if (!s->kvm_dirty_ring_size) {
2548 dirty_log_manual_caps =
2549 kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
2550 dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
2551 KVM_DIRTY_LOG_INITIALLY_SET);
2552 s->manual_dirty_log_protect = dirty_log_manual_caps;
2553 if (dirty_log_manual_caps) {
2554 ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
2555 dirty_log_manual_caps);
2556 if (ret) {
2557 warn_report("Trying to enable capability %"PRIu64" of "
2558 "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
2559 "Falling back to the legacy mode. ",
2560 dirty_log_manual_caps);
2561 s->manual_dirty_log_protect = 0;
2562 }
2563 }
2564 }
2565
2566 return 0;
2567 }
2568
kvm_init(MachineState * ms)2569 static int kvm_init(MachineState *ms)
2570 {
2571 MachineClass *mc = MACHINE_GET_CLASS(ms);
2572 static const char upgrade_note[] =
2573 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
2574 "(see http://sourceforge.net/projects/kvm).\n";
2575 const struct {
2576 const char *name;
2577 int num;
2578 } num_cpus[] = {
2579 { "SMP", ms->smp.cpus },
2580 { "hotpluggable", ms->smp.max_cpus },
2581 { /* end of list */ }
2582 }, *nc = num_cpus;
2583 int soft_vcpus_limit, hard_vcpus_limit;
2584 KVMState *s;
2585 const KVMCapabilityInfo *missing_cap;
2586 int ret;
2587 int type;
2588
2589 qemu_mutex_init(&kml_slots_lock);
2590
2591 s = KVM_STATE(ms->accelerator);
2592
2593 /*
2594 * On systems where the kernel can support different base page
2595 * sizes, host page size may be different from TARGET_PAGE_SIZE,
2596 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum
2597 * page size for the system though.
2598 */
2599 assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size());
2600
2601 s->sigmask_len = 8;
2602 accel_blocker_init();
2603
2604 #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
2605 QTAILQ_INIT(&s->kvm_sw_breakpoints);
2606 #endif
2607 QLIST_INIT(&s->kvm_parked_vcpus);
2608 s->fd = qemu_open_old(s->device ?: "/dev/kvm", O_RDWR);
2609 if (s->fd == -1) {
2610 error_report("Could not access KVM kernel module: %m");
2611 ret = -errno;
2612 goto err;
2613 }
2614
2615 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
2616 if (ret < KVM_API_VERSION) {
2617 if (ret >= 0) {
2618 ret = -EINVAL;
2619 }
2620 error_report("kvm version too old");
2621 goto err;
2622 }
2623
2624 if (ret > KVM_API_VERSION) {
2625 ret = -EINVAL;
2626 error_report("kvm version not supported");
2627 goto err;
2628 }
2629
2630 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
2631 s->nr_slots_max = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
2632
2633 /* If unspecified, use the default value */
2634 if (!s->nr_slots_max) {
2635 s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT;
2636 }
2637
2638 type = find_kvm_machine_type(ms);
2639 if (type < 0) {
2640 ret = -EINVAL;
2641 goto err;
2642 }
2643
2644 ret = do_kvm_create_vm(ms, type);
2645 if (ret < 0) {
2646 goto err;
2647 }
2648
2649 s->vmfd = ret;
2650
2651 s->nr_as = kvm_vm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
2652 if (s->nr_as <= 1) {
2653 s->nr_as = 1;
2654 }
2655 s->as = g_new0(struct KVMAs, s->nr_as);
2656
2657 /* check the vcpu limits */
2658 soft_vcpus_limit = kvm_recommended_vcpus(s);
2659 hard_vcpus_limit = kvm_max_vcpus(s);
2660
2661 while (nc->name) {
2662 if (nc->num > soft_vcpus_limit) {
2663 warn_report("Number of %s cpus requested (%d) exceeds "
2664 "the recommended cpus supported by KVM (%d)",
2665 nc->name, nc->num, soft_vcpus_limit);
2666
2667 if (nc->num > hard_vcpus_limit) {
2668 error_report("Number of %s cpus requested (%d) exceeds "
2669 "the maximum cpus supported by KVM (%d)",
2670 nc->name, nc->num, hard_vcpus_limit);
2671 exit(1);
2672 }
2673 }
2674 nc++;
2675 }
2676
2677 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
2678 if (!missing_cap) {
2679 missing_cap =
2680 kvm_check_extension_list(s, kvm_arch_required_capabilities);
2681 }
2682 if (missing_cap) {
2683 ret = -EINVAL;
2684 error_report("kvm does not support %s", missing_cap->name);
2685 error_printf("%s", upgrade_note);
2686 goto err;
2687 }
2688
2689 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
2690 s->coalesced_pio = s->coalesced_mmio &&
2691 kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
2692
2693 ret = kvm_setup_dirty_ring(s);
2694 if (ret < 0) {
2695 goto err;
2696 }
2697
2698 #ifdef KVM_CAP_VCPU_EVENTS
2699 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
2700 #endif
2701 s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
2702
2703 s->irq_set_ioctl = KVM_IRQ_LINE;
2704 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
2705 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
2706 }
2707
2708 kvm_readonly_mem_allowed =
2709 (kvm_vm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
2710
2711 kvm_resamplefds_allowed =
2712 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
2713
2714 kvm_vm_attributes_allowed =
2715 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
2716
2717 #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
2718 kvm_has_guest_debug =
2719 (kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG) > 0);
2720 #endif
2721
2722 kvm_sstep_flags = 0;
2723 if (kvm_has_guest_debug) {
2724 kvm_sstep_flags = SSTEP_ENABLE;
2725
2726 #if defined TARGET_KVM_HAVE_GUEST_DEBUG
2727 int guest_debug_flags =
2728 kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG2);
2729
2730 if (guest_debug_flags & KVM_GUESTDBG_BLOCKIRQ) {
2731 kvm_sstep_flags |= SSTEP_NOIRQ;
2732 }
2733 #endif
2734 }
2735
2736 kvm_state = s;
2737
2738 ret = kvm_arch_init(ms, s);
2739 if (ret < 0) {
2740 goto err;
2741 }
2742
2743 kvm_supported_memory_attributes = kvm_vm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES);
2744 kvm_guest_memfd_supported =
2745 kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) &&
2746 kvm_check_extension(s, KVM_CAP_USER_MEMORY2) &&
2747 (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE);
2748
2749 if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
2750 s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
2751 }
2752
2753 qemu_register_reset(kvm_unpoison_all, NULL);
2754
2755 if (s->kernel_irqchip_allowed) {
2756 kvm_irqchip_create(s);
2757 }
2758
2759 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
2760 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
2761 s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
2762 s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
2763
2764 kvm_memory_listener_register(s, &s->memory_listener,
2765 &address_space_memory, 0, "kvm-memory");
2766 memory_listener_register(&kvm_io_listener,
2767 &address_space_io);
2768
2769 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2770 if (!s->sync_mmu) {
2771 ret = ram_block_discard_disable(true);
2772 assert(!ret);
2773 }
2774
2775 if (s->kvm_dirty_ring_size) {
2776 kvm_dirty_ring_reaper_init(s);
2777 }
2778
2779 if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
2780 add_stats_callbacks(STATS_PROVIDER_KVM, query_stats_cb,
2781 query_stats_schemas_cb);
2782 }
2783
2784 return 0;
2785
2786 err:
2787 assert(ret < 0);
2788 if (s->vmfd >= 0) {
2789 close(s->vmfd);
2790 }
2791 if (s->fd != -1) {
2792 close(s->fd);
2793 }
2794 g_free(s->as);
2795 g_free(s->memory_listener.slots);
2796
2797 return ret;
2798 }
2799
kvm_set_sigmask_len(KVMState * s,unsigned int sigmask_len)2800 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
2801 {
2802 s->sigmask_len = sigmask_len;
2803 }
2804
kvm_handle_io(uint16_t port,MemTxAttrs attrs,void * data,int direction,int size,uint32_t count)2805 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
2806 int size, uint32_t count)
2807 {
2808 int i;
2809 uint8_t *ptr = data;
2810
2811 for (i = 0; i < count; i++) {
2812 address_space_rw(&address_space_io, port, attrs,
2813 ptr, size,
2814 direction == KVM_EXIT_IO_OUT);
2815 ptr += size;
2816 }
2817 }
2818
kvm_handle_internal_error(CPUState * cpu,struct kvm_run * run)2819 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
2820 {
2821 int i;
2822
2823 fprintf(stderr, "KVM internal error. Suberror: %d\n",
2824 run->internal.suberror);
2825
2826 for (i = 0; i < run->internal.ndata; ++i) {
2827 fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n",
2828 i, (uint64_t)run->internal.data[i]);
2829 }
2830 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
2831 fprintf(stderr, "emulation failure\n");
2832 if (!kvm_arch_stop_on_emulation_error(cpu)) {
2833 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2834 return EXCP_INTERRUPT;
2835 }
2836 }
2837 /* FIXME: Should trigger a qmp message to let management know
2838 * something went wrong.
2839 */
2840 return -1;
2841 }
2842
kvm_flush_coalesced_mmio_buffer(void)2843 void kvm_flush_coalesced_mmio_buffer(void)
2844 {
2845 KVMState *s = kvm_state;
2846
2847 if (!s || s->coalesced_flush_in_progress) {
2848 return;
2849 }
2850
2851 s->coalesced_flush_in_progress = true;
2852
2853 if (s->coalesced_mmio_ring) {
2854 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
2855 while (ring->first != ring->last) {
2856 struct kvm_coalesced_mmio *ent;
2857
2858 ent = &ring->coalesced_mmio[ring->first];
2859
2860 if (ent->pio == 1) {
2861 address_space_write(&address_space_io, ent->phys_addr,
2862 MEMTXATTRS_UNSPECIFIED, ent->data,
2863 ent->len);
2864 } else {
2865 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
2866 }
2867 smp_wmb();
2868 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
2869 }
2870 }
2871
2872 s->coalesced_flush_in_progress = false;
2873 }
2874
do_kvm_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)2875 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2876 {
2877 if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
2878 Error *err = NULL;
2879 int ret = kvm_arch_get_registers(cpu, &err);
2880 if (ret) {
2881 if (err) {
2882 error_reportf_err(err, "Failed to synchronize CPU state: ");
2883 } else {
2884 error_report("Failed to get registers: %s", strerror(-ret));
2885 }
2886
2887 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2888 vm_stop(RUN_STATE_INTERNAL_ERROR);
2889 }
2890
2891 cpu->vcpu_dirty = true;
2892 }
2893 }
2894
kvm_cpu_synchronize_state(CPUState * cpu)2895 void kvm_cpu_synchronize_state(CPUState *cpu)
2896 {
2897 if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
2898 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
2899 }
2900 }
2901
do_kvm_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)2902 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
2903 {
2904 Error *err = NULL;
2905 int ret = kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE, &err);
2906 if (ret) {
2907 if (err) {
2908 error_reportf_err(err, "Restoring resisters after reset: ");
2909 } else {
2910 error_report("Failed to put registers after reset: %s",
2911 strerror(-ret));
2912 }
2913 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2914 vm_stop(RUN_STATE_INTERNAL_ERROR);
2915 }
2916
2917 cpu->vcpu_dirty = false;
2918 }
2919
kvm_cpu_synchronize_post_reset(CPUState * cpu)2920 void kvm_cpu_synchronize_post_reset(CPUState *cpu)
2921 {
2922 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2923
2924 if (cpu == first_cpu) {
2925 kvm_reset_parked_vcpus(kvm_state);
2926 }
2927 }
2928
do_kvm_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)2929 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
2930 {
2931 Error *err = NULL;
2932 int ret = kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE, &err);
2933 if (ret) {
2934 if (err) {
2935 error_reportf_err(err, "Putting registers after init: ");
2936 } else {
2937 error_report("Failed to put registers after init: %s",
2938 strerror(-ret));
2939 }
2940 exit(1);
2941 }
2942
2943 cpu->vcpu_dirty = false;
2944 }
2945
kvm_cpu_synchronize_post_init(CPUState * cpu)2946 void kvm_cpu_synchronize_post_init(CPUState *cpu)
2947 {
2948 if (!kvm_state->guest_state_protected) {
2949 /*
2950 * This runs before the machine_init_done notifiers, and is the last
2951 * opportunity to synchronize the state of confidential guests.
2952 */
2953 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2954 }
2955 }
2956
do_kvm_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)2957 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
2958 {
2959 cpu->vcpu_dirty = true;
2960 }
2961
kvm_cpu_synchronize_pre_loadvm(CPUState * cpu)2962 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
2963 {
2964 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2965 }
2966
2967 #ifdef KVM_HAVE_MCE_INJECTION
2968 static __thread void *pending_sigbus_addr;
2969 static __thread int pending_sigbus_code;
2970 static __thread bool have_sigbus_pending;
2971 #endif
2972
kvm_cpu_kick(CPUState * cpu)2973 static void kvm_cpu_kick(CPUState *cpu)
2974 {
2975 qatomic_set(&cpu->kvm_run->immediate_exit, 1);
2976 }
2977
kvm_cpu_kick_self(void)2978 static void kvm_cpu_kick_self(void)
2979 {
2980 if (kvm_immediate_exit) {
2981 kvm_cpu_kick(current_cpu);
2982 } else {
2983 qemu_cpu_kick_self();
2984 }
2985 }
2986
kvm_eat_signals(CPUState * cpu)2987 static void kvm_eat_signals(CPUState *cpu)
2988 {
2989 struct timespec ts = { 0, 0 };
2990 siginfo_t siginfo;
2991 sigset_t waitset;
2992 sigset_t chkset;
2993 int r;
2994
2995 if (kvm_immediate_exit) {
2996 qatomic_set(&cpu->kvm_run->immediate_exit, 0);
2997 /* Write kvm_run->immediate_exit before the cpu->exit_request
2998 * write in kvm_cpu_exec.
2999 */
3000 smp_wmb();
3001 return;
3002 }
3003
3004 sigemptyset(&waitset);
3005 sigaddset(&waitset, SIG_IPI);
3006
3007 do {
3008 r = sigtimedwait(&waitset, &siginfo, &ts);
3009 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
3010 perror("sigtimedwait");
3011 exit(1);
3012 }
3013
3014 r = sigpending(&chkset);
3015 if (r == -1) {
3016 perror("sigpending");
3017 exit(1);
3018 }
3019 } while (sigismember(&chkset, SIG_IPI));
3020 }
3021
kvm_convert_memory(hwaddr start,hwaddr size,bool to_private)3022 int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
3023 {
3024 MemoryRegionSection section;
3025 ram_addr_t offset;
3026 MemoryRegion *mr;
3027 RAMBlock *rb;
3028 void *addr;
3029 int ret = -EINVAL;
3030
3031 trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared");
3032
3033 if (!QEMU_PTR_IS_ALIGNED(start, qemu_real_host_page_size()) ||
3034 !QEMU_PTR_IS_ALIGNED(size, qemu_real_host_page_size())) {
3035 return ret;
3036 }
3037
3038 if (!size) {
3039 return ret;
3040 }
3041
3042 section = memory_region_find(get_system_memory(), start, size);
3043 mr = section.mr;
3044 if (!mr) {
3045 /*
3046 * Ignore converting non-assigned region to shared.
3047 *
3048 * TDX requires vMMIO region to be shared to inject #VE to guest.
3049 * OVMF issues conservatively MapGPA(shared) on 32bit PCI MMIO region,
3050 * and vIO-APIC 0xFEC00000 4K page.
3051 * OVMF assigns 32bit PCI MMIO region to
3052 * [top of low memory: typically 2GB=0xC000000, 0xFC00000)
3053 */
3054 if (!to_private) {
3055 return 0;
3056 }
3057 return ret;
3058 }
3059
3060 if (!memory_region_has_guest_memfd(mr)) {
3061 /*
3062 * Because vMMIO region must be shared, guest TD may convert vMMIO
3063 * region to shared explicitly. Don't complain such case. See
3064 * memory_region_type() for checking if the region is MMIO region.
3065 */
3066 if (!to_private &&
3067 !memory_region_is_ram(mr) &&
3068 !memory_region_is_ram_device(mr) &&
3069 !memory_region_is_rom(mr) &&
3070 !memory_region_is_romd(mr)) {
3071 ret = 0;
3072 } else {
3073 error_report("Convert non guest_memfd backed memory region "
3074 "(0x%"HWADDR_PRIx" ,+ 0x%"HWADDR_PRIx") to %s",
3075 start, size, to_private ? "private" : "shared");
3076 }
3077 goto out_unref;
3078 }
3079
3080 if (to_private) {
3081 ret = kvm_set_memory_attributes_private(start, size);
3082 } else {
3083 ret = kvm_set_memory_attributes_shared(start, size);
3084 }
3085 if (ret) {
3086 goto out_unref;
3087 }
3088
3089 addr = memory_region_get_ram_ptr(mr) + section.offset_within_region;
3090 rb = qemu_ram_block_from_host(addr, false, &offset);
3091
3092 if (to_private) {
3093 if (rb->page_size != qemu_real_host_page_size()) {
3094 /*
3095 * shared memory is backed by hugetlb, which is supposed to be
3096 * pre-allocated and doesn't need to be discarded
3097 */
3098 goto out_unref;
3099 }
3100 ret = ram_block_discard_range(rb, offset, size);
3101 } else {
3102 ret = ram_block_discard_guest_memfd_range(rb, offset, size);
3103 }
3104
3105 out_unref:
3106 memory_region_unref(mr);
3107 return ret;
3108 }
3109
kvm_cpu_exec(CPUState * cpu)3110 int kvm_cpu_exec(CPUState *cpu)
3111 {
3112 struct kvm_run *run = cpu->kvm_run;
3113 int ret, run_ret;
3114
3115 trace_kvm_cpu_exec();
3116
3117 if (kvm_arch_process_async_events(cpu)) {
3118 qatomic_set(&cpu->exit_request, 0);
3119 return EXCP_HLT;
3120 }
3121
3122 bql_unlock();
3123 cpu_exec_start(cpu);
3124
3125 do {
3126 MemTxAttrs attrs;
3127
3128 if (cpu->vcpu_dirty) {
3129 Error *err = NULL;
3130 ret = kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE, &err);
3131 if (ret) {
3132 if (err) {
3133 error_reportf_err(err, "Putting registers after init: ");
3134 } else {
3135 error_report("Failed to put registers after init: %s",
3136 strerror(-ret));
3137 }
3138 ret = -1;
3139 break;
3140 }
3141
3142 cpu->vcpu_dirty = false;
3143 }
3144
3145 kvm_arch_pre_run(cpu, run);
3146 if (qatomic_read(&cpu->exit_request)) {
3147 trace_kvm_interrupt_exit_request();
3148 /*
3149 * KVM requires us to reenter the kernel after IO exits to complete
3150 * instruction emulation. This self-signal will ensure that we
3151 * leave ASAP again.
3152 */
3153 kvm_cpu_kick_self();
3154 }
3155
3156 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
3157 * Matching barrier in kvm_eat_signals.
3158 */
3159 smp_rmb();
3160
3161 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
3162
3163 attrs = kvm_arch_post_run(cpu, run);
3164
3165 #ifdef KVM_HAVE_MCE_INJECTION
3166 if (unlikely(have_sigbus_pending)) {
3167 bql_lock();
3168 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
3169 pending_sigbus_addr);
3170 have_sigbus_pending = false;
3171 bql_unlock();
3172 }
3173 #endif
3174
3175 if (run_ret < 0) {
3176 if (run_ret == -EINTR || run_ret == -EAGAIN) {
3177 trace_kvm_io_window_exit();
3178 kvm_eat_signals(cpu);
3179 ret = EXCP_INTERRUPT;
3180 break;
3181 }
3182 if (!(run_ret == -EFAULT && run->exit_reason == KVM_EXIT_MEMORY_FAULT)) {
3183 fprintf(stderr, "error: kvm run failed %s\n",
3184 strerror(-run_ret));
3185 #ifdef TARGET_PPC
3186 if (run_ret == -EBUSY) {
3187 fprintf(stderr,
3188 "This is probably because your SMT is enabled.\n"
3189 "VCPU can only run on primary threads with all "
3190 "secondary threads offline.\n");
3191 }
3192 #endif
3193 ret = -1;
3194 break;
3195 }
3196 }
3197
3198 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
3199 switch (run->exit_reason) {
3200 case KVM_EXIT_IO:
3201 /* Called outside BQL */
3202 kvm_handle_io(run->io.port, attrs,
3203 (uint8_t *)run + run->io.data_offset,
3204 run->io.direction,
3205 run->io.size,
3206 run->io.count);
3207 ret = 0;
3208 break;
3209 case KVM_EXIT_MMIO:
3210 /* Called outside BQL */
3211 address_space_rw(&address_space_memory,
3212 run->mmio.phys_addr, attrs,
3213 run->mmio.data,
3214 run->mmio.len,
3215 run->mmio.is_write);
3216 ret = 0;
3217 break;
3218 case KVM_EXIT_IRQ_WINDOW_OPEN:
3219 ret = EXCP_INTERRUPT;
3220 break;
3221 case KVM_EXIT_SHUTDOWN:
3222 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
3223 ret = EXCP_INTERRUPT;
3224 break;
3225 case KVM_EXIT_UNKNOWN:
3226 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
3227 (uint64_t)run->hw.hardware_exit_reason);
3228 ret = -1;
3229 break;
3230 case KVM_EXIT_INTERNAL_ERROR:
3231 ret = kvm_handle_internal_error(cpu, run);
3232 break;
3233 case KVM_EXIT_DIRTY_RING_FULL:
3234 /*
3235 * We shouldn't continue if the dirty ring of this vcpu is
3236 * still full. Got kicked by KVM_RESET_DIRTY_RINGS.
3237 */
3238 trace_kvm_dirty_ring_full(cpu->cpu_index);
3239 bql_lock();
3240 /*
3241 * We throttle vCPU by making it sleep once it exit from kernel
3242 * due to dirty ring full. In the dirtylimit scenario, reaping
3243 * all vCPUs after a single vCPU dirty ring get full result in
3244 * the miss of sleep, so just reap the ring-fulled vCPU.
3245 */
3246 if (dirtylimit_in_service()) {
3247 kvm_dirty_ring_reap(kvm_state, cpu);
3248 } else {
3249 kvm_dirty_ring_reap(kvm_state, NULL);
3250 }
3251 bql_unlock();
3252 dirtylimit_vcpu_execute(cpu);
3253 ret = 0;
3254 break;
3255 case KVM_EXIT_SYSTEM_EVENT:
3256 trace_kvm_run_exit_system_event(cpu->cpu_index, run->system_event.type);
3257 switch (run->system_event.type) {
3258 case KVM_SYSTEM_EVENT_SHUTDOWN:
3259 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
3260 ret = EXCP_INTERRUPT;
3261 break;
3262 case KVM_SYSTEM_EVENT_RESET:
3263 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
3264 ret = EXCP_INTERRUPT;
3265 break;
3266 case KVM_SYSTEM_EVENT_CRASH:
3267 kvm_cpu_synchronize_state(cpu);
3268 bql_lock();
3269 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
3270 bql_unlock();
3271 ret = 0;
3272 break;
3273 default:
3274 ret = kvm_arch_handle_exit(cpu, run);
3275 break;
3276 }
3277 break;
3278 case KVM_EXIT_MEMORY_FAULT:
3279 trace_kvm_memory_fault(run->memory_fault.gpa,
3280 run->memory_fault.size,
3281 run->memory_fault.flags);
3282 if (run->memory_fault.flags & ~KVM_MEMORY_EXIT_FLAG_PRIVATE) {
3283 error_report("KVM_EXIT_MEMORY_FAULT: Unknown flag 0x%" PRIx64,
3284 (uint64_t)run->memory_fault.flags);
3285 ret = -1;
3286 break;
3287 }
3288 ret = kvm_convert_memory(run->memory_fault.gpa, run->memory_fault.size,
3289 run->memory_fault.flags & KVM_MEMORY_EXIT_FLAG_PRIVATE);
3290 break;
3291 default:
3292 ret = kvm_arch_handle_exit(cpu, run);
3293 break;
3294 }
3295 } while (ret == 0);
3296
3297 cpu_exec_end(cpu);
3298 bql_lock();
3299
3300 if (ret < 0) {
3301 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
3302 vm_stop(RUN_STATE_INTERNAL_ERROR);
3303 }
3304
3305 qatomic_set(&cpu->exit_request, 0);
3306 return ret;
3307 }
3308
kvm_ioctl(KVMState * s,unsigned long type,...)3309 int kvm_ioctl(KVMState *s, unsigned long type, ...)
3310 {
3311 int ret;
3312 void *arg;
3313 va_list ap;
3314
3315 va_start(ap, type);
3316 arg = va_arg(ap, void *);
3317 va_end(ap);
3318
3319 trace_kvm_ioctl(type, arg);
3320 ret = ioctl(s->fd, type, arg);
3321 if (ret == -1) {
3322 ret = -errno;
3323 }
3324 return ret;
3325 }
3326
kvm_vm_ioctl(KVMState * s,unsigned long type,...)3327 int kvm_vm_ioctl(KVMState *s, unsigned long type, ...)
3328 {
3329 int ret;
3330 void *arg;
3331 va_list ap;
3332
3333 va_start(ap, type);
3334 arg = va_arg(ap, void *);
3335 va_end(ap);
3336
3337 trace_kvm_vm_ioctl(type, arg);
3338 accel_ioctl_begin();
3339 ret = ioctl(s->vmfd, type, arg);
3340 accel_ioctl_end();
3341 if (ret == -1) {
3342 ret = -errno;
3343 }
3344 return ret;
3345 }
3346
kvm_vcpu_ioctl(CPUState * cpu,unsigned long type,...)3347 int kvm_vcpu_ioctl(CPUState *cpu, unsigned long type, ...)
3348 {
3349 int ret;
3350 void *arg;
3351 va_list ap;
3352
3353 va_start(ap, type);
3354 arg = va_arg(ap, void *);
3355 va_end(ap);
3356
3357 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
3358 accel_cpu_ioctl_begin(cpu);
3359 ret = ioctl(cpu->kvm_fd, type, arg);
3360 accel_cpu_ioctl_end(cpu);
3361 if (ret == -1) {
3362 ret = -errno;
3363 }
3364 return ret;
3365 }
3366
kvm_device_ioctl(int fd,unsigned long type,...)3367 int kvm_device_ioctl(int fd, unsigned long type, ...)
3368 {
3369 int ret;
3370 void *arg;
3371 va_list ap;
3372
3373 va_start(ap, type);
3374 arg = va_arg(ap, void *);
3375 va_end(ap);
3376
3377 trace_kvm_device_ioctl(fd, type, arg);
3378 accel_ioctl_begin();
3379 ret = ioctl(fd, type, arg);
3380 accel_ioctl_end();
3381 if (ret == -1) {
3382 ret = -errno;
3383 }
3384 return ret;
3385 }
3386
kvm_vm_check_attr(KVMState * s,uint32_t group,uint64_t attr)3387 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
3388 {
3389 int ret;
3390 struct kvm_device_attr attribute = {
3391 .group = group,
3392 .attr = attr,
3393 };
3394
3395 if (!kvm_vm_attributes_allowed) {
3396 return 0;
3397 }
3398
3399 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
3400 /* kvm returns 0 on success for HAS_DEVICE_ATTR */
3401 return ret ? 0 : 1;
3402 }
3403
kvm_device_check_attr(int dev_fd,uint32_t group,uint64_t attr)3404 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
3405 {
3406 struct kvm_device_attr attribute = {
3407 .group = group,
3408 .attr = attr,
3409 .flags = 0,
3410 };
3411
3412 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
3413 }
3414
kvm_device_access(int fd,int group,uint64_t attr,void * val,bool write,Error ** errp)3415 int kvm_device_access(int fd, int group, uint64_t attr,
3416 void *val, bool write, Error **errp)
3417 {
3418 struct kvm_device_attr kvmattr;
3419 int err;
3420
3421 kvmattr.flags = 0;
3422 kvmattr.group = group;
3423 kvmattr.attr = attr;
3424 kvmattr.addr = (uintptr_t)val;
3425
3426 err = kvm_device_ioctl(fd,
3427 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
3428 &kvmattr);
3429 if (err < 0) {
3430 error_setg_errno(errp, -err,
3431 "KVM_%s_DEVICE_ATTR failed: Group %d "
3432 "attr 0x%016" PRIx64,
3433 write ? "SET" : "GET", group, attr);
3434 }
3435 return err;
3436 }
3437
kvm_has_sync_mmu(void)3438 bool kvm_has_sync_mmu(void)
3439 {
3440 return kvm_state->sync_mmu;
3441 }
3442
kvm_has_vcpu_events(void)3443 int kvm_has_vcpu_events(void)
3444 {
3445 return kvm_state->vcpu_events;
3446 }
3447
kvm_max_nested_state_length(void)3448 int kvm_max_nested_state_length(void)
3449 {
3450 return kvm_state->max_nested_state_len;
3451 }
3452
kvm_has_gsi_routing(void)3453 int kvm_has_gsi_routing(void)
3454 {
3455 #ifdef KVM_CAP_IRQ_ROUTING
3456 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
3457 #else
3458 return false;
3459 #endif
3460 }
3461
kvm_arm_supports_user_irq(void)3462 bool kvm_arm_supports_user_irq(void)
3463 {
3464 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
3465 }
3466
3467 #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
kvm_find_sw_breakpoint(CPUState * cpu,vaddr pc)3468 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, vaddr pc)
3469 {
3470 struct kvm_sw_breakpoint *bp;
3471
3472 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
3473 if (bp->pc == pc) {
3474 return bp;
3475 }
3476 }
3477 return NULL;
3478 }
3479
kvm_sw_breakpoints_active(CPUState * cpu)3480 int kvm_sw_breakpoints_active(CPUState *cpu)
3481 {
3482 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
3483 }
3484
3485 struct kvm_set_guest_debug_data {
3486 struct kvm_guest_debug dbg;
3487 int err;
3488 };
3489
kvm_invoke_set_guest_debug(CPUState * cpu,run_on_cpu_data data)3490 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
3491 {
3492 struct kvm_set_guest_debug_data *dbg_data =
3493 (struct kvm_set_guest_debug_data *) data.host_ptr;
3494
3495 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
3496 &dbg_data->dbg);
3497 }
3498
kvm_update_guest_debug(CPUState * cpu,unsigned long reinject_trap)3499 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
3500 {
3501 struct kvm_set_guest_debug_data data;
3502
3503 data.dbg.control = reinject_trap;
3504
3505 if (cpu->singlestep_enabled) {
3506 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
3507
3508 if (cpu->singlestep_enabled & SSTEP_NOIRQ) {
3509 data.dbg.control |= KVM_GUESTDBG_BLOCKIRQ;
3510 }
3511 }
3512 kvm_arch_update_guest_debug(cpu, &data.dbg);
3513
3514 run_on_cpu(cpu, kvm_invoke_set_guest_debug,
3515 RUN_ON_CPU_HOST_PTR(&data));
3516 return data.err;
3517 }
3518
kvm_supports_guest_debug(void)3519 bool kvm_supports_guest_debug(void)
3520 {
3521 /* probed during kvm_init() */
3522 return kvm_has_guest_debug;
3523 }
3524
kvm_insert_breakpoint(CPUState * cpu,int type,vaddr addr,vaddr len)3525 int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
3526 {
3527 struct kvm_sw_breakpoint *bp;
3528 int err;
3529
3530 if (type == GDB_BREAKPOINT_SW) {
3531 bp = kvm_find_sw_breakpoint(cpu, addr);
3532 if (bp) {
3533 bp->use_count++;
3534 return 0;
3535 }
3536
3537 bp = g_new(struct kvm_sw_breakpoint, 1);
3538 bp->pc = addr;
3539 bp->use_count = 1;
3540 err = kvm_arch_insert_sw_breakpoint(cpu, bp);
3541 if (err) {
3542 g_free(bp);
3543 return err;
3544 }
3545
3546 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3547 } else {
3548 err = kvm_arch_insert_hw_breakpoint(addr, len, type);
3549 if (err) {
3550 return err;
3551 }
3552 }
3553
3554 CPU_FOREACH(cpu) {
3555 err = kvm_update_guest_debug(cpu, 0);
3556 if (err) {
3557 return err;
3558 }
3559 }
3560 return 0;
3561 }
3562
kvm_remove_breakpoint(CPUState * cpu,int type,vaddr addr,vaddr len)3563 int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
3564 {
3565 struct kvm_sw_breakpoint *bp;
3566 int err;
3567
3568 if (type == GDB_BREAKPOINT_SW) {
3569 bp = kvm_find_sw_breakpoint(cpu, addr);
3570 if (!bp) {
3571 return -ENOENT;
3572 }
3573
3574 if (bp->use_count > 1) {
3575 bp->use_count--;
3576 return 0;
3577 }
3578
3579 err = kvm_arch_remove_sw_breakpoint(cpu, bp);
3580 if (err) {
3581 return err;
3582 }
3583
3584 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3585 g_free(bp);
3586 } else {
3587 err = kvm_arch_remove_hw_breakpoint(addr, len, type);
3588 if (err) {
3589 return err;
3590 }
3591 }
3592
3593 CPU_FOREACH(cpu) {
3594 err = kvm_update_guest_debug(cpu, 0);
3595 if (err) {
3596 return err;
3597 }
3598 }
3599 return 0;
3600 }
3601
kvm_remove_all_breakpoints(CPUState * cpu)3602 void kvm_remove_all_breakpoints(CPUState *cpu)
3603 {
3604 struct kvm_sw_breakpoint *bp, *next;
3605 KVMState *s = cpu->kvm_state;
3606 CPUState *tmpcpu;
3607
3608 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
3609 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
3610 /* Try harder to find a CPU that currently sees the breakpoint. */
3611 CPU_FOREACH(tmpcpu) {
3612 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
3613 break;
3614 }
3615 }
3616 }
3617 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
3618 g_free(bp);
3619 }
3620 kvm_arch_remove_all_hw_breakpoints();
3621
3622 CPU_FOREACH(cpu) {
3623 kvm_update_guest_debug(cpu, 0);
3624 }
3625 }
3626
3627 #endif /* !TARGET_KVM_HAVE_GUEST_DEBUG */
3628
kvm_set_signal_mask(CPUState * cpu,const sigset_t * sigset)3629 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
3630 {
3631 KVMState *s = kvm_state;
3632 struct kvm_signal_mask *sigmask;
3633 int r;
3634
3635 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
3636
3637 sigmask->len = s->sigmask_len;
3638 memcpy(sigmask->sigset, sigset, sizeof(*sigset));
3639 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
3640 g_free(sigmask);
3641
3642 return r;
3643 }
3644
kvm_ipi_signal(int sig)3645 static void kvm_ipi_signal(int sig)
3646 {
3647 if (current_cpu) {
3648 assert(kvm_immediate_exit);
3649 kvm_cpu_kick(current_cpu);
3650 }
3651 }
3652
kvm_init_cpu_signals(CPUState * cpu)3653 void kvm_init_cpu_signals(CPUState *cpu)
3654 {
3655 int r;
3656 sigset_t set;
3657 struct sigaction sigact;
3658
3659 memset(&sigact, 0, sizeof(sigact));
3660 sigact.sa_handler = kvm_ipi_signal;
3661 sigaction(SIG_IPI, &sigact, NULL);
3662
3663 pthread_sigmask(SIG_BLOCK, NULL, &set);
3664 #if defined KVM_HAVE_MCE_INJECTION
3665 sigdelset(&set, SIGBUS);
3666 pthread_sigmask(SIG_SETMASK, &set, NULL);
3667 #endif
3668 sigdelset(&set, SIG_IPI);
3669 if (kvm_immediate_exit) {
3670 r = pthread_sigmask(SIG_SETMASK, &set, NULL);
3671 } else {
3672 r = kvm_set_signal_mask(cpu, &set);
3673 }
3674 if (r) {
3675 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
3676 exit(1);
3677 }
3678 }
3679
3680 /* Called asynchronously in VCPU thread. */
kvm_on_sigbus_vcpu(CPUState * cpu,int code,void * addr)3681 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
3682 {
3683 #ifdef KVM_HAVE_MCE_INJECTION
3684 if (have_sigbus_pending) {
3685 return 1;
3686 }
3687 have_sigbus_pending = true;
3688 pending_sigbus_addr = addr;
3689 pending_sigbus_code = code;
3690 qatomic_set(&cpu->exit_request, 1);
3691 return 0;
3692 #else
3693 return 1;
3694 #endif
3695 }
3696
3697 /* Called synchronously (via signalfd) in main thread. */
kvm_on_sigbus(int code,void * addr)3698 int kvm_on_sigbus(int code, void *addr)
3699 {
3700 #ifdef KVM_HAVE_MCE_INJECTION
3701 /* Action required MCE kills the process if SIGBUS is blocked. Because
3702 * that's what happens in the I/O thread, where we handle MCE via signalfd,
3703 * we can only get action optional here.
3704 */
3705 assert(code != BUS_MCEERR_AR);
3706 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
3707 return 0;
3708 #else
3709 return 1;
3710 #endif
3711 }
3712
kvm_create_device(KVMState * s,uint64_t type,bool test)3713 int kvm_create_device(KVMState *s, uint64_t type, bool test)
3714 {
3715 int ret;
3716 struct kvm_create_device create_dev;
3717
3718 create_dev.type = type;
3719 create_dev.fd = -1;
3720 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
3721
3722 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
3723 return -ENOTSUP;
3724 }
3725
3726 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
3727 if (ret) {
3728 return ret;
3729 }
3730
3731 return test ? 0 : create_dev.fd;
3732 }
3733
kvm_device_supported(int vmfd,uint64_t type)3734 bool kvm_device_supported(int vmfd, uint64_t type)
3735 {
3736 struct kvm_create_device create_dev = {
3737 .type = type,
3738 .fd = -1,
3739 .flags = KVM_CREATE_DEVICE_TEST,
3740 };
3741
3742 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
3743 return false;
3744 }
3745
3746 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
3747 }
3748
kvm_set_one_reg(CPUState * cs,uint64_t id,void * source)3749 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
3750 {
3751 struct kvm_one_reg reg;
3752 int r;
3753
3754 reg.id = id;
3755 reg.addr = (uintptr_t) source;
3756 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®);
3757 if (r) {
3758 trace_kvm_failed_reg_set(id, strerror(-r));
3759 }
3760 return r;
3761 }
3762
kvm_get_one_reg(CPUState * cs,uint64_t id,void * target)3763 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
3764 {
3765 struct kvm_one_reg reg;
3766 int r;
3767
3768 reg.id = id;
3769 reg.addr = (uintptr_t) target;
3770 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®);
3771 if (r) {
3772 trace_kvm_failed_reg_get(id, strerror(-r));
3773 }
3774 return r;
3775 }
3776
kvm_accel_has_memory(MachineState * ms,AddressSpace * as,hwaddr start_addr,hwaddr size)3777 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
3778 hwaddr start_addr, hwaddr size)
3779 {
3780 KVMState *kvm = KVM_STATE(ms->accelerator);
3781 int i;
3782
3783 for (i = 0; i < kvm->nr_as; ++i) {
3784 if (kvm->as[i].as == as && kvm->as[i].ml) {
3785 size = MIN(kvm_max_slot_size, size);
3786 return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
3787 start_addr, size);
3788 }
3789 }
3790
3791 return false;
3792 }
3793
kvm_get_kvm_shadow_mem(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3794 static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v,
3795 const char *name, void *opaque,
3796 Error **errp)
3797 {
3798 KVMState *s = KVM_STATE(obj);
3799 int64_t value = s->kvm_shadow_mem;
3800
3801 visit_type_int(v, name, &value, errp);
3802 }
3803
kvm_set_kvm_shadow_mem(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3804 static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v,
3805 const char *name, void *opaque,
3806 Error **errp)
3807 {
3808 KVMState *s = KVM_STATE(obj);
3809 int64_t value;
3810
3811 if (s->fd != -1) {
3812 error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3813 return;
3814 }
3815
3816 if (!visit_type_int(v, name, &value, errp)) {
3817 return;
3818 }
3819
3820 s->kvm_shadow_mem = value;
3821 }
3822
kvm_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3823 static void kvm_set_kernel_irqchip(Object *obj, Visitor *v,
3824 const char *name, void *opaque,
3825 Error **errp)
3826 {
3827 KVMState *s = KVM_STATE(obj);
3828 OnOffSplit mode;
3829
3830 if (s->fd != -1) {
3831 error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3832 return;
3833 }
3834
3835 if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
3836 return;
3837 }
3838 switch (mode) {
3839 case ON_OFF_SPLIT_ON:
3840 s->kernel_irqchip_allowed = true;
3841 s->kernel_irqchip_required = true;
3842 s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3843 break;
3844 case ON_OFF_SPLIT_OFF:
3845 s->kernel_irqchip_allowed = false;
3846 s->kernel_irqchip_required = false;
3847 s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3848 break;
3849 case ON_OFF_SPLIT_SPLIT:
3850 s->kernel_irqchip_allowed = true;
3851 s->kernel_irqchip_required = true;
3852 s->kernel_irqchip_split = ON_OFF_AUTO_ON;
3853 break;
3854 default:
3855 /* The value was checked in visit_type_OnOffSplit() above. If
3856 * we get here, then something is wrong in QEMU.
3857 */
3858 abort();
3859 }
3860 }
3861
kvm_kernel_irqchip_allowed(void)3862 bool kvm_kernel_irqchip_allowed(void)
3863 {
3864 return kvm_state->kernel_irqchip_allowed;
3865 }
3866
kvm_kernel_irqchip_required(void)3867 bool kvm_kernel_irqchip_required(void)
3868 {
3869 return kvm_state->kernel_irqchip_required;
3870 }
3871
kvm_kernel_irqchip_split(void)3872 bool kvm_kernel_irqchip_split(void)
3873 {
3874 return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
3875 }
3876
kvm_get_dirty_ring_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3877 static void kvm_get_dirty_ring_size(Object *obj, Visitor *v,
3878 const char *name, void *opaque,
3879 Error **errp)
3880 {
3881 KVMState *s = KVM_STATE(obj);
3882 uint32_t value = s->kvm_dirty_ring_size;
3883
3884 visit_type_uint32(v, name, &value, errp);
3885 }
3886
kvm_set_dirty_ring_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3887 static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
3888 const char *name, void *opaque,
3889 Error **errp)
3890 {
3891 KVMState *s = KVM_STATE(obj);
3892 uint32_t value;
3893
3894 if (s->fd != -1) {
3895 error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3896 return;
3897 }
3898
3899 if (!visit_type_uint32(v, name, &value, errp)) {
3900 return;
3901 }
3902 if (value & (value - 1)) {
3903 error_setg(errp, "dirty-ring-size must be a power of two.");
3904 return;
3905 }
3906
3907 s->kvm_dirty_ring_size = value;
3908 }
3909
kvm_get_device(Object * obj,Error ** errp G_GNUC_UNUSED)3910 static char *kvm_get_device(Object *obj,
3911 Error **errp G_GNUC_UNUSED)
3912 {
3913 KVMState *s = KVM_STATE(obj);
3914
3915 return g_strdup(s->device);
3916 }
3917
kvm_set_device(Object * obj,const char * value,Error ** errp G_GNUC_UNUSED)3918 static void kvm_set_device(Object *obj,
3919 const char *value,
3920 Error **errp G_GNUC_UNUSED)
3921 {
3922 KVMState *s = KVM_STATE(obj);
3923
3924 g_free(s->device);
3925 s->device = g_strdup(value);
3926 }
3927
kvm_set_kvm_rapl(Object * obj,bool value,Error ** errp)3928 static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
3929 {
3930 KVMState *s = KVM_STATE(obj);
3931 s->msr_energy.enable = value;
3932 }
3933
kvm_set_kvm_rapl_socket_path(Object * obj,const char * str,Error ** errp)3934 static void kvm_set_kvm_rapl_socket_path(Object *obj,
3935 const char *str,
3936 Error **errp)
3937 {
3938 KVMState *s = KVM_STATE(obj);
3939 g_free(s->msr_energy.socket_path);
3940 s->msr_energy.socket_path = g_strdup(str);
3941 }
3942
kvm_accel_instance_init(Object * obj)3943 static void kvm_accel_instance_init(Object *obj)
3944 {
3945 KVMState *s = KVM_STATE(obj);
3946
3947 s->fd = -1;
3948 s->vmfd = -1;
3949 s->kvm_shadow_mem = -1;
3950 s->kernel_irqchip_allowed = true;
3951 s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
3952 /* KVM dirty ring is by default off */
3953 s->kvm_dirty_ring_size = 0;
3954 s->kvm_dirty_ring_with_bitmap = false;
3955 s->kvm_eager_split_size = 0;
3956 s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN;
3957 s->notify_window = 0;
3958 s->xen_version = 0;
3959 s->xen_gnttab_max_frames = 64;
3960 s->xen_evtchn_max_pirq = 256;
3961 s->device = NULL;
3962 s->msr_energy.enable = false;
3963 }
3964
3965 /**
3966 * kvm_gdbstub_sstep_flags():
3967 *
3968 * Returns: SSTEP_* flags that KVM supports for guest debug. The
3969 * support is probed during kvm_init()
3970 */
kvm_gdbstub_sstep_flags(void)3971 static int kvm_gdbstub_sstep_flags(void)
3972 {
3973 return kvm_sstep_flags;
3974 }
3975
kvm_accel_class_init(ObjectClass * oc,const void * data)3976 static void kvm_accel_class_init(ObjectClass *oc, const void *data)
3977 {
3978 AccelClass *ac = ACCEL_CLASS(oc);
3979 ac->name = "KVM";
3980 ac->init_machine = kvm_init;
3981 ac->has_memory = kvm_accel_has_memory;
3982 ac->allowed = &kvm_allowed;
3983 ac->gdbstub_supported_sstep_flags = kvm_gdbstub_sstep_flags;
3984
3985 object_class_property_add(oc, "kernel-irqchip", "on|off|split",
3986 NULL, kvm_set_kernel_irqchip,
3987 NULL, NULL);
3988 object_class_property_set_description(oc, "kernel-irqchip",
3989 "Configure KVM in-kernel irqchip");
3990
3991 object_class_property_add(oc, "kvm-shadow-mem", "int",
3992 kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem,
3993 NULL, NULL);
3994 object_class_property_set_description(oc, "kvm-shadow-mem",
3995 "KVM shadow MMU size");
3996
3997 object_class_property_add(oc, "dirty-ring-size", "uint32",
3998 kvm_get_dirty_ring_size, kvm_set_dirty_ring_size,
3999 NULL, NULL);
4000 object_class_property_set_description(oc, "dirty-ring-size",
4001 "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
4002
4003 object_class_property_add_str(oc, "device", kvm_get_device, kvm_set_device);
4004 object_class_property_set_description(oc, "device",
4005 "Path to the device node to use (default: /dev/kvm)");
4006
4007 object_class_property_add_bool(oc, "rapl",
4008 NULL,
4009 kvm_set_kvm_rapl);
4010 object_class_property_set_description(oc, "rapl",
4011 "Allow energy related MSRs for RAPL interface in Guest");
4012
4013 object_class_property_add_str(oc, "rapl-helper-socket", NULL,
4014 kvm_set_kvm_rapl_socket_path);
4015 object_class_property_set_description(oc, "rapl-helper-socket",
4016 "Socket Path for comminucating with the Virtual MSR helper daemon");
4017
4018 kvm_arch_accel_class_init(oc);
4019 }
4020
4021 static const TypeInfo kvm_accel_type = {
4022 .name = TYPE_KVM_ACCEL,
4023 .parent = TYPE_ACCEL,
4024 .instance_init = kvm_accel_instance_init,
4025 .class_init = kvm_accel_class_init,
4026 .instance_size = sizeof(KVMState),
4027 };
4028
kvm_type_init(void)4029 static void kvm_type_init(void)
4030 {
4031 type_register_static(&kvm_accel_type);
4032 }
4033
4034 type_init(kvm_type_init);
4035
4036 typedef struct StatsArgs {
4037 union StatsResultsType {
4038 StatsResultList **stats;
4039 StatsSchemaList **schema;
4040 } result;
4041 strList *names;
4042 Error **errp;
4043 } StatsArgs;
4044
add_kvmstat_entry(struct kvm_stats_desc * pdesc,uint64_t * stats_data,StatsList * stats_list,Error ** errp)4045 static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
4046 uint64_t *stats_data,
4047 StatsList *stats_list,
4048 Error **errp)
4049 {
4050
4051 Stats *stats;
4052 uint64List *val_list = NULL;
4053
4054 /* Only add stats that we understand. */
4055 switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
4056 case KVM_STATS_TYPE_CUMULATIVE:
4057 case KVM_STATS_TYPE_INSTANT:
4058 case KVM_STATS_TYPE_PEAK:
4059 case KVM_STATS_TYPE_LINEAR_HIST:
4060 case KVM_STATS_TYPE_LOG_HIST:
4061 break;
4062 default:
4063 return stats_list;
4064 }
4065
4066 switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
4067 case KVM_STATS_UNIT_NONE:
4068 case KVM_STATS_UNIT_BYTES:
4069 case KVM_STATS_UNIT_CYCLES:
4070 case KVM_STATS_UNIT_SECONDS:
4071 case KVM_STATS_UNIT_BOOLEAN:
4072 break;
4073 default:
4074 return stats_list;
4075 }
4076
4077 switch (pdesc->flags & KVM_STATS_BASE_MASK) {
4078 case KVM_STATS_BASE_POW10:
4079 case KVM_STATS_BASE_POW2:
4080 break;
4081 default:
4082 return stats_list;
4083 }
4084
4085 /* Alloc and populate data list */
4086 stats = g_new0(Stats, 1);
4087 stats->name = g_strdup(pdesc->name);
4088 stats->value = g_new0(StatsValue, 1);
4089
4090 if ((pdesc->flags & KVM_STATS_UNIT_MASK) == KVM_STATS_UNIT_BOOLEAN) {
4091 stats->value->u.boolean = *stats_data;
4092 stats->value->type = QTYPE_QBOOL;
4093 } else if (pdesc->size == 1) {
4094 stats->value->u.scalar = *stats_data;
4095 stats->value->type = QTYPE_QNUM;
4096 } else {
4097 int i;
4098 for (i = 0; i < pdesc->size; i++) {
4099 QAPI_LIST_PREPEND(val_list, stats_data[i]);
4100 }
4101 stats->value->u.list = val_list;
4102 stats->value->type = QTYPE_QLIST;
4103 }
4104
4105 QAPI_LIST_PREPEND(stats_list, stats);
4106 return stats_list;
4107 }
4108
add_kvmschema_entry(struct kvm_stats_desc * pdesc,StatsSchemaValueList * list,Error ** errp)4109 static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc,
4110 StatsSchemaValueList *list,
4111 Error **errp)
4112 {
4113 StatsSchemaValueList *schema_entry = g_new0(StatsSchemaValueList, 1);
4114 schema_entry->value = g_new0(StatsSchemaValue, 1);
4115
4116 switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
4117 case KVM_STATS_TYPE_CUMULATIVE:
4118 schema_entry->value->type = STATS_TYPE_CUMULATIVE;
4119 break;
4120 case KVM_STATS_TYPE_INSTANT:
4121 schema_entry->value->type = STATS_TYPE_INSTANT;
4122 break;
4123 case KVM_STATS_TYPE_PEAK:
4124 schema_entry->value->type = STATS_TYPE_PEAK;
4125 break;
4126 case KVM_STATS_TYPE_LINEAR_HIST:
4127 schema_entry->value->type = STATS_TYPE_LINEAR_HISTOGRAM;
4128 schema_entry->value->bucket_size = pdesc->bucket_size;
4129 schema_entry->value->has_bucket_size = true;
4130 break;
4131 case KVM_STATS_TYPE_LOG_HIST:
4132 schema_entry->value->type = STATS_TYPE_LOG2_HISTOGRAM;
4133 break;
4134 default:
4135 goto exit;
4136 }
4137
4138 switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
4139 case KVM_STATS_UNIT_NONE:
4140 break;
4141 case KVM_STATS_UNIT_BOOLEAN:
4142 schema_entry->value->has_unit = true;
4143 schema_entry->value->unit = STATS_UNIT_BOOLEAN;
4144 break;
4145 case KVM_STATS_UNIT_BYTES:
4146 schema_entry->value->has_unit = true;
4147 schema_entry->value->unit = STATS_UNIT_BYTES;
4148 break;
4149 case KVM_STATS_UNIT_CYCLES:
4150 schema_entry->value->has_unit = true;
4151 schema_entry->value->unit = STATS_UNIT_CYCLES;
4152 break;
4153 case KVM_STATS_UNIT_SECONDS:
4154 schema_entry->value->has_unit = true;
4155 schema_entry->value->unit = STATS_UNIT_SECONDS;
4156 break;
4157 default:
4158 goto exit;
4159 }
4160
4161 schema_entry->value->exponent = pdesc->exponent;
4162 if (pdesc->exponent) {
4163 switch (pdesc->flags & KVM_STATS_BASE_MASK) {
4164 case KVM_STATS_BASE_POW10:
4165 schema_entry->value->has_base = true;
4166 schema_entry->value->base = 10;
4167 break;
4168 case KVM_STATS_BASE_POW2:
4169 schema_entry->value->has_base = true;
4170 schema_entry->value->base = 2;
4171 break;
4172 default:
4173 goto exit;
4174 }
4175 }
4176
4177 schema_entry->value->name = g_strdup(pdesc->name);
4178 schema_entry->next = list;
4179 return schema_entry;
4180 exit:
4181 g_free(schema_entry->value);
4182 g_free(schema_entry);
4183 return list;
4184 }
4185
4186 /* Cached stats descriptors */
4187 typedef struct StatsDescriptors {
4188 const char *ident; /* cache key, currently the StatsTarget */
4189 struct kvm_stats_desc *kvm_stats_desc;
4190 struct kvm_stats_header kvm_stats_header;
4191 QTAILQ_ENTRY(StatsDescriptors) next;
4192 } StatsDescriptors;
4193
4194 static QTAILQ_HEAD(, StatsDescriptors) stats_descriptors =
4195 QTAILQ_HEAD_INITIALIZER(stats_descriptors);
4196
4197 /*
4198 * Return the descriptors for 'target', that either have already been read
4199 * or are retrieved from 'stats_fd'.
4200 */
find_stats_descriptors(StatsTarget target,int stats_fd,Error ** errp)4201 static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd,
4202 Error **errp)
4203 {
4204 StatsDescriptors *descriptors;
4205 const char *ident;
4206 struct kvm_stats_desc *kvm_stats_desc;
4207 struct kvm_stats_header *kvm_stats_header;
4208 size_t size_desc;
4209 ssize_t ret;
4210
4211 ident = StatsTarget_str(target);
4212 QTAILQ_FOREACH(descriptors, &stats_descriptors, next) {
4213 if (g_str_equal(descriptors->ident, ident)) {
4214 return descriptors;
4215 }
4216 }
4217
4218 descriptors = g_new0(StatsDescriptors, 1);
4219
4220 /* Read stats header */
4221 kvm_stats_header = &descriptors->kvm_stats_header;
4222 ret = pread(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header), 0);
4223 if (ret != sizeof(*kvm_stats_header)) {
4224 error_setg(errp, "KVM stats: failed to read stats header: "
4225 "expected %zu actual %zu",
4226 sizeof(*kvm_stats_header), ret);
4227 g_free(descriptors);
4228 return NULL;
4229 }
4230 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4231
4232 /* Read stats descriptors */
4233 kvm_stats_desc = g_malloc0_n(kvm_stats_header->num_desc, size_desc);
4234 ret = pread(stats_fd, kvm_stats_desc,
4235 size_desc * kvm_stats_header->num_desc,
4236 kvm_stats_header->desc_offset);
4237
4238 if (ret != size_desc * kvm_stats_header->num_desc) {
4239 error_setg(errp, "KVM stats: failed to read stats descriptors: "
4240 "expected %zu actual %zu",
4241 size_desc * kvm_stats_header->num_desc, ret);
4242 g_free(descriptors);
4243 g_free(kvm_stats_desc);
4244 return NULL;
4245 }
4246 descriptors->kvm_stats_desc = kvm_stats_desc;
4247 descriptors->ident = ident;
4248 QTAILQ_INSERT_TAIL(&stats_descriptors, descriptors, next);
4249 return descriptors;
4250 }
4251
query_stats(StatsResultList ** result,StatsTarget target,strList * names,int stats_fd,CPUState * cpu,Error ** errp)4252 static void query_stats(StatsResultList **result, StatsTarget target,
4253 strList *names, int stats_fd, CPUState *cpu,
4254 Error **errp)
4255 {
4256 struct kvm_stats_desc *kvm_stats_desc;
4257 struct kvm_stats_header *kvm_stats_header;
4258 StatsDescriptors *descriptors;
4259 g_autofree uint64_t *stats_data = NULL;
4260 struct kvm_stats_desc *pdesc;
4261 StatsList *stats_list = NULL;
4262 size_t size_desc, size_data = 0;
4263 ssize_t ret;
4264 int i;
4265
4266 descriptors = find_stats_descriptors(target, stats_fd, errp);
4267 if (!descriptors) {
4268 return;
4269 }
4270
4271 kvm_stats_header = &descriptors->kvm_stats_header;
4272 kvm_stats_desc = descriptors->kvm_stats_desc;
4273 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4274
4275 /* Tally the total data size; read schema data */
4276 for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4277 pdesc = (void *)kvm_stats_desc + i * size_desc;
4278 size_data += pdesc->size * sizeof(*stats_data);
4279 }
4280
4281 stats_data = g_malloc0(size_data);
4282 ret = pread(stats_fd, stats_data, size_data, kvm_stats_header->data_offset);
4283
4284 if (ret != size_data) {
4285 error_setg(errp, "KVM stats: failed to read data: "
4286 "expected %zu actual %zu", size_data, ret);
4287 return;
4288 }
4289
4290 for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4291 uint64_t *stats;
4292 pdesc = (void *)kvm_stats_desc + i * size_desc;
4293
4294 /* Add entry to the list */
4295 stats = (void *)stats_data + pdesc->offset;
4296 if (!apply_str_list_filter(pdesc->name, names)) {
4297 continue;
4298 }
4299 stats_list = add_kvmstat_entry(pdesc, stats, stats_list, errp);
4300 }
4301
4302 if (!stats_list) {
4303 return;
4304 }
4305
4306 switch (target) {
4307 case STATS_TARGET_VM:
4308 add_stats_entry(result, STATS_PROVIDER_KVM, NULL, stats_list);
4309 break;
4310 case STATS_TARGET_VCPU:
4311 add_stats_entry(result, STATS_PROVIDER_KVM,
4312 cpu->parent_obj.canonical_path,
4313 stats_list);
4314 break;
4315 default:
4316 g_assert_not_reached();
4317 }
4318 }
4319
query_stats_schema(StatsSchemaList ** result,StatsTarget target,int stats_fd,Error ** errp)4320 static void query_stats_schema(StatsSchemaList **result, StatsTarget target,
4321 int stats_fd, Error **errp)
4322 {
4323 struct kvm_stats_desc *kvm_stats_desc;
4324 struct kvm_stats_header *kvm_stats_header;
4325 StatsDescriptors *descriptors;
4326 struct kvm_stats_desc *pdesc;
4327 StatsSchemaValueList *stats_list = NULL;
4328 size_t size_desc;
4329 int i;
4330
4331 descriptors = find_stats_descriptors(target, stats_fd, errp);
4332 if (!descriptors) {
4333 return;
4334 }
4335
4336 kvm_stats_header = &descriptors->kvm_stats_header;
4337 kvm_stats_desc = descriptors->kvm_stats_desc;
4338 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4339
4340 /* Tally the total data size; read schema data */
4341 for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4342 pdesc = (void *)kvm_stats_desc + i * size_desc;
4343 stats_list = add_kvmschema_entry(pdesc, stats_list, errp);
4344 }
4345
4346 add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list);
4347 }
4348
query_stats_vcpu(CPUState * cpu,StatsArgs * kvm_stats_args)4349 static void query_stats_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4350 {
4351 int stats_fd = cpu->kvm_vcpu_stats_fd;
4352 Error *local_err = NULL;
4353
4354 if (stats_fd == -1) {
4355 error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4356 error_propagate(kvm_stats_args->errp, local_err);
4357 return;
4358 }
4359 query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU,
4360 kvm_stats_args->names, stats_fd, cpu,
4361 kvm_stats_args->errp);
4362 }
4363
query_stats_schema_vcpu(CPUState * cpu,StatsArgs * kvm_stats_args)4364 static void query_stats_schema_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4365 {
4366 int stats_fd = cpu->kvm_vcpu_stats_fd;
4367 Error *local_err = NULL;
4368
4369 if (stats_fd == -1) {
4370 error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4371 error_propagate(kvm_stats_args->errp, local_err);
4372 return;
4373 }
4374 query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd,
4375 kvm_stats_args->errp);
4376 }
4377
query_stats_cb(StatsResultList ** result,StatsTarget target,strList * names,strList * targets,Error ** errp)4378 static void query_stats_cb(StatsResultList **result, StatsTarget target,
4379 strList *names, strList *targets, Error **errp)
4380 {
4381 KVMState *s = kvm_state;
4382 CPUState *cpu;
4383 int stats_fd;
4384
4385 switch (target) {
4386 case STATS_TARGET_VM:
4387 {
4388 stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4389 if (stats_fd == -1) {
4390 error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4391 return;
4392 }
4393 query_stats(result, target, names, stats_fd, NULL, errp);
4394 close(stats_fd);
4395 break;
4396 }
4397 case STATS_TARGET_VCPU:
4398 {
4399 StatsArgs stats_args;
4400 stats_args.result.stats = result;
4401 stats_args.names = names;
4402 stats_args.errp = errp;
4403 CPU_FOREACH(cpu) {
4404 if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) {
4405 continue;
4406 }
4407 query_stats_vcpu(cpu, &stats_args);
4408 }
4409 break;
4410 }
4411 default:
4412 break;
4413 }
4414 }
4415
query_stats_schemas_cb(StatsSchemaList ** result,Error ** errp)4416 void query_stats_schemas_cb(StatsSchemaList **result, Error **errp)
4417 {
4418 StatsArgs stats_args;
4419 KVMState *s = kvm_state;
4420 int stats_fd;
4421
4422 stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4423 if (stats_fd == -1) {
4424 error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4425 return;
4426 }
4427 query_stats_schema(result, STATS_TARGET_VM, stats_fd, errp);
4428 close(stats_fd);
4429
4430 if (first_cpu) {
4431 stats_args.result.schema = result;
4432 stats_args.errp = errp;
4433 query_stats_schema_vcpu(first_cpu, &stats_args);
4434 }
4435 }
4436
kvm_mark_guest_state_protected(void)4437 void kvm_mark_guest_state_protected(void)
4438 {
4439 kvm_state->guest_state_protected = true;
4440 }
4441
kvm_create_guest_memfd(uint64_t size,uint64_t flags,Error ** errp)4442 int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp)
4443 {
4444 int fd;
4445 struct kvm_create_guest_memfd guest_memfd = {
4446 .size = size,
4447 .flags = flags,
4448 };
4449
4450 if (!kvm_guest_memfd_supported) {
4451 error_setg(errp, "KVM does not support guest_memfd");
4452 return -1;
4453 }
4454
4455 fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_GUEST_MEMFD, &guest_memfd);
4456 if (fd < 0) {
4457 error_setg_errno(errp, errno, "Error creating KVM guest_memfd");
4458 return -1;
4459 }
4460
4461 return fd;
4462 }
4463