1 /*
2 * generic functions used by VFIO devices
3 *
4 * Copyright Red Hat, Inc. 2012
5 *
6 * Authors:
7 * Alex Williamson <alex.williamson@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Based on qemu-kvm device-assignment:
13 * Adapted for KVM by Qumranet.
14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19 */
20
21 #include "qemu/osdep.h"
22 #include <sys/ioctl.h>
23 #ifdef CONFIG_KVM
24 #include <linux/kvm.h>
25 #endif
26 #include <linux/vfio.h>
27
28 #include "hw/vfio/vfio-device.h"
29 #include "hw/vfio/pci.h"
30 #include "system/address-spaces.h"
31 #include "system/memory.h"
32 #include "system/ram_addr.h"
33 #include "hw/hw.h"
34 #include "qemu/error-report.h"
35 #include "qemu/main-loop.h"
36 #include "qemu/range.h"
37 #include "system/kvm.h"
38 #include "system/reset.h"
39 #include "system/runstate.h"
40 #include "trace.h"
41 #include "qapi/error.h"
42 #include "migration/misc.h"
43 #include "migration/qemu-file.h"
44 #include "system/tcg.h"
45 #include "system/tpm.h"
46 #include "vfio-migration-internal.h"
47 #include "vfio-helpers.h"
48 #include "vfio-listener.h"
49
50 /*
51 * Device state interfaces
52 */
53
54
vfio_log_sync_needed(const VFIOContainerBase * bcontainer)55 static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer)
56 {
57 VFIODevice *vbasedev;
58
59 if (!vfio_container_dirty_tracking_is_started(bcontainer)) {
60 return false;
61 }
62
63 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
64 VFIOMigration *migration = vbasedev->migration;
65
66 if (!migration) {
67 return false;
68 }
69
70 if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
71 (vfio_device_state_is_running(vbasedev) ||
72 vfio_device_state_is_precopy(vbasedev))) {
73 return false;
74 }
75 }
76 return true;
77 }
78
vfio_listener_skipped_section(MemoryRegionSection * section)79 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
80 {
81 return (!memory_region_is_ram(section->mr) &&
82 !memory_region_is_iommu(section->mr)) ||
83 memory_region_is_protected(section->mr) ||
84 /*
85 * Sizing an enabled 64-bit BAR can cause spurious mappings to
86 * addresses in the upper part of the 64-bit address space. These
87 * are never accessed by the CPU and beyond the address width of
88 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
89 */
90 section->offset_within_address_space & (1ULL << 63);
91 }
92
93 /* Called with rcu_read_lock held. */
vfio_get_xlat_addr(IOMMUTLBEntry * iotlb,void ** vaddr,ram_addr_t * ram_addr,bool * read_only,Error ** errp)94 static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
95 ram_addr_t *ram_addr, bool *read_only,
96 Error **errp)
97 {
98 bool ret, mr_has_discard_manager;
99
100 ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
101 &mr_has_discard_manager, errp);
102 if (ret && mr_has_discard_manager) {
103 /*
104 * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
105 * pages will remain pinned inside vfio until unmapped, resulting in a
106 * higher memory consumption than expected. If memory would get
107 * populated again later, there would be an inconsistency between pages
108 * pinned by vfio and pages seen by QEMU. This is the case until
109 * unmapped from the IOMMU (e.g., during device reset).
110 *
111 * With malicious guests, we really only care about pinning more memory
112 * than expected. RLIMIT_MEMLOCK set for the user/process can never be
113 * exceeded and can be used to mitigate this problem.
114 */
115 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
116 " RAM (e.g., virtio-mem) works, however, malicious"
117 " guests can trigger pinning of more memory than"
118 " intended via an IOMMU. It's possible to mitigate "
119 " by setting/adjusting RLIMIT_MEMLOCK.");
120 }
121 return ret;
122 }
123
vfio_iommu_map_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)124 static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
125 {
126 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
127 VFIOContainerBase *bcontainer = giommu->bcontainer;
128 hwaddr iova = iotlb->iova + giommu->iommu_offset;
129 void *vaddr;
130 int ret;
131 Error *local_err = NULL;
132
133 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
134 iova, iova + iotlb->addr_mask);
135
136 if (iotlb->target_as != &address_space_memory) {
137 error_setg(&local_err,
138 "Wrong target AS \"%s\", only system memory is allowed",
139 iotlb->target_as->name ? iotlb->target_as->name : "none");
140 if (migration_is_running()) {
141 migration_file_set_error(-EINVAL, local_err);
142 } else {
143 error_report_err(local_err);
144 }
145 return;
146 }
147
148 rcu_read_lock();
149
150 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
151 bool read_only;
152
153 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) {
154 error_report_err(local_err);
155 goto out;
156 }
157 /*
158 * vaddr is only valid until rcu_read_unlock(). But after
159 * vfio_dma_map has set up the mapping the pages will be
160 * pinned by the kernel. This makes sure that the RAM backend
161 * of vaddr will always be there, even if the memory object is
162 * destroyed and its backing memory munmap-ed.
163 */
164 ret = vfio_container_dma_map(bcontainer, iova,
165 iotlb->addr_mask + 1, vaddr,
166 read_only);
167 if (ret) {
168 error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
169 "0x%"HWADDR_PRIx", %p) = %d (%s)",
170 bcontainer, iova,
171 iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
172 }
173 } else {
174 ret = vfio_container_dma_unmap(bcontainer, iova,
175 iotlb->addr_mask + 1, iotlb, false);
176 if (ret) {
177 error_setg(&local_err,
178 "vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
179 "0x%"HWADDR_PRIx") = %d (%s)",
180 bcontainer, iova,
181 iotlb->addr_mask + 1, ret, strerror(-ret));
182 if (migration_is_running()) {
183 migration_file_set_error(ret, local_err);
184 } else {
185 error_report_err(local_err);
186 }
187 }
188 }
189 out:
190 rcu_read_unlock();
191 }
192
vfio_ram_discard_notify_discard(RamDiscardListener * rdl,MemoryRegionSection * section)193 static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
194 MemoryRegionSection *section)
195 {
196 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
197 listener);
198 VFIOContainerBase *bcontainer = vrdl->bcontainer;
199 const hwaddr size = int128_get64(section->size);
200 const hwaddr iova = section->offset_within_address_space;
201 int ret;
202
203 /* Unmap with a single call. */
204 ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL, false);
205 if (ret) {
206 error_report("%s: vfio_container_dma_unmap() failed: %s", __func__,
207 strerror(-ret));
208 }
209 }
210
vfio_ram_discard_notify_populate(RamDiscardListener * rdl,MemoryRegionSection * section)211 static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
212 MemoryRegionSection *section)
213 {
214 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
215 listener);
216 VFIOContainerBase *bcontainer = vrdl->bcontainer;
217 const hwaddr end = section->offset_within_region +
218 int128_get64(section->size);
219 hwaddr start, next, iova;
220 void *vaddr;
221 int ret;
222
223 /*
224 * Map in (aligned within memory region) minimum granularity, so we can
225 * unmap in minimum granularity later.
226 */
227 for (start = section->offset_within_region; start < end; start = next) {
228 next = ROUND_UP(start + 1, vrdl->granularity);
229 next = MIN(next, end);
230
231 iova = start - section->offset_within_region +
232 section->offset_within_address_space;
233 vaddr = memory_region_get_ram_ptr(section->mr) + start;
234
235 ret = vfio_container_dma_map(bcontainer, iova, next - start,
236 vaddr, section->readonly);
237 if (ret) {
238 /* Rollback */
239 vfio_ram_discard_notify_discard(rdl, section);
240 return ret;
241 }
242 }
243 return 0;
244 }
245
vfio_ram_discard_register_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)246 static void vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
247 MemoryRegionSection *section)
248 {
249 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
250 int target_page_size = qemu_target_page_size();
251 VFIORamDiscardListener *vrdl;
252
253 /* Ignore some corner cases not relevant in practice. */
254 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, target_page_size));
255 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
256 target_page_size));
257 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), target_page_size));
258
259 vrdl = g_new0(VFIORamDiscardListener, 1);
260 vrdl->bcontainer = bcontainer;
261 vrdl->mr = section->mr;
262 vrdl->offset_within_address_space = section->offset_within_address_space;
263 vrdl->size = int128_get64(section->size);
264 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
265 section->mr);
266
267 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
268 g_assert(bcontainer->pgsizes &&
269 vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes));
270
271 ram_discard_listener_init(&vrdl->listener,
272 vfio_ram_discard_notify_populate,
273 vfio_ram_discard_notify_discard, true);
274 ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
275 QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next);
276
277 /*
278 * Sanity-check if we have a theoretically problematic setup where we could
279 * exceed the maximum number of possible DMA mappings over time. We assume
280 * that each mapped section in the same address space as a RamDiscardManager
281 * section consumes exactly one DMA mapping, with the exception of
282 * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
283 * in the same address space as RamDiscardManager sections.
284 *
285 * We assume that each section in the address space consumes one memslot.
286 * We take the number of KVM memory slots as a best guess for the maximum
287 * number of sections in the address space we could have over time,
288 * also consuming DMA mappings.
289 */
290 if (bcontainer->dma_max_mappings) {
291 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
292
293 #ifdef CONFIG_KVM
294 if (kvm_enabled()) {
295 max_memslots = kvm_get_max_memslots();
296 }
297 #endif
298
299 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
300 hwaddr start, end;
301
302 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
303 vrdl->granularity);
304 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
305 vrdl->granularity);
306 vrdl_mappings += (end - start) / vrdl->granularity;
307 vrdl_count++;
308 }
309
310 if (vrdl_mappings + max_memslots - vrdl_count >
311 bcontainer->dma_max_mappings) {
312 warn_report("%s: possibly running out of DMA mappings. E.g., try"
313 " increasing the 'block-size' of virtio-mem devies."
314 " Maximum possible DMA mappings: %d, Maximum possible"
315 " memslots: %d", __func__, bcontainer->dma_max_mappings,
316 max_memslots);
317 }
318 }
319 }
320
vfio_ram_discard_unregister_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)321 static void vfio_ram_discard_unregister_listener(VFIOContainerBase *bcontainer,
322 MemoryRegionSection *section)
323 {
324 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
325 VFIORamDiscardListener *vrdl = NULL;
326
327 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
328 if (vrdl->mr == section->mr &&
329 vrdl->offset_within_address_space ==
330 section->offset_within_address_space) {
331 break;
332 }
333 }
334
335 if (!vrdl) {
336 hw_error("vfio: Trying to unregister missing RAM discard listener");
337 }
338
339 ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
340 QLIST_REMOVE(vrdl, next);
341 g_free(vrdl);
342 }
343
vfio_known_safe_misalignment(MemoryRegionSection * section)344 static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
345 {
346 MemoryRegion *mr = section->mr;
347
348 if (!TPM_IS_CRB(mr->owner)) {
349 return false;
350 }
351
352 /* this is a known safe misaligned region, just trace for debug purpose */
353 trace_vfio_known_safe_misalignment(memory_region_name(mr),
354 section->offset_within_address_space,
355 section->offset_within_region,
356 qemu_real_host_page_size());
357 return true;
358 }
359
vfio_listener_valid_section(MemoryRegionSection * section,const char * name)360 static bool vfio_listener_valid_section(MemoryRegionSection *section,
361 const char *name)
362 {
363 if (vfio_listener_skipped_section(section)) {
364 trace_vfio_listener_region_skip(name,
365 section->offset_within_address_space,
366 section->offset_within_address_space +
367 int128_get64(int128_sub(section->size, int128_one())));
368 return false;
369 }
370
371 if (unlikely((section->offset_within_address_space &
372 ~qemu_real_host_page_mask()) !=
373 (section->offset_within_region & ~qemu_real_host_page_mask()))) {
374 if (!vfio_known_safe_misalignment(section)) {
375 error_report("%s received unaligned region %s iova=0x%"PRIx64
376 " offset_within_region=0x%"PRIx64
377 " qemu_real_host_page_size=0x%"PRIxPTR,
378 __func__, memory_region_name(section->mr),
379 section->offset_within_address_space,
380 section->offset_within_region,
381 qemu_real_host_page_size());
382 }
383 return false;
384 }
385
386 return true;
387 }
388
vfio_get_section_iova_range(VFIOContainerBase * bcontainer,MemoryRegionSection * section,hwaddr * out_iova,hwaddr * out_end,Int128 * out_llend)389 static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
390 MemoryRegionSection *section,
391 hwaddr *out_iova, hwaddr *out_end,
392 Int128 *out_llend)
393 {
394 Int128 llend;
395 hwaddr iova;
396
397 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
398 llend = int128_make64(section->offset_within_address_space);
399 llend = int128_add(llend, section->size);
400 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
401
402 if (int128_ge(int128_make64(iova), llend)) {
403 return false;
404 }
405
406 *out_iova = iova;
407 *out_end = int128_get64(int128_sub(llend, int128_one()));
408 if (out_llend) {
409 *out_llend = llend;
410 }
411 return true;
412 }
413
vfio_listener_begin(MemoryListener * listener)414 static void vfio_listener_begin(MemoryListener *listener)
415 {
416 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
417 listener);
418 void (*listener_begin)(VFIOContainerBase *bcontainer);
419
420 listener_begin = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
421
422 if (listener_begin) {
423 listener_begin(bcontainer);
424 }
425 }
426
vfio_listener_commit(MemoryListener * listener)427 static void vfio_listener_commit(MemoryListener *listener)
428 {
429 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
430 listener);
431 void (*listener_commit)(VFIOContainerBase *bcontainer);
432
433 listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
434
435 if (listener_commit) {
436 listener_commit(bcontainer);
437 }
438 }
439
vfio_device_error_append(VFIODevice * vbasedev,Error ** errp)440 static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp)
441 {
442 /*
443 * MMIO region mapping failures are not fatal but in this case PCI
444 * peer-to-peer transactions are broken.
445 */
446 if (vbasedev && vbasedev->type == VFIO_DEVICE_TYPE_PCI) {
447 error_append_hint(errp, "%s: PCI peer-to-peer transactions "
448 "on BARs are not supported.\n", vbasedev->name);
449 }
450 }
451
vfio_listener_region_add(MemoryListener * listener,MemoryRegionSection * section)452 static void vfio_listener_region_add(MemoryListener *listener,
453 MemoryRegionSection *section)
454 {
455 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
456 listener);
457 hwaddr iova, end;
458 Int128 llend, llsize;
459 void *vaddr;
460 int ret;
461 Error *err = NULL;
462
463 if (!vfio_listener_valid_section(section, "region_add")) {
464 return;
465 }
466
467 if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
468 &llend)) {
469 if (memory_region_is_ram_device(section->mr)) {
470 trace_vfio_listener_region_add_no_dma_map(
471 memory_region_name(section->mr),
472 section->offset_within_address_space,
473 int128_getlo(section->size),
474 qemu_real_host_page_size());
475 }
476 return;
477 }
478
479 /* PPC64/pseries machine only */
480 if (!vfio_container_add_section_window(bcontainer, section, &err)) {
481 goto mmio_dma_error;
482 }
483
484 memory_region_ref(section->mr);
485
486 if (memory_region_is_iommu(section->mr)) {
487 VFIOGuestIOMMU *giommu;
488 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
489 int iommu_idx;
490
491 trace_vfio_listener_region_add_iommu(section->mr->name, iova, end);
492 /*
493 * FIXME: For VFIO iommu types which have KVM acceleration to
494 * avoid bouncing all map/unmaps through qemu this way, this
495 * would be the right place to wire that up (tell the KVM
496 * device emulation the VFIO iommu handles to use).
497 */
498 giommu = g_malloc0(sizeof(*giommu));
499 giommu->iommu_mr = iommu_mr;
500 giommu->iommu_offset = section->offset_within_address_space -
501 section->offset_within_region;
502 giommu->bcontainer = bcontainer;
503 llend = int128_add(int128_make64(section->offset_within_region),
504 section->size);
505 llend = int128_sub(llend, int128_one());
506 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
507 MEMTXATTRS_UNSPECIFIED);
508 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
509 IOMMU_NOTIFIER_IOTLB_EVENTS,
510 section->offset_within_region,
511 int128_get64(llend),
512 iommu_idx);
513
514 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
515 &err);
516 if (ret) {
517 g_free(giommu);
518 goto fail;
519 }
520 QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next);
521 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
522
523 return;
524 }
525
526 /* Here we assume that memory_region_is_ram(section->mr)==true */
527
528 /*
529 * For RAM memory regions with a RamDiscardManager, we only want to map the
530 * actually populated parts - and update the mapping whenever we're notified
531 * about changes.
532 */
533 if (memory_region_has_ram_discard_manager(section->mr)) {
534 vfio_ram_discard_register_listener(bcontainer, section);
535 return;
536 }
537
538 vaddr = memory_region_get_ram_ptr(section->mr) +
539 section->offset_within_region +
540 (iova - section->offset_within_address_space);
541
542 trace_vfio_listener_region_add_ram(iova, end, vaddr);
543
544 llsize = int128_sub(llend, int128_make64(iova));
545
546 if (memory_region_is_ram_device(section->mr)) {
547 hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
548
549 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
550 trace_vfio_listener_region_add_no_dma_map(
551 memory_region_name(section->mr),
552 section->offset_within_address_space,
553 int128_getlo(section->size),
554 pgmask + 1);
555 return;
556 }
557 }
558
559 ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
560 vaddr, section->readonly);
561 if (ret) {
562 error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
563 "0x%"HWADDR_PRIx", %p) = %d (%s)",
564 bcontainer, iova, int128_get64(llsize), vaddr, ret,
565 strerror(-ret));
566 mmio_dma_error:
567 if (memory_region_is_ram_device(section->mr)) {
568 /* Allow unexpected mappings not to be fatal for RAM devices */
569 VFIODevice *vbasedev =
570 vfio_get_vfio_device(memory_region_owner(section->mr));
571 vfio_device_error_append(vbasedev, &err);
572 warn_report_err_once(err);
573 return;
574 }
575 goto fail;
576 }
577
578 return;
579
580 fail:
581 if (!bcontainer->initialized) {
582 /*
583 * At machine init time or when the device is attached to the
584 * VM, store the first error in the container so we can
585 * gracefully fail the device realize routine.
586 */
587 if (!bcontainer->error) {
588 error_propagate_prepend(&bcontainer->error, err,
589 "Region %s: ",
590 memory_region_name(section->mr));
591 } else {
592 error_free(err);
593 }
594 } else {
595 /*
596 * At runtime, there's not much we can do other than throw a
597 * hardware error.
598 */
599 error_report_err(err);
600 hw_error("vfio: DMA mapping failed, unable to continue");
601 }
602 }
603
vfio_listener_region_del(MemoryListener * listener,MemoryRegionSection * section)604 static void vfio_listener_region_del(MemoryListener *listener,
605 MemoryRegionSection *section)
606 {
607 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
608 listener);
609 hwaddr iova, end;
610 Int128 llend, llsize;
611 int ret;
612 bool try_unmap = true;
613
614 if (!vfio_listener_valid_section(section, "region_del")) {
615 return;
616 }
617
618 if (memory_region_is_iommu(section->mr)) {
619 VFIOGuestIOMMU *giommu;
620
621 trace_vfio_listener_region_del_iommu(section->mr->name);
622 QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
623 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
624 giommu->n.start == section->offset_within_region) {
625 memory_region_unregister_iommu_notifier(section->mr,
626 &giommu->n);
627 QLIST_REMOVE(giommu, giommu_next);
628 g_free(giommu);
629 break;
630 }
631 }
632
633 /*
634 * FIXME: We assume the one big unmap below is adequate to
635 * remove any individual page mappings in the IOMMU which
636 * might have been copied into VFIO. This works for a page table
637 * based IOMMU where a big unmap flattens a large range of IO-PTEs.
638 * That may not be true for all IOMMU types.
639 */
640 }
641
642 if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
643 &llend)) {
644 return;
645 }
646
647 llsize = int128_sub(llend, int128_make64(iova));
648
649 trace_vfio_listener_region_del(iova, end);
650
651 if (memory_region_is_ram_device(section->mr)) {
652 hwaddr pgmask;
653
654 pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
655 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
656 } else if (memory_region_has_ram_discard_manager(section->mr)) {
657 vfio_ram_discard_unregister_listener(bcontainer, section);
658 /* Unregistering will trigger an unmap. */
659 try_unmap = false;
660 }
661
662 if (try_unmap) {
663 bool unmap_all = false;
664
665 if (int128_eq(llsize, int128_2_64())) {
666 unmap_all = true;
667 llsize = int128_zero();
668 }
669 ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize),
670 NULL, unmap_all);
671 if (ret) {
672 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
673 "0x%"HWADDR_PRIx") = %d (%s)",
674 bcontainer, iova, int128_get64(llsize), ret,
675 strerror(-ret));
676 }
677 }
678
679 memory_region_unref(section->mr);
680
681 /* PPC64/pseries machine only */
682 vfio_container_del_section_window(bcontainer, section);
683 }
684
685 typedef struct VFIODirtyRanges {
686 hwaddr min32;
687 hwaddr max32;
688 hwaddr min64;
689 hwaddr max64;
690 hwaddr minpci64;
691 hwaddr maxpci64;
692 } VFIODirtyRanges;
693
694 typedef struct VFIODirtyRangesListener {
695 VFIOContainerBase *bcontainer;
696 VFIODirtyRanges ranges;
697 MemoryListener listener;
698 } VFIODirtyRangesListener;
699
vfio_section_is_vfio_pci(MemoryRegionSection * section,VFIOContainerBase * bcontainer)700 static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
701 VFIOContainerBase *bcontainer)
702 {
703 VFIOPCIDevice *pcidev;
704 VFIODevice *vbasedev;
705 Object *owner;
706
707 owner = memory_region_owner(section->mr);
708
709 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
710 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
711 continue;
712 }
713 pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
714 if (OBJECT(pcidev) == owner) {
715 return true;
716 }
717 }
718
719 return false;
720 }
721
vfio_dirty_tracking_update_range(VFIODirtyRanges * range,hwaddr iova,hwaddr end,bool update_pci)722 static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range,
723 hwaddr iova, hwaddr end,
724 bool update_pci)
725 {
726 hwaddr *min, *max;
727
728 /*
729 * The address space passed to the dirty tracker is reduced to three ranges:
730 * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
731 * PCI 64-bit hole.
732 *
733 * The underlying reports of dirty will query a sub-interval of each of
734 * these ranges.
735 *
736 * The purpose of the three range handling is to handle known cases of big
737 * holes in the address space, like the x86 AMD 1T hole, and firmware (like
738 * OVMF) which may relocate the pci-hole64 to the end of the address space.
739 * The latter would otherwise generate large ranges for tracking, stressing
740 * the limits of supported hardware. The pci-hole32 will always be below 4G
741 * (overlapping or not) so it doesn't need special handling and is part of
742 * the 32-bit range.
743 *
744 * The alternative would be an IOVATree but that has a much bigger runtime
745 * overhead and unnecessary complexity.
746 */
747 if (update_pci && iova >= UINT32_MAX) {
748 min = &range->minpci64;
749 max = &range->maxpci64;
750 } else {
751 min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
752 max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
753 }
754 if (*min > iova) {
755 *min = iova;
756 }
757 if (*max < end) {
758 *max = end;
759 }
760
761 trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
762 }
763
vfio_dirty_tracking_update(MemoryListener * listener,MemoryRegionSection * section)764 static void vfio_dirty_tracking_update(MemoryListener *listener,
765 MemoryRegionSection *section)
766 {
767 VFIODirtyRangesListener *dirty =
768 container_of(listener, VFIODirtyRangesListener, listener);
769 hwaddr iova, end;
770
771 if (!vfio_listener_valid_section(section, "tracking_update") ||
772 !vfio_get_section_iova_range(dirty->bcontainer, section,
773 &iova, &end, NULL)) {
774 return;
775 }
776
777 vfio_dirty_tracking_update_range(&dirty->ranges, iova, end,
778 vfio_section_is_vfio_pci(section, dirty->bcontainer));
779 }
780
781 static const MemoryListener vfio_dirty_tracking_listener = {
782 .name = "vfio-tracking",
783 .region_add = vfio_dirty_tracking_update,
784 };
785
vfio_dirty_tracking_init(VFIOContainerBase * bcontainer,VFIODirtyRanges * ranges)786 static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer,
787 VFIODirtyRanges *ranges)
788 {
789 VFIODirtyRangesListener dirty;
790
791 memset(&dirty, 0, sizeof(dirty));
792 dirty.ranges.min32 = UINT32_MAX;
793 dirty.ranges.min64 = UINT64_MAX;
794 dirty.ranges.minpci64 = UINT64_MAX;
795 dirty.listener = vfio_dirty_tracking_listener;
796 dirty.bcontainer = bcontainer;
797
798 memory_listener_register(&dirty.listener,
799 bcontainer->space->as);
800
801 *ranges = dirty.ranges;
802
803 /*
804 * The memory listener is synchronous, and used to calculate the range
805 * to dirty tracking. Unregister it after we are done as we are not
806 * interested in any follow-up updates.
807 */
808 memory_listener_unregister(&dirty.listener);
809 }
810
vfio_devices_dma_logging_stop(VFIOContainerBase * bcontainer)811 static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
812 {
813 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
814 sizeof(uint64_t))] = {};
815 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
816 VFIODevice *vbasedev;
817
818 feature->argsz = sizeof(buf);
819 feature->flags = VFIO_DEVICE_FEATURE_SET |
820 VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
821
822 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
823 int ret;
824
825 if (!vbasedev->dirty_tracking) {
826 continue;
827 }
828
829 ret = vbasedev->io_ops->device_feature(vbasedev, feature);
830
831 if (ret != 0) {
832 warn_report("%s: Failed to stop DMA logging, err %d (%s)",
833 vbasedev->name, -ret, strerror(-ret));
834 }
835 vbasedev->dirty_tracking = false;
836 }
837 }
838
839 static struct vfio_device_feature *
vfio_device_feature_dma_logging_start_create(VFIOContainerBase * bcontainer,VFIODirtyRanges * tracking)840 vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer,
841 VFIODirtyRanges *tracking)
842 {
843 struct vfio_device_feature *feature;
844 size_t feature_size;
845 struct vfio_device_feature_dma_logging_control *control;
846 struct vfio_device_feature_dma_logging_range *ranges;
847
848 feature_size = sizeof(struct vfio_device_feature) +
849 sizeof(struct vfio_device_feature_dma_logging_control);
850 feature = g_try_malloc0(feature_size);
851 if (!feature) {
852 errno = ENOMEM;
853 return NULL;
854 }
855 feature->argsz = feature_size;
856 feature->flags = VFIO_DEVICE_FEATURE_SET |
857 VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
858
859 control = (struct vfio_device_feature_dma_logging_control *)feature->data;
860 control->page_size = qemu_real_host_page_size();
861
862 /*
863 * DMA logging uAPI guarantees to support at least a number of ranges that
864 * fits into a single host kernel base page.
865 */
866 control->num_ranges = !!tracking->max32 + !!tracking->max64 +
867 !!tracking->maxpci64;
868 ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
869 control->num_ranges);
870 if (!ranges) {
871 g_free(feature);
872 errno = ENOMEM;
873
874 return NULL;
875 }
876
877 control->ranges = (uintptr_t)ranges;
878 if (tracking->max32) {
879 ranges->iova = tracking->min32;
880 ranges->length = (tracking->max32 - tracking->min32) + 1;
881 ranges++;
882 }
883 if (tracking->max64) {
884 ranges->iova = tracking->min64;
885 ranges->length = (tracking->max64 - tracking->min64) + 1;
886 ranges++;
887 }
888 if (tracking->maxpci64) {
889 ranges->iova = tracking->minpci64;
890 ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1;
891 }
892
893 trace_vfio_device_dirty_tracking_start(control->num_ranges,
894 tracking->min32, tracking->max32,
895 tracking->min64, tracking->max64,
896 tracking->minpci64, tracking->maxpci64);
897
898 return feature;
899 }
900
vfio_device_feature_dma_logging_start_destroy(struct vfio_device_feature * feature)901 static void vfio_device_feature_dma_logging_start_destroy(
902 struct vfio_device_feature *feature)
903 {
904 struct vfio_device_feature_dma_logging_control *control =
905 (struct vfio_device_feature_dma_logging_control *)feature->data;
906 struct vfio_device_feature_dma_logging_range *ranges =
907 (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges;
908
909 g_free(ranges);
910 g_free(feature);
911 }
912
vfio_devices_dma_logging_start(VFIOContainerBase * bcontainer,Error ** errp)913 static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer,
914 Error **errp)
915 {
916 struct vfio_device_feature *feature;
917 VFIODirtyRanges ranges;
918 VFIODevice *vbasedev;
919 int ret = 0;
920
921 vfio_dirty_tracking_init(bcontainer, &ranges);
922 feature = vfio_device_feature_dma_logging_start_create(bcontainer,
923 &ranges);
924 if (!feature) {
925 error_setg_errno(errp, errno, "Failed to prepare DMA logging");
926 return false;
927 }
928
929 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
930 if (vbasedev->dirty_tracking) {
931 continue;
932 }
933
934 ret = vbasedev->io_ops->device_feature(vbasedev, feature);
935 if (ret) {
936 error_setg_errno(errp, -ret, "%s: Failed to start DMA logging",
937 vbasedev->name);
938 goto out;
939 }
940 vbasedev->dirty_tracking = true;
941 }
942
943 out:
944 if (ret) {
945 vfio_devices_dma_logging_stop(bcontainer);
946 }
947
948 vfio_device_feature_dma_logging_start_destroy(feature);
949
950 return ret == 0;
951 }
952
vfio_listener_log_global_start(MemoryListener * listener,Error ** errp)953 static bool vfio_listener_log_global_start(MemoryListener *listener,
954 Error **errp)
955 {
956 ERRP_GUARD();
957 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
958 listener);
959 bool ret;
960
961 if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) {
962 ret = vfio_devices_dma_logging_start(bcontainer, errp);
963 } else {
964 ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0;
965 }
966
967 if (!ret) {
968 error_prepend(errp, "vfio: Could not start dirty page tracking - ");
969 }
970 return ret;
971 }
972
vfio_listener_log_global_stop(MemoryListener * listener)973 static void vfio_listener_log_global_stop(MemoryListener *listener)
974 {
975 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
976 listener);
977 Error *local_err = NULL;
978 int ret = 0;
979
980 if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) {
981 vfio_devices_dma_logging_stop(bcontainer);
982 } else {
983 ret = vfio_container_set_dirty_page_tracking(bcontainer, false,
984 &local_err);
985 }
986
987 if (ret) {
988 error_prepend(&local_err,
989 "vfio: Could not stop dirty page tracking - ");
990 if (migration_is_running()) {
991 migration_file_set_error(ret, local_err);
992 } else {
993 error_report_err(local_err);
994 }
995 }
996 }
997
998 typedef struct {
999 IOMMUNotifier n;
1000 VFIOGuestIOMMU *giommu;
1001 } vfio_giommu_dirty_notifier;
1002
vfio_iommu_map_dirty_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)1003 static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
1004 {
1005 vfio_giommu_dirty_notifier *gdn = container_of(n,
1006 vfio_giommu_dirty_notifier, n);
1007 VFIOGuestIOMMU *giommu = gdn->giommu;
1008 VFIOContainerBase *bcontainer = giommu->bcontainer;
1009 hwaddr iova = iotlb->iova + giommu->iommu_offset;
1010 ram_addr_t translated_addr;
1011 Error *local_err = NULL;
1012 int ret = -EINVAL;
1013
1014 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
1015
1016 if (iotlb->target_as != &address_space_memory) {
1017 error_setg(&local_err,
1018 "Wrong target AS \"%s\", only system memory is allowed",
1019 iotlb->target_as->name ? iotlb->target_as->name : "none");
1020 goto out;
1021 }
1022
1023 rcu_read_lock();
1024 if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) {
1025 goto out_unlock;
1026 }
1027
1028 ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
1029 translated_addr, &local_err);
1030 if (ret) {
1031 error_prepend(&local_err,
1032 "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
1033 "0x%"HWADDR_PRIx") failed - ", bcontainer, iova,
1034 iotlb->addr_mask + 1);
1035 }
1036
1037 out_unlock:
1038 rcu_read_unlock();
1039
1040 out:
1041 if (ret) {
1042 if (migration_is_running()) {
1043 migration_file_set_error(ret, local_err);
1044 } else {
1045 error_report_err(local_err);
1046 }
1047 }
1048 }
1049
vfio_ram_discard_query_dirty_bitmap(MemoryRegionSection * section,void * opaque)1050 static int vfio_ram_discard_query_dirty_bitmap(MemoryRegionSection *section,
1051 void *opaque)
1052 {
1053 const hwaddr size = int128_get64(section->size);
1054 const hwaddr iova = section->offset_within_address_space;
1055 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
1056 section->offset_within_region;
1057 VFIORamDiscardListener *vrdl = opaque;
1058 Error *local_err = NULL;
1059 int ret;
1060
1061 /*
1062 * Sync the whole mapped region (spanning multiple individual mappings)
1063 * in one go.
1064 */
1065 ret = vfio_container_query_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr,
1066 &local_err);
1067 if (ret) {
1068 error_report_err(local_err);
1069 }
1070 return ret;
1071 }
1072
1073 static int
vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section)1074 vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer,
1075 MemoryRegionSection *section)
1076 {
1077 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
1078 VFIORamDiscardListener *vrdl = NULL;
1079
1080 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
1081 if (vrdl->mr == section->mr &&
1082 vrdl->offset_within_address_space ==
1083 section->offset_within_address_space) {
1084 break;
1085 }
1086 }
1087
1088 if (!vrdl) {
1089 hw_error("vfio: Trying to sync missing RAM discard listener");
1090 }
1091
1092 /*
1093 * We only want/can synchronize the bitmap for actually mapped parts -
1094 * which correspond to populated parts. Replay all populated parts.
1095 */
1096 return ram_discard_manager_replay_populated(rdm, section,
1097 vfio_ram_discard_query_dirty_bitmap,
1098 &vrdl);
1099 }
1100
vfio_sync_iommu_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section)1101 static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer,
1102 MemoryRegionSection *section)
1103 {
1104 VFIOGuestIOMMU *giommu;
1105 bool found = false;
1106 Int128 llend;
1107 vfio_giommu_dirty_notifier gdn;
1108 int idx;
1109
1110 QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
1111 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
1112 giommu->n.start == section->offset_within_region) {
1113 found = true;
1114 break;
1115 }
1116 }
1117
1118 if (!found) {
1119 return 0;
1120 }
1121
1122 gdn.giommu = giommu;
1123 idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
1124 MEMTXATTRS_UNSPECIFIED);
1125
1126 llend = int128_add(int128_make64(section->offset_within_region),
1127 section->size);
1128 llend = int128_sub(llend, int128_one());
1129
1130 iommu_notifier_init(&gdn.n, vfio_iommu_map_dirty_notify, IOMMU_NOTIFIER_MAP,
1131 section->offset_within_region, int128_get64(llend),
1132 idx);
1133 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
1134
1135 return 0;
1136 }
1137
vfio_sync_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section,Error ** errp)1138 static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer,
1139 MemoryRegionSection *section, Error **errp)
1140 {
1141 ram_addr_t ram_addr;
1142
1143 if (memory_region_is_iommu(section->mr)) {
1144 return vfio_sync_iommu_dirty_bitmap(bcontainer, section);
1145 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1146 int ret;
1147
1148 ret = vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section);
1149 if (ret) {
1150 error_setg(errp,
1151 "Failed to sync dirty bitmap with RAM discard listener");
1152 }
1153 return ret;
1154 }
1155
1156 ram_addr = memory_region_get_ram_addr(section->mr) +
1157 section->offset_within_region;
1158
1159 return vfio_container_query_dirty_bitmap(bcontainer,
1160 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
1161 int128_get64(section->size), ram_addr, errp);
1162 }
1163
vfio_listener_log_sync(MemoryListener * listener,MemoryRegionSection * section)1164 static void vfio_listener_log_sync(MemoryListener *listener,
1165 MemoryRegionSection *section)
1166 {
1167 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
1168 listener);
1169 int ret;
1170 Error *local_err = NULL;
1171
1172 if (vfio_listener_skipped_section(section)) {
1173 return;
1174 }
1175
1176 if (vfio_log_sync_needed(bcontainer)) {
1177 ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err);
1178 if (ret) {
1179 if (migration_is_running()) {
1180 migration_file_set_error(ret, local_err);
1181 } else {
1182 error_report_err(local_err);
1183 }
1184 }
1185 }
1186 }
1187
1188 static const MemoryListener vfio_memory_listener = {
1189 .name = "vfio",
1190 .begin = vfio_listener_begin,
1191 .commit = vfio_listener_commit,
1192 .region_add = vfio_listener_region_add,
1193 .region_del = vfio_listener_region_del,
1194 .log_global_start = vfio_listener_log_global_start,
1195 .log_global_stop = vfio_listener_log_global_stop,
1196 .log_sync = vfio_listener_log_sync,
1197 };
1198
vfio_listener_register(VFIOContainerBase * bcontainer,Error ** errp)1199 bool vfio_listener_register(VFIOContainerBase *bcontainer, Error **errp)
1200 {
1201 bcontainer->listener = vfio_memory_listener;
1202 memory_listener_register(&bcontainer->listener, bcontainer->space->as);
1203
1204 if (bcontainer->error) {
1205 error_propagate_prepend(errp, bcontainer->error,
1206 "memory listener initialization failed: ");
1207 return false;
1208 }
1209
1210 return true;
1211 }
1212
vfio_listener_unregister(VFIOContainerBase * bcontainer)1213 void vfio_listener_unregister(VFIOContainerBase *bcontainer)
1214 {
1215 memory_listener_unregister(&bcontainer->listener);
1216 }
1217