1 /*
2 * generic functions used by VFIO devices
3 *
4 * Copyright Red Hat, Inc. 2012
5 *
6 * Authors:
7 * Alex Williamson <alex.williamson@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Based on qemu-kvm device-assignment:
13 * Adapted for KVM by Qumranet.
14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19 */
20
21 #include "qemu/osdep.h"
22 #include <sys/ioctl.h>
23 #ifdef CONFIG_KVM
24 #include <linux/kvm.h>
25 #endif
26 #include <linux/vfio.h>
27
28 #include "hw/vfio/vfio-device.h"
29 #include "hw/vfio/pci.h"
30 #include "system/address-spaces.h"
31 #include "system/memory.h"
32 #include "system/ram_addr.h"
33 #include "hw/hw.h"
34 #include "qemu/error-report.h"
35 #include "qemu/main-loop.h"
36 #include "qemu/range.h"
37 #include "system/kvm.h"
38 #include "system/reset.h"
39 #include "system/runstate.h"
40 #include "trace.h"
41 #include "qapi/error.h"
42 #include "migration/misc.h"
43 #include "migration/qemu-file.h"
44 #include "system/tcg.h"
45 #include "system/tpm.h"
46 #include "vfio-migration-internal.h"
47 #include "vfio-helpers.h"
48 #include "vfio-listener.h"
49
50 /*
51 * Device state interfaces
52 */
53
54
vfio_log_sync_needed(const VFIOContainerBase * bcontainer)55 static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer)
56 {
57 VFIODevice *vbasedev;
58
59 if (!vfio_container_dirty_tracking_is_started(bcontainer)) {
60 return false;
61 }
62
63 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
64 VFIOMigration *migration = vbasedev->migration;
65
66 if (!migration) {
67 return false;
68 }
69
70 if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
71 (vfio_device_state_is_running(vbasedev) ||
72 vfio_device_state_is_precopy(vbasedev))) {
73 return false;
74 }
75 }
76 return true;
77 }
78
vfio_listener_skipped_section(MemoryRegionSection * section)79 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
80 {
81 return (!memory_region_is_ram(section->mr) &&
82 !memory_region_is_iommu(section->mr)) ||
83 memory_region_is_protected(section->mr) ||
84 /*
85 * Sizing an enabled 64-bit BAR can cause spurious mappings to
86 * addresses in the upper part of the 64-bit address space. These
87 * are never accessed by the CPU and beyond the address width of
88 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
89 */
90 section->offset_within_address_space & (1ULL << 63);
91 }
92
93 /*
94 * Called with rcu_read_lock held.
95 * The returned MemoryRegion must not be accessed after calling rcu_read_unlock.
96 */
vfio_translate_iotlb(IOMMUTLBEntry * iotlb,hwaddr * xlat_p,Error ** errp)97 static MemoryRegion *vfio_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p,
98 Error **errp)
99 {
100 MemoryRegion *mr;
101
102 mr = memory_translate_iotlb(iotlb, xlat_p, errp);
103 if (mr && memory_region_has_ram_discard_manager(mr)) {
104 /*
105 * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
106 * pages will remain pinned inside vfio until unmapped, resulting in a
107 * higher memory consumption than expected. If memory would get
108 * populated again later, there would be an inconsistency between pages
109 * pinned by vfio and pages seen by QEMU. This is the case until
110 * unmapped from the IOMMU (e.g., during device reset).
111 *
112 * With malicious guests, we really only care about pinning more memory
113 * than expected. RLIMIT_MEMLOCK set for the user/process can never be
114 * exceeded and can be used to mitigate this problem.
115 */
116 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
117 " RAM (e.g., virtio-mem) works, however, malicious"
118 " guests can trigger pinning of more memory than"
119 " intended via an IOMMU. It's possible to mitigate "
120 " by setting/adjusting RLIMIT_MEMLOCK.");
121 }
122 return mr;
123 }
124
vfio_iommu_map_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)125 static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
126 {
127 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
128 VFIOContainerBase *bcontainer = giommu->bcontainer;
129 hwaddr iova = iotlb->iova + giommu->iommu_offset;
130 MemoryRegion *mr;
131 hwaddr xlat;
132 void *vaddr;
133 int ret;
134 Error *local_err = NULL;
135
136 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
137 iova, iova + iotlb->addr_mask);
138
139 if (iotlb->target_as != &address_space_memory) {
140 error_setg(&local_err,
141 "Wrong target AS \"%s\", only system memory is allowed",
142 iotlb->target_as->name ? iotlb->target_as->name : "none");
143 if (migration_is_running()) {
144 migration_file_set_error(-EINVAL, local_err);
145 } else {
146 error_report_err(local_err);
147 }
148 return;
149 }
150
151 rcu_read_lock();
152
153 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
154 bool read_only;
155
156 mr = vfio_translate_iotlb(iotlb, &xlat, &local_err);
157 if (!mr) {
158 error_report_err(local_err);
159 goto out;
160 }
161 vaddr = memory_region_get_ram_ptr(mr) + xlat;
162 read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly;
163
164 /*
165 * vaddr is only valid until rcu_read_unlock(). But after
166 * vfio_dma_map has set up the mapping the pages will be
167 * pinned by the kernel. This makes sure that the RAM backend
168 * of vaddr will always be there, even if the memory object is
169 * destroyed and its backing memory munmap-ed.
170 */
171 ret = vfio_container_dma_map(bcontainer, iova,
172 iotlb->addr_mask + 1, vaddr,
173 read_only, mr);
174 if (ret) {
175 error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
176 "0x%"HWADDR_PRIx", %p) = %d (%s)",
177 bcontainer, iova,
178 iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
179 }
180 } else {
181 ret = vfio_container_dma_unmap(bcontainer, iova,
182 iotlb->addr_mask + 1, iotlb, false);
183 if (ret) {
184 error_setg(&local_err,
185 "vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
186 "0x%"HWADDR_PRIx") = %d (%s)",
187 bcontainer, iova,
188 iotlb->addr_mask + 1, ret, strerror(-ret));
189 if (migration_is_running()) {
190 migration_file_set_error(ret, local_err);
191 } else {
192 error_report_err(local_err);
193 }
194 }
195 }
196 out:
197 rcu_read_unlock();
198 }
199
vfio_ram_discard_notify_discard(RamDiscardListener * rdl,MemoryRegionSection * section)200 static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
201 MemoryRegionSection *section)
202 {
203 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
204 listener);
205 VFIOContainerBase *bcontainer = vrdl->bcontainer;
206 const hwaddr size = int128_get64(section->size);
207 const hwaddr iova = section->offset_within_address_space;
208 int ret;
209
210 /* Unmap with a single call. */
211 ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL, false);
212 if (ret) {
213 error_report("%s: vfio_container_dma_unmap() failed: %s", __func__,
214 strerror(-ret));
215 }
216 }
217
vfio_ram_discard_notify_populate(RamDiscardListener * rdl,MemoryRegionSection * section)218 static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
219 MemoryRegionSection *section)
220 {
221 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
222 listener);
223 VFIOContainerBase *bcontainer = vrdl->bcontainer;
224 const hwaddr end = section->offset_within_region +
225 int128_get64(section->size);
226 hwaddr start, next, iova;
227 void *vaddr;
228 int ret;
229
230 /*
231 * Map in (aligned within memory region) minimum granularity, so we can
232 * unmap in minimum granularity later.
233 */
234 for (start = section->offset_within_region; start < end; start = next) {
235 next = ROUND_UP(start + 1, vrdl->granularity);
236 next = MIN(next, end);
237
238 iova = start - section->offset_within_region +
239 section->offset_within_address_space;
240 vaddr = memory_region_get_ram_ptr(section->mr) + start;
241
242 ret = vfio_container_dma_map(bcontainer, iova, next - start,
243 vaddr, section->readonly, section->mr);
244 if (ret) {
245 /* Rollback */
246 vfio_ram_discard_notify_discard(rdl, section);
247 return ret;
248 }
249 }
250 return 0;
251 }
252
vfio_ram_discard_register_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)253 static void vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
254 MemoryRegionSection *section)
255 {
256 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
257 int target_page_size = qemu_target_page_size();
258 VFIORamDiscardListener *vrdl;
259
260 /* Ignore some corner cases not relevant in practice. */
261 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, target_page_size));
262 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
263 target_page_size));
264 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), target_page_size));
265
266 vrdl = g_new0(VFIORamDiscardListener, 1);
267 vrdl->bcontainer = bcontainer;
268 vrdl->mr = section->mr;
269 vrdl->offset_within_address_space = section->offset_within_address_space;
270 vrdl->size = int128_get64(section->size);
271 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
272 section->mr);
273
274 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
275 g_assert(bcontainer->pgsizes &&
276 vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes));
277
278 ram_discard_listener_init(&vrdl->listener,
279 vfio_ram_discard_notify_populate,
280 vfio_ram_discard_notify_discard, true);
281 ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
282 QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next);
283
284 /*
285 * Sanity-check if we have a theoretically problematic setup where we could
286 * exceed the maximum number of possible DMA mappings over time. We assume
287 * that each mapped section in the same address space as a RamDiscardManager
288 * section consumes exactly one DMA mapping, with the exception of
289 * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
290 * in the same address space as RamDiscardManager sections.
291 *
292 * We assume that each section in the address space consumes one memslot.
293 * We take the number of KVM memory slots as a best guess for the maximum
294 * number of sections in the address space we could have over time,
295 * also consuming DMA mappings.
296 */
297 if (bcontainer->dma_max_mappings) {
298 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
299
300 #ifdef CONFIG_KVM
301 if (kvm_enabled()) {
302 max_memslots = kvm_get_max_memslots();
303 }
304 #endif
305
306 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
307 hwaddr start, end;
308
309 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
310 vrdl->granularity);
311 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
312 vrdl->granularity);
313 vrdl_mappings += (end - start) / vrdl->granularity;
314 vrdl_count++;
315 }
316
317 if (vrdl_mappings + max_memslots - vrdl_count >
318 bcontainer->dma_max_mappings) {
319 warn_report("%s: possibly running out of DMA mappings. E.g., try"
320 " increasing the 'block-size' of virtio-mem devies."
321 " Maximum possible DMA mappings: %d, Maximum possible"
322 " memslots: %d", __func__, bcontainer->dma_max_mappings,
323 max_memslots);
324 }
325 }
326 }
327
vfio_ram_discard_unregister_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)328 static void vfio_ram_discard_unregister_listener(VFIOContainerBase *bcontainer,
329 MemoryRegionSection *section)
330 {
331 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
332 VFIORamDiscardListener *vrdl = NULL;
333
334 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
335 if (vrdl->mr == section->mr &&
336 vrdl->offset_within_address_space ==
337 section->offset_within_address_space) {
338 break;
339 }
340 }
341
342 if (!vrdl) {
343 hw_error("vfio: Trying to unregister missing RAM discard listener");
344 }
345
346 ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
347 QLIST_REMOVE(vrdl, next);
348 g_free(vrdl);
349 }
350
vfio_known_safe_misalignment(MemoryRegionSection * section)351 static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
352 {
353 MemoryRegion *mr = section->mr;
354
355 if (!TPM_IS_CRB(mr->owner)) {
356 return false;
357 }
358
359 /* this is a known safe misaligned region, just trace for debug purpose */
360 trace_vfio_known_safe_misalignment(memory_region_name(mr),
361 section->offset_within_address_space,
362 section->offset_within_region,
363 qemu_real_host_page_size());
364 return true;
365 }
366
vfio_listener_valid_section(MemoryRegionSection * section,const char * name)367 static bool vfio_listener_valid_section(MemoryRegionSection *section,
368 const char *name)
369 {
370 if (vfio_listener_skipped_section(section)) {
371 trace_vfio_listener_region_skip(name,
372 section->offset_within_address_space,
373 section->offset_within_address_space +
374 int128_get64(int128_sub(section->size, int128_one())));
375 return false;
376 }
377
378 if (unlikely((section->offset_within_address_space &
379 ~qemu_real_host_page_mask()) !=
380 (section->offset_within_region & ~qemu_real_host_page_mask()))) {
381 if (!vfio_known_safe_misalignment(section)) {
382 error_report("%s received unaligned region %s iova=0x%"PRIx64
383 " offset_within_region=0x%"PRIx64
384 " qemu_real_host_page_size=0x%"PRIxPTR,
385 __func__, memory_region_name(section->mr),
386 section->offset_within_address_space,
387 section->offset_within_region,
388 qemu_real_host_page_size());
389 }
390 return false;
391 }
392
393 return true;
394 }
395
vfio_get_section_iova_range(VFIOContainerBase * bcontainer,MemoryRegionSection * section,hwaddr * out_iova,hwaddr * out_end,Int128 * out_llend)396 static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
397 MemoryRegionSection *section,
398 hwaddr *out_iova, hwaddr *out_end,
399 Int128 *out_llend)
400 {
401 Int128 llend;
402 hwaddr iova;
403
404 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
405 llend = int128_make64(section->offset_within_address_space);
406 llend = int128_add(llend, section->size);
407 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
408
409 if (int128_ge(int128_make64(iova), llend)) {
410 return false;
411 }
412
413 *out_iova = iova;
414 *out_end = int128_get64(int128_sub(llend, int128_one()));
415 if (out_llend) {
416 *out_llend = llend;
417 }
418 return true;
419 }
420
vfio_listener_begin(MemoryListener * listener)421 static void vfio_listener_begin(MemoryListener *listener)
422 {
423 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
424 listener);
425 void (*listener_begin)(VFIOContainerBase *bcontainer);
426
427 listener_begin = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
428
429 if (listener_begin) {
430 listener_begin(bcontainer);
431 }
432 }
433
vfio_listener_commit(MemoryListener * listener)434 static void vfio_listener_commit(MemoryListener *listener)
435 {
436 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
437 listener);
438 void (*listener_commit)(VFIOContainerBase *bcontainer);
439
440 listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
441
442 if (listener_commit) {
443 listener_commit(bcontainer);
444 }
445 }
446
vfio_device_error_append(VFIODevice * vbasedev,Error ** errp)447 static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp)
448 {
449 /*
450 * MMIO region mapping failures are not fatal but in this case PCI
451 * peer-to-peer transactions are broken.
452 */
453 if (vbasedev && vbasedev->type == VFIO_DEVICE_TYPE_PCI) {
454 error_append_hint(errp, "%s: PCI peer-to-peer transactions "
455 "on BARs are not supported.\n", vbasedev->name);
456 }
457 }
458
vfio_find_ram_discard_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)459 VFIORamDiscardListener *vfio_find_ram_discard_listener(
460 VFIOContainerBase *bcontainer, MemoryRegionSection *section)
461 {
462 VFIORamDiscardListener *vrdl = NULL;
463
464 QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
465 if (vrdl->mr == section->mr &&
466 vrdl->offset_within_address_space ==
467 section->offset_within_address_space) {
468 break;
469 }
470 }
471
472 if (!vrdl) {
473 hw_error("vfio: Trying to sync missing RAM discard listener");
474 /* does not return */
475 }
476 return vrdl;
477 }
478
vfio_listener_region_add(MemoryListener * listener,MemoryRegionSection * section)479 static void vfio_listener_region_add(MemoryListener *listener,
480 MemoryRegionSection *section)
481 {
482 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
483 listener);
484 hwaddr iova, end;
485 Int128 llend, llsize;
486 void *vaddr;
487 int ret;
488 Error *err = NULL;
489
490 if (!vfio_listener_valid_section(section, "region_add")) {
491 return;
492 }
493
494 if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
495 &llend)) {
496 if (memory_region_is_ram_device(section->mr)) {
497 trace_vfio_listener_region_add_no_dma_map(
498 memory_region_name(section->mr),
499 section->offset_within_address_space,
500 int128_getlo(section->size),
501 qemu_real_host_page_size());
502 }
503 return;
504 }
505
506 /* PPC64/pseries machine only */
507 if (!vfio_container_add_section_window(bcontainer, section, &err)) {
508 goto mmio_dma_error;
509 }
510
511 memory_region_ref(section->mr);
512
513 if (memory_region_is_iommu(section->mr)) {
514 VFIOGuestIOMMU *giommu;
515 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
516 int iommu_idx;
517
518 trace_vfio_listener_region_add_iommu(section->mr->name, iova, end);
519 /*
520 * FIXME: For VFIO iommu types which have KVM acceleration to
521 * avoid bouncing all map/unmaps through qemu this way, this
522 * would be the right place to wire that up (tell the KVM
523 * device emulation the VFIO iommu handles to use).
524 */
525 giommu = g_malloc0(sizeof(*giommu));
526 giommu->iommu_mr = iommu_mr;
527 giommu->iommu_offset = section->offset_within_address_space -
528 section->offset_within_region;
529 giommu->bcontainer = bcontainer;
530 llend = int128_add(int128_make64(section->offset_within_region),
531 section->size);
532 llend = int128_sub(llend, int128_one());
533 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
534 MEMTXATTRS_UNSPECIFIED);
535 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
536 IOMMU_NOTIFIER_IOTLB_EVENTS,
537 section->offset_within_region,
538 int128_get64(llend),
539 iommu_idx);
540
541 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
542 &err);
543 if (ret) {
544 g_free(giommu);
545 goto fail;
546 }
547 QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next);
548 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
549
550 return;
551 }
552
553 /* Here we assume that memory_region_is_ram(section->mr)==true */
554
555 /*
556 * For RAM memory regions with a RamDiscardManager, we only want to map the
557 * actually populated parts - and update the mapping whenever we're notified
558 * about changes.
559 */
560 if (memory_region_has_ram_discard_manager(section->mr)) {
561 vfio_ram_discard_register_listener(bcontainer, section);
562 return;
563 }
564
565 vaddr = memory_region_get_ram_ptr(section->mr) +
566 section->offset_within_region +
567 (iova - section->offset_within_address_space);
568
569 trace_vfio_listener_region_add_ram(iova, end, vaddr);
570
571 llsize = int128_sub(llend, int128_make64(iova));
572
573 if (memory_region_is_ram_device(section->mr)) {
574 hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
575
576 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
577 trace_vfio_listener_region_add_no_dma_map(
578 memory_region_name(section->mr),
579 section->offset_within_address_space,
580 int128_getlo(section->size),
581 pgmask + 1);
582 return;
583 }
584 }
585
586 ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
587 vaddr, section->readonly, section->mr);
588 if (ret) {
589 error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
590 "0x%"HWADDR_PRIx", %p) = %d (%s)",
591 bcontainer, iova, int128_get64(llsize), vaddr, ret,
592 strerror(-ret));
593 mmio_dma_error:
594 if (memory_region_is_ram_device(section->mr)) {
595 /* Allow unexpected mappings not to be fatal for RAM devices */
596 VFIODevice *vbasedev =
597 vfio_get_vfio_device(memory_region_owner(section->mr));
598 vfio_device_error_append(vbasedev, &err);
599 warn_report_err_once(err);
600 return;
601 }
602 goto fail;
603 }
604
605 return;
606
607 fail:
608 if (!bcontainer->initialized) {
609 /*
610 * At machine init time or when the device is attached to the
611 * VM, store the first error in the container so we can
612 * gracefully fail the device realize routine.
613 */
614 if (!bcontainer->error) {
615 error_propagate_prepend(&bcontainer->error, err,
616 "Region %s: ",
617 memory_region_name(section->mr));
618 } else {
619 error_free(err);
620 }
621 } else {
622 /*
623 * At runtime, there's not much we can do other than throw a
624 * hardware error.
625 */
626 error_report_err(err);
627 hw_error("vfio: DMA mapping failed, unable to continue");
628 }
629 }
630
vfio_listener_region_del(MemoryListener * listener,MemoryRegionSection * section)631 static void vfio_listener_region_del(MemoryListener *listener,
632 MemoryRegionSection *section)
633 {
634 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
635 listener);
636 hwaddr iova, end;
637 Int128 llend, llsize;
638 int ret;
639 bool try_unmap = true;
640
641 if (!vfio_listener_valid_section(section, "region_del")) {
642 return;
643 }
644
645 if (memory_region_is_iommu(section->mr)) {
646 VFIOGuestIOMMU *giommu;
647
648 trace_vfio_listener_region_del_iommu(section->mr->name);
649 QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
650 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
651 giommu->n.start == section->offset_within_region) {
652 memory_region_unregister_iommu_notifier(section->mr,
653 &giommu->n);
654 QLIST_REMOVE(giommu, giommu_next);
655 g_free(giommu);
656 break;
657 }
658 }
659
660 /*
661 * FIXME: We assume the one big unmap below is adequate to
662 * remove any individual page mappings in the IOMMU which
663 * might have been copied into VFIO. This works for a page table
664 * based IOMMU where a big unmap flattens a large range of IO-PTEs.
665 * That may not be true for all IOMMU types.
666 */
667 }
668
669 if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
670 &llend)) {
671 return;
672 }
673
674 llsize = int128_sub(llend, int128_make64(iova));
675
676 trace_vfio_listener_region_del(iova, end);
677
678 if (memory_region_is_ram_device(section->mr)) {
679 hwaddr pgmask;
680
681 pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
682 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
683 } else if (memory_region_has_ram_discard_manager(section->mr)) {
684 vfio_ram_discard_unregister_listener(bcontainer, section);
685 /* Unregistering will trigger an unmap. */
686 try_unmap = false;
687 }
688
689 if (try_unmap) {
690 bool unmap_all = false;
691
692 if (int128_eq(llsize, int128_2_64())) {
693 unmap_all = true;
694 llsize = int128_zero();
695 }
696 ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize),
697 NULL, unmap_all);
698 if (ret) {
699 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
700 "0x%"HWADDR_PRIx") = %d (%s)",
701 bcontainer, iova, int128_get64(llsize), ret,
702 strerror(-ret));
703 }
704 }
705
706 memory_region_unref(section->mr);
707
708 /* PPC64/pseries machine only */
709 vfio_container_del_section_window(bcontainer, section);
710 }
711
712 typedef struct VFIODirtyRanges {
713 hwaddr min32;
714 hwaddr max32;
715 hwaddr min64;
716 hwaddr max64;
717 hwaddr minpci64;
718 hwaddr maxpci64;
719 } VFIODirtyRanges;
720
721 typedef struct VFIODirtyRangesListener {
722 VFIOContainerBase *bcontainer;
723 VFIODirtyRanges ranges;
724 MemoryListener listener;
725 } VFIODirtyRangesListener;
726
vfio_section_is_vfio_pci(MemoryRegionSection * section,VFIOContainerBase * bcontainer)727 static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
728 VFIOContainerBase *bcontainer)
729 {
730 VFIOPCIDevice *pcidev;
731 VFIODevice *vbasedev;
732 Object *owner;
733
734 owner = memory_region_owner(section->mr);
735
736 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
737 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
738 continue;
739 }
740 pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
741 if (OBJECT(pcidev) == owner) {
742 return true;
743 }
744 }
745
746 return false;
747 }
748
vfio_dirty_tracking_update_range(VFIODirtyRanges * range,hwaddr iova,hwaddr end,bool update_pci)749 static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range,
750 hwaddr iova, hwaddr end,
751 bool update_pci)
752 {
753 hwaddr *min, *max;
754
755 /*
756 * The address space passed to the dirty tracker is reduced to three ranges:
757 * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
758 * PCI 64-bit hole.
759 *
760 * The underlying reports of dirty will query a sub-interval of each of
761 * these ranges.
762 *
763 * The purpose of the three range handling is to handle known cases of big
764 * holes in the address space, like the x86 AMD 1T hole, and firmware (like
765 * OVMF) which may relocate the pci-hole64 to the end of the address space.
766 * The latter would otherwise generate large ranges for tracking, stressing
767 * the limits of supported hardware. The pci-hole32 will always be below 4G
768 * (overlapping or not) so it doesn't need special handling and is part of
769 * the 32-bit range.
770 *
771 * The alternative would be an IOVATree but that has a much bigger runtime
772 * overhead and unnecessary complexity.
773 */
774 if (update_pci && iova >= UINT32_MAX) {
775 min = &range->minpci64;
776 max = &range->maxpci64;
777 } else {
778 min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
779 max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
780 }
781 if (*min > iova) {
782 *min = iova;
783 }
784 if (*max < end) {
785 *max = end;
786 }
787
788 trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
789 }
790
vfio_dirty_tracking_update(MemoryListener * listener,MemoryRegionSection * section)791 static void vfio_dirty_tracking_update(MemoryListener *listener,
792 MemoryRegionSection *section)
793 {
794 VFIODirtyRangesListener *dirty =
795 container_of(listener, VFIODirtyRangesListener, listener);
796 hwaddr iova, end;
797
798 if (!vfio_listener_valid_section(section, "tracking_update") ||
799 !vfio_get_section_iova_range(dirty->bcontainer, section,
800 &iova, &end, NULL)) {
801 return;
802 }
803
804 vfio_dirty_tracking_update_range(&dirty->ranges, iova, end,
805 vfio_section_is_vfio_pci(section, dirty->bcontainer));
806 }
807
808 static const MemoryListener vfio_dirty_tracking_listener = {
809 .name = "vfio-tracking",
810 .region_add = vfio_dirty_tracking_update,
811 };
812
vfio_dirty_tracking_init(VFIOContainerBase * bcontainer,VFIODirtyRanges * ranges)813 static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer,
814 VFIODirtyRanges *ranges)
815 {
816 VFIODirtyRangesListener dirty;
817
818 memset(&dirty, 0, sizeof(dirty));
819 dirty.ranges.min32 = UINT32_MAX;
820 dirty.ranges.min64 = UINT64_MAX;
821 dirty.ranges.minpci64 = UINT64_MAX;
822 dirty.listener = vfio_dirty_tracking_listener;
823 dirty.bcontainer = bcontainer;
824
825 memory_listener_register(&dirty.listener,
826 bcontainer->space->as);
827
828 *ranges = dirty.ranges;
829
830 /*
831 * The memory listener is synchronous, and used to calculate the range
832 * to dirty tracking. Unregister it after we are done as we are not
833 * interested in any follow-up updates.
834 */
835 memory_listener_unregister(&dirty.listener);
836 }
837
vfio_devices_dma_logging_stop(VFIOContainerBase * bcontainer)838 static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
839 {
840 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
841 sizeof(uint64_t))] = {};
842 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
843 VFIODevice *vbasedev;
844
845 feature->argsz = sizeof(buf);
846 feature->flags = VFIO_DEVICE_FEATURE_SET |
847 VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
848
849 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
850 int ret;
851
852 if (!vbasedev->dirty_tracking) {
853 continue;
854 }
855
856 ret = vbasedev->io_ops->device_feature(vbasedev, feature);
857
858 if (ret != 0) {
859 warn_report("%s: Failed to stop DMA logging, err %d (%s)",
860 vbasedev->name, -ret, strerror(-ret));
861 }
862 vbasedev->dirty_tracking = false;
863 }
864 }
865
866 static struct vfio_device_feature *
vfio_device_feature_dma_logging_start_create(VFIOContainerBase * bcontainer,VFIODirtyRanges * tracking)867 vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer,
868 VFIODirtyRanges *tracking)
869 {
870 struct vfio_device_feature *feature;
871 size_t feature_size;
872 struct vfio_device_feature_dma_logging_control *control;
873 struct vfio_device_feature_dma_logging_range *ranges;
874
875 feature_size = sizeof(struct vfio_device_feature) +
876 sizeof(struct vfio_device_feature_dma_logging_control);
877 feature = g_try_malloc0(feature_size);
878 if (!feature) {
879 errno = ENOMEM;
880 return NULL;
881 }
882 feature->argsz = feature_size;
883 feature->flags = VFIO_DEVICE_FEATURE_SET |
884 VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
885
886 control = (struct vfio_device_feature_dma_logging_control *)feature->data;
887 control->page_size = qemu_real_host_page_size();
888
889 /*
890 * DMA logging uAPI guarantees to support at least a number of ranges that
891 * fits into a single host kernel base page.
892 */
893 control->num_ranges = !!tracking->max32 + !!tracking->max64 +
894 !!tracking->maxpci64;
895 ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
896 control->num_ranges);
897 if (!ranges) {
898 g_free(feature);
899 errno = ENOMEM;
900
901 return NULL;
902 }
903
904 control->ranges = (uintptr_t)ranges;
905 if (tracking->max32) {
906 ranges->iova = tracking->min32;
907 ranges->length = (tracking->max32 - tracking->min32) + 1;
908 ranges++;
909 }
910 if (tracking->max64) {
911 ranges->iova = tracking->min64;
912 ranges->length = (tracking->max64 - tracking->min64) + 1;
913 ranges++;
914 }
915 if (tracking->maxpci64) {
916 ranges->iova = tracking->minpci64;
917 ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1;
918 }
919
920 trace_vfio_device_dirty_tracking_start(control->num_ranges,
921 tracking->min32, tracking->max32,
922 tracking->min64, tracking->max64,
923 tracking->minpci64, tracking->maxpci64);
924
925 return feature;
926 }
927
vfio_device_feature_dma_logging_start_destroy(struct vfio_device_feature * feature)928 static void vfio_device_feature_dma_logging_start_destroy(
929 struct vfio_device_feature *feature)
930 {
931 struct vfio_device_feature_dma_logging_control *control =
932 (struct vfio_device_feature_dma_logging_control *)feature->data;
933 struct vfio_device_feature_dma_logging_range *ranges =
934 (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges;
935
936 g_free(ranges);
937 g_free(feature);
938 }
939
vfio_devices_dma_logging_start(VFIOContainerBase * bcontainer,Error ** errp)940 static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer,
941 Error **errp)
942 {
943 struct vfio_device_feature *feature;
944 VFIODirtyRanges ranges;
945 VFIODevice *vbasedev;
946 int ret = 0;
947
948 vfio_dirty_tracking_init(bcontainer, &ranges);
949 feature = vfio_device_feature_dma_logging_start_create(bcontainer,
950 &ranges);
951 if (!feature) {
952 error_setg_errno(errp, errno, "Failed to prepare DMA logging");
953 return false;
954 }
955
956 QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
957 if (vbasedev->dirty_tracking) {
958 continue;
959 }
960
961 ret = vbasedev->io_ops->device_feature(vbasedev, feature);
962 if (ret) {
963 error_setg_errno(errp, -ret, "%s: Failed to start DMA logging",
964 vbasedev->name);
965 goto out;
966 }
967 vbasedev->dirty_tracking = true;
968 }
969
970 out:
971 if (ret) {
972 vfio_devices_dma_logging_stop(bcontainer);
973 }
974
975 vfio_device_feature_dma_logging_start_destroy(feature);
976
977 return ret == 0;
978 }
979
vfio_listener_log_global_start(MemoryListener * listener,Error ** errp)980 static bool vfio_listener_log_global_start(MemoryListener *listener,
981 Error **errp)
982 {
983 ERRP_GUARD();
984 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
985 listener);
986 bool ret;
987
988 if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) {
989 ret = vfio_devices_dma_logging_start(bcontainer, errp);
990 } else {
991 ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0;
992 }
993
994 if (!ret) {
995 error_prepend(errp, "vfio: Could not start dirty page tracking - ");
996 }
997 return ret;
998 }
999
vfio_listener_log_global_stop(MemoryListener * listener)1000 static void vfio_listener_log_global_stop(MemoryListener *listener)
1001 {
1002 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
1003 listener);
1004 Error *local_err = NULL;
1005 int ret = 0;
1006
1007 if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) {
1008 vfio_devices_dma_logging_stop(bcontainer);
1009 } else {
1010 ret = vfio_container_set_dirty_page_tracking(bcontainer, false,
1011 &local_err);
1012 }
1013
1014 if (ret) {
1015 error_prepend(&local_err,
1016 "vfio: Could not stop dirty page tracking - ");
1017 if (migration_is_running()) {
1018 migration_file_set_error(ret, local_err);
1019 } else {
1020 error_report_err(local_err);
1021 }
1022 }
1023 }
1024
1025 typedef struct {
1026 IOMMUNotifier n;
1027 VFIOGuestIOMMU *giommu;
1028 } vfio_giommu_dirty_notifier;
1029
vfio_iommu_map_dirty_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)1030 static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
1031 {
1032 vfio_giommu_dirty_notifier *gdn = container_of(n,
1033 vfio_giommu_dirty_notifier, n);
1034 VFIOGuestIOMMU *giommu = gdn->giommu;
1035 VFIOContainerBase *bcontainer = giommu->bcontainer;
1036 hwaddr iova = iotlb->iova + giommu->iommu_offset;
1037 ram_addr_t translated_addr;
1038 Error *local_err = NULL;
1039 int ret = -EINVAL;
1040 MemoryRegion *mr;
1041 hwaddr xlat;
1042
1043 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
1044
1045 if (iotlb->target_as != &address_space_memory) {
1046 error_setg(&local_err,
1047 "Wrong target AS \"%s\", only system memory is allowed",
1048 iotlb->target_as->name ? iotlb->target_as->name : "none");
1049 goto out;
1050 }
1051
1052 rcu_read_lock();
1053 mr = vfio_translate_iotlb(iotlb, &xlat, &local_err);
1054 if (!mr) {
1055 goto out_unlock;
1056 }
1057 translated_addr = memory_region_get_ram_addr(mr) + xlat;
1058
1059 ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
1060 translated_addr, &local_err);
1061 if (ret) {
1062 error_prepend(&local_err,
1063 "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
1064 "0x%"HWADDR_PRIx") failed - ", bcontainer, iova,
1065 iotlb->addr_mask + 1);
1066 }
1067
1068 out_unlock:
1069 rcu_read_unlock();
1070
1071 out:
1072 if (ret) {
1073 if (migration_is_running()) {
1074 migration_file_set_error(ret, local_err);
1075 } else {
1076 error_report_err(local_err);
1077 }
1078 }
1079 }
1080
vfio_ram_discard_query_dirty_bitmap(MemoryRegionSection * section,void * opaque)1081 static int vfio_ram_discard_query_dirty_bitmap(MemoryRegionSection *section,
1082 void *opaque)
1083 {
1084 const hwaddr size = int128_get64(section->size);
1085 const hwaddr iova = section->offset_within_address_space;
1086 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
1087 section->offset_within_region;
1088 VFIORamDiscardListener *vrdl = opaque;
1089 Error *local_err = NULL;
1090 int ret;
1091
1092 /*
1093 * Sync the whole mapped region (spanning multiple individual mappings)
1094 * in one go.
1095 */
1096 ret = vfio_container_query_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr,
1097 &local_err);
1098 if (ret) {
1099 error_report_err(local_err);
1100 }
1101 return ret;
1102 }
1103
1104 static int
vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section)1105 vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer,
1106 MemoryRegionSection *section)
1107 {
1108 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
1109 VFIORamDiscardListener *vrdl =
1110 vfio_find_ram_discard_listener(bcontainer, section);
1111
1112 /*
1113 * We only want/can synchronize the bitmap for actually mapped parts -
1114 * which correspond to populated parts. Replay all populated parts.
1115 */
1116 return ram_discard_manager_replay_populated(rdm, section,
1117 vfio_ram_discard_query_dirty_bitmap,
1118 &vrdl);
1119 }
1120
vfio_sync_iommu_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section)1121 static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer,
1122 MemoryRegionSection *section)
1123 {
1124 VFIOGuestIOMMU *giommu;
1125 bool found = false;
1126 Int128 llend;
1127 vfio_giommu_dirty_notifier gdn;
1128 int idx;
1129
1130 QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
1131 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
1132 giommu->n.start == section->offset_within_region) {
1133 found = true;
1134 break;
1135 }
1136 }
1137
1138 if (!found) {
1139 return 0;
1140 }
1141
1142 gdn.giommu = giommu;
1143 idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
1144 MEMTXATTRS_UNSPECIFIED);
1145
1146 llend = int128_add(int128_make64(section->offset_within_region),
1147 section->size);
1148 llend = int128_sub(llend, int128_one());
1149
1150 iommu_notifier_init(&gdn.n, vfio_iommu_map_dirty_notify, IOMMU_NOTIFIER_MAP,
1151 section->offset_within_region, int128_get64(llend),
1152 idx);
1153 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
1154
1155 return 0;
1156 }
1157
vfio_sync_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section,Error ** errp)1158 static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer,
1159 MemoryRegionSection *section, Error **errp)
1160 {
1161 ram_addr_t ram_addr;
1162
1163 if (memory_region_is_iommu(section->mr)) {
1164 return vfio_sync_iommu_dirty_bitmap(bcontainer, section);
1165 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1166 int ret;
1167
1168 ret = vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section);
1169 if (ret) {
1170 error_setg(errp,
1171 "Failed to sync dirty bitmap with RAM discard listener");
1172 }
1173 return ret;
1174 }
1175
1176 ram_addr = memory_region_get_ram_addr(section->mr) +
1177 section->offset_within_region;
1178
1179 return vfio_container_query_dirty_bitmap(bcontainer,
1180 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
1181 int128_get64(section->size), ram_addr, errp);
1182 }
1183
vfio_listener_log_sync(MemoryListener * listener,MemoryRegionSection * section)1184 static void vfio_listener_log_sync(MemoryListener *listener,
1185 MemoryRegionSection *section)
1186 {
1187 VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
1188 listener);
1189 int ret;
1190 Error *local_err = NULL;
1191
1192 if (vfio_listener_skipped_section(section)) {
1193 return;
1194 }
1195
1196 if (vfio_log_sync_needed(bcontainer)) {
1197 ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err);
1198 if (ret) {
1199 if (migration_is_running()) {
1200 migration_file_set_error(ret, local_err);
1201 } else {
1202 error_report_err(local_err);
1203 }
1204 }
1205 }
1206 }
1207
1208 static const MemoryListener vfio_memory_listener = {
1209 .name = "vfio",
1210 .begin = vfio_listener_begin,
1211 .commit = vfio_listener_commit,
1212 .region_add = vfio_listener_region_add,
1213 .region_del = vfio_listener_region_del,
1214 .log_global_start = vfio_listener_log_global_start,
1215 .log_global_stop = vfio_listener_log_global_stop,
1216 .log_sync = vfio_listener_log_sync,
1217 };
1218
vfio_listener_register(VFIOContainerBase * bcontainer,Error ** errp)1219 bool vfio_listener_register(VFIOContainerBase *bcontainer, Error **errp)
1220 {
1221 bcontainer->listener = vfio_memory_listener;
1222 memory_listener_register(&bcontainer->listener, bcontainer->space->as);
1223
1224 if (bcontainer->error) {
1225 error_propagate_prepend(errp, bcontainer->error,
1226 "memory listener initialization failed: ");
1227 return false;
1228 }
1229
1230 return true;
1231 }
1232
vfio_listener_unregister(VFIOContainerBase * bcontainer)1233 void vfio_listener_unregister(VFIOContainerBase *bcontainer)
1234 {
1235 memory_listener_unregister(&bcontainer->listener);
1236 }
1237