1e2c7d025SEric Auger /*
2e2c7d025SEric Auger * generic functions used by VFIO devices
3e2c7d025SEric Auger *
4e2c7d025SEric Auger * Copyright Red Hat, Inc. 2012
5e2c7d025SEric Auger *
6e2c7d025SEric Auger * Authors:
7e2c7d025SEric Auger * Alex Williamson <alex.williamson@redhat.com>
8e2c7d025SEric Auger *
9e2c7d025SEric Auger * This work is licensed under the terms of the GNU GPL, version 2. See
10e2c7d025SEric Auger * the COPYING file in the top-level directory.
11e2c7d025SEric Auger *
12e2c7d025SEric Auger * Based on qemu-kvm device-assignment:
13e2c7d025SEric Auger * Adapted for KVM by Qumranet.
14e2c7d025SEric Auger * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15e2c7d025SEric Auger * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16e2c7d025SEric Auger * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17e2c7d025SEric Auger * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18e2c7d025SEric Auger * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19e2c7d025SEric Auger */
20e2c7d025SEric Auger
21c6eacb1aSPeter Maydell #include "qemu/osdep.h"
22e2c7d025SEric Auger #include <sys/ioctl.h>
23a9c94277SMarkus Armbruster #ifdef CONFIG_KVM
24a9c94277SMarkus Armbruster #include <linux/kvm.h>
25a9c94277SMarkus Armbruster #endif
26e2c7d025SEric Auger #include <linux/vfio.h>
27e2c7d025SEric Auger
2811b8b9d5SCédric Le Goater #include "hw/vfio/vfio-device.h"
29a31fe5daSJoao Martins #include "hw/vfio/pci.h"
30dfc56946SRichard Henderson #include "system/address-spaces.h"
318be545baSRichard Henderson #include "system/memory.h"
324705a71dSRichard Henderson #include "system/ram_addr.h"
33e2c7d025SEric Auger #include "hw/hw.h"
34e2c7d025SEric Auger #include "qemu/error-report.h"
35db725815SMarkus Armbruster #include "qemu/main-loop.h"
36f4ec5e26SAlexey Kardashevskiy #include "qemu/range.h"
3732cad1ffSPhilippe Mathieu-Daudé #include "system/kvm.h"
3832cad1ffSPhilippe Mathieu-Daudé #include "system/reset.h"
3932cad1ffSPhilippe Mathieu-Daudé #include "system/runstate.h"
40e2c7d025SEric Auger #include "trace.h"
4101905f58SEric Auger #include "qapi/error.h"
428b942af3SAvihai Horon #include "migration/misc.h"
43236e0a45SAvihai Horon #include "migration/qemu-file.h"
44514c2967SPhilippe Mathieu-Daudé #include "system/tcg.h"
4532cad1ffSPhilippe Mathieu-Daudé #include "system/tpm.h"
46b553d2c4SCédric Le Goater #include "vfio-migration-internal.h"
47ac28680dSCédric Le Goater #include "vfio-helpers.h"
48a9183378SCédric Le Goater #include "vfio-listener.h"
49e2c7d025SEric Auger
50e2c7d025SEric Auger /*
51b6dd6504SKirti Wankhede * Device state interfaces
52b6dd6504SKirti Wankhede */
53b6dd6504SKirti Wankhede
5429d81b71SAvihai Horon
vfio_log_sync_needed(const VFIOContainerBase * bcontainer)551f21670eSAvihai Horon static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer)
56b6dd6504SKirti Wankhede {
57b6dd6504SKirti Wankhede VFIODevice *vbasedev;
58b6dd6504SKirti Wankhede
59e17c281eSCédric Le Goater if (!vfio_container_dirty_tracking_is_started(bcontainer)) {
60b6dd6504SKirti Wankhede return false;
61b6dd6504SKirti Wankhede }
62b6dd6504SKirti Wankhede
633e6015d1SZhenzhong Duan QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
64b6dd6504SKirti Wankhede VFIOMigration *migration = vbasedev->migration;
65b6dd6504SKirti Wankhede
66b6dd6504SKirti Wankhede if (!migration) {
67b6dd6504SKirti Wankhede return false;
68b6dd6504SKirti Wankhede }
69b6dd6504SKirti Wankhede
707429aebeSAvihai Horon if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
713d4d0f0eSJoao Martins (vfio_device_state_is_running(vbasedev) ||
723d4d0f0eSJoao Martins vfio_device_state_is_precopy(vbasedev))) {
7331bcbbb5SAvihai Horon return false;
7431bcbbb5SAvihai Horon }
75b6dd6504SKirti Wankhede }
76b6dd6504SKirti Wankhede return true;
77b6dd6504SKirti Wankhede }
78b6dd6504SKirti Wankhede
vfio_listener_skipped_section(MemoryRegionSection * section)79e2c7d025SEric Auger static bool vfio_listener_skipped_section(MemoryRegionSection *section)
80e2c7d025SEric Auger {
81e2c7d025SEric Auger return (!memory_region_is_ram(section->mr) &&
82e2c7d025SEric Auger !memory_region_is_iommu(section->mr)) ||
8356918a12SSean Christopherson memory_region_is_protected(section->mr) ||
84e2c7d025SEric Auger /*
85e2c7d025SEric Auger * Sizing an enabled 64-bit BAR can cause spurious mappings to
86e2c7d025SEric Auger * addresses in the upper part of the 64-bit address space. These
87e2c7d025SEric Auger * are never accessed by the CPU and beyond the address width of
88e2c7d025SEric Auger * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
89e2c7d025SEric Auger */
90e2c7d025SEric Auger section->offset_within_address_space & (1ULL << 63);
91e2c7d025SEric Auger }
92e2c7d025SEric Auger
93e3353d63SSteve Sistare /*
94e3353d63SSteve Sistare * Called with rcu_read_lock held.
95e3353d63SSteve Sistare * The returned MemoryRegion must not be accessed after calling rcu_read_unlock.
96e3353d63SSteve Sistare */
vfio_translate_iotlb(IOMMUTLBEntry * iotlb,hwaddr * xlat_p,Error ** errp)97e3353d63SSteve Sistare static MemoryRegion *vfio_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p,
98ebb481c0SCédric Le Goater Error **errp)
994a4b88fbSPeter Xu {
100e3353d63SSteve Sistare MemoryRegion *mr;
1014a4b88fbSPeter Xu
102e3353d63SSteve Sistare mr = memory_translate_iotlb(iotlb, xlat_p, errp);
103e3353d63SSteve Sistare if (mr && memory_region_has_ram_discard_manager(mr)) {
1040fd7616eSDavid Hildenbrand /*
1050fd7616eSDavid Hildenbrand * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
1060fd7616eSDavid Hildenbrand * pages will remain pinned inside vfio until unmapped, resulting in a
1070fd7616eSDavid Hildenbrand * higher memory consumption than expected. If memory would get
1080fd7616eSDavid Hildenbrand * populated again later, there would be an inconsistency between pages
1090fd7616eSDavid Hildenbrand * pinned by vfio and pages seen by QEMU. This is the case until
1100fd7616eSDavid Hildenbrand * unmapped from the IOMMU (e.g., during device reset).
1110fd7616eSDavid Hildenbrand *
1120fd7616eSDavid Hildenbrand * With malicious guests, we really only care about pinning more memory
1130fd7616eSDavid Hildenbrand * than expected. RLIMIT_MEMLOCK set for the user/process can never be
1140fd7616eSDavid Hildenbrand * exceeded and can be used to mitigate this problem.
1150fd7616eSDavid Hildenbrand */
1160fd7616eSDavid Hildenbrand warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
1170fd7616eSDavid Hildenbrand " RAM (e.g., virtio-mem) works, however, malicious"
1180fd7616eSDavid Hildenbrand " guests can trigger pinning of more memory than"
1190fd7616eSDavid Hildenbrand " intended via an IOMMU. It's possible to mitigate "
1200fd7616eSDavid Hildenbrand " by setting/adjusting RLIMIT_MEMLOCK.");
1214a4b88fbSPeter Xu }
122e3353d63SSteve Sistare return mr;
1234a4b88fbSPeter Xu }
1244a4b88fbSPeter Xu
vfio_iommu_map_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)125cdb30812SPeter Xu static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
126e2c7d025SEric Auger {
127e2c7d025SEric Auger VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
128dddf83abSEric Auger VFIOContainerBase *bcontainer = giommu->bcontainer;
129d78c19b5SAlexey Kardashevskiy hwaddr iova = iotlb->iova + giommu->iommu_offset;
130e3353d63SSteve Sistare MemoryRegion *mr;
131e3353d63SSteve Sistare hwaddr xlat;
132e2c7d025SEric Auger void *vaddr;
133e2c7d025SEric Auger int ret;
134ebb481c0SCédric Le Goater Error *local_err = NULL;
135e2c7d025SEric Auger
13632138357SPeter Xu trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
13732138357SPeter Xu iova, iova + iotlb->addr_mask);
138e2c7d025SEric Auger
139f1f93650SAlexey Kardashevskiy if (iotlb->target_as != &address_space_memory) {
14010c7f1cfSCédric Le Goater error_setg(&local_err,
14110c7f1cfSCédric Le Goater "Wrong target AS \"%s\", only system memory is allowed",
142f1f93650SAlexey Kardashevskiy iotlb->target_as->name ? iotlb->target_as->name : "none");
14310c7f1cfSCédric Le Goater if (migration_is_running()) {
14410c7f1cfSCédric Le Goater migration_file_set_error(-EINVAL, local_err);
14510c7f1cfSCédric Le Goater } else {
14610c7f1cfSCédric Le Goater error_report_err(local_err);
14710c7f1cfSCédric Le Goater }
148f1f93650SAlexey Kardashevskiy return;
149f1f93650SAlexey Kardashevskiy }
150f1f93650SAlexey Kardashevskiy
15141063e1eSPaolo Bonzini rcu_read_lock();
1524a4b88fbSPeter Xu
153dfbd90e5SPeter Xu if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
1549a04fe09SKirti Wankhede bool read_only;
1559a04fe09SKirti Wankhede
156e3353d63SSteve Sistare mr = vfio_translate_iotlb(iotlb, &xlat, &local_err);
157e3353d63SSteve Sistare if (!mr) {
158ebb481c0SCédric Le Goater error_report_err(local_err);
15941063e1eSPaolo Bonzini goto out;
160e2c7d025SEric Auger }
161e3353d63SSteve Sistare vaddr = memory_region_get_ram_ptr(mr) + xlat;
162e3353d63SSteve Sistare read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly;
163e3353d63SSteve Sistare
1644a4b88fbSPeter Xu /*
1654a4b88fbSPeter Xu * vaddr is only valid until rcu_read_unlock(). But after
1664a4b88fbSPeter Xu * vfio_dma_map has set up the mapping the pages will be
1674a4b88fbSPeter Xu * pinned by the kernel. This makes sure that the RAM backend
1684a4b88fbSPeter Xu * of vaddr will always be there, even if the memory object is
1694a4b88fbSPeter Xu * destroyed and its backing memory munmap-ed.
1704a4b88fbSPeter Xu */
171b08501a9SEric Auger ret = vfio_container_dma_map(bcontainer, iova,
172e2c7d025SEric Auger iotlb->addr_mask + 1, vaddr,
17344d0acf8SJohn Levon read_only, mr);
174e2c7d025SEric Auger if (ret) {
175b08501a9SEric Auger error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
176db9b829bSAvihai Horon "0x%"HWADDR_PRIx", %p) = %d (%s)",
177b08501a9SEric Auger bcontainer, iova,
178db9b829bSAvihai Horon iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
179e2c7d025SEric Auger }
180e2c7d025SEric Auger } else {
181b08501a9SEric Auger ret = vfio_container_dma_unmap(bcontainer, iova,
1825a22b505SJohn Levon iotlb->addr_mask + 1, iotlb, false);
183e2c7d025SEric Auger if (ret) {
18410c7f1cfSCédric Le Goater error_setg(&local_err,
18510c7f1cfSCédric Le Goater "vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
186db9b829bSAvihai Horon "0x%"HWADDR_PRIx") = %d (%s)",
187b08501a9SEric Auger bcontainer, iova,
188db9b829bSAvihai Horon iotlb->addr_mask + 1, ret, strerror(-ret));
18910c7f1cfSCédric Le Goater if (migration_is_running()) {
19010c7f1cfSCédric Le Goater migration_file_set_error(ret, local_err);
19110c7f1cfSCédric Le Goater } else {
19210c7f1cfSCédric Le Goater error_report_err(local_err);
19310c7f1cfSCédric Le Goater }
194e2c7d025SEric Auger }
195e2c7d025SEric Auger }
19641063e1eSPaolo Bonzini out:
19741063e1eSPaolo Bonzini rcu_read_unlock();
198e2c7d025SEric Auger }
199e2c7d025SEric Auger
vfio_ram_discard_notify_discard(RamDiscardListener * rdl,MemoryRegionSection * section)2005e3b981cSDavid Hildenbrand static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
2015e3b981cSDavid Hildenbrand MemoryRegionSection *section)
2025e3b981cSDavid Hildenbrand {
2035e3b981cSDavid Hildenbrand VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
2045e3b981cSDavid Hildenbrand listener);
205dc74a4b0SZhenzhong Duan VFIOContainerBase *bcontainer = vrdl->bcontainer;
2065e3b981cSDavid Hildenbrand const hwaddr size = int128_get64(section->size);
2075e3b981cSDavid Hildenbrand const hwaddr iova = section->offset_within_address_space;
2085e3b981cSDavid Hildenbrand int ret;
2095e3b981cSDavid Hildenbrand
2105e3b981cSDavid Hildenbrand /* Unmap with a single call. */
2115a22b505SJohn Levon ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL, false);
2125e3b981cSDavid Hildenbrand if (ret) {
213b08501a9SEric Auger error_report("%s: vfio_container_dma_unmap() failed: %s", __func__,
2145e3b981cSDavid Hildenbrand strerror(-ret));
2155e3b981cSDavid Hildenbrand }
2165e3b981cSDavid Hildenbrand }
2175e3b981cSDavid Hildenbrand
vfio_ram_discard_notify_populate(RamDiscardListener * rdl,MemoryRegionSection * section)2185e3b981cSDavid Hildenbrand static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
2195e3b981cSDavid Hildenbrand MemoryRegionSection *section)
2205e3b981cSDavid Hildenbrand {
2215e3b981cSDavid Hildenbrand VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
2225e3b981cSDavid Hildenbrand listener);
223dc74a4b0SZhenzhong Duan VFIOContainerBase *bcontainer = vrdl->bcontainer;
2245e3b981cSDavid Hildenbrand const hwaddr end = section->offset_within_region +
2255e3b981cSDavid Hildenbrand int128_get64(section->size);
2265e3b981cSDavid Hildenbrand hwaddr start, next, iova;
2275e3b981cSDavid Hildenbrand void *vaddr;
2285e3b981cSDavid Hildenbrand int ret;
2295e3b981cSDavid Hildenbrand
2305e3b981cSDavid Hildenbrand /*
2315e3b981cSDavid Hildenbrand * Map in (aligned within memory region) minimum granularity, so we can
2325e3b981cSDavid Hildenbrand * unmap in minimum granularity later.
2335e3b981cSDavid Hildenbrand */
2345e3b981cSDavid Hildenbrand for (start = section->offset_within_region; start < end; start = next) {
2355e3b981cSDavid Hildenbrand next = ROUND_UP(start + 1, vrdl->granularity);
2365e3b981cSDavid Hildenbrand next = MIN(next, end);
2375e3b981cSDavid Hildenbrand
2385e3b981cSDavid Hildenbrand iova = start - section->offset_within_region +
2395e3b981cSDavid Hildenbrand section->offset_within_address_space;
2405e3b981cSDavid Hildenbrand vaddr = memory_region_get_ram_ptr(section->mr) + start;
2415e3b981cSDavid Hildenbrand
242dc74a4b0SZhenzhong Duan ret = vfio_container_dma_map(bcontainer, iova, next - start,
24344d0acf8SJohn Levon vaddr, section->readonly, section->mr);
2445e3b981cSDavid Hildenbrand if (ret) {
2455e3b981cSDavid Hildenbrand /* Rollback */
2465e3b981cSDavid Hildenbrand vfio_ram_discard_notify_discard(rdl, section);
2475e3b981cSDavid Hildenbrand return ret;
2485e3b981cSDavid Hildenbrand }
2495e3b981cSDavid Hildenbrand }
2505e3b981cSDavid Hildenbrand return 0;
2515e3b981cSDavid Hildenbrand }
2525e3b981cSDavid Hildenbrand
vfio_ram_discard_register_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)25374d37637SCédric Le Goater static void vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
2545e3b981cSDavid Hildenbrand MemoryRegionSection *section)
2555e3b981cSDavid Hildenbrand {
2565e3b981cSDavid Hildenbrand RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
25780ce7bb5SPhilippe Mathieu-Daudé int target_page_size = qemu_target_page_size();
2585e3b981cSDavid Hildenbrand VFIORamDiscardListener *vrdl;
2595e3b981cSDavid Hildenbrand
2605e3b981cSDavid Hildenbrand /* Ignore some corner cases not relevant in practice. */
26180ce7bb5SPhilippe Mathieu-Daudé g_assert(QEMU_IS_ALIGNED(section->offset_within_region, target_page_size));
2625e3b981cSDavid Hildenbrand g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
26380ce7bb5SPhilippe Mathieu-Daudé target_page_size));
26480ce7bb5SPhilippe Mathieu-Daudé g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), target_page_size));
2655e3b981cSDavid Hildenbrand
2665e3b981cSDavid Hildenbrand vrdl = g_new0(VFIORamDiscardListener, 1);
267dc74a4b0SZhenzhong Duan vrdl->bcontainer = bcontainer;
2685e3b981cSDavid Hildenbrand vrdl->mr = section->mr;
2695e3b981cSDavid Hildenbrand vrdl->offset_within_address_space = section->offset_within_address_space;
2705e3b981cSDavid Hildenbrand vrdl->size = int128_get64(section->size);
2715e3b981cSDavid Hildenbrand vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
2725e3b981cSDavid Hildenbrand section->mr);
2735e3b981cSDavid Hildenbrand
2745e3b981cSDavid Hildenbrand g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
2757ab1cb74SEric Auger g_assert(bcontainer->pgsizes &&
2767ab1cb74SEric Auger vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes));
2775e3b981cSDavid Hildenbrand
2785e3b981cSDavid Hildenbrand ram_discard_listener_init(&vrdl->listener,
2795e3b981cSDavid Hildenbrand vfio_ram_discard_notify_populate,
2805e3b981cSDavid Hildenbrand vfio_ram_discard_notify_discard, true);
2815e3b981cSDavid Hildenbrand ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
282dc74a4b0SZhenzhong Duan QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next);
283a74317f6SDavid Hildenbrand
284a74317f6SDavid Hildenbrand /*
285a74317f6SDavid Hildenbrand * Sanity-check if we have a theoretically problematic setup where we could
286a74317f6SDavid Hildenbrand * exceed the maximum number of possible DMA mappings over time. We assume
287a74317f6SDavid Hildenbrand * that each mapped section in the same address space as a RamDiscardManager
288a74317f6SDavid Hildenbrand * section consumes exactly one DMA mapping, with the exception of
289a74317f6SDavid Hildenbrand * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
290a74317f6SDavid Hildenbrand * in the same address space as RamDiscardManager sections.
291a74317f6SDavid Hildenbrand *
292a74317f6SDavid Hildenbrand * We assume that each section in the address space consumes one memslot.
293a74317f6SDavid Hildenbrand * We take the number of KVM memory slots as a best guess for the maximum
294a74317f6SDavid Hildenbrand * number of sections in the address space we could have over time,
295a74317f6SDavid Hildenbrand * also consuming DMA mappings.
296a74317f6SDavid Hildenbrand */
2977ab1cb74SEric Auger if (bcontainer->dma_max_mappings) {
298a74317f6SDavid Hildenbrand unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
299a74317f6SDavid Hildenbrand
300a74317f6SDavid Hildenbrand #ifdef CONFIG_KVM
301a74317f6SDavid Hildenbrand if (kvm_enabled()) {
302a74317f6SDavid Hildenbrand max_memslots = kvm_get_max_memslots();
303a74317f6SDavid Hildenbrand }
304a74317f6SDavid Hildenbrand #endif
305a74317f6SDavid Hildenbrand
306dc74a4b0SZhenzhong Duan QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
307a74317f6SDavid Hildenbrand hwaddr start, end;
308a74317f6SDavid Hildenbrand
309a74317f6SDavid Hildenbrand start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
310a74317f6SDavid Hildenbrand vrdl->granularity);
311a74317f6SDavid Hildenbrand end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
312a74317f6SDavid Hildenbrand vrdl->granularity);
313a74317f6SDavid Hildenbrand vrdl_mappings += (end - start) / vrdl->granularity;
314a74317f6SDavid Hildenbrand vrdl_count++;
315a74317f6SDavid Hildenbrand }
316a74317f6SDavid Hildenbrand
317a74317f6SDavid Hildenbrand if (vrdl_mappings + max_memslots - vrdl_count >
3187ab1cb74SEric Auger bcontainer->dma_max_mappings) {
319a74317f6SDavid Hildenbrand warn_report("%s: possibly running out of DMA mappings. E.g., try"
320a74317f6SDavid Hildenbrand " increasing the 'block-size' of virtio-mem devies."
321a74317f6SDavid Hildenbrand " Maximum possible DMA mappings: %d, Maximum possible"
3227ab1cb74SEric Auger " memslots: %d", __func__, bcontainer->dma_max_mappings,
323a74317f6SDavid Hildenbrand max_memslots);
324a74317f6SDavid Hildenbrand }
325a74317f6SDavid Hildenbrand }
3265e3b981cSDavid Hildenbrand }
3275e3b981cSDavid Hildenbrand
vfio_ram_discard_unregister_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)32874d37637SCédric Le Goater static void vfio_ram_discard_unregister_listener(VFIOContainerBase *bcontainer,
3295e3b981cSDavid Hildenbrand MemoryRegionSection *section)
3305e3b981cSDavid Hildenbrand {
3315e3b981cSDavid Hildenbrand RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
3325e3b981cSDavid Hildenbrand VFIORamDiscardListener *vrdl = NULL;
3335e3b981cSDavid Hildenbrand
334dc74a4b0SZhenzhong Duan QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
3355e3b981cSDavid Hildenbrand if (vrdl->mr == section->mr &&
3365e3b981cSDavid Hildenbrand vrdl->offset_within_address_space ==
3375e3b981cSDavid Hildenbrand section->offset_within_address_space) {
3385e3b981cSDavid Hildenbrand break;
3395e3b981cSDavid Hildenbrand }
3405e3b981cSDavid Hildenbrand }
3415e3b981cSDavid Hildenbrand
3425e3b981cSDavid Hildenbrand if (!vrdl) {
3435e3b981cSDavid Hildenbrand hw_error("vfio: Trying to unregister missing RAM discard listener");
3445e3b981cSDavid Hildenbrand }
3455e3b981cSDavid Hildenbrand
3465e3b981cSDavid Hildenbrand ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
3475e3b981cSDavid Hildenbrand QLIST_REMOVE(vrdl, next);
3485e3b981cSDavid Hildenbrand g_free(vrdl);
3495e3b981cSDavid Hildenbrand }
3505e3b981cSDavid Hildenbrand
vfio_known_safe_misalignment(MemoryRegionSection * section)351851d6d1aSEric Auger static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
352851d6d1aSEric Auger {
353851d6d1aSEric Auger MemoryRegion *mr = section->mr;
354851d6d1aSEric Auger
355851d6d1aSEric Auger if (!TPM_IS_CRB(mr->owner)) {
356851d6d1aSEric Auger return false;
357851d6d1aSEric Auger }
358851d6d1aSEric Auger
359851d6d1aSEric Auger /* this is a known safe misaligned region, just trace for debug purpose */
360851d6d1aSEric Auger trace_vfio_known_safe_misalignment(memory_region_name(mr),
361851d6d1aSEric Auger section->offset_within_address_space,
362851d6d1aSEric Auger section->offset_within_region,
363851d6d1aSEric Auger qemu_real_host_page_size());
364851d6d1aSEric Auger return true;
365851d6d1aSEric Auger }
366851d6d1aSEric Auger
vfio_listener_valid_section(MemoryRegionSection * section,const char * name)367b92f2376SJoao Martins static bool vfio_listener_valid_section(MemoryRegionSection *section,
368b92f2376SJoao Martins const char *name)
369e2c7d025SEric Auger {
370e2c7d025SEric Auger if (vfio_listener_skipped_section(section)) {
371b92f2376SJoao Martins trace_vfio_listener_region_skip(name,
372e2c7d025SEric Auger section->offset_within_address_space,
373e2c7d025SEric Auger section->offset_within_address_space +
374e2c7d025SEric Auger int128_get64(int128_sub(section->size, int128_one())));
375b92f2376SJoao Martins return false;
376e2c7d025SEric Auger }
377e2c7d025SEric Auger
3781eb7f642SKunkun Jiang if (unlikely((section->offset_within_address_space &
3798e3b0cbbSMarc-André Lureau ~qemu_real_host_page_mask()) !=
3808e3b0cbbSMarc-André Lureau (section->offset_within_region & ~qemu_real_host_page_mask()))) {
381851d6d1aSEric Auger if (!vfio_known_safe_misalignment(section)) {
382851d6d1aSEric Auger error_report("%s received unaligned region %s iova=0x%"PRIx64
383851d6d1aSEric Auger " offset_within_region=0x%"PRIx64
384851d6d1aSEric Auger " qemu_real_host_page_size=0x%"PRIxPTR,
385851d6d1aSEric Auger __func__, memory_region_name(section->mr),
386851d6d1aSEric Auger section->offset_within_address_space,
387851d6d1aSEric Auger section->offset_within_region,
388851d6d1aSEric Auger qemu_real_host_page_size());
389851d6d1aSEric Auger }
390b92f2376SJoao Martins return false;
391b92f2376SJoao Martins }
392b92f2376SJoao Martins
393b92f2376SJoao Martins return true;
394b92f2376SJoao Martins }
395b92f2376SJoao Martins
vfio_get_section_iova_range(VFIOContainerBase * bcontainer,MemoryRegionSection * section,hwaddr * out_iova,hwaddr * out_end,Int128 * out_llend)396c7b313d3SEric Auger static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
3974ead8308SJoao Martins MemoryRegionSection *section,
3984ead8308SJoao Martins hwaddr *out_iova, hwaddr *out_end,
3994ead8308SJoao Martins Int128 *out_llend)
4004ead8308SJoao Martins {
4014ead8308SJoao Martins Int128 llend;
4024ead8308SJoao Martins hwaddr iova;
4034ead8308SJoao Martins
4044ead8308SJoao Martins iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
4054ead8308SJoao Martins llend = int128_make64(section->offset_within_address_space);
4064ead8308SJoao Martins llend = int128_add(llend, section->size);
4074ead8308SJoao Martins llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
4084ead8308SJoao Martins
4094ead8308SJoao Martins if (int128_ge(int128_make64(iova), llend)) {
4104ead8308SJoao Martins return false;
4114ead8308SJoao Martins }
4124ead8308SJoao Martins
4134ead8308SJoao Martins *out_iova = iova;
4144ead8308SJoao Martins *out_end = int128_get64(int128_sub(llend, int128_one()));
4154ead8308SJoao Martins if (out_llend) {
4164ead8308SJoao Martins *out_llend = llend;
4174ead8308SJoao Martins }
4184ead8308SJoao Martins return true;
4194ead8308SJoao Martins }
4204ead8308SJoao Martins
vfio_listener_begin(MemoryListener * listener)421d9b7d8b6SJohn Levon static void vfio_listener_begin(MemoryListener *listener)
422d9b7d8b6SJohn Levon {
423d9b7d8b6SJohn Levon VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
424d9b7d8b6SJohn Levon listener);
425d9b7d8b6SJohn Levon void (*listener_begin)(VFIOContainerBase *bcontainer);
426d9b7d8b6SJohn Levon
427d9b7d8b6SJohn Levon listener_begin = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
428d9b7d8b6SJohn Levon
429d9b7d8b6SJohn Levon if (listener_begin) {
430d9b7d8b6SJohn Levon listener_begin(bcontainer);
431d9b7d8b6SJohn Levon }
432d9b7d8b6SJohn Levon }
433d9b7d8b6SJohn Levon
vfio_listener_commit(MemoryListener * listener)434d9b7d8b6SJohn Levon static void vfio_listener_commit(MemoryListener *listener)
435d9b7d8b6SJohn Levon {
436d9b7d8b6SJohn Levon VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
437d9b7d8b6SJohn Levon listener);
438d9b7d8b6SJohn Levon void (*listener_commit)(VFIOContainerBase *bcontainer);
439d9b7d8b6SJohn Levon
440d9b7d8b6SJohn Levon listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
441d9b7d8b6SJohn Levon
442d9b7d8b6SJohn Levon if (listener_commit) {
443d9b7d8b6SJohn Levon listener_commit(bcontainer);
444d9b7d8b6SJohn Levon }
445d9b7d8b6SJohn Levon }
446d9b7d8b6SJohn Levon
vfio_device_error_append(VFIODevice * vbasedev,Error ** errp)447aaec0042SCédric Le Goater static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp)
448aaec0042SCédric Le Goater {
449aaec0042SCédric Le Goater /*
450aaec0042SCédric Le Goater * MMIO region mapping failures are not fatal but in this case PCI
451aaec0042SCédric Le Goater * peer-to-peer transactions are broken.
452aaec0042SCédric Le Goater */
453aaec0042SCédric Le Goater if (vbasedev && vbasedev->type == VFIO_DEVICE_TYPE_PCI) {
454aaec0042SCédric Le Goater error_append_hint(errp, "%s: PCI peer-to-peer transactions "
455aaec0042SCédric Le Goater "on BARs are not supported.\n", vbasedev->name);
456aaec0042SCédric Le Goater }
457aaec0042SCédric Le Goater }
458aaec0042SCédric Le Goater
vfio_find_ram_discard_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)459*2372f8d9SSteve Sistare VFIORamDiscardListener *vfio_find_ram_discard_listener(
460*2372f8d9SSteve Sistare VFIOContainerBase *bcontainer, MemoryRegionSection *section)
461*2372f8d9SSteve Sistare {
462*2372f8d9SSteve Sistare VFIORamDiscardListener *vrdl = NULL;
463*2372f8d9SSteve Sistare
464*2372f8d9SSteve Sistare QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
465*2372f8d9SSteve Sistare if (vrdl->mr == section->mr &&
466*2372f8d9SSteve Sistare vrdl->offset_within_address_space ==
467*2372f8d9SSteve Sistare section->offset_within_address_space) {
468*2372f8d9SSteve Sistare break;
469*2372f8d9SSteve Sistare }
470*2372f8d9SSteve Sistare }
471*2372f8d9SSteve Sistare
472*2372f8d9SSteve Sistare if (!vrdl) {
473*2372f8d9SSteve Sistare hw_error("vfio: Trying to sync missing RAM discard listener");
474*2372f8d9SSteve Sistare /* does not return */
475*2372f8d9SSteve Sistare }
476*2372f8d9SSteve Sistare return vrdl;
477*2372f8d9SSteve Sistare }
478*2372f8d9SSteve Sistare
vfio_listener_region_add(MemoryListener * listener,MemoryRegionSection * section)479b92f2376SJoao Martins static void vfio_listener_region_add(MemoryListener *listener,
480b92f2376SJoao Martins MemoryRegionSection *section)
481b92f2376SJoao Martins {
482c7b313d3SEric Auger VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
483c7b313d3SEric Auger listener);
484b92f2376SJoao Martins hwaddr iova, end;
485b92f2376SJoao Martins Int128 llend, llsize;
486b92f2376SJoao Martins void *vaddr;
487b92f2376SJoao Martins int ret;
488b92f2376SJoao Martins Error *err = NULL;
489b92f2376SJoao Martins
490b92f2376SJoao Martins if (!vfio_listener_valid_section(section, "region_add")) {
491e2c7d025SEric Auger return;
492e2c7d025SEric Auger }
493e2c7d025SEric Auger
494c7b313d3SEric Auger if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
495c7b313d3SEric Auger &llend)) {
496e4b34708SKunkun Jiang if (memory_region_is_ram_device(section->mr)) {
497e4b34708SKunkun Jiang trace_vfio_listener_region_add_no_dma_map(
498e4b34708SKunkun Jiang memory_region_name(section->mr),
499e4b34708SKunkun Jiang section->offset_within_address_space,
500e4b34708SKunkun Jiang int128_getlo(section->size),
5018e3b0cbbSMarc-André Lureau qemu_real_host_page_size());
502e4b34708SKunkun Jiang }
503e2c7d025SEric Auger return;
504e2c7d025SEric Auger }
5053898aad3SDavid Gibson
506be7d8579SCédric Le Goater /* PPC64/pseries machine only */
50733e4c22fSZhenzhong Duan if (!vfio_container_add_section_window(bcontainer, section, &err)) {
508be7d8579SCédric Le Goater goto mmio_dma_error;
5092e4109deSAlexey Kardashevskiy }
5102e4109deSAlexey Kardashevskiy
511e2c7d025SEric Auger memory_region_ref(section->mr);
512e2c7d025SEric Auger
513e2c7d025SEric Auger if (memory_region_is_iommu(section->mr)) {
514e2c7d025SEric Auger VFIOGuestIOMMU *giommu;
5153df9d748SAlexey Kardashevskiy IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
516cb1efcf4SPeter Maydell int iommu_idx;
517e2c7d025SEric Auger
518a6586419SEric Auger trace_vfio_listener_region_add_iommu(section->mr->name, iova, end);
519e2c7d025SEric Auger /*
520e2c7d025SEric Auger * FIXME: For VFIO iommu types which have KVM acceleration to
521e2c7d025SEric Auger * avoid bouncing all map/unmaps through qemu this way, this
522e2c7d025SEric Auger * would be the right place to wire that up (tell the KVM
523e2c7d025SEric Auger * device emulation the VFIO iommu handles to use).
524e2c7d025SEric Auger */
525e2c7d025SEric Auger giommu = g_malloc0(sizeof(*giommu));
52644ee6aaaSYi Liu giommu->iommu_mr = iommu_mr;
527d78c19b5SAlexey Kardashevskiy giommu->iommu_offset = section->offset_within_address_space -
528d78c19b5SAlexey Kardashevskiy section->offset_within_region;
529dddf83abSEric Auger giommu->bcontainer = bcontainer;
530698feb5eSPeter Xu llend = int128_add(int128_make64(section->offset_within_region),
531698feb5eSPeter Xu section->size);
532698feb5eSPeter Xu llend = int128_sub(llend, int128_one());
533cb1efcf4SPeter Maydell iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
534cb1efcf4SPeter Maydell MEMTXATTRS_UNSPECIFIED);
535698feb5eSPeter Xu iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
5368dca037bSEric Auger IOMMU_NOTIFIER_IOTLB_EVENTS,
537698feb5eSPeter Xu section->offset_within_region,
538cb1efcf4SPeter Maydell int128_get64(llend),
539cb1efcf4SPeter Maydell iommu_idx);
540508ce5ebSDavid Gibson
541549d4005SEric Auger ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
542549d4005SEric Auger &err);
543549d4005SEric Auger if (ret) {
544549d4005SEric Auger g_free(giommu);
545549d4005SEric Auger goto fail;
546549d4005SEric Auger }
547dddf83abSEric Auger QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next);
54844ee6aaaSYi Liu memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
549e2c7d025SEric Auger
550e2c7d025SEric Auger return;
551e2c7d025SEric Auger }
552e2c7d025SEric Auger
553e2c7d025SEric Auger /* Here we assume that memory_region_is_ram(section->mr)==true */
554e2c7d025SEric Auger
5555e3b981cSDavid Hildenbrand /*
5565e3b981cSDavid Hildenbrand * For RAM memory regions with a RamDiscardManager, we only want to map the
5575e3b981cSDavid Hildenbrand * actually populated parts - and update the mapping whenever we're notified
5585e3b981cSDavid Hildenbrand * about changes.
5595e3b981cSDavid Hildenbrand */
5605e3b981cSDavid Hildenbrand if (memory_region_has_ram_discard_manager(section->mr)) {
56174d37637SCédric Le Goater vfio_ram_discard_register_listener(bcontainer, section);
5625e3b981cSDavid Hildenbrand return;
5635e3b981cSDavid Hildenbrand }
5645e3b981cSDavid Hildenbrand
565e2c7d025SEric Auger vaddr = memory_region_get_ram_ptr(section->mr) +
566e2c7d025SEric Auger section->offset_within_region +
567e2c7d025SEric Auger (iova - section->offset_within_address_space);
568e2c7d025SEric Auger
56955efcc53SBandan Das trace_vfio_listener_region_add_ram(iova, end, vaddr);
570e2c7d025SEric Auger
57155efcc53SBandan Das llsize = int128_sub(llend, int128_make64(iova));
57255efcc53SBandan Das
573567b5b30SAlexey Kardashevskiy if (memory_region_is_ram_device(section->mr)) {
5747ab1cb74SEric Auger hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
575567b5b30SAlexey Kardashevskiy
576567b5b30SAlexey Kardashevskiy if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
5775c086005SEric Auger trace_vfio_listener_region_add_no_dma_map(
5785c086005SEric Auger memory_region_name(section->mr),
5795c086005SEric Auger section->offset_within_address_space,
580567b5b30SAlexey Kardashevskiy int128_getlo(section->size),
581567b5b30SAlexey Kardashevskiy pgmask + 1);
582567b5b30SAlexey Kardashevskiy return;
583567b5b30SAlexey Kardashevskiy }
584567b5b30SAlexey Kardashevskiy }
585567b5b30SAlexey Kardashevskiy
586c7b313d3SEric Auger ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
58744d0acf8SJohn Levon vaddr, section->readonly, section->mr);
588e2c7d025SEric Auger if (ret) {
589b08501a9SEric Auger error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
590db9b829bSAvihai Horon "0x%"HWADDR_PRIx", %p) = %d (%s)",
591c7b313d3SEric Auger bcontainer, iova, int128_get64(llsize), vaddr, ret,
592db9b829bSAvihai Horon strerror(-ret));
593be7d8579SCédric Le Goater mmio_dma_error:
594567b5b30SAlexey Kardashevskiy if (memory_region_is_ram_device(section->mr)) {
595567b5b30SAlexey Kardashevskiy /* Allow unexpected mappings not to be fatal for RAM devices */
596aaec0042SCédric Le Goater VFIODevice *vbasedev =
597aaec0042SCédric Le Goater vfio_get_vfio_device(memory_region_owner(section->mr));
598aaec0042SCédric Le Goater vfio_device_error_append(vbasedev, &err);
599aaec0042SCédric Le Goater warn_report_err_once(err);
600567b5b30SAlexey Kardashevskiy return;
601567b5b30SAlexey Kardashevskiy }
602ac6dc389SDavid Gibson goto fail;
603ac6dc389SDavid Gibson }
604e2c7d025SEric Auger
605ac6dc389SDavid Gibson return;
606ac6dc389SDavid Gibson
607ac6dc389SDavid Gibson fail:
608c7b313d3SEric Auger if (!bcontainer->initialized) {
609cdc6f2e0SCédric Le Goater /*
610cdc6f2e0SCédric Le Goater * At machine init time or when the device is attached to the
611cdc6f2e0SCédric Le Goater * VM, store the first error in the container so we can
612cdc6f2e0SCédric Le Goater * gracefully fail the device realize routine.
613cdc6f2e0SCédric Le Goater */
614c7b313d3SEric Auger if (!bcontainer->error) {
615c7b313d3SEric Auger error_propagate_prepend(&bcontainer->error, err,
616d7d87836SEric Auger "Region %s: ",
617d7d87836SEric Auger memory_region_name(section->mr));
618d7d87836SEric Auger } else {
619d7d87836SEric Auger error_free(err);
620e2c7d025SEric Auger }
621e2c7d025SEric Auger } else {
622cdc6f2e0SCédric Le Goater /*
623cdc6f2e0SCédric Le Goater * At runtime, there's not much we can do other than throw a
624cdc6f2e0SCédric Le Goater * hardware error.
625cdc6f2e0SCédric Le Goater */
626d7d87836SEric Auger error_report_err(err);
627e2c7d025SEric Auger hw_error("vfio: DMA mapping failed, unable to continue");
628e2c7d025SEric Auger }
629e2c7d025SEric Auger }
630e2c7d025SEric Auger
vfio_listener_region_del(MemoryListener * listener,MemoryRegionSection * section)631e2c7d025SEric Auger static void vfio_listener_region_del(MemoryListener *listener,
632e2c7d025SEric Auger MemoryRegionSection *section)
633e2c7d025SEric Auger {
634c7b313d3SEric Auger VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
635c7b313d3SEric Auger listener);
636e2c7d025SEric Auger hwaddr iova, end;
6377a057b4fSAlexey Kardashevskiy Int128 llend, llsize;
638e2c7d025SEric Auger int ret;
639567b5b30SAlexey Kardashevskiy bool try_unmap = true;
640e2c7d025SEric Auger
641b92f2376SJoao Martins if (!vfio_listener_valid_section(section, "region_del")) {
642e2c7d025SEric Auger return;
643e2c7d025SEric Auger }
644e2c7d025SEric Auger
645e2c7d025SEric Auger if (memory_region_is_iommu(section->mr)) {
646e2c7d025SEric Auger VFIOGuestIOMMU *giommu;
647e2c7d025SEric Auger
648a6586419SEric Auger trace_vfio_listener_region_del_iommu(section->mr->name);
649dddf83abSEric Auger QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
65044ee6aaaSYi Liu if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
651698feb5eSPeter Xu giommu->n.start == section->offset_within_region) {
6523df9d748SAlexey Kardashevskiy memory_region_unregister_iommu_notifier(section->mr,
653d22d8956SAlexey Kardashevskiy &giommu->n);
654e2c7d025SEric Auger QLIST_REMOVE(giommu, giommu_next);
655e2c7d025SEric Auger g_free(giommu);
656e2c7d025SEric Auger break;
657e2c7d025SEric Auger }
658e2c7d025SEric Auger }
659e2c7d025SEric Auger
660e2c7d025SEric Auger /*
661e2c7d025SEric Auger * FIXME: We assume the one big unmap below is adequate to
662e2c7d025SEric Auger * remove any individual page mappings in the IOMMU which
663e2c7d025SEric Auger * might have been copied into VFIO. This works for a page table
664e2c7d025SEric Auger * based IOMMU where a big unmap flattens a large range of IO-PTEs.
665e2c7d025SEric Auger * That may not be true for all IOMMU types.
666e2c7d025SEric Auger */
667e2c7d025SEric Auger }
668e2c7d025SEric Auger
669c7b313d3SEric Auger if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
670c7b313d3SEric Auger &llend)) {
671e2c7d025SEric Auger return;
672e2c7d025SEric Auger }
673e2c7d025SEric Auger
6747a057b4fSAlexey Kardashevskiy llsize = int128_sub(llend, int128_make64(iova));
675e2c7d025SEric Auger
6767a057b4fSAlexey Kardashevskiy trace_vfio_listener_region_del(iova, end);
6777a057b4fSAlexey Kardashevskiy
678567b5b30SAlexey Kardashevskiy if (memory_region_is_ram_device(section->mr)) {
679567b5b30SAlexey Kardashevskiy hwaddr pgmask;
680567b5b30SAlexey Kardashevskiy
6817ab1cb74SEric Auger pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
682567b5b30SAlexey Kardashevskiy try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
6835e3b981cSDavid Hildenbrand } else if (memory_region_has_ram_discard_manager(section->mr)) {
68474d37637SCédric Le Goater vfio_ram_discard_unregister_listener(bcontainer, section);
6855e3b981cSDavid Hildenbrand /* Unregistering will trigger an unmap. */
6865e3b981cSDavid Hildenbrand try_unmap = false;
687567b5b30SAlexey Kardashevskiy }
688567b5b30SAlexey Kardashevskiy
689567b5b30SAlexey Kardashevskiy if (try_unmap) {
6909458d9b4SJohn Levon bool unmap_all = false;
6919458d9b4SJohn Levon
6921b296c3dSJean-Philippe Brucker if (int128_eq(llsize, int128_2_64())) {
6939458d9b4SJohn Levon unmap_all = true;
6949458d9b4SJohn Levon llsize = int128_zero();
6951b296c3dSJean-Philippe Brucker }
6969458d9b4SJohn Levon ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize),
6979458d9b4SJohn Levon NULL, unmap_all);
698e2c7d025SEric Auger if (ret) {
699b08501a9SEric Auger error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
700db9b829bSAvihai Horon "0x%"HWADDR_PRIx") = %d (%s)",
701c7b313d3SEric Auger bcontainer, iova, int128_get64(llsize), ret,
702db9b829bSAvihai Horon strerror(-ret));
703e2c7d025SEric Auger }
704567b5b30SAlexey Kardashevskiy }
705567b5b30SAlexey Kardashevskiy
706567b5b30SAlexey Kardashevskiy memory_region_unref(section->mr);
7072e4109deSAlexey Kardashevskiy
708be7d8579SCédric Le Goater /* PPC64/pseries machine only */
709233309e8SZhenzhong Duan vfio_container_del_section_window(bcontainer, section);
710e2c7d025SEric Auger }
711e2c7d025SEric Auger
71262c1b002SJoao Martins typedef struct VFIODirtyRanges {
71362c1b002SJoao Martins hwaddr min32;
71462c1b002SJoao Martins hwaddr max32;
71562c1b002SJoao Martins hwaddr min64;
71662c1b002SJoao Martins hwaddr max64;
717a31fe5daSJoao Martins hwaddr minpci64;
718a31fe5daSJoao Martins hwaddr maxpci64;
71962c1b002SJoao Martins } VFIODirtyRanges;
72062c1b002SJoao Martins
72162c1b002SJoao Martins typedef struct VFIODirtyRangesListener {
722c7b313d3SEric Auger VFIOContainerBase *bcontainer;
72362c1b002SJoao Martins VFIODirtyRanges ranges;
72462c1b002SJoao Martins MemoryListener listener;
72562c1b002SJoao Martins } VFIODirtyRangesListener;
72662c1b002SJoao Martins
vfio_section_is_vfio_pci(MemoryRegionSection * section,VFIOContainerBase * bcontainer)727a31fe5daSJoao Martins static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
728c7b313d3SEric Auger VFIOContainerBase *bcontainer)
729a31fe5daSJoao Martins {
730a31fe5daSJoao Martins VFIOPCIDevice *pcidev;
731a31fe5daSJoao Martins VFIODevice *vbasedev;
732a31fe5daSJoao Martins Object *owner;
733a31fe5daSJoao Martins
734a31fe5daSJoao Martins owner = memory_region_owner(section->mr);
735a31fe5daSJoao Martins
7363e6015d1SZhenzhong Duan QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
737a31fe5daSJoao Martins if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
738a31fe5daSJoao Martins continue;
739a31fe5daSJoao Martins }
740a31fe5daSJoao Martins pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
741a31fe5daSJoao Martins if (OBJECT(pcidev) == owner) {
742a31fe5daSJoao Martins return true;
743a31fe5daSJoao Martins }
744a31fe5daSJoao Martins }
745a31fe5daSJoao Martins
746a31fe5daSJoao Martins return false;
747a31fe5daSJoao Martins }
748a31fe5daSJoao Martins
vfio_dirty_tracking_update_range(VFIODirtyRanges * range,hwaddr iova,hwaddr end,bool update_pci)749344e7094SJoao Martins static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range,
750344e7094SJoao Martins hwaddr iova, hwaddr end,
751344e7094SJoao Martins bool update_pci)
75262c1b002SJoao Martins {
753344e7094SJoao Martins hwaddr *min, *max;
75462c1b002SJoao Martins
75562c1b002SJoao Martins /*
756a31fe5daSJoao Martins * The address space passed to the dirty tracker is reduced to three ranges:
757a31fe5daSJoao Martins * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
758a31fe5daSJoao Martins * PCI 64-bit hole.
759a31fe5daSJoao Martins *
76062c1b002SJoao Martins * The underlying reports of dirty will query a sub-interval of each of
76162c1b002SJoao Martins * these ranges.
76262c1b002SJoao Martins *
763a31fe5daSJoao Martins * The purpose of the three range handling is to handle known cases of big
764a31fe5daSJoao Martins * holes in the address space, like the x86 AMD 1T hole, and firmware (like
765a31fe5daSJoao Martins * OVMF) which may relocate the pci-hole64 to the end of the address space.
766a31fe5daSJoao Martins * The latter would otherwise generate large ranges for tracking, stressing
767a31fe5daSJoao Martins * the limits of supported hardware. The pci-hole32 will always be below 4G
768a31fe5daSJoao Martins * (overlapping or not) so it doesn't need special handling and is part of
769a31fe5daSJoao Martins * the 32-bit range.
770a31fe5daSJoao Martins *
771a31fe5daSJoao Martins * The alternative would be an IOVATree but that has a much bigger runtime
772a31fe5daSJoao Martins * overhead and unnecessary complexity.
77362c1b002SJoao Martins */
774344e7094SJoao Martins if (update_pci && iova >= UINT32_MAX) {
775a31fe5daSJoao Martins min = &range->minpci64;
776a31fe5daSJoao Martins max = &range->maxpci64;
777a31fe5daSJoao Martins } else {
77862c1b002SJoao Martins min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
77962c1b002SJoao Martins max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
780a31fe5daSJoao Martins }
78162c1b002SJoao Martins if (*min > iova) {
78262c1b002SJoao Martins *min = iova;
78362c1b002SJoao Martins }
78462c1b002SJoao Martins if (*max < end) {
78562c1b002SJoao Martins *max = end;
78662c1b002SJoao Martins }
78762c1b002SJoao Martins
78862c1b002SJoao Martins trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
789344e7094SJoao Martins }
790344e7094SJoao Martins
vfio_dirty_tracking_update(MemoryListener * listener,MemoryRegionSection * section)791344e7094SJoao Martins static void vfio_dirty_tracking_update(MemoryListener *listener,
792344e7094SJoao Martins MemoryRegionSection *section)
793344e7094SJoao Martins {
794344e7094SJoao Martins VFIODirtyRangesListener *dirty =
795344e7094SJoao Martins container_of(listener, VFIODirtyRangesListener, listener);
796344e7094SJoao Martins hwaddr iova, end;
797344e7094SJoao Martins
798344e7094SJoao Martins if (!vfio_listener_valid_section(section, "tracking_update") ||
799344e7094SJoao Martins !vfio_get_section_iova_range(dirty->bcontainer, section,
800344e7094SJoao Martins &iova, &end, NULL)) {
80162c1b002SJoao Martins return;
80262c1b002SJoao Martins }
80362c1b002SJoao Martins
804344e7094SJoao Martins vfio_dirty_tracking_update_range(&dirty->ranges, iova, end,
805344e7094SJoao Martins vfio_section_is_vfio_pci(section, dirty->bcontainer));
806344e7094SJoao Martins }
807344e7094SJoao Martins
80862c1b002SJoao Martins static const MemoryListener vfio_dirty_tracking_listener = {
80962c1b002SJoao Martins .name = "vfio-tracking",
81062c1b002SJoao Martins .region_add = vfio_dirty_tracking_update,
81162c1b002SJoao Martins };
81262c1b002SJoao Martins
vfio_dirty_tracking_init(VFIOContainerBase * bcontainer,VFIODirtyRanges * ranges)813c7b313d3SEric Auger static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer,
81462c1b002SJoao Martins VFIODirtyRanges *ranges)
81562c1b002SJoao Martins {
81662c1b002SJoao Martins VFIODirtyRangesListener dirty;
81762c1b002SJoao Martins
81862c1b002SJoao Martins memset(&dirty, 0, sizeof(dirty));
81962c1b002SJoao Martins dirty.ranges.min32 = UINT32_MAX;
82062c1b002SJoao Martins dirty.ranges.min64 = UINT64_MAX;
821a31fe5daSJoao Martins dirty.ranges.minpci64 = UINT64_MAX;
82262c1b002SJoao Martins dirty.listener = vfio_dirty_tracking_listener;
823c7b313d3SEric Auger dirty.bcontainer = bcontainer;
82462c1b002SJoao Martins
82562c1b002SJoao Martins memory_listener_register(&dirty.listener,
826c7b313d3SEric Auger bcontainer->space->as);
82762c1b002SJoao Martins
82862c1b002SJoao Martins *ranges = dirty.ranges;
82962c1b002SJoao Martins
83062c1b002SJoao Martins /*
83162c1b002SJoao Martins * The memory listener is synchronous, and used to calculate the range
83262c1b002SJoao Martins * to dirty tracking. Unregister it after we are done as we are not
83362c1b002SJoao Martins * interested in any follow-up updates.
83462c1b002SJoao Martins */
83562c1b002SJoao Martins memory_listener_unregister(&dirty.listener);
83662c1b002SJoao Martins }
83762c1b002SJoao Martins
vfio_devices_dma_logging_stop(VFIOContainerBase * bcontainer)838c7b313d3SEric Auger static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
8395255bbf4SJoao Martins {
8405255bbf4SJoao Martins uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
8415255bbf4SJoao Martins sizeof(uint64_t))] = {};
8425255bbf4SJoao Martins struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
8435255bbf4SJoao Martins VFIODevice *vbasedev;
8445255bbf4SJoao Martins
8455255bbf4SJoao Martins feature->argsz = sizeof(buf);
8465255bbf4SJoao Martins feature->flags = VFIO_DEVICE_FEATURE_SET |
8475255bbf4SJoao Martins VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
8485255bbf4SJoao Martins
8493e6015d1SZhenzhong Duan QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
85038bf025dSJohn Levon int ret;
85138bf025dSJohn Levon
8525255bbf4SJoao Martins if (!vbasedev->dirty_tracking) {
8535255bbf4SJoao Martins continue;
8545255bbf4SJoao Martins }
8555255bbf4SJoao Martins
85638bf025dSJohn Levon ret = vbasedev->io_ops->device_feature(vbasedev, feature);
85738bf025dSJohn Levon
85838bf025dSJohn Levon if (ret != 0) {
8595255bbf4SJoao Martins warn_report("%s: Failed to stop DMA logging, err %d (%s)",
86038bf025dSJohn Levon vbasedev->name, -ret, strerror(-ret));
8615255bbf4SJoao Martins }
8625255bbf4SJoao Martins vbasedev->dirty_tracking = false;
8635255bbf4SJoao Martins }
8645255bbf4SJoao Martins }
8655255bbf4SJoao Martins
8665255bbf4SJoao Martins static struct vfio_device_feature *
vfio_device_feature_dma_logging_start_create(VFIOContainerBase * bcontainer,VFIODirtyRanges * tracking)867c7b313d3SEric Auger vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer,
8685255bbf4SJoao Martins VFIODirtyRanges *tracking)
8695255bbf4SJoao Martins {
8705255bbf4SJoao Martins struct vfio_device_feature *feature;
8715255bbf4SJoao Martins size_t feature_size;
8725255bbf4SJoao Martins struct vfio_device_feature_dma_logging_control *control;
8735255bbf4SJoao Martins struct vfio_device_feature_dma_logging_range *ranges;
8745255bbf4SJoao Martins
8755255bbf4SJoao Martins feature_size = sizeof(struct vfio_device_feature) +
8765255bbf4SJoao Martins sizeof(struct vfio_device_feature_dma_logging_control);
8775255bbf4SJoao Martins feature = g_try_malloc0(feature_size);
8785255bbf4SJoao Martins if (!feature) {
8795255bbf4SJoao Martins errno = ENOMEM;
8805255bbf4SJoao Martins return NULL;
8815255bbf4SJoao Martins }
8825255bbf4SJoao Martins feature->argsz = feature_size;
8835255bbf4SJoao Martins feature->flags = VFIO_DEVICE_FEATURE_SET |
8845255bbf4SJoao Martins VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
8855255bbf4SJoao Martins
8865255bbf4SJoao Martins control = (struct vfio_device_feature_dma_logging_control *)feature->data;
8875255bbf4SJoao Martins control->page_size = qemu_real_host_page_size();
8885255bbf4SJoao Martins
8895255bbf4SJoao Martins /*
8905255bbf4SJoao Martins * DMA logging uAPI guarantees to support at least a number of ranges that
8915255bbf4SJoao Martins * fits into a single host kernel base page.
8925255bbf4SJoao Martins */
893a31fe5daSJoao Martins control->num_ranges = !!tracking->max32 + !!tracking->max64 +
894a31fe5daSJoao Martins !!tracking->maxpci64;
8955255bbf4SJoao Martins ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
8965255bbf4SJoao Martins control->num_ranges);
8975255bbf4SJoao Martins if (!ranges) {
8985255bbf4SJoao Martins g_free(feature);
8995255bbf4SJoao Martins errno = ENOMEM;
9005255bbf4SJoao Martins
9015255bbf4SJoao Martins return NULL;
9025255bbf4SJoao Martins }
9035255bbf4SJoao Martins
904592d0bc0SPaolo Bonzini control->ranges = (uintptr_t)ranges;
9055255bbf4SJoao Martins if (tracking->max32) {
9065255bbf4SJoao Martins ranges->iova = tracking->min32;
9075255bbf4SJoao Martins ranges->length = (tracking->max32 - tracking->min32) + 1;
9085255bbf4SJoao Martins ranges++;
9095255bbf4SJoao Martins }
9105255bbf4SJoao Martins if (tracking->max64) {
9115255bbf4SJoao Martins ranges->iova = tracking->min64;
9125255bbf4SJoao Martins ranges->length = (tracking->max64 - tracking->min64) + 1;
913a31fe5daSJoao Martins ranges++;
914a31fe5daSJoao Martins }
915a31fe5daSJoao Martins if (tracking->maxpci64) {
916a31fe5daSJoao Martins ranges->iova = tracking->minpci64;
917a31fe5daSJoao Martins ranges->length = (tracking->maxpci64 - tracking->minpci64) + 1;
9185255bbf4SJoao Martins }
9195255bbf4SJoao Martins
9205255bbf4SJoao Martins trace_vfio_device_dirty_tracking_start(control->num_ranges,
9215255bbf4SJoao Martins tracking->min32, tracking->max32,
922a31fe5daSJoao Martins tracking->min64, tracking->max64,
923a31fe5daSJoao Martins tracking->minpci64, tracking->maxpci64);
9245255bbf4SJoao Martins
9255255bbf4SJoao Martins return feature;
9265255bbf4SJoao Martins }
9275255bbf4SJoao Martins
vfio_device_feature_dma_logging_start_destroy(struct vfio_device_feature * feature)9285255bbf4SJoao Martins static void vfio_device_feature_dma_logging_start_destroy(
9295255bbf4SJoao Martins struct vfio_device_feature *feature)
9305255bbf4SJoao Martins {
9315255bbf4SJoao Martins struct vfio_device_feature_dma_logging_control *control =
9325255bbf4SJoao Martins (struct vfio_device_feature_dma_logging_control *)feature->data;
9335255bbf4SJoao Martins struct vfio_device_feature_dma_logging_range *ranges =
9345255bbf4SJoao Martins (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges;
9355255bbf4SJoao Martins
9365255bbf4SJoao Martins g_free(ranges);
9375255bbf4SJoao Martins g_free(feature);
9385255bbf4SJoao Martins }
9395255bbf4SJoao Martins
vfio_devices_dma_logging_start(VFIOContainerBase * bcontainer,Error ** errp)940332b9b0dSCédric Le Goater static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer,
9410f21358fSCédric Le Goater Error **errp)
9425255bbf4SJoao Martins {
9435255bbf4SJoao Martins struct vfio_device_feature *feature;
9445255bbf4SJoao Martins VFIODirtyRanges ranges;
9455255bbf4SJoao Martins VFIODevice *vbasedev;
9465255bbf4SJoao Martins int ret = 0;
9475255bbf4SJoao Martins
948c7b313d3SEric Auger vfio_dirty_tracking_init(bcontainer, &ranges);
949c7b313d3SEric Auger feature = vfio_device_feature_dma_logging_start_create(bcontainer,
9505255bbf4SJoao Martins &ranges);
9515255bbf4SJoao Martins if (!feature) {
9520f21358fSCédric Le Goater error_setg_errno(errp, errno, "Failed to prepare DMA logging");
953332b9b0dSCédric Le Goater return false;
9545255bbf4SJoao Martins }
9555255bbf4SJoao Martins
9563e6015d1SZhenzhong Duan QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
9575255bbf4SJoao Martins if (vbasedev->dirty_tracking) {
9585255bbf4SJoao Martins continue;
9595255bbf4SJoao Martins }
9605255bbf4SJoao Martins
96138bf025dSJohn Levon ret = vbasedev->io_ops->device_feature(vbasedev, feature);
9625255bbf4SJoao Martins if (ret) {
96338bf025dSJohn Levon error_setg_errno(errp, -ret, "%s: Failed to start DMA logging",
9640f21358fSCédric Le Goater vbasedev->name);
9655255bbf4SJoao Martins goto out;
9665255bbf4SJoao Martins }
9675255bbf4SJoao Martins vbasedev->dirty_tracking = true;
9685255bbf4SJoao Martins }
9695255bbf4SJoao Martins
9705255bbf4SJoao Martins out:
9715255bbf4SJoao Martins if (ret) {
972c7b313d3SEric Auger vfio_devices_dma_logging_stop(bcontainer);
9735255bbf4SJoao Martins }
9745255bbf4SJoao Martins
9755255bbf4SJoao Martins vfio_device_feature_dma_logging_start_destroy(feature);
9765255bbf4SJoao Martins
977332b9b0dSCédric Le Goater return ret == 0;
9785255bbf4SJoao Martins }
9795255bbf4SJoao Martins
vfio_listener_log_global_start(MemoryListener * listener,Error ** errp)9803688fec8SCédric Le Goater static bool vfio_listener_log_global_start(MemoryListener *listener,
9813688fec8SCédric Le Goater Error **errp)
982758b96b6SKeqian Zhu {
9830f21358fSCédric Le Goater ERRP_GUARD();
984c7b313d3SEric Auger VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
985c7b313d3SEric Auger listener);
986332b9b0dSCédric Le Goater bool ret;
987758b96b6SKeqian Zhu
98860f29d08SCédric Le Goater if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) {
9890f21358fSCédric Le Goater ret = vfio_devices_dma_logging_start(bcontainer, errp);
9905255bbf4SJoao Martins } else {
991332b9b0dSCédric Le Goater ret = vfio_container_set_dirty_page_tracking(bcontainer, true, errp) == 0;
9925255bbf4SJoao Martins }
9935255bbf4SJoao Martins
994332b9b0dSCédric Le Goater if (!ret) {
9950f21358fSCédric Le Goater error_prepend(errp, "vfio: Could not start dirty page tracking - ");
996236e0a45SAvihai Horon }
997332b9b0dSCédric Le Goater return ret;
998758b96b6SKeqian Zhu }
999758b96b6SKeqian Zhu
vfio_listener_log_global_stop(MemoryListener * listener)1000758b96b6SKeqian Zhu static void vfio_listener_log_global_stop(MemoryListener *listener)
1001758b96b6SKeqian Zhu {
1002c7b313d3SEric Auger VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
1003c7b313d3SEric Auger listener);
10040f21358fSCédric Le Goater Error *local_err = NULL;
10055255bbf4SJoao Martins int ret = 0;
1006758b96b6SKeqian Zhu
100760f29d08SCédric Le Goater if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) {
1008c7b313d3SEric Auger vfio_devices_dma_logging_stop(bcontainer);
10095255bbf4SJoao Martins } else {
10100f21358fSCédric Le Goater ret = vfio_container_set_dirty_page_tracking(bcontainer, false,
10110f21358fSCédric Le Goater &local_err);
10125255bbf4SJoao Martins }
10135255bbf4SJoao Martins
1014236e0a45SAvihai Horon if (ret) {
10150f21358fSCédric Le Goater error_prepend(&local_err,
10160f21358fSCédric Le Goater "vfio: Could not stop dirty page tracking - ");
101710c7f1cfSCédric Le Goater if (migration_is_running()) {
101810c7f1cfSCédric Le Goater migration_file_set_error(ret, local_err);
101910c7f1cfSCédric Le Goater } else {
10200f21358fSCédric Le Goater error_report_err(local_err);
102110c7f1cfSCédric Le Goater }
1022236e0a45SAvihai Horon }
1023758b96b6SKeqian Zhu }
1024758b96b6SKeqian Zhu
10259a04fe09SKirti Wankhede typedef struct {
10269a04fe09SKirti Wankhede IOMMUNotifier n;
10279a04fe09SKirti Wankhede VFIOGuestIOMMU *giommu;
10289a04fe09SKirti Wankhede } vfio_giommu_dirty_notifier;
10299a04fe09SKirti Wankhede
vfio_iommu_map_dirty_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)10309a04fe09SKirti Wankhede static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
10319a04fe09SKirti Wankhede {
10329a04fe09SKirti Wankhede vfio_giommu_dirty_notifier *gdn = container_of(n,
10339a04fe09SKirti Wankhede vfio_giommu_dirty_notifier, n);
10349a04fe09SKirti Wankhede VFIOGuestIOMMU *giommu = gdn->giommu;
1035dddf83abSEric Auger VFIOContainerBase *bcontainer = giommu->bcontainer;
10369a04fe09SKirti Wankhede hwaddr iova = iotlb->iova + giommu->iommu_offset;
10379a04fe09SKirti Wankhede ram_addr_t translated_addr;
1038ebb481c0SCédric Le Goater Error *local_err = NULL;
1039236e0a45SAvihai Horon int ret = -EINVAL;
1040e3353d63SSteve Sistare MemoryRegion *mr;
1041e3353d63SSteve Sistare hwaddr xlat;
10429a04fe09SKirti Wankhede
10439a04fe09SKirti Wankhede trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
10449a04fe09SKirti Wankhede
10459a04fe09SKirti Wankhede if (iotlb->target_as != &address_space_memory) {
104610c7f1cfSCédric Le Goater error_setg(&local_err,
104710c7f1cfSCédric Le Goater "Wrong target AS \"%s\", only system memory is allowed",
10489a04fe09SKirti Wankhede iotlb->target_as->name ? iotlb->target_as->name : "none");
1049236e0a45SAvihai Horon goto out;
10509a04fe09SKirti Wankhede }
10519a04fe09SKirti Wankhede
10529a04fe09SKirti Wankhede rcu_read_lock();
1053e3353d63SSteve Sistare mr = vfio_translate_iotlb(iotlb, &xlat, &local_err);
1054e3353d63SSteve Sistare if (!mr) {
105594d12088SCédric Le Goater goto out_unlock;
105694d12088SCédric Le Goater }
1057e3353d63SSteve Sistare translated_addr = memory_region_get_ram_addr(mr) + xlat;
105894d12088SCédric Le Goater
1059c51358bdSCédric Le Goater ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
10602da5f9e4SCédric Le Goater translated_addr, &local_err);
10619a04fe09SKirti Wankhede if (ret) {
10622da5f9e4SCédric Le Goater error_prepend(&local_err,
10632da5f9e4SCédric Le Goater "vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
10642da5f9e4SCédric Le Goater "0x%"HWADDR_PRIx") failed - ", bcontainer, iova,
10652da5f9e4SCédric Le Goater iotlb->addr_mask + 1);
10669a04fe09SKirti Wankhede }
106794d12088SCédric Le Goater
106894d12088SCédric Le Goater out_unlock:
10699a04fe09SKirti Wankhede rcu_read_unlock();
1070236e0a45SAvihai Horon
1071236e0a45SAvihai Horon out:
1072236e0a45SAvihai Horon if (ret) {
107310c7f1cfSCédric Le Goater if (migration_is_running()) {
107410c7f1cfSCédric Le Goater migration_file_set_error(ret, local_err);
107510c7f1cfSCédric Le Goater } else {
107610c7f1cfSCédric Le Goater error_report_err(local_err);
107710c7f1cfSCédric Le Goater }
1078236e0a45SAvihai Horon }
10799a04fe09SKirti Wankhede }
10809a04fe09SKirti Wankhede
vfio_ram_discard_query_dirty_bitmap(MemoryRegionSection * section,void * opaque)108174d37637SCédric Le Goater static int vfio_ram_discard_query_dirty_bitmap(MemoryRegionSection *section,
10825e3b981cSDavid Hildenbrand void *opaque)
10835e3b981cSDavid Hildenbrand {
10845e3b981cSDavid Hildenbrand const hwaddr size = int128_get64(section->size);
10855e3b981cSDavid Hildenbrand const hwaddr iova = section->offset_within_address_space;
10865e3b981cSDavid Hildenbrand const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
10875e3b981cSDavid Hildenbrand section->offset_within_region;
10885e3b981cSDavid Hildenbrand VFIORamDiscardListener *vrdl = opaque;
10892da5f9e4SCédric Le Goater Error *local_err = NULL;
10902da5f9e4SCédric Le Goater int ret;
10915e3b981cSDavid Hildenbrand
10925e3b981cSDavid Hildenbrand /*
10935e3b981cSDavid Hildenbrand * Sync the whole mapped region (spanning multiple individual mappings)
10945e3b981cSDavid Hildenbrand * in one go.
10955e3b981cSDavid Hildenbrand */
1096c51358bdSCédric Le Goater ret = vfio_container_query_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr,
10972da5f9e4SCédric Le Goater &local_err);
10982da5f9e4SCédric Le Goater if (ret) {
10992da5f9e4SCédric Le Goater error_report_err(local_err);
11002da5f9e4SCédric Le Goater }
11012da5f9e4SCédric Le Goater return ret;
11025e3b981cSDavid Hildenbrand }
11035e3b981cSDavid Hildenbrand
1104dc74a4b0SZhenzhong Duan static int
vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section)1105dc74a4b0SZhenzhong Duan vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer,
11065e3b981cSDavid Hildenbrand MemoryRegionSection *section)
11075e3b981cSDavid Hildenbrand {
11085e3b981cSDavid Hildenbrand RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
1109*2372f8d9SSteve Sistare VFIORamDiscardListener *vrdl =
1110*2372f8d9SSteve Sistare vfio_find_ram_discard_listener(bcontainer, section);
11115e3b981cSDavid Hildenbrand
11125e3b981cSDavid Hildenbrand /*
11135e3b981cSDavid Hildenbrand * We only want/can synchronize the bitmap for actually mapped parts -
11145e3b981cSDavid Hildenbrand * which correspond to populated parts. Replay all populated parts.
11155e3b981cSDavid Hildenbrand */
11165e3b981cSDavid Hildenbrand return ram_discard_manager_replay_populated(rdm, section,
111774d37637SCédric Le Goater vfio_ram_discard_query_dirty_bitmap,
11185e3b981cSDavid Hildenbrand &vrdl);
11195e3b981cSDavid Hildenbrand }
11205e3b981cSDavid Hildenbrand
vfio_sync_iommu_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section)1121723f702bSAvihai Horon static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer,
1122723f702bSAvihai Horon MemoryRegionSection *section)
1123b6dd6504SKirti Wankhede {
11249a04fe09SKirti Wankhede VFIOGuestIOMMU *giommu;
1125723f702bSAvihai Horon bool found = false;
1126723f702bSAvihai Horon Int128 llend;
1127723f702bSAvihai Horon vfio_giommu_dirty_notifier gdn;
1128723f702bSAvihai Horon int idx;
11299a04fe09SKirti Wankhede
1130dddf83abSEric Auger QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
113144ee6aaaSYi Liu if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
11329a04fe09SKirti Wankhede giommu->n.start == section->offset_within_region) {
1133723f702bSAvihai Horon found = true;
1134723f702bSAvihai Horon break;
1135723f702bSAvihai Horon }
1136723f702bSAvihai Horon }
1137723f702bSAvihai Horon
1138723f702bSAvihai Horon if (!found) {
1139723f702bSAvihai Horon return 0;
1140723f702bSAvihai Horon }
1141723f702bSAvihai Horon
1142723f702bSAvihai Horon gdn.giommu = giommu;
1143723f702bSAvihai Horon idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
11449a04fe09SKirti Wankhede MEMTXATTRS_UNSPECIFIED);
11459a04fe09SKirti Wankhede
11469a04fe09SKirti Wankhede llend = int128_add(int128_make64(section->offset_within_region),
11479a04fe09SKirti Wankhede section->size);
11489a04fe09SKirti Wankhede llend = int128_sub(llend, int128_one());
11499a04fe09SKirti Wankhede
1150723f702bSAvihai Horon iommu_notifier_init(&gdn.n, vfio_iommu_map_dirty_notify, IOMMU_NOTIFIER_MAP,
1151723f702bSAvihai Horon section->offset_within_region, int128_get64(llend),
11529a04fe09SKirti Wankhede idx);
115344ee6aaaSYi Liu memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
1154723f702bSAvihai Horon
11559a04fe09SKirti Wankhede return 0;
1156723f702bSAvihai Horon }
1157723f702bSAvihai Horon
vfio_sync_dirty_bitmap(VFIOContainerBase * bcontainer,MemoryRegionSection * section,Error ** errp)1158723f702bSAvihai Horon static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer,
1159723f702bSAvihai Horon MemoryRegionSection *section, Error **errp)
1160723f702bSAvihai Horon {
1161723f702bSAvihai Horon ram_addr_t ram_addr;
1162723f702bSAvihai Horon
1163723f702bSAvihai Horon if (memory_region_is_iommu(section->mr)) {
1164723f702bSAvihai Horon return vfio_sync_iommu_dirty_bitmap(bcontainer, section);
11655e3b981cSDavid Hildenbrand } else if (memory_region_has_ram_discard_manager(section->mr)) {
11662da5f9e4SCédric Le Goater int ret;
11672da5f9e4SCédric Le Goater
11682da5f9e4SCédric Le Goater ret = vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section);
11692da5f9e4SCédric Le Goater if (ret) {
11702da5f9e4SCédric Le Goater error_setg(errp,
11712da5f9e4SCédric Le Goater "Failed to sync dirty bitmap with RAM discard listener");
11722da5f9e4SCédric Le Goater }
11732da5f9e4SCédric Le Goater return ret;
11749a04fe09SKirti Wankhede }
11759a04fe09SKirti Wankhede
1176b6dd6504SKirti Wankhede ram_addr = memory_region_get_ram_addr(section->mr) +
1177b6dd6504SKirti Wankhede section->offset_within_region;
1178b6dd6504SKirti Wankhede
1179c51358bdSCédric Le Goater return vfio_container_query_dirty_bitmap(bcontainer,
11801eb7f642SKunkun Jiang REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
11812da5f9e4SCédric Le Goater int128_get64(section->size), ram_addr, errp);
1182b6dd6504SKirti Wankhede }
1183b6dd6504SKirti Wankhede
vfio_listener_log_sync(MemoryListener * listener,MemoryRegionSection * section)11844292d501SZenghui Yu static void vfio_listener_log_sync(MemoryListener *listener,
1185b6dd6504SKirti Wankhede MemoryRegionSection *section)
1186b6dd6504SKirti Wankhede {
1187c7b313d3SEric Auger VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
1188c7b313d3SEric Auger listener);
1189236e0a45SAvihai Horon int ret;
11902da5f9e4SCédric Le Goater Error *local_err = NULL;
1191b6dd6504SKirti Wankhede
1192b051a3f6SAvihai Horon if (vfio_listener_skipped_section(section)) {
1193b6dd6504SKirti Wankhede return;
1194b6dd6504SKirti Wankhede }
1195b6dd6504SKirti Wankhede
11961f21670eSAvihai Horon if (vfio_log_sync_needed(bcontainer)) {
11972da5f9e4SCédric Le Goater ret = vfio_sync_dirty_bitmap(bcontainer, section, &local_err);
1198236e0a45SAvihai Horon if (ret) {
119910c7f1cfSCédric Le Goater if (migration_is_running()) {
120010c7f1cfSCédric Le Goater migration_file_set_error(ret, local_err);
120110c7f1cfSCédric Le Goater } else {
12022da5f9e4SCédric Le Goater error_report_err(local_err);
120310c7f1cfSCédric Le Goater }
1204236e0a45SAvihai Horon }
1205b6dd6504SKirti Wankhede }
1206b6dd6504SKirti Wankhede }
1207b6dd6504SKirti Wankhede
1208a9183378SCédric Le Goater static const MemoryListener vfio_memory_listener = {
1209142518bdSPeter Xu .name = "vfio",
1210d9b7d8b6SJohn Levon .begin = vfio_listener_begin,
1211d9b7d8b6SJohn Levon .commit = vfio_listener_commit,
1212e2c7d025SEric Auger .region_add = vfio_listener_region_add,
1213e2c7d025SEric Auger .region_del = vfio_listener_region_del,
1214758b96b6SKeqian Zhu .log_global_start = vfio_listener_log_global_start,
1215758b96b6SKeqian Zhu .log_global_stop = vfio_listener_log_global_stop,
12164292d501SZenghui Yu .log_sync = vfio_listener_log_sync,
1217e2c7d025SEric Auger };
1218a9183378SCédric Le Goater
vfio_listener_register(VFIOContainerBase * bcontainer,Error ** errp)1219a9183378SCédric Le Goater bool vfio_listener_register(VFIOContainerBase *bcontainer, Error **errp)
1220a9183378SCédric Le Goater {
1221a9183378SCédric Le Goater bcontainer->listener = vfio_memory_listener;
1222a9183378SCédric Le Goater memory_listener_register(&bcontainer->listener, bcontainer->space->as);
1223a9183378SCédric Le Goater
1224a9183378SCédric Le Goater if (bcontainer->error) {
1225a9183378SCédric Le Goater error_propagate_prepend(errp, bcontainer->error,
1226a9183378SCédric Le Goater "memory listener initialization failed: ");
1227a9183378SCédric Le Goater return false;
1228a9183378SCédric Le Goater }
1229a9183378SCédric Le Goater
1230a9183378SCédric Le Goater return true;
1231a9183378SCédric Le Goater }
1232a9183378SCédric Le Goater
vfio_listener_unregister(VFIOContainerBase * bcontainer)1233a9183378SCédric Le Goater void vfio_listener_unregister(VFIOContainerBase *bcontainer)
1234a9183378SCédric Le Goater {
1235a9183378SCédric Le Goater memory_listener_unregister(&bcontainer->listener);
1236a9183378SCédric Le Goater }
1237