xref: /qemu/hw/vfio/cpr-legacy.c (revision d9ce74873a6a5a7c504379857461e4ae64fcf0cd)
154857b08SSteve Sistare /*
254857b08SSteve Sistare  * Copyright (c) 2021-2025 Oracle and/or its affiliates.
354857b08SSteve Sistare  *
454857b08SSteve Sistare  * SPDX-License-Identifier: GPL-2.0-or-later
554857b08SSteve Sistare  */
654857b08SSteve Sistare 
754857b08SSteve Sistare #include <sys/ioctl.h>
854857b08SSteve Sistare #include <linux/vfio.h>
954857b08SSteve Sistare #include "qemu/osdep.h"
1054857b08SSteve Sistare #include "hw/vfio/vfio-container.h"
11c29a65edSSteve Sistare #include "hw/vfio/vfio-device.h"
127e9f2141SSteve Sistare #include "hw/vfio/vfio-listener.h"
1354857b08SSteve Sistare #include "migration/blocker.h"
1454857b08SSteve Sistare #include "migration/cpr.h"
1554857b08SSteve Sistare #include "migration/migration.h"
1654857b08SSteve Sistare #include "migration/vmstate.h"
1754857b08SSteve Sistare #include "qapi/error.h"
187e9f2141SSteve Sistare #include "qemu/error-report.h"
1954857b08SSteve Sistare 
vfio_dma_unmap_vaddr_all(VFIOContainer * container,Error ** errp)201faadd96SSteve Sistare static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp)
211faadd96SSteve Sistare {
221faadd96SSteve Sistare     struct vfio_iommu_type1_dma_unmap unmap = {
231faadd96SSteve Sistare         .argsz = sizeof(unmap),
241faadd96SSteve Sistare         .flags = VFIO_DMA_UNMAP_FLAG_VADDR | VFIO_DMA_UNMAP_FLAG_ALL,
251faadd96SSteve Sistare         .iova = 0,
261faadd96SSteve Sistare         .size = 0,
271faadd96SSteve Sistare     };
281faadd96SSteve Sistare     if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
291faadd96SSteve Sistare         error_setg_errno(errp, errno, "vfio_dma_unmap_vaddr_all");
301faadd96SSteve Sistare         return false;
311faadd96SSteve Sistare     }
32*eba1f657SSteve Sistare     container->cpr.vaddr_unmapped = true;
331faadd96SSteve Sistare     return true;
341faadd96SSteve Sistare }
351faadd96SSteve Sistare 
367e9f2141SSteve Sistare /*
377e9f2141SSteve Sistare  * Set the new @vaddr for any mappings registered during cpr load.
387e9f2141SSteve Sistare  * The incoming state is cleared thereafter.
397e9f2141SSteve Sistare  */
vfio_legacy_cpr_dma_map(const VFIOContainerBase * bcontainer,hwaddr iova,ram_addr_t size,void * vaddr,bool readonly,MemoryRegion * mr)407e9f2141SSteve Sistare static int vfio_legacy_cpr_dma_map(const VFIOContainerBase *bcontainer,
417e9f2141SSteve Sistare                                    hwaddr iova, ram_addr_t size, void *vaddr,
427e9f2141SSteve Sistare                                    bool readonly, MemoryRegion *mr)
437e9f2141SSteve Sistare {
447e9f2141SSteve Sistare     const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
457e9f2141SSteve Sistare                                                   bcontainer);
467e9f2141SSteve Sistare     struct vfio_iommu_type1_dma_map map = {
477e9f2141SSteve Sistare         .argsz = sizeof(map),
487e9f2141SSteve Sistare         .flags = VFIO_DMA_MAP_FLAG_VADDR,
497e9f2141SSteve Sistare         .vaddr = (__u64)(uintptr_t)vaddr,
507e9f2141SSteve Sistare         .iova = iova,
517e9f2141SSteve Sistare         .size = size,
527e9f2141SSteve Sistare     };
537e9f2141SSteve Sistare 
547e9f2141SSteve Sistare     g_assert(cpr_is_incoming());
557e9f2141SSteve Sistare 
567e9f2141SSteve Sistare     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) {
577e9f2141SSteve Sistare         return -errno;
587e9f2141SSteve Sistare     }
597e9f2141SSteve Sistare 
607e9f2141SSteve Sistare     return 0;
617e9f2141SSteve Sistare }
621faadd96SSteve Sistare 
vfio_region_remap(MemoryListener * listener,MemoryRegionSection * section)63*eba1f657SSteve Sistare static void vfio_region_remap(MemoryListener *listener,
64*eba1f657SSteve Sistare                               MemoryRegionSection *section)
65*eba1f657SSteve Sistare {
66*eba1f657SSteve Sistare     VFIOContainer *container = container_of(listener, VFIOContainer,
67*eba1f657SSteve Sistare                                             cpr.remap_listener);
68*eba1f657SSteve Sistare     vfio_container_region_add(&container->bcontainer, section, true);
69*eba1f657SSteve Sistare }
70*eba1f657SSteve Sistare 
vfio_cpr_supported(VFIOContainer * container,Error ** errp)7154857b08SSteve Sistare static bool vfio_cpr_supported(VFIOContainer *container, Error **errp)
7254857b08SSteve Sistare {
7354857b08SSteve Sistare     if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) {
7454857b08SSteve Sistare         error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR");
7554857b08SSteve Sistare         return false;
7654857b08SSteve Sistare 
7754857b08SSteve Sistare     } else if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL)) {
7854857b08SSteve Sistare         error_setg(errp, "VFIO container does not support VFIO_UNMAP_ALL");
7954857b08SSteve Sistare         return false;
8054857b08SSteve Sistare 
8154857b08SSteve Sistare     } else {
8254857b08SSteve Sistare         return true;
8354857b08SSteve Sistare     }
8454857b08SSteve Sistare }
8554857b08SSteve Sistare 
vfio_container_pre_save(void * opaque)861faadd96SSteve Sistare static int vfio_container_pre_save(void *opaque)
871faadd96SSteve Sistare {
881faadd96SSteve Sistare     VFIOContainer *container = opaque;
891faadd96SSteve Sistare     Error *local_err = NULL;
901faadd96SSteve Sistare 
911faadd96SSteve Sistare     if (!vfio_dma_unmap_vaddr_all(container, &local_err)) {
921faadd96SSteve Sistare         error_report_err(local_err);
931faadd96SSteve Sistare         return -1;
941faadd96SSteve Sistare     }
951faadd96SSteve Sistare     return 0;
961faadd96SSteve Sistare }
971faadd96SSteve Sistare 
vfio_container_post_load(void * opaque,int version_id)987e9f2141SSteve Sistare static int vfio_container_post_load(void *opaque, int version_id)
997e9f2141SSteve Sistare {
1007e9f2141SSteve Sistare     VFIOContainer *container = opaque;
1017e9f2141SSteve Sistare     VFIOContainerBase *bcontainer = &container->bcontainer;
1027e9f2141SSteve Sistare     VFIOGroup *group;
1037e9f2141SSteve Sistare     Error *local_err = NULL;
1047e9f2141SSteve Sistare 
1057e9f2141SSteve Sistare     if (!vfio_listener_register(bcontainer, &local_err)) {
1067e9f2141SSteve Sistare         error_report_err(local_err);
1077e9f2141SSteve Sistare         return -1;
1087e9f2141SSteve Sistare     }
1097e9f2141SSteve Sistare 
1107e9f2141SSteve Sistare     QLIST_FOREACH(group, &container->group_list, container_next) {
1117e9f2141SSteve Sistare         VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
1127e9f2141SSteve Sistare 
1137e9f2141SSteve Sistare         /* Restore original dma_map function */
1147e9f2141SSteve Sistare         vioc->dma_map = container->cpr.saved_dma_map;
1157e9f2141SSteve Sistare     }
1167e9f2141SSteve Sistare     return 0;
1177e9f2141SSteve Sistare }
1187e9f2141SSteve Sistare 
11954857b08SSteve Sistare static const VMStateDescription vfio_container_vmstate = {
12054857b08SSteve Sistare     .name = "vfio-container",
12154857b08SSteve Sistare     .version_id = 0,
12254857b08SSteve Sistare     .minimum_version_id = 0,
1237e9f2141SSteve Sistare     .priority = MIG_PRI_LOW,  /* Must happen after devices and groups */
1241faadd96SSteve Sistare     .pre_save = vfio_container_pre_save,
1257e9f2141SSteve Sistare     .post_load = vfio_container_post_load,
12654857b08SSteve Sistare     .needed = cpr_incoming_needed,
12754857b08SSteve Sistare     .fields = (VMStateField[]) {
12854857b08SSteve Sistare         VMSTATE_END_OF_LIST()
12954857b08SSteve Sistare     }
13054857b08SSteve Sistare };
13154857b08SSteve Sistare 
vfio_cpr_fail_notifier(NotifierWithReturn * notifier,MigrationEvent * e,Error ** errp)132*eba1f657SSteve Sistare static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier,
133*eba1f657SSteve Sistare                                   MigrationEvent *e, Error **errp)
134*eba1f657SSteve Sistare {
135*eba1f657SSteve Sistare     VFIOContainer *container =
136*eba1f657SSteve Sistare         container_of(notifier, VFIOContainer, cpr.transfer_notifier);
137*eba1f657SSteve Sistare     VFIOContainerBase *bcontainer = &container->bcontainer;
138*eba1f657SSteve Sistare 
139*eba1f657SSteve Sistare     if (e->type != MIG_EVENT_PRECOPY_FAILED) {
140*eba1f657SSteve Sistare         return 0;
141*eba1f657SSteve Sistare     }
142*eba1f657SSteve Sistare 
143*eba1f657SSteve Sistare     if (container->cpr.vaddr_unmapped) {
144*eba1f657SSteve Sistare         /*
145*eba1f657SSteve Sistare          * Force a call to vfio_region_remap for each mapped section by
146*eba1f657SSteve Sistare          * temporarily registering a listener, and temporarily diverting
147*eba1f657SSteve Sistare          * dma_map to vfio_legacy_cpr_dma_map.  The latter restores vaddr.
148*eba1f657SSteve Sistare          */
149*eba1f657SSteve Sistare 
150*eba1f657SSteve Sistare         VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
151*eba1f657SSteve Sistare         vioc->dma_map = vfio_legacy_cpr_dma_map;
152*eba1f657SSteve Sistare 
153*eba1f657SSteve Sistare         container->cpr.remap_listener = (MemoryListener) {
154*eba1f657SSteve Sistare             .name = "vfio cpr recover",
155*eba1f657SSteve Sistare             .region_add = vfio_region_remap
156*eba1f657SSteve Sistare         };
157*eba1f657SSteve Sistare         memory_listener_register(&container->cpr.remap_listener,
158*eba1f657SSteve Sistare                                  bcontainer->space->as);
159*eba1f657SSteve Sistare         memory_listener_unregister(&container->cpr.remap_listener);
160*eba1f657SSteve Sistare         container->cpr.vaddr_unmapped = false;
161*eba1f657SSteve Sistare         vioc->dma_map = container->cpr.saved_dma_map;
162*eba1f657SSteve Sistare     }
163*eba1f657SSteve Sistare     return 0;
164*eba1f657SSteve Sistare }
165*eba1f657SSteve Sistare 
vfio_legacy_cpr_register_container(VFIOContainer * container,Error ** errp)16654857b08SSteve Sistare bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp)
16754857b08SSteve Sistare {
16854857b08SSteve Sistare     VFIOContainerBase *bcontainer = &container->bcontainer;
16954857b08SSteve Sistare     Error **cpr_blocker = &container->cpr.blocker;
17054857b08SSteve Sistare 
17154857b08SSteve Sistare     migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier,
17254857b08SSteve Sistare                                 vfio_cpr_reboot_notifier,
17354857b08SSteve Sistare                                 MIG_MODE_CPR_REBOOT);
17454857b08SSteve Sistare 
17554857b08SSteve Sistare     if (!vfio_cpr_supported(container, cpr_blocker)) {
17654857b08SSteve Sistare         return migrate_add_blocker_modes(cpr_blocker, errp,
17754857b08SSteve Sistare                                          MIG_MODE_CPR_TRANSFER, -1) == 0;
17854857b08SSteve Sistare     }
17954857b08SSteve Sistare 
18054857b08SSteve Sistare     vmstate_register(NULL, -1, &vfio_container_vmstate, container);
18154857b08SSteve Sistare 
1827e9f2141SSteve Sistare     /* During incoming CPR, divert calls to dma_map. */
1837e9f2141SSteve Sistare     if (cpr_is_incoming()) {
1847e9f2141SSteve Sistare         VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
1857e9f2141SSteve Sistare         container->cpr.saved_dma_map = vioc->dma_map;
1867e9f2141SSteve Sistare         vioc->dma_map = vfio_legacy_cpr_dma_map;
1877e9f2141SSteve Sistare     }
188*eba1f657SSteve Sistare 
189*eba1f657SSteve Sistare     migration_add_notifier_mode(&container->cpr.transfer_notifier,
190*eba1f657SSteve Sistare                                 vfio_cpr_fail_notifier,
191*eba1f657SSteve Sistare                                 MIG_MODE_CPR_TRANSFER);
19254857b08SSteve Sistare     return true;
19354857b08SSteve Sistare }
19454857b08SSteve Sistare 
vfio_legacy_cpr_unregister_container(VFIOContainer * container)19554857b08SSteve Sistare void vfio_legacy_cpr_unregister_container(VFIOContainer *container)
19654857b08SSteve Sistare {
19754857b08SSteve Sistare     VFIOContainerBase *bcontainer = &container->bcontainer;
19854857b08SSteve Sistare 
19954857b08SSteve Sistare     migration_remove_notifier(&bcontainer->cpr_reboot_notifier);
20054857b08SSteve Sistare     migrate_del_blocker(&container->cpr.blocker);
20154857b08SSteve Sistare     vmstate_unregister(NULL, &vfio_container_vmstate, container);
202*eba1f657SSteve Sistare     migration_remove_notifier(&container->cpr.transfer_notifier);
203*eba1f657SSteve Sistare }
204*eba1f657SSteve Sistare 
205*eba1f657SSteve Sistare /*
206*eba1f657SSteve Sistare  * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after
207*eba1f657SSteve Sistare  * succeeding for others, so the latter have lost their vaddr.  Call this
208*eba1f657SSteve Sistare  * to restore vaddr for a section with a giommu.
209*eba1f657SSteve Sistare  *
210*eba1f657SSteve Sistare  * The giommu already exists.  Find it and replay it, which calls
211*eba1f657SSteve Sistare  * vfio_legacy_cpr_dma_map further down the stack.
212*eba1f657SSteve Sistare  */
vfio_cpr_giommu_remap(VFIOContainerBase * bcontainer,MemoryRegionSection * section)213*eba1f657SSteve Sistare void vfio_cpr_giommu_remap(VFIOContainerBase *bcontainer,
214*eba1f657SSteve Sistare                            MemoryRegionSection *section)
215*eba1f657SSteve Sistare {
216*eba1f657SSteve Sistare     VFIOGuestIOMMU *giommu = NULL;
217*eba1f657SSteve Sistare     hwaddr as_offset = section->offset_within_address_space;
218*eba1f657SSteve Sistare     hwaddr iommu_offset = as_offset - section->offset_within_region;
219*eba1f657SSteve Sistare 
220*eba1f657SSteve Sistare     QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
221*eba1f657SSteve Sistare         if (giommu->iommu_mr == IOMMU_MEMORY_REGION(section->mr) &&
222*eba1f657SSteve Sistare             giommu->iommu_offset == iommu_offset) {
223*eba1f657SSteve Sistare             break;
224*eba1f657SSteve Sistare         }
225*eba1f657SSteve Sistare     }
226*eba1f657SSteve Sistare     g_assert(giommu);
227*eba1f657SSteve Sistare     memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
228*eba1f657SSteve Sistare }
229*eba1f657SSteve Sistare 
230*eba1f657SSteve Sistare /*
231*eba1f657SSteve Sistare  * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after
232*eba1f657SSteve Sistare  * succeeding for others, so the latter have lost their vaddr.  Call this
233*eba1f657SSteve Sistare  * to restore vaddr for a section with a RamDiscardManager.
234*eba1f657SSteve Sistare  *
235*eba1f657SSteve Sistare  * The ram discard listener already exists.  Call its populate function
236*eba1f657SSteve Sistare  * directly, which calls vfio_legacy_cpr_dma_map.
237*eba1f657SSteve Sistare  */
vfio_cpr_ram_discard_register_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)238*eba1f657SSteve Sistare bool vfio_cpr_ram_discard_register_listener(VFIOContainerBase *bcontainer,
239*eba1f657SSteve Sistare                                             MemoryRegionSection *section)
240*eba1f657SSteve Sistare {
241*eba1f657SSteve Sistare     VFIORamDiscardListener *vrdl =
242*eba1f657SSteve Sistare         vfio_find_ram_discard_listener(bcontainer, section);
243*eba1f657SSteve Sistare 
244*eba1f657SSteve Sistare     g_assert(vrdl);
245*eba1f657SSteve Sistare     return vrdl->listener.notify_populate(&vrdl->listener, section) == 0;
24654857b08SSteve Sistare }
247c29a65edSSteve Sistare 
vfio_cpr_group_get_device_fd(int d,const char * name)248c29a65edSSteve Sistare int vfio_cpr_group_get_device_fd(int d, const char *name)
249c29a65edSSteve Sistare {
250c29a65edSSteve Sistare     const int id = 0;
251c29a65edSSteve Sistare     int fd = cpr_find_fd(name, id);
252c29a65edSSteve Sistare 
253c29a65edSSteve Sistare     if (fd < 0) {
254c29a65edSSteve Sistare         fd = ioctl(d, VFIO_GROUP_GET_DEVICE_FD, name);
255c29a65edSSteve Sistare         if (fd >= 0) {
256c29a65edSSteve Sistare             cpr_save_fd(name, id, fd);
257c29a65edSSteve Sistare         }
258c29a65edSSteve Sistare     }
259c29a65edSSteve Sistare     return fd;
260c29a65edSSteve Sistare }
261c29a65edSSteve Sistare 
same_device(int fd1,int fd2)262c29a65edSSteve Sistare static bool same_device(int fd1, int fd2)
263c29a65edSSteve Sistare {
264c29a65edSSteve Sistare     struct stat st1, st2;
265c29a65edSSteve Sistare 
266c29a65edSSteve Sistare     return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev;
267c29a65edSSteve Sistare }
268c29a65edSSteve Sistare 
vfio_cpr_container_match(VFIOContainer * container,VFIOGroup * group,int fd)269c29a65edSSteve Sistare bool vfio_cpr_container_match(VFIOContainer *container, VFIOGroup *group,
270c29a65edSSteve Sistare                               int fd)
271c29a65edSSteve Sistare {
272c29a65edSSteve Sistare     if (container->fd == fd) {
273c29a65edSSteve Sistare         return true;
274c29a65edSSteve Sistare     }
275c29a65edSSteve Sistare     if (!same_device(container->fd, fd)) {
276c29a65edSSteve Sistare         return false;
277c29a65edSSteve Sistare     }
278c29a65edSSteve Sistare     /*
279c29a65edSSteve Sistare      * Same device, different fd.  This occurs when the container fd is
280c29a65edSSteve Sistare      * cpr_save'd multiple times, once for each groupid, so SCM_RIGHTS
281c29a65edSSteve Sistare      * produces duplicates.  De-dup it.
282c29a65edSSteve Sistare      */
283c29a65edSSteve Sistare     cpr_delete_fd("vfio_container_for_group", group->groupid);
284c29a65edSSteve Sistare     close(fd);
285c29a65edSSteve Sistare     cpr_save_fd("vfio_container_for_group", group->groupid, container->fd);
286c29a65edSSteve Sistare     return true;
287c29a65edSSteve Sistare }
288