154857b08SSteve Sistare /* 254857b08SSteve Sistare * Copyright (c) 2021-2025 Oracle and/or its affiliates. 354857b08SSteve Sistare * 454857b08SSteve Sistare * SPDX-License-Identifier: GPL-2.0-or-later 554857b08SSteve Sistare */ 654857b08SSteve Sistare 754857b08SSteve Sistare #include <sys/ioctl.h> 854857b08SSteve Sistare #include <linux/vfio.h> 954857b08SSteve Sistare #include "qemu/osdep.h" 1054857b08SSteve Sistare #include "hw/vfio/vfio-container.h" 11c29a65edSSteve Sistare #include "hw/vfio/vfio-device.h" 127e9f2141SSteve Sistare #include "hw/vfio/vfio-listener.h" 1354857b08SSteve Sistare #include "migration/blocker.h" 1454857b08SSteve Sistare #include "migration/cpr.h" 1554857b08SSteve Sistare #include "migration/migration.h" 1654857b08SSteve Sistare #include "migration/vmstate.h" 1754857b08SSteve Sistare #include "qapi/error.h" 187e9f2141SSteve Sistare #include "qemu/error-report.h" 1954857b08SSteve Sistare 201faadd96SSteve Sistare static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp) 211faadd96SSteve Sistare { 221faadd96SSteve Sistare struct vfio_iommu_type1_dma_unmap unmap = { 231faadd96SSteve Sistare .argsz = sizeof(unmap), 241faadd96SSteve Sistare .flags = VFIO_DMA_UNMAP_FLAG_VADDR | VFIO_DMA_UNMAP_FLAG_ALL, 251faadd96SSteve Sistare .iova = 0, 261faadd96SSteve Sistare .size = 0, 271faadd96SSteve Sistare }; 281faadd96SSteve Sistare if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 291faadd96SSteve Sistare error_setg_errno(errp, errno, "vfio_dma_unmap_vaddr_all"); 301faadd96SSteve Sistare return false; 311faadd96SSteve Sistare } 32*eba1f657SSteve Sistare container->cpr.vaddr_unmapped = true; 331faadd96SSteve Sistare return true; 341faadd96SSteve Sistare } 351faadd96SSteve Sistare 367e9f2141SSteve Sistare /* 377e9f2141SSteve Sistare * Set the new @vaddr for any mappings registered during cpr load. 387e9f2141SSteve Sistare * The incoming state is cleared thereafter. 397e9f2141SSteve Sistare */ 407e9f2141SSteve Sistare static int vfio_legacy_cpr_dma_map(const VFIOContainerBase *bcontainer, 417e9f2141SSteve Sistare hwaddr iova, ram_addr_t size, void *vaddr, 427e9f2141SSteve Sistare bool readonly, MemoryRegion *mr) 437e9f2141SSteve Sistare { 447e9f2141SSteve Sistare const VFIOContainer *container = container_of(bcontainer, VFIOContainer, 457e9f2141SSteve Sistare bcontainer); 467e9f2141SSteve Sistare struct vfio_iommu_type1_dma_map map = { 477e9f2141SSteve Sistare .argsz = sizeof(map), 487e9f2141SSteve Sistare .flags = VFIO_DMA_MAP_FLAG_VADDR, 497e9f2141SSteve Sistare .vaddr = (__u64)(uintptr_t)vaddr, 507e9f2141SSteve Sistare .iova = iova, 517e9f2141SSteve Sistare .size = size, 527e9f2141SSteve Sistare }; 537e9f2141SSteve Sistare 547e9f2141SSteve Sistare g_assert(cpr_is_incoming()); 557e9f2141SSteve Sistare 567e9f2141SSteve Sistare if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) { 577e9f2141SSteve Sistare return -errno; 587e9f2141SSteve Sistare } 597e9f2141SSteve Sistare 607e9f2141SSteve Sistare return 0; 617e9f2141SSteve Sistare } 621faadd96SSteve Sistare 63*eba1f657SSteve Sistare static void vfio_region_remap(MemoryListener *listener, 64*eba1f657SSteve Sistare MemoryRegionSection *section) 65*eba1f657SSteve Sistare { 66*eba1f657SSteve Sistare VFIOContainer *container = container_of(listener, VFIOContainer, 67*eba1f657SSteve Sistare cpr.remap_listener); 68*eba1f657SSteve Sistare vfio_container_region_add(&container->bcontainer, section, true); 69*eba1f657SSteve Sistare } 70*eba1f657SSteve Sistare 7154857b08SSteve Sistare static bool vfio_cpr_supported(VFIOContainer *container, Error **errp) 7254857b08SSteve Sistare { 7354857b08SSteve Sistare if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) { 7454857b08SSteve Sistare error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR"); 7554857b08SSteve Sistare return false; 7654857b08SSteve Sistare 7754857b08SSteve Sistare } else if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL)) { 7854857b08SSteve Sistare error_setg(errp, "VFIO container does not support VFIO_UNMAP_ALL"); 7954857b08SSteve Sistare return false; 8054857b08SSteve Sistare 8154857b08SSteve Sistare } else { 8254857b08SSteve Sistare return true; 8354857b08SSteve Sistare } 8454857b08SSteve Sistare } 8554857b08SSteve Sistare 861faadd96SSteve Sistare static int vfio_container_pre_save(void *opaque) 871faadd96SSteve Sistare { 881faadd96SSteve Sistare VFIOContainer *container = opaque; 891faadd96SSteve Sistare Error *local_err = NULL; 901faadd96SSteve Sistare 911faadd96SSteve Sistare if (!vfio_dma_unmap_vaddr_all(container, &local_err)) { 921faadd96SSteve Sistare error_report_err(local_err); 931faadd96SSteve Sistare return -1; 941faadd96SSteve Sistare } 951faadd96SSteve Sistare return 0; 961faadd96SSteve Sistare } 971faadd96SSteve Sistare 987e9f2141SSteve Sistare static int vfio_container_post_load(void *opaque, int version_id) 997e9f2141SSteve Sistare { 1007e9f2141SSteve Sistare VFIOContainer *container = opaque; 1017e9f2141SSteve Sistare VFIOContainerBase *bcontainer = &container->bcontainer; 1027e9f2141SSteve Sistare VFIOGroup *group; 1037e9f2141SSteve Sistare Error *local_err = NULL; 1047e9f2141SSteve Sistare 1057e9f2141SSteve Sistare if (!vfio_listener_register(bcontainer, &local_err)) { 1067e9f2141SSteve Sistare error_report_err(local_err); 1077e9f2141SSteve Sistare return -1; 1087e9f2141SSteve Sistare } 1097e9f2141SSteve Sistare 1107e9f2141SSteve Sistare QLIST_FOREACH(group, &container->group_list, container_next) { 1117e9f2141SSteve Sistare VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); 1127e9f2141SSteve Sistare 1137e9f2141SSteve Sistare /* Restore original dma_map function */ 1147e9f2141SSteve Sistare vioc->dma_map = container->cpr.saved_dma_map; 1157e9f2141SSteve Sistare } 1167e9f2141SSteve Sistare return 0; 1177e9f2141SSteve Sistare } 1187e9f2141SSteve Sistare 11954857b08SSteve Sistare static const VMStateDescription vfio_container_vmstate = { 12054857b08SSteve Sistare .name = "vfio-container", 12154857b08SSteve Sistare .version_id = 0, 12254857b08SSteve Sistare .minimum_version_id = 0, 1237e9f2141SSteve Sistare .priority = MIG_PRI_LOW, /* Must happen after devices and groups */ 1241faadd96SSteve Sistare .pre_save = vfio_container_pre_save, 1257e9f2141SSteve Sistare .post_load = vfio_container_post_load, 12654857b08SSteve Sistare .needed = cpr_incoming_needed, 12754857b08SSteve Sistare .fields = (VMStateField[]) { 12854857b08SSteve Sistare VMSTATE_END_OF_LIST() 12954857b08SSteve Sistare } 13054857b08SSteve Sistare }; 13154857b08SSteve Sistare 132*eba1f657SSteve Sistare static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier, 133*eba1f657SSteve Sistare MigrationEvent *e, Error **errp) 134*eba1f657SSteve Sistare { 135*eba1f657SSteve Sistare VFIOContainer *container = 136*eba1f657SSteve Sistare container_of(notifier, VFIOContainer, cpr.transfer_notifier); 137*eba1f657SSteve Sistare VFIOContainerBase *bcontainer = &container->bcontainer; 138*eba1f657SSteve Sistare 139*eba1f657SSteve Sistare if (e->type != MIG_EVENT_PRECOPY_FAILED) { 140*eba1f657SSteve Sistare return 0; 141*eba1f657SSteve Sistare } 142*eba1f657SSteve Sistare 143*eba1f657SSteve Sistare if (container->cpr.vaddr_unmapped) { 144*eba1f657SSteve Sistare /* 145*eba1f657SSteve Sistare * Force a call to vfio_region_remap for each mapped section by 146*eba1f657SSteve Sistare * temporarily registering a listener, and temporarily diverting 147*eba1f657SSteve Sistare * dma_map to vfio_legacy_cpr_dma_map. The latter restores vaddr. 148*eba1f657SSteve Sistare */ 149*eba1f657SSteve Sistare 150*eba1f657SSteve Sistare VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); 151*eba1f657SSteve Sistare vioc->dma_map = vfio_legacy_cpr_dma_map; 152*eba1f657SSteve Sistare 153*eba1f657SSteve Sistare container->cpr.remap_listener = (MemoryListener) { 154*eba1f657SSteve Sistare .name = "vfio cpr recover", 155*eba1f657SSteve Sistare .region_add = vfio_region_remap 156*eba1f657SSteve Sistare }; 157*eba1f657SSteve Sistare memory_listener_register(&container->cpr.remap_listener, 158*eba1f657SSteve Sistare bcontainer->space->as); 159*eba1f657SSteve Sistare memory_listener_unregister(&container->cpr.remap_listener); 160*eba1f657SSteve Sistare container->cpr.vaddr_unmapped = false; 161*eba1f657SSteve Sistare vioc->dma_map = container->cpr.saved_dma_map; 162*eba1f657SSteve Sistare } 163*eba1f657SSteve Sistare return 0; 164*eba1f657SSteve Sistare } 165*eba1f657SSteve Sistare 16654857b08SSteve Sistare bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp) 16754857b08SSteve Sistare { 16854857b08SSteve Sistare VFIOContainerBase *bcontainer = &container->bcontainer; 16954857b08SSteve Sistare Error **cpr_blocker = &container->cpr.blocker; 17054857b08SSteve Sistare 17154857b08SSteve Sistare migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, 17254857b08SSteve Sistare vfio_cpr_reboot_notifier, 17354857b08SSteve Sistare MIG_MODE_CPR_REBOOT); 17454857b08SSteve Sistare 17554857b08SSteve Sistare if (!vfio_cpr_supported(container, cpr_blocker)) { 17654857b08SSteve Sistare return migrate_add_blocker_modes(cpr_blocker, errp, 17754857b08SSteve Sistare MIG_MODE_CPR_TRANSFER, -1) == 0; 17854857b08SSteve Sistare } 17954857b08SSteve Sistare 18054857b08SSteve Sistare vmstate_register(NULL, -1, &vfio_container_vmstate, container); 18154857b08SSteve Sistare 1827e9f2141SSteve Sistare /* During incoming CPR, divert calls to dma_map. */ 1837e9f2141SSteve Sistare if (cpr_is_incoming()) { 1847e9f2141SSteve Sistare VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); 1857e9f2141SSteve Sistare container->cpr.saved_dma_map = vioc->dma_map; 1867e9f2141SSteve Sistare vioc->dma_map = vfio_legacy_cpr_dma_map; 1877e9f2141SSteve Sistare } 188*eba1f657SSteve Sistare 189*eba1f657SSteve Sistare migration_add_notifier_mode(&container->cpr.transfer_notifier, 190*eba1f657SSteve Sistare vfio_cpr_fail_notifier, 191*eba1f657SSteve Sistare MIG_MODE_CPR_TRANSFER); 19254857b08SSteve Sistare return true; 19354857b08SSteve Sistare } 19454857b08SSteve Sistare 19554857b08SSteve Sistare void vfio_legacy_cpr_unregister_container(VFIOContainer *container) 19654857b08SSteve Sistare { 19754857b08SSteve Sistare VFIOContainerBase *bcontainer = &container->bcontainer; 19854857b08SSteve Sistare 19954857b08SSteve Sistare migration_remove_notifier(&bcontainer->cpr_reboot_notifier); 20054857b08SSteve Sistare migrate_del_blocker(&container->cpr.blocker); 20154857b08SSteve Sistare vmstate_unregister(NULL, &vfio_container_vmstate, container); 202*eba1f657SSteve Sistare migration_remove_notifier(&container->cpr.transfer_notifier); 203*eba1f657SSteve Sistare } 204*eba1f657SSteve Sistare 205*eba1f657SSteve Sistare /* 206*eba1f657SSteve Sistare * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after 207*eba1f657SSteve Sistare * succeeding for others, so the latter have lost their vaddr. Call this 208*eba1f657SSteve Sistare * to restore vaddr for a section with a giommu. 209*eba1f657SSteve Sistare * 210*eba1f657SSteve Sistare * The giommu already exists. Find it and replay it, which calls 211*eba1f657SSteve Sistare * vfio_legacy_cpr_dma_map further down the stack. 212*eba1f657SSteve Sistare */ 213*eba1f657SSteve Sistare void vfio_cpr_giommu_remap(VFIOContainerBase *bcontainer, 214*eba1f657SSteve Sistare MemoryRegionSection *section) 215*eba1f657SSteve Sistare { 216*eba1f657SSteve Sistare VFIOGuestIOMMU *giommu = NULL; 217*eba1f657SSteve Sistare hwaddr as_offset = section->offset_within_address_space; 218*eba1f657SSteve Sistare hwaddr iommu_offset = as_offset - section->offset_within_region; 219*eba1f657SSteve Sistare 220*eba1f657SSteve Sistare QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { 221*eba1f657SSteve Sistare if (giommu->iommu_mr == IOMMU_MEMORY_REGION(section->mr) && 222*eba1f657SSteve Sistare giommu->iommu_offset == iommu_offset) { 223*eba1f657SSteve Sistare break; 224*eba1f657SSteve Sistare } 225*eba1f657SSteve Sistare } 226*eba1f657SSteve Sistare g_assert(giommu); 227*eba1f657SSteve Sistare memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); 228*eba1f657SSteve Sistare } 229*eba1f657SSteve Sistare 230*eba1f657SSteve Sistare /* 231*eba1f657SSteve Sistare * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after 232*eba1f657SSteve Sistare * succeeding for others, so the latter have lost their vaddr. Call this 233*eba1f657SSteve Sistare * to restore vaddr for a section with a RamDiscardManager. 234*eba1f657SSteve Sistare * 235*eba1f657SSteve Sistare * The ram discard listener already exists. Call its populate function 236*eba1f657SSteve Sistare * directly, which calls vfio_legacy_cpr_dma_map. 237*eba1f657SSteve Sistare */ 238*eba1f657SSteve Sistare bool vfio_cpr_ram_discard_register_listener(VFIOContainerBase *bcontainer, 239*eba1f657SSteve Sistare MemoryRegionSection *section) 240*eba1f657SSteve Sistare { 241*eba1f657SSteve Sistare VFIORamDiscardListener *vrdl = 242*eba1f657SSteve Sistare vfio_find_ram_discard_listener(bcontainer, section); 243*eba1f657SSteve Sistare 244*eba1f657SSteve Sistare g_assert(vrdl); 245*eba1f657SSteve Sistare return vrdl->listener.notify_populate(&vrdl->listener, section) == 0; 24654857b08SSteve Sistare } 247c29a65edSSteve Sistare 248c29a65edSSteve Sistare int vfio_cpr_group_get_device_fd(int d, const char *name) 249c29a65edSSteve Sistare { 250c29a65edSSteve Sistare const int id = 0; 251c29a65edSSteve Sistare int fd = cpr_find_fd(name, id); 252c29a65edSSteve Sistare 253c29a65edSSteve Sistare if (fd < 0) { 254c29a65edSSteve Sistare fd = ioctl(d, VFIO_GROUP_GET_DEVICE_FD, name); 255c29a65edSSteve Sistare if (fd >= 0) { 256c29a65edSSteve Sistare cpr_save_fd(name, id, fd); 257c29a65edSSteve Sistare } 258c29a65edSSteve Sistare } 259c29a65edSSteve Sistare return fd; 260c29a65edSSteve Sistare } 261c29a65edSSteve Sistare 262c29a65edSSteve Sistare static bool same_device(int fd1, int fd2) 263c29a65edSSteve Sistare { 264c29a65edSSteve Sistare struct stat st1, st2; 265c29a65edSSteve Sistare 266c29a65edSSteve Sistare return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev; 267c29a65edSSteve Sistare } 268c29a65edSSteve Sistare 269c29a65edSSteve Sistare bool vfio_cpr_container_match(VFIOContainer *container, VFIOGroup *group, 270c29a65edSSteve Sistare int fd) 271c29a65edSSteve Sistare { 272c29a65edSSteve Sistare if (container->fd == fd) { 273c29a65edSSteve Sistare return true; 274c29a65edSSteve Sistare } 275c29a65edSSteve Sistare if (!same_device(container->fd, fd)) { 276c29a65edSSteve Sistare return false; 277c29a65edSSteve Sistare } 278c29a65edSSteve Sistare /* 279c29a65edSSteve Sistare * Same device, different fd. This occurs when the container fd is 280c29a65edSSteve Sistare * cpr_save'd multiple times, once for each groupid, so SCM_RIGHTS 281c29a65edSSteve Sistare * produces duplicates. De-dup it. 282c29a65edSSteve Sistare */ 283c29a65edSSteve Sistare cpr_delete_fd("vfio_container_for_group", group->groupid); 284c29a65edSSteve Sistare close(fd); 285c29a65edSSteve Sistare cpr_save_fd("vfio_container_for_group", group->groupid, container->fd); 286c29a65edSSteve Sistare return true; 287c29a65edSSteve Sistare } 288