1 /*
2 * Copyright (c) 2021-2025 Oracle and/or its affiliates.
3 *
4 * SPDX-License-Identifier: GPL-2.0-or-later
5 */
6
7 #include <sys/ioctl.h>
8 #include <linux/vfio.h>
9 #include "qemu/osdep.h"
10 #include "hw/vfio/vfio-container.h"
11 #include "hw/vfio/vfio-device.h"
12 #include "hw/vfio/vfio-listener.h"
13 #include "migration/blocker.h"
14 #include "migration/cpr.h"
15 #include "migration/migration.h"
16 #include "migration/vmstate.h"
17 #include "qapi/error.h"
18 #include "qemu/error-report.h"
19
vfio_dma_unmap_vaddr_all(VFIOContainer * container,Error ** errp)20 static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp)
21 {
22 struct vfio_iommu_type1_dma_unmap unmap = {
23 .argsz = sizeof(unmap),
24 .flags = VFIO_DMA_UNMAP_FLAG_VADDR | VFIO_DMA_UNMAP_FLAG_ALL,
25 .iova = 0,
26 .size = 0,
27 };
28 if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
29 error_setg_errno(errp, errno, "vfio_dma_unmap_vaddr_all");
30 return false;
31 }
32 container->cpr.vaddr_unmapped = true;
33 return true;
34 }
35
36 /*
37 * Set the new @vaddr for any mappings registered during cpr load.
38 * The incoming state is cleared thereafter.
39 */
vfio_legacy_cpr_dma_map(const VFIOContainerBase * bcontainer,hwaddr iova,ram_addr_t size,void * vaddr,bool readonly,MemoryRegion * mr)40 static int vfio_legacy_cpr_dma_map(const VFIOContainerBase *bcontainer,
41 hwaddr iova, ram_addr_t size, void *vaddr,
42 bool readonly, MemoryRegion *mr)
43 {
44 const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
45 bcontainer);
46 struct vfio_iommu_type1_dma_map map = {
47 .argsz = sizeof(map),
48 .flags = VFIO_DMA_MAP_FLAG_VADDR,
49 .vaddr = (__u64)(uintptr_t)vaddr,
50 .iova = iova,
51 .size = size,
52 };
53
54 g_assert(cpr_is_incoming());
55
56 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) {
57 return -errno;
58 }
59
60 return 0;
61 }
62
vfio_region_remap(MemoryListener * listener,MemoryRegionSection * section)63 static void vfio_region_remap(MemoryListener *listener,
64 MemoryRegionSection *section)
65 {
66 VFIOContainer *container = container_of(listener, VFIOContainer,
67 cpr.remap_listener);
68 vfio_container_region_add(&container->bcontainer, section, true);
69 }
70
vfio_cpr_supported(VFIOContainer * container,Error ** errp)71 static bool vfio_cpr_supported(VFIOContainer *container, Error **errp)
72 {
73 if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) {
74 error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR");
75 return false;
76
77 } else if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL)) {
78 error_setg(errp, "VFIO container does not support VFIO_UNMAP_ALL");
79 return false;
80
81 } else {
82 return true;
83 }
84 }
85
vfio_container_pre_save(void * opaque)86 static int vfio_container_pre_save(void *opaque)
87 {
88 VFIOContainer *container = opaque;
89 Error *local_err = NULL;
90
91 if (!vfio_dma_unmap_vaddr_all(container, &local_err)) {
92 error_report_err(local_err);
93 return -1;
94 }
95 return 0;
96 }
97
vfio_container_post_load(void * opaque,int version_id)98 static int vfio_container_post_load(void *opaque, int version_id)
99 {
100 VFIOContainer *container = opaque;
101 VFIOContainerBase *bcontainer = &container->bcontainer;
102 VFIOGroup *group;
103 Error *local_err = NULL;
104
105 if (!vfio_listener_register(bcontainer, &local_err)) {
106 error_report_err(local_err);
107 return -1;
108 }
109
110 QLIST_FOREACH(group, &container->group_list, container_next) {
111 VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
112
113 /* Restore original dma_map function */
114 vioc->dma_map = container->cpr.saved_dma_map;
115 }
116 return 0;
117 }
118
119 static const VMStateDescription vfio_container_vmstate = {
120 .name = "vfio-container",
121 .version_id = 0,
122 .minimum_version_id = 0,
123 .priority = MIG_PRI_LOW, /* Must happen after devices and groups */
124 .pre_save = vfio_container_pre_save,
125 .post_load = vfio_container_post_load,
126 .needed = cpr_incoming_needed,
127 .fields = (VMStateField[]) {
128 VMSTATE_END_OF_LIST()
129 }
130 };
131
vfio_cpr_fail_notifier(NotifierWithReturn * notifier,MigrationEvent * e,Error ** errp)132 static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier,
133 MigrationEvent *e, Error **errp)
134 {
135 VFIOContainer *container =
136 container_of(notifier, VFIOContainer, cpr.transfer_notifier);
137 VFIOContainerBase *bcontainer = &container->bcontainer;
138
139 if (e->type != MIG_EVENT_PRECOPY_FAILED) {
140 return 0;
141 }
142
143 if (container->cpr.vaddr_unmapped) {
144 /*
145 * Force a call to vfio_region_remap for each mapped section by
146 * temporarily registering a listener, and temporarily diverting
147 * dma_map to vfio_legacy_cpr_dma_map. The latter restores vaddr.
148 */
149
150 VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
151 vioc->dma_map = vfio_legacy_cpr_dma_map;
152
153 container->cpr.remap_listener = (MemoryListener) {
154 .name = "vfio cpr recover",
155 .region_add = vfio_region_remap
156 };
157 memory_listener_register(&container->cpr.remap_listener,
158 bcontainer->space->as);
159 memory_listener_unregister(&container->cpr.remap_listener);
160 container->cpr.vaddr_unmapped = false;
161 vioc->dma_map = container->cpr.saved_dma_map;
162 }
163 return 0;
164 }
165
vfio_legacy_cpr_register_container(VFIOContainer * container,Error ** errp)166 bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp)
167 {
168 VFIOContainerBase *bcontainer = &container->bcontainer;
169 Error **cpr_blocker = &container->cpr.blocker;
170
171 migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier,
172 vfio_cpr_reboot_notifier,
173 MIG_MODE_CPR_REBOOT);
174
175 if (!vfio_cpr_supported(container, cpr_blocker)) {
176 return migrate_add_blocker_modes(cpr_blocker, errp,
177 MIG_MODE_CPR_TRANSFER, -1) == 0;
178 }
179
180 vmstate_register(NULL, -1, &vfio_container_vmstate, container);
181
182 /* During incoming CPR, divert calls to dma_map. */
183 if (cpr_is_incoming()) {
184 VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
185 container->cpr.saved_dma_map = vioc->dma_map;
186 vioc->dma_map = vfio_legacy_cpr_dma_map;
187 }
188
189 migration_add_notifier_mode(&container->cpr.transfer_notifier,
190 vfio_cpr_fail_notifier,
191 MIG_MODE_CPR_TRANSFER);
192 return true;
193 }
194
vfio_legacy_cpr_unregister_container(VFIOContainer * container)195 void vfio_legacy_cpr_unregister_container(VFIOContainer *container)
196 {
197 VFIOContainerBase *bcontainer = &container->bcontainer;
198
199 migration_remove_notifier(&bcontainer->cpr_reboot_notifier);
200 migrate_del_blocker(&container->cpr.blocker);
201 vmstate_unregister(NULL, &vfio_container_vmstate, container);
202 migration_remove_notifier(&container->cpr.transfer_notifier);
203 }
204
205 /*
206 * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after
207 * succeeding for others, so the latter have lost their vaddr. Call this
208 * to restore vaddr for a section with a giommu.
209 *
210 * The giommu already exists. Find it and replay it, which calls
211 * vfio_legacy_cpr_dma_map further down the stack.
212 */
vfio_cpr_giommu_remap(VFIOContainerBase * bcontainer,MemoryRegionSection * section)213 void vfio_cpr_giommu_remap(VFIOContainerBase *bcontainer,
214 MemoryRegionSection *section)
215 {
216 VFIOGuestIOMMU *giommu = NULL;
217 hwaddr as_offset = section->offset_within_address_space;
218 hwaddr iommu_offset = as_offset - section->offset_within_region;
219
220 QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
221 if (giommu->iommu_mr == IOMMU_MEMORY_REGION(section->mr) &&
222 giommu->iommu_offset == iommu_offset) {
223 break;
224 }
225 }
226 g_assert(giommu);
227 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
228 }
229
230 /*
231 * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after
232 * succeeding for others, so the latter have lost their vaddr. Call this
233 * to restore vaddr for a section with a RamDiscardManager.
234 *
235 * The ram discard listener already exists. Call its populate function
236 * directly, which calls vfio_legacy_cpr_dma_map.
237 */
vfio_cpr_ram_discard_register_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)238 bool vfio_cpr_ram_discard_register_listener(VFIOContainerBase *bcontainer,
239 MemoryRegionSection *section)
240 {
241 VFIORamDiscardListener *vrdl =
242 vfio_find_ram_discard_listener(bcontainer, section);
243
244 g_assert(vrdl);
245 return vrdl->listener.notify_populate(&vrdl->listener, section) == 0;
246 }
247
vfio_cpr_group_get_device_fd(int d,const char * name)248 int vfio_cpr_group_get_device_fd(int d, const char *name)
249 {
250 const int id = 0;
251 int fd = cpr_find_fd(name, id);
252
253 if (fd < 0) {
254 fd = ioctl(d, VFIO_GROUP_GET_DEVICE_FD, name);
255 if (fd >= 0) {
256 cpr_save_fd(name, id, fd);
257 }
258 }
259 return fd;
260 }
261
same_device(int fd1,int fd2)262 static bool same_device(int fd1, int fd2)
263 {
264 struct stat st1, st2;
265
266 return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev;
267 }
268
vfio_cpr_container_match(VFIOContainer * container,VFIOGroup * group,int fd)269 bool vfio_cpr_container_match(VFIOContainer *container, VFIOGroup *group,
270 int fd)
271 {
272 if (container->fd == fd) {
273 return true;
274 }
275 if (!same_device(container->fd, fd)) {
276 return false;
277 }
278 /*
279 * Same device, different fd. This occurs when the container fd is
280 * cpr_save'd multiple times, once for each groupid, so SCM_RIGHTS
281 * produces duplicates. De-dup it.
282 */
283 cpr_delete_fd("vfio_container_for_group", group->groupid);
284 close(fd);
285 cpr_save_fd("vfio_container_for_group", group->groupid, container->fd);
286 return true;
287 }
288