1 /*
2 * Copyright (c) 2021-2025 Oracle and/or its affiliates.
3 *
4 * SPDX-License-Identifier: GPL-2.0-or-later
5 */
6
7 #include <sys/ioctl.h>
8 #include <linux/vfio.h>
9 #include "qemu/osdep.h"
10 #include "hw/vfio/vfio-container.h"
11 #include "hw/vfio/vfio-device.h"
12 #include "hw/vfio/vfio-listener.h"
13 #include "migration/blocker.h"
14 #include "migration/cpr.h"
15 #include "migration/migration.h"
16 #include "migration/vmstate.h"
17 #include "qapi/error.h"
18 #include "qemu/error-report.h"
19
vfio_dma_unmap_vaddr_all(VFIOContainer * container,Error ** errp)20 static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp)
21 {
22 struct vfio_iommu_type1_dma_unmap unmap = {
23 .argsz = sizeof(unmap),
24 .flags = VFIO_DMA_UNMAP_FLAG_VADDR | VFIO_DMA_UNMAP_FLAG_ALL,
25 .iova = 0,
26 .size = 0,
27 };
28 if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
29 error_setg_errno(errp, errno, "vfio_dma_unmap_vaddr_all");
30 return false;
31 }
32 container->cpr.vaddr_unmapped = true;
33 return true;
34 }
35
36 /*
37 * Set the new @vaddr for any mappings registered during cpr load.
38 * The incoming state is cleared thereafter.
39 */
vfio_legacy_cpr_dma_map(const VFIOContainerBase * bcontainer,hwaddr iova,ram_addr_t size,void * vaddr,bool readonly,MemoryRegion * mr)40 static int vfio_legacy_cpr_dma_map(const VFIOContainerBase *bcontainer,
41 hwaddr iova, ram_addr_t size, void *vaddr,
42 bool readonly, MemoryRegion *mr)
43 {
44 const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
45 bcontainer);
46 struct vfio_iommu_type1_dma_map map = {
47 .argsz = sizeof(map),
48 .flags = VFIO_DMA_MAP_FLAG_VADDR,
49 .vaddr = (__u64)(uintptr_t)vaddr,
50 .iova = iova,
51 .size = size,
52 };
53
54 g_assert(cpr_is_incoming());
55
56 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) {
57 return -errno;
58 }
59
60 return 0;
61 }
62
vfio_region_remap(MemoryListener * listener,MemoryRegionSection * section)63 static void vfio_region_remap(MemoryListener *listener,
64 MemoryRegionSection *section)
65 {
66 VFIOContainer *container = container_of(listener, VFIOContainer,
67 cpr.remap_listener);
68 vfio_container_region_add(&container->bcontainer, section, true);
69 }
70
vfio_cpr_supported(VFIOContainer * container,Error ** errp)71 static bool vfio_cpr_supported(VFIOContainer *container, Error **errp)
72 {
73 if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) {
74 error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR");
75 return false;
76
77 } else if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL)) {
78 error_setg(errp, "VFIO container does not support VFIO_UNMAP_ALL");
79 return false;
80
81 } else {
82 return true;
83 }
84 }
85
vfio_container_pre_save(void * opaque)86 static int vfio_container_pre_save(void *opaque)
87 {
88 VFIOContainer *container = opaque;
89 Error *local_err = NULL;
90
91 if (!vfio_dma_unmap_vaddr_all(container, &local_err)) {
92 error_report_err(local_err);
93 return -1;
94 }
95 return 0;
96 }
97
vfio_container_post_load(void * opaque,int version_id)98 static int vfio_container_post_load(void *opaque, int version_id)
99 {
100 VFIOContainer *container = opaque;
101 VFIOContainerBase *bcontainer = &container->bcontainer;
102 VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
103 dma_map_fn saved_dma_map = vioc->dma_map;
104 Error *local_err = NULL;
105
106 /* During incoming CPR, divert calls to dma_map. */
107 vioc->dma_map = vfio_legacy_cpr_dma_map;
108
109 if (!vfio_listener_register(bcontainer, &local_err)) {
110 error_report_err(local_err);
111 return -1;
112 }
113
114 /* Restore original dma_map function */
115 vioc->dma_map = saved_dma_map;
116
117 return 0;
118 }
119
120 static const VMStateDescription vfio_container_vmstate = {
121 .name = "vfio-container",
122 .version_id = 0,
123 .minimum_version_id = 0,
124 .priority = MIG_PRI_LOW, /* Must happen after devices and groups */
125 .pre_save = vfio_container_pre_save,
126 .post_load = vfio_container_post_load,
127 .needed = cpr_incoming_needed,
128 .fields = (VMStateField[]) {
129 VMSTATE_END_OF_LIST()
130 }
131 };
132
vfio_cpr_fail_notifier(NotifierWithReturn * notifier,MigrationEvent * e,Error ** errp)133 static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier,
134 MigrationEvent *e, Error **errp)
135 {
136 VFIOContainer *container =
137 container_of(notifier, VFIOContainer, cpr.transfer_notifier);
138 VFIOContainerBase *bcontainer = &container->bcontainer;
139
140 if (e->type != MIG_EVENT_PRECOPY_FAILED) {
141 return 0;
142 }
143
144 if (container->cpr.vaddr_unmapped) {
145 /*
146 * Force a call to vfio_region_remap for each mapped section by
147 * temporarily registering a listener, and temporarily diverting
148 * dma_map to vfio_legacy_cpr_dma_map. The latter restores vaddr.
149 */
150
151 VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
152 dma_map_fn saved_dma_map = vioc->dma_map;
153 vioc->dma_map = vfio_legacy_cpr_dma_map;
154
155 container->cpr.remap_listener = (MemoryListener) {
156 .name = "vfio cpr recover",
157 .region_add = vfio_region_remap
158 };
159 memory_listener_register(&container->cpr.remap_listener,
160 bcontainer->space->as);
161 memory_listener_unregister(&container->cpr.remap_listener);
162 container->cpr.vaddr_unmapped = false;
163 vioc->dma_map = saved_dma_map;
164 }
165 return 0;
166 }
167
vfio_legacy_cpr_register_container(VFIOContainer * container,Error ** errp)168 bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp)
169 {
170 VFIOContainerBase *bcontainer = &container->bcontainer;
171 Error **cpr_blocker = &container->cpr.blocker;
172
173 migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier,
174 vfio_cpr_reboot_notifier,
175 MIG_MODE_CPR_REBOOT);
176
177 if (!vfio_cpr_supported(container, cpr_blocker)) {
178 return migrate_add_blocker_modes(cpr_blocker, errp,
179 MIG_MODE_CPR_TRANSFER, -1) == 0;
180 }
181
182 vfio_cpr_add_kvm_notifier();
183
184 vmstate_register(NULL, -1, &vfio_container_vmstate, container);
185
186 migration_add_notifier_mode(&container->cpr.transfer_notifier,
187 vfio_cpr_fail_notifier,
188 MIG_MODE_CPR_TRANSFER);
189 return true;
190 }
191
vfio_legacy_cpr_unregister_container(VFIOContainer * container)192 void vfio_legacy_cpr_unregister_container(VFIOContainer *container)
193 {
194 VFIOContainerBase *bcontainer = &container->bcontainer;
195
196 migration_remove_notifier(&bcontainer->cpr_reboot_notifier);
197 migrate_del_blocker(&container->cpr.blocker);
198 vmstate_unregister(NULL, &vfio_container_vmstate, container);
199 migration_remove_notifier(&container->cpr.transfer_notifier);
200 }
201
202 /*
203 * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after
204 * succeeding for others, so the latter have lost their vaddr. Call this
205 * to restore vaddr for a section with a giommu.
206 *
207 * The giommu already exists. Find it and replay it, which calls
208 * vfio_legacy_cpr_dma_map further down the stack.
209 */
vfio_cpr_giommu_remap(VFIOContainerBase * bcontainer,MemoryRegionSection * section)210 void vfio_cpr_giommu_remap(VFIOContainerBase *bcontainer,
211 MemoryRegionSection *section)
212 {
213 VFIOGuestIOMMU *giommu = NULL;
214 hwaddr as_offset = section->offset_within_address_space;
215 hwaddr iommu_offset = as_offset - section->offset_within_region;
216
217 QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
218 if (giommu->iommu_mr == IOMMU_MEMORY_REGION(section->mr) &&
219 giommu->iommu_offset == iommu_offset) {
220 break;
221 }
222 }
223 g_assert(giommu);
224 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
225 }
226
227 /*
228 * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after
229 * succeeding for others, so the latter have lost their vaddr. Call this
230 * to restore vaddr for a section with a RamDiscardManager.
231 *
232 * The ram discard listener already exists. Call its populate function
233 * directly, which calls vfio_legacy_cpr_dma_map.
234 */
vfio_cpr_ram_discard_register_listener(VFIOContainerBase * bcontainer,MemoryRegionSection * section)235 bool vfio_cpr_ram_discard_register_listener(VFIOContainerBase *bcontainer,
236 MemoryRegionSection *section)
237 {
238 VFIORamDiscardListener *vrdl =
239 vfio_find_ram_discard_listener(bcontainer, section);
240
241 g_assert(vrdl);
242 return vrdl->listener.notify_populate(&vrdl->listener, section) == 0;
243 }
244
vfio_cpr_group_get_device_fd(int d,const char * name)245 int vfio_cpr_group_get_device_fd(int d, const char *name)
246 {
247 const int id = 0;
248 int fd = cpr_find_fd(name, id);
249
250 if (fd < 0) {
251 fd = ioctl(d, VFIO_GROUP_GET_DEVICE_FD, name);
252 if (fd >= 0) {
253 cpr_save_fd(name, id, fd);
254 }
255 }
256 return fd;
257 }
258
same_device(int fd1,int fd2)259 static bool same_device(int fd1, int fd2)
260 {
261 struct stat st1, st2;
262
263 return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev;
264 }
265
vfio_cpr_container_match(VFIOContainer * container,VFIOGroup * group,int fd)266 bool vfio_cpr_container_match(VFIOContainer *container, VFIOGroup *group,
267 int fd)
268 {
269 if (container->fd == fd) {
270 return true;
271 }
272 if (!same_device(container->fd, fd)) {
273 return false;
274 }
275 /*
276 * Same device, different fd. This occurs when the container fd is
277 * cpr_save'd multiple times, once for each groupid, so SCM_RIGHTS
278 * produces duplicates. De-dup it.
279 */
280 cpr_delete_fd("vfio_container_for_group", group->groupid);
281 close(fd);
282 cpr_save_fd("vfio_container_for_group", group->groupid, container->fd);
283 return true;
284 }
285