xref: /qemu/hw/vfio/container.c (revision 86c54a3a418e462e67444ac4db25b2757fd62079)
1 /*
2  * generic functions used by VFIO devices
3  *
4  * Copyright Red Hat, Inc. 2012
5  *
6  * Authors:
7  *  Alex Williamson <alex.williamson@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Based on qemu-kvm device-assignment:
13  *  Adapted for KVM by Qumranet.
14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19  */
20 
21 #include "qemu/osdep.h"
22 #include <sys/ioctl.h>
23 #include <linux/vfio.h>
24 
25 #include "hw/vfio/vfio-device.h"
26 #include "system/address-spaces.h"
27 #include "system/memory.h"
28 #include "system/ram_addr.h"
29 #include "qemu/error-report.h"
30 #include "qemu/range.h"
31 #include "system/reset.h"
32 #include "trace.h"
33 #include "qapi/error.h"
34 #include "pci.h"
35 #include "hw/vfio/vfio-container.h"
36 #include "vfio-helpers.h"
37 #include "vfio-cpr.h"
38 #include "vfio-listener.h"
39 
40 #define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio"
41 
42 typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
43 static VFIOGroupList vfio_group_list =
44     QLIST_HEAD_INITIALIZER(vfio_group_list);
45 
46 static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
47 {
48     switch (container->iommu_type) {
49     case VFIO_TYPE1v2_IOMMU:
50     case VFIO_TYPE1_IOMMU:
51         /*
52          * We support coordinated discarding of RAM via the RamDiscardManager.
53          */
54         return ram_block_uncoordinated_discard_disable(state);
55     default:
56         /*
57          * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
58          * RamDiscardManager, however, it is completely untested.
59          *
60          * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
61          * completely the opposite of managing mapping/pinning dynamically as
62          * required by RamDiscardManager. We would have to special-case sections
63          * with a RamDiscardManager.
64          */
65         return ram_block_discard_disable(state);
66     }
67 }
68 
69 static int vfio_dma_unmap_bitmap(const VFIOContainer *container,
70                                  hwaddr iova, ram_addr_t size,
71                                  IOMMUTLBEntry *iotlb)
72 {
73     const VFIOContainerBase *bcontainer = &container->bcontainer;
74     struct vfio_iommu_type1_dma_unmap *unmap;
75     struct vfio_bitmap *bitmap;
76     VFIOBitmap vbmap;
77     int ret;
78 
79     ret = vfio_bitmap_alloc(&vbmap, size);
80     if (ret) {
81         return ret;
82     }
83 
84     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
85 
86     unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
87     unmap->iova = iova;
88     unmap->size = size;
89     unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
90     bitmap = (struct vfio_bitmap *)&unmap->data;
91 
92     /*
93      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
94      * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
95      * to qemu_real_host_page_size.
96      */
97     bitmap->pgsize = qemu_real_host_page_size();
98     bitmap->size = vbmap.size;
99     bitmap->data = (__u64 *)vbmap.bitmap;
100 
101     if (vbmap.size > bcontainer->max_dirty_bitmap_size) {
102         error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
103         ret = -E2BIG;
104         goto unmap_exit;
105     }
106 
107     ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
108     if (!ret) {
109         cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
110                 iotlb->translated_addr, vbmap.pages);
111     } else {
112         error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
113     }
114 
115 unmap_exit:
116     g_free(unmap);
117     g_free(vbmap.bitmap);
118 
119     return ret;
120 }
121 
122 static int vfio_legacy_dma_unmap_one(const VFIOContainerBase *bcontainer,
123                                      hwaddr iova, ram_addr_t size,
124                                      IOMMUTLBEntry *iotlb)
125 {
126     const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
127                                                   bcontainer);
128     struct vfio_iommu_type1_dma_unmap unmap = {
129         .argsz = sizeof(unmap),
130         .flags = 0,
131         .iova = iova,
132         .size = size,
133     };
134     bool need_dirty_sync = false;
135     int ret;
136     Error *local_err = NULL;
137 
138     if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) {
139         if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) &&
140             bcontainer->dirty_pages_supported) {
141             return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
142         }
143 
144         need_dirty_sync = true;
145     }
146 
147     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
148         /*
149          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
150          * v4.15) where an overflow in its wrap-around check prevents us from
151          * unmapping the last page of the address space.  Test for the error
152          * condition and re-try the unmap excluding the last page.  The
153          * expectation is that we've never mapped the last page anyway and this
154          * unmap request comes via vIOMMU support which also makes it unlikely
155          * that this page is used.  This bug was introduced well after type1 v2
156          * support was introduced, so we shouldn't need to test for v1.  A fix
157          * is queued for kernel v5.0 so this workaround can be removed once
158          * affected kernels are sufficiently deprecated.
159          */
160         if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
161             container->iommu_type == VFIO_TYPE1v2_IOMMU) {
162             trace_vfio_legacy_dma_unmap_overflow_workaround();
163             unmap.size -= 1ULL << ctz64(bcontainer->pgsizes);
164             continue;
165         }
166         return -errno;
167     }
168 
169     if (need_dirty_sync) {
170         ret = vfio_container_query_dirty_bitmap(bcontainer, iova, size,
171                                     iotlb->translated_addr, &local_err);
172         if (ret) {
173             error_report_err(local_err);
174             return ret;
175         }
176     }
177 
178     return 0;
179 }
180 
181 /*
182  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
183  */
184 static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
185                                  hwaddr iova, ram_addr_t size,
186                                  IOMMUTLBEntry *iotlb, bool unmap_all)
187 {
188     int ret;
189 
190     if (unmap_all) {
191         /* The unmap ioctl doesn't accept a full 64-bit span. */
192         Int128 llsize = int128_rshift(int128_2_64(), 1);
193 
194         ret = vfio_legacy_dma_unmap_one(bcontainer, 0, int128_get64(llsize),
195                                         iotlb);
196 
197         if (ret == 0) {
198             ret = vfio_legacy_dma_unmap_one(bcontainer, int128_get64(llsize),
199                                             int128_get64(llsize), iotlb);
200         }
201 
202     } else {
203         ret = vfio_legacy_dma_unmap_one(bcontainer, iova, size, iotlb);
204     }
205 
206     return ret;
207 }
208 
209 static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
210                                ram_addr_t size, void *vaddr, bool readonly)
211 {
212     const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
213                                                   bcontainer);
214     struct vfio_iommu_type1_dma_map map = {
215         .argsz = sizeof(map),
216         .flags = VFIO_DMA_MAP_FLAG_READ,
217         .vaddr = (__u64)(uintptr_t)vaddr,
218         .iova = iova,
219         .size = size,
220     };
221 
222     if (!readonly) {
223         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
224     }
225 
226     /*
227      * Try the mapping, if it fails with EBUSY, unmap the region and try
228      * again.  This shouldn't be necessary, but we sometimes see it in
229      * the VGA ROM space.
230      */
231     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
232         (errno == EBUSY &&
233          vfio_legacy_dma_unmap(bcontainer, iova, size, NULL, false) == 0 &&
234          ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
235         return 0;
236     }
237 
238     return -errno;
239 }
240 
241 static int
242 vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer,
243                                     bool start, Error **errp)
244 {
245     const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
246                                                   bcontainer);
247     int ret;
248     struct vfio_iommu_type1_dirty_bitmap dirty = {
249         .argsz = sizeof(dirty),
250     };
251 
252     if (start) {
253         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
254     } else {
255         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
256     }
257 
258     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
259     if (ret) {
260         ret = -errno;
261         error_setg_errno(errp, errno, "Failed to set dirty tracking flag 0x%x",
262                          dirty.flags);
263     }
264 
265     return ret;
266 }
267 
268 static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
269                       VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp)
270 {
271     const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
272                                                   bcontainer);
273     struct vfio_iommu_type1_dirty_bitmap *dbitmap;
274     struct vfio_iommu_type1_dirty_bitmap_get *range;
275     int ret;
276 
277     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
278 
279     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
280     dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
281     range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
282     range->iova = iova;
283     range->size = size;
284 
285     /*
286      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
287      * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
288      * to qemu_real_host_page_size.
289      */
290     range->bitmap.pgsize = qemu_real_host_page_size();
291     range->bitmap.size = vbmap->size;
292     range->bitmap.data = (__u64 *)vbmap->bitmap;
293 
294     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
295     if (ret) {
296         ret = -errno;
297         error_setg_errno(errp, errno,
298                          "Failed to get dirty bitmap for iova: 0x%"PRIx64
299                          " size: 0x%"PRIx64, (uint64_t)range->iova,
300                          (uint64_t)range->size);
301     }
302 
303     g_free(dbitmap);
304 
305     return ret;
306 }
307 
308 static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
309                                      VFIOContainerBase *bcontainer)
310 {
311     struct vfio_info_cap_header *hdr;
312     struct vfio_iommu_type1_info_cap_iova_range *cap;
313 
314     hdr = vfio_get_iommu_type1_info_cap(info,
315                                         VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE);
316     if (!hdr) {
317         return false;
318     }
319 
320     cap = (void *)hdr;
321 
322     for (int i = 0; i < cap->nr_iovas; i++) {
323         Range *range = g_new(Range, 1);
324 
325         range_set_bounds(range, cap->iova_ranges[i].start,
326                          cap->iova_ranges[i].end);
327         bcontainer->iova_ranges =
328             range_list_insert(bcontainer->iova_ranges, range);
329     }
330 
331     return true;
332 }
333 
334 static void vfio_group_add_kvm_device(VFIOGroup *group)
335 {
336     Error *err = NULL;
337 
338     if (vfio_kvm_device_add_fd(group->fd, &err)) {
339         error_reportf_err(err, "group ID %d: ", group->groupid);
340     }
341 }
342 
343 static void vfio_group_del_kvm_device(VFIOGroup *group)
344 {
345     Error *err = NULL;
346 
347     if (vfio_kvm_device_del_fd(group->fd, &err)) {
348         error_reportf_err(err, "group ID %d: ", group->groupid);
349     }
350 }
351 
352 /*
353  * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
354  */
355 static int vfio_get_iommu_type(int container_fd,
356                                Error **errp)
357 {
358     int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
359                           VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
360     int i;
361 
362     for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
363         if (ioctl(container_fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
364             return iommu_types[i];
365         }
366     }
367     error_setg(errp, "No available IOMMU models");
368     return -EINVAL;
369 }
370 
371 /*
372  * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type
373  */
374 static const char *vfio_get_iommu_class_name(int iommu_type)
375 {
376     switch (iommu_type) {
377     case VFIO_TYPE1v2_IOMMU:
378     case VFIO_TYPE1_IOMMU:
379         return TYPE_VFIO_IOMMU_LEGACY;
380         break;
381     case VFIO_SPAPR_TCE_v2_IOMMU:
382     case VFIO_SPAPR_TCE_IOMMU:
383         return TYPE_VFIO_IOMMU_SPAPR;
384         break;
385     default:
386         g_assert_not_reached();
387     };
388 }
389 
390 static bool vfio_set_iommu(int container_fd, int group_fd,
391                            int *iommu_type, Error **errp)
392 {
393     if (ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container_fd)) {
394         error_setg_errno(errp, errno, "Failed to set group container");
395         return false;
396     }
397 
398     while (ioctl(container_fd, VFIO_SET_IOMMU, *iommu_type)) {
399         if (*iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
400             /*
401              * On sPAPR, despite the IOMMU subdriver always advertises v1 and
402              * v2, the running platform may not support v2 and there is no
403              * way to guess it until an IOMMU group gets added to the container.
404              * So in case it fails with v2, try v1 as a fallback.
405              */
406             *iommu_type = VFIO_SPAPR_TCE_IOMMU;
407             continue;
408         }
409         error_setg_errno(errp, errno, "Failed to set iommu for container");
410         return false;
411     }
412 
413     return true;
414 }
415 
416 static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group,
417                                             Error **errp)
418 {
419     int iommu_type;
420     const char *vioc_name;
421     VFIOContainer *container;
422 
423     iommu_type = vfio_get_iommu_type(fd, errp);
424     if (iommu_type < 0) {
425         return NULL;
426     }
427 
428     if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) {
429         return NULL;
430     }
431 
432     vioc_name = vfio_get_iommu_class_name(iommu_type);
433 
434     container = VFIO_IOMMU_LEGACY(object_new(vioc_name));
435     container->fd = fd;
436     container->iommu_type = iommu_type;
437     return container;
438 }
439 
440 static int vfio_get_iommu_info(VFIOContainer *container,
441                                struct vfio_iommu_type1_info **info)
442 {
443 
444     size_t argsz = sizeof(struct vfio_iommu_type1_info);
445 
446     *info = g_new0(struct vfio_iommu_type1_info, 1);
447 again:
448     (*info)->argsz = argsz;
449 
450     if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
451         g_free(*info);
452         *info = NULL;
453         return -errno;
454     }
455 
456     if (((*info)->argsz > argsz)) {
457         argsz = (*info)->argsz;
458         *info = g_realloc(*info, argsz);
459         goto again;
460     }
461 
462     return 0;
463 }
464 
465 static struct vfio_info_cap_header *
466 vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
467 {
468     struct vfio_info_cap_header *hdr;
469     void *ptr = info;
470 
471     if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
472         return NULL;
473     }
474 
475     for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
476         if (hdr->id == id) {
477             return hdr;
478         }
479     }
480 
481     return NULL;
482 }
483 
484 static void vfio_get_iommu_info_migration(VFIOContainer *container,
485                                           struct vfio_iommu_type1_info *info)
486 {
487     struct vfio_info_cap_header *hdr;
488     struct vfio_iommu_type1_info_cap_migration *cap_mig;
489     VFIOContainerBase *bcontainer = &container->bcontainer;
490 
491     hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
492     if (!hdr) {
493         return;
494     }
495 
496     cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
497                             header);
498 
499     /*
500      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
501      * qemu_real_host_page_size to mark those dirty.
502      */
503     if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
504         bcontainer->dirty_pages_supported = true;
505         bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
506         bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap;
507     }
508 }
509 
510 static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
511 {
512     VFIOContainer *container = container_of(bcontainer, VFIOContainer,
513                                             bcontainer);
514     g_autofree struct vfio_iommu_type1_info *info = NULL;
515     int ret;
516 
517     ret = vfio_get_iommu_info(container, &info);
518     if (ret) {
519         error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
520         return false;
521     }
522 
523     if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
524         bcontainer->pgsizes = info->iova_pgsizes;
525     } else {
526         bcontainer->pgsizes = qemu_real_host_page_size();
527     }
528 
529     if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) {
530         bcontainer->dma_max_mappings = 65535;
531     }
532 
533     vfio_get_info_iova_range(info, bcontainer);
534 
535     vfio_get_iommu_info_migration(container, info);
536     return true;
537 }
538 
539 static bool vfio_container_attach_discard_disable(VFIOContainer *container,
540                                             VFIOGroup *group, Error **errp)
541 {
542     int ret;
543 
544     /*
545      * VFIO is currently incompatible with discarding of RAM insofar as the
546      * madvise to purge (zap) the page from QEMU's address space does not
547      * interact with the memory API and therefore leaves stale virtual to
548      * physical mappings in the IOMMU if the page was previously pinned.  We
549      * therefore set discarding broken for each group added to a container,
550      * whether the container is used individually or shared.  This provides
551      * us with options to allow devices within a group to opt-in and allow
552      * discarding, so long as it is done consistently for a group (for instance
553      * if the device is an mdev device where it is known that the host vendor
554      * driver will never pin pages outside of the working set of the guest
555      * driver, which would thus not be discarding candidates).
556      *
557      * The first opportunity to induce pinning occurs here where we attempt to
558      * attach the group to existing containers within the AddressSpace.  If any
559      * pages are already zapped from the virtual address space, such as from
560      * previous discards, new pinning will cause valid mappings to be
561      * re-established.  Likewise, when the overall MemoryListener for a new
562      * container is registered, a replay of mappings within the AddressSpace
563      * will occur, re-establishing any previously zapped pages as well.
564      *
565      * Especially virtio-balloon is currently only prevented from discarding
566      * new memory, it will not yet set ram_block_discard_set_required() and
567      * therefore, neither stops us here or deals with the sudden memory
568      * consumption of inflated memory.
569      *
570      * We do support discarding of memory coordinated via the RamDiscardManager
571      * with some IOMMU types. vfio_ram_block_discard_disable() handles the
572      * details once we know which type of IOMMU we are using.
573      */
574 
575     ret = vfio_ram_block_discard_disable(container, true);
576     if (ret) {
577         error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
578         if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
579             error_report("vfio: error disconnecting group %d from"
580                          " container", group->groupid);
581         }
582     }
583     return !ret;
584 }
585 
586 static bool vfio_container_group_add(VFIOContainer *container, VFIOGroup *group,
587                                      Error **errp)
588 {
589     if (!vfio_container_attach_discard_disable(container, group, errp)) {
590         return false;
591     }
592     group->container = container;
593     QLIST_INSERT_HEAD(&container->group_list, group, container_next);
594     vfio_group_add_kvm_device(group);
595     return true;
596 }
597 
598 static void vfio_container_group_del(VFIOContainer *container, VFIOGroup *group)
599 {
600     QLIST_REMOVE(group, container_next);
601     group->container = NULL;
602     vfio_group_del_kvm_device(group);
603     vfio_ram_block_discard_disable(container, false);
604 }
605 
606 static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as,
607                                    Error **errp)
608 {
609     VFIOContainer *container;
610     VFIOContainerBase *bcontainer;
611     int ret, fd = -1;
612     VFIOAddressSpace *space;
613     VFIOIOMMUClass *vioc = NULL;
614     bool new_container = false;
615     bool group_was_added = false;
616 
617     space = vfio_address_space_get(as);
618 
619     QLIST_FOREACH(bcontainer, &space->containers, next) {
620         container = container_of(bcontainer, VFIOContainer, bcontainer);
621         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
622             return vfio_container_group_add(container, group, errp);
623         }
624     }
625 
626     fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
627     if (fd < 0) {
628         goto fail;
629     }
630 
631     ret = ioctl(fd, VFIO_GET_API_VERSION);
632     if (ret != VFIO_API_VERSION) {
633         error_setg(errp, "supported vfio version: %d, "
634                    "reported version: %d", VFIO_API_VERSION, ret);
635         goto fail;
636     }
637 
638     container = vfio_create_container(fd, group, errp);
639     if (!container) {
640         goto fail;
641     }
642     new_container = true;
643     bcontainer = &container->bcontainer;
644 
645     if (!vfio_cpr_register_container(bcontainer, errp)) {
646         goto fail;
647     }
648 
649     vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
650     assert(vioc->setup);
651 
652     if (!vioc->setup(bcontainer, errp)) {
653         goto fail;
654     }
655 
656     vfio_address_space_insert(space, bcontainer);
657 
658     if (!vfio_container_group_add(container, group, errp)) {
659         goto fail;
660     }
661     group_was_added = true;
662 
663     if (!vfio_listener_register(bcontainer, errp)) {
664         goto fail;
665     }
666 
667     bcontainer->initialized = true;
668 
669     return true;
670 
671 fail:
672     vfio_listener_unregister(bcontainer);
673 
674     if (group_was_added) {
675         vfio_container_group_del(container, group);
676     }
677     if (vioc && vioc->release) {
678         vioc->release(bcontainer);
679     }
680     if (new_container) {
681         vfio_cpr_unregister_container(bcontainer);
682         object_unref(container);
683     }
684     if (fd >= 0) {
685         close(fd);
686     }
687     vfio_address_space_put(space);
688 
689     return false;
690 }
691 
692 static void vfio_container_disconnect(VFIOGroup *group)
693 {
694     VFIOContainer *container = group->container;
695     VFIOContainerBase *bcontainer = &container->bcontainer;
696     VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
697 
698     QLIST_REMOVE(group, container_next);
699     group->container = NULL;
700 
701     /*
702      * Explicitly release the listener first before unset container,
703      * since unset may destroy the backend container if it's the last
704      * group.
705      */
706     if (QLIST_EMPTY(&container->group_list)) {
707         vfio_listener_unregister(bcontainer);
708         if (vioc->release) {
709             vioc->release(bcontainer);
710         }
711     }
712 
713     if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
714         error_report("vfio: error disconnecting group %d from container",
715                      group->groupid);
716     }
717 
718     if (QLIST_EMPTY(&container->group_list)) {
719         VFIOAddressSpace *space = bcontainer->space;
720 
721         trace_vfio_container_disconnect(container->fd);
722         vfio_cpr_unregister_container(bcontainer);
723         close(container->fd);
724         object_unref(container);
725 
726         vfio_address_space_put(space);
727     }
728 }
729 
730 static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp)
731 {
732     ERRP_GUARD();
733     VFIOGroup *group;
734     char path[32];
735     struct vfio_group_status status = { .argsz = sizeof(status) };
736 
737     QLIST_FOREACH(group, &vfio_group_list, next) {
738         if (group->groupid == groupid) {
739             /* Found it.  Now is it already in the right context? */
740             if (group->container->bcontainer.space->as == as) {
741                 return group;
742             } else {
743                 error_setg(errp, "group %d used in multiple address spaces",
744                            group->groupid);
745                 return NULL;
746             }
747         }
748     }
749 
750     group = g_malloc0(sizeof(*group));
751 
752     snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
753     group->fd = qemu_open(path, O_RDWR, errp);
754     if (group->fd < 0) {
755         goto free_group_exit;
756     }
757 
758     if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
759         error_setg_errno(errp, errno, "failed to get group %d status", groupid);
760         goto close_fd_exit;
761     }
762 
763     if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
764         error_setg(errp, "group %d is not viable", groupid);
765         error_append_hint(errp,
766                           "Please ensure all devices within the iommu_group "
767                           "are bound to their vfio bus driver.\n");
768         goto close_fd_exit;
769     }
770 
771     group->groupid = groupid;
772     QLIST_INIT(&group->device_list);
773 
774     if (!vfio_container_connect(group, as, errp)) {
775         error_prepend(errp, "failed to setup container for group %d: ",
776                       groupid);
777         goto close_fd_exit;
778     }
779 
780     QLIST_INSERT_HEAD(&vfio_group_list, group, next);
781 
782     return group;
783 
784 close_fd_exit:
785     close(group->fd);
786 
787 free_group_exit:
788     g_free(group);
789 
790     return NULL;
791 }
792 
793 static void vfio_group_put(VFIOGroup *group)
794 {
795     if (!group || !QLIST_EMPTY(&group->device_list)) {
796         return;
797     }
798 
799     if (!group->ram_block_discard_allowed) {
800         vfio_ram_block_discard_disable(group->container, false);
801     }
802     vfio_group_del_kvm_device(group);
803     vfio_container_disconnect(group);
804     QLIST_REMOVE(group, next);
805     trace_vfio_group_put(group->fd);
806     close(group->fd);
807     g_free(group);
808 }
809 
810 static bool vfio_device_get(VFIOGroup *group, const char *name,
811                             VFIODevice *vbasedev, Error **errp)
812 {
813     g_autofree struct vfio_device_info *info = NULL;
814     int fd;
815 
816     fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
817     if (fd < 0) {
818         error_setg_errno(errp, errno, "error getting device from group %d",
819                          group->groupid);
820         error_append_hint(errp,
821                       "Verify all devices in group %d are bound to vfio-<bus> "
822                       "or pci-stub and not already in use\n", group->groupid);
823         return false;
824     }
825 
826     info = vfio_get_device_info(fd);
827     if (!info) {
828         error_setg_errno(errp, errno, "error getting device info");
829         close(fd);
830         return false;
831     }
832 
833     /*
834      * Set discarding of RAM as not broken for this group if the driver knows
835      * the device operates compatibly with discarding.  Setting must be
836      * consistent per group, but since compatibility is really only possible
837      * with mdev currently, we expect singleton groups.
838      */
839     if (vbasedev->ram_block_discard_allowed !=
840         group->ram_block_discard_allowed) {
841         if (!QLIST_EMPTY(&group->device_list)) {
842             error_setg(errp, "Inconsistent setting of support for discarding "
843                        "RAM (e.g., balloon) within group");
844             close(fd);
845             return false;
846         }
847 
848         if (!group->ram_block_discard_allowed) {
849             group->ram_block_discard_allowed = true;
850             vfio_ram_block_discard_disable(group->container, false);
851         }
852     }
853 
854     vfio_device_prepare(vbasedev, &group->container->bcontainer, info);
855 
856     vbasedev->fd = fd;
857     vbasedev->group = group;
858     QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
859 
860     trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs);
861 
862     return true;
863 }
864 
865 static void vfio_device_put(VFIODevice *vbasedev)
866 {
867     if (!vbasedev->group) {
868         return;
869     }
870     QLIST_REMOVE(vbasedev, next);
871     vbasedev->group = NULL;
872     trace_vfio_device_put(vbasedev->fd);
873     close(vbasedev->fd);
874 }
875 
876 static int vfio_device_get_groupid(VFIODevice *vbasedev, Error **errp)
877 {
878     char *tmp, group_path[PATH_MAX];
879     g_autofree char *group_name = NULL;
880     int ret, groupid;
881     ssize_t len;
882 
883     tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev);
884     len = readlink(tmp, group_path, sizeof(group_path));
885     g_free(tmp);
886 
887     if (len <= 0 || len >= sizeof(group_path)) {
888         ret = len < 0 ? -errno : -ENAMETOOLONG;
889         error_setg_errno(errp, -ret, "no iommu_group found");
890         return ret;
891     }
892 
893     group_path[len] = 0;
894 
895     group_name = g_path_get_basename(group_path);
896     if (sscanf(group_name, "%d", &groupid) != 1) {
897         error_setg_errno(errp, errno, "failed to read %s", group_path);
898         return -errno;
899     }
900     return groupid;
901 }
902 
903 /*
904  * vfio_device_attach: attach a device to a security context
905  * @name and @vbasedev->name are likely to be different depending
906  * on the type of the device, hence the need for passing @name
907  */
908 static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
909                                       AddressSpace *as, Error **errp)
910 {
911     int groupid = vfio_device_get_groupid(vbasedev, errp);
912     VFIODevice *vbasedev_iter;
913     VFIOGroup *group;
914 
915     if (groupid < 0) {
916         return false;
917     }
918 
919     trace_vfio_device_attach(vbasedev->name, groupid);
920 
921     group = vfio_group_get(groupid, as, errp);
922     if (!group) {
923         return false;
924     }
925 
926     QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
927         if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
928             error_setg(errp, "device is already attached");
929             goto group_put_exit;
930         }
931     }
932     if (!vfio_device_get(group, name, vbasedev, errp)) {
933         goto group_put_exit;
934     }
935 
936     if (!vfio_device_hiod_create_and_realize(vbasedev,
937                                              TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
938                                              errp)) {
939         goto device_put_exit;
940     }
941 
942     return true;
943 
944 device_put_exit:
945     vfio_device_put(vbasedev);
946 group_put_exit:
947     vfio_group_put(group);
948     return false;
949 }
950 
951 static void vfio_legacy_detach_device(VFIODevice *vbasedev)
952 {
953     VFIOGroup *group = vbasedev->group;
954 
955     trace_vfio_device_detach(vbasedev->name, group->groupid);
956 
957     vfio_device_unprepare(vbasedev);
958 
959     object_unref(vbasedev->hiod);
960     vfio_device_put(vbasedev);
961     vfio_group_put(group);
962 }
963 
964 static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single)
965 {
966     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
967     VFIOGroup *group;
968     struct vfio_pci_hot_reset_info *info = NULL;
969     struct vfio_pci_dependent_device *devices;
970     struct vfio_pci_hot_reset *reset;
971     int32_t *fds;
972     int ret, i, count;
973     bool multi = false;
974 
975     trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
976 
977     if (!single) {
978         vfio_pci_pre_reset(vdev);
979     }
980     vdev->vbasedev.needs_reset = false;
981 
982     ret = vfio_pci_get_pci_hot_reset_info(vdev, &info);
983 
984     if (ret) {
985         goto out_single;
986     }
987     devices = &info->devices[0];
988 
989     trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
990 
991     /* Verify that we have all the groups required */
992     for (i = 0; i < info->count; i++) {
993         PCIHostDeviceAddress host;
994         VFIOPCIDevice *tmp;
995         VFIODevice *vbasedev_iter;
996 
997         host.domain = devices[i].segment;
998         host.bus = devices[i].bus;
999         host.slot = PCI_SLOT(devices[i].devfn);
1000         host.function = PCI_FUNC(devices[i].devfn);
1001 
1002         trace_vfio_pci_hot_reset_dep_devices(host.domain,
1003                 host.bus, host.slot, host.function, devices[i].group_id);
1004 
1005         if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
1006             continue;
1007         }
1008 
1009         QLIST_FOREACH(group, &vfio_group_list, next) {
1010             if (group->groupid == devices[i].group_id) {
1011                 break;
1012             }
1013         }
1014 
1015         if (!group) {
1016             if (!vdev->has_pm_reset) {
1017                 error_report("vfio: Cannot reset device %s, "
1018                              "depends on group %d which is not owned.",
1019                              vdev->vbasedev.name, devices[i].group_id);
1020             }
1021             ret = -EPERM;
1022             goto out;
1023         }
1024 
1025         /* Prep dependent devices for reset and clear our marker. */
1026         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
1027             if (!vbasedev_iter->dev->realized ||
1028                 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
1029                 continue;
1030             }
1031             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
1032             if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
1033                 if (single) {
1034                     ret = -EINVAL;
1035                     goto out_single;
1036                 }
1037                 vfio_pci_pre_reset(tmp);
1038                 tmp->vbasedev.needs_reset = false;
1039                 multi = true;
1040                 break;
1041             }
1042         }
1043     }
1044 
1045     if (!single && !multi) {
1046         ret = -EINVAL;
1047         goto out_single;
1048     }
1049 
1050     /* Determine how many group fds need to be passed */
1051     count = 0;
1052     QLIST_FOREACH(group, &vfio_group_list, next) {
1053         for (i = 0; i < info->count; i++) {
1054             if (group->groupid == devices[i].group_id) {
1055                 count++;
1056                 break;
1057             }
1058         }
1059     }
1060 
1061     reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
1062     reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
1063     fds = &reset->group_fds[0];
1064 
1065     /* Fill in group fds */
1066     QLIST_FOREACH(group, &vfio_group_list, next) {
1067         for (i = 0; i < info->count; i++) {
1068             if (group->groupid == devices[i].group_id) {
1069                 fds[reset->count++] = group->fd;
1070                 break;
1071             }
1072         }
1073     }
1074 
1075     /* Bus reset! */
1076     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
1077     g_free(reset);
1078     if (ret) {
1079         ret = -errno;
1080     }
1081 
1082     trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
1083                                     ret ? strerror(errno) : "Success");
1084 
1085 out:
1086     /* Re-enable INTx on affected devices */
1087     for (i = 0; i < info->count; i++) {
1088         PCIHostDeviceAddress host;
1089         VFIOPCIDevice *tmp;
1090         VFIODevice *vbasedev_iter;
1091 
1092         host.domain = devices[i].segment;
1093         host.bus = devices[i].bus;
1094         host.slot = PCI_SLOT(devices[i].devfn);
1095         host.function = PCI_FUNC(devices[i].devfn);
1096 
1097         if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
1098             continue;
1099         }
1100 
1101         QLIST_FOREACH(group, &vfio_group_list, next) {
1102             if (group->groupid == devices[i].group_id) {
1103                 break;
1104             }
1105         }
1106 
1107         if (!group) {
1108             break;
1109         }
1110 
1111         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
1112             if (!vbasedev_iter->dev->realized ||
1113                 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
1114                 continue;
1115             }
1116             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
1117             if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
1118                 vfio_pci_post_reset(tmp);
1119                 break;
1120             }
1121         }
1122     }
1123 out_single:
1124     if (!single) {
1125         vfio_pci_post_reset(vdev);
1126     }
1127     g_free(info);
1128 
1129     return ret;
1130 }
1131 
1132 static void vfio_iommu_legacy_class_init(ObjectClass *klass, const void *data)
1133 {
1134     VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
1135 
1136     vioc->setup = vfio_legacy_setup;
1137     vioc->dma_map = vfio_legacy_dma_map;
1138     vioc->dma_unmap = vfio_legacy_dma_unmap;
1139     vioc->attach_device = vfio_legacy_attach_device;
1140     vioc->detach_device = vfio_legacy_detach_device;
1141     vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking;
1142     vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap;
1143     vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
1144 };
1145 
1146 static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
1147                                      Error **errp)
1148 {
1149     VFIODevice *vdev = opaque;
1150 
1151     hiod->name = g_strdup(vdev->name);
1152     hiod->agent = opaque;
1153 
1154     return true;
1155 }
1156 
1157 static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap,
1158                                     Error **errp)
1159 {
1160     switch (cap) {
1161     case HOST_IOMMU_DEVICE_CAP_AW_BITS:
1162         return vfio_device_get_aw_bits(hiod->agent);
1163     default:
1164         error_setg(errp, "%s: unsupported capability %x", hiod->name, cap);
1165         return -EINVAL;
1166     }
1167 }
1168 
1169 static GList *
1170 hiod_legacy_vfio_get_iova_ranges(HostIOMMUDevice *hiod)
1171 {
1172     VFIODevice *vdev = hiod->agent;
1173 
1174     g_assert(vdev);
1175     return vfio_container_get_iova_ranges(vdev->bcontainer);
1176 }
1177 
1178 static uint64_t
1179 hiod_legacy_vfio_get_page_size_mask(HostIOMMUDevice *hiod)
1180 {
1181     VFIODevice *vdev = hiod->agent;
1182 
1183     g_assert(vdev);
1184     return vfio_container_get_page_size_mask(vdev->bcontainer);
1185 }
1186 
1187 static void vfio_iommu_legacy_instance_init(Object *obj)
1188 {
1189     VFIOContainer *container = VFIO_IOMMU_LEGACY(obj);
1190 
1191     QLIST_INIT(&container->group_list);
1192 }
1193 
1194 static void hiod_legacy_vfio_class_init(ObjectClass *oc, const void *data)
1195 {
1196     HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
1197 
1198     hioc->realize = hiod_legacy_vfio_realize;
1199     hioc->get_cap = hiod_legacy_vfio_get_cap;
1200     hioc->get_iova_ranges = hiod_legacy_vfio_get_iova_ranges;
1201     hioc->get_page_size_mask = hiod_legacy_vfio_get_page_size_mask;
1202 };
1203 
1204 static const TypeInfo types[] = {
1205     {
1206         .name = TYPE_VFIO_IOMMU_LEGACY,
1207         .parent = TYPE_VFIO_IOMMU,
1208         .instance_init = vfio_iommu_legacy_instance_init,
1209         .instance_size = sizeof(VFIOContainer),
1210         .class_init = vfio_iommu_legacy_class_init,
1211     }, {
1212         .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
1213         .parent = TYPE_HOST_IOMMU_DEVICE,
1214         .class_init = hiod_legacy_vfio_class_init,
1215     }
1216 };
1217 
1218 DEFINE_TYPES(types)
1219