xref: /qemu/hw/vfio/spapr.c (revision 06b40d250ecfa1633209c2e431a7a38acfd03a98)
1 /*
2  * DMA memory preregistration
3  *
4  * Authors:
5  *  Alexey Kardashevskiy <aik@ozlabs.ru>
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2.  See
8  * the COPYING file in the top-level directory.
9  */
10 
11 #include "qemu/osdep.h"
12 #include <sys/ioctl.h>
13 #include <linux/vfio.h>
14 #include "system/kvm.h"
15 #include "system/hostmem.h"
16 #include "system/address-spaces.h"
17 
18 #include "hw/vfio/vfio-container.h"
19 #include "hw/hw.h"
20 #include "system/ram_addr.h"
21 #include "qemu/error-report.h"
22 #include "qapi/error.h"
23 #include "trace.h"
24 #include "vfio-helpers.h"
25 
26 typedef struct VFIOHostDMAWindow {
27     hwaddr min_iova;
28     hwaddr max_iova;
29     uint64_t iova_pgsizes;
30     QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next;
31 } VFIOHostDMAWindow;
32 
33 typedef struct VFIOSpaprContainer {
34     VFIOContainer container;
35     MemoryListener prereg_listener;
36     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
37     unsigned int levels;
38 } VFIOSpaprContainer;
39 
40 OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR);
41 
vfio_prereg_listener_skipped_section(MemoryRegionSection * section)42 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
43 {
44     if (memory_region_is_iommu(section->mr)) {
45         hw_error("Cannot possibly preregister IOMMU memory");
46     }
47 
48     return !memory_region_is_ram(section->mr) ||
49             memory_region_is_ram_device(section->mr);
50 }
51 
vfio_prereg_gpa_to_vaddr(MemoryRegionSection * section,hwaddr gpa)52 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
53 {
54     return memory_region_get_ram_ptr(section->mr) +
55         section->offset_within_region +
56         (gpa - section->offset_within_address_space);
57 }
58 
vfio_prereg_listener_region_add(MemoryListener * listener,MemoryRegionSection * section)59 static void vfio_prereg_listener_region_add(MemoryListener *listener,
60                                             MemoryRegionSection *section)
61 {
62     VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
63                                                   prereg_listener);
64     VFIOContainer *container = &scontainer->container;
65     VFIOContainerBase *bcontainer = &container->bcontainer;
66     const hwaddr gpa = section->offset_within_address_space;
67     hwaddr end;
68     int ret;
69     hwaddr page_mask = qemu_real_host_page_mask();
70     struct vfio_iommu_spapr_register_memory reg = {
71         .argsz = sizeof(reg),
72         .flags = 0,
73     };
74 
75     if (vfio_prereg_listener_skipped_section(section)) {
76         trace_vfio_prereg_listener_region_add_skip(
77                 section->offset_within_address_space,
78                 section->offset_within_address_space +
79                 int128_get64(int128_sub(section->size, int128_one())));
80         return;
81     }
82 
83     if (unlikely((section->offset_within_address_space & ~page_mask) ||
84                  (section->offset_within_region & ~page_mask) ||
85                  (int128_get64(section->size) & ~page_mask))) {
86         error_report("%s received unaligned region", __func__);
87         return;
88     }
89 
90     end = section->offset_within_address_space + int128_get64(section->size);
91     if (gpa >= end) {
92         return;
93     }
94 
95     memory_region_ref(section->mr);
96 
97     reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
98     reg.size = end - gpa;
99 
100     ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
101     trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
102     if (ret) {
103         /*
104          * On the initfn path, store the first error in the container so we
105          * can gracefully fail.  Runtime, there's not much we can do other
106          * than throw a hardware error.
107          */
108         if (!bcontainer->initialized) {
109             if (!bcontainer->error) {
110                 error_setg_errno(&bcontainer->error, -ret,
111                                  "Memory registering failed");
112             }
113         } else {
114             hw_error("vfio: Memory registering failed, unable to continue");
115         }
116     }
117 }
118 
vfio_prereg_listener_region_del(MemoryListener * listener,MemoryRegionSection * section)119 static void vfio_prereg_listener_region_del(MemoryListener *listener,
120                                             MemoryRegionSection *section)
121 {
122     VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
123                                                   prereg_listener);
124     VFIOContainer *container = &scontainer->container;
125     const hwaddr gpa = section->offset_within_address_space;
126     hwaddr end;
127     int ret;
128     hwaddr page_mask = qemu_real_host_page_mask();
129     struct vfio_iommu_spapr_register_memory reg = {
130         .argsz = sizeof(reg),
131         .flags = 0,
132     };
133 
134     if (vfio_prereg_listener_skipped_section(section)) {
135         trace_vfio_prereg_listener_region_del_skip(
136                 section->offset_within_address_space,
137                 section->offset_within_address_space +
138                 int128_get64(int128_sub(section->size, int128_one())));
139         return;
140     }
141 
142     if (unlikely((section->offset_within_address_space & ~page_mask) ||
143                  (section->offset_within_region & ~page_mask) ||
144                  (int128_get64(section->size) & ~page_mask))) {
145         error_report("%s received unaligned region", __func__);
146         return;
147     }
148 
149     end = section->offset_within_address_space + int128_get64(section->size);
150     if (gpa >= end) {
151         return;
152     }
153 
154     reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
155     reg.size = end - gpa;
156 
157     ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
158     trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
159 }
160 
161 static const MemoryListener vfio_prereg_listener = {
162     .name = "vfio-pre-reg",
163     .region_add = vfio_prereg_listener_region_add,
164     .region_del = vfio_prereg_listener_region_del,
165 };
166 
vfio_host_win_add(VFIOSpaprContainer * scontainer,hwaddr min_iova,hwaddr max_iova,uint64_t iova_pgsizes)167 static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova,
168                               hwaddr max_iova, uint64_t iova_pgsizes)
169 {
170     VFIOHostDMAWindow *hostwin;
171 
172     QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
173         if (ranges_overlap(hostwin->min_iova,
174                            hostwin->max_iova - hostwin->min_iova + 1,
175                            min_iova,
176                            max_iova - min_iova + 1)) {
177             hw_error("%s: Overlapped IOMMU are not enabled", __func__);
178         }
179     }
180 
181     hostwin = g_malloc0(sizeof(*hostwin));
182 
183     hostwin->min_iova = min_iova;
184     hostwin->max_iova = max_iova;
185     hostwin->iova_pgsizes = iova_pgsizes;
186     QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next);
187 }
188 
vfio_host_win_del(VFIOSpaprContainer * scontainer,hwaddr min_iova,hwaddr max_iova)189 static int vfio_host_win_del(VFIOSpaprContainer *scontainer,
190                              hwaddr min_iova, hwaddr max_iova)
191 {
192     VFIOHostDMAWindow *hostwin;
193 
194     QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
195         if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
196             QLIST_REMOVE(hostwin, hostwin_next);
197             g_free(hostwin);
198             return 0;
199         }
200     }
201 
202     return -1;
203 }
204 
vfio_find_hostwin(VFIOSpaprContainer * container,hwaddr iova,hwaddr end)205 static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container,
206                                             hwaddr iova, hwaddr end)
207 {
208     VFIOHostDMAWindow *hostwin;
209     bool hostwin_found = false;
210 
211     QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
212         if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
213             hostwin_found = true;
214             break;
215         }
216     }
217 
218     return hostwin_found ? hostwin : NULL;
219 }
220 
vfio_spapr_remove_window(VFIOContainer * container,hwaddr offset_within_address_space)221 static int vfio_spapr_remove_window(VFIOContainer *container,
222                                     hwaddr offset_within_address_space)
223 {
224     struct vfio_iommu_spapr_tce_remove remove = {
225         .argsz = sizeof(remove),
226         .start_addr = offset_within_address_space,
227     };
228     int ret;
229 
230     ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
231     if (ret) {
232         error_report("Failed to remove window at %"PRIx64,
233                      (uint64_t)remove.start_addr);
234         return -errno;
235     }
236 
237     trace_vfio_spapr_remove_window(offset_within_address_space);
238 
239     return 0;
240 }
241 
vfio_spapr_create_window(VFIOContainer * container,MemoryRegionSection * section,hwaddr * pgsize,Error ** errp)242 static bool vfio_spapr_create_window(VFIOContainer *container,
243                                     MemoryRegionSection *section,
244                                     hwaddr *pgsize, Error **errp)
245 {
246     int ret = 0;
247     VFIOContainerBase *bcontainer = &container->bcontainer;
248     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
249                                                   container);
250     IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
251     uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
252     unsigned entries, bits_total, bits_per_level, max_levels, ddw_levels;
253     struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
254     long rampagesize = qemu_minrampagesize();
255 
256     /*
257      * The host might not support the guest supported IOMMU page size,
258      * so we will use smaller physical IOMMU pages to back them.
259      */
260     if (pagesize > rampagesize) {
261         pagesize = rampagesize;
262     }
263     pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1));
264     pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0;
265     if (!pagesize) {
266         error_setg_errno(errp, EINVAL, "Host doesn't support page size 0x%"PRIx64
267                          ", the supported mask is 0x%lx",
268                          memory_region_iommu_get_min_page_size(iommu_mr),
269                          bcontainer->pgsizes);
270         return false;
271     }
272 
273     /*
274      * FIXME: For VFIO iommu types which have KVM acceleration to
275      * avoid bouncing all map/unmaps through qemu this way, this
276      * would be the right place to wire that up (tell the KVM
277      * device emulation the VFIO iommu handles to use).
278      */
279     create.window_size = int128_get64(section->size);
280     create.page_shift = ctz64(pagesize);
281     /*
282      * SPAPR host supports multilevel TCE tables. We try to guess optimal
283      * levels number and if this fails (for example due to the host memory
284      * fragmentation), we increase levels. The DMA address structure is:
285      * rrrrrrrr rxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx iiiiiiii
286      * where:
287      *   r = reserved (bits >= 55 are reserved in the existing hardware)
288      *   i = IOMMU page offset (64K in this example)
289      *   x = bits to index a TCE which can be split to equal chunks to index
290      *      within the level.
291      * The aim is to split "x" to smaller possible number of levels.
292      */
293     entries = create.window_size >> create.page_shift;
294     /* bits_total is number of "x" needed */
295     bits_total = ctz64(entries * sizeof(uint64_t));
296     /*
297      * bits_per_level is a safe guess of how much we can allocate per level:
298      * 8 is the current minimum for CONFIG_FORCE_MAX_ZONEORDER and MAX_ORDER
299      * is usually bigger than that.
300      * Below we look at qemu_real_host_page_size as TCEs are allocated from
301      * system pages.
302      */
303     bits_per_level = ctz64(qemu_real_host_page_size()) + 8;
304     create.levels = bits_total / bits_per_level;
305 
306     ddw_levels = scontainer->levels;
307     if (ddw_levels > 1) {
308         if (bits_total % bits_per_level) {
309             ++create.levels;
310         }
311         max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
312         for ( ; create.levels <= max_levels; ++create.levels) {
313             ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
314             if (!ret) {
315                 break;
316             }
317         }
318     } else { /* ddw_levels == 1 */
319         if (create.levels > ddw_levels) {
320             error_setg_errno(errp, EINVAL, "Host doesn't support multi-level TCE tables"
321                              ". Use larger IO page size. Supported mask is 0x%lx",
322                              bcontainer->pgsizes);
323             return false;
324         }
325         ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
326     }
327 
328     if (ret) {
329         error_setg_errno(errp, errno, "Failed to create a window, ret = %d", ret);
330         return false;
331     }
332 
333     if (create.start_addr != section->offset_within_address_space) {
334         vfio_spapr_remove_window(container, create.start_addr);
335 
336         error_setg_errno(errp, EINVAL, "Host doesn't support DMA window at %"HWADDR_PRIx
337                          ", must be %"PRIx64, section->offset_within_address_space,
338                          (uint64_t)create.start_addr);
339         return false;
340     }
341     trace_vfio_spapr_create_window(create.page_shift,
342                                    create.levels,
343                                    create.window_size,
344                                    create.start_addr);
345     *pgsize = pagesize;
346 
347     return true;
348 }
349 
350 static bool
vfio_spapr_container_add_section_window(VFIOContainerBase * bcontainer,MemoryRegionSection * section,Error ** errp)351 vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
352                                         MemoryRegionSection *section,
353                                         Error **errp)
354 {
355     VFIOContainer *container = container_of(bcontainer, VFIOContainer,
356                                             bcontainer);
357     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
358                                                   container);
359     VFIOHostDMAWindow *hostwin;
360     hwaddr pgsize = 0;
361     int ret;
362 
363     /*
364      * VFIO_SPAPR_TCE_IOMMU supports a single host window between
365      * [dma32_window_start, dma32_window_size), we need to ensure
366      * the section fall in this range.
367      */
368     if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
369         hwaddr iova, end;
370 
371         iova = section->offset_within_address_space;
372         end = iova + int128_get64(section->size) - 1;
373 
374         if (!vfio_find_hostwin(scontainer, iova, end)) {
375             error_setg(errp, "Container %p can't map guest IOVA region"
376                        " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
377                        iova, end);
378             return false;
379         }
380         return true;
381     }
382 
383     if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
384         return true;
385     }
386 
387     /* For now intersections are not allowed, we may relax this later */
388     QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
389         if (ranges_overlap(hostwin->min_iova,
390                            hostwin->max_iova - hostwin->min_iova + 1,
391                            section->offset_within_address_space,
392                            int128_get64(section->size))) {
393             error_setg(errp,
394                 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
395                 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
396                 section->offset_within_address_space,
397                 section->offset_within_address_space +
398                     int128_get64(section->size) - 1,
399                 hostwin->min_iova, hostwin->max_iova);
400             return false;
401         }
402     }
403 
404     ret = vfio_spapr_create_window(container, section, &pgsize, errp);
405     if (!ret) {
406         return false;
407     }
408 
409     vfio_host_win_add(scontainer, section->offset_within_address_space,
410                       section->offset_within_address_space +
411                       int128_get64(section->size) - 1, pgsize);
412 #ifdef CONFIG_KVM
413     if (kvm_enabled()) {
414         VFIOGroup *group;
415         IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
416         struct kvm_vfio_spapr_tce param;
417         struct kvm_device_attr attr = {
418             .group = KVM_DEV_VFIO_GROUP,
419             .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
420             .addr = (uint64_t)(unsigned long)&param,
421         };
422 
423         if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
424                                           &param.tablefd)) {
425             QLIST_FOREACH(group, &container->group_list, container_next) {
426                 param.groupfd = group->fd;
427                 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
428                     error_setg_errno(errp, errno,
429                                      "vfio: failed GROUP_SET_SPAPR_TCE for "
430                                      "KVM VFIO device %d and group fd %d",
431                                      param.tablefd, param.groupfd);
432                     return false;
433                 }
434                 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
435             }
436         }
437     }
438 #endif
439     return true;
440 }
441 
442 static void
vfio_spapr_container_del_section_window(VFIOContainerBase * bcontainer,MemoryRegionSection * section)443 vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer,
444                                         MemoryRegionSection *section)
445 {
446     VFIOContainer *container = container_of(bcontainer, VFIOContainer,
447                                             bcontainer);
448     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
449                                                   container);
450 
451     if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
452         return;
453     }
454 
455     vfio_spapr_remove_window(container,
456                              section->offset_within_address_space);
457     if (vfio_host_win_del(scontainer,
458                           section->offset_within_address_space,
459                           section->offset_within_address_space +
460                           int128_get64(section->size) - 1) < 0) {
461         hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
462                  __func__, section->offset_within_address_space);
463     }
464 }
465 
vfio_spapr_container_release(VFIOContainerBase * bcontainer)466 static void vfio_spapr_container_release(VFIOContainerBase *bcontainer)
467 {
468     VFIOContainer *container = container_of(bcontainer, VFIOContainer,
469                                             bcontainer);
470     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
471                                                   container);
472     VFIOHostDMAWindow *hostwin, *next;
473 
474     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
475         memory_listener_unregister(&scontainer->prereg_listener);
476     }
477     QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next,
478                        next) {
479         QLIST_REMOVE(hostwin, hostwin_next);
480         g_free(hostwin);
481     }
482 }
483 
vfio_spapr_container_setup(VFIOContainerBase * bcontainer,Error ** errp)484 static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
485                                        Error **errp)
486 {
487     VFIOContainer *container = container_of(bcontainer, VFIOContainer,
488                                             bcontainer);
489     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
490                                                   container);
491     struct vfio_iommu_spapr_tce_info info;
492     bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
493     int ret, fd = container->fd;
494 
495     QLIST_INIT(&scontainer->hostwin_list);
496 
497     /*
498      * The host kernel code implementing VFIO_IOMMU_DISABLE is called
499      * when container fd is closed so we do not call it explicitly
500      * in this file.
501      */
502     if (!v2) {
503         ret = ioctl(fd, VFIO_IOMMU_ENABLE);
504         if (ret) {
505             error_setg_errno(errp, errno, "failed to enable container");
506             return false;
507         }
508     } else {
509         scontainer->prereg_listener = vfio_prereg_listener;
510 
511         memory_listener_register(&scontainer->prereg_listener,
512                                  &address_space_memory);
513         if (bcontainer->error) {
514             error_propagate_prepend(errp, bcontainer->error,
515                     "RAM memory listener initialization failed: ");
516             goto listener_unregister_exit;
517         }
518     }
519 
520     info.argsz = sizeof(info);
521     ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
522     if (ret) {
523         error_setg_errno(errp, errno,
524                          "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
525         goto listener_unregister_exit;
526     }
527 
528     scontainer->levels = info.ddw.levels;
529 
530     if (v2) {
531         bcontainer->pgsizes = info.ddw.pgsizes;
532         /*
533          * There is a default window in just created container.
534          * To make region_add/del simpler, we better remove this
535          * window now and let those iommu_listener callbacks
536          * create/remove them when needed.
537          */
538         ret = vfio_spapr_remove_window(container, info.dma32_window_start);
539         if (ret) {
540             error_setg_errno(errp, -ret,
541                              "failed to remove existing window");
542             goto listener_unregister_exit;
543         }
544     } else {
545         /* The default table uses 4K pages */
546         bcontainer->pgsizes = 0x1000;
547         vfio_host_win_add(scontainer, info.dma32_window_start,
548                           info.dma32_window_start +
549                           info.dma32_window_size - 1,
550                           0x1000);
551     }
552 
553     return true;
554 
555 listener_unregister_exit:
556     if (v2) {
557         memory_listener_unregister(&scontainer->prereg_listener);
558     }
559     return false;
560 }
561 
vfio_iommu_spapr_class_init(ObjectClass * klass,const void * data)562 static void vfio_iommu_spapr_class_init(ObjectClass *klass, const void *data)
563 {
564     VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
565 
566     vioc->add_window = vfio_spapr_container_add_section_window;
567     vioc->del_window = vfio_spapr_container_del_section_window;
568     vioc->release = vfio_spapr_container_release;
569     vioc->setup = vfio_spapr_container_setup;
570 };
571 
572 static const TypeInfo types[] = {
573     {
574         .name = TYPE_VFIO_IOMMU_SPAPR,
575         .parent = TYPE_VFIO_IOMMU_LEGACY,
576         .instance_size = sizeof(VFIOSpaprContainer),
577         .class_init = vfio_iommu_spapr_class_init,
578     },
579 };
580 
581 DEFINE_TYPES(types)
582