1 /*
2 * DMA memory preregistration
3 *
4 * Authors:
5 * Alexey Kardashevskiy <aik@ozlabs.ru>
6 *
7 * This work is licensed under the terms of the GNU GPL, version 2. See
8 * the COPYING file in the top-level directory.
9 */
10
11 #include "qemu/osdep.h"
12 #include <sys/ioctl.h>
13 #include <linux/vfio.h>
14 #include "system/kvm.h"
15 #include "system/hostmem.h"
16 #include "system/address-spaces.h"
17
18 #include "hw/vfio/vfio-container.h"
19 #include "hw/hw.h"
20 #include "system/ram_addr.h"
21 #include "qemu/error-report.h"
22 #include "qapi/error.h"
23 #include "trace.h"
24 #include "vfio-helpers.h"
25
26 typedef struct VFIOHostDMAWindow {
27 hwaddr min_iova;
28 hwaddr max_iova;
29 uint64_t iova_pgsizes;
30 QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next;
31 } VFIOHostDMAWindow;
32
33 typedef struct VFIOSpaprContainer {
34 VFIOContainer container;
35 MemoryListener prereg_listener;
36 QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
37 unsigned int levels;
38 } VFIOSpaprContainer;
39
40 OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR);
41
vfio_prereg_listener_skipped_section(MemoryRegionSection * section)42 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
43 {
44 if (memory_region_is_iommu(section->mr)) {
45 hw_error("Cannot possibly preregister IOMMU memory");
46 }
47
48 return !memory_region_is_ram(section->mr) ||
49 memory_region_is_ram_device(section->mr);
50 }
51
vfio_prereg_gpa_to_vaddr(MemoryRegionSection * section,hwaddr gpa)52 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
53 {
54 return memory_region_get_ram_ptr(section->mr) +
55 section->offset_within_region +
56 (gpa - section->offset_within_address_space);
57 }
58
vfio_prereg_listener_region_add(MemoryListener * listener,MemoryRegionSection * section)59 static void vfio_prereg_listener_region_add(MemoryListener *listener,
60 MemoryRegionSection *section)
61 {
62 VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
63 prereg_listener);
64 VFIOContainer *container = &scontainer->container;
65 VFIOContainerBase *bcontainer = &container->bcontainer;
66 const hwaddr gpa = section->offset_within_address_space;
67 hwaddr end;
68 int ret;
69 hwaddr page_mask = qemu_real_host_page_mask();
70 struct vfio_iommu_spapr_register_memory reg = {
71 .argsz = sizeof(reg),
72 .flags = 0,
73 };
74
75 if (vfio_prereg_listener_skipped_section(section)) {
76 trace_vfio_prereg_listener_region_add_skip(
77 section->offset_within_address_space,
78 section->offset_within_address_space +
79 int128_get64(int128_sub(section->size, int128_one())));
80 return;
81 }
82
83 if (unlikely((section->offset_within_address_space & ~page_mask) ||
84 (section->offset_within_region & ~page_mask) ||
85 (int128_get64(section->size) & ~page_mask))) {
86 error_report("%s received unaligned region", __func__);
87 return;
88 }
89
90 end = section->offset_within_address_space + int128_get64(section->size);
91 if (gpa >= end) {
92 return;
93 }
94
95 memory_region_ref(section->mr);
96
97 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
98 reg.size = end - gpa;
99
100 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
101 trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
102 if (ret) {
103 /*
104 * On the initfn path, store the first error in the container so we
105 * can gracefully fail. Runtime, there's not much we can do other
106 * than throw a hardware error.
107 */
108 if (!bcontainer->initialized) {
109 if (!bcontainer->error) {
110 error_setg_errno(&bcontainer->error, -ret,
111 "Memory registering failed");
112 }
113 } else {
114 hw_error("vfio: Memory registering failed, unable to continue");
115 }
116 }
117 }
118
vfio_prereg_listener_region_del(MemoryListener * listener,MemoryRegionSection * section)119 static void vfio_prereg_listener_region_del(MemoryListener *listener,
120 MemoryRegionSection *section)
121 {
122 VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
123 prereg_listener);
124 VFIOContainer *container = &scontainer->container;
125 const hwaddr gpa = section->offset_within_address_space;
126 hwaddr end;
127 int ret;
128 hwaddr page_mask = qemu_real_host_page_mask();
129 struct vfio_iommu_spapr_register_memory reg = {
130 .argsz = sizeof(reg),
131 .flags = 0,
132 };
133
134 if (vfio_prereg_listener_skipped_section(section)) {
135 trace_vfio_prereg_listener_region_del_skip(
136 section->offset_within_address_space,
137 section->offset_within_address_space +
138 int128_get64(int128_sub(section->size, int128_one())));
139 return;
140 }
141
142 if (unlikely((section->offset_within_address_space & ~page_mask) ||
143 (section->offset_within_region & ~page_mask) ||
144 (int128_get64(section->size) & ~page_mask))) {
145 error_report("%s received unaligned region", __func__);
146 return;
147 }
148
149 end = section->offset_within_address_space + int128_get64(section->size);
150 if (gpa >= end) {
151 return;
152 }
153
154 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
155 reg.size = end - gpa;
156
157 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®);
158 trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
159 }
160
161 static const MemoryListener vfio_prereg_listener = {
162 .name = "vfio-pre-reg",
163 .region_add = vfio_prereg_listener_region_add,
164 .region_del = vfio_prereg_listener_region_del,
165 };
166
vfio_host_win_add(VFIOSpaprContainer * scontainer,hwaddr min_iova,hwaddr max_iova,uint64_t iova_pgsizes)167 static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova,
168 hwaddr max_iova, uint64_t iova_pgsizes)
169 {
170 VFIOHostDMAWindow *hostwin;
171
172 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
173 if (ranges_overlap(hostwin->min_iova,
174 hostwin->max_iova - hostwin->min_iova + 1,
175 min_iova,
176 max_iova - min_iova + 1)) {
177 hw_error("%s: Overlapped IOMMU are not enabled", __func__);
178 }
179 }
180
181 hostwin = g_malloc0(sizeof(*hostwin));
182
183 hostwin->min_iova = min_iova;
184 hostwin->max_iova = max_iova;
185 hostwin->iova_pgsizes = iova_pgsizes;
186 QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next);
187 }
188
vfio_host_win_del(VFIOSpaprContainer * scontainer,hwaddr min_iova,hwaddr max_iova)189 static int vfio_host_win_del(VFIOSpaprContainer *scontainer,
190 hwaddr min_iova, hwaddr max_iova)
191 {
192 VFIOHostDMAWindow *hostwin;
193
194 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
195 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
196 QLIST_REMOVE(hostwin, hostwin_next);
197 g_free(hostwin);
198 return 0;
199 }
200 }
201
202 return -1;
203 }
204
vfio_find_hostwin(VFIOSpaprContainer * container,hwaddr iova,hwaddr end)205 static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container,
206 hwaddr iova, hwaddr end)
207 {
208 VFIOHostDMAWindow *hostwin;
209 bool hostwin_found = false;
210
211 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
212 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
213 hostwin_found = true;
214 break;
215 }
216 }
217
218 return hostwin_found ? hostwin : NULL;
219 }
220
vfio_spapr_remove_window(VFIOContainer * container,hwaddr offset_within_address_space)221 static int vfio_spapr_remove_window(VFIOContainer *container,
222 hwaddr offset_within_address_space)
223 {
224 struct vfio_iommu_spapr_tce_remove remove = {
225 .argsz = sizeof(remove),
226 .start_addr = offset_within_address_space,
227 };
228 int ret;
229
230 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
231 if (ret) {
232 error_report("Failed to remove window at %"PRIx64,
233 (uint64_t)remove.start_addr);
234 return -errno;
235 }
236
237 trace_vfio_spapr_remove_window(offset_within_address_space);
238
239 return 0;
240 }
241
vfio_spapr_create_window(VFIOContainer * container,MemoryRegionSection * section,hwaddr * pgsize,Error ** errp)242 static bool vfio_spapr_create_window(VFIOContainer *container,
243 MemoryRegionSection *section,
244 hwaddr *pgsize, Error **errp)
245 {
246 int ret = 0;
247 VFIOContainerBase *bcontainer = &container->bcontainer;
248 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
249 container);
250 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
251 uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
252 unsigned entries, bits_total, bits_per_level, max_levels, ddw_levels;
253 struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
254 long rampagesize = qemu_minrampagesize();
255
256 /*
257 * The host might not support the guest supported IOMMU page size,
258 * so we will use smaller physical IOMMU pages to back them.
259 */
260 if (pagesize > rampagesize) {
261 pagesize = rampagesize;
262 }
263 pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1));
264 pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0;
265 if (!pagesize) {
266 error_setg_errno(errp, EINVAL, "Host doesn't support page size 0x%"PRIx64
267 ", the supported mask is 0x%lx",
268 memory_region_iommu_get_min_page_size(iommu_mr),
269 bcontainer->pgsizes);
270 return false;
271 }
272
273 /*
274 * FIXME: For VFIO iommu types which have KVM acceleration to
275 * avoid bouncing all map/unmaps through qemu this way, this
276 * would be the right place to wire that up (tell the KVM
277 * device emulation the VFIO iommu handles to use).
278 */
279 create.window_size = int128_get64(section->size);
280 create.page_shift = ctz64(pagesize);
281 /*
282 * SPAPR host supports multilevel TCE tables. We try to guess optimal
283 * levels number and if this fails (for example due to the host memory
284 * fragmentation), we increase levels. The DMA address structure is:
285 * rrrrrrrr rxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx iiiiiiii
286 * where:
287 * r = reserved (bits >= 55 are reserved in the existing hardware)
288 * i = IOMMU page offset (64K in this example)
289 * x = bits to index a TCE which can be split to equal chunks to index
290 * within the level.
291 * The aim is to split "x" to smaller possible number of levels.
292 */
293 entries = create.window_size >> create.page_shift;
294 /* bits_total is number of "x" needed */
295 bits_total = ctz64(entries * sizeof(uint64_t));
296 /*
297 * bits_per_level is a safe guess of how much we can allocate per level:
298 * 8 is the current minimum for CONFIG_FORCE_MAX_ZONEORDER and MAX_ORDER
299 * is usually bigger than that.
300 * Below we look at qemu_real_host_page_size as TCEs are allocated from
301 * system pages.
302 */
303 bits_per_level = ctz64(qemu_real_host_page_size()) + 8;
304 create.levels = bits_total / bits_per_level;
305
306 ddw_levels = scontainer->levels;
307 if (ddw_levels > 1) {
308 if (bits_total % bits_per_level) {
309 ++create.levels;
310 }
311 max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
312 for ( ; create.levels <= max_levels; ++create.levels) {
313 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
314 if (!ret) {
315 break;
316 }
317 }
318 } else { /* ddw_levels == 1 */
319 if (create.levels > ddw_levels) {
320 error_setg_errno(errp, EINVAL, "Host doesn't support multi-level TCE tables"
321 ". Use larger IO page size. Supported mask is 0x%lx",
322 bcontainer->pgsizes);
323 return false;
324 }
325 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
326 }
327
328 if (ret) {
329 error_setg_errno(errp, errno, "Failed to create a window, ret = %d", ret);
330 return false;
331 }
332
333 if (create.start_addr != section->offset_within_address_space) {
334 vfio_spapr_remove_window(container, create.start_addr);
335
336 error_setg_errno(errp, EINVAL, "Host doesn't support DMA window at %"HWADDR_PRIx
337 ", must be %"PRIx64, section->offset_within_address_space,
338 (uint64_t)create.start_addr);
339 return false;
340 }
341 trace_vfio_spapr_create_window(create.page_shift,
342 create.levels,
343 create.window_size,
344 create.start_addr);
345 *pgsize = pagesize;
346
347 return true;
348 }
349
350 static bool
vfio_spapr_container_add_section_window(VFIOContainerBase * bcontainer,MemoryRegionSection * section,Error ** errp)351 vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
352 MemoryRegionSection *section,
353 Error **errp)
354 {
355 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
356 bcontainer);
357 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
358 container);
359 VFIOHostDMAWindow *hostwin;
360 hwaddr pgsize = 0;
361 int ret;
362
363 /*
364 * VFIO_SPAPR_TCE_IOMMU supports a single host window between
365 * [dma32_window_start, dma32_window_size), we need to ensure
366 * the section fall in this range.
367 */
368 if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
369 hwaddr iova, end;
370
371 iova = section->offset_within_address_space;
372 end = iova + int128_get64(section->size) - 1;
373
374 if (!vfio_find_hostwin(scontainer, iova, end)) {
375 error_setg(errp, "Container %p can't map guest IOVA region"
376 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
377 iova, end);
378 return false;
379 }
380 return true;
381 }
382
383 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
384 return true;
385 }
386
387 /* For now intersections are not allowed, we may relax this later */
388 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
389 if (ranges_overlap(hostwin->min_iova,
390 hostwin->max_iova - hostwin->min_iova + 1,
391 section->offset_within_address_space,
392 int128_get64(section->size))) {
393 error_setg(errp,
394 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
395 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
396 section->offset_within_address_space,
397 section->offset_within_address_space +
398 int128_get64(section->size) - 1,
399 hostwin->min_iova, hostwin->max_iova);
400 return false;
401 }
402 }
403
404 ret = vfio_spapr_create_window(container, section, &pgsize, errp);
405 if (!ret) {
406 return false;
407 }
408
409 vfio_host_win_add(scontainer, section->offset_within_address_space,
410 section->offset_within_address_space +
411 int128_get64(section->size) - 1, pgsize);
412 #ifdef CONFIG_KVM
413 if (kvm_enabled()) {
414 VFIOGroup *group;
415 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
416 struct kvm_vfio_spapr_tce param;
417 struct kvm_device_attr attr = {
418 .group = KVM_DEV_VFIO_GROUP,
419 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
420 .addr = (uint64_t)(unsigned long)¶m,
421 };
422
423 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
424 ¶m.tablefd)) {
425 QLIST_FOREACH(group, &container->group_list, container_next) {
426 param.groupfd = group->fd;
427 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
428 error_setg_errno(errp, errno,
429 "vfio: failed GROUP_SET_SPAPR_TCE for "
430 "KVM VFIO device %d and group fd %d",
431 param.tablefd, param.groupfd);
432 return false;
433 }
434 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
435 }
436 }
437 }
438 #endif
439 return true;
440 }
441
442 static void
vfio_spapr_container_del_section_window(VFIOContainerBase * bcontainer,MemoryRegionSection * section)443 vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer,
444 MemoryRegionSection *section)
445 {
446 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
447 bcontainer);
448 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
449 container);
450
451 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
452 return;
453 }
454
455 vfio_spapr_remove_window(container,
456 section->offset_within_address_space);
457 if (vfio_host_win_del(scontainer,
458 section->offset_within_address_space,
459 section->offset_within_address_space +
460 int128_get64(section->size) - 1) < 0) {
461 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
462 __func__, section->offset_within_address_space);
463 }
464 }
465
vfio_spapr_container_release(VFIOContainerBase * bcontainer)466 static void vfio_spapr_container_release(VFIOContainerBase *bcontainer)
467 {
468 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
469 bcontainer);
470 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
471 container);
472 VFIOHostDMAWindow *hostwin, *next;
473
474 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
475 memory_listener_unregister(&scontainer->prereg_listener);
476 }
477 QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next,
478 next) {
479 QLIST_REMOVE(hostwin, hostwin_next);
480 g_free(hostwin);
481 }
482 }
483
vfio_spapr_container_setup(VFIOContainerBase * bcontainer,Error ** errp)484 static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
485 Error **errp)
486 {
487 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
488 bcontainer);
489 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
490 container);
491 struct vfio_iommu_spapr_tce_info info;
492 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
493 int ret, fd = container->fd;
494
495 QLIST_INIT(&scontainer->hostwin_list);
496
497 /*
498 * The host kernel code implementing VFIO_IOMMU_DISABLE is called
499 * when container fd is closed so we do not call it explicitly
500 * in this file.
501 */
502 if (!v2) {
503 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
504 if (ret) {
505 error_setg_errno(errp, errno, "failed to enable container");
506 return false;
507 }
508 } else {
509 scontainer->prereg_listener = vfio_prereg_listener;
510
511 memory_listener_register(&scontainer->prereg_listener,
512 &address_space_memory);
513 if (bcontainer->error) {
514 error_propagate_prepend(errp, bcontainer->error,
515 "RAM memory listener initialization failed: ");
516 goto listener_unregister_exit;
517 }
518 }
519
520 info.argsz = sizeof(info);
521 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
522 if (ret) {
523 error_setg_errno(errp, errno,
524 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
525 goto listener_unregister_exit;
526 }
527
528 scontainer->levels = info.ddw.levels;
529
530 if (v2) {
531 bcontainer->pgsizes = info.ddw.pgsizes;
532 /*
533 * There is a default window in just created container.
534 * To make region_add/del simpler, we better remove this
535 * window now and let those iommu_listener callbacks
536 * create/remove them when needed.
537 */
538 ret = vfio_spapr_remove_window(container, info.dma32_window_start);
539 if (ret) {
540 error_setg_errno(errp, -ret,
541 "failed to remove existing window");
542 goto listener_unregister_exit;
543 }
544 } else {
545 /* The default table uses 4K pages */
546 bcontainer->pgsizes = 0x1000;
547 vfio_host_win_add(scontainer, info.dma32_window_start,
548 info.dma32_window_start +
549 info.dma32_window_size - 1,
550 0x1000);
551 }
552
553 return true;
554
555 listener_unregister_exit:
556 if (v2) {
557 memory_listener_unregister(&scontainer->prereg_listener);
558 }
559 return false;
560 }
561
vfio_iommu_spapr_class_init(ObjectClass * klass,const void * data)562 static void vfio_iommu_spapr_class_init(ObjectClass *klass, const void *data)
563 {
564 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
565
566 vioc->add_window = vfio_spapr_container_add_section_window;
567 vioc->del_window = vfio_spapr_container_del_section_window;
568 vioc->release = vfio_spapr_container_release;
569 vioc->setup = vfio_spapr_container_setup;
570 };
571
572 static const TypeInfo types[] = {
573 {
574 .name = TYPE_VFIO_IOMMU_SPAPR,
575 .parent = TYPE_VFIO_IOMMU_LEGACY,
576 .instance_size = sizeof(VFIOSpaprContainer),
577 .class_init = vfio_iommu_spapr_class_init,
578 },
579 };
580
581 DEFINE_TYPES(types)
582