1 /* 2 * DMA memory preregistration 3 * 4 * Authors: 5 * Alexey Kardashevskiy <aik@ozlabs.ru> 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2. See 8 * the COPYING file in the top-level directory. 9 */ 10 11 #include "qemu/osdep.h" 12 #include <sys/ioctl.h> 13 #include <linux/vfio.h> 14 #include "system/kvm.h" 15 #include "system/hostmem.h" 16 #include "system/address-spaces.h" 17 18 #include "hw/vfio/vfio-container.h" 19 #include "hw/hw.h" 20 #include "system/ram_addr.h" 21 #include "qemu/error-report.h" 22 #include "qapi/error.h" 23 #include "trace.h" 24 #include "vfio-helpers.h" 25 26 typedef struct VFIOHostDMAWindow { 27 hwaddr min_iova; 28 hwaddr max_iova; 29 uint64_t iova_pgsizes; 30 QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next; 31 } VFIOHostDMAWindow; 32 33 typedef struct VFIOSpaprContainer { 34 VFIOContainer container; 35 MemoryListener prereg_listener; 36 QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; 37 unsigned int levels; 38 } VFIOSpaprContainer; 39 40 OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR); 41 42 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) 43 { 44 if (memory_region_is_iommu(section->mr)) { 45 hw_error("Cannot possibly preregister IOMMU memory"); 46 } 47 48 return !memory_region_is_ram(section->mr) || 49 memory_region_is_ram_device(section->mr); 50 } 51 52 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa) 53 { 54 return memory_region_get_ram_ptr(section->mr) + 55 section->offset_within_region + 56 (gpa - section->offset_within_address_space); 57 } 58 59 static void vfio_prereg_listener_region_add(MemoryListener *listener, 60 MemoryRegionSection *section) 61 { 62 VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, 63 prereg_listener); 64 VFIOContainer *container = &scontainer->container; 65 VFIOContainerBase *bcontainer = &container->bcontainer; 66 const hwaddr gpa = section->offset_within_address_space; 67 hwaddr end; 68 int ret; 69 hwaddr page_mask = qemu_real_host_page_mask(); 70 struct vfio_iommu_spapr_register_memory reg = { 71 .argsz = sizeof(reg), 72 .flags = 0, 73 }; 74 75 if (vfio_prereg_listener_skipped_section(section)) { 76 trace_vfio_prereg_listener_region_add_skip( 77 section->offset_within_address_space, 78 section->offset_within_address_space + 79 int128_get64(int128_sub(section->size, int128_one()))); 80 return; 81 } 82 83 if (unlikely((section->offset_within_address_space & ~page_mask) || 84 (section->offset_within_region & ~page_mask) || 85 (int128_get64(section->size) & ~page_mask))) { 86 error_report("%s received unaligned region", __func__); 87 return; 88 } 89 90 end = section->offset_within_address_space + int128_get64(section->size); 91 if (gpa >= end) { 92 return; 93 } 94 95 memory_region_ref(section->mr); 96 97 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); 98 reg.size = end - gpa; 99 100 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); 101 trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0); 102 if (ret) { 103 /* 104 * On the initfn path, store the first error in the container so we 105 * can gracefully fail. Runtime, there's not much we can do other 106 * than throw a hardware error. 107 */ 108 if (!bcontainer->initialized) { 109 if (!bcontainer->error) { 110 error_setg_errno(&bcontainer->error, -ret, 111 "Memory registering failed"); 112 } 113 } else { 114 hw_error("vfio: Memory registering failed, unable to continue"); 115 } 116 } 117 } 118 119 static void vfio_prereg_listener_region_del(MemoryListener *listener, 120 MemoryRegionSection *section) 121 { 122 VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, 123 prereg_listener); 124 VFIOContainer *container = &scontainer->container; 125 const hwaddr gpa = section->offset_within_address_space; 126 hwaddr end; 127 int ret; 128 hwaddr page_mask = qemu_real_host_page_mask(); 129 struct vfio_iommu_spapr_register_memory reg = { 130 .argsz = sizeof(reg), 131 .flags = 0, 132 }; 133 134 if (vfio_prereg_listener_skipped_section(section)) { 135 trace_vfio_prereg_listener_region_del_skip( 136 section->offset_within_address_space, 137 section->offset_within_address_space + 138 int128_get64(int128_sub(section->size, int128_one()))); 139 return; 140 } 141 142 if (unlikely((section->offset_within_address_space & ~page_mask) || 143 (section->offset_within_region & ~page_mask) || 144 (int128_get64(section->size) & ~page_mask))) { 145 error_report("%s received unaligned region", __func__); 146 return; 147 } 148 149 end = section->offset_within_address_space + int128_get64(section->size); 150 if (gpa >= end) { 151 return; 152 } 153 154 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); 155 reg.size = end - gpa; 156 157 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); 158 trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0); 159 } 160 161 static const MemoryListener vfio_prereg_listener = { 162 .name = "vfio-pre-reg", 163 .region_add = vfio_prereg_listener_region_add, 164 .region_del = vfio_prereg_listener_region_del, 165 }; 166 167 static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova, 168 hwaddr max_iova, uint64_t iova_pgsizes) 169 { 170 VFIOHostDMAWindow *hostwin; 171 172 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { 173 if (ranges_overlap(hostwin->min_iova, 174 hostwin->max_iova - hostwin->min_iova + 1, 175 min_iova, 176 max_iova - min_iova + 1)) { 177 hw_error("%s: Overlapped IOMMU are not enabled", __func__); 178 } 179 } 180 181 hostwin = g_malloc0(sizeof(*hostwin)); 182 183 hostwin->min_iova = min_iova; 184 hostwin->max_iova = max_iova; 185 hostwin->iova_pgsizes = iova_pgsizes; 186 QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next); 187 } 188 189 static int vfio_host_win_del(VFIOSpaprContainer *scontainer, 190 hwaddr min_iova, hwaddr max_iova) 191 { 192 VFIOHostDMAWindow *hostwin; 193 194 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { 195 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { 196 QLIST_REMOVE(hostwin, hostwin_next); 197 g_free(hostwin); 198 return 0; 199 } 200 } 201 202 return -1; 203 } 204 205 static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container, 206 hwaddr iova, hwaddr end) 207 { 208 VFIOHostDMAWindow *hostwin; 209 bool hostwin_found = false; 210 211 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { 212 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { 213 hostwin_found = true; 214 break; 215 } 216 } 217 218 return hostwin_found ? hostwin : NULL; 219 } 220 221 static int vfio_spapr_remove_window(VFIOContainer *container, 222 hwaddr offset_within_address_space) 223 { 224 struct vfio_iommu_spapr_tce_remove remove = { 225 .argsz = sizeof(remove), 226 .start_addr = offset_within_address_space, 227 }; 228 int ret; 229 230 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); 231 if (ret) { 232 error_report("Failed to remove window at %"PRIx64, 233 (uint64_t)remove.start_addr); 234 return -errno; 235 } 236 237 trace_vfio_spapr_remove_window(offset_within_address_space); 238 239 return 0; 240 } 241 242 static bool vfio_spapr_create_window(VFIOContainer *container, 243 MemoryRegionSection *section, 244 hwaddr *pgsize, Error **errp) 245 { 246 int ret = 0; 247 VFIOContainerBase *bcontainer = &container->bcontainer; 248 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 249 container); 250 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 251 uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask; 252 unsigned entries, bits_total, bits_per_level, max_levels, ddw_levels; 253 struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; 254 long rampagesize = qemu_minrampagesize(); 255 256 /* 257 * The host might not support the guest supported IOMMU page size, 258 * so we will use smaller physical IOMMU pages to back them. 259 */ 260 if (pagesize > rampagesize) { 261 pagesize = rampagesize; 262 } 263 pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1)); 264 pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0; 265 if (!pagesize) { 266 error_setg_errno(errp, EINVAL, "Host doesn't support page size 0x%"PRIx64 267 ", the supported mask is 0x%lx", 268 memory_region_iommu_get_min_page_size(iommu_mr), 269 bcontainer->pgsizes); 270 return false; 271 } 272 273 /* 274 * FIXME: For VFIO iommu types which have KVM acceleration to 275 * avoid bouncing all map/unmaps through qemu this way, this 276 * would be the right place to wire that up (tell the KVM 277 * device emulation the VFIO iommu handles to use). 278 */ 279 create.window_size = int128_get64(section->size); 280 create.page_shift = ctz64(pagesize); 281 /* 282 * SPAPR host supports multilevel TCE tables. We try to guess optimal 283 * levels number and if this fails (for example due to the host memory 284 * fragmentation), we increase levels. The DMA address structure is: 285 * rrrrrrrr rxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx iiiiiiii 286 * where: 287 * r = reserved (bits >= 55 are reserved in the existing hardware) 288 * i = IOMMU page offset (64K in this example) 289 * x = bits to index a TCE which can be split to equal chunks to index 290 * within the level. 291 * The aim is to split "x" to smaller possible number of levels. 292 */ 293 entries = create.window_size >> create.page_shift; 294 /* bits_total is number of "x" needed */ 295 bits_total = ctz64(entries * sizeof(uint64_t)); 296 /* 297 * bits_per_level is a safe guess of how much we can allocate per level: 298 * 8 is the current minimum for CONFIG_FORCE_MAX_ZONEORDER and MAX_ORDER 299 * is usually bigger than that. 300 * Below we look at qemu_real_host_page_size as TCEs are allocated from 301 * system pages. 302 */ 303 bits_per_level = ctz64(qemu_real_host_page_size()) + 8; 304 create.levels = bits_total / bits_per_level; 305 306 ddw_levels = scontainer->levels; 307 if (ddw_levels > 1) { 308 if (bits_total % bits_per_level) { 309 ++create.levels; 310 } 311 max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size()); 312 for ( ; create.levels <= max_levels; ++create.levels) { 313 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); 314 if (!ret) { 315 break; 316 } 317 } 318 } else { /* ddw_levels == 1 */ 319 if (create.levels > ddw_levels) { 320 error_setg_errno(errp, EINVAL, "Host doesn't support multi-level TCE tables" 321 ". Use larger IO page size. Supported mask is 0x%lx", 322 bcontainer->pgsizes); 323 return false; 324 } 325 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); 326 } 327 328 if (ret) { 329 error_setg_errno(errp, errno, "Failed to create a window, ret = %d", ret); 330 return false; 331 } 332 333 if (create.start_addr != section->offset_within_address_space) { 334 vfio_spapr_remove_window(container, create.start_addr); 335 336 error_setg_errno(errp, EINVAL, "Host doesn't support DMA window at %"HWADDR_PRIx 337 ", must be %"PRIx64, section->offset_within_address_space, 338 (uint64_t)create.start_addr); 339 return false; 340 } 341 trace_vfio_spapr_create_window(create.page_shift, 342 create.levels, 343 create.window_size, 344 create.start_addr); 345 *pgsize = pagesize; 346 347 return true; 348 } 349 350 static bool 351 vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, 352 MemoryRegionSection *section, 353 Error **errp) 354 { 355 VFIOContainer *container = container_of(bcontainer, VFIOContainer, 356 bcontainer); 357 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 358 container); 359 VFIOHostDMAWindow *hostwin; 360 hwaddr pgsize = 0; 361 int ret; 362 363 /* 364 * VFIO_SPAPR_TCE_IOMMU supports a single host window between 365 * [dma32_window_start, dma32_window_size), we need to ensure 366 * the section fall in this range. 367 */ 368 if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { 369 hwaddr iova, end; 370 371 iova = section->offset_within_address_space; 372 end = iova + int128_get64(section->size) - 1; 373 374 if (!vfio_find_hostwin(scontainer, iova, end)) { 375 error_setg(errp, "Container %p can't map guest IOVA region" 376 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, 377 iova, end); 378 return false; 379 } 380 return true; 381 } 382 383 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { 384 return true; 385 } 386 387 /* For now intersections are not allowed, we may relax this later */ 388 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { 389 if (ranges_overlap(hostwin->min_iova, 390 hostwin->max_iova - hostwin->min_iova + 1, 391 section->offset_within_address_space, 392 int128_get64(section->size))) { 393 error_setg(errp, 394 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing" 395 "host DMA window [0x%"PRIx64",0x%"PRIx64"]", 396 section->offset_within_address_space, 397 section->offset_within_address_space + 398 int128_get64(section->size) - 1, 399 hostwin->min_iova, hostwin->max_iova); 400 return false; 401 } 402 } 403 404 ret = vfio_spapr_create_window(container, section, &pgsize, errp); 405 if (!ret) { 406 return false; 407 } 408 409 vfio_host_win_add(scontainer, section->offset_within_address_space, 410 section->offset_within_address_space + 411 int128_get64(section->size) - 1, pgsize); 412 #ifdef CONFIG_KVM 413 if (kvm_enabled()) { 414 VFIOGroup *group; 415 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 416 struct kvm_vfio_spapr_tce param; 417 struct kvm_device_attr attr = { 418 .group = KVM_DEV_VFIO_GROUP, 419 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, 420 .addr = (uint64_t)(unsigned long)¶m, 421 }; 422 423 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD, 424 ¶m.tablefd)) { 425 QLIST_FOREACH(group, &container->group_list, container_next) { 426 param.groupfd = group->fd; 427 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 428 error_setg_errno(errp, errno, 429 "vfio: failed GROUP_SET_SPAPR_TCE for " 430 "KVM VFIO device %d and group fd %d", 431 param.tablefd, param.groupfd); 432 return false; 433 } 434 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd); 435 } 436 } 437 } 438 #endif 439 return true; 440 } 441 442 static void 443 vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, 444 MemoryRegionSection *section) 445 { 446 VFIOContainer *container = container_of(bcontainer, VFIOContainer, 447 bcontainer); 448 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 449 container); 450 451 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { 452 return; 453 } 454 455 vfio_spapr_remove_window(container, 456 section->offset_within_address_space); 457 if (vfio_host_win_del(scontainer, 458 section->offset_within_address_space, 459 section->offset_within_address_space + 460 int128_get64(section->size) - 1) < 0) { 461 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, 462 __func__, section->offset_within_address_space); 463 } 464 } 465 466 static void vfio_spapr_container_release(VFIOContainerBase *bcontainer) 467 { 468 VFIOContainer *container = container_of(bcontainer, VFIOContainer, 469 bcontainer); 470 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 471 container); 472 VFIOHostDMAWindow *hostwin, *next; 473 474 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { 475 memory_listener_unregister(&scontainer->prereg_listener); 476 } 477 QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next, 478 next) { 479 QLIST_REMOVE(hostwin, hostwin_next); 480 g_free(hostwin); 481 } 482 } 483 484 static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer, 485 Error **errp) 486 { 487 VFIOContainer *container = container_of(bcontainer, VFIOContainer, 488 bcontainer); 489 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 490 container); 491 struct vfio_iommu_spapr_tce_info info; 492 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; 493 int ret, fd = container->fd; 494 495 QLIST_INIT(&scontainer->hostwin_list); 496 497 /* 498 * The host kernel code implementing VFIO_IOMMU_DISABLE is called 499 * when container fd is closed so we do not call it explicitly 500 * in this file. 501 */ 502 if (!v2) { 503 ret = ioctl(fd, VFIO_IOMMU_ENABLE); 504 if (ret) { 505 error_setg_errno(errp, errno, "failed to enable container"); 506 return false; 507 } 508 } else { 509 scontainer->prereg_listener = vfio_prereg_listener; 510 511 memory_listener_register(&scontainer->prereg_listener, 512 &address_space_memory); 513 if (bcontainer->error) { 514 error_propagate_prepend(errp, bcontainer->error, 515 "RAM memory listener initialization failed: "); 516 goto listener_unregister_exit; 517 } 518 } 519 520 info.argsz = sizeof(info); 521 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); 522 if (ret) { 523 error_setg_errno(errp, errno, 524 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed"); 525 goto listener_unregister_exit; 526 } 527 528 scontainer->levels = info.ddw.levels; 529 530 if (v2) { 531 bcontainer->pgsizes = info.ddw.pgsizes; 532 /* 533 * There is a default window in just created container. 534 * To make region_add/del simpler, we better remove this 535 * window now and let those iommu_listener callbacks 536 * create/remove them when needed. 537 */ 538 ret = vfio_spapr_remove_window(container, info.dma32_window_start); 539 if (ret) { 540 error_setg_errno(errp, -ret, 541 "failed to remove existing window"); 542 goto listener_unregister_exit; 543 } 544 } else { 545 /* The default table uses 4K pages */ 546 bcontainer->pgsizes = 0x1000; 547 vfio_host_win_add(scontainer, info.dma32_window_start, 548 info.dma32_window_start + 549 info.dma32_window_size - 1, 550 0x1000); 551 } 552 553 return true; 554 555 listener_unregister_exit: 556 if (v2) { 557 memory_listener_unregister(&scontainer->prereg_listener); 558 } 559 return false; 560 } 561 562 static void vfio_iommu_spapr_class_init(ObjectClass *klass, const void *data) 563 { 564 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); 565 566 vioc->add_window = vfio_spapr_container_add_section_window; 567 vioc->del_window = vfio_spapr_container_del_section_window; 568 vioc->release = vfio_spapr_container_release; 569 vioc->setup = vfio_spapr_container_setup; 570 }; 571 572 static const TypeInfo types[] = { 573 { 574 .name = TYPE_VFIO_IOMMU_SPAPR, 575 .parent = TYPE_VFIO_IOMMU_LEGACY, 576 .instance_size = sizeof(VFIOSpaprContainer), 577 .class_init = vfio_iommu_spapr_class_init, 578 }, 579 }; 580 581 DEFINE_TYPES(types) 582