1 /* 2 * vhost-vdpa 3 * 4 * Copyright(c) 2017-2018 Intel Corporation. 5 * Copyright(c) 2020 Red Hat, Inc. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include <linux/vhost.h> 14 #include <linux/vfio.h> 15 #include <sys/eventfd.h> 16 #include <sys/ioctl.h> 17 #include "hw/virtio/vhost.h" 18 #include "hw/virtio/vhost-backend.h" 19 #include "hw/virtio/virtio-net.h" 20 #include "hw/virtio/vhost-shadow-virtqueue.h" 21 #include "hw/virtio/vhost-vdpa.h" 22 #include "exec/address-spaces.h" 23 #include "migration/blocker.h" 24 #include "qemu/cutils.h" 25 #include "qemu/main-loop.h" 26 #include "cpu.h" 27 #include "trace.h" 28 #include "qapi/error.h" 29 30 /* 31 * Return one past the end of the end of section. Be careful with uint64_t 32 * conversions! 33 */ 34 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section) 35 { 36 Int128 llend = int128_make64(section->offset_within_address_space); 37 llend = int128_add(llend, section->size); 38 llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); 39 40 return llend; 41 } 42 43 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, 44 uint64_t iova_min, 45 uint64_t iova_max) 46 { 47 Int128 llend; 48 49 if ((!memory_region_is_ram(section->mr) && 50 !memory_region_is_iommu(section->mr)) || 51 memory_region_is_protected(section->mr) || 52 /* vhost-vDPA doesn't allow MMIO to be mapped */ 53 memory_region_is_ram_device(section->mr)) { 54 return true; 55 } 56 57 if (section->offset_within_address_space < iova_min) { 58 error_report("RAM section out of device range (min=0x%" PRIx64 59 ", addr=0x%" HWADDR_PRIx ")", 60 iova_min, section->offset_within_address_space); 61 return true; 62 } 63 64 llend = vhost_vdpa_section_end(section); 65 if (int128_gt(llend, int128_make64(iova_max))) { 66 error_report("RAM section out of device range (max=0x%" PRIx64 67 ", end addr=0x%" PRIx64 ")", 68 iova_max, int128_get64(llend)); 69 return true; 70 } 71 72 return false; 73 } 74 75 int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size, 76 void *vaddr, bool readonly) 77 { 78 struct vhost_msg_v2 msg = {}; 79 int fd = v->device_fd; 80 int ret = 0; 81 82 msg.type = v->msg_type; 83 msg.iotlb.iova = iova; 84 msg.iotlb.size = size; 85 msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr; 86 msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW; 87 msg.iotlb.type = VHOST_IOTLB_UPDATE; 88 89 trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size, 90 msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type); 91 92 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 93 error_report("failed to write, fd=%d, errno=%d (%s)", 94 fd, errno, strerror(errno)); 95 return -EIO ; 96 } 97 98 return ret; 99 } 100 101 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size) 102 { 103 struct vhost_msg_v2 msg = {}; 104 int fd = v->device_fd; 105 int ret = 0; 106 107 msg.type = v->msg_type; 108 msg.iotlb.iova = iova; 109 msg.iotlb.size = size; 110 msg.iotlb.type = VHOST_IOTLB_INVALIDATE; 111 112 trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova, 113 msg.iotlb.size, msg.iotlb.type); 114 115 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 116 error_report("failed to write, fd=%d, errno=%d (%s)", 117 fd, errno, strerror(errno)); 118 return -EIO ; 119 } 120 121 return ret; 122 } 123 124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v) 125 { 126 int fd = v->device_fd; 127 struct vhost_msg_v2 msg = { 128 .type = v->msg_type, 129 .iotlb.type = VHOST_IOTLB_BATCH_BEGIN, 130 }; 131 132 trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type); 133 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 134 error_report("failed to write, fd=%d, errno=%d (%s)", 135 fd, errno, strerror(errno)); 136 } 137 } 138 139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v) 140 { 141 if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) && 142 !v->iotlb_batch_begin_sent) { 143 vhost_vdpa_listener_begin_batch(v); 144 } 145 146 v->iotlb_batch_begin_sent = true; 147 } 148 149 static void vhost_vdpa_listener_commit(MemoryListener *listener) 150 { 151 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 152 struct vhost_dev *dev = v->dev; 153 struct vhost_msg_v2 msg = {}; 154 int fd = v->device_fd; 155 156 if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) { 157 return; 158 } 159 160 if (!v->iotlb_batch_begin_sent) { 161 return; 162 } 163 164 msg.type = v->msg_type; 165 msg.iotlb.type = VHOST_IOTLB_BATCH_END; 166 167 trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type); 168 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 169 error_report("failed to write, fd=%d, errno=%d (%s)", 170 fd, errno, strerror(errno)); 171 } 172 173 v->iotlb_batch_begin_sent = false; 174 } 175 176 static void vhost_vdpa_listener_region_add(MemoryListener *listener, 177 MemoryRegionSection *section) 178 { 179 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 180 hwaddr iova; 181 Int128 llend, llsize; 182 void *vaddr; 183 int ret; 184 185 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first, 186 v->iova_range.last)) { 187 return; 188 } 189 190 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 191 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 192 error_report("%s received unaligned region", __func__); 193 return; 194 } 195 196 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 197 llend = vhost_vdpa_section_end(section); 198 if (int128_ge(int128_make64(iova), llend)) { 199 return; 200 } 201 202 memory_region_ref(section->mr); 203 204 /* Here we assume that memory_region_is_ram(section->mr)==true */ 205 206 vaddr = memory_region_get_ram_ptr(section->mr) + 207 section->offset_within_region + 208 (iova - section->offset_within_address_space); 209 210 trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend), 211 vaddr, section->readonly); 212 213 llsize = int128_sub(llend, int128_make64(iova)); 214 if (v->shadow_vqs_enabled) { 215 DMAMap mem_region = { 216 .translated_addr = (hwaddr)(uintptr_t)vaddr, 217 .size = int128_get64(llsize) - 1, 218 .perm = IOMMU_ACCESS_FLAG(true, section->readonly), 219 }; 220 221 int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region); 222 if (unlikely(r != IOVA_OK)) { 223 error_report("Can't allocate a mapping (%d)", r); 224 goto fail; 225 } 226 227 iova = mem_region.iova; 228 } 229 230 vhost_vdpa_iotlb_batch_begin_once(v); 231 ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize), 232 vaddr, section->readonly); 233 if (ret) { 234 error_report("vhost vdpa map fail!"); 235 goto fail; 236 } 237 238 return; 239 240 fail: 241 /* 242 * On the initfn path, store the first error in the container so we 243 * can gracefully fail. Runtime, there's not much we can do other 244 * than throw a hardware error. 245 */ 246 error_report("vhost-vdpa: DMA mapping failed, unable to continue"); 247 return; 248 249 } 250 251 static void vhost_vdpa_listener_region_del(MemoryListener *listener, 252 MemoryRegionSection *section) 253 { 254 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 255 hwaddr iova; 256 Int128 llend, llsize; 257 int ret; 258 259 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first, 260 v->iova_range.last)) { 261 return; 262 } 263 264 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 265 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 266 error_report("%s received unaligned region", __func__); 267 return; 268 } 269 270 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 271 llend = vhost_vdpa_section_end(section); 272 273 trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend)); 274 275 if (int128_ge(int128_make64(iova), llend)) { 276 return; 277 } 278 279 llsize = int128_sub(llend, int128_make64(iova)); 280 281 if (v->shadow_vqs_enabled) { 282 const DMAMap *result; 283 const void *vaddr = memory_region_get_ram_ptr(section->mr) + 284 section->offset_within_region + 285 (iova - section->offset_within_address_space); 286 DMAMap mem_region = { 287 .translated_addr = (hwaddr)(uintptr_t)vaddr, 288 .size = int128_get64(llsize) - 1, 289 }; 290 291 result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region); 292 if (!result) { 293 /* The memory listener map wasn't mapped */ 294 return; 295 } 296 iova = result->iova; 297 vhost_iova_tree_remove(v->iova_tree, result); 298 } 299 vhost_vdpa_iotlb_batch_begin_once(v); 300 ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize)); 301 if (ret) { 302 error_report("vhost_vdpa dma unmap error!"); 303 } 304 305 memory_region_unref(section->mr); 306 } 307 /* 308 * IOTLB API is used by vhost-vdpa which requires incremental updating 309 * of the mapping. So we can not use generic vhost memory listener which 310 * depends on the addnop(). 311 */ 312 static const MemoryListener vhost_vdpa_memory_listener = { 313 .name = "vhost-vdpa", 314 .commit = vhost_vdpa_listener_commit, 315 .region_add = vhost_vdpa_listener_region_add, 316 .region_del = vhost_vdpa_listener_region_del, 317 }; 318 319 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request, 320 void *arg) 321 { 322 struct vhost_vdpa *v = dev->opaque; 323 int fd = v->device_fd; 324 int ret; 325 326 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 327 328 ret = ioctl(fd, request, arg); 329 return ret < 0 ? -errno : ret; 330 } 331 332 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status) 333 { 334 uint8_t s; 335 int ret; 336 337 trace_vhost_vdpa_add_status(dev, status); 338 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); 339 if (ret < 0) { 340 return ret; 341 } 342 343 s |= status; 344 345 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s); 346 if (ret < 0) { 347 return ret; 348 } 349 350 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); 351 if (ret < 0) { 352 return ret; 353 } 354 355 if (!(s & status)) { 356 return -EIO; 357 } 358 359 return 0; 360 } 361 362 static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v) 363 { 364 int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE, 365 &v->iova_range); 366 if (ret != 0) { 367 v->iova_range.first = 0; 368 v->iova_range.last = UINT64_MAX; 369 } 370 371 trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first, 372 v->iova_range.last); 373 } 374 375 /* 376 * The use of this function is for requests that only need to be 377 * applied once. Typically such request occurs at the beginning 378 * of operation, and before setting up queues. It should not be 379 * used for request that performs operation until all queues are 380 * set, which would need to check dev->vq_index_end instead. 381 */ 382 static bool vhost_vdpa_first_dev(struct vhost_dev *dev) 383 { 384 struct vhost_vdpa *v = dev->opaque; 385 386 return v->index == 0; 387 } 388 389 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev, 390 uint64_t *features) 391 { 392 int ret; 393 394 ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features); 395 trace_vhost_vdpa_get_features(dev, *features); 396 return ret; 397 } 398 399 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v, 400 Error **errp) 401 { 402 g_autoptr(GPtrArray) shadow_vqs = NULL; 403 uint64_t dev_features, svq_features; 404 int r; 405 bool ok; 406 407 if (!v->shadow_vqs_enabled) { 408 return 0; 409 } 410 411 r = vhost_vdpa_get_dev_features(hdev, &dev_features); 412 if (r != 0) { 413 error_setg_errno(errp, -r, "Can't get vdpa device features"); 414 return r; 415 } 416 417 svq_features = dev_features; 418 ok = vhost_svq_valid_features(svq_features, errp); 419 if (unlikely(!ok)) { 420 return -1; 421 } 422 423 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free); 424 for (unsigned n = 0; n < hdev->nvqs; ++n) { 425 g_autoptr(VhostShadowVirtqueue) svq; 426 427 svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops, 428 v->shadow_vq_ops_opaque); 429 if (unlikely(!svq)) { 430 error_setg(errp, "Cannot create svq %u", n); 431 return -1; 432 } 433 g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq)); 434 } 435 436 v->shadow_vqs = g_steal_pointer(&shadow_vqs); 437 return 0; 438 } 439 440 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) 441 { 442 struct vhost_vdpa *v; 443 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 444 trace_vhost_vdpa_init(dev, opaque); 445 int ret; 446 447 /* 448 * Similar to VFIO, we end up pinning all guest memory and have to 449 * disable discarding of RAM. 450 */ 451 ret = ram_block_discard_disable(true); 452 if (ret) { 453 error_report("Cannot set discarding of RAM broken"); 454 return ret; 455 } 456 457 v = opaque; 458 v->dev = dev; 459 dev->opaque = opaque ; 460 v->listener = vhost_vdpa_memory_listener; 461 v->msg_type = VHOST_IOTLB_MSG_V2; 462 ret = vhost_vdpa_init_svq(dev, v, errp); 463 if (ret) { 464 goto err; 465 } 466 467 vhost_vdpa_get_iova_range(v); 468 469 if (!vhost_vdpa_first_dev(dev)) { 470 return 0; 471 } 472 473 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | 474 VIRTIO_CONFIG_S_DRIVER); 475 476 return 0; 477 478 err: 479 ram_block_discard_disable(false); 480 return ret; 481 } 482 483 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev, 484 int queue_index) 485 { 486 size_t page_size = qemu_real_host_page_size(); 487 struct vhost_vdpa *v = dev->opaque; 488 VirtIODevice *vdev = dev->vdev; 489 VhostVDPAHostNotifier *n; 490 491 n = &v->notifier[queue_index]; 492 493 if (n->addr) { 494 virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false); 495 object_unparent(OBJECT(&n->mr)); 496 munmap(n->addr, page_size); 497 n->addr = NULL; 498 } 499 } 500 501 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index) 502 { 503 size_t page_size = qemu_real_host_page_size(); 504 struct vhost_vdpa *v = dev->opaque; 505 VirtIODevice *vdev = dev->vdev; 506 VhostVDPAHostNotifier *n; 507 int fd = v->device_fd; 508 void *addr; 509 char *name; 510 511 vhost_vdpa_host_notifier_uninit(dev, queue_index); 512 513 n = &v->notifier[queue_index]; 514 515 addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd, 516 queue_index * page_size); 517 if (addr == MAP_FAILED) { 518 goto err; 519 } 520 521 name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]", 522 v, queue_index); 523 memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name, 524 page_size, addr); 525 g_free(name); 526 527 if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) { 528 object_unparent(OBJECT(&n->mr)); 529 munmap(addr, page_size); 530 goto err; 531 } 532 n->addr = addr; 533 534 return 0; 535 536 err: 537 return -1; 538 } 539 540 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n) 541 { 542 int i; 543 544 for (i = dev->vq_index; i < dev->vq_index + n; i++) { 545 vhost_vdpa_host_notifier_uninit(dev, i); 546 } 547 } 548 549 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev) 550 { 551 struct vhost_vdpa *v = dev->opaque; 552 int i; 553 554 if (v->shadow_vqs_enabled) { 555 /* FIXME SVQ is not compatible with host notifiers mr */ 556 return; 557 } 558 559 for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) { 560 if (vhost_vdpa_host_notifier_init(dev, i)) { 561 goto err; 562 } 563 } 564 565 return; 566 567 err: 568 vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index); 569 return; 570 } 571 572 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev) 573 { 574 struct vhost_vdpa *v = dev->opaque; 575 size_t idx; 576 577 if (!v->shadow_vqs) { 578 return; 579 } 580 581 for (idx = 0; idx < v->shadow_vqs->len; ++idx) { 582 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx)); 583 } 584 g_ptr_array_free(v->shadow_vqs, true); 585 } 586 587 static int vhost_vdpa_cleanup(struct vhost_dev *dev) 588 { 589 struct vhost_vdpa *v; 590 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 591 v = dev->opaque; 592 trace_vhost_vdpa_cleanup(dev, v); 593 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); 594 memory_listener_unregister(&v->listener); 595 vhost_vdpa_svq_cleanup(dev); 596 597 dev->opaque = NULL; 598 ram_block_discard_disable(false); 599 600 return 0; 601 } 602 603 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev) 604 { 605 trace_vhost_vdpa_memslots_limit(dev, INT_MAX); 606 return INT_MAX; 607 } 608 609 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev, 610 struct vhost_memory *mem) 611 { 612 if (!vhost_vdpa_first_dev(dev)) { 613 return 0; 614 } 615 616 trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding); 617 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) && 618 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) { 619 int i; 620 for (i = 0; i < mem->nregions; i++) { 621 trace_vhost_vdpa_dump_regions(dev, i, 622 mem->regions[i].guest_phys_addr, 623 mem->regions[i].memory_size, 624 mem->regions[i].userspace_addr, 625 mem->regions[i].flags_padding); 626 } 627 } 628 if (mem->padding) { 629 return -EINVAL; 630 } 631 632 return 0; 633 } 634 635 static int vhost_vdpa_set_features(struct vhost_dev *dev, 636 uint64_t features) 637 { 638 struct vhost_vdpa *v = dev->opaque; 639 int ret; 640 641 if (!vhost_vdpa_first_dev(dev)) { 642 return 0; 643 } 644 645 if (v->shadow_vqs_enabled) { 646 if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) { 647 /* 648 * QEMU is just trying to enable or disable logging. SVQ handles 649 * this sepparately, so no need to forward this. 650 */ 651 v->acked_features = features; 652 return 0; 653 } 654 655 v->acked_features = features; 656 657 /* We must not ack _F_LOG if SVQ is enabled */ 658 features &= ~BIT_ULL(VHOST_F_LOG_ALL); 659 } 660 661 trace_vhost_vdpa_set_features(dev, features); 662 ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features); 663 if (ret) { 664 return ret; 665 } 666 667 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); 668 } 669 670 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) 671 { 672 uint64_t features; 673 uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 | 674 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH; 675 int r; 676 677 if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) { 678 return -EFAULT; 679 } 680 681 features &= f; 682 683 if (vhost_vdpa_first_dev(dev)) { 684 r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); 685 if (r) { 686 return -EFAULT; 687 } 688 } 689 690 dev->backend_cap = features; 691 692 return 0; 693 } 694 695 static int vhost_vdpa_get_device_id(struct vhost_dev *dev, 696 uint32_t *device_id) 697 { 698 int ret; 699 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id); 700 trace_vhost_vdpa_get_device_id(dev, *device_id); 701 return ret; 702 } 703 704 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v) 705 { 706 if (!v->shadow_vqs_enabled) { 707 return; 708 } 709 710 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { 711 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 712 vhost_svq_stop(svq); 713 } 714 } 715 716 static int vhost_vdpa_reset_device(struct vhost_dev *dev) 717 { 718 struct vhost_vdpa *v = dev->opaque; 719 int ret; 720 uint8_t status = 0; 721 722 vhost_vdpa_reset_svq(v); 723 724 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); 725 trace_vhost_vdpa_reset_device(dev, status); 726 return ret; 727 } 728 729 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) 730 { 731 assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); 732 733 trace_vhost_vdpa_get_vq_index(dev, idx, idx); 734 return idx; 735 } 736 737 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev) 738 { 739 int i; 740 trace_vhost_vdpa_set_vring_ready(dev); 741 for (i = 0; i < dev->nvqs; ++i) { 742 struct vhost_vring_state state = { 743 .index = dev->vq_index + i, 744 .num = 1, 745 }; 746 vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state); 747 } 748 return 0; 749 } 750 751 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config, 752 uint32_t config_len) 753 { 754 int b, len; 755 char line[QEMU_HEXDUMP_LINE_LEN]; 756 757 for (b = 0; b < config_len; b += 16) { 758 len = config_len - b; 759 qemu_hexdump_line(line, b, config, len, false); 760 trace_vhost_vdpa_dump_config(dev, line); 761 } 762 } 763 764 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data, 765 uint32_t offset, uint32_t size, 766 uint32_t flags) 767 { 768 struct vhost_vdpa_config *config; 769 int ret; 770 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 771 772 trace_vhost_vdpa_set_config(dev, offset, size, flags); 773 config = g_malloc(size + config_size); 774 config->off = offset; 775 config->len = size; 776 memcpy(config->buf, data, size); 777 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) && 778 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { 779 vhost_vdpa_dump_config(dev, data, size); 780 } 781 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config); 782 g_free(config); 783 return ret; 784 } 785 786 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config, 787 uint32_t config_len, Error **errp) 788 { 789 struct vhost_vdpa_config *v_config; 790 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 791 int ret; 792 793 trace_vhost_vdpa_get_config(dev, config, config_len); 794 v_config = g_malloc(config_len + config_size); 795 v_config->len = config_len; 796 v_config->off = 0; 797 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config); 798 memcpy(config, v_config->buf, config_len); 799 g_free(v_config); 800 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) && 801 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { 802 vhost_vdpa_dump_config(dev, config, config_len); 803 } 804 return ret; 805 } 806 807 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev, 808 struct vhost_vring_state *ring) 809 { 810 trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num); 811 return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring); 812 } 813 814 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev, 815 struct vhost_vring_file *file) 816 { 817 trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd); 818 return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file); 819 } 820 821 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev, 822 struct vhost_vring_file *file) 823 { 824 trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd); 825 return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file); 826 } 827 828 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev, 829 struct vhost_vring_addr *addr) 830 { 831 trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags, 832 addr->desc_user_addr, addr->used_user_addr, 833 addr->avail_user_addr, 834 addr->log_guest_addr); 835 836 return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr); 837 838 } 839 840 /** 841 * Set the shadow virtqueue descriptors to the device 842 * 843 * @dev: The vhost device model 844 * @svq: The shadow virtqueue 845 * @idx: The index of the virtqueue in the vhost device 846 * @errp: Error 847 * 848 * Note that this function does not rewind kick file descriptor if cannot set 849 * call one. 850 */ 851 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev, 852 VhostShadowVirtqueue *svq, unsigned idx, 853 Error **errp) 854 { 855 struct vhost_vring_file file = { 856 .index = dev->vq_index + idx, 857 }; 858 const EventNotifier *event_notifier = &svq->hdev_kick; 859 int r; 860 861 file.fd = event_notifier_get_fd(event_notifier); 862 r = vhost_vdpa_set_vring_dev_kick(dev, &file); 863 if (unlikely(r != 0)) { 864 error_setg_errno(errp, -r, "Can't set device kick fd"); 865 return r; 866 } 867 868 event_notifier = &svq->hdev_call; 869 file.fd = event_notifier_get_fd(event_notifier); 870 r = vhost_vdpa_set_vring_dev_call(dev, &file); 871 if (unlikely(r != 0)) { 872 error_setg_errno(errp, -r, "Can't set device call fd"); 873 } 874 875 return r; 876 } 877 878 /** 879 * Unmap a SVQ area in the device 880 */ 881 static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, 882 const DMAMap *needle) 883 { 884 const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle); 885 hwaddr size; 886 int r; 887 888 if (unlikely(!result)) { 889 error_report("Unable to find SVQ address to unmap"); 890 return false; 891 } 892 893 size = ROUND_UP(result->size, qemu_real_host_page_size()); 894 r = vhost_vdpa_dma_unmap(v, result->iova, size); 895 return r == 0; 896 } 897 898 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, 899 const VhostShadowVirtqueue *svq) 900 { 901 DMAMap needle = {}; 902 struct vhost_vdpa *v = dev->opaque; 903 struct vhost_vring_addr svq_addr; 904 bool ok; 905 906 vhost_svq_get_vring_addr(svq, &svq_addr); 907 908 needle.translated_addr = svq_addr.desc_user_addr; 909 ok = vhost_vdpa_svq_unmap_ring(v, &needle); 910 if (unlikely(!ok)) { 911 return false; 912 } 913 914 needle.translated_addr = svq_addr.used_user_addr; 915 return vhost_vdpa_svq_unmap_ring(v, &needle); 916 } 917 918 /** 919 * Map the SVQ area in the device 920 * 921 * @v: Vhost-vdpa device 922 * @needle: The area to search iova 923 * @errorp: Error pointer 924 */ 925 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle, 926 Error **errp) 927 { 928 int r; 929 930 r = vhost_iova_tree_map_alloc(v->iova_tree, needle); 931 if (unlikely(r != IOVA_OK)) { 932 error_setg(errp, "Cannot allocate iova (%d)", r); 933 return false; 934 } 935 936 r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1, 937 (void *)(uintptr_t)needle->translated_addr, 938 needle->perm == IOMMU_RO); 939 if (unlikely(r != 0)) { 940 error_setg_errno(errp, -r, "Cannot map region to device"); 941 vhost_iova_tree_remove(v->iova_tree, needle); 942 } 943 944 return r == 0; 945 } 946 947 /** 948 * Map the shadow virtqueue rings in the device 949 * 950 * @dev: The vhost device 951 * @svq: The shadow virtqueue 952 * @addr: Assigned IOVA addresses 953 * @errp: Error pointer 954 */ 955 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, 956 const VhostShadowVirtqueue *svq, 957 struct vhost_vring_addr *addr, 958 Error **errp) 959 { 960 DMAMap device_region, driver_region; 961 struct vhost_vring_addr svq_addr; 962 struct vhost_vdpa *v = dev->opaque; 963 size_t device_size = vhost_svq_device_area_size(svq); 964 size_t driver_size = vhost_svq_driver_area_size(svq); 965 size_t avail_offset; 966 bool ok; 967 968 ERRP_GUARD(); 969 vhost_svq_get_vring_addr(svq, &svq_addr); 970 971 driver_region = (DMAMap) { 972 .translated_addr = svq_addr.desc_user_addr, 973 .size = driver_size - 1, 974 .perm = IOMMU_RO, 975 }; 976 ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); 977 if (unlikely(!ok)) { 978 error_prepend(errp, "Cannot create vq driver region: "); 979 return false; 980 } 981 addr->desc_user_addr = driver_region.iova; 982 avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr; 983 addr->avail_user_addr = driver_region.iova + avail_offset; 984 985 device_region = (DMAMap) { 986 .translated_addr = svq_addr.used_user_addr, 987 .size = device_size - 1, 988 .perm = IOMMU_RW, 989 }; 990 ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); 991 if (unlikely(!ok)) { 992 error_prepend(errp, "Cannot create vq device region: "); 993 vhost_vdpa_svq_unmap_ring(v, &driver_region); 994 } 995 addr->used_user_addr = device_region.iova; 996 997 return ok; 998 } 999 1000 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev, 1001 VhostShadowVirtqueue *svq, unsigned idx, 1002 Error **errp) 1003 { 1004 uint16_t vq_index = dev->vq_index + idx; 1005 struct vhost_vring_state s = { 1006 .index = vq_index, 1007 }; 1008 int r; 1009 1010 r = vhost_vdpa_set_dev_vring_base(dev, &s); 1011 if (unlikely(r)) { 1012 error_setg_errno(errp, -r, "Cannot set vring base"); 1013 return false; 1014 } 1015 1016 r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp); 1017 return r == 0; 1018 } 1019 1020 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev) 1021 { 1022 struct vhost_vdpa *v = dev->opaque; 1023 Error *err = NULL; 1024 unsigned i; 1025 1026 if (!v->shadow_vqs) { 1027 return true; 1028 } 1029 1030 if (v->migration_blocker) { 1031 int r = migrate_add_blocker(v->migration_blocker, &err); 1032 if (unlikely(r < 0)) { 1033 return false; 1034 } 1035 } 1036 1037 for (i = 0; i < v->shadow_vqs->len; ++i) { 1038 VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i); 1039 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 1040 struct vhost_vring_addr addr = { 1041 .index = dev->vq_index + i, 1042 }; 1043 int r; 1044 bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err); 1045 if (unlikely(!ok)) { 1046 goto err; 1047 } 1048 1049 vhost_svq_start(svq, dev->vdev, vq); 1050 ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err); 1051 if (unlikely(!ok)) { 1052 goto err_map; 1053 } 1054 1055 /* Override vring GPA set by vhost subsystem */ 1056 r = vhost_vdpa_set_vring_dev_addr(dev, &addr); 1057 if (unlikely(r != 0)) { 1058 error_setg_errno(&err, -r, "Cannot set device address"); 1059 goto err_set_addr; 1060 } 1061 } 1062 1063 return true; 1064 1065 err_set_addr: 1066 vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i)); 1067 1068 err_map: 1069 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i)); 1070 1071 err: 1072 error_reportf_err(err, "Cannot setup SVQ %u: ", i); 1073 for (unsigned j = 0; j < i; ++j) { 1074 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j); 1075 vhost_vdpa_svq_unmap_rings(dev, svq); 1076 vhost_svq_stop(svq); 1077 } 1078 1079 if (v->migration_blocker) { 1080 migrate_del_blocker(v->migration_blocker); 1081 } 1082 1083 return false; 1084 } 1085 1086 static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev) 1087 { 1088 struct vhost_vdpa *v = dev->opaque; 1089 1090 if (!v->shadow_vqs) { 1091 return true; 1092 } 1093 1094 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { 1095 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 1096 bool ok = vhost_vdpa_svq_unmap_rings(dev, svq); 1097 if (unlikely(!ok)) { 1098 return false; 1099 } 1100 } 1101 1102 if (v->migration_blocker) { 1103 migrate_del_blocker(v->migration_blocker); 1104 } 1105 return true; 1106 } 1107 1108 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) 1109 { 1110 struct vhost_vdpa *v = dev->opaque; 1111 bool ok; 1112 trace_vhost_vdpa_dev_start(dev, started); 1113 1114 if (started) { 1115 vhost_vdpa_host_notifiers_init(dev); 1116 ok = vhost_vdpa_svqs_start(dev); 1117 if (unlikely(!ok)) { 1118 return -1; 1119 } 1120 vhost_vdpa_set_vring_ready(dev); 1121 } else { 1122 ok = vhost_vdpa_svqs_stop(dev); 1123 if (unlikely(!ok)) { 1124 return -1; 1125 } 1126 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); 1127 } 1128 1129 if (dev->vq_index + dev->nvqs != dev->vq_index_end) { 1130 return 0; 1131 } 1132 1133 if (started) { 1134 memory_listener_register(&v->listener, &address_space_memory); 1135 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); 1136 } else { 1137 vhost_vdpa_reset_device(dev); 1138 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | 1139 VIRTIO_CONFIG_S_DRIVER); 1140 memory_listener_unregister(&v->listener); 1141 1142 return 0; 1143 } 1144 } 1145 1146 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, 1147 struct vhost_log *log) 1148 { 1149 struct vhost_vdpa *v = dev->opaque; 1150 if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { 1151 return 0; 1152 } 1153 1154 trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd, 1155 log->log); 1156 return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base); 1157 } 1158 1159 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev, 1160 struct vhost_vring_addr *addr) 1161 { 1162 struct vhost_vdpa *v = dev->opaque; 1163 1164 if (v->shadow_vqs_enabled) { 1165 /* 1166 * Device vring addr was set at device start. SVQ base is handled by 1167 * VirtQueue code. 1168 */ 1169 return 0; 1170 } 1171 1172 return vhost_vdpa_set_vring_dev_addr(dev, addr); 1173 } 1174 1175 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev, 1176 struct vhost_vring_state *ring) 1177 { 1178 trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num); 1179 return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring); 1180 } 1181 1182 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev, 1183 struct vhost_vring_state *ring) 1184 { 1185 struct vhost_vdpa *v = dev->opaque; 1186 VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index); 1187 1188 /* 1189 * vhost-vdpa devices does not support in-flight requests. Set all of them 1190 * as available. 1191 * 1192 * TODO: This is ok for networking, but other kinds of devices might 1193 * have problems with these retransmissions. 1194 */ 1195 while (virtqueue_rewind(vq, 1)) { 1196 continue; 1197 } 1198 if (v->shadow_vqs_enabled) { 1199 /* 1200 * Device vring base was set at device start. SVQ base is handled by 1201 * VirtQueue code. 1202 */ 1203 return 0; 1204 } 1205 1206 return vhost_vdpa_set_dev_vring_base(dev, ring); 1207 } 1208 1209 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, 1210 struct vhost_vring_state *ring) 1211 { 1212 struct vhost_vdpa *v = dev->opaque; 1213 int ret; 1214 1215 if (v->shadow_vqs_enabled) { 1216 ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index); 1217 return 0; 1218 } 1219 1220 ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); 1221 trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num); 1222 return ret; 1223 } 1224 1225 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev, 1226 struct vhost_vring_file *file) 1227 { 1228 struct vhost_vdpa *v = dev->opaque; 1229 int vdpa_idx = file->index - dev->vq_index; 1230 1231 if (v->shadow_vqs_enabled) { 1232 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); 1233 vhost_svq_set_svq_kick_fd(svq, file->fd); 1234 return 0; 1235 } else { 1236 return vhost_vdpa_set_vring_dev_kick(dev, file); 1237 } 1238 } 1239 1240 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev, 1241 struct vhost_vring_file *file) 1242 { 1243 struct vhost_vdpa *v = dev->opaque; 1244 1245 if (v->shadow_vqs_enabled) { 1246 int vdpa_idx = file->index - dev->vq_index; 1247 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); 1248 1249 vhost_svq_set_svq_call_fd(svq, file->fd); 1250 return 0; 1251 } else { 1252 return vhost_vdpa_set_vring_dev_call(dev, file); 1253 } 1254 } 1255 1256 static int vhost_vdpa_get_features(struct vhost_dev *dev, 1257 uint64_t *features) 1258 { 1259 struct vhost_vdpa *v = dev->opaque; 1260 int ret = vhost_vdpa_get_dev_features(dev, features); 1261 1262 if (ret == 0 && v->shadow_vqs_enabled) { 1263 /* Add SVQ logging capabilities */ 1264 *features |= BIT_ULL(VHOST_F_LOG_ALL); 1265 } 1266 1267 return ret; 1268 } 1269 1270 static int vhost_vdpa_set_owner(struct vhost_dev *dev) 1271 { 1272 if (!vhost_vdpa_first_dev(dev)) { 1273 return 0; 1274 } 1275 1276 trace_vhost_vdpa_set_owner(dev); 1277 return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL); 1278 } 1279 1280 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev, 1281 struct vhost_vring_addr *addr, struct vhost_virtqueue *vq) 1282 { 1283 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 1284 addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys; 1285 addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys; 1286 addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys; 1287 trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr, 1288 addr->avail_user_addr, addr->used_user_addr); 1289 return 0; 1290 } 1291 1292 static bool vhost_vdpa_force_iommu(struct vhost_dev *dev) 1293 { 1294 return true; 1295 } 1296 1297 const VhostOps vdpa_ops = { 1298 .backend_type = VHOST_BACKEND_TYPE_VDPA, 1299 .vhost_backend_init = vhost_vdpa_init, 1300 .vhost_backend_cleanup = vhost_vdpa_cleanup, 1301 .vhost_set_log_base = vhost_vdpa_set_log_base, 1302 .vhost_set_vring_addr = vhost_vdpa_set_vring_addr, 1303 .vhost_set_vring_num = vhost_vdpa_set_vring_num, 1304 .vhost_set_vring_base = vhost_vdpa_set_vring_base, 1305 .vhost_get_vring_base = vhost_vdpa_get_vring_base, 1306 .vhost_set_vring_kick = vhost_vdpa_set_vring_kick, 1307 .vhost_set_vring_call = vhost_vdpa_set_vring_call, 1308 .vhost_get_features = vhost_vdpa_get_features, 1309 .vhost_set_backend_cap = vhost_vdpa_set_backend_cap, 1310 .vhost_set_owner = vhost_vdpa_set_owner, 1311 .vhost_set_vring_endian = NULL, 1312 .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit, 1313 .vhost_set_mem_table = vhost_vdpa_set_mem_table, 1314 .vhost_set_features = vhost_vdpa_set_features, 1315 .vhost_reset_device = vhost_vdpa_reset_device, 1316 .vhost_get_vq_index = vhost_vdpa_get_vq_index, 1317 .vhost_get_config = vhost_vdpa_get_config, 1318 .vhost_set_config = vhost_vdpa_set_config, 1319 .vhost_requires_shm_log = NULL, 1320 .vhost_migration_done = NULL, 1321 .vhost_backend_can_merge = NULL, 1322 .vhost_net_set_mtu = NULL, 1323 .vhost_set_iotlb_callback = NULL, 1324 .vhost_send_device_iotlb_msg = NULL, 1325 .vhost_dev_start = vhost_vdpa_dev_start, 1326 .vhost_get_device_id = vhost_vdpa_get_device_id, 1327 .vhost_vq_get_addr = vhost_vdpa_vq_get_addr, 1328 .vhost_force_iommu = vhost_vdpa_force_iommu, 1329 }; 1330