1 /* 2 * vhost-vdpa 3 * 4 * Copyright(c) 2017-2018 Intel Corporation. 5 * Copyright(c) 2020 Red Hat, Inc. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include <linux/vhost.h> 14 #include <linux/vfio.h> 15 #include <sys/eventfd.h> 16 #include <sys/ioctl.h> 17 #include "hw/virtio/vhost.h" 18 #include "hw/virtio/vhost-backend.h" 19 #include "hw/virtio/virtio-net.h" 20 #include "hw/virtio/vhost-shadow-virtqueue.h" 21 #include "hw/virtio/vhost-vdpa.h" 22 #include "exec/address-spaces.h" 23 #include "migration/blocker.h" 24 #include "qemu/cutils.h" 25 #include "qemu/main-loop.h" 26 #include "cpu.h" 27 #include "trace.h" 28 #include "qapi/error.h" 29 30 /* 31 * Return one past the end of the end of section. Be careful with uint64_t 32 * conversions! 33 */ 34 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section) 35 { 36 Int128 llend = int128_make64(section->offset_within_address_space); 37 llend = int128_add(llend, section->size); 38 llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); 39 40 return llend; 41 } 42 43 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, 44 uint64_t iova_min, 45 uint64_t iova_max) 46 { 47 Int128 llend; 48 49 if ((!memory_region_is_ram(section->mr) && 50 !memory_region_is_iommu(section->mr)) || 51 memory_region_is_protected(section->mr) || 52 /* vhost-vDPA doesn't allow MMIO to be mapped */ 53 memory_region_is_ram_device(section->mr)) { 54 return true; 55 } 56 57 if (section->offset_within_address_space < iova_min) { 58 error_report("RAM section out of device range (min=0x%" PRIx64 59 ", addr=0x%" HWADDR_PRIx ")", 60 iova_min, section->offset_within_address_space); 61 return true; 62 } 63 64 llend = vhost_vdpa_section_end(section); 65 if (int128_gt(llend, int128_make64(iova_max))) { 66 error_report("RAM section out of device range (max=0x%" PRIx64 67 ", end addr=0x%" PRIx64 ")", 68 iova_max, int128_get64(llend)); 69 return true; 70 } 71 72 return false; 73 } 74 75 int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size, 76 void *vaddr, bool readonly) 77 { 78 struct vhost_msg_v2 msg = {}; 79 int fd = v->device_fd; 80 int ret = 0; 81 82 msg.type = v->msg_type; 83 msg.iotlb.iova = iova; 84 msg.iotlb.size = size; 85 msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr; 86 msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW; 87 msg.iotlb.type = VHOST_IOTLB_UPDATE; 88 89 trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size, 90 msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type); 91 92 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 93 error_report("failed to write, fd=%d, errno=%d (%s)", 94 fd, errno, strerror(errno)); 95 return -EIO ; 96 } 97 98 return ret; 99 } 100 101 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size) 102 { 103 struct vhost_msg_v2 msg = {}; 104 int fd = v->device_fd; 105 int ret = 0; 106 107 msg.type = v->msg_type; 108 msg.iotlb.iova = iova; 109 msg.iotlb.size = size; 110 msg.iotlb.type = VHOST_IOTLB_INVALIDATE; 111 112 trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova, 113 msg.iotlb.size, msg.iotlb.type); 114 115 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 116 error_report("failed to write, fd=%d, errno=%d (%s)", 117 fd, errno, strerror(errno)); 118 return -EIO ; 119 } 120 121 return ret; 122 } 123 124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v) 125 { 126 int fd = v->device_fd; 127 struct vhost_msg_v2 msg = { 128 .type = v->msg_type, 129 .iotlb.type = VHOST_IOTLB_BATCH_BEGIN, 130 }; 131 132 trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type); 133 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 134 error_report("failed to write, fd=%d, errno=%d (%s)", 135 fd, errno, strerror(errno)); 136 } 137 } 138 139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v) 140 { 141 if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) && 142 !v->iotlb_batch_begin_sent) { 143 vhost_vdpa_listener_begin_batch(v); 144 } 145 146 v->iotlb_batch_begin_sent = true; 147 } 148 149 static void vhost_vdpa_listener_commit(MemoryListener *listener) 150 { 151 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 152 struct vhost_dev *dev = v->dev; 153 struct vhost_msg_v2 msg = {}; 154 int fd = v->device_fd; 155 156 if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) { 157 return; 158 } 159 160 if (!v->iotlb_batch_begin_sent) { 161 return; 162 } 163 164 msg.type = v->msg_type; 165 msg.iotlb.type = VHOST_IOTLB_BATCH_END; 166 167 trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type); 168 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { 169 error_report("failed to write, fd=%d, errno=%d (%s)", 170 fd, errno, strerror(errno)); 171 } 172 173 v->iotlb_batch_begin_sent = false; 174 } 175 176 static void vhost_vdpa_listener_region_add(MemoryListener *listener, 177 MemoryRegionSection *section) 178 { 179 DMAMap mem_region = {}; 180 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 181 hwaddr iova; 182 Int128 llend, llsize; 183 void *vaddr; 184 int ret; 185 186 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first, 187 v->iova_range.last)) { 188 return; 189 } 190 191 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 192 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 193 error_report("%s received unaligned region", __func__); 194 return; 195 } 196 197 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 198 llend = vhost_vdpa_section_end(section); 199 if (int128_ge(int128_make64(iova), llend)) { 200 return; 201 } 202 203 memory_region_ref(section->mr); 204 205 /* Here we assume that memory_region_is_ram(section->mr)==true */ 206 207 vaddr = memory_region_get_ram_ptr(section->mr) + 208 section->offset_within_region + 209 (iova - section->offset_within_address_space); 210 211 trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend), 212 vaddr, section->readonly); 213 214 llsize = int128_sub(llend, int128_make64(iova)); 215 if (v->shadow_vqs_enabled) { 216 int r; 217 218 mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr, 219 mem_region.size = int128_get64(llsize) - 1, 220 mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly), 221 222 r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region); 223 if (unlikely(r != IOVA_OK)) { 224 error_report("Can't allocate a mapping (%d)", r); 225 goto fail; 226 } 227 228 iova = mem_region.iova; 229 } 230 231 vhost_vdpa_iotlb_batch_begin_once(v); 232 ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize), 233 vaddr, section->readonly); 234 if (ret) { 235 error_report("vhost vdpa map fail!"); 236 goto fail_map; 237 } 238 239 return; 240 241 fail_map: 242 if (v->shadow_vqs_enabled) { 243 vhost_iova_tree_remove(v->iova_tree, mem_region); 244 } 245 246 fail: 247 /* 248 * On the initfn path, store the first error in the container so we 249 * can gracefully fail. Runtime, there's not much we can do other 250 * than throw a hardware error. 251 */ 252 error_report("vhost-vdpa: DMA mapping failed, unable to continue"); 253 return; 254 255 } 256 257 static void vhost_vdpa_listener_region_del(MemoryListener *listener, 258 MemoryRegionSection *section) 259 { 260 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); 261 hwaddr iova; 262 Int128 llend, llsize; 263 int ret; 264 265 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first, 266 v->iova_range.last)) { 267 return; 268 } 269 270 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 271 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 272 error_report("%s received unaligned region", __func__); 273 return; 274 } 275 276 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 277 llend = vhost_vdpa_section_end(section); 278 279 trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend)); 280 281 if (int128_ge(int128_make64(iova), llend)) { 282 return; 283 } 284 285 llsize = int128_sub(llend, int128_make64(iova)); 286 287 if (v->shadow_vqs_enabled) { 288 const DMAMap *result; 289 const void *vaddr = memory_region_get_ram_ptr(section->mr) + 290 section->offset_within_region + 291 (iova - section->offset_within_address_space); 292 DMAMap mem_region = { 293 .translated_addr = (hwaddr)(uintptr_t)vaddr, 294 .size = int128_get64(llsize) - 1, 295 }; 296 297 result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region); 298 if (!result) { 299 /* The memory listener map wasn't mapped */ 300 return; 301 } 302 iova = result->iova; 303 vhost_iova_tree_remove(v->iova_tree, *result); 304 } 305 vhost_vdpa_iotlb_batch_begin_once(v); 306 ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize)); 307 if (ret) { 308 error_report("vhost_vdpa dma unmap error!"); 309 } 310 311 memory_region_unref(section->mr); 312 } 313 /* 314 * IOTLB API is used by vhost-vdpa which requires incremental updating 315 * of the mapping. So we can not use generic vhost memory listener which 316 * depends on the addnop(). 317 */ 318 static const MemoryListener vhost_vdpa_memory_listener = { 319 .name = "vhost-vdpa", 320 .commit = vhost_vdpa_listener_commit, 321 .region_add = vhost_vdpa_listener_region_add, 322 .region_del = vhost_vdpa_listener_region_del, 323 }; 324 325 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request, 326 void *arg) 327 { 328 struct vhost_vdpa *v = dev->opaque; 329 int fd = v->device_fd; 330 int ret; 331 332 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 333 334 ret = ioctl(fd, request, arg); 335 return ret < 0 ? -errno : ret; 336 } 337 338 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status) 339 { 340 uint8_t s; 341 int ret; 342 343 trace_vhost_vdpa_add_status(dev, status); 344 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); 345 if (ret < 0) { 346 return ret; 347 } 348 349 s |= status; 350 351 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s); 352 if (ret < 0) { 353 return ret; 354 } 355 356 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); 357 if (ret < 0) { 358 return ret; 359 } 360 361 if (!(s & status)) { 362 return -EIO; 363 } 364 365 return 0; 366 } 367 368 static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v) 369 { 370 int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE, 371 &v->iova_range); 372 if (ret != 0) { 373 v->iova_range.first = 0; 374 v->iova_range.last = UINT64_MAX; 375 } 376 377 trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first, 378 v->iova_range.last); 379 } 380 381 /* 382 * The use of this function is for requests that only need to be 383 * applied once. Typically such request occurs at the beginning 384 * of operation, and before setting up queues. It should not be 385 * used for request that performs operation until all queues are 386 * set, which would need to check dev->vq_index_end instead. 387 */ 388 static bool vhost_vdpa_first_dev(struct vhost_dev *dev) 389 { 390 struct vhost_vdpa *v = dev->opaque; 391 392 return v->index == 0; 393 } 394 395 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev, 396 uint64_t *features) 397 { 398 int ret; 399 400 ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features); 401 trace_vhost_vdpa_get_features(dev, *features); 402 return ret; 403 } 404 405 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v, 406 Error **errp) 407 { 408 g_autoptr(GPtrArray) shadow_vqs = NULL; 409 uint64_t dev_features, svq_features; 410 int r; 411 bool ok; 412 413 if (!v->shadow_vqs_enabled) { 414 return 0; 415 } 416 417 r = vhost_vdpa_get_dev_features(hdev, &dev_features); 418 if (r != 0) { 419 error_setg_errno(errp, -r, "Can't get vdpa device features"); 420 return r; 421 } 422 423 svq_features = dev_features; 424 ok = vhost_svq_valid_features(svq_features, errp); 425 if (unlikely(!ok)) { 426 return -1; 427 } 428 429 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free); 430 for (unsigned n = 0; n < hdev->nvqs; ++n) { 431 VhostShadowVirtqueue *svq; 432 433 svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops, 434 v->shadow_vq_ops_opaque); 435 g_ptr_array_add(shadow_vqs, svq); 436 } 437 438 v->shadow_vqs = g_steal_pointer(&shadow_vqs); 439 return 0; 440 } 441 442 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) 443 { 444 struct vhost_vdpa *v; 445 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 446 trace_vhost_vdpa_init(dev, opaque); 447 int ret; 448 449 /* 450 * Similar to VFIO, we end up pinning all guest memory and have to 451 * disable discarding of RAM. 452 */ 453 ret = ram_block_discard_disable(true); 454 if (ret) { 455 error_report("Cannot set discarding of RAM broken"); 456 return ret; 457 } 458 459 v = opaque; 460 v->dev = dev; 461 dev->opaque = opaque ; 462 v->listener = vhost_vdpa_memory_listener; 463 v->msg_type = VHOST_IOTLB_MSG_V2; 464 ret = vhost_vdpa_init_svq(dev, v, errp); 465 if (ret) { 466 goto err; 467 } 468 469 vhost_vdpa_get_iova_range(v); 470 471 if (!vhost_vdpa_first_dev(dev)) { 472 return 0; 473 } 474 475 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | 476 VIRTIO_CONFIG_S_DRIVER); 477 478 return 0; 479 480 err: 481 ram_block_discard_disable(false); 482 return ret; 483 } 484 485 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev, 486 int queue_index) 487 { 488 size_t page_size = qemu_real_host_page_size(); 489 struct vhost_vdpa *v = dev->opaque; 490 VirtIODevice *vdev = dev->vdev; 491 VhostVDPAHostNotifier *n; 492 493 n = &v->notifier[queue_index]; 494 495 if (n->addr) { 496 virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false); 497 object_unparent(OBJECT(&n->mr)); 498 munmap(n->addr, page_size); 499 n->addr = NULL; 500 } 501 } 502 503 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index) 504 { 505 size_t page_size = qemu_real_host_page_size(); 506 struct vhost_vdpa *v = dev->opaque; 507 VirtIODevice *vdev = dev->vdev; 508 VhostVDPAHostNotifier *n; 509 int fd = v->device_fd; 510 void *addr; 511 char *name; 512 513 vhost_vdpa_host_notifier_uninit(dev, queue_index); 514 515 n = &v->notifier[queue_index]; 516 517 addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd, 518 queue_index * page_size); 519 if (addr == MAP_FAILED) { 520 goto err; 521 } 522 523 name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]", 524 v, queue_index); 525 memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name, 526 page_size, addr); 527 g_free(name); 528 529 if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) { 530 object_unparent(OBJECT(&n->mr)); 531 munmap(addr, page_size); 532 goto err; 533 } 534 n->addr = addr; 535 536 return 0; 537 538 err: 539 return -1; 540 } 541 542 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n) 543 { 544 int i; 545 546 for (i = dev->vq_index; i < dev->vq_index + n; i++) { 547 vhost_vdpa_host_notifier_uninit(dev, i); 548 } 549 } 550 551 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev) 552 { 553 struct vhost_vdpa *v = dev->opaque; 554 int i; 555 556 if (v->shadow_vqs_enabled) { 557 /* FIXME SVQ is not compatible with host notifiers mr */ 558 return; 559 } 560 561 for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) { 562 if (vhost_vdpa_host_notifier_init(dev, i)) { 563 goto err; 564 } 565 } 566 567 return; 568 569 err: 570 vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index); 571 return; 572 } 573 574 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev) 575 { 576 struct vhost_vdpa *v = dev->opaque; 577 size_t idx; 578 579 if (!v->shadow_vqs) { 580 return; 581 } 582 583 for (idx = 0; idx < v->shadow_vqs->len; ++idx) { 584 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx)); 585 } 586 g_ptr_array_free(v->shadow_vqs, true); 587 } 588 589 static int vhost_vdpa_cleanup(struct vhost_dev *dev) 590 { 591 struct vhost_vdpa *v; 592 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 593 v = dev->opaque; 594 trace_vhost_vdpa_cleanup(dev, v); 595 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); 596 memory_listener_unregister(&v->listener); 597 vhost_vdpa_svq_cleanup(dev); 598 599 dev->opaque = NULL; 600 ram_block_discard_disable(false); 601 602 return 0; 603 } 604 605 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev) 606 { 607 trace_vhost_vdpa_memslots_limit(dev, INT_MAX); 608 return INT_MAX; 609 } 610 611 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev, 612 struct vhost_memory *mem) 613 { 614 if (!vhost_vdpa_first_dev(dev)) { 615 return 0; 616 } 617 618 trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding); 619 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) && 620 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) { 621 int i; 622 for (i = 0; i < mem->nregions; i++) { 623 trace_vhost_vdpa_dump_regions(dev, i, 624 mem->regions[i].guest_phys_addr, 625 mem->regions[i].memory_size, 626 mem->regions[i].userspace_addr, 627 mem->regions[i].flags_padding); 628 } 629 } 630 if (mem->padding) { 631 return -EINVAL; 632 } 633 634 return 0; 635 } 636 637 static int vhost_vdpa_set_features(struct vhost_dev *dev, 638 uint64_t features) 639 { 640 struct vhost_vdpa *v = dev->opaque; 641 int ret; 642 643 if (!vhost_vdpa_first_dev(dev)) { 644 return 0; 645 } 646 647 if (v->shadow_vqs_enabled) { 648 if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) { 649 /* 650 * QEMU is just trying to enable or disable logging. SVQ handles 651 * this sepparately, so no need to forward this. 652 */ 653 v->acked_features = features; 654 return 0; 655 } 656 657 v->acked_features = features; 658 659 /* We must not ack _F_LOG if SVQ is enabled */ 660 features &= ~BIT_ULL(VHOST_F_LOG_ALL); 661 } 662 663 trace_vhost_vdpa_set_features(dev, features); 664 ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features); 665 if (ret) { 666 return ret; 667 } 668 669 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); 670 } 671 672 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) 673 { 674 uint64_t features; 675 uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 | 676 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH; 677 int r; 678 679 if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) { 680 return -EFAULT; 681 } 682 683 features &= f; 684 685 if (vhost_vdpa_first_dev(dev)) { 686 r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); 687 if (r) { 688 return -EFAULT; 689 } 690 } 691 692 dev->backend_cap = features; 693 694 return 0; 695 } 696 697 static int vhost_vdpa_get_device_id(struct vhost_dev *dev, 698 uint32_t *device_id) 699 { 700 int ret; 701 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id); 702 trace_vhost_vdpa_get_device_id(dev, *device_id); 703 return ret; 704 } 705 706 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v) 707 { 708 if (!v->shadow_vqs_enabled) { 709 return; 710 } 711 712 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { 713 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 714 vhost_svq_stop(svq); 715 } 716 } 717 718 static int vhost_vdpa_reset_device(struct vhost_dev *dev) 719 { 720 struct vhost_vdpa *v = dev->opaque; 721 int ret; 722 uint8_t status = 0; 723 724 vhost_vdpa_reset_svq(v); 725 726 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); 727 trace_vhost_vdpa_reset_device(dev, status); 728 return ret; 729 } 730 731 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) 732 { 733 assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); 734 735 trace_vhost_vdpa_get_vq_index(dev, idx, idx); 736 return idx; 737 } 738 739 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev) 740 { 741 int i; 742 trace_vhost_vdpa_set_vring_ready(dev); 743 for (i = 0; i < dev->nvqs; ++i) { 744 struct vhost_vring_state state = { 745 .index = dev->vq_index + i, 746 .num = 1, 747 }; 748 vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state); 749 } 750 return 0; 751 } 752 753 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config, 754 uint32_t config_len) 755 { 756 int b, len; 757 char line[QEMU_HEXDUMP_LINE_LEN]; 758 759 for (b = 0; b < config_len; b += 16) { 760 len = config_len - b; 761 qemu_hexdump_line(line, b, config, len, false); 762 trace_vhost_vdpa_dump_config(dev, line); 763 } 764 } 765 766 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data, 767 uint32_t offset, uint32_t size, 768 uint32_t flags) 769 { 770 struct vhost_vdpa_config *config; 771 int ret; 772 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 773 774 trace_vhost_vdpa_set_config(dev, offset, size, flags); 775 config = g_malloc(size + config_size); 776 config->off = offset; 777 config->len = size; 778 memcpy(config->buf, data, size); 779 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) && 780 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { 781 vhost_vdpa_dump_config(dev, data, size); 782 } 783 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config); 784 g_free(config); 785 return ret; 786 } 787 788 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config, 789 uint32_t config_len, Error **errp) 790 { 791 struct vhost_vdpa_config *v_config; 792 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); 793 int ret; 794 795 trace_vhost_vdpa_get_config(dev, config, config_len); 796 v_config = g_malloc(config_len + config_size); 797 v_config->len = config_len; 798 v_config->off = 0; 799 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config); 800 memcpy(config, v_config->buf, config_len); 801 g_free(v_config); 802 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) && 803 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { 804 vhost_vdpa_dump_config(dev, config, config_len); 805 } 806 return ret; 807 } 808 809 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev, 810 struct vhost_vring_state *ring) 811 { 812 trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num); 813 return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring); 814 } 815 816 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev, 817 struct vhost_vring_file *file) 818 { 819 trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd); 820 return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file); 821 } 822 823 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev, 824 struct vhost_vring_file *file) 825 { 826 trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd); 827 return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file); 828 } 829 830 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev, 831 struct vhost_vring_addr *addr) 832 { 833 trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags, 834 addr->desc_user_addr, addr->used_user_addr, 835 addr->avail_user_addr, 836 addr->log_guest_addr); 837 838 return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr); 839 840 } 841 842 /** 843 * Set the shadow virtqueue descriptors to the device 844 * 845 * @dev: The vhost device model 846 * @svq: The shadow virtqueue 847 * @idx: The index of the virtqueue in the vhost device 848 * @errp: Error 849 * 850 * Note that this function does not rewind kick file descriptor if cannot set 851 * call one. 852 */ 853 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev, 854 VhostShadowVirtqueue *svq, unsigned idx, 855 Error **errp) 856 { 857 struct vhost_vring_file file = { 858 .index = dev->vq_index + idx, 859 }; 860 const EventNotifier *event_notifier = &svq->hdev_kick; 861 int r; 862 863 r = event_notifier_init(&svq->hdev_kick, 0); 864 if (r != 0) { 865 error_setg_errno(errp, -r, "Couldn't create kick event notifier"); 866 goto err_init_hdev_kick; 867 } 868 869 r = event_notifier_init(&svq->hdev_call, 0); 870 if (r != 0) { 871 error_setg_errno(errp, -r, "Couldn't create call event notifier"); 872 goto err_init_hdev_call; 873 } 874 875 file.fd = event_notifier_get_fd(event_notifier); 876 r = vhost_vdpa_set_vring_dev_kick(dev, &file); 877 if (unlikely(r != 0)) { 878 error_setg_errno(errp, -r, "Can't set device kick fd"); 879 goto err_init_set_dev_fd; 880 } 881 882 event_notifier = &svq->hdev_call; 883 file.fd = event_notifier_get_fd(event_notifier); 884 r = vhost_vdpa_set_vring_dev_call(dev, &file); 885 if (unlikely(r != 0)) { 886 error_setg_errno(errp, -r, "Can't set device call fd"); 887 goto err_init_set_dev_fd; 888 } 889 890 return 0; 891 892 err_init_set_dev_fd: 893 event_notifier_set_handler(&svq->hdev_call, NULL); 894 895 err_init_hdev_call: 896 event_notifier_cleanup(&svq->hdev_kick); 897 898 err_init_hdev_kick: 899 return r; 900 } 901 902 /** 903 * Unmap a SVQ area in the device 904 */ 905 static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr) 906 { 907 const DMAMap needle = { 908 .translated_addr = addr, 909 }; 910 const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, &needle); 911 hwaddr size; 912 int r; 913 914 if (unlikely(!result)) { 915 error_report("Unable to find SVQ address to unmap"); 916 return; 917 } 918 919 size = ROUND_UP(result->size, qemu_real_host_page_size()); 920 r = vhost_vdpa_dma_unmap(v, result->iova, size); 921 if (unlikely(r < 0)) { 922 error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r); 923 return; 924 } 925 926 vhost_iova_tree_remove(v->iova_tree, *result); 927 } 928 929 static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, 930 const VhostShadowVirtqueue *svq) 931 { 932 struct vhost_vdpa *v = dev->opaque; 933 struct vhost_vring_addr svq_addr; 934 935 vhost_svq_get_vring_addr(svq, &svq_addr); 936 937 vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr); 938 939 vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr); 940 } 941 942 /** 943 * Map the SVQ area in the device 944 * 945 * @v: Vhost-vdpa device 946 * @needle: The area to search iova 947 * @errorp: Error pointer 948 */ 949 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle, 950 Error **errp) 951 { 952 int r; 953 954 r = vhost_iova_tree_map_alloc(v->iova_tree, needle); 955 if (unlikely(r != IOVA_OK)) { 956 error_setg(errp, "Cannot allocate iova (%d)", r); 957 return false; 958 } 959 960 r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1, 961 (void *)(uintptr_t)needle->translated_addr, 962 needle->perm == IOMMU_RO); 963 if (unlikely(r != 0)) { 964 error_setg_errno(errp, -r, "Cannot map region to device"); 965 vhost_iova_tree_remove(v->iova_tree, *needle); 966 } 967 968 return r == 0; 969 } 970 971 /** 972 * Map the shadow virtqueue rings in the device 973 * 974 * @dev: The vhost device 975 * @svq: The shadow virtqueue 976 * @addr: Assigned IOVA addresses 977 * @errp: Error pointer 978 */ 979 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, 980 const VhostShadowVirtqueue *svq, 981 struct vhost_vring_addr *addr, 982 Error **errp) 983 { 984 ERRP_GUARD(); 985 DMAMap device_region, driver_region; 986 struct vhost_vring_addr svq_addr; 987 struct vhost_vdpa *v = dev->opaque; 988 size_t device_size = vhost_svq_device_area_size(svq); 989 size_t driver_size = vhost_svq_driver_area_size(svq); 990 size_t avail_offset; 991 bool ok; 992 993 vhost_svq_get_vring_addr(svq, &svq_addr); 994 995 driver_region = (DMAMap) { 996 .translated_addr = svq_addr.desc_user_addr, 997 .size = driver_size - 1, 998 .perm = IOMMU_RO, 999 }; 1000 ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); 1001 if (unlikely(!ok)) { 1002 error_prepend(errp, "Cannot create vq driver region: "); 1003 return false; 1004 } 1005 addr->desc_user_addr = driver_region.iova; 1006 avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr; 1007 addr->avail_user_addr = driver_region.iova + avail_offset; 1008 1009 device_region = (DMAMap) { 1010 .translated_addr = svq_addr.used_user_addr, 1011 .size = device_size - 1, 1012 .perm = IOMMU_RW, 1013 }; 1014 ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); 1015 if (unlikely(!ok)) { 1016 error_prepend(errp, "Cannot create vq device region: "); 1017 vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr); 1018 } 1019 addr->used_user_addr = device_region.iova; 1020 1021 return ok; 1022 } 1023 1024 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev, 1025 VhostShadowVirtqueue *svq, unsigned idx, 1026 Error **errp) 1027 { 1028 uint16_t vq_index = dev->vq_index + idx; 1029 struct vhost_vring_state s = { 1030 .index = vq_index, 1031 }; 1032 int r; 1033 1034 r = vhost_vdpa_set_dev_vring_base(dev, &s); 1035 if (unlikely(r)) { 1036 error_setg_errno(errp, -r, "Cannot set vring base"); 1037 return false; 1038 } 1039 1040 r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp); 1041 return r == 0; 1042 } 1043 1044 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev) 1045 { 1046 struct vhost_vdpa *v = dev->opaque; 1047 Error *err = NULL; 1048 unsigned i; 1049 1050 if (!v->shadow_vqs_enabled) { 1051 return true; 1052 } 1053 1054 for (i = 0; i < v->shadow_vqs->len; ++i) { 1055 VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i); 1056 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 1057 struct vhost_vring_addr addr = { 1058 .index = dev->vq_index + i, 1059 }; 1060 int r; 1061 bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err); 1062 if (unlikely(!ok)) { 1063 goto err; 1064 } 1065 1066 vhost_svq_start(svq, dev->vdev, vq); 1067 ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err); 1068 if (unlikely(!ok)) { 1069 goto err_map; 1070 } 1071 1072 /* Override vring GPA set by vhost subsystem */ 1073 r = vhost_vdpa_set_vring_dev_addr(dev, &addr); 1074 if (unlikely(r != 0)) { 1075 error_setg_errno(&err, -r, "Cannot set device address"); 1076 goto err_set_addr; 1077 } 1078 } 1079 1080 return true; 1081 1082 err_set_addr: 1083 vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i)); 1084 1085 err_map: 1086 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i)); 1087 1088 err: 1089 error_reportf_err(err, "Cannot setup SVQ %u: ", i); 1090 for (unsigned j = 0; j < i; ++j) { 1091 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j); 1092 vhost_vdpa_svq_unmap_rings(dev, svq); 1093 vhost_svq_stop(svq); 1094 } 1095 1096 return false; 1097 } 1098 1099 static void vhost_vdpa_svqs_stop(struct vhost_dev *dev) 1100 { 1101 struct vhost_vdpa *v = dev->opaque; 1102 1103 if (!v->shadow_vqs_enabled) { 1104 return; 1105 } 1106 1107 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { 1108 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); 1109 vhost_vdpa_svq_unmap_rings(dev, svq); 1110 1111 event_notifier_cleanup(&svq->hdev_kick); 1112 event_notifier_cleanup(&svq->hdev_call); 1113 } 1114 } 1115 1116 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) 1117 { 1118 struct vhost_vdpa *v = dev->opaque; 1119 bool ok; 1120 trace_vhost_vdpa_dev_start(dev, started); 1121 1122 if (started) { 1123 vhost_vdpa_host_notifiers_init(dev); 1124 ok = vhost_vdpa_svqs_start(dev); 1125 if (unlikely(!ok)) { 1126 return -1; 1127 } 1128 vhost_vdpa_set_vring_ready(dev); 1129 } else { 1130 vhost_vdpa_svqs_stop(dev); 1131 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); 1132 } 1133 1134 if (dev->vq_index + dev->nvqs != dev->vq_index_end) { 1135 return 0; 1136 } 1137 1138 if (started) { 1139 memory_listener_register(&v->listener, &address_space_memory); 1140 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); 1141 } else { 1142 vhost_vdpa_reset_device(dev); 1143 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | 1144 VIRTIO_CONFIG_S_DRIVER); 1145 memory_listener_unregister(&v->listener); 1146 1147 return 0; 1148 } 1149 } 1150 1151 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, 1152 struct vhost_log *log) 1153 { 1154 struct vhost_vdpa *v = dev->opaque; 1155 if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { 1156 return 0; 1157 } 1158 1159 trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd, 1160 log->log); 1161 return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base); 1162 } 1163 1164 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev, 1165 struct vhost_vring_addr *addr) 1166 { 1167 struct vhost_vdpa *v = dev->opaque; 1168 1169 if (v->shadow_vqs_enabled) { 1170 /* 1171 * Device vring addr was set at device start. SVQ base is handled by 1172 * VirtQueue code. 1173 */ 1174 return 0; 1175 } 1176 1177 return vhost_vdpa_set_vring_dev_addr(dev, addr); 1178 } 1179 1180 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev, 1181 struct vhost_vring_state *ring) 1182 { 1183 trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num); 1184 return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring); 1185 } 1186 1187 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev, 1188 struct vhost_vring_state *ring) 1189 { 1190 struct vhost_vdpa *v = dev->opaque; 1191 VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index); 1192 1193 /* 1194 * vhost-vdpa devices does not support in-flight requests. Set all of them 1195 * as available. 1196 * 1197 * TODO: This is ok for networking, but other kinds of devices might 1198 * have problems with these retransmissions. 1199 */ 1200 while (virtqueue_rewind(vq, 1)) { 1201 continue; 1202 } 1203 if (v->shadow_vqs_enabled) { 1204 /* 1205 * Device vring base was set at device start. SVQ base is handled by 1206 * VirtQueue code. 1207 */ 1208 return 0; 1209 } 1210 1211 return vhost_vdpa_set_dev_vring_base(dev, ring); 1212 } 1213 1214 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, 1215 struct vhost_vring_state *ring) 1216 { 1217 struct vhost_vdpa *v = dev->opaque; 1218 int ret; 1219 1220 if (v->shadow_vqs_enabled) { 1221 ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index); 1222 return 0; 1223 } 1224 1225 ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); 1226 trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num); 1227 return ret; 1228 } 1229 1230 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev, 1231 struct vhost_vring_file *file) 1232 { 1233 struct vhost_vdpa *v = dev->opaque; 1234 int vdpa_idx = file->index - dev->vq_index; 1235 1236 if (v->shadow_vqs_enabled) { 1237 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); 1238 vhost_svq_set_svq_kick_fd(svq, file->fd); 1239 return 0; 1240 } else { 1241 return vhost_vdpa_set_vring_dev_kick(dev, file); 1242 } 1243 } 1244 1245 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev, 1246 struct vhost_vring_file *file) 1247 { 1248 struct vhost_vdpa *v = dev->opaque; 1249 1250 if (v->shadow_vqs_enabled) { 1251 int vdpa_idx = file->index - dev->vq_index; 1252 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); 1253 1254 vhost_svq_set_svq_call_fd(svq, file->fd); 1255 return 0; 1256 } else { 1257 return vhost_vdpa_set_vring_dev_call(dev, file); 1258 } 1259 } 1260 1261 static int vhost_vdpa_get_features(struct vhost_dev *dev, 1262 uint64_t *features) 1263 { 1264 struct vhost_vdpa *v = dev->opaque; 1265 int ret = vhost_vdpa_get_dev_features(dev, features); 1266 1267 if (ret == 0 && v->shadow_vqs_enabled) { 1268 /* Add SVQ logging capabilities */ 1269 *features |= BIT_ULL(VHOST_F_LOG_ALL); 1270 } 1271 1272 return ret; 1273 } 1274 1275 static int vhost_vdpa_set_owner(struct vhost_dev *dev) 1276 { 1277 if (!vhost_vdpa_first_dev(dev)) { 1278 return 0; 1279 } 1280 1281 trace_vhost_vdpa_set_owner(dev); 1282 return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL); 1283 } 1284 1285 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev, 1286 struct vhost_vring_addr *addr, struct vhost_virtqueue *vq) 1287 { 1288 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); 1289 addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys; 1290 addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys; 1291 addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys; 1292 trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr, 1293 addr->avail_user_addr, addr->used_user_addr); 1294 return 0; 1295 } 1296 1297 static bool vhost_vdpa_force_iommu(struct vhost_dev *dev) 1298 { 1299 return true; 1300 } 1301 1302 const VhostOps vdpa_ops = { 1303 .backend_type = VHOST_BACKEND_TYPE_VDPA, 1304 .vhost_backend_init = vhost_vdpa_init, 1305 .vhost_backend_cleanup = vhost_vdpa_cleanup, 1306 .vhost_set_log_base = vhost_vdpa_set_log_base, 1307 .vhost_set_vring_addr = vhost_vdpa_set_vring_addr, 1308 .vhost_set_vring_num = vhost_vdpa_set_vring_num, 1309 .vhost_set_vring_base = vhost_vdpa_set_vring_base, 1310 .vhost_get_vring_base = vhost_vdpa_get_vring_base, 1311 .vhost_set_vring_kick = vhost_vdpa_set_vring_kick, 1312 .vhost_set_vring_call = vhost_vdpa_set_vring_call, 1313 .vhost_get_features = vhost_vdpa_get_features, 1314 .vhost_set_backend_cap = vhost_vdpa_set_backend_cap, 1315 .vhost_set_owner = vhost_vdpa_set_owner, 1316 .vhost_set_vring_endian = NULL, 1317 .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit, 1318 .vhost_set_mem_table = vhost_vdpa_set_mem_table, 1319 .vhost_set_features = vhost_vdpa_set_features, 1320 .vhost_reset_device = vhost_vdpa_reset_device, 1321 .vhost_get_vq_index = vhost_vdpa_get_vq_index, 1322 .vhost_get_config = vhost_vdpa_get_config, 1323 .vhost_set_config = vhost_vdpa_set_config, 1324 .vhost_requires_shm_log = NULL, 1325 .vhost_migration_done = NULL, 1326 .vhost_backend_can_merge = NULL, 1327 .vhost_net_set_mtu = NULL, 1328 .vhost_set_iotlb_callback = NULL, 1329 .vhost_send_device_iotlb_msg = NULL, 1330 .vhost_dev_start = vhost_vdpa_dev_start, 1331 .vhost_get_device_id = vhost_vdpa_get_device_id, 1332 .vhost_vq_get_addr = vhost_vdpa_vq_get_addr, 1333 .vhost_force_iommu = vhost_vdpa_force_iommu, 1334 }; 1335