1 /* 2 * vhost support 3 * 4 * Copyright Red Hat, Inc. 2010 5 * 6 * Authors: 7 * Michael S. Tsirkin <mst@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "hw/virtio/vhost.h" 19 #include "hw/hw.h" 20 #include "qemu/atomic.h" 21 #include "qemu/range.h" 22 #include "qemu/error-report.h" 23 #include "qemu/memfd.h" 24 #include <linux/vhost.h> 25 #include "exec/address-spaces.h" 26 #include "hw/virtio/virtio-bus.h" 27 #include "hw/virtio/virtio-access.h" 28 #include "migration/blocker.h" 29 #include "sysemu/dma.h" 30 31 /* enabled until disconnected backend stabilizes */ 32 #define _VHOST_DEBUG 1 33 34 #ifdef _VHOST_DEBUG 35 #define VHOST_OPS_DEBUG(fmt, ...) \ 36 do { error_report(fmt ": %s (%d)", ## __VA_ARGS__, \ 37 strerror(errno), errno); } while (0) 38 #else 39 #define VHOST_OPS_DEBUG(fmt, ...) \ 40 do { } while (0) 41 #endif 42 43 static struct vhost_log *vhost_log; 44 static struct vhost_log *vhost_log_shm; 45 46 static unsigned int used_memslots; 47 static QLIST_HEAD(, vhost_dev) vhost_devices = 48 QLIST_HEAD_INITIALIZER(vhost_devices); 49 50 bool vhost_has_free_slot(void) 51 { 52 unsigned int slots_limit = ~0U; 53 struct vhost_dev *hdev; 54 55 QLIST_FOREACH(hdev, &vhost_devices, entry) { 56 unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 57 slots_limit = MIN(slots_limit, r); 58 } 59 return slots_limit > used_memslots; 60 } 61 62 static void vhost_dev_sync_region(struct vhost_dev *dev, 63 MemoryRegionSection *section, 64 uint64_t mfirst, uint64_t mlast, 65 uint64_t rfirst, uint64_t rlast) 66 { 67 vhost_log_chunk_t *log = dev->log->log; 68 69 uint64_t start = MAX(mfirst, rfirst); 70 uint64_t end = MIN(mlast, rlast); 71 vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK; 72 vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1; 73 uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK); 74 75 if (end < start) { 76 return; 77 } 78 assert(end / VHOST_LOG_CHUNK < dev->log_size); 79 assert(start / VHOST_LOG_CHUNK < dev->log_size); 80 81 for (;from < to; ++from) { 82 vhost_log_chunk_t log; 83 /* We first check with non-atomic: much cheaper, 84 * and we expect non-dirty to be the common case. */ 85 if (!*from) { 86 addr += VHOST_LOG_CHUNK; 87 continue; 88 } 89 /* Data must be read atomically. We don't really need barrier semantics 90 * but it's easier to use atomic_* than roll our own. */ 91 log = atomic_xchg(from, 0); 92 while (log) { 93 int bit = ctzl(log); 94 hwaddr page_addr; 95 hwaddr section_offset; 96 hwaddr mr_offset; 97 page_addr = addr + bit * VHOST_LOG_PAGE; 98 section_offset = page_addr - section->offset_within_address_space; 99 mr_offset = section_offset + section->offset_within_region; 100 memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE); 101 log &= ~(0x1ull << bit); 102 } 103 addr += VHOST_LOG_CHUNK; 104 } 105 } 106 107 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, 108 MemoryRegionSection *section, 109 hwaddr first, 110 hwaddr last) 111 { 112 int i; 113 hwaddr start_addr; 114 hwaddr end_addr; 115 116 if (!dev->log_enabled || !dev->started) { 117 return 0; 118 } 119 start_addr = section->offset_within_address_space; 120 end_addr = range_get_last(start_addr, int128_get64(section->size)); 121 start_addr = MAX(first, start_addr); 122 end_addr = MIN(last, end_addr); 123 124 for (i = 0; i < dev->mem->nregions; ++i) { 125 struct vhost_memory_region *reg = dev->mem->regions + i; 126 vhost_dev_sync_region(dev, section, start_addr, end_addr, 127 reg->guest_phys_addr, 128 range_get_last(reg->guest_phys_addr, 129 reg->memory_size)); 130 } 131 for (i = 0; i < dev->nvqs; ++i) { 132 struct vhost_virtqueue *vq = dev->vqs + i; 133 vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys, 134 range_get_last(vq->used_phys, vq->used_size)); 135 } 136 return 0; 137 } 138 139 static void vhost_log_sync(MemoryListener *listener, 140 MemoryRegionSection *section) 141 { 142 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 143 memory_listener); 144 vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); 145 } 146 147 static void vhost_log_sync_range(struct vhost_dev *dev, 148 hwaddr first, hwaddr last) 149 { 150 int i; 151 /* FIXME: this is N^2 in number of sections */ 152 for (i = 0; i < dev->n_mem_sections; ++i) { 153 MemoryRegionSection *section = &dev->mem_sections[i]; 154 vhost_sync_dirty_bitmap(dev, section, first, last); 155 } 156 } 157 158 /* Assign/unassign. Keep an unsorted array of non-overlapping 159 * memory regions in dev->mem. */ 160 static void vhost_dev_unassign_memory(struct vhost_dev *dev, 161 uint64_t start_addr, 162 uint64_t size) 163 { 164 int from, to, n = dev->mem->nregions; 165 /* Track overlapping/split regions for sanity checking. */ 166 int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0; 167 168 for (from = 0, to = 0; from < n; ++from, ++to) { 169 struct vhost_memory_region *reg = dev->mem->regions + to; 170 uint64_t reglast; 171 uint64_t memlast; 172 uint64_t change; 173 174 /* clone old region */ 175 if (to != from) { 176 memcpy(reg, dev->mem->regions + from, sizeof *reg); 177 } 178 179 /* No overlap is simple */ 180 if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size, 181 start_addr, size)) { 182 continue; 183 } 184 185 /* Split only happens if supplied region 186 * is in the middle of an existing one. Thus it can not 187 * overlap with any other existing region. */ 188 assert(!split); 189 190 reglast = range_get_last(reg->guest_phys_addr, reg->memory_size); 191 memlast = range_get_last(start_addr, size); 192 193 /* Remove whole region */ 194 if (start_addr <= reg->guest_phys_addr && memlast >= reglast) { 195 --dev->mem->nregions; 196 --to; 197 ++overlap_middle; 198 continue; 199 } 200 201 /* Shrink region */ 202 if (memlast >= reglast) { 203 reg->memory_size = start_addr - reg->guest_phys_addr; 204 assert(reg->memory_size); 205 assert(!overlap_end); 206 ++overlap_end; 207 continue; 208 } 209 210 /* Shift region */ 211 if (start_addr <= reg->guest_phys_addr) { 212 change = memlast + 1 - reg->guest_phys_addr; 213 reg->memory_size -= change; 214 reg->guest_phys_addr += change; 215 reg->userspace_addr += change; 216 assert(reg->memory_size); 217 assert(!overlap_start); 218 ++overlap_start; 219 continue; 220 } 221 222 /* This only happens if supplied region 223 * is in the middle of an existing one. Thus it can not 224 * overlap with any other existing region. */ 225 assert(!overlap_start); 226 assert(!overlap_end); 227 assert(!overlap_middle); 228 /* Split region: shrink first part, shift second part. */ 229 memcpy(dev->mem->regions + n, reg, sizeof *reg); 230 reg->memory_size = start_addr - reg->guest_phys_addr; 231 assert(reg->memory_size); 232 change = memlast + 1 - reg->guest_phys_addr; 233 reg = dev->mem->regions + n; 234 reg->memory_size -= change; 235 assert(reg->memory_size); 236 reg->guest_phys_addr += change; 237 reg->userspace_addr += change; 238 /* Never add more than 1 region */ 239 assert(dev->mem->nregions == n); 240 ++dev->mem->nregions; 241 ++split; 242 } 243 } 244 245 /* Called after unassign, so no regions overlap the given range. */ 246 static void vhost_dev_assign_memory(struct vhost_dev *dev, 247 uint64_t start_addr, 248 uint64_t size, 249 uint64_t uaddr) 250 { 251 int from, to; 252 struct vhost_memory_region *merged = NULL; 253 for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) { 254 struct vhost_memory_region *reg = dev->mem->regions + to; 255 uint64_t prlast, urlast; 256 uint64_t pmlast, umlast; 257 uint64_t s, e, u; 258 259 /* clone old region */ 260 if (to != from) { 261 memcpy(reg, dev->mem->regions + from, sizeof *reg); 262 } 263 prlast = range_get_last(reg->guest_phys_addr, reg->memory_size); 264 pmlast = range_get_last(start_addr, size); 265 urlast = range_get_last(reg->userspace_addr, reg->memory_size); 266 umlast = range_get_last(uaddr, size); 267 268 /* check for overlapping regions: should never happen. */ 269 assert(prlast < start_addr || pmlast < reg->guest_phys_addr); 270 /* Not an adjacent or overlapping region - do not merge. */ 271 if ((prlast + 1 != start_addr || urlast + 1 != uaddr) && 272 (pmlast + 1 != reg->guest_phys_addr || 273 umlast + 1 != reg->userspace_addr)) { 274 continue; 275 } 276 277 if (dev->vhost_ops->vhost_backend_can_merge && 278 !dev->vhost_ops->vhost_backend_can_merge(dev, uaddr, size, 279 reg->userspace_addr, 280 reg->memory_size)) { 281 continue; 282 } 283 284 if (merged) { 285 --to; 286 assert(to >= 0); 287 } else { 288 merged = reg; 289 } 290 u = MIN(uaddr, reg->userspace_addr); 291 s = MIN(start_addr, reg->guest_phys_addr); 292 e = MAX(pmlast, prlast); 293 uaddr = merged->userspace_addr = u; 294 start_addr = merged->guest_phys_addr = s; 295 size = merged->memory_size = e - s + 1; 296 assert(merged->memory_size); 297 } 298 299 if (!merged) { 300 struct vhost_memory_region *reg = dev->mem->regions + to; 301 memset(reg, 0, sizeof *reg); 302 reg->memory_size = size; 303 assert(reg->memory_size); 304 reg->guest_phys_addr = start_addr; 305 reg->userspace_addr = uaddr; 306 ++to; 307 } 308 assert(to <= dev->mem->nregions + 1); 309 dev->mem->nregions = to; 310 } 311 312 static uint64_t vhost_get_log_size(struct vhost_dev *dev) 313 { 314 uint64_t log_size = 0; 315 int i; 316 for (i = 0; i < dev->mem->nregions; ++i) { 317 struct vhost_memory_region *reg = dev->mem->regions + i; 318 uint64_t last = range_get_last(reg->guest_phys_addr, 319 reg->memory_size); 320 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); 321 } 322 for (i = 0; i < dev->nvqs; ++i) { 323 struct vhost_virtqueue *vq = dev->vqs + i; 324 uint64_t last = vq->used_phys + vq->used_size - 1; 325 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); 326 } 327 return log_size; 328 } 329 330 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share) 331 { 332 Error *err = NULL; 333 struct vhost_log *log; 334 uint64_t logsize = size * sizeof(*(log->log)); 335 int fd = -1; 336 337 log = g_new0(struct vhost_log, 1); 338 if (share) { 339 log->log = qemu_memfd_alloc("vhost-log", logsize, 340 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, 341 &fd, &err); 342 if (err) { 343 error_report_err(err); 344 g_free(log); 345 return NULL; 346 } 347 memset(log->log, 0, logsize); 348 } else { 349 log->log = g_malloc0(logsize); 350 } 351 352 log->size = size; 353 log->refcnt = 1; 354 log->fd = fd; 355 356 return log; 357 } 358 359 static struct vhost_log *vhost_log_get(uint64_t size, bool share) 360 { 361 struct vhost_log *log = share ? vhost_log_shm : vhost_log; 362 363 if (!log || log->size != size) { 364 log = vhost_log_alloc(size, share); 365 if (share) { 366 vhost_log_shm = log; 367 } else { 368 vhost_log = log; 369 } 370 } else { 371 ++log->refcnt; 372 } 373 374 return log; 375 } 376 377 static void vhost_log_put(struct vhost_dev *dev, bool sync) 378 { 379 struct vhost_log *log = dev->log; 380 381 if (!log) { 382 return; 383 } 384 385 --log->refcnt; 386 if (log->refcnt == 0) { 387 /* Sync only the range covered by the old log */ 388 if (dev->log_size && sync) { 389 vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); 390 } 391 392 if (vhost_log == log) { 393 g_free(log->log); 394 vhost_log = NULL; 395 } else if (vhost_log_shm == log) { 396 qemu_memfd_free(log->log, log->size * sizeof(*(log->log)), 397 log->fd); 398 vhost_log_shm = NULL; 399 } 400 401 g_free(log); 402 } 403 404 dev->log = NULL; 405 dev->log_size = 0; 406 } 407 408 static bool vhost_dev_log_is_shared(struct vhost_dev *dev) 409 { 410 return dev->vhost_ops->vhost_requires_shm_log && 411 dev->vhost_ops->vhost_requires_shm_log(dev); 412 } 413 414 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) 415 { 416 struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev)); 417 uint64_t log_base = (uintptr_t)log->log; 418 int r; 419 420 /* inform backend of log switching, this must be done before 421 releasing the current log, to ensure no logging is lost */ 422 r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); 423 if (r < 0) { 424 VHOST_OPS_DEBUG("vhost_set_log_base failed"); 425 } 426 427 vhost_log_put(dev, true); 428 dev->log = log; 429 dev->log_size = size; 430 } 431 432 static int vhost_dev_has_iommu(struct vhost_dev *dev) 433 { 434 VirtIODevice *vdev = dev->vdev; 435 436 return virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); 437 } 438 439 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr, 440 hwaddr *plen, int is_write) 441 { 442 if (!vhost_dev_has_iommu(dev)) { 443 return cpu_physical_memory_map(addr, plen, is_write); 444 } else { 445 return (void *)(uintptr_t)addr; 446 } 447 } 448 449 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer, 450 hwaddr len, int is_write, 451 hwaddr access_len) 452 { 453 if (!vhost_dev_has_iommu(dev)) { 454 cpu_physical_memory_unmap(buffer, len, is_write, access_len); 455 } 456 } 457 458 static int vhost_verify_ring_part_mapping(void *ring_hva, 459 uint64_t ring_gpa, 460 uint64_t ring_size, 461 void *reg_hva, 462 uint64_t reg_gpa, 463 uint64_t reg_size) 464 { 465 uint64_t hva_ring_offset; 466 uint64_t ring_last = range_get_last(ring_gpa, ring_size); 467 uint64_t reg_last = range_get_last(reg_gpa, reg_size); 468 469 if (ring_last < reg_gpa || ring_gpa > reg_last) { 470 return 0; 471 } 472 /* check that whole ring's is mapped */ 473 if (ring_last > reg_last) { 474 return -ENOMEM; 475 } 476 /* check that ring's MemoryRegion wasn't replaced */ 477 hva_ring_offset = ring_gpa - reg_gpa; 478 if (ring_hva != reg_hva + hva_ring_offset) { 479 return -EBUSY; 480 } 481 482 return 0; 483 } 484 485 static int vhost_verify_ring_mappings(struct vhost_dev *dev, 486 void *reg_hva, 487 uint64_t reg_gpa, 488 uint64_t reg_size) 489 { 490 int i, j; 491 int r = 0; 492 const char *part_name[] = { 493 "descriptor table", 494 "available ring", 495 "used ring" 496 }; 497 498 for (i = 0; i < dev->nvqs; ++i) { 499 struct vhost_virtqueue *vq = dev->vqs + i; 500 501 j = 0; 502 r = vhost_verify_ring_part_mapping( 503 vq->desc, vq->desc_phys, vq->desc_size, 504 reg_hva, reg_gpa, reg_size); 505 if (r) { 506 break; 507 } 508 509 j++; 510 r = vhost_verify_ring_part_mapping( 511 vq->desc, vq->desc_phys, vq->desc_size, 512 reg_hva, reg_gpa, reg_size); 513 if (r) { 514 break; 515 } 516 517 j++; 518 r = vhost_verify_ring_part_mapping( 519 vq->desc, vq->desc_phys, vq->desc_size, 520 reg_hva, reg_gpa, reg_size); 521 if (r) { 522 break; 523 } 524 } 525 526 if (r == -ENOMEM) { 527 error_report("Unable to map %s for ring %d", part_name[j], i); 528 } else if (r == -EBUSY) { 529 error_report("%s relocated for ring %d", part_name[j], i); 530 } 531 return r; 532 } 533 534 static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev, 535 uint64_t start_addr, 536 uint64_t size) 537 { 538 int i, n = dev->mem->nregions; 539 for (i = 0; i < n; ++i) { 540 struct vhost_memory_region *reg = dev->mem->regions + i; 541 if (ranges_overlap(reg->guest_phys_addr, reg->memory_size, 542 start_addr, size)) { 543 return reg; 544 } 545 } 546 return NULL; 547 } 548 549 static bool vhost_dev_cmp_memory(struct vhost_dev *dev, 550 uint64_t start_addr, 551 uint64_t size, 552 uint64_t uaddr) 553 { 554 struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size); 555 uint64_t reglast; 556 uint64_t memlast; 557 558 if (!reg) { 559 return true; 560 } 561 562 reglast = range_get_last(reg->guest_phys_addr, reg->memory_size); 563 memlast = range_get_last(start_addr, size); 564 565 /* Need to extend region? */ 566 if (start_addr < reg->guest_phys_addr || memlast > reglast) { 567 return true; 568 } 569 /* userspace_addr changed? */ 570 return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr; 571 } 572 573 static void vhost_set_memory(MemoryListener *listener, 574 MemoryRegionSection *section, 575 bool add) 576 { 577 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 578 memory_listener); 579 hwaddr start_addr = section->offset_within_address_space; 580 ram_addr_t size = int128_get64(section->size); 581 bool log_dirty = 582 memory_region_get_dirty_log_mask(section->mr) & ~(1 << DIRTY_MEMORY_MIGRATION); 583 int s = offsetof(struct vhost_memory, regions) + 584 (dev->mem->nregions + 1) * sizeof dev->mem->regions[0]; 585 void *ram; 586 587 dev->mem = g_realloc(dev->mem, s); 588 589 if (log_dirty) { 590 add = false; 591 } 592 593 assert(size); 594 595 /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */ 596 ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region; 597 if (add) { 598 if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) { 599 /* Region exists with same address. Nothing to do. */ 600 return; 601 } 602 } else { 603 if (!vhost_dev_find_reg(dev, start_addr, size)) { 604 /* Removing region that we don't access. Nothing to do. */ 605 return; 606 } 607 } 608 609 vhost_dev_unassign_memory(dev, start_addr, size); 610 if (add) { 611 /* Add given mapping, merging adjacent regions if any */ 612 vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram); 613 } else { 614 /* Remove old mapping for this memory, if any. */ 615 vhost_dev_unassign_memory(dev, start_addr, size); 616 } 617 dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr); 618 dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1); 619 dev->memory_changed = true; 620 used_memslots = dev->mem->nregions; 621 } 622 623 static bool vhost_section(MemoryRegionSection *section) 624 { 625 return memory_region_is_ram(section->mr) && 626 !memory_region_is_rom(section->mr); 627 } 628 629 static void vhost_begin(MemoryListener *listener) 630 { 631 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 632 memory_listener); 633 dev->mem_changed_end_addr = 0; 634 dev->mem_changed_start_addr = -1; 635 dev->tmp_sections = NULL; 636 dev->n_tmp_sections = 0; 637 } 638 639 static void vhost_commit(MemoryListener *listener) 640 { 641 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 642 memory_listener); 643 MemoryRegionSection *old_sections; 644 int n_old_sections; 645 uint64_t log_size; 646 size_t regions_size; 647 int r; 648 int i; 649 bool changed = false; 650 651 /* Note we can be called before the device is started, but then 652 * starting the device calls set_mem_table, so we need to have 653 * built the data structures. 654 */ 655 old_sections = dev->mem_sections; 656 n_old_sections = dev->n_mem_sections; 657 dev->mem_sections = dev->tmp_sections; 658 dev->n_mem_sections = dev->n_tmp_sections; 659 660 if (dev->n_mem_sections != n_old_sections) { 661 changed = true; 662 } else { 663 /* Same size, lets check the contents */ 664 changed = n_old_sections && memcmp(dev->mem_sections, old_sections, 665 n_old_sections * sizeof(old_sections[0])) != 0; 666 } 667 668 trace_vhost_commit(dev->started, changed); 669 if (!changed) { 670 goto out; 671 } 672 673 /* Rebuild the regions list from the new sections list */ 674 regions_size = offsetof(struct vhost_memory, regions) + 675 dev->n_mem_sections * sizeof dev->mem->regions[0]; 676 dev->mem = g_realloc(dev->mem, regions_size); 677 dev->mem->nregions = dev->n_mem_sections; 678 used_memslots = dev->mem->nregions; 679 for (i = 0; i < dev->n_mem_sections; i++) { 680 struct vhost_memory_region *cur_vmr = dev->mem->regions + i; 681 struct MemoryRegionSection *mrs = dev->mem_sections + i; 682 683 cur_vmr->guest_phys_addr = mrs->offset_within_address_space; 684 cur_vmr->memory_size = int128_get64(mrs->size); 685 cur_vmr->userspace_addr = 686 (uintptr_t)memory_region_get_ram_ptr(mrs->mr) + 687 mrs->offset_within_region; 688 cur_vmr->flags_padding = 0; 689 } 690 691 if (!dev->started) { 692 goto out; 693 } 694 695 for (i = 0; i < dev->mem->nregions; i++) { 696 if (vhost_verify_ring_mappings(dev, 697 (void *)(uintptr_t)dev->mem->regions[i].userspace_addr, 698 dev->mem->regions[i].guest_phys_addr, 699 dev->mem->regions[i].memory_size)) { 700 error_report("Verify ring failure on region %d", i); 701 abort(); 702 } 703 } 704 705 if (!dev->log_enabled) { 706 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 707 if (r < 0) { 708 VHOST_OPS_DEBUG("vhost_set_mem_table failed"); 709 } 710 dev->memory_changed = false; 711 goto out; 712 } 713 log_size = vhost_get_log_size(dev); 714 /* We allocate an extra 4K bytes to log, 715 * to reduce the * number of reallocations. */ 716 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) 717 /* To log more, must increase log size before table update. */ 718 if (dev->log_size < log_size) { 719 vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); 720 } 721 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); 722 if (r < 0) { 723 VHOST_OPS_DEBUG("vhost_set_mem_table failed"); 724 } 725 /* To log less, can only decrease log size after table update. */ 726 if (dev->log_size > log_size + VHOST_LOG_BUFFER) { 727 vhost_dev_log_resize(dev, log_size); 728 } 729 dev->memory_changed = false; 730 731 out: 732 /* Deref the old list of sections, this must happen _after_ the 733 * vhost_set_mem_table to ensure the client isn't still using the 734 * section we're about to unref. 735 */ 736 while (n_old_sections--) { 737 memory_region_unref(old_sections[n_old_sections].mr); 738 } 739 g_free(old_sections); 740 return; 741 } 742 743 /* Adds the section data to the tmp_section structure. 744 * It relies on the listener calling us in memory address order 745 * and for each region (via the _add and _nop methods) to 746 * join neighbours. 747 */ 748 static void vhost_region_add_section(struct vhost_dev *dev, 749 MemoryRegionSection *section) 750 { 751 bool need_add = true; 752 uint64_t mrs_size = int128_get64(section->size); 753 uint64_t mrs_gpa = section->offset_within_address_space; 754 uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + 755 section->offset_within_region; 756 757 trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, 758 mrs_host); 759 760 bool log_dirty = memory_region_get_dirty_log_mask(section->mr) & 761 ~(1 << DIRTY_MEMORY_MIGRATION); 762 if (log_dirty) { 763 return; 764 } 765 766 if (dev->n_tmp_sections) { 767 /* Since we already have at least one section, lets see if 768 * this extends it; since we're scanning in order, we only 769 * have to look at the last one, and the FlatView that calls 770 * us shouldn't have overlaps. 771 */ 772 MemoryRegionSection *prev_sec = dev->tmp_sections + 773 (dev->n_tmp_sections - 1); 774 uint64_t prev_gpa_start = prev_sec->offset_within_address_space; 775 uint64_t prev_size = int128_get64(prev_sec->size); 776 uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size); 777 uint64_t prev_host_start = 778 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) + 779 prev_sec->offset_within_region; 780 uint64_t prev_host_end = range_get_last(prev_host_start, prev_size); 781 782 if (prev_gpa_end + 1 == mrs_gpa && 783 prev_host_end + 1 == mrs_host && 784 section->mr == prev_sec->mr && 785 (!dev->vhost_ops->vhost_backend_can_merge || 786 dev->vhost_ops->vhost_backend_can_merge(dev, 787 mrs_host, mrs_size, 788 prev_host_start, prev_size))) { 789 /* The two sections abut */ 790 need_add = false; 791 prev_sec->size = int128_add(prev_sec->size, section->size); 792 trace_vhost_region_add_section_abut(section->mr->name, 793 mrs_size + prev_size); 794 } 795 } 796 797 if (need_add) { 798 ++dev->n_tmp_sections; 799 dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections, 800 dev->n_tmp_sections); 801 dev->tmp_sections[dev->n_tmp_sections - 1] = *section; 802 /* The flatview isn't stable and we don't use it, making it NULL 803 * means we can memcmp the list. 804 */ 805 dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL; 806 memory_region_ref(section->mr); 807 } 808 } 809 810 static void vhost_region_add(MemoryListener *listener, 811 MemoryRegionSection *section) 812 { 813 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 814 memory_listener); 815 816 if (!vhost_section(section)) { 817 return; 818 } 819 vhost_region_add_section(dev, section); 820 821 vhost_set_memory(listener, section, true); 822 } 823 824 /* Called on regions that have not changed */ 825 static void vhost_region_nop(MemoryListener *listener, 826 MemoryRegionSection *section) 827 { 828 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 829 memory_listener); 830 831 if (!vhost_section(section)) { 832 return; 833 } 834 835 vhost_region_add_section(dev, section); 836 } 837 838 static void vhost_region_del(MemoryListener *listener, 839 MemoryRegionSection *section) 840 { 841 if (!vhost_section(section)) { 842 return; 843 } 844 845 vhost_set_memory(listener, section, false); 846 } 847 848 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 849 { 850 struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n); 851 struct vhost_dev *hdev = iommu->hdev; 852 hwaddr iova = iotlb->iova + iommu->iommu_offset; 853 854 if (vhost_backend_invalidate_device_iotlb(hdev, iova, 855 iotlb->addr_mask + 1)) { 856 error_report("Fail to invalidate device iotlb"); 857 } 858 } 859 860 static void vhost_iommu_region_add(MemoryListener *listener, 861 MemoryRegionSection *section) 862 { 863 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 864 iommu_listener); 865 struct vhost_iommu *iommu; 866 Int128 end; 867 868 if (!memory_region_is_iommu(section->mr)) { 869 return; 870 } 871 872 iommu = g_malloc0(sizeof(*iommu)); 873 end = int128_add(int128_make64(section->offset_within_region), 874 section->size); 875 end = int128_sub(end, int128_one()); 876 iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify, 877 IOMMU_NOTIFIER_UNMAP, 878 section->offset_within_region, 879 int128_get64(end)); 880 iommu->mr = section->mr; 881 iommu->iommu_offset = section->offset_within_address_space - 882 section->offset_within_region; 883 iommu->hdev = dev; 884 memory_region_register_iommu_notifier(section->mr, &iommu->n); 885 QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next); 886 /* TODO: can replay help performance here? */ 887 } 888 889 static void vhost_iommu_region_del(MemoryListener *listener, 890 MemoryRegionSection *section) 891 { 892 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 893 iommu_listener); 894 struct vhost_iommu *iommu; 895 896 if (!memory_region_is_iommu(section->mr)) { 897 return; 898 } 899 900 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { 901 if (iommu->mr == section->mr && 902 iommu->n.start == section->offset_within_region) { 903 memory_region_unregister_iommu_notifier(iommu->mr, 904 &iommu->n); 905 QLIST_REMOVE(iommu, iommu_next); 906 g_free(iommu); 907 break; 908 } 909 } 910 } 911 912 static int vhost_virtqueue_set_addr(struct vhost_dev *dev, 913 struct vhost_virtqueue *vq, 914 unsigned idx, bool enable_log) 915 { 916 struct vhost_vring_addr addr = { 917 .index = idx, 918 .desc_user_addr = (uint64_t)(unsigned long)vq->desc, 919 .avail_user_addr = (uint64_t)(unsigned long)vq->avail, 920 .used_user_addr = (uint64_t)(unsigned long)vq->used, 921 .log_guest_addr = vq->used_phys, 922 .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0, 923 }; 924 int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); 925 if (r < 0) { 926 VHOST_OPS_DEBUG("vhost_set_vring_addr failed"); 927 return -errno; 928 } 929 return 0; 930 } 931 932 static int vhost_dev_set_features(struct vhost_dev *dev, 933 bool enable_log) 934 { 935 uint64_t features = dev->acked_features; 936 int r; 937 if (enable_log) { 938 features |= 0x1ULL << VHOST_F_LOG_ALL; 939 } 940 r = dev->vhost_ops->vhost_set_features(dev, features); 941 if (r < 0) { 942 VHOST_OPS_DEBUG("vhost_set_features failed"); 943 } 944 return r < 0 ? -errno : 0; 945 } 946 947 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) 948 { 949 int r, i, idx; 950 r = vhost_dev_set_features(dev, enable_log); 951 if (r < 0) { 952 goto err_features; 953 } 954 for (i = 0; i < dev->nvqs; ++i) { 955 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 956 r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 957 enable_log); 958 if (r < 0) { 959 goto err_vq; 960 } 961 } 962 return 0; 963 err_vq: 964 for (; i >= 0; --i) { 965 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); 966 vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, 967 dev->log_enabled); 968 } 969 vhost_dev_set_features(dev, dev->log_enabled); 970 err_features: 971 return r; 972 } 973 974 static int vhost_migration_log(MemoryListener *listener, int enable) 975 { 976 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 977 memory_listener); 978 int r; 979 if (!!enable == dev->log_enabled) { 980 return 0; 981 } 982 if (!dev->started) { 983 dev->log_enabled = enable; 984 return 0; 985 } 986 if (!enable) { 987 r = vhost_dev_set_log(dev, false); 988 if (r < 0) { 989 return r; 990 } 991 vhost_log_put(dev, false); 992 } else { 993 vhost_dev_log_resize(dev, vhost_get_log_size(dev)); 994 r = vhost_dev_set_log(dev, true); 995 if (r < 0) { 996 return r; 997 } 998 } 999 dev->log_enabled = enable; 1000 return 0; 1001 } 1002 1003 static void vhost_log_global_start(MemoryListener *listener) 1004 { 1005 int r; 1006 1007 r = vhost_migration_log(listener, true); 1008 if (r < 0) { 1009 abort(); 1010 } 1011 } 1012 1013 static void vhost_log_global_stop(MemoryListener *listener) 1014 { 1015 int r; 1016 1017 r = vhost_migration_log(listener, false); 1018 if (r < 0) { 1019 abort(); 1020 } 1021 } 1022 1023 static void vhost_log_start(MemoryListener *listener, 1024 MemoryRegionSection *section, 1025 int old, int new) 1026 { 1027 /* FIXME: implement */ 1028 } 1029 1030 static void vhost_log_stop(MemoryListener *listener, 1031 MemoryRegionSection *section, 1032 int old, int new) 1033 { 1034 /* FIXME: implement */ 1035 } 1036 1037 /* The vhost driver natively knows how to handle the vrings of non 1038 * cross-endian legacy devices and modern devices. Only legacy devices 1039 * exposed to a bi-endian guest may require the vhost driver to use a 1040 * specific endianness. 1041 */ 1042 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev) 1043 { 1044 if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { 1045 return false; 1046 } 1047 #ifdef HOST_WORDS_BIGENDIAN 1048 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE; 1049 #else 1050 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG; 1051 #endif 1052 } 1053 1054 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, 1055 bool is_big_endian, 1056 int vhost_vq_index) 1057 { 1058 struct vhost_vring_state s = { 1059 .index = vhost_vq_index, 1060 .num = is_big_endian 1061 }; 1062 1063 if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) { 1064 return 0; 1065 } 1066 1067 VHOST_OPS_DEBUG("vhost_set_vring_endian failed"); 1068 if (errno == ENOTTY) { 1069 error_report("vhost does not support cross-endian"); 1070 return -ENOSYS; 1071 } 1072 1073 return -errno; 1074 } 1075 1076 static int vhost_memory_region_lookup(struct vhost_dev *hdev, 1077 uint64_t gpa, uint64_t *uaddr, 1078 uint64_t *len) 1079 { 1080 int i; 1081 1082 for (i = 0; i < hdev->mem->nregions; i++) { 1083 struct vhost_memory_region *reg = hdev->mem->regions + i; 1084 1085 if (gpa >= reg->guest_phys_addr && 1086 reg->guest_phys_addr + reg->memory_size > gpa) { 1087 *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr; 1088 *len = reg->guest_phys_addr + reg->memory_size - gpa; 1089 return 0; 1090 } 1091 } 1092 1093 return -EFAULT; 1094 } 1095 1096 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) 1097 { 1098 IOMMUTLBEntry iotlb; 1099 uint64_t uaddr, len; 1100 int ret = -EFAULT; 1101 1102 rcu_read_lock(); 1103 1104 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, 1105 iova, write); 1106 if (iotlb.target_as != NULL) { 1107 ret = vhost_memory_region_lookup(dev, iotlb.translated_addr, 1108 &uaddr, &len); 1109 if (ret) { 1110 error_report("Fail to lookup the translated address " 1111 "%"PRIx64, iotlb.translated_addr); 1112 goto out; 1113 } 1114 1115 len = MIN(iotlb.addr_mask + 1, len); 1116 iova = iova & ~iotlb.addr_mask; 1117 1118 ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, 1119 len, iotlb.perm); 1120 if (ret) { 1121 error_report("Fail to update device iotlb"); 1122 goto out; 1123 } 1124 } 1125 out: 1126 rcu_read_unlock(); 1127 1128 return ret; 1129 } 1130 1131 static int vhost_virtqueue_start(struct vhost_dev *dev, 1132 struct VirtIODevice *vdev, 1133 struct vhost_virtqueue *vq, 1134 unsigned idx) 1135 { 1136 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1137 VirtioBusState *vbus = VIRTIO_BUS(qbus); 1138 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); 1139 hwaddr s, l, a; 1140 int r; 1141 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1142 struct vhost_vring_file file = { 1143 .index = vhost_vq_index 1144 }; 1145 struct vhost_vring_state state = { 1146 .index = vhost_vq_index 1147 }; 1148 struct VirtQueue *vvq = virtio_get_queue(vdev, idx); 1149 1150 1151 vq->num = state.num = virtio_queue_get_num(vdev, idx); 1152 r = dev->vhost_ops->vhost_set_vring_num(dev, &state); 1153 if (r) { 1154 VHOST_OPS_DEBUG("vhost_set_vring_num failed"); 1155 return -errno; 1156 } 1157 1158 state.num = virtio_queue_get_last_avail_idx(vdev, idx); 1159 r = dev->vhost_ops->vhost_set_vring_base(dev, &state); 1160 if (r) { 1161 VHOST_OPS_DEBUG("vhost_set_vring_base failed"); 1162 return -errno; 1163 } 1164 1165 if (vhost_needs_vring_endian(vdev)) { 1166 r = vhost_virtqueue_set_vring_endian_legacy(dev, 1167 virtio_is_big_endian(vdev), 1168 vhost_vq_index); 1169 if (r) { 1170 return -errno; 1171 } 1172 } 1173 1174 vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx); 1175 vq->desc_phys = a = virtio_queue_get_desc_addr(vdev, idx); 1176 vq->desc = vhost_memory_map(dev, a, &l, 0); 1177 if (!vq->desc || l != s) { 1178 r = -ENOMEM; 1179 goto fail_alloc_desc; 1180 } 1181 vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx); 1182 vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx); 1183 vq->avail = vhost_memory_map(dev, a, &l, 0); 1184 if (!vq->avail || l != s) { 1185 r = -ENOMEM; 1186 goto fail_alloc_avail; 1187 } 1188 vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx); 1189 vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx); 1190 vq->used = vhost_memory_map(dev, a, &l, 1); 1191 if (!vq->used || l != s) { 1192 r = -ENOMEM; 1193 goto fail_alloc_used; 1194 } 1195 1196 r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); 1197 if (r < 0) { 1198 r = -errno; 1199 goto fail_alloc; 1200 } 1201 1202 file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); 1203 r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); 1204 if (r) { 1205 VHOST_OPS_DEBUG("vhost_set_vring_kick failed"); 1206 r = -errno; 1207 goto fail_kick; 1208 } 1209 1210 /* Clear and discard previous events if any. */ 1211 event_notifier_test_and_clear(&vq->masked_notifier); 1212 1213 /* Init vring in unmasked state, unless guest_notifier_mask 1214 * will do it later. 1215 */ 1216 if (!vdev->use_guest_notifier_mask) { 1217 /* TODO: check and handle errors. */ 1218 vhost_virtqueue_mask(dev, vdev, idx, false); 1219 } 1220 1221 if (k->query_guest_notifiers && 1222 k->query_guest_notifiers(qbus->parent) && 1223 virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) { 1224 file.fd = -1; 1225 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1226 if (r) { 1227 goto fail_vector; 1228 } 1229 } 1230 1231 return 0; 1232 1233 fail_vector: 1234 fail_kick: 1235 fail_alloc: 1236 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1237 0, 0); 1238 fail_alloc_used: 1239 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1240 0, 0); 1241 fail_alloc_avail: 1242 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1243 0, 0); 1244 fail_alloc_desc: 1245 return r; 1246 } 1247 1248 static void vhost_virtqueue_stop(struct vhost_dev *dev, 1249 struct VirtIODevice *vdev, 1250 struct vhost_virtqueue *vq, 1251 unsigned idx) 1252 { 1253 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); 1254 struct vhost_vring_state state = { 1255 .index = vhost_vq_index, 1256 }; 1257 int r; 1258 1259 r = dev->vhost_ops->vhost_get_vring_base(dev, &state); 1260 if (r < 0) { 1261 VHOST_OPS_DEBUG("vhost VQ %d ring restore failed: %d", idx, r); 1262 /* Connection to the backend is broken, so let's sync internal 1263 * last avail idx to the device used idx. 1264 */ 1265 virtio_queue_restore_last_avail_idx(vdev, idx); 1266 } else { 1267 virtio_queue_set_last_avail_idx(vdev, idx, state.num); 1268 } 1269 virtio_queue_invalidate_signalled_used(vdev, idx); 1270 virtio_queue_update_used_idx(vdev, idx); 1271 1272 /* In the cross-endian case, we need to reset the vring endianness to 1273 * native as legacy devices expect so by default. 1274 */ 1275 if (vhost_needs_vring_endian(vdev)) { 1276 vhost_virtqueue_set_vring_endian_legacy(dev, 1277 !virtio_is_big_endian(vdev), 1278 vhost_vq_index); 1279 } 1280 1281 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), 1282 1, virtio_queue_get_used_size(vdev, idx)); 1283 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), 1284 0, virtio_queue_get_avail_size(vdev, idx)); 1285 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 1286 0, virtio_queue_get_desc_size(vdev, idx)); 1287 } 1288 1289 static void vhost_eventfd_add(MemoryListener *listener, 1290 MemoryRegionSection *section, 1291 bool match_data, uint64_t data, EventNotifier *e) 1292 { 1293 } 1294 1295 static void vhost_eventfd_del(MemoryListener *listener, 1296 MemoryRegionSection *section, 1297 bool match_data, uint64_t data, EventNotifier *e) 1298 { 1299 } 1300 1301 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev, 1302 int n, uint32_t timeout) 1303 { 1304 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1305 struct vhost_vring_state state = { 1306 .index = vhost_vq_index, 1307 .num = timeout, 1308 }; 1309 int r; 1310 1311 if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) { 1312 return -EINVAL; 1313 } 1314 1315 r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state); 1316 if (r) { 1317 VHOST_OPS_DEBUG("vhost_set_vring_busyloop_timeout failed"); 1318 return r; 1319 } 1320 1321 return 0; 1322 } 1323 1324 static int vhost_virtqueue_init(struct vhost_dev *dev, 1325 struct vhost_virtqueue *vq, int n) 1326 { 1327 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); 1328 struct vhost_vring_file file = { 1329 .index = vhost_vq_index, 1330 }; 1331 int r = event_notifier_init(&vq->masked_notifier, 0); 1332 if (r < 0) { 1333 return r; 1334 } 1335 1336 file.fd = event_notifier_get_fd(&vq->masked_notifier); 1337 r = dev->vhost_ops->vhost_set_vring_call(dev, &file); 1338 if (r) { 1339 VHOST_OPS_DEBUG("vhost_set_vring_call failed"); 1340 r = -errno; 1341 goto fail_call; 1342 } 1343 1344 vq->dev = dev; 1345 1346 return 0; 1347 fail_call: 1348 event_notifier_cleanup(&vq->masked_notifier); 1349 return r; 1350 } 1351 1352 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) 1353 { 1354 event_notifier_cleanup(&vq->masked_notifier); 1355 } 1356 1357 int vhost_dev_init(struct vhost_dev *hdev, void *opaque, 1358 VhostBackendType backend_type, uint32_t busyloop_timeout) 1359 { 1360 uint64_t features; 1361 int i, r, n_initialized_vqs = 0; 1362 Error *local_err = NULL; 1363 1364 hdev->vdev = NULL; 1365 hdev->migration_blocker = NULL; 1366 1367 r = vhost_set_backend_type(hdev, backend_type); 1368 assert(r >= 0); 1369 1370 r = hdev->vhost_ops->vhost_backend_init(hdev, opaque); 1371 if (r < 0) { 1372 goto fail; 1373 } 1374 1375 if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) { 1376 error_report("vhost backend memory slots limit is less" 1377 " than current number of present memory slots"); 1378 r = -1; 1379 goto fail; 1380 } 1381 1382 r = hdev->vhost_ops->vhost_set_owner(hdev); 1383 if (r < 0) { 1384 VHOST_OPS_DEBUG("vhost_set_owner failed"); 1385 goto fail; 1386 } 1387 1388 r = hdev->vhost_ops->vhost_get_features(hdev, &features); 1389 if (r < 0) { 1390 VHOST_OPS_DEBUG("vhost_get_features failed"); 1391 goto fail; 1392 } 1393 1394 for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) { 1395 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i); 1396 if (r < 0) { 1397 goto fail; 1398 } 1399 } 1400 1401 if (busyloop_timeout) { 1402 for (i = 0; i < hdev->nvqs; ++i) { 1403 r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 1404 busyloop_timeout); 1405 if (r < 0) { 1406 goto fail_busyloop; 1407 } 1408 } 1409 } 1410 1411 hdev->features = features; 1412 1413 hdev->memory_listener = (MemoryListener) { 1414 .begin = vhost_begin, 1415 .commit = vhost_commit, 1416 .region_add = vhost_region_add, 1417 .region_del = vhost_region_del, 1418 .region_nop = vhost_region_nop, 1419 .log_start = vhost_log_start, 1420 .log_stop = vhost_log_stop, 1421 .log_sync = vhost_log_sync, 1422 .log_global_start = vhost_log_global_start, 1423 .log_global_stop = vhost_log_global_stop, 1424 .eventfd_add = vhost_eventfd_add, 1425 .eventfd_del = vhost_eventfd_del, 1426 .priority = 10 1427 }; 1428 1429 hdev->iommu_listener = (MemoryListener) { 1430 .region_add = vhost_iommu_region_add, 1431 .region_del = vhost_iommu_region_del, 1432 }; 1433 1434 if (hdev->migration_blocker == NULL) { 1435 if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { 1436 error_setg(&hdev->migration_blocker, 1437 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); 1438 } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_check()) { 1439 error_setg(&hdev->migration_blocker, 1440 "Migration disabled: failed to allocate shared memory"); 1441 } 1442 } 1443 1444 if (hdev->migration_blocker != NULL) { 1445 r = migrate_add_blocker(hdev->migration_blocker, &local_err); 1446 if (local_err) { 1447 error_report_err(local_err); 1448 error_free(hdev->migration_blocker); 1449 goto fail_busyloop; 1450 } 1451 } 1452 1453 hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); 1454 hdev->n_mem_sections = 0; 1455 hdev->mem_sections = NULL; 1456 hdev->log = NULL; 1457 hdev->log_size = 0; 1458 hdev->log_enabled = false; 1459 hdev->started = false; 1460 hdev->memory_changed = false; 1461 memory_listener_register(&hdev->memory_listener, &address_space_memory); 1462 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); 1463 return 0; 1464 1465 fail_busyloop: 1466 while (--i >= 0) { 1467 vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0); 1468 } 1469 fail: 1470 hdev->nvqs = n_initialized_vqs; 1471 vhost_dev_cleanup(hdev); 1472 return r; 1473 } 1474 1475 void vhost_dev_cleanup(struct vhost_dev *hdev) 1476 { 1477 int i; 1478 1479 for (i = 0; i < hdev->nvqs; ++i) { 1480 vhost_virtqueue_cleanup(hdev->vqs + i); 1481 } 1482 if (hdev->mem) { 1483 /* those are only safe after successful init */ 1484 memory_listener_unregister(&hdev->memory_listener); 1485 QLIST_REMOVE(hdev, entry); 1486 } 1487 if (hdev->migration_blocker) { 1488 migrate_del_blocker(hdev->migration_blocker); 1489 error_free(hdev->migration_blocker); 1490 } 1491 g_free(hdev->mem); 1492 g_free(hdev->mem_sections); 1493 if (hdev->vhost_ops) { 1494 hdev->vhost_ops->vhost_backend_cleanup(hdev); 1495 } 1496 assert(!hdev->log); 1497 1498 memset(hdev, 0, sizeof(struct vhost_dev)); 1499 } 1500 1501 /* Stop processing guest IO notifications in qemu. 1502 * Start processing them in vhost in kernel. 1503 */ 1504 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1505 { 1506 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1507 int i, r, e; 1508 1509 /* We will pass the notifiers to the kernel, make sure that QEMU 1510 * doesn't interfere. 1511 */ 1512 r = virtio_device_grab_ioeventfd(vdev); 1513 if (r < 0) { 1514 error_report("binding does not support host notifiers"); 1515 goto fail; 1516 } 1517 1518 for (i = 0; i < hdev->nvqs; ++i) { 1519 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1520 true); 1521 if (r < 0) { 1522 error_report("vhost VQ %d notifier binding failed: %d", i, -r); 1523 goto fail_vq; 1524 } 1525 } 1526 1527 return 0; 1528 fail_vq: 1529 while (--i >= 0) { 1530 e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1531 false); 1532 if (e < 0) { 1533 error_report("vhost VQ %d notifier cleanup error: %d", i, -r); 1534 } 1535 assert (e >= 0); 1536 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i); 1537 } 1538 virtio_device_release_ioeventfd(vdev); 1539 fail: 1540 return r; 1541 } 1542 1543 /* Stop processing guest IO notifications in vhost. 1544 * Start processing them in qemu. 1545 * This might actually run the qemu handlers right away, 1546 * so virtio in qemu must be completely setup when this is called. 1547 */ 1548 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1549 { 1550 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1551 int i, r; 1552 1553 for (i = 0; i < hdev->nvqs; ++i) { 1554 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, 1555 false); 1556 if (r < 0) { 1557 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r); 1558 } 1559 assert (r >= 0); 1560 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i); 1561 } 1562 virtio_device_release_ioeventfd(vdev); 1563 } 1564 1565 /* Test and clear event pending status. 1566 * Should be called after unmask to avoid losing events. 1567 */ 1568 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n) 1569 { 1570 struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index; 1571 assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); 1572 return event_notifier_test_and_clear(&vq->masked_notifier); 1573 } 1574 1575 /* Mask/unmask events from this vq. */ 1576 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, 1577 bool mask) 1578 { 1579 struct VirtQueue *vvq = virtio_get_queue(vdev, n); 1580 int r, index = n - hdev->vq_index; 1581 struct vhost_vring_file file; 1582 1583 /* should only be called after backend is connected */ 1584 assert(hdev->vhost_ops); 1585 1586 if (mask) { 1587 assert(vdev->use_guest_notifier_mask); 1588 file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier); 1589 } else { 1590 file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq)); 1591 } 1592 1593 file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n); 1594 r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file); 1595 if (r < 0) { 1596 VHOST_OPS_DEBUG("vhost_set_vring_call failed"); 1597 } 1598 } 1599 1600 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, 1601 uint64_t features) 1602 { 1603 const int *bit = feature_bits; 1604 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1605 uint64_t bit_mask = (1ULL << *bit); 1606 if (!(hdev->features & bit_mask)) { 1607 features &= ~bit_mask; 1608 } 1609 bit++; 1610 } 1611 return features; 1612 } 1613 1614 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, 1615 uint64_t features) 1616 { 1617 const int *bit = feature_bits; 1618 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1619 uint64_t bit_mask = (1ULL << *bit); 1620 if (features & bit_mask) { 1621 hdev->acked_features |= bit_mask; 1622 } 1623 bit++; 1624 } 1625 } 1626 1627 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config, 1628 uint32_t config_len) 1629 { 1630 assert(hdev->vhost_ops); 1631 1632 if (hdev->vhost_ops->vhost_get_config) { 1633 return hdev->vhost_ops->vhost_get_config(hdev, config, config_len); 1634 } 1635 1636 return -1; 1637 } 1638 1639 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data, 1640 uint32_t offset, uint32_t size, uint32_t flags) 1641 { 1642 assert(hdev->vhost_ops); 1643 1644 if (hdev->vhost_ops->vhost_set_config) { 1645 return hdev->vhost_ops->vhost_set_config(hdev, data, offset, 1646 size, flags); 1647 } 1648 1649 return -1; 1650 } 1651 1652 void vhost_dev_set_config_notifier(struct vhost_dev *hdev, 1653 const VhostDevConfigOps *ops) 1654 { 1655 assert(hdev->vhost_ops); 1656 hdev->config_ops = ops; 1657 } 1658 1659 /* Host notifiers must be enabled at this point. */ 1660 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) 1661 { 1662 int i, r; 1663 1664 /* should only be called after backend is connected */ 1665 assert(hdev->vhost_ops); 1666 1667 hdev->started = true; 1668 hdev->vdev = vdev; 1669 1670 r = vhost_dev_set_features(hdev, hdev->log_enabled); 1671 if (r < 0) { 1672 goto fail_features; 1673 } 1674 1675 if (vhost_dev_has_iommu(hdev)) { 1676 memory_listener_register(&hdev->iommu_listener, vdev->dma_as); 1677 } 1678 1679 r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); 1680 if (r < 0) { 1681 VHOST_OPS_DEBUG("vhost_set_mem_table failed"); 1682 r = -errno; 1683 goto fail_mem; 1684 } 1685 for (i = 0; i < hdev->nvqs; ++i) { 1686 r = vhost_virtqueue_start(hdev, 1687 vdev, 1688 hdev->vqs + i, 1689 hdev->vq_index + i); 1690 if (r < 0) { 1691 goto fail_vq; 1692 } 1693 } 1694 1695 if (hdev->log_enabled) { 1696 uint64_t log_base; 1697 1698 hdev->log_size = vhost_get_log_size(hdev); 1699 hdev->log = vhost_log_get(hdev->log_size, 1700 vhost_dev_log_is_shared(hdev)); 1701 log_base = (uintptr_t)hdev->log->log; 1702 r = hdev->vhost_ops->vhost_set_log_base(hdev, 1703 hdev->log_size ? log_base : 0, 1704 hdev->log); 1705 if (r < 0) { 1706 VHOST_OPS_DEBUG("vhost_set_log_base failed"); 1707 r = -errno; 1708 goto fail_log; 1709 } 1710 } 1711 1712 if (vhost_dev_has_iommu(hdev)) { 1713 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true); 1714 1715 /* Update used ring information for IOTLB to work correctly, 1716 * vhost-kernel code requires for this.*/ 1717 for (i = 0; i < hdev->nvqs; ++i) { 1718 struct vhost_virtqueue *vq = hdev->vqs + i; 1719 vhost_device_iotlb_miss(hdev, vq->used_phys, true); 1720 } 1721 } 1722 return 0; 1723 fail_log: 1724 vhost_log_put(hdev, false); 1725 fail_vq: 1726 while (--i >= 0) { 1727 vhost_virtqueue_stop(hdev, 1728 vdev, 1729 hdev->vqs + i, 1730 hdev->vq_index + i); 1731 } 1732 i = hdev->nvqs; 1733 1734 fail_mem: 1735 fail_features: 1736 1737 hdev->started = false; 1738 return r; 1739 } 1740 1741 /* Host notifiers must be enabled at this point. */ 1742 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev) 1743 { 1744 int i; 1745 1746 /* should only be called after backend is connected */ 1747 assert(hdev->vhost_ops); 1748 1749 for (i = 0; i < hdev->nvqs; ++i) { 1750 vhost_virtqueue_stop(hdev, 1751 vdev, 1752 hdev->vqs + i, 1753 hdev->vq_index + i); 1754 } 1755 1756 if (vhost_dev_has_iommu(hdev)) { 1757 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); 1758 memory_listener_unregister(&hdev->iommu_listener); 1759 } 1760 vhost_log_put(hdev, true); 1761 hdev->started = false; 1762 hdev->vdev = NULL; 1763 } 1764 1765 int vhost_net_set_backend(struct vhost_dev *hdev, 1766 struct vhost_vring_file *file) 1767 { 1768 if (hdev->vhost_ops->vhost_net_set_backend) { 1769 return hdev->vhost_ops->vhost_net_set_backend(hdev, file); 1770 } 1771 1772 return -1; 1773 } 1774