1 /* 2 * vhost support 3 * 4 * Copyright Red Hat, Inc. 2010 5 * 6 * Authors: 7 * Michael S. Tsirkin <mst@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "hw/virtio/vhost.h" 17 #include "hw/hw.h" 18 #include "qemu/atomic.h" 19 #include "qemu/range.h" 20 #include "qemu/error-report.h" 21 #include <linux/vhost.h> 22 #include "exec/address-spaces.h" 23 #include "hw/virtio/virtio-bus.h" 24 #include "hw/virtio/virtio-access.h" 25 #include "migration/migration.h" 26 27 static struct vhost_log *vhost_log; 28 29 static unsigned int used_memslots; 30 static QLIST_HEAD(, vhost_dev) vhost_devices = 31 QLIST_HEAD_INITIALIZER(vhost_devices); 32 33 bool vhost_has_free_slot(void) 34 { 35 unsigned int slots_limit = ~0U; 36 struct vhost_dev *hdev; 37 38 QLIST_FOREACH(hdev, &vhost_devices, entry) { 39 unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); 40 slots_limit = MIN(slots_limit, r); 41 } 42 return slots_limit > used_memslots; 43 } 44 45 static void vhost_dev_sync_region(struct vhost_dev *dev, 46 MemoryRegionSection *section, 47 uint64_t mfirst, uint64_t mlast, 48 uint64_t rfirst, uint64_t rlast) 49 { 50 vhost_log_chunk_t *log = dev->log->log; 51 52 uint64_t start = MAX(mfirst, rfirst); 53 uint64_t end = MIN(mlast, rlast); 54 vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK; 55 vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1; 56 uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK; 57 58 if (end < start) { 59 return; 60 } 61 assert(end / VHOST_LOG_CHUNK < dev->log_size); 62 assert(start / VHOST_LOG_CHUNK < dev->log_size); 63 64 for (;from < to; ++from) { 65 vhost_log_chunk_t log; 66 /* We first check with non-atomic: much cheaper, 67 * and we expect non-dirty to be the common case. */ 68 if (!*from) { 69 addr += VHOST_LOG_CHUNK; 70 continue; 71 } 72 /* Data must be read atomically. We don't really need barrier semantics 73 * but it's easier to use atomic_* than roll our own. */ 74 log = atomic_xchg(from, 0); 75 while (log) { 76 int bit = ctzl(log); 77 hwaddr page_addr; 78 hwaddr section_offset; 79 hwaddr mr_offset; 80 page_addr = addr + bit * VHOST_LOG_PAGE; 81 section_offset = page_addr - section->offset_within_address_space; 82 mr_offset = section_offset + section->offset_within_region; 83 memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE); 84 log &= ~(0x1ull << bit); 85 } 86 addr += VHOST_LOG_CHUNK; 87 } 88 } 89 90 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, 91 MemoryRegionSection *section, 92 hwaddr first, 93 hwaddr last) 94 { 95 int i; 96 hwaddr start_addr; 97 hwaddr end_addr; 98 99 if (!dev->log_enabled || !dev->started) { 100 return 0; 101 } 102 start_addr = section->offset_within_address_space; 103 end_addr = range_get_last(start_addr, int128_get64(section->size)); 104 start_addr = MAX(first, start_addr); 105 end_addr = MIN(last, end_addr); 106 107 for (i = 0; i < dev->mem->nregions; ++i) { 108 struct vhost_memory_region *reg = dev->mem->regions + i; 109 vhost_dev_sync_region(dev, section, start_addr, end_addr, 110 reg->guest_phys_addr, 111 range_get_last(reg->guest_phys_addr, 112 reg->memory_size)); 113 } 114 for (i = 0; i < dev->nvqs; ++i) { 115 struct vhost_virtqueue *vq = dev->vqs + i; 116 vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys, 117 range_get_last(vq->used_phys, vq->used_size)); 118 } 119 return 0; 120 } 121 122 static void vhost_log_sync(MemoryListener *listener, 123 MemoryRegionSection *section) 124 { 125 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 126 memory_listener); 127 vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); 128 } 129 130 static void vhost_log_sync_range(struct vhost_dev *dev, 131 hwaddr first, hwaddr last) 132 { 133 int i; 134 /* FIXME: this is N^2 in number of sections */ 135 for (i = 0; i < dev->n_mem_sections; ++i) { 136 MemoryRegionSection *section = &dev->mem_sections[i]; 137 vhost_sync_dirty_bitmap(dev, section, first, last); 138 } 139 } 140 141 /* Assign/unassign. Keep an unsorted array of non-overlapping 142 * memory regions in dev->mem. */ 143 static void vhost_dev_unassign_memory(struct vhost_dev *dev, 144 uint64_t start_addr, 145 uint64_t size) 146 { 147 int from, to, n = dev->mem->nregions; 148 /* Track overlapping/split regions for sanity checking. */ 149 int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0; 150 151 for (from = 0, to = 0; from < n; ++from, ++to) { 152 struct vhost_memory_region *reg = dev->mem->regions + to; 153 uint64_t reglast; 154 uint64_t memlast; 155 uint64_t change; 156 157 /* clone old region */ 158 if (to != from) { 159 memcpy(reg, dev->mem->regions + from, sizeof *reg); 160 } 161 162 /* No overlap is simple */ 163 if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size, 164 start_addr, size)) { 165 continue; 166 } 167 168 /* Split only happens if supplied region 169 * is in the middle of an existing one. Thus it can not 170 * overlap with any other existing region. */ 171 assert(!split); 172 173 reglast = range_get_last(reg->guest_phys_addr, reg->memory_size); 174 memlast = range_get_last(start_addr, size); 175 176 /* Remove whole region */ 177 if (start_addr <= reg->guest_phys_addr && memlast >= reglast) { 178 --dev->mem->nregions; 179 --to; 180 ++overlap_middle; 181 continue; 182 } 183 184 /* Shrink region */ 185 if (memlast >= reglast) { 186 reg->memory_size = start_addr - reg->guest_phys_addr; 187 assert(reg->memory_size); 188 assert(!overlap_end); 189 ++overlap_end; 190 continue; 191 } 192 193 /* Shift region */ 194 if (start_addr <= reg->guest_phys_addr) { 195 change = memlast + 1 - reg->guest_phys_addr; 196 reg->memory_size -= change; 197 reg->guest_phys_addr += change; 198 reg->userspace_addr += change; 199 assert(reg->memory_size); 200 assert(!overlap_start); 201 ++overlap_start; 202 continue; 203 } 204 205 /* This only happens if supplied region 206 * is in the middle of an existing one. Thus it can not 207 * overlap with any other existing region. */ 208 assert(!overlap_start); 209 assert(!overlap_end); 210 assert(!overlap_middle); 211 /* Split region: shrink first part, shift second part. */ 212 memcpy(dev->mem->regions + n, reg, sizeof *reg); 213 reg->memory_size = start_addr - reg->guest_phys_addr; 214 assert(reg->memory_size); 215 change = memlast + 1 - reg->guest_phys_addr; 216 reg = dev->mem->regions + n; 217 reg->memory_size -= change; 218 assert(reg->memory_size); 219 reg->guest_phys_addr += change; 220 reg->userspace_addr += change; 221 /* Never add more than 1 region */ 222 assert(dev->mem->nregions == n); 223 ++dev->mem->nregions; 224 ++split; 225 } 226 } 227 228 /* Called after unassign, so no regions overlap the given range. */ 229 static void vhost_dev_assign_memory(struct vhost_dev *dev, 230 uint64_t start_addr, 231 uint64_t size, 232 uint64_t uaddr) 233 { 234 int from, to; 235 struct vhost_memory_region *merged = NULL; 236 for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) { 237 struct vhost_memory_region *reg = dev->mem->regions + to; 238 uint64_t prlast, urlast; 239 uint64_t pmlast, umlast; 240 uint64_t s, e, u; 241 242 /* clone old region */ 243 if (to != from) { 244 memcpy(reg, dev->mem->regions + from, sizeof *reg); 245 } 246 prlast = range_get_last(reg->guest_phys_addr, reg->memory_size); 247 pmlast = range_get_last(start_addr, size); 248 urlast = range_get_last(reg->userspace_addr, reg->memory_size); 249 umlast = range_get_last(uaddr, size); 250 251 /* check for overlapping regions: should never happen. */ 252 assert(prlast < start_addr || pmlast < reg->guest_phys_addr); 253 /* Not an adjacent or overlapping region - do not merge. */ 254 if ((prlast + 1 != start_addr || urlast + 1 != uaddr) && 255 (pmlast + 1 != reg->guest_phys_addr || 256 umlast + 1 != reg->userspace_addr)) { 257 continue; 258 } 259 260 if (merged) { 261 --to; 262 assert(to >= 0); 263 } else { 264 merged = reg; 265 } 266 u = MIN(uaddr, reg->userspace_addr); 267 s = MIN(start_addr, reg->guest_phys_addr); 268 e = MAX(pmlast, prlast); 269 uaddr = merged->userspace_addr = u; 270 start_addr = merged->guest_phys_addr = s; 271 size = merged->memory_size = e - s + 1; 272 assert(merged->memory_size); 273 } 274 275 if (!merged) { 276 struct vhost_memory_region *reg = dev->mem->regions + to; 277 memset(reg, 0, sizeof *reg); 278 reg->memory_size = size; 279 assert(reg->memory_size); 280 reg->guest_phys_addr = start_addr; 281 reg->userspace_addr = uaddr; 282 ++to; 283 } 284 assert(to <= dev->mem->nregions + 1); 285 dev->mem->nregions = to; 286 } 287 288 static uint64_t vhost_get_log_size(struct vhost_dev *dev) 289 { 290 uint64_t log_size = 0; 291 int i; 292 for (i = 0; i < dev->mem->nregions; ++i) { 293 struct vhost_memory_region *reg = dev->mem->regions + i; 294 uint64_t last = range_get_last(reg->guest_phys_addr, 295 reg->memory_size); 296 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); 297 } 298 for (i = 0; i < dev->nvqs; ++i) { 299 struct vhost_virtqueue *vq = dev->vqs + i; 300 uint64_t last = vq->used_phys + vq->used_size - 1; 301 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); 302 } 303 return log_size; 304 } 305 static struct vhost_log *vhost_log_alloc(uint64_t size) 306 { 307 struct vhost_log *log = g_malloc0(sizeof *log + size * sizeof(*(log->log))); 308 309 log->size = size; 310 log->refcnt = 1; 311 312 return log; 313 } 314 315 static struct vhost_log *vhost_log_get(uint64_t size) 316 { 317 if (!vhost_log || vhost_log->size != size) { 318 vhost_log = vhost_log_alloc(size); 319 } else { 320 ++vhost_log->refcnt; 321 } 322 323 return vhost_log; 324 } 325 326 static void vhost_log_put(struct vhost_dev *dev, bool sync) 327 { 328 struct vhost_log *log = dev->log; 329 330 if (!log) { 331 return; 332 } 333 334 --log->refcnt; 335 if (log->refcnt == 0) { 336 /* Sync only the range covered by the old log */ 337 if (dev->log_size && sync) { 338 vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); 339 } 340 if (vhost_log == log) { 341 vhost_log = NULL; 342 } 343 g_free(log); 344 } 345 } 346 347 static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size) 348 { 349 struct vhost_log *log = vhost_log_get(size); 350 uint64_t log_base = (uintptr_t)log->log; 351 int r; 352 353 /* inform backend of log switching, this must be done before 354 releasing the current log, to ensure no logging is lost */ 355 r = dev->vhost_ops->vhost_set_log_base(dev, log_base); 356 assert(r >= 0); 357 vhost_log_put(dev, true); 358 dev->log = log; 359 dev->log_size = size; 360 } 361 362 static int vhost_verify_ring_mappings(struct vhost_dev *dev, 363 uint64_t start_addr, 364 uint64_t size) 365 { 366 int i; 367 int r = 0; 368 369 for (i = 0; !r && i < dev->nvqs; ++i) { 370 struct vhost_virtqueue *vq = dev->vqs + i; 371 hwaddr l; 372 void *p; 373 374 if (!ranges_overlap(start_addr, size, vq->ring_phys, vq->ring_size)) { 375 continue; 376 } 377 l = vq->ring_size; 378 p = cpu_physical_memory_map(vq->ring_phys, &l, 1); 379 if (!p || l != vq->ring_size) { 380 fprintf(stderr, "Unable to map ring buffer for ring %d\n", i); 381 r = -ENOMEM; 382 } 383 if (p != vq->ring) { 384 fprintf(stderr, "Ring buffer relocated for ring %d\n", i); 385 r = -EBUSY; 386 } 387 cpu_physical_memory_unmap(p, l, 0, 0); 388 } 389 return r; 390 } 391 392 static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev, 393 uint64_t start_addr, 394 uint64_t size) 395 { 396 int i, n = dev->mem->nregions; 397 for (i = 0; i < n; ++i) { 398 struct vhost_memory_region *reg = dev->mem->regions + i; 399 if (ranges_overlap(reg->guest_phys_addr, reg->memory_size, 400 start_addr, size)) { 401 return reg; 402 } 403 } 404 return NULL; 405 } 406 407 static bool vhost_dev_cmp_memory(struct vhost_dev *dev, 408 uint64_t start_addr, 409 uint64_t size, 410 uint64_t uaddr) 411 { 412 struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size); 413 uint64_t reglast; 414 uint64_t memlast; 415 416 if (!reg) { 417 return true; 418 } 419 420 reglast = range_get_last(reg->guest_phys_addr, reg->memory_size); 421 memlast = range_get_last(start_addr, size); 422 423 /* Need to extend region? */ 424 if (start_addr < reg->guest_phys_addr || memlast > reglast) { 425 return true; 426 } 427 /* userspace_addr changed? */ 428 return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr; 429 } 430 431 static void vhost_set_memory(MemoryListener *listener, 432 MemoryRegionSection *section, 433 bool add) 434 { 435 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 436 memory_listener); 437 hwaddr start_addr = section->offset_within_address_space; 438 ram_addr_t size = int128_get64(section->size); 439 bool log_dirty = 440 memory_region_get_dirty_log_mask(section->mr) & ~(1 << DIRTY_MEMORY_MIGRATION); 441 int s = offsetof(struct vhost_memory, regions) + 442 (dev->mem->nregions + 1) * sizeof dev->mem->regions[0]; 443 void *ram; 444 445 dev->mem = g_realloc(dev->mem, s); 446 447 if (log_dirty) { 448 add = false; 449 } 450 451 assert(size); 452 453 /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */ 454 ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region; 455 if (add) { 456 if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) { 457 /* Region exists with same address. Nothing to do. */ 458 return; 459 } 460 } else { 461 if (!vhost_dev_find_reg(dev, start_addr, size)) { 462 /* Removing region that we don't access. Nothing to do. */ 463 return; 464 } 465 } 466 467 vhost_dev_unassign_memory(dev, start_addr, size); 468 if (add) { 469 /* Add given mapping, merging adjacent regions if any */ 470 vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram); 471 } else { 472 /* Remove old mapping for this memory, if any. */ 473 vhost_dev_unassign_memory(dev, start_addr, size); 474 } 475 dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr); 476 dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1); 477 dev->memory_changed = true; 478 used_memslots = dev->mem->nregions; 479 } 480 481 static bool vhost_section(MemoryRegionSection *section) 482 { 483 return memory_region_is_ram(section->mr); 484 } 485 486 static void vhost_begin(MemoryListener *listener) 487 { 488 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 489 memory_listener); 490 dev->mem_changed_end_addr = 0; 491 dev->mem_changed_start_addr = -1; 492 } 493 494 static void vhost_commit(MemoryListener *listener) 495 { 496 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 497 memory_listener); 498 hwaddr start_addr = 0; 499 ram_addr_t size = 0; 500 uint64_t log_size; 501 int r; 502 503 if (!dev->memory_changed) { 504 return; 505 } 506 if (!dev->started) { 507 return; 508 } 509 if (dev->mem_changed_start_addr > dev->mem_changed_end_addr) { 510 return; 511 } 512 513 if (dev->started) { 514 start_addr = dev->mem_changed_start_addr; 515 size = dev->mem_changed_end_addr - dev->mem_changed_start_addr + 1; 516 517 r = vhost_verify_ring_mappings(dev, start_addr, size); 518 assert(r >= 0); 519 } 520 521 if (!dev->log_enabled) { 522 r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem); 523 assert(r >= 0); 524 dev->memory_changed = false; 525 return; 526 } 527 log_size = vhost_get_log_size(dev); 528 /* We allocate an extra 4K bytes to log, 529 * to reduce the * number of reallocations. */ 530 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) 531 /* To log more, must increase log size before table update. */ 532 if (dev->log_size < log_size) { 533 vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); 534 } 535 r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem); 536 assert(r >= 0); 537 /* To log less, can only decrease log size after table update. */ 538 if (dev->log_size > log_size + VHOST_LOG_BUFFER) { 539 vhost_dev_log_resize(dev, log_size); 540 } 541 dev->memory_changed = false; 542 } 543 544 static void vhost_region_add(MemoryListener *listener, 545 MemoryRegionSection *section) 546 { 547 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 548 memory_listener); 549 550 if (!vhost_section(section)) { 551 return; 552 } 553 554 ++dev->n_mem_sections; 555 dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections, 556 dev->n_mem_sections); 557 dev->mem_sections[dev->n_mem_sections - 1] = *section; 558 memory_region_ref(section->mr); 559 vhost_set_memory(listener, section, true); 560 } 561 562 static void vhost_region_del(MemoryListener *listener, 563 MemoryRegionSection *section) 564 { 565 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 566 memory_listener); 567 int i; 568 569 if (!vhost_section(section)) { 570 return; 571 } 572 573 vhost_set_memory(listener, section, false); 574 memory_region_unref(section->mr); 575 for (i = 0; i < dev->n_mem_sections; ++i) { 576 if (dev->mem_sections[i].offset_within_address_space 577 == section->offset_within_address_space) { 578 --dev->n_mem_sections; 579 memmove(&dev->mem_sections[i], &dev->mem_sections[i+1], 580 (dev->n_mem_sections - i) * sizeof(*dev->mem_sections)); 581 break; 582 } 583 } 584 } 585 586 static void vhost_region_nop(MemoryListener *listener, 587 MemoryRegionSection *section) 588 { 589 } 590 591 static int vhost_virtqueue_set_addr(struct vhost_dev *dev, 592 struct vhost_virtqueue *vq, 593 unsigned idx, bool enable_log) 594 { 595 struct vhost_vring_addr addr = { 596 .index = idx, 597 .desc_user_addr = (uint64_t)(unsigned long)vq->desc, 598 .avail_user_addr = (uint64_t)(unsigned long)vq->avail, 599 .used_user_addr = (uint64_t)(unsigned long)vq->used, 600 .log_guest_addr = vq->used_phys, 601 .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0, 602 }; 603 int r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_ADDR, &addr); 604 if (r < 0) { 605 return -errno; 606 } 607 return 0; 608 } 609 610 static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log) 611 { 612 uint64_t features = dev->acked_features; 613 int r; 614 if (enable_log) { 615 features |= 0x1ULL << VHOST_F_LOG_ALL; 616 } 617 r = dev->vhost_ops->vhost_call(dev, VHOST_SET_FEATURES, &features); 618 return r < 0 ? -errno : 0; 619 } 620 621 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) 622 { 623 int r, t, i; 624 r = vhost_dev_set_features(dev, enable_log); 625 if (r < 0) { 626 goto err_features; 627 } 628 for (i = 0; i < dev->nvqs; ++i) { 629 r = vhost_virtqueue_set_addr(dev, dev->vqs + i, i, 630 enable_log); 631 if (r < 0) { 632 goto err_vq; 633 } 634 } 635 return 0; 636 err_vq: 637 for (; i >= 0; --i) { 638 t = vhost_virtqueue_set_addr(dev, dev->vqs + i, i, 639 dev->log_enabled); 640 assert(t >= 0); 641 } 642 t = vhost_dev_set_features(dev, dev->log_enabled); 643 assert(t >= 0); 644 err_features: 645 return r; 646 } 647 648 static int vhost_migration_log(MemoryListener *listener, int enable) 649 { 650 struct vhost_dev *dev = container_of(listener, struct vhost_dev, 651 memory_listener); 652 int r; 653 if (!!enable == dev->log_enabled) { 654 return 0; 655 } 656 if (!dev->started) { 657 dev->log_enabled = enable; 658 return 0; 659 } 660 if (!enable) { 661 r = vhost_dev_set_log(dev, false); 662 if (r < 0) { 663 return r; 664 } 665 vhost_log_put(dev, false); 666 dev->log = NULL; 667 dev->log_size = 0; 668 } else { 669 vhost_dev_log_resize(dev, vhost_get_log_size(dev)); 670 r = vhost_dev_set_log(dev, true); 671 if (r < 0) { 672 return r; 673 } 674 } 675 dev->log_enabled = enable; 676 return 0; 677 } 678 679 static void vhost_log_global_start(MemoryListener *listener) 680 { 681 int r; 682 683 r = vhost_migration_log(listener, true); 684 if (r < 0) { 685 abort(); 686 } 687 } 688 689 static void vhost_log_global_stop(MemoryListener *listener) 690 { 691 int r; 692 693 r = vhost_migration_log(listener, false); 694 if (r < 0) { 695 abort(); 696 } 697 } 698 699 static void vhost_log_start(MemoryListener *listener, 700 MemoryRegionSection *section, 701 int old, int new) 702 { 703 /* FIXME: implement */ 704 } 705 706 static void vhost_log_stop(MemoryListener *listener, 707 MemoryRegionSection *section, 708 int old, int new) 709 { 710 /* FIXME: implement */ 711 } 712 713 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, 714 bool is_big_endian, 715 int vhost_vq_index) 716 { 717 struct vhost_vring_state s = { 718 .index = vhost_vq_index, 719 .num = is_big_endian 720 }; 721 722 if (!dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_ENDIAN, &s)) { 723 return 0; 724 } 725 726 if (errno == ENOTTY) { 727 error_report("vhost does not support cross-endian"); 728 return -ENOSYS; 729 } 730 731 return -errno; 732 } 733 734 static int vhost_virtqueue_start(struct vhost_dev *dev, 735 struct VirtIODevice *vdev, 736 struct vhost_virtqueue *vq, 737 unsigned idx) 738 { 739 hwaddr s, l, a; 740 int r; 741 int vhost_vq_index = dev->vhost_ops->vhost_backend_get_vq_index(dev, idx); 742 struct vhost_vring_file file = { 743 .index = vhost_vq_index 744 }; 745 struct vhost_vring_state state = { 746 .index = vhost_vq_index 747 }; 748 struct VirtQueue *vvq = virtio_get_queue(vdev, idx); 749 750 751 vq->num = state.num = virtio_queue_get_num(vdev, idx); 752 r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_NUM, &state); 753 if (r) { 754 return -errno; 755 } 756 757 state.num = virtio_queue_get_last_avail_idx(vdev, idx); 758 r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_BASE, &state); 759 if (r) { 760 return -errno; 761 } 762 763 if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1) && 764 virtio_legacy_is_cross_endian(vdev)) { 765 r = vhost_virtqueue_set_vring_endian_legacy(dev, 766 virtio_is_big_endian(vdev), 767 vhost_vq_index); 768 if (r) { 769 return -errno; 770 } 771 } 772 773 s = l = virtio_queue_get_desc_size(vdev, idx); 774 a = virtio_queue_get_desc_addr(vdev, idx); 775 vq->desc = cpu_physical_memory_map(a, &l, 0); 776 if (!vq->desc || l != s) { 777 r = -ENOMEM; 778 goto fail_alloc_desc; 779 } 780 s = l = virtio_queue_get_avail_size(vdev, idx); 781 a = virtio_queue_get_avail_addr(vdev, idx); 782 vq->avail = cpu_physical_memory_map(a, &l, 0); 783 if (!vq->avail || l != s) { 784 r = -ENOMEM; 785 goto fail_alloc_avail; 786 } 787 vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx); 788 vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx); 789 vq->used = cpu_physical_memory_map(a, &l, 1); 790 if (!vq->used || l != s) { 791 r = -ENOMEM; 792 goto fail_alloc_used; 793 } 794 795 vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx); 796 vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx); 797 vq->ring = cpu_physical_memory_map(a, &l, 1); 798 if (!vq->ring || l != s) { 799 r = -ENOMEM; 800 goto fail_alloc_ring; 801 } 802 803 r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); 804 if (r < 0) { 805 r = -errno; 806 goto fail_alloc; 807 } 808 809 file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); 810 r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_KICK, &file); 811 if (r) { 812 r = -errno; 813 goto fail_kick; 814 } 815 816 /* Clear and discard previous events if any. */ 817 event_notifier_test_and_clear(&vq->masked_notifier); 818 819 return 0; 820 821 fail_kick: 822 fail_alloc: 823 cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx), 824 0, 0); 825 fail_alloc_ring: 826 cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx), 827 0, 0); 828 fail_alloc_used: 829 cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx), 830 0, 0); 831 fail_alloc_avail: 832 cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx), 833 0, 0); 834 fail_alloc_desc: 835 return r; 836 } 837 838 static void vhost_virtqueue_stop(struct vhost_dev *dev, 839 struct VirtIODevice *vdev, 840 struct vhost_virtqueue *vq, 841 unsigned idx) 842 { 843 int vhost_vq_index = dev->vhost_ops->vhost_backend_get_vq_index(dev, idx); 844 struct vhost_vring_state state = { 845 .index = vhost_vq_index, 846 }; 847 int r; 848 849 r = dev->vhost_ops->vhost_call(dev, VHOST_GET_VRING_BASE, &state); 850 if (r < 0) { 851 fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r); 852 fflush(stderr); 853 } 854 virtio_queue_set_last_avail_idx(vdev, idx, state.num); 855 virtio_queue_invalidate_signalled_used(vdev, idx); 856 857 /* In the cross-endian case, we need to reset the vring endianness to 858 * native as legacy devices expect so by default. 859 */ 860 if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1) && 861 virtio_legacy_is_cross_endian(vdev)) { 862 r = vhost_virtqueue_set_vring_endian_legacy(dev, 863 !virtio_is_big_endian(vdev), 864 vhost_vq_index); 865 if (r < 0) { 866 error_report("failed to reset vring endianness"); 867 } 868 } 869 870 assert (r >= 0); 871 cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx), 872 0, virtio_queue_get_ring_size(vdev, idx)); 873 cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx), 874 1, virtio_queue_get_used_size(vdev, idx)); 875 cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx), 876 0, virtio_queue_get_avail_size(vdev, idx)); 877 cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx), 878 0, virtio_queue_get_desc_size(vdev, idx)); 879 } 880 881 static void vhost_eventfd_add(MemoryListener *listener, 882 MemoryRegionSection *section, 883 bool match_data, uint64_t data, EventNotifier *e) 884 { 885 } 886 887 static void vhost_eventfd_del(MemoryListener *listener, 888 MemoryRegionSection *section, 889 bool match_data, uint64_t data, EventNotifier *e) 890 { 891 } 892 893 static int vhost_virtqueue_init(struct vhost_dev *dev, 894 struct vhost_virtqueue *vq, int n) 895 { 896 int vhost_vq_index = dev->vhost_ops->vhost_backend_get_vq_index(dev, n); 897 struct vhost_vring_file file = { 898 .index = vhost_vq_index, 899 }; 900 int r = event_notifier_init(&vq->masked_notifier, 0); 901 if (r < 0) { 902 return r; 903 } 904 905 file.fd = event_notifier_get_fd(&vq->masked_notifier); 906 r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_CALL, &file); 907 if (r) { 908 r = -errno; 909 goto fail_call; 910 } 911 return 0; 912 fail_call: 913 event_notifier_cleanup(&vq->masked_notifier); 914 return r; 915 } 916 917 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) 918 { 919 event_notifier_cleanup(&vq->masked_notifier); 920 } 921 922 int vhost_dev_init(struct vhost_dev *hdev, void *opaque, 923 VhostBackendType backend_type) 924 { 925 uint64_t features; 926 int i, r; 927 928 if (vhost_set_backend_type(hdev, backend_type) < 0) { 929 close((uintptr_t)opaque); 930 return -1; 931 } 932 933 if (hdev->vhost_ops->vhost_backend_init(hdev, opaque) < 0) { 934 close((uintptr_t)opaque); 935 return -errno; 936 } 937 938 if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) { 939 fprintf(stderr, "vhost backend memory slots limit is less" 940 " than current number of present memory slots\n"); 941 close((uintptr_t)opaque); 942 return -1; 943 } 944 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); 945 946 r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_OWNER, NULL); 947 if (r < 0) { 948 goto fail; 949 } 950 951 r = hdev->vhost_ops->vhost_call(hdev, VHOST_GET_FEATURES, &features); 952 if (r < 0) { 953 goto fail; 954 } 955 956 for (i = 0; i < hdev->nvqs; ++i) { 957 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i); 958 if (r < 0) { 959 goto fail_vq; 960 } 961 } 962 hdev->features = features; 963 964 hdev->memory_listener = (MemoryListener) { 965 .begin = vhost_begin, 966 .commit = vhost_commit, 967 .region_add = vhost_region_add, 968 .region_del = vhost_region_del, 969 .region_nop = vhost_region_nop, 970 .log_start = vhost_log_start, 971 .log_stop = vhost_log_stop, 972 .log_sync = vhost_log_sync, 973 .log_global_start = vhost_log_global_start, 974 .log_global_stop = vhost_log_global_stop, 975 .eventfd_add = vhost_eventfd_add, 976 .eventfd_del = vhost_eventfd_del, 977 .priority = 10 978 }; 979 hdev->migration_blocker = NULL; 980 if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { 981 error_setg(&hdev->migration_blocker, 982 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); 983 migrate_add_blocker(hdev->migration_blocker); 984 } 985 hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); 986 hdev->n_mem_sections = 0; 987 hdev->mem_sections = NULL; 988 hdev->log = NULL; 989 hdev->log_size = 0; 990 hdev->log_enabled = false; 991 hdev->started = false; 992 hdev->memory_changed = false; 993 memory_listener_register(&hdev->memory_listener, &address_space_memory); 994 return 0; 995 fail_vq: 996 while (--i >= 0) { 997 vhost_virtqueue_cleanup(hdev->vqs + i); 998 } 999 fail: 1000 r = -errno; 1001 hdev->vhost_ops->vhost_backend_cleanup(hdev); 1002 QLIST_REMOVE(hdev, entry); 1003 return r; 1004 } 1005 1006 void vhost_dev_cleanup(struct vhost_dev *hdev) 1007 { 1008 int i; 1009 for (i = 0; i < hdev->nvqs; ++i) { 1010 vhost_virtqueue_cleanup(hdev->vqs + i); 1011 } 1012 memory_listener_unregister(&hdev->memory_listener); 1013 if (hdev->migration_blocker) { 1014 migrate_del_blocker(hdev->migration_blocker); 1015 error_free(hdev->migration_blocker); 1016 } 1017 g_free(hdev->mem); 1018 g_free(hdev->mem_sections); 1019 hdev->vhost_ops->vhost_backend_cleanup(hdev); 1020 QLIST_REMOVE(hdev, entry); 1021 } 1022 1023 /* Stop processing guest IO notifications in qemu. 1024 * Start processing them in vhost in kernel. 1025 */ 1026 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1027 { 1028 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1029 VirtioBusState *vbus = VIRTIO_BUS(qbus); 1030 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); 1031 int i, r, e; 1032 if (!k->set_host_notifier) { 1033 fprintf(stderr, "binding does not support host notifiers\n"); 1034 r = -ENOSYS; 1035 goto fail; 1036 } 1037 1038 for (i = 0; i < hdev->nvqs; ++i) { 1039 r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, true); 1040 if (r < 0) { 1041 fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r); 1042 goto fail_vq; 1043 } 1044 } 1045 1046 return 0; 1047 fail_vq: 1048 while (--i >= 0) { 1049 e = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false); 1050 if (e < 0) { 1051 fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r); 1052 fflush(stderr); 1053 } 1054 assert (e >= 0); 1055 } 1056 fail: 1057 return r; 1058 } 1059 1060 /* Stop processing guest IO notifications in vhost. 1061 * Start processing them in qemu. 1062 * This might actually run the qemu handlers right away, 1063 * so virtio in qemu must be completely setup when this is called. 1064 */ 1065 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) 1066 { 1067 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); 1068 VirtioBusState *vbus = VIRTIO_BUS(qbus); 1069 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); 1070 int i, r; 1071 1072 for (i = 0; i < hdev->nvqs; ++i) { 1073 r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false); 1074 if (r < 0) { 1075 fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r); 1076 fflush(stderr); 1077 } 1078 assert (r >= 0); 1079 } 1080 } 1081 1082 /* Test and clear event pending status. 1083 * Should be called after unmask to avoid losing events. 1084 */ 1085 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n) 1086 { 1087 struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index; 1088 assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); 1089 return event_notifier_test_and_clear(&vq->masked_notifier); 1090 } 1091 1092 /* Mask/unmask events from this vq. */ 1093 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, 1094 bool mask) 1095 { 1096 struct VirtQueue *vvq = virtio_get_queue(vdev, n); 1097 int r, index = n - hdev->vq_index; 1098 struct vhost_vring_file file; 1099 1100 if (mask) { 1101 file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier); 1102 } else { 1103 file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq)); 1104 } 1105 1106 file.index = hdev->vhost_ops->vhost_backend_get_vq_index(hdev, n); 1107 r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_VRING_CALL, &file); 1108 assert(r >= 0); 1109 } 1110 1111 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, 1112 uint64_t features) 1113 { 1114 const int *bit = feature_bits; 1115 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1116 uint64_t bit_mask = (1ULL << *bit); 1117 if (!(hdev->features & bit_mask)) { 1118 features &= ~bit_mask; 1119 } 1120 bit++; 1121 } 1122 return features; 1123 } 1124 1125 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, 1126 uint64_t features) 1127 { 1128 const int *bit = feature_bits; 1129 while (*bit != VHOST_INVALID_FEATURE_BIT) { 1130 uint64_t bit_mask = (1ULL << *bit); 1131 if (features & bit_mask) { 1132 hdev->acked_features |= bit_mask; 1133 } 1134 bit++; 1135 } 1136 } 1137 1138 /* Host notifiers must be enabled at this point. */ 1139 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) 1140 { 1141 int i, r; 1142 1143 hdev->started = true; 1144 1145 r = vhost_dev_set_features(hdev, hdev->log_enabled); 1146 if (r < 0) { 1147 goto fail_features; 1148 } 1149 r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_MEM_TABLE, hdev->mem); 1150 if (r < 0) { 1151 r = -errno; 1152 goto fail_mem; 1153 } 1154 for (i = 0; i < hdev->nvqs; ++i) { 1155 r = vhost_virtqueue_start(hdev, 1156 vdev, 1157 hdev->vqs + i, 1158 hdev->vq_index + i); 1159 if (r < 0) { 1160 goto fail_vq; 1161 } 1162 } 1163 1164 if (hdev->log_enabled) { 1165 uint64_t log_base; 1166 1167 hdev->log_size = vhost_get_log_size(hdev); 1168 hdev->log = vhost_log_get(hdev->log_size); 1169 log_base = (uintptr_t)hdev->log->log; 1170 r = hdev->vhost_ops->vhost_set_log_base(hdev, 1171 hdev->log_size ? log_base : 0); 1172 if (r < 0) { 1173 r = -errno; 1174 goto fail_log; 1175 } 1176 } 1177 1178 return 0; 1179 fail_log: 1180 vhost_log_put(hdev, false); 1181 fail_vq: 1182 while (--i >= 0) { 1183 vhost_virtqueue_stop(hdev, 1184 vdev, 1185 hdev->vqs + i, 1186 hdev->vq_index + i); 1187 } 1188 i = hdev->nvqs; 1189 fail_mem: 1190 fail_features: 1191 1192 hdev->started = false; 1193 return r; 1194 } 1195 1196 /* Host notifiers must be enabled at this point. */ 1197 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev) 1198 { 1199 int i; 1200 1201 for (i = 0; i < hdev->nvqs; ++i) { 1202 vhost_virtqueue_stop(hdev, 1203 vdev, 1204 hdev->vqs + i, 1205 hdev->vq_index + i); 1206 } 1207 1208 vhost_log_put(hdev, true); 1209 hdev->started = false; 1210 hdev->log = NULL; 1211 hdev->log_size = 0; 1212 } 1213 1214