1 /* 2 * vfio based device assignment support 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include <dirent.h> 22 #include <linux/vfio.h> 23 #include <sys/ioctl.h> 24 #include <sys/mman.h> 25 #include <sys/stat.h> 26 #include <sys/types.h> 27 #include <unistd.h> 28 29 #include "config.h" 30 #include "exec/address-spaces.h" 31 #include "exec/memory.h" 32 #include "hw/pci/msi.h" 33 #include "hw/pci/msix.h" 34 #include "hw/pci/pci.h" 35 #include "qemu-common.h" 36 #include "qemu/error-report.h" 37 #include "qemu/event_notifier.h" 38 #include "qemu/queue.h" 39 #include "qemu/range.h" 40 #include "sysemu/kvm.h" 41 #include "sysemu/sysemu.h" 42 #include "trace.h" 43 #include "hw/vfio/vfio.h" 44 #include "hw/vfio/vfio-common.h" 45 46 struct VFIOPCIDevice; 47 48 typedef struct VFIOQuirk { 49 MemoryRegion mem; 50 struct VFIOPCIDevice *vdev; 51 QLIST_ENTRY(VFIOQuirk) next; 52 struct { 53 uint32_t base_offset:TARGET_PAGE_BITS; 54 uint32_t address_offset:TARGET_PAGE_BITS; 55 uint32_t address_size:3; 56 uint32_t bar:3; 57 58 uint32_t address_match; 59 uint32_t address_mask; 60 61 uint32_t address_val:TARGET_PAGE_BITS; 62 uint32_t data_offset:TARGET_PAGE_BITS; 63 uint32_t data_size:3; 64 65 uint8_t flags; 66 uint8_t read_flags; 67 uint8_t write_flags; 68 } data; 69 } VFIOQuirk; 70 71 typedef struct VFIOBAR { 72 VFIORegion region; 73 bool ioport; 74 bool mem64; 75 QLIST_HEAD(, VFIOQuirk) quirks; 76 } VFIOBAR; 77 78 typedef struct VFIOVGARegion { 79 MemoryRegion mem; 80 off_t offset; 81 int nr; 82 QLIST_HEAD(, VFIOQuirk) quirks; 83 } VFIOVGARegion; 84 85 typedef struct VFIOVGA { 86 off_t fd_offset; 87 int fd; 88 VFIOVGARegion region[QEMU_PCI_VGA_NUM_REGIONS]; 89 } VFIOVGA; 90 91 typedef struct VFIOINTx { 92 bool pending; /* interrupt pending */ 93 bool kvm_accel; /* set when QEMU bypass through KVM enabled */ 94 uint8_t pin; /* which pin to pull for qemu_set_irq */ 95 EventNotifier interrupt; /* eventfd triggered on interrupt */ 96 EventNotifier unmask; /* eventfd for unmask on QEMU bypass */ 97 PCIINTxRoute route; /* routing info for QEMU bypass */ 98 uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */ 99 QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */ 100 } VFIOINTx; 101 102 typedef struct VFIOMSIVector { 103 /* 104 * Two interrupt paths are configured per vector. The first, is only used 105 * for interrupts injected via QEMU. This is typically the non-accel path, 106 * but may also be used when we want QEMU to handle masking and pending 107 * bits. The KVM path bypasses QEMU and is therefore higher performance, 108 * but requires masking at the device. virq is used to track the MSI route 109 * through KVM, thus kvm_interrupt is only available when virq is set to a 110 * valid (>= 0) value. 111 */ 112 EventNotifier interrupt; 113 EventNotifier kvm_interrupt; 114 struct VFIOPCIDevice *vdev; /* back pointer to device */ 115 int virq; 116 bool use; 117 } VFIOMSIVector; 118 119 enum { 120 VFIO_INT_NONE = 0, 121 VFIO_INT_INTx = 1, 122 VFIO_INT_MSI = 2, 123 VFIO_INT_MSIX = 3, 124 }; 125 126 /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */ 127 typedef struct VFIOMSIXInfo { 128 uint8_t table_bar; 129 uint8_t pba_bar; 130 uint16_t entries; 131 uint32_t table_offset; 132 uint32_t pba_offset; 133 MemoryRegion mmap_mem; 134 void *mmap; 135 } VFIOMSIXInfo; 136 137 typedef struct VFIOPCIDevice { 138 PCIDevice pdev; 139 VFIODevice vbasedev; 140 VFIOINTx intx; 141 unsigned int config_size; 142 uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */ 143 off_t config_offset; /* Offset of config space region within device fd */ 144 unsigned int rom_size; 145 off_t rom_offset; /* Offset of ROM region within device fd */ 146 void *rom; 147 int msi_cap_size; 148 VFIOMSIVector *msi_vectors; 149 VFIOMSIXInfo *msix; 150 int nr_vectors; /* Number of MSI/MSIX vectors currently in use */ 151 int interrupt; /* Current interrupt type */ 152 VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ 153 VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */ 154 PCIHostDeviceAddress host; 155 EventNotifier err_notifier; 156 uint32_t features; 157 #define VFIO_FEATURE_ENABLE_VGA_BIT 0 158 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT) 159 int32_t bootindex; 160 uint8_t pm_cap; 161 bool has_vga; 162 bool pci_aer; 163 bool has_flr; 164 bool has_pm_reset; 165 bool rom_read_failed; 166 } VFIOPCIDevice; 167 168 typedef struct VFIORomBlacklistEntry { 169 uint16_t vendor_id; 170 uint16_t device_id; 171 } VFIORomBlacklistEntry; 172 173 /* 174 * List of device ids/vendor ids for which to disable 175 * option rom loading. This avoids the guest hangs during rom 176 * execution as noticed with the BCM 57810 card for lack of a 177 * more better way to handle such issues. 178 * The user can still override by specifying a romfile or 179 * rombar=1. 180 * Please see https://bugs.launchpad.net/qemu/+bug/1284874 181 * for an analysis of the 57810 card hang. When adding 182 * a new vendor id/device id combination below, please also add 183 * your card/environment details and information that could 184 * help in debugging to the bug tracking this issue 185 */ 186 static const VFIORomBlacklistEntry romblacklist[] = { 187 /* Broadcom BCM 57810 */ 188 { 0x14e4, 0x168e } 189 }; 190 191 #define MSIX_CAP_LENGTH 12 192 193 static void vfio_disable_interrupts(VFIOPCIDevice *vdev); 194 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); 195 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, 196 uint32_t val, int len); 197 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); 198 static int vfio_populate_device(VFIODevice *vbasedev); 199 200 /* 201 * Disabling BAR mmaping can be slow, but toggling it around INTx can 202 * also be a huge overhead. We try to get the best of both worlds by 203 * waiting until an interrupt to disable mmaps (subsequent transitions 204 * to the same state are effectively no overhead). If the interrupt has 205 * been serviced and the time gap is long enough, we re-enable mmaps for 206 * performance. This works well for things like graphics cards, which 207 * may not use their interrupt at all and are penalized to an unusable 208 * level by read/write BAR traps. Other devices, like NICs, have more 209 * regular interrupts and see much better latency by staying in non-mmap 210 * mode. We therefore set the default mmap_timeout such that a ping 211 * is just enough to keep the mmap disabled. Users can experiment with 212 * other options with the x-intx-mmap-timeout-ms parameter (a value of 213 * zero disables the timer). 214 */ 215 static void vfio_intx_mmap_enable(void *opaque) 216 { 217 VFIOPCIDevice *vdev = opaque; 218 219 if (vdev->intx.pending) { 220 timer_mod(vdev->intx.mmap_timer, 221 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout); 222 return; 223 } 224 225 vfio_mmap_set_enabled(vdev, true); 226 } 227 228 static void vfio_intx_interrupt(void *opaque) 229 { 230 VFIOPCIDevice *vdev = opaque; 231 232 if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) { 233 return; 234 } 235 236 trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin); 237 238 vdev->intx.pending = true; 239 pci_irq_assert(&vdev->pdev); 240 vfio_mmap_set_enabled(vdev, false); 241 if (vdev->intx.mmap_timeout) { 242 timer_mod(vdev->intx.mmap_timer, 243 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout); 244 } 245 } 246 247 static void vfio_eoi(VFIODevice *vbasedev) 248 { 249 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 250 251 if (!vdev->intx.pending) { 252 return; 253 } 254 255 trace_vfio_eoi(vbasedev->name); 256 257 vdev->intx.pending = false; 258 pci_irq_deassert(&vdev->pdev); 259 vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 260 } 261 262 static void vfio_enable_intx_kvm(VFIOPCIDevice *vdev) 263 { 264 #ifdef CONFIG_KVM 265 struct kvm_irqfd irqfd = { 266 .fd = event_notifier_get_fd(&vdev->intx.interrupt), 267 .gsi = vdev->intx.route.irq, 268 .flags = KVM_IRQFD_FLAG_RESAMPLE, 269 }; 270 struct vfio_irq_set *irq_set; 271 int ret, argsz; 272 int32_t *pfd; 273 274 if (!VFIO_ALLOW_KVM_INTX || !kvm_irqfds_enabled() || 275 vdev->intx.route.mode != PCI_INTX_ENABLED || 276 !kvm_resamplefds_enabled()) { 277 return; 278 } 279 280 /* Get to a known interrupt state */ 281 qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev); 282 vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 283 vdev->intx.pending = false; 284 pci_irq_deassert(&vdev->pdev); 285 286 /* Get an eventfd for resample/unmask */ 287 if (event_notifier_init(&vdev->intx.unmask, 0)) { 288 error_report("vfio: Error: event_notifier_init failed eoi"); 289 goto fail; 290 } 291 292 /* KVM triggers it, VFIO listens for it */ 293 irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask); 294 295 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { 296 error_report("vfio: Error: Failed to setup resample irqfd: %m"); 297 goto fail_irqfd; 298 } 299 300 argsz = sizeof(*irq_set) + sizeof(*pfd); 301 302 irq_set = g_malloc0(argsz); 303 irq_set->argsz = argsz; 304 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK; 305 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; 306 irq_set->start = 0; 307 irq_set->count = 1; 308 pfd = (int32_t *)&irq_set->data; 309 310 *pfd = irqfd.resamplefd; 311 312 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 313 g_free(irq_set); 314 if (ret) { 315 error_report("vfio: Error: Failed to setup INTx unmask fd: %m"); 316 goto fail_vfio; 317 } 318 319 /* Let'em rip */ 320 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 321 322 vdev->intx.kvm_accel = true; 323 324 trace_vfio_enable_intx_kvm(vdev->vbasedev.name); 325 326 return; 327 328 fail_vfio: 329 irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN; 330 kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd); 331 fail_irqfd: 332 event_notifier_cleanup(&vdev->intx.unmask); 333 fail: 334 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev); 335 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 336 #endif 337 } 338 339 static void vfio_disable_intx_kvm(VFIOPCIDevice *vdev) 340 { 341 #ifdef CONFIG_KVM 342 struct kvm_irqfd irqfd = { 343 .fd = event_notifier_get_fd(&vdev->intx.interrupt), 344 .gsi = vdev->intx.route.irq, 345 .flags = KVM_IRQFD_FLAG_DEASSIGN, 346 }; 347 348 if (!vdev->intx.kvm_accel) { 349 return; 350 } 351 352 /* 353 * Get to a known state, hardware masked, QEMU ready to accept new 354 * interrupts, QEMU IRQ de-asserted. 355 */ 356 vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 357 vdev->intx.pending = false; 358 pci_irq_deassert(&vdev->pdev); 359 360 /* Tell KVM to stop listening for an INTx irqfd */ 361 if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) { 362 error_report("vfio: Error: Failed to disable INTx irqfd: %m"); 363 } 364 365 /* We only need to close the eventfd for VFIO to cleanup the kernel side */ 366 event_notifier_cleanup(&vdev->intx.unmask); 367 368 /* QEMU starts listening for interrupt events. */ 369 qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev); 370 371 vdev->intx.kvm_accel = false; 372 373 /* If we've missed an event, let it re-fire through QEMU */ 374 vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 375 376 trace_vfio_disable_intx_kvm(vdev->vbasedev.name); 377 #endif 378 } 379 380 static void vfio_update_irq(PCIDevice *pdev) 381 { 382 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 383 PCIINTxRoute route; 384 385 if (vdev->interrupt != VFIO_INT_INTx) { 386 return; 387 } 388 389 route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin); 390 391 if (!pci_intx_route_changed(&vdev->intx.route, &route)) { 392 return; /* Nothing changed */ 393 } 394 395 trace_vfio_update_irq(vdev->vbasedev.name, 396 vdev->intx.route.irq, route.irq); 397 398 vfio_disable_intx_kvm(vdev); 399 400 vdev->intx.route = route; 401 402 if (route.mode != PCI_INTX_ENABLED) { 403 return; 404 } 405 406 vfio_enable_intx_kvm(vdev); 407 408 /* Re-enable the interrupt in cased we missed an EOI */ 409 vfio_eoi(&vdev->vbasedev); 410 } 411 412 static int vfio_enable_intx(VFIOPCIDevice *vdev) 413 { 414 uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); 415 int ret, argsz; 416 struct vfio_irq_set *irq_set; 417 int32_t *pfd; 418 419 if (!pin) { 420 return 0; 421 } 422 423 vfio_disable_interrupts(vdev); 424 425 vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ 426 pci_config_set_interrupt_pin(vdev->pdev.config, pin); 427 428 #ifdef CONFIG_KVM 429 /* 430 * Only conditional to avoid generating error messages on platforms 431 * where we won't actually use the result anyway. 432 */ 433 if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) { 434 vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev, 435 vdev->intx.pin); 436 } 437 #endif 438 439 ret = event_notifier_init(&vdev->intx.interrupt, 0); 440 if (ret) { 441 error_report("vfio: Error: event_notifier_init failed"); 442 return ret; 443 } 444 445 argsz = sizeof(*irq_set) + sizeof(*pfd); 446 447 irq_set = g_malloc0(argsz); 448 irq_set->argsz = argsz; 449 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; 450 irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; 451 irq_set->start = 0; 452 irq_set->count = 1; 453 pfd = (int32_t *)&irq_set->data; 454 455 *pfd = event_notifier_get_fd(&vdev->intx.interrupt); 456 qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev); 457 458 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 459 g_free(irq_set); 460 if (ret) { 461 error_report("vfio: Error: Failed to setup INTx fd: %m"); 462 qemu_set_fd_handler(*pfd, NULL, NULL, vdev); 463 event_notifier_cleanup(&vdev->intx.interrupt); 464 return -errno; 465 } 466 467 vfio_enable_intx_kvm(vdev); 468 469 vdev->interrupt = VFIO_INT_INTx; 470 471 trace_vfio_enable_intx(vdev->vbasedev.name); 472 473 return 0; 474 } 475 476 static void vfio_disable_intx(VFIOPCIDevice *vdev) 477 { 478 int fd; 479 480 timer_del(vdev->intx.mmap_timer); 481 vfio_disable_intx_kvm(vdev); 482 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); 483 vdev->intx.pending = false; 484 pci_irq_deassert(&vdev->pdev); 485 vfio_mmap_set_enabled(vdev, true); 486 487 fd = event_notifier_get_fd(&vdev->intx.interrupt); 488 qemu_set_fd_handler(fd, NULL, NULL, vdev); 489 event_notifier_cleanup(&vdev->intx.interrupt); 490 491 vdev->interrupt = VFIO_INT_NONE; 492 493 trace_vfio_disable_intx(vdev->vbasedev.name); 494 } 495 496 /* 497 * MSI/X 498 */ 499 static void vfio_msi_interrupt(void *opaque) 500 { 501 VFIOMSIVector *vector = opaque; 502 VFIOPCIDevice *vdev = vector->vdev; 503 int nr = vector - vdev->msi_vectors; 504 505 if (!event_notifier_test_and_clear(&vector->interrupt)) { 506 return; 507 } 508 509 #ifdef DEBUG_VFIO 510 MSIMessage msg; 511 512 if (vdev->interrupt == VFIO_INT_MSIX) { 513 msg = msix_get_message(&vdev->pdev, nr); 514 } else if (vdev->interrupt == VFIO_INT_MSI) { 515 msg = msi_get_message(&vdev->pdev, nr); 516 } else { 517 abort(); 518 } 519 520 trace_vfio_msi_interrupt(vbasedev->name, nr, msg.address, msg.data); 521 #endif 522 523 if (vdev->interrupt == VFIO_INT_MSIX) { 524 msix_notify(&vdev->pdev, nr); 525 } else if (vdev->interrupt == VFIO_INT_MSI) { 526 msi_notify(&vdev->pdev, nr); 527 } else { 528 error_report("vfio: MSI interrupt receieved, but not enabled?"); 529 } 530 } 531 532 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) 533 { 534 struct vfio_irq_set *irq_set; 535 int ret = 0, i, argsz; 536 int32_t *fds; 537 538 argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds)); 539 540 irq_set = g_malloc0(argsz); 541 irq_set->argsz = argsz; 542 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; 543 irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX; 544 irq_set->start = 0; 545 irq_set->count = vdev->nr_vectors; 546 fds = (int32_t *)&irq_set->data; 547 548 for (i = 0; i < vdev->nr_vectors; i++) { 549 int fd = -1; 550 551 /* 552 * MSI vs MSI-X - The guest has direct access to MSI mask and pending 553 * bits, therefore we always use the KVM signaling path when setup. 554 * MSI-X mask and pending bits are emulated, so we want to use the 555 * KVM signaling path only when configured and unmasked. 556 */ 557 if (vdev->msi_vectors[i].use) { 558 if (vdev->msi_vectors[i].virq < 0 || 559 (msix && msix_is_masked(&vdev->pdev, i))) { 560 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); 561 } else { 562 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt); 563 } 564 } 565 566 fds[i] = fd; 567 } 568 569 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 570 571 g_free(irq_set); 572 573 return ret; 574 } 575 576 static void vfio_add_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage *msg, 577 bool msix) 578 { 579 int virq; 580 581 if ((msix && !VFIO_ALLOW_KVM_MSIX) || 582 (!msix && !VFIO_ALLOW_KVM_MSI) || !msg) { 583 return; 584 } 585 586 if (event_notifier_init(&vector->kvm_interrupt, 0)) { 587 return; 588 } 589 590 virq = kvm_irqchip_add_msi_route(kvm_state, *msg); 591 if (virq < 0) { 592 event_notifier_cleanup(&vector->kvm_interrupt); 593 return; 594 } 595 596 if (kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->kvm_interrupt, 597 NULL, virq) < 0) { 598 kvm_irqchip_release_virq(kvm_state, virq); 599 event_notifier_cleanup(&vector->kvm_interrupt); 600 return; 601 } 602 603 vector->virq = virq; 604 } 605 606 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) 607 { 608 kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->kvm_interrupt, 609 vector->virq); 610 kvm_irqchip_release_virq(kvm_state, vector->virq); 611 vector->virq = -1; 612 event_notifier_cleanup(&vector->kvm_interrupt); 613 } 614 615 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg) 616 { 617 kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg); 618 } 619 620 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, 621 MSIMessage *msg, IOHandler *handler) 622 { 623 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 624 VFIOMSIVector *vector; 625 int ret; 626 627 trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr); 628 629 vector = &vdev->msi_vectors[nr]; 630 631 if (!vector->use) { 632 vector->vdev = vdev; 633 vector->virq = -1; 634 if (event_notifier_init(&vector->interrupt, 0)) { 635 error_report("vfio: Error: event_notifier_init failed"); 636 } 637 vector->use = true; 638 msix_vector_use(pdev, nr); 639 } 640 641 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 642 handler, NULL, vector); 643 644 /* 645 * Attempt to enable route through KVM irqchip, 646 * default to userspace handling if unavailable. 647 */ 648 if (vector->virq >= 0) { 649 if (!msg) { 650 vfio_remove_kvm_msi_virq(vector); 651 } else { 652 vfio_update_kvm_msi_virq(vector, *msg); 653 } 654 } else { 655 vfio_add_kvm_msi_virq(vector, msg, true); 656 } 657 658 /* 659 * We don't want to have the host allocate all possible MSI vectors 660 * for a device if they're not in use, so we shutdown and incrementally 661 * increase them as needed. 662 */ 663 if (vdev->nr_vectors < nr + 1) { 664 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); 665 vdev->nr_vectors = nr + 1; 666 ret = vfio_enable_vectors(vdev, true); 667 if (ret) { 668 error_report("vfio: failed to enable vectors, %d", ret); 669 } 670 } else { 671 int argsz; 672 struct vfio_irq_set *irq_set; 673 int32_t *pfd; 674 675 argsz = sizeof(*irq_set) + sizeof(*pfd); 676 677 irq_set = g_malloc0(argsz); 678 irq_set->argsz = argsz; 679 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 680 VFIO_IRQ_SET_ACTION_TRIGGER; 681 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; 682 irq_set->start = nr; 683 irq_set->count = 1; 684 pfd = (int32_t *)&irq_set->data; 685 686 if (vector->virq >= 0) { 687 *pfd = event_notifier_get_fd(&vector->kvm_interrupt); 688 } else { 689 *pfd = event_notifier_get_fd(&vector->interrupt); 690 } 691 692 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 693 g_free(irq_set); 694 if (ret) { 695 error_report("vfio: failed to modify vector, %d", ret); 696 } 697 } 698 699 return 0; 700 } 701 702 static int vfio_msix_vector_use(PCIDevice *pdev, 703 unsigned int nr, MSIMessage msg) 704 { 705 return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt); 706 } 707 708 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) 709 { 710 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 711 VFIOMSIVector *vector = &vdev->msi_vectors[nr]; 712 713 trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); 714 715 /* 716 * There are still old guests that mask and unmask vectors on every 717 * interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of 718 * the KVM setup in place, simply switch VFIO to use the non-bypass 719 * eventfd. We'll then fire the interrupt through QEMU and the MSI-X 720 * core will mask the interrupt and set pending bits, allowing it to 721 * be re-asserted on unmask. Nothing to do if already using QEMU mode. 722 */ 723 if (vector->virq >= 0) { 724 int argsz; 725 struct vfio_irq_set *irq_set; 726 int32_t *pfd; 727 728 argsz = sizeof(*irq_set) + sizeof(*pfd); 729 730 irq_set = g_malloc0(argsz); 731 irq_set->argsz = argsz; 732 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 733 VFIO_IRQ_SET_ACTION_TRIGGER; 734 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; 735 irq_set->start = nr; 736 irq_set->count = 1; 737 pfd = (int32_t *)&irq_set->data; 738 739 *pfd = event_notifier_get_fd(&vector->interrupt); 740 741 ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 742 743 g_free(irq_set); 744 } 745 } 746 747 static void vfio_enable_msix(VFIOPCIDevice *vdev) 748 { 749 vfio_disable_interrupts(vdev); 750 751 vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector)); 752 753 vdev->interrupt = VFIO_INT_MSIX; 754 755 /* 756 * Some communication channels between VF & PF or PF & fw rely on the 757 * physical state of the device and expect that enabling MSI-X from the 758 * guest enables the same on the host. When our guest is Linux, the 759 * guest driver call to pci_enable_msix() sets the enabling bit in the 760 * MSI-X capability, but leaves the vector table masked. We therefore 761 * can't rely on a vector_use callback (from request_irq() in the guest) 762 * to switch the physical device into MSI-X mode because that may come a 763 * long time after pci_enable_msix(). This code enables vector 0 with 764 * triggering to userspace, then immediately release the vector, leaving 765 * the physical device with no vectors enabled, but MSI-X enabled, just 766 * like the guest view. 767 */ 768 vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL); 769 vfio_msix_vector_release(&vdev->pdev, 0); 770 771 if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, 772 vfio_msix_vector_release, NULL)) { 773 error_report("vfio: msix_set_vector_notifiers failed"); 774 } 775 776 trace_vfio_enable_msix(vdev->vbasedev.name); 777 } 778 779 static void vfio_enable_msi(VFIOPCIDevice *vdev) 780 { 781 int ret, i; 782 783 vfio_disable_interrupts(vdev); 784 785 vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); 786 retry: 787 vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector)); 788 789 for (i = 0; i < vdev->nr_vectors; i++) { 790 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 791 MSIMessage msg = msi_get_message(&vdev->pdev, i); 792 793 vector->vdev = vdev; 794 vector->virq = -1; 795 vector->use = true; 796 797 if (event_notifier_init(&vector->interrupt, 0)) { 798 error_report("vfio: Error: event_notifier_init failed"); 799 } 800 801 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 802 vfio_msi_interrupt, NULL, vector); 803 804 /* 805 * Attempt to enable route through KVM irqchip, 806 * default to userspace handling if unavailable. 807 */ 808 vfio_add_kvm_msi_virq(vector, &msg, false); 809 } 810 811 /* Set interrupt type prior to possible interrupts */ 812 vdev->interrupt = VFIO_INT_MSI; 813 814 ret = vfio_enable_vectors(vdev, false); 815 if (ret) { 816 if (ret < 0) { 817 error_report("vfio: Error: Failed to setup MSI fds: %m"); 818 } else if (ret != vdev->nr_vectors) { 819 error_report("vfio: Error: Failed to enable %d " 820 "MSI vectors, retry with %d", vdev->nr_vectors, ret); 821 } 822 823 for (i = 0; i < vdev->nr_vectors; i++) { 824 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 825 if (vector->virq >= 0) { 826 vfio_remove_kvm_msi_virq(vector); 827 } 828 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 829 NULL, NULL, NULL); 830 event_notifier_cleanup(&vector->interrupt); 831 } 832 833 g_free(vdev->msi_vectors); 834 835 if (ret > 0 && ret != vdev->nr_vectors) { 836 vdev->nr_vectors = ret; 837 goto retry; 838 } 839 vdev->nr_vectors = 0; 840 841 /* 842 * Failing to setup MSI doesn't really fall within any specification. 843 * Let's try leaving interrupts disabled and hope the guest figures 844 * out to fall back to INTx for this device. 845 */ 846 error_report("vfio: Error: Failed to enable MSI"); 847 vdev->interrupt = VFIO_INT_NONE; 848 849 return; 850 } 851 852 trace_vfio_enable_msi(vdev->vbasedev.name, vdev->nr_vectors); 853 } 854 855 static void vfio_disable_msi_common(VFIOPCIDevice *vdev) 856 { 857 int i; 858 859 for (i = 0; i < vdev->nr_vectors; i++) { 860 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 861 if (vdev->msi_vectors[i].use) { 862 if (vector->virq >= 0) { 863 vfio_remove_kvm_msi_virq(vector); 864 } 865 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), 866 NULL, NULL, NULL); 867 event_notifier_cleanup(&vector->interrupt); 868 } 869 } 870 871 g_free(vdev->msi_vectors); 872 vdev->msi_vectors = NULL; 873 vdev->nr_vectors = 0; 874 vdev->interrupt = VFIO_INT_NONE; 875 876 vfio_enable_intx(vdev); 877 } 878 879 static void vfio_disable_msix(VFIOPCIDevice *vdev) 880 { 881 int i; 882 883 msix_unset_vector_notifiers(&vdev->pdev); 884 885 /* 886 * MSI-X will only release vectors if MSI-X is still enabled on the 887 * device, check through the rest and release it ourselves if necessary. 888 */ 889 for (i = 0; i < vdev->nr_vectors; i++) { 890 if (vdev->msi_vectors[i].use) { 891 vfio_msix_vector_release(&vdev->pdev, i); 892 msix_vector_unuse(&vdev->pdev, i); 893 } 894 } 895 896 if (vdev->nr_vectors) { 897 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); 898 } 899 900 vfio_disable_msi_common(vdev); 901 902 trace_vfio_disable_msix(vdev->vbasedev.name); 903 } 904 905 static void vfio_disable_msi(VFIOPCIDevice *vdev) 906 { 907 vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); 908 vfio_disable_msi_common(vdev); 909 910 trace_vfio_disable_msi(vdev->vbasedev.name); 911 } 912 913 static void vfio_update_msi(VFIOPCIDevice *vdev) 914 { 915 int i; 916 917 for (i = 0; i < vdev->nr_vectors; i++) { 918 VFIOMSIVector *vector = &vdev->msi_vectors[i]; 919 MSIMessage msg; 920 921 if (!vector->use || vector->virq < 0) { 922 continue; 923 } 924 925 msg = msi_get_message(&vdev->pdev, i); 926 vfio_update_kvm_msi_virq(vector, msg); 927 } 928 } 929 930 static void vfio_pci_load_rom(VFIOPCIDevice *vdev) 931 { 932 struct vfio_region_info reg_info = { 933 .argsz = sizeof(reg_info), 934 .index = VFIO_PCI_ROM_REGION_INDEX 935 }; 936 uint64_t size; 937 off_t off = 0; 938 size_t bytes; 939 940 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, ®_info)) { 941 error_report("vfio: Error getting ROM info: %m"); 942 return; 943 } 944 945 trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info.size, 946 (unsigned long)reg_info.offset, 947 (unsigned long)reg_info.flags); 948 949 vdev->rom_size = size = reg_info.size; 950 vdev->rom_offset = reg_info.offset; 951 952 if (!vdev->rom_size) { 953 vdev->rom_read_failed = true; 954 error_report("vfio-pci: Cannot read device rom at " 955 "%s", vdev->vbasedev.name); 956 error_printf("Device option ROM contents are probably invalid " 957 "(check dmesg).\nSkip option ROM probe with rombar=0, " 958 "or load from file with romfile=\n"); 959 return; 960 } 961 962 vdev->rom = g_malloc(size); 963 memset(vdev->rom, 0xff, size); 964 965 while (size) { 966 bytes = pread(vdev->vbasedev.fd, vdev->rom + off, 967 size, vdev->rom_offset + off); 968 if (bytes == 0) { 969 break; 970 } else if (bytes > 0) { 971 off += bytes; 972 size -= bytes; 973 } else { 974 if (errno == EINTR || errno == EAGAIN) { 975 continue; 976 } 977 error_report("vfio: Error reading device ROM: %m"); 978 break; 979 } 980 } 981 } 982 983 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) 984 { 985 VFIOPCIDevice *vdev = opaque; 986 union { 987 uint8_t byte; 988 uint16_t word; 989 uint32_t dword; 990 uint64_t qword; 991 } val; 992 uint64_t data = 0; 993 994 /* Load the ROM lazily when the guest tries to read it */ 995 if (unlikely(!vdev->rom && !vdev->rom_read_failed)) { 996 vfio_pci_load_rom(vdev); 997 } 998 999 memcpy(&val, vdev->rom + addr, 1000 (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0); 1001 1002 switch (size) { 1003 case 1: 1004 data = val.byte; 1005 break; 1006 case 2: 1007 data = le16_to_cpu(val.word); 1008 break; 1009 case 4: 1010 data = le32_to_cpu(val.dword); 1011 break; 1012 default: 1013 hw_error("vfio: unsupported read size, %d bytes\n", size); 1014 break; 1015 } 1016 1017 trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data); 1018 1019 return data; 1020 } 1021 1022 static void vfio_rom_write(void *opaque, hwaddr addr, 1023 uint64_t data, unsigned size) 1024 { 1025 } 1026 1027 static const MemoryRegionOps vfio_rom_ops = { 1028 .read = vfio_rom_read, 1029 .write = vfio_rom_write, 1030 .endianness = DEVICE_LITTLE_ENDIAN, 1031 }; 1032 1033 static bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev) 1034 { 1035 PCIDevice *pdev = &vdev->pdev; 1036 uint16_t vendor_id, device_id; 1037 int count = 0; 1038 1039 vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID); 1040 device_id = pci_get_word(pdev->config + PCI_DEVICE_ID); 1041 1042 while (count < ARRAY_SIZE(romblacklist)) { 1043 if (romblacklist[count].vendor_id == vendor_id && 1044 romblacklist[count].device_id == device_id) { 1045 return true; 1046 } 1047 count++; 1048 } 1049 1050 return false; 1051 } 1052 1053 static void vfio_pci_size_rom(VFIOPCIDevice *vdev) 1054 { 1055 uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); 1056 off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; 1057 DeviceState *dev = DEVICE(vdev); 1058 char name[32]; 1059 int fd = vdev->vbasedev.fd; 1060 1061 if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { 1062 /* Since pci handles romfile, just print a message and return */ 1063 if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) { 1064 error_printf("Warning : Device at %04x:%02x:%02x.%x " 1065 "is known to cause system instability issues during " 1066 "option rom execution. " 1067 "Proceeding anyway since user specified romfile\n", 1068 vdev->host.domain, vdev->host.bus, vdev->host.slot, 1069 vdev->host.function); 1070 } 1071 return; 1072 } 1073 1074 /* 1075 * Use the same size ROM BAR as the physical device. The contents 1076 * will get filled in later when the guest tries to read it. 1077 */ 1078 if (pread(fd, &orig, 4, offset) != 4 || 1079 pwrite(fd, &size, 4, offset) != 4 || 1080 pread(fd, &size, 4, offset) != 4 || 1081 pwrite(fd, &orig, 4, offset) != 4) { 1082 error_report("%s(%04x:%02x:%02x.%x) failed: %m", 1083 __func__, vdev->host.domain, vdev->host.bus, 1084 vdev->host.slot, vdev->host.function); 1085 return; 1086 } 1087 1088 size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1; 1089 1090 if (!size) { 1091 return; 1092 } 1093 1094 if (vfio_blacklist_opt_rom(vdev)) { 1095 if (dev->opts && qemu_opt_get(dev->opts, "rombar")) { 1096 error_printf("Warning : Device at %04x:%02x:%02x.%x " 1097 "is known to cause system instability issues during " 1098 "option rom execution. " 1099 "Proceeding anyway since user specified non zero value for " 1100 "rombar\n", 1101 vdev->host.domain, vdev->host.bus, vdev->host.slot, 1102 vdev->host.function); 1103 } else { 1104 error_printf("Warning : Rom loading for device at " 1105 "%04x:%02x:%02x.%x has been disabled due to " 1106 "system instability issues. " 1107 "Specify rombar=1 or romfile to force\n", 1108 vdev->host.domain, vdev->host.bus, vdev->host.slot, 1109 vdev->host.function); 1110 return; 1111 } 1112 } 1113 1114 trace_vfio_pci_size_rom(vdev->vbasedev.name, size); 1115 1116 snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom", 1117 vdev->host.domain, vdev->host.bus, vdev->host.slot, 1118 vdev->host.function); 1119 1120 memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev), 1121 &vfio_rom_ops, vdev, name, size); 1122 1123 pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 1124 PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom); 1125 1126 vdev->pdev.has_rom = true; 1127 vdev->rom_read_failed = false; 1128 } 1129 1130 static void vfio_vga_write(void *opaque, hwaddr addr, 1131 uint64_t data, unsigned size) 1132 { 1133 VFIOVGARegion *region = opaque; 1134 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]); 1135 union { 1136 uint8_t byte; 1137 uint16_t word; 1138 uint32_t dword; 1139 uint64_t qword; 1140 } buf; 1141 off_t offset = vga->fd_offset + region->offset + addr; 1142 1143 switch (size) { 1144 case 1: 1145 buf.byte = data; 1146 break; 1147 case 2: 1148 buf.word = cpu_to_le16(data); 1149 break; 1150 case 4: 1151 buf.dword = cpu_to_le32(data); 1152 break; 1153 default: 1154 hw_error("vfio: unsupported write size, %d bytes", size); 1155 break; 1156 } 1157 1158 if (pwrite(vga->fd, &buf, size, offset) != size) { 1159 error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m", 1160 __func__, region->offset + addr, data, size); 1161 } 1162 1163 trace_vfio_vga_write(region->offset + addr, data, size); 1164 } 1165 1166 static uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size) 1167 { 1168 VFIOVGARegion *region = opaque; 1169 VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]); 1170 union { 1171 uint8_t byte; 1172 uint16_t word; 1173 uint32_t dword; 1174 uint64_t qword; 1175 } buf; 1176 uint64_t data = 0; 1177 off_t offset = vga->fd_offset + region->offset + addr; 1178 1179 if (pread(vga->fd, &buf, size, offset) != size) { 1180 error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m", 1181 __func__, region->offset + addr, size); 1182 return (uint64_t)-1; 1183 } 1184 1185 switch (size) { 1186 case 1: 1187 data = buf.byte; 1188 break; 1189 case 2: 1190 data = le16_to_cpu(buf.word); 1191 break; 1192 case 4: 1193 data = le32_to_cpu(buf.dword); 1194 break; 1195 default: 1196 hw_error("vfio: unsupported read size, %d bytes", size); 1197 break; 1198 } 1199 1200 trace_vfio_vga_read(region->offset + addr, size, data); 1201 1202 return data; 1203 } 1204 1205 static const MemoryRegionOps vfio_vga_ops = { 1206 .read = vfio_vga_read, 1207 .write = vfio_vga_write, 1208 .endianness = DEVICE_LITTLE_ENDIAN, 1209 }; 1210 1211 /* 1212 * Device specific quirks 1213 */ 1214 1215 /* Is range1 fully contained within range2? */ 1216 static bool vfio_range_contained(uint64_t first1, uint64_t len1, 1217 uint64_t first2, uint64_t len2) { 1218 return (first1 >= first2 && first1 + len1 <= first2 + len2); 1219 } 1220 1221 static bool vfio_flags_enabled(uint8_t flags, uint8_t mask) 1222 { 1223 return (mask && (flags & mask) == mask); 1224 } 1225 1226 static uint64_t vfio_generic_window_quirk_read(void *opaque, 1227 hwaddr addr, unsigned size) 1228 { 1229 VFIOQuirk *quirk = opaque; 1230 VFIOPCIDevice *vdev = quirk->vdev; 1231 uint64_t data; 1232 1233 if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) && 1234 ranges_overlap(addr, size, 1235 quirk->data.data_offset, quirk->data.data_size)) { 1236 hwaddr offset = addr - quirk->data.data_offset; 1237 1238 if (!vfio_range_contained(addr, size, quirk->data.data_offset, 1239 quirk->data.data_size)) { 1240 hw_error("%s: window data read not fully contained: %s", 1241 __func__, memory_region_name(&quirk->mem)); 1242 } 1243 1244 data = vfio_pci_read_config(&vdev->pdev, 1245 quirk->data.address_val + offset, size); 1246 1247 trace_vfio_generic_window_quirk_read(memory_region_name(&quirk->mem), 1248 vdev->vbasedev.name, 1249 quirk->data.bar, 1250 addr, size, data); 1251 } else { 1252 data = vfio_region_read(&vdev->bars[quirk->data.bar].region, 1253 addr + quirk->data.base_offset, size); 1254 } 1255 1256 return data; 1257 } 1258 1259 static void vfio_generic_window_quirk_write(void *opaque, hwaddr addr, 1260 uint64_t data, unsigned size) 1261 { 1262 VFIOQuirk *quirk = opaque; 1263 VFIOPCIDevice *vdev = quirk->vdev; 1264 1265 if (ranges_overlap(addr, size, 1266 quirk->data.address_offset, quirk->data.address_size)) { 1267 1268 if (addr != quirk->data.address_offset) { 1269 hw_error("%s: offset write into address window: %s", 1270 __func__, memory_region_name(&quirk->mem)); 1271 } 1272 1273 if ((data & ~quirk->data.address_mask) == quirk->data.address_match) { 1274 quirk->data.flags |= quirk->data.write_flags | 1275 quirk->data.read_flags; 1276 quirk->data.address_val = data & quirk->data.address_mask; 1277 } else { 1278 quirk->data.flags &= ~(quirk->data.write_flags | 1279 quirk->data.read_flags); 1280 } 1281 } 1282 1283 if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) && 1284 ranges_overlap(addr, size, 1285 quirk->data.data_offset, quirk->data.data_size)) { 1286 hwaddr offset = addr - quirk->data.data_offset; 1287 1288 if (!vfio_range_contained(addr, size, quirk->data.data_offset, 1289 quirk->data.data_size)) { 1290 hw_error("%s: window data write not fully contained: %s", 1291 __func__, memory_region_name(&quirk->mem)); 1292 } 1293 1294 vfio_pci_write_config(&vdev->pdev, 1295 quirk->data.address_val + offset, data, size); 1296 trace_vfio_generic_window_quirk_write(memory_region_name(&quirk->mem), 1297 vdev->vbasedev.name, 1298 quirk->data.bar, 1299 addr, data, size); 1300 return; 1301 } 1302 1303 vfio_region_write(&vdev->bars[quirk->data.bar].region, 1304 addr + quirk->data.base_offset, data, size); 1305 } 1306 1307 static const MemoryRegionOps vfio_generic_window_quirk = { 1308 .read = vfio_generic_window_quirk_read, 1309 .write = vfio_generic_window_quirk_write, 1310 .endianness = DEVICE_LITTLE_ENDIAN, 1311 }; 1312 1313 static uint64_t vfio_generic_quirk_read(void *opaque, 1314 hwaddr addr, unsigned size) 1315 { 1316 VFIOQuirk *quirk = opaque; 1317 VFIOPCIDevice *vdev = quirk->vdev; 1318 hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK; 1319 hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK; 1320 uint64_t data; 1321 1322 if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) && 1323 ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) { 1324 if (!vfio_range_contained(addr, size, offset, 1325 quirk->data.address_mask + 1)) { 1326 hw_error("%s: read not fully contained: %s", 1327 __func__, memory_region_name(&quirk->mem)); 1328 } 1329 1330 data = vfio_pci_read_config(&vdev->pdev, addr - offset, size); 1331 1332 trace_vfio_generic_quirk_read(memory_region_name(&quirk->mem), 1333 vdev->vbasedev.name, quirk->data.bar, 1334 addr + base, size, data); 1335 } else { 1336 data = vfio_region_read(&vdev->bars[quirk->data.bar].region, 1337 addr + base, size); 1338 } 1339 1340 return data; 1341 } 1342 1343 static void vfio_generic_quirk_write(void *opaque, hwaddr addr, 1344 uint64_t data, unsigned size) 1345 { 1346 VFIOQuirk *quirk = opaque; 1347 VFIOPCIDevice *vdev = quirk->vdev; 1348 hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK; 1349 hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK; 1350 1351 if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) && 1352 ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) { 1353 if (!vfio_range_contained(addr, size, offset, 1354 quirk->data.address_mask + 1)) { 1355 hw_error("%s: write not fully contained: %s", 1356 __func__, memory_region_name(&quirk->mem)); 1357 } 1358 1359 vfio_pci_write_config(&vdev->pdev, addr - offset, data, size); 1360 1361 trace_vfio_generic_quirk_write(memory_region_name(&quirk->mem), 1362 vdev->vbasedev.name, quirk->data.bar, 1363 addr + base, data, size); 1364 } else { 1365 vfio_region_write(&vdev->bars[quirk->data.bar].region, 1366 addr + base, data, size); 1367 } 1368 } 1369 1370 static const MemoryRegionOps vfio_generic_quirk = { 1371 .read = vfio_generic_quirk_read, 1372 .write = vfio_generic_quirk_write, 1373 .endianness = DEVICE_LITTLE_ENDIAN, 1374 }; 1375 1376 #define PCI_VENDOR_ID_ATI 0x1002 1377 1378 /* 1379 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR 1380 * through VGA register 0x3c3. On newer cards, the I/O port BAR is always 1381 * BAR4 (older cards like the X550 used BAR1, but we don't care to support 1382 * those). Note that on bare metal, a read of 0x3c3 doesn't always return the 1383 * I/O port BAR address. Originally this was coded to return the virtual BAR 1384 * address only if the physical register read returns the actual BAR address, 1385 * but users have reported greater success if we return the virtual address 1386 * unconditionally. 1387 */ 1388 static uint64_t vfio_ati_3c3_quirk_read(void *opaque, 1389 hwaddr addr, unsigned size) 1390 { 1391 VFIOQuirk *quirk = opaque; 1392 VFIOPCIDevice *vdev = quirk->vdev; 1393 uint64_t data = vfio_pci_read_config(&vdev->pdev, 1394 PCI_BASE_ADDRESS_0 + (4 * 4) + 1, 1395 size); 1396 trace_vfio_ati_3c3_quirk_read(data); 1397 1398 return data; 1399 } 1400 1401 static const MemoryRegionOps vfio_ati_3c3_quirk = { 1402 .read = vfio_ati_3c3_quirk_read, 1403 .endianness = DEVICE_LITTLE_ENDIAN, 1404 }; 1405 1406 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev) 1407 { 1408 PCIDevice *pdev = &vdev->pdev; 1409 VFIOQuirk *quirk; 1410 1411 if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) { 1412 return; 1413 } 1414 1415 /* 1416 * As long as the BAR is >= 256 bytes it will be aligned such that the 1417 * lower byte is always zero. Filter out anything else, if it exists. 1418 */ 1419 if (!vdev->bars[4].ioport || vdev->bars[4].region.size < 256) { 1420 return; 1421 } 1422 1423 quirk = g_malloc0(sizeof(*quirk)); 1424 quirk->vdev = vdev; 1425 1426 memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, quirk, 1427 "vfio-ati-3c3-quirk", 1); 1428 memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, 1429 3 /* offset 3 bytes from 0x3c0 */, &quirk->mem); 1430 1431 QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks, 1432 quirk, next); 1433 1434 trace_vfio_vga_probe_ati_3c3_quirk(vdev->vbasedev.name); 1435 } 1436 1437 /* 1438 * Newer ATI/AMD devices, including HD5450 and HD7850, have a window to PCI 1439 * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access 1440 * the MMIO space directly, but a window to this space is provided through 1441 * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the 1442 * data register. When the address is programmed to a range of 0x4000-0x4fff 1443 * PCI configuration space is available. Experimentation seems to indicate 1444 * that only read-only access is provided, but we drop writes when the window 1445 * is enabled to config space nonetheless. 1446 */ 1447 static void vfio_probe_ati_bar4_window_quirk(VFIOPCIDevice *vdev, int nr) 1448 { 1449 PCIDevice *pdev = &vdev->pdev; 1450 VFIOQuirk *quirk; 1451 1452 if (!vdev->has_vga || nr != 4 || 1453 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) { 1454 return; 1455 } 1456 1457 quirk = g_malloc0(sizeof(*quirk)); 1458 quirk->vdev = vdev; 1459 quirk->data.address_size = 4; 1460 quirk->data.data_offset = 4; 1461 quirk->data.data_size = 4; 1462 quirk->data.address_match = 0x4000; 1463 quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1; 1464 quirk->data.bar = nr; 1465 quirk->data.read_flags = quirk->data.write_flags = 1; 1466 1467 memory_region_init_io(&quirk->mem, OBJECT(vdev), 1468 &vfio_generic_window_quirk, quirk, 1469 "vfio-ati-bar4-window-quirk", 8); 1470 memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, 1471 quirk->data.base_offset, &quirk->mem, 1); 1472 1473 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 1474 1475 trace_vfio_probe_ati_bar4_window_quirk(vdev->vbasedev.name); 1476 } 1477 1478 #define PCI_VENDOR_ID_REALTEK 0x10ec 1479 1480 /* 1481 * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2 1482 * offset 0x70 there is a dword data register, offset 0x74 is a dword address 1483 * register. According to the Linux r8169 driver, the MSI-X table is addressed 1484 * when the "type" portion of the address register is set to 0x1. This appears 1485 * to be bits 16:30. Bit 31 is both a write indicator and some sort of 1486 * "address latched" indicator. Bits 12:15 are a mask field, which we can 1487 * ignore because the MSI-X table should always be accessed as a dword (full 1488 * mask). Bits 0:11 is offset within the type. 1489 * 1490 * Example trace: 1491 * 1492 * Read from MSI-X table offset 0 1493 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr 1494 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch 1495 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data 1496 * 1497 * Write 0xfee00000 to MSI-X table offset 0 1498 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data 1499 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write 1500 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete 1501 */ 1502 1503 static uint64_t vfio_rtl8168_window_quirk_read(void *opaque, 1504 hwaddr addr, unsigned size) 1505 { 1506 VFIOQuirk *quirk = opaque; 1507 VFIOPCIDevice *vdev = quirk->vdev; 1508 1509 switch (addr) { 1510 case 4: /* address */ 1511 if (quirk->data.flags) { 1512 trace_vfio_rtl8168_window_quirk_read_fake( 1513 memory_region_name(&quirk->mem), 1514 vdev->vbasedev.name); 1515 1516 return quirk->data.address_match ^ 0x10000000U; 1517 } 1518 break; 1519 case 0: /* data */ 1520 if (quirk->data.flags) { 1521 uint64_t val; 1522 1523 trace_vfio_rtl8168_window_quirk_read_table( 1524 memory_region_name(&quirk->mem), 1525 vdev->vbasedev.name); 1526 1527 if (!(vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) { 1528 return 0; 1529 } 1530 1531 io_mem_read(&vdev->pdev.msix_table_mmio, 1532 (hwaddr)(quirk->data.address_match & 0xfff), 1533 &val, size); 1534 return val; 1535 } 1536 } 1537 1538 trace_vfio_rtl8168_window_quirk_read_direct(memory_region_name(&quirk->mem), 1539 vdev->vbasedev.name); 1540 1541 return vfio_region_read(&vdev->bars[quirk->data.bar].region, 1542 addr + 0x70, size); 1543 } 1544 1545 static void vfio_rtl8168_window_quirk_write(void *opaque, hwaddr addr, 1546 uint64_t data, unsigned size) 1547 { 1548 VFIOQuirk *quirk = opaque; 1549 VFIOPCIDevice *vdev = quirk->vdev; 1550 1551 switch (addr) { 1552 case 4: /* address */ 1553 if ((data & 0x7fff0000) == 0x10000) { 1554 if (data & 0x10000000U && 1555 vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) { 1556 1557 trace_vfio_rtl8168_window_quirk_write_table( 1558 memory_region_name(&quirk->mem), 1559 vdev->vbasedev.name); 1560 1561 io_mem_write(&vdev->pdev.msix_table_mmio, 1562 (hwaddr)(quirk->data.address_match & 0xfff), 1563 data, size); 1564 } 1565 1566 quirk->data.flags = 1; 1567 quirk->data.address_match = data; 1568 1569 return; 1570 } 1571 quirk->data.flags = 0; 1572 break; 1573 case 0: /* data */ 1574 quirk->data.address_mask = data; 1575 break; 1576 } 1577 1578 trace_vfio_rtl8168_window_quirk_write_direct( 1579 memory_region_name(&quirk->mem), 1580 vdev->vbasedev.name); 1581 1582 vfio_region_write(&vdev->bars[quirk->data.bar].region, 1583 addr + 0x70, data, size); 1584 } 1585 1586 static const MemoryRegionOps vfio_rtl8168_window_quirk = { 1587 .read = vfio_rtl8168_window_quirk_read, 1588 .write = vfio_rtl8168_window_quirk_write, 1589 .valid = { 1590 .min_access_size = 4, 1591 .max_access_size = 4, 1592 .unaligned = false, 1593 }, 1594 .endianness = DEVICE_LITTLE_ENDIAN, 1595 }; 1596 1597 static void vfio_probe_rtl8168_bar2_window_quirk(VFIOPCIDevice *vdev, int nr) 1598 { 1599 PCIDevice *pdev = &vdev->pdev; 1600 VFIOQuirk *quirk; 1601 1602 if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_REALTEK || 1603 pci_get_word(pdev->config + PCI_DEVICE_ID) != 0x8168 || nr != 2) { 1604 return; 1605 } 1606 1607 quirk = g_malloc0(sizeof(*quirk)); 1608 quirk->vdev = vdev; 1609 quirk->data.bar = nr; 1610 1611 memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_rtl8168_window_quirk, 1612 quirk, "vfio-rtl8168-window-quirk", 8); 1613 memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, 1614 0x70, &quirk->mem, 1); 1615 1616 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 1617 1618 trace_vfio_probe_rtl8168_bar2_window_quirk(vdev->vbasedev.name); 1619 } 1620 /* 1621 * Trap the BAR2 MMIO window to config space as well. 1622 */ 1623 static void vfio_probe_ati_bar2_4000_quirk(VFIOPCIDevice *vdev, int nr) 1624 { 1625 PCIDevice *pdev = &vdev->pdev; 1626 VFIOQuirk *quirk; 1627 1628 /* Only enable on newer devices where BAR2 is 64bit */ 1629 if (!vdev->has_vga || nr != 2 || !vdev->bars[2].mem64 || 1630 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) { 1631 return; 1632 } 1633 1634 quirk = g_malloc0(sizeof(*quirk)); 1635 quirk->vdev = vdev; 1636 quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1; 1637 quirk->data.address_match = 0x4000; 1638 quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1; 1639 quirk->data.bar = nr; 1640 1641 memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk, 1642 "vfio-ati-bar2-4000-quirk", 1643 TARGET_PAGE_ALIGN(quirk->data.address_mask + 1)); 1644 memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, 1645 quirk->data.address_match & TARGET_PAGE_MASK, 1646 &quirk->mem, 1); 1647 1648 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 1649 1650 trace_vfio_probe_ati_bar2_4000_quirk(vdev->vbasedev.name); 1651 } 1652 1653 /* 1654 * Older ATI/AMD cards like the X550 have a similar window to that above. 1655 * I/O port BAR1 provides a window to a mirror of PCI config space located 1656 * in BAR2 at offset 0xf00. We don't care to support such older cards, but 1657 * note it for future reference. 1658 */ 1659 1660 #define PCI_VENDOR_ID_NVIDIA 0x10de 1661 1662 /* 1663 * Nvidia has several different methods to get to config space, the 1664 * nouveu project has several of these documented here: 1665 * https://github.com/pathscale/envytools/tree/master/hwdocs 1666 * 1667 * The first quirk is actually not documented in envytools and is found 1668 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an 1669 * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access 1670 * the mirror of PCI config space found at BAR0 offset 0x1800. The access 1671 * sequence first writes 0x338 to I/O port 0x3d4. The target offset is 1672 * then written to 0x3d0. Finally 0x538 is written for a read and 0x738 1673 * is written for a write to 0x3d4. The BAR0 offset is then accessible 1674 * through 0x3d0. This quirk doesn't seem to be necessary on newer cards 1675 * that use the I/O port BAR5 window but it doesn't hurt to leave it. 1676 */ 1677 enum { 1678 NV_3D0_NONE = 0, 1679 NV_3D0_SELECT, 1680 NV_3D0_WINDOW, 1681 NV_3D0_READ, 1682 NV_3D0_WRITE, 1683 }; 1684 1685 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque, 1686 hwaddr addr, unsigned size) 1687 { 1688 VFIOQuirk *quirk = opaque; 1689 VFIOPCIDevice *vdev = quirk->vdev; 1690 PCIDevice *pdev = &vdev->pdev; 1691 uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI], 1692 addr + quirk->data.base_offset, size); 1693 1694 if (quirk->data.flags == NV_3D0_READ && addr == quirk->data.data_offset) { 1695 data = vfio_pci_read_config(pdev, quirk->data.address_val, size); 1696 trace_vfio_nvidia_3d0_quirk_read(size, data); 1697 } 1698 1699 quirk->data.flags = NV_3D0_NONE; 1700 1701 return data; 1702 } 1703 1704 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr, 1705 uint64_t data, unsigned size) 1706 { 1707 VFIOQuirk *quirk = opaque; 1708 VFIOPCIDevice *vdev = quirk->vdev; 1709 PCIDevice *pdev = &vdev->pdev; 1710 1711 switch (quirk->data.flags) { 1712 case NV_3D0_NONE: 1713 if (addr == quirk->data.address_offset && data == 0x338) { 1714 quirk->data.flags = NV_3D0_SELECT; 1715 } 1716 break; 1717 case NV_3D0_SELECT: 1718 quirk->data.flags = NV_3D0_NONE; 1719 if (addr == quirk->data.data_offset && 1720 (data & ~quirk->data.address_mask) == quirk->data.address_match) { 1721 quirk->data.flags = NV_3D0_WINDOW; 1722 quirk->data.address_val = data & quirk->data.address_mask; 1723 } 1724 break; 1725 case NV_3D0_WINDOW: 1726 quirk->data.flags = NV_3D0_NONE; 1727 if (addr == quirk->data.address_offset) { 1728 if (data == 0x538) { 1729 quirk->data.flags = NV_3D0_READ; 1730 } else if (data == 0x738) { 1731 quirk->data.flags = NV_3D0_WRITE; 1732 } 1733 } 1734 break; 1735 case NV_3D0_WRITE: 1736 quirk->data.flags = NV_3D0_NONE; 1737 if (addr == quirk->data.data_offset) { 1738 vfio_pci_write_config(pdev, quirk->data.address_val, data, size); 1739 trace_vfio_nvidia_3d0_quirk_write(data, size); 1740 return; 1741 } 1742 break; 1743 } 1744 1745 vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI], 1746 addr + quirk->data.base_offset, data, size); 1747 } 1748 1749 static const MemoryRegionOps vfio_nvidia_3d0_quirk = { 1750 .read = vfio_nvidia_3d0_quirk_read, 1751 .write = vfio_nvidia_3d0_quirk_write, 1752 .endianness = DEVICE_LITTLE_ENDIAN, 1753 }; 1754 1755 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev) 1756 { 1757 PCIDevice *pdev = &vdev->pdev; 1758 VFIOQuirk *quirk; 1759 1760 if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA || 1761 !vdev->bars[1].region.size) { 1762 return; 1763 } 1764 1765 quirk = g_malloc0(sizeof(*quirk)); 1766 quirk->vdev = vdev; 1767 quirk->data.base_offset = 0x10; 1768 quirk->data.address_offset = 4; 1769 quirk->data.address_size = 2; 1770 quirk->data.address_match = 0x1800; 1771 quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1; 1772 quirk->data.data_offset = 0; 1773 quirk->data.data_size = 4; 1774 1775 memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_3d0_quirk, 1776 quirk, "vfio-nvidia-3d0-quirk", 6); 1777 memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, 1778 quirk->data.base_offset, &quirk->mem); 1779 1780 QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks, 1781 quirk, next); 1782 1783 trace_vfio_vga_probe_nvidia_3d0_quirk(vdev->vbasedev.name); 1784 } 1785 1786 /* 1787 * The second quirk is documented in envytools. The I/O port BAR5 is just 1788 * a set of address/data ports to the MMIO BARs. The BAR we care about is 1789 * again BAR0. This backdoor is apparently a bit newer than the one above 1790 * so we need to not only trap 256 bytes @0x1800, but all of PCI config 1791 * space, including extended space is available at the 4k @0x88000. 1792 */ 1793 enum { 1794 NV_BAR5_ADDRESS = 0x1, 1795 NV_BAR5_ENABLE = 0x2, 1796 NV_BAR5_MASTER = 0x4, 1797 NV_BAR5_VALID = 0x7, 1798 }; 1799 1800 static void vfio_nvidia_bar5_window_quirk_write(void *opaque, hwaddr addr, 1801 uint64_t data, unsigned size) 1802 { 1803 VFIOQuirk *quirk = opaque; 1804 1805 switch (addr) { 1806 case 0x0: 1807 if (data & 0x1) { 1808 quirk->data.flags |= NV_BAR5_MASTER; 1809 } else { 1810 quirk->data.flags &= ~NV_BAR5_MASTER; 1811 } 1812 break; 1813 case 0x4: 1814 if (data & 0x1) { 1815 quirk->data.flags |= NV_BAR5_ENABLE; 1816 } else { 1817 quirk->data.flags &= ~NV_BAR5_ENABLE; 1818 } 1819 break; 1820 case 0x8: 1821 if (quirk->data.flags & NV_BAR5_MASTER) { 1822 if ((data & ~0xfff) == 0x88000) { 1823 quirk->data.flags |= NV_BAR5_ADDRESS; 1824 quirk->data.address_val = data & 0xfff; 1825 } else if ((data & ~0xff) == 0x1800) { 1826 quirk->data.flags |= NV_BAR5_ADDRESS; 1827 quirk->data.address_val = data & 0xff; 1828 } else { 1829 quirk->data.flags &= ~NV_BAR5_ADDRESS; 1830 } 1831 } 1832 break; 1833 } 1834 1835 vfio_generic_window_quirk_write(opaque, addr, data, size); 1836 } 1837 1838 static const MemoryRegionOps vfio_nvidia_bar5_window_quirk = { 1839 .read = vfio_generic_window_quirk_read, 1840 .write = vfio_nvidia_bar5_window_quirk_write, 1841 .valid.min_access_size = 4, 1842 .endianness = DEVICE_LITTLE_ENDIAN, 1843 }; 1844 1845 static void vfio_probe_nvidia_bar5_window_quirk(VFIOPCIDevice *vdev, int nr) 1846 { 1847 PCIDevice *pdev = &vdev->pdev; 1848 VFIOQuirk *quirk; 1849 1850 if (!vdev->has_vga || nr != 5 || 1851 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) { 1852 return; 1853 } 1854 1855 quirk = g_malloc0(sizeof(*quirk)); 1856 quirk->vdev = vdev; 1857 quirk->data.read_flags = quirk->data.write_flags = NV_BAR5_VALID; 1858 quirk->data.address_offset = 0x8; 1859 quirk->data.address_size = 0; /* actually 4, but avoids generic code */ 1860 quirk->data.data_offset = 0xc; 1861 quirk->data.data_size = 4; 1862 quirk->data.bar = nr; 1863 1864 memory_region_init_io(&quirk->mem, OBJECT(vdev), 1865 &vfio_nvidia_bar5_window_quirk, quirk, 1866 "vfio-nvidia-bar5-window-quirk", 16); 1867 memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, 1868 0, &quirk->mem, 1); 1869 1870 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 1871 1872 trace_vfio_probe_nvidia_bar5_window_quirk(vdev->vbasedev.name); 1873 } 1874 1875 static void vfio_nvidia_88000_quirk_write(void *opaque, hwaddr addr, 1876 uint64_t data, unsigned size) 1877 { 1878 VFIOQuirk *quirk = opaque; 1879 VFIOPCIDevice *vdev = quirk->vdev; 1880 PCIDevice *pdev = &vdev->pdev; 1881 hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK; 1882 1883 vfio_generic_quirk_write(opaque, addr, data, size); 1884 1885 /* 1886 * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the 1887 * MSI capability ID register. Both the ID and next register are 1888 * read-only, so we allow writes covering either of those to real hw. 1889 * NB - only fixed for the 0x88000 MMIO window. 1890 */ 1891 if ((pdev->cap_present & QEMU_PCI_CAP_MSI) && 1892 vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) { 1893 vfio_region_write(&vdev->bars[quirk->data.bar].region, 1894 addr + base, data, size); 1895 } 1896 } 1897 1898 static const MemoryRegionOps vfio_nvidia_88000_quirk = { 1899 .read = vfio_generic_quirk_read, 1900 .write = vfio_nvidia_88000_quirk_write, 1901 .endianness = DEVICE_LITTLE_ENDIAN, 1902 }; 1903 1904 /* 1905 * Finally, BAR0 itself. We want to redirect any accesses to either 1906 * 0x1800 or 0x88000 through the PCI config space access functions. 1907 * 1908 * NB - quirk at a page granularity or else they don't seem to work when 1909 * BARs are mmap'd 1910 * 1911 * Here's offset 0x88000... 1912 */ 1913 static void vfio_probe_nvidia_bar0_88000_quirk(VFIOPCIDevice *vdev, int nr) 1914 { 1915 PCIDevice *pdev = &vdev->pdev; 1916 VFIOQuirk *quirk; 1917 uint16_t vendor, class; 1918 1919 vendor = pci_get_word(pdev->config + PCI_VENDOR_ID); 1920 class = pci_get_word(pdev->config + PCI_CLASS_DEVICE); 1921 1922 if (nr != 0 || vendor != PCI_VENDOR_ID_NVIDIA || 1923 class != PCI_CLASS_DISPLAY_VGA) { 1924 return; 1925 } 1926 1927 quirk = g_malloc0(sizeof(*quirk)); 1928 quirk->vdev = vdev; 1929 quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1; 1930 quirk->data.address_match = 0x88000; 1931 quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1; 1932 quirk->data.bar = nr; 1933 1934 memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_88000_quirk, 1935 quirk, "vfio-nvidia-bar0-88000-quirk", 1936 TARGET_PAGE_ALIGN(quirk->data.address_mask + 1)); 1937 memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, 1938 quirk->data.address_match & TARGET_PAGE_MASK, 1939 &quirk->mem, 1); 1940 1941 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 1942 1943 trace_vfio_probe_nvidia_bar0_88000_quirk(vdev->vbasedev.name); 1944 } 1945 1946 /* 1947 * And here's the same for BAR0 offset 0x1800... 1948 */ 1949 static void vfio_probe_nvidia_bar0_1800_quirk(VFIOPCIDevice *vdev, int nr) 1950 { 1951 PCIDevice *pdev = &vdev->pdev; 1952 VFIOQuirk *quirk; 1953 1954 if (!vdev->has_vga || nr != 0 || 1955 pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) { 1956 return; 1957 } 1958 1959 /* Log the chipset ID */ 1960 trace_vfio_probe_nvidia_bar0_1800_quirk_id( 1961 (unsigned int)(vfio_region_read(&vdev->bars[0].region, 0, 4) >> 20) 1962 & 0xff); 1963 1964 quirk = g_malloc0(sizeof(*quirk)); 1965 quirk->vdev = vdev; 1966 quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1; 1967 quirk->data.address_match = 0x1800; 1968 quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1; 1969 quirk->data.bar = nr; 1970 1971 memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk, 1972 "vfio-nvidia-bar0-1800-quirk", 1973 TARGET_PAGE_ALIGN(quirk->data.address_mask + 1)); 1974 memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, 1975 quirk->data.address_match & TARGET_PAGE_MASK, 1976 &quirk->mem, 1); 1977 1978 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 1979 1980 trace_vfio_probe_nvidia_bar0_1800_quirk(vdev->vbasedev.name); 1981 } 1982 1983 /* 1984 * TODO - Some Nvidia devices provide config access to their companion HDA 1985 * device and even to their parent bridge via these config space mirrors. 1986 * Add quirks for those regions. 1987 */ 1988 1989 /* 1990 * Common quirk probe entry points. 1991 */ 1992 static void vfio_vga_quirk_setup(VFIOPCIDevice *vdev) 1993 { 1994 vfio_vga_probe_ati_3c3_quirk(vdev); 1995 vfio_vga_probe_nvidia_3d0_quirk(vdev); 1996 } 1997 1998 static void vfio_vga_quirk_teardown(VFIOPCIDevice *vdev) 1999 { 2000 int i; 2001 2002 for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) { 2003 while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) { 2004 VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks); 2005 memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem); 2006 object_unparent(OBJECT(&quirk->mem)); 2007 QLIST_REMOVE(quirk, next); 2008 g_free(quirk); 2009 } 2010 } 2011 } 2012 2013 static void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) 2014 { 2015 vfio_probe_ati_bar4_window_quirk(vdev, nr); 2016 vfio_probe_ati_bar2_4000_quirk(vdev, nr); 2017 vfio_probe_nvidia_bar5_window_quirk(vdev, nr); 2018 vfio_probe_nvidia_bar0_88000_quirk(vdev, nr); 2019 vfio_probe_nvidia_bar0_1800_quirk(vdev, nr); 2020 vfio_probe_rtl8168_bar2_window_quirk(vdev, nr); 2021 } 2022 2023 static void vfio_bar_quirk_teardown(VFIOPCIDevice *vdev, int nr) 2024 { 2025 VFIOBAR *bar = &vdev->bars[nr]; 2026 2027 while (!QLIST_EMPTY(&bar->quirks)) { 2028 VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks); 2029 memory_region_del_subregion(&bar->region.mem, &quirk->mem); 2030 object_unparent(OBJECT(&quirk->mem)); 2031 QLIST_REMOVE(quirk, next); 2032 g_free(quirk); 2033 } 2034 } 2035 2036 /* 2037 * PCI config space 2038 */ 2039 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) 2040 { 2041 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 2042 uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; 2043 2044 memcpy(&emu_bits, vdev->emulated_config_bits + addr, len); 2045 emu_bits = le32_to_cpu(emu_bits); 2046 2047 if (emu_bits) { 2048 emu_val = pci_default_read_config(pdev, addr, len); 2049 } 2050 2051 if (~emu_bits & (0xffffffffU >> (32 - len * 8))) { 2052 ssize_t ret; 2053 2054 ret = pread(vdev->vbasedev.fd, &phys_val, len, 2055 vdev->config_offset + addr); 2056 if (ret != len) { 2057 error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m", 2058 __func__, vdev->host.domain, vdev->host.bus, 2059 vdev->host.slot, vdev->host.function, addr, len); 2060 return -errno; 2061 } 2062 phys_val = le32_to_cpu(phys_val); 2063 } 2064 2065 val = (emu_val & emu_bits) | (phys_val & ~emu_bits); 2066 2067 trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val); 2068 2069 return val; 2070 } 2071 2072 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, 2073 uint32_t val, int len) 2074 { 2075 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 2076 uint32_t val_le = cpu_to_le32(val); 2077 2078 trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len); 2079 2080 /* Write everything to VFIO, let it filter out what we can't write */ 2081 if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr) 2082 != len) { 2083 error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m", 2084 __func__, vdev->host.domain, vdev->host.bus, 2085 vdev->host.slot, vdev->host.function, addr, val, len); 2086 } 2087 2088 /* MSI/MSI-X Enabling/Disabling */ 2089 if (pdev->cap_present & QEMU_PCI_CAP_MSI && 2090 ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) { 2091 int is_enabled, was_enabled = msi_enabled(pdev); 2092 2093 pci_default_write_config(pdev, addr, val, len); 2094 2095 is_enabled = msi_enabled(pdev); 2096 2097 if (!was_enabled) { 2098 if (is_enabled) { 2099 vfio_enable_msi(vdev); 2100 } 2101 } else { 2102 if (!is_enabled) { 2103 vfio_disable_msi(vdev); 2104 } else { 2105 vfio_update_msi(vdev); 2106 } 2107 } 2108 } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX && 2109 ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) { 2110 int is_enabled, was_enabled = msix_enabled(pdev); 2111 2112 pci_default_write_config(pdev, addr, val, len); 2113 2114 is_enabled = msix_enabled(pdev); 2115 2116 if (!was_enabled && is_enabled) { 2117 vfio_enable_msix(vdev); 2118 } else if (was_enabled && !is_enabled) { 2119 vfio_disable_msix(vdev); 2120 } 2121 } else { 2122 /* Write everything to QEMU to keep emulated bits correct */ 2123 pci_default_write_config(pdev, addr, val, len); 2124 } 2125 } 2126 2127 /* 2128 * Interrupt setup 2129 */ 2130 static void vfio_disable_interrupts(VFIOPCIDevice *vdev) 2131 { 2132 /* 2133 * More complicated than it looks. Disabling MSI/X transitions the 2134 * device to INTx mode (if supported). Therefore we need to first 2135 * disable MSI/X and then cleanup by disabling INTx. 2136 */ 2137 if (vdev->interrupt == VFIO_INT_MSIX) { 2138 vfio_disable_msix(vdev); 2139 } else if (vdev->interrupt == VFIO_INT_MSI) { 2140 vfio_disable_msi(vdev); 2141 } 2142 2143 if (vdev->interrupt == VFIO_INT_INTx) { 2144 vfio_disable_intx(vdev); 2145 } 2146 } 2147 2148 static int vfio_setup_msi(VFIOPCIDevice *vdev, int pos) 2149 { 2150 uint16_t ctrl; 2151 bool msi_64bit, msi_maskbit; 2152 int ret, entries; 2153 2154 if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl), 2155 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { 2156 return -errno; 2157 } 2158 ctrl = le16_to_cpu(ctrl); 2159 2160 msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT); 2161 msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT); 2162 entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1); 2163 2164 trace_vfio_setup_msi(vdev->vbasedev.name, pos); 2165 2166 ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit); 2167 if (ret < 0) { 2168 if (ret == -ENOTSUP) { 2169 return 0; 2170 } 2171 error_report("vfio: msi_init failed"); 2172 return ret; 2173 } 2174 vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0); 2175 2176 return 0; 2177 } 2178 2179 /* 2180 * We don't have any control over how pci_add_capability() inserts 2181 * capabilities into the chain. In order to setup MSI-X we need a 2182 * MemoryRegion for the BAR. In order to setup the BAR and not 2183 * attempt to mmap the MSI-X table area, which VFIO won't allow, we 2184 * need to first look for where the MSI-X table lives. So we 2185 * unfortunately split MSI-X setup across two functions. 2186 */ 2187 static int vfio_early_setup_msix(VFIOPCIDevice *vdev) 2188 { 2189 uint8_t pos; 2190 uint16_t ctrl; 2191 uint32_t table, pba; 2192 int fd = vdev->vbasedev.fd; 2193 2194 pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); 2195 if (!pos) { 2196 return 0; 2197 } 2198 2199 if (pread(fd, &ctrl, sizeof(ctrl), 2200 vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { 2201 return -errno; 2202 } 2203 2204 if (pread(fd, &table, sizeof(table), 2205 vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { 2206 return -errno; 2207 } 2208 2209 if (pread(fd, &pba, sizeof(pba), 2210 vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { 2211 return -errno; 2212 } 2213 2214 ctrl = le16_to_cpu(ctrl); 2215 table = le32_to_cpu(table); 2216 pba = le32_to_cpu(pba); 2217 2218 vdev->msix = g_malloc0(sizeof(*(vdev->msix))); 2219 vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK; 2220 vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; 2221 vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK; 2222 vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; 2223 vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; 2224 2225 trace_vfio_early_setup_msix(vdev->vbasedev.name, pos, 2226 vdev->msix->table_bar, 2227 vdev->msix->table_offset, 2228 vdev->msix->entries); 2229 2230 return 0; 2231 } 2232 2233 static int vfio_setup_msix(VFIOPCIDevice *vdev, int pos) 2234 { 2235 int ret; 2236 2237 ret = msix_init(&vdev->pdev, vdev->msix->entries, 2238 &vdev->bars[vdev->msix->table_bar].region.mem, 2239 vdev->msix->table_bar, vdev->msix->table_offset, 2240 &vdev->bars[vdev->msix->pba_bar].region.mem, 2241 vdev->msix->pba_bar, vdev->msix->pba_offset, pos); 2242 if (ret < 0) { 2243 if (ret == -ENOTSUP) { 2244 return 0; 2245 } 2246 error_report("vfio: msix_init failed"); 2247 return ret; 2248 } 2249 2250 return 0; 2251 } 2252 2253 static void vfio_teardown_msi(VFIOPCIDevice *vdev) 2254 { 2255 msi_uninit(&vdev->pdev); 2256 2257 if (vdev->msix) { 2258 msix_uninit(&vdev->pdev, 2259 &vdev->bars[vdev->msix->table_bar].region.mem, 2260 &vdev->bars[vdev->msix->pba_bar].region.mem); 2261 } 2262 } 2263 2264 /* 2265 * Resource setup 2266 */ 2267 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled) 2268 { 2269 int i; 2270 2271 for (i = 0; i < PCI_ROM_SLOT; i++) { 2272 VFIOBAR *bar = &vdev->bars[i]; 2273 2274 if (!bar->region.size) { 2275 continue; 2276 } 2277 2278 memory_region_set_enabled(&bar->region.mmap_mem, enabled); 2279 if (vdev->msix && vdev->msix->table_bar == i) { 2280 memory_region_set_enabled(&vdev->msix->mmap_mem, enabled); 2281 } 2282 } 2283 } 2284 2285 static void vfio_unmap_bar(VFIOPCIDevice *vdev, int nr) 2286 { 2287 VFIOBAR *bar = &vdev->bars[nr]; 2288 2289 if (!bar->region.size) { 2290 return; 2291 } 2292 2293 vfio_bar_quirk_teardown(vdev, nr); 2294 2295 memory_region_del_subregion(&bar->region.mem, &bar->region.mmap_mem); 2296 munmap(bar->region.mmap, memory_region_size(&bar->region.mmap_mem)); 2297 2298 if (vdev->msix && vdev->msix->table_bar == nr) { 2299 memory_region_del_subregion(&bar->region.mem, &vdev->msix->mmap_mem); 2300 munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem)); 2301 } 2302 } 2303 2304 static void vfio_map_bar(VFIOPCIDevice *vdev, int nr) 2305 { 2306 VFIOBAR *bar = &vdev->bars[nr]; 2307 uint64_t size = bar->region.size; 2308 char name[64]; 2309 uint32_t pci_bar; 2310 uint8_t type; 2311 int ret; 2312 2313 /* Skip both unimplemented BARs and the upper half of 64bit BARS. */ 2314 if (!size) { 2315 return; 2316 } 2317 2318 snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d", 2319 vdev->host.domain, vdev->host.bus, vdev->host.slot, 2320 vdev->host.function, nr); 2321 2322 /* Determine what type of BAR this is for registration */ 2323 ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), 2324 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); 2325 if (ret != sizeof(pci_bar)) { 2326 error_report("vfio: Failed to read BAR %d (%m)", nr); 2327 return; 2328 } 2329 2330 pci_bar = le32_to_cpu(pci_bar); 2331 bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO); 2332 bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64); 2333 type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : 2334 ~PCI_BASE_ADDRESS_MEM_MASK); 2335 2336 /* A "slow" read/write mapping underlies all BARs */ 2337 memory_region_init_io(&bar->region.mem, OBJECT(vdev), &vfio_region_ops, 2338 bar, name, size); 2339 pci_register_bar(&vdev->pdev, nr, type, &bar->region.mem); 2340 2341 /* 2342 * We can't mmap areas overlapping the MSIX vector table, so we 2343 * potentially insert a direct-mapped subregion before and after it. 2344 */ 2345 if (vdev->msix && vdev->msix->table_bar == nr) { 2346 size = vdev->msix->table_offset & qemu_host_page_mask; 2347 } 2348 2349 strncat(name, " mmap", sizeof(name) - strlen(name) - 1); 2350 if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem, 2351 &bar->region.mmap_mem, &bar->region.mmap, 2352 size, 0, name)) { 2353 error_report("%s unsupported. Performance may be slow", name); 2354 } 2355 2356 if (vdev->msix && vdev->msix->table_bar == nr) { 2357 uint64_t start; 2358 2359 start = HOST_PAGE_ALIGN(vdev->msix->table_offset + 2360 (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE)); 2361 2362 size = start < bar->region.size ? bar->region.size - start : 0; 2363 strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1); 2364 /* VFIOMSIXInfo contains another MemoryRegion for this mapping */ 2365 if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem, 2366 &vdev->msix->mmap_mem, 2367 &vdev->msix->mmap, size, start, name)) { 2368 error_report("%s unsupported. Performance may be slow", name); 2369 } 2370 } 2371 2372 vfio_bar_quirk_setup(vdev, nr); 2373 } 2374 2375 static void vfio_map_bars(VFIOPCIDevice *vdev) 2376 { 2377 int i; 2378 2379 for (i = 0; i < PCI_ROM_SLOT; i++) { 2380 vfio_map_bar(vdev, i); 2381 } 2382 2383 if (vdev->has_vga) { 2384 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem, 2385 OBJECT(vdev), &vfio_vga_ops, 2386 &vdev->vga.region[QEMU_PCI_VGA_MEM], 2387 "vfio-vga-mmio@0xa0000", 2388 QEMU_PCI_VGA_MEM_SIZE); 2389 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem, 2390 OBJECT(vdev), &vfio_vga_ops, 2391 &vdev->vga.region[QEMU_PCI_VGA_IO_LO], 2392 "vfio-vga-io@0x3b0", 2393 QEMU_PCI_VGA_IO_LO_SIZE); 2394 memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, 2395 OBJECT(vdev), &vfio_vga_ops, 2396 &vdev->vga.region[QEMU_PCI_VGA_IO_HI], 2397 "vfio-vga-io@0x3c0", 2398 QEMU_PCI_VGA_IO_HI_SIZE); 2399 2400 pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem, 2401 &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem, 2402 &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem); 2403 vfio_vga_quirk_setup(vdev); 2404 } 2405 } 2406 2407 static void vfio_unmap_bars(VFIOPCIDevice *vdev) 2408 { 2409 int i; 2410 2411 for (i = 0; i < PCI_ROM_SLOT; i++) { 2412 vfio_unmap_bar(vdev, i); 2413 } 2414 2415 if (vdev->has_vga) { 2416 vfio_vga_quirk_teardown(vdev); 2417 pci_unregister_vga(&vdev->pdev); 2418 } 2419 } 2420 2421 /* 2422 * General setup 2423 */ 2424 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos) 2425 { 2426 uint8_t tmp, next = 0xff; 2427 2428 for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp; 2429 tmp = pdev->config[tmp + 1]) { 2430 if (tmp > pos && tmp < next) { 2431 next = tmp; 2432 } 2433 } 2434 2435 return next - pos; 2436 } 2437 2438 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask) 2439 { 2440 pci_set_word(buf, (pci_get_word(buf) & ~mask) | val); 2441 } 2442 2443 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos, 2444 uint16_t val, uint16_t mask) 2445 { 2446 vfio_set_word_bits(vdev->pdev.config + pos, val, mask); 2447 vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask); 2448 vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask); 2449 } 2450 2451 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask) 2452 { 2453 pci_set_long(buf, (pci_get_long(buf) & ~mask) | val); 2454 } 2455 2456 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos, 2457 uint32_t val, uint32_t mask) 2458 { 2459 vfio_set_long_bits(vdev->pdev.config + pos, val, mask); 2460 vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask); 2461 vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask); 2462 } 2463 2464 static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size) 2465 { 2466 uint16_t flags; 2467 uint8_t type; 2468 2469 flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS); 2470 type = (flags & PCI_EXP_FLAGS_TYPE) >> 4; 2471 2472 if (type != PCI_EXP_TYPE_ENDPOINT && 2473 type != PCI_EXP_TYPE_LEG_END && 2474 type != PCI_EXP_TYPE_RC_END) { 2475 2476 error_report("vfio: Assignment of PCIe type 0x%x " 2477 "devices is not currently supported", type); 2478 return -EINVAL; 2479 } 2480 2481 if (!pci_bus_is_express(vdev->pdev.bus)) { 2482 /* 2483 * Use express capability as-is on PCI bus. It doesn't make much 2484 * sense to even expose, but some drivers (ex. tg3) depend on it 2485 * and guests don't seem to be particular about it. We'll need 2486 * to revist this or force express devices to express buses if we 2487 * ever expose an IOMMU to the guest. 2488 */ 2489 } else if (pci_bus_is_root(vdev->pdev.bus)) { 2490 /* 2491 * On a Root Complex bus Endpoints become Root Complex Integrated 2492 * Endpoints, which changes the type and clears the LNK & LNK2 fields. 2493 */ 2494 if (type == PCI_EXP_TYPE_ENDPOINT) { 2495 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS, 2496 PCI_EXP_TYPE_RC_END << 4, 2497 PCI_EXP_FLAGS_TYPE); 2498 2499 /* Link Capabilities, Status, and Control goes away */ 2500 if (size > PCI_EXP_LNKCTL) { 2501 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0); 2502 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0); 2503 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0); 2504 2505 #ifndef PCI_EXP_LNKCAP2 2506 #define PCI_EXP_LNKCAP2 44 2507 #endif 2508 #ifndef PCI_EXP_LNKSTA2 2509 #define PCI_EXP_LNKSTA2 50 2510 #endif 2511 /* Link 2 Capabilities, Status, and Control goes away */ 2512 if (size > PCI_EXP_LNKCAP2) { 2513 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0); 2514 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0); 2515 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0); 2516 } 2517 } 2518 2519 } else if (type == PCI_EXP_TYPE_LEG_END) { 2520 /* 2521 * Legacy endpoints don't belong on the root complex. Windows 2522 * seems to be happier with devices if we skip the capability. 2523 */ 2524 return 0; 2525 } 2526 2527 } else { 2528 /* 2529 * Convert Root Complex Integrated Endpoints to regular endpoints. 2530 * These devices don't support LNK/LNK2 capabilities, so make them up. 2531 */ 2532 if (type == PCI_EXP_TYPE_RC_END) { 2533 vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS, 2534 PCI_EXP_TYPE_ENDPOINT << 4, 2535 PCI_EXP_FLAGS_TYPE); 2536 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 2537 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25, ~0); 2538 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0); 2539 } 2540 2541 /* Mark the Link Status bits as emulated to allow virtual negotiation */ 2542 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 2543 pci_get_word(vdev->pdev.config + pos + 2544 PCI_EXP_LNKSTA), 2545 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS); 2546 } 2547 2548 pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size); 2549 if (pos >= 0) { 2550 vdev->pdev.exp.exp_cap = pos; 2551 } 2552 2553 return pos; 2554 } 2555 2556 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos) 2557 { 2558 uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP); 2559 2560 if (cap & PCI_EXP_DEVCAP_FLR) { 2561 trace_vfio_check_pcie_flr(vdev->vbasedev.name); 2562 vdev->has_flr = true; 2563 } 2564 } 2565 2566 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos) 2567 { 2568 uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL); 2569 2570 if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) { 2571 trace_vfio_check_pm_reset(vdev->vbasedev.name); 2572 vdev->has_pm_reset = true; 2573 } 2574 } 2575 2576 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos) 2577 { 2578 uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP); 2579 2580 if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) { 2581 trace_vfio_check_af_flr(vdev->vbasedev.name); 2582 vdev->has_flr = true; 2583 } 2584 } 2585 2586 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) 2587 { 2588 PCIDevice *pdev = &vdev->pdev; 2589 uint8_t cap_id, next, size; 2590 int ret; 2591 2592 cap_id = pdev->config[pos]; 2593 next = pdev->config[pos + 1]; 2594 2595 /* 2596 * If it becomes important to configure capabilities to their actual 2597 * size, use this as the default when it's something we don't recognize. 2598 * Since QEMU doesn't actually handle many of the config accesses, 2599 * exact size doesn't seem worthwhile. 2600 */ 2601 size = vfio_std_cap_max_size(pdev, pos); 2602 2603 /* 2604 * pci_add_capability always inserts the new capability at the head 2605 * of the chain. Therefore to end up with a chain that matches the 2606 * physical device, we insert from the end by making this recursive. 2607 * This is also why we pre-caclulate size above as cached config space 2608 * will be changed as we unwind the stack. 2609 */ 2610 if (next) { 2611 ret = vfio_add_std_cap(vdev, next); 2612 if (ret) { 2613 return ret; 2614 } 2615 } else { 2616 /* Begin the rebuild, use QEMU emulated list bits */ 2617 pdev->config[PCI_CAPABILITY_LIST] = 0; 2618 vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff; 2619 vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST; 2620 } 2621 2622 /* Use emulated next pointer to allow dropping caps */ 2623 pci_set_byte(vdev->emulated_config_bits + pos + 1, 0xff); 2624 2625 switch (cap_id) { 2626 case PCI_CAP_ID_MSI: 2627 ret = vfio_setup_msi(vdev, pos); 2628 break; 2629 case PCI_CAP_ID_EXP: 2630 vfio_check_pcie_flr(vdev, pos); 2631 ret = vfio_setup_pcie_cap(vdev, pos, size); 2632 break; 2633 case PCI_CAP_ID_MSIX: 2634 ret = vfio_setup_msix(vdev, pos); 2635 break; 2636 case PCI_CAP_ID_PM: 2637 vfio_check_pm_reset(vdev, pos); 2638 vdev->pm_cap = pos; 2639 ret = pci_add_capability(pdev, cap_id, pos, size); 2640 break; 2641 case PCI_CAP_ID_AF: 2642 vfio_check_af_flr(vdev, pos); 2643 ret = pci_add_capability(pdev, cap_id, pos, size); 2644 break; 2645 default: 2646 ret = pci_add_capability(pdev, cap_id, pos, size); 2647 break; 2648 } 2649 2650 if (ret < 0) { 2651 error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability " 2652 "0x%x[0x%x]@0x%x: %d", vdev->host.domain, 2653 vdev->host.bus, vdev->host.slot, vdev->host.function, 2654 cap_id, size, pos, ret); 2655 return ret; 2656 } 2657 2658 return 0; 2659 } 2660 2661 static int vfio_add_capabilities(VFIOPCIDevice *vdev) 2662 { 2663 PCIDevice *pdev = &vdev->pdev; 2664 2665 if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) || 2666 !pdev->config[PCI_CAPABILITY_LIST]) { 2667 return 0; /* Nothing to add */ 2668 } 2669 2670 return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]); 2671 } 2672 2673 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) 2674 { 2675 PCIDevice *pdev = &vdev->pdev; 2676 uint16_t cmd; 2677 2678 vfio_disable_interrupts(vdev); 2679 2680 /* Make sure the device is in D0 */ 2681 if (vdev->pm_cap) { 2682 uint16_t pmcsr; 2683 uint8_t state; 2684 2685 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); 2686 state = pmcsr & PCI_PM_CTRL_STATE_MASK; 2687 if (state) { 2688 pmcsr &= ~PCI_PM_CTRL_STATE_MASK; 2689 vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); 2690 /* vfio handles the necessary delay here */ 2691 pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); 2692 state = pmcsr & PCI_PM_CTRL_STATE_MASK; 2693 if (state) { 2694 error_report("vfio: Unable to power on device, stuck in D%d", 2695 state); 2696 } 2697 } 2698 } 2699 2700 /* 2701 * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master. 2702 * Also put INTx Disable in known state. 2703 */ 2704 cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); 2705 cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | 2706 PCI_COMMAND_INTX_DISABLE); 2707 vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); 2708 } 2709 2710 static void vfio_pci_post_reset(VFIOPCIDevice *vdev) 2711 { 2712 vfio_enable_intx(vdev); 2713 } 2714 2715 static bool vfio_pci_host_match(PCIHostDeviceAddress *host1, 2716 PCIHostDeviceAddress *host2) 2717 { 2718 return (host1->domain == host2->domain && host1->bus == host2->bus && 2719 host1->slot == host2->slot && host1->function == host2->function); 2720 } 2721 2722 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) 2723 { 2724 VFIOGroup *group; 2725 struct vfio_pci_hot_reset_info *info; 2726 struct vfio_pci_dependent_device *devices; 2727 struct vfio_pci_hot_reset *reset; 2728 int32_t *fds; 2729 int ret, i, count; 2730 bool multi = false; 2731 2732 trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); 2733 2734 vfio_pci_pre_reset(vdev); 2735 vdev->vbasedev.needs_reset = false; 2736 2737 info = g_malloc0(sizeof(*info)); 2738 info->argsz = sizeof(*info); 2739 2740 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); 2741 if (ret && errno != ENOSPC) { 2742 ret = -errno; 2743 if (!vdev->has_pm_reset) { 2744 error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, " 2745 "no available reset mechanism.", vdev->host.domain, 2746 vdev->host.bus, vdev->host.slot, vdev->host.function); 2747 } 2748 goto out_single; 2749 } 2750 2751 count = info->count; 2752 info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices))); 2753 info->argsz = sizeof(*info) + (count * sizeof(*devices)); 2754 devices = &info->devices[0]; 2755 2756 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); 2757 if (ret) { 2758 ret = -errno; 2759 error_report("vfio: hot reset info failed: %m"); 2760 goto out_single; 2761 } 2762 2763 trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); 2764 2765 /* Verify that we have all the groups required */ 2766 for (i = 0; i < info->count; i++) { 2767 PCIHostDeviceAddress host; 2768 VFIOPCIDevice *tmp; 2769 VFIODevice *vbasedev_iter; 2770 2771 host.domain = devices[i].segment; 2772 host.bus = devices[i].bus; 2773 host.slot = PCI_SLOT(devices[i].devfn); 2774 host.function = PCI_FUNC(devices[i].devfn); 2775 2776 trace_vfio_pci_hot_reset_dep_devices(host.domain, 2777 host.bus, host.slot, host.function, devices[i].group_id); 2778 2779 if (vfio_pci_host_match(&host, &vdev->host)) { 2780 continue; 2781 } 2782 2783 QLIST_FOREACH(group, &vfio_group_list, next) { 2784 if (group->groupid == devices[i].group_id) { 2785 break; 2786 } 2787 } 2788 2789 if (!group) { 2790 if (!vdev->has_pm_reset) { 2791 error_report("vfio: Cannot reset device %s, " 2792 "depends on group %d which is not owned.", 2793 vdev->vbasedev.name, devices[i].group_id); 2794 } 2795 ret = -EPERM; 2796 goto out; 2797 } 2798 2799 /* Prep dependent devices for reset and clear our marker. */ 2800 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 2801 if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { 2802 continue; 2803 } 2804 tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); 2805 if (vfio_pci_host_match(&host, &tmp->host)) { 2806 if (single) { 2807 ret = -EINVAL; 2808 goto out_single; 2809 } 2810 vfio_pci_pre_reset(tmp); 2811 tmp->vbasedev.needs_reset = false; 2812 multi = true; 2813 break; 2814 } 2815 } 2816 } 2817 2818 if (!single && !multi) { 2819 ret = -EINVAL; 2820 goto out_single; 2821 } 2822 2823 /* Determine how many group fds need to be passed */ 2824 count = 0; 2825 QLIST_FOREACH(group, &vfio_group_list, next) { 2826 for (i = 0; i < info->count; i++) { 2827 if (group->groupid == devices[i].group_id) { 2828 count++; 2829 break; 2830 } 2831 } 2832 } 2833 2834 reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); 2835 reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); 2836 fds = &reset->group_fds[0]; 2837 2838 /* Fill in group fds */ 2839 QLIST_FOREACH(group, &vfio_group_list, next) { 2840 for (i = 0; i < info->count; i++) { 2841 if (group->groupid == devices[i].group_id) { 2842 fds[reset->count++] = group->fd; 2843 break; 2844 } 2845 } 2846 } 2847 2848 /* Bus reset! */ 2849 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); 2850 g_free(reset); 2851 2852 trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, 2853 ret ? "%m" : "Success"); 2854 2855 out: 2856 /* Re-enable INTx on affected devices */ 2857 for (i = 0; i < info->count; i++) { 2858 PCIHostDeviceAddress host; 2859 VFIOPCIDevice *tmp; 2860 VFIODevice *vbasedev_iter; 2861 2862 host.domain = devices[i].segment; 2863 host.bus = devices[i].bus; 2864 host.slot = PCI_SLOT(devices[i].devfn); 2865 host.function = PCI_FUNC(devices[i].devfn); 2866 2867 if (vfio_pci_host_match(&host, &vdev->host)) { 2868 continue; 2869 } 2870 2871 QLIST_FOREACH(group, &vfio_group_list, next) { 2872 if (group->groupid == devices[i].group_id) { 2873 break; 2874 } 2875 } 2876 2877 if (!group) { 2878 break; 2879 } 2880 2881 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 2882 if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { 2883 continue; 2884 } 2885 tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); 2886 if (vfio_pci_host_match(&host, &tmp->host)) { 2887 vfio_pci_post_reset(tmp); 2888 break; 2889 } 2890 } 2891 } 2892 out_single: 2893 vfio_pci_post_reset(vdev); 2894 g_free(info); 2895 2896 return ret; 2897 } 2898 2899 /* 2900 * We want to differentiate hot reset of mulitple in-use devices vs hot reset 2901 * of a single in-use device. VFIO_DEVICE_RESET will already handle the case 2902 * of doing hot resets when there is only a single device per bus. The in-use 2903 * here refers to how many VFIODevices are affected. A hot reset that affects 2904 * multiple devices, but only a single in-use device, means that we can call 2905 * it from our bus ->reset() callback since the extent is effectively a single 2906 * device. This allows us to make use of it in the hotplug path. When there 2907 * are multiple in-use devices, we can only trigger the hot reset during a 2908 * system reset and thus from our reset handler. We separate _one vs _multi 2909 * here so that we don't overlap and do a double reset on the system reset 2910 * path where both our reset handler and ->reset() callback are used. Calling 2911 * _one() will only do a hot reset for the one in-use devices case, calling 2912 * _multi() will do nothing if a _one() would have been sufficient. 2913 */ 2914 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev) 2915 { 2916 return vfio_pci_hot_reset(vdev, true); 2917 } 2918 2919 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev) 2920 { 2921 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 2922 return vfio_pci_hot_reset(vdev, false); 2923 } 2924 2925 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev) 2926 { 2927 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 2928 if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) { 2929 vbasedev->needs_reset = true; 2930 } 2931 } 2932 2933 static VFIODeviceOps vfio_pci_ops = { 2934 .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, 2935 .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, 2936 .vfio_eoi = vfio_eoi, 2937 .vfio_populate_device = vfio_populate_device, 2938 }; 2939 2940 static int vfio_populate_device(VFIODevice *vbasedev) 2941 { 2942 VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); 2943 struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) }; 2944 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; 2945 int i, ret = -1; 2946 2947 /* Sanity check device */ 2948 if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) { 2949 error_report("vfio: Um, this isn't a PCI device"); 2950 goto error; 2951 } 2952 2953 if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) { 2954 error_report("vfio: unexpected number of io regions %u", 2955 vbasedev->num_regions); 2956 goto error; 2957 } 2958 2959 if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) { 2960 error_report("vfio: unexpected number of irqs %u", vbasedev->num_irqs); 2961 goto error; 2962 } 2963 2964 for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) { 2965 reg_info.index = i; 2966 2967 ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); 2968 if (ret) { 2969 error_report("vfio: Error getting region %d info: %m", i); 2970 goto error; 2971 } 2972 2973 trace_vfio_populate_device_region(vbasedev->name, i, 2974 (unsigned long)reg_info.size, 2975 (unsigned long)reg_info.offset, 2976 (unsigned long)reg_info.flags); 2977 2978 vdev->bars[i].region.vbasedev = vbasedev; 2979 vdev->bars[i].region.flags = reg_info.flags; 2980 vdev->bars[i].region.size = reg_info.size; 2981 vdev->bars[i].region.fd_offset = reg_info.offset; 2982 vdev->bars[i].region.nr = i; 2983 QLIST_INIT(&vdev->bars[i].quirks); 2984 } 2985 2986 reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX; 2987 2988 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); 2989 if (ret) { 2990 error_report("vfio: Error getting config info: %m"); 2991 goto error; 2992 } 2993 2994 trace_vfio_populate_device_config(vdev->vbasedev.name, 2995 (unsigned long)reg_info.size, 2996 (unsigned long)reg_info.offset, 2997 (unsigned long)reg_info.flags); 2998 2999 vdev->config_size = reg_info.size; 3000 if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) { 3001 vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS; 3002 } 3003 vdev->config_offset = reg_info.offset; 3004 3005 if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) && 3006 vbasedev->num_regions > VFIO_PCI_VGA_REGION_INDEX) { 3007 struct vfio_region_info vga_info = { 3008 .argsz = sizeof(vga_info), 3009 .index = VFIO_PCI_VGA_REGION_INDEX, 3010 }; 3011 3012 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info); 3013 if (ret) { 3014 error_report( 3015 "vfio: Device does not support requested feature x-vga"); 3016 goto error; 3017 } 3018 3019 if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) || 3020 !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) || 3021 vga_info.size < 0xbffff + 1) { 3022 error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx", 3023 (unsigned long)vga_info.flags, 3024 (unsigned long)vga_info.size); 3025 goto error; 3026 } 3027 3028 vdev->vga.fd_offset = vga_info.offset; 3029 vdev->vga.fd = vdev->vbasedev.fd; 3030 3031 vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE; 3032 vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM; 3033 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks); 3034 3035 vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE; 3036 vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO; 3037 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks); 3038 3039 vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE; 3040 vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI; 3041 QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks); 3042 3043 vdev->has_vga = true; 3044 } 3045 irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; 3046 3047 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); 3048 if (ret) { 3049 /* This can fail for an old kernel or legacy PCI dev */ 3050 trace_vfio_populate_device_get_irq_info_failure(); 3051 ret = 0; 3052 } else if (irq_info.count == 1) { 3053 vdev->pci_aer = true; 3054 } else { 3055 error_report("vfio: %s " 3056 "Could not enable error recovery for the device", 3057 vbasedev->name); 3058 } 3059 3060 error: 3061 return ret; 3062 } 3063 3064 static void vfio_put_device(VFIOPCIDevice *vdev) 3065 { 3066 g_free(vdev->vbasedev.name); 3067 if (vdev->msix) { 3068 g_free(vdev->msix); 3069 vdev->msix = NULL; 3070 } 3071 vfio_put_base_device(&vdev->vbasedev); 3072 } 3073 3074 static void vfio_err_notifier_handler(void *opaque) 3075 { 3076 VFIOPCIDevice *vdev = opaque; 3077 3078 if (!event_notifier_test_and_clear(&vdev->err_notifier)) { 3079 return; 3080 } 3081 3082 /* 3083 * TBD. Retrieve the error details and decide what action 3084 * needs to be taken. One of the actions could be to pass 3085 * the error to the guest and have the guest driver recover 3086 * from the error. This requires that PCIe capabilities be 3087 * exposed to the guest. For now, we just terminate the 3088 * guest to contain the error. 3089 */ 3090 3091 error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. " 3092 "Please collect any data possible and then kill the guest", 3093 __func__, vdev->host.domain, vdev->host.bus, 3094 vdev->host.slot, vdev->host.function); 3095 3096 vm_stop(RUN_STATE_INTERNAL_ERROR); 3097 } 3098 3099 /* 3100 * Registers error notifier for devices supporting error recovery. 3101 * If we encounter a failure in this function, we report an error 3102 * and continue after disabling error recovery support for the 3103 * device. 3104 */ 3105 static void vfio_register_err_notifier(VFIOPCIDevice *vdev) 3106 { 3107 int ret; 3108 int argsz; 3109 struct vfio_irq_set *irq_set; 3110 int32_t *pfd; 3111 3112 if (!vdev->pci_aer) { 3113 return; 3114 } 3115 3116 if (event_notifier_init(&vdev->err_notifier, 0)) { 3117 error_report("vfio: Unable to init event notifier for error detection"); 3118 vdev->pci_aer = false; 3119 return; 3120 } 3121 3122 argsz = sizeof(*irq_set) + sizeof(*pfd); 3123 3124 irq_set = g_malloc0(argsz); 3125 irq_set->argsz = argsz; 3126 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 3127 VFIO_IRQ_SET_ACTION_TRIGGER; 3128 irq_set->index = VFIO_PCI_ERR_IRQ_INDEX; 3129 irq_set->start = 0; 3130 irq_set->count = 1; 3131 pfd = (int32_t *)&irq_set->data; 3132 3133 *pfd = event_notifier_get_fd(&vdev->err_notifier); 3134 qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev); 3135 3136 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 3137 if (ret) { 3138 error_report("vfio: Failed to set up error notification"); 3139 qemu_set_fd_handler(*pfd, NULL, NULL, vdev); 3140 event_notifier_cleanup(&vdev->err_notifier); 3141 vdev->pci_aer = false; 3142 } 3143 g_free(irq_set); 3144 } 3145 3146 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) 3147 { 3148 int argsz; 3149 struct vfio_irq_set *irq_set; 3150 int32_t *pfd; 3151 int ret; 3152 3153 if (!vdev->pci_aer) { 3154 return; 3155 } 3156 3157 argsz = sizeof(*irq_set) + sizeof(*pfd); 3158 3159 irq_set = g_malloc0(argsz); 3160 irq_set->argsz = argsz; 3161 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 3162 VFIO_IRQ_SET_ACTION_TRIGGER; 3163 irq_set->index = VFIO_PCI_ERR_IRQ_INDEX; 3164 irq_set->start = 0; 3165 irq_set->count = 1; 3166 pfd = (int32_t *)&irq_set->data; 3167 *pfd = -1; 3168 3169 ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); 3170 if (ret) { 3171 error_report("vfio: Failed to de-assign error fd: %m"); 3172 } 3173 g_free(irq_set); 3174 qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), 3175 NULL, NULL, vdev); 3176 event_notifier_cleanup(&vdev->err_notifier); 3177 } 3178 3179 static int vfio_initfn(PCIDevice *pdev) 3180 { 3181 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 3182 VFIODevice *vbasedev_iter; 3183 VFIOGroup *group; 3184 char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name; 3185 ssize_t len; 3186 struct stat st; 3187 int groupid; 3188 int ret; 3189 3190 /* Check that the host device exists */ 3191 snprintf(path, sizeof(path), 3192 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/", 3193 vdev->host.domain, vdev->host.bus, vdev->host.slot, 3194 vdev->host.function); 3195 if (stat(path, &st) < 0) { 3196 error_report("vfio: error: no such host device: %s", path); 3197 return -errno; 3198 } 3199 3200 vdev->vbasedev.ops = &vfio_pci_ops; 3201 3202 vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI; 3203 vdev->vbasedev.name = g_strdup_printf("%04x:%02x:%02x.%01x", 3204 vdev->host.domain, vdev->host.bus, 3205 vdev->host.slot, vdev->host.function); 3206 3207 strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1); 3208 3209 len = readlink(path, iommu_group_path, sizeof(path)); 3210 if (len <= 0 || len >= sizeof(path)) { 3211 error_report("vfio: error no iommu_group for device"); 3212 return len < 0 ? -errno : ENAMETOOLONG; 3213 } 3214 3215 iommu_group_path[len] = 0; 3216 group_name = basename(iommu_group_path); 3217 3218 if (sscanf(group_name, "%d", &groupid) != 1) { 3219 error_report("vfio: error reading %s: %m", path); 3220 return -errno; 3221 } 3222 3223 trace_vfio_initfn(vdev->vbasedev.name, groupid); 3224 3225 group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev)); 3226 if (!group) { 3227 error_report("vfio: failed to get group %d", groupid); 3228 return -ENOENT; 3229 } 3230 3231 snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x", 3232 vdev->host.domain, vdev->host.bus, vdev->host.slot, 3233 vdev->host.function); 3234 3235 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { 3236 if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) { 3237 error_report("vfio: error: device %s is already attached", path); 3238 vfio_put_group(group); 3239 return -EBUSY; 3240 } 3241 } 3242 3243 ret = vfio_get_device(group, path, &vdev->vbasedev); 3244 if (ret) { 3245 error_report("vfio: failed to get device %s", path); 3246 vfio_put_group(group); 3247 return ret; 3248 } 3249 3250 /* Get a copy of config space */ 3251 ret = pread(vdev->vbasedev.fd, vdev->pdev.config, 3252 MIN(pci_config_size(&vdev->pdev), vdev->config_size), 3253 vdev->config_offset); 3254 if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { 3255 ret = ret < 0 ? -errno : -EFAULT; 3256 error_report("vfio: Failed to read device config space"); 3257 goto out_put; 3258 } 3259 3260 /* vfio emulates a lot for us, but some bits need extra love */ 3261 vdev->emulated_config_bits = g_malloc0(vdev->config_size); 3262 3263 /* QEMU can choose to expose the ROM or not */ 3264 memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4); 3265 3266 /* QEMU can change multi-function devices to single function, or reverse */ 3267 vdev->emulated_config_bits[PCI_HEADER_TYPE] = 3268 PCI_HEADER_TYPE_MULTI_FUNCTION; 3269 3270 /* Restore or clear multifunction, this is always controlled by QEMU */ 3271 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { 3272 vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; 3273 } else { 3274 vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION; 3275 } 3276 3277 /* 3278 * Clear host resource mapping info. If we choose not to register a 3279 * BAR, such as might be the case with the option ROM, we can get 3280 * confusing, unwritable, residual addresses from the host here. 3281 */ 3282 memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24); 3283 memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4); 3284 3285 vfio_pci_size_rom(vdev); 3286 3287 ret = vfio_early_setup_msix(vdev); 3288 if (ret) { 3289 goto out_put; 3290 } 3291 3292 vfio_map_bars(vdev); 3293 3294 ret = vfio_add_capabilities(vdev); 3295 if (ret) { 3296 goto out_teardown; 3297 } 3298 3299 /* QEMU emulates all of MSI & MSIX */ 3300 if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { 3301 memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, 3302 MSIX_CAP_LENGTH); 3303 } 3304 3305 if (pdev->cap_present & QEMU_PCI_CAP_MSI) { 3306 memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff, 3307 vdev->msi_cap_size); 3308 } 3309 3310 if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { 3311 vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, 3312 vfio_intx_mmap_enable, vdev); 3313 pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq); 3314 ret = vfio_enable_intx(vdev); 3315 if (ret) { 3316 goto out_teardown; 3317 } 3318 } 3319 3320 vfio_register_err_notifier(vdev); 3321 3322 return 0; 3323 3324 out_teardown: 3325 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); 3326 vfio_teardown_msi(vdev); 3327 vfio_unmap_bars(vdev); 3328 out_put: 3329 g_free(vdev->emulated_config_bits); 3330 vfio_put_device(vdev); 3331 vfio_put_group(group); 3332 return ret; 3333 } 3334 3335 static void vfio_exitfn(PCIDevice *pdev) 3336 { 3337 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 3338 VFIOGroup *group = vdev->vbasedev.group; 3339 3340 vfio_unregister_err_notifier(vdev); 3341 pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); 3342 vfio_disable_interrupts(vdev); 3343 if (vdev->intx.mmap_timer) { 3344 timer_free(vdev->intx.mmap_timer); 3345 } 3346 vfio_teardown_msi(vdev); 3347 vfio_unmap_bars(vdev); 3348 g_free(vdev->emulated_config_bits); 3349 g_free(vdev->rom); 3350 vfio_put_device(vdev); 3351 vfio_put_group(group); 3352 } 3353 3354 static void vfio_pci_reset(DeviceState *dev) 3355 { 3356 PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev); 3357 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); 3358 3359 trace_vfio_pci_reset(vdev->vbasedev.name); 3360 3361 vfio_pci_pre_reset(vdev); 3362 3363 if (vdev->vbasedev.reset_works && 3364 (vdev->has_flr || !vdev->has_pm_reset) && 3365 !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) { 3366 trace_vfio_pci_reset_flr(vdev->vbasedev.name); 3367 goto post_reset; 3368 } 3369 3370 /* See if we can do our own bus reset */ 3371 if (!vfio_pci_hot_reset_one(vdev)) { 3372 goto post_reset; 3373 } 3374 3375 /* If nothing else works and the device supports PM reset, use it */ 3376 if (vdev->vbasedev.reset_works && vdev->has_pm_reset && 3377 !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) { 3378 trace_vfio_pci_reset_pm(vdev->vbasedev.name); 3379 goto post_reset; 3380 } 3381 3382 post_reset: 3383 vfio_pci_post_reset(vdev); 3384 } 3385 3386 static void vfio_instance_init(Object *obj) 3387 { 3388 PCIDevice *pci_dev = PCI_DEVICE(obj); 3389 VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(obj)); 3390 3391 device_add_bootindex_property(obj, &vdev->bootindex, 3392 "bootindex", NULL, 3393 &pci_dev->qdev, NULL); 3394 } 3395 3396 static Property vfio_pci_dev_properties[] = { 3397 DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), 3398 DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice, 3399 intx.mmap_timeout, 1100), 3400 DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features, 3401 VFIO_FEATURE_ENABLE_VGA_BIT, false), 3402 DEFINE_PROP_INT32("bootindex", VFIOPCIDevice, bootindex, -1), 3403 /* 3404 * TODO - support passed fds... is this necessary? 3405 * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), 3406 * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name), 3407 */ 3408 DEFINE_PROP_END_OF_LIST(), 3409 }; 3410 3411 static const VMStateDescription vfio_pci_vmstate = { 3412 .name = "vfio-pci", 3413 .unmigratable = 1, 3414 }; 3415 3416 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) 3417 { 3418 DeviceClass *dc = DEVICE_CLASS(klass); 3419 PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); 3420 3421 dc->reset = vfio_pci_reset; 3422 dc->props = vfio_pci_dev_properties; 3423 dc->vmsd = &vfio_pci_vmstate; 3424 dc->desc = "VFIO-based PCI device assignment"; 3425 set_bit(DEVICE_CATEGORY_MISC, dc->categories); 3426 pdc->init = vfio_initfn; 3427 pdc->exit = vfio_exitfn; 3428 pdc->config_read = vfio_pci_read_config; 3429 pdc->config_write = vfio_pci_write_config; 3430 pdc->is_express = 1; /* We might be */ 3431 } 3432 3433 static const TypeInfo vfio_pci_dev_info = { 3434 .name = "vfio-pci", 3435 .parent = TYPE_PCI_DEVICE, 3436 .instance_size = sizeof(VFIOPCIDevice), 3437 .class_init = vfio_pci_dev_class_init, 3438 .instance_init = vfio_instance_init, 3439 }; 3440 3441 static void register_vfio_pci_dev_type(void) 3442 { 3443 type_register_static(&vfio_pci_dev_info); 3444 } 3445 3446 type_init(register_vfio_pci_dev_type) 3447