1 #include "kvm/irq.h" 2 #include "kvm/kvm.h" 3 #include "kvm/kvm-cpu.h" 4 #include "kvm/vfio.h" 5 6 #include <assert.h> 7 8 #include <sys/ioctl.h> 9 #include <sys/eventfd.h> 10 #include <sys/resource.h> 11 #include <sys/time.h> 12 13 #include <assert.h> 14 15 /* Wrapper around UAPI vfio_irq_set */ 16 union vfio_irq_eventfd { 17 struct vfio_irq_set irq; 18 u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)]; 19 }; 20 21 static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd) 22 { 23 memcpy(&evfd->irq.data, &fd, sizeof(fd)); 24 } 25 26 #define msi_is_enabled(state) ((state) & VFIO_PCI_MSI_STATE_ENABLED) 27 #define msi_is_masked(state) ((state) & VFIO_PCI_MSI_STATE_MASKED) 28 #define msi_is_empty(state) ((state) & VFIO_PCI_MSI_STATE_EMPTY) 29 30 #define msi_update_state(state, val, bit) \ 31 (state) = (val) ? (state) | bit : (state) & ~bit; 32 #define msi_set_enabled(state, val) \ 33 msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED) 34 #define msi_set_masked(state, val) \ 35 msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED) 36 #define msi_set_empty(state, val) \ 37 msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY) 38 39 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev); 40 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev); 41 42 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev, 43 bool msix) 44 { 45 size_t i; 46 int ret = 0; 47 int *eventfds; 48 struct vfio_pci_device *pdev = &vdev->pci; 49 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 50 union vfio_irq_eventfd single = { 51 .irq = { 52 .argsz = sizeof(single), 53 .flags = VFIO_IRQ_SET_DATA_EVENTFD | 54 VFIO_IRQ_SET_ACTION_TRIGGER, 55 .index = msis->info.index, 56 .count = 1, 57 }, 58 }; 59 60 if (!msi_is_enabled(msis->virt_state)) 61 return 0; 62 63 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 64 /* 65 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same 66 * time. Since INTx has to be enabled from the start (we don't 67 * have a reliable way to know when the guest starts using it), 68 * disable it now. 69 */ 70 vfio_pci_disable_intx(kvm, vdev); 71 72 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 73 74 /* 75 * Initial registration of the full range. This enables the physical 76 * MSI/MSI-X capability, which might have desired side effects. For 77 * instance when assigning virtio legacy devices, enabling the MSI 78 * capability modifies the config space layout! 79 * 80 * As an optimization, only update MSIs when guest unmasks the 81 * capability. This greatly reduces the initialization time for Linux 82 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap 83 * masked, then fills individual vectors, then unmasks the whole 84 * function. So we only do one VFIO ioctl when enabling for the first 85 * time, and then one when unmasking. 86 * 87 * phys_state is empty when it is enabled but no vector has been 88 * registered via SET_IRQS yet. 89 */ 90 if (!msi_is_enabled(msis->phys_state) || 91 (!msi_is_masked(msis->virt_state) && 92 msi_is_empty(msis->phys_state))) { 93 bool empty = true; 94 95 for (i = 0; i < msis->nr_entries; i++) { 96 eventfds[i] = msis->entries[i].gsi >= 0 ? 97 msis->entries[i].eventfd : -1; 98 99 if (eventfds[i] >= 0) 100 empty = false; 101 } 102 103 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set); 104 if (ret < 0) { 105 perror("VFIO_DEVICE_SET_IRQS(multi)"); 106 return ret; 107 } 108 109 msi_set_enabled(msis->phys_state, true); 110 msi_set_empty(msis->phys_state, empty); 111 112 return 0; 113 } 114 115 if (msi_is_masked(msis->virt_state)) { 116 /* TODO: if phys_state is not empty nor masked, mask all vectors */ 117 return 0; 118 } 119 120 /* Update individual vectors to avoid breaking those in use */ 121 for (i = 0; i < msis->nr_entries; i++) { 122 struct vfio_pci_msi_entry *entry = &msis->entries[i]; 123 int fd = entry->gsi >= 0 ? entry->eventfd : -1; 124 125 if (fd == eventfds[i]) 126 continue; 127 128 single.irq.start = i; 129 set_vfio_irq_eventd_payload(&single, fd); 130 131 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single); 132 if (ret < 0) { 133 perror("VFIO_DEVICE_SET_IRQS(single)"); 134 break; 135 } 136 137 eventfds[i] = fd; 138 139 if (msi_is_empty(msis->phys_state) && fd >= 0) 140 msi_set_empty(msis->phys_state, false); 141 } 142 143 return ret; 144 } 145 146 static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev, 147 bool msix) 148 { 149 int ret; 150 struct vfio_pci_device *pdev = &vdev->pci; 151 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 152 struct vfio_irq_set irq_set = { 153 .argsz = sizeof(irq_set), 154 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 155 .index = msis->info.index, 156 .start = 0, 157 .count = 0, 158 }; 159 160 if (!msi_is_enabled(msis->phys_state)) 161 return 0; 162 163 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 164 if (ret < 0) { 165 perror("VFIO_DEVICE_SET_IRQS(NONE)"); 166 return ret; 167 } 168 169 msi_set_enabled(msis->phys_state, false); 170 msi_set_empty(msis->phys_state, true); 171 172 /* 173 * When MSI or MSIX is disabled, this might be called when 174 * PCI driver detects the MSI interrupt failure and wants to 175 * rollback to INTx mode. Thus enable INTx if the device 176 * supports INTx mode in this case. 177 */ 178 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 179 ret = vfio_pci_enable_intx(kvm, vdev); 180 181 return ret >= 0 ? 0 : ret; 182 } 183 184 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev, 185 struct vfio_pci_msi_entry *entry) 186 { 187 int ret; 188 189 if (entry->eventfd < 0) { 190 entry->eventfd = eventfd(0, 0); 191 if (entry->eventfd < 0) { 192 ret = -errno; 193 vfio_dev_err(vdev, "cannot create eventfd"); 194 return ret; 195 } 196 } 197 198 /* Allocate IRQ if necessary */ 199 if (entry->gsi < 0) { 200 int ret = irq__add_msix_route(kvm, &entry->config.msg, 201 vdev->dev_hdr.dev_num << 3); 202 if (ret < 0) { 203 vfio_dev_err(vdev, "cannot create MSI-X route"); 204 return ret; 205 } 206 entry->gsi = ret; 207 } else { 208 irq__update_msix_route(kvm, entry->gsi, &entry->config.msg); 209 } 210 211 /* 212 * MSI masking is unimplemented in VFIO, so we have to handle it by 213 * disabling/enabling IRQ route instead. We do it on the KVM side rather 214 * than VFIO, because: 215 * - it is 8x faster 216 * - it allows to decouple masking logic from capability state. 217 * - in masked state, after removing irqfd route, we could easily plug 218 * the eventfd in a local handler, in order to serve Pending Bit reads 219 * to the guest. 220 * 221 * So entry->phys_state is masked when there is no active irqfd route. 222 */ 223 if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state)) 224 return 0; 225 226 if (msi_is_masked(entry->phys_state)) { 227 ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1); 228 if (ret < 0) { 229 vfio_dev_err(vdev, "cannot setup irqfd"); 230 return ret; 231 } 232 } else { 233 irq__del_irqfd(kvm, entry->gsi, entry->eventfd); 234 } 235 236 msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state)); 237 238 return 0; 239 } 240 241 static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 242 u32 len, u8 is_write, void *ptr) 243 { 244 struct vfio_pci_device *pdev = ptr; 245 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 246 u64 offset = addr - pba->guest_phys_addr; 247 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 248 249 if (is_write) 250 return; 251 252 /* 253 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA 254 * is completely useless here. Note that Linux doesn't use PBA. 255 */ 256 if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len) 257 vfio_dev_err(vdev, "cannot access MSIX PBA\n"); 258 } 259 260 static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 261 u32 len, u8 is_write, void *ptr) 262 { 263 struct kvm *kvm = vcpu->kvm; 264 struct vfio_pci_msi_entry *entry; 265 struct vfio_pci_device *pdev = ptr; 266 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 267 268 u64 offset = addr - pdev->msix_table.guest_phys_addr; 269 270 size_t vector = offset / PCI_MSIX_ENTRY_SIZE; 271 off_t field = offset % PCI_MSIX_ENTRY_SIZE; 272 273 /* 274 * PCI spec says that software must use aligned 4 or 8 bytes accesses 275 * for the MSI-X tables. 276 */ 277 if ((len != 4 && len != 8) || addr & (len - 1)) { 278 vfio_dev_warn(vdev, "invalid MSI-X table access"); 279 return; 280 } 281 282 entry = &pdev->msix.entries[vector]; 283 284 mutex_lock(&pdev->msix.mutex); 285 286 if (!is_write) { 287 memcpy(data, (void *)&entry->config + field, len); 288 goto out_unlock; 289 } 290 291 memcpy((void *)&entry->config + field, data, len); 292 293 /* 294 * Check if access touched the vector control register, which is at the 295 * end of the MSI-X entry. 296 */ 297 if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL) 298 goto out_unlock; 299 300 msi_set_masked(entry->virt_state, entry->config.ctrl & 301 PCI_MSIX_ENTRY_CTRL_MASKBIT); 302 303 if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0) 304 /* Not much we can do here. */ 305 vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector); 306 307 /* Update the physical capability if necessary */ 308 if (vfio_pci_enable_msis(kvm, vdev, true)) 309 vfio_dev_err(vdev, "cannot enable MSIX"); 310 311 out_unlock: 312 mutex_unlock(&pdev->msix.mutex); 313 } 314 315 static void vfio_pci_msix_cap_write(struct kvm *kvm, 316 struct vfio_device *vdev, u16 off, 317 void *data, int sz) 318 { 319 struct vfio_pci_device *pdev = &vdev->pci; 320 off_t enable_pos = PCI_MSIX_FLAGS + 1; 321 bool enable; 322 u16 flags; 323 324 off -= pdev->msix.pos; 325 326 /* Check if access intersects with the MSI-X Enable bit */ 327 if (off > enable_pos || off + sz <= enable_pos) 328 return; 329 330 /* Read byte that contains the Enable bit */ 331 flags = *(u8 *)(data + enable_pos - off) << 8; 332 333 mutex_lock(&pdev->msix.mutex); 334 335 msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL); 336 enable = flags & PCI_MSIX_FLAGS_ENABLE; 337 msi_set_enabled(pdev->msix.virt_state, enable); 338 339 if (enable && vfio_pci_enable_msis(kvm, vdev, true)) 340 vfio_dev_err(vdev, "cannot enable MSIX"); 341 else if (!enable && vfio_pci_disable_msis(kvm, vdev, true)) 342 vfio_dev_err(vdev, "cannot disable MSIX"); 343 344 mutex_unlock(&pdev->msix.mutex); 345 } 346 347 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev, 348 u16 off, u8 *data, u32 sz) 349 { 350 size_t i; 351 u32 mask = 0; 352 size_t mask_pos, start, limit; 353 struct vfio_pci_msi_entry *entry; 354 struct vfio_pci_device *pdev = &vdev->pci; 355 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 356 357 if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT)) 358 return 0; 359 360 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) 361 mask_pos = PCI_MSI_MASK_64; 362 else 363 mask_pos = PCI_MSI_MASK_32; 364 365 if (off >= mask_pos + 4 || off + sz <= mask_pos) 366 return 0; 367 368 /* Set mask to current state */ 369 for (i = 0; i < pdev->msi.nr_entries; i++) { 370 entry = &pdev->msi.entries[i]; 371 mask |= !!msi_is_masked(entry->virt_state) << i; 372 } 373 374 /* Update mask following the intersection of access and register */ 375 start = max_t(size_t, off, mask_pos); 376 limit = min_t(size_t, off + sz, mask_pos + 4); 377 378 memcpy((void *)&mask + start - mask_pos, data + start - off, 379 limit - start); 380 381 /* Update states if necessary */ 382 for (i = 0; i < pdev->msi.nr_entries; i++) { 383 bool masked = mask & (1 << i); 384 385 entry = &pdev->msi.entries[i]; 386 if (masked != msi_is_masked(entry->virt_state)) { 387 msi_set_masked(entry->virt_state, masked); 388 vfio_pci_update_msi_entry(kvm, vdev, entry); 389 } 390 } 391 392 return 1; 393 } 394 395 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev, 396 u16 off, u8 *data, u32 sz) 397 { 398 u8 ctrl; 399 struct msi_msg msg; 400 size_t i, nr_vectors; 401 struct vfio_pci_msi_entry *entry; 402 struct vfio_pci_device *pdev = &vdev->pci; 403 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 404 405 off -= pdev->msi.pos; 406 407 mutex_lock(&pdev->msi.mutex); 408 409 /* Check if the guest is trying to update mask bits */ 410 if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz)) 411 goto out_unlock; 412 413 /* Only modify routes when guest pokes the enable bit */ 414 if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS) 415 goto out_unlock; 416 417 ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off); 418 419 msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE); 420 421 if (!msi_is_enabled(pdev->msi.virt_state)) { 422 vfio_pci_disable_msis(kvm, vdev, false); 423 goto out_unlock; 424 } 425 426 /* Create routes for the requested vectors */ 427 nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4); 428 429 msg.address_lo = msi_cap_64->address_lo; 430 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) { 431 msg.address_hi = msi_cap_64->address_hi; 432 msg.data = msi_cap_64->data; 433 } else { 434 struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64; 435 msg.address_hi = 0; 436 msg.data = msi_cap_32->data; 437 } 438 439 for (i = 0; i < nr_vectors; i++) { 440 entry = &pdev->msi.entries[i]; 441 442 /* 443 * Set the MSI data value as required by the PCI local 444 * bus specifications, MSI capability, "Message Data". 445 */ 446 msg.data &= ~(nr_vectors - 1); 447 msg.data |= i; 448 449 entry->config.msg = msg; 450 vfio_pci_update_msi_entry(kvm, vdev, entry); 451 } 452 453 /* Update the physical capability if necessary */ 454 if (vfio_pci_enable_msis(kvm, vdev, false)) 455 vfio_dev_err(vdev, "cannot enable MSI"); 456 457 out_unlock: 458 mutex_unlock(&pdev->msi.mutex); 459 } 460 461 static int vfio_pci_bar_activate(struct kvm *kvm, 462 struct pci_device_header *pci_hdr, 463 int bar_num, void *data) 464 { 465 struct vfio_device *vdev = data; 466 struct vfio_pci_device *pdev = &vdev->pci; 467 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 468 struct vfio_pci_msix_table *table = &pdev->msix_table; 469 struct vfio_region *region; 470 u32 bar_addr; 471 bool has_msix; 472 int ret; 473 474 assert((u32)bar_num < vdev->info.num_regions); 475 476 region = &vdev->regions[bar_num]; 477 has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX; 478 479 bar_addr = pci__bar_address(pci_hdr, bar_num); 480 if (pci__bar_is_io(pci_hdr, bar_num)) 481 region->port_base = bar_addr; 482 else 483 region->guest_phys_addr = bar_addr; 484 485 if (has_msix && (u32)bar_num == table->bar) { 486 table->guest_phys_addr = region->guest_phys_addr; 487 ret = kvm__register_mmio(kvm, table->guest_phys_addr, 488 table->size, false, 489 vfio_pci_msix_table_access, pdev); 490 /* 491 * The MSIX table and the PBA structure can share the same BAR, 492 * but for convenience we register different regions for mmio 493 * emulation. We want to we update both if they share the same 494 * BAR. 495 */ 496 if (ret < 0 || table->bar != pba->bar) 497 goto out; 498 } 499 500 if (has_msix && (u32)bar_num == pba->bar) { 501 if (pba->bar == table->bar) 502 pba->guest_phys_addr = table->guest_phys_addr + table->size; 503 else 504 pba->guest_phys_addr = region->guest_phys_addr; 505 ret = kvm__register_mmio(kvm, pba->guest_phys_addr, 506 pba->size, false, 507 vfio_pci_msix_pba_access, pdev); 508 goto out; 509 } 510 511 ret = vfio_map_region(kvm, vdev, region); 512 out: 513 return ret; 514 } 515 516 static int vfio_pci_bar_deactivate(struct kvm *kvm, 517 struct pci_device_header *pci_hdr, 518 int bar_num, void *data) 519 { 520 struct vfio_device *vdev = data; 521 struct vfio_pci_device *pdev = &vdev->pci; 522 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 523 struct vfio_pci_msix_table *table = &pdev->msix_table; 524 struct vfio_region *region; 525 bool has_msix, success; 526 int ret; 527 528 assert((u32)bar_num < vdev->info.num_regions); 529 530 region = &vdev->regions[bar_num]; 531 has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX; 532 533 if (has_msix && (u32)bar_num == table->bar) { 534 success = kvm__deregister_mmio(kvm, table->guest_phys_addr); 535 /* kvm__deregister_mmio fails when the region is not found. */ 536 ret = (success ? 0 : -ENOENT); 537 /* See vfio_pci_bar_activate(). */ 538 if (ret < 0 || table->bar!= pba->bar) 539 goto out; 540 } 541 542 if (has_msix && (u32)bar_num == pba->bar) { 543 success = kvm__deregister_mmio(kvm, pba->guest_phys_addr); 544 ret = (success ? 0 : -ENOENT); 545 goto out; 546 } 547 548 vfio_unmap_region(kvm, region); 549 ret = 0; 550 551 out: 552 return ret; 553 } 554 555 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr, 556 u16 offset, void *data, int sz) 557 { 558 struct vfio_region_info *info; 559 struct vfio_pci_device *pdev; 560 struct vfio_device *vdev; 561 char base[sz]; 562 563 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 564 vdev = container_of(pdev, struct vfio_device, pci); 565 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 566 567 /* Dummy read in case of side-effects */ 568 if (pread(vdev->fd, base, sz, info->offset + offset) != sz) 569 vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x", 570 sz, offset); 571 } 572 573 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr, 574 u16 offset, void *data, int sz) 575 { 576 struct vfio_region_info *info; 577 struct vfio_pci_device *pdev; 578 struct vfio_device *vdev; 579 u32 tmp; 580 581 /* Make sure a larger size will not overrun tmp on the stack. */ 582 assert(sz <= 4); 583 584 if (offset == PCI_ROM_ADDRESS) 585 return; 586 587 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 588 vdev = container_of(pdev, struct vfio_device, pci); 589 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 590 591 if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz) 592 vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x", 593 sz, offset); 594 595 /* Handle MSI write now, since it might update the hardware capability */ 596 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) 597 vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz); 598 599 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) 600 vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz); 601 602 if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz) 603 vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x", 604 sz, offset); 605 } 606 607 static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr) 608 { 609 size_t size = 10; 610 611 if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT) 612 size += 4; 613 if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT) 614 size += 10; 615 616 return size; 617 } 618 619 static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr) 620 { 621 switch (cap_hdr->type) { 622 case PCI_CAP_ID_MSIX: 623 return PCI_CAP_MSIX_SIZEOF; 624 case PCI_CAP_ID_MSI: 625 return vfio_pci_msi_cap_size((void *)cap_hdr); 626 default: 627 pr_err("unknown PCI capability 0x%x", cap_hdr->type); 628 return 0; 629 } 630 } 631 632 static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr, 633 struct pci_cap_hdr *cap, off_t pos) 634 { 635 struct pci_cap_hdr *last; 636 struct pci_device_header *hdr = &vdev->pci.hdr; 637 638 cap->next = 0; 639 640 if (!hdr->capabilities) { 641 hdr->capabilities = pos; 642 hdr->status |= PCI_STATUS_CAP_LIST; 643 } else { 644 last = PCI_CAP(virt_hdr, hdr->capabilities); 645 646 while (last->next) 647 last = PCI_CAP(virt_hdr, last->next); 648 649 last->next = pos; 650 } 651 652 memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap)); 653 654 return 0; 655 } 656 657 static int vfio_pci_parse_caps(struct vfio_device *vdev) 658 { 659 int ret; 660 size_t size; 661 u16 pos, next; 662 struct pci_cap_hdr *cap; 663 u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY]; 664 struct vfio_pci_device *pdev = &vdev->pci; 665 666 if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST)) 667 return 0; 668 669 memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY); 670 671 pos = pdev->hdr.capabilities & ~3; 672 673 pdev->hdr.status &= ~PCI_STATUS_CAP_LIST; 674 pdev->hdr.capabilities = 0; 675 676 for (; pos; pos = next) { 677 cap = PCI_CAP(&pdev->hdr, pos); 678 next = cap->next; 679 680 switch (cap->type) { 681 case PCI_CAP_ID_MSIX: 682 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 683 if (ret) 684 return ret; 685 686 pdev->msix.pos = pos; 687 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX; 688 break; 689 case PCI_CAP_ID_MSI: 690 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 691 if (ret) 692 return ret; 693 694 pdev->msi.pos = pos; 695 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI; 696 break; 697 } 698 } 699 700 /* Wipe remaining capabilities */ 701 pos = PCI_STD_HEADER_SIZEOF; 702 size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF; 703 memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size); 704 705 return 0; 706 } 707 708 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev) 709 { 710 ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY; 711 struct vfio_region_info *info; 712 struct vfio_pci_device *pdev = &vdev->pci; 713 714 if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { 715 vfio_dev_err(vdev, "Config Space not found"); 716 return -ENODEV; 717 } 718 719 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 720 *info = (struct vfio_region_info) { 721 .argsz = sizeof(*info), 722 .index = VFIO_PCI_CONFIG_REGION_INDEX, 723 }; 724 725 ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 726 if (!info->size) { 727 vfio_dev_err(vdev, "Config Space has size zero?!"); 728 return -EINVAL; 729 } 730 731 /* Read standard headers and capabilities */ 732 if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) { 733 vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz); 734 return -EIO; 735 } 736 737 /* Strip bit 7, that indicates multifunction */ 738 pdev->hdr.header_type &= 0x7f; 739 740 if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) { 741 vfio_dev_err(vdev, "unsupported header type %u", 742 pdev->hdr.header_type); 743 return -EOPNOTSUPP; 744 } 745 746 if (pdev->hdr.irq_pin) 747 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX; 748 749 vfio_pci_parse_caps(vdev); 750 751 return 0; 752 } 753 754 static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev) 755 { 756 int i; 757 u64 base; 758 ssize_t hdr_sz; 759 struct msix_cap *msix; 760 struct vfio_region_info *info; 761 struct vfio_pci_device *pdev = &vdev->pci; 762 struct vfio_region *region; 763 764 /* Initialise the BARs */ 765 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 766 if ((u32)i == vdev->info.num_regions) 767 break; 768 769 region = &vdev->regions[i]; 770 /* Construct a fake reg to match what we've mapped. */ 771 if (region->is_ioport) { 772 base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) | 773 PCI_BASE_ADDRESS_SPACE_IO; 774 } else { 775 base = (region->guest_phys_addr & 776 PCI_BASE_ADDRESS_MEM_MASK) | 777 PCI_BASE_ADDRESS_SPACE_MEMORY; 778 } 779 780 pdev->hdr.bar[i] = base; 781 782 if (!base) 783 continue; 784 785 pdev->hdr.bar_size[i] = region->info.size; 786 } 787 788 /* I really can't be bothered to support cardbus. */ 789 pdev->hdr.card_bus = 0; 790 791 /* 792 * Nuke the expansion ROM for now. If we want to do this properly, 793 * we need to save its size somewhere and map into the guest. 794 */ 795 pdev->hdr.exp_rom_bar = 0; 796 797 /* Plumb in our fake MSI-X capability, if we have it. */ 798 msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX); 799 if (msix) { 800 /* Add a shortcut to the PBA region for the MMIO handler */ 801 int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar; 802 pdev->msix_pba.offset = vdev->regions[pba_index].info.offset + 803 (msix->pba_offset & PCI_MSIX_PBA_OFFSET); 804 805 /* Tidy up the capability */ 806 msix->table_offset &= PCI_MSIX_TABLE_BIR; 807 msix->pba_offset &= PCI_MSIX_PBA_BIR; 808 if (pdev->msix_table.bar == pdev->msix_pba.bar) 809 msix->pba_offset |= pdev->msix_table.size & 810 PCI_MSIX_PBA_OFFSET; 811 } 812 813 /* Install our fake Configuration Space */ 814 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 815 /* 816 * We don't touch the extended configuration space, let's be cautious 817 * and not overwrite it all with zeros, or bad things might happen. 818 */ 819 hdr_sz = PCI_DEV_CFG_SIZE_LEGACY; 820 if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) { 821 vfio_dev_err(vdev, "failed to write %zd bytes to Config Space", 822 hdr_sz); 823 return -EIO; 824 } 825 826 /* Register callbacks for cfg accesses */ 827 pdev->hdr.cfg_ops = (struct pci_config_operations) { 828 .read = vfio_pci_cfg_read, 829 .write = vfio_pci_cfg_write, 830 }; 831 832 pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH; 833 834 return 0; 835 } 836 837 static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index, 838 struct vfio_region_info *info) 839 { 840 int ret; 841 842 *info = (struct vfio_region_info) { 843 .argsz = sizeof(*info), 844 .index = index, 845 }; 846 847 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 848 if (ret) { 849 ret = -errno; 850 vfio_dev_err(vdev, "cannot get info for BAR %u", index); 851 return ret; 852 } 853 854 if (info->size && !is_power_of_two(info->size)) { 855 vfio_dev_err(vdev, "region is not power of two: 0x%llx", 856 info->size); 857 return -EINVAL; 858 } 859 860 return 0; 861 } 862 863 static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev) 864 { 865 int ret; 866 size_t i; 867 size_t map_size; 868 size_t nr_entries; 869 struct vfio_pci_msi_entry *entries; 870 struct vfio_pci_device *pdev = &vdev->pci; 871 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 872 struct vfio_pci_msix_table *table = &pdev->msix_table; 873 struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos); 874 struct vfio_region_info info; 875 876 table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR; 877 pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR; 878 879 /* 880 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE. 881 */ 882 nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; 883 table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE); 884 pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE); 885 886 entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry)); 887 if (!entries) 888 return -ENOMEM; 889 890 for (i = 0; i < nr_entries; i++) 891 entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; 892 893 ret = vfio_pci_get_region_info(vdev, table->bar, &info); 894 if (ret) 895 return ret; 896 if (!info.size) 897 return -EINVAL; 898 map_size = info.size; 899 900 if (table->bar != pba->bar) { 901 ret = vfio_pci_get_region_info(vdev, pba->bar, &info); 902 if (ret) 903 return ret; 904 if (!info.size) 905 return -EINVAL; 906 map_size += info.size; 907 } 908 909 /* 910 * To ease MSI-X cap configuration in case they share the same BAR, 911 * collapse table and pending array. The size of the BAR regions must be 912 * powers of two. 913 */ 914 map_size = ALIGN(map_size, PAGE_SIZE); 915 table->guest_phys_addr = pci_get_mmio_block(map_size); 916 if (!table->guest_phys_addr) { 917 pr_err("cannot allocate MMIO space"); 918 ret = -ENOMEM; 919 goto out_free; 920 } 921 922 /* 923 * We could map the physical PBA directly into the guest, but it's 924 * likely smaller than a page, and we can only hand full pages to the 925 * guest. Even though the PCI spec disallows sharing a page used for 926 * MSI-X with any other resource, it allows to share the same page 927 * between MSI-X table and PBA. For the sake of isolation, create a 928 * virtual PBA. 929 */ 930 pba->guest_phys_addr = table->guest_phys_addr + table->size; 931 932 pdev->msix.entries = entries; 933 pdev->msix.nr_entries = nr_entries; 934 935 return 0; 936 937 out_free: 938 free(entries); 939 940 return ret; 941 } 942 943 static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev) 944 { 945 struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos); 946 947 pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1), 948 pdev->msi.entries = calloc(pdev->msi.nr_entries, 949 sizeof(struct vfio_pci_msi_entry)); 950 if (!pdev->msi.entries) 951 return -ENOMEM; 952 953 return 0; 954 } 955 956 static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev, 957 size_t nr) 958 { 959 int ret; 960 u32 bar; 961 size_t map_size; 962 struct vfio_pci_device *pdev = &vdev->pci; 963 struct vfio_region *region; 964 965 if (nr >= vdev->info.num_regions) 966 return 0; 967 968 region = &vdev->regions[nr]; 969 bar = pdev->hdr.bar[nr]; 970 971 region->vdev = vdev; 972 region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO); 973 974 ret = vfio_pci_get_region_info(vdev, nr, ®ion->info); 975 if (ret) 976 return ret; 977 978 /* Ignore invalid or unimplemented regions */ 979 if (!region->info.size) 980 return 0; 981 982 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 983 /* Trap and emulate MSI-X table */ 984 if (nr == pdev->msix_table.bar) { 985 region->guest_phys_addr = pdev->msix_table.guest_phys_addr; 986 return 0; 987 } else if (nr == pdev->msix_pba.bar) { 988 region->guest_phys_addr = pdev->msix_pba.guest_phys_addr; 989 return 0; 990 } 991 } 992 993 if (region->is_ioport) { 994 region->port_base = pci_get_io_port_block(region->info.size); 995 } else { 996 /* Grab some MMIO space in the guest */ 997 map_size = ALIGN(region->info.size, PAGE_SIZE); 998 region->guest_phys_addr = pci_get_mmio_block(map_size); 999 } 1000 1001 return 0; 1002 } 1003 1004 static int vfio_pci_configure_dev_regions(struct kvm *kvm, 1005 struct vfio_device *vdev) 1006 { 1007 int ret; 1008 u32 bar; 1009 size_t i; 1010 bool is_64bit = false; 1011 struct vfio_pci_device *pdev = &vdev->pci; 1012 1013 ret = vfio_pci_parse_cfg_space(vdev); 1014 if (ret) 1015 return ret; 1016 1017 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1018 ret = vfio_pci_create_msix_table(kvm, vdev); 1019 if (ret) 1020 return ret; 1021 } 1022 1023 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 1024 ret = vfio_pci_create_msi_cap(kvm, pdev); 1025 if (ret) 1026 return ret; 1027 } 1028 1029 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 1030 /* Ignore top half of 64-bit BAR */ 1031 if (is_64bit) { 1032 is_64bit = false; 1033 continue; 1034 } 1035 1036 ret = vfio_pci_configure_bar(kvm, vdev, i); 1037 if (ret) 1038 return ret; 1039 1040 bar = pdev->hdr.bar[i]; 1041 is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) == 1042 PCI_BASE_ADDRESS_SPACE_MEMORY && 1043 bar & PCI_BASE_ADDRESS_MEM_TYPE_64; 1044 } 1045 1046 /* We've configured the BARs, fake up a Configuration Space */ 1047 ret = vfio_pci_fixup_cfg_space(vdev); 1048 if (ret) 1049 return ret; 1050 1051 return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate, 1052 vfio_pci_bar_deactivate, vdev); 1053 } 1054 1055 /* 1056 * Attempt to update the FD limit, if opening an eventfd for each IRQ vector 1057 * would hit the limit. Which is likely to happen when a device uses 2048 MSIs. 1058 */ 1059 static int vfio_pci_reserve_irq_fds(size_t num) 1060 { 1061 /* 1062 * I counted around 27 fds under normal load. Let's add 100 for good 1063 * measure. 1064 */ 1065 static size_t needed = 128; 1066 struct rlimit fd_limit, new_limit; 1067 1068 needed += num; 1069 1070 if (getrlimit(RLIMIT_NOFILE, &fd_limit)) { 1071 perror("getrlimit(RLIMIT_NOFILE)"); 1072 return 0; 1073 } 1074 1075 if (fd_limit.rlim_cur >= needed) 1076 return 0; 1077 1078 new_limit.rlim_cur = needed; 1079 1080 if (fd_limit.rlim_max < needed) 1081 /* Try to bump hard limit (root only) */ 1082 new_limit.rlim_max = needed; 1083 else 1084 new_limit.rlim_max = fd_limit.rlim_max; 1085 1086 if (setrlimit(RLIMIT_NOFILE, &new_limit)) { 1087 perror("setrlimit(RLIMIT_NOFILE)"); 1088 pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)", 1089 (size_t)(needed - fd_limit.rlim_cur)); 1090 } 1091 1092 return 0; 1093 } 1094 1095 static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev, 1096 struct vfio_pci_msi_common *msis) 1097 { 1098 int ret; 1099 size_t i; 1100 int *eventfds; 1101 size_t irq_set_size; 1102 struct vfio_pci_msi_entry *entry; 1103 size_t nr_entries = msis->nr_entries; 1104 1105 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info); 1106 if (ret || msis->info.count == 0) { 1107 vfio_dev_err(vdev, "no MSI reported by VFIO"); 1108 return -ENODEV; 1109 } 1110 1111 if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) { 1112 vfio_dev_err(vdev, "interrupt not EVENTFD capable"); 1113 return -EINVAL; 1114 } 1115 1116 if (msis->info.count != nr_entries) { 1117 vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO"); 1118 return -EINVAL; 1119 } 1120 1121 mutex_init(&msis->mutex); 1122 1123 vfio_pci_reserve_irq_fds(nr_entries); 1124 1125 irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int); 1126 msis->irq_set = malloc(irq_set_size); 1127 if (!msis->irq_set) 1128 return -ENOMEM; 1129 1130 *msis->irq_set = (struct vfio_irq_set) { 1131 .argsz = irq_set_size, 1132 .flags = VFIO_IRQ_SET_DATA_EVENTFD | 1133 VFIO_IRQ_SET_ACTION_TRIGGER, 1134 .index = msis->info.index, 1135 .start = 0, 1136 .count = nr_entries, 1137 }; 1138 1139 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 1140 1141 for (i = 0; i < nr_entries; i++) { 1142 entry = &msis->entries[i]; 1143 entry->gsi = -1; 1144 entry->eventfd = -1; 1145 msi_set_masked(entry->virt_state, true); 1146 msi_set_masked(entry->phys_state, true); 1147 eventfds[i] = -1; 1148 } 1149 1150 return 0; 1151 } 1152 1153 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev) 1154 { 1155 struct vfio_pci_device *pdev = &vdev->pci; 1156 int gsi = pdev->intx_gsi; 1157 struct vfio_irq_set irq_set = { 1158 .argsz = sizeof(irq_set), 1159 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 1160 .index = VFIO_PCI_INTX_IRQ_INDEX, 1161 }; 1162 1163 if (pdev->intx_fd == -1) 1164 return; 1165 1166 pr_debug("user requested MSI, disabling INTx %d", gsi); 1167 1168 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 1169 irq__del_irqfd(kvm, gsi, pdev->intx_fd); 1170 1171 close(pdev->intx_fd); 1172 close(pdev->unmask_fd); 1173 pdev->intx_fd = -1; 1174 } 1175 1176 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev) 1177 { 1178 int ret; 1179 int trigger_fd, unmask_fd; 1180 union vfio_irq_eventfd trigger; 1181 union vfio_irq_eventfd unmask; 1182 struct vfio_pci_device *pdev = &vdev->pci; 1183 int gsi = pdev->intx_gsi; 1184 1185 if (pdev->intx_fd != -1) 1186 return 0; 1187 1188 /* 1189 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd 1190 * signals an interrupt from host to guest, and unmask_fd signals the 1191 * deassertion of the line from guest to host. 1192 */ 1193 trigger_fd = eventfd(0, 0); 1194 if (trigger_fd < 0) { 1195 vfio_dev_err(vdev, "failed to create trigger eventfd"); 1196 return trigger_fd; 1197 } 1198 1199 unmask_fd = eventfd(0, 0); 1200 if (unmask_fd < 0) { 1201 vfio_dev_err(vdev, "failed to create unmask eventfd"); 1202 close(trigger_fd); 1203 return unmask_fd; 1204 } 1205 1206 ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd); 1207 if (ret) 1208 goto err_close; 1209 1210 trigger.irq = (struct vfio_irq_set) { 1211 .argsz = sizeof(trigger), 1212 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, 1213 .index = VFIO_PCI_INTX_IRQ_INDEX, 1214 .start = 0, 1215 .count = 1, 1216 }; 1217 set_vfio_irq_eventd_payload(&trigger, trigger_fd); 1218 1219 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 1220 if (ret < 0) { 1221 vfio_dev_err(vdev, "failed to setup VFIO IRQ"); 1222 goto err_delete_line; 1223 } 1224 1225 unmask.irq = (struct vfio_irq_set) { 1226 .argsz = sizeof(unmask), 1227 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK, 1228 .index = VFIO_PCI_INTX_IRQ_INDEX, 1229 .start = 0, 1230 .count = 1, 1231 }; 1232 set_vfio_irq_eventd_payload(&unmask, unmask_fd); 1233 1234 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask); 1235 if (ret < 0) { 1236 vfio_dev_err(vdev, "failed to setup unmask IRQ"); 1237 goto err_remove_event; 1238 } 1239 1240 pdev->intx_fd = trigger_fd; 1241 pdev->unmask_fd = unmask_fd; 1242 1243 return 0; 1244 1245 err_remove_event: 1246 /* Remove trigger event */ 1247 trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; 1248 trigger.irq.count = 0; 1249 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 1250 1251 err_delete_line: 1252 irq__del_irqfd(kvm, gsi, trigger_fd); 1253 1254 err_close: 1255 close(trigger_fd); 1256 close(unmask_fd); 1257 return ret; 1258 } 1259 1260 static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev) 1261 { 1262 int ret; 1263 struct vfio_pci_device *pdev = &vdev->pci; 1264 struct vfio_irq_info irq_info = { 1265 .argsz = sizeof(irq_info), 1266 .index = VFIO_PCI_INTX_IRQ_INDEX, 1267 }; 1268 1269 vfio_pci_reserve_irq_fds(2); 1270 1271 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); 1272 if (ret || irq_info.count == 0) { 1273 vfio_dev_err(vdev, "no INTx reported by VFIO"); 1274 return -ENODEV; 1275 } 1276 1277 if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { 1278 vfio_dev_err(vdev, "interrupt not eventfd capable"); 1279 return -EINVAL; 1280 } 1281 1282 if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) { 1283 vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED"); 1284 return -EINVAL; 1285 } 1286 1287 /* Guest is going to ovewrite our irq_line... */ 1288 pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET; 1289 1290 pdev->intx_fd = -1; 1291 1292 return 0; 1293 } 1294 1295 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev) 1296 { 1297 int ret = 0; 1298 struct vfio_pci_device *pdev = &vdev->pci; 1299 1300 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1301 pdev->msix.info = (struct vfio_irq_info) { 1302 .argsz = sizeof(pdev->msix.info), 1303 .index = VFIO_PCI_MSIX_IRQ_INDEX, 1304 }; 1305 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix); 1306 if (ret) 1307 return ret; 1308 } 1309 1310 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 1311 pdev->msi.info = (struct vfio_irq_info) { 1312 .argsz = sizeof(pdev->msi.info), 1313 .index = VFIO_PCI_MSI_IRQ_INDEX, 1314 }; 1315 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi); 1316 if (ret) 1317 return ret; 1318 } 1319 1320 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) { 1321 pci__assign_irq(&vdev->pci.hdr); 1322 1323 ret = vfio_pci_init_intx(kvm, vdev); 1324 if (ret) 1325 return ret; 1326 1327 ret = vfio_pci_enable_intx(kvm, vdev); 1328 } 1329 1330 return ret; 1331 } 1332 1333 int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev) 1334 { 1335 int ret; 1336 1337 ret = vfio_pci_configure_dev_regions(kvm, vdev); 1338 if (ret) { 1339 vfio_dev_err(vdev, "failed to configure regions"); 1340 return ret; 1341 } 1342 1343 vdev->dev_hdr = (struct device_header) { 1344 .bus_type = DEVICE_BUS_PCI, 1345 .data = &vdev->pci.hdr, 1346 }; 1347 1348 ret = device__register(&vdev->dev_hdr); 1349 if (ret) { 1350 vfio_dev_err(vdev, "failed to register VFIO device"); 1351 return ret; 1352 } 1353 1354 ret = vfio_pci_configure_dev_irqs(kvm, vdev); 1355 if (ret) { 1356 vfio_dev_err(vdev, "failed to configure IRQs"); 1357 return ret; 1358 } 1359 1360 return 0; 1361 } 1362 1363 void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev) 1364 { 1365 size_t i; 1366 struct vfio_pci_device *pdev = &vdev->pci; 1367 1368 for (i = 0; i < vdev->info.num_regions; i++) 1369 vfio_unmap_region(kvm, &vdev->regions[i]); 1370 1371 device__unregister(&vdev->dev_hdr); 1372 1373 free(pdev->msix.irq_set); 1374 free(pdev->msix.entries); 1375 free(pdev->msi.irq_set); 1376 free(pdev->msi.entries); 1377 } 1378