1 #include "kvm/irq.h" 2 #include "kvm/kvm.h" 3 #include "kvm/kvm-cpu.h" 4 #include "kvm/vfio.h" 5 6 #include <assert.h> 7 8 #include <sys/ioctl.h> 9 #include <sys/eventfd.h> 10 #include <sys/resource.h> 11 #include <sys/time.h> 12 13 /* Some distros don't have the define. */ 14 #ifndef PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 15 #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 12 16 #endif 17 18 /* Wrapper around UAPI vfio_irq_set */ 19 union vfio_irq_eventfd { 20 struct vfio_irq_set irq; 21 u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)]; 22 }; 23 24 static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd) 25 { 26 memcpy(&evfd->irq.data, &fd, sizeof(fd)); 27 } 28 29 #define msi_is_enabled(state) ((state) & VFIO_PCI_MSI_STATE_ENABLED) 30 #define msi_is_masked(state) ((state) & VFIO_PCI_MSI_STATE_MASKED) 31 #define msi_is_empty(state) ((state) & VFIO_PCI_MSI_STATE_EMPTY) 32 33 #define msi_update_state(state, val, bit) \ 34 (state) = (val) ? (state) | bit : (state) & ~bit; 35 #define msi_set_enabled(state, val) \ 36 msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED) 37 #define msi_set_masked(state, val) \ 38 msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED) 39 #define msi_set_empty(state, val) \ 40 msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY) 41 42 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev); 43 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev); 44 45 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev, 46 bool msix) 47 { 48 size_t i; 49 int ret = 0; 50 int *eventfds; 51 struct vfio_pci_device *pdev = &vdev->pci; 52 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 53 union vfio_irq_eventfd single = { 54 .irq = { 55 .argsz = sizeof(single), 56 .flags = VFIO_IRQ_SET_DATA_EVENTFD | 57 VFIO_IRQ_SET_ACTION_TRIGGER, 58 .index = msis->info.index, 59 .count = 1, 60 }, 61 }; 62 63 if (!msi_is_enabled(msis->virt_state)) 64 return 0; 65 66 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 67 /* 68 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same 69 * time. Since INTx has to be enabled from the start (we don't 70 * have a reliable way to know when the guest starts using it), 71 * disable it now. 72 */ 73 vfio_pci_disable_intx(kvm, vdev); 74 75 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 76 77 /* 78 * Initial registration of the full range. This enables the physical 79 * MSI/MSI-X capability, which might have desired side effects. For 80 * instance when assigning virtio legacy devices, enabling the MSI 81 * capability modifies the config space layout! 82 * 83 * As an optimization, only update MSIs when guest unmasks the 84 * capability. This greatly reduces the initialization time for Linux 85 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap 86 * masked, then fills individual vectors, then unmasks the whole 87 * function. So we only do one VFIO ioctl when enabling for the first 88 * time, and then one when unmasking. 89 * 90 * phys_state is empty when it is enabled but no vector has been 91 * registered via SET_IRQS yet. 92 */ 93 if (!msi_is_enabled(msis->phys_state) || 94 (!msi_is_masked(msis->virt_state) && 95 msi_is_empty(msis->phys_state))) { 96 bool empty = true; 97 98 for (i = 0; i < msis->nr_entries; i++) { 99 eventfds[i] = msis->entries[i].gsi >= 0 ? 100 msis->entries[i].eventfd : -1; 101 102 if (eventfds[i] >= 0) 103 empty = false; 104 } 105 106 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set); 107 if (ret < 0) { 108 perror("VFIO_DEVICE_SET_IRQS(multi)"); 109 return ret; 110 } 111 112 msi_set_enabled(msis->phys_state, true); 113 msi_set_empty(msis->phys_state, empty); 114 115 return 0; 116 } 117 118 if (msi_is_masked(msis->virt_state)) { 119 /* TODO: if phys_state is not empty nor masked, mask all vectors */ 120 return 0; 121 } 122 123 /* Update individual vectors to avoid breaking those in use */ 124 for (i = 0; i < msis->nr_entries; i++) { 125 struct vfio_pci_msi_entry *entry = &msis->entries[i]; 126 int fd = entry->gsi >= 0 ? entry->eventfd : -1; 127 128 if (fd == eventfds[i]) 129 continue; 130 131 single.irq.start = i; 132 set_vfio_irq_eventd_payload(&single, fd); 133 134 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single); 135 if (ret < 0) { 136 perror("VFIO_DEVICE_SET_IRQS(single)"); 137 break; 138 } 139 140 eventfds[i] = fd; 141 142 if (msi_is_empty(msis->phys_state) && fd >= 0) 143 msi_set_empty(msis->phys_state, false); 144 } 145 146 return ret; 147 } 148 149 static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev, 150 bool msix) 151 { 152 int ret; 153 struct vfio_pci_device *pdev = &vdev->pci; 154 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 155 struct vfio_irq_set irq_set = { 156 .argsz = sizeof(irq_set), 157 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 158 .index = msis->info.index, 159 .start = 0, 160 .count = 0, 161 }; 162 163 if (!msi_is_enabled(msis->phys_state)) 164 return 0; 165 166 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 167 if (ret < 0) { 168 perror("VFIO_DEVICE_SET_IRQS(NONE)"); 169 return ret; 170 } 171 172 msi_set_enabled(msis->phys_state, false); 173 msi_set_empty(msis->phys_state, true); 174 175 /* 176 * When MSI or MSIX is disabled, this might be called when 177 * PCI driver detects the MSI interrupt failure and wants to 178 * rollback to INTx mode. Thus enable INTx if the device 179 * supports INTx mode in this case. 180 */ 181 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 182 ret = vfio_pci_enable_intx(kvm, vdev); 183 184 return ret >= 0 ? 0 : ret; 185 } 186 187 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev, 188 struct vfio_pci_msi_entry *entry) 189 { 190 int ret; 191 192 if (entry->eventfd < 0) { 193 entry->eventfd = eventfd(0, 0); 194 if (entry->eventfd < 0) { 195 ret = -errno; 196 vfio_dev_err(vdev, "cannot create eventfd"); 197 return ret; 198 } 199 } 200 201 /* Allocate IRQ if necessary */ 202 if (entry->gsi < 0) { 203 int ret = irq__add_msix_route(kvm, &entry->config.msg, 204 vdev->dev_hdr.dev_num << 3); 205 if (ret < 0) { 206 vfio_dev_err(vdev, "cannot create MSI-X route"); 207 return ret; 208 } 209 entry->gsi = ret; 210 } else { 211 irq__update_msix_route(kvm, entry->gsi, &entry->config.msg); 212 } 213 214 /* 215 * MSI masking is unimplemented in VFIO, so we have to handle it by 216 * disabling/enabling IRQ route instead. We do it on the KVM side rather 217 * than VFIO, because: 218 * - it is 8x faster 219 * - it allows to decouple masking logic from capability state. 220 * - in masked state, after removing irqfd route, we could easily plug 221 * the eventfd in a local handler, in order to serve Pending Bit reads 222 * to the guest. 223 * 224 * So entry->phys_state is masked when there is no active irqfd route. 225 */ 226 if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state)) 227 return 0; 228 229 if (msi_is_masked(entry->phys_state)) { 230 ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1); 231 if (ret < 0) { 232 vfio_dev_err(vdev, "cannot setup irqfd"); 233 return ret; 234 } 235 } else { 236 irq__del_irqfd(kvm, entry->gsi, entry->eventfd); 237 } 238 239 msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state)); 240 241 return 0; 242 } 243 244 static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 245 u32 len, u8 is_write, void *ptr) 246 { 247 struct vfio_pci_device *pdev = ptr; 248 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 249 u64 offset = addr - pba->guest_phys_addr; 250 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 251 252 if (offset >= pba->size) { 253 vfio_dev_err(vdev, "access outside of the MSIX PBA"); 254 return; 255 } 256 257 if (is_write) 258 return; 259 260 /* 261 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA 262 * is completely useless here. Note that Linux doesn't use PBA. 263 */ 264 if (pread(vdev->fd, data, len, pba->fd_offset + offset) != (ssize_t)len) 265 vfio_dev_err(vdev, "cannot access MSIX PBA\n"); 266 } 267 268 static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 269 u32 len, u8 is_write, void *ptr) 270 { 271 struct kvm *kvm = vcpu->kvm; 272 struct vfio_pci_msi_entry *entry; 273 struct vfio_pci_device *pdev = ptr; 274 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 275 276 u64 offset = addr - pdev->msix_table.guest_phys_addr; 277 if (offset >= pdev->msix_table.size) { 278 vfio_dev_err(vdev, "access outside of the MSI-X table"); 279 return; 280 } 281 282 size_t vector = offset / PCI_MSIX_ENTRY_SIZE; 283 off_t field = offset % PCI_MSIX_ENTRY_SIZE; 284 285 /* 286 * PCI spec says that software must use aligned 4 or 8 bytes accesses 287 * for the MSI-X tables. 288 */ 289 if ((len != 4 && len != 8) || addr & (len - 1)) { 290 vfio_dev_warn(vdev, "invalid MSI-X table access"); 291 return; 292 } 293 294 entry = &pdev->msix.entries[vector]; 295 296 mutex_lock(&pdev->msix.mutex); 297 298 if (!is_write) { 299 memcpy(data, (void *)&entry->config + field, len); 300 goto out_unlock; 301 } 302 303 memcpy((void *)&entry->config + field, data, len); 304 305 /* 306 * Check if access touched the vector control register, which is at the 307 * end of the MSI-X entry. 308 */ 309 if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL) 310 goto out_unlock; 311 312 msi_set_masked(entry->virt_state, entry->config.ctrl & 313 PCI_MSIX_ENTRY_CTRL_MASKBIT); 314 315 if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0) 316 /* Not much we can do here. */ 317 vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector); 318 319 /* Update the physical capability if necessary */ 320 if (vfio_pci_enable_msis(kvm, vdev, true)) 321 vfio_dev_err(vdev, "cannot enable MSIX"); 322 323 out_unlock: 324 mutex_unlock(&pdev->msix.mutex); 325 } 326 327 static void vfio_pci_msix_cap_write(struct kvm *kvm, 328 struct vfio_device *vdev, u16 off, 329 void *data, int sz) 330 { 331 struct vfio_pci_device *pdev = &vdev->pci; 332 off_t enable_pos = PCI_MSIX_FLAGS + 1; 333 bool enable; 334 u16 flags; 335 336 off -= pdev->msix.pos; 337 338 /* Check if access intersects with the MSI-X Enable bit */ 339 if (off > enable_pos || off + sz <= enable_pos) 340 return; 341 342 /* Read byte that contains the Enable bit */ 343 flags = *(u8 *)(data + enable_pos - off) << 8; 344 345 mutex_lock(&pdev->msix.mutex); 346 347 msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL); 348 enable = flags & PCI_MSIX_FLAGS_ENABLE; 349 msi_set_enabled(pdev->msix.virt_state, enable); 350 351 if (enable && vfio_pci_enable_msis(kvm, vdev, true)) 352 vfio_dev_err(vdev, "cannot enable MSIX"); 353 else if (!enable && vfio_pci_disable_msis(kvm, vdev, true)) 354 vfio_dev_err(vdev, "cannot disable MSIX"); 355 356 mutex_unlock(&pdev->msix.mutex); 357 } 358 359 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev, 360 u16 off, u8 *data, u32 sz) 361 { 362 size_t i; 363 u32 mask = 0; 364 size_t mask_pos, start, limit; 365 struct vfio_pci_msi_entry *entry; 366 struct vfio_pci_device *pdev = &vdev->pci; 367 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 368 369 if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT)) 370 return 0; 371 372 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) 373 mask_pos = PCI_MSI_MASK_64; 374 else 375 mask_pos = PCI_MSI_MASK_32; 376 377 if (off >= mask_pos + 4 || off + sz <= mask_pos) 378 return 0; 379 380 /* Set mask to current state */ 381 for (i = 0; i < pdev->msi.nr_entries; i++) { 382 entry = &pdev->msi.entries[i]; 383 mask |= !!msi_is_masked(entry->virt_state) << i; 384 } 385 386 /* Update mask following the intersection of access and register */ 387 start = max_t(size_t, off, mask_pos); 388 limit = min_t(size_t, off + sz, mask_pos + 4); 389 390 memcpy((void *)&mask + start - mask_pos, data + start - off, 391 limit - start); 392 393 /* Update states if necessary */ 394 for (i = 0; i < pdev->msi.nr_entries; i++) { 395 bool masked = mask & (1 << i); 396 397 entry = &pdev->msi.entries[i]; 398 if (masked != msi_is_masked(entry->virt_state)) { 399 msi_set_masked(entry->virt_state, masked); 400 vfio_pci_update_msi_entry(kvm, vdev, entry); 401 } 402 } 403 404 return 1; 405 } 406 407 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev, 408 u16 off, u8 *data, u32 sz) 409 { 410 u8 ctrl; 411 struct msi_msg msg; 412 size_t i, nr_vectors; 413 struct vfio_pci_msi_entry *entry; 414 struct vfio_pci_device *pdev = &vdev->pci; 415 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 416 417 off -= pdev->msi.pos; 418 419 mutex_lock(&pdev->msi.mutex); 420 421 /* Check if the guest is trying to update mask bits */ 422 if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz)) 423 goto out_unlock; 424 425 /* Only modify routes when guest pokes the enable bit */ 426 if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS) 427 goto out_unlock; 428 429 ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off); 430 431 msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE); 432 433 if (!msi_is_enabled(pdev->msi.virt_state)) { 434 vfio_pci_disable_msis(kvm, vdev, false); 435 goto out_unlock; 436 } 437 438 /* Create routes for the requested vectors */ 439 nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4); 440 441 msg.address_lo = msi_cap_64->address_lo; 442 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) { 443 msg.address_hi = msi_cap_64->address_hi; 444 msg.data = msi_cap_64->data; 445 } else { 446 struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64; 447 msg.address_hi = 0; 448 msg.data = msi_cap_32->data; 449 } 450 451 for (i = 0; i < nr_vectors; i++) { 452 entry = &pdev->msi.entries[i]; 453 454 /* 455 * Set the MSI data value as required by the PCI local 456 * bus specifications, MSI capability, "Message Data". 457 */ 458 msg.data &= ~(nr_vectors - 1); 459 msg.data |= i; 460 461 entry->config.msg = msg; 462 vfio_pci_update_msi_entry(kvm, vdev, entry); 463 } 464 465 /* Update the physical capability if necessary */ 466 if (vfio_pci_enable_msis(kvm, vdev, false)) 467 vfio_dev_err(vdev, "cannot enable MSI"); 468 469 out_unlock: 470 mutex_unlock(&pdev->msi.mutex); 471 } 472 473 static int vfio_pci_bar_activate(struct kvm *kvm, 474 struct pci_device_header *pci_hdr, 475 int bar_num, void *data) 476 { 477 struct vfio_device *vdev = data; 478 struct vfio_pci_device *pdev = &vdev->pci; 479 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 480 struct vfio_pci_msix_table *table = &pdev->msix_table; 481 struct vfio_region *region; 482 u32 bar_addr; 483 bool has_msix; 484 int ret; 485 486 assert((u32)bar_num < vdev->info.num_regions); 487 488 region = &vdev->regions[bar_num]; 489 has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX; 490 491 bar_addr = pci__bar_address(pci_hdr, bar_num); 492 if (pci__bar_is_io(pci_hdr, bar_num)) 493 region->port_base = bar_addr; 494 else 495 region->guest_phys_addr = bar_addr; 496 497 if (has_msix && (u32)bar_num == table->bar) { 498 table->guest_phys_addr = region->guest_phys_addr; 499 ret = kvm__register_mmio(kvm, table->guest_phys_addr, 500 table->size, false, 501 vfio_pci_msix_table_access, pdev); 502 /* 503 * The MSIX table and the PBA structure can share the same BAR, 504 * but for convenience we register different regions for mmio 505 * emulation. We want to we update both if they share the same 506 * BAR. 507 */ 508 if (ret < 0 || table->bar != pba->bar) 509 goto out; 510 } 511 512 if (has_msix && (u32)bar_num == pba->bar) { 513 if (pba->bar == table->bar) 514 pba->guest_phys_addr = table->guest_phys_addr + pba->bar_offset; 515 else 516 pba->guest_phys_addr = region->guest_phys_addr; 517 ret = kvm__register_mmio(kvm, pba->guest_phys_addr, 518 pba->size, false, 519 vfio_pci_msix_pba_access, pdev); 520 goto out; 521 } 522 523 ret = vfio_map_region(kvm, vdev, region); 524 out: 525 return ret; 526 } 527 528 static int vfio_pci_bar_deactivate(struct kvm *kvm, 529 struct pci_device_header *pci_hdr, 530 int bar_num, void *data) 531 { 532 struct vfio_device *vdev = data; 533 struct vfio_pci_device *pdev = &vdev->pci; 534 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 535 struct vfio_pci_msix_table *table = &pdev->msix_table; 536 struct vfio_region *region; 537 bool has_msix, success; 538 int ret; 539 540 assert((u32)bar_num < vdev->info.num_regions); 541 542 region = &vdev->regions[bar_num]; 543 has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX; 544 545 if (has_msix && (u32)bar_num == table->bar) { 546 success = kvm__deregister_mmio(kvm, table->guest_phys_addr); 547 /* kvm__deregister_mmio fails when the region is not found. */ 548 ret = (success ? 0 : -ENOENT); 549 /* See vfio_pci_bar_activate(). */ 550 if (ret < 0 || table->bar!= pba->bar) 551 goto out; 552 } 553 554 if (has_msix && (u32)bar_num == pba->bar) { 555 success = kvm__deregister_mmio(kvm, pba->guest_phys_addr); 556 ret = (success ? 0 : -ENOENT); 557 goto out; 558 } 559 560 vfio_unmap_region(kvm, region); 561 ret = 0; 562 563 out: 564 return ret; 565 } 566 567 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr, 568 u16 offset, void *data, int sz) 569 { 570 struct vfio_region_info *info; 571 struct vfio_pci_device *pdev; 572 struct vfio_device *vdev; 573 char base[sz]; 574 575 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 576 vdev = container_of(pdev, struct vfio_device, pci); 577 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 578 579 /* Dummy read in case of side-effects */ 580 if (pread(vdev->fd, base, sz, info->offset + offset) != sz) 581 vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x", 582 sz, offset); 583 } 584 585 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr, 586 u16 offset, void *data, int sz) 587 { 588 struct vfio_region_info *info; 589 struct vfio_pci_device *pdev; 590 struct vfio_device *vdev; 591 u32 tmp; 592 593 /* Make sure a larger size will not overrun tmp on the stack. */ 594 assert(sz <= 4); 595 596 if (offset == PCI_ROM_ADDRESS) 597 return; 598 599 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 600 vdev = container_of(pdev, struct vfio_device, pci); 601 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 602 603 if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz) 604 vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x", 605 sz, offset); 606 607 /* Handle MSI write now, since it might update the hardware capability */ 608 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) 609 vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz); 610 611 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) 612 vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz); 613 614 if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz) 615 vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x", 616 sz, offset); 617 } 618 619 static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr) 620 { 621 size_t size = 10; 622 623 if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT) 624 size += 4; 625 if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT) 626 size += 10; 627 628 return size; 629 } 630 631 static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr) 632 { 633 switch (cap_hdr->type) { 634 case PCI_CAP_ID_MSIX: 635 return PCI_CAP_MSIX_SIZEOF; 636 case PCI_CAP_ID_MSI: 637 return vfio_pci_msi_cap_size((void *)cap_hdr); 638 case PCI_CAP_ID_EXP: 639 /* 640 * We don't emulate any of the link, slot and root complex 641 * properties, so ignore them. 642 */ 643 return PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1; 644 default: 645 pr_err("unknown PCI capability 0x%x", cap_hdr->type); 646 return 0; 647 } 648 } 649 650 static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr, 651 struct pci_cap_hdr *cap, off_t pos) 652 { 653 struct pci_cap_hdr *last; 654 struct pci_device_header *hdr = &vdev->pci.hdr; 655 656 cap->next = 0; 657 658 if (!hdr->capabilities) { 659 hdr->capabilities = pos; 660 hdr->status |= PCI_STATUS_CAP_LIST; 661 } else { 662 last = PCI_CAP(virt_hdr, hdr->capabilities); 663 664 while (last->next) 665 last = PCI_CAP(virt_hdr, last->next); 666 667 last->next = pos; 668 } 669 670 memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap)); 671 672 return 0; 673 } 674 675 static int vfio_pci_parse_caps(struct vfio_device *vdev) 676 { 677 int ret; 678 size_t size; 679 u16 pos, next; 680 struct pci_cap_hdr *cap; 681 u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY]; 682 struct vfio_pci_device *pdev = &vdev->pci; 683 684 if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST)) 685 return 0; 686 687 memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY); 688 689 pos = pdev->hdr.capabilities & ~3; 690 691 pdev->hdr.status &= ~PCI_STATUS_CAP_LIST; 692 pdev->hdr.capabilities = 0; 693 694 for (; pos; pos = next) { 695 cap = PCI_CAP(&pdev->hdr, pos); 696 next = cap->next; 697 698 switch (cap->type) { 699 case PCI_CAP_ID_MSIX: 700 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 701 if (ret) 702 return ret; 703 704 pdev->msix.pos = pos; 705 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX; 706 break; 707 case PCI_CAP_ID_MSI: 708 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 709 if (ret) 710 return ret; 711 712 pdev->msi.pos = pos; 713 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI; 714 break; 715 case PCI_CAP_ID_EXP: 716 if (!arch_has_pci_exp()) 717 continue; 718 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 719 if (ret) 720 return ret; 721 break; 722 } 723 } 724 725 /* Wipe remaining capabilities */ 726 pos = PCI_STD_HEADER_SIZEOF; 727 size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF; 728 memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size); 729 730 return 0; 731 } 732 733 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev) 734 { 735 ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY; 736 struct vfio_region_info *info; 737 struct vfio_pci_device *pdev = &vdev->pci; 738 739 if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { 740 vfio_dev_err(vdev, "Config Space not found"); 741 return -ENODEV; 742 } 743 744 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 745 *info = (struct vfio_region_info) { 746 .argsz = sizeof(*info), 747 .index = VFIO_PCI_CONFIG_REGION_INDEX, 748 }; 749 750 ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 751 if (!info->size) { 752 vfio_dev_err(vdev, "Config Space has size zero?!"); 753 return -EINVAL; 754 } 755 756 /* Read standard headers and capabilities */ 757 if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) { 758 vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz); 759 return -EIO; 760 } 761 762 /* Strip bit 7, that indicates multifunction */ 763 pdev->hdr.header_type &= 0x7f; 764 765 if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) { 766 vfio_dev_err(vdev, "unsupported header type %u", 767 pdev->hdr.header_type); 768 return -EOPNOTSUPP; 769 } 770 771 if (pdev->hdr.irq_pin) 772 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX; 773 774 vfio_pci_parse_caps(vdev); 775 776 return 0; 777 } 778 779 static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev) 780 { 781 int i; 782 u64 base; 783 ssize_t hdr_sz; 784 struct msix_cap *msix; 785 struct vfio_region_info *info; 786 struct vfio_pci_device *pdev = &vdev->pci; 787 struct vfio_region *region; 788 789 /* Initialise the BARs */ 790 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 791 if ((u32)i == vdev->info.num_regions) 792 break; 793 794 region = &vdev->regions[i]; 795 /* Construct a fake reg to match what we've mapped. */ 796 if (region->is_ioport) { 797 base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) | 798 PCI_BASE_ADDRESS_SPACE_IO; 799 } else { 800 base = (region->guest_phys_addr & 801 PCI_BASE_ADDRESS_MEM_MASK) | 802 PCI_BASE_ADDRESS_SPACE_MEMORY; 803 } 804 805 pdev->hdr.bar[i] = base; 806 807 if (!base) 808 continue; 809 810 pdev->hdr.bar_size[i] = region->info.size; 811 } 812 813 /* I really can't be bothered to support cardbus. */ 814 pdev->hdr.card_bus = 0; 815 816 /* 817 * Nuke the expansion ROM for now. If we want to do this properly, 818 * we need to save its size somewhere and map into the guest. 819 */ 820 pdev->hdr.exp_rom_bar = 0; 821 822 /* Plumb in our fake MSI-X capability, if we have it. */ 823 msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX); 824 if (msix) { 825 /* Add a shortcut to the PBA region for the MMIO handler */ 826 int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar; 827 u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET; 828 829 pdev->msix_pba.fd_offset = vdev->regions[pba_index].info.offset + 830 pba_bar_offset; 831 832 /* Tidy up the capability */ 833 msix->table_offset &= PCI_MSIX_TABLE_BIR; 834 if (pdev->msix_table.bar == pdev->msix_pba.bar) { 835 /* Keep the same offset as the MSIX cap. */ 836 pdev->msix_pba.bar_offset = pba_bar_offset; 837 } else { 838 /* PBA is at the start of the BAR. */ 839 msix->pba_offset &= PCI_MSIX_PBA_BIR; 840 pdev->msix_pba.bar_offset = 0; 841 } 842 } 843 844 /* Install our fake Configuration Space */ 845 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 846 /* 847 * We don't touch the extended configuration space, let's be cautious 848 * and not overwrite it all with zeros, or bad things might happen. 849 */ 850 hdr_sz = PCI_DEV_CFG_SIZE_LEGACY; 851 if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) { 852 vfio_dev_err(vdev, "failed to write %zd bytes to Config Space", 853 hdr_sz); 854 return -EIO; 855 } 856 857 /* Register callbacks for cfg accesses */ 858 pdev->hdr.cfg_ops = (struct pci_config_operations) { 859 .read = vfio_pci_cfg_read, 860 .write = vfio_pci_cfg_write, 861 }; 862 863 pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH; 864 865 return 0; 866 } 867 868 static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index, 869 struct vfio_region_info *info) 870 { 871 int ret; 872 873 *info = (struct vfio_region_info) { 874 .argsz = sizeof(*info), 875 .index = index, 876 }; 877 878 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 879 if (ret) { 880 ret = -errno; 881 vfio_dev_err(vdev, "cannot get info for BAR %u", index); 882 return ret; 883 } 884 885 if (info->size && !is_power_of_two(info->size)) { 886 vfio_dev_err(vdev, "region is not power of two: 0x%llx", 887 info->size); 888 return -EINVAL; 889 } 890 891 return 0; 892 } 893 894 static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev) 895 { 896 int ret; 897 size_t i; 898 size_t map_size; 899 size_t nr_entries; 900 struct vfio_pci_msi_entry *entries; 901 struct vfio_pci_device *pdev = &vdev->pci; 902 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 903 struct vfio_pci_msix_table *table = &pdev->msix_table; 904 struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos); 905 struct vfio_region_info info; 906 907 table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR; 908 pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR; 909 910 nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; 911 912 /* MSIX table and PBA must support QWORD accesses. */ 913 table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, 8); 914 pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), 8); 915 916 entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry)); 917 if (!entries) 918 return -ENOMEM; 919 920 for (i = 0; i < nr_entries; i++) 921 entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; 922 923 ret = vfio_pci_get_region_info(vdev, table->bar, &info); 924 if (ret) 925 return ret; 926 if (!info.size) 927 return -EINVAL; 928 929 map_size = ALIGN(info.size, PAGE_SIZE); 930 table->guest_phys_addr = pci_get_mmio_block(map_size); 931 if (!table->guest_phys_addr) { 932 pr_err("cannot allocate MMIO space"); 933 ret = -ENOMEM; 934 goto out_free; 935 } 936 937 /* 938 * We could map the physical PBA directly into the guest, but it's 939 * likely smaller than a page, and we can only hand full pages to the 940 * guest. Even though the PCI spec disallows sharing a page used for 941 * MSI-X with any other resource, it allows to share the same page 942 * between MSI-X table and PBA. For the sake of isolation, create a 943 * virtual PBA. 944 */ 945 if (table->bar == pba->bar) { 946 u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET; 947 948 /* Sanity checks. */ 949 if (table->size > pba_bar_offset) 950 die("MSIX table overlaps with PBA"); 951 if (pba_bar_offset + pba->size > info.size) 952 die("PBA exceeds the size of the region"); 953 pba->guest_phys_addr = table->guest_phys_addr + pba_bar_offset; 954 } else { 955 ret = vfio_pci_get_region_info(vdev, pba->bar, &info); 956 if (ret) 957 return ret; 958 if (!info.size) 959 return -EINVAL; 960 961 map_size = ALIGN(info.size, PAGE_SIZE); 962 pba->guest_phys_addr = pci_get_mmio_block(map_size); 963 if (!pba->guest_phys_addr) { 964 pr_err("cannot allocate MMIO space"); 965 ret = -ENOMEM; 966 goto out_free; 967 } 968 } 969 970 pdev->msix.entries = entries; 971 pdev->msix.nr_entries = nr_entries; 972 973 return 0; 974 975 out_free: 976 free(entries); 977 978 return ret; 979 } 980 981 static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev) 982 { 983 struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos); 984 985 pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1), 986 pdev->msi.entries = calloc(pdev->msi.nr_entries, 987 sizeof(struct vfio_pci_msi_entry)); 988 if (!pdev->msi.entries) 989 return -ENOMEM; 990 991 return 0; 992 } 993 994 static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev, 995 size_t nr) 996 { 997 int ret; 998 u32 bar; 999 size_t map_size; 1000 struct vfio_pci_device *pdev = &vdev->pci; 1001 struct vfio_region *region; 1002 1003 if (nr >= vdev->info.num_regions) 1004 return 0; 1005 1006 region = &vdev->regions[nr]; 1007 bar = pdev->hdr.bar[nr]; 1008 1009 region->vdev = vdev; 1010 region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO); 1011 1012 ret = vfio_pci_get_region_info(vdev, nr, ®ion->info); 1013 if (ret) 1014 return ret; 1015 1016 /* Ignore invalid or unimplemented regions */ 1017 if (!region->info.size) 1018 return 0; 1019 1020 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1021 /* Trap and emulate MSI-X table */ 1022 if (nr == pdev->msix_table.bar) { 1023 region->guest_phys_addr = pdev->msix_table.guest_phys_addr; 1024 return 0; 1025 } else if (nr == pdev->msix_pba.bar) { 1026 region->guest_phys_addr = pdev->msix_pba.guest_phys_addr; 1027 return 0; 1028 } 1029 } 1030 1031 if (region->is_ioport) { 1032 region->port_base = pci_get_io_port_block(region->info.size); 1033 } else { 1034 /* Grab some MMIO space in the guest */ 1035 map_size = ALIGN(region->info.size, PAGE_SIZE); 1036 region->guest_phys_addr = pci_get_mmio_block(map_size); 1037 } 1038 1039 return 0; 1040 } 1041 1042 static int vfio_pci_configure_dev_regions(struct kvm *kvm, 1043 struct vfio_device *vdev) 1044 { 1045 int ret; 1046 u32 bar; 1047 size_t i; 1048 bool is_64bit = false; 1049 struct vfio_pci_device *pdev = &vdev->pci; 1050 1051 ret = vfio_pci_parse_cfg_space(vdev); 1052 if (ret) 1053 return ret; 1054 1055 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1056 ret = vfio_pci_create_msix_table(kvm, vdev); 1057 if (ret) 1058 return ret; 1059 } 1060 1061 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 1062 ret = vfio_pci_create_msi_cap(kvm, pdev); 1063 if (ret) 1064 return ret; 1065 } 1066 1067 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 1068 /* Ignore top half of 64-bit BAR */ 1069 if (is_64bit) { 1070 is_64bit = false; 1071 continue; 1072 } 1073 1074 ret = vfio_pci_configure_bar(kvm, vdev, i); 1075 if (ret) 1076 return ret; 1077 1078 bar = pdev->hdr.bar[i]; 1079 is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) == 1080 PCI_BASE_ADDRESS_SPACE_MEMORY && 1081 bar & PCI_BASE_ADDRESS_MEM_TYPE_64; 1082 } 1083 1084 /* We've configured the BARs, fake up a Configuration Space */ 1085 ret = vfio_pci_fixup_cfg_space(vdev); 1086 if (ret) 1087 return ret; 1088 1089 return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate, 1090 vfio_pci_bar_deactivate, vdev); 1091 } 1092 1093 /* 1094 * Attempt to update the FD limit, if opening an eventfd for each IRQ vector 1095 * would hit the limit. Which is likely to happen when a device uses 2048 MSIs. 1096 */ 1097 static int vfio_pci_reserve_irq_fds(size_t num) 1098 { 1099 /* 1100 * I counted around 27 fds under normal load. Let's add 100 for good 1101 * measure. 1102 */ 1103 static size_t needed = 128; 1104 struct rlimit fd_limit, new_limit; 1105 1106 needed += num; 1107 1108 if (getrlimit(RLIMIT_NOFILE, &fd_limit)) { 1109 perror("getrlimit(RLIMIT_NOFILE)"); 1110 return 0; 1111 } 1112 1113 if (fd_limit.rlim_cur >= needed) 1114 return 0; 1115 1116 new_limit.rlim_cur = needed; 1117 1118 if (fd_limit.rlim_max < needed) 1119 /* Try to bump hard limit (root only) */ 1120 new_limit.rlim_max = needed; 1121 else 1122 new_limit.rlim_max = fd_limit.rlim_max; 1123 1124 if (setrlimit(RLIMIT_NOFILE, &new_limit)) { 1125 perror("setrlimit(RLIMIT_NOFILE)"); 1126 pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)", 1127 (size_t)(needed - fd_limit.rlim_cur)); 1128 } 1129 1130 return 0; 1131 } 1132 1133 static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev, 1134 struct vfio_pci_msi_common *msis) 1135 { 1136 int ret; 1137 size_t i; 1138 int *eventfds; 1139 size_t irq_set_size; 1140 struct vfio_pci_msi_entry *entry; 1141 size_t nr_entries = msis->nr_entries; 1142 1143 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info); 1144 if (ret || msis->info.count == 0) { 1145 vfio_dev_err(vdev, "no MSI reported by VFIO"); 1146 return -ENODEV; 1147 } 1148 1149 if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) { 1150 vfio_dev_err(vdev, "interrupt not EVENTFD capable"); 1151 return -EINVAL; 1152 } 1153 1154 if (msis->info.count != nr_entries) { 1155 vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO"); 1156 return -EINVAL; 1157 } 1158 1159 mutex_init(&msis->mutex); 1160 1161 vfio_pci_reserve_irq_fds(nr_entries); 1162 1163 irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int); 1164 msis->irq_set = malloc(irq_set_size); 1165 if (!msis->irq_set) 1166 return -ENOMEM; 1167 1168 *msis->irq_set = (struct vfio_irq_set) { 1169 .argsz = irq_set_size, 1170 .flags = VFIO_IRQ_SET_DATA_EVENTFD | 1171 VFIO_IRQ_SET_ACTION_TRIGGER, 1172 .index = msis->info.index, 1173 .start = 0, 1174 .count = nr_entries, 1175 }; 1176 1177 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 1178 1179 for (i = 0; i < nr_entries; i++) { 1180 entry = &msis->entries[i]; 1181 entry->gsi = -1; 1182 entry->eventfd = -1; 1183 msi_set_masked(entry->virt_state, true); 1184 msi_set_masked(entry->phys_state, true); 1185 eventfds[i] = -1; 1186 } 1187 1188 return 0; 1189 } 1190 1191 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev) 1192 { 1193 struct vfio_pci_device *pdev = &vdev->pci; 1194 int gsi = pdev->intx_gsi; 1195 struct vfio_irq_set irq_set = { 1196 .argsz = sizeof(irq_set), 1197 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 1198 .index = VFIO_PCI_INTX_IRQ_INDEX, 1199 }; 1200 1201 if (pdev->intx_fd == -1) 1202 return; 1203 1204 pr_debug("user requested MSI, disabling INTx %d", gsi); 1205 1206 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 1207 irq__del_irqfd(kvm, gsi, pdev->intx_fd); 1208 1209 close(pdev->intx_fd); 1210 close(pdev->unmask_fd); 1211 pdev->intx_fd = -1; 1212 } 1213 1214 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev) 1215 { 1216 int ret; 1217 int trigger_fd, unmask_fd; 1218 union vfio_irq_eventfd trigger; 1219 union vfio_irq_eventfd unmask; 1220 struct vfio_pci_device *pdev = &vdev->pci; 1221 int gsi = pdev->intx_gsi; 1222 1223 if (pdev->intx_fd != -1) 1224 return 0; 1225 1226 /* 1227 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd 1228 * signals an interrupt from host to guest, and unmask_fd signals the 1229 * deassertion of the line from guest to host. 1230 */ 1231 trigger_fd = eventfd(0, 0); 1232 if (trigger_fd < 0) { 1233 vfio_dev_err(vdev, "failed to create trigger eventfd"); 1234 return trigger_fd; 1235 } 1236 1237 unmask_fd = eventfd(0, 0); 1238 if (unmask_fd < 0) { 1239 vfio_dev_err(vdev, "failed to create unmask eventfd"); 1240 close(trigger_fd); 1241 return unmask_fd; 1242 } 1243 1244 ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd); 1245 if (ret) 1246 goto err_close; 1247 1248 trigger.irq = (struct vfio_irq_set) { 1249 .argsz = sizeof(trigger), 1250 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, 1251 .index = VFIO_PCI_INTX_IRQ_INDEX, 1252 .start = 0, 1253 .count = 1, 1254 }; 1255 set_vfio_irq_eventd_payload(&trigger, trigger_fd); 1256 1257 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 1258 if (ret < 0) { 1259 vfio_dev_err(vdev, "failed to setup VFIO IRQ"); 1260 goto err_delete_line; 1261 } 1262 1263 unmask.irq = (struct vfio_irq_set) { 1264 .argsz = sizeof(unmask), 1265 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK, 1266 .index = VFIO_PCI_INTX_IRQ_INDEX, 1267 .start = 0, 1268 .count = 1, 1269 }; 1270 set_vfio_irq_eventd_payload(&unmask, unmask_fd); 1271 1272 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask); 1273 if (ret < 0) { 1274 vfio_dev_err(vdev, "failed to setup unmask IRQ"); 1275 goto err_remove_event; 1276 } 1277 1278 pdev->intx_fd = trigger_fd; 1279 pdev->unmask_fd = unmask_fd; 1280 1281 return 0; 1282 1283 err_remove_event: 1284 /* Remove trigger event */ 1285 trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; 1286 trigger.irq.count = 0; 1287 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 1288 1289 err_delete_line: 1290 irq__del_irqfd(kvm, gsi, trigger_fd); 1291 1292 err_close: 1293 close(trigger_fd); 1294 close(unmask_fd); 1295 return ret; 1296 } 1297 1298 static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev) 1299 { 1300 int ret; 1301 struct vfio_pci_device *pdev = &vdev->pci; 1302 struct vfio_irq_info irq_info = { 1303 .argsz = sizeof(irq_info), 1304 .index = VFIO_PCI_INTX_IRQ_INDEX, 1305 }; 1306 1307 vfio_pci_reserve_irq_fds(2); 1308 1309 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); 1310 if (ret || irq_info.count == 0) { 1311 vfio_dev_err(vdev, "no INTx reported by VFIO"); 1312 return -ENODEV; 1313 } 1314 1315 if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { 1316 vfio_dev_err(vdev, "interrupt not eventfd capable"); 1317 return -EINVAL; 1318 } 1319 1320 if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) { 1321 vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED"); 1322 return -EINVAL; 1323 } 1324 1325 /* Guest is going to ovewrite our irq_line... */ 1326 pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET; 1327 1328 pdev->intx_fd = -1; 1329 1330 return 0; 1331 } 1332 1333 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev) 1334 { 1335 int ret = 0; 1336 struct vfio_pci_device *pdev = &vdev->pci; 1337 1338 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1339 pdev->msix.info = (struct vfio_irq_info) { 1340 .argsz = sizeof(pdev->msix.info), 1341 .index = VFIO_PCI_MSIX_IRQ_INDEX, 1342 }; 1343 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix); 1344 if (ret) 1345 return ret; 1346 } 1347 1348 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 1349 pdev->msi.info = (struct vfio_irq_info) { 1350 .argsz = sizeof(pdev->msi.info), 1351 .index = VFIO_PCI_MSI_IRQ_INDEX, 1352 }; 1353 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi); 1354 if (ret) 1355 return ret; 1356 } 1357 1358 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) { 1359 pci__assign_irq(&vdev->pci.hdr); 1360 1361 ret = vfio_pci_init_intx(kvm, vdev); 1362 if (ret) 1363 return ret; 1364 1365 ret = vfio_pci_enable_intx(kvm, vdev); 1366 } 1367 1368 return ret; 1369 } 1370 1371 int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev) 1372 { 1373 int ret; 1374 1375 ret = vfio_pci_configure_dev_regions(kvm, vdev); 1376 if (ret) { 1377 vfio_dev_err(vdev, "failed to configure regions"); 1378 return ret; 1379 } 1380 1381 vdev->dev_hdr = (struct device_header) { 1382 .bus_type = DEVICE_BUS_PCI, 1383 .data = &vdev->pci.hdr, 1384 }; 1385 1386 ret = device__register(&vdev->dev_hdr); 1387 if (ret) { 1388 vfio_dev_err(vdev, "failed to register VFIO device"); 1389 return ret; 1390 } 1391 1392 ret = vfio_pci_configure_dev_irqs(kvm, vdev); 1393 if (ret) { 1394 vfio_dev_err(vdev, "failed to configure IRQs"); 1395 return ret; 1396 } 1397 1398 return 0; 1399 } 1400 1401 void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev) 1402 { 1403 size_t i; 1404 struct vfio_pci_device *pdev = &vdev->pci; 1405 1406 for (i = 0; i < vdev->info.num_regions; i++) 1407 vfio_unmap_region(kvm, &vdev->regions[i]); 1408 1409 device__unregister(&vdev->dev_hdr); 1410 1411 free(pdev->msix.irq_set); 1412 free(pdev->msix.entries); 1413 free(pdev->msi.irq_set); 1414 free(pdev->msi.entries); 1415 } 1416