1 #include "kvm/irq.h" 2 #include "kvm/kvm.h" 3 #include "kvm/kvm-cpu.h" 4 #include "kvm/vfio.h" 5 6 #include <assert.h> 7 8 #include <sys/ioctl.h> 9 #include <sys/eventfd.h> 10 #include <sys/resource.h> 11 #include <sys/time.h> 12 13 /* Some distros don't have the define. */ 14 #ifndef PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 15 #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 12 16 #endif 17 18 /* Wrapper around UAPI vfio_irq_set */ 19 union vfio_irq_eventfd { 20 struct vfio_irq_set irq; 21 u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)]; 22 }; 23 24 static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd) 25 { 26 memcpy(&evfd->irq.data, &fd, sizeof(fd)); 27 } 28 29 #define msi_is_enabled(state) ((state) & VFIO_PCI_MSI_STATE_ENABLED) 30 #define msi_is_masked(state) ((state) & VFIO_PCI_MSI_STATE_MASKED) 31 #define msi_is_empty(state) ((state) & VFIO_PCI_MSI_STATE_EMPTY) 32 33 #define msi_update_state(state, val, bit) \ 34 (state) = (val) ? (state) | bit : (state) & ~bit; 35 #define msi_set_enabled(state, val) \ 36 msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED) 37 #define msi_set_masked(state, val) \ 38 msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED) 39 #define msi_set_empty(state, val) \ 40 msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY) 41 42 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev); 43 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev); 44 45 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev, 46 bool msix) 47 { 48 size_t i; 49 int ret = 0; 50 int *eventfds; 51 struct vfio_pci_device *pdev = &vdev->pci; 52 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 53 union vfio_irq_eventfd single = { 54 .irq = { 55 .argsz = sizeof(single), 56 .flags = VFIO_IRQ_SET_DATA_EVENTFD | 57 VFIO_IRQ_SET_ACTION_TRIGGER, 58 .index = msis->info.index, 59 .count = 1, 60 }, 61 }; 62 63 if (!msi_is_enabled(msis->virt_state)) 64 return 0; 65 66 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 67 /* 68 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same 69 * time. Since INTx has to be enabled from the start (we don't 70 * have a reliable way to know when the guest starts using it), 71 * disable it now. 72 */ 73 vfio_pci_disable_intx(kvm, vdev); 74 75 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 76 77 /* 78 * Initial registration of the full range. This enables the physical 79 * MSI/MSI-X capability, which might have desired side effects. For 80 * instance when assigning virtio legacy devices, enabling the MSI 81 * capability modifies the config space layout! 82 * 83 * As an optimization, only update MSIs when guest unmasks the 84 * capability. This greatly reduces the initialization time for Linux 85 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap 86 * masked, then fills individual vectors, then unmasks the whole 87 * function. So we only do one VFIO ioctl when enabling for the first 88 * time, and then one when unmasking. 89 * 90 * phys_state is empty when it is enabled but no vector has been 91 * registered via SET_IRQS yet. 92 */ 93 if (!msi_is_enabled(msis->phys_state) || 94 (!msi_is_masked(msis->virt_state) && 95 msi_is_empty(msis->phys_state))) { 96 bool empty = true; 97 98 for (i = 0; i < msis->nr_entries; i++) { 99 eventfds[i] = msis->entries[i].gsi >= 0 ? 100 msis->entries[i].eventfd : -1; 101 102 if (eventfds[i] >= 0) 103 empty = false; 104 } 105 106 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set); 107 if (ret < 0) { 108 perror("VFIO_DEVICE_SET_IRQS(multi)"); 109 return ret; 110 } 111 112 msi_set_enabled(msis->phys_state, true); 113 msi_set_empty(msis->phys_state, empty); 114 115 return 0; 116 } 117 118 if (msi_is_masked(msis->virt_state)) { 119 /* TODO: if phys_state is not empty nor masked, mask all vectors */ 120 return 0; 121 } 122 123 /* Update individual vectors to avoid breaking those in use */ 124 for (i = 0; i < msis->nr_entries; i++) { 125 struct vfio_pci_msi_entry *entry = &msis->entries[i]; 126 int fd = entry->gsi >= 0 ? entry->eventfd : -1; 127 128 if (fd == eventfds[i]) 129 continue; 130 131 single.irq.start = i; 132 set_vfio_irq_eventd_payload(&single, fd); 133 134 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single); 135 if (ret < 0) { 136 perror("VFIO_DEVICE_SET_IRQS(single)"); 137 break; 138 } 139 140 eventfds[i] = fd; 141 142 if (msi_is_empty(msis->phys_state) && fd >= 0) 143 msi_set_empty(msis->phys_state, false); 144 } 145 146 return ret; 147 } 148 149 static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev, 150 bool msix) 151 { 152 int ret; 153 struct vfio_pci_device *pdev = &vdev->pci; 154 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 155 struct vfio_irq_set irq_set = { 156 .argsz = sizeof(irq_set), 157 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 158 .index = msis->info.index, 159 .start = 0, 160 .count = 0, 161 }; 162 163 if (!msi_is_enabled(msis->phys_state)) 164 return 0; 165 166 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 167 if (ret < 0) { 168 perror("VFIO_DEVICE_SET_IRQS(NONE)"); 169 return ret; 170 } 171 172 msi_set_enabled(msis->phys_state, false); 173 msi_set_empty(msis->phys_state, true); 174 175 /* 176 * When MSI or MSIX is disabled, this might be called when 177 * PCI driver detects the MSI interrupt failure and wants to 178 * rollback to INTx mode. Thus enable INTx if the device 179 * supports INTx mode in this case. 180 */ 181 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 182 ret = vfio_pci_enable_intx(kvm, vdev); 183 184 return ret >= 0 ? 0 : ret; 185 } 186 187 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev, 188 struct vfio_pci_msi_entry *entry) 189 { 190 int ret; 191 192 if (entry->eventfd < 0) { 193 entry->eventfd = eventfd(0, 0); 194 if (entry->eventfd < 0) { 195 ret = -errno; 196 vfio_dev_err(vdev, "cannot create eventfd"); 197 return ret; 198 } 199 } 200 201 /* Allocate IRQ if necessary */ 202 if (entry->gsi < 0) { 203 int ret = irq__add_msix_route(kvm, &entry->config.msg, 204 vdev->dev_hdr.dev_num << 3); 205 if (ret < 0) { 206 vfio_dev_err(vdev, "cannot create MSI-X route"); 207 return ret; 208 } 209 entry->gsi = ret; 210 } else { 211 irq__update_msix_route(kvm, entry->gsi, &entry->config.msg); 212 } 213 214 /* 215 * MSI masking is unimplemented in VFIO, so we have to handle it by 216 * disabling/enabling IRQ route instead. We do it on the KVM side rather 217 * than VFIO, because: 218 * - it is 8x faster 219 * - it allows to decouple masking logic from capability state. 220 * - in masked state, after removing irqfd route, we could easily plug 221 * the eventfd in a local handler, in order to serve Pending Bit reads 222 * to the guest. 223 * 224 * So entry->phys_state is masked when there is no active irqfd route. 225 */ 226 if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state)) 227 return 0; 228 229 if (msi_is_masked(entry->phys_state)) { 230 ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1); 231 if (ret < 0) { 232 vfio_dev_err(vdev, "cannot setup irqfd"); 233 return ret; 234 } 235 } else { 236 irq__del_irqfd(kvm, entry->gsi, entry->eventfd); 237 } 238 239 msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state)); 240 241 return 0; 242 } 243 244 static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 245 u32 len, u8 is_write, void *ptr) 246 { 247 struct vfio_pci_device *pdev = ptr; 248 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 249 u64 offset = addr - pba->guest_phys_addr; 250 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 251 252 if (is_write) 253 return; 254 255 /* 256 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA 257 * is completely useless here. Note that Linux doesn't use PBA. 258 */ 259 if (pread(vdev->fd, data, len, pba->fd_offset + offset) != (ssize_t)len) 260 vfio_dev_err(vdev, "cannot access MSIX PBA\n"); 261 } 262 263 static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 264 u32 len, u8 is_write, void *ptr) 265 { 266 struct kvm *kvm = vcpu->kvm; 267 struct vfio_pci_msi_entry *entry; 268 struct vfio_pci_device *pdev = ptr; 269 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 270 271 u64 offset = addr - pdev->msix_table.guest_phys_addr; 272 273 size_t vector = offset / PCI_MSIX_ENTRY_SIZE; 274 off_t field = offset % PCI_MSIX_ENTRY_SIZE; 275 276 /* 277 * PCI spec says that software must use aligned 4 or 8 bytes accesses 278 * for the MSI-X tables. 279 */ 280 if ((len != 4 && len != 8) || addr & (len - 1)) { 281 vfio_dev_warn(vdev, "invalid MSI-X table access"); 282 return; 283 } 284 285 entry = &pdev->msix.entries[vector]; 286 287 mutex_lock(&pdev->msix.mutex); 288 289 if (!is_write) { 290 memcpy(data, (void *)&entry->config + field, len); 291 goto out_unlock; 292 } 293 294 memcpy((void *)&entry->config + field, data, len); 295 296 /* 297 * Check if access touched the vector control register, which is at the 298 * end of the MSI-X entry. 299 */ 300 if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL) 301 goto out_unlock; 302 303 msi_set_masked(entry->virt_state, entry->config.ctrl & 304 PCI_MSIX_ENTRY_CTRL_MASKBIT); 305 306 if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0) 307 /* Not much we can do here. */ 308 vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector); 309 310 /* Update the physical capability if necessary */ 311 if (vfio_pci_enable_msis(kvm, vdev, true)) 312 vfio_dev_err(vdev, "cannot enable MSIX"); 313 314 out_unlock: 315 mutex_unlock(&pdev->msix.mutex); 316 } 317 318 static void vfio_pci_msix_cap_write(struct kvm *kvm, 319 struct vfio_device *vdev, u16 off, 320 void *data, int sz) 321 { 322 struct vfio_pci_device *pdev = &vdev->pci; 323 off_t enable_pos = PCI_MSIX_FLAGS + 1; 324 bool enable; 325 u16 flags; 326 327 off -= pdev->msix.pos; 328 329 /* Check if access intersects with the MSI-X Enable bit */ 330 if (off > enable_pos || off + sz <= enable_pos) 331 return; 332 333 /* Read byte that contains the Enable bit */ 334 flags = *(u8 *)(data + enable_pos - off) << 8; 335 336 mutex_lock(&pdev->msix.mutex); 337 338 msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL); 339 enable = flags & PCI_MSIX_FLAGS_ENABLE; 340 msi_set_enabled(pdev->msix.virt_state, enable); 341 342 if (enable && vfio_pci_enable_msis(kvm, vdev, true)) 343 vfio_dev_err(vdev, "cannot enable MSIX"); 344 else if (!enable && vfio_pci_disable_msis(kvm, vdev, true)) 345 vfio_dev_err(vdev, "cannot disable MSIX"); 346 347 mutex_unlock(&pdev->msix.mutex); 348 } 349 350 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev, 351 u16 off, u8 *data, u32 sz) 352 { 353 size_t i; 354 u32 mask = 0; 355 size_t mask_pos, start, limit; 356 struct vfio_pci_msi_entry *entry; 357 struct vfio_pci_device *pdev = &vdev->pci; 358 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 359 360 if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT)) 361 return 0; 362 363 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) 364 mask_pos = PCI_MSI_MASK_64; 365 else 366 mask_pos = PCI_MSI_MASK_32; 367 368 if (off >= mask_pos + 4 || off + sz <= mask_pos) 369 return 0; 370 371 /* Set mask to current state */ 372 for (i = 0; i < pdev->msi.nr_entries; i++) { 373 entry = &pdev->msi.entries[i]; 374 mask |= !!msi_is_masked(entry->virt_state) << i; 375 } 376 377 /* Update mask following the intersection of access and register */ 378 start = max_t(size_t, off, mask_pos); 379 limit = min_t(size_t, off + sz, mask_pos + 4); 380 381 memcpy((void *)&mask + start - mask_pos, data + start - off, 382 limit - start); 383 384 /* Update states if necessary */ 385 for (i = 0; i < pdev->msi.nr_entries; i++) { 386 bool masked = mask & (1 << i); 387 388 entry = &pdev->msi.entries[i]; 389 if (masked != msi_is_masked(entry->virt_state)) { 390 msi_set_masked(entry->virt_state, masked); 391 vfio_pci_update_msi_entry(kvm, vdev, entry); 392 } 393 } 394 395 return 1; 396 } 397 398 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev, 399 u16 off, u8 *data, u32 sz) 400 { 401 u8 ctrl; 402 struct msi_msg msg; 403 size_t i, nr_vectors; 404 struct vfio_pci_msi_entry *entry; 405 struct vfio_pci_device *pdev = &vdev->pci; 406 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 407 408 off -= pdev->msi.pos; 409 410 mutex_lock(&pdev->msi.mutex); 411 412 /* Check if the guest is trying to update mask bits */ 413 if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz)) 414 goto out_unlock; 415 416 /* Only modify routes when guest pokes the enable bit */ 417 if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS) 418 goto out_unlock; 419 420 ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off); 421 422 msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE); 423 424 if (!msi_is_enabled(pdev->msi.virt_state)) { 425 vfio_pci_disable_msis(kvm, vdev, false); 426 goto out_unlock; 427 } 428 429 /* Create routes for the requested vectors */ 430 nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4); 431 432 msg.address_lo = msi_cap_64->address_lo; 433 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) { 434 msg.address_hi = msi_cap_64->address_hi; 435 msg.data = msi_cap_64->data; 436 } else { 437 struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64; 438 msg.address_hi = 0; 439 msg.data = msi_cap_32->data; 440 } 441 442 for (i = 0; i < nr_vectors; i++) { 443 entry = &pdev->msi.entries[i]; 444 445 /* 446 * Set the MSI data value as required by the PCI local 447 * bus specifications, MSI capability, "Message Data". 448 */ 449 msg.data &= ~(nr_vectors - 1); 450 msg.data |= i; 451 452 entry->config.msg = msg; 453 vfio_pci_update_msi_entry(kvm, vdev, entry); 454 } 455 456 /* Update the physical capability if necessary */ 457 if (vfio_pci_enable_msis(kvm, vdev, false)) 458 vfio_dev_err(vdev, "cannot enable MSI"); 459 460 out_unlock: 461 mutex_unlock(&pdev->msi.mutex); 462 } 463 464 static int vfio_pci_bar_activate(struct kvm *kvm, 465 struct pci_device_header *pci_hdr, 466 int bar_num, void *data) 467 { 468 struct vfio_device *vdev = data; 469 struct vfio_pci_device *pdev = &vdev->pci; 470 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 471 struct vfio_pci_msix_table *table = &pdev->msix_table; 472 struct vfio_region *region; 473 u32 bar_addr; 474 bool has_msix; 475 int ret; 476 477 assert((u32)bar_num < vdev->info.num_regions); 478 479 region = &vdev->regions[bar_num]; 480 has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX; 481 482 bar_addr = pci__bar_address(pci_hdr, bar_num); 483 if (pci__bar_is_io(pci_hdr, bar_num)) 484 region->port_base = bar_addr; 485 else 486 region->guest_phys_addr = bar_addr; 487 488 if (has_msix && (u32)bar_num == table->bar) { 489 table->guest_phys_addr = region->guest_phys_addr; 490 ret = kvm__register_mmio(kvm, table->guest_phys_addr, 491 table->size, false, 492 vfio_pci_msix_table_access, pdev); 493 /* 494 * The MSIX table and the PBA structure can share the same BAR, 495 * but for convenience we register different regions for mmio 496 * emulation. We want to we update both if they share the same 497 * BAR. 498 */ 499 if (ret < 0 || table->bar != pba->bar) 500 goto out; 501 } 502 503 if (has_msix && (u32)bar_num == pba->bar) { 504 if (pba->bar == table->bar) 505 pba->guest_phys_addr = table->guest_phys_addr + pba->bar_offset; 506 else 507 pba->guest_phys_addr = region->guest_phys_addr; 508 ret = kvm__register_mmio(kvm, pba->guest_phys_addr, 509 pba->size, false, 510 vfio_pci_msix_pba_access, pdev); 511 goto out; 512 } 513 514 ret = vfio_map_region(kvm, vdev, region); 515 out: 516 return ret; 517 } 518 519 static int vfio_pci_bar_deactivate(struct kvm *kvm, 520 struct pci_device_header *pci_hdr, 521 int bar_num, void *data) 522 { 523 struct vfio_device *vdev = data; 524 struct vfio_pci_device *pdev = &vdev->pci; 525 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 526 struct vfio_pci_msix_table *table = &pdev->msix_table; 527 struct vfio_region *region; 528 bool has_msix, success; 529 int ret; 530 531 assert((u32)bar_num < vdev->info.num_regions); 532 533 region = &vdev->regions[bar_num]; 534 has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX; 535 536 if (has_msix && (u32)bar_num == table->bar) { 537 success = kvm__deregister_mmio(kvm, table->guest_phys_addr); 538 /* kvm__deregister_mmio fails when the region is not found. */ 539 ret = (success ? 0 : -ENOENT); 540 /* See vfio_pci_bar_activate(). */ 541 if (ret < 0 || table->bar!= pba->bar) 542 goto out; 543 } 544 545 if (has_msix && (u32)bar_num == pba->bar) { 546 success = kvm__deregister_mmio(kvm, pba->guest_phys_addr); 547 ret = (success ? 0 : -ENOENT); 548 goto out; 549 } 550 551 vfio_unmap_region(kvm, region); 552 ret = 0; 553 554 out: 555 return ret; 556 } 557 558 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr, 559 u16 offset, void *data, int sz) 560 { 561 struct vfio_region_info *info; 562 struct vfio_pci_device *pdev; 563 struct vfio_device *vdev; 564 char base[sz]; 565 566 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 567 vdev = container_of(pdev, struct vfio_device, pci); 568 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 569 570 /* Dummy read in case of side-effects */ 571 if (pread(vdev->fd, base, sz, info->offset + offset) != sz) 572 vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x", 573 sz, offset); 574 } 575 576 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr, 577 u16 offset, void *data, int sz) 578 { 579 struct vfio_region_info *info; 580 struct vfio_pci_device *pdev; 581 struct vfio_device *vdev; 582 u32 tmp; 583 584 /* Make sure a larger size will not overrun tmp on the stack. */ 585 assert(sz <= 4); 586 587 if (offset == PCI_ROM_ADDRESS) 588 return; 589 590 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 591 vdev = container_of(pdev, struct vfio_device, pci); 592 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 593 594 if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz) 595 vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x", 596 sz, offset); 597 598 /* Handle MSI write now, since it might update the hardware capability */ 599 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) 600 vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz); 601 602 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) 603 vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz); 604 605 if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz) 606 vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x", 607 sz, offset); 608 } 609 610 static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr) 611 { 612 size_t size = 10; 613 614 if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT) 615 size += 4; 616 if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT) 617 size += 10; 618 619 return size; 620 } 621 622 static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr) 623 { 624 switch (cap_hdr->type) { 625 case PCI_CAP_ID_MSIX: 626 return PCI_CAP_MSIX_SIZEOF; 627 case PCI_CAP_ID_MSI: 628 return vfio_pci_msi_cap_size((void *)cap_hdr); 629 case PCI_CAP_ID_EXP: 630 /* 631 * We don't emulate any of the link, slot and root complex 632 * properties, so ignore them. 633 */ 634 return PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1; 635 default: 636 pr_err("unknown PCI capability 0x%x", cap_hdr->type); 637 return 0; 638 } 639 } 640 641 static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr, 642 struct pci_cap_hdr *cap, off_t pos) 643 { 644 struct pci_cap_hdr *last; 645 struct pci_device_header *hdr = &vdev->pci.hdr; 646 647 cap->next = 0; 648 649 if (!hdr->capabilities) { 650 hdr->capabilities = pos; 651 hdr->status |= PCI_STATUS_CAP_LIST; 652 } else { 653 last = PCI_CAP(virt_hdr, hdr->capabilities); 654 655 while (last->next) 656 last = PCI_CAP(virt_hdr, last->next); 657 658 last->next = pos; 659 } 660 661 memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap)); 662 663 return 0; 664 } 665 666 static int vfio_pci_parse_caps(struct vfio_device *vdev) 667 { 668 int ret; 669 size_t size; 670 u16 pos, next; 671 struct pci_cap_hdr *cap; 672 u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY]; 673 struct vfio_pci_device *pdev = &vdev->pci; 674 675 if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST)) 676 return 0; 677 678 memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY); 679 680 pos = pdev->hdr.capabilities & ~3; 681 682 pdev->hdr.status &= ~PCI_STATUS_CAP_LIST; 683 pdev->hdr.capabilities = 0; 684 685 for (; pos; pos = next) { 686 cap = PCI_CAP(&pdev->hdr, pos); 687 next = cap->next; 688 689 switch (cap->type) { 690 case PCI_CAP_ID_MSIX: 691 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 692 if (ret) 693 return ret; 694 695 pdev->msix.pos = pos; 696 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX; 697 break; 698 case PCI_CAP_ID_MSI: 699 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 700 if (ret) 701 return ret; 702 703 pdev->msi.pos = pos; 704 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI; 705 break; 706 case PCI_CAP_ID_EXP: 707 if (!arch_has_pci_exp()) 708 continue; 709 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 710 if (ret) 711 return ret; 712 break; 713 } 714 } 715 716 /* Wipe remaining capabilities */ 717 pos = PCI_STD_HEADER_SIZEOF; 718 size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF; 719 memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size); 720 721 return 0; 722 } 723 724 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev) 725 { 726 ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY; 727 struct vfio_region_info *info; 728 struct vfio_pci_device *pdev = &vdev->pci; 729 730 if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { 731 vfio_dev_err(vdev, "Config Space not found"); 732 return -ENODEV; 733 } 734 735 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 736 *info = (struct vfio_region_info) { 737 .argsz = sizeof(*info), 738 .index = VFIO_PCI_CONFIG_REGION_INDEX, 739 }; 740 741 ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 742 if (!info->size) { 743 vfio_dev_err(vdev, "Config Space has size zero?!"); 744 return -EINVAL; 745 } 746 747 /* Read standard headers and capabilities */ 748 if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) { 749 vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz); 750 return -EIO; 751 } 752 753 /* Strip bit 7, that indicates multifunction */ 754 pdev->hdr.header_type &= 0x7f; 755 756 if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) { 757 vfio_dev_err(vdev, "unsupported header type %u", 758 pdev->hdr.header_type); 759 return -EOPNOTSUPP; 760 } 761 762 if (pdev->hdr.irq_pin) 763 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX; 764 765 vfio_pci_parse_caps(vdev); 766 767 return 0; 768 } 769 770 static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev) 771 { 772 int i; 773 u64 base; 774 ssize_t hdr_sz; 775 struct msix_cap *msix; 776 struct vfio_region_info *info; 777 struct vfio_pci_device *pdev = &vdev->pci; 778 struct vfio_region *region; 779 780 /* Initialise the BARs */ 781 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 782 if ((u32)i == vdev->info.num_regions) 783 break; 784 785 region = &vdev->regions[i]; 786 /* Construct a fake reg to match what we've mapped. */ 787 if (region->is_ioport) { 788 base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) | 789 PCI_BASE_ADDRESS_SPACE_IO; 790 } else { 791 base = (region->guest_phys_addr & 792 PCI_BASE_ADDRESS_MEM_MASK) | 793 PCI_BASE_ADDRESS_SPACE_MEMORY; 794 } 795 796 pdev->hdr.bar[i] = base; 797 798 if (!base) 799 continue; 800 801 pdev->hdr.bar_size[i] = region->info.size; 802 } 803 804 /* I really can't be bothered to support cardbus. */ 805 pdev->hdr.card_bus = 0; 806 807 /* 808 * Nuke the expansion ROM for now. If we want to do this properly, 809 * we need to save its size somewhere and map into the guest. 810 */ 811 pdev->hdr.exp_rom_bar = 0; 812 813 /* Plumb in our fake MSI-X capability, if we have it. */ 814 msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX); 815 if (msix) { 816 /* Add a shortcut to the PBA region for the MMIO handler */ 817 int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar; 818 u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET; 819 820 pdev->msix_pba.fd_offset = vdev->regions[pba_index].info.offset + 821 pba_bar_offset; 822 823 /* Tidy up the capability */ 824 msix->table_offset &= PCI_MSIX_TABLE_BIR; 825 if (pdev->msix_table.bar == pdev->msix_pba.bar) { 826 /* Keep the same offset as the MSIX cap. */ 827 pdev->msix_pba.bar_offset = pba_bar_offset; 828 } else { 829 /* PBA is at the start of the BAR. */ 830 msix->pba_offset &= PCI_MSIX_PBA_BIR; 831 pdev->msix_pba.bar_offset = 0; 832 } 833 } 834 835 /* Install our fake Configuration Space */ 836 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 837 /* 838 * We don't touch the extended configuration space, let's be cautious 839 * and not overwrite it all with zeros, or bad things might happen. 840 */ 841 hdr_sz = PCI_DEV_CFG_SIZE_LEGACY; 842 if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) { 843 vfio_dev_err(vdev, "failed to write %zd bytes to Config Space", 844 hdr_sz); 845 return -EIO; 846 } 847 848 /* Register callbacks for cfg accesses */ 849 pdev->hdr.cfg_ops = (struct pci_config_operations) { 850 .read = vfio_pci_cfg_read, 851 .write = vfio_pci_cfg_write, 852 }; 853 854 pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH; 855 856 return 0; 857 } 858 859 static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index, 860 struct vfio_region_info *info) 861 { 862 int ret; 863 864 *info = (struct vfio_region_info) { 865 .argsz = sizeof(*info), 866 .index = index, 867 }; 868 869 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 870 if (ret) { 871 ret = -errno; 872 vfio_dev_err(vdev, "cannot get info for BAR %u", index); 873 return ret; 874 } 875 876 if (info->size && !is_power_of_two(info->size)) { 877 vfio_dev_err(vdev, "region is not power of two: 0x%llx", 878 info->size); 879 return -EINVAL; 880 } 881 882 return 0; 883 } 884 885 static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev) 886 { 887 int ret; 888 size_t i; 889 size_t map_size; 890 size_t nr_entries; 891 struct vfio_pci_msi_entry *entries; 892 struct vfio_pci_device *pdev = &vdev->pci; 893 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 894 struct vfio_pci_msix_table *table = &pdev->msix_table; 895 struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos); 896 struct vfio_region_info info; 897 898 table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR; 899 pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR; 900 901 nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; 902 903 /* MSIX table and PBA must support QWORD accesses. */ 904 table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, 8); 905 pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), 8); 906 907 entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry)); 908 if (!entries) 909 return -ENOMEM; 910 911 for (i = 0; i < nr_entries; i++) 912 entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; 913 914 ret = vfio_pci_get_region_info(vdev, table->bar, &info); 915 if (ret) 916 return ret; 917 if (!info.size) 918 return -EINVAL; 919 920 map_size = ALIGN(info.size, PAGE_SIZE); 921 table->guest_phys_addr = pci_get_mmio_block(map_size); 922 if (!table->guest_phys_addr) { 923 pr_err("cannot allocate MMIO space"); 924 ret = -ENOMEM; 925 goto out_free; 926 } 927 928 /* 929 * We could map the physical PBA directly into the guest, but it's 930 * likely smaller than a page, and we can only hand full pages to the 931 * guest. Even though the PCI spec disallows sharing a page used for 932 * MSI-X with any other resource, it allows to share the same page 933 * between MSI-X table and PBA. For the sake of isolation, create a 934 * virtual PBA. 935 */ 936 if (table->bar == pba->bar) { 937 u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET; 938 939 /* Sanity checks. */ 940 if (table->size > pba_bar_offset) 941 die("MSIX table overlaps with PBA"); 942 if (pba_bar_offset + pba->size > info.size) 943 die("PBA exceeds the size of the region"); 944 pba->guest_phys_addr = table->guest_phys_addr + pba_bar_offset; 945 } else { 946 ret = vfio_pci_get_region_info(vdev, pba->bar, &info); 947 if (ret) 948 return ret; 949 if (!info.size) 950 return -EINVAL; 951 952 map_size = ALIGN(info.size, PAGE_SIZE); 953 pba->guest_phys_addr = pci_get_mmio_block(map_size); 954 if (!pba->guest_phys_addr) { 955 pr_err("cannot allocate MMIO space"); 956 ret = -ENOMEM; 957 goto out_free; 958 } 959 } 960 961 pdev->msix.entries = entries; 962 pdev->msix.nr_entries = nr_entries; 963 964 return 0; 965 966 out_free: 967 free(entries); 968 969 return ret; 970 } 971 972 static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev) 973 { 974 struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos); 975 976 pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1), 977 pdev->msi.entries = calloc(pdev->msi.nr_entries, 978 sizeof(struct vfio_pci_msi_entry)); 979 if (!pdev->msi.entries) 980 return -ENOMEM; 981 982 return 0; 983 } 984 985 static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev, 986 size_t nr) 987 { 988 int ret; 989 u32 bar; 990 size_t map_size; 991 struct vfio_pci_device *pdev = &vdev->pci; 992 struct vfio_region *region; 993 994 if (nr >= vdev->info.num_regions) 995 return 0; 996 997 region = &vdev->regions[nr]; 998 bar = pdev->hdr.bar[nr]; 999 1000 region->vdev = vdev; 1001 region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO); 1002 1003 ret = vfio_pci_get_region_info(vdev, nr, ®ion->info); 1004 if (ret) 1005 return ret; 1006 1007 /* Ignore invalid or unimplemented regions */ 1008 if (!region->info.size) 1009 return 0; 1010 1011 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1012 /* Trap and emulate MSI-X table */ 1013 if (nr == pdev->msix_table.bar) { 1014 region->guest_phys_addr = pdev->msix_table.guest_phys_addr; 1015 return 0; 1016 } else if (nr == pdev->msix_pba.bar) { 1017 region->guest_phys_addr = pdev->msix_pba.guest_phys_addr; 1018 return 0; 1019 } 1020 } 1021 1022 if (region->is_ioport) { 1023 region->port_base = pci_get_io_port_block(region->info.size); 1024 } else { 1025 /* Grab some MMIO space in the guest */ 1026 map_size = ALIGN(region->info.size, PAGE_SIZE); 1027 region->guest_phys_addr = pci_get_mmio_block(map_size); 1028 } 1029 1030 return 0; 1031 } 1032 1033 static int vfio_pci_configure_dev_regions(struct kvm *kvm, 1034 struct vfio_device *vdev) 1035 { 1036 int ret; 1037 u32 bar; 1038 size_t i; 1039 bool is_64bit = false; 1040 struct vfio_pci_device *pdev = &vdev->pci; 1041 1042 ret = vfio_pci_parse_cfg_space(vdev); 1043 if (ret) 1044 return ret; 1045 1046 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1047 ret = vfio_pci_create_msix_table(kvm, vdev); 1048 if (ret) 1049 return ret; 1050 } 1051 1052 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 1053 ret = vfio_pci_create_msi_cap(kvm, pdev); 1054 if (ret) 1055 return ret; 1056 } 1057 1058 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 1059 /* Ignore top half of 64-bit BAR */ 1060 if (is_64bit) { 1061 is_64bit = false; 1062 continue; 1063 } 1064 1065 ret = vfio_pci_configure_bar(kvm, vdev, i); 1066 if (ret) 1067 return ret; 1068 1069 bar = pdev->hdr.bar[i]; 1070 is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) == 1071 PCI_BASE_ADDRESS_SPACE_MEMORY && 1072 bar & PCI_BASE_ADDRESS_MEM_TYPE_64; 1073 } 1074 1075 /* We've configured the BARs, fake up a Configuration Space */ 1076 ret = vfio_pci_fixup_cfg_space(vdev); 1077 if (ret) 1078 return ret; 1079 1080 return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate, 1081 vfio_pci_bar_deactivate, vdev); 1082 } 1083 1084 /* 1085 * Attempt to update the FD limit, if opening an eventfd for each IRQ vector 1086 * would hit the limit. Which is likely to happen when a device uses 2048 MSIs. 1087 */ 1088 static int vfio_pci_reserve_irq_fds(size_t num) 1089 { 1090 /* 1091 * I counted around 27 fds under normal load. Let's add 100 for good 1092 * measure. 1093 */ 1094 static size_t needed = 128; 1095 struct rlimit fd_limit, new_limit; 1096 1097 needed += num; 1098 1099 if (getrlimit(RLIMIT_NOFILE, &fd_limit)) { 1100 perror("getrlimit(RLIMIT_NOFILE)"); 1101 return 0; 1102 } 1103 1104 if (fd_limit.rlim_cur >= needed) 1105 return 0; 1106 1107 new_limit.rlim_cur = needed; 1108 1109 if (fd_limit.rlim_max < needed) 1110 /* Try to bump hard limit (root only) */ 1111 new_limit.rlim_max = needed; 1112 else 1113 new_limit.rlim_max = fd_limit.rlim_max; 1114 1115 if (setrlimit(RLIMIT_NOFILE, &new_limit)) { 1116 perror("setrlimit(RLIMIT_NOFILE)"); 1117 pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)", 1118 (size_t)(needed - fd_limit.rlim_cur)); 1119 } 1120 1121 return 0; 1122 } 1123 1124 static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev, 1125 struct vfio_pci_msi_common *msis) 1126 { 1127 int ret; 1128 size_t i; 1129 int *eventfds; 1130 size_t irq_set_size; 1131 struct vfio_pci_msi_entry *entry; 1132 size_t nr_entries = msis->nr_entries; 1133 1134 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info); 1135 if (ret || msis->info.count == 0) { 1136 vfio_dev_err(vdev, "no MSI reported by VFIO"); 1137 return -ENODEV; 1138 } 1139 1140 if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) { 1141 vfio_dev_err(vdev, "interrupt not EVENTFD capable"); 1142 return -EINVAL; 1143 } 1144 1145 if (msis->info.count != nr_entries) { 1146 vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO"); 1147 return -EINVAL; 1148 } 1149 1150 mutex_init(&msis->mutex); 1151 1152 vfio_pci_reserve_irq_fds(nr_entries); 1153 1154 irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int); 1155 msis->irq_set = malloc(irq_set_size); 1156 if (!msis->irq_set) 1157 return -ENOMEM; 1158 1159 *msis->irq_set = (struct vfio_irq_set) { 1160 .argsz = irq_set_size, 1161 .flags = VFIO_IRQ_SET_DATA_EVENTFD | 1162 VFIO_IRQ_SET_ACTION_TRIGGER, 1163 .index = msis->info.index, 1164 .start = 0, 1165 .count = nr_entries, 1166 }; 1167 1168 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 1169 1170 for (i = 0; i < nr_entries; i++) { 1171 entry = &msis->entries[i]; 1172 entry->gsi = -1; 1173 entry->eventfd = -1; 1174 msi_set_masked(entry->virt_state, true); 1175 msi_set_masked(entry->phys_state, true); 1176 eventfds[i] = -1; 1177 } 1178 1179 return 0; 1180 } 1181 1182 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev) 1183 { 1184 struct vfio_pci_device *pdev = &vdev->pci; 1185 int gsi = pdev->intx_gsi; 1186 struct vfio_irq_set irq_set = { 1187 .argsz = sizeof(irq_set), 1188 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 1189 .index = VFIO_PCI_INTX_IRQ_INDEX, 1190 }; 1191 1192 if (pdev->intx_fd == -1) 1193 return; 1194 1195 pr_debug("user requested MSI, disabling INTx %d", gsi); 1196 1197 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 1198 irq__del_irqfd(kvm, gsi, pdev->intx_fd); 1199 1200 close(pdev->intx_fd); 1201 close(pdev->unmask_fd); 1202 pdev->intx_fd = -1; 1203 } 1204 1205 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev) 1206 { 1207 int ret; 1208 int trigger_fd, unmask_fd; 1209 union vfio_irq_eventfd trigger; 1210 union vfio_irq_eventfd unmask; 1211 struct vfio_pci_device *pdev = &vdev->pci; 1212 int gsi = pdev->intx_gsi; 1213 1214 if (pdev->intx_fd != -1) 1215 return 0; 1216 1217 /* 1218 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd 1219 * signals an interrupt from host to guest, and unmask_fd signals the 1220 * deassertion of the line from guest to host. 1221 */ 1222 trigger_fd = eventfd(0, 0); 1223 if (trigger_fd < 0) { 1224 vfio_dev_err(vdev, "failed to create trigger eventfd"); 1225 return trigger_fd; 1226 } 1227 1228 unmask_fd = eventfd(0, 0); 1229 if (unmask_fd < 0) { 1230 vfio_dev_err(vdev, "failed to create unmask eventfd"); 1231 close(trigger_fd); 1232 return unmask_fd; 1233 } 1234 1235 ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd); 1236 if (ret) 1237 goto err_close; 1238 1239 trigger.irq = (struct vfio_irq_set) { 1240 .argsz = sizeof(trigger), 1241 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, 1242 .index = VFIO_PCI_INTX_IRQ_INDEX, 1243 .start = 0, 1244 .count = 1, 1245 }; 1246 set_vfio_irq_eventd_payload(&trigger, trigger_fd); 1247 1248 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 1249 if (ret < 0) { 1250 vfio_dev_err(vdev, "failed to setup VFIO IRQ"); 1251 goto err_delete_line; 1252 } 1253 1254 unmask.irq = (struct vfio_irq_set) { 1255 .argsz = sizeof(unmask), 1256 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK, 1257 .index = VFIO_PCI_INTX_IRQ_INDEX, 1258 .start = 0, 1259 .count = 1, 1260 }; 1261 set_vfio_irq_eventd_payload(&unmask, unmask_fd); 1262 1263 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask); 1264 if (ret < 0) { 1265 vfio_dev_err(vdev, "failed to setup unmask IRQ"); 1266 goto err_remove_event; 1267 } 1268 1269 pdev->intx_fd = trigger_fd; 1270 pdev->unmask_fd = unmask_fd; 1271 1272 return 0; 1273 1274 err_remove_event: 1275 /* Remove trigger event */ 1276 trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; 1277 trigger.irq.count = 0; 1278 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 1279 1280 err_delete_line: 1281 irq__del_irqfd(kvm, gsi, trigger_fd); 1282 1283 err_close: 1284 close(trigger_fd); 1285 close(unmask_fd); 1286 return ret; 1287 } 1288 1289 static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev) 1290 { 1291 int ret; 1292 struct vfio_pci_device *pdev = &vdev->pci; 1293 struct vfio_irq_info irq_info = { 1294 .argsz = sizeof(irq_info), 1295 .index = VFIO_PCI_INTX_IRQ_INDEX, 1296 }; 1297 1298 vfio_pci_reserve_irq_fds(2); 1299 1300 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); 1301 if (ret || irq_info.count == 0) { 1302 vfio_dev_err(vdev, "no INTx reported by VFIO"); 1303 return -ENODEV; 1304 } 1305 1306 if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { 1307 vfio_dev_err(vdev, "interrupt not eventfd capable"); 1308 return -EINVAL; 1309 } 1310 1311 if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) { 1312 vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED"); 1313 return -EINVAL; 1314 } 1315 1316 /* Guest is going to ovewrite our irq_line... */ 1317 pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET; 1318 1319 pdev->intx_fd = -1; 1320 1321 return 0; 1322 } 1323 1324 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev) 1325 { 1326 int ret = 0; 1327 struct vfio_pci_device *pdev = &vdev->pci; 1328 1329 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1330 pdev->msix.info = (struct vfio_irq_info) { 1331 .argsz = sizeof(pdev->msix.info), 1332 .index = VFIO_PCI_MSIX_IRQ_INDEX, 1333 }; 1334 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix); 1335 if (ret) 1336 return ret; 1337 } 1338 1339 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 1340 pdev->msi.info = (struct vfio_irq_info) { 1341 .argsz = sizeof(pdev->msi.info), 1342 .index = VFIO_PCI_MSI_IRQ_INDEX, 1343 }; 1344 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi); 1345 if (ret) 1346 return ret; 1347 } 1348 1349 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) { 1350 pci__assign_irq(&vdev->pci.hdr); 1351 1352 ret = vfio_pci_init_intx(kvm, vdev); 1353 if (ret) 1354 return ret; 1355 1356 ret = vfio_pci_enable_intx(kvm, vdev); 1357 } 1358 1359 return ret; 1360 } 1361 1362 int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev) 1363 { 1364 int ret; 1365 1366 ret = vfio_pci_configure_dev_regions(kvm, vdev); 1367 if (ret) { 1368 vfio_dev_err(vdev, "failed to configure regions"); 1369 return ret; 1370 } 1371 1372 vdev->dev_hdr = (struct device_header) { 1373 .bus_type = DEVICE_BUS_PCI, 1374 .data = &vdev->pci.hdr, 1375 }; 1376 1377 ret = device__register(&vdev->dev_hdr); 1378 if (ret) { 1379 vfio_dev_err(vdev, "failed to register VFIO device"); 1380 return ret; 1381 } 1382 1383 ret = vfio_pci_configure_dev_irqs(kvm, vdev); 1384 if (ret) { 1385 vfio_dev_err(vdev, "failed to configure IRQs"); 1386 return ret; 1387 } 1388 1389 return 0; 1390 } 1391 1392 void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev) 1393 { 1394 size_t i; 1395 struct vfio_pci_device *pdev = &vdev->pci; 1396 1397 for (i = 0; i < vdev->info.num_regions; i++) 1398 vfio_unmap_region(kvm, &vdev->regions[i]); 1399 1400 device__unregister(&vdev->dev_hdr); 1401 1402 free(pdev->msix.irq_set); 1403 free(pdev->msix.entries); 1404 free(pdev->msi.irq_set); 1405 free(pdev->msi.entries); 1406 } 1407