1 #include "kvm/irq.h" 2 #include "kvm/kvm.h" 3 #include "kvm/kvm-cpu.h" 4 #include "kvm/vfio.h" 5 6 #include <assert.h> 7 8 #include <sys/ioctl.h> 9 #include <sys/eventfd.h> 10 #include <sys/resource.h> 11 #include <sys/time.h> 12 13 #include <assert.h> 14 15 /* Some distros don't have the define. */ 16 #ifndef PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 17 #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 12 18 #endif 19 20 /* Wrapper around UAPI vfio_irq_set */ 21 union vfio_irq_eventfd { 22 struct vfio_irq_set irq; 23 u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)]; 24 }; 25 26 static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd) 27 { 28 memcpy(&evfd->irq.data, &fd, sizeof(fd)); 29 } 30 31 #define msi_is_enabled(state) ((state) & VFIO_PCI_MSI_STATE_ENABLED) 32 #define msi_is_masked(state) ((state) & VFIO_PCI_MSI_STATE_MASKED) 33 #define msi_is_empty(state) ((state) & VFIO_PCI_MSI_STATE_EMPTY) 34 35 #define msi_update_state(state, val, bit) \ 36 (state) = (val) ? (state) | bit : (state) & ~bit; 37 #define msi_set_enabled(state, val) \ 38 msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED) 39 #define msi_set_masked(state, val) \ 40 msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED) 41 #define msi_set_empty(state, val) \ 42 msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY) 43 44 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev); 45 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev); 46 47 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev, 48 bool msix) 49 { 50 size_t i; 51 int ret = 0; 52 int *eventfds; 53 struct vfio_pci_device *pdev = &vdev->pci; 54 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 55 union vfio_irq_eventfd single = { 56 .irq = { 57 .argsz = sizeof(single), 58 .flags = VFIO_IRQ_SET_DATA_EVENTFD | 59 VFIO_IRQ_SET_ACTION_TRIGGER, 60 .index = msis->info.index, 61 .count = 1, 62 }, 63 }; 64 65 if (!msi_is_enabled(msis->virt_state)) 66 return 0; 67 68 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 69 /* 70 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same 71 * time. Since INTx has to be enabled from the start (we don't 72 * have a reliable way to know when the guest starts using it), 73 * disable it now. 74 */ 75 vfio_pci_disable_intx(kvm, vdev); 76 77 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 78 79 /* 80 * Initial registration of the full range. This enables the physical 81 * MSI/MSI-X capability, which might have desired side effects. For 82 * instance when assigning virtio legacy devices, enabling the MSI 83 * capability modifies the config space layout! 84 * 85 * As an optimization, only update MSIs when guest unmasks the 86 * capability. This greatly reduces the initialization time for Linux 87 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap 88 * masked, then fills individual vectors, then unmasks the whole 89 * function. So we only do one VFIO ioctl when enabling for the first 90 * time, and then one when unmasking. 91 * 92 * phys_state is empty when it is enabled but no vector has been 93 * registered via SET_IRQS yet. 94 */ 95 if (!msi_is_enabled(msis->phys_state) || 96 (!msi_is_masked(msis->virt_state) && 97 msi_is_empty(msis->phys_state))) { 98 bool empty = true; 99 100 for (i = 0; i < msis->nr_entries; i++) { 101 eventfds[i] = msis->entries[i].gsi >= 0 ? 102 msis->entries[i].eventfd : -1; 103 104 if (eventfds[i] >= 0) 105 empty = false; 106 } 107 108 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set); 109 if (ret < 0) { 110 perror("VFIO_DEVICE_SET_IRQS(multi)"); 111 return ret; 112 } 113 114 msi_set_enabled(msis->phys_state, true); 115 msi_set_empty(msis->phys_state, empty); 116 117 return 0; 118 } 119 120 if (msi_is_masked(msis->virt_state)) { 121 /* TODO: if phys_state is not empty nor masked, mask all vectors */ 122 return 0; 123 } 124 125 /* Update individual vectors to avoid breaking those in use */ 126 for (i = 0; i < msis->nr_entries; i++) { 127 struct vfio_pci_msi_entry *entry = &msis->entries[i]; 128 int fd = entry->gsi >= 0 ? entry->eventfd : -1; 129 130 if (fd == eventfds[i]) 131 continue; 132 133 single.irq.start = i; 134 set_vfio_irq_eventd_payload(&single, fd); 135 136 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single); 137 if (ret < 0) { 138 perror("VFIO_DEVICE_SET_IRQS(single)"); 139 break; 140 } 141 142 eventfds[i] = fd; 143 144 if (msi_is_empty(msis->phys_state) && fd >= 0) 145 msi_set_empty(msis->phys_state, false); 146 } 147 148 return ret; 149 } 150 151 static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev, 152 bool msix) 153 { 154 int ret; 155 struct vfio_pci_device *pdev = &vdev->pci; 156 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 157 struct vfio_irq_set irq_set = { 158 .argsz = sizeof(irq_set), 159 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 160 .index = msis->info.index, 161 .start = 0, 162 .count = 0, 163 }; 164 165 if (!msi_is_enabled(msis->phys_state)) 166 return 0; 167 168 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 169 if (ret < 0) { 170 perror("VFIO_DEVICE_SET_IRQS(NONE)"); 171 return ret; 172 } 173 174 msi_set_enabled(msis->phys_state, false); 175 msi_set_empty(msis->phys_state, true); 176 177 /* 178 * When MSI or MSIX is disabled, this might be called when 179 * PCI driver detects the MSI interrupt failure and wants to 180 * rollback to INTx mode. Thus enable INTx if the device 181 * supports INTx mode in this case. 182 */ 183 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 184 ret = vfio_pci_enable_intx(kvm, vdev); 185 186 return ret >= 0 ? 0 : ret; 187 } 188 189 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev, 190 struct vfio_pci_msi_entry *entry) 191 { 192 int ret; 193 194 if (entry->eventfd < 0) { 195 entry->eventfd = eventfd(0, 0); 196 if (entry->eventfd < 0) { 197 ret = -errno; 198 vfio_dev_err(vdev, "cannot create eventfd"); 199 return ret; 200 } 201 } 202 203 /* Allocate IRQ if necessary */ 204 if (entry->gsi < 0) { 205 int ret = irq__add_msix_route(kvm, &entry->config.msg, 206 vdev->dev_hdr.dev_num << 3); 207 if (ret < 0) { 208 vfio_dev_err(vdev, "cannot create MSI-X route"); 209 return ret; 210 } 211 entry->gsi = ret; 212 } else { 213 irq__update_msix_route(kvm, entry->gsi, &entry->config.msg); 214 } 215 216 /* 217 * MSI masking is unimplemented in VFIO, so we have to handle it by 218 * disabling/enabling IRQ route instead. We do it on the KVM side rather 219 * than VFIO, because: 220 * - it is 8x faster 221 * - it allows to decouple masking logic from capability state. 222 * - in masked state, after removing irqfd route, we could easily plug 223 * the eventfd in a local handler, in order to serve Pending Bit reads 224 * to the guest. 225 * 226 * So entry->phys_state is masked when there is no active irqfd route. 227 */ 228 if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state)) 229 return 0; 230 231 if (msi_is_masked(entry->phys_state)) { 232 ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1); 233 if (ret < 0) { 234 vfio_dev_err(vdev, "cannot setup irqfd"); 235 return ret; 236 } 237 } else { 238 irq__del_irqfd(kvm, entry->gsi, entry->eventfd); 239 } 240 241 msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state)); 242 243 return 0; 244 } 245 246 static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 247 u32 len, u8 is_write, void *ptr) 248 { 249 struct vfio_pci_device *pdev = ptr; 250 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 251 u64 offset = addr - pba->guest_phys_addr; 252 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 253 254 if (is_write) 255 return; 256 257 /* 258 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA 259 * is completely useless here. Note that Linux doesn't use PBA. 260 */ 261 if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len) 262 vfio_dev_err(vdev, "cannot access MSIX PBA\n"); 263 } 264 265 static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 266 u32 len, u8 is_write, void *ptr) 267 { 268 struct kvm *kvm = vcpu->kvm; 269 struct vfio_pci_msi_entry *entry; 270 struct vfio_pci_device *pdev = ptr; 271 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 272 273 u64 offset = addr - pdev->msix_table.guest_phys_addr; 274 275 size_t vector = offset / PCI_MSIX_ENTRY_SIZE; 276 off_t field = offset % PCI_MSIX_ENTRY_SIZE; 277 278 /* 279 * PCI spec says that software must use aligned 4 or 8 bytes accesses 280 * for the MSI-X tables. 281 */ 282 if ((len != 4 && len != 8) || addr & (len - 1)) { 283 vfio_dev_warn(vdev, "invalid MSI-X table access"); 284 return; 285 } 286 287 entry = &pdev->msix.entries[vector]; 288 289 mutex_lock(&pdev->msix.mutex); 290 291 if (!is_write) { 292 memcpy(data, (void *)&entry->config + field, len); 293 goto out_unlock; 294 } 295 296 memcpy((void *)&entry->config + field, data, len); 297 298 /* 299 * Check if access touched the vector control register, which is at the 300 * end of the MSI-X entry. 301 */ 302 if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL) 303 goto out_unlock; 304 305 msi_set_masked(entry->virt_state, entry->config.ctrl & 306 PCI_MSIX_ENTRY_CTRL_MASKBIT); 307 308 if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0) 309 /* Not much we can do here. */ 310 vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector); 311 312 /* Update the physical capability if necessary */ 313 if (vfio_pci_enable_msis(kvm, vdev, true)) 314 vfio_dev_err(vdev, "cannot enable MSIX"); 315 316 out_unlock: 317 mutex_unlock(&pdev->msix.mutex); 318 } 319 320 static void vfio_pci_msix_cap_write(struct kvm *kvm, 321 struct vfio_device *vdev, u16 off, 322 void *data, int sz) 323 { 324 struct vfio_pci_device *pdev = &vdev->pci; 325 off_t enable_pos = PCI_MSIX_FLAGS + 1; 326 bool enable; 327 u16 flags; 328 329 off -= pdev->msix.pos; 330 331 /* Check if access intersects with the MSI-X Enable bit */ 332 if (off > enable_pos || off + sz <= enable_pos) 333 return; 334 335 /* Read byte that contains the Enable bit */ 336 flags = *(u8 *)(data + enable_pos - off) << 8; 337 338 mutex_lock(&pdev->msix.mutex); 339 340 msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL); 341 enable = flags & PCI_MSIX_FLAGS_ENABLE; 342 msi_set_enabled(pdev->msix.virt_state, enable); 343 344 if (enable && vfio_pci_enable_msis(kvm, vdev, true)) 345 vfio_dev_err(vdev, "cannot enable MSIX"); 346 else if (!enable && vfio_pci_disable_msis(kvm, vdev, true)) 347 vfio_dev_err(vdev, "cannot disable MSIX"); 348 349 mutex_unlock(&pdev->msix.mutex); 350 } 351 352 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev, 353 u16 off, u8 *data, u32 sz) 354 { 355 size_t i; 356 u32 mask = 0; 357 size_t mask_pos, start, limit; 358 struct vfio_pci_msi_entry *entry; 359 struct vfio_pci_device *pdev = &vdev->pci; 360 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 361 362 if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT)) 363 return 0; 364 365 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) 366 mask_pos = PCI_MSI_MASK_64; 367 else 368 mask_pos = PCI_MSI_MASK_32; 369 370 if (off >= mask_pos + 4 || off + sz <= mask_pos) 371 return 0; 372 373 /* Set mask to current state */ 374 for (i = 0; i < pdev->msi.nr_entries; i++) { 375 entry = &pdev->msi.entries[i]; 376 mask |= !!msi_is_masked(entry->virt_state) << i; 377 } 378 379 /* Update mask following the intersection of access and register */ 380 start = max_t(size_t, off, mask_pos); 381 limit = min_t(size_t, off + sz, mask_pos + 4); 382 383 memcpy((void *)&mask + start - mask_pos, data + start - off, 384 limit - start); 385 386 /* Update states if necessary */ 387 for (i = 0; i < pdev->msi.nr_entries; i++) { 388 bool masked = mask & (1 << i); 389 390 entry = &pdev->msi.entries[i]; 391 if (masked != msi_is_masked(entry->virt_state)) { 392 msi_set_masked(entry->virt_state, masked); 393 vfio_pci_update_msi_entry(kvm, vdev, entry); 394 } 395 } 396 397 return 1; 398 } 399 400 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev, 401 u16 off, u8 *data, u32 sz) 402 { 403 u8 ctrl; 404 struct msi_msg msg; 405 size_t i, nr_vectors; 406 struct vfio_pci_msi_entry *entry; 407 struct vfio_pci_device *pdev = &vdev->pci; 408 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 409 410 off -= pdev->msi.pos; 411 412 mutex_lock(&pdev->msi.mutex); 413 414 /* Check if the guest is trying to update mask bits */ 415 if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz)) 416 goto out_unlock; 417 418 /* Only modify routes when guest pokes the enable bit */ 419 if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS) 420 goto out_unlock; 421 422 ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off); 423 424 msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE); 425 426 if (!msi_is_enabled(pdev->msi.virt_state)) { 427 vfio_pci_disable_msis(kvm, vdev, false); 428 goto out_unlock; 429 } 430 431 /* Create routes for the requested vectors */ 432 nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4); 433 434 msg.address_lo = msi_cap_64->address_lo; 435 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) { 436 msg.address_hi = msi_cap_64->address_hi; 437 msg.data = msi_cap_64->data; 438 } else { 439 struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64; 440 msg.address_hi = 0; 441 msg.data = msi_cap_32->data; 442 } 443 444 for (i = 0; i < nr_vectors; i++) { 445 entry = &pdev->msi.entries[i]; 446 447 /* 448 * Set the MSI data value as required by the PCI local 449 * bus specifications, MSI capability, "Message Data". 450 */ 451 msg.data &= ~(nr_vectors - 1); 452 msg.data |= i; 453 454 entry->config.msg = msg; 455 vfio_pci_update_msi_entry(kvm, vdev, entry); 456 } 457 458 /* Update the physical capability if necessary */ 459 if (vfio_pci_enable_msis(kvm, vdev, false)) 460 vfio_dev_err(vdev, "cannot enable MSI"); 461 462 out_unlock: 463 mutex_unlock(&pdev->msi.mutex); 464 } 465 466 static int vfio_pci_bar_activate(struct kvm *kvm, 467 struct pci_device_header *pci_hdr, 468 int bar_num, void *data) 469 { 470 struct vfio_device *vdev = data; 471 struct vfio_pci_device *pdev = &vdev->pci; 472 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 473 struct vfio_pci_msix_table *table = &pdev->msix_table; 474 struct vfio_region *region; 475 u32 bar_addr; 476 bool has_msix; 477 int ret; 478 479 assert((u32)bar_num < vdev->info.num_regions); 480 481 region = &vdev->regions[bar_num]; 482 has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX; 483 484 bar_addr = pci__bar_address(pci_hdr, bar_num); 485 if (pci__bar_is_io(pci_hdr, bar_num)) 486 region->port_base = bar_addr; 487 else 488 region->guest_phys_addr = bar_addr; 489 490 if (has_msix && (u32)bar_num == table->bar) { 491 table->guest_phys_addr = region->guest_phys_addr; 492 ret = kvm__register_mmio(kvm, table->guest_phys_addr, 493 table->size, false, 494 vfio_pci_msix_table_access, pdev); 495 /* 496 * The MSIX table and the PBA structure can share the same BAR, 497 * but for convenience we register different regions for mmio 498 * emulation. We want to we update both if they share the same 499 * BAR. 500 */ 501 if (ret < 0 || table->bar != pba->bar) 502 goto out; 503 } 504 505 if (has_msix && (u32)bar_num == pba->bar) { 506 if (pba->bar == table->bar) 507 pba->guest_phys_addr = table->guest_phys_addr + table->size; 508 else 509 pba->guest_phys_addr = region->guest_phys_addr; 510 ret = kvm__register_mmio(kvm, pba->guest_phys_addr, 511 pba->size, false, 512 vfio_pci_msix_pba_access, pdev); 513 goto out; 514 } 515 516 ret = vfio_map_region(kvm, vdev, region); 517 out: 518 return ret; 519 } 520 521 static int vfio_pci_bar_deactivate(struct kvm *kvm, 522 struct pci_device_header *pci_hdr, 523 int bar_num, void *data) 524 { 525 struct vfio_device *vdev = data; 526 struct vfio_pci_device *pdev = &vdev->pci; 527 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 528 struct vfio_pci_msix_table *table = &pdev->msix_table; 529 struct vfio_region *region; 530 bool has_msix, success; 531 int ret; 532 533 assert((u32)bar_num < vdev->info.num_regions); 534 535 region = &vdev->regions[bar_num]; 536 has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX; 537 538 if (has_msix && (u32)bar_num == table->bar) { 539 success = kvm__deregister_mmio(kvm, table->guest_phys_addr); 540 /* kvm__deregister_mmio fails when the region is not found. */ 541 ret = (success ? 0 : -ENOENT); 542 /* See vfio_pci_bar_activate(). */ 543 if (ret < 0 || table->bar!= pba->bar) 544 goto out; 545 } 546 547 if (has_msix && (u32)bar_num == pba->bar) { 548 success = kvm__deregister_mmio(kvm, pba->guest_phys_addr); 549 ret = (success ? 0 : -ENOENT); 550 goto out; 551 } 552 553 vfio_unmap_region(kvm, region); 554 ret = 0; 555 556 out: 557 return ret; 558 } 559 560 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr, 561 u16 offset, void *data, int sz) 562 { 563 struct vfio_region_info *info; 564 struct vfio_pci_device *pdev; 565 struct vfio_device *vdev; 566 char base[sz]; 567 568 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 569 vdev = container_of(pdev, struct vfio_device, pci); 570 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 571 572 /* Dummy read in case of side-effects */ 573 if (pread(vdev->fd, base, sz, info->offset + offset) != sz) 574 vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x", 575 sz, offset); 576 } 577 578 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr, 579 u16 offset, void *data, int sz) 580 { 581 struct vfio_region_info *info; 582 struct vfio_pci_device *pdev; 583 struct vfio_device *vdev; 584 u32 tmp; 585 586 /* Make sure a larger size will not overrun tmp on the stack. */ 587 assert(sz <= 4); 588 589 if (offset == PCI_ROM_ADDRESS) 590 return; 591 592 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 593 vdev = container_of(pdev, struct vfio_device, pci); 594 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 595 596 if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz) 597 vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x", 598 sz, offset); 599 600 /* Handle MSI write now, since it might update the hardware capability */ 601 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) 602 vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz); 603 604 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) 605 vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz); 606 607 if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz) 608 vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x", 609 sz, offset); 610 } 611 612 static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr) 613 { 614 size_t size = 10; 615 616 if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT) 617 size += 4; 618 if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT) 619 size += 10; 620 621 return size; 622 } 623 624 static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr) 625 { 626 switch (cap_hdr->type) { 627 case PCI_CAP_ID_MSIX: 628 return PCI_CAP_MSIX_SIZEOF; 629 case PCI_CAP_ID_MSI: 630 return vfio_pci_msi_cap_size((void *)cap_hdr); 631 case PCI_CAP_ID_EXP: 632 /* 633 * We don't emulate any of the link, slot and root complex 634 * properties, so ignore them. 635 */ 636 return PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1; 637 default: 638 pr_err("unknown PCI capability 0x%x", cap_hdr->type); 639 return 0; 640 } 641 } 642 643 static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr, 644 struct pci_cap_hdr *cap, off_t pos) 645 { 646 struct pci_cap_hdr *last; 647 struct pci_device_header *hdr = &vdev->pci.hdr; 648 649 cap->next = 0; 650 651 if (!hdr->capabilities) { 652 hdr->capabilities = pos; 653 hdr->status |= PCI_STATUS_CAP_LIST; 654 } else { 655 last = PCI_CAP(virt_hdr, hdr->capabilities); 656 657 while (last->next) 658 last = PCI_CAP(virt_hdr, last->next); 659 660 last->next = pos; 661 } 662 663 memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap)); 664 665 return 0; 666 } 667 668 static int vfio_pci_parse_caps(struct vfio_device *vdev) 669 { 670 int ret; 671 size_t size; 672 u16 pos, next; 673 struct pci_cap_hdr *cap; 674 u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY]; 675 struct vfio_pci_device *pdev = &vdev->pci; 676 677 if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST)) 678 return 0; 679 680 memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY); 681 682 pos = pdev->hdr.capabilities & ~3; 683 684 pdev->hdr.status &= ~PCI_STATUS_CAP_LIST; 685 pdev->hdr.capabilities = 0; 686 687 for (; pos; pos = next) { 688 cap = PCI_CAP(&pdev->hdr, pos); 689 next = cap->next; 690 691 switch (cap->type) { 692 case PCI_CAP_ID_MSIX: 693 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 694 if (ret) 695 return ret; 696 697 pdev->msix.pos = pos; 698 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX; 699 break; 700 case PCI_CAP_ID_MSI: 701 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 702 if (ret) 703 return ret; 704 705 pdev->msi.pos = pos; 706 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI; 707 break; 708 case PCI_CAP_ID_EXP: 709 if (!arch_has_pci_exp()) 710 continue; 711 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 712 if (ret) 713 return ret; 714 break; 715 } 716 } 717 718 /* Wipe remaining capabilities */ 719 pos = PCI_STD_HEADER_SIZEOF; 720 size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF; 721 memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size); 722 723 return 0; 724 } 725 726 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev) 727 { 728 ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY; 729 struct vfio_region_info *info; 730 struct vfio_pci_device *pdev = &vdev->pci; 731 732 if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { 733 vfio_dev_err(vdev, "Config Space not found"); 734 return -ENODEV; 735 } 736 737 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 738 *info = (struct vfio_region_info) { 739 .argsz = sizeof(*info), 740 .index = VFIO_PCI_CONFIG_REGION_INDEX, 741 }; 742 743 ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 744 if (!info->size) { 745 vfio_dev_err(vdev, "Config Space has size zero?!"); 746 return -EINVAL; 747 } 748 749 /* Read standard headers and capabilities */ 750 if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) { 751 vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz); 752 return -EIO; 753 } 754 755 /* Strip bit 7, that indicates multifunction */ 756 pdev->hdr.header_type &= 0x7f; 757 758 if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) { 759 vfio_dev_err(vdev, "unsupported header type %u", 760 pdev->hdr.header_type); 761 return -EOPNOTSUPP; 762 } 763 764 if (pdev->hdr.irq_pin) 765 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX; 766 767 vfio_pci_parse_caps(vdev); 768 769 return 0; 770 } 771 772 static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev) 773 { 774 int i; 775 u64 base; 776 ssize_t hdr_sz; 777 struct msix_cap *msix; 778 struct vfio_region_info *info; 779 struct vfio_pci_device *pdev = &vdev->pci; 780 struct vfio_region *region; 781 782 /* Initialise the BARs */ 783 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 784 if ((u32)i == vdev->info.num_regions) 785 break; 786 787 region = &vdev->regions[i]; 788 /* Construct a fake reg to match what we've mapped. */ 789 if (region->is_ioport) { 790 base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) | 791 PCI_BASE_ADDRESS_SPACE_IO; 792 } else { 793 base = (region->guest_phys_addr & 794 PCI_BASE_ADDRESS_MEM_MASK) | 795 PCI_BASE_ADDRESS_SPACE_MEMORY; 796 } 797 798 pdev->hdr.bar[i] = base; 799 800 if (!base) 801 continue; 802 803 pdev->hdr.bar_size[i] = region->info.size; 804 } 805 806 /* I really can't be bothered to support cardbus. */ 807 pdev->hdr.card_bus = 0; 808 809 /* 810 * Nuke the expansion ROM for now. If we want to do this properly, 811 * we need to save its size somewhere and map into the guest. 812 */ 813 pdev->hdr.exp_rom_bar = 0; 814 815 /* Plumb in our fake MSI-X capability, if we have it. */ 816 msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX); 817 if (msix) { 818 /* Add a shortcut to the PBA region for the MMIO handler */ 819 int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar; 820 pdev->msix_pba.offset = vdev->regions[pba_index].info.offset + 821 (msix->pba_offset & PCI_MSIX_PBA_OFFSET); 822 823 /* Tidy up the capability */ 824 msix->table_offset &= PCI_MSIX_TABLE_BIR; 825 msix->pba_offset &= PCI_MSIX_PBA_BIR; 826 if (pdev->msix_table.bar == pdev->msix_pba.bar) 827 msix->pba_offset |= pdev->msix_table.size & 828 PCI_MSIX_PBA_OFFSET; 829 } 830 831 /* Install our fake Configuration Space */ 832 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 833 /* 834 * We don't touch the extended configuration space, let's be cautious 835 * and not overwrite it all with zeros, or bad things might happen. 836 */ 837 hdr_sz = PCI_DEV_CFG_SIZE_LEGACY; 838 if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) { 839 vfio_dev_err(vdev, "failed to write %zd bytes to Config Space", 840 hdr_sz); 841 return -EIO; 842 } 843 844 /* Register callbacks for cfg accesses */ 845 pdev->hdr.cfg_ops = (struct pci_config_operations) { 846 .read = vfio_pci_cfg_read, 847 .write = vfio_pci_cfg_write, 848 }; 849 850 pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH; 851 852 return 0; 853 } 854 855 static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index, 856 struct vfio_region_info *info) 857 { 858 int ret; 859 860 *info = (struct vfio_region_info) { 861 .argsz = sizeof(*info), 862 .index = index, 863 }; 864 865 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 866 if (ret) { 867 ret = -errno; 868 vfio_dev_err(vdev, "cannot get info for BAR %u", index); 869 return ret; 870 } 871 872 if (info->size && !is_power_of_two(info->size)) { 873 vfio_dev_err(vdev, "region is not power of two: 0x%llx", 874 info->size); 875 return -EINVAL; 876 } 877 878 return 0; 879 } 880 881 static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev) 882 { 883 int ret; 884 size_t i; 885 size_t map_size; 886 size_t nr_entries; 887 struct vfio_pci_msi_entry *entries; 888 struct vfio_pci_device *pdev = &vdev->pci; 889 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 890 struct vfio_pci_msix_table *table = &pdev->msix_table; 891 struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos); 892 struct vfio_region_info info; 893 894 table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR; 895 pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR; 896 897 /* 898 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE. 899 */ 900 nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; 901 table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE); 902 pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE); 903 904 entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry)); 905 if (!entries) 906 return -ENOMEM; 907 908 for (i = 0; i < nr_entries; i++) 909 entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; 910 911 ret = vfio_pci_get_region_info(vdev, table->bar, &info); 912 if (ret) 913 return ret; 914 if (!info.size) 915 return -EINVAL; 916 map_size = info.size; 917 918 if (table->bar != pba->bar) { 919 ret = vfio_pci_get_region_info(vdev, pba->bar, &info); 920 if (ret) 921 return ret; 922 if (!info.size) 923 return -EINVAL; 924 map_size += info.size; 925 } 926 927 /* 928 * To ease MSI-X cap configuration in case they share the same BAR, 929 * collapse table and pending array. The size of the BAR regions must be 930 * powers of two. 931 */ 932 map_size = ALIGN(map_size, PAGE_SIZE); 933 table->guest_phys_addr = pci_get_mmio_block(map_size); 934 if (!table->guest_phys_addr) { 935 pr_err("cannot allocate MMIO space"); 936 ret = -ENOMEM; 937 goto out_free; 938 } 939 940 /* 941 * We could map the physical PBA directly into the guest, but it's 942 * likely smaller than a page, and we can only hand full pages to the 943 * guest. Even though the PCI spec disallows sharing a page used for 944 * MSI-X with any other resource, it allows to share the same page 945 * between MSI-X table and PBA. For the sake of isolation, create a 946 * virtual PBA. 947 */ 948 pba->guest_phys_addr = table->guest_phys_addr + table->size; 949 950 pdev->msix.entries = entries; 951 pdev->msix.nr_entries = nr_entries; 952 953 return 0; 954 955 out_free: 956 free(entries); 957 958 return ret; 959 } 960 961 static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev) 962 { 963 struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos); 964 965 pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1), 966 pdev->msi.entries = calloc(pdev->msi.nr_entries, 967 sizeof(struct vfio_pci_msi_entry)); 968 if (!pdev->msi.entries) 969 return -ENOMEM; 970 971 return 0; 972 } 973 974 static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev, 975 size_t nr) 976 { 977 int ret; 978 u32 bar; 979 size_t map_size; 980 struct vfio_pci_device *pdev = &vdev->pci; 981 struct vfio_region *region; 982 983 if (nr >= vdev->info.num_regions) 984 return 0; 985 986 region = &vdev->regions[nr]; 987 bar = pdev->hdr.bar[nr]; 988 989 region->vdev = vdev; 990 region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO); 991 992 ret = vfio_pci_get_region_info(vdev, nr, ®ion->info); 993 if (ret) 994 return ret; 995 996 /* Ignore invalid or unimplemented regions */ 997 if (!region->info.size) 998 return 0; 999 1000 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1001 /* Trap and emulate MSI-X table */ 1002 if (nr == pdev->msix_table.bar) { 1003 region->guest_phys_addr = pdev->msix_table.guest_phys_addr; 1004 return 0; 1005 } else if (nr == pdev->msix_pba.bar) { 1006 region->guest_phys_addr = pdev->msix_pba.guest_phys_addr; 1007 return 0; 1008 } 1009 } 1010 1011 if (region->is_ioport) { 1012 region->port_base = pci_get_io_port_block(region->info.size); 1013 } else { 1014 /* Grab some MMIO space in the guest */ 1015 map_size = ALIGN(region->info.size, PAGE_SIZE); 1016 region->guest_phys_addr = pci_get_mmio_block(map_size); 1017 } 1018 1019 return 0; 1020 } 1021 1022 static int vfio_pci_configure_dev_regions(struct kvm *kvm, 1023 struct vfio_device *vdev) 1024 { 1025 int ret; 1026 u32 bar; 1027 size_t i; 1028 bool is_64bit = false; 1029 struct vfio_pci_device *pdev = &vdev->pci; 1030 1031 ret = vfio_pci_parse_cfg_space(vdev); 1032 if (ret) 1033 return ret; 1034 1035 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1036 ret = vfio_pci_create_msix_table(kvm, vdev); 1037 if (ret) 1038 return ret; 1039 } 1040 1041 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 1042 ret = vfio_pci_create_msi_cap(kvm, pdev); 1043 if (ret) 1044 return ret; 1045 } 1046 1047 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 1048 /* Ignore top half of 64-bit BAR */ 1049 if (is_64bit) { 1050 is_64bit = false; 1051 continue; 1052 } 1053 1054 ret = vfio_pci_configure_bar(kvm, vdev, i); 1055 if (ret) 1056 return ret; 1057 1058 bar = pdev->hdr.bar[i]; 1059 is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) == 1060 PCI_BASE_ADDRESS_SPACE_MEMORY && 1061 bar & PCI_BASE_ADDRESS_MEM_TYPE_64; 1062 } 1063 1064 /* We've configured the BARs, fake up a Configuration Space */ 1065 ret = vfio_pci_fixup_cfg_space(vdev); 1066 if (ret) 1067 return ret; 1068 1069 return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate, 1070 vfio_pci_bar_deactivate, vdev); 1071 } 1072 1073 /* 1074 * Attempt to update the FD limit, if opening an eventfd for each IRQ vector 1075 * would hit the limit. Which is likely to happen when a device uses 2048 MSIs. 1076 */ 1077 static int vfio_pci_reserve_irq_fds(size_t num) 1078 { 1079 /* 1080 * I counted around 27 fds under normal load. Let's add 100 for good 1081 * measure. 1082 */ 1083 static size_t needed = 128; 1084 struct rlimit fd_limit, new_limit; 1085 1086 needed += num; 1087 1088 if (getrlimit(RLIMIT_NOFILE, &fd_limit)) { 1089 perror("getrlimit(RLIMIT_NOFILE)"); 1090 return 0; 1091 } 1092 1093 if (fd_limit.rlim_cur >= needed) 1094 return 0; 1095 1096 new_limit.rlim_cur = needed; 1097 1098 if (fd_limit.rlim_max < needed) 1099 /* Try to bump hard limit (root only) */ 1100 new_limit.rlim_max = needed; 1101 else 1102 new_limit.rlim_max = fd_limit.rlim_max; 1103 1104 if (setrlimit(RLIMIT_NOFILE, &new_limit)) { 1105 perror("setrlimit(RLIMIT_NOFILE)"); 1106 pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)", 1107 (size_t)(needed - fd_limit.rlim_cur)); 1108 } 1109 1110 return 0; 1111 } 1112 1113 static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev, 1114 struct vfio_pci_msi_common *msis) 1115 { 1116 int ret; 1117 size_t i; 1118 int *eventfds; 1119 size_t irq_set_size; 1120 struct vfio_pci_msi_entry *entry; 1121 size_t nr_entries = msis->nr_entries; 1122 1123 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info); 1124 if (ret || msis->info.count == 0) { 1125 vfio_dev_err(vdev, "no MSI reported by VFIO"); 1126 return -ENODEV; 1127 } 1128 1129 if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) { 1130 vfio_dev_err(vdev, "interrupt not EVENTFD capable"); 1131 return -EINVAL; 1132 } 1133 1134 if (msis->info.count != nr_entries) { 1135 vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO"); 1136 return -EINVAL; 1137 } 1138 1139 mutex_init(&msis->mutex); 1140 1141 vfio_pci_reserve_irq_fds(nr_entries); 1142 1143 irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int); 1144 msis->irq_set = malloc(irq_set_size); 1145 if (!msis->irq_set) 1146 return -ENOMEM; 1147 1148 *msis->irq_set = (struct vfio_irq_set) { 1149 .argsz = irq_set_size, 1150 .flags = VFIO_IRQ_SET_DATA_EVENTFD | 1151 VFIO_IRQ_SET_ACTION_TRIGGER, 1152 .index = msis->info.index, 1153 .start = 0, 1154 .count = nr_entries, 1155 }; 1156 1157 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 1158 1159 for (i = 0; i < nr_entries; i++) { 1160 entry = &msis->entries[i]; 1161 entry->gsi = -1; 1162 entry->eventfd = -1; 1163 msi_set_masked(entry->virt_state, true); 1164 msi_set_masked(entry->phys_state, true); 1165 eventfds[i] = -1; 1166 } 1167 1168 return 0; 1169 } 1170 1171 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev) 1172 { 1173 struct vfio_pci_device *pdev = &vdev->pci; 1174 int gsi = pdev->intx_gsi; 1175 struct vfio_irq_set irq_set = { 1176 .argsz = sizeof(irq_set), 1177 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 1178 .index = VFIO_PCI_INTX_IRQ_INDEX, 1179 }; 1180 1181 if (pdev->intx_fd == -1) 1182 return; 1183 1184 pr_debug("user requested MSI, disabling INTx %d", gsi); 1185 1186 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 1187 irq__del_irqfd(kvm, gsi, pdev->intx_fd); 1188 1189 close(pdev->intx_fd); 1190 close(pdev->unmask_fd); 1191 pdev->intx_fd = -1; 1192 } 1193 1194 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev) 1195 { 1196 int ret; 1197 int trigger_fd, unmask_fd; 1198 union vfio_irq_eventfd trigger; 1199 union vfio_irq_eventfd unmask; 1200 struct vfio_pci_device *pdev = &vdev->pci; 1201 int gsi = pdev->intx_gsi; 1202 1203 if (pdev->intx_fd != -1) 1204 return 0; 1205 1206 /* 1207 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd 1208 * signals an interrupt from host to guest, and unmask_fd signals the 1209 * deassertion of the line from guest to host. 1210 */ 1211 trigger_fd = eventfd(0, 0); 1212 if (trigger_fd < 0) { 1213 vfio_dev_err(vdev, "failed to create trigger eventfd"); 1214 return trigger_fd; 1215 } 1216 1217 unmask_fd = eventfd(0, 0); 1218 if (unmask_fd < 0) { 1219 vfio_dev_err(vdev, "failed to create unmask eventfd"); 1220 close(trigger_fd); 1221 return unmask_fd; 1222 } 1223 1224 ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd); 1225 if (ret) 1226 goto err_close; 1227 1228 trigger.irq = (struct vfio_irq_set) { 1229 .argsz = sizeof(trigger), 1230 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, 1231 .index = VFIO_PCI_INTX_IRQ_INDEX, 1232 .start = 0, 1233 .count = 1, 1234 }; 1235 set_vfio_irq_eventd_payload(&trigger, trigger_fd); 1236 1237 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 1238 if (ret < 0) { 1239 vfio_dev_err(vdev, "failed to setup VFIO IRQ"); 1240 goto err_delete_line; 1241 } 1242 1243 unmask.irq = (struct vfio_irq_set) { 1244 .argsz = sizeof(unmask), 1245 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK, 1246 .index = VFIO_PCI_INTX_IRQ_INDEX, 1247 .start = 0, 1248 .count = 1, 1249 }; 1250 set_vfio_irq_eventd_payload(&unmask, unmask_fd); 1251 1252 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask); 1253 if (ret < 0) { 1254 vfio_dev_err(vdev, "failed to setup unmask IRQ"); 1255 goto err_remove_event; 1256 } 1257 1258 pdev->intx_fd = trigger_fd; 1259 pdev->unmask_fd = unmask_fd; 1260 1261 return 0; 1262 1263 err_remove_event: 1264 /* Remove trigger event */ 1265 trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; 1266 trigger.irq.count = 0; 1267 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 1268 1269 err_delete_line: 1270 irq__del_irqfd(kvm, gsi, trigger_fd); 1271 1272 err_close: 1273 close(trigger_fd); 1274 close(unmask_fd); 1275 return ret; 1276 } 1277 1278 static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev) 1279 { 1280 int ret; 1281 struct vfio_pci_device *pdev = &vdev->pci; 1282 struct vfio_irq_info irq_info = { 1283 .argsz = sizeof(irq_info), 1284 .index = VFIO_PCI_INTX_IRQ_INDEX, 1285 }; 1286 1287 vfio_pci_reserve_irq_fds(2); 1288 1289 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); 1290 if (ret || irq_info.count == 0) { 1291 vfio_dev_err(vdev, "no INTx reported by VFIO"); 1292 return -ENODEV; 1293 } 1294 1295 if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { 1296 vfio_dev_err(vdev, "interrupt not eventfd capable"); 1297 return -EINVAL; 1298 } 1299 1300 if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) { 1301 vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED"); 1302 return -EINVAL; 1303 } 1304 1305 /* Guest is going to ovewrite our irq_line... */ 1306 pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET; 1307 1308 pdev->intx_fd = -1; 1309 1310 return 0; 1311 } 1312 1313 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev) 1314 { 1315 int ret = 0; 1316 struct vfio_pci_device *pdev = &vdev->pci; 1317 1318 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1319 pdev->msix.info = (struct vfio_irq_info) { 1320 .argsz = sizeof(pdev->msix.info), 1321 .index = VFIO_PCI_MSIX_IRQ_INDEX, 1322 }; 1323 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix); 1324 if (ret) 1325 return ret; 1326 } 1327 1328 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 1329 pdev->msi.info = (struct vfio_irq_info) { 1330 .argsz = sizeof(pdev->msi.info), 1331 .index = VFIO_PCI_MSI_IRQ_INDEX, 1332 }; 1333 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi); 1334 if (ret) 1335 return ret; 1336 } 1337 1338 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) { 1339 pci__assign_irq(&vdev->pci.hdr); 1340 1341 ret = vfio_pci_init_intx(kvm, vdev); 1342 if (ret) 1343 return ret; 1344 1345 ret = vfio_pci_enable_intx(kvm, vdev); 1346 } 1347 1348 return ret; 1349 } 1350 1351 int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev) 1352 { 1353 int ret; 1354 1355 ret = vfio_pci_configure_dev_regions(kvm, vdev); 1356 if (ret) { 1357 vfio_dev_err(vdev, "failed to configure regions"); 1358 return ret; 1359 } 1360 1361 vdev->dev_hdr = (struct device_header) { 1362 .bus_type = DEVICE_BUS_PCI, 1363 .data = &vdev->pci.hdr, 1364 }; 1365 1366 ret = device__register(&vdev->dev_hdr); 1367 if (ret) { 1368 vfio_dev_err(vdev, "failed to register VFIO device"); 1369 return ret; 1370 } 1371 1372 ret = vfio_pci_configure_dev_irqs(kvm, vdev); 1373 if (ret) { 1374 vfio_dev_err(vdev, "failed to configure IRQs"); 1375 return ret; 1376 } 1377 1378 return 0; 1379 } 1380 1381 void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev) 1382 { 1383 size_t i; 1384 struct vfio_pci_device *pdev = &vdev->pci; 1385 1386 for (i = 0; i < vdev->info.num_regions; i++) 1387 vfio_unmap_region(kvm, &vdev->regions[i]); 1388 1389 device__unregister(&vdev->dev_hdr); 1390 1391 free(pdev->msix.irq_set); 1392 free(pdev->msix.entries); 1393 free(pdev->msi.irq_set); 1394 free(pdev->msi.entries); 1395 } 1396