1 #include "kvm/irq.h" 2 #include "kvm/kvm.h" 3 #include "kvm/kvm-cpu.h" 4 #include "kvm/vfio.h" 5 6 #include <sys/ioctl.h> 7 #include <sys/eventfd.h> 8 #include <sys/resource.h> 9 #include <sys/time.h> 10 11 /* Wrapper around UAPI vfio_irq_set */ 12 union vfio_irq_eventfd { 13 struct vfio_irq_set irq; 14 u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)]; 15 }; 16 17 static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd) 18 { 19 memcpy(&evfd->irq.data, &fd, sizeof(fd)); 20 } 21 22 #define msi_is_enabled(state) ((state) & VFIO_PCI_MSI_STATE_ENABLED) 23 #define msi_is_masked(state) ((state) & VFIO_PCI_MSI_STATE_MASKED) 24 #define msi_is_empty(state) ((state) & VFIO_PCI_MSI_STATE_EMPTY) 25 26 #define msi_update_state(state, val, bit) \ 27 (state) = (val) ? (state) | bit : (state) & ~bit; 28 #define msi_set_enabled(state, val) \ 29 msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED) 30 #define msi_set_masked(state, val) \ 31 msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED) 32 #define msi_set_empty(state, val) \ 33 msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY) 34 35 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev); 36 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev); 37 38 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev, 39 bool msix) 40 { 41 size_t i; 42 int ret = 0; 43 int *eventfds; 44 struct vfio_pci_device *pdev = &vdev->pci; 45 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 46 union vfio_irq_eventfd single = { 47 .irq = { 48 .argsz = sizeof(single), 49 .flags = VFIO_IRQ_SET_DATA_EVENTFD | 50 VFIO_IRQ_SET_ACTION_TRIGGER, 51 .index = msis->info.index, 52 .count = 1, 53 }, 54 }; 55 56 if (!msi_is_enabled(msis->virt_state)) 57 return 0; 58 59 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 60 /* 61 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same 62 * time. Since INTx has to be enabled from the start (we don't 63 * have a reliable way to know when the guest starts using it), 64 * disable it now. 65 */ 66 vfio_pci_disable_intx(kvm, vdev); 67 68 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 69 70 /* 71 * Initial registration of the full range. This enables the physical 72 * MSI/MSI-X capability, which might have desired side effects. For 73 * instance when assigning virtio legacy devices, enabling the MSI 74 * capability modifies the config space layout! 75 * 76 * As an optimization, only update MSIs when guest unmasks the 77 * capability. This greatly reduces the initialization time for Linux 78 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap 79 * masked, then fills individual vectors, then unmasks the whole 80 * function. So we only do one VFIO ioctl when enabling for the first 81 * time, and then one when unmasking. 82 * 83 * phys_state is empty when it is enabled but no vector has been 84 * registered via SET_IRQS yet. 85 */ 86 if (!msi_is_enabled(msis->phys_state) || 87 (!msi_is_masked(msis->virt_state) && 88 msi_is_empty(msis->phys_state))) { 89 bool empty = true; 90 91 for (i = 0; i < msis->nr_entries; i++) { 92 eventfds[i] = msis->entries[i].gsi >= 0 ? 93 msis->entries[i].eventfd : -1; 94 95 if (eventfds[i] >= 0) 96 empty = false; 97 } 98 99 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set); 100 if (ret < 0) { 101 perror("VFIO_DEVICE_SET_IRQS(multi)"); 102 return ret; 103 } 104 105 msi_set_enabled(msis->phys_state, true); 106 msi_set_empty(msis->phys_state, empty); 107 108 return 0; 109 } 110 111 if (msi_is_masked(msis->virt_state)) { 112 /* TODO: if phys_state is not empty nor masked, mask all vectors */ 113 return 0; 114 } 115 116 /* Update individual vectors to avoid breaking those in use */ 117 for (i = 0; i < msis->nr_entries; i++) { 118 struct vfio_pci_msi_entry *entry = &msis->entries[i]; 119 int fd = entry->gsi >= 0 ? entry->eventfd : -1; 120 121 if (fd == eventfds[i]) 122 continue; 123 124 single.irq.start = i; 125 set_vfio_irq_eventd_payload(&single, fd); 126 127 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single); 128 if (ret < 0) { 129 perror("VFIO_DEVICE_SET_IRQS(single)"); 130 break; 131 } 132 133 eventfds[i] = fd; 134 135 if (msi_is_empty(msis->phys_state) && fd >= 0) 136 msi_set_empty(msis->phys_state, false); 137 } 138 139 return ret; 140 } 141 142 static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev, 143 bool msix) 144 { 145 int ret; 146 struct vfio_pci_device *pdev = &vdev->pci; 147 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 148 struct vfio_irq_set irq_set = { 149 .argsz = sizeof(irq_set), 150 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 151 .index = msis->info.index, 152 .start = 0, 153 .count = 0, 154 }; 155 156 if (!msi_is_enabled(msis->phys_state)) 157 return 0; 158 159 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 160 if (ret < 0) { 161 perror("VFIO_DEVICE_SET_IRQS(NONE)"); 162 return ret; 163 } 164 165 msi_set_enabled(msis->phys_state, false); 166 msi_set_empty(msis->phys_state, true); 167 168 /* 169 * When MSI or MSIX is disabled, this might be called when 170 * PCI driver detects the MSI interrupt failure and wants to 171 * rollback to INTx mode. Thus enable INTx if the device 172 * supports INTx mode in this case. 173 */ 174 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 175 ret = vfio_pci_enable_intx(kvm, vdev); 176 177 return ret >= 0 ? 0 : ret; 178 } 179 180 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev, 181 struct vfio_pci_msi_entry *entry) 182 { 183 int ret; 184 185 if (entry->eventfd < 0) { 186 entry->eventfd = eventfd(0, 0); 187 if (entry->eventfd < 0) { 188 ret = -errno; 189 vfio_dev_err(vdev, "cannot create eventfd"); 190 return ret; 191 } 192 } 193 194 /* Allocate IRQ if necessary */ 195 if (entry->gsi < 0) { 196 int ret = irq__add_msix_route(kvm, &entry->config.msg, 197 vdev->dev_hdr.dev_num << 3); 198 if (ret < 0) { 199 vfio_dev_err(vdev, "cannot create MSI-X route"); 200 return ret; 201 } 202 entry->gsi = ret; 203 } else { 204 irq__update_msix_route(kvm, entry->gsi, &entry->config.msg); 205 } 206 207 /* 208 * MSI masking is unimplemented in VFIO, so we have to handle it by 209 * disabling/enabling IRQ route instead. We do it on the KVM side rather 210 * than VFIO, because: 211 * - it is 8x faster 212 * - it allows to decouple masking logic from capability state. 213 * - in masked state, after removing irqfd route, we could easily plug 214 * the eventfd in a local handler, in order to serve Pending Bit reads 215 * to the guest. 216 * 217 * So entry->phys_state is masked when there is no active irqfd route. 218 */ 219 if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state)) 220 return 0; 221 222 if (msi_is_masked(entry->phys_state)) { 223 ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1); 224 if (ret < 0) { 225 vfio_dev_err(vdev, "cannot setup irqfd"); 226 return ret; 227 } 228 } else { 229 irq__del_irqfd(kvm, entry->gsi, entry->eventfd); 230 } 231 232 msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state)); 233 234 return 0; 235 } 236 237 static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 238 u32 len, u8 is_write, void *ptr) 239 { 240 struct vfio_pci_device *pdev = ptr; 241 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 242 u64 offset = addr - pba->guest_phys_addr; 243 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 244 245 if (is_write) 246 return; 247 248 /* 249 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA 250 * is completely useless here. Note that Linux doesn't use PBA. 251 */ 252 if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len) 253 vfio_dev_err(vdev, "cannot access MSIX PBA\n"); 254 } 255 256 static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 257 u32 len, u8 is_write, void *ptr) 258 { 259 struct kvm *kvm = vcpu->kvm; 260 struct vfio_pci_msi_entry *entry; 261 struct vfio_pci_device *pdev = ptr; 262 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 263 264 u64 offset = addr - pdev->msix_table.guest_phys_addr; 265 266 size_t vector = offset / PCI_MSIX_ENTRY_SIZE; 267 off_t field = offset % PCI_MSIX_ENTRY_SIZE; 268 269 /* 270 * PCI spec says that software must use aligned 4 or 8 bytes accesses 271 * for the MSI-X tables. 272 */ 273 if ((len != 4 && len != 8) || addr & (len - 1)) { 274 vfio_dev_warn(vdev, "invalid MSI-X table access"); 275 return; 276 } 277 278 entry = &pdev->msix.entries[vector]; 279 280 mutex_lock(&pdev->msix.mutex); 281 282 if (!is_write) { 283 memcpy(data, (void *)&entry->config + field, len); 284 goto out_unlock; 285 } 286 287 memcpy((void *)&entry->config + field, data, len); 288 289 /* 290 * Check if access touched the vector control register, which is at the 291 * end of the MSI-X entry. 292 */ 293 if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL) 294 goto out_unlock; 295 296 msi_set_masked(entry->virt_state, entry->config.ctrl & 297 PCI_MSIX_ENTRY_CTRL_MASKBIT); 298 299 if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0) 300 /* Not much we can do here. */ 301 vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector); 302 303 /* Update the physical capability if necessary */ 304 if (vfio_pci_enable_msis(kvm, vdev, true)) 305 vfio_dev_err(vdev, "cannot enable MSIX"); 306 307 out_unlock: 308 mutex_unlock(&pdev->msix.mutex); 309 } 310 311 static void vfio_pci_msix_cap_write(struct kvm *kvm, 312 struct vfio_device *vdev, u8 off, 313 void *data, int sz) 314 { 315 struct vfio_pci_device *pdev = &vdev->pci; 316 off_t enable_pos = PCI_MSIX_FLAGS + 1; 317 bool enable; 318 u16 flags; 319 320 off -= pdev->msix.pos; 321 322 /* Check if access intersects with the MSI-X Enable bit */ 323 if (off > enable_pos || off + sz <= enable_pos) 324 return; 325 326 /* Read byte that contains the Enable bit */ 327 flags = *(u8 *)(data + enable_pos - off) << 8; 328 329 mutex_lock(&pdev->msix.mutex); 330 331 msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL); 332 enable = flags & PCI_MSIX_FLAGS_ENABLE; 333 msi_set_enabled(pdev->msix.virt_state, enable); 334 335 if (enable && vfio_pci_enable_msis(kvm, vdev, true)) 336 vfio_dev_err(vdev, "cannot enable MSIX"); 337 else if (!enable && vfio_pci_disable_msis(kvm, vdev, true)) 338 vfio_dev_err(vdev, "cannot disable MSIX"); 339 340 mutex_unlock(&pdev->msix.mutex); 341 } 342 343 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev, 344 u8 off, u8 *data, u32 sz) 345 { 346 size_t i; 347 u32 mask = 0; 348 size_t mask_pos, start, limit; 349 struct vfio_pci_msi_entry *entry; 350 struct vfio_pci_device *pdev = &vdev->pci; 351 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 352 353 if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT)) 354 return 0; 355 356 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) 357 mask_pos = PCI_MSI_MASK_64; 358 else 359 mask_pos = PCI_MSI_MASK_32; 360 361 if (off >= mask_pos + 4 || off + sz <= mask_pos) 362 return 0; 363 364 /* Set mask to current state */ 365 for (i = 0; i < pdev->msi.nr_entries; i++) { 366 entry = &pdev->msi.entries[i]; 367 mask |= !!msi_is_masked(entry->virt_state) << i; 368 } 369 370 /* Update mask following the intersection of access and register */ 371 start = max_t(size_t, off, mask_pos); 372 limit = min_t(size_t, off + sz, mask_pos + 4); 373 374 memcpy((void *)&mask + start - mask_pos, data + start - off, 375 limit - start); 376 377 /* Update states if necessary */ 378 for (i = 0; i < pdev->msi.nr_entries; i++) { 379 bool masked = mask & (1 << i); 380 381 entry = &pdev->msi.entries[i]; 382 if (masked != msi_is_masked(entry->virt_state)) { 383 msi_set_masked(entry->virt_state, masked); 384 vfio_pci_update_msi_entry(kvm, vdev, entry); 385 } 386 } 387 388 return 1; 389 } 390 391 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev, 392 u8 off, u8 *data, u32 sz) 393 { 394 u8 ctrl; 395 struct msi_msg msg; 396 size_t i, nr_vectors; 397 struct vfio_pci_msi_entry *entry; 398 struct vfio_pci_device *pdev = &vdev->pci; 399 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 400 401 off -= pdev->msi.pos; 402 403 mutex_lock(&pdev->msi.mutex); 404 405 /* Check if the guest is trying to update mask bits */ 406 if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz)) 407 goto out_unlock; 408 409 /* Only modify routes when guest pokes the enable bit */ 410 if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS) 411 goto out_unlock; 412 413 ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off); 414 415 msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE); 416 417 if (!msi_is_enabled(pdev->msi.virt_state)) { 418 vfio_pci_disable_msis(kvm, vdev, false); 419 goto out_unlock; 420 } 421 422 /* Create routes for the requested vectors */ 423 nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4); 424 425 msg.address_lo = msi_cap_64->address_lo; 426 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) { 427 msg.address_hi = msi_cap_64->address_hi; 428 msg.data = msi_cap_64->data; 429 } else { 430 struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64; 431 msg.address_hi = 0; 432 msg.data = msi_cap_32->data; 433 } 434 435 for (i = 0; i < nr_vectors; i++) { 436 entry = &pdev->msi.entries[i]; 437 438 /* 439 * Set the MSI data value as required by the PCI local 440 * bus specifications, MSI capability, "Message Data". 441 */ 442 msg.data &= ~(nr_vectors - 1); 443 msg.data |= i; 444 445 entry->config.msg = msg; 446 vfio_pci_update_msi_entry(kvm, vdev, entry); 447 } 448 449 /* Update the physical capability if necessary */ 450 if (vfio_pci_enable_msis(kvm, vdev, false)) 451 vfio_dev_err(vdev, "cannot enable MSI"); 452 453 out_unlock: 454 mutex_unlock(&pdev->msi.mutex); 455 } 456 457 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr, 458 u8 offset, void *data, int sz) 459 { 460 struct vfio_region_info *info; 461 struct vfio_pci_device *pdev; 462 struct vfio_device *vdev; 463 char base[sz]; 464 465 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 466 vdev = container_of(pdev, struct vfio_device, pci); 467 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 468 469 /* Dummy read in case of side-effects */ 470 if (pread(vdev->fd, base, sz, info->offset + offset) != sz) 471 vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x", 472 sz, offset); 473 } 474 475 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr, 476 u8 offset, void *data, int sz) 477 { 478 struct vfio_region_info *info; 479 struct vfio_pci_device *pdev; 480 struct vfio_device *vdev; 481 void *base = pci_hdr; 482 483 if (offset == PCI_ROM_ADDRESS) 484 return; 485 486 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 487 vdev = container_of(pdev, struct vfio_device, pci); 488 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 489 490 if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz) 491 vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x", 492 sz, offset); 493 494 /* Handle MSI write now, since it might update the hardware capability */ 495 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) 496 vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz); 497 498 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) 499 vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz); 500 501 if (pread(vdev->fd, base + offset, sz, info->offset + offset) != sz) 502 vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x", 503 sz, offset); 504 } 505 506 static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr) 507 { 508 size_t size = 10; 509 510 if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT) 511 size += 4; 512 if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT) 513 size += 10; 514 515 return size; 516 } 517 518 static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr) 519 { 520 switch (cap_hdr->type) { 521 case PCI_CAP_ID_MSIX: 522 return PCI_CAP_MSIX_SIZEOF; 523 case PCI_CAP_ID_MSI: 524 return vfio_pci_msi_cap_size((void *)cap_hdr); 525 default: 526 pr_err("unknown PCI capability 0x%x", cap_hdr->type); 527 return 0; 528 } 529 } 530 531 static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr, 532 struct pci_cap_hdr *cap, off_t pos) 533 { 534 struct pci_cap_hdr *last; 535 struct pci_device_header *hdr = &vdev->pci.hdr; 536 537 cap->next = 0; 538 539 if (!hdr->capabilities) { 540 hdr->capabilities = pos; 541 hdr->status |= PCI_STATUS_CAP_LIST; 542 } else { 543 last = PCI_CAP(virt_hdr, hdr->capabilities); 544 545 while (last->next) 546 last = PCI_CAP(virt_hdr, last->next); 547 548 last->next = pos; 549 } 550 551 memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap)); 552 553 return 0; 554 } 555 556 static int vfio_pci_parse_caps(struct vfio_device *vdev) 557 { 558 int ret; 559 size_t size; 560 u8 pos, next; 561 struct pci_cap_hdr *cap; 562 u8 virt_hdr[PCI_DEV_CFG_SIZE]; 563 struct vfio_pci_device *pdev = &vdev->pci; 564 565 if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST)) 566 return 0; 567 568 memset(virt_hdr, 0, PCI_DEV_CFG_SIZE); 569 570 pos = pdev->hdr.capabilities & ~3; 571 572 pdev->hdr.status &= ~PCI_STATUS_CAP_LIST; 573 pdev->hdr.capabilities = 0; 574 575 for (; pos; pos = next) { 576 cap = PCI_CAP(&pdev->hdr, pos); 577 next = cap->next; 578 579 switch (cap->type) { 580 case PCI_CAP_ID_MSIX: 581 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 582 if (ret) 583 return ret; 584 585 pdev->msix.pos = pos; 586 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX; 587 break; 588 case PCI_CAP_ID_MSI: 589 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 590 if (ret) 591 return ret; 592 593 pdev->msi.pos = pos; 594 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI; 595 break; 596 } 597 } 598 599 /* Wipe remaining capabilities */ 600 pos = PCI_STD_HEADER_SIZEOF; 601 size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF; 602 memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size); 603 604 return 0; 605 } 606 607 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev) 608 { 609 ssize_t sz = PCI_DEV_CFG_SIZE; 610 struct vfio_region_info *info; 611 struct vfio_pci_device *pdev = &vdev->pci; 612 613 if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { 614 vfio_dev_err(vdev, "Config Space not found"); 615 return -ENODEV; 616 } 617 618 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 619 *info = (struct vfio_region_info) { 620 .argsz = sizeof(*info), 621 .index = VFIO_PCI_CONFIG_REGION_INDEX, 622 }; 623 624 ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 625 if (!info->size) { 626 vfio_dev_err(vdev, "Config Space has size zero?!"); 627 return -EINVAL; 628 } 629 630 /* Read standard headers and capabilities */ 631 if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) { 632 vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz); 633 return -EIO; 634 } 635 636 /* Strip bit 7, that indicates multifunction */ 637 pdev->hdr.header_type &= 0x7f; 638 639 if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) { 640 vfio_dev_err(vdev, "unsupported header type %u", 641 pdev->hdr.header_type); 642 return -EOPNOTSUPP; 643 } 644 645 if (pdev->hdr.irq_pin) 646 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX; 647 648 vfio_pci_parse_caps(vdev); 649 650 return 0; 651 } 652 653 static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev) 654 { 655 int i; 656 u64 base; 657 ssize_t hdr_sz; 658 struct msix_cap *msix; 659 struct vfio_region_info *info; 660 struct vfio_pci_device *pdev = &vdev->pci; 661 struct vfio_region *region; 662 663 /* Initialise the BARs */ 664 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 665 if ((u32)i == vdev->info.num_regions) 666 break; 667 668 region = &vdev->regions[i]; 669 /* Construct a fake reg to match what we've mapped. */ 670 if (region->is_ioport) { 671 base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) | 672 PCI_BASE_ADDRESS_SPACE_IO; 673 } else { 674 base = (region->guest_phys_addr & 675 PCI_BASE_ADDRESS_MEM_MASK) | 676 PCI_BASE_ADDRESS_SPACE_MEMORY; 677 } 678 679 pdev->hdr.bar[i] = base; 680 681 if (!base) 682 continue; 683 684 pdev->hdr.bar_size[i] = region->info.size; 685 } 686 687 /* I really can't be bothered to support cardbus. */ 688 pdev->hdr.card_bus = 0; 689 690 /* 691 * Nuke the expansion ROM for now. If we want to do this properly, 692 * we need to save its size somewhere and map into the guest. 693 */ 694 pdev->hdr.exp_rom_bar = 0; 695 696 /* Plumb in our fake MSI-X capability, if we have it. */ 697 msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX); 698 if (msix) { 699 /* Add a shortcut to the PBA region for the MMIO handler */ 700 int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar; 701 pdev->msix_pba.offset = vdev->regions[pba_index].info.offset + 702 (msix->pba_offset & PCI_MSIX_PBA_OFFSET); 703 704 /* Tidy up the capability */ 705 msix->table_offset &= PCI_MSIX_TABLE_BIR; 706 msix->pba_offset &= PCI_MSIX_PBA_BIR; 707 if (pdev->msix_table.bar == pdev->msix_pba.bar) 708 msix->pba_offset |= pdev->msix_table.size & 709 PCI_MSIX_PBA_OFFSET; 710 } 711 712 /* Install our fake Configuration Space */ 713 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 714 hdr_sz = PCI_DEV_CFG_SIZE; 715 if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) { 716 vfio_dev_err(vdev, "failed to write %zd bytes to Config Space", 717 hdr_sz); 718 return -EIO; 719 } 720 721 /* Register callbacks for cfg accesses */ 722 pdev->hdr.cfg_ops = (struct pci_config_operations) { 723 .read = vfio_pci_cfg_read, 724 .write = vfio_pci_cfg_write, 725 }; 726 727 pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH; 728 729 return 0; 730 } 731 732 static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index, 733 struct vfio_region_info *info) 734 { 735 int ret; 736 737 *info = (struct vfio_region_info) { 738 .argsz = sizeof(*info), 739 .index = index, 740 }; 741 742 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 743 if (ret) { 744 ret = -errno; 745 vfio_dev_err(vdev, "cannot get info for BAR %u", index); 746 return ret; 747 } 748 749 if (info->size && !is_power_of_two(info->size)) { 750 vfio_dev_err(vdev, "region is not power of two: 0x%llx", 751 info->size); 752 return -EINVAL; 753 } 754 755 return 0; 756 } 757 758 static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev) 759 { 760 int ret; 761 size_t i; 762 size_t map_size; 763 size_t nr_entries; 764 struct vfio_pci_msi_entry *entries; 765 struct vfio_pci_device *pdev = &vdev->pci; 766 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 767 struct vfio_pci_msix_table *table = &pdev->msix_table; 768 struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos); 769 struct vfio_region_info info; 770 771 table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR; 772 pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR; 773 774 /* 775 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE. 776 */ 777 nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; 778 table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE); 779 pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE); 780 781 entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry)); 782 if (!entries) 783 return -ENOMEM; 784 785 for (i = 0; i < nr_entries; i++) 786 entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; 787 788 ret = vfio_pci_get_region_info(vdev, table->bar, &info); 789 if (ret) 790 return ret; 791 if (!info.size) 792 return -EINVAL; 793 map_size = info.size; 794 795 if (table->bar != pba->bar) { 796 ret = vfio_pci_get_region_info(vdev, pba->bar, &info); 797 if (ret) 798 return ret; 799 if (!info.size) 800 return -EINVAL; 801 map_size += info.size; 802 } 803 804 /* 805 * To ease MSI-X cap configuration in case they share the same BAR, 806 * collapse table and pending array. The size of the BAR regions must be 807 * powers of two. 808 */ 809 map_size = ALIGN(map_size, PAGE_SIZE); 810 table->guest_phys_addr = pci_get_mmio_block(map_size); 811 if (!table->guest_phys_addr) { 812 pr_err("cannot allocate MMIO space"); 813 ret = -ENOMEM; 814 goto out_free; 815 } 816 pba->guest_phys_addr = table->guest_phys_addr + table->size; 817 818 ret = kvm__register_mmio(kvm, table->guest_phys_addr, table->size, 819 false, vfio_pci_msix_table_access, pdev); 820 if (ret < 0) 821 goto out_free; 822 823 /* 824 * We could map the physical PBA directly into the guest, but it's 825 * likely smaller than a page, and we can only hand full pages to the 826 * guest. Even though the PCI spec disallows sharing a page used for 827 * MSI-X with any other resource, it allows to share the same page 828 * between MSI-X table and PBA. For the sake of isolation, create a 829 * virtual PBA. 830 */ 831 ret = kvm__register_mmio(kvm, pba->guest_phys_addr, pba->size, false, 832 vfio_pci_msix_pba_access, pdev); 833 if (ret < 0) 834 goto out_free; 835 836 pdev->msix.entries = entries; 837 pdev->msix.nr_entries = nr_entries; 838 839 return 0; 840 841 out_free: 842 free(entries); 843 844 return ret; 845 } 846 847 static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev) 848 { 849 struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos); 850 851 pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1), 852 pdev->msi.entries = calloc(pdev->msi.nr_entries, 853 sizeof(struct vfio_pci_msi_entry)); 854 if (!pdev->msi.entries) 855 return -ENOMEM; 856 857 return 0; 858 } 859 860 static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev, 861 size_t nr) 862 { 863 int ret; 864 u32 bar; 865 size_t map_size; 866 struct vfio_pci_device *pdev = &vdev->pci; 867 struct vfio_region *region; 868 869 if (nr >= vdev->info.num_regions) 870 return 0; 871 872 region = &vdev->regions[nr]; 873 bar = pdev->hdr.bar[nr]; 874 875 region->vdev = vdev; 876 region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO); 877 878 ret = vfio_pci_get_region_info(vdev, nr, ®ion->info); 879 if (ret) 880 return ret; 881 882 /* Ignore invalid or unimplemented regions */ 883 if (!region->info.size) 884 return 0; 885 886 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 887 /* Trap and emulate MSI-X table */ 888 if (nr == pdev->msix_table.bar) { 889 region->guest_phys_addr = pdev->msix_table.guest_phys_addr; 890 return 0; 891 } else if (nr == pdev->msix_pba.bar) { 892 region->guest_phys_addr = pdev->msix_pba.guest_phys_addr; 893 return 0; 894 } 895 } 896 897 if (!region->is_ioport) { 898 /* Grab some MMIO space in the guest */ 899 map_size = ALIGN(region->info.size, PAGE_SIZE); 900 region->guest_phys_addr = pci_get_mmio_block(map_size); 901 } 902 903 /* Map the BARs into the guest or setup a trap region. */ 904 ret = vfio_map_region(kvm, vdev, region); 905 if (ret) 906 return ret; 907 908 return 0; 909 } 910 911 static int vfio_pci_configure_dev_regions(struct kvm *kvm, 912 struct vfio_device *vdev) 913 { 914 int ret; 915 u32 bar; 916 size_t i; 917 bool is_64bit = false; 918 struct vfio_pci_device *pdev = &vdev->pci; 919 920 ret = vfio_pci_parse_cfg_space(vdev); 921 if (ret) 922 return ret; 923 924 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 925 ret = vfio_pci_create_msix_table(kvm, vdev); 926 if (ret) 927 return ret; 928 } 929 930 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 931 ret = vfio_pci_create_msi_cap(kvm, pdev); 932 if (ret) 933 return ret; 934 } 935 936 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 937 /* Ignore top half of 64-bit BAR */ 938 if (is_64bit) { 939 is_64bit = false; 940 continue; 941 } 942 943 ret = vfio_pci_configure_bar(kvm, vdev, i); 944 if (ret) 945 return ret; 946 947 bar = pdev->hdr.bar[i]; 948 is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) == 949 PCI_BASE_ADDRESS_SPACE_MEMORY && 950 bar & PCI_BASE_ADDRESS_MEM_TYPE_64; 951 } 952 953 /* We've configured the BARs, fake up a Configuration Space */ 954 return vfio_pci_fixup_cfg_space(vdev); 955 } 956 957 /* 958 * Attempt to update the FD limit, if opening an eventfd for each IRQ vector 959 * would hit the limit. Which is likely to happen when a device uses 2048 MSIs. 960 */ 961 static int vfio_pci_reserve_irq_fds(size_t num) 962 { 963 /* 964 * I counted around 27 fds under normal load. Let's add 100 for good 965 * measure. 966 */ 967 static size_t needed = 128; 968 struct rlimit fd_limit, new_limit; 969 970 needed += num; 971 972 if (getrlimit(RLIMIT_NOFILE, &fd_limit)) { 973 perror("getrlimit(RLIMIT_NOFILE)"); 974 return 0; 975 } 976 977 if (fd_limit.rlim_cur >= needed) 978 return 0; 979 980 new_limit.rlim_cur = needed; 981 982 if (fd_limit.rlim_max < needed) 983 /* Try to bump hard limit (root only) */ 984 new_limit.rlim_max = needed; 985 else 986 new_limit.rlim_max = fd_limit.rlim_max; 987 988 if (setrlimit(RLIMIT_NOFILE, &new_limit)) { 989 perror("setrlimit(RLIMIT_NOFILE)"); 990 pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)", 991 (size_t)(needed - fd_limit.rlim_cur)); 992 } 993 994 return 0; 995 } 996 997 static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev, 998 struct vfio_pci_msi_common *msis) 999 { 1000 int ret; 1001 size_t i; 1002 int *eventfds; 1003 size_t irq_set_size; 1004 struct vfio_pci_msi_entry *entry; 1005 size_t nr_entries = msis->nr_entries; 1006 1007 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info); 1008 if (ret || msis->info.count == 0) { 1009 vfio_dev_err(vdev, "no MSI reported by VFIO"); 1010 return -ENODEV; 1011 } 1012 1013 if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) { 1014 vfio_dev_err(vdev, "interrupt not EVENTFD capable"); 1015 return -EINVAL; 1016 } 1017 1018 if (msis->info.count != nr_entries) { 1019 vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO"); 1020 return -EINVAL; 1021 } 1022 1023 mutex_init(&msis->mutex); 1024 1025 vfio_pci_reserve_irq_fds(nr_entries); 1026 1027 irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int); 1028 msis->irq_set = malloc(irq_set_size); 1029 if (!msis->irq_set) 1030 return -ENOMEM; 1031 1032 *msis->irq_set = (struct vfio_irq_set) { 1033 .argsz = irq_set_size, 1034 .flags = VFIO_IRQ_SET_DATA_EVENTFD | 1035 VFIO_IRQ_SET_ACTION_TRIGGER, 1036 .index = msis->info.index, 1037 .start = 0, 1038 .count = nr_entries, 1039 }; 1040 1041 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 1042 1043 for (i = 0; i < nr_entries; i++) { 1044 entry = &msis->entries[i]; 1045 entry->gsi = -1; 1046 entry->eventfd = -1; 1047 msi_set_masked(entry->virt_state, true); 1048 msi_set_masked(entry->phys_state, true); 1049 eventfds[i] = -1; 1050 } 1051 1052 return 0; 1053 } 1054 1055 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev) 1056 { 1057 struct vfio_pci_device *pdev = &vdev->pci; 1058 int gsi = pdev->intx_gsi; 1059 struct vfio_irq_set irq_set = { 1060 .argsz = sizeof(irq_set), 1061 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 1062 .index = VFIO_PCI_INTX_IRQ_INDEX, 1063 }; 1064 1065 if (pdev->intx_fd == -1) 1066 return; 1067 1068 pr_debug("user requested MSI, disabling INTx %d", gsi); 1069 1070 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 1071 irq__del_irqfd(kvm, gsi, pdev->intx_fd); 1072 1073 close(pdev->intx_fd); 1074 close(pdev->unmask_fd); 1075 pdev->intx_fd = -1; 1076 } 1077 1078 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev) 1079 { 1080 int ret; 1081 int trigger_fd, unmask_fd; 1082 union vfio_irq_eventfd trigger; 1083 union vfio_irq_eventfd unmask; 1084 struct vfio_pci_device *pdev = &vdev->pci; 1085 int gsi = pdev->intx_gsi; 1086 1087 if (pdev->intx_fd != -1) 1088 return 0; 1089 1090 /* 1091 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd 1092 * signals an interrupt from host to guest, and unmask_fd signals the 1093 * deassertion of the line from guest to host. 1094 */ 1095 trigger_fd = eventfd(0, 0); 1096 if (trigger_fd < 0) { 1097 vfio_dev_err(vdev, "failed to create trigger eventfd"); 1098 return trigger_fd; 1099 } 1100 1101 unmask_fd = eventfd(0, 0); 1102 if (unmask_fd < 0) { 1103 vfio_dev_err(vdev, "failed to create unmask eventfd"); 1104 close(trigger_fd); 1105 return unmask_fd; 1106 } 1107 1108 ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd); 1109 if (ret) 1110 goto err_close; 1111 1112 trigger.irq = (struct vfio_irq_set) { 1113 .argsz = sizeof(trigger), 1114 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, 1115 .index = VFIO_PCI_INTX_IRQ_INDEX, 1116 .start = 0, 1117 .count = 1, 1118 }; 1119 set_vfio_irq_eventd_payload(&trigger, trigger_fd); 1120 1121 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 1122 if (ret < 0) { 1123 vfio_dev_err(vdev, "failed to setup VFIO IRQ"); 1124 goto err_delete_line; 1125 } 1126 1127 unmask.irq = (struct vfio_irq_set) { 1128 .argsz = sizeof(unmask), 1129 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK, 1130 .index = VFIO_PCI_INTX_IRQ_INDEX, 1131 .start = 0, 1132 .count = 1, 1133 }; 1134 set_vfio_irq_eventd_payload(&unmask, unmask_fd); 1135 1136 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask); 1137 if (ret < 0) { 1138 vfio_dev_err(vdev, "failed to setup unmask IRQ"); 1139 goto err_remove_event; 1140 } 1141 1142 pdev->intx_fd = trigger_fd; 1143 pdev->unmask_fd = unmask_fd; 1144 1145 return 0; 1146 1147 err_remove_event: 1148 /* Remove trigger event */ 1149 trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; 1150 trigger.irq.count = 0; 1151 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 1152 1153 err_delete_line: 1154 irq__del_irqfd(kvm, gsi, trigger_fd); 1155 1156 err_close: 1157 close(trigger_fd); 1158 close(unmask_fd); 1159 return ret; 1160 } 1161 1162 static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev) 1163 { 1164 int ret; 1165 struct vfio_pci_device *pdev = &vdev->pci; 1166 struct vfio_irq_info irq_info = { 1167 .argsz = sizeof(irq_info), 1168 .index = VFIO_PCI_INTX_IRQ_INDEX, 1169 }; 1170 1171 vfio_pci_reserve_irq_fds(2); 1172 1173 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); 1174 if (ret || irq_info.count == 0) { 1175 vfio_dev_err(vdev, "no INTx reported by VFIO"); 1176 return -ENODEV; 1177 } 1178 1179 if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { 1180 vfio_dev_err(vdev, "interrupt not eventfd capable"); 1181 return -EINVAL; 1182 } 1183 1184 if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) { 1185 vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED"); 1186 return -EINVAL; 1187 } 1188 1189 /* Guest is going to ovewrite our irq_line... */ 1190 pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET; 1191 1192 pdev->intx_fd = -1; 1193 1194 return 0; 1195 } 1196 1197 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev) 1198 { 1199 int ret = 0; 1200 struct vfio_pci_device *pdev = &vdev->pci; 1201 1202 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1203 pdev->msix.info = (struct vfio_irq_info) { 1204 .argsz = sizeof(pdev->msix.info), 1205 .index = VFIO_PCI_MSIX_IRQ_INDEX, 1206 }; 1207 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix); 1208 if (ret) 1209 return ret; 1210 } 1211 1212 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 1213 pdev->msi.info = (struct vfio_irq_info) { 1214 .argsz = sizeof(pdev->msi.info), 1215 .index = VFIO_PCI_MSI_IRQ_INDEX, 1216 }; 1217 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi); 1218 if (ret) 1219 return ret; 1220 } 1221 1222 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) { 1223 pci__assign_irq(&vdev->pci.hdr); 1224 1225 ret = vfio_pci_init_intx(kvm, vdev); 1226 if (ret) 1227 return ret; 1228 1229 ret = vfio_pci_enable_intx(kvm, vdev); 1230 } 1231 1232 return ret; 1233 } 1234 1235 int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev) 1236 { 1237 int ret; 1238 1239 ret = vfio_pci_configure_dev_regions(kvm, vdev); 1240 if (ret) { 1241 vfio_dev_err(vdev, "failed to configure regions"); 1242 return ret; 1243 } 1244 1245 vdev->dev_hdr = (struct device_header) { 1246 .bus_type = DEVICE_BUS_PCI, 1247 .data = &vdev->pci.hdr, 1248 }; 1249 1250 ret = device__register(&vdev->dev_hdr); 1251 if (ret) { 1252 vfio_dev_err(vdev, "failed to register VFIO device"); 1253 return ret; 1254 } 1255 1256 ret = vfio_pci_configure_dev_irqs(kvm, vdev); 1257 if (ret) { 1258 vfio_dev_err(vdev, "failed to configure IRQs"); 1259 return ret; 1260 } 1261 1262 return 0; 1263 } 1264 1265 void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev) 1266 { 1267 size_t i; 1268 struct vfio_pci_device *pdev = &vdev->pci; 1269 1270 for (i = 0; i < vdev->info.num_regions; i++) 1271 vfio_unmap_region(kvm, &vdev->regions[i]); 1272 1273 device__unregister(&vdev->dev_hdr); 1274 1275 free(pdev->msix.irq_set); 1276 free(pdev->msix.entries); 1277 free(pdev->msi.irq_set); 1278 free(pdev->msi.entries); 1279 } 1280