1 #include "kvm/irq.h" 2 #include "kvm/kvm.h" 3 #include "kvm/kvm-cpu.h" 4 #include "kvm/vfio.h" 5 6 #include <sys/ioctl.h> 7 #include <sys/eventfd.h> 8 #include <sys/resource.h> 9 #include <sys/time.h> 10 11 /* Wrapper around UAPI vfio_irq_set */ 12 struct vfio_irq_eventfd { 13 struct vfio_irq_set irq; 14 int fd; 15 }; 16 17 #define msi_is_enabled(state) ((state) & VFIO_PCI_MSI_STATE_ENABLED) 18 #define msi_is_masked(state) ((state) & VFIO_PCI_MSI_STATE_MASKED) 19 #define msi_is_empty(state) ((state) & VFIO_PCI_MSI_STATE_EMPTY) 20 21 #define msi_update_state(state, val, bit) \ 22 (state) = (val) ? (state) | bit : (state) & ~bit; 23 #define msi_set_enabled(state, val) \ 24 msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED) 25 #define msi_set_masked(state, val) \ 26 msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED) 27 #define msi_set_empty(state, val) \ 28 msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY) 29 30 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev); 31 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev); 32 33 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev, 34 bool msix) 35 { 36 size_t i; 37 int ret = 0; 38 int *eventfds; 39 struct vfio_pci_device *pdev = &vdev->pci; 40 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 41 struct vfio_irq_eventfd single = { 42 .irq = { 43 .argsz = sizeof(single), 44 .flags = VFIO_IRQ_SET_DATA_EVENTFD | 45 VFIO_IRQ_SET_ACTION_TRIGGER, 46 .index = msis->info.index, 47 .count = 1, 48 }, 49 }; 50 51 if (!msi_is_enabled(msis->virt_state)) 52 return 0; 53 54 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 55 /* 56 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same 57 * time. Since INTx has to be enabled from the start (we don't 58 * have a reliable way to know when the guest starts using it), 59 * disable it now. 60 */ 61 vfio_pci_disable_intx(kvm, vdev); 62 63 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 64 65 /* 66 * Initial registration of the full range. This enables the physical 67 * MSI/MSI-X capability, which might have desired side effects. For 68 * instance when assigning virtio legacy devices, enabling the MSI 69 * capability modifies the config space layout! 70 * 71 * As an optimization, only update MSIs when guest unmasks the 72 * capability. This greatly reduces the initialization time for Linux 73 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap 74 * masked, then fills individual vectors, then unmasks the whole 75 * function. So we only do one VFIO ioctl when enabling for the first 76 * time, and then one when unmasking. 77 * 78 * phys_state is empty when it is enabled but no vector has been 79 * registered via SET_IRQS yet. 80 */ 81 if (!msi_is_enabled(msis->phys_state) || 82 (!msi_is_masked(msis->virt_state) && 83 msi_is_empty(msis->phys_state))) { 84 bool empty = true; 85 86 for (i = 0; i < msis->nr_entries; i++) { 87 eventfds[i] = msis->entries[i].gsi >= 0 ? 88 msis->entries[i].eventfd : -1; 89 90 if (eventfds[i] >= 0) 91 empty = false; 92 } 93 94 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set); 95 if (ret < 0) { 96 perror("VFIO_DEVICE_SET_IRQS(multi)"); 97 return ret; 98 } 99 100 msi_set_enabled(msis->phys_state, true); 101 msi_set_empty(msis->phys_state, empty); 102 103 return 0; 104 } 105 106 if (msi_is_masked(msis->virt_state)) { 107 /* TODO: if phys_state is not empty nor masked, mask all vectors */ 108 return 0; 109 } 110 111 /* Update individual vectors to avoid breaking those in use */ 112 for (i = 0; i < msis->nr_entries; i++) { 113 struct vfio_pci_msi_entry *entry = &msis->entries[i]; 114 int fd = entry->gsi >= 0 ? entry->eventfd : -1; 115 116 if (fd == eventfds[i]) 117 continue; 118 119 single.irq.start = i; 120 single.fd = fd; 121 122 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single); 123 if (ret < 0) { 124 perror("VFIO_DEVICE_SET_IRQS(single)"); 125 break; 126 } 127 128 eventfds[i] = fd; 129 130 if (msi_is_empty(msis->phys_state) && fd >= 0) 131 msi_set_empty(msis->phys_state, false); 132 } 133 134 return ret; 135 } 136 137 static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev, 138 bool msix) 139 { 140 int ret; 141 struct vfio_pci_device *pdev = &vdev->pci; 142 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 143 struct vfio_irq_set irq_set = { 144 .argsz = sizeof(irq_set), 145 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 146 .index = msis->info.index, 147 .start = 0, 148 .count = 0, 149 }; 150 151 if (!msi_is_enabled(msis->phys_state)) 152 return 0; 153 154 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 155 if (ret < 0) { 156 perror("VFIO_DEVICE_SET_IRQS(NONE)"); 157 return ret; 158 } 159 160 msi_set_enabled(msis->phys_state, false); 161 msi_set_empty(msis->phys_state, true); 162 163 /* 164 * When MSI or MSIX is disabled, this might be called when 165 * PCI driver detects the MSI interrupt failure and wants to 166 * rollback to INTx mode. Thus enable INTx if the device 167 * supports INTx mode in this case. 168 */ 169 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 170 ret = vfio_pci_enable_intx(kvm, vdev); 171 172 return ret >= 0 ? 0 : ret; 173 } 174 175 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev, 176 struct vfio_pci_msi_entry *entry) 177 { 178 int ret; 179 180 if (entry->eventfd < 0) { 181 entry->eventfd = eventfd(0, 0); 182 if (entry->eventfd < 0) { 183 ret = -errno; 184 vfio_dev_err(vdev, "cannot create eventfd"); 185 return ret; 186 } 187 } 188 189 /* Allocate IRQ if necessary */ 190 if (entry->gsi < 0) { 191 int ret = irq__add_msix_route(kvm, &entry->config.msg, 192 vdev->dev_hdr.dev_num << 3); 193 if (ret < 0) { 194 vfio_dev_err(vdev, "cannot create MSI-X route"); 195 return ret; 196 } 197 entry->gsi = ret; 198 } else { 199 irq__update_msix_route(kvm, entry->gsi, &entry->config.msg); 200 } 201 202 /* 203 * MSI masking is unimplemented in VFIO, so we have to handle it by 204 * disabling/enabling IRQ route instead. We do it on the KVM side rather 205 * than VFIO, because: 206 * - it is 8x faster 207 * - it allows to decouple masking logic from capability state. 208 * - in masked state, after removing irqfd route, we could easily plug 209 * the eventfd in a local handler, in order to serve Pending Bit reads 210 * to the guest. 211 * 212 * So entry->phys_state is masked when there is no active irqfd route. 213 */ 214 if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state)) 215 return 0; 216 217 if (msi_is_masked(entry->phys_state)) { 218 ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1); 219 if (ret < 0) { 220 vfio_dev_err(vdev, "cannot setup irqfd"); 221 return ret; 222 } 223 } else { 224 irq__del_irqfd(kvm, entry->gsi, entry->eventfd); 225 } 226 227 msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state)); 228 229 return 0; 230 } 231 232 static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 233 u32 len, u8 is_write, void *ptr) 234 { 235 struct vfio_pci_device *pdev = ptr; 236 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 237 u64 offset = addr - pba->guest_phys_addr; 238 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 239 240 if (is_write) 241 return; 242 243 /* 244 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA 245 * is completely useless here. Note that Linux doesn't use PBA. 246 */ 247 if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len) 248 vfio_dev_err(vdev, "cannot access MSIX PBA\n"); 249 } 250 251 static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 252 u32 len, u8 is_write, void *ptr) 253 { 254 struct kvm *kvm = vcpu->kvm; 255 struct vfio_pci_msi_entry *entry; 256 struct vfio_pci_device *pdev = ptr; 257 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 258 259 u64 offset = addr - pdev->msix_table.guest_phys_addr; 260 261 size_t vector = offset / PCI_MSIX_ENTRY_SIZE; 262 off_t field = offset % PCI_MSIX_ENTRY_SIZE; 263 264 /* 265 * PCI spec says that software must use aligned 4 or 8 bytes accesses 266 * for the MSI-X tables. 267 */ 268 if ((len != 4 && len != 8) || addr & (len - 1)) { 269 vfio_dev_warn(vdev, "invalid MSI-X table access"); 270 return; 271 } 272 273 entry = &pdev->msix.entries[vector]; 274 275 mutex_lock(&pdev->msix.mutex); 276 277 if (!is_write) { 278 memcpy(data, (void *)&entry->config + field, len); 279 goto out_unlock; 280 } 281 282 memcpy((void *)&entry->config + field, data, len); 283 284 /* 285 * Check if access touched the vector control register, which is at the 286 * end of the MSI-X entry. 287 */ 288 if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL) 289 goto out_unlock; 290 291 msi_set_masked(entry->virt_state, entry->config.ctrl & 292 PCI_MSIX_ENTRY_CTRL_MASKBIT); 293 294 if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0) 295 /* Not much we can do here. */ 296 vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector); 297 298 /* Update the physical capability if necessary */ 299 if (vfio_pci_enable_msis(kvm, vdev, true)) 300 vfio_dev_err(vdev, "cannot enable MSIX"); 301 302 out_unlock: 303 mutex_unlock(&pdev->msix.mutex); 304 } 305 306 static void vfio_pci_msix_cap_write(struct kvm *kvm, 307 struct vfio_device *vdev, u8 off, 308 void *data, int sz) 309 { 310 struct vfio_pci_device *pdev = &vdev->pci; 311 off_t enable_pos = PCI_MSIX_FLAGS + 1; 312 bool enable; 313 u16 flags; 314 315 off -= pdev->msix.pos; 316 317 /* Check if access intersects with the MSI-X Enable bit */ 318 if (off > enable_pos || off + sz <= enable_pos) 319 return; 320 321 /* Read byte that contains the Enable bit */ 322 flags = *(u8 *)(data + enable_pos - off) << 8; 323 324 mutex_lock(&pdev->msix.mutex); 325 326 msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL); 327 enable = flags & PCI_MSIX_FLAGS_ENABLE; 328 msi_set_enabled(pdev->msix.virt_state, enable); 329 330 if (enable && vfio_pci_enable_msis(kvm, vdev, true)) 331 vfio_dev_err(vdev, "cannot enable MSIX"); 332 else if (!enable && vfio_pci_disable_msis(kvm, vdev, true)) 333 vfio_dev_err(vdev, "cannot disable MSIX"); 334 335 mutex_unlock(&pdev->msix.mutex); 336 } 337 338 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev, 339 u8 off, u8 *data, u32 sz) 340 { 341 size_t i; 342 u32 mask = 0; 343 size_t mask_pos, start, limit; 344 struct vfio_pci_msi_entry *entry; 345 struct vfio_pci_device *pdev = &vdev->pci; 346 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 347 348 if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT)) 349 return 0; 350 351 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) 352 mask_pos = PCI_MSI_MASK_64; 353 else 354 mask_pos = PCI_MSI_MASK_32; 355 356 if (off >= mask_pos + 4 || off + sz <= mask_pos) 357 return 0; 358 359 /* Set mask to current state */ 360 for (i = 0; i < pdev->msi.nr_entries; i++) { 361 entry = &pdev->msi.entries[i]; 362 mask |= !!msi_is_masked(entry->virt_state) << i; 363 } 364 365 /* Update mask following the intersection of access and register */ 366 start = max_t(size_t, off, mask_pos); 367 limit = min_t(size_t, off + sz, mask_pos + 4); 368 369 memcpy((void *)&mask + start - mask_pos, data + start - off, 370 limit - start); 371 372 /* Update states if necessary */ 373 for (i = 0; i < pdev->msi.nr_entries; i++) { 374 bool masked = mask & (1 << i); 375 376 entry = &pdev->msi.entries[i]; 377 if (masked != msi_is_masked(entry->virt_state)) { 378 msi_set_masked(entry->virt_state, masked); 379 vfio_pci_update_msi_entry(kvm, vdev, entry); 380 } 381 } 382 383 return 1; 384 } 385 386 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev, 387 u8 off, u8 *data, u32 sz) 388 { 389 u8 ctrl; 390 struct msi_msg msg; 391 size_t i, nr_vectors; 392 struct vfio_pci_msi_entry *entry; 393 struct vfio_pci_device *pdev = &vdev->pci; 394 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 395 396 off -= pdev->msi.pos; 397 398 mutex_lock(&pdev->msi.mutex); 399 400 /* Check if the guest is trying to update mask bits */ 401 if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz)) 402 goto out_unlock; 403 404 /* Only modify routes when guest pokes the enable bit */ 405 if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS) 406 goto out_unlock; 407 408 ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off); 409 410 msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE); 411 412 if (!msi_is_enabled(pdev->msi.virt_state)) { 413 vfio_pci_disable_msis(kvm, vdev, false); 414 goto out_unlock; 415 } 416 417 /* Create routes for the requested vectors */ 418 nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4); 419 420 msg.address_lo = msi_cap_64->address_lo; 421 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) { 422 msg.address_hi = msi_cap_64->address_hi; 423 msg.data = msi_cap_64->data; 424 } else { 425 struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64; 426 msg.address_hi = 0; 427 msg.data = msi_cap_32->data; 428 } 429 430 for (i = 0; i < nr_vectors; i++) { 431 entry = &pdev->msi.entries[i]; 432 entry->config.msg = msg; 433 vfio_pci_update_msi_entry(kvm, vdev, entry); 434 } 435 436 /* Update the physical capability if necessary */ 437 if (vfio_pci_enable_msis(kvm, vdev, false)) 438 vfio_dev_err(vdev, "cannot enable MSI"); 439 440 out_unlock: 441 mutex_unlock(&pdev->msi.mutex); 442 } 443 444 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr, 445 u8 offset, void *data, int sz) 446 { 447 struct vfio_region_info *info; 448 struct vfio_pci_device *pdev; 449 struct vfio_device *vdev; 450 char base[sz]; 451 452 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 453 vdev = container_of(pdev, struct vfio_device, pci); 454 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 455 456 /* Dummy read in case of side-effects */ 457 if (pread(vdev->fd, base, sz, info->offset + offset) != sz) 458 vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x", 459 sz, offset); 460 } 461 462 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr, 463 u8 offset, void *data, int sz) 464 { 465 struct vfio_region_info *info; 466 struct vfio_pci_device *pdev; 467 struct vfio_device *vdev; 468 void *base = pci_hdr; 469 470 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 471 vdev = container_of(pdev, struct vfio_device, pci); 472 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 473 474 if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz) 475 vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x", 476 sz, offset); 477 478 /* Handle MSI write now, since it might update the hardware capability */ 479 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) 480 vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz); 481 482 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) 483 vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz); 484 485 if (pread(vdev->fd, base + offset, sz, info->offset + offset) != sz) 486 vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x", 487 sz, offset); 488 } 489 490 static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr) 491 { 492 size_t size = 10; 493 494 if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT) 495 size += 4; 496 if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT) 497 size += 10; 498 499 return size; 500 } 501 502 static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr) 503 { 504 switch (cap_hdr->type) { 505 case PCI_CAP_ID_MSIX: 506 return PCI_CAP_MSIX_SIZEOF; 507 case PCI_CAP_ID_MSI: 508 return vfio_pci_msi_cap_size((void *)cap_hdr); 509 default: 510 pr_err("unknown PCI capability 0x%x", cap_hdr->type); 511 return 0; 512 } 513 } 514 515 static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr, 516 struct pci_cap_hdr *cap, off_t pos) 517 { 518 struct pci_cap_hdr *last; 519 struct pci_device_header *hdr = &vdev->pci.hdr; 520 521 cap->next = 0; 522 523 if (!hdr->capabilities) { 524 hdr->capabilities = pos; 525 hdr->status |= PCI_STATUS_CAP_LIST; 526 } else { 527 last = PCI_CAP(virt_hdr, hdr->capabilities); 528 529 while (last->next) 530 last = PCI_CAP(virt_hdr, last->next); 531 532 last->next = pos; 533 } 534 535 memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap)); 536 537 return 0; 538 } 539 540 static int vfio_pci_parse_caps(struct vfio_device *vdev) 541 { 542 int ret; 543 size_t size; 544 u8 pos, next; 545 struct pci_cap_hdr *cap; 546 u8 virt_hdr[PCI_DEV_CFG_SIZE]; 547 struct vfio_pci_device *pdev = &vdev->pci; 548 549 if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST)) 550 return 0; 551 552 memset(virt_hdr, 0, PCI_DEV_CFG_SIZE); 553 554 pos = pdev->hdr.capabilities & ~3; 555 556 pdev->hdr.status &= ~PCI_STATUS_CAP_LIST; 557 pdev->hdr.capabilities = 0; 558 559 for (; pos; pos = next) { 560 if (pos >= PCI_DEV_CFG_SIZE) { 561 vfio_dev_warn(vdev, "ignoring cap outside of config space"); 562 return -EINVAL; 563 } 564 565 cap = PCI_CAP(&pdev->hdr, pos); 566 next = cap->next; 567 568 switch (cap->type) { 569 case PCI_CAP_ID_MSIX: 570 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 571 if (ret) 572 return ret; 573 574 pdev->msix.pos = pos; 575 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX; 576 break; 577 case PCI_CAP_ID_MSI: 578 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 579 if (ret) 580 return ret; 581 582 pdev->msi.pos = pos; 583 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI; 584 break; 585 } 586 } 587 588 /* Wipe remaining capabilities */ 589 pos = PCI_STD_HEADER_SIZEOF; 590 size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF; 591 memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size); 592 593 return 0; 594 } 595 596 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev) 597 { 598 ssize_t sz = PCI_DEV_CFG_SIZE; 599 struct vfio_region_info *info; 600 struct vfio_pci_device *pdev = &vdev->pci; 601 602 if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { 603 vfio_dev_err(vdev, "Config Space not found"); 604 return -ENODEV; 605 } 606 607 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 608 *info = (struct vfio_region_info) { 609 .argsz = sizeof(*info), 610 .index = VFIO_PCI_CONFIG_REGION_INDEX, 611 }; 612 613 ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 614 if (!info->size) { 615 vfio_dev_err(vdev, "Config Space has size zero?!"); 616 return -EINVAL; 617 } 618 619 /* Read standard headers and capabilities */ 620 if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) { 621 vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz); 622 return -EIO; 623 } 624 625 /* Strip bit 7, that indicates multifunction */ 626 pdev->hdr.header_type &= 0x7f; 627 628 if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) { 629 vfio_dev_err(vdev, "unsupported header type %u", 630 pdev->hdr.header_type); 631 return -EOPNOTSUPP; 632 } 633 634 if (pdev->hdr.irq_pin) 635 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX; 636 637 vfio_pci_parse_caps(vdev); 638 639 return 0; 640 } 641 642 static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev) 643 { 644 int i; 645 ssize_t hdr_sz; 646 struct msix_cap *msix; 647 struct vfio_region_info *info; 648 struct vfio_pci_device *pdev = &vdev->pci; 649 650 /* Initialise the BARs */ 651 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 652 u64 base; 653 struct vfio_region *region = &vdev->regions[i]; 654 655 /* Construct a fake reg to match what we've mapped. */ 656 if (region->is_ioport) { 657 base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) | 658 PCI_BASE_ADDRESS_SPACE_IO; 659 } else { 660 base = (region->guest_phys_addr & 661 PCI_BASE_ADDRESS_MEM_MASK) | 662 PCI_BASE_ADDRESS_SPACE_MEMORY; 663 } 664 665 pdev->hdr.bar[i] = base; 666 667 if (!base) 668 continue; 669 670 pdev->hdr.bar_size[i] = region->info.size; 671 } 672 673 /* I really can't be bothered to support cardbus. */ 674 pdev->hdr.card_bus = 0; 675 676 /* 677 * Nuke the expansion ROM for now. If we want to do this properly, 678 * we need to save its size somewhere and map into the guest. 679 */ 680 pdev->hdr.exp_rom_bar = 0; 681 682 /* Plumb in our fake MSI-X capability, if we have it. */ 683 msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX); 684 if (msix) { 685 /* Add a shortcut to the PBA region for the MMIO handler */ 686 int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar; 687 pdev->msix_pba.offset = vdev->regions[pba_index].info.offset + 688 (msix->pba_offset & PCI_MSIX_PBA_OFFSET); 689 690 /* Tidy up the capability */ 691 msix->table_offset &= PCI_MSIX_TABLE_BIR; 692 msix->pba_offset &= PCI_MSIX_PBA_BIR; 693 if (pdev->msix_table.bar == pdev->msix_pba.bar) 694 msix->pba_offset |= pdev->msix_table.size & 695 PCI_MSIX_PBA_OFFSET; 696 } 697 698 /* Install our fake Configuration Space */ 699 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 700 hdr_sz = PCI_DEV_CFG_SIZE; 701 if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) { 702 vfio_dev_err(vdev, "failed to write %zd bytes to Config Space", 703 hdr_sz); 704 return -EIO; 705 } 706 707 /* Register callbacks for cfg accesses */ 708 pdev->hdr.cfg_ops = (struct pci_config_operations) { 709 .read = vfio_pci_cfg_read, 710 .write = vfio_pci_cfg_write, 711 }; 712 713 pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH; 714 715 return 0; 716 } 717 718 static int vfio_pci_create_msix_table(struct kvm *kvm, 719 struct vfio_pci_device *pdev) 720 { 721 int ret; 722 size_t i; 723 size_t mmio_size; 724 size_t nr_entries; 725 struct vfio_pci_msi_entry *entries; 726 struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 727 struct vfio_pci_msix_table *table = &pdev->msix_table; 728 struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos); 729 730 table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR; 731 pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR; 732 733 /* 734 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE. 735 */ 736 nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; 737 table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE); 738 pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE); 739 740 entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry)); 741 if (!entries) 742 return -ENOMEM; 743 744 for (i = 0; i < nr_entries; i++) 745 entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; 746 747 /* 748 * To ease MSI-X cap configuration in case they share the same BAR, 749 * collapse table and pending array. The size of the BAR regions must be 750 * powers of two. 751 */ 752 mmio_size = roundup_pow_of_two(table->size + pba->size); 753 table->guest_phys_addr = pci_get_io_space_block(mmio_size); 754 if (!table->guest_phys_addr) { 755 pr_err("cannot allocate IO space"); 756 ret = -ENOMEM; 757 goto out_free; 758 } 759 pba->guest_phys_addr = table->guest_phys_addr + table->size; 760 761 ret = kvm__register_mmio(kvm, table->guest_phys_addr, table->size, 762 false, vfio_pci_msix_table_access, pdev); 763 if (ret < 0) 764 goto out_free; 765 766 /* 767 * We could map the physical PBA directly into the guest, but it's 768 * likely smaller than a page, and we can only hand full pages to the 769 * guest. Even though the PCI spec disallows sharing a page used for 770 * MSI-X with any other resource, it allows to share the same page 771 * between MSI-X table and PBA. For the sake of isolation, create a 772 * virtual PBA. 773 */ 774 ret = kvm__register_mmio(kvm, pba->guest_phys_addr, pba->size, false, 775 vfio_pci_msix_pba_access, pdev); 776 if (ret < 0) 777 goto out_free; 778 779 pdev->msix.entries = entries; 780 pdev->msix.nr_entries = nr_entries; 781 782 return 0; 783 784 out_free: 785 free(entries); 786 787 return ret; 788 } 789 790 static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev) 791 { 792 struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos); 793 794 pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1), 795 pdev->msi.entries = calloc(pdev->msi.nr_entries, 796 sizeof(struct vfio_pci_msi_entry)); 797 if (!pdev->msi.entries) 798 return -ENOMEM; 799 800 return 0; 801 } 802 803 static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev, 804 size_t nr) 805 { 806 int ret; 807 u32 bar; 808 size_t map_size; 809 struct vfio_pci_device *pdev = &vdev->pci; 810 struct vfio_region *region = &vdev->regions[nr]; 811 812 if (nr >= vdev->info.num_regions) 813 return 0; 814 815 bar = pdev->hdr.bar[nr]; 816 817 region->vdev = vdev; 818 region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO); 819 region->info = (struct vfio_region_info) { 820 .argsz = sizeof(region->info), 821 .index = nr, 822 }; 823 824 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®ion->info); 825 if (ret) { 826 ret = -errno; 827 vfio_dev_err(vdev, "cannot get info for BAR %zu", nr); 828 return ret; 829 } 830 831 /* Ignore invalid or unimplemented regions */ 832 if (!region->info.size) 833 return 0; 834 835 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 836 /* Trap and emulate MSI-X table */ 837 if (nr == pdev->msix_table.bar) { 838 region->guest_phys_addr = pdev->msix_table.guest_phys_addr; 839 return 0; 840 } else if (nr == pdev->msix_pba.bar) { 841 region->guest_phys_addr = pdev->msix_pba.guest_phys_addr; 842 return 0; 843 } 844 } 845 846 if (!region->is_ioport) { 847 /* Grab some MMIO space in the guest */ 848 map_size = ALIGN(region->info.size, PAGE_SIZE); 849 region->guest_phys_addr = pci_get_io_space_block(map_size); 850 } 851 852 /* Map the BARs into the guest or setup a trap region. */ 853 ret = vfio_map_region(kvm, vdev, region); 854 if (ret) 855 return ret; 856 857 return 0; 858 } 859 860 static int vfio_pci_configure_dev_regions(struct kvm *kvm, 861 struct vfio_device *vdev) 862 { 863 int ret; 864 u32 bar; 865 size_t i; 866 bool is_64bit = false; 867 struct vfio_pci_device *pdev = &vdev->pci; 868 869 ret = vfio_pci_parse_cfg_space(vdev); 870 if (ret) 871 return ret; 872 873 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 874 ret = vfio_pci_create_msix_table(kvm, pdev); 875 if (ret) 876 return ret; 877 } 878 879 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 880 ret = vfio_pci_create_msi_cap(kvm, pdev); 881 if (ret) 882 return ret; 883 } 884 885 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 886 /* Ignore top half of 64-bit BAR */ 887 if (i % 2 && is_64bit) 888 continue; 889 890 ret = vfio_pci_configure_bar(kvm, vdev, i); 891 if (ret) 892 return ret; 893 894 bar = pdev->hdr.bar[i]; 895 is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) == 896 PCI_BASE_ADDRESS_SPACE_MEMORY && 897 bar & PCI_BASE_ADDRESS_MEM_TYPE_64; 898 } 899 900 /* We've configured the BARs, fake up a Configuration Space */ 901 return vfio_pci_fixup_cfg_space(vdev); 902 } 903 904 /* 905 * Attempt to update the FD limit, if opening an eventfd for each IRQ vector 906 * would hit the limit. Which is likely to happen when a device uses 2048 MSIs. 907 */ 908 static int vfio_pci_reserve_irq_fds(size_t num) 909 { 910 /* 911 * I counted around 27 fds under normal load. Let's add 100 for good 912 * measure. 913 */ 914 static size_t needed = 128; 915 struct rlimit fd_limit, new_limit; 916 917 needed += num; 918 919 if (getrlimit(RLIMIT_NOFILE, &fd_limit)) { 920 perror("getrlimit(RLIMIT_NOFILE)"); 921 return 0; 922 } 923 924 if (fd_limit.rlim_cur >= needed) 925 return 0; 926 927 new_limit.rlim_cur = needed; 928 929 if (fd_limit.rlim_max < needed) 930 /* Try to bump hard limit (root only) */ 931 new_limit.rlim_max = needed; 932 else 933 new_limit.rlim_max = fd_limit.rlim_max; 934 935 if (setrlimit(RLIMIT_NOFILE, &new_limit)) { 936 perror("setrlimit(RLIMIT_NOFILE)"); 937 pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)", 938 (size_t)(needed - fd_limit.rlim_cur)); 939 } 940 941 return 0; 942 } 943 944 static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev, 945 struct vfio_pci_msi_common *msis) 946 { 947 int ret; 948 size_t i; 949 int *eventfds; 950 size_t irq_set_size; 951 struct vfio_pci_msi_entry *entry; 952 size_t nr_entries = msis->nr_entries; 953 954 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info); 955 if (ret || msis->info.count == 0) { 956 vfio_dev_err(vdev, "no MSI reported by VFIO"); 957 return -ENODEV; 958 } 959 960 if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) { 961 vfio_dev_err(vdev, "interrupt not EVENTFD capable"); 962 return -EINVAL; 963 } 964 965 if (msis->info.count != nr_entries) { 966 vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO"); 967 return -EINVAL; 968 } 969 970 mutex_init(&msis->mutex); 971 972 vfio_pci_reserve_irq_fds(nr_entries); 973 974 irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int); 975 msis->irq_set = malloc(irq_set_size); 976 if (!msis->irq_set) 977 return -ENOMEM; 978 979 *msis->irq_set = (struct vfio_irq_set) { 980 .argsz = irq_set_size, 981 .flags = VFIO_IRQ_SET_DATA_EVENTFD | 982 VFIO_IRQ_SET_ACTION_TRIGGER, 983 .index = msis->info.index, 984 .start = 0, 985 .count = nr_entries, 986 }; 987 988 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 989 990 for (i = 0; i < nr_entries; i++) { 991 entry = &msis->entries[i]; 992 entry->gsi = -1; 993 entry->eventfd = -1; 994 msi_set_masked(entry->virt_state, true); 995 msi_set_masked(entry->phys_state, true); 996 eventfds[i] = -1; 997 } 998 999 return 0; 1000 } 1001 1002 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev) 1003 { 1004 struct vfio_pci_device *pdev = &vdev->pci; 1005 int gsi = pdev->intx_gsi; 1006 struct vfio_irq_set irq_set = { 1007 .argsz = sizeof(irq_set), 1008 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 1009 .index = VFIO_PCI_INTX_IRQ_INDEX, 1010 }; 1011 1012 if (pdev->intx_fd == -1) 1013 return; 1014 1015 pr_debug("user requested MSI, disabling INTx %d", gsi); 1016 1017 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 1018 irq__del_irqfd(kvm, gsi, pdev->intx_fd); 1019 1020 close(pdev->intx_fd); 1021 close(pdev->unmask_fd); 1022 pdev->intx_fd = -1; 1023 } 1024 1025 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev) 1026 { 1027 int ret; 1028 int trigger_fd, unmask_fd; 1029 struct vfio_irq_eventfd trigger; 1030 struct vfio_irq_eventfd unmask; 1031 struct vfio_pci_device *pdev = &vdev->pci; 1032 int gsi = pdev->intx_gsi; 1033 1034 if (pdev->intx_fd != -1) 1035 return 0; 1036 1037 /* 1038 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd 1039 * signals an interrupt from host to guest, and unmask_fd signals the 1040 * deassertion of the line from guest to host. 1041 */ 1042 trigger_fd = eventfd(0, 0); 1043 if (trigger_fd < 0) { 1044 vfio_dev_err(vdev, "failed to create trigger eventfd"); 1045 return trigger_fd; 1046 } 1047 1048 unmask_fd = eventfd(0, 0); 1049 if (unmask_fd < 0) { 1050 vfio_dev_err(vdev, "failed to create unmask eventfd"); 1051 close(trigger_fd); 1052 return unmask_fd; 1053 } 1054 1055 ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd); 1056 if (ret) 1057 goto err_close; 1058 1059 trigger.irq = (struct vfio_irq_set) { 1060 .argsz = sizeof(trigger), 1061 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, 1062 .index = VFIO_PCI_INTX_IRQ_INDEX, 1063 .start = 0, 1064 .count = 1, 1065 }; 1066 trigger.fd = trigger_fd; 1067 1068 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 1069 if (ret < 0) { 1070 vfio_dev_err(vdev, "failed to setup VFIO IRQ"); 1071 goto err_delete_line; 1072 } 1073 1074 unmask.irq = (struct vfio_irq_set) { 1075 .argsz = sizeof(unmask), 1076 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK, 1077 .index = VFIO_PCI_INTX_IRQ_INDEX, 1078 .start = 0, 1079 .count = 1, 1080 }; 1081 unmask.fd = unmask_fd; 1082 1083 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask); 1084 if (ret < 0) { 1085 vfio_dev_err(vdev, "failed to setup unmask IRQ"); 1086 goto err_remove_event; 1087 } 1088 1089 pdev->intx_fd = trigger_fd; 1090 pdev->unmask_fd = unmask_fd; 1091 1092 return 0; 1093 1094 err_remove_event: 1095 /* Remove trigger event */ 1096 trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; 1097 trigger.irq.count = 0; 1098 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 1099 1100 err_delete_line: 1101 irq__del_irqfd(kvm, gsi, trigger_fd); 1102 1103 err_close: 1104 close(trigger_fd); 1105 close(unmask_fd); 1106 return ret; 1107 } 1108 1109 static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev) 1110 { 1111 int ret; 1112 struct vfio_pci_device *pdev = &vdev->pci; 1113 struct vfio_irq_info irq_info = { 1114 .argsz = sizeof(irq_info), 1115 .index = VFIO_PCI_INTX_IRQ_INDEX, 1116 }; 1117 1118 vfio_pci_reserve_irq_fds(2); 1119 1120 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); 1121 if (ret || irq_info.count == 0) { 1122 vfio_dev_err(vdev, "no INTx reported by VFIO"); 1123 return -ENODEV; 1124 } 1125 1126 if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { 1127 vfio_dev_err(vdev, "interrupt not eventfd capable"); 1128 return -EINVAL; 1129 } 1130 1131 if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) { 1132 vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED"); 1133 return -EINVAL; 1134 } 1135 1136 /* Guest is going to ovewrite our irq_line... */ 1137 pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET; 1138 1139 pdev->intx_fd = -1; 1140 1141 return 0; 1142 } 1143 1144 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev) 1145 { 1146 int ret = 0; 1147 struct vfio_pci_device *pdev = &vdev->pci; 1148 1149 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1150 pdev->msix.info = (struct vfio_irq_info) { 1151 .argsz = sizeof(pdev->msix.info), 1152 .index = VFIO_PCI_MSIX_IRQ_INDEX, 1153 }; 1154 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix); 1155 if (ret) 1156 return ret; 1157 } 1158 1159 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 1160 pdev->msi.info = (struct vfio_irq_info) { 1161 .argsz = sizeof(pdev->msi.info), 1162 .index = VFIO_PCI_MSI_IRQ_INDEX, 1163 }; 1164 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi); 1165 if (ret) 1166 return ret; 1167 } 1168 1169 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) { 1170 ret = vfio_pci_init_intx(kvm, vdev); 1171 if (ret) 1172 return ret; 1173 1174 ret = vfio_pci_enable_intx(kvm, vdev); 1175 } 1176 1177 return ret; 1178 } 1179 1180 int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev) 1181 { 1182 int ret; 1183 1184 ret = vfio_pci_configure_dev_regions(kvm, vdev); 1185 if (ret) { 1186 vfio_dev_err(vdev, "failed to configure regions"); 1187 return ret; 1188 } 1189 1190 vdev->dev_hdr = (struct device_header) { 1191 .bus_type = DEVICE_BUS_PCI, 1192 .data = &vdev->pci.hdr, 1193 }; 1194 1195 ret = device__register(&vdev->dev_hdr); 1196 if (ret) { 1197 vfio_dev_err(vdev, "failed to register VFIO device"); 1198 return ret; 1199 } 1200 1201 ret = vfio_pci_configure_dev_irqs(kvm, vdev); 1202 if (ret) { 1203 vfio_dev_err(vdev, "failed to configure IRQs"); 1204 return ret; 1205 } 1206 1207 return 0; 1208 } 1209 1210 void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev) 1211 { 1212 size_t i; 1213 struct vfio_pci_device *pdev = &vdev->pci; 1214 1215 for (i = 0; i < vdev->info.num_regions; i++) 1216 vfio_unmap_region(kvm, &vdev->regions[i]); 1217 1218 device__unregister(&vdev->dev_hdr); 1219 1220 free(pdev->msix.irq_set); 1221 free(pdev->msix.entries); 1222 free(pdev->msi.irq_set); 1223 free(pdev->msi.entries); 1224 } 1225