16078a454SJean-Philippe Brucker #include "kvm/irq.h" 26078a454SJean-Philippe Brucker #include "kvm/kvm.h" 36078a454SJean-Philippe Brucker #include "kvm/kvm-cpu.h" 46078a454SJean-Philippe Brucker #include "kvm/vfio.h" 56078a454SJean-Philippe Brucker 6e1d0285cSAlexandru Elisei #include <assert.h> 7e1d0285cSAlexandru Elisei 86078a454SJean-Philippe Brucker #include <sys/ioctl.h> 96078a454SJean-Philippe Brucker #include <sys/eventfd.h> 10c9888d95SJean-Philippe Brucker #include <sys/resource.h> 11c9888d95SJean-Philippe Brucker #include <sys/time.h> 126078a454SJean-Philippe Brucker 1325c1dc6cSAlexandru Elisei /* Some distros don't have the define. */ 1425c1dc6cSAlexandru Elisei #ifndef PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 1525c1dc6cSAlexandru Elisei #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 12 1625c1dc6cSAlexandru Elisei #endif 1725c1dc6cSAlexandru Elisei 186078a454SJean-Philippe Brucker /* Wrapper around UAPI vfio_irq_set */ 19a3704b91SAndre Przywara union vfio_irq_eventfd { 206078a454SJean-Philippe Brucker struct vfio_irq_set irq; 21a3704b91SAndre Przywara u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)]; 226078a454SJean-Philippe Brucker }; 236078a454SJean-Philippe Brucker 24a3704b91SAndre Przywara static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd) 25a3704b91SAndre Przywara { 26a3704b91SAndre Przywara memcpy(&evfd->irq.data, &fd, sizeof(fd)); 27a3704b91SAndre Przywara } 28a3704b91SAndre Przywara 29c9888d95SJean-Philippe Brucker #define msi_is_enabled(state) ((state) & VFIO_PCI_MSI_STATE_ENABLED) 30c9888d95SJean-Philippe Brucker #define msi_is_masked(state) ((state) & VFIO_PCI_MSI_STATE_MASKED) 31c9888d95SJean-Philippe Brucker #define msi_is_empty(state) ((state) & VFIO_PCI_MSI_STATE_EMPTY) 32c9888d95SJean-Philippe Brucker 33c9888d95SJean-Philippe Brucker #define msi_update_state(state, val, bit) \ 34c9888d95SJean-Philippe Brucker (state) = (val) ? (state) | bit : (state) & ~bit; 35c9888d95SJean-Philippe Brucker #define msi_set_enabled(state, val) \ 36c9888d95SJean-Philippe Brucker msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED) 37c9888d95SJean-Philippe Brucker #define msi_set_masked(state, val) \ 38c9888d95SJean-Philippe Brucker msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED) 39c9888d95SJean-Philippe Brucker #define msi_set_empty(state, val) \ 40c9888d95SJean-Philippe Brucker msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY) 41c9888d95SJean-Philippe Brucker 42c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev); 437302327aSLeo Yan static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev); 44c9888d95SJean-Philippe Brucker 458dd28afeSJean-Philippe Brucker static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev, 468dd28afeSJean-Philippe Brucker bool msix) 47c9888d95SJean-Philippe Brucker { 48c9888d95SJean-Philippe Brucker size_t i; 49c9888d95SJean-Philippe Brucker int ret = 0; 50c9888d95SJean-Philippe Brucker int *eventfds; 51c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 528dd28afeSJean-Philippe Brucker struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 53a3704b91SAndre Przywara union vfio_irq_eventfd single = { 54c9888d95SJean-Philippe Brucker .irq = { 55c9888d95SJean-Philippe Brucker .argsz = sizeof(single), 56c9888d95SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_EVENTFD | 57c9888d95SJean-Philippe Brucker VFIO_IRQ_SET_ACTION_TRIGGER, 58c9888d95SJean-Philippe Brucker .index = msis->info.index, 59c9888d95SJean-Philippe Brucker .count = 1, 60c9888d95SJean-Philippe Brucker }, 61c9888d95SJean-Philippe Brucker }; 62c9888d95SJean-Philippe Brucker 63c9888d95SJean-Philippe Brucker if (!msi_is_enabled(msis->virt_state)) 64c9888d95SJean-Philippe Brucker return 0; 65c9888d95SJean-Philippe Brucker 667302327aSLeo Yan if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 67c9888d95SJean-Philippe Brucker /* 68c9888d95SJean-Philippe Brucker * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same 69c9888d95SJean-Philippe Brucker * time. Since INTx has to be enabled from the start (we don't 707302327aSLeo Yan * have a reliable way to know when the guest starts using it), 71c9888d95SJean-Philippe Brucker * disable it now. 72c9888d95SJean-Philippe Brucker */ 73c9888d95SJean-Philippe Brucker vfio_pci_disable_intx(kvm, vdev); 74c9888d95SJean-Philippe Brucker 75c9888d95SJean-Philippe Brucker eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 76c9888d95SJean-Philippe Brucker 77c9888d95SJean-Philippe Brucker /* 78c9888d95SJean-Philippe Brucker * Initial registration of the full range. This enables the physical 79c9888d95SJean-Philippe Brucker * MSI/MSI-X capability, which might have desired side effects. For 80c9888d95SJean-Philippe Brucker * instance when assigning virtio legacy devices, enabling the MSI 81c9888d95SJean-Philippe Brucker * capability modifies the config space layout! 82c9888d95SJean-Philippe Brucker * 83c9888d95SJean-Philippe Brucker * As an optimization, only update MSIs when guest unmasks the 84c9888d95SJean-Philippe Brucker * capability. This greatly reduces the initialization time for Linux 85c9888d95SJean-Philippe Brucker * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap 86c9888d95SJean-Philippe Brucker * masked, then fills individual vectors, then unmasks the whole 87c9888d95SJean-Philippe Brucker * function. So we only do one VFIO ioctl when enabling for the first 88c9888d95SJean-Philippe Brucker * time, and then one when unmasking. 89c9888d95SJean-Philippe Brucker * 90c9888d95SJean-Philippe Brucker * phys_state is empty when it is enabled but no vector has been 91c9888d95SJean-Philippe Brucker * registered via SET_IRQS yet. 92c9888d95SJean-Philippe Brucker */ 93c9888d95SJean-Philippe Brucker if (!msi_is_enabled(msis->phys_state) || 94c9888d95SJean-Philippe Brucker (!msi_is_masked(msis->virt_state) && 95c9888d95SJean-Philippe Brucker msi_is_empty(msis->phys_state))) { 96c9888d95SJean-Philippe Brucker bool empty = true; 97c9888d95SJean-Philippe Brucker 98c9888d95SJean-Philippe Brucker for (i = 0; i < msis->nr_entries; i++) { 99c9888d95SJean-Philippe Brucker eventfds[i] = msis->entries[i].gsi >= 0 ? 100c9888d95SJean-Philippe Brucker msis->entries[i].eventfd : -1; 101c9888d95SJean-Philippe Brucker 102c9888d95SJean-Philippe Brucker if (eventfds[i] >= 0) 103c9888d95SJean-Philippe Brucker empty = false; 104c9888d95SJean-Philippe Brucker } 105c9888d95SJean-Philippe Brucker 106c9888d95SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set); 107c9888d95SJean-Philippe Brucker if (ret < 0) { 108c9888d95SJean-Philippe Brucker perror("VFIO_DEVICE_SET_IRQS(multi)"); 109c9888d95SJean-Philippe Brucker return ret; 110c9888d95SJean-Philippe Brucker } 111c9888d95SJean-Philippe Brucker 112c9888d95SJean-Philippe Brucker msi_set_enabled(msis->phys_state, true); 113c9888d95SJean-Philippe Brucker msi_set_empty(msis->phys_state, empty); 114c9888d95SJean-Philippe Brucker 115c9888d95SJean-Philippe Brucker return 0; 116c9888d95SJean-Philippe Brucker } 117c9888d95SJean-Philippe Brucker 118c9888d95SJean-Philippe Brucker if (msi_is_masked(msis->virt_state)) { 119c9888d95SJean-Philippe Brucker /* TODO: if phys_state is not empty nor masked, mask all vectors */ 120c9888d95SJean-Philippe Brucker return 0; 121c9888d95SJean-Philippe Brucker } 122c9888d95SJean-Philippe Brucker 123c9888d95SJean-Philippe Brucker /* Update individual vectors to avoid breaking those in use */ 124c9888d95SJean-Philippe Brucker for (i = 0; i < msis->nr_entries; i++) { 125c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entry = &msis->entries[i]; 126c9888d95SJean-Philippe Brucker int fd = entry->gsi >= 0 ? entry->eventfd : -1; 127c9888d95SJean-Philippe Brucker 128c9888d95SJean-Philippe Brucker if (fd == eventfds[i]) 129c9888d95SJean-Philippe Brucker continue; 130c9888d95SJean-Philippe Brucker 131c9888d95SJean-Philippe Brucker single.irq.start = i; 132a3704b91SAndre Przywara set_vfio_irq_eventd_payload(&single, fd); 133c9888d95SJean-Philippe Brucker 134c9888d95SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single); 135c9888d95SJean-Philippe Brucker if (ret < 0) { 136c9888d95SJean-Philippe Brucker perror("VFIO_DEVICE_SET_IRQS(single)"); 137c9888d95SJean-Philippe Brucker break; 138c9888d95SJean-Philippe Brucker } 139c9888d95SJean-Philippe Brucker 140c9888d95SJean-Philippe Brucker eventfds[i] = fd; 141c9888d95SJean-Philippe Brucker 142c9888d95SJean-Philippe Brucker if (msi_is_empty(msis->phys_state) && fd >= 0) 143c9888d95SJean-Philippe Brucker msi_set_empty(msis->phys_state, false); 144c9888d95SJean-Philippe Brucker } 145c9888d95SJean-Philippe Brucker 146c9888d95SJean-Philippe Brucker return ret; 147c9888d95SJean-Philippe Brucker } 148c9888d95SJean-Philippe Brucker 1498dd28afeSJean-Philippe Brucker static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev, 1508dd28afeSJean-Philippe Brucker bool msix) 151c9888d95SJean-Philippe Brucker { 152c9888d95SJean-Philippe Brucker int ret; 153c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 1548dd28afeSJean-Philippe Brucker struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 155c9888d95SJean-Philippe Brucker struct vfio_irq_set irq_set = { 156c9888d95SJean-Philippe Brucker .argsz = sizeof(irq_set), 157c9888d95SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 158c9888d95SJean-Philippe Brucker .index = msis->info.index, 159c9888d95SJean-Philippe Brucker .start = 0, 160c9888d95SJean-Philippe Brucker .count = 0, 161c9888d95SJean-Philippe Brucker }; 162c9888d95SJean-Philippe Brucker 163c9888d95SJean-Philippe Brucker if (!msi_is_enabled(msis->phys_state)) 164c9888d95SJean-Philippe Brucker return 0; 165c9888d95SJean-Philippe Brucker 166c9888d95SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 167c9888d95SJean-Philippe Brucker if (ret < 0) { 168c9888d95SJean-Philippe Brucker perror("VFIO_DEVICE_SET_IRQS(NONE)"); 169c9888d95SJean-Philippe Brucker return ret; 170c9888d95SJean-Philippe Brucker } 171c9888d95SJean-Philippe Brucker 172c9888d95SJean-Philippe Brucker msi_set_enabled(msis->phys_state, false); 173c9888d95SJean-Philippe Brucker msi_set_empty(msis->phys_state, true); 174c9888d95SJean-Philippe Brucker 1757302327aSLeo Yan /* 1767302327aSLeo Yan * When MSI or MSIX is disabled, this might be called when 1777302327aSLeo Yan * PCI driver detects the MSI interrupt failure and wants to 1787302327aSLeo Yan * rollback to INTx mode. Thus enable INTx if the device 1797302327aSLeo Yan * supports INTx mode in this case. 1807302327aSLeo Yan */ 1817302327aSLeo Yan if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 1827302327aSLeo Yan ret = vfio_pci_enable_intx(kvm, vdev); 1837302327aSLeo Yan 1847302327aSLeo Yan return ret >= 0 ? 0 : ret; 185c9888d95SJean-Philippe Brucker } 186c9888d95SJean-Philippe Brucker 187c9888d95SJean-Philippe Brucker static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev, 188c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entry) 189c9888d95SJean-Philippe Brucker { 190c9888d95SJean-Philippe Brucker int ret; 191c9888d95SJean-Philippe Brucker 192c9888d95SJean-Philippe Brucker if (entry->eventfd < 0) { 193c9888d95SJean-Philippe Brucker entry->eventfd = eventfd(0, 0); 194c9888d95SJean-Philippe Brucker if (entry->eventfd < 0) { 195c9888d95SJean-Philippe Brucker ret = -errno; 196c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot create eventfd"); 197c9888d95SJean-Philippe Brucker return ret; 198c9888d95SJean-Philippe Brucker } 199c9888d95SJean-Philippe Brucker } 200c9888d95SJean-Philippe Brucker 201c9888d95SJean-Philippe Brucker /* Allocate IRQ if necessary */ 202c9888d95SJean-Philippe Brucker if (entry->gsi < 0) { 203c9888d95SJean-Philippe Brucker int ret = irq__add_msix_route(kvm, &entry->config.msg, 204c9888d95SJean-Philippe Brucker vdev->dev_hdr.dev_num << 3); 205c9888d95SJean-Philippe Brucker if (ret < 0) { 206c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot create MSI-X route"); 207c9888d95SJean-Philippe Brucker return ret; 208c9888d95SJean-Philippe Brucker } 209c9888d95SJean-Philippe Brucker entry->gsi = ret; 210c9888d95SJean-Philippe Brucker } else { 211c9888d95SJean-Philippe Brucker irq__update_msix_route(kvm, entry->gsi, &entry->config.msg); 212c9888d95SJean-Philippe Brucker } 213c9888d95SJean-Philippe Brucker 214c9888d95SJean-Philippe Brucker /* 215c9888d95SJean-Philippe Brucker * MSI masking is unimplemented in VFIO, so we have to handle it by 216c9888d95SJean-Philippe Brucker * disabling/enabling IRQ route instead. We do it on the KVM side rather 217c9888d95SJean-Philippe Brucker * than VFIO, because: 218c9888d95SJean-Philippe Brucker * - it is 8x faster 219c9888d95SJean-Philippe Brucker * - it allows to decouple masking logic from capability state. 220c9888d95SJean-Philippe Brucker * - in masked state, after removing irqfd route, we could easily plug 221c9888d95SJean-Philippe Brucker * the eventfd in a local handler, in order to serve Pending Bit reads 222c9888d95SJean-Philippe Brucker * to the guest. 223c9888d95SJean-Philippe Brucker * 224c9888d95SJean-Philippe Brucker * So entry->phys_state is masked when there is no active irqfd route. 225c9888d95SJean-Philippe Brucker */ 226c9888d95SJean-Philippe Brucker if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state)) 227c9888d95SJean-Philippe Brucker return 0; 228c9888d95SJean-Philippe Brucker 229c9888d95SJean-Philippe Brucker if (msi_is_masked(entry->phys_state)) { 230c9888d95SJean-Philippe Brucker ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1); 231c9888d95SJean-Philippe Brucker if (ret < 0) { 232c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot setup irqfd"); 233c9888d95SJean-Philippe Brucker return ret; 234c9888d95SJean-Philippe Brucker } 235c9888d95SJean-Philippe Brucker } else { 236c9888d95SJean-Philippe Brucker irq__del_irqfd(kvm, entry->gsi, entry->eventfd); 237c9888d95SJean-Philippe Brucker } 238c9888d95SJean-Philippe Brucker 239c9888d95SJean-Philippe Brucker msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state)); 240c9888d95SJean-Philippe Brucker 241c9888d95SJean-Philippe Brucker return 0; 242c9888d95SJean-Philippe Brucker } 243c9888d95SJean-Philippe Brucker 244c9888d95SJean-Philippe Brucker static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 245c9888d95SJean-Philippe Brucker u32 len, u8 is_write, void *ptr) 246c9888d95SJean-Philippe Brucker { 247c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = ptr; 248c9888d95SJean-Philippe Brucker struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 249c9888d95SJean-Philippe Brucker u64 offset = addr - pba->guest_phys_addr; 250c9888d95SJean-Philippe Brucker struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 251c9888d95SJean-Philippe Brucker 252*b20d6e30SAlexandru Elisei if (offset >= pba->size) { 253*b20d6e30SAlexandru Elisei vfio_dev_err(vdev, "access outside of the MSIX PBA"); 254*b20d6e30SAlexandru Elisei return; 255*b20d6e30SAlexandru Elisei } 256*b20d6e30SAlexandru Elisei 257c9888d95SJean-Philippe Brucker if (is_write) 258c9888d95SJean-Philippe Brucker return; 259c9888d95SJean-Philippe Brucker 260c9888d95SJean-Philippe Brucker /* 261c9888d95SJean-Philippe Brucker * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA 262c9888d95SJean-Philippe Brucker * is completely useless here. Note that Linux doesn't use PBA. 263c9888d95SJean-Philippe Brucker */ 2645f44d5d6SAlexandru Elisei if (pread(vdev->fd, data, len, pba->fd_offset + offset) != (ssize_t)len) 265c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot access MSIX PBA\n"); 266c9888d95SJean-Philippe Brucker } 267c9888d95SJean-Philippe Brucker 268c9888d95SJean-Philippe Brucker static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 269c9888d95SJean-Philippe Brucker u32 len, u8 is_write, void *ptr) 270c9888d95SJean-Philippe Brucker { 271c9888d95SJean-Philippe Brucker struct kvm *kvm = vcpu->kvm; 272c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entry; 273c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = ptr; 274c9888d95SJean-Philippe Brucker struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 275c9888d95SJean-Philippe Brucker 276c9888d95SJean-Philippe Brucker u64 offset = addr - pdev->msix_table.guest_phys_addr; 277*b20d6e30SAlexandru Elisei if (offset >= pdev->msix_table.size) { 278*b20d6e30SAlexandru Elisei vfio_dev_err(vdev, "access outside of the MSI-X table"); 279*b20d6e30SAlexandru Elisei return; 280*b20d6e30SAlexandru Elisei } 281c9888d95SJean-Philippe Brucker 282c9888d95SJean-Philippe Brucker size_t vector = offset / PCI_MSIX_ENTRY_SIZE; 283c9888d95SJean-Philippe Brucker off_t field = offset % PCI_MSIX_ENTRY_SIZE; 284c9888d95SJean-Philippe Brucker 285c9888d95SJean-Philippe Brucker /* 286c9888d95SJean-Philippe Brucker * PCI spec says that software must use aligned 4 or 8 bytes accesses 287c9888d95SJean-Philippe Brucker * for the MSI-X tables. 288c9888d95SJean-Philippe Brucker */ 289c9888d95SJean-Philippe Brucker if ((len != 4 && len != 8) || addr & (len - 1)) { 290c9888d95SJean-Philippe Brucker vfio_dev_warn(vdev, "invalid MSI-X table access"); 291c9888d95SJean-Philippe Brucker return; 292c9888d95SJean-Philippe Brucker } 293c9888d95SJean-Philippe Brucker 294c9888d95SJean-Philippe Brucker entry = &pdev->msix.entries[vector]; 295c9888d95SJean-Philippe Brucker 296c9888d95SJean-Philippe Brucker mutex_lock(&pdev->msix.mutex); 297c9888d95SJean-Philippe Brucker 298c9888d95SJean-Philippe Brucker if (!is_write) { 299c9888d95SJean-Philippe Brucker memcpy(data, (void *)&entry->config + field, len); 300c9888d95SJean-Philippe Brucker goto out_unlock; 301c9888d95SJean-Philippe Brucker } 302c9888d95SJean-Philippe Brucker 303c9888d95SJean-Philippe Brucker memcpy((void *)&entry->config + field, data, len); 304c9888d95SJean-Philippe Brucker 305c9888d95SJean-Philippe Brucker /* 306c9888d95SJean-Philippe Brucker * Check if access touched the vector control register, which is at the 307c9888d95SJean-Philippe Brucker * end of the MSI-X entry. 308c9888d95SJean-Philippe Brucker */ 309c9888d95SJean-Philippe Brucker if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL) 310c9888d95SJean-Philippe Brucker goto out_unlock; 311c9888d95SJean-Philippe Brucker 312c9888d95SJean-Philippe Brucker msi_set_masked(entry->virt_state, entry->config.ctrl & 313c9888d95SJean-Philippe Brucker PCI_MSIX_ENTRY_CTRL_MASKBIT); 314c9888d95SJean-Philippe Brucker 315c9888d95SJean-Philippe Brucker if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0) 316c9888d95SJean-Philippe Brucker /* Not much we can do here. */ 317c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector); 318c9888d95SJean-Philippe Brucker 319c9888d95SJean-Philippe Brucker /* Update the physical capability if necessary */ 3208dd28afeSJean-Philippe Brucker if (vfio_pci_enable_msis(kvm, vdev, true)) 321c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot enable MSIX"); 322c9888d95SJean-Philippe Brucker 323c9888d95SJean-Philippe Brucker out_unlock: 324c9888d95SJean-Philippe Brucker mutex_unlock(&pdev->msix.mutex); 325c9888d95SJean-Philippe Brucker } 326c9888d95SJean-Philippe Brucker 327c9888d95SJean-Philippe Brucker static void vfio_pci_msix_cap_write(struct kvm *kvm, 328e69b7663SAlexandru Elisei struct vfio_device *vdev, u16 off, 329c9888d95SJean-Philippe Brucker void *data, int sz) 330c9888d95SJean-Philippe Brucker { 331c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 332c9888d95SJean-Philippe Brucker off_t enable_pos = PCI_MSIX_FLAGS + 1; 333c9888d95SJean-Philippe Brucker bool enable; 334c9888d95SJean-Philippe Brucker u16 flags; 335c9888d95SJean-Philippe Brucker 336c9888d95SJean-Philippe Brucker off -= pdev->msix.pos; 337c9888d95SJean-Philippe Brucker 338c9888d95SJean-Philippe Brucker /* Check if access intersects with the MSI-X Enable bit */ 339c9888d95SJean-Philippe Brucker if (off > enable_pos || off + sz <= enable_pos) 340c9888d95SJean-Philippe Brucker return; 341c9888d95SJean-Philippe Brucker 342c9888d95SJean-Philippe Brucker /* Read byte that contains the Enable bit */ 343c9888d95SJean-Philippe Brucker flags = *(u8 *)(data + enable_pos - off) << 8; 344c9888d95SJean-Philippe Brucker 345c9888d95SJean-Philippe Brucker mutex_lock(&pdev->msix.mutex); 346c9888d95SJean-Philippe Brucker 347c9888d95SJean-Philippe Brucker msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL); 348c9888d95SJean-Philippe Brucker enable = flags & PCI_MSIX_FLAGS_ENABLE; 349c9888d95SJean-Philippe Brucker msi_set_enabled(pdev->msix.virt_state, enable); 350c9888d95SJean-Philippe Brucker 3518dd28afeSJean-Philippe Brucker if (enable && vfio_pci_enable_msis(kvm, vdev, true)) 352c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot enable MSIX"); 3538dd28afeSJean-Philippe Brucker else if (!enable && vfio_pci_disable_msis(kvm, vdev, true)) 354c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot disable MSIX"); 355c9888d95SJean-Philippe Brucker 356c9888d95SJean-Philippe Brucker mutex_unlock(&pdev->msix.mutex); 357c9888d95SJean-Philippe Brucker } 358c9888d95SJean-Philippe Brucker 3598dd28afeSJean-Philippe Brucker static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev, 360e69b7663SAlexandru Elisei u16 off, u8 *data, u32 sz) 3618dd28afeSJean-Philippe Brucker { 3628dd28afeSJean-Philippe Brucker size_t i; 3638dd28afeSJean-Philippe Brucker u32 mask = 0; 3648dd28afeSJean-Philippe Brucker size_t mask_pos, start, limit; 3658dd28afeSJean-Philippe Brucker struct vfio_pci_msi_entry *entry; 3668dd28afeSJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 3678dd28afeSJean-Philippe Brucker struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 3688dd28afeSJean-Philippe Brucker 3698dd28afeSJean-Philippe Brucker if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT)) 3708dd28afeSJean-Philippe Brucker return 0; 3718dd28afeSJean-Philippe Brucker 3728dd28afeSJean-Philippe Brucker if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) 3738dd28afeSJean-Philippe Brucker mask_pos = PCI_MSI_MASK_64; 3748dd28afeSJean-Philippe Brucker else 3758dd28afeSJean-Philippe Brucker mask_pos = PCI_MSI_MASK_32; 3768dd28afeSJean-Philippe Brucker 3778dd28afeSJean-Philippe Brucker if (off >= mask_pos + 4 || off + sz <= mask_pos) 3788dd28afeSJean-Philippe Brucker return 0; 3798dd28afeSJean-Philippe Brucker 3808dd28afeSJean-Philippe Brucker /* Set mask to current state */ 3818dd28afeSJean-Philippe Brucker for (i = 0; i < pdev->msi.nr_entries; i++) { 3828dd28afeSJean-Philippe Brucker entry = &pdev->msi.entries[i]; 3838dd28afeSJean-Philippe Brucker mask |= !!msi_is_masked(entry->virt_state) << i; 3848dd28afeSJean-Philippe Brucker } 3858dd28afeSJean-Philippe Brucker 3868dd28afeSJean-Philippe Brucker /* Update mask following the intersection of access and register */ 3878dd28afeSJean-Philippe Brucker start = max_t(size_t, off, mask_pos); 3888dd28afeSJean-Philippe Brucker limit = min_t(size_t, off + sz, mask_pos + 4); 3898dd28afeSJean-Philippe Brucker 3908dd28afeSJean-Philippe Brucker memcpy((void *)&mask + start - mask_pos, data + start - off, 3918dd28afeSJean-Philippe Brucker limit - start); 3928dd28afeSJean-Philippe Brucker 3938dd28afeSJean-Philippe Brucker /* Update states if necessary */ 3948dd28afeSJean-Philippe Brucker for (i = 0; i < pdev->msi.nr_entries; i++) { 3958dd28afeSJean-Philippe Brucker bool masked = mask & (1 << i); 3968dd28afeSJean-Philippe Brucker 3978dd28afeSJean-Philippe Brucker entry = &pdev->msi.entries[i]; 3988dd28afeSJean-Philippe Brucker if (masked != msi_is_masked(entry->virt_state)) { 3998dd28afeSJean-Philippe Brucker msi_set_masked(entry->virt_state, masked); 4008dd28afeSJean-Philippe Brucker vfio_pci_update_msi_entry(kvm, vdev, entry); 4018dd28afeSJean-Philippe Brucker } 4028dd28afeSJean-Philippe Brucker } 4038dd28afeSJean-Philippe Brucker 4048dd28afeSJean-Philippe Brucker return 1; 4058dd28afeSJean-Philippe Brucker } 4068dd28afeSJean-Philippe Brucker 4078dd28afeSJean-Philippe Brucker static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev, 408e69b7663SAlexandru Elisei u16 off, u8 *data, u32 sz) 4098dd28afeSJean-Philippe Brucker { 4108dd28afeSJean-Philippe Brucker u8 ctrl; 4118dd28afeSJean-Philippe Brucker struct msi_msg msg; 4128dd28afeSJean-Philippe Brucker size_t i, nr_vectors; 4138dd28afeSJean-Philippe Brucker struct vfio_pci_msi_entry *entry; 4148dd28afeSJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 4158dd28afeSJean-Philippe Brucker struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 4168dd28afeSJean-Philippe Brucker 4178dd28afeSJean-Philippe Brucker off -= pdev->msi.pos; 4188dd28afeSJean-Philippe Brucker 4198dd28afeSJean-Philippe Brucker mutex_lock(&pdev->msi.mutex); 4208dd28afeSJean-Philippe Brucker 4218dd28afeSJean-Philippe Brucker /* Check if the guest is trying to update mask bits */ 4228dd28afeSJean-Philippe Brucker if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz)) 4238dd28afeSJean-Philippe Brucker goto out_unlock; 4248dd28afeSJean-Philippe Brucker 4258dd28afeSJean-Philippe Brucker /* Only modify routes when guest pokes the enable bit */ 4268dd28afeSJean-Philippe Brucker if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS) 4278dd28afeSJean-Philippe Brucker goto out_unlock; 4288dd28afeSJean-Philippe Brucker 4298dd28afeSJean-Philippe Brucker ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off); 4308dd28afeSJean-Philippe Brucker 4318dd28afeSJean-Philippe Brucker msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE); 4328dd28afeSJean-Philippe Brucker 4338dd28afeSJean-Philippe Brucker if (!msi_is_enabled(pdev->msi.virt_state)) { 4348dd28afeSJean-Philippe Brucker vfio_pci_disable_msis(kvm, vdev, false); 4358dd28afeSJean-Philippe Brucker goto out_unlock; 4368dd28afeSJean-Philippe Brucker } 4378dd28afeSJean-Philippe Brucker 4388dd28afeSJean-Philippe Brucker /* Create routes for the requested vectors */ 4398dd28afeSJean-Philippe Brucker nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4); 4408dd28afeSJean-Philippe Brucker 4418dd28afeSJean-Philippe Brucker msg.address_lo = msi_cap_64->address_lo; 4428dd28afeSJean-Philippe Brucker if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) { 4438dd28afeSJean-Philippe Brucker msg.address_hi = msi_cap_64->address_hi; 4448dd28afeSJean-Philippe Brucker msg.data = msi_cap_64->data; 4458dd28afeSJean-Philippe Brucker } else { 4468dd28afeSJean-Philippe Brucker struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64; 4478dd28afeSJean-Philippe Brucker msg.address_hi = 0; 4488dd28afeSJean-Philippe Brucker msg.data = msi_cap_32->data; 4498dd28afeSJean-Philippe Brucker } 4508dd28afeSJean-Philippe Brucker 4518dd28afeSJean-Philippe Brucker for (i = 0; i < nr_vectors; i++) { 4528dd28afeSJean-Philippe Brucker entry = &pdev->msi.entries[i]; 453e554aefdSLorenzo Pieralisi 454e554aefdSLorenzo Pieralisi /* 455e554aefdSLorenzo Pieralisi * Set the MSI data value as required by the PCI local 456e554aefdSLorenzo Pieralisi * bus specifications, MSI capability, "Message Data". 457e554aefdSLorenzo Pieralisi */ 458e554aefdSLorenzo Pieralisi msg.data &= ~(nr_vectors - 1); 459e554aefdSLorenzo Pieralisi msg.data |= i; 460e554aefdSLorenzo Pieralisi 4618dd28afeSJean-Philippe Brucker entry->config.msg = msg; 4628dd28afeSJean-Philippe Brucker vfio_pci_update_msi_entry(kvm, vdev, entry); 4638dd28afeSJean-Philippe Brucker } 4648dd28afeSJean-Philippe Brucker 4658dd28afeSJean-Philippe Brucker /* Update the physical capability if necessary */ 4668dd28afeSJean-Philippe Brucker if (vfio_pci_enable_msis(kvm, vdev, false)) 4678dd28afeSJean-Philippe Brucker vfio_dev_err(vdev, "cannot enable MSI"); 4688dd28afeSJean-Philippe Brucker 4698dd28afeSJean-Philippe Brucker out_unlock: 4708dd28afeSJean-Philippe Brucker mutex_unlock(&pdev->msi.mutex); 4718dd28afeSJean-Philippe Brucker } 4728dd28afeSJean-Philippe Brucker 4735a8e4f25SAlexandru Elisei static int vfio_pci_bar_activate(struct kvm *kvm, 4745a8e4f25SAlexandru Elisei struct pci_device_header *pci_hdr, 4755a8e4f25SAlexandru Elisei int bar_num, void *data) 4765a8e4f25SAlexandru Elisei { 4775a8e4f25SAlexandru Elisei struct vfio_device *vdev = data; 4785a8e4f25SAlexandru Elisei struct vfio_pci_device *pdev = &vdev->pci; 4795a8e4f25SAlexandru Elisei struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 4805a8e4f25SAlexandru Elisei struct vfio_pci_msix_table *table = &pdev->msix_table; 4815a8e4f25SAlexandru Elisei struct vfio_region *region; 482465edc9dSAlexandru Elisei u32 bar_addr; 4835a8e4f25SAlexandru Elisei bool has_msix; 4845a8e4f25SAlexandru Elisei int ret; 4855a8e4f25SAlexandru Elisei 4865a8e4f25SAlexandru Elisei assert((u32)bar_num < vdev->info.num_regions); 4875a8e4f25SAlexandru Elisei 4885a8e4f25SAlexandru Elisei region = &vdev->regions[bar_num]; 4895a8e4f25SAlexandru Elisei has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX; 4905a8e4f25SAlexandru Elisei 491465edc9dSAlexandru Elisei bar_addr = pci__bar_address(pci_hdr, bar_num); 492465edc9dSAlexandru Elisei if (pci__bar_is_io(pci_hdr, bar_num)) 493465edc9dSAlexandru Elisei region->port_base = bar_addr; 494465edc9dSAlexandru Elisei else 495465edc9dSAlexandru Elisei region->guest_phys_addr = bar_addr; 496465edc9dSAlexandru Elisei 4975a8e4f25SAlexandru Elisei if (has_msix && (u32)bar_num == table->bar) { 498465edc9dSAlexandru Elisei table->guest_phys_addr = region->guest_phys_addr; 4995a8e4f25SAlexandru Elisei ret = kvm__register_mmio(kvm, table->guest_phys_addr, 5005a8e4f25SAlexandru Elisei table->size, false, 5015a8e4f25SAlexandru Elisei vfio_pci_msix_table_access, pdev); 5025a8e4f25SAlexandru Elisei /* 5035a8e4f25SAlexandru Elisei * The MSIX table and the PBA structure can share the same BAR, 5045a8e4f25SAlexandru Elisei * but for convenience we register different regions for mmio 5055a8e4f25SAlexandru Elisei * emulation. We want to we update both if they share the same 5065a8e4f25SAlexandru Elisei * BAR. 5075a8e4f25SAlexandru Elisei */ 5085a8e4f25SAlexandru Elisei if (ret < 0 || table->bar != pba->bar) 5095a8e4f25SAlexandru Elisei goto out; 5105a8e4f25SAlexandru Elisei } 5115a8e4f25SAlexandru Elisei 5125a8e4f25SAlexandru Elisei if (has_msix && (u32)bar_num == pba->bar) { 513465edc9dSAlexandru Elisei if (pba->bar == table->bar) 514f93acc04SAlexandru Elisei pba->guest_phys_addr = table->guest_phys_addr + pba->bar_offset; 515465edc9dSAlexandru Elisei else 516465edc9dSAlexandru Elisei pba->guest_phys_addr = region->guest_phys_addr; 5175a8e4f25SAlexandru Elisei ret = kvm__register_mmio(kvm, pba->guest_phys_addr, 5185a8e4f25SAlexandru Elisei pba->size, false, 5195a8e4f25SAlexandru Elisei vfio_pci_msix_pba_access, pdev); 5205a8e4f25SAlexandru Elisei goto out; 5215a8e4f25SAlexandru Elisei } 5225a8e4f25SAlexandru Elisei 5235a8e4f25SAlexandru Elisei ret = vfio_map_region(kvm, vdev, region); 5245a8e4f25SAlexandru Elisei out: 5255a8e4f25SAlexandru Elisei return ret; 5265a8e4f25SAlexandru Elisei } 5275a8e4f25SAlexandru Elisei 5285a8e4f25SAlexandru Elisei static int vfio_pci_bar_deactivate(struct kvm *kvm, 5295a8e4f25SAlexandru Elisei struct pci_device_header *pci_hdr, 5305a8e4f25SAlexandru Elisei int bar_num, void *data) 5315a8e4f25SAlexandru Elisei { 5325a8e4f25SAlexandru Elisei struct vfio_device *vdev = data; 5335a8e4f25SAlexandru Elisei struct vfio_pci_device *pdev = &vdev->pci; 5345a8e4f25SAlexandru Elisei struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 5355a8e4f25SAlexandru Elisei struct vfio_pci_msix_table *table = &pdev->msix_table; 5365a8e4f25SAlexandru Elisei struct vfio_region *region; 5375a8e4f25SAlexandru Elisei bool has_msix, success; 5385a8e4f25SAlexandru Elisei int ret; 5395a8e4f25SAlexandru Elisei 5405a8e4f25SAlexandru Elisei assert((u32)bar_num < vdev->info.num_regions); 5415a8e4f25SAlexandru Elisei 5425a8e4f25SAlexandru Elisei region = &vdev->regions[bar_num]; 5435a8e4f25SAlexandru Elisei has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX; 5445a8e4f25SAlexandru Elisei 5455a8e4f25SAlexandru Elisei if (has_msix && (u32)bar_num == table->bar) { 5465a8e4f25SAlexandru Elisei success = kvm__deregister_mmio(kvm, table->guest_phys_addr); 5475a8e4f25SAlexandru Elisei /* kvm__deregister_mmio fails when the region is not found. */ 5485a8e4f25SAlexandru Elisei ret = (success ? 0 : -ENOENT); 5495a8e4f25SAlexandru Elisei /* See vfio_pci_bar_activate(). */ 5505a8e4f25SAlexandru Elisei if (ret < 0 || table->bar!= pba->bar) 5515a8e4f25SAlexandru Elisei goto out; 5525a8e4f25SAlexandru Elisei } 5535a8e4f25SAlexandru Elisei 5545a8e4f25SAlexandru Elisei if (has_msix && (u32)bar_num == pba->bar) { 5555a8e4f25SAlexandru Elisei success = kvm__deregister_mmio(kvm, pba->guest_phys_addr); 5565a8e4f25SAlexandru Elisei ret = (success ? 0 : -ENOENT); 5575a8e4f25SAlexandru Elisei goto out; 5585a8e4f25SAlexandru Elisei } 5595a8e4f25SAlexandru Elisei 5605a8e4f25SAlexandru Elisei vfio_unmap_region(kvm, region); 5615a8e4f25SAlexandru Elisei ret = 0; 5625a8e4f25SAlexandru Elisei 5635a8e4f25SAlexandru Elisei out: 5645a8e4f25SAlexandru Elisei return ret; 5655a8e4f25SAlexandru Elisei } 5665a8e4f25SAlexandru Elisei 5676078a454SJean-Philippe Brucker static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr, 568e69b7663SAlexandru Elisei u16 offset, void *data, int sz) 5696078a454SJean-Philippe Brucker { 5706078a454SJean-Philippe Brucker struct vfio_region_info *info; 5716078a454SJean-Philippe Brucker struct vfio_pci_device *pdev; 5726078a454SJean-Philippe Brucker struct vfio_device *vdev; 5736078a454SJean-Philippe Brucker char base[sz]; 5746078a454SJean-Philippe Brucker 5756078a454SJean-Philippe Brucker pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 5766078a454SJean-Philippe Brucker vdev = container_of(pdev, struct vfio_device, pci); 5776078a454SJean-Philippe Brucker info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 5786078a454SJean-Philippe Brucker 5796078a454SJean-Philippe Brucker /* Dummy read in case of side-effects */ 5806078a454SJean-Philippe Brucker if (pread(vdev->fd, base, sz, info->offset + offset) != sz) 5816078a454SJean-Philippe Brucker vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x", 5826078a454SJean-Philippe Brucker sz, offset); 5836078a454SJean-Philippe Brucker } 5846078a454SJean-Philippe Brucker 5856078a454SJean-Philippe Brucker static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr, 586e69b7663SAlexandru Elisei u16 offset, void *data, int sz) 5876078a454SJean-Philippe Brucker { 5886078a454SJean-Philippe Brucker struct vfio_region_info *info; 5896078a454SJean-Philippe Brucker struct vfio_pci_device *pdev; 5906078a454SJean-Philippe Brucker struct vfio_device *vdev; 591e1d0285cSAlexandru Elisei u32 tmp; 592e1d0285cSAlexandru Elisei 593e1d0285cSAlexandru Elisei /* Make sure a larger size will not overrun tmp on the stack. */ 594e1d0285cSAlexandru Elisei assert(sz <= 4); 5956078a454SJean-Philippe Brucker 5965b7fef16SAlexandru Elisei if (offset == PCI_ROM_ADDRESS) 5975b7fef16SAlexandru Elisei return; 5985b7fef16SAlexandru Elisei 5996078a454SJean-Philippe Brucker pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 6006078a454SJean-Philippe Brucker vdev = container_of(pdev, struct vfio_device, pci); 6016078a454SJean-Philippe Brucker info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 6026078a454SJean-Philippe Brucker 6036078a454SJean-Philippe Brucker if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz) 6046078a454SJean-Philippe Brucker vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x", 6056078a454SJean-Philippe Brucker sz, offset); 6066078a454SJean-Philippe Brucker 607c9888d95SJean-Philippe Brucker /* Handle MSI write now, since it might update the hardware capability */ 608c9888d95SJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) 609c9888d95SJean-Philippe Brucker vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz); 610c9888d95SJean-Philippe Brucker 6118dd28afeSJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) 6128dd28afeSJean-Philippe Brucker vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz); 6138dd28afeSJean-Philippe Brucker 614e1d0285cSAlexandru Elisei if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz) 6156078a454SJean-Philippe Brucker vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x", 6166078a454SJean-Philippe Brucker sz, offset); 6176078a454SJean-Philippe Brucker } 6186078a454SJean-Philippe Brucker 6198dd28afeSJean-Philippe Brucker static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr) 6208dd28afeSJean-Philippe Brucker { 6218dd28afeSJean-Philippe Brucker size_t size = 10; 6228dd28afeSJean-Philippe Brucker 6238dd28afeSJean-Philippe Brucker if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT) 6248dd28afeSJean-Philippe Brucker size += 4; 6258dd28afeSJean-Philippe Brucker if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT) 6268dd28afeSJean-Philippe Brucker size += 10; 6278dd28afeSJean-Philippe Brucker 6288dd28afeSJean-Philippe Brucker return size; 6298dd28afeSJean-Philippe Brucker } 6308dd28afeSJean-Philippe Brucker 631c9888d95SJean-Philippe Brucker static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr) 632c9888d95SJean-Philippe Brucker { 633c9888d95SJean-Philippe Brucker switch (cap_hdr->type) { 634c9888d95SJean-Philippe Brucker case PCI_CAP_ID_MSIX: 635c9888d95SJean-Philippe Brucker return PCI_CAP_MSIX_SIZEOF; 6368dd28afeSJean-Philippe Brucker case PCI_CAP_ID_MSI: 6378dd28afeSJean-Philippe Brucker return vfio_pci_msi_cap_size((void *)cap_hdr); 63825c1dc6cSAlexandru Elisei case PCI_CAP_ID_EXP: 63925c1dc6cSAlexandru Elisei /* 64025c1dc6cSAlexandru Elisei * We don't emulate any of the link, slot and root complex 64125c1dc6cSAlexandru Elisei * properties, so ignore them. 64225c1dc6cSAlexandru Elisei */ 64325c1dc6cSAlexandru Elisei return PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1; 644c9888d95SJean-Philippe Brucker default: 645c9888d95SJean-Philippe Brucker pr_err("unknown PCI capability 0x%x", cap_hdr->type); 646c9888d95SJean-Philippe Brucker return 0; 647c9888d95SJean-Philippe Brucker } 648c9888d95SJean-Philippe Brucker } 649c9888d95SJean-Philippe Brucker 650c9888d95SJean-Philippe Brucker static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr, 651c9888d95SJean-Philippe Brucker struct pci_cap_hdr *cap, off_t pos) 652c9888d95SJean-Philippe Brucker { 653c9888d95SJean-Philippe Brucker struct pci_cap_hdr *last; 654c9888d95SJean-Philippe Brucker struct pci_device_header *hdr = &vdev->pci.hdr; 655c9888d95SJean-Philippe Brucker 656c9888d95SJean-Philippe Brucker cap->next = 0; 657c9888d95SJean-Philippe Brucker 658c9888d95SJean-Philippe Brucker if (!hdr->capabilities) { 659c9888d95SJean-Philippe Brucker hdr->capabilities = pos; 660c9888d95SJean-Philippe Brucker hdr->status |= PCI_STATUS_CAP_LIST; 661c9888d95SJean-Philippe Brucker } else { 662c9888d95SJean-Philippe Brucker last = PCI_CAP(virt_hdr, hdr->capabilities); 663c9888d95SJean-Philippe Brucker 664c9888d95SJean-Philippe Brucker while (last->next) 665c9888d95SJean-Philippe Brucker last = PCI_CAP(virt_hdr, last->next); 666c9888d95SJean-Philippe Brucker 667c9888d95SJean-Philippe Brucker last->next = pos; 668c9888d95SJean-Philippe Brucker } 669c9888d95SJean-Philippe Brucker 670c9888d95SJean-Philippe Brucker memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap)); 671c9888d95SJean-Philippe Brucker 672c9888d95SJean-Philippe Brucker return 0; 673c9888d95SJean-Philippe Brucker } 674c9888d95SJean-Philippe Brucker 6756078a454SJean-Philippe Brucker static int vfio_pci_parse_caps(struct vfio_device *vdev) 6766078a454SJean-Philippe Brucker { 677c9888d95SJean-Philippe Brucker int ret; 678c9888d95SJean-Philippe Brucker size_t size; 679e69b7663SAlexandru Elisei u16 pos, next; 680c9888d95SJean-Philippe Brucker struct pci_cap_hdr *cap; 681e69b7663SAlexandru Elisei u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY]; 6826078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 6836078a454SJean-Philippe Brucker 6846078a454SJean-Philippe Brucker if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST)) 6856078a454SJean-Philippe Brucker return 0; 6866078a454SJean-Philippe Brucker 687e69b7663SAlexandru Elisei memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY); 688c9888d95SJean-Philippe Brucker 689c9888d95SJean-Philippe Brucker pos = pdev->hdr.capabilities & ~3; 690c9888d95SJean-Philippe Brucker 6916078a454SJean-Philippe Brucker pdev->hdr.status &= ~PCI_STATUS_CAP_LIST; 6926078a454SJean-Philippe Brucker pdev->hdr.capabilities = 0; 6936078a454SJean-Philippe Brucker 694c9888d95SJean-Philippe Brucker for (; pos; pos = next) { 695c9888d95SJean-Philippe Brucker cap = PCI_CAP(&pdev->hdr, pos); 696c9888d95SJean-Philippe Brucker next = cap->next; 697c9888d95SJean-Philippe Brucker 698c9888d95SJean-Philippe Brucker switch (cap->type) { 699c9888d95SJean-Philippe Brucker case PCI_CAP_ID_MSIX: 700c9888d95SJean-Philippe Brucker ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 701c9888d95SJean-Philippe Brucker if (ret) 702c9888d95SJean-Philippe Brucker return ret; 703c9888d95SJean-Philippe Brucker 704c9888d95SJean-Philippe Brucker pdev->msix.pos = pos; 705c9888d95SJean-Philippe Brucker pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX; 706c9888d95SJean-Philippe Brucker break; 7078dd28afeSJean-Philippe Brucker case PCI_CAP_ID_MSI: 7088dd28afeSJean-Philippe Brucker ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 7098dd28afeSJean-Philippe Brucker if (ret) 7108dd28afeSJean-Philippe Brucker return ret; 7118dd28afeSJean-Philippe Brucker 7128dd28afeSJean-Philippe Brucker pdev->msi.pos = pos; 7138dd28afeSJean-Philippe Brucker pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI; 7148dd28afeSJean-Philippe Brucker break; 71525c1dc6cSAlexandru Elisei case PCI_CAP_ID_EXP: 71625c1dc6cSAlexandru Elisei if (!arch_has_pci_exp()) 71725c1dc6cSAlexandru Elisei continue; 71825c1dc6cSAlexandru Elisei ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 71925c1dc6cSAlexandru Elisei if (ret) 72025c1dc6cSAlexandru Elisei return ret; 72125c1dc6cSAlexandru Elisei break; 722c9888d95SJean-Philippe Brucker } 723c9888d95SJean-Philippe Brucker } 724c9888d95SJean-Philippe Brucker 725c9888d95SJean-Philippe Brucker /* Wipe remaining capabilities */ 726c9888d95SJean-Philippe Brucker pos = PCI_STD_HEADER_SIZEOF; 727e69b7663SAlexandru Elisei size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF; 728c9888d95SJean-Philippe Brucker memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size); 7296078a454SJean-Philippe Brucker 7306078a454SJean-Philippe Brucker return 0; 7316078a454SJean-Philippe Brucker } 7326078a454SJean-Philippe Brucker 7336078a454SJean-Philippe Brucker static int vfio_pci_parse_cfg_space(struct vfio_device *vdev) 7346078a454SJean-Philippe Brucker { 735e69b7663SAlexandru Elisei ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY; 7366078a454SJean-Philippe Brucker struct vfio_region_info *info; 7376078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 7386078a454SJean-Philippe Brucker 7396078a454SJean-Philippe Brucker if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { 7406078a454SJean-Philippe Brucker vfio_dev_err(vdev, "Config Space not found"); 7416078a454SJean-Philippe Brucker return -ENODEV; 7426078a454SJean-Philippe Brucker } 7436078a454SJean-Philippe Brucker 7446078a454SJean-Philippe Brucker info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 7456078a454SJean-Philippe Brucker *info = (struct vfio_region_info) { 7466078a454SJean-Philippe Brucker .argsz = sizeof(*info), 7476078a454SJean-Philippe Brucker .index = VFIO_PCI_CONFIG_REGION_INDEX, 7486078a454SJean-Philippe Brucker }; 7496078a454SJean-Philippe Brucker 7506078a454SJean-Philippe Brucker ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 7516078a454SJean-Philippe Brucker if (!info->size) { 7526078a454SJean-Philippe Brucker vfio_dev_err(vdev, "Config Space has size zero?!"); 7536078a454SJean-Philippe Brucker return -EINVAL; 7546078a454SJean-Philippe Brucker } 7556078a454SJean-Philippe Brucker 756c9888d95SJean-Philippe Brucker /* Read standard headers and capabilities */ 7576078a454SJean-Philippe Brucker if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) { 7586078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz); 7596078a454SJean-Philippe Brucker return -EIO; 7606078a454SJean-Philippe Brucker } 7616078a454SJean-Philippe Brucker 7626078a454SJean-Philippe Brucker /* Strip bit 7, that indicates multifunction */ 7636078a454SJean-Philippe Brucker pdev->hdr.header_type &= 0x7f; 7646078a454SJean-Philippe Brucker 7656078a454SJean-Philippe Brucker if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) { 7666078a454SJean-Philippe Brucker vfio_dev_err(vdev, "unsupported header type %u", 7676078a454SJean-Philippe Brucker pdev->hdr.header_type); 7686078a454SJean-Philippe Brucker return -EOPNOTSUPP; 7696078a454SJean-Philippe Brucker } 7706078a454SJean-Philippe Brucker 771c9888d95SJean-Philippe Brucker if (pdev->hdr.irq_pin) 772c9888d95SJean-Philippe Brucker pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX; 773c9888d95SJean-Philippe Brucker 7746078a454SJean-Philippe Brucker vfio_pci_parse_caps(vdev); 7756078a454SJean-Philippe Brucker 7766078a454SJean-Philippe Brucker return 0; 7776078a454SJean-Philippe Brucker } 7786078a454SJean-Philippe Brucker 7796078a454SJean-Philippe Brucker static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev) 7806078a454SJean-Philippe Brucker { 7816078a454SJean-Philippe Brucker int i; 7823665392aSAlexandru Elisei u64 base; 7836078a454SJean-Philippe Brucker ssize_t hdr_sz; 784c9888d95SJean-Philippe Brucker struct msix_cap *msix; 7856078a454SJean-Philippe Brucker struct vfio_region_info *info; 7866078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 7873665392aSAlexandru Elisei struct vfio_region *region; 7886078a454SJean-Philippe Brucker 7896078a454SJean-Philippe Brucker /* Initialise the BARs */ 7906078a454SJean-Philippe Brucker for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 7913665392aSAlexandru Elisei if ((u32)i == vdev->info.num_regions) 7923665392aSAlexandru Elisei break; 79382caa882SJean-Philippe Brucker 7943665392aSAlexandru Elisei region = &vdev->regions[i]; 79582caa882SJean-Philippe Brucker /* Construct a fake reg to match what we've mapped. */ 79682caa882SJean-Philippe Brucker if (region->is_ioport) { 79782caa882SJean-Philippe Brucker base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) | 79882caa882SJean-Philippe Brucker PCI_BASE_ADDRESS_SPACE_IO; 79982caa882SJean-Philippe Brucker } else { 80082caa882SJean-Philippe Brucker base = (region->guest_phys_addr & 80182caa882SJean-Philippe Brucker PCI_BASE_ADDRESS_MEM_MASK) | 80282caa882SJean-Philippe Brucker PCI_BASE_ADDRESS_SPACE_MEMORY; 80382caa882SJean-Philippe Brucker } 80482caa882SJean-Philippe Brucker 80582caa882SJean-Philippe Brucker pdev->hdr.bar[i] = base; 8066078a454SJean-Philippe Brucker 8076078a454SJean-Philippe Brucker if (!base) 8086078a454SJean-Philippe Brucker continue; 8096078a454SJean-Philippe Brucker 8106078a454SJean-Philippe Brucker pdev->hdr.bar_size[i] = region->info.size; 8116078a454SJean-Philippe Brucker } 8126078a454SJean-Philippe Brucker 8136078a454SJean-Philippe Brucker /* I really can't be bothered to support cardbus. */ 8146078a454SJean-Philippe Brucker pdev->hdr.card_bus = 0; 8156078a454SJean-Philippe Brucker 8166078a454SJean-Philippe Brucker /* 8176078a454SJean-Philippe Brucker * Nuke the expansion ROM for now. If we want to do this properly, 8186078a454SJean-Philippe Brucker * we need to save its size somewhere and map into the guest. 8196078a454SJean-Philippe Brucker */ 8206078a454SJean-Philippe Brucker pdev->hdr.exp_rom_bar = 0; 8216078a454SJean-Philippe Brucker 822c9888d95SJean-Philippe Brucker /* Plumb in our fake MSI-X capability, if we have it. */ 823c9888d95SJean-Philippe Brucker msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX); 824c9888d95SJean-Philippe Brucker if (msix) { 825c9888d95SJean-Philippe Brucker /* Add a shortcut to the PBA region for the MMIO handler */ 826c9888d95SJean-Philippe Brucker int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar; 827f93acc04SAlexandru Elisei u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET; 828f93acc04SAlexandru Elisei 8295f44d5d6SAlexandru Elisei pdev->msix_pba.fd_offset = vdev->regions[pba_index].info.offset + 830f93acc04SAlexandru Elisei pba_bar_offset; 831c9888d95SJean-Philippe Brucker 832c9888d95SJean-Philippe Brucker /* Tidy up the capability */ 833c9888d95SJean-Philippe Brucker msix->table_offset &= PCI_MSIX_TABLE_BIR; 834f93acc04SAlexandru Elisei if (pdev->msix_table.bar == pdev->msix_pba.bar) { 835f93acc04SAlexandru Elisei /* Keep the same offset as the MSIX cap. */ 836f93acc04SAlexandru Elisei pdev->msix_pba.bar_offset = pba_bar_offset; 837f93acc04SAlexandru Elisei } else { 838f93acc04SAlexandru Elisei /* PBA is at the start of the BAR. */ 839c9888d95SJean-Philippe Brucker msix->pba_offset &= PCI_MSIX_PBA_BIR; 840f93acc04SAlexandru Elisei pdev->msix_pba.bar_offset = 0; 841f93acc04SAlexandru Elisei } 842c9888d95SJean-Philippe Brucker } 843c9888d95SJean-Philippe Brucker 8446078a454SJean-Philippe Brucker /* Install our fake Configuration Space */ 8456078a454SJean-Philippe Brucker info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 846e69b7663SAlexandru Elisei /* 847e69b7663SAlexandru Elisei * We don't touch the extended configuration space, let's be cautious 848e69b7663SAlexandru Elisei * and not overwrite it all with zeros, or bad things might happen. 849e69b7663SAlexandru Elisei */ 850e69b7663SAlexandru Elisei hdr_sz = PCI_DEV_CFG_SIZE_LEGACY; 8516078a454SJean-Philippe Brucker if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) { 8526078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to write %zd bytes to Config Space", 8536078a454SJean-Philippe Brucker hdr_sz); 8546078a454SJean-Philippe Brucker return -EIO; 8556078a454SJean-Philippe Brucker } 8566078a454SJean-Philippe Brucker 8576078a454SJean-Philippe Brucker /* Register callbacks for cfg accesses */ 8586078a454SJean-Philippe Brucker pdev->hdr.cfg_ops = (struct pci_config_operations) { 8596078a454SJean-Philippe Brucker .read = vfio_pci_cfg_read, 8606078a454SJean-Philippe Brucker .write = vfio_pci_cfg_write, 8616078a454SJean-Philippe Brucker }; 8626078a454SJean-Philippe Brucker 8636078a454SJean-Philippe Brucker pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH; 8646078a454SJean-Philippe Brucker 8656078a454SJean-Philippe Brucker return 0; 8666078a454SJean-Philippe Brucker } 8676078a454SJean-Philippe Brucker 868ed01a603SAlexandru Elisei static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index, 869ed01a603SAlexandru Elisei struct vfio_region_info *info) 870ed01a603SAlexandru Elisei { 871ed01a603SAlexandru Elisei int ret; 872ed01a603SAlexandru Elisei 873ed01a603SAlexandru Elisei *info = (struct vfio_region_info) { 874ed01a603SAlexandru Elisei .argsz = sizeof(*info), 875ed01a603SAlexandru Elisei .index = index, 876ed01a603SAlexandru Elisei }; 877ed01a603SAlexandru Elisei 878ed01a603SAlexandru Elisei ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 879ed01a603SAlexandru Elisei if (ret) { 880ed01a603SAlexandru Elisei ret = -errno; 881ed01a603SAlexandru Elisei vfio_dev_err(vdev, "cannot get info for BAR %u", index); 882ed01a603SAlexandru Elisei return ret; 883ed01a603SAlexandru Elisei } 884ed01a603SAlexandru Elisei 885ed01a603SAlexandru Elisei if (info->size && !is_power_of_two(info->size)) { 886ed01a603SAlexandru Elisei vfio_dev_err(vdev, "region is not power of two: 0x%llx", 887ed01a603SAlexandru Elisei info->size); 888ed01a603SAlexandru Elisei return -EINVAL; 889ed01a603SAlexandru Elisei } 890ed01a603SAlexandru Elisei 891ed01a603SAlexandru Elisei return 0; 892ed01a603SAlexandru Elisei } 893ed01a603SAlexandru Elisei 894ed01a603SAlexandru Elisei static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev) 895c9888d95SJean-Philippe Brucker { 896c9888d95SJean-Philippe Brucker int ret; 897c9888d95SJean-Philippe Brucker size_t i; 898ed01a603SAlexandru Elisei size_t map_size; 899c9888d95SJean-Philippe Brucker size_t nr_entries; 900c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entries; 901ed01a603SAlexandru Elisei struct vfio_pci_device *pdev = &vdev->pci; 902c9888d95SJean-Philippe Brucker struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 903c9888d95SJean-Philippe Brucker struct vfio_pci_msix_table *table = &pdev->msix_table; 904c9888d95SJean-Philippe Brucker struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos); 905ed01a603SAlexandru Elisei struct vfio_region_info info; 906c9888d95SJean-Philippe Brucker 907c9888d95SJean-Philippe Brucker table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR; 908c9888d95SJean-Philippe Brucker pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR; 909c9888d95SJean-Philippe Brucker 910c9888d95SJean-Philippe Brucker nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; 911f93acc04SAlexandru Elisei 912f93acc04SAlexandru Elisei /* MSIX table and PBA must support QWORD accesses. */ 913f93acc04SAlexandru Elisei table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, 8); 914f93acc04SAlexandru Elisei pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), 8); 915c9888d95SJean-Philippe Brucker 916c9888d95SJean-Philippe Brucker entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry)); 917c9888d95SJean-Philippe Brucker if (!entries) 918c9888d95SJean-Philippe Brucker return -ENOMEM; 919c9888d95SJean-Philippe Brucker 920c9888d95SJean-Philippe Brucker for (i = 0; i < nr_entries; i++) 921c9888d95SJean-Philippe Brucker entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; 922c9888d95SJean-Philippe Brucker 923ed01a603SAlexandru Elisei ret = vfio_pci_get_region_info(vdev, table->bar, &info); 924ed01a603SAlexandru Elisei if (ret) 925ed01a603SAlexandru Elisei return ret; 926ed01a603SAlexandru Elisei if (!info.size) 927ed01a603SAlexandru Elisei return -EINVAL; 928ed01a603SAlexandru Elisei 929f93acc04SAlexandru Elisei map_size = ALIGN(info.size, PAGE_SIZE); 930ed01a603SAlexandru Elisei table->guest_phys_addr = pci_get_mmio_block(map_size); 931c9888d95SJean-Philippe Brucker if (!table->guest_phys_addr) { 932ed01a603SAlexandru Elisei pr_err("cannot allocate MMIO space"); 933c9888d95SJean-Philippe Brucker ret = -ENOMEM; 934c9888d95SJean-Philippe Brucker goto out_free; 935c9888d95SJean-Philippe Brucker } 936c9888d95SJean-Philippe Brucker 937c9888d95SJean-Philippe Brucker /* 938c9888d95SJean-Philippe Brucker * We could map the physical PBA directly into the guest, but it's 939c9888d95SJean-Philippe Brucker * likely smaller than a page, and we can only hand full pages to the 940c9888d95SJean-Philippe Brucker * guest. Even though the PCI spec disallows sharing a page used for 941c9888d95SJean-Philippe Brucker * MSI-X with any other resource, it allows to share the same page 942c9888d95SJean-Philippe Brucker * between MSI-X table and PBA. For the sake of isolation, create a 943c9888d95SJean-Philippe Brucker * virtual PBA. 944c9888d95SJean-Philippe Brucker */ 945f93acc04SAlexandru Elisei if (table->bar == pba->bar) { 946f93acc04SAlexandru Elisei u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET; 947f93acc04SAlexandru Elisei 948f93acc04SAlexandru Elisei /* Sanity checks. */ 949f93acc04SAlexandru Elisei if (table->size > pba_bar_offset) 950f93acc04SAlexandru Elisei die("MSIX table overlaps with PBA"); 951f93acc04SAlexandru Elisei if (pba_bar_offset + pba->size > info.size) 952f93acc04SAlexandru Elisei die("PBA exceeds the size of the region"); 953f93acc04SAlexandru Elisei pba->guest_phys_addr = table->guest_phys_addr + pba_bar_offset; 954f93acc04SAlexandru Elisei } else { 955f93acc04SAlexandru Elisei ret = vfio_pci_get_region_info(vdev, pba->bar, &info); 956f93acc04SAlexandru Elisei if (ret) 957f93acc04SAlexandru Elisei return ret; 958f93acc04SAlexandru Elisei if (!info.size) 959f93acc04SAlexandru Elisei return -EINVAL; 960f93acc04SAlexandru Elisei 961f93acc04SAlexandru Elisei map_size = ALIGN(info.size, PAGE_SIZE); 962f93acc04SAlexandru Elisei pba->guest_phys_addr = pci_get_mmio_block(map_size); 963f93acc04SAlexandru Elisei if (!pba->guest_phys_addr) { 964f93acc04SAlexandru Elisei pr_err("cannot allocate MMIO space"); 965f93acc04SAlexandru Elisei ret = -ENOMEM; 966f93acc04SAlexandru Elisei goto out_free; 967f93acc04SAlexandru Elisei } 968f93acc04SAlexandru Elisei } 969c9888d95SJean-Philippe Brucker 970c9888d95SJean-Philippe Brucker pdev->msix.entries = entries; 971c9888d95SJean-Philippe Brucker pdev->msix.nr_entries = nr_entries; 972c9888d95SJean-Philippe Brucker 973c9888d95SJean-Philippe Brucker return 0; 974c9888d95SJean-Philippe Brucker 975c9888d95SJean-Philippe Brucker out_free: 976c9888d95SJean-Philippe Brucker free(entries); 977c9888d95SJean-Philippe Brucker 978c9888d95SJean-Philippe Brucker return ret; 979c9888d95SJean-Philippe Brucker } 980c9888d95SJean-Philippe Brucker 9818dd28afeSJean-Philippe Brucker static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev) 9828dd28afeSJean-Philippe Brucker { 9838dd28afeSJean-Philippe Brucker struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos); 9848dd28afeSJean-Philippe Brucker 9858dd28afeSJean-Philippe Brucker pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1), 9868dd28afeSJean-Philippe Brucker pdev->msi.entries = calloc(pdev->msi.nr_entries, 9878dd28afeSJean-Philippe Brucker sizeof(struct vfio_pci_msi_entry)); 9888dd28afeSJean-Philippe Brucker if (!pdev->msi.entries) 9898dd28afeSJean-Philippe Brucker return -ENOMEM; 9908dd28afeSJean-Philippe Brucker 9918dd28afeSJean-Philippe Brucker return 0; 9928dd28afeSJean-Philippe Brucker } 9938dd28afeSJean-Philippe Brucker 9946078a454SJean-Philippe Brucker static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev, 9956078a454SJean-Philippe Brucker size_t nr) 9966078a454SJean-Philippe Brucker { 9976078a454SJean-Philippe Brucker int ret; 99882caa882SJean-Philippe Brucker u32 bar; 9996078a454SJean-Philippe Brucker size_t map_size; 1000c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 10013665392aSAlexandru Elisei struct vfio_region *region; 10026078a454SJean-Philippe Brucker 10036078a454SJean-Philippe Brucker if (nr >= vdev->info.num_regions) 10046078a454SJean-Philippe Brucker return 0; 10056078a454SJean-Philippe Brucker 10063665392aSAlexandru Elisei region = &vdev->regions[nr]; 100782caa882SJean-Philippe Brucker bar = pdev->hdr.bar[nr]; 100882caa882SJean-Philippe Brucker 100982caa882SJean-Philippe Brucker region->vdev = vdev; 101082caa882SJean-Philippe Brucker region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO); 10116078a454SJean-Philippe Brucker 1012ed01a603SAlexandru Elisei ret = vfio_pci_get_region_info(vdev, nr, ®ion->info); 1013ed01a603SAlexandru Elisei if (ret) 10146078a454SJean-Philippe Brucker return ret; 10156078a454SJean-Philippe Brucker 10166078a454SJean-Philippe Brucker /* Ignore invalid or unimplemented regions */ 10176078a454SJean-Philippe Brucker if (!region->info.size) 10186078a454SJean-Philippe Brucker return 0; 10196078a454SJean-Philippe Brucker 1020c9888d95SJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1021c9888d95SJean-Philippe Brucker /* Trap and emulate MSI-X table */ 1022c9888d95SJean-Philippe Brucker if (nr == pdev->msix_table.bar) { 1023c9888d95SJean-Philippe Brucker region->guest_phys_addr = pdev->msix_table.guest_phys_addr; 1024c9888d95SJean-Philippe Brucker return 0; 1025c9888d95SJean-Philippe Brucker } else if (nr == pdev->msix_pba.bar) { 1026c9888d95SJean-Philippe Brucker region->guest_phys_addr = pdev->msix_pba.guest_phys_addr; 1027c9888d95SJean-Philippe Brucker return 0; 1028c9888d95SJean-Philippe Brucker } 1029c9888d95SJean-Philippe Brucker } 1030c9888d95SJean-Philippe Brucker 1031a05e576fSAlexandru Elisei if (region->is_ioport) { 1032a05e576fSAlexandru Elisei region->port_base = pci_get_io_port_block(region->info.size); 1033a05e576fSAlexandru Elisei } else { 10346078a454SJean-Philippe Brucker /* Grab some MMIO space in the guest */ 10356078a454SJean-Philippe Brucker map_size = ALIGN(region->info.size, PAGE_SIZE); 1036854aa2efSJulien Thierry region->guest_phys_addr = pci_get_mmio_block(map_size); 103782caa882SJean-Philippe Brucker } 10386078a454SJean-Philippe Brucker 10396078a454SJean-Philippe Brucker return 0; 10406078a454SJean-Philippe Brucker } 10416078a454SJean-Philippe Brucker 10426078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_regions(struct kvm *kvm, 10436078a454SJean-Philippe Brucker struct vfio_device *vdev) 10446078a454SJean-Philippe Brucker { 10456078a454SJean-Philippe Brucker int ret; 10466078a454SJean-Philippe Brucker u32 bar; 10476078a454SJean-Philippe Brucker size_t i; 10486078a454SJean-Philippe Brucker bool is_64bit = false; 10496078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 10506078a454SJean-Philippe Brucker 10516078a454SJean-Philippe Brucker ret = vfio_pci_parse_cfg_space(vdev); 10526078a454SJean-Philippe Brucker if (ret) 10536078a454SJean-Philippe Brucker return ret; 10546078a454SJean-Philippe Brucker 1055c9888d95SJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1056ed01a603SAlexandru Elisei ret = vfio_pci_create_msix_table(kvm, vdev); 1057c9888d95SJean-Philippe Brucker if (ret) 1058c9888d95SJean-Philippe Brucker return ret; 1059c9888d95SJean-Philippe Brucker } 1060c9888d95SJean-Philippe Brucker 10618dd28afeSJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 10628dd28afeSJean-Philippe Brucker ret = vfio_pci_create_msi_cap(kvm, pdev); 10638dd28afeSJean-Philippe Brucker if (ret) 10648dd28afeSJean-Philippe Brucker return ret; 10658dd28afeSJean-Philippe Brucker } 10668dd28afeSJean-Philippe Brucker 10676078a454SJean-Philippe Brucker for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 10686078a454SJean-Philippe Brucker /* Ignore top half of 64-bit BAR */ 106984998f21SAlexandru Elisei if (is_64bit) { 107084998f21SAlexandru Elisei is_64bit = false; 10716078a454SJean-Philippe Brucker continue; 107284998f21SAlexandru Elisei } 10736078a454SJean-Philippe Brucker 10746078a454SJean-Philippe Brucker ret = vfio_pci_configure_bar(kvm, vdev, i); 10756078a454SJean-Philippe Brucker if (ret) 10766078a454SJean-Philippe Brucker return ret; 10776078a454SJean-Philippe Brucker 10786078a454SJean-Philippe Brucker bar = pdev->hdr.bar[i]; 10796078a454SJean-Philippe Brucker is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) == 10806078a454SJean-Philippe Brucker PCI_BASE_ADDRESS_SPACE_MEMORY && 10816078a454SJean-Philippe Brucker bar & PCI_BASE_ADDRESS_MEM_TYPE_64; 10826078a454SJean-Philippe Brucker } 10836078a454SJean-Philippe Brucker 10846078a454SJean-Philippe Brucker /* We've configured the BARs, fake up a Configuration Space */ 10855a8e4f25SAlexandru Elisei ret = vfio_pci_fixup_cfg_space(vdev); 10865a8e4f25SAlexandru Elisei if (ret) 10875a8e4f25SAlexandru Elisei return ret; 10885a8e4f25SAlexandru Elisei 10895a8e4f25SAlexandru Elisei return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate, 10905a8e4f25SAlexandru Elisei vfio_pci_bar_deactivate, vdev); 10916078a454SJean-Philippe Brucker } 10926078a454SJean-Philippe Brucker 1093c9888d95SJean-Philippe Brucker /* 1094c9888d95SJean-Philippe Brucker * Attempt to update the FD limit, if opening an eventfd for each IRQ vector 1095c9888d95SJean-Philippe Brucker * would hit the limit. Which is likely to happen when a device uses 2048 MSIs. 1096c9888d95SJean-Philippe Brucker */ 1097c9888d95SJean-Philippe Brucker static int vfio_pci_reserve_irq_fds(size_t num) 1098c9888d95SJean-Philippe Brucker { 1099c9888d95SJean-Philippe Brucker /* 1100c9888d95SJean-Philippe Brucker * I counted around 27 fds under normal load. Let's add 100 for good 1101c9888d95SJean-Philippe Brucker * measure. 1102c9888d95SJean-Philippe Brucker */ 1103c9888d95SJean-Philippe Brucker static size_t needed = 128; 1104c9888d95SJean-Philippe Brucker struct rlimit fd_limit, new_limit; 1105c9888d95SJean-Philippe Brucker 1106c9888d95SJean-Philippe Brucker needed += num; 1107c9888d95SJean-Philippe Brucker 1108c9888d95SJean-Philippe Brucker if (getrlimit(RLIMIT_NOFILE, &fd_limit)) { 1109c9888d95SJean-Philippe Brucker perror("getrlimit(RLIMIT_NOFILE)"); 1110c9888d95SJean-Philippe Brucker return 0; 1111c9888d95SJean-Philippe Brucker } 1112c9888d95SJean-Philippe Brucker 1113c9888d95SJean-Philippe Brucker if (fd_limit.rlim_cur >= needed) 1114c9888d95SJean-Philippe Brucker return 0; 1115c9888d95SJean-Philippe Brucker 1116c9888d95SJean-Philippe Brucker new_limit.rlim_cur = needed; 1117c9888d95SJean-Philippe Brucker 1118c9888d95SJean-Philippe Brucker if (fd_limit.rlim_max < needed) 1119c9888d95SJean-Philippe Brucker /* Try to bump hard limit (root only) */ 1120c9888d95SJean-Philippe Brucker new_limit.rlim_max = needed; 1121c9888d95SJean-Philippe Brucker else 1122c9888d95SJean-Philippe Brucker new_limit.rlim_max = fd_limit.rlim_max; 1123c9888d95SJean-Philippe Brucker 1124c9888d95SJean-Philippe Brucker if (setrlimit(RLIMIT_NOFILE, &new_limit)) { 1125c9888d95SJean-Philippe Brucker perror("setrlimit(RLIMIT_NOFILE)"); 1126c9888d95SJean-Philippe Brucker pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)", 1127c9888d95SJean-Philippe Brucker (size_t)(needed - fd_limit.rlim_cur)); 1128c9888d95SJean-Philippe Brucker } 1129c9888d95SJean-Philippe Brucker 1130c9888d95SJean-Philippe Brucker return 0; 1131c9888d95SJean-Philippe Brucker } 1132c9888d95SJean-Philippe Brucker 1133c9888d95SJean-Philippe Brucker static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev, 1134c9888d95SJean-Philippe Brucker struct vfio_pci_msi_common *msis) 1135c9888d95SJean-Philippe Brucker { 1136c9888d95SJean-Philippe Brucker int ret; 1137c9888d95SJean-Philippe Brucker size_t i; 1138c9888d95SJean-Philippe Brucker int *eventfds; 1139c9888d95SJean-Philippe Brucker size_t irq_set_size; 1140c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entry; 1141c9888d95SJean-Philippe Brucker size_t nr_entries = msis->nr_entries; 1142c9888d95SJean-Philippe Brucker 1143c9888d95SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info); 114409533d3cSAndre Przywara if (ret || msis->info.count == 0) { 1145c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "no MSI reported by VFIO"); 1146c9888d95SJean-Philippe Brucker return -ENODEV; 1147c9888d95SJean-Philippe Brucker } 1148c9888d95SJean-Philippe Brucker 1149c9888d95SJean-Philippe Brucker if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) { 1150c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "interrupt not EVENTFD capable"); 1151c9888d95SJean-Philippe Brucker return -EINVAL; 1152c9888d95SJean-Philippe Brucker } 1153c9888d95SJean-Philippe Brucker 1154c9888d95SJean-Philippe Brucker if (msis->info.count != nr_entries) { 1155c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO"); 1156c9888d95SJean-Philippe Brucker return -EINVAL; 1157c9888d95SJean-Philippe Brucker } 1158c9888d95SJean-Philippe Brucker 1159c9888d95SJean-Philippe Brucker mutex_init(&msis->mutex); 1160c9888d95SJean-Philippe Brucker 1161c9888d95SJean-Philippe Brucker vfio_pci_reserve_irq_fds(nr_entries); 1162c9888d95SJean-Philippe Brucker 1163c9888d95SJean-Philippe Brucker irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int); 1164c9888d95SJean-Philippe Brucker msis->irq_set = malloc(irq_set_size); 1165c9888d95SJean-Philippe Brucker if (!msis->irq_set) 1166c9888d95SJean-Philippe Brucker return -ENOMEM; 1167c9888d95SJean-Philippe Brucker 1168c9888d95SJean-Philippe Brucker *msis->irq_set = (struct vfio_irq_set) { 1169c9888d95SJean-Philippe Brucker .argsz = irq_set_size, 1170c9888d95SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_EVENTFD | 1171c9888d95SJean-Philippe Brucker VFIO_IRQ_SET_ACTION_TRIGGER, 1172c9888d95SJean-Philippe Brucker .index = msis->info.index, 1173c9888d95SJean-Philippe Brucker .start = 0, 1174c9888d95SJean-Philippe Brucker .count = nr_entries, 1175c9888d95SJean-Philippe Brucker }; 1176c9888d95SJean-Philippe Brucker 1177c9888d95SJean-Philippe Brucker eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 1178c9888d95SJean-Philippe Brucker 1179c9888d95SJean-Philippe Brucker for (i = 0; i < nr_entries; i++) { 1180c9888d95SJean-Philippe Brucker entry = &msis->entries[i]; 1181c9888d95SJean-Philippe Brucker entry->gsi = -1; 1182c9888d95SJean-Philippe Brucker entry->eventfd = -1; 1183c9888d95SJean-Philippe Brucker msi_set_masked(entry->virt_state, true); 1184c9888d95SJean-Philippe Brucker msi_set_masked(entry->phys_state, true); 1185c9888d95SJean-Philippe Brucker eventfds[i] = -1; 1186c9888d95SJean-Philippe Brucker } 1187c9888d95SJean-Philippe Brucker 1188c9888d95SJean-Philippe Brucker return 0; 1189c9888d95SJean-Philippe Brucker } 1190c9888d95SJean-Philippe Brucker 1191c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev) 1192c9888d95SJean-Philippe Brucker { 1193c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 1194c9888d95SJean-Philippe Brucker int gsi = pdev->intx_gsi; 1195c9888d95SJean-Philippe Brucker struct vfio_irq_set irq_set = { 1196c9888d95SJean-Philippe Brucker .argsz = sizeof(irq_set), 1197c9888d95SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 1198c9888d95SJean-Philippe Brucker .index = VFIO_PCI_INTX_IRQ_INDEX, 1199c9888d95SJean-Philippe Brucker }; 1200c9888d95SJean-Philippe Brucker 12017302327aSLeo Yan if (pdev->intx_fd == -1) 12027302327aSLeo Yan return; 12037302327aSLeo Yan 1204c9888d95SJean-Philippe Brucker pr_debug("user requested MSI, disabling INTx %d", gsi); 1205c9888d95SJean-Philippe Brucker 1206c9888d95SJean-Philippe Brucker ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 1207c9888d95SJean-Philippe Brucker irq__del_irqfd(kvm, gsi, pdev->intx_fd); 1208c9888d95SJean-Philippe Brucker 1209c9888d95SJean-Philippe Brucker close(pdev->intx_fd); 1210a1ff6f87SLeo Yan close(pdev->unmask_fd); 12117302327aSLeo Yan pdev->intx_fd = -1; 1212c9888d95SJean-Philippe Brucker } 1213c9888d95SJean-Philippe Brucker 12146078a454SJean-Philippe Brucker static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev) 12156078a454SJean-Philippe Brucker { 12166078a454SJean-Philippe Brucker int ret; 12176078a454SJean-Philippe Brucker int trigger_fd, unmask_fd; 1218a3704b91SAndre Przywara union vfio_irq_eventfd trigger; 1219a3704b91SAndre Przywara union vfio_irq_eventfd unmask; 12206078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 122112bd7a16SLeo Yan int gsi = pdev->intx_gsi; 12226078a454SJean-Philippe Brucker 12237302327aSLeo Yan if (pdev->intx_fd != -1) 12247302327aSLeo Yan return 0; 12257302327aSLeo Yan 12266078a454SJean-Philippe Brucker /* 12276078a454SJean-Philippe Brucker * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd 12286078a454SJean-Philippe Brucker * signals an interrupt from host to guest, and unmask_fd signals the 12296078a454SJean-Philippe Brucker * deassertion of the line from guest to host. 12306078a454SJean-Philippe Brucker */ 12316078a454SJean-Philippe Brucker trigger_fd = eventfd(0, 0); 12326078a454SJean-Philippe Brucker if (trigger_fd < 0) { 12336078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to create trigger eventfd"); 12346078a454SJean-Philippe Brucker return trigger_fd; 12356078a454SJean-Philippe Brucker } 12366078a454SJean-Philippe Brucker 12376078a454SJean-Philippe Brucker unmask_fd = eventfd(0, 0); 12386078a454SJean-Philippe Brucker if (unmask_fd < 0) { 12396078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to create unmask eventfd"); 12406078a454SJean-Philippe Brucker close(trigger_fd); 12416078a454SJean-Philippe Brucker return unmask_fd; 12426078a454SJean-Philippe Brucker } 12436078a454SJean-Philippe Brucker 12446078a454SJean-Philippe Brucker ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd); 12456078a454SJean-Philippe Brucker if (ret) 12466078a454SJean-Philippe Brucker goto err_close; 12476078a454SJean-Philippe Brucker 12486078a454SJean-Philippe Brucker trigger.irq = (struct vfio_irq_set) { 12496078a454SJean-Philippe Brucker .argsz = sizeof(trigger), 12506078a454SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, 12516078a454SJean-Philippe Brucker .index = VFIO_PCI_INTX_IRQ_INDEX, 12526078a454SJean-Philippe Brucker .start = 0, 12536078a454SJean-Philippe Brucker .count = 1, 12546078a454SJean-Philippe Brucker }; 1255a3704b91SAndre Przywara set_vfio_irq_eventd_payload(&trigger, trigger_fd); 12566078a454SJean-Philippe Brucker 12576078a454SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 12586078a454SJean-Philippe Brucker if (ret < 0) { 12596078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to setup VFIO IRQ"); 12606078a454SJean-Philippe Brucker goto err_delete_line; 12616078a454SJean-Philippe Brucker } 12626078a454SJean-Philippe Brucker 12636078a454SJean-Philippe Brucker unmask.irq = (struct vfio_irq_set) { 12646078a454SJean-Philippe Brucker .argsz = sizeof(unmask), 12656078a454SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK, 12666078a454SJean-Philippe Brucker .index = VFIO_PCI_INTX_IRQ_INDEX, 12676078a454SJean-Philippe Brucker .start = 0, 12686078a454SJean-Philippe Brucker .count = 1, 12696078a454SJean-Philippe Brucker }; 1270a3704b91SAndre Przywara set_vfio_irq_eventd_payload(&unmask, unmask_fd); 12716078a454SJean-Philippe Brucker 12726078a454SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask); 12736078a454SJean-Philippe Brucker if (ret < 0) { 12746078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to setup unmask IRQ"); 12756078a454SJean-Philippe Brucker goto err_remove_event; 12766078a454SJean-Philippe Brucker } 12776078a454SJean-Philippe Brucker 1278c9888d95SJean-Philippe Brucker pdev->intx_fd = trigger_fd; 1279a1ff6f87SLeo Yan pdev->unmask_fd = unmask_fd; 1280c9888d95SJean-Philippe Brucker 12816078a454SJean-Philippe Brucker return 0; 12826078a454SJean-Philippe Brucker 12836078a454SJean-Philippe Brucker err_remove_event: 12846078a454SJean-Philippe Brucker /* Remove trigger event */ 12856078a454SJean-Philippe Brucker trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; 12866078a454SJean-Philippe Brucker trigger.irq.count = 0; 12876078a454SJean-Philippe Brucker ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 12886078a454SJean-Philippe Brucker 12896078a454SJean-Philippe Brucker err_delete_line: 12906078a454SJean-Philippe Brucker irq__del_irqfd(kvm, gsi, trigger_fd); 12916078a454SJean-Philippe Brucker 12926078a454SJean-Philippe Brucker err_close: 12936078a454SJean-Philippe Brucker close(trigger_fd); 12946078a454SJean-Philippe Brucker close(unmask_fd); 12956078a454SJean-Philippe Brucker return ret; 12966078a454SJean-Philippe Brucker } 12976078a454SJean-Philippe Brucker 129812bd7a16SLeo Yan static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev) 129912bd7a16SLeo Yan { 130012bd7a16SLeo Yan int ret; 130112bd7a16SLeo Yan struct vfio_pci_device *pdev = &vdev->pci; 130212bd7a16SLeo Yan struct vfio_irq_info irq_info = { 130312bd7a16SLeo Yan .argsz = sizeof(irq_info), 130412bd7a16SLeo Yan .index = VFIO_PCI_INTX_IRQ_INDEX, 130512bd7a16SLeo Yan }; 130612bd7a16SLeo Yan 130712bd7a16SLeo Yan vfio_pci_reserve_irq_fds(2); 130812bd7a16SLeo Yan 130912bd7a16SLeo Yan ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); 131012bd7a16SLeo Yan if (ret || irq_info.count == 0) { 131112bd7a16SLeo Yan vfio_dev_err(vdev, "no INTx reported by VFIO"); 131212bd7a16SLeo Yan return -ENODEV; 131312bd7a16SLeo Yan } 131412bd7a16SLeo Yan 131512bd7a16SLeo Yan if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { 131612bd7a16SLeo Yan vfio_dev_err(vdev, "interrupt not eventfd capable"); 131712bd7a16SLeo Yan return -EINVAL; 131812bd7a16SLeo Yan } 131912bd7a16SLeo Yan 132012bd7a16SLeo Yan if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) { 132112bd7a16SLeo Yan vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED"); 132212bd7a16SLeo Yan return -EINVAL; 132312bd7a16SLeo Yan } 132412bd7a16SLeo Yan 132512bd7a16SLeo Yan /* Guest is going to ovewrite our irq_line... */ 132612bd7a16SLeo Yan pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET; 132712bd7a16SLeo Yan 13287302327aSLeo Yan pdev->intx_fd = -1; 13297302327aSLeo Yan 133012bd7a16SLeo Yan return 0; 133112bd7a16SLeo Yan } 133212bd7a16SLeo Yan 13336078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev) 13346078a454SJean-Philippe Brucker { 1335c9888d95SJean-Philippe Brucker int ret = 0; 13366078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 13376078a454SJean-Philippe Brucker 1338c9888d95SJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1339c9888d95SJean-Philippe Brucker pdev->msix.info = (struct vfio_irq_info) { 1340c9888d95SJean-Philippe Brucker .argsz = sizeof(pdev->msix.info), 1341c9888d95SJean-Philippe Brucker .index = VFIO_PCI_MSIX_IRQ_INDEX, 13426078a454SJean-Philippe Brucker }; 1343c9888d95SJean-Philippe Brucker ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix); 1344c9888d95SJean-Philippe Brucker if (ret) 1345c9888d95SJean-Philippe Brucker return ret; 13466078a454SJean-Philippe Brucker } 13476078a454SJean-Philippe Brucker 13488dd28afeSJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 13498dd28afeSJean-Philippe Brucker pdev->msi.info = (struct vfio_irq_info) { 13508dd28afeSJean-Philippe Brucker .argsz = sizeof(pdev->msi.info), 13518dd28afeSJean-Philippe Brucker .index = VFIO_PCI_MSI_IRQ_INDEX, 13528dd28afeSJean-Philippe Brucker }; 13538dd28afeSJean-Philippe Brucker ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi); 13548dd28afeSJean-Philippe Brucker if (ret) 13558dd28afeSJean-Philippe Brucker return ret; 13568dd28afeSJean-Philippe Brucker } 13578dd28afeSJean-Philippe Brucker 135812bd7a16SLeo Yan if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) { 1359c0c45eedSAndre Przywara pci__assign_irq(&vdev->pci.hdr); 1360c0c45eedSAndre Przywara 136112bd7a16SLeo Yan ret = vfio_pci_init_intx(kvm, vdev); 136212bd7a16SLeo Yan if (ret) 136312bd7a16SLeo Yan return ret; 136412bd7a16SLeo Yan 1365c9888d95SJean-Philippe Brucker ret = vfio_pci_enable_intx(kvm, vdev); 136612bd7a16SLeo Yan } 1367c9888d95SJean-Philippe Brucker 1368c9888d95SJean-Philippe Brucker return ret; 13696078a454SJean-Philippe Brucker } 13706078a454SJean-Philippe Brucker 13716078a454SJean-Philippe Brucker int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev) 13726078a454SJean-Philippe Brucker { 13736078a454SJean-Philippe Brucker int ret; 13746078a454SJean-Philippe Brucker 13756078a454SJean-Philippe Brucker ret = vfio_pci_configure_dev_regions(kvm, vdev); 13766078a454SJean-Philippe Brucker if (ret) { 13776078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to configure regions"); 13786078a454SJean-Philippe Brucker return ret; 13796078a454SJean-Philippe Brucker } 13806078a454SJean-Philippe Brucker 13816078a454SJean-Philippe Brucker vdev->dev_hdr = (struct device_header) { 13826078a454SJean-Philippe Brucker .bus_type = DEVICE_BUS_PCI, 13836078a454SJean-Philippe Brucker .data = &vdev->pci.hdr, 13846078a454SJean-Philippe Brucker }; 13856078a454SJean-Philippe Brucker 13866078a454SJean-Philippe Brucker ret = device__register(&vdev->dev_hdr); 13876078a454SJean-Philippe Brucker if (ret) { 13886078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to register VFIO device"); 13896078a454SJean-Philippe Brucker return ret; 13906078a454SJean-Philippe Brucker } 13916078a454SJean-Philippe Brucker 13926078a454SJean-Philippe Brucker ret = vfio_pci_configure_dev_irqs(kvm, vdev); 13936078a454SJean-Philippe Brucker if (ret) { 13946078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to configure IRQs"); 13956078a454SJean-Philippe Brucker return ret; 13966078a454SJean-Philippe Brucker } 13976078a454SJean-Philippe Brucker 13986078a454SJean-Philippe Brucker return 0; 13996078a454SJean-Philippe Brucker } 14006078a454SJean-Philippe Brucker 14016078a454SJean-Philippe Brucker void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev) 14026078a454SJean-Philippe Brucker { 14036078a454SJean-Philippe Brucker size_t i; 1404c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 14056078a454SJean-Philippe Brucker 14066078a454SJean-Philippe Brucker for (i = 0; i < vdev->info.num_regions; i++) 14076078a454SJean-Philippe Brucker vfio_unmap_region(kvm, &vdev->regions[i]); 14086078a454SJean-Philippe Brucker 14096078a454SJean-Philippe Brucker device__unregister(&vdev->dev_hdr); 1410c9888d95SJean-Philippe Brucker 1411c9888d95SJean-Philippe Brucker free(pdev->msix.irq_set); 1412c9888d95SJean-Philippe Brucker free(pdev->msix.entries); 14138dd28afeSJean-Philippe Brucker free(pdev->msi.irq_set); 14148dd28afeSJean-Philippe Brucker free(pdev->msi.entries); 14156078a454SJean-Philippe Brucker } 1416