16078a454SJean-Philippe Brucker #include "kvm/irq.h" 26078a454SJean-Philippe Brucker #include "kvm/kvm.h" 36078a454SJean-Philippe Brucker #include "kvm/kvm-cpu.h" 46078a454SJean-Philippe Brucker #include "kvm/vfio.h" 56078a454SJean-Philippe Brucker 66078a454SJean-Philippe Brucker #include <sys/ioctl.h> 76078a454SJean-Philippe Brucker #include <sys/eventfd.h> 8c9888d95SJean-Philippe Brucker #include <sys/resource.h> 9c9888d95SJean-Philippe Brucker #include <sys/time.h> 106078a454SJean-Philippe Brucker 116078a454SJean-Philippe Brucker /* Wrapper around UAPI vfio_irq_set */ 12a3704b91SAndre Przywara union vfio_irq_eventfd { 136078a454SJean-Philippe Brucker struct vfio_irq_set irq; 14a3704b91SAndre Przywara u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)]; 156078a454SJean-Philippe Brucker }; 166078a454SJean-Philippe Brucker 17a3704b91SAndre Przywara static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd) 18a3704b91SAndre Przywara { 19a3704b91SAndre Przywara memcpy(&evfd->irq.data, &fd, sizeof(fd)); 20a3704b91SAndre Przywara } 21a3704b91SAndre Przywara 22c9888d95SJean-Philippe Brucker #define msi_is_enabled(state) ((state) & VFIO_PCI_MSI_STATE_ENABLED) 23c9888d95SJean-Philippe Brucker #define msi_is_masked(state) ((state) & VFIO_PCI_MSI_STATE_MASKED) 24c9888d95SJean-Philippe Brucker #define msi_is_empty(state) ((state) & VFIO_PCI_MSI_STATE_EMPTY) 25c9888d95SJean-Philippe Brucker 26c9888d95SJean-Philippe Brucker #define msi_update_state(state, val, bit) \ 27c9888d95SJean-Philippe Brucker (state) = (val) ? (state) | bit : (state) & ~bit; 28c9888d95SJean-Philippe Brucker #define msi_set_enabled(state, val) \ 29c9888d95SJean-Philippe Brucker msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED) 30c9888d95SJean-Philippe Brucker #define msi_set_masked(state, val) \ 31c9888d95SJean-Philippe Brucker msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED) 32c9888d95SJean-Philippe Brucker #define msi_set_empty(state, val) \ 33c9888d95SJean-Philippe Brucker msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY) 34c9888d95SJean-Philippe Brucker 35c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev); 367302327aSLeo Yan static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev); 37c9888d95SJean-Philippe Brucker 388dd28afeSJean-Philippe Brucker static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev, 398dd28afeSJean-Philippe Brucker bool msix) 40c9888d95SJean-Philippe Brucker { 41c9888d95SJean-Philippe Brucker size_t i; 42c9888d95SJean-Philippe Brucker int ret = 0; 43c9888d95SJean-Philippe Brucker int *eventfds; 44c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 458dd28afeSJean-Philippe Brucker struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 46a3704b91SAndre Przywara union vfio_irq_eventfd single = { 47c9888d95SJean-Philippe Brucker .irq = { 48c9888d95SJean-Philippe Brucker .argsz = sizeof(single), 49c9888d95SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_EVENTFD | 50c9888d95SJean-Philippe Brucker VFIO_IRQ_SET_ACTION_TRIGGER, 51c9888d95SJean-Philippe Brucker .index = msis->info.index, 52c9888d95SJean-Philippe Brucker .count = 1, 53c9888d95SJean-Philippe Brucker }, 54c9888d95SJean-Philippe Brucker }; 55c9888d95SJean-Philippe Brucker 56c9888d95SJean-Philippe Brucker if (!msi_is_enabled(msis->virt_state)) 57c9888d95SJean-Philippe Brucker return 0; 58c9888d95SJean-Philippe Brucker 597302327aSLeo Yan if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 60c9888d95SJean-Philippe Brucker /* 61c9888d95SJean-Philippe Brucker * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same 62c9888d95SJean-Philippe Brucker * time. Since INTx has to be enabled from the start (we don't 637302327aSLeo Yan * have a reliable way to know when the guest starts using it), 64c9888d95SJean-Philippe Brucker * disable it now. 65c9888d95SJean-Philippe Brucker */ 66c9888d95SJean-Philippe Brucker vfio_pci_disable_intx(kvm, vdev); 67c9888d95SJean-Philippe Brucker 68c9888d95SJean-Philippe Brucker eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 69c9888d95SJean-Philippe Brucker 70c9888d95SJean-Philippe Brucker /* 71c9888d95SJean-Philippe Brucker * Initial registration of the full range. This enables the physical 72c9888d95SJean-Philippe Brucker * MSI/MSI-X capability, which might have desired side effects. For 73c9888d95SJean-Philippe Brucker * instance when assigning virtio legacy devices, enabling the MSI 74c9888d95SJean-Philippe Brucker * capability modifies the config space layout! 75c9888d95SJean-Philippe Brucker * 76c9888d95SJean-Philippe Brucker * As an optimization, only update MSIs when guest unmasks the 77c9888d95SJean-Philippe Brucker * capability. This greatly reduces the initialization time for Linux 78c9888d95SJean-Philippe Brucker * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap 79c9888d95SJean-Philippe Brucker * masked, then fills individual vectors, then unmasks the whole 80c9888d95SJean-Philippe Brucker * function. So we only do one VFIO ioctl when enabling for the first 81c9888d95SJean-Philippe Brucker * time, and then one when unmasking. 82c9888d95SJean-Philippe Brucker * 83c9888d95SJean-Philippe Brucker * phys_state is empty when it is enabled but no vector has been 84c9888d95SJean-Philippe Brucker * registered via SET_IRQS yet. 85c9888d95SJean-Philippe Brucker */ 86c9888d95SJean-Philippe Brucker if (!msi_is_enabled(msis->phys_state) || 87c9888d95SJean-Philippe Brucker (!msi_is_masked(msis->virt_state) && 88c9888d95SJean-Philippe Brucker msi_is_empty(msis->phys_state))) { 89c9888d95SJean-Philippe Brucker bool empty = true; 90c9888d95SJean-Philippe Brucker 91c9888d95SJean-Philippe Brucker for (i = 0; i < msis->nr_entries; i++) { 92c9888d95SJean-Philippe Brucker eventfds[i] = msis->entries[i].gsi >= 0 ? 93c9888d95SJean-Philippe Brucker msis->entries[i].eventfd : -1; 94c9888d95SJean-Philippe Brucker 95c9888d95SJean-Philippe Brucker if (eventfds[i] >= 0) 96c9888d95SJean-Philippe Brucker empty = false; 97c9888d95SJean-Philippe Brucker } 98c9888d95SJean-Philippe Brucker 99c9888d95SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set); 100c9888d95SJean-Philippe Brucker if (ret < 0) { 101c9888d95SJean-Philippe Brucker perror("VFIO_DEVICE_SET_IRQS(multi)"); 102c9888d95SJean-Philippe Brucker return ret; 103c9888d95SJean-Philippe Brucker } 104c9888d95SJean-Philippe Brucker 105c9888d95SJean-Philippe Brucker msi_set_enabled(msis->phys_state, true); 106c9888d95SJean-Philippe Brucker msi_set_empty(msis->phys_state, empty); 107c9888d95SJean-Philippe Brucker 108c9888d95SJean-Philippe Brucker return 0; 109c9888d95SJean-Philippe Brucker } 110c9888d95SJean-Philippe Brucker 111c9888d95SJean-Philippe Brucker if (msi_is_masked(msis->virt_state)) { 112c9888d95SJean-Philippe Brucker /* TODO: if phys_state is not empty nor masked, mask all vectors */ 113c9888d95SJean-Philippe Brucker return 0; 114c9888d95SJean-Philippe Brucker } 115c9888d95SJean-Philippe Brucker 116c9888d95SJean-Philippe Brucker /* Update individual vectors to avoid breaking those in use */ 117c9888d95SJean-Philippe Brucker for (i = 0; i < msis->nr_entries; i++) { 118c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entry = &msis->entries[i]; 119c9888d95SJean-Philippe Brucker int fd = entry->gsi >= 0 ? entry->eventfd : -1; 120c9888d95SJean-Philippe Brucker 121c9888d95SJean-Philippe Brucker if (fd == eventfds[i]) 122c9888d95SJean-Philippe Brucker continue; 123c9888d95SJean-Philippe Brucker 124c9888d95SJean-Philippe Brucker single.irq.start = i; 125a3704b91SAndre Przywara set_vfio_irq_eventd_payload(&single, fd); 126c9888d95SJean-Philippe Brucker 127c9888d95SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single); 128c9888d95SJean-Philippe Brucker if (ret < 0) { 129c9888d95SJean-Philippe Brucker perror("VFIO_DEVICE_SET_IRQS(single)"); 130c9888d95SJean-Philippe Brucker break; 131c9888d95SJean-Philippe Brucker } 132c9888d95SJean-Philippe Brucker 133c9888d95SJean-Philippe Brucker eventfds[i] = fd; 134c9888d95SJean-Philippe Brucker 135c9888d95SJean-Philippe Brucker if (msi_is_empty(msis->phys_state) && fd >= 0) 136c9888d95SJean-Philippe Brucker msi_set_empty(msis->phys_state, false); 137c9888d95SJean-Philippe Brucker } 138c9888d95SJean-Philippe Brucker 139c9888d95SJean-Philippe Brucker return ret; 140c9888d95SJean-Philippe Brucker } 141c9888d95SJean-Philippe Brucker 1428dd28afeSJean-Philippe Brucker static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev, 1438dd28afeSJean-Philippe Brucker bool msix) 144c9888d95SJean-Philippe Brucker { 145c9888d95SJean-Philippe Brucker int ret; 146c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 1478dd28afeSJean-Philippe Brucker struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; 148c9888d95SJean-Philippe Brucker struct vfio_irq_set irq_set = { 149c9888d95SJean-Philippe Brucker .argsz = sizeof(irq_set), 150c9888d95SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 151c9888d95SJean-Philippe Brucker .index = msis->info.index, 152c9888d95SJean-Philippe Brucker .start = 0, 153c9888d95SJean-Philippe Brucker .count = 0, 154c9888d95SJean-Philippe Brucker }; 155c9888d95SJean-Philippe Brucker 156c9888d95SJean-Philippe Brucker if (!msi_is_enabled(msis->phys_state)) 157c9888d95SJean-Philippe Brucker return 0; 158c9888d95SJean-Philippe Brucker 159c9888d95SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 160c9888d95SJean-Philippe Brucker if (ret < 0) { 161c9888d95SJean-Philippe Brucker perror("VFIO_DEVICE_SET_IRQS(NONE)"); 162c9888d95SJean-Philippe Brucker return ret; 163c9888d95SJean-Philippe Brucker } 164c9888d95SJean-Philippe Brucker 165c9888d95SJean-Philippe Brucker msi_set_enabled(msis->phys_state, false); 166c9888d95SJean-Philippe Brucker msi_set_empty(msis->phys_state, true); 167c9888d95SJean-Philippe Brucker 1687302327aSLeo Yan /* 1697302327aSLeo Yan * When MSI or MSIX is disabled, this might be called when 1707302327aSLeo Yan * PCI driver detects the MSI interrupt failure and wants to 1717302327aSLeo Yan * rollback to INTx mode. Thus enable INTx if the device 1727302327aSLeo Yan * supports INTx mode in this case. 1737302327aSLeo Yan */ 1747302327aSLeo Yan if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) 1757302327aSLeo Yan ret = vfio_pci_enable_intx(kvm, vdev); 1767302327aSLeo Yan 1777302327aSLeo Yan return ret >= 0 ? 0 : ret; 178c9888d95SJean-Philippe Brucker } 179c9888d95SJean-Philippe Brucker 180c9888d95SJean-Philippe Brucker static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev, 181c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entry) 182c9888d95SJean-Philippe Brucker { 183c9888d95SJean-Philippe Brucker int ret; 184c9888d95SJean-Philippe Brucker 185c9888d95SJean-Philippe Brucker if (entry->eventfd < 0) { 186c9888d95SJean-Philippe Brucker entry->eventfd = eventfd(0, 0); 187c9888d95SJean-Philippe Brucker if (entry->eventfd < 0) { 188c9888d95SJean-Philippe Brucker ret = -errno; 189c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot create eventfd"); 190c9888d95SJean-Philippe Brucker return ret; 191c9888d95SJean-Philippe Brucker } 192c9888d95SJean-Philippe Brucker } 193c9888d95SJean-Philippe Brucker 194c9888d95SJean-Philippe Brucker /* Allocate IRQ if necessary */ 195c9888d95SJean-Philippe Brucker if (entry->gsi < 0) { 196c9888d95SJean-Philippe Brucker int ret = irq__add_msix_route(kvm, &entry->config.msg, 197c9888d95SJean-Philippe Brucker vdev->dev_hdr.dev_num << 3); 198c9888d95SJean-Philippe Brucker if (ret < 0) { 199c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot create MSI-X route"); 200c9888d95SJean-Philippe Brucker return ret; 201c9888d95SJean-Philippe Brucker } 202c9888d95SJean-Philippe Brucker entry->gsi = ret; 203c9888d95SJean-Philippe Brucker } else { 204c9888d95SJean-Philippe Brucker irq__update_msix_route(kvm, entry->gsi, &entry->config.msg); 205c9888d95SJean-Philippe Brucker } 206c9888d95SJean-Philippe Brucker 207c9888d95SJean-Philippe Brucker /* 208c9888d95SJean-Philippe Brucker * MSI masking is unimplemented in VFIO, so we have to handle it by 209c9888d95SJean-Philippe Brucker * disabling/enabling IRQ route instead. We do it on the KVM side rather 210c9888d95SJean-Philippe Brucker * than VFIO, because: 211c9888d95SJean-Philippe Brucker * - it is 8x faster 212c9888d95SJean-Philippe Brucker * - it allows to decouple masking logic from capability state. 213c9888d95SJean-Philippe Brucker * - in masked state, after removing irqfd route, we could easily plug 214c9888d95SJean-Philippe Brucker * the eventfd in a local handler, in order to serve Pending Bit reads 215c9888d95SJean-Philippe Brucker * to the guest. 216c9888d95SJean-Philippe Brucker * 217c9888d95SJean-Philippe Brucker * So entry->phys_state is masked when there is no active irqfd route. 218c9888d95SJean-Philippe Brucker */ 219c9888d95SJean-Philippe Brucker if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state)) 220c9888d95SJean-Philippe Brucker return 0; 221c9888d95SJean-Philippe Brucker 222c9888d95SJean-Philippe Brucker if (msi_is_masked(entry->phys_state)) { 223c9888d95SJean-Philippe Brucker ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1); 224c9888d95SJean-Philippe Brucker if (ret < 0) { 225c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot setup irqfd"); 226c9888d95SJean-Philippe Brucker return ret; 227c9888d95SJean-Philippe Brucker } 228c9888d95SJean-Philippe Brucker } else { 229c9888d95SJean-Philippe Brucker irq__del_irqfd(kvm, entry->gsi, entry->eventfd); 230c9888d95SJean-Philippe Brucker } 231c9888d95SJean-Philippe Brucker 232c9888d95SJean-Philippe Brucker msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state)); 233c9888d95SJean-Philippe Brucker 234c9888d95SJean-Philippe Brucker return 0; 235c9888d95SJean-Philippe Brucker } 236c9888d95SJean-Philippe Brucker 237c9888d95SJean-Philippe Brucker static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 238c9888d95SJean-Philippe Brucker u32 len, u8 is_write, void *ptr) 239c9888d95SJean-Philippe Brucker { 240c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = ptr; 241c9888d95SJean-Philippe Brucker struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 242c9888d95SJean-Philippe Brucker u64 offset = addr - pba->guest_phys_addr; 243c9888d95SJean-Philippe Brucker struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 244c9888d95SJean-Philippe Brucker 245c9888d95SJean-Philippe Brucker if (is_write) 246c9888d95SJean-Philippe Brucker return; 247c9888d95SJean-Philippe Brucker 248c9888d95SJean-Philippe Brucker /* 249c9888d95SJean-Philippe Brucker * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA 250c9888d95SJean-Philippe Brucker * is completely useless here. Note that Linux doesn't use PBA. 251c9888d95SJean-Philippe Brucker */ 252c9888d95SJean-Philippe Brucker if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len) 253c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot access MSIX PBA\n"); 254c9888d95SJean-Philippe Brucker } 255c9888d95SJean-Philippe Brucker 256c9888d95SJean-Philippe Brucker static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, 257c9888d95SJean-Philippe Brucker u32 len, u8 is_write, void *ptr) 258c9888d95SJean-Philippe Brucker { 259c9888d95SJean-Philippe Brucker struct kvm *kvm = vcpu->kvm; 260c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entry; 261c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = ptr; 262c9888d95SJean-Philippe Brucker struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); 263c9888d95SJean-Philippe Brucker 264c9888d95SJean-Philippe Brucker u64 offset = addr - pdev->msix_table.guest_phys_addr; 265c9888d95SJean-Philippe Brucker 266c9888d95SJean-Philippe Brucker size_t vector = offset / PCI_MSIX_ENTRY_SIZE; 267c9888d95SJean-Philippe Brucker off_t field = offset % PCI_MSIX_ENTRY_SIZE; 268c9888d95SJean-Philippe Brucker 269c9888d95SJean-Philippe Brucker /* 270c9888d95SJean-Philippe Brucker * PCI spec says that software must use aligned 4 or 8 bytes accesses 271c9888d95SJean-Philippe Brucker * for the MSI-X tables. 272c9888d95SJean-Philippe Brucker */ 273c9888d95SJean-Philippe Brucker if ((len != 4 && len != 8) || addr & (len - 1)) { 274c9888d95SJean-Philippe Brucker vfio_dev_warn(vdev, "invalid MSI-X table access"); 275c9888d95SJean-Philippe Brucker return; 276c9888d95SJean-Philippe Brucker } 277c9888d95SJean-Philippe Brucker 278c9888d95SJean-Philippe Brucker entry = &pdev->msix.entries[vector]; 279c9888d95SJean-Philippe Brucker 280c9888d95SJean-Philippe Brucker mutex_lock(&pdev->msix.mutex); 281c9888d95SJean-Philippe Brucker 282c9888d95SJean-Philippe Brucker if (!is_write) { 283c9888d95SJean-Philippe Brucker memcpy(data, (void *)&entry->config + field, len); 284c9888d95SJean-Philippe Brucker goto out_unlock; 285c9888d95SJean-Philippe Brucker } 286c9888d95SJean-Philippe Brucker 287c9888d95SJean-Philippe Brucker memcpy((void *)&entry->config + field, data, len); 288c9888d95SJean-Philippe Brucker 289c9888d95SJean-Philippe Brucker /* 290c9888d95SJean-Philippe Brucker * Check if access touched the vector control register, which is at the 291c9888d95SJean-Philippe Brucker * end of the MSI-X entry. 292c9888d95SJean-Philippe Brucker */ 293c9888d95SJean-Philippe Brucker if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL) 294c9888d95SJean-Philippe Brucker goto out_unlock; 295c9888d95SJean-Philippe Brucker 296c9888d95SJean-Philippe Brucker msi_set_masked(entry->virt_state, entry->config.ctrl & 297c9888d95SJean-Philippe Brucker PCI_MSIX_ENTRY_CTRL_MASKBIT); 298c9888d95SJean-Philippe Brucker 299c9888d95SJean-Philippe Brucker if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0) 300c9888d95SJean-Philippe Brucker /* Not much we can do here. */ 301c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector); 302c9888d95SJean-Philippe Brucker 303c9888d95SJean-Philippe Brucker /* Update the physical capability if necessary */ 3048dd28afeSJean-Philippe Brucker if (vfio_pci_enable_msis(kvm, vdev, true)) 305c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot enable MSIX"); 306c9888d95SJean-Philippe Brucker 307c9888d95SJean-Philippe Brucker out_unlock: 308c9888d95SJean-Philippe Brucker mutex_unlock(&pdev->msix.mutex); 309c9888d95SJean-Philippe Brucker } 310c9888d95SJean-Philippe Brucker 311c9888d95SJean-Philippe Brucker static void vfio_pci_msix_cap_write(struct kvm *kvm, 312c9888d95SJean-Philippe Brucker struct vfio_device *vdev, u8 off, 313c9888d95SJean-Philippe Brucker void *data, int sz) 314c9888d95SJean-Philippe Brucker { 315c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 316c9888d95SJean-Philippe Brucker off_t enable_pos = PCI_MSIX_FLAGS + 1; 317c9888d95SJean-Philippe Brucker bool enable; 318c9888d95SJean-Philippe Brucker u16 flags; 319c9888d95SJean-Philippe Brucker 320c9888d95SJean-Philippe Brucker off -= pdev->msix.pos; 321c9888d95SJean-Philippe Brucker 322c9888d95SJean-Philippe Brucker /* Check if access intersects with the MSI-X Enable bit */ 323c9888d95SJean-Philippe Brucker if (off > enable_pos || off + sz <= enable_pos) 324c9888d95SJean-Philippe Brucker return; 325c9888d95SJean-Philippe Brucker 326c9888d95SJean-Philippe Brucker /* Read byte that contains the Enable bit */ 327c9888d95SJean-Philippe Brucker flags = *(u8 *)(data + enable_pos - off) << 8; 328c9888d95SJean-Philippe Brucker 329c9888d95SJean-Philippe Brucker mutex_lock(&pdev->msix.mutex); 330c9888d95SJean-Philippe Brucker 331c9888d95SJean-Philippe Brucker msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL); 332c9888d95SJean-Philippe Brucker enable = flags & PCI_MSIX_FLAGS_ENABLE; 333c9888d95SJean-Philippe Brucker msi_set_enabled(pdev->msix.virt_state, enable); 334c9888d95SJean-Philippe Brucker 3358dd28afeSJean-Philippe Brucker if (enable && vfio_pci_enable_msis(kvm, vdev, true)) 336c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot enable MSIX"); 3378dd28afeSJean-Philippe Brucker else if (!enable && vfio_pci_disable_msis(kvm, vdev, true)) 338c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot disable MSIX"); 339c9888d95SJean-Philippe Brucker 340c9888d95SJean-Philippe Brucker mutex_unlock(&pdev->msix.mutex); 341c9888d95SJean-Philippe Brucker } 342c9888d95SJean-Philippe Brucker 3438dd28afeSJean-Philippe Brucker static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev, 3448dd28afeSJean-Philippe Brucker u8 off, u8 *data, u32 sz) 3458dd28afeSJean-Philippe Brucker { 3468dd28afeSJean-Philippe Brucker size_t i; 3478dd28afeSJean-Philippe Brucker u32 mask = 0; 3488dd28afeSJean-Philippe Brucker size_t mask_pos, start, limit; 3498dd28afeSJean-Philippe Brucker struct vfio_pci_msi_entry *entry; 3508dd28afeSJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 3518dd28afeSJean-Philippe Brucker struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 3528dd28afeSJean-Philippe Brucker 3538dd28afeSJean-Philippe Brucker if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT)) 3548dd28afeSJean-Philippe Brucker return 0; 3558dd28afeSJean-Philippe Brucker 3568dd28afeSJean-Philippe Brucker if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) 3578dd28afeSJean-Philippe Brucker mask_pos = PCI_MSI_MASK_64; 3588dd28afeSJean-Philippe Brucker else 3598dd28afeSJean-Philippe Brucker mask_pos = PCI_MSI_MASK_32; 3608dd28afeSJean-Philippe Brucker 3618dd28afeSJean-Philippe Brucker if (off >= mask_pos + 4 || off + sz <= mask_pos) 3628dd28afeSJean-Philippe Brucker return 0; 3638dd28afeSJean-Philippe Brucker 3648dd28afeSJean-Philippe Brucker /* Set mask to current state */ 3658dd28afeSJean-Philippe Brucker for (i = 0; i < pdev->msi.nr_entries; i++) { 3668dd28afeSJean-Philippe Brucker entry = &pdev->msi.entries[i]; 3678dd28afeSJean-Philippe Brucker mask |= !!msi_is_masked(entry->virt_state) << i; 3688dd28afeSJean-Philippe Brucker } 3698dd28afeSJean-Philippe Brucker 3708dd28afeSJean-Philippe Brucker /* Update mask following the intersection of access and register */ 3718dd28afeSJean-Philippe Brucker start = max_t(size_t, off, mask_pos); 3728dd28afeSJean-Philippe Brucker limit = min_t(size_t, off + sz, mask_pos + 4); 3738dd28afeSJean-Philippe Brucker 3748dd28afeSJean-Philippe Brucker memcpy((void *)&mask + start - mask_pos, data + start - off, 3758dd28afeSJean-Philippe Brucker limit - start); 3768dd28afeSJean-Philippe Brucker 3778dd28afeSJean-Philippe Brucker /* Update states if necessary */ 3788dd28afeSJean-Philippe Brucker for (i = 0; i < pdev->msi.nr_entries; i++) { 3798dd28afeSJean-Philippe Brucker bool masked = mask & (1 << i); 3808dd28afeSJean-Philippe Brucker 3818dd28afeSJean-Philippe Brucker entry = &pdev->msi.entries[i]; 3828dd28afeSJean-Philippe Brucker if (masked != msi_is_masked(entry->virt_state)) { 3838dd28afeSJean-Philippe Brucker msi_set_masked(entry->virt_state, masked); 3848dd28afeSJean-Philippe Brucker vfio_pci_update_msi_entry(kvm, vdev, entry); 3858dd28afeSJean-Philippe Brucker } 3868dd28afeSJean-Philippe Brucker } 3878dd28afeSJean-Philippe Brucker 3888dd28afeSJean-Philippe Brucker return 1; 3898dd28afeSJean-Philippe Brucker } 3908dd28afeSJean-Philippe Brucker 3918dd28afeSJean-Philippe Brucker static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev, 3928dd28afeSJean-Philippe Brucker u8 off, u8 *data, u32 sz) 3938dd28afeSJean-Philippe Brucker { 3948dd28afeSJean-Philippe Brucker u8 ctrl; 3958dd28afeSJean-Philippe Brucker struct msi_msg msg; 3968dd28afeSJean-Philippe Brucker size_t i, nr_vectors; 3978dd28afeSJean-Philippe Brucker struct vfio_pci_msi_entry *entry; 3988dd28afeSJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 3998dd28afeSJean-Philippe Brucker struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); 4008dd28afeSJean-Philippe Brucker 4018dd28afeSJean-Philippe Brucker off -= pdev->msi.pos; 4028dd28afeSJean-Philippe Brucker 4038dd28afeSJean-Philippe Brucker mutex_lock(&pdev->msi.mutex); 4048dd28afeSJean-Philippe Brucker 4058dd28afeSJean-Philippe Brucker /* Check if the guest is trying to update mask bits */ 4068dd28afeSJean-Philippe Brucker if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz)) 4078dd28afeSJean-Philippe Brucker goto out_unlock; 4088dd28afeSJean-Philippe Brucker 4098dd28afeSJean-Philippe Brucker /* Only modify routes when guest pokes the enable bit */ 4108dd28afeSJean-Philippe Brucker if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS) 4118dd28afeSJean-Philippe Brucker goto out_unlock; 4128dd28afeSJean-Philippe Brucker 4138dd28afeSJean-Philippe Brucker ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off); 4148dd28afeSJean-Philippe Brucker 4158dd28afeSJean-Philippe Brucker msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE); 4168dd28afeSJean-Philippe Brucker 4178dd28afeSJean-Philippe Brucker if (!msi_is_enabled(pdev->msi.virt_state)) { 4188dd28afeSJean-Philippe Brucker vfio_pci_disable_msis(kvm, vdev, false); 4198dd28afeSJean-Philippe Brucker goto out_unlock; 4208dd28afeSJean-Philippe Brucker } 4218dd28afeSJean-Philippe Brucker 4228dd28afeSJean-Philippe Brucker /* Create routes for the requested vectors */ 4238dd28afeSJean-Philippe Brucker nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4); 4248dd28afeSJean-Philippe Brucker 4258dd28afeSJean-Philippe Brucker msg.address_lo = msi_cap_64->address_lo; 4268dd28afeSJean-Philippe Brucker if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) { 4278dd28afeSJean-Philippe Brucker msg.address_hi = msi_cap_64->address_hi; 4288dd28afeSJean-Philippe Brucker msg.data = msi_cap_64->data; 4298dd28afeSJean-Philippe Brucker } else { 4308dd28afeSJean-Philippe Brucker struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64; 4318dd28afeSJean-Philippe Brucker msg.address_hi = 0; 4328dd28afeSJean-Philippe Brucker msg.data = msi_cap_32->data; 4338dd28afeSJean-Philippe Brucker } 4348dd28afeSJean-Philippe Brucker 4358dd28afeSJean-Philippe Brucker for (i = 0; i < nr_vectors; i++) { 4368dd28afeSJean-Philippe Brucker entry = &pdev->msi.entries[i]; 4378dd28afeSJean-Philippe Brucker entry->config.msg = msg; 4388dd28afeSJean-Philippe Brucker vfio_pci_update_msi_entry(kvm, vdev, entry); 4398dd28afeSJean-Philippe Brucker } 4408dd28afeSJean-Philippe Brucker 4418dd28afeSJean-Philippe Brucker /* Update the physical capability if necessary */ 4428dd28afeSJean-Philippe Brucker if (vfio_pci_enable_msis(kvm, vdev, false)) 4438dd28afeSJean-Philippe Brucker vfio_dev_err(vdev, "cannot enable MSI"); 4448dd28afeSJean-Philippe Brucker 4458dd28afeSJean-Philippe Brucker out_unlock: 4468dd28afeSJean-Philippe Brucker mutex_unlock(&pdev->msi.mutex); 4478dd28afeSJean-Philippe Brucker } 4488dd28afeSJean-Philippe Brucker 4496078a454SJean-Philippe Brucker static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr, 4506078a454SJean-Philippe Brucker u8 offset, void *data, int sz) 4516078a454SJean-Philippe Brucker { 4526078a454SJean-Philippe Brucker struct vfio_region_info *info; 4536078a454SJean-Philippe Brucker struct vfio_pci_device *pdev; 4546078a454SJean-Philippe Brucker struct vfio_device *vdev; 4556078a454SJean-Philippe Brucker char base[sz]; 4566078a454SJean-Philippe Brucker 4576078a454SJean-Philippe Brucker pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 4586078a454SJean-Philippe Brucker vdev = container_of(pdev, struct vfio_device, pci); 4596078a454SJean-Philippe Brucker info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 4606078a454SJean-Philippe Brucker 4616078a454SJean-Philippe Brucker /* Dummy read in case of side-effects */ 4626078a454SJean-Philippe Brucker if (pread(vdev->fd, base, sz, info->offset + offset) != sz) 4636078a454SJean-Philippe Brucker vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x", 4646078a454SJean-Philippe Brucker sz, offset); 4656078a454SJean-Philippe Brucker } 4666078a454SJean-Philippe Brucker 4676078a454SJean-Philippe Brucker static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr, 4686078a454SJean-Philippe Brucker u8 offset, void *data, int sz) 4696078a454SJean-Philippe Brucker { 4706078a454SJean-Philippe Brucker struct vfio_region_info *info; 4716078a454SJean-Philippe Brucker struct vfio_pci_device *pdev; 4726078a454SJean-Philippe Brucker struct vfio_device *vdev; 4736078a454SJean-Philippe Brucker void *base = pci_hdr; 4746078a454SJean-Philippe Brucker 4756078a454SJean-Philippe Brucker pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); 4766078a454SJean-Philippe Brucker vdev = container_of(pdev, struct vfio_device, pci); 4776078a454SJean-Philippe Brucker info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 4786078a454SJean-Philippe Brucker 4796078a454SJean-Philippe Brucker if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz) 4806078a454SJean-Philippe Brucker vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x", 4816078a454SJean-Philippe Brucker sz, offset); 4826078a454SJean-Philippe Brucker 483c9888d95SJean-Philippe Brucker /* Handle MSI write now, since it might update the hardware capability */ 484c9888d95SJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) 485c9888d95SJean-Philippe Brucker vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz); 486c9888d95SJean-Philippe Brucker 4878dd28afeSJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) 4888dd28afeSJean-Philippe Brucker vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz); 4898dd28afeSJean-Philippe Brucker 4906078a454SJean-Philippe Brucker if (pread(vdev->fd, base + offset, sz, info->offset + offset) != sz) 4916078a454SJean-Philippe Brucker vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x", 4926078a454SJean-Philippe Brucker sz, offset); 4936078a454SJean-Philippe Brucker } 4946078a454SJean-Philippe Brucker 4958dd28afeSJean-Philippe Brucker static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr) 4968dd28afeSJean-Philippe Brucker { 4978dd28afeSJean-Philippe Brucker size_t size = 10; 4988dd28afeSJean-Philippe Brucker 4998dd28afeSJean-Philippe Brucker if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT) 5008dd28afeSJean-Philippe Brucker size += 4; 5018dd28afeSJean-Philippe Brucker if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT) 5028dd28afeSJean-Philippe Brucker size += 10; 5038dd28afeSJean-Philippe Brucker 5048dd28afeSJean-Philippe Brucker return size; 5058dd28afeSJean-Philippe Brucker } 5068dd28afeSJean-Philippe Brucker 507c9888d95SJean-Philippe Brucker static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr) 508c9888d95SJean-Philippe Brucker { 509c9888d95SJean-Philippe Brucker switch (cap_hdr->type) { 510c9888d95SJean-Philippe Brucker case PCI_CAP_ID_MSIX: 511c9888d95SJean-Philippe Brucker return PCI_CAP_MSIX_SIZEOF; 5128dd28afeSJean-Philippe Brucker case PCI_CAP_ID_MSI: 5138dd28afeSJean-Philippe Brucker return vfio_pci_msi_cap_size((void *)cap_hdr); 514c9888d95SJean-Philippe Brucker default: 515c9888d95SJean-Philippe Brucker pr_err("unknown PCI capability 0x%x", cap_hdr->type); 516c9888d95SJean-Philippe Brucker return 0; 517c9888d95SJean-Philippe Brucker } 518c9888d95SJean-Philippe Brucker } 519c9888d95SJean-Philippe Brucker 520c9888d95SJean-Philippe Brucker static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr, 521c9888d95SJean-Philippe Brucker struct pci_cap_hdr *cap, off_t pos) 522c9888d95SJean-Philippe Brucker { 523c9888d95SJean-Philippe Brucker struct pci_cap_hdr *last; 524c9888d95SJean-Philippe Brucker struct pci_device_header *hdr = &vdev->pci.hdr; 525c9888d95SJean-Philippe Brucker 526c9888d95SJean-Philippe Brucker cap->next = 0; 527c9888d95SJean-Philippe Brucker 528c9888d95SJean-Philippe Brucker if (!hdr->capabilities) { 529c9888d95SJean-Philippe Brucker hdr->capabilities = pos; 530c9888d95SJean-Philippe Brucker hdr->status |= PCI_STATUS_CAP_LIST; 531c9888d95SJean-Philippe Brucker } else { 532c9888d95SJean-Philippe Brucker last = PCI_CAP(virt_hdr, hdr->capabilities); 533c9888d95SJean-Philippe Brucker 534c9888d95SJean-Philippe Brucker while (last->next) 535c9888d95SJean-Philippe Brucker last = PCI_CAP(virt_hdr, last->next); 536c9888d95SJean-Philippe Brucker 537c9888d95SJean-Philippe Brucker last->next = pos; 538c9888d95SJean-Philippe Brucker } 539c9888d95SJean-Philippe Brucker 540c9888d95SJean-Philippe Brucker memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap)); 541c9888d95SJean-Philippe Brucker 542c9888d95SJean-Philippe Brucker return 0; 543c9888d95SJean-Philippe Brucker } 544c9888d95SJean-Philippe Brucker 5456078a454SJean-Philippe Brucker static int vfio_pci_parse_caps(struct vfio_device *vdev) 5466078a454SJean-Philippe Brucker { 547c9888d95SJean-Philippe Brucker int ret; 548c9888d95SJean-Philippe Brucker size_t size; 549c9888d95SJean-Philippe Brucker u8 pos, next; 550c9888d95SJean-Philippe Brucker struct pci_cap_hdr *cap; 551c9888d95SJean-Philippe Brucker u8 virt_hdr[PCI_DEV_CFG_SIZE]; 5526078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 5536078a454SJean-Philippe Brucker 5546078a454SJean-Philippe Brucker if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST)) 5556078a454SJean-Philippe Brucker return 0; 5566078a454SJean-Philippe Brucker 557c9888d95SJean-Philippe Brucker memset(virt_hdr, 0, PCI_DEV_CFG_SIZE); 558c9888d95SJean-Philippe Brucker 559c9888d95SJean-Philippe Brucker pos = pdev->hdr.capabilities & ~3; 560c9888d95SJean-Philippe Brucker 5616078a454SJean-Philippe Brucker pdev->hdr.status &= ~PCI_STATUS_CAP_LIST; 5626078a454SJean-Philippe Brucker pdev->hdr.capabilities = 0; 5636078a454SJean-Philippe Brucker 564c9888d95SJean-Philippe Brucker for (; pos; pos = next) { 565c9888d95SJean-Philippe Brucker cap = PCI_CAP(&pdev->hdr, pos); 566c9888d95SJean-Philippe Brucker next = cap->next; 567c9888d95SJean-Philippe Brucker 568c9888d95SJean-Philippe Brucker switch (cap->type) { 569c9888d95SJean-Philippe Brucker case PCI_CAP_ID_MSIX: 570c9888d95SJean-Philippe Brucker ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 571c9888d95SJean-Philippe Brucker if (ret) 572c9888d95SJean-Philippe Brucker return ret; 573c9888d95SJean-Philippe Brucker 574c9888d95SJean-Philippe Brucker pdev->msix.pos = pos; 575c9888d95SJean-Philippe Brucker pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX; 576c9888d95SJean-Philippe Brucker break; 5778dd28afeSJean-Philippe Brucker case PCI_CAP_ID_MSI: 5788dd28afeSJean-Philippe Brucker ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); 5798dd28afeSJean-Philippe Brucker if (ret) 5808dd28afeSJean-Philippe Brucker return ret; 5818dd28afeSJean-Philippe Brucker 5828dd28afeSJean-Philippe Brucker pdev->msi.pos = pos; 5838dd28afeSJean-Philippe Brucker pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI; 5848dd28afeSJean-Philippe Brucker break; 585c9888d95SJean-Philippe Brucker } 586c9888d95SJean-Philippe Brucker } 587c9888d95SJean-Philippe Brucker 588c9888d95SJean-Philippe Brucker /* Wipe remaining capabilities */ 589c9888d95SJean-Philippe Brucker pos = PCI_STD_HEADER_SIZEOF; 590c9888d95SJean-Philippe Brucker size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF; 591c9888d95SJean-Philippe Brucker memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size); 5926078a454SJean-Philippe Brucker 5936078a454SJean-Philippe Brucker return 0; 5946078a454SJean-Philippe Brucker } 5956078a454SJean-Philippe Brucker 5966078a454SJean-Philippe Brucker static int vfio_pci_parse_cfg_space(struct vfio_device *vdev) 5976078a454SJean-Philippe Brucker { 598c9888d95SJean-Philippe Brucker ssize_t sz = PCI_DEV_CFG_SIZE; 5996078a454SJean-Philippe Brucker struct vfio_region_info *info; 6006078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 6016078a454SJean-Philippe Brucker 6026078a454SJean-Philippe Brucker if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { 6036078a454SJean-Philippe Brucker vfio_dev_err(vdev, "Config Space not found"); 6046078a454SJean-Philippe Brucker return -ENODEV; 6056078a454SJean-Philippe Brucker } 6066078a454SJean-Philippe Brucker 6076078a454SJean-Philippe Brucker info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 6086078a454SJean-Philippe Brucker *info = (struct vfio_region_info) { 6096078a454SJean-Philippe Brucker .argsz = sizeof(*info), 6106078a454SJean-Philippe Brucker .index = VFIO_PCI_CONFIG_REGION_INDEX, 6116078a454SJean-Philippe Brucker }; 6126078a454SJean-Philippe Brucker 6136078a454SJean-Philippe Brucker ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 6146078a454SJean-Philippe Brucker if (!info->size) { 6156078a454SJean-Philippe Brucker vfio_dev_err(vdev, "Config Space has size zero?!"); 6166078a454SJean-Philippe Brucker return -EINVAL; 6176078a454SJean-Philippe Brucker } 6186078a454SJean-Philippe Brucker 619c9888d95SJean-Philippe Brucker /* Read standard headers and capabilities */ 6206078a454SJean-Philippe Brucker if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) { 6216078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz); 6226078a454SJean-Philippe Brucker return -EIO; 6236078a454SJean-Philippe Brucker } 6246078a454SJean-Philippe Brucker 6256078a454SJean-Philippe Brucker /* Strip bit 7, that indicates multifunction */ 6266078a454SJean-Philippe Brucker pdev->hdr.header_type &= 0x7f; 6276078a454SJean-Philippe Brucker 6286078a454SJean-Philippe Brucker if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) { 6296078a454SJean-Philippe Brucker vfio_dev_err(vdev, "unsupported header type %u", 6306078a454SJean-Philippe Brucker pdev->hdr.header_type); 6316078a454SJean-Philippe Brucker return -EOPNOTSUPP; 6326078a454SJean-Philippe Brucker } 6336078a454SJean-Philippe Brucker 634c9888d95SJean-Philippe Brucker if (pdev->hdr.irq_pin) 635c9888d95SJean-Philippe Brucker pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX; 636c9888d95SJean-Philippe Brucker 6376078a454SJean-Philippe Brucker vfio_pci_parse_caps(vdev); 6386078a454SJean-Philippe Brucker 6396078a454SJean-Philippe Brucker return 0; 6406078a454SJean-Philippe Brucker } 6416078a454SJean-Philippe Brucker 6426078a454SJean-Philippe Brucker static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev) 6436078a454SJean-Philippe Brucker { 6446078a454SJean-Philippe Brucker int i; 6456078a454SJean-Philippe Brucker ssize_t hdr_sz; 646c9888d95SJean-Philippe Brucker struct msix_cap *msix; 6476078a454SJean-Philippe Brucker struct vfio_region_info *info; 6486078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 6496078a454SJean-Philippe Brucker 6506078a454SJean-Philippe Brucker /* Initialise the BARs */ 6516078a454SJean-Philippe Brucker for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 65282caa882SJean-Philippe Brucker u64 base; 6536078a454SJean-Philippe Brucker struct vfio_region *region = &vdev->regions[i]; 65482caa882SJean-Philippe Brucker 65582caa882SJean-Philippe Brucker /* Construct a fake reg to match what we've mapped. */ 65682caa882SJean-Philippe Brucker if (region->is_ioport) { 65782caa882SJean-Philippe Brucker base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) | 65882caa882SJean-Philippe Brucker PCI_BASE_ADDRESS_SPACE_IO; 65982caa882SJean-Philippe Brucker } else { 66082caa882SJean-Philippe Brucker base = (region->guest_phys_addr & 66182caa882SJean-Philippe Brucker PCI_BASE_ADDRESS_MEM_MASK) | 66282caa882SJean-Philippe Brucker PCI_BASE_ADDRESS_SPACE_MEMORY; 66382caa882SJean-Philippe Brucker } 66482caa882SJean-Philippe Brucker 66582caa882SJean-Philippe Brucker pdev->hdr.bar[i] = base; 6666078a454SJean-Philippe Brucker 6676078a454SJean-Philippe Brucker if (!base) 6686078a454SJean-Philippe Brucker continue; 6696078a454SJean-Philippe Brucker 6706078a454SJean-Philippe Brucker pdev->hdr.bar_size[i] = region->info.size; 6716078a454SJean-Philippe Brucker } 6726078a454SJean-Philippe Brucker 6736078a454SJean-Philippe Brucker /* I really can't be bothered to support cardbus. */ 6746078a454SJean-Philippe Brucker pdev->hdr.card_bus = 0; 6756078a454SJean-Philippe Brucker 6766078a454SJean-Philippe Brucker /* 6776078a454SJean-Philippe Brucker * Nuke the expansion ROM for now. If we want to do this properly, 6786078a454SJean-Philippe Brucker * we need to save its size somewhere and map into the guest. 6796078a454SJean-Philippe Brucker */ 6806078a454SJean-Philippe Brucker pdev->hdr.exp_rom_bar = 0; 6816078a454SJean-Philippe Brucker 682c9888d95SJean-Philippe Brucker /* Plumb in our fake MSI-X capability, if we have it. */ 683c9888d95SJean-Philippe Brucker msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX); 684c9888d95SJean-Philippe Brucker if (msix) { 685c9888d95SJean-Philippe Brucker /* Add a shortcut to the PBA region for the MMIO handler */ 686c9888d95SJean-Philippe Brucker int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar; 687c9888d95SJean-Philippe Brucker pdev->msix_pba.offset = vdev->regions[pba_index].info.offset + 688c9888d95SJean-Philippe Brucker (msix->pba_offset & PCI_MSIX_PBA_OFFSET); 689c9888d95SJean-Philippe Brucker 690c9888d95SJean-Philippe Brucker /* Tidy up the capability */ 691c9888d95SJean-Philippe Brucker msix->table_offset &= PCI_MSIX_TABLE_BIR; 692c9888d95SJean-Philippe Brucker msix->pba_offset &= PCI_MSIX_PBA_BIR; 693c9888d95SJean-Philippe Brucker if (pdev->msix_table.bar == pdev->msix_pba.bar) 694c9888d95SJean-Philippe Brucker msix->pba_offset |= pdev->msix_table.size & 695c9888d95SJean-Philippe Brucker PCI_MSIX_PBA_OFFSET; 696c9888d95SJean-Philippe Brucker } 697c9888d95SJean-Philippe Brucker 6986078a454SJean-Philippe Brucker /* Install our fake Configuration Space */ 6996078a454SJean-Philippe Brucker info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; 7006078a454SJean-Philippe Brucker hdr_sz = PCI_DEV_CFG_SIZE; 7016078a454SJean-Philippe Brucker if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) { 7026078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to write %zd bytes to Config Space", 7036078a454SJean-Philippe Brucker hdr_sz); 7046078a454SJean-Philippe Brucker return -EIO; 7056078a454SJean-Philippe Brucker } 7066078a454SJean-Philippe Brucker 7076078a454SJean-Philippe Brucker /* Register callbacks for cfg accesses */ 7086078a454SJean-Philippe Brucker pdev->hdr.cfg_ops = (struct pci_config_operations) { 7096078a454SJean-Philippe Brucker .read = vfio_pci_cfg_read, 7106078a454SJean-Philippe Brucker .write = vfio_pci_cfg_write, 7116078a454SJean-Philippe Brucker }; 7126078a454SJean-Philippe Brucker 7136078a454SJean-Philippe Brucker pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH; 7146078a454SJean-Philippe Brucker 7156078a454SJean-Philippe Brucker return 0; 7166078a454SJean-Philippe Brucker } 7176078a454SJean-Philippe Brucker 718ed01a603SAlexandru Elisei static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index, 719ed01a603SAlexandru Elisei struct vfio_region_info *info) 720ed01a603SAlexandru Elisei { 721ed01a603SAlexandru Elisei int ret; 722ed01a603SAlexandru Elisei 723ed01a603SAlexandru Elisei *info = (struct vfio_region_info) { 724ed01a603SAlexandru Elisei .argsz = sizeof(*info), 725ed01a603SAlexandru Elisei .index = index, 726ed01a603SAlexandru Elisei }; 727ed01a603SAlexandru Elisei 728ed01a603SAlexandru Elisei ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); 729ed01a603SAlexandru Elisei if (ret) { 730ed01a603SAlexandru Elisei ret = -errno; 731ed01a603SAlexandru Elisei vfio_dev_err(vdev, "cannot get info for BAR %u", index); 732ed01a603SAlexandru Elisei return ret; 733ed01a603SAlexandru Elisei } 734ed01a603SAlexandru Elisei 735ed01a603SAlexandru Elisei if (info->size && !is_power_of_two(info->size)) { 736ed01a603SAlexandru Elisei vfio_dev_err(vdev, "region is not power of two: 0x%llx", 737ed01a603SAlexandru Elisei info->size); 738ed01a603SAlexandru Elisei return -EINVAL; 739ed01a603SAlexandru Elisei } 740ed01a603SAlexandru Elisei 741ed01a603SAlexandru Elisei return 0; 742ed01a603SAlexandru Elisei } 743ed01a603SAlexandru Elisei 744ed01a603SAlexandru Elisei static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev) 745c9888d95SJean-Philippe Brucker { 746c9888d95SJean-Philippe Brucker int ret; 747c9888d95SJean-Philippe Brucker size_t i; 748ed01a603SAlexandru Elisei size_t map_size; 749c9888d95SJean-Philippe Brucker size_t nr_entries; 750c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entries; 751ed01a603SAlexandru Elisei struct vfio_pci_device *pdev = &vdev->pci; 752c9888d95SJean-Philippe Brucker struct vfio_pci_msix_pba *pba = &pdev->msix_pba; 753c9888d95SJean-Philippe Brucker struct vfio_pci_msix_table *table = &pdev->msix_table; 754c9888d95SJean-Philippe Brucker struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos); 755ed01a603SAlexandru Elisei struct vfio_region_info info; 756c9888d95SJean-Philippe Brucker 757c9888d95SJean-Philippe Brucker table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR; 758c9888d95SJean-Philippe Brucker pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR; 759c9888d95SJean-Philippe Brucker 760c9888d95SJean-Philippe Brucker /* 761c9888d95SJean-Philippe Brucker * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE. 762c9888d95SJean-Philippe Brucker */ 763c9888d95SJean-Philippe Brucker nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; 764c9888d95SJean-Philippe Brucker table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE); 765c9888d95SJean-Philippe Brucker pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE); 766c9888d95SJean-Philippe Brucker 767c9888d95SJean-Philippe Brucker entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry)); 768c9888d95SJean-Philippe Brucker if (!entries) 769c9888d95SJean-Philippe Brucker return -ENOMEM; 770c9888d95SJean-Philippe Brucker 771c9888d95SJean-Philippe Brucker for (i = 0; i < nr_entries; i++) 772c9888d95SJean-Philippe Brucker entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; 773c9888d95SJean-Philippe Brucker 774ed01a603SAlexandru Elisei ret = vfio_pci_get_region_info(vdev, table->bar, &info); 775ed01a603SAlexandru Elisei if (ret) 776ed01a603SAlexandru Elisei return ret; 777ed01a603SAlexandru Elisei if (!info.size) 778ed01a603SAlexandru Elisei return -EINVAL; 779ed01a603SAlexandru Elisei map_size = info.size; 780ed01a603SAlexandru Elisei 781ed01a603SAlexandru Elisei if (table->bar != pba->bar) { 782ed01a603SAlexandru Elisei ret = vfio_pci_get_region_info(vdev, pba->bar, &info); 783ed01a603SAlexandru Elisei if (ret) 784ed01a603SAlexandru Elisei return ret; 785ed01a603SAlexandru Elisei if (!info.size) 786ed01a603SAlexandru Elisei return -EINVAL; 787ed01a603SAlexandru Elisei map_size += info.size; 788ed01a603SAlexandru Elisei } 789ed01a603SAlexandru Elisei 790c9888d95SJean-Philippe Brucker /* 791c9888d95SJean-Philippe Brucker * To ease MSI-X cap configuration in case they share the same BAR, 792c9888d95SJean-Philippe Brucker * collapse table and pending array. The size of the BAR regions must be 793c9888d95SJean-Philippe Brucker * powers of two. 794c9888d95SJean-Philippe Brucker */ 795ed01a603SAlexandru Elisei map_size = ALIGN(map_size, PAGE_SIZE); 796ed01a603SAlexandru Elisei table->guest_phys_addr = pci_get_mmio_block(map_size); 797c9888d95SJean-Philippe Brucker if (!table->guest_phys_addr) { 798ed01a603SAlexandru Elisei pr_err("cannot allocate MMIO space"); 799c9888d95SJean-Philippe Brucker ret = -ENOMEM; 800c9888d95SJean-Philippe Brucker goto out_free; 801c9888d95SJean-Philippe Brucker } 802c9888d95SJean-Philippe Brucker pba->guest_phys_addr = table->guest_phys_addr + table->size; 803c9888d95SJean-Philippe Brucker 804c9888d95SJean-Philippe Brucker ret = kvm__register_mmio(kvm, table->guest_phys_addr, table->size, 805c9888d95SJean-Philippe Brucker false, vfio_pci_msix_table_access, pdev); 806c9888d95SJean-Philippe Brucker if (ret < 0) 807c9888d95SJean-Philippe Brucker goto out_free; 808c9888d95SJean-Philippe Brucker 809c9888d95SJean-Philippe Brucker /* 810c9888d95SJean-Philippe Brucker * We could map the physical PBA directly into the guest, but it's 811c9888d95SJean-Philippe Brucker * likely smaller than a page, and we can only hand full pages to the 812c9888d95SJean-Philippe Brucker * guest. Even though the PCI spec disallows sharing a page used for 813c9888d95SJean-Philippe Brucker * MSI-X with any other resource, it allows to share the same page 814c9888d95SJean-Philippe Brucker * between MSI-X table and PBA. For the sake of isolation, create a 815c9888d95SJean-Philippe Brucker * virtual PBA. 816c9888d95SJean-Philippe Brucker */ 817c9888d95SJean-Philippe Brucker ret = kvm__register_mmio(kvm, pba->guest_phys_addr, pba->size, false, 818c9888d95SJean-Philippe Brucker vfio_pci_msix_pba_access, pdev); 819c9888d95SJean-Philippe Brucker if (ret < 0) 820c9888d95SJean-Philippe Brucker goto out_free; 821c9888d95SJean-Philippe Brucker 822c9888d95SJean-Philippe Brucker pdev->msix.entries = entries; 823c9888d95SJean-Philippe Brucker pdev->msix.nr_entries = nr_entries; 824c9888d95SJean-Philippe Brucker 825c9888d95SJean-Philippe Brucker return 0; 826c9888d95SJean-Philippe Brucker 827c9888d95SJean-Philippe Brucker out_free: 828c9888d95SJean-Philippe Brucker free(entries); 829c9888d95SJean-Philippe Brucker 830c9888d95SJean-Philippe Brucker return ret; 831c9888d95SJean-Philippe Brucker } 832c9888d95SJean-Philippe Brucker 8338dd28afeSJean-Philippe Brucker static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev) 8348dd28afeSJean-Philippe Brucker { 8358dd28afeSJean-Philippe Brucker struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos); 8368dd28afeSJean-Philippe Brucker 8378dd28afeSJean-Philippe Brucker pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1), 8388dd28afeSJean-Philippe Brucker pdev->msi.entries = calloc(pdev->msi.nr_entries, 8398dd28afeSJean-Philippe Brucker sizeof(struct vfio_pci_msi_entry)); 8408dd28afeSJean-Philippe Brucker if (!pdev->msi.entries) 8418dd28afeSJean-Philippe Brucker return -ENOMEM; 8428dd28afeSJean-Philippe Brucker 8438dd28afeSJean-Philippe Brucker return 0; 8448dd28afeSJean-Philippe Brucker } 8458dd28afeSJean-Philippe Brucker 8466078a454SJean-Philippe Brucker static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev, 8476078a454SJean-Philippe Brucker size_t nr) 8486078a454SJean-Philippe Brucker { 8496078a454SJean-Philippe Brucker int ret; 85082caa882SJean-Philippe Brucker u32 bar; 8516078a454SJean-Philippe Brucker size_t map_size; 852c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 8536078a454SJean-Philippe Brucker struct vfio_region *region = &vdev->regions[nr]; 8546078a454SJean-Philippe Brucker 8556078a454SJean-Philippe Brucker if (nr >= vdev->info.num_regions) 8566078a454SJean-Philippe Brucker return 0; 8576078a454SJean-Philippe Brucker 85882caa882SJean-Philippe Brucker bar = pdev->hdr.bar[nr]; 85982caa882SJean-Philippe Brucker 86082caa882SJean-Philippe Brucker region->vdev = vdev; 86182caa882SJean-Philippe Brucker region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO); 8626078a454SJean-Philippe Brucker 863ed01a603SAlexandru Elisei ret = vfio_pci_get_region_info(vdev, nr, ®ion->info); 864ed01a603SAlexandru Elisei if (ret) 8656078a454SJean-Philippe Brucker return ret; 8666078a454SJean-Philippe Brucker 8676078a454SJean-Philippe Brucker /* Ignore invalid or unimplemented regions */ 8686078a454SJean-Philippe Brucker if (!region->info.size) 8696078a454SJean-Philippe Brucker return 0; 8706078a454SJean-Philippe Brucker 871c9888d95SJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 872c9888d95SJean-Philippe Brucker /* Trap and emulate MSI-X table */ 873c9888d95SJean-Philippe Brucker if (nr == pdev->msix_table.bar) { 874c9888d95SJean-Philippe Brucker region->guest_phys_addr = pdev->msix_table.guest_phys_addr; 875c9888d95SJean-Philippe Brucker return 0; 876c9888d95SJean-Philippe Brucker } else if (nr == pdev->msix_pba.bar) { 877c9888d95SJean-Philippe Brucker region->guest_phys_addr = pdev->msix_pba.guest_phys_addr; 878c9888d95SJean-Philippe Brucker return 0; 879c9888d95SJean-Philippe Brucker } 880c9888d95SJean-Philippe Brucker } 881c9888d95SJean-Philippe Brucker 88282caa882SJean-Philippe Brucker if (!region->is_ioport) { 8836078a454SJean-Philippe Brucker /* Grab some MMIO space in the guest */ 8846078a454SJean-Philippe Brucker map_size = ALIGN(region->info.size, PAGE_SIZE); 885854aa2efSJulien Thierry region->guest_phys_addr = pci_get_mmio_block(map_size); 88682caa882SJean-Philippe Brucker } 8876078a454SJean-Philippe Brucker 88882caa882SJean-Philippe Brucker /* Map the BARs into the guest or setup a trap region. */ 8896078a454SJean-Philippe Brucker ret = vfio_map_region(kvm, vdev, region); 8906078a454SJean-Philippe Brucker if (ret) 8916078a454SJean-Philippe Brucker return ret; 8926078a454SJean-Philippe Brucker 8936078a454SJean-Philippe Brucker return 0; 8946078a454SJean-Philippe Brucker } 8956078a454SJean-Philippe Brucker 8966078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_regions(struct kvm *kvm, 8976078a454SJean-Philippe Brucker struct vfio_device *vdev) 8986078a454SJean-Philippe Brucker { 8996078a454SJean-Philippe Brucker int ret; 9006078a454SJean-Philippe Brucker u32 bar; 9016078a454SJean-Philippe Brucker size_t i; 9026078a454SJean-Philippe Brucker bool is_64bit = false; 9036078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 9046078a454SJean-Philippe Brucker 9056078a454SJean-Philippe Brucker ret = vfio_pci_parse_cfg_space(vdev); 9066078a454SJean-Philippe Brucker if (ret) 9076078a454SJean-Philippe Brucker return ret; 9086078a454SJean-Philippe Brucker 909c9888d95SJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 910ed01a603SAlexandru Elisei ret = vfio_pci_create_msix_table(kvm, vdev); 911c9888d95SJean-Philippe Brucker if (ret) 912c9888d95SJean-Philippe Brucker return ret; 913c9888d95SJean-Philippe Brucker } 914c9888d95SJean-Philippe Brucker 9158dd28afeSJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 9168dd28afeSJean-Philippe Brucker ret = vfio_pci_create_msi_cap(kvm, pdev); 9178dd28afeSJean-Philippe Brucker if (ret) 9188dd28afeSJean-Philippe Brucker return ret; 9198dd28afeSJean-Philippe Brucker } 9208dd28afeSJean-Philippe Brucker 9216078a454SJean-Philippe Brucker for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { 9226078a454SJean-Philippe Brucker /* Ignore top half of 64-bit BAR */ 923*84998f21SAlexandru Elisei if (is_64bit) { 924*84998f21SAlexandru Elisei is_64bit = false; 9256078a454SJean-Philippe Brucker continue; 926*84998f21SAlexandru Elisei } 9276078a454SJean-Philippe Brucker 9286078a454SJean-Philippe Brucker ret = vfio_pci_configure_bar(kvm, vdev, i); 9296078a454SJean-Philippe Brucker if (ret) 9306078a454SJean-Philippe Brucker return ret; 9316078a454SJean-Philippe Brucker 9326078a454SJean-Philippe Brucker bar = pdev->hdr.bar[i]; 9336078a454SJean-Philippe Brucker is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) == 9346078a454SJean-Philippe Brucker PCI_BASE_ADDRESS_SPACE_MEMORY && 9356078a454SJean-Philippe Brucker bar & PCI_BASE_ADDRESS_MEM_TYPE_64; 9366078a454SJean-Philippe Brucker } 9376078a454SJean-Philippe Brucker 9386078a454SJean-Philippe Brucker /* We've configured the BARs, fake up a Configuration Space */ 9396078a454SJean-Philippe Brucker return vfio_pci_fixup_cfg_space(vdev); 9406078a454SJean-Philippe Brucker } 9416078a454SJean-Philippe Brucker 942c9888d95SJean-Philippe Brucker /* 943c9888d95SJean-Philippe Brucker * Attempt to update the FD limit, if opening an eventfd for each IRQ vector 944c9888d95SJean-Philippe Brucker * would hit the limit. Which is likely to happen when a device uses 2048 MSIs. 945c9888d95SJean-Philippe Brucker */ 946c9888d95SJean-Philippe Brucker static int vfio_pci_reserve_irq_fds(size_t num) 947c9888d95SJean-Philippe Brucker { 948c9888d95SJean-Philippe Brucker /* 949c9888d95SJean-Philippe Brucker * I counted around 27 fds under normal load. Let's add 100 for good 950c9888d95SJean-Philippe Brucker * measure. 951c9888d95SJean-Philippe Brucker */ 952c9888d95SJean-Philippe Brucker static size_t needed = 128; 953c9888d95SJean-Philippe Brucker struct rlimit fd_limit, new_limit; 954c9888d95SJean-Philippe Brucker 955c9888d95SJean-Philippe Brucker needed += num; 956c9888d95SJean-Philippe Brucker 957c9888d95SJean-Philippe Brucker if (getrlimit(RLIMIT_NOFILE, &fd_limit)) { 958c9888d95SJean-Philippe Brucker perror("getrlimit(RLIMIT_NOFILE)"); 959c9888d95SJean-Philippe Brucker return 0; 960c9888d95SJean-Philippe Brucker } 961c9888d95SJean-Philippe Brucker 962c9888d95SJean-Philippe Brucker if (fd_limit.rlim_cur >= needed) 963c9888d95SJean-Philippe Brucker return 0; 964c9888d95SJean-Philippe Brucker 965c9888d95SJean-Philippe Brucker new_limit.rlim_cur = needed; 966c9888d95SJean-Philippe Brucker 967c9888d95SJean-Philippe Brucker if (fd_limit.rlim_max < needed) 968c9888d95SJean-Philippe Brucker /* Try to bump hard limit (root only) */ 969c9888d95SJean-Philippe Brucker new_limit.rlim_max = needed; 970c9888d95SJean-Philippe Brucker else 971c9888d95SJean-Philippe Brucker new_limit.rlim_max = fd_limit.rlim_max; 972c9888d95SJean-Philippe Brucker 973c9888d95SJean-Philippe Brucker if (setrlimit(RLIMIT_NOFILE, &new_limit)) { 974c9888d95SJean-Philippe Brucker perror("setrlimit(RLIMIT_NOFILE)"); 975c9888d95SJean-Philippe Brucker pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)", 976c9888d95SJean-Philippe Brucker (size_t)(needed - fd_limit.rlim_cur)); 977c9888d95SJean-Philippe Brucker } 978c9888d95SJean-Philippe Brucker 979c9888d95SJean-Philippe Brucker return 0; 980c9888d95SJean-Philippe Brucker } 981c9888d95SJean-Philippe Brucker 982c9888d95SJean-Philippe Brucker static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev, 983c9888d95SJean-Philippe Brucker struct vfio_pci_msi_common *msis) 984c9888d95SJean-Philippe Brucker { 985c9888d95SJean-Philippe Brucker int ret; 986c9888d95SJean-Philippe Brucker size_t i; 987c9888d95SJean-Philippe Brucker int *eventfds; 988c9888d95SJean-Philippe Brucker size_t irq_set_size; 989c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entry; 990c9888d95SJean-Philippe Brucker size_t nr_entries = msis->nr_entries; 991c9888d95SJean-Philippe Brucker 992c9888d95SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info); 99309533d3cSAndre Przywara if (ret || msis->info.count == 0) { 994c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "no MSI reported by VFIO"); 995c9888d95SJean-Philippe Brucker return -ENODEV; 996c9888d95SJean-Philippe Brucker } 997c9888d95SJean-Philippe Brucker 998c9888d95SJean-Philippe Brucker if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) { 999c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "interrupt not EVENTFD capable"); 1000c9888d95SJean-Philippe Brucker return -EINVAL; 1001c9888d95SJean-Philippe Brucker } 1002c9888d95SJean-Philippe Brucker 1003c9888d95SJean-Philippe Brucker if (msis->info.count != nr_entries) { 1004c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO"); 1005c9888d95SJean-Philippe Brucker return -EINVAL; 1006c9888d95SJean-Philippe Brucker } 1007c9888d95SJean-Philippe Brucker 1008c9888d95SJean-Philippe Brucker mutex_init(&msis->mutex); 1009c9888d95SJean-Philippe Brucker 1010c9888d95SJean-Philippe Brucker vfio_pci_reserve_irq_fds(nr_entries); 1011c9888d95SJean-Philippe Brucker 1012c9888d95SJean-Philippe Brucker irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int); 1013c9888d95SJean-Philippe Brucker msis->irq_set = malloc(irq_set_size); 1014c9888d95SJean-Philippe Brucker if (!msis->irq_set) 1015c9888d95SJean-Philippe Brucker return -ENOMEM; 1016c9888d95SJean-Philippe Brucker 1017c9888d95SJean-Philippe Brucker *msis->irq_set = (struct vfio_irq_set) { 1018c9888d95SJean-Philippe Brucker .argsz = irq_set_size, 1019c9888d95SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_EVENTFD | 1020c9888d95SJean-Philippe Brucker VFIO_IRQ_SET_ACTION_TRIGGER, 1021c9888d95SJean-Philippe Brucker .index = msis->info.index, 1022c9888d95SJean-Philippe Brucker .start = 0, 1023c9888d95SJean-Philippe Brucker .count = nr_entries, 1024c9888d95SJean-Philippe Brucker }; 1025c9888d95SJean-Philippe Brucker 1026c9888d95SJean-Philippe Brucker eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); 1027c9888d95SJean-Philippe Brucker 1028c9888d95SJean-Philippe Brucker for (i = 0; i < nr_entries; i++) { 1029c9888d95SJean-Philippe Brucker entry = &msis->entries[i]; 1030c9888d95SJean-Philippe Brucker entry->gsi = -1; 1031c9888d95SJean-Philippe Brucker entry->eventfd = -1; 1032c9888d95SJean-Philippe Brucker msi_set_masked(entry->virt_state, true); 1033c9888d95SJean-Philippe Brucker msi_set_masked(entry->phys_state, true); 1034c9888d95SJean-Philippe Brucker eventfds[i] = -1; 1035c9888d95SJean-Philippe Brucker } 1036c9888d95SJean-Philippe Brucker 1037c9888d95SJean-Philippe Brucker return 0; 1038c9888d95SJean-Philippe Brucker } 1039c9888d95SJean-Philippe Brucker 1040c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev) 1041c9888d95SJean-Philippe Brucker { 1042c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 1043c9888d95SJean-Philippe Brucker int gsi = pdev->intx_gsi; 1044c9888d95SJean-Philippe Brucker struct vfio_irq_set irq_set = { 1045c9888d95SJean-Philippe Brucker .argsz = sizeof(irq_set), 1046c9888d95SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 1047c9888d95SJean-Philippe Brucker .index = VFIO_PCI_INTX_IRQ_INDEX, 1048c9888d95SJean-Philippe Brucker }; 1049c9888d95SJean-Philippe Brucker 10507302327aSLeo Yan if (pdev->intx_fd == -1) 10517302327aSLeo Yan return; 10527302327aSLeo Yan 1053c9888d95SJean-Philippe Brucker pr_debug("user requested MSI, disabling INTx %d", gsi); 1054c9888d95SJean-Philippe Brucker 1055c9888d95SJean-Philippe Brucker ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 1056c9888d95SJean-Philippe Brucker irq__del_irqfd(kvm, gsi, pdev->intx_fd); 1057c9888d95SJean-Philippe Brucker 1058c9888d95SJean-Philippe Brucker close(pdev->intx_fd); 1059a1ff6f87SLeo Yan close(pdev->unmask_fd); 10607302327aSLeo Yan pdev->intx_fd = -1; 1061c9888d95SJean-Philippe Brucker } 1062c9888d95SJean-Philippe Brucker 10636078a454SJean-Philippe Brucker static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev) 10646078a454SJean-Philippe Brucker { 10656078a454SJean-Philippe Brucker int ret; 10666078a454SJean-Philippe Brucker int trigger_fd, unmask_fd; 1067a3704b91SAndre Przywara union vfio_irq_eventfd trigger; 1068a3704b91SAndre Przywara union vfio_irq_eventfd unmask; 10696078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 107012bd7a16SLeo Yan int gsi = pdev->intx_gsi; 10716078a454SJean-Philippe Brucker 10727302327aSLeo Yan if (pdev->intx_fd != -1) 10737302327aSLeo Yan return 0; 10747302327aSLeo Yan 10756078a454SJean-Philippe Brucker /* 10766078a454SJean-Philippe Brucker * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd 10776078a454SJean-Philippe Brucker * signals an interrupt from host to guest, and unmask_fd signals the 10786078a454SJean-Philippe Brucker * deassertion of the line from guest to host. 10796078a454SJean-Philippe Brucker */ 10806078a454SJean-Philippe Brucker trigger_fd = eventfd(0, 0); 10816078a454SJean-Philippe Brucker if (trigger_fd < 0) { 10826078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to create trigger eventfd"); 10836078a454SJean-Philippe Brucker return trigger_fd; 10846078a454SJean-Philippe Brucker } 10856078a454SJean-Philippe Brucker 10866078a454SJean-Philippe Brucker unmask_fd = eventfd(0, 0); 10876078a454SJean-Philippe Brucker if (unmask_fd < 0) { 10886078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to create unmask eventfd"); 10896078a454SJean-Philippe Brucker close(trigger_fd); 10906078a454SJean-Philippe Brucker return unmask_fd; 10916078a454SJean-Philippe Brucker } 10926078a454SJean-Philippe Brucker 10936078a454SJean-Philippe Brucker ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd); 10946078a454SJean-Philippe Brucker if (ret) 10956078a454SJean-Philippe Brucker goto err_close; 10966078a454SJean-Philippe Brucker 10976078a454SJean-Philippe Brucker trigger.irq = (struct vfio_irq_set) { 10986078a454SJean-Philippe Brucker .argsz = sizeof(trigger), 10996078a454SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, 11006078a454SJean-Philippe Brucker .index = VFIO_PCI_INTX_IRQ_INDEX, 11016078a454SJean-Philippe Brucker .start = 0, 11026078a454SJean-Philippe Brucker .count = 1, 11036078a454SJean-Philippe Brucker }; 1104a3704b91SAndre Przywara set_vfio_irq_eventd_payload(&trigger, trigger_fd); 11056078a454SJean-Philippe Brucker 11066078a454SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 11076078a454SJean-Philippe Brucker if (ret < 0) { 11086078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to setup VFIO IRQ"); 11096078a454SJean-Philippe Brucker goto err_delete_line; 11106078a454SJean-Philippe Brucker } 11116078a454SJean-Philippe Brucker 11126078a454SJean-Philippe Brucker unmask.irq = (struct vfio_irq_set) { 11136078a454SJean-Philippe Brucker .argsz = sizeof(unmask), 11146078a454SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK, 11156078a454SJean-Philippe Brucker .index = VFIO_PCI_INTX_IRQ_INDEX, 11166078a454SJean-Philippe Brucker .start = 0, 11176078a454SJean-Philippe Brucker .count = 1, 11186078a454SJean-Philippe Brucker }; 1119a3704b91SAndre Przywara set_vfio_irq_eventd_payload(&unmask, unmask_fd); 11206078a454SJean-Philippe Brucker 11216078a454SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask); 11226078a454SJean-Philippe Brucker if (ret < 0) { 11236078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to setup unmask IRQ"); 11246078a454SJean-Philippe Brucker goto err_remove_event; 11256078a454SJean-Philippe Brucker } 11266078a454SJean-Philippe Brucker 1127c9888d95SJean-Philippe Brucker pdev->intx_fd = trigger_fd; 1128a1ff6f87SLeo Yan pdev->unmask_fd = unmask_fd; 1129c9888d95SJean-Philippe Brucker 11306078a454SJean-Philippe Brucker return 0; 11316078a454SJean-Philippe Brucker 11326078a454SJean-Philippe Brucker err_remove_event: 11336078a454SJean-Philippe Brucker /* Remove trigger event */ 11346078a454SJean-Philippe Brucker trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; 11356078a454SJean-Philippe Brucker trigger.irq.count = 0; 11366078a454SJean-Philippe Brucker ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); 11376078a454SJean-Philippe Brucker 11386078a454SJean-Philippe Brucker err_delete_line: 11396078a454SJean-Philippe Brucker irq__del_irqfd(kvm, gsi, trigger_fd); 11406078a454SJean-Philippe Brucker 11416078a454SJean-Philippe Brucker err_close: 11426078a454SJean-Philippe Brucker close(trigger_fd); 11436078a454SJean-Philippe Brucker close(unmask_fd); 11446078a454SJean-Philippe Brucker return ret; 11456078a454SJean-Philippe Brucker } 11466078a454SJean-Philippe Brucker 114712bd7a16SLeo Yan static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev) 114812bd7a16SLeo Yan { 114912bd7a16SLeo Yan int ret; 115012bd7a16SLeo Yan struct vfio_pci_device *pdev = &vdev->pci; 115112bd7a16SLeo Yan struct vfio_irq_info irq_info = { 115212bd7a16SLeo Yan .argsz = sizeof(irq_info), 115312bd7a16SLeo Yan .index = VFIO_PCI_INTX_IRQ_INDEX, 115412bd7a16SLeo Yan }; 115512bd7a16SLeo Yan 115612bd7a16SLeo Yan vfio_pci_reserve_irq_fds(2); 115712bd7a16SLeo Yan 115812bd7a16SLeo Yan ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); 115912bd7a16SLeo Yan if (ret || irq_info.count == 0) { 116012bd7a16SLeo Yan vfio_dev_err(vdev, "no INTx reported by VFIO"); 116112bd7a16SLeo Yan return -ENODEV; 116212bd7a16SLeo Yan } 116312bd7a16SLeo Yan 116412bd7a16SLeo Yan if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { 116512bd7a16SLeo Yan vfio_dev_err(vdev, "interrupt not eventfd capable"); 116612bd7a16SLeo Yan return -EINVAL; 116712bd7a16SLeo Yan } 116812bd7a16SLeo Yan 116912bd7a16SLeo Yan if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) { 117012bd7a16SLeo Yan vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED"); 117112bd7a16SLeo Yan return -EINVAL; 117212bd7a16SLeo Yan } 117312bd7a16SLeo Yan 117412bd7a16SLeo Yan /* Guest is going to ovewrite our irq_line... */ 117512bd7a16SLeo Yan pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET; 117612bd7a16SLeo Yan 11777302327aSLeo Yan pdev->intx_fd = -1; 11787302327aSLeo Yan 117912bd7a16SLeo Yan return 0; 118012bd7a16SLeo Yan } 118112bd7a16SLeo Yan 11826078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev) 11836078a454SJean-Philippe Brucker { 1184c9888d95SJean-Philippe Brucker int ret = 0; 11856078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 11866078a454SJean-Philippe Brucker 1187c9888d95SJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { 1188c9888d95SJean-Philippe Brucker pdev->msix.info = (struct vfio_irq_info) { 1189c9888d95SJean-Philippe Brucker .argsz = sizeof(pdev->msix.info), 1190c9888d95SJean-Philippe Brucker .index = VFIO_PCI_MSIX_IRQ_INDEX, 11916078a454SJean-Philippe Brucker }; 1192c9888d95SJean-Philippe Brucker ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix); 1193c9888d95SJean-Philippe Brucker if (ret) 1194c9888d95SJean-Philippe Brucker return ret; 11956078a454SJean-Philippe Brucker } 11966078a454SJean-Philippe Brucker 11978dd28afeSJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { 11988dd28afeSJean-Philippe Brucker pdev->msi.info = (struct vfio_irq_info) { 11998dd28afeSJean-Philippe Brucker .argsz = sizeof(pdev->msi.info), 12008dd28afeSJean-Philippe Brucker .index = VFIO_PCI_MSI_IRQ_INDEX, 12018dd28afeSJean-Philippe Brucker }; 12028dd28afeSJean-Philippe Brucker ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi); 12038dd28afeSJean-Philippe Brucker if (ret) 12048dd28afeSJean-Philippe Brucker return ret; 12058dd28afeSJean-Philippe Brucker } 12068dd28afeSJean-Philippe Brucker 120712bd7a16SLeo Yan if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) { 120812bd7a16SLeo Yan ret = vfio_pci_init_intx(kvm, vdev); 120912bd7a16SLeo Yan if (ret) 121012bd7a16SLeo Yan return ret; 121112bd7a16SLeo Yan 1212c9888d95SJean-Philippe Brucker ret = vfio_pci_enable_intx(kvm, vdev); 121312bd7a16SLeo Yan } 1214c9888d95SJean-Philippe Brucker 1215c9888d95SJean-Philippe Brucker return ret; 12166078a454SJean-Philippe Brucker } 12176078a454SJean-Philippe Brucker 12186078a454SJean-Philippe Brucker int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev) 12196078a454SJean-Philippe Brucker { 12206078a454SJean-Philippe Brucker int ret; 12216078a454SJean-Philippe Brucker 12226078a454SJean-Philippe Brucker ret = vfio_pci_configure_dev_regions(kvm, vdev); 12236078a454SJean-Philippe Brucker if (ret) { 12246078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to configure regions"); 12256078a454SJean-Philippe Brucker return ret; 12266078a454SJean-Philippe Brucker } 12276078a454SJean-Philippe Brucker 12286078a454SJean-Philippe Brucker vdev->dev_hdr = (struct device_header) { 12296078a454SJean-Philippe Brucker .bus_type = DEVICE_BUS_PCI, 12306078a454SJean-Philippe Brucker .data = &vdev->pci.hdr, 12316078a454SJean-Philippe Brucker }; 12326078a454SJean-Philippe Brucker 12336078a454SJean-Philippe Brucker ret = device__register(&vdev->dev_hdr); 12346078a454SJean-Philippe Brucker if (ret) { 12356078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to register VFIO device"); 12366078a454SJean-Philippe Brucker return ret; 12376078a454SJean-Philippe Brucker } 12386078a454SJean-Philippe Brucker 12396078a454SJean-Philippe Brucker ret = vfio_pci_configure_dev_irqs(kvm, vdev); 12406078a454SJean-Philippe Brucker if (ret) { 12416078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to configure IRQs"); 12426078a454SJean-Philippe Brucker return ret; 12436078a454SJean-Philippe Brucker } 12446078a454SJean-Philippe Brucker 12456078a454SJean-Philippe Brucker return 0; 12466078a454SJean-Philippe Brucker } 12476078a454SJean-Philippe Brucker 12486078a454SJean-Philippe Brucker void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev) 12496078a454SJean-Philippe Brucker { 12506078a454SJean-Philippe Brucker size_t i; 1251c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci; 12526078a454SJean-Philippe Brucker 12536078a454SJean-Philippe Brucker for (i = 0; i < vdev->info.num_regions; i++) 12546078a454SJean-Philippe Brucker vfio_unmap_region(kvm, &vdev->regions[i]); 12556078a454SJean-Philippe Brucker 12566078a454SJean-Philippe Brucker device__unregister(&vdev->dev_hdr); 1257c9888d95SJean-Philippe Brucker 1258c9888d95SJean-Philippe Brucker free(pdev->msix.irq_set); 1259c9888d95SJean-Philippe Brucker free(pdev->msix.entries); 12608dd28afeSJean-Philippe Brucker free(pdev->msi.irq_set); 12618dd28afeSJean-Philippe Brucker free(pdev->msi.entries); 12626078a454SJean-Philippe Brucker } 1263