139181fc6SAlexandru Elisei #include "linux/sizes.h"
239181fc6SAlexandru Elisei
36078a454SJean-Philippe Brucker #include "kvm/irq.h"
46078a454SJean-Philippe Brucker #include "kvm/kvm.h"
56078a454SJean-Philippe Brucker #include "kvm/kvm-cpu.h"
66078a454SJean-Philippe Brucker #include "kvm/vfio.h"
76078a454SJean-Philippe Brucker
8e1d0285cSAlexandru Elisei #include <assert.h>
9e1d0285cSAlexandru Elisei
106078a454SJean-Philippe Brucker #include <sys/ioctl.h>
116078a454SJean-Philippe Brucker #include <sys/eventfd.h>
12c9888d95SJean-Philippe Brucker #include <sys/resource.h>
13c9888d95SJean-Philippe Brucker #include <sys/time.h>
146078a454SJean-Philippe Brucker
1525c1dc6cSAlexandru Elisei /* Some distros don't have the define. */
1625c1dc6cSAlexandru Elisei #ifndef PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1
1725c1dc6cSAlexandru Elisei #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 12
1825c1dc6cSAlexandru Elisei #endif
1925c1dc6cSAlexandru Elisei
206078a454SJean-Philippe Brucker /* Wrapper around UAPI vfio_irq_set */
21a3704b91SAndre Przywara union vfio_irq_eventfd {
226078a454SJean-Philippe Brucker struct vfio_irq_set irq;
23a3704b91SAndre Przywara u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)];
246078a454SJean-Philippe Brucker };
256078a454SJean-Philippe Brucker
set_vfio_irq_eventd_payload(union vfio_irq_eventfd * evfd,int fd)26a3704b91SAndre Przywara static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd)
27a3704b91SAndre Przywara {
28a3704b91SAndre Przywara memcpy(&evfd->irq.data, &fd, sizeof(fd));
29a3704b91SAndre Przywara }
30a3704b91SAndre Przywara
31*0b5e55fcSJean-Philippe Brucker /*
32*0b5e55fcSJean-Philippe Brucker * To support MSI and MSI-X with common code, track the host and guest states of
33*0b5e55fcSJean-Philippe Brucker * the MSI/MSI-X capability, and of individual vectors.
34*0b5e55fcSJean-Philippe Brucker *
35*0b5e55fcSJean-Philippe Brucker * Both MSI and MSI-X capabilities are enabled and disabled through registers.
36*0b5e55fcSJean-Philippe Brucker * Vectors cannot be individually disabled.
37*0b5e55fcSJean-Philippe Brucker */
38c9888d95SJean-Philippe Brucker #define msi_is_enabled(state) ((state) & VFIO_PCI_MSI_STATE_ENABLED)
39*0b5e55fcSJean-Philippe Brucker
40*0b5e55fcSJean-Philippe Brucker /*
41*0b5e55fcSJean-Philippe Brucker * MSI-X: the control register allows to mask all vectors, and the table allows
42*0b5e55fcSJean-Philippe Brucker * to mask each vector individually.
43*0b5e55fcSJean-Philippe Brucker *
44*0b5e55fcSJean-Philippe Brucker * MSI: if the capability supports Per-Vector Masking then the Mask Bit register
45*0b5e55fcSJean-Philippe Brucker * allows to mask each vector individually. Otherwise there is no masking for
46*0b5e55fcSJean-Philippe Brucker * MSI.
47*0b5e55fcSJean-Philippe Brucker */
48c9888d95SJean-Philippe Brucker #define msi_is_masked(state) ((state) & VFIO_PCI_MSI_STATE_MASKED)
49*0b5e55fcSJean-Philippe Brucker
50*0b5e55fcSJean-Philippe Brucker /*
51*0b5e55fcSJean-Philippe Brucker * A capability is empty when no vector has been registered with SET_IRQS
52*0b5e55fcSJean-Philippe Brucker * yet. It's an optimization specific to kvmtool to avoid issuing lots of
53*0b5e55fcSJean-Philippe Brucker * SET_IRQS ioctls when the guest configures the MSI-X table while the
54*0b5e55fcSJean-Philippe Brucker * capability is masked.
55*0b5e55fcSJean-Philippe Brucker */
56c9888d95SJean-Philippe Brucker #define msi_is_empty(state) ((state) & VFIO_PCI_MSI_STATE_EMPTY)
57c9888d95SJean-Philippe Brucker
58c9888d95SJean-Philippe Brucker #define msi_update_state(state, val, bit) \
59c9888d95SJean-Philippe Brucker (state) = (val) ? (state) | bit : (state) & ~bit;
60c9888d95SJean-Philippe Brucker #define msi_set_enabled(state, val) \
61c9888d95SJean-Philippe Brucker msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
62c9888d95SJean-Philippe Brucker #define msi_set_masked(state, val) \
63c9888d95SJean-Philippe Brucker msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
64c9888d95SJean-Philippe Brucker #define msi_set_empty(state, val) \
65c9888d95SJean-Philippe Brucker msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
66c9888d95SJean-Philippe Brucker
67c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
687302327aSLeo Yan static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
69c9888d95SJean-Philippe Brucker
vfio_pci_enable_msis(struct kvm * kvm,struct vfio_device * vdev,bool msix)708dd28afeSJean-Philippe Brucker static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
718dd28afeSJean-Philippe Brucker bool msix)
72c9888d95SJean-Philippe Brucker {
73c9888d95SJean-Philippe Brucker size_t i;
74c9888d95SJean-Philippe Brucker int ret = 0;
75c9888d95SJean-Philippe Brucker int *eventfds;
76c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
778dd28afeSJean-Philippe Brucker struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
78a3704b91SAndre Przywara union vfio_irq_eventfd single = {
79c9888d95SJean-Philippe Brucker .irq = {
80c9888d95SJean-Philippe Brucker .argsz = sizeof(single),
81c9888d95SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_EVENTFD |
82c9888d95SJean-Philippe Brucker VFIO_IRQ_SET_ACTION_TRIGGER,
83c9888d95SJean-Philippe Brucker .index = msis->info.index,
84c9888d95SJean-Philippe Brucker .count = 1,
85c9888d95SJean-Philippe Brucker },
86c9888d95SJean-Philippe Brucker };
87c9888d95SJean-Philippe Brucker
88*0b5e55fcSJean-Philippe Brucker if (!msi_is_enabled(msis->guest_state))
89c9888d95SJean-Philippe Brucker return 0;
90c9888d95SJean-Philippe Brucker
917302327aSLeo Yan if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
92c9888d95SJean-Philippe Brucker /*
93c9888d95SJean-Philippe Brucker * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
94c9888d95SJean-Philippe Brucker * time. Since INTx has to be enabled from the start (we don't
957302327aSLeo Yan * have a reliable way to know when the guest starts using it),
96c9888d95SJean-Philippe Brucker * disable it now.
97c9888d95SJean-Philippe Brucker */
98c9888d95SJean-Philippe Brucker vfio_pci_disable_intx(kvm, vdev);
99c9888d95SJean-Philippe Brucker
100c9888d95SJean-Philippe Brucker eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
101c9888d95SJean-Philippe Brucker
102c9888d95SJean-Philippe Brucker /*
103c9888d95SJean-Philippe Brucker * Initial registration of the full range. This enables the physical
104*0b5e55fcSJean-Philippe Brucker * MSI/MSI-X capability, which might have side effects. For instance
105*0b5e55fcSJean-Philippe Brucker * when assigning virtio legacy devices, enabling the MSI capability
106*0b5e55fcSJean-Philippe Brucker * modifies the config space layout!
107c9888d95SJean-Philippe Brucker *
108c9888d95SJean-Philippe Brucker * As an optimization, only update MSIs when guest unmasks the
109c9888d95SJean-Philippe Brucker * capability. This greatly reduces the initialization time for Linux
110c9888d95SJean-Philippe Brucker * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
111c9888d95SJean-Philippe Brucker * masked, then fills individual vectors, then unmasks the whole
112c9888d95SJean-Philippe Brucker * function. So we only do one VFIO ioctl when enabling for the first
113c9888d95SJean-Philippe Brucker * time, and then one when unmasking.
114c9888d95SJean-Philippe Brucker */
115*0b5e55fcSJean-Philippe Brucker if (!msi_is_enabled(msis->host_state) ||
116*0b5e55fcSJean-Philippe Brucker (!msi_is_masked(msis->guest_state) &&
117*0b5e55fcSJean-Philippe Brucker msi_is_empty(msis->host_state))) {
118c9888d95SJean-Philippe Brucker bool empty = true;
119c9888d95SJean-Philippe Brucker
120c9888d95SJean-Philippe Brucker for (i = 0; i < msis->nr_entries; i++) {
121c9888d95SJean-Philippe Brucker eventfds[i] = msis->entries[i].gsi >= 0 ?
122c9888d95SJean-Philippe Brucker msis->entries[i].eventfd : -1;
123c9888d95SJean-Philippe Brucker
124c9888d95SJean-Philippe Brucker if (eventfds[i] >= 0)
125c9888d95SJean-Philippe Brucker empty = false;
126c9888d95SJean-Philippe Brucker }
127c9888d95SJean-Philippe Brucker
128c9888d95SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
129c9888d95SJean-Philippe Brucker if (ret < 0) {
130c9888d95SJean-Philippe Brucker perror("VFIO_DEVICE_SET_IRQS(multi)");
131c9888d95SJean-Philippe Brucker return ret;
132c9888d95SJean-Philippe Brucker }
133c9888d95SJean-Philippe Brucker
134*0b5e55fcSJean-Philippe Brucker msi_set_enabled(msis->host_state, true);
135*0b5e55fcSJean-Philippe Brucker msi_set_empty(msis->host_state, empty);
136c9888d95SJean-Philippe Brucker
137c9888d95SJean-Philippe Brucker return 0;
138c9888d95SJean-Philippe Brucker }
139c9888d95SJean-Philippe Brucker
140*0b5e55fcSJean-Philippe Brucker if (msi_is_masked(msis->guest_state)) {
141*0b5e55fcSJean-Philippe Brucker /* TODO: if host_state is not empty nor masked, mask all vectors */
142c9888d95SJean-Philippe Brucker return 0;
143c9888d95SJean-Philippe Brucker }
144c9888d95SJean-Philippe Brucker
145c9888d95SJean-Philippe Brucker /* Update individual vectors to avoid breaking those in use */
146c9888d95SJean-Philippe Brucker for (i = 0; i < msis->nr_entries; i++) {
147c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entry = &msis->entries[i];
148c9888d95SJean-Philippe Brucker int fd = entry->gsi >= 0 ? entry->eventfd : -1;
149c9888d95SJean-Philippe Brucker
150c9888d95SJean-Philippe Brucker if (fd == eventfds[i])
151c9888d95SJean-Philippe Brucker continue;
152c9888d95SJean-Philippe Brucker
153c9888d95SJean-Philippe Brucker single.irq.start = i;
154a3704b91SAndre Przywara set_vfio_irq_eventd_payload(&single, fd);
155c9888d95SJean-Philippe Brucker
156c9888d95SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
157c9888d95SJean-Philippe Brucker if (ret < 0) {
158c9888d95SJean-Philippe Brucker perror("VFIO_DEVICE_SET_IRQS(single)");
159c9888d95SJean-Philippe Brucker break;
160c9888d95SJean-Philippe Brucker }
161c9888d95SJean-Philippe Brucker
162c9888d95SJean-Philippe Brucker eventfds[i] = fd;
163c9888d95SJean-Philippe Brucker
164*0b5e55fcSJean-Philippe Brucker if (msi_is_empty(msis->host_state) && fd >= 0)
165*0b5e55fcSJean-Philippe Brucker msi_set_empty(msis->host_state, false);
166c9888d95SJean-Philippe Brucker }
167c9888d95SJean-Philippe Brucker
168c9888d95SJean-Philippe Brucker return ret;
169c9888d95SJean-Philippe Brucker }
170c9888d95SJean-Philippe Brucker
vfio_pci_disable_msis(struct kvm * kvm,struct vfio_device * vdev,bool msix)1718dd28afeSJean-Philippe Brucker static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev,
1728dd28afeSJean-Philippe Brucker bool msix)
173c9888d95SJean-Philippe Brucker {
174c9888d95SJean-Philippe Brucker int ret;
175c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
1768dd28afeSJean-Philippe Brucker struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
177c9888d95SJean-Philippe Brucker struct vfio_irq_set irq_set = {
178c9888d95SJean-Philippe Brucker .argsz = sizeof(irq_set),
179c9888d95SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
180c9888d95SJean-Philippe Brucker .index = msis->info.index,
181c9888d95SJean-Philippe Brucker .start = 0,
182c9888d95SJean-Philippe Brucker .count = 0,
183c9888d95SJean-Philippe Brucker };
184c9888d95SJean-Philippe Brucker
185*0b5e55fcSJean-Philippe Brucker if (!msi_is_enabled(msis->host_state))
186c9888d95SJean-Philippe Brucker return 0;
187c9888d95SJean-Philippe Brucker
188c9888d95SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
189c9888d95SJean-Philippe Brucker if (ret < 0) {
190c9888d95SJean-Philippe Brucker perror("VFIO_DEVICE_SET_IRQS(NONE)");
191c9888d95SJean-Philippe Brucker return ret;
192c9888d95SJean-Philippe Brucker }
193c9888d95SJean-Philippe Brucker
194*0b5e55fcSJean-Philippe Brucker msi_set_enabled(msis->host_state, false);
195*0b5e55fcSJean-Philippe Brucker msi_set_empty(msis->host_state, true);
196c9888d95SJean-Philippe Brucker
1977302327aSLeo Yan /*
1987302327aSLeo Yan * When MSI or MSIX is disabled, this might be called when
1997302327aSLeo Yan * PCI driver detects the MSI interrupt failure and wants to
2007302327aSLeo Yan * rollback to INTx mode. Thus enable INTx if the device
2017302327aSLeo Yan * supports INTx mode in this case.
2027302327aSLeo Yan */
2037302327aSLeo Yan if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
2047302327aSLeo Yan ret = vfio_pci_enable_intx(kvm, vdev);
2057302327aSLeo Yan
2067302327aSLeo Yan return ret >= 0 ? 0 : ret;
207c9888d95SJean-Philippe Brucker }
208c9888d95SJean-Philippe Brucker
vfio_pci_update_msi_entry(struct kvm * kvm,struct vfio_device * vdev,struct vfio_pci_msi_entry * entry)209c9888d95SJean-Philippe Brucker static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
210c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entry)
211c9888d95SJean-Philippe Brucker {
212c9888d95SJean-Philippe Brucker int ret;
213c9888d95SJean-Philippe Brucker
214c9888d95SJean-Philippe Brucker if (entry->eventfd < 0) {
215c9888d95SJean-Philippe Brucker entry->eventfd = eventfd(0, 0);
216c9888d95SJean-Philippe Brucker if (entry->eventfd < 0) {
217c9888d95SJean-Philippe Brucker ret = -errno;
218c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot create eventfd");
219c9888d95SJean-Philippe Brucker return ret;
220c9888d95SJean-Philippe Brucker }
221c9888d95SJean-Philippe Brucker }
222c9888d95SJean-Philippe Brucker
223c9888d95SJean-Philippe Brucker /* Allocate IRQ if necessary */
224c9888d95SJean-Philippe Brucker if (entry->gsi < 0) {
225c9888d95SJean-Philippe Brucker int ret = irq__add_msix_route(kvm, &entry->config.msg,
226c9888d95SJean-Philippe Brucker vdev->dev_hdr.dev_num << 3);
227c9888d95SJean-Philippe Brucker if (ret < 0) {
228c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot create MSI-X route");
229c9888d95SJean-Philippe Brucker return ret;
230c9888d95SJean-Philippe Brucker }
231c9888d95SJean-Philippe Brucker entry->gsi = ret;
232c9888d95SJean-Philippe Brucker } else {
233c9888d95SJean-Philippe Brucker irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
234c9888d95SJean-Philippe Brucker }
235c9888d95SJean-Philippe Brucker
236c9888d95SJean-Philippe Brucker /*
237c9888d95SJean-Philippe Brucker * MSI masking is unimplemented in VFIO, so we have to handle it by
238c9888d95SJean-Philippe Brucker * disabling/enabling IRQ route instead. We do it on the KVM side rather
239c9888d95SJean-Philippe Brucker * than VFIO, because:
240c9888d95SJean-Philippe Brucker * - it is 8x faster
241c9888d95SJean-Philippe Brucker * - it allows to decouple masking logic from capability state.
242c9888d95SJean-Philippe Brucker * - in masked state, after removing irqfd route, we could easily plug
243c9888d95SJean-Philippe Brucker * the eventfd in a local handler, in order to serve Pending Bit reads
244c9888d95SJean-Philippe Brucker * to the guest.
245c9888d95SJean-Philippe Brucker *
246*0b5e55fcSJean-Philippe Brucker * So entry->host_state is masked when there is no active irqfd route.
247c9888d95SJean-Philippe Brucker */
248*0b5e55fcSJean-Philippe Brucker if (msi_is_masked(entry->guest_state) == msi_is_masked(entry->host_state))
249c9888d95SJean-Philippe Brucker return 0;
250c9888d95SJean-Philippe Brucker
251*0b5e55fcSJean-Philippe Brucker if (msi_is_masked(entry->host_state)) {
252c9888d95SJean-Philippe Brucker ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
253c9888d95SJean-Philippe Brucker if (ret < 0) {
254c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot setup irqfd");
255c9888d95SJean-Philippe Brucker return ret;
256c9888d95SJean-Philippe Brucker }
257c9888d95SJean-Philippe Brucker } else {
258c9888d95SJean-Philippe Brucker irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
259c9888d95SJean-Philippe Brucker }
260c9888d95SJean-Philippe Brucker
261*0b5e55fcSJean-Philippe Brucker msi_set_masked(entry->host_state, msi_is_masked(entry->guest_state));
262c9888d95SJean-Philippe Brucker
263c9888d95SJean-Philippe Brucker return 0;
264c9888d95SJean-Philippe Brucker }
265c9888d95SJean-Philippe Brucker
vfio_pci_msix_pba_access(struct kvm_cpu * vcpu,u64 addr,u8 * data,u32 len,u8 is_write,void * ptr)266c9888d95SJean-Philippe Brucker static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
267c9888d95SJean-Philippe Brucker u32 len, u8 is_write, void *ptr)
268c9888d95SJean-Philippe Brucker {
269c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = ptr;
270c9888d95SJean-Philippe Brucker struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
271c9888d95SJean-Philippe Brucker u64 offset = addr - pba->guest_phys_addr;
272c9888d95SJean-Philippe Brucker struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
273c9888d95SJean-Philippe Brucker
274b20d6e30SAlexandru Elisei if (offset >= pba->size) {
275b20d6e30SAlexandru Elisei vfio_dev_err(vdev, "access outside of the MSIX PBA");
276b20d6e30SAlexandru Elisei return;
277b20d6e30SAlexandru Elisei }
278b20d6e30SAlexandru Elisei
279c9888d95SJean-Philippe Brucker if (is_write)
280c9888d95SJean-Philippe Brucker return;
281c9888d95SJean-Philippe Brucker
282c9888d95SJean-Philippe Brucker /*
283c9888d95SJean-Philippe Brucker * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
284c9888d95SJean-Philippe Brucker * is completely useless here. Note that Linux doesn't use PBA.
285c9888d95SJean-Philippe Brucker */
2865f44d5d6SAlexandru Elisei if (pread(vdev->fd, data, len, pba->fd_offset + offset) != (ssize_t)len)
287c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot access MSIX PBA\n");
288c9888d95SJean-Philippe Brucker }
289c9888d95SJean-Philippe Brucker
vfio_pci_msix_table_access(struct kvm_cpu * vcpu,u64 addr,u8 * data,u32 len,u8 is_write,void * ptr)290c9888d95SJean-Philippe Brucker static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
291c9888d95SJean-Philippe Brucker u32 len, u8 is_write, void *ptr)
292c9888d95SJean-Philippe Brucker {
293c9888d95SJean-Philippe Brucker struct kvm *kvm = vcpu->kvm;
294c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entry;
295c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = ptr;
296c9888d95SJean-Philippe Brucker struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
297c9888d95SJean-Philippe Brucker
298c9888d95SJean-Philippe Brucker u64 offset = addr - pdev->msix_table.guest_phys_addr;
299b20d6e30SAlexandru Elisei if (offset >= pdev->msix_table.size) {
300b20d6e30SAlexandru Elisei vfio_dev_err(vdev, "access outside of the MSI-X table");
301b20d6e30SAlexandru Elisei return;
302b20d6e30SAlexandru Elisei }
303c9888d95SJean-Philippe Brucker
304c9888d95SJean-Philippe Brucker size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
305c9888d95SJean-Philippe Brucker off_t field = offset % PCI_MSIX_ENTRY_SIZE;
306c9888d95SJean-Philippe Brucker
307c9888d95SJean-Philippe Brucker /*
308c9888d95SJean-Philippe Brucker * PCI spec says that software must use aligned 4 or 8 bytes accesses
309c9888d95SJean-Philippe Brucker * for the MSI-X tables.
310c9888d95SJean-Philippe Brucker */
311c9888d95SJean-Philippe Brucker if ((len != 4 && len != 8) || addr & (len - 1)) {
312c9888d95SJean-Philippe Brucker vfio_dev_warn(vdev, "invalid MSI-X table access");
313c9888d95SJean-Philippe Brucker return;
314c9888d95SJean-Philippe Brucker }
315c9888d95SJean-Philippe Brucker
316c9888d95SJean-Philippe Brucker entry = &pdev->msix.entries[vector];
317c9888d95SJean-Philippe Brucker
318c9888d95SJean-Philippe Brucker mutex_lock(&pdev->msix.mutex);
319c9888d95SJean-Philippe Brucker
320c9888d95SJean-Philippe Brucker if (!is_write) {
321c9888d95SJean-Philippe Brucker memcpy(data, (void *)&entry->config + field, len);
322c9888d95SJean-Philippe Brucker goto out_unlock;
323c9888d95SJean-Philippe Brucker }
324c9888d95SJean-Philippe Brucker
325c9888d95SJean-Philippe Brucker memcpy((void *)&entry->config + field, data, len);
326c9888d95SJean-Philippe Brucker
327c9888d95SJean-Philippe Brucker /*
328c9888d95SJean-Philippe Brucker * Check if access touched the vector control register, which is at the
329c9888d95SJean-Philippe Brucker * end of the MSI-X entry.
330c9888d95SJean-Philippe Brucker */
331c9888d95SJean-Philippe Brucker if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
332c9888d95SJean-Philippe Brucker goto out_unlock;
333c9888d95SJean-Philippe Brucker
334*0b5e55fcSJean-Philippe Brucker msi_set_masked(entry->guest_state, entry->config.ctrl &
335c9888d95SJean-Philippe Brucker PCI_MSIX_ENTRY_CTRL_MASKBIT);
336c9888d95SJean-Philippe Brucker
337c9888d95SJean-Philippe Brucker if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
338c9888d95SJean-Philippe Brucker /* Not much we can do here. */
339c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
340c9888d95SJean-Philippe Brucker
341c9888d95SJean-Philippe Brucker /* Update the physical capability if necessary */
3428dd28afeSJean-Philippe Brucker if (vfio_pci_enable_msis(kvm, vdev, true))
343c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot enable MSIX");
344c9888d95SJean-Philippe Brucker
345c9888d95SJean-Philippe Brucker out_unlock:
346c9888d95SJean-Philippe Brucker mutex_unlock(&pdev->msix.mutex);
347c9888d95SJean-Philippe Brucker }
348c9888d95SJean-Philippe Brucker
vfio_pci_msix_cap_write(struct kvm * kvm,struct vfio_device * vdev,u16 off,void * data,int sz)349c9888d95SJean-Philippe Brucker static void vfio_pci_msix_cap_write(struct kvm *kvm,
350e69b7663SAlexandru Elisei struct vfio_device *vdev, u16 off,
351c9888d95SJean-Philippe Brucker void *data, int sz)
352c9888d95SJean-Philippe Brucker {
353c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
354c9888d95SJean-Philippe Brucker off_t enable_pos = PCI_MSIX_FLAGS + 1;
355c9888d95SJean-Philippe Brucker bool enable;
356c9888d95SJean-Philippe Brucker u16 flags;
357c9888d95SJean-Philippe Brucker
358c9888d95SJean-Philippe Brucker off -= pdev->msix.pos;
359c9888d95SJean-Philippe Brucker
360c9888d95SJean-Philippe Brucker /* Check if access intersects with the MSI-X Enable bit */
361c9888d95SJean-Philippe Brucker if (off > enable_pos || off + sz <= enable_pos)
362c9888d95SJean-Philippe Brucker return;
363c9888d95SJean-Philippe Brucker
364c9888d95SJean-Philippe Brucker /* Read byte that contains the Enable bit */
365c9888d95SJean-Philippe Brucker flags = *(u8 *)(data + enable_pos - off) << 8;
366c9888d95SJean-Philippe Brucker
367c9888d95SJean-Philippe Brucker mutex_lock(&pdev->msix.mutex);
368c9888d95SJean-Philippe Brucker
369*0b5e55fcSJean-Philippe Brucker msi_set_masked(pdev->msix.guest_state, flags & PCI_MSIX_FLAGS_MASKALL);
370c9888d95SJean-Philippe Brucker enable = flags & PCI_MSIX_FLAGS_ENABLE;
371*0b5e55fcSJean-Philippe Brucker msi_set_enabled(pdev->msix.guest_state, enable);
372c9888d95SJean-Philippe Brucker
3738dd28afeSJean-Philippe Brucker if (enable && vfio_pci_enable_msis(kvm, vdev, true))
374c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot enable MSIX");
3758dd28afeSJean-Philippe Brucker else if (!enable && vfio_pci_disable_msis(kvm, vdev, true))
376c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "cannot disable MSIX");
377c9888d95SJean-Philippe Brucker
378c9888d95SJean-Philippe Brucker mutex_unlock(&pdev->msix.mutex);
379c9888d95SJean-Philippe Brucker }
380c9888d95SJean-Philippe Brucker
vfio_pci_msi_vector_write(struct kvm * kvm,struct vfio_device * vdev,u16 off,u8 * data,u32 sz)3818dd28afeSJean-Philippe Brucker static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
382e69b7663SAlexandru Elisei u16 off, u8 *data, u32 sz)
3838dd28afeSJean-Philippe Brucker {
3848dd28afeSJean-Philippe Brucker size_t i;
3858dd28afeSJean-Philippe Brucker u32 mask = 0;
3868dd28afeSJean-Philippe Brucker size_t mask_pos, start, limit;
3878dd28afeSJean-Philippe Brucker struct vfio_pci_msi_entry *entry;
3888dd28afeSJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
3898dd28afeSJean-Philippe Brucker struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
3908dd28afeSJean-Philippe Brucker
3918dd28afeSJean-Philippe Brucker if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT))
3928dd28afeSJean-Philippe Brucker return 0;
3938dd28afeSJean-Philippe Brucker
3948dd28afeSJean-Philippe Brucker if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT)
3958dd28afeSJean-Philippe Brucker mask_pos = PCI_MSI_MASK_64;
3968dd28afeSJean-Philippe Brucker else
3978dd28afeSJean-Philippe Brucker mask_pos = PCI_MSI_MASK_32;
3988dd28afeSJean-Philippe Brucker
3998dd28afeSJean-Philippe Brucker if (off >= mask_pos + 4 || off + sz <= mask_pos)
4008dd28afeSJean-Philippe Brucker return 0;
4018dd28afeSJean-Philippe Brucker
4028dd28afeSJean-Philippe Brucker /* Set mask to current state */
4038dd28afeSJean-Philippe Brucker for (i = 0; i < pdev->msi.nr_entries; i++) {
4048dd28afeSJean-Philippe Brucker entry = &pdev->msi.entries[i];
405*0b5e55fcSJean-Philippe Brucker mask |= !!msi_is_masked(entry->guest_state) << i;
4068dd28afeSJean-Philippe Brucker }
4078dd28afeSJean-Philippe Brucker
4088dd28afeSJean-Philippe Brucker /* Update mask following the intersection of access and register */
4098dd28afeSJean-Philippe Brucker start = max_t(size_t, off, mask_pos);
4108dd28afeSJean-Philippe Brucker limit = min_t(size_t, off + sz, mask_pos + 4);
4118dd28afeSJean-Philippe Brucker
4128dd28afeSJean-Philippe Brucker memcpy((void *)&mask + start - mask_pos, data + start - off,
4138dd28afeSJean-Philippe Brucker limit - start);
4148dd28afeSJean-Philippe Brucker
4158dd28afeSJean-Philippe Brucker /* Update states if necessary */
4168dd28afeSJean-Philippe Brucker for (i = 0; i < pdev->msi.nr_entries; i++) {
4178dd28afeSJean-Philippe Brucker bool masked = mask & (1 << i);
4188dd28afeSJean-Philippe Brucker
4198dd28afeSJean-Philippe Brucker entry = &pdev->msi.entries[i];
420*0b5e55fcSJean-Philippe Brucker if (masked != msi_is_masked(entry->guest_state)) {
421*0b5e55fcSJean-Philippe Brucker msi_set_masked(entry->guest_state, masked);
4228dd28afeSJean-Philippe Brucker vfio_pci_update_msi_entry(kvm, vdev, entry);
4238dd28afeSJean-Philippe Brucker }
4248dd28afeSJean-Philippe Brucker }
4258dd28afeSJean-Philippe Brucker
4268dd28afeSJean-Philippe Brucker return 1;
4278dd28afeSJean-Philippe Brucker }
4288dd28afeSJean-Philippe Brucker
vfio_pci_msi_cap_write(struct kvm * kvm,struct vfio_device * vdev,u16 off,u8 * data,u32 sz)4298dd28afeSJean-Philippe Brucker static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
430e69b7663SAlexandru Elisei u16 off, u8 *data, u32 sz)
4318dd28afeSJean-Philippe Brucker {
4328dd28afeSJean-Philippe Brucker u8 ctrl;
4338dd28afeSJean-Philippe Brucker struct msi_msg msg;
4348dd28afeSJean-Philippe Brucker size_t i, nr_vectors;
4358dd28afeSJean-Philippe Brucker struct vfio_pci_msi_entry *entry;
4368dd28afeSJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
4378dd28afeSJean-Philippe Brucker struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
4388dd28afeSJean-Philippe Brucker
4398dd28afeSJean-Philippe Brucker off -= pdev->msi.pos;
4408dd28afeSJean-Philippe Brucker
4418dd28afeSJean-Philippe Brucker mutex_lock(&pdev->msi.mutex);
4428dd28afeSJean-Philippe Brucker
4438dd28afeSJean-Philippe Brucker /* Check if the guest is trying to update mask bits */
4448dd28afeSJean-Philippe Brucker if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz))
4458dd28afeSJean-Philippe Brucker goto out_unlock;
4468dd28afeSJean-Philippe Brucker
4478dd28afeSJean-Philippe Brucker /* Only modify routes when guest pokes the enable bit */
4488dd28afeSJean-Philippe Brucker if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS)
4498dd28afeSJean-Philippe Brucker goto out_unlock;
4508dd28afeSJean-Philippe Brucker
4518dd28afeSJean-Philippe Brucker ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off);
4528dd28afeSJean-Philippe Brucker
453*0b5e55fcSJean-Philippe Brucker msi_set_enabled(pdev->msi.guest_state, ctrl & PCI_MSI_FLAGS_ENABLE);
4548dd28afeSJean-Philippe Brucker
455*0b5e55fcSJean-Philippe Brucker if (!msi_is_enabled(pdev->msi.guest_state)) {
4568dd28afeSJean-Philippe Brucker vfio_pci_disable_msis(kvm, vdev, false);
4578dd28afeSJean-Philippe Brucker goto out_unlock;
4588dd28afeSJean-Philippe Brucker }
4598dd28afeSJean-Philippe Brucker
4608dd28afeSJean-Philippe Brucker /* Create routes for the requested vectors */
4618dd28afeSJean-Philippe Brucker nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
4628dd28afeSJean-Philippe Brucker
4638dd28afeSJean-Philippe Brucker msg.address_lo = msi_cap_64->address_lo;
4648dd28afeSJean-Philippe Brucker if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) {
4658dd28afeSJean-Philippe Brucker msg.address_hi = msi_cap_64->address_hi;
4668dd28afeSJean-Philippe Brucker msg.data = msi_cap_64->data;
4678dd28afeSJean-Philippe Brucker } else {
4688dd28afeSJean-Philippe Brucker struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64;
4698dd28afeSJean-Philippe Brucker msg.address_hi = 0;
4708dd28afeSJean-Philippe Brucker msg.data = msi_cap_32->data;
4718dd28afeSJean-Philippe Brucker }
4728dd28afeSJean-Philippe Brucker
4738dd28afeSJean-Philippe Brucker for (i = 0; i < nr_vectors; i++) {
4748dd28afeSJean-Philippe Brucker entry = &pdev->msi.entries[i];
475e554aefdSLorenzo Pieralisi
476e554aefdSLorenzo Pieralisi /*
477e554aefdSLorenzo Pieralisi * Set the MSI data value as required by the PCI local
478e554aefdSLorenzo Pieralisi * bus specifications, MSI capability, "Message Data".
479e554aefdSLorenzo Pieralisi */
480e554aefdSLorenzo Pieralisi msg.data &= ~(nr_vectors - 1);
481e554aefdSLorenzo Pieralisi msg.data |= i;
482e554aefdSLorenzo Pieralisi
4838dd28afeSJean-Philippe Brucker entry->config.msg = msg;
4848dd28afeSJean-Philippe Brucker vfio_pci_update_msi_entry(kvm, vdev, entry);
4858dd28afeSJean-Philippe Brucker }
4868dd28afeSJean-Philippe Brucker
4878dd28afeSJean-Philippe Brucker /* Update the physical capability if necessary */
4888dd28afeSJean-Philippe Brucker if (vfio_pci_enable_msis(kvm, vdev, false))
4898dd28afeSJean-Philippe Brucker vfio_dev_err(vdev, "cannot enable MSI");
4908dd28afeSJean-Philippe Brucker
4918dd28afeSJean-Philippe Brucker out_unlock:
4928dd28afeSJean-Philippe Brucker mutex_unlock(&pdev->msi.mutex);
4938dd28afeSJean-Philippe Brucker }
4948dd28afeSJean-Philippe Brucker
vfio_pci_bar_activate(struct kvm * kvm,struct pci_device_header * pci_hdr,int bar_num,void * data)4955a8e4f25SAlexandru Elisei static int vfio_pci_bar_activate(struct kvm *kvm,
4965a8e4f25SAlexandru Elisei struct pci_device_header *pci_hdr,
4975a8e4f25SAlexandru Elisei int bar_num, void *data)
4985a8e4f25SAlexandru Elisei {
4995a8e4f25SAlexandru Elisei struct vfio_device *vdev = data;
5005a8e4f25SAlexandru Elisei struct vfio_pci_device *pdev = &vdev->pci;
5015a8e4f25SAlexandru Elisei struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
5025a8e4f25SAlexandru Elisei struct vfio_pci_msix_table *table = &pdev->msix_table;
5035a8e4f25SAlexandru Elisei struct vfio_region *region;
504465edc9dSAlexandru Elisei u32 bar_addr;
5055a8e4f25SAlexandru Elisei bool has_msix;
5065a8e4f25SAlexandru Elisei int ret;
5075a8e4f25SAlexandru Elisei
5085a8e4f25SAlexandru Elisei assert((u32)bar_num < vdev->info.num_regions);
5095a8e4f25SAlexandru Elisei
5105a8e4f25SAlexandru Elisei region = &vdev->regions[bar_num];
5115a8e4f25SAlexandru Elisei has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
5125a8e4f25SAlexandru Elisei
513465edc9dSAlexandru Elisei bar_addr = pci__bar_address(pci_hdr, bar_num);
514465edc9dSAlexandru Elisei if (pci__bar_is_io(pci_hdr, bar_num))
515465edc9dSAlexandru Elisei region->port_base = bar_addr;
516465edc9dSAlexandru Elisei else
517465edc9dSAlexandru Elisei region->guest_phys_addr = bar_addr;
518465edc9dSAlexandru Elisei
5195a8e4f25SAlexandru Elisei if (has_msix && (u32)bar_num == table->bar) {
520465edc9dSAlexandru Elisei table->guest_phys_addr = region->guest_phys_addr;
5215a8e4f25SAlexandru Elisei ret = kvm__register_mmio(kvm, table->guest_phys_addr,
5225a8e4f25SAlexandru Elisei table->size, false,
5235a8e4f25SAlexandru Elisei vfio_pci_msix_table_access, pdev);
5245a8e4f25SAlexandru Elisei /*
5255a8e4f25SAlexandru Elisei * The MSIX table and the PBA structure can share the same BAR,
5265a8e4f25SAlexandru Elisei * but for convenience we register different regions for mmio
5275a8e4f25SAlexandru Elisei * emulation. We want to we update both if they share the same
5285a8e4f25SAlexandru Elisei * BAR.
5295a8e4f25SAlexandru Elisei */
5305a8e4f25SAlexandru Elisei if (ret < 0 || table->bar != pba->bar)
5315a8e4f25SAlexandru Elisei goto out;
5325a8e4f25SAlexandru Elisei }
5335a8e4f25SAlexandru Elisei
5345a8e4f25SAlexandru Elisei if (has_msix && (u32)bar_num == pba->bar) {
535465edc9dSAlexandru Elisei if (pba->bar == table->bar)
536f93acc04SAlexandru Elisei pba->guest_phys_addr = table->guest_phys_addr + pba->bar_offset;
537465edc9dSAlexandru Elisei else
538465edc9dSAlexandru Elisei pba->guest_phys_addr = region->guest_phys_addr;
5395a8e4f25SAlexandru Elisei ret = kvm__register_mmio(kvm, pba->guest_phys_addr,
5405a8e4f25SAlexandru Elisei pba->size, false,
5415a8e4f25SAlexandru Elisei vfio_pci_msix_pba_access, pdev);
5425a8e4f25SAlexandru Elisei goto out;
5435a8e4f25SAlexandru Elisei }
5445a8e4f25SAlexandru Elisei
5455a8e4f25SAlexandru Elisei ret = vfio_map_region(kvm, vdev, region);
5465a8e4f25SAlexandru Elisei out:
5475a8e4f25SAlexandru Elisei return ret;
5485a8e4f25SAlexandru Elisei }
5495a8e4f25SAlexandru Elisei
vfio_pci_bar_deactivate(struct kvm * kvm,struct pci_device_header * pci_hdr,int bar_num,void * data)5505a8e4f25SAlexandru Elisei static int vfio_pci_bar_deactivate(struct kvm *kvm,
5515a8e4f25SAlexandru Elisei struct pci_device_header *pci_hdr,
5525a8e4f25SAlexandru Elisei int bar_num, void *data)
5535a8e4f25SAlexandru Elisei {
5545a8e4f25SAlexandru Elisei struct vfio_device *vdev = data;
5555a8e4f25SAlexandru Elisei struct vfio_pci_device *pdev = &vdev->pci;
5565a8e4f25SAlexandru Elisei struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
5575a8e4f25SAlexandru Elisei struct vfio_pci_msix_table *table = &pdev->msix_table;
5585a8e4f25SAlexandru Elisei struct vfio_region *region;
5595a8e4f25SAlexandru Elisei bool has_msix, success;
5605a8e4f25SAlexandru Elisei int ret;
5615a8e4f25SAlexandru Elisei
5625a8e4f25SAlexandru Elisei assert((u32)bar_num < vdev->info.num_regions);
5635a8e4f25SAlexandru Elisei
5645a8e4f25SAlexandru Elisei region = &vdev->regions[bar_num];
5655a8e4f25SAlexandru Elisei has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
5665a8e4f25SAlexandru Elisei
5675a8e4f25SAlexandru Elisei if (has_msix && (u32)bar_num == table->bar) {
5685a8e4f25SAlexandru Elisei success = kvm__deregister_mmio(kvm, table->guest_phys_addr);
5695a8e4f25SAlexandru Elisei /* kvm__deregister_mmio fails when the region is not found. */
5705a8e4f25SAlexandru Elisei ret = (success ? 0 : -ENOENT);
5715a8e4f25SAlexandru Elisei /* See vfio_pci_bar_activate(). */
5725a8e4f25SAlexandru Elisei if (ret < 0 || table->bar!= pba->bar)
5735a8e4f25SAlexandru Elisei goto out;
5745a8e4f25SAlexandru Elisei }
5755a8e4f25SAlexandru Elisei
5765a8e4f25SAlexandru Elisei if (has_msix && (u32)bar_num == pba->bar) {
5775a8e4f25SAlexandru Elisei success = kvm__deregister_mmio(kvm, pba->guest_phys_addr);
5785a8e4f25SAlexandru Elisei ret = (success ? 0 : -ENOENT);
5795a8e4f25SAlexandru Elisei goto out;
5805a8e4f25SAlexandru Elisei }
5815a8e4f25SAlexandru Elisei
5825a8e4f25SAlexandru Elisei vfio_unmap_region(kvm, region);
5835a8e4f25SAlexandru Elisei ret = 0;
5845a8e4f25SAlexandru Elisei
5855a8e4f25SAlexandru Elisei out:
5865a8e4f25SAlexandru Elisei return ret;
5875a8e4f25SAlexandru Elisei }
5885a8e4f25SAlexandru Elisei
vfio_pci_cfg_read(struct kvm * kvm,struct pci_device_header * pci_hdr,u16 offset,void * data,int sz)5896078a454SJean-Philippe Brucker static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
590e69b7663SAlexandru Elisei u16 offset, void *data, int sz)
5916078a454SJean-Philippe Brucker {
5926078a454SJean-Philippe Brucker struct vfio_region_info *info;
5936078a454SJean-Philippe Brucker struct vfio_pci_device *pdev;
5946078a454SJean-Philippe Brucker struct vfio_device *vdev;
5956078a454SJean-Philippe Brucker char base[sz];
5966078a454SJean-Philippe Brucker
5976078a454SJean-Philippe Brucker pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
5986078a454SJean-Philippe Brucker vdev = container_of(pdev, struct vfio_device, pci);
5996078a454SJean-Philippe Brucker info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
6006078a454SJean-Philippe Brucker
6016078a454SJean-Philippe Brucker /* Dummy read in case of side-effects */
6026078a454SJean-Philippe Brucker if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
6036078a454SJean-Philippe Brucker vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
6046078a454SJean-Philippe Brucker sz, offset);
6056078a454SJean-Philippe Brucker }
6066078a454SJean-Philippe Brucker
vfio_pci_cfg_write(struct kvm * kvm,struct pci_device_header * pci_hdr,u16 offset,void * data,int sz)6076078a454SJean-Philippe Brucker static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
608e69b7663SAlexandru Elisei u16 offset, void *data, int sz)
6096078a454SJean-Philippe Brucker {
6106078a454SJean-Philippe Brucker struct vfio_region_info *info;
6116078a454SJean-Philippe Brucker struct vfio_pci_device *pdev;
6126078a454SJean-Philippe Brucker struct vfio_device *vdev;
613e1d0285cSAlexandru Elisei u32 tmp;
614e1d0285cSAlexandru Elisei
615e1d0285cSAlexandru Elisei /* Make sure a larger size will not overrun tmp on the stack. */
616e1d0285cSAlexandru Elisei assert(sz <= 4);
6176078a454SJean-Philippe Brucker
6185b7fef16SAlexandru Elisei if (offset == PCI_ROM_ADDRESS)
6195b7fef16SAlexandru Elisei return;
6205b7fef16SAlexandru Elisei
6216078a454SJean-Philippe Brucker pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
6226078a454SJean-Philippe Brucker vdev = container_of(pdev, struct vfio_device, pci);
6236078a454SJean-Philippe Brucker info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
6246078a454SJean-Philippe Brucker
6256078a454SJean-Philippe Brucker if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
6266078a454SJean-Philippe Brucker vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
6276078a454SJean-Philippe Brucker sz, offset);
6286078a454SJean-Philippe Brucker
629c9888d95SJean-Philippe Brucker /* Handle MSI write now, since it might update the hardware capability */
630c9888d95SJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
631c9888d95SJean-Philippe Brucker vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
632c9888d95SJean-Philippe Brucker
6338dd28afeSJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI)
6348dd28afeSJean-Philippe Brucker vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz);
6358dd28afeSJean-Philippe Brucker
636e1d0285cSAlexandru Elisei if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz)
6376078a454SJean-Philippe Brucker vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
6386078a454SJean-Philippe Brucker sz, offset);
6396078a454SJean-Philippe Brucker }
6406078a454SJean-Philippe Brucker
vfio_pci_msi_cap_size(struct msi_cap_64 * cap_hdr)6418dd28afeSJean-Philippe Brucker static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr)
6428dd28afeSJean-Philippe Brucker {
6438dd28afeSJean-Philippe Brucker size_t size = 10;
6448dd28afeSJean-Philippe Brucker
6458dd28afeSJean-Philippe Brucker if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT)
6468dd28afeSJean-Philippe Brucker size += 4;
6478dd28afeSJean-Philippe Brucker if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT)
6488dd28afeSJean-Philippe Brucker size += 10;
6498dd28afeSJean-Philippe Brucker
6508dd28afeSJean-Philippe Brucker return size;
6518dd28afeSJean-Philippe Brucker }
6528dd28afeSJean-Philippe Brucker
vfio_pci_cap_size(struct pci_cap_hdr * cap_hdr)653c9888d95SJean-Philippe Brucker static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
654c9888d95SJean-Philippe Brucker {
655c9888d95SJean-Philippe Brucker switch (cap_hdr->type) {
656c9888d95SJean-Philippe Brucker case PCI_CAP_ID_MSIX:
657c9888d95SJean-Philippe Brucker return PCI_CAP_MSIX_SIZEOF;
6588dd28afeSJean-Philippe Brucker case PCI_CAP_ID_MSI:
6598dd28afeSJean-Philippe Brucker return vfio_pci_msi_cap_size((void *)cap_hdr);
66025c1dc6cSAlexandru Elisei case PCI_CAP_ID_EXP:
66125c1dc6cSAlexandru Elisei /*
66225c1dc6cSAlexandru Elisei * We don't emulate any of the link, slot and root complex
66325c1dc6cSAlexandru Elisei * properties, so ignore them.
66425c1dc6cSAlexandru Elisei */
66525c1dc6cSAlexandru Elisei return PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1;
666c9888d95SJean-Philippe Brucker default:
667c9888d95SJean-Philippe Brucker pr_err("unknown PCI capability 0x%x", cap_hdr->type);
668c9888d95SJean-Philippe Brucker return 0;
669c9888d95SJean-Philippe Brucker }
670c9888d95SJean-Philippe Brucker }
671c9888d95SJean-Philippe Brucker
vfio_pci_add_cap(struct vfio_device * vdev,u8 * virt_hdr,struct pci_cap_hdr * cap,off_t pos)672c9888d95SJean-Philippe Brucker static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
673c9888d95SJean-Philippe Brucker struct pci_cap_hdr *cap, off_t pos)
674c9888d95SJean-Philippe Brucker {
675c9888d95SJean-Philippe Brucker struct pci_cap_hdr *last;
676c9888d95SJean-Philippe Brucker struct pci_device_header *hdr = &vdev->pci.hdr;
677c9888d95SJean-Philippe Brucker
678c9888d95SJean-Philippe Brucker cap->next = 0;
679c9888d95SJean-Philippe Brucker
680c9888d95SJean-Philippe Brucker if (!hdr->capabilities) {
681c9888d95SJean-Philippe Brucker hdr->capabilities = pos;
682c9888d95SJean-Philippe Brucker hdr->status |= PCI_STATUS_CAP_LIST;
683c9888d95SJean-Philippe Brucker } else {
684c9888d95SJean-Philippe Brucker last = PCI_CAP(virt_hdr, hdr->capabilities);
685c9888d95SJean-Philippe Brucker
686c9888d95SJean-Philippe Brucker while (last->next)
687c9888d95SJean-Philippe Brucker last = PCI_CAP(virt_hdr, last->next);
688c9888d95SJean-Philippe Brucker
689c9888d95SJean-Philippe Brucker last->next = pos;
690c9888d95SJean-Philippe Brucker }
691c9888d95SJean-Philippe Brucker
692c9888d95SJean-Philippe Brucker memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
693c9888d95SJean-Philippe Brucker
694c9888d95SJean-Philippe Brucker return 0;
695c9888d95SJean-Philippe Brucker }
696c9888d95SJean-Philippe Brucker
vfio_pci_parse_caps(struct vfio_device * vdev)6976078a454SJean-Philippe Brucker static int vfio_pci_parse_caps(struct vfio_device *vdev)
6986078a454SJean-Philippe Brucker {
699c9888d95SJean-Philippe Brucker int ret;
700c9888d95SJean-Philippe Brucker size_t size;
701e69b7663SAlexandru Elisei u16 pos, next;
702c9888d95SJean-Philippe Brucker struct pci_cap_hdr *cap;
703e69b7663SAlexandru Elisei u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY];
7046078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
7056078a454SJean-Philippe Brucker
7066078a454SJean-Philippe Brucker if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
7076078a454SJean-Philippe Brucker return 0;
7086078a454SJean-Philippe Brucker
709e69b7663SAlexandru Elisei memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY);
710c9888d95SJean-Philippe Brucker
711c9888d95SJean-Philippe Brucker pos = pdev->hdr.capabilities & ~3;
712c9888d95SJean-Philippe Brucker
7136078a454SJean-Philippe Brucker pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
7146078a454SJean-Philippe Brucker pdev->hdr.capabilities = 0;
7156078a454SJean-Philippe Brucker
716c9888d95SJean-Philippe Brucker for (; pos; pos = next) {
717c9888d95SJean-Philippe Brucker cap = PCI_CAP(&pdev->hdr, pos);
718c9888d95SJean-Philippe Brucker next = cap->next;
719c9888d95SJean-Philippe Brucker
720c9888d95SJean-Philippe Brucker switch (cap->type) {
721c9888d95SJean-Philippe Brucker case PCI_CAP_ID_MSIX:
722c9888d95SJean-Philippe Brucker ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
723c9888d95SJean-Philippe Brucker if (ret)
724c9888d95SJean-Philippe Brucker return ret;
725c9888d95SJean-Philippe Brucker
726c9888d95SJean-Philippe Brucker pdev->msix.pos = pos;
727c9888d95SJean-Philippe Brucker pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
728c9888d95SJean-Philippe Brucker break;
7298dd28afeSJean-Philippe Brucker case PCI_CAP_ID_MSI:
7308dd28afeSJean-Philippe Brucker ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
7318dd28afeSJean-Philippe Brucker if (ret)
7328dd28afeSJean-Philippe Brucker return ret;
7338dd28afeSJean-Philippe Brucker
7348dd28afeSJean-Philippe Brucker pdev->msi.pos = pos;
7358dd28afeSJean-Philippe Brucker pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI;
7368dd28afeSJean-Philippe Brucker break;
73725c1dc6cSAlexandru Elisei case PCI_CAP_ID_EXP:
73825c1dc6cSAlexandru Elisei if (!arch_has_pci_exp())
73925c1dc6cSAlexandru Elisei continue;
74025c1dc6cSAlexandru Elisei ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
74125c1dc6cSAlexandru Elisei if (ret)
74225c1dc6cSAlexandru Elisei return ret;
74325c1dc6cSAlexandru Elisei break;
744c9888d95SJean-Philippe Brucker }
745c9888d95SJean-Philippe Brucker }
746c9888d95SJean-Philippe Brucker
747c9888d95SJean-Philippe Brucker /* Wipe remaining capabilities */
748c9888d95SJean-Philippe Brucker pos = PCI_STD_HEADER_SIZEOF;
749e69b7663SAlexandru Elisei size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF;
750c9888d95SJean-Philippe Brucker memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
7516078a454SJean-Philippe Brucker
7526078a454SJean-Philippe Brucker return 0;
7536078a454SJean-Philippe Brucker }
7546078a454SJean-Philippe Brucker
vfio_pci_parse_cfg_space(struct vfio_device * vdev)7556078a454SJean-Philippe Brucker static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
7566078a454SJean-Philippe Brucker {
757e69b7663SAlexandru Elisei ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
7586078a454SJean-Philippe Brucker struct vfio_region_info *info;
7596078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
7606078a454SJean-Philippe Brucker
7616078a454SJean-Philippe Brucker if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
7626078a454SJean-Philippe Brucker vfio_dev_err(vdev, "Config Space not found");
7636078a454SJean-Philippe Brucker return -ENODEV;
7646078a454SJean-Philippe Brucker }
7656078a454SJean-Philippe Brucker
7666078a454SJean-Philippe Brucker info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
7676078a454SJean-Philippe Brucker *info = (struct vfio_region_info) {
7686078a454SJean-Philippe Brucker .argsz = sizeof(*info),
7696078a454SJean-Philippe Brucker .index = VFIO_PCI_CONFIG_REGION_INDEX,
7706078a454SJean-Philippe Brucker };
7716078a454SJean-Philippe Brucker
7726078a454SJean-Philippe Brucker ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
7736078a454SJean-Philippe Brucker if (!info->size) {
7746078a454SJean-Philippe Brucker vfio_dev_err(vdev, "Config Space has size zero?!");
7756078a454SJean-Philippe Brucker return -EINVAL;
7766078a454SJean-Philippe Brucker }
7776078a454SJean-Philippe Brucker
778c9888d95SJean-Philippe Brucker /* Read standard headers and capabilities */
7796078a454SJean-Philippe Brucker if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
7806078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
7816078a454SJean-Philippe Brucker return -EIO;
7826078a454SJean-Philippe Brucker }
7836078a454SJean-Philippe Brucker
7846078a454SJean-Philippe Brucker /* Strip bit 7, that indicates multifunction */
7856078a454SJean-Philippe Brucker pdev->hdr.header_type &= 0x7f;
7866078a454SJean-Philippe Brucker
7876078a454SJean-Philippe Brucker if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
7886078a454SJean-Philippe Brucker vfio_dev_err(vdev, "unsupported header type %u",
7896078a454SJean-Philippe Brucker pdev->hdr.header_type);
7906078a454SJean-Philippe Brucker return -EOPNOTSUPP;
7916078a454SJean-Philippe Brucker }
7926078a454SJean-Philippe Brucker
793c9888d95SJean-Philippe Brucker if (pdev->hdr.irq_pin)
794c9888d95SJean-Philippe Brucker pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
795c9888d95SJean-Philippe Brucker
7966078a454SJean-Philippe Brucker vfio_pci_parse_caps(vdev);
7976078a454SJean-Philippe Brucker
7986078a454SJean-Philippe Brucker return 0;
7996078a454SJean-Philippe Brucker }
8006078a454SJean-Philippe Brucker
vfio_pci_fixup_cfg_space(struct vfio_device * vdev)8016078a454SJean-Philippe Brucker static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
8026078a454SJean-Philippe Brucker {
8036078a454SJean-Philippe Brucker int i;
8043665392aSAlexandru Elisei u64 base;
8056078a454SJean-Philippe Brucker ssize_t hdr_sz;
806c9888d95SJean-Philippe Brucker struct msix_cap *msix;
8076078a454SJean-Philippe Brucker struct vfio_region_info *info;
8086078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
8093665392aSAlexandru Elisei struct vfio_region *region;
8106078a454SJean-Philippe Brucker
8116078a454SJean-Philippe Brucker /* Initialise the BARs */
8126078a454SJean-Philippe Brucker for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
8133665392aSAlexandru Elisei if ((u32)i == vdev->info.num_regions)
8143665392aSAlexandru Elisei break;
81582caa882SJean-Philippe Brucker
8163665392aSAlexandru Elisei region = &vdev->regions[i];
81782caa882SJean-Philippe Brucker /* Construct a fake reg to match what we've mapped. */
81882caa882SJean-Philippe Brucker if (region->is_ioport) {
81982caa882SJean-Philippe Brucker base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) |
82082caa882SJean-Philippe Brucker PCI_BASE_ADDRESS_SPACE_IO;
82182caa882SJean-Philippe Brucker } else {
82282caa882SJean-Philippe Brucker base = (region->guest_phys_addr &
82382caa882SJean-Philippe Brucker PCI_BASE_ADDRESS_MEM_MASK) |
82482caa882SJean-Philippe Brucker PCI_BASE_ADDRESS_SPACE_MEMORY;
82582caa882SJean-Philippe Brucker }
82682caa882SJean-Philippe Brucker
82782caa882SJean-Philippe Brucker pdev->hdr.bar[i] = base;
8286078a454SJean-Philippe Brucker
8296078a454SJean-Philippe Brucker if (!base)
8306078a454SJean-Philippe Brucker continue;
8316078a454SJean-Philippe Brucker
8326078a454SJean-Philippe Brucker pdev->hdr.bar_size[i] = region->info.size;
8336078a454SJean-Philippe Brucker }
8346078a454SJean-Philippe Brucker
8356078a454SJean-Philippe Brucker /* I really can't be bothered to support cardbus. */
8366078a454SJean-Philippe Brucker pdev->hdr.card_bus = 0;
8376078a454SJean-Philippe Brucker
8386078a454SJean-Philippe Brucker /*
8396078a454SJean-Philippe Brucker * Nuke the expansion ROM for now. If we want to do this properly,
8406078a454SJean-Philippe Brucker * we need to save its size somewhere and map into the guest.
8416078a454SJean-Philippe Brucker */
8426078a454SJean-Philippe Brucker pdev->hdr.exp_rom_bar = 0;
8436078a454SJean-Philippe Brucker
844c9888d95SJean-Philippe Brucker /* Plumb in our fake MSI-X capability, if we have it. */
845c9888d95SJean-Philippe Brucker msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
846c9888d95SJean-Philippe Brucker if (msix) {
847c9888d95SJean-Philippe Brucker /* Add a shortcut to the PBA region for the MMIO handler */
848c9888d95SJean-Philippe Brucker int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
849f93acc04SAlexandru Elisei u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET;
850f93acc04SAlexandru Elisei
8515f44d5d6SAlexandru Elisei pdev->msix_pba.fd_offset = vdev->regions[pba_index].info.offset +
852f93acc04SAlexandru Elisei pba_bar_offset;
853c9888d95SJean-Philippe Brucker
854c9888d95SJean-Philippe Brucker /* Tidy up the capability */
855c9888d95SJean-Philippe Brucker msix->table_offset &= PCI_MSIX_TABLE_BIR;
856f93acc04SAlexandru Elisei if (pdev->msix_table.bar == pdev->msix_pba.bar) {
857f93acc04SAlexandru Elisei /* Keep the same offset as the MSIX cap. */
858f93acc04SAlexandru Elisei pdev->msix_pba.bar_offset = pba_bar_offset;
859f93acc04SAlexandru Elisei } else {
860f93acc04SAlexandru Elisei /* PBA is at the start of the BAR. */
861c9888d95SJean-Philippe Brucker msix->pba_offset &= PCI_MSIX_PBA_BIR;
862f93acc04SAlexandru Elisei pdev->msix_pba.bar_offset = 0;
863f93acc04SAlexandru Elisei }
864c9888d95SJean-Philippe Brucker }
865c9888d95SJean-Philippe Brucker
8666078a454SJean-Philippe Brucker /* Install our fake Configuration Space */
8676078a454SJean-Philippe Brucker info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
868e69b7663SAlexandru Elisei /*
869e69b7663SAlexandru Elisei * We don't touch the extended configuration space, let's be cautious
870e69b7663SAlexandru Elisei * and not overwrite it all with zeros, or bad things might happen.
871e69b7663SAlexandru Elisei */
872e69b7663SAlexandru Elisei hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
8736078a454SJean-Philippe Brucker if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
8746078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
8756078a454SJean-Philippe Brucker hdr_sz);
8766078a454SJean-Philippe Brucker return -EIO;
8776078a454SJean-Philippe Brucker }
8786078a454SJean-Philippe Brucker
8796078a454SJean-Philippe Brucker /* Register callbacks for cfg accesses */
8806078a454SJean-Philippe Brucker pdev->hdr.cfg_ops = (struct pci_config_operations) {
8816078a454SJean-Philippe Brucker .read = vfio_pci_cfg_read,
8826078a454SJean-Philippe Brucker .write = vfio_pci_cfg_write,
8836078a454SJean-Philippe Brucker };
8846078a454SJean-Philippe Brucker
8856078a454SJean-Philippe Brucker pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
8866078a454SJean-Philippe Brucker
8876078a454SJean-Philippe Brucker return 0;
8886078a454SJean-Philippe Brucker }
8896078a454SJean-Philippe Brucker
vfio_pci_get_region_info(struct vfio_device * vdev,u32 index,struct vfio_region_info * info)890ed01a603SAlexandru Elisei static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index,
891ed01a603SAlexandru Elisei struct vfio_region_info *info)
892ed01a603SAlexandru Elisei {
893ed01a603SAlexandru Elisei int ret;
894ed01a603SAlexandru Elisei
895ed01a603SAlexandru Elisei *info = (struct vfio_region_info) {
896ed01a603SAlexandru Elisei .argsz = sizeof(*info),
897ed01a603SAlexandru Elisei .index = index,
898ed01a603SAlexandru Elisei };
899ed01a603SAlexandru Elisei
900ed01a603SAlexandru Elisei ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
901ed01a603SAlexandru Elisei if (ret) {
902ed01a603SAlexandru Elisei ret = -errno;
903ed01a603SAlexandru Elisei vfio_dev_err(vdev, "cannot get info for BAR %u", index);
904ed01a603SAlexandru Elisei return ret;
905ed01a603SAlexandru Elisei }
906ed01a603SAlexandru Elisei
907ed01a603SAlexandru Elisei if (info->size && !is_power_of_two(info->size)) {
908ed01a603SAlexandru Elisei vfio_dev_err(vdev, "region is not power of two: 0x%llx",
909ed01a603SAlexandru Elisei info->size);
910ed01a603SAlexandru Elisei return -EINVAL;
911ed01a603SAlexandru Elisei }
912ed01a603SAlexandru Elisei
913ed01a603SAlexandru Elisei return 0;
914ed01a603SAlexandru Elisei }
915ed01a603SAlexandru Elisei
vfio_pci_create_msix_table(struct kvm * kvm,struct vfio_device * vdev)916ed01a603SAlexandru Elisei static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev)
917c9888d95SJean-Philippe Brucker {
918c9888d95SJean-Philippe Brucker int ret;
919c9888d95SJean-Philippe Brucker size_t i;
920ed01a603SAlexandru Elisei size_t map_size;
921c9888d95SJean-Philippe Brucker size_t nr_entries;
922c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entries;
923ed01a603SAlexandru Elisei struct vfio_pci_device *pdev = &vdev->pci;
924c9888d95SJean-Philippe Brucker struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
925c9888d95SJean-Philippe Brucker struct vfio_pci_msix_table *table = &pdev->msix_table;
926c9888d95SJean-Philippe Brucker struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
927ed01a603SAlexandru Elisei struct vfio_region_info info;
928c9888d95SJean-Philippe Brucker
929c9888d95SJean-Philippe Brucker table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
930c9888d95SJean-Philippe Brucker pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
931c9888d95SJean-Philippe Brucker
932c9888d95SJean-Philippe Brucker nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
933f93acc04SAlexandru Elisei
934f93acc04SAlexandru Elisei /* MSIX table and PBA must support QWORD accesses. */
935f93acc04SAlexandru Elisei table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, 8);
936f93acc04SAlexandru Elisei pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), 8);
937c9888d95SJean-Philippe Brucker
938c9888d95SJean-Philippe Brucker entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
939c9888d95SJean-Philippe Brucker if (!entries)
940c9888d95SJean-Philippe Brucker return -ENOMEM;
941c9888d95SJean-Philippe Brucker
942c9888d95SJean-Philippe Brucker for (i = 0; i < nr_entries; i++)
943c9888d95SJean-Philippe Brucker entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
944c9888d95SJean-Philippe Brucker
945ed01a603SAlexandru Elisei ret = vfio_pci_get_region_info(vdev, table->bar, &info);
946ed01a603SAlexandru Elisei if (ret)
947ed01a603SAlexandru Elisei return ret;
948ed01a603SAlexandru Elisei if (!info.size)
949ed01a603SAlexandru Elisei return -EINVAL;
950ed01a603SAlexandru Elisei
95139181fc6SAlexandru Elisei map_size = ALIGN(info.size, MAX_PAGE_SIZE);
952ed01a603SAlexandru Elisei table->guest_phys_addr = pci_get_mmio_block(map_size);
953c9888d95SJean-Philippe Brucker if (!table->guest_phys_addr) {
954ed01a603SAlexandru Elisei pr_err("cannot allocate MMIO space");
955c9888d95SJean-Philippe Brucker ret = -ENOMEM;
956c9888d95SJean-Philippe Brucker goto out_free;
957c9888d95SJean-Philippe Brucker }
958c9888d95SJean-Philippe Brucker
959c9888d95SJean-Philippe Brucker /*
960c9888d95SJean-Philippe Brucker * We could map the physical PBA directly into the guest, but it's
961c9888d95SJean-Philippe Brucker * likely smaller than a page, and we can only hand full pages to the
962c9888d95SJean-Philippe Brucker * guest. Even though the PCI spec disallows sharing a page used for
963c9888d95SJean-Philippe Brucker * MSI-X with any other resource, it allows to share the same page
964c9888d95SJean-Philippe Brucker * between MSI-X table and PBA. For the sake of isolation, create a
965c9888d95SJean-Philippe Brucker * virtual PBA.
966c9888d95SJean-Philippe Brucker */
967f93acc04SAlexandru Elisei if (table->bar == pba->bar) {
968f93acc04SAlexandru Elisei u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET;
969f93acc04SAlexandru Elisei
970f93acc04SAlexandru Elisei /* Sanity checks. */
971f93acc04SAlexandru Elisei if (table->size > pba_bar_offset)
972f93acc04SAlexandru Elisei die("MSIX table overlaps with PBA");
973f93acc04SAlexandru Elisei if (pba_bar_offset + pba->size > info.size)
974f93acc04SAlexandru Elisei die("PBA exceeds the size of the region");
975f93acc04SAlexandru Elisei pba->guest_phys_addr = table->guest_phys_addr + pba_bar_offset;
976f93acc04SAlexandru Elisei } else {
977f93acc04SAlexandru Elisei ret = vfio_pci_get_region_info(vdev, pba->bar, &info);
978f93acc04SAlexandru Elisei if (ret)
979f93acc04SAlexandru Elisei return ret;
980f93acc04SAlexandru Elisei if (!info.size)
981f93acc04SAlexandru Elisei return -EINVAL;
982f93acc04SAlexandru Elisei
98339181fc6SAlexandru Elisei map_size = ALIGN(info.size, MAX_PAGE_SIZE);
984f93acc04SAlexandru Elisei pba->guest_phys_addr = pci_get_mmio_block(map_size);
985f93acc04SAlexandru Elisei if (!pba->guest_phys_addr) {
986f93acc04SAlexandru Elisei pr_err("cannot allocate MMIO space");
987f93acc04SAlexandru Elisei ret = -ENOMEM;
988f93acc04SAlexandru Elisei goto out_free;
989f93acc04SAlexandru Elisei }
990f93acc04SAlexandru Elisei }
991c9888d95SJean-Philippe Brucker
992c9888d95SJean-Philippe Brucker pdev->msix.entries = entries;
993c9888d95SJean-Philippe Brucker pdev->msix.nr_entries = nr_entries;
994c9888d95SJean-Philippe Brucker
995c9888d95SJean-Philippe Brucker return 0;
996c9888d95SJean-Philippe Brucker
997c9888d95SJean-Philippe Brucker out_free:
998c9888d95SJean-Philippe Brucker free(entries);
999c9888d95SJean-Philippe Brucker
1000c9888d95SJean-Philippe Brucker return ret;
1001c9888d95SJean-Philippe Brucker }
1002c9888d95SJean-Philippe Brucker
vfio_pci_create_msi_cap(struct kvm * kvm,struct vfio_pci_device * pdev)10038dd28afeSJean-Philippe Brucker static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev)
10048dd28afeSJean-Philippe Brucker {
10058dd28afeSJean-Philippe Brucker struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos);
10068dd28afeSJean-Philippe Brucker
10078dd28afeSJean-Philippe Brucker pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1),
10088dd28afeSJean-Philippe Brucker pdev->msi.entries = calloc(pdev->msi.nr_entries,
10098dd28afeSJean-Philippe Brucker sizeof(struct vfio_pci_msi_entry));
10108dd28afeSJean-Philippe Brucker if (!pdev->msi.entries)
10118dd28afeSJean-Philippe Brucker return -ENOMEM;
10128dd28afeSJean-Philippe Brucker
10138dd28afeSJean-Philippe Brucker return 0;
10148dd28afeSJean-Philippe Brucker }
10158dd28afeSJean-Philippe Brucker
vfio_pci_configure_bar(struct kvm * kvm,struct vfio_device * vdev,size_t nr)10166078a454SJean-Philippe Brucker static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
10176078a454SJean-Philippe Brucker size_t nr)
10186078a454SJean-Philippe Brucker {
10196078a454SJean-Philippe Brucker int ret;
102082caa882SJean-Philippe Brucker u32 bar;
10216078a454SJean-Philippe Brucker size_t map_size;
1022c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
10233665392aSAlexandru Elisei struct vfio_region *region;
10246078a454SJean-Philippe Brucker
10256078a454SJean-Philippe Brucker if (nr >= vdev->info.num_regions)
10266078a454SJean-Philippe Brucker return 0;
10276078a454SJean-Philippe Brucker
10283665392aSAlexandru Elisei region = &vdev->regions[nr];
102982caa882SJean-Philippe Brucker bar = pdev->hdr.bar[nr];
103082caa882SJean-Philippe Brucker
103182caa882SJean-Philippe Brucker region->vdev = vdev;
103282caa882SJean-Philippe Brucker region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
10336078a454SJean-Philippe Brucker
1034ed01a603SAlexandru Elisei ret = vfio_pci_get_region_info(vdev, nr, ®ion->info);
1035ed01a603SAlexandru Elisei if (ret)
10366078a454SJean-Philippe Brucker return ret;
10376078a454SJean-Philippe Brucker
10386078a454SJean-Philippe Brucker /* Ignore invalid or unimplemented regions */
10396078a454SJean-Philippe Brucker if (!region->info.size)
10406078a454SJean-Philippe Brucker return 0;
10416078a454SJean-Philippe Brucker
1042c9888d95SJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1043c9888d95SJean-Philippe Brucker /* Trap and emulate MSI-X table */
1044c9888d95SJean-Philippe Brucker if (nr == pdev->msix_table.bar) {
1045c9888d95SJean-Philippe Brucker region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
1046c9888d95SJean-Philippe Brucker return 0;
1047c9888d95SJean-Philippe Brucker } else if (nr == pdev->msix_pba.bar) {
1048c9888d95SJean-Philippe Brucker region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
1049c9888d95SJean-Philippe Brucker return 0;
1050c9888d95SJean-Philippe Brucker }
1051c9888d95SJean-Philippe Brucker }
1052c9888d95SJean-Philippe Brucker
1053a05e576fSAlexandru Elisei if (region->is_ioport) {
1054a05e576fSAlexandru Elisei region->port_base = pci_get_io_port_block(region->info.size);
1055a05e576fSAlexandru Elisei } else {
10566078a454SJean-Philippe Brucker /* Grab some MMIO space in the guest */
10576078a454SJean-Philippe Brucker map_size = ALIGN(region->info.size, PAGE_SIZE);
1058854aa2efSJulien Thierry region->guest_phys_addr = pci_get_mmio_block(map_size);
105982caa882SJean-Philippe Brucker }
10606078a454SJean-Philippe Brucker
10616078a454SJean-Philippe Brucker return 0;
10626078a454SJean-Philippe Brucker }
10636078a454SJean-Philippe Brucker
vfio_pci_configure_dev_regions(struct kvm * kvm,struct vfio_device * vdev)10646078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_regions(struct kvm *kvm,
10656078a454SJean-Philippe Brucker struct vfio_device *vdev)
10666078a454SJean-Philippe Brucker {
10676078a454SJean-Philippe Brucker int ret;
10686078a454SJean-Philippe Brucker u32 bar;
10696078a454SJean-Philippe Brucker size_t i;
10706078a454SJean-Philippe Brucker bool is_64bit = false;
10716078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
10726078a454SJean-Philippe Brucker
10736078a454SJean-Philippe Brucker ret = vfio_pci_parse_cfg_space(vdev);
10746078a454SJean-Philippe Brucker if (ret)
10756078a454SJean-Philippe Brucker return ret;
10766078a454SJean-Philippe Brucker
1077c9888d95SJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1078ed01a603SAlexandru Elisei ret = vfio_pci_create_msix_table(kvm, vdev);
1079c9888d95SJean-Philippe Brucker if (ret)
1080c9888d95SJean-Philippe Brucker return ret;
1081c9888d95SJean-Philippe Brucker }
1082c9888d95SJean-Philippe Brucker
10838dd28afeSJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
10848dd28afeSJean-Philippe Brucker ret = vfio_pci_create_msi_cap(kvm, pdev);
10858dd28afeSJean-Philippe Brucker if (ret)
10868dd28afeSJean-Philippe Brucker return ret;
10878dd28afeSJean-Philippe Brucker }
10888dd28afeSJean-Philippe Brucker
10896078a454SJean-Philippe Brucker for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
10906078a454SJean-Philippe Brucker /* Ignore top half of 64-bit BAR */
109184998f21SAlexandru Elisei if (is_64bit) {
109284998f21SAlexandru Elisei is_64bit = false;
10936078a454SJean-Philippe Brucker continue;
109484998f21SAlexandru Elisei }
10956078a454SJean-Philippe Brucker
10966078a454SJean-Philippe Brucker ret = vfio_pci_configure_bar(kvm, vdev, i);
10976078a454SJean-Philippe Brucker if (ret)
10986078a454SJean-Philippe Brucker return ret;
10996078a454SJean-Philippe Brucker
11006078a454SJean-Philippe Brucker bar = pdev->hdr.bar[i];
11016078a454SJean-Philippe Brucker is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) ==
11026078a454SJean-Philippe Brucker PCI_BASE_ADDRESS_SPACE_MEMORY &&
11036078a454SJean-Philippe Brucker bar & PCI_BASE_ADDRESS_MEM_TYPE_64;
11046078a454SJean-Philippe Brucker }
11056078a454SJean-Philippe Brucker
11066078a454SJean-Philippe Brucker /* We've configured the BARs, fake up a Configuration Space */
11075a8e4f25SAlexandru Elisei ret = vfio_pci_fixup_cfg_space(vdev);
11085a8e4f25SAlexandru Elisei if (ret)
11095a8e4f25SAlexandru Elisei return ret;
11105a8e4f25SAlexandru Elisei
11115a8e4f25SAlexandru Elisei return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate,
11125a8e4f25SAlexandru Elisei vfio_pci_bar_deactivate, vdev);
11136078a454SJean-Philippe Brucker }
11146078a454SJean-Philippe Brucker
1115c9888d95SJean-Philippe Brucker /*
1116c9888d95SJean-Philippe Brucker * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
1117c9888d95SJean-Philippe Brucker * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
1118c9888d95SJean-Philippe Brucker */
vfio_pci_reserve_irq_fds(size_t num)1119c9888d95SJean-Philippe Brucker static int vfio_pci_reserve_irq_fds(size_t num)
1120c9888d95SJean-Philippe Brucker {
1121c9888d95SJean-Philippe Brucker /*
1122c9888d95SJean-Philippe Brucker * I counted around 27 fds under normal load. Let's add 100 for good
1123c9888d95SJean-Philippe Brucker * measure.
1124c9888d95SJean-Philippe Brucker */
1125c9888d95SJean-Philippe Brucker static size_t needed = 128;
1126c9888d95SJean-Philippe Brucker struct rlimit fd_limit, new_limit;
1127c9888d95SJean-Philippe Brucker
1128c9888d95SJean-Philippe Brucker needed += num;
1129c9888d95SJean-Philippe Brucker
1130c9888d95SJean-Philippe Brucker if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
1131c9888d95SJean-Philippe Brucker perror("getrlimit(RLIMIT_NOFILE)");
1132c9888d95SJean-Philippe Brucker return 0;
1133c9888d95SJean-Philippe Brucker }
1134c9888d95SJean-Philippe Brucker
1135c9888d95SJean-Philippe Brucker if (fd_limit.rlim_cur >= needed)
1136c9888d95SJean-Philippe Brucker return 0;
1137c9888d95SJean-Philippe Brucker
1138c9888d95SJean-Philippe Brucker new_limit.rlim_cur = needed;
1139c9888d95SJean-Philippe Brucker
1140c9888d95SJean-Philippe Brucker if (fd_limit.rlim_max < needed)
1141c9888d95SJean-Philippe Brucker /* Try to bump hard limit (root only) */
1142c9888d95SJean-Philippe Brucker new_limit.rlim_max = needed;
1143c9888d95SJean-Philippe Brucker else
1144c9888d95SJean-Philippe Brucker new_limit.rlim_max = fd_limit.rlim_max;
1145c9888d95SJean-Philippe Brucker
1146c9888d95SJean-Philippe Brucker if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
1147c9888d95SJean-Philippe Brucker perror("setrlimit(RLIMIT_NOFILE)");
1148c9888d95SJean-Philippe Brucker pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
1149c9888d95SJean-Philippe Brucker (size_t)(needed - fd_limit.rlim_cur));
1150c9888d95SJean-Philippe Brucker }
1151c9888d95SJean-Philippe Brucker
1152c9888d95SJean-Philippe Brucker return 0;
1153c9888d95SJean-Philippe Brucker }
1154c9888d95SJean-Philippe Brucker
vfio_pci_init_msis(struct kvm * kvm,struct vfio_device * vdev,struct vfio_pci_msi_common * msis)1155c9888d95SJean-Philippe Brucker static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
1156c9888d95SJean-Philippe Brucker struct vfio_pci_msi_common *msis)
1157c9888d95SJean-Philippe Brucker {
1158c9888d95SJean-Philippe Brucker int ret;
1159c9888d95SJean-Philippe Brucker size_t i;
1160c9888d95SJean-Philippe Brucker int *eventfds;
1161c9888d95SJean-Philippe Brucker size_t irq_set_size;
1162c9888d95SJean-Philippe Brucker struct vfio_pci_msi_entry *entry;
1163c9888d95SJean-Philippe Brucker size_t nr_entries = msis->nr_entries;
1164c9888d95SJean-Philippe Brucker
1165c9888d95SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
116609533d3cSAndre Przywara if (ret || msis->info.count == 0) {
1167c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "no MSI reported by VFIO");
1168c9888d95SJean-Philippe Brucker return -ENODEV;
1169c9888d95SJean-Philippe Brucker }
1170c9888d95SJean-Philippe Brucker
1171c9888d95SJean-Philippe Brucker if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1172c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "interrupt not EVENTFD capable");
1173c9888d95SJean-Philippe Brucker return -EINVAL;
1174c9888d95SJean-Philippe Brucker }
1175c9888d95SJean-Philippe Brucker
1176c9888d95SJean-Philippe Brucker if (msis->info.count != nr_entries) {
1177c9888d95SJean-Philippe Brucker vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
1178c9888d95SJean-Philippe Brucker return -EINVAL;
1179c9888d95SJean-Philippe Brucker }
1180c9888d95SJean-Philippe Brucker
1181c9888d95SJean-Philippe Brucker mutex_init(&msis->mutex);
1182c9888d95SJean-Philippe Brucker
1183c9888d95SJean-Philippe Brucker vfio_pci_reserve_irq_fds(nr_entries);
1184c9888d95SJean-Philippe Brucker
1185c9888d95SJean-Philippe Brucker irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
1186c9888d95SJean-Philippe Brucker msis->irq_set = malloc(irq_set_size);
1187c9888d95SJean-Philippe Brucker if (!msis->irq_set)
1188c9888d95SJean-Philippe Brucker return -ENOMEM;
1189c9888d95SJean-Philippe Brucker
1190c9888d95SJean-Philippe Brucker *msis->irq_set = (struct vfio_irq_set) {
1191c9888d95SJean-Philippe Brucker .argsz = irq_set_size,
1192c9888d95SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_EVENTFD |
1193c9888d95SJean-Philippe Brucker VFIO_IRQ_SET_ACTION_TRIGGER,
1194c9888d95SJean-Philippe Brucker .index = msis->info.index,
1195c9888d95SJean-Philippe Brucker .start = 0,
1196c9888d95SJean-Philippe Brucker .count = nr_entries,
1197c9888d95SJean-Philippe Brucker };
1198c9888d95SJean-Philippe Brucker
1199c9888d95SJean-Philippe Brucker eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
1200c9888d95SJean-Philippe Brucker
1201c9888d95SJean-Philippe Brucker for (i = 0; i < nr_entries; i++) {
1202c9888d95SJean-Philippe Brucker entry = &msis->entries[i];
1203c9888d95SJean-Philippe Brucker entry->gsi = -1;
1204c9888d95SJean-Philippe Brucker entry->eventfd = -1;
1205*0b5e55fcSJean-Philippe Brucker msi_set_masked(entry->guest_state, false);
1206*0b5e55fcSJean-Philippe Brucker msi_set_masked(entry->host_state, true);
1207c9888d95SJean-Philippe Brucker eventfds[i] = -1;
1208c9888d95SJean-Philippe Brucker }
1209c9888d95SJean-Philippe Brucker
1210c9888d95SJean-Philippe Brucker return 0;
1211c9888d95SJean-Philippe Brucker }
1212c9888d95SJean-Philippe Brucker
vfio_pci_disable_intx(struct kvm * kvm,struct vfio_device * vdev)1213c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
1214c9888d95SJean-Philippe Brucker {
1215c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
1216c9888d95SJean-Philippe Brucker int gsi = pdev->intx_gsi;
1217c9888d95SJean-Philippe Brucker struct vfio_irq_set irq_set = {
1218c9888d95SJean-Philippe Brucker .argsz = sizeof(irq_set),
1219c9888d95SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
1220c9888d95SJean-Philippe Brucker .index = VFIO_PCI_INTX_IRQ_INDEX,
1221c9888d95SJean-Philippe Brucker };
1222c9888d95SJean-Philippe Brucker
12237302327aSLeo Yan if (pdev->intx_fd == -1)
12247302327aSLeo Yan return;
12257302327aSLeo Yan
1226c9888d95SJean-Philippe Brucker pr_debug("user requested MSI, disabling INTx %d", gsi);
1227c9888d95SJean-Philippe Brucker
1228c9888d95SJean-Philippe Brucker ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
1229c9888d95SJean-Philippe Brucker irq__del_irqfd(kvm, gsi, pdev->intx_fd);
1230c9888d95SJean-Philippe Brucker
1231c9888d95SJean-Philippe Brucker close(pdev->intx_fd);
1232a1ff6f87SLeo Yan close(pdev->unmask_fd);
12337302327aSLeo Yan pdev->intx_fd = -1;
1234c9888d95SJean-Philippe Brucker }
1235c9888d95SJean-Philippe Brucker
vfio_pci_enable_intx(struct kvm * kvm,struct vfio_device * vdev)12366078a454SJean-Philippe Brucker static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
12376078a454SJean-Philippe Brucker {
12386078a454SJean-Philippe Brucker int ret;
12396078a454SJean-Philippe Brucker int trigger_fd, unmask_fd;
1240a3704b91SAndre Przywara union vfio_irq_eventfd trigger;
1241a3704b91SAndre Przywara union vfio_irq_eventfd unmask;
12426078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
124312bd7a16SLeo Yan int gsi = pdev->intx_gsi;
12446078a454SJean-Philippe Brucker
12457302327aSLeo Yan if (pdev->intx_fd != -1)
12467302327aSLeo Yan return 0;
12477302327aSLeo Yan
12486078a454SJean-Philippe Brucker /*
12496078a454SJean-Philippe Brucker * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
12506078a454SJean-Philippe Brucker * signals an interrupt from host to guest, and unmask_fd signals the
12516078a454SJean-Philippe Brucker * deassertion of the line from guest to host.
12526078a454SJean-Philippe Brucker */
12536078a454SJean-Philippe Brucker trigger_fd = eventfd(0, 0);
12546078a454SJean-Philippe Brucker if (trigger_fd < 0) {
12556078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to create trigger eventfd");
12566078a454SJean-Philippe Brucker return trigger_fd;
12576078a454SJean-Philippe Brucker }
12586078a454SJean-Philippe Brucker
12596078a454SJean-Philippe Brucker unmask_fd = eventfd(0, 0);
12606078a454SJean-Philippe Brucker if (unmask_fd < 0) {
12616078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to create unmask eventfd");
12626078a454SJean-Philippe Brucker close(trigger_fd);
12636078a454SJean-Philippe Brucker return unmask_fd;
12646078a454SJean-Philippe Brucker }
12656078a454SJean-Philippe Brucker
12666078a454SJean-Philippe Brucker ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
12676078a454SJean-Philippe Brucker if (ret)
12686078a454SJean-Philippe Brucker goto err_close;
12696078a454SJean-Philippe Brucker
12706078a454SJean-Philippe Brucker trigger.irq = (struct vfio_irq_set) {
12716078a454SJean-Philippe Brucker .argsz = sizeof(trigger),
12726078a454SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
12736078a454SJean-Philippe Brucker .index = VFIO_PCI_INTX_IRQ_INDEX,
12746078a454SJean-Philippe Brucker .start = 0,
12756078a454SJean-Philippe Brucker .count = 1,
12766078a454SJean-Philippe Brucker };
1277a3704b91SAndre Przywara set_vfio_irq_eventd_payload(&trigger, trigger_fd);
12786078a454SJean-Philippe Brucker
12796078a454SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
12806078a454SJean-Philippe Brucker if (ret < 0) {
12816078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to setup VFIO IRQ");
12826078a454SJean-Philippe Brucker goto err_delete_line;
12836078a454SJean-Philippe Brucker }
12846078a454SJean-Philippe Brucker
12856078a454SJean-Philippe Brucker unmask.irq = (struct vfio_irq_set) {
12866078a454SJean-Philippe Brucker .argsz = sizeof(unmask),
12876078a454SJean-Philippe Brucker .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
12886078a454SJean-Philippe Brucker .index = VFIO_PCI_INTX_IRQ_INDEX,
12896078a454SJean-Philippe Brucker .start = 0,
12906078a454SJean-Philippe Brucker .count = 1,
12916078a454SJean-Philippe Brucker };
1292a3704b91SAndre Przywara set_vfio_irq_eventd_payload(&unmask, unmask_fd);
12936078a454SJean-Philippe Brucker
12946078a454SJean-Philippe Brucker ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask);
12956078a454SJean-Philippe Brucker if (ret < 0) {
12966078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to setup unmask IRQ");
12976078a454SJean-Philippe Brucker goto err_remove_event;
12986078a454SJean-Philippe Brucker }
12996078a454SJean-Philippe Brucker
1300c9888d95SJean-Philippe Brucker pdev->intx_fd = trigger_fd;
1301a1ff6f87SLeo Yan pdev->unmask_fd = unmask_fd;
1302c9888d95SJean-Philippe Brucker
13036078a454SJean-Philippe Brucker return 0;
13046078a454SJean-Philippe Brucker
13056078a454SJean-Philippe Brucker err_remove_event:
13066078a454SJean-Philippe Brucker /* Remove trigger event */
13076078a454SJean-Philippe Brucker trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
13086078a454SJean-Philippe Brucker trigger.irq.count = 0;
13096078a454SJean-Philippe Brucker ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
13106078a454SJean-Philippe Brucker
13116078a454SJean-Philippe Brucker err_delete_line:
13126078a454SJean-Philippe Brucker irq__del_irqfd(kvm, gsi, trigger_fd);
13136078a454SJean-Philippe Brucker
13146078a454SJean-Philippe Brucker err_close:
13156078a454SJean-Philippe Brucker close(trigger_fd);
13166078a454SJean-Philippe Brucker close(unmask_fd);
13176078a454SJean-Philippe Brucker return ret;
13186078a454SJean-Philippe Brucker }
13196078a454SJean-Philippe Brucker
vfio_pci_init_intx(struct kvm * kvm,struct vfio_device * vdev)132012bd7a16SLeo Yan static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev)
132112bd7a16SLeo Yan {
132212bd7a16SLeo Yan int ret;
132312bd7a16SLeo Yan struct vfio_pci_device *pdev = &vdev->pci;
132412bd7a16SLeo Yan struct vfio_irq_info irq_info = {
132512bd7a16SLeo Yan .argsz = sizeof(irq_info),
132612bd7a16SLeo Yan .index = VFIO_PCI_INTX_IRQ_INDEX,
132712bd7a16SLeo Yan };
132812bd7a16SLeo Yan
132912bd7a16SLeo Yan vfio_pci_reserve_irq_fds(2);
133012bd7a16SLeo Yan
133112bd7a16SLeo Yan ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
133212bd7a16SLeo Yan if (ret || irq_info.count == 0) {
133312bd7a16SLeo Yan vfio_dev_err(vdev, "no INTx reported by VFIO");
133412bd7a16SLeo Yan return -ENODEV;
133512bd7a16SLeo Yan }
133612bd7a16SLeo Yan
133712bd7a16SLeo Yan if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
133812bd7a16SLeo Yan vfio_dev_err(vdev, "interrupt not eventfd capable");
133912bd7a16SLeo Yan return -EINVAL;
134012bd7a16SLeo Yan }
134112bd7a16SLeo Yan
134212bd7a16SLeo Yan if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
134312bd7a16SLeo Yan vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
134412bd7a16SLeo Yan return -EINVAL;
134512bd7a16SLeo Yan }
134612bd7a16SLeo Yan
134712bd7a16SLeo Yan /* Guest is going to ovewrite our irq_line... */
134812bd7a16SLeo Yan pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
134912bd7a16SLeo Yan
13507302327aSLeo Yan pdev->intx_fd = -1;
13517302327aSLeo Yan
135212bd7a16SLeo Yan return 0;
135312bd7a16SLeo Yan }
135412bd7a16SLeo Yan
vfio_pci_configure_dev_irqs(struct kvm * kvm,struct vfio_device * vdev)13556078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
13566078a454SJean-Philippe Brucker {
1357c9888d95SJean-Philippe Brucker int ret = 0;
13586078a454SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
13596078a454SJean-Philippe Brucker
1360c9888d95SJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1361c9888d95SJean-Philippe Brucker pdev->msix.info = (struct vfio_irq_info) {
1362c9888d95SJean-Philippe Brucker .argsz = sizeof(pdev->msix.info),
1363c9888d95SJean-Philippe Brucker .index = VFIO_PCI_MSIX_IRQ_INDEX,
13646078a454SJean-Philippe Brucker };
1365c9888d95SJean-Philippe Brucker ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
1366c9888d95SJean-Philippe Brucker if (ret)
1367c9888d95SJean-Philippe Brucker return ret;
13686078a454SJean-Philippe Brucker }
13696078a454SJean-Philippe Brucker
13708dd28afeSJean-Philippe Brucker if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
13718dd28afeSJean-Philippe Brucker pdev->msi.info = (struct vfio_irq_info) {
13728dd28afeSJean-Philippe Brucker .argsz = sizeof(pdev->msi.info),
13738dd28afeSJean-Philippe Brucker .index = VFIO_PCI_MSI_IRQ_INDEX,
13748dd28afeSJean-Philippe Brucker };
13758dd28afeSJean-Philippe Brucker ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi);
13768dd28afeSJean-Philippe Brucker if (ret)
13778dd28afeSJean-Philippe Brucker return ret;
13788dd28afeSJean-Philippe Brucker }
13798dd28afeSJean-Philippe Brucker
138012bd7a16SLeo Yan if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
1381c0c45eedSAndre Przywara pci__assign_irq(&vdev->pci.hdr);
1382c0c45eedSAndre Przywara
138312bd7a16SLeo Yan ret = vfio_pci_init_intx(kvm, vdev);
138412bd7a16SLeo Yan if (ret)
138512bd7a16SLeo Yan return ret;
138612bd7a16SLeo Yan
1387c9888d95SJean-Philippe Brucker ret = vfio_pci_enable_intx(kvm, vdev);
138812bd7a16SLeo Yan }
1389c9888d95SJean-Philippe Brucker
1390c9888d95SJean-Philippe Brucker return ret;
13916078a454SJean-Philippe Brucker }
13926078a454SJean-Philippe Brucker
vfio_pci_setup_device(struct kvm * kvm,struct vfio_device * vdev)13936078a454SJean-Philippe Brucker int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
13946078a454SJean-Philippe Brucker {
13956078a454SJean-Philippe Brucker int ret;
13966078a454SJean-Philippe Brucker
13976078a454SJean-Philippe Brucker ret = vfio_pci_configure_dev_regions(kvm, vdev);
13986078a454SJean-Philippe Brucker if (ret) {
13996078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to configure regions");
14006078a454SJean-Philippe Brucker return ret;
14016078a454SJean-Philippe Brucker }
14026078a454SJean-Philippe Brucker
14036078a454SJean-Philippe Brucker vdev->dev_hdr = (struct device_header) {
14046078a454SJean-Philippe Brucker .bus_type = DEVICE_BUS_PCI,
14056078a454SJean-Philippe Brucker .data = &vdev->pci.hdr,
14066078a454SJean-Philippe Brucker };
14076078a454SJean-Philippe Brucker
14086078a454SJean-Philippe Brucker ret = device__register(&vdev->dev_hdr);
14096078a454SJean-Philippe Brucker if (ret) {
14106078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to register VFIO device");
14116078a454SJean-Philippe Brucker return ret;
14126078a454SJean-Philippe Brucker }
14136078a454SJean-Philippe Brucker
14146078a454SJean-Philippe Brucker ret = vfio_pci_configure_dev_irqs(kvm, vdev);
14156078a454SJean-Philippe Brucker if (ret) {
14166078a454SJean-Philippe Brucker vfio_dev_err(vdev, "failed to configure IRQs");
14176078a454SJean-Philippe Brucker return ret;
14186078a454SJean-Philippe Brucker }
14196078a454SJean-Philippe Brucker
14206078a454SJean-Philippe Brucker return 0;
14216078a454SJean-Philippe Brucker }
14226078a454SJean-Philippe Brucker
vfio_pci_teardown_device(struct kvm * kvm,struct vfio_device * vdev)14236078a454SJean-Philippe Brucker void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
14246078a454SJean-Philippe Brucker {
14256078a454SJean-Philippe Brucker size_t i;
1426c9888d95SJean-Philippe Brucker struct vfio_pci_device *pdev = &vdev->pci;
14276078a454SJean-Philippe Brucker
14286078a454SJean-Philippe Brucker for (i = 0; i < vdev->info.num_regions; i++)
14296078a454SJean-Philippe Brucker vfio_unmap_region(kvm, &vdev->regions[i]);
14306078a454SJean-Philippe Brucker
14316078a454SJean-Philippe Brucker device__unregister(&vdev->dev_hdr);
1432c9888d95SJean-Philippe Brucker
1433c9888d95SJean-Philippe Brucker free(pdev->msix.irq_set);
1434c9888d95SJean-Philippe Brucker free(pdev->msix.entries);
14358dd28afeSJean-Philippe Brucker free(pdev->msi.irq_set);
14368dd28afeSJean-Philippe Brucker free(pdev->msi.entries);
14376078a454SJean-Philippe Brucker }
1438