xref: /kvmtool/vfio/pci.c (revision 39181fc6429f4e9e71473284940e35857b42772a)
1*39181fc6SAlexandru Elisei #include "linux/sizes.h"
2*39181fc6SAlexandru Elisei 
36078a454SJean-Philippe Brucker #include "kvm/irq.h"
46078a454SJean-Philippe Brucker #include "kvm/kvm.h"
56078a454SJean-Philippe Brucker #include "kvm/kvm-cpu.h"
66078a454SJean-Philippe Brucker #include "kvm/vfio.h"
76078a454SJean-Philippe Brucker 
8e1d0285cSAlexandru Elisei #include <assert.h>
9e1d0285cSAlexandru Elisei 
106078a454SJean-Philippe Brucker #include <sys/ioctl.h>
116078a454SJean-Philippe Brucker #include <sys/eventfd.h>
12c9888d95SJean-Philippe Brucker #include <sys/resource.h>
13c9888d95SJean-Philippe Brucker #include <sys/time.h>
146078a454SJean-Philippe Brucker 
1525c1dc6cSAlexandru Elisei /* Some distros don't have the define. */
1625c1dc6cSAlexandru Elisei #ifndef PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1
1725c1dc6cSAlexandru Elisei #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1	12
1825c1dc6cSAlexandru Elisei #endif
1925c1dc6cSAlexandru Elisei 
206078a454SJean-Philippe Brucker /* Wrapper around UAPI vfio_irq_set */
21a3704b91SAndre Przywara union vfio_irq_eventfd {
226078a454SJean-Philippe Brucker 	struct vfio_irq_set	irq;
23a3704b91SAndre Przywara 	u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)];
246078a454SJean-Philippe Brucker };
256078a454SJean-Philippe Brucker 
26a3704b91SAndre Przywara static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd)
27a3704b91SAndre Przywara {
28a3704b91SAndre Przywara 	memcpy(&evfd->irq.data, &fd, sizeof(fd));
29a3704b91SAndre Przywara }
30a3704b91SAndre Przywara 
31c9888d95SJean-Philippe Brucker #define msi_is_enabled(state)		((state) & VFIO_PCI_MSI_STATE_ENABLED)
32c9888d95SJean-Philippe Brucker #define msi_is_masked(state)		((state) & VFIO_PCI_MSI_STATE_MASKED)
33c9888d95SJean-Philippe Brucker #define msi_is_empty(state)		((state) & VFIO_PCI_MSI_STATE_EMPTY)
34c9888d95SJean-Philippe Brucker 
35c9888d95SJean-Philippe Brucker #define msi_update_state(state, val, bit)				\
36c9888d95SJean-Philippe Brucker 	(state) = (val) ? (state) | bit : (state) & ~bit;
37c9888d95SJean-Philippe Brucker #define msi_set_enabled(state, val)					\
38c9888d95SJean-Philippe Brucker 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
39c9888d95SJean-Philippe Brucker #define msi_set_masked(state, val)					\
40c9888d95SJean-Philippe Brucker 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
41c9888d95SJean-Philippe Brucker #define msi_set_empty(state, val)					\
42c9888d95SJean-Philippe Brucker 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
43c9888d95SJean-Philippe Brucker 
44c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
457302327aSLeo Yan static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
46c9888d95SJean-Philippe Brucker 
478dd28afeSJean-Philippe Brucker static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
488dd28afeSJean-Philippe Brucker 				bool msix)
49c9888d95SJean-Philippe Brucker {
50c9888d95SJean-Philippe Brucker 	size_t i;
51c9888d95SJean-Philippe Brucker 	int ret = 0;
52c9888d95SJean-Philippe Brucker 	int *eventfds;
53c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
548dd28afeSJean-Philippe Brucker 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
55a3704b91SAndre Przywara 	union vfio_irq_eventfd single = {
56c9888d95SJean-Philippe Brucker 		.irq = {
57c9888d95SJean-Philippe Brucker 			.argsz	= sizeof(single),
58c9888d95SJean-Philippe Brucker 			.flags	= VFIO_IRQ_SET_DATA_EVENTFD |
59c9888d95SJean-Philippe Brucker 				  VFIO_IRQ_SET_ACTION_TRIGGER,
60c9888d95SJean-Philippe Brucker 			.index	= msis->info.index,
61c9888d95SJean-Philippe Brucker 			.count	= 1,
62c9888d95SJean-Philippe Brucker 		},
63c9888d95SJean-Philippe Brucker 	};
64c9888d95SJean-Philippe Brucker 
65c9888d95SJean-Philippe Brucker 	if (!msi_is_enabled(msis->virt_state))
66c9888d95SJean-Philippe Brucker 		return 0;
67c9888d95SJean-Philippe Brucker 
687302327aSLeo Yan 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
69c9888d95SJean-Philippe Brucker 		/*
70c9888d95SJean-Philippe Brucker 		 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
71c9888d95SJean-Philippe Brucker 		 * time. Since INTx has to be enabled from the start (we don't
727302327aSLeo Yan 		 * have a reliable way to know when the guest starts using it),
73c9888d95SJean-Philippe Brucker 		 * disable it now.
74c9888d95SJean-Philippe Brucker 		 */
75c9888d95SJean-Philippe Brucker 		vfio_pci_disable_intx(kvm, vdev);
76c9888d95SJean-Philippe Brucker 
77c9888d95SJean-Philippe Brucker 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
78c9888d95SJean-Philippe Brucker 
79c9888d95SJean-Philippe Brucker 	/*
80c9888d95SJean-Philippe Brucker 	 * Initial registration of the full range. This enables the physical
81c9888d95SJean-Philippe Brucker 	 * MSI/MSI-X capability, which might have desired side effects. For
82c9888d95SJean-Philippe Brucker 	 * instance when assigning virtio legacy devices, enabling the MSI
83c9888d95SJean-Philippe Brucker 	 * capability modifies the config space layout!
84c9888d95SJean-Philippe Brucker 	 *
85c9888d95SJean-Philippe Brucker 	 * As an optimization, only update MSIs when guest unmasks the
86c9888d95SJean-Philippe Brucker 	 * capability. This greatly reduces the initialization time for Linux
87c9888d95SJean-Philippe Brucker 	 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
88c9888d95SJean-Philippe Brucker 	 * masked, then fills individual vectors, then unmasks the whole
89c9888d95SJean-Philippe Brucker 	 * function. So we only do one VFIO ioctl when enabling for the first
90c9888d95SJean-Philippe Brucker 	 * time, and then one when unmasking.
91c9888d95SJean-Philippe Brucker 	 *
92c9888d95SJean-Philippe Brucker 	 * phys_state is empty when it is enabled but no vector has been
93c9888d95SJean-Philippe Brucker 	 * registered via SET_IRQS yet.
94c9888d95SJean-Philippe Brucker 	 */
95c9888d95SJean-Philippe Brucker 	if (!msi_is_enabled(msis->phys_state) ||
96c9888d95SJean-Philippe Brucker 	    (!msi_is_masked(msis->virt_state) &&
97c9888d95SJean-Philippe Brucker 	     msi_is_empty(msis->phys_state))) {
98c9888d95SJean-Philippe Brucker 		bool empty = true;
99c9888d95SJean-Philippe Brucker 
100c9888d95SJean-Philippe Brucker 		for (i = 0; i < msis->nr_entries; i++) {
101c9888d95SJean-Philippe Brucker 			eventfds[i] = msis->entries[i].gsi >= 0 ?
102c9888d95SJean-Philippe Brucker 				      msis->entries[i].eventfd : -1;
103c9888d95SJean-Philippe Brucker 
104c9888d95SJean-Philippe Brucker 			if (eventfds[i] >= 0)
105c9888d95SJean-Philippe Brucker 				empty = false;
106c9888d95SJean-Philippe Brucker 		}
107c9888d95SJean-Philippe Brucker 
108c9888d95SJean-Philippe Brucker 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
109c9888d95SJean-Philippe Brucker 		if (ret < 0) {
110c9888d95SJean-Philippe Brucker 			perror("VFIO_DEVICE_SET_IRQS(multi)");
111c9888d95SJean-Philippe Brucker 			return ret;
112c9888d95SJean-Philippe Brucker 		}
113c9888d95SJean-Philippe Brucker 
114c9888d95SJean-Philippe Brucker 		msi_set_enabled(msis->phys_state, true);
115c9888d95SJean-Philippe Brucker 		msi_set_empty(msis->phys_state, empty);
116c9888d95SJean-Philippe Brucker 
117c9888d95SJean-Philippe Brucker 		return 0;
118c9888d95SJean-Philippe Brucker 	}
119c9888d95SJean-Philippe Brucker 
120c9888d95SJean-Philippe Brucker 	if (msi_is_masked(msis->virt_state)) {
121c9888d95SJean-Philippe Brucker 		/* TODO: if phys_state is not empty nor masked, mask all vectors */
122c9888d95SJean-Philippe Brucker 		return 0;
123c9888d95SJean-Philippe Brucker 	}
124c9888d95SJean-Philippe Brucker 
125c9888d95SJean-Philippe Brucker 	/* Update individual vectors to avoid breaking those in use */
126c9888d95SJean-Philippe Brucker 	for (i = 0; i < msis->nr_entries; i++) {
127c9888d95SJean-Philippe Brucker 		struct vfio_pci_msi_entry *entry = &msis->entries[i];
128c9888d95SJean-Philippe Brucker 		int fd = entry->gsi >= 0 ? entry->eventfd : -1;
129c9888d95SJean-Philippe Brucker 
130c9888d95SJean-Philippe Brucker 		if (fd == eventfds[i])
131c9888d95SJean-Philippe Brucker 			continue;
132c9888d95SJean-Philippe Brucker 
133c9888d95SJean-Philippe Brucker 		single.irq.start = i;
134a3704b91SAndre Przywara 		set_vfio_irq_eventd_payload(&single, fd);
135c9888d95SJean-Philippe Brucker 
136c9888d95SJean-Philippe Brucker 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
137c9888d95SJean-Philippe Brucker 		if (ret < 0) {
138c9888d95SJean-Philippe Brucker 			perror("VFIO_DEVICE_SET_IRQS(single)");
139c9888d95SJean-Philippe Brucker 			break;
140c9888d95SJean-Philippe Brucker 		}
141c9888d95SJean-Philippe Brucker 
142c9888d95SJean-Philippe Brucker 		eventfds[i] = fd;
143c9888d95SJean-Philippe Brucker 
144c9888d95SJean-Philippe Brucker 		if (msi_is_empty(msis->phys_state) && fd >= 0)
145c9888d95SJean-Philippe Brucker 			msi_set_empty(msis->phys_state, false);
146c9888d95SJean-Philippe Brucker 	}
147c9888d95SJean-Philippe Brucker 
148c9888d95SJean-Philippe Brucker 	return ret;
149c9888d95SJean-Philippe Brucker }
150c9888d95SJean-Philippe Brucker 
1518dd28afeSJean-Philippe Brucker static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev,
1528dd28afeSJean-Philippe Brucker 				 bool msix)
153c9888d95SJean-Philippe Brucker {
154c9888d95SJean-Philippe Brucker 	int ret;
155c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
1568dd28afeSJean-Philippe Brucker 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
157c9888d95SJean-Philippe Brucker 	struct vfio_irq_set irq_set = {
158c9888d95SJean-Philippe Brucker 		.argsz	= sizeof(irq_set),
159c9888d95SJean-Philippe Brucker 		.flags 	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
160c9888d95SJean-Philippe Brucker 		.index 	= msis->info.index,
161c9888d95SJean-Philippe Brucker 		.start 	= 0,
162c9888d95SJean-Philippe Brucker 		.count	= 0,
163c9888d95SJean-Philippe Brucker 	};
164c9888d95SJean-Philippe Brucker 
165c9888d95SJean-Philippe Brucker 	if (!msi_is_enabled(msis->phys_state))
166c9888d95SJean-Philippe Brucker 		return 0;
167c9888d95SJean-Philippe Brucker 
168c9888d95SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
169c9888d95SJean-Philippe Brucker 	if (ret < 0) {
170c9888d95SJean-Philippe Brucker 		perror("VFIO_DEVICE_SET_IRQS(NONE)");
171c9888d95SJean-Philippe Brucker 		return ret;
172c9888d95SJean-Philippe Brucker 	}
173c9888d95SJean-Philippe Brucker 
174c9888d95SJean-Philippe Brucker 	msi_set_enabled(msis->phys_state, false);
175c9888d95SJean-Philippe Brucker 	msi_set_empty(msis->phys_state, true);
176c9888d95SJean-Philippe Brucker 
1777302327aSLeo Yan 	/*
1787302327aSLeo Yan 	 * When MSI or MSIX is disabled, this might be called when
1797302327aSLeo Yan 	 * PCI driver detects the MSI interrupt failure and wants to
1807302327aSLeo Yan 	 * rollback to INTx mode.  Thus enable INTx if the device
1817302327aSLeo Yan 	 * supports INTx mode in this case.
1827302327aSLeo Yan 	 */
1837302327aSLeo Yan 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
1847302327aSLeo Yan 		ret = vfio_pci_enable_intx(kvm, vdev);
1857302327aSLeo Yan 
1867302327aSLeo Yan 	return ret >= 0 ? 0 : ret;
187c9888d95SJean-Philippe Brucker }
188c9888d95SJean-Philippe Brucker 
189c9888d95SJean-Philippe Brucker static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
190c9888d95SJean-Philippe Brucker 				     struct vfio_pci_msi_entry *entry)
191c9888d95SJean-Philippe Brucker {
192c9888d95SJean-Philippe Brucker 	int ret;
193c9888d95SJean-Philippe Brucker 
194c9888d95SJean-Philippe Brucker 	if (entry->eventfd < 0) {
195c9888d95SJean-Philippe Brucker 		entry->eventfd = eventfd(0, 0);
196c9888d95SJean-Philippe Brucker 		if (entry->eventfd < 0) {
197c9888d95SJean-Philippe Brucker 			ret = -errno;
198c9888d95SJean-Philippe Brucker 			vfio_dev_err(vdev, "cannot create eventfd");
199c9888d95SJean-Philippe Brucker 			return ret;
200c9888d95SJean-Philippe Brucker 		}
201c9888d95SJean-Philippe Brucker 	}
202c9888d95SJean-Philippe Brucker 
203c9888d95SJean-Philippe Brucker 	/* Allocate IRQ if necessary */
204c9888d95SJean-Philippe Brucker 	if (entry->gsi < 0) {
205c9888d95SJean-Philippe Brucker 		int ret = irq__add_msix_route(kvm, &entry->config.msg,
206c9888d95SJean-Philippe Brucker 					      vdev->dev_hdr.dev_num << 3);
207c9888d95SJean-Philippe Brucker 		if (ret < 0) {
208c9888d95SJean-Philippe Brucker 			vfio_dev_err(vdev, "cannot create MSI-X route");
209c9888d95SJean-Philippe Brucker 			return ret;
210c9888d95SJean-Philippe Brucker 		}
211c9888d95SJean-Philippe Brucker 		entry->gsi = ret;
212c9888d95SJean-Philippe Brucker 	} else {
213c9888d95SJean-Philippe Brucker 		irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
214c9888d95SJean-Philippe Brucker 	}
215c9888d95SJean-Philippe Brucker 
216c9888d95SJean-Philippe Brucker 	/*
217c9888d95SJean-Philippe Brucker 	 * MSI masking is unimplemented in VFIO, so we have to handle it by
218c9888d95SJean-Philippe Brucker 	 * disabling/enabling IRQ route instead. We do it on the KVM side rather
219c9888d95SJean-Philippe Brucker 	 * than VFIO, because:
220c9888d95SJean-Philippe Brucker 	 * - it is 8x faster
221c9888d95SJean-Philippe Brucker 	 * - it allows to decouple masking logic from capability state.
222c9888d95SJean-Philippe Brucker 	 * - in masked state, after removing irqfd route, we could easily plug
223c9888d95SJean-Philippe Brucker 	 *   the eventfd in a local handler, in order to serve Pending Bit reads
224c9888d95SJean-Philippe Brucker 	 *   to the guest.
225c9888d95SJean-Philippe Brucker 	 *
226c9888d95SJean-Philippe Brucker 	 * So entry->phys_state is masked when there is no active irqfd route.
227c9888d95SJean-Philippe Brucker 	 */
228c9888d95SJean-Philippe Brucker 	if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state))
229c9888d95SJean-Philippe Brucker 		return 0;
230c9888d95SJean-Philippe Brucker 
231c9888d95SJean-Philippe Brucker 	if (msi_is_masked(entry->phys_state)) {
232c9888d95SJean-Philippe Brucker 		ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
233c9888d95SJean-Philippe Brucker 		if (ret < 0) {
234c9888d95SJean-Philippe Brucker 			vfio_dev_err(vdev, "cannot setup irqfd");
235c9888d95SJean-Philippe Brucker 			return ret;
236c9888d95SJean-Philippe Brucker 		}
237c9888d95SJean-Philippe Brucker 	} else {
238c9888d95SJean-Philippe Brucker 		irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
239c9888d95SJean-Philippe Brucker 	}
240c9888d95SJean-Philippe Brucker 
241c9888d95SJean-Philippe Brucker 	msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state));
242c9888d95SJean-Philippe Brucker 
243c9888d95SJean-Philippe Brucker 	return 0;
244c9888d95SJean-Philippe Brucker }
245c9888d95SJean-Philippe Brucker 
246c9888d95SJean-Philippe Brucker static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
247c9888d95SJean-Philippe Brucker 				     u32 len, u8 is_write, void *ptr)
248c9888d95SJean-Philippe Brucker {
249c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = ptr;
250c9888d95SJean-Philippe Brucker 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
251c9888d95SJean-Philippe Brucker 	u64 offset = addr - pba->guest_phys_addr;
252c9888d95SJean-Philippe Brucker 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
253c9888d95SJean-Philippe Brucker 
254b20d6e30SAlexandru Elisei 	if (offset >= pba->size) {
255b20d6e30SAlexandru Elisei 		vfio_dev_err(vdev, "access outside of the MSIX PBA");
256b20d6e30SAlexandru Elisei 		return;
257b20d6e30SAlexandru Elisei 	}
258b20d6e30SAlexandru Elisei 
259c9888d95SJean-Philippe Brucker 	if (is_write)
260c9888d95SJean-Philippe Brucker 		return;
261c9888d95SJean-Philippe Brucker 
262c9888d95SJean-Philippe Brucker 	/*
263c9888d95SJean-Philippe Brucker 	 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
264c9888d95SJean-Philippe Brucker 	 * is completely useless here. Note that Linux doesn't use PBA.
265c9888d95SJean-Philippe Brucker 	 */
2665f44d5d6SAlexandru Elisei 	if (pread(vdev->fd, data, len, pba->fd_offset + offset) != (ssize_t)len)
267c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot access MSIX PBA\n");
268c9888d95SJean-Philippe Brucker }
269c9888d95SJean-Philippe Brucker 
270c9888d95SJean-Philippe Brucker static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
271c9888d95SJean-Philippe Brucker 				       u32 len, u8 is_write, void *ptr)
272c9888d95SJean-Philippe Brucker {
273c9888d95SJean-Philippe Brucker 	struct kvm *kvm = vcpu->kvm;
274c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
275c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = ptr;
276c9888d95SJean-Philippe Brucker 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
277c9888d95SJean-Philippe Brucker 
278c9888d95SJean-Philippe Brucker 	u64 offset = addr - pdev->msix_table.guest_phys_addr;
279b20d6e30SAlexandru Elisei 	if (offset >= pdev->msix_table.size) {
280b20d6e30SAlexandru Elisei 		vfio_dev_err(vdev, "access outside of the MSI-X table");
281b20d6e30SAlexandru Elisei 		return;
282b20d6e30SAlexandru Elisei 	}
283c9888d95SJean-Philippe Brucker 
284c9888d95SJean-Philippe Brucker 	size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
285c9888d95SJean-Philippe Brucker 	off_t field = offset % PCI_MSIX_ENTRY_SIZE;
286c9888d95SJean-Philippe Brucker 
287c9888d95SJean-Philippe Brucker 	/*
288c9888d95SJean-Philippe Brucker 	 * PCI spec says that software must use aligned 4 or 8 bytes accesses
289c9888d95SJean-Philippe Brucker 	 * for the MSI-X tables.
290c9888d95SJean-Philippe Brucker 	 */
291c9888d95SJean-Philippe Brucker 	if ((len != 4 && len != 8) || addr & (len - 1)) {
292c9888d95SJean-Philippe Brucker 		vfio_dev_warn(vdev, "invalid MSI-X table access");
293c9888d95SJean-Philippe Brucker 		return;
294c9888d95SJean-Philippe Brucker 	}
295c9888d95SJean-Philippe Brucker 
296c9888d95SJean-Philippe Brucker 	entry = &pdev->msix.entries[vector];
297c9888d95SJean-Philippe Brucker 
298c9888d95SJean-Philippe Brucker 	mutex_lock(&pdev->msix.mutex);
299c9888d95SJean-Philippe Brucker 
300c9888d95SJean-Philippe Brucker 	if (!is_write) {
301c9888d95SJean-Philippe Brucker 		memcpy(data, (void *)&entry->config + field, len);
302c9888d95SJean-Philippe Brucker 		goto out_unlock;
303c9888d95SJean-Philippe Brucker 	}
304c9888d95SJean-Philippe Brucker 
305c9888d95SJean-Philippe Brucker 	memcpy((void *)&entry->config + field, data, len);
306c9888d95SJean-Philippe Brucker 
307c9888d95SJean-Philippe Brucker 	/*
308c9888d95SJean-Philippe Brucker 	 * Check if access touched the vector control register, which is at the
309c9888d95SJean-Philippe Brucker 	 * end of the MSI-X entry.
310c9888d95SJean-Philippe Brucker 	 */
311c9888d95SJean-Philippe Brucker 	if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
312c9888d95SJean-Philippe Brucker 		goto out_unlock;
313c9888d95SJean-Philippe Brucker 
314c9888d95SJean-Philippe Brucker 	msi_set_masked(entry->virt_state, entry->config.ctrl &
315c9888d95SJean-Philippe Brucker 		       PCI_MSIX_ENTRY_CTRL_MASKBIT);
316c9888d95SJean-Philippe Brucker 
317c9888d95SJean-Philippe Brucker 	if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
318c9888d95SJean-Philippe Brucker 		/* Not much we can do here. */
319c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
320c9888d95SJean-Philippe Brucker 
321c9888d95SJean-Philippe Brucker 	/* Update the physical capability if necessary */
3228dd28afeSJean-Philippe Brucker 	if (vfio_pci_enable_msis(kvm, vdev, true))
323c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot enable MSIX");
324c9888d95SJean-Philippe Brucker 
325c9888d95SJean-Philippe Brucker out_unlock:
326c9888d95SJean-Philippe Brucker 	mutex_unlock(&pdev->msix.mutex);
327c9888d95SJean-Philippe Brucker }
328c9888d95SJean-Philippe Brucker 
329c9888d95SJean-Philippe Brucker static void vfio_pci_msix_cap_write(struct kvm *kvm,
330e69b7663SAlexandru Elisei 				    struct vfio_device *vdev, u16 off,
331c9888d95SJean-Philippe Brucker 				    void *data, int sz)
332c9888d95SJean-Philippe Brucker {
333c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
334c9888d95SJean-Philippe Brucker 	off_t enable_pos = PCI_MSIX_FLAGS + 1;
335c9888d95SJean-Philippe Brucker 	bool enable;
336c9888d95SJean-Philippe Brucker 	u16 flags;
337c9888d95SJean-Philippe Brucker 
338c9888d95SJean-Philippe Brucker 	off -= pdev->msix.pos;
339c9888d95SJean-Philippe Brucker 
340c9888d95SJean-Philippe Brucker 	/* Check if access intersects with the MSI-X Enable bit */
341c9888d95SJean-Philippe Brucker 	if (off > enable_pos || off + sz <= enable_pos)
342c9888d95SJean-Philippe Brucker 		return;
343c9888d95SJean-Philippe Brucker 
344c9888d95SJean-Philippe Brucker 	/* Read byte that contains the Enable bit */
345c9888d95SJean-Philippe Brucker 	flags = *(u8 *)(data + enable_pos - off) << 8;
346c9888d95SJean-Philippe Brucker 
347c9888d95SJean-Philippe Brucker 	mutex_lock(&pdev->msix.mutex);
348c9888d95SJean-Philippe Brucker 
349c9888d95SJean-Philippe Brucker 	msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL);
350c9888d95SJean-Philippe Brucker 	enable = flags & PCI_MSIX_FLAGS_ENABLE;
351c9888d95SJean-Philippe Brucker 	msi_set_enabled(pdev->msix.virt_state, enable);
352c9888d95SJean-Philippe Brucker 
3538dd28afeSJean-Philippe Brucker 	if (enable && vfio_pci_enable_msis(kvm, vdev, true))
354c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot enable MSIX");
3558dd28afeSJean-Philippe Brucker 	else if (!enable && vfio_pci_disable_msis(kvm, vdev, true))
356c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot disable MSIX");
357c9888d95SJean-Philippe Brucker 
358c9888d95SJean-Philippe Brucker 	mutex_unlock(&pdev->msix.mutex);
359c9888d95SJean-Philippe Brucker }
360c9888d95SJean-Philippe Brucker 
3618dd28afeSJean-Philippe Brucker static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
362e69b7663SAlexandru Elisei 				     u16 off, u8 *data, u32 sz)
3638dd28afeSJean-Philippe Brucker {
3648dd28afeSJean-Philippe Brucker 	size_t i;
3658dd28afeSJean-Philippe Brucker 	u32 mask = 0;
3668dd28afeSJean-Philippe Brucker 	size_t mask_pos, start, limit;
3678dd28afeSJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
3688dd28afeSJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
3698dd28afeSJean-Philippe Brucker 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
3708dd28afeSJean-Philippe Brucker 
3718dd28afeSJean-Philippe Brucker 	if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT))
3728dd28afeSJean-Philippe Brucker 		return 0;
3738dd28afeSJean-Philippe Brucker 
3748dd28afeSJean-Philippe Brucker 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT)
3758dd28afeSJean-Philippe Brucker 		mask_pos = PCI_MSI_MASK_64;
3768dd28afeSJean-Philippe Brucker 	else
3778dd28afeSJean-Philippe Brucker 		mask_pos = PCI_MSI_MASK_32;
3788dd28afeSJean-Philippe Brucker 
3798dd28afeSJean-Philippe Brucker 	if (off >= mask_pos + 4 || off + sz <= mask_pos)
3808dd28afeSJean-Philippe Brucker 		return 0;
3818dd28afeSJean-Philippe Brucker 
3828dd28afeSJean-Philippe Brucker 	/* Set mask to current state */
3838dd28afeSJean-Philippe Brucker 	for (i = 0; i < pdev->msi.nr_entries; i++) {
3848dd28afeSJean-Philippe Brucker 		entry = &pdev->msi.entries[i];
3858dd28afeSJean-Philippe Brucker 		mask |= !!msi_is_masked(entry->virt_state) << i;
3868dd28afeSJean-Philippe Brucker 	}
3878dd28afeSJean-Philippe Brucker 
3888dd28afeSJean-Philippe Brucker 	/* Update mask following the intersection of access and register */
3898dd28afeSJean-Philippe Brucker 	start = max_t(size_t, off, mask_pos);
3908dd28afeSJean-Philippe Brucker 	limit = min_t(size_t, off + sz, mask_pos + 4);
3918dd28afeSJean-Philippe Brucker 
3928dd28afeSJean-Philippe Brucker 	memcpy((void *)&mask + start - mask_pos, data + start - off,
3938dd28afeSJean-Philippe Brucker 	       limit - start);
3948dd28afeSJean-Philippe Brucker 
3958dd28afeSJean-Philippe Brucker 	/* Update states if necessary */
3968dd28afeSJean-Philippe Brucker 	for (i = 0; i < pdev->msi.nr_entries; i++) {
3978dd28afeSJean-Philippe Brucker 		bool masked = mask & (1 << i);
3988dd28afeSJean-Philippe Brucker 
3998dd28afeSJean-Philippe Brucker 		entry = &pdev->msi.entries[i];
4008dd28afeSJean-Philippe Brucker 		if (masked != msi_is_masked(entry->virt_state)) {
4018dd28afeSJean-Philippe Brucker 			msi_set_masked(entry->virt_state, masked);
4028dd28afeSJean-Philippe Brucker 			vfio_pci_update_msi_entry(kvm, vdev, entry);
4038dd28afeSJean-Philippe Brucker 		}
4048dd28afeSJean-Philippe Brucker 	}
4058dd28afeSJean-Philippe Brucker 
4068dd28afeSJean-Philippe Brucker 	return 1;
4078dd28afeSJean-Philippe Brucker }
4088dd28afeSJean-Philippe Brucker 
4098dd28afeSJean-Philippe Brucker static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
410e69b7663SAlexandru Elisei 				   u16 off, u8 *data, u32 sz)
4118dd28afeSJean-Philippe Brucker {
4128dd28afeSJean-Philippe Brucker 	u8 ctrl;
4138dd28afeSJean-Philippe Brucker 	struct msi_msg msg;
4148dd28afeSJean-Philippe Brucker 	size_t i, nr_vectors;
4158dd28afeSJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
4168dd28afeSJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
4178dd28afeSJean-Philippe Brucker 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
4188dd28afeSJean-Philippe Brucker 
4198dd28afeSJean-Philippe Brucker 	off -= pdev->msi.pos;
4208dd28afeSJean-Philippe Brucker 
4218dd28afeSJean-Philippe Brucker 	mutex_lock(&pdev->msi.mutex);
4228dd28afeSJean-Philippe Brucker 
4238dd28afeSJean-Philippe Brucker 	/* Check if the guest is trying to update mask bits */
4248dd28afeSJean-Philippe Brucker 	if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz))
4258dd28afeSJean-Philippe Brucker 		goto out_unlock;
4268dd28afeSJean-Philippe Brucker 
4278dd28afeSJean-Philippe Brucker 	/* Only modify routes when guest pokes the enable bit */
4288dd28afeSJean-Philippe Brucker 	if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS)
4298dd28afeSJean-Philippe Brucker 		goto out_unlock;
4308dd28afeSJean-Philippe Brucker 
4318dd28afeSJean-Philippe Brucker 	ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off);
4328dd28afeSJean-Philippe Brucker 
4338dd28afeSJean-Philippe Brucker 	msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE);
4348dd28afeSJean-Philippe Brucker 
4358dd28afeSJean-Philippe Brucker 	if (!msi_is_enabled(pdev->msi.virt_state)) {
4368dd28afeSJean-Philippe Brucker 		vfio_pci_disable_msis(kvm, vdev, false);
4378dd28afeSJean-Philippe Brucker 		goto out_unlock;
4388dd28afeSJean-Philippe Brucker 	}
4398dd28afeSJean-Philippe Brucker 
4408dd28afeSJean-Philippe Brucker 	/* Create routes for the requested vectors */
4418dd28afeSJean-Philippe Brucker 	nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
4428dd28afeSJean-Philippe Brucker 
4438dd28afeSJean-Philippe Brucker 	msg.address_lo = msi_cap_64->address_lo;
4448dd28afeSJean-Philippe Brucker 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) {
4458dd28afeSJean-Philippe Brucker 		msg.address_hi = msi_cap_64->address_hi;
4468dd28afeSJean-Philippe Brucker 		msg.data = msi_cap_64->data;
4478dd28afeSJean-Philippe Brucker 	} else {
4488dd28afeSJean-Philippe Brucker 		struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64;
4498dd28afeSJean-Philippe Brucker 		msg.address_hi = 0;
4508dd28afeSJean-Philippe Brucker 		msg.data = msi_cap_32->data;
4518dd28afeSJean-Philippe Brucker 	}
4528dd28afeSJean-Philippe Brucker 
4538dd28afeSJean-Philippe Brucker 	for (i = 0; i < nr_vectors; i++) {
4548dd28afeSJean-Philippe Brucker 		entry = &pdev->msi.entries[i];
455e554aefdSLorenzo Pieralisi 
456e554aefdSLorenzo Pieralisi 		/*
457e554aefdSLorenzo Pieralisi 		 * Set the MSI data value as required by the PCI local
458e554aefdSLorenzo Pieralisi 		 * bus specifications, MSI capability, "Message Data".
459e554aefdSLorenzo Pieralisi 		 */
460e554aefdSLorenzo Pieralisi 		msg.data &= ~(nr_vectors - 1);
461e554aefdSLorenzo Pieralisi 		msg.data |= i;
462e554aefdSLorenzo Pieralisi 
4638dd28afeSJean-Philippe Brucker 		entry->config.msg = msg;
4648dd28afeSJean-Philippe Brucker 		vfio_pci_update_msi_entry(kvm, vdev, entry);
4658dd28afeSJean-Philippe Brucker 	}
4668dd28afeSJean-Philippe Brucker 
4678dd28afeSJean-Philippe Brucker 	/* Update the physical capability if necessary */
4688dd28afeSJean-Philippe Brucker 	if (vfio_pci_enable_msis(kvm, vdev, false))
4698dd28afeSJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot enable MSI");
4708dd28afeSJean-Philippe Brucker 
4718dd28afeSJean-Philippe Brucker out_unlock:
4728dd28afeSJean-Philippe Brucker 	mutex_unlock(&pdev->msi.mutex);
4738dd28afeSJean-Philippe Brucker }
4748dd28afeSJean-Philippe Brucker 
4755a8e4f25SAlexandru Elisei static int vfio_pci_bar_activate(struct kvm *kvm,
4765a8e4f25SAlexandru Elisei 				 struct pci_device_header *pci_hdr,
4775a8e4f25SAlexandru Elisei 				 int bar_num, void *data)
4785a8e4f25SAlexandru Elisei {
4795a8e4f25SAlexandru Elisei 	struct vfio_device *vdev = data;
4805a8e4f25SAlexandru Elisei 	struct vfio_pci_device *pdev = &vdev->pci;
4815a8e4f25SAlexandru Elisei 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
4825a8e4f25SAlexandru Elisei 	struct vfio_pci_msix_table *table = &pdev->msix_table;
4835a8e4f25SAlexandru Elisei 	struct vfio_region *region;
484465edc9dSAlexandru Elisei 	u32 bar_addr;
4855a8e4f25SAlexandru Elisei 	bool has_msix;
4865a8e4f25SAlexandru Elisei 	int ret;
4875a8e4f25SAlexandru Elisei 
4885a8e4f25SAlexandru Elisei 	assert((u32)bar_num < vdev->info.num_regions);
4895a8e4f25SAlexandru Elisei 
4905a8e4f25SAlexandru Elisei 	region = &vdev->regions[bar_num];
4915a8e4f25SAlexandru Elisei 	has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
4925a8e4f25SAlexandru Elisei 
493465edc9dSAlexandru Elisei 	bar_addr = pci__bar_address(pci_hdr, bar_num);
494465edc9dSAlexandru Elisei 	if (pci__bar_is_io(pci_hdr, bar_num))
495465edc9dSAlexandru Elisei 		region->port_base = bar_addr;
496465edc9dSAlexandru Elisei 	else
497465edc9dSAlexandru Elisei 		region->guest_phys_addr = bar_addr;
498465edc9dSAlexandru Elisei 
4995a8e4f25SAlexandru Elisei 	if (has_msix && (u32)bar_num == table->bar) {
500465edc9dSAlexandru Elisei 		table->guest_phys_addr = region->guest_phys_addr;
5015a8e4f25SAlexandru Elisei 		ret = kvm__register_mmio(kvm, table->guest_phys_addr,
5025a8e4f25SAlexandru Elisei 					 table->size, false,
5035a8e4f25SAlexandru Elisei 					 vfio_pci_msix_table_access, pdev);
5045a8e4f25SAlexandru Elisei 		/*
5055a8e4f25SAlexandru Elisei 		 * The MSIX table and the PBA structure can share the same BAR,
5065a8e4f25SAlexandru Elisei 		 * but for convenience we register different regions for mmio
5075a8e4f25SAlexandru Elisei 		 * emulation. We want to we update both if they share the same
5085a8e4f25SAlexandru Elisei 		 * BAR.
5095a8e4f25SAlexandru Elisei 		 */
5105a8e4f25SAlexandru Elisei 		if (ret < 0 || table->bar != pba->bar)
5115a8e4f25SAlexandru Elisei 			goto out;
5125a8e4f25SAlexandru Elisei 	}
5135a8e4f25SAlexandru Elisei 
5145a8e4f25SAlexandru Elisei 	if (has_msix && (u32)bar_num == pba->bar) {
515465edc9dSAlexandru Elisei 		if (pba->bar == table->bar)
516f93acc04SAlexandru Elisei 			pba->guest_phys_addr = table->guest_phys_addr + pba->bar_offset;
517465edc9dSAlexandru Elisei 		else
518465edc9dSAlexandru Elisei 			pba->guest_phys_addr = region->guest_phys_addr;
5195a8e4f25SAlexandru Elisei 		ret = kvm__register_mmio(kvm, pba->guest_phys_addr,
5205a8e4f25SAlexandru Elisei 					 pba->size, false,
5215a8e4f25SAlexandru Elisei 					 vfio_pci_msix_pba_access, pdev);
5225a8e4f25SAlexandru Elisei 		goto out;
5235a8e4f25SAlexandru Elisei 	}
5245a8e4f25SAlexandru Elisei 
5255a8e4f25SAlexandru Elisei 	ret = vfio_map_region(kvm, vdev, region);
5265a8e4f25SAlexandru Elisei out:
5275a8e4f25SAlexandru Elisei 	return ret;
5285a8e4f25SAlexandru Elisei }
5295a8e4f25SAlexandru Elisei 
5305a8e4f25SAlexandru Elisei static int vfio_pci_bar_deactivate(struct kvm *kvm,
5315a8e4f25SAlexandru Elisei 				   struct pci_device_header *pci_hdr,
5325a8e4f25SAlexandru Elisei 				   int bar_num, void *data)
5335a8e4f25SAlexandru Elisei {
5345a8e4f25SAlexandru Elisei 	struct vfio_device *vdev = data;
5355a8e4f25SAlexandru Elisei 	struct vfio_pci_device *pdev = &vdev->pci;
5365a8e4f25SAlexandru Elisei 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
5375a8e4f25SAlexandru Elisei 	struct vfio_pci_msix_table *table = &pdev->msix_table;
5385a8e4f25SAlexandru Elisei 	struct vfio_region *region;
5395a8e4f25SAlexandru Elisei 	bool has_msix, success;
5405a8e4f25SAlexandru Elisei 	int ret;
5415a8e4f25SAlexandru Elisei 
5425a8e4f25SAlexandru Elisei 	assert((u32)bar_num < vdev->info.num_regions);
5435a8e4f25SAlexandru Elisei 
5445a8e4f25SAlexandru Elisei 	region = &vdev->regions[bar_num];
5455a8e4f25SAlexandru Elisei 	has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
5465a8e4f25SAlexandru Elisei 
5475a8e4f25SAlexandru Elisei 	if (has_msix && (u32)bar_num == table->bar) {
5485a8e4f25SAlexandru Elisei 		success = kvm__deregister_mmio(kvm, table->guest_phys_addr);
5495a8e4f25SAlexandru Elisei 		/* kvm__deregister_mmio fails when the region is not found. */
5505a8e4f25SAlexandru Elisei 		ret = (success ? 0 : -ENOENT);
5515a8e4f25SAlexandru Elisei 		/* See vfio_pci_bar_activate(). */
5525a8e4f25SAlexandru Elisei 		if (ret < 0 || table->bar!= pba->bar)
5535a8e4f25SAlexandru Elisei 			goto out;
5545a8e4f25SAlexandru Elisei 	}
5555a8e4f25SAlexandru Elisei 
5565a8e4f25SAlexandru Elisei 	if (has_msix && (u32)bar_num == pba->bar) {
5575a8e4f25SAlexandru Elisei 		success = kvm__deregister_mmio(kvm, pba->guest_phys_addr);
5585a8e4f25SAlexandru Elisei 		ret = (success ? 0 : -ENOENT);
5595a8e4f25SAlexandru Elisei 		goto out;
5605a8e4f25SAlexandru Elisei 	}
5615a8e4f25SAlexandru Elisei 
5625a8e4f25SAlexandru Elisei 	vfio_unmap_region(kvm, region);
5635a8e4f25SAlexandru Elisei 	ret = 0;
5645a8e4f25SAlexandru Elisei 
5655a8e4f25SAlexandru Elisei out:
5665a8e4f25SAlexandru Elisei 	return ret;
5675a8e4f25SAlexandru Elisei }
5685a8e4f25SAlexandru Elisei 
5696078a454SJean-Philippe Brucker static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
570e69b7663SAlexandru Elisei 			      u16 offset, void *data, int sz)
5716078a454SJean-Philippe Brucker {
5726078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
5736078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev;
5746078a454SJean-Philippe Brucker 	struct vfio_device *vdev;
5756078a454SJean-Philippe Brucker 	char base[sz];
5766078a454SJean-Philippe Brucker 
5776078a454SJean-Philippe Brucker 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
5786078a454SJean-Philippe Brucker 	vdev = container_of(pdev, struct vfio_device, pci);
5796078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
5806078a454SJean-Philippe Brucker 
5816078a454SJean-Philippe Brucker 	/* Dummy read in case of side-effects */
5826078a454SJean-Philippe Brucker 	if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
5836078a454SJean-Philippe Brucker 		vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
5846078a454SJean-Philippe Brucker 			      sz, offset);
5856078a454SJean-Philippe Brucker }
5866078a454SJean-Philippe Brucker 
5876078a454SJean-Philippe Brucker static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
588e69b7663SAlexandru Elisei 			       u16 offset, void *data, int sz)
5896078a454SJean-Philippe Brucker {
5906078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
5916078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev;
5926078a454SJean-Philippe Brucker 	struct vfio_device *vdev;
593e1d0285cSAlexandru Elisei 	u32 tmp;
594e1d0285cSAlexandru Elisei 
595e1d0285cSAlexandru Elisei 	/* Make sure a larger size will not overrun tmp on the stack. */
596e1d0285cSAlexandru Elisei 	assert(sz <= 4);
5976078a454SJean-Philippe Brucker 
5985b7fef16SAlexandru Elisei 	if (offset == PCI_ROM_ADDRESS)
5995b7fef16SAlexandru Elisei 		return;
6005b7fef16SAlexandru Elisei 
6016078a454SJean-Philippe Brucker 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
6026078a454SJean-Philippe Brucker 	vdev = container_of(pdev, struct vfio_device, pci);
6036078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
6046078a454SJean-Philippe Brucker 
6056078a454SJean-Philippe Brucker 	if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
6066078a454SJean-Philippe Brucker 		vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
6076078a454SJean-Philippe Brucker 			      sz, offset);
6086078a454SJean-Philippe Brucker 
609c9888d95SJean-Philippe Brucker 	/* Handle MSI write now, since it might update the hardware capability */
610c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
611c9888d95SJean-Philippe Brucker 		vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
612c9888d95SJean-Philippe Brucker 
6138dd28afeSJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI)
6148dd28afeSJean-Philippe Brucker 		vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz);
6158dd28afeSJean-Philippe Brucker 
616e1d0285cSAlexandru Elisei 	if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz)
6176078a454SJean-Philippe Brucker 		vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
6186078a454SJean-Philippe Brucker 			      sz, offset);
6196078a454SJean-Philippe Brucker }
6206078a454SJean-Philippe Brucker 
6218dd28afeSJean-Philippe Brucker static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr)
6228dd28afeSJean-Philippe Brucker {
6238dd28afeSJean-Philippe Brucker 	size_t size = 10;
6248dd28afeSJean-Philippe Brucker 
6258dd28afeSJean-Philippe Brucker 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT)
6268dd28afeSJean-Philippe Brucker 		size += 4;
6278dd28afeSJean-Philippe Brucker 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT)
6288dd28afeSJean-Philippe Brucker 		size += 10;
6298dd28afeSJean-Philippe Brucker 
6308dd28afeSJean-Philippe Brucker 	return size;
6318dd28afeSJean-Philippe Brucker }
6328dd28afeSJean-Philippe Brucker 
633c9888d95SJean-Philippe Brucker static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
634c9888d95SJean-Philippe Brucker {
635c9888d95SJean-Philippe Brucker 	switch (cap_hdr->type) {
636c9888d95SJean-Philippe Brucker 	case PCI_CAP_ID_MSIX:
637c9888d95SJean-Philippe Brucker 		return PCI_CAP_MSIX_SIZEOF;
6388dd28afeSJean-Philippe Brucker 	case PCI_CAP_ID_MSI:
6398dd28afeSJean-Philippe Brucker 		return vfio_pci_msi_cap_size((void *)cap_hdr);
64025c1dc6cSAlexandru Elisei 	case PCI_CAP_ID_EXP:
64125c1dc6cSAlexandru Elisei 		/*
64225c1dc6cSAlexandru Elisei 		 * We don't emulate any of the link, slot and root complex
64325c1dc6cSAlexandru Elisei 		 * properties, so ignore them.
64425c1dc6cSAlexandru Elisei 		 */
64525c1dc6cSAlexandru Elisei 		return PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1;
646c9888d95SJean-Philippe Brucker 	default:
647c9888d95SJean-Philippe Brucker 		pr_err("unknown PCI capability 0x%x", cap_hdr->type);
648c9888d95SJean-Philippe Brucker 		return 0;
649c9888d95SJean-Philippe Brucker 	}
650c9888d95SJean-Philippe Brucker }
651c9888d95SJean-Philippe Brucker 
652c9888d95SJean-Philippe Brucker static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
653c9888d95SJean-Philippe Brucker 			    struct pci_cap_hdr *cap, off_t pos)
654c9888d95SJean-Philippe Brucker {
655c9888d95SJean-Philippe Brucker 	struct pci_cap_hdr *last;
656c9888d95SJean-Philippe Brucker 	struct pci_device_header *hdr = &vdev->pci.hdr;
657c9888d95SJean-Philippe Brucker 
658c9888d95SJean-Philippe Brucker 	cap->next = 0;
659c9888d95SJean-Philippe Brucker 
660c9888d95SJean-Philippe Brucker 	if (!hdr->capabilities) {
661c9888d95SJean-Philippe Brucker 		hdr->capabilities = pos;
662c9888d95SJean-Philippe Brucker 		hdr->status |= PCI_STATUS_CAP_LIST;
663c9888d95SJean-Philippe Brucker 	} else {
664c9888d95SJean-Philippe Brucker 		last = PCI_CAP(virt_hdr, hdr->capabilities);
665c9888d95SJean-Philippe Brucker 
666c9888d95SJean-Philippe Brucker 		while (last->next)
667c9888d95SJean-Philippe Brucker 			last = PCI_CAP(virt_hdr, last->next);
668c9888d95SJean-Philippe Brucker 
669c9888d95SJean-Philippe Brucker 		last->next = pos;
670c9888d95SJean-Philippe Brucker 	}
671c9888d95SJean-Philippe Brucker 
672c9888d95SJean-Philippe Brucker 	memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
673c9888d95SJean-Philippe Brucker 
674c9888d95SJean-Philippe Brucker 	return 0;
675c9888d95SJean-Philippe Brucker }
676c9888d95SJean-Philippe Brucker 
6776078a454SJean-Philippe Brucker static int vfio_pci_parse_caps(struct vfio_device *vdev)
6786078a454SJean-Philippe Brucker {
679c9888d95SJean-Philippe Brucker 	int ret;
680c9888d95SJean-Philippe Brucker 	size_t size;
681e69b7663SAlexandru Elisei 	u16 pos, next;
682c9888d95SJean-Philippe Brucker 	struct pci_cap_hdr *cap;
683e69b7663SAlexandru Elisei 	u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY];
6846078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
6856078a454SJean-Philippe Brucker 
6866078a454SJean-Philippe Brucker 	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
6876078a454SJean-Philippe Brucker 		return 0;
6886078a454SJean-Philippe Brucker 
689e69b7663SAlexandru Elisei 	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY);
690c9888d95SJean-Philippe Brucker 
691c9888d95SJean-Philippe Brucker 	pos = pdev->hdr.capabilities & ~3;
692c9888d95SJean-Philippe Brucker 
6936078a454SJean-Philippe Brucker 	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
6946078a454SJean-Philippe Brucker 	pdev->hdr.capabilities = 0;
6956078a454SJean-Philippe Brucker 
696c9888d95SJean-Philippe Brucker 	for (; pos; pos = next) {
697c9888d95SJean-Philippe Brucker 		cap = PCI_CAP(&pdev->hdr, pos);
698c9888d95SJean-Philippe Brucker 		next = cap->next;
699c9888d95SJean-Philippe Brucker 
700c9888d95SJean-Philippe Brucker 		switch (cap->type) {
701c9888d95SJean-Philippe Brucker 		case PCI_CAP_ID_MSIX:
702c9888d95SJean-Philippe Brucker 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
703c9888d95SJean-Philippe Brucker 			if (ret)
704c9888d95SJean-Philippe Brucker 				return ret;
705c9888d95SJean-Philippe Brucker 
706c9888d95SJean-Philippe Brucker 			pdev->msix.pos = pos;
707c9888d95SJean-Philippe Brucker 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
708c9888d95SJean-Philippe Brucker 			break;
7098dd28afeSJean-Philippe Brucker 		case PCI_CAP_ID_MSI:
7108dd28afeSJean-Philippe Brucker 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
7118dd28afeSJean-Philippe Brucker 			if (ret)
7128dd28afeSJean-Philippe Brucker 				return ret;
7138dd28afeSJean-Philippe Brucker 
7148dd28afeSJean-Philippe Brucker 			pdev->msi.pos = pos;
7158dd28afeSJean-Philippe Brucker 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI;
7168dd28afeSJean-Philippe Brucker 			break;
71725c1dc6cSAlexandru Elisei 		case PCI_CAP_ID_EXP:
71825c1dc6cSAlexandru Elisei 			if (!arch_has_pci_exp())
71925c1dc6cSAlexandru Elisei 				continue;
72025c1dc6cSAlexandru Elisei 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
72125c1dc6cSAlexandru Elisei 			if (ret)
72225c1dc6cSAlexandru Elisei 				return ret;
72325c1dc6cSAlexandru Elisei 			break;
724c9888d95SJean-Philippe Brucker 		}
725c9888d95SJean-Philippe Brucker 	}
726c9888d95SJean-Philippe Brucker 
727c9888d95SJean-Philippe Brucker 	/* Wipe remaining capabilities */
728c9888d95SJean-Philippe Brucker 	pos = PCI_STD_HEADER_SIZEOF;
729e69b7663SAlexandru Elisei 	size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF;
730c9888d95SJean-Philippe Brucker 	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
7316078a454SJean-Philippe Brucker 
7326078a454SJean-Philippe Brucker 	return 0;
7336078a454SJean-Philippe Brucker }
7346078a454SJean-Philippe Brucker 
7356078a454SJean-Philippe Brucker static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
7366078a454SJean-Philippe Brucker {
737e69b7663SAlexandru Elisei 	ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
7386078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
7396078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
7406078a454SJean-Philippe Brucker 
7416078a454SJean-Philippe Brucker 	if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
7426078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "Config Space not found");
7436078a454SJean-Philippe Brucker 		return -ENODEV;
7446078a454SJean-Philippe Brucker 	}
7456078a454SJean-Philippe Brucker 
7466078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
7476078a454SJean-Philippe Brucker 	*info = (struct vfio_region_info) {
7486078a454SJean-Philippe Brucker 			.argsz = sizeof(*info),
7496078a454SJean-Philippe Brucker 			.index = VFIO_PCI_CONFIG_REGION_INDEX,
7506078a454SJean-Philippe Brucker 	};
7516078a454SJean-Philippe Brucker 
7526078a454SJean-Philippe Brucker 	ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
7536078a454SJean-Philippe Brucker 	if (!info->size) {
7546078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "Config Space has size zero?!");
7556078a454SJean-Philippe Brucker 		return -EINVAL;
7566078a454SJean-Philippe Brucker 	}
7576078a454SJean-Philippe Brucker 
758c9888d95SJean-Philippe Brucker 	/* Read standard headers and capabilities */
7596078a454SJean-Philippe Brucker 	if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
7606078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
7616078a454SJean-Philippe Brucker 		return -EIO;
7626078a454SJean-Philippe Brucker 	}
7636078a454SJean-Philippe Brucker 
7646078a454SJean-Philippe Brucker 	/* Strip bit 7, that indicates multifunction */
7656078a454SJean-Philippe Brucker 	pdev->hdr.header_type &= 0x7f;
7666078a454SJean-Philippe Brucker 
7676078a454SJean-Philippe Brucker 	if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
7686078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "unsupported header type %u",
7696078a454SJean-Philippe Brucker 			     pdev->hdr.header_type);
7706078a454SJean-Philippe Brucker 		return -EOPNOTSUPP;
7716078a454SJean-Philippe Brucker 	}
7726078a454SJean-Philippe Brucker 
773c9888d95SJean-Philippe Brucker 	if (pdev->hdr.irq_pin)
774c9888d95SJean-Philippe Brucker 		pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
775c9888d95SJean-Philippe Brucker 
7766078a454SJean-Philippe Brucker 	vfio_pci_parse_caps(vdev);
7776078a454SJean-Philippe Brucker 
7786078a454SJean-Philippe Brucker 	return 0;
7796078a454SJean-Philippe Brucker }
7806078a454SJean-Philippe Brucker 
7816078a454SJean-Philippe Brucker static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
7826078a454SJean-Philippe Brucker {
7836078a454SJean-Philippe Brucker 	int i;
7843665392aSAlexandru Elisei 	u64 base;
7856078a454SJean-Philippe Brucker 	ssize_t hdr_sz;
786c9888d95SJean-Philippe Brucker 	struct msix_cap *msix;
7876078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
7886078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
7893665392aSAlexandru Elisei 	struct vfio_region *region;
7906078a454SJean-Philippe Brucker 
7916078a454SJean-Philippe Brucker 	/* Initialise the BARs */
7926078a454SJean-Philippe Brucker 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
7933665392aSAlexandru Elisei 		if ((u32)i == vdev->info.num_regions)
7943665392aSAlexandru Elisei 			break;
79582caa882SJean-Philippe Brucker 
7963665392aSAlexandru Elisei 		region = &vdev->regions[i];
79782caa882SJean-Philippe Brucker 		/* Construct a fake reg to match what we've mapped. */
79882caa882SJean-Philippe Brucker 		if (region->is_ioport) {
79982caa882SJean-Philippe Brucker 			base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) |
80082caa882SJean-Philippe Brucker 				PCI_BASE_ADDRESS_SPACE_IO;
80182caa882SJean-Philippe Brucker 		} else {
80282caa882SJean-Philippe Brucker 			base = (region->guest_phys_addr &
80382caa882SJean-Philippe Brucker 				PCI_BASE_ADDRESS_MEM_MASK) |
80482caa882SJean-Philippe Brucker 				PCI_BASE_ADDRESS_SPACE_MEMORY;
80582caa882SJean-Philippe Brucker 		}
80682caa882SJean-Philippe Brucker 
80782caa882SJean-Philippe Brucker 		pdev->hdr.bar[i] = base;
8086078a454SJean-Philippe Brucker 
8096078a454SJean-Philippe Brucker 		if (!base)
8106078a454SJean-Philippe Brucker 			continue;
8116078a454SJean-Philippe Brucker 
8126078a454SJean-Philippe Brucker 		pdev->hdr.bar_size[i] = region->info.size;
8136078a454SJean-Philippe Brucker 	}
8146078a454SJean-Philippe Brucker 
8156078a454SJean-Philippe Brucker 	/* I really can't be bothered to support cardbus. */
8166078a454SJean-Philippe Brucker 	pdev->hdr.card_bus = 0;
8176078a454SJean-Philippe Brucker 
8186078a454SJean-Philippe Brucker 	/*
8196078a454SJean-Philippe Brucker 	 * Nuke the expansion ROM for now. If we want to do this properly,
8206078a454SJean-Philippe Brucker 	 * we need to save its size somewhere and map into the guest.
8216078a454SJean-Philippe Brucker 	 */
8226078a454SJean-Philippe Brucker 	pdev->hdr.exp_rom_bar = 0;
8236078a454SJean-Philippe Brucker 
824c9888d95SJean-Philippe Brucker 	/* Plumb in our fake MSI-X capability, if we have it. */
825c9888d95SJean-Philippe Brucker 	msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
826c9888d95SJean-Philippe Brucker 	if (msix) {
827c9888d95SJean-Philippe Brucker 		/* Add a shortcut to the PBA region for the MMIO handler */
828c9888d95SJean-Philippe Brucker 		int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
829f93acc04SAlexandru Elisei 		u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET;
830f93acc04SAlexandru Elisei 
8315f44d5d6SAlexandru Elisei 		pdev->msix_pba.fd_offset = vdev->regions[pba_index].info.offset +
832f93acc04SAlexandru Elisei 					   pba_bar_offset;
833c9888d95SJean-Philippe Brucker 
834c9888d95SJean-Philippe Brucker 		/* Tidy up the capability */
835c9888d95SJean-Philippe Brucker 		msix->table_offset &= PCI_MSIX_TABLE_BIR;
836f93acc04SAlexandru Elisei 		if (pdev->msix_table.bar == pdev->msix_pba.bar) {
837f93acc04SAlexandru Elisei 			/* Keep the same offset as the MSIX cap. */
838f93acc04SAlexandru Elisei 			pdev->msix_pba.bar_offset = pba_bar_offset;
839f93acc04SAlexandru Elisei 		} else {
840f93acc04SAlexandru Elisei 			/* PBA is at the start of the BAR. */
841c9888d95SJean-Philippe Brucker 			msix->pba_offset &= PCI_MSIX_PBA_BIR;
842f93acc04SAlexandru Elisei 			pdev->msix_pba.bar_offset = 0;
843f93acc04SAlexandru Elisei 		}
844c9888d95SJean-Philippe Brucker 	}
845c9888d95SJean-Philippe Brucker 
8466078a454SJean-Philippe Brucker 	/* Install our fake Configuration Space */
8476078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
848e69b7663SAlexandru Elisei 	/*
849e69b7663SAlexandru Elisei 	 * We don't touch the extended configuration space, let's be cautious
850e69b7663SAlexandru Elisei 	 * and not overwrite it all with zeros, or bad things might happen.
851e69b7663SAlexandru Elisei 	 */
852e69b7663SAlexandru Elisei 	hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
8536078a454SJean-Philippe Brucker 	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
8546078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
8556078a454SJean-Philippe Brucker 			     hdr_sz);
8566078a454SJean-Philippe Brucker 		return -EIO;
8576078a454SJean-Philippe Brucker 	}
8586078a454SJean-Philippe Brucker 
8596078a454SJean-Philippe Brucker 	/* Register callbacks for cfg accesses */
8606078a454SJean-Philippe Brucker 	pdev->hdr.cfg_ops = (struct pci_config_operations) {
8616078a454SJean-Philippe Brucker 		.read	= vfio_pci_cfg_read,
8626078a454SJean-Philippe Brucker 		.write	= vfio_pci_cfg_write,
8636078a454SJean-Philippe Brucker 	};
8646078a454SJean-Philippe Brucker 
8656078a454SJean-Philippe Brucker 	pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
8666078a454SJean-Philippe Brucker 
8676078a454SJean-Philippe Brucker 	return 0;
8686078a454SJean-Philippe Brucker }
8696078a454SJean-Philippe Brucker 
870ed01a603SAlexandru Elisei static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index,
871ed01a603SAlexandru Elisei 				    struct vfio_region_info *info)
872ed01a603SAlexandru Elisei {
873ed01a603SAlexandru Elisei 	int ret;
874ed01a603SAlexandru Elisei 
875ed01a603SAlexandru Elisei 	*info = (struct vfio_region_info) {
876ed01a603SAlexandru Elisei 		.argsz = sizeof(*info),
877ed01a603SAlexandru Elisei 		.index = index,
878ed01a603SAlexandru Elisei 	};
879ed01a603SAlexandru Elisei 
880ed01a603SAlexandru Elisei 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
881ed01a603SAlexandru Elisei 	if (ret) {
882ed01a603SAlexandru Elisei 		ret = -errno;
883ed01a603SAlexandru Elisei 		vfio_dev_err(vdev, "cannot get info for BAR %u", index);
884ed01a603SAlexandru Elisei 		return ret;
885ed01a603SAlexandru Elisei 	}
886ed01a603SAlexandru Elisei 
887ed01a603SAlexandru Elisei 	if (info->size && !is_power_of_two(info->size)) {
888ed01a603SAlexandru Elisei 		vfio_dev_err(vdev, "region is not power of two: 0x%llx",
889ed01a603SAlexandru Elisei 				info->size);
890ed01a603SAlexandru Elisei 		return -EINVAL;
891ed01a603SAlexandru Elisei 	}
892ed01a603SAlexandru Elisei 
893ed01a603SAlexandru Elisei 	return 0;
894ed01a603SAlexandru Elisei }
895ed01a603SAlexandru Elisei 
896ed01a603SAlexandru Elisei static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev)
897c9888d95SJean-Philippe Brucker {
898c9888d95SJean-Philippe Brucker 	int ret;
899c9888d95SJean-Philippe Brucker 	size_t i;
900ed01a603SAlexandru Elisei 	size_t map_size;
901c9888d95SJean-Philippe Brucker 	size_t nr_entries;
902c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_entry *entries;
903ed01a603SAlexandru Elisei 	struct vfio_pci_device *pdev = &vdev->pci;
904c9888d95SJean-Philippe Brucker 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
905c9888d95SJean-Philippe Brucker 	struct vfio_pci_msix_table *table = &pdev->msix_table;
906c9888d95SJean-Philippe Brucker 	struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
907ed01a603SAlexandru Elisei 	struct vfio_region_info info;
908c9888d95SJean-Philippe Brucker 
909c9888d95SJean-Philippe Brucker 	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
910c9888d95SJean-Philippe Brucker 	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
911c9888d95SJean-Philippe Brucker 
912c9888d95SJean-Philippe Brucker 	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
913f93acc04SAlexandru Elisei 
914f93acc04SAlexandru Elisei 	/* MSIX table and PBA must support QWORD accesses. */
915f93acc04SAlexandru Elisei 	table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, 8);
916f93acc04SAlexandru Elisei 	pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), 8);
917c9888d95SJean-Philippe Brucker 
918c9888d95SJean-Philippe Brucker 	entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
919c9888d95SJean-Philippe Brucker 	if (!entries)
920c9888d95SJean-Philippe Brucker 		return -ENOMEM;
921c9888d95SJean-Philippe Brucker 
922c9888d95SJean-Philippe Brucker 	for (i = 0; i < nr_entries; i++)
923c9888d95SJean-Philippe Brucker 		entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
924c9888d95SJean-Philippe Brucker 
925ed01a603SAlexandru Elisei 	ret = vfio_pci_get_region_info(vdev, table->bar, &info);
926ed01a603SAlexandru Elisei 	if (ret)
927ed01a603SAlexandru Elisei 		return ret;
928ed01a603SAlexandru Elisei 	if (!info.size)
929ed01a603SAlexandru Elisei 		return -EINVAL;
930ed01a603SAlexandru Elisei 
931*39181fc6SAlexandru Elisei 	map_size = ALIGN(info.size, MAX_PAGE_SIZE);
932ed01a603SAlexandru Elisei 	table->guest_phys_addr = pci_get_mmio_block(map_size);
933c9888d95SJean-Philippe Brucker 	if (!table->guest_phys_addr) {
934ed01a603SAlexandru Elisei 		pr_err("cannot allocate MMIO space");
935c9888d95SJean-Philippe Brucker 		ret = -ENOMEM;
936c9888d95SJean-Philippe Brucker 		goto out_free;
937c9888d95SJean-Philippe Brucker 	}
938c9888d95SJean-Philippe Brucker 
939c9888d95SJean-Philippe Brucker 	/*
940c9888d95SJean-Philippe Brucker 	 * We could map the physical PBA directly into the guest, but it's
941c9888d95SJean-Philippe Brucker 	 * likely smaller than a page, and we can only hand full pages to the
942c9888d95SJean-Philippe Brucker 	 * guest. Even though the PCI spec disallows sharing a page used for
943c9888d95SJean-Philippe Brucker 	 * MSI-X with any other resource, it allows to share the same page
944c9888d95SJean-Philippe Brucker 	 * between MSI-X table and PBA. For the sake of isolation, create a
945c9888d95SJean-Philippe Brucker 	 * virtual PBA.
946c9888d95SJean-Philippe Brucker 	 */
947f93acc04SAlexandru Elisei 	if (table->bar == pba->bar) {
948f93acc04SAlexandru Elisei 		u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET;
949f93acc04SAlexandru Elisei 
950f93acc04SAlexandru Elisei 		/* Sanity checks. */
951f93acc04SAlexandru Elisei 		if (table->size > pba_bar_offset)
952f93acc04SAlexandru Elisei 			die("MSIX table overlaps with PBA");
953f93acc04SAlexandru Elisei 		if (pba_bar_offset + pba->size > info.size)
954f93acc04SAlexandru Elisei 			die("PBA exceeds the size of the region");
955f93acc04SAlexandru Elisei 		pba->guest_phys_addr = table->guest_phys_addr + pba_bar_offset;
956f93acc04SAlexandru Elisei 	} else {
957f93acc04SAlexandru Elisei 		ret = vfio_pci_get_region_info(vdev, pba->bar, &info);
958f93acc04SAlexandru Elisei 		if (ret)
959f93acc04SAlexandru Elisei 			return ret;
960f93acc04SAlexandru Elisei 		if (!info.size)
961f93acc04SAlexandru Elisei 			return -EINVAL;
962f93acc04SAlexandru Elisei 
963*39181fc6SAlexandru Elisei 		map_size = ALIGN(info.size, MAX_PAGE_SIZE);
964f93acc04SAlexandru Elisei 		pba->guest_phys_addr = pci_get_mmio_block(map_size);
965f93acc04SAlexandru Elisei 		if (!pba->guest_phys_addr) {
966f93acc04SAlexandru Elisei 			pr_err("cannot allocate MMIO space");
967f93acc04SAlexandru Elisei 			ret = -ENOMEM;
968f93acc04SAlexandru Elisei 			goto out_free;
969f93acc04SAlexandru Elisei 		}
970f93acc04SAlexandru Elisei 	}
971c9888d95SJean-Philippe Brucker 
972c9888d95SJean-Philippe Brucker 	pdev->msix.entries = entries;
973c9888d95SJean-Philippe Brucker 	pdev->msix.nr_entries = nr_entries;
974c9888d95SJean-Philippe Brucker 
975c9888d95SJean-Philippe Brucker 	return 0;
976c9888d95SJean-Philippe Brucker 
977c9888d95SJean-Philippe Brucker out_free:
978c9888d95SJean-Philippe Brucker 	free(entries);
979c9888d95SJean-Philippe Brucker 
980c9888d95SJean-Philippe Brucker 	return ret;
981c9888d95SJean-Philippe Brucker }
982c9888d95SJean-Philippe Brucker 
9838dd28afeSJean-Philippe Brucker static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev)
9848dd28afeSJean-Philippe Brucker {
9858dd28afeSJean-Philippe Brucker 	struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos);
9868dd28afeSJean-Philippe Brucker 
9878dd28afeSJean-Philippe Brucker 	pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1),
9888dd28afeSJean-Philippe Brucker 	pdev->msi.entries = calloc(pdev->msi.nr_entries,
9898dd28afeSJean-Philippe Brucker 				   sizeof(struct vfio_pci_msi_entry));
9908dd28afeSJean-Philippe Brucker 	if (!pdev->msi.entries)
9918dd28afeSJean-Philippe Brucker 		return -ENOMEM;
9928dd28afeSJean-Philippe Brucker 
9938dd28afeSJean-Philippe Brucker 	return 0;
9948dd28afeSJean-Philippe Brucker }
9958dd28afeSJean-Philippe Brucker 
9966078a454SJean-Philippe Brucker static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
9976078a454SJean-Philippe Brucker 				  size_t nr)
9986078a454SJean-Philippe Brucker {
9996078a454SJean-Philippe Brucker 	int ret;
100082caa882SJean-Philippe Brucker 	u32 bar;
10016078a454SJean-Philippe Brucker 	size_t map_size;
1002c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
10033665392aSAlexandru Elisei 	struct vfio_region *region;
10046078a454SJean-Philippe Brucker 
10056078a454SJean-Philippe Brucker 	if (nr >= vdev->info.num_regions)
10066078a454SJean-Philippe Brucker 		return 0;
10076078a454SJean-Philippe Brucker 
10083665392aSAlexandru Elisei 	region = &vdev->regions[nr];
100982caa882SJean-Philippe Brucker 	bar = pdev->hdr.bar[nr];
101082caa882SJean-Philippe Brucker 
101182caa882SJean-Philippe Brucker 	region->vdev = vdev;
101282caa882SJean-Philippe Brucker 	region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
10136078a454SJean-Philippe Brucker 
1014ed01a603SAlexandru Elisei 	ret = vfio_pci_get_region_info(vdev, nr, &region->info);
1015ed01a603SAlexandru Elisei 	if (ret)
10166078a454SJean-Philippe Brucker 		return ret;
10176078a454SJean-Philippe Brucker 
10186078a454SJean-Philippe Brucker 	/* Ignore invalid or unimplemented regions */
10196078a454SJean-Philippe Brucker 	if (!region->info.size)
10206078a454SJean-Philippe Brucker 		return 0;
10216078a454SJean-Philippe Brucker 
1022c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1023c9888d95SJean-Philippe Brucker 		/* Trap and emulate MSI-X table */
1024c9888d95SJean-Philippe Brucker 		if (nr == pdev->msix_table.bar) {
1025c9888d95SJean-Philippe Brucker 			region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
1026c9888d95SJean-Philippe Brucker 			return 0;
1027c9888d95SJean-Philippe Brucker 		} else if (nr == pdev->msix_pba.bar) {
1028c9888d95SJean-Philippe Brucker 			region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
1029c9888d95SJean-Philippe Brucker 			return 0;
1030c9888d95SJean-Philippe Brucker 		}
1031c9888d95SJean-Philippe Brucker 	}
1032c9888d95SJean-Philippe Brucker 
1033a05e576fSAlexandru Elisei 	if (region->is_ioport) {
1034a05e576fSAlexandru Elisei 		region->port_base = pci_get_io_port_block(region->info.size);
1035a05e576fSAlexandru Elisei 	} else {
10366078a454SJean-Philippe Brucker 		/* Grab some MMIO space in the guest */
10376078a454SJean-Philippe Brucker 		map_size = ALIGN(region->info.size, PAGE_SIZE);
1038854aa2efSJulien Thierry 		region->guest_phys_addr = pci_get_mmio_block(map_size);
103982caa882SJean-Philippe Brucker 	}
10406078a454SJean-Philippe Brucker 
10416078a454SJean-Philippe Brucker 	return 0;
10426078a454SJean-Philippe Brucker }
10436078a454SJean-Philippe Brucker 
10446078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_regions(struct kvm *kvm,
10456078a454SJean-Philippe Brucker 					  struct vfio_device *vdev)
10466078a454SJean-Philippe Brucker {
10476078a454SJean-Philippe Brucker 	int ret;
10486078a454SJean-Philippe Brucker 	u32 bar;
10496078a454SJean-Philippe Brucker 	size_t i;
10506078a454SJean-Philippe Brucker 	bool is_64bit = false;
10516078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
10526078a454SJean-Philippe Brucker 
10536078a454SJean-Philippe Brucker 	ret = vfio_pci_parse_cfg_space(vdev);
10546078a454SJean-Philippe Brucker 	if (ret)
10556078a454SJean-Philippe Brucker 		return ret;
10566078a454SJean-Philippe Brucker 
1057c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1058ed01a603SAlexandru Elisei 		ret = vfio_pci_create_msix_table(kvm, vdev);
1059c9888d95SJean-Philippe Brucker 		if (ret)
1060c9888d95SJean-Philippe Brucker 			return ret;
1061c9888d95SJean-Philippe Brucker 	}
1062c9888d95SJean-Philippe Brucker 
10638dd28afeSJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
10648dd28afeSJean-Philippe Brucker 		ret = vfio_pci_create_msi_cap(kvm, pdev);
10658dd28afeSJean-Philippe Brucker 		if (ret)
10668dd28afeSJean-Philippe Brucker 			return ret;
10678dd28afeSJean-Philippe Brucker 	}
10688dd28afeSJean-Philippe Brucker 
10696078a454SJean-Philippe Brucker 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
10706078a454SJean-Philippe Brucker 		/* Ignore top half of 64-bit BAR */
107184998f21SAlexandru Elisei 		if (is_64bit) {
107284998f21SAlexandru Elisei 			is_64bit = false;
10736078a454SJean-Philippe Brucker 			continue;
107484998f21SAlexandru Elisei 		}
10756078a454SJean-Philippe Brucker 
10766078a454SJean-Philippe Brucker 		ret = vfio_pci_configure_bar(kvm, vdev, i);
10776078a454SJean-Philippe Brucker 		if (ret)
10786078a454SJean-Philippe Brucker 			return ret;
10796078a454SJean-Philippe Brucker 
10806078a454SJean-Philippe Brucker 		bar = pdev->hdr.bar[i];
10816078a454SJean-Philippe Brucker 		is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) ==
10826078a454SJean-Philippe Brucker 			   PCI_BASE_ADDRESS_SPACE_MEMORY &&
10836078a454SJean-Philippe Brucker 			   bar & PCI_BASE_ADDRESS_MEM_TYPE_64;
10846078a454SJean-Philippe Brucker 	}
10856078a454SJean-Philippe Brucker 
10866078a454SJean-Philippe Brucker 	/* We've configured the BARs, fake up a Configuration Space */
10875a8e4f25SAlexandru Elisei 	ret = vfio_pci_fixup_cfg_space(vdev);
10885a8e4f25SAlexandru Elisei 	if (ret)
10895a8e4f25SAlexandru Elisei 		return ret;
10905a8e4f25SAlexandru Elisei 
10915a8e4f25SAlexandru Elisei 	return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate,
10925a8e4f25SAlexandru Elisei 					 vfio_pci_bar_deactivate, vdev);
10936078a454SJean-Philippe Brucker }
10946078a454SJean-Philippe Brucker 
1095c9888d95SJean-Philippe Brucker /*
1096c9888d95SJean-Philippe Brucker  * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
1097c9888d95SJean-Philippe Brucker  * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
1098c9888d95SJean-Philippe Brucker  */
1099c9888d95SJean-Philippe Brucker static int vfio_pci_reserve_irq_fds(size_t num)
1100c9888d95SJean-Philippe Brucker {
1101c9888d95SJean-Philippe Brucker 	/*
1102c9888d95SJean-Philippe Brucker 	 * I counted around 27 fds under normal load. Let's add 100 for good
1103c9888d95SJean-Philippe Brucker 	 * measure.
1104c9888d95SJean-Philippe Brucker 	 */
1105c9888d95SJean-Philippe Brucker 	static size_t needed = 128;
1106c9888d95SJean-Philippe Brucker 	struct rlimit fd_limit, new_limit;
1107c9888d95SJean-Philippe Brucker 
1108c9888d95SJean-Philippe Brucker 	needed += num;
1109c9888d95SJean-Philippe Brucker 
1110c9888d95SJean-Philippe Brucker 	if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
1111c9888d95SJean-Philippe Brucker 		perror("getrlimit(RLIMIT_NOFILE)");
1112c9888d95SJean-Philippe Brucker 		return 0;
1113c9888d95SJean-Philippe Brucker 	}
1114c9888d95SJean-Philippe Brucker 
1115c9888d95SJean-Philippe Brucker 	if (fd_limit.rlim_cur >= needed)
1116c9888d95SJean-Philippe Brucker 		return 0;
1117c9888d95SJean-Philippe Brucker 
1118c9888d95SJean-Philippe Brucker 	new_limit.rlim_cur = needed;
1119c9888d95SJean-Philippe Brucker 
1120c9888d95SJean-Philippe Brucker 	if (fd_limit.rlim_max < needed)
1121c9888d95SJean-Philippe Brucker 		/* Try to bump hard limit (root only) */
1122c9888d95SJean-Philippe Brucker 		new_limit.rlim_max = needed;
1123c9888d95SJean-Philippe Brucker 	else
1124c9888d95SJean-Philippe Brucker 		new_limit.rlim_max = fd_limit.rlim_max;
1125c9888d95SJean-Philippe Brucker 
1126c9888d95SJean-Philippe Brucker 	if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
1127c9888d95SJean-Philippe Brucker 		perror("setrlimit(RLIMIT_NOFILE)");
1128c9888d95SJean-Philippe Brucker 		pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
1129c9888d95SJean-Philippe Brucker 			   (size_t)(needed - fd_limit.rlim_cur));
1130c9888d95SJean-Philippe Brucker 	}
1131c9888d95SJean-Philippe Brucker 
1132c9888d95SJean-Philippe Brucker 	return 0;
1133c9888d95SJean-Philippe Brucker }
1134c9888d95SJean-Philippe Brucker 
1135c9888d95SJean-Philippe Brucker static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
1136c9888d95SJean-Philippe Brucker 			     struct vfio_pci_msi_common *msis)
1137c9888d95SJean-Philippe Brucker {
1138c9888d95SJean-Philippe Brucker 	int ret;
1139c9888d95SJean-Philippe Brucker 	size_t i;
1140c9888d95SJean-Philippe Brucker 	int *eventfds;
1141c9888d95SJean-Philippe Brucker 	size_t irq_set_size;
1142c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
1143c9888d95SJean-Philippe Brucker 	size_t nr_entries = msis->nr_entries;
1144c9888d95SJean-Philippe Brucker 
1145c9888d95SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
114609533d3cSAndre Przywara 	if (ret || msis->info.count == 0) {
1147c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "no MSI reported by VFIO");
1148c9888d95SJean-Philippe Brucker 		return -ENODEV;
1149c9888d95SJean-Philippe Brucker 	}
1150c9888d95SJean-Philippe Brucker 
1151c9888d95SJean-Philippe Brucker 	if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1152c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "interrupt not EVENTFD capable");
1153c9888d95SJean-Philippe Brucker 		return -EINVAL;
1154c9888d95SJean-Philippe Brucker 	}
1155c9888d95SJean-Philippe Brucker 
1156c9888d95SJean-Philippe Brucker 	if (msis->info.count != nr_entries) {
1157c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
1158c9888d95SJean-Philippe Brucker 		return -EINVAL;
1159c9888d95SJean-Philippe Brucker 	}
1160c9888d95SJean-Philippe Brucker 
1161c9888d95SJean-Philippe Brucker 	mutex_init(&msis->mutex);
1162c9888d95SJean-Philippe Brucker 
1163c9888d95SJean-Philippe Brucker 	vfio_pci_reserve_irq_fds(nr_entries);
1164c9888d95SJean-Philippe Brucker 
1165c9888d95SJean-Philippe Brucker 	irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
1166c9888d95SJean-Philippe Brucker 	msis->irq_set = malloc(irq_set_size);
1167c9888d95SJean-Philippe Brucker 	if (!msis->irq_set)
1168c9888d95SJean-Philippe Brucker 		return -ENOMEM;
1169c9888d95SJean-Philippe Brucker 
1170c9888d95SJean-Philippe Brucker 	*msis->irq_set = (struct vfio_irq_set) {
1171c9888d95SJean-Philippe Brucker 		.argsz	= irq_set_size,
1172c9888d95SJean-Philippe Brucker 		.flags 	= VFIO_IRQ_SET_DATA_EVENTFD |
1173c9888d95SJean-Philippe Brucker 			  VFIO_IRQ_SET_ACTION_TRIGGER,
1174c9888d95SJean-Philippe Brucker 		.index 	= msis->info.index,
1175c9888d95SJean-Philippe Brucker 		.start 	= 0,
1176c9888d95SJean-Philippe Brucker 		.count 	= nr_entries,
1177c9888d95SJean-Philippe Brucker 	};
1178c9888d95SJean-Philippe Brucker 
1179c9888d95SJean-Philippe Brucker 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
1180c9888d95SJean-Philippe Brucker 
1181c9888d95SJean-Philippe Brucker 	for (i = 0; i < nr_entries; i++) {
1182c9888d95SJean-Philippe Brucker 		entry = &msis->entries[i];
1183c9888d95SJean-Philippe Brucker 		entry->gsi = -1;
1184c9888d95SJean-Philippe Brucker 		entry->eventfd = -1;
1185c9888d95SJean-Philippe Brucker 		msi_set_masked(entry->virt_state, true);
1186c9888d95SJean-Philippe Brucker 		msi_set_masked(entry->phys_state, true);
1187c9888d95SJean-Philippe Brucker 		eventfds[i] = -1;
1188c9888d95SJean-Philippe Brucker 	}
1189c9888d95SJean-Philippe Brucker 
1190c9888d95SJean-Philippe Brucker 	return 0;
1191c9888d95SJean-Philippe Brucker }
1192c9888d95SJean-Philippe Brucker 
1193c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
1194c9888d95SJean-Philippe Brucker {
1195c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
1196c9888d95SJean-Philippe Brucker 	int gsi = pdev->intx_gsi;
1197c9888d95SJean-Philippe Brucker 	struct vfio_irq_set irq_set = {
1198c9888d95SJean-Philippe Brucker 		.argsz	= sizeof(irq_set),
1199c9888d95SJean-Philippe Brucker 		.flags	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
1200c9888d95SJean-Philippe Brucker 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1201c9888d95SJean-Philippe Brucker 	};
1202c9888d95SJean-Philippe Brucker 
12037302327aSLeo Yan 	if (pdev->intx_fd == -1)
12047302327aSLeo Yan 		return;
12057302327aSLeo Yan 
1206c9888d95SJean-Philippe Brucker 	pr_debug("user requested MSI, disabling INTx %d", gsi);
1207c9888d95SJean-Philippe Brucker 
1208c9888d95SJean-Philippe Brucker 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
1209c9888d95SJean-Philippe Brucker 	irq__del_irqfd(kvm, gsi, pdev->intx_fd);
1210c9888d95SJean-Philippe Brucker 
1211c9888d95SJean-Philippe Brucker 	close(pdev->intx_fd);
1212a1ff6f87SLeo Yan 	close(pdev->unmask_fd);
12137302327aSLeo Yan 	pdev->intx_fd = -1;
1214c9888d95SJean-Philippe Brucker }
1215c9888d95SJean-Philippe Brucker 
12166078a454SJean-Philippe Brucker static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
12176078a454SJean-Philippe Brucker {
12186078a454SJean-Philippe Brucker 	int ret;
12196078a454SJean-Philippe Brucker 	int trigger_fd, unmask_fd;
1220a3704b91SAndre Przywara 	union vfio_irq_eventfd	trigger;
1221a3704b91SAndre Przywara 	union vfio_irq_eventfd	unmask;
12226078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
122312bd7a16SLeo Yan 	int gsi = pdev->intx_gsi;
12246078a454SJean-Philippe Brucker 
12257302327aSLeo Yan 	if (pdev->intx_fd != -1)
12267302327aSLeo Yan 		return 0;
12277302327aSLeo Yan 
12286078a454SJean-Philippe Brucker 	/*
12296078a454SJean-Philippe Brucker 	 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
12306078a454SJean-Philippe Brucker 	 * signals an interrupt from host to guest, and unmask_fd signals the
12316078a454SJean-Philippe Brucker 	 * deassertion of the line from guest to host.
12326078a454SJean-Philippe Brucker 	 */
12336078a454SJean-Philippe Brucker 	trigger_fd = eventfd(0, 0);
12346078a454SJean-Philippe Brucker 	if (trigger_fd < 0) {
12356078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to create trigger eventfd");
12366078a454SJean-Philippe Brucker 		return trigger_fd;
12376078a454SJean-Philippe Brucker 	}
12386078a454SJean-Philippe Brucker 
12396078a454SJean-Philippe Brucker 	unmask_fd = eventfd(0, 0);
12406078a454SJean-Philippe Brucker 	if (unmask_fd < 0) {
12416078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to create unmask eventfd");
12426078a454SJean-Philippe Brucker 		close(trigger_fd);
12436078a454SJean-Philippe Brucker 		return unmask_fd;
12446078a454SJean-Philippe Brucker 	}
12456078a454SJean-Philippe Brucker 
12466078a454SJean-Philippe Brucker 	ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
12476078a454SJean-Philippe Brucker 	if (ret)
12486078a454SJean-Philippe Brucker 		goto err_close;
12496078a454SJean-Philippe Brucker 
12506078a454SJean-Philippe Brucker 	trigger.irq = (struct vfio_irq_set) {
12516078a454SJean-Philippe Brucker 		.argsz	= sizeof(trigger),
12526078a454SJean-Philippe Brucker 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
12536078a454SJean-Philippe Brucker 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
12546078a454SJean-Philippe Brucker 		.start	= 0,
12556078a454SJean-Philippe Brucker 		.count	= 1,
12566078a454SJean-Philippe Brucker 	};
1257a3704b91SAndre Przywara 	set_vfio_irq_eventd_payload(&trigger, trigger_fd);
12586078a454SJean-Philippe Brucker 
12596078a454SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
12606078a454SJean-Philippe Brucker 	if (ret < 0) {
12616078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to setup VFIO IRQ");
12626078a454SJean-Philippe Brucker 		goto err_delete_line;
12636078a454SJean-Philippe Brucker 	}
12646078a454SJean-Philippe Brucker 
12656078a454SJean-Philippe Brucker 	unmask.irq = (struct vfio_irq_set) {
12666078a454SJean-Philippe Brucker 		.argsz	= sizeof(unmask),
12676078a454SJean-Philippe Brucker 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
12686078a454SJean-Philippe Brucker 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
12696078a454SJean-Philippe Brucker 		.start	= 0,
12706078a454SJean-Philippe Brucker 		.count	= 1,
12716078a454SJean-Philippe Brucker 	};
1272a3704b91SAndre Przywara 	set_vfio_irq_eventd_payload(&unmask, unmask_fd);
12736078a454SJean-Philippe Brucker 
12746078a454SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask);
12756078a454SJean-Philippe Brucker 	if (ret < 0) {
12766078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to setup unmask IRQ");
12776078a454SJean-Philippe Brucker 		goto err_remove_event;
12786078a454SJean-Philippe Brucker 	}
12796078a454SJean-Philippe Brucker 
1280c9888d95SJean-Philippe Brucker 	pdev->intx_fd = trigger_fd;
1281a1ff6f87SLeo Yan 	pdev->unmask_fd = unmask_fd;
1282c9888d95SJean-Philippe Brucker 
12836078a454SJean-Philippe Brucker 	return 0;
12846078a454SJean-Philippe Brucker 
12856078a454SJean-Philippe Brucker err_remove_event:
12866078a454SJean-Philippe Brucker 	/* Remove trigger event */
12876078a454SJean-Philippe Brucker 	trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
12886078a454SJean-Philippe Brucker 	trigger.irq.count = 0;
12896078a454SJean-Philippe Brucker 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
12906078a454SJean-Philippe Brucker 
12916078a454SJean-Philippe Brucker err_delete_line:
12926078a454SJean-Philippe Brucker 	irq__del_irqfd(kvm, gsi, trigger_fd);
12936078a454SJean-Philippe Brucker 
12946078a454SJean-Philippe Brucker err_close:
12956078a454SJean-Philippe Brucker 	close(trigger_fd);
12966078a454SJean-Philippe Brucker 	close(unmask_fd);
12976078a454SJean-Philippe Brucker 	return ret;
12986078a454SJean-Philippe Brucker }
12996078a454SJean-Philippe Brucker 
130012bd7a16SLeo Yan static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev)
130112bd7a16SLeo Yan {
130212bd7a16SLeo Yan 	int ret;
130312bd7a16SLeo Yan 	struct vfio_pci_device *pdev = &vdev->pci;
130412bd7a16SLeo Yan 	struct vfio_irq_info irq_info = {
130512bd7a16SLeo Yan 		.argsz = sizeof(irq_info),
130612bd7a16SLeo Yan 		.index = VFIO_PCI_INTX_IRQ_INDEX,
130712bd7a16SLeo Yan 	};
130812bd7a16SLeo Yan 
130912bd7a16SLeo Yan 	vfio_pci_reserve_irq_fds(2);
131012bd7a16SLeo Yan 
131112bd7a16SLeo Yan 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
131212bd7a16SLeo Yan 	if (ret || irq_info.count == 0) {
131312bd7a16SLeo Yan 		vfio_dev_err(vdev, "no INTx reported by VFIO");
131412bd7a16SLeo Yan 		return -ENODEV;
131512bd7a16SLeo Yan 	}
131612bd7a16SLeo Yan 
131712bd7a16SLeo Yan 	if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
131812bd7a16SLeo Yan 		vfio_dev_err(vdev, "interrupt not eventfd capable");
131912bd7a16SLeo Yan 		return -EINVAL;
132012bd7a16SLeo Yan 	}
132112bd7a16SLeo Yan 
132212bd7a16SLeo Yan 	if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
132312bd7a16SLeo Yan 		vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
132412bd7a16SLeo Yan 		return -EINVAL;
132512bd7a16SLeo Yan 	}
132612bd7a16SLeo Yan 
132712bd7a16SLeo Yan 	/* Guest is going to ovewrite our irq_line... */
132812bd7a16SLeo Yan 	pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
132912bd7a16SLeo Yan 
13307302327aSLeo Yan 	pdev->intx_fd = -1;
13317302327aSLeo Yan 
133212bd7a16SLeo Yan 	return 0;
133312bd7a16SLeo Yan }
133412bd7a16SLeo Yan 
13356078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
13366078a454SJean-Philippe Brucker {
1337c9888d95SJean-Philippe Brucker 	int ret = 0;
13386078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
13396078a454SJean-Philippe Brucker 
1340c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1341c9888d95SJean-Philippe Brucker 		pdev->msix.info = (struct vfio_irq_info) {
1342c9888d95SJean-Philippe Brucker 			.argsz = sizeof(pdev->msix.info),
1343c9888d95SJean-Philippe Brucker 			.index = VFIO_PCI_MSIX_IRQ_INDEX,
13446078a454SJean-Philippe Brucker 		};
1345c9888d95SJean-Philippe Brucker 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
1346c9888d95SJean-Philippe Brucker 		if (ret)
1347c9888d95SJean-Philippe Brucker 			return ret;
13486078a454SJean-Philippe Brucker 	}
13496078a454SJean-Philippe Brucker 
13508dd28afeSJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
13518dd28afeSJean-Philippe Brucker 		pdev->msi.info = (struct vfio_irq_info) {
13528dd28afeSJean-Philippe Brucker 			.argsz = sizeof(pdev->msi.info),
13538dd28afeSJean-Philippe Brucker 			.index = VFIO_PCI_MSI_IRQ_INDEX,
13548dd28afeSJean-Philippe Brucker 		};
13558dd28afeSJean-Philippe Brucker 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi);
13568dd28afeSJean-Philippe Brucker 		if (ret)
13578dd28afeSJean-Philippe Brucker 			return ret;
13588dd28afeSJean-Philippe Brucker 	}
13598dd28afeSJean-Philippe Brucker 
136012bd7a16SLeo Yan 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
1361c0c45eedSAndre Przywara 		pci__assign_irq(&vdev->pci.hdr);
1362c0c45eedSAndre Przywara 
136312bd7a16SLeo Yan 		ret = vfio_pci_init_intx(kvm, vdev);
136412bd7a16SLeo Yan 		if (ret)
136512bd7a16SLeo Yan 			return ret;
136612bd7a16SLeo Yan 
1367c9888d95SJean-Philippe Brucker 		ret = vfio_pci_enable_intx(kvm, vdev);
136812bd7a16SLeo Yan 	}
1369c9888d95SJean-Philippe Brucker 
1370c9888d95SJean-Philippe Brucker 	return ret;
13716078a454SJean-Philippe Brucker }
13726078a454SJean-Philippe Brucker 
13736078a454SJean-Philippe Brucker int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
13746078a454SJean-Philippe Brucker {
13756078a454SJean-Philippe Brucker 	int ret;
13766078a454SJean-Philippe Brucker 
13776078a454SJean-Philippe Brucker 	ret = vfio_pci_configure_dev_regions(kvm, vdev);
13786078a454SJean-Philippe Brucker 	if (ret) {
13796078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to configure regions");
13806078a454SJean-Philippe Brucker 		return ret;
13816078a454SJean-Philippe Brucker 	}
13826078a454SJean-Philippe Brucker 
13836078a454SJean-Philippe Brucker 	vdev->dev_hdr = (struct device_header) {
13846078a454SJean-Philippe Brucker 		.bus_type	= DEVICE_BUS_PCI,
13856078a454SJean-Philippe Brucker 		.data		= &vdev->pci.hdr,
13866078a454SJean-Philippe Brucker 	};
13876078a454SJean-Philippe Brucker 
13886078a454SJean-Philippe Brucker 	ret = device__register(&vdev->dev_hdr);
13896078a454SJean-Philippe Brucker 	if (ret) {
13906078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to register VFIO device");
13916078a454SJean-Philippe Brucker 		return ret;
13926078a454SJean-Philippe Brucker 	}
13936078a454SJean-Philippe Brucker 
13946078a454SJean-Philippe Brucker 	ret = vfio_pci_configure_dev_irqs(kvm, vdev);
13956078a454SJean-Philippe Brucker 	if (ret) {
13966078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to configure IRQs");
13976078a454SJean-Philippe Brucker 		return ret;
13986078a454SJean-Philippe Brucker 	}
13996078a454SJean-Philippe Brucker 
14006078a454SJean-Philippe Brucker 	return 0;
14016078a454SJean-Philippe Brucker }
14026078a454SJean-Philippe Brucker 
14036078a454SJean-Philippe Brucker void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
14046078a454SJean-Philippe Brucker {
14056078a454SJean-Philippe Brucker 	size_t i;
1406c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
14076078a454SJean-Philippe Brucker 
14086078a454SJean-Philippe Brucker 	for (i = 0; i < vdev->info.num_regions; i++)
14096078a454SJean-Philippe Brucker 		vfio_unmap_region(kvm, &vdev->regions[i]);
14106078a454SJean-Philippe Brucker 
14116078a454SJean-Philippe Brucker 	device__unregister(&vdev->dev_hdr);
1412c9888d95SJean-Philippe Brucker 
1413c9888d95SJean-Philippe Brucker 	free(pdev->msix.irq_set);
1414c9888d95SJean-Philippe Brucker 	free(pdev->msix.entries);
14158dd28afeSJean-Philippe Brucker 	free(pdev->msi.irq_set);
14168dd28afeSJean-Philippe Brucker 	free(pdev->msi.entries);
14176078a454SJean-Philippe Brucker }
1418