xref: /kvmtool/vfio/pci.c (revision e1d0285c89ae5c7d0d3b1fdede92d2cf9d12bb01)
16078a454SJean-Philippe Brucker #include "kvm/irq.h"
26078a454SJean-Philippe Brucker #include "kvm/kvm.h"
36078a454SJean-Philippe Brucker #include "kvm/kvm-cpu.h"
46078a454SJean-Philippe Brucker #include "kvm/vfio.h"
56078a454SJean-Philippe Brucker 
6*e1d0285cSAlexandru Elisei #include <assert.h>
7*e1d0285cSAlexandru Elisei 
86078a454SJean-Philippe Brucker #include <sys/ioctl.h>
96078a454SJean-Philippe Brucker #include <sys/eventfd.h>
10c9888d95SJean-Philippe Brucker #include <sys/resource.h>
11c9888d95SJean-Philippe Brucker #include <sys/time.h>
126078a454SJean-Philippe Brucker 
136078a454SJean-Philippe Brucker /* Wrapper around UAPI vfio_irq_set */
14a3704b91SAndre Przywara union vfio_irq_eventfd {
156078a454SJean-Philippe Brucker 	struct vfio_irq_set	irq;
16a3704b91SAndre Przywara 	u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)];
176078a454SJean-Philippe Brucker };
186078a454SJean-Philippe Brucker 
19a3704b91SAndre Przywara static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd)
20a3704b91SAndre Przywara {
21a3704b91SAndre Przywara 	memcpy(&evfd->irq.data, &fd, sizeof(fd));
22a3704b91SAndre Przywara }
23a3704b91SAndre Przywara 
24c9888d95SJean-Philippe Brucker #define msi_is_enabled(state)		((state) & VFIO_PCI_MSI_STATE_ENABLED)
25c9888d95SJean-Philippe Brucker #define msi_is_masked(state)		((state) & VFIO_PCI_MSI_STATE_MASKED)
26c9888d95SJean-Philippe Brucker #define msi_is_empty(state)		((state) & VFIO_PCI_MSI_STATE_EMPTY)
27c9888d95SJean-Philippe Brucker 
28c9888d95SJean-Philippe Brucker #define msi_update_state(state, val, bit)				\
29c9888d95SJean-Philippe Brucker 	(state) = (val) ? (state) | bit : (state) & ~bit;
30c9888d95SJean-Philippe Brucker #define msi_set_enabled(state, val)					\
31c9888d95SJean-Philippe Brucker 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
32c9888d95SJean-Philippe Brucker #define msi_set_masked(state, val)					\
33c9888d95SJean-Philippe Brucker 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
34c9888d95SJean-Philippe Brucker #define msi_set_empty(state, val)					\
35c9888d95SJean-Philippe Brucker 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
36c9888d95SJean-Philippe Brucker 
37c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
387302327aSLeo Yan static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
39c9888d95SJean-Philippe Brucker 
408dd28afeSJean-Philippe Brucker static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
418dd28afeSJean-Philippe Brucker 				bool msix)
42c9888d95SJean-Philippe Brucker {
43c9888d95SJean-Philippe Brucker 	size_t i;
44c9888d95SJean-Philippe Brucker 	int ret = 0;
45c9888d95SJean-Philippe Brucker 	int *eventfds;
46c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
478dd28afeSJean-Philippe Brucker 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
48a3704b91SAndre Przywara 	union vfio_irq_eventfd single = {
49c9888d95SJean-Philippe Brucker 		.irq = {
50c9888d95SJean-Philippe Brucker 			.argsz	= sizeof(single),
51c9888d95SJean-Philippe Brucker 			.flags	= VFIO_IRQ_SET_DATA_EVENTFD |
52c9888d95SJean-Philippe Brucker 				  VFIO_IRQ_SET_ACTION_TRIGGER,
53c9888d95SJean-Philippe Brucker 			.index	= msis->info.index,
54c9888d95SJean-Philippe Brucker 			.count	= 1,
55c9888d95SJean-Philippe Brucker 		},
56c9888d95SJean-Philippe Brucker 	};
57c9888d95SJean-Philippe Brucker 
58c9888d95SJean-Philippe Brucker 	if (!msi_is_enabled(msis->virt_state))
59c9888d95SJean-Philippe Brucker 		return 0;
60c9888d95SJean-Philippe Brucker 
617302327aSLeo Yan 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
62c9888d95SJean-Philippe Brucker 		/*
63c9888d95SJean-Philippe Brucker 		 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
64c9888d95SJean-Philippe Brucker 		 * time. Since INTx has to be enabled from the start (we don't
657302327aSLeo Yan 		 * have a reliable way to know when the guest starts using it),
66c9888d95SJean-Philippe Brucker 		 * disable it now.
67c9888d95SJean-Philippe Brucker 		 */
68c9888d95SJean-Philippe Brucker 		vfio_pci_disable_intx(kvm, vdev);
69c9888d95SJean-Philippe Brucker 
70c9888d95SJean-Philippe Brucker 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
71c9888d95SJean-Philippe Brucker 
72c9888d95SJean-Philippe Brucker 	/*
73c9888d95SJean-Philippe Brucker 	 * Initial registration of the full range. This enables the physical
74c9888d95SJean-Philippe Brucker 	 * MSI/MSI-X capability, which might have desired side effects. For
75c9888d95SJean-Philippe Brucker 	 * instance when assigning virtio legacy devices, enabling the MSI
76c9888d95SJean-Philippe Brucker 	 * capability modifies the config space layout!
77c9888d95SJean-Philippe Brucker 	 *
78c9888d95SJean-Philippe Brucker 	 * As an optimization, only update MSIs when guest unmasks the
79c9888d95SJean-Philippe Brucker 	 * capability. This greatly reduces the initialization time for Linux
80c9888d95SJean-Philippe Brucker 	 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
81c9888d95SJean-Philippe Brucker 	 * masked, then fills individual vectors, then unmasks the whole
82c9888d95SJean-Philippe Brucker 	 * function. So we only do one VFIO ioctl when enabling for the first
83c9888d95SJean-Philippe Brucker 	 * time, and then one when unmasking.
84c9888d95SJean-Philippe Brucker 	 *
85c9888d95SJean-Philippe Brucker 	 * phys_state is empty when it is enabled but no vector has been
86c9888d95SJean-Philippe Brucker 	 * registered via SET_IRQS yet.
87c9888d95SJean-Philippe Brucker 	 */
88c9888d95SJean-Philippe Brucker 	if (!msi_is_enabled(msis->phys_state) ||
89c9888d95SJean-Philippe Brucker 	    (!msi_is_masked(msis->virt_state) &&
90c9888d95SJean-Philippe Brucker 	     msi_is_empty(msis->phys_state))) {
91c9888d95SJean-Philippe Brucker 		bool empty = true;
92c9888d95SJean-Philippe Brucker 
93c9888d95SJean-Philippe Brucker 		for (i = 0; i < msis->nr_entries; i++) {
94c9888d95SJean-Philippe Brucker 			eventfds[i] = msis->entries[i].gsi >= 0 ?
95c9888d95SJean-Philippe Brucker 				      msis->entries[i].eventfd : -1;
96c9888d95SJean-Philippe Brucker 
97c9888d95SJean-Philippe Brucker 			if (eventfds[i] >= 0)
98c9888d95SJean-Philippe Brucker 				empty = false;
99c9888d95SJean-Philippe Brucker 		}
100c9888d95SJean-Philippe Brucker 
101c9888d95SJean-Philippe Brucker 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
102c9888d95SJean-Philippe Brucker 		if (ret < 0) {
103c9888d95SJean-Philippe Brucker 			perror("VFIO_DEVICE_SET_IRQS(multi)");
104c9888d95SJean-Philippe Brucker 			return ret;
105c9888d95SJean-Philippe Brucker 		}
106c9888d95SJean-Philippe Brucker 
107c9888d95SJean-Philippe Brucker 		msi_set_enabled(msis->phys_state, true);
108c9888d95SJean-Philippe Brucker 		msi_set_empty(msis->phys_state, empty);
109c9888d95SJean-Philippe Brucker 
110c9888d95SJean-Philippe Brucker 		return 0;
111c9888d95SJean-Philippe Brucker 	}
112c9888d95SJean-Philippe Brucker 
113c9888d95SJean-Philippe Brucker 	if (msi_is_masked(msis->virt_state)) {
114c9888d95SJean-Philippe Brucker 		/* TODO: if phys_state is not empty nor masked, mask all vectors */
115c9888d95SJean-Philippe Brucker 		return 0;
116c9888d95SJean-Philippe Brucker 	}
117c9888d95SJean-Philippe Brucker 
118c9888d95SJean-Philippe Brucker 	/* Update individual vectors to avoid breaking those in use */
119c9888d95SJean-Philippe Brucker 	for (i = 0; i < msis->nr_entries; i++) {
120c9888d95SJean-Philippe Brucker 		struct vfio_pci_msi_entry *entry = &msis->entries[i];
121c9888d95SJean-Philippe Brucker 		int fd = entry->gsi >= 0 ? entry->eventfd : -1;
122c9888d95SJean-Philippe Brucker 
123c9888d95SJean-Philippe Brucker 		if (fd == eventfds[i])
124c9888d95SJean-Philippe Brucker 			continue;
125c9888d95SJean-Philippe Brucker 
126c9888d95SJean-Philippe Brucker 		single.irq.start = i;
127a3704b91SAndre Przywara 		set_vfio_irq_eventd_payload(&single, fd);
128c9888d95SJean-Philippe Brucker 
129c9888d95SJean-Philippe Brucker 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
130c9888d95SJean-Philippe Brucker 		if (ret < 0) {
131c9888d95SJean-Philippe Brucker 			perror("VFIO_DEVICE_SET_IRQS(single)");
132c9888d95SJean-Philippe Brucker 			break;
133c9888d95SJean-Philippe Brucker 		}
134c9888d95SJean-Philippe Brucker 
135c9888d95SJean-Philippe Brucker 		eventfds[i] = fd;
136c9888d95SJean-Philippe Brucker 
137c9888d95SJean-Philippe Brucker 		if (msi_is_empty(msis->phys_state) && fd >= 0)
138c9888d95SJean-Philippe Brucker 			msi_set_empty(msis->phys_state, false);
139c9888d95SJean-Philippe Brucker 	}
140c9888d95SJean-Philippe Brucker 
141c9888d95SJean-Philippe Brucker 	return ret;
142c9888d95SJean-Philippe Brucker }
143c9888d95SJean-Philippe Brucker 
1448dd28afeSJean-Philippe Brucker static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev,
1458dd28afeSJean-Philippe Brucker 				 bool msix)
146c9888d95SJean-Philippe Brucker {
147c9888d95SJean-Philippe Brucker 	int ret;
148c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
1498dd28afeSJean-Philippe Brucker 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
150c9888d95SJean-Philippe Brucker 	struct vfio_irq_set irq_set = {
151c9888d95SJean-Philippe Brucker 		.argsz	= sizeof(irq_set),
152c9888d95SJean-Philippe Brucker 		.flags 	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
153c9888d95SJean-Philippe Brucker 		.index 	= msis->info.index,
154c9888d95SJean-Philippe Brucker 		.start 	= 0,
155c9888d95SJean-Philippe Brucker 		.count	= 0,
156c9888d95SJean-Philippe Brucker 	};
157c9888d95SJean-Philippe Brucker 
158c9888d95SJean-Philippe Brucker 	if (!msi_is_enabled(msis->phys_state))
159c9888d95SJean-Philippe Brucker 		return 0;
160c9888d95SJean-Philippe Brucker 
161c9888d95SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
162c9888d95SJean-Philippe Brucker 	if (ret < 0) {
163c9888d95SJean-Philippe Brucker 		perror("VFIO_DEVICE_SET_IRQS(NONE)");
164c9888d95SJean-Philippe Brucker 		return ret;
165c9888d95SJean-Philippe Brucker 	}
166c9888d95SJean-Philippe Brucker 
167c9888d95SJean-Philippe Brucker 	msi_set_enabled(msis->phys_state, false);
168c9888d95SJean-Philippe Brucker 	msi_set_empty(msis->phys_state, true);
169c9888d95SJean-Philippe Brucker 
1707302327aSLeo Yan 	/*
1717302327aSLeo Yan 	 * When MSI or MSIX is disabled, this might be called when
1727302327aSLeo Yan 	 * PCI driver detects the MSI interrupt failure and wants to
1737302327aSLeo Yan 	 * rollback to INTx mode.  Thus enable INTx if the device
1747302327aSLeo Yan 	 * supports INTx mode in this case.
1757302327aSLeo Yan 	 */
1767302327aSLeo Yan 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
1777302327aSLeo Yan 		ret = vfio_pci_enable_intx(kvm, vdev);
1787302327aSLeo Yan 
1797302327aSLeo Yan 	return ret >= 0 ? 0 : ret;
180c9888d95SJean-Philippe Brucker }
181c9888d95SJean-Philippe Brucker 
182c9888d95SJean-Philippe Brucker static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
183c9888d95SJean-Philippe Brucker 				     struct vfio_pci_msi_entry *entry)
184c9888d95SJean-Philippe Brucker {
185c9888d95SJean-Philippe Brucker 	int ret;
186c9888d95SJean-Philippe Brucker 
187c9888d95SJean-Philippe Brucker 	if (entry->eventfd < 0) {
188c9888d95SJean-Philippe Brucker 		entry->eventfd = eventfd(0, 0);
189c9888d95SJean-Philippe Brucker 		if (entry->eventfd < 0) {
190c9888d95SJean-Philippe Brucker 			ret = -errno;
191c9888d95SJean-Philippe Brucker 			vfio_dev_err(vdev, "cannot create eventfd");
192c9888d95SJean-Philippe Brucker 			return ret;
193c9888d95SJean-Philippe Brucker 		}
194c9888d95SJean-Philippe Brucker 	}
195c9888d95SJean-Philippe Brucker 
196c9888d95SJean-Philippe Brucker 	/* Allocate IRQ if necessary */
197c9888d95SJean-Philippe Brucker 	if (entry->gsi < 0) {
198c9888d95SJean-Philippe Brucker 		int ret = irq__add_msix_route(kvm, &entry->config.msg,
199c9888d95SJean-Philippe Brucker 					      vdev->dev_hdr.dev_num << 3);
200c9888d95SJean-Philippe Brucker 		if (ret < 0) {
201c9888d95SJean-Philippe Brucker 			vfio_dev_err(vdev, "cannot create MSI-X route");
202c9888d95SJean-Philippe Brucker 			return ret;
203c9888d95SJean-Philippe Brucker 		}
204c9888d95SJean-Philippe Brucker 		entry->gsi = ret;
205c9888d95SJean-Philippe Brucker 	} else {
206c9888d95SJean-Philippe Brucker 		irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
207c9888d95SJean-Philippe Brucker 	}
208c9888d95SJean-Philippe Brucker 
209c9888d95SJean-Philippe Brucker 	/*
210c9888d95SJean-Philippe Brucker 	 * MSI masking is unimplemented in VFIO, so we have to handle it by
211c9888d95SJean-Philippe Brucker 	 * disabling/enabling IRQ route instead. We do it on the KVM side rather
212c9888d95SJean-Philippe Brucker 	 * than VFIO, because:
213c9888d95SJean-Philippe Brucker 	 * - it is 8x faster
214c9888d95SJean-Philippe Brucker 	 * - it allows to decouple masking logic from capability state.
215c9888d95SJean-Philippe Brucker 	 * - in masked state, after removing irqfd route, we could easily plug
216c9888d95SJean-Philippe Brucker 	 *   the eventfd in a local handler, in order to serve Pending Bit reads
217c9888d95SJean-Philippe Brucker 	 *   to the guest.
218c9888d95SJean-Philippe Brucker 	 *
219c9888d95SJean-Philippe Brucker 	 * So entry->phys_state is masked when there is no active irqfd route.
220c9888d95SJean-Philippe Brucker 	 */
221c9888d95SJean-Philippe Brucker 	if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state))
222c9888d95SJean-Philippe Brucker 		return 0;
223c9888d95SJean-Philippe Brucker 
224c9888d95SJean-Philippe Brucker 	if (msi_is_masked(entry->phys_state)) {
225c9888d95SJean-Philippe Brucker 		ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
226c9888d95SJean-Philippe Brucker 		if (ret < 0) {
227c9888d95SJean-Philippe Brucker 			vfio_dev_err(vdev, "cannot setup irqfd");
228c9888d95SJean-Philippe Brucker 			return ret;
229c9888d95SJean-Philippe Brucker 		}
230c9888d95SJean-Philippe Brucker 	} else {
231c9888d95SJean-Philippe Brucker 		irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
232c9888d95SJean-Philippe Brucker 	}
233c9888d95SJean-Philippe Brucker 
234c9888d95SJean-Philippe Brucker 	msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state));
235c9888d95SJean-Philippe Brucker 
236c9888d95SJean-Philippe Brucker 	return 0;
237c9888d95SJean-Philippe Brucker }
238c9888d95SJean-Philippe Brucker 
239c9888d95SJean-Philippe Brucker static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
240c9888d95SJean-Philippe Brucker 				     u32 len, u8 is_write, void *ptr)
241c9888d95SJean-Philippe Brucker {
242c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = ptr;
243c9888d95SJean-Philippe Brucker 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
244c9888d95SJean-Philippe Brucker 	u64 offset = addr - pba->guest_phys_addr;
245c9888d95SJean-Philippe Brucker 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
246c9888d95SJean-Philippe Brucker 
247c9888d95SJean-Philippe Brucker 	if (is_write)
248c9888d95SJean-Philippe Brucker 		return;
249c9888d95SJean-Philippe Brucker 
250c9888d95SJean-Philippe Brucker 	/*
251c9888d95SJean-Philippe Brucker 	 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
252c9888d95SJean-Philippe Brucker 	 * is completely useless here. Note that Linux doesn't use PBA.
253c9888d95SJean-Philippe Brucker 	 */
254c9888d95SJean-Philippe Brucker 	if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len)
255c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot access MSIX PBA\n");
256c9888d95SJean-Philippe Brucker }
257c9888d95SJean-Philippe Brucker 
258c9888d95SJean-Philippe Brucker static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
259c9888d95SJean-Philippe Brucker 				       u32 len, u8 is_write, void *ptr)
260c9888d95SJean-Philippe Brucker {
261c9888d95SJean-Philippe Brucker 	struct kvm *kvm = vcpu->kvm;
262c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
263c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = ptr;
264c9888d95SJean-Philippe Brucker 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
265c9888d95SJean-Philippe Brucker 
266c9888d95SJean-Philippe Brucker 	u64 offset = addr - pdev->msix_table.guest_phys_addr;
267c9888d95SJean-Philippe Brucker 
268c9888d95SJean-Philippe Brucker 	size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
269c9888d95SJean-Philippe Brucker 	off_t field = offset % PCI_MSIX_ENTRY_SIZE;
270c9888d95SJean-Philippe Brucker 
271c9888d95SJean-Philippe Brucker 	/*
272c9888d95SJean-Philippe Brucker 	 * PCI spec says that software must use aligned 4 or 8 bytes accesses
273c9888d95SJean-Philippe Brucker 	 * for the MSI-X tables.
274c9888d95SJean-Philippe Brucker 	 */
275c9888d95SJean-Philippe Brucker 	if ((len != 4 && len != 8) || addr & (len - 1)) {
276c9888d95SJean-Philippe Brucker 		vfio_dev_warn(vdev, "invalid MSI-X table access");
277c9888d95SJean-Philippe Brucker 		return;
278c9888d95SJean-Philippe Brucker 	}
279c9888d95SJean-Philippe Brucker 
280c9888d95SJean-Philippe Brucker 	entry = &pdev->msix.entries[vector];
281c9888d95SJean-Philippe Brucker 
282c9888d95SJean-Philippe Brucker 	mutex_lock(&pdev->msix.mutex);
283c9888d95SJean-Philippe Brucker 
284c9888d95SJean-Philippe Brucker 	if (!is_write) {
285c9888d95SJean-Philippe Brucker 		memcpy(data, (void *)&entry->config + field, len);
286c9888d95SJean-Philippe Brucker 		goto out_unlock;
287c9888d95SJean-Philippe Brucker 	}
288c9888d95SJean-Philippe Brucker 
289c9888d95SJean-Philippe Brucker 	memcpy((void *)&entry->config + field, data, len);
290c9888d95SJean-Philippe Brucker 
291c9888d95SJean-Philippe Brucker 	/*
292c9888d95SJean-Philippe Brucker 	 * Check if access touched the vector control register, which is at the
293c9888d95SJean-Philippe Brucker 	 * end of the MSI-X entry.
294c9888d95SJean-Philippe Brucker 	 */
295c9888d95SJean-Philippe Brucker 	if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
296c9888d95SJean-Philippe Brucker 		goto out_unlock;
297c9888d95SJean-Philippe Brucker 
298c9888d95SJean-Philippe Brucker 	msi_set_masked(entry->virt_state, entry->config.ctrl &
299c9888d95SJean-Philippe Brucker 		       PCI_MSIX_ENTRY_CTRL_MASKBIT);
300c9888d95SJean-Philippe Brucker 
301c9888d95SJean-Philippe Brucker 	if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
302c9888d95SJean-Philippe Brucker 		/* Not much we can do here. */
303c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
304c9888d95SJean-Philippe Brucker 
305c9888d95SJean-Philippe Brucker 	/* Update the physical capability if necessary */
3068dd28afeSJean-Philippe Brucker 	if (vfio_pci_enable_msis(kvm, vdev, true))
307c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot enable MSIX");
308c9888d95SJean-Philippe Brucker 
309c9888d95SJean-Philippe Brucker out_unlock:
310c9888d95SJean-Philippe Brucker 	mutex_unlock(&pdev->msix.mutex);
311c9888d95SJean-Philippe Brucker }
312c9888d95SJean-Philippe Brucker 
313c9888d95SJean-Philippe Brucker static void vfio_pci_msix_cap_write(struct kvm *kvm,
314c9888d95SJean-Philippe Brucker 				    struct vfio_device *vdev, u8 off,
315c9888d95SJean-Philippe Brucker 				    void *data, int sz)
316c9888d95SJean-Philippe Brucker {
317c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
318c9888d95SJean-Philippe Brucker 	off_t enable_pos = PCI_MSIX_FLAGS + 1;
319c9888d95SJean-Philippe Brucker 	bool enable;
320c9888d95SJean-Philippe Brucker 	u16 flags;
321c9888d95SJean-Philippe Brucker 
322c9888d95SJean-Philippe Brucker 	off -= pdev->msix.pos;
323c9888d95SJean-Philippe Brucker 
324c9888d95SJean-Philippe Brucker 	/* Check if access intersects with the MSI-X Enable bit */
325c9888d95SJean-Philippe Brucker 	if (off > enable_pos || off + sz <= enable_pos)
326c9888d95SJean-Philippe Brucker 		return;
327c9888d95SJean-Philippe Brucker 
328c9888d95SJean-Philippe Brucker 	/* Read byte that contains the Enable bit */
329c9888d95SJean-Philippe Brucker 	flags = *(u8 *)(data + enable_pos - off) << 8;
330c9888d95SJean-Philippe Brucker 
331c9888d95SJean-Philippe Brucker 	mutex_lock(&pdev->msix.mutex);
332c9888d95SJean-Philippe Brucker 
333c9888d95SJean-Philippe Brucker 	msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL);
334c9888d95SJean-Philippe Brucker 	enable = flags & PCI_MSIX_FLAGS_ENABLE;
335c9888d95SJean-Philippe Brucker 	msi_set_enabled(pdev->msix.virt_state, enable);
336c9888d95SJean-Philippe Brucker 
3378dd28afeSJean-Philippe Brucker 	if (enable && vfio_pci_enable_msis(kvm, vdev, true))
338c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot enable MSIX");
3398dd28afeSJean-Philippe Brucker 	else if (!enable && vfio_pci_disable_msis(kvm, vdev, true))
340c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot disable MSIX");
341c9888d95SJean-Philippe Brucker 
342c9888d95SJean-Philippe Brucker 	mutex_unlock(&pdev->msix.mutex);
343c9888d95SJean-Philippe Brucker }
344c9888d95SJean-Philippe Brucker 
3458dd28afeSJean-Philippe Brucker static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
3468dd28afeSJean-Philippe Brucker 				     u8 off, u8 *data, u32 sz)
3478dd28afeSJean-Philippe Brucker {
3488dd28afeSJean-Philippe Brucker 	size_t i;
3498dd28afeSJean-Philippe Brucker 	u32 mask = 0;
3508dd28afeSJean-Philippe Brucker 	size_t mask_pos, start, limit;
3518dd28afeSJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
3528dd28afeSJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
3538dd28afeSJean-Philippe Brucker 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
3548dd28afeSJean-Philippe Brucker 
3558dd28afeSJean-Philippe Brucker 	if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT))
3568dd28afeSJean-Philippe Brucker 		return 0;
3578dd28afeSJean-Philippe Brucker 
3588dd28afeSJean-Philippe Brucker 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT)
3598dd28afeSJean-Philippe Brucker 		mask_pos = PCI_MSI_MASK_64;
3608dd28afeSJean-Philippe Brucker 	else
3618dd28afeSJean-Philippe Brucker 		mask_pos = PCI_MSI_MASK_32;
3628dd28afeSJean-Philippe Brucker 
3638dd28afeSJean-Philippe Brucker 	if (off >= mask_pos + 4 || off + sz <= mask_pos)
3648dd28afeSJean-Philippe Brucker 		return 0;
3658dd28afeSJean-Philippe Brucker 
3668dd28afeSJean-Philippe Brucker 	/* Set mask to current state */
3678dd28afeSJean-Philippe Brucker 	for (i = 0; i < pdev->msi.nr_entries; i++) {
3688dd28afeSJean-Philippe Brucker 		entry = &pdev->msi.entries[i];
3698dd28afeSJean-Philippe Brucker 		mask |= !!msi_is_masked(entry->virt_state) << i;
3708dd28afeSJean-Philippe Brucker 	}
3718dd28afeSJean-Philippe Brucker 
3728dd28afeSJean-Philippe Brucker 	/* Update mask following the intersection of access and register */
3738dd28afeSJean-Philippe Brucker 	start = max_t(size_t, off, mask_pos);
3748dd28afeSJean-Philippe Brucker 	limit = min_t(size_t, off + sz, mask_pos + 4);
3758dd28afeSJean-Philippe Brucker 
3768dd28afeSJean-Philippe Brucker 	memcpy((void *)&mask + start - mask_pos, data + start - off,
3778dd28afeSJean-Philippe Brucker 	       limit - start);
3788dd28afeSJean-Philippe Brucker 
3798dd28afeSJean-Philippe Brucker 	/* Update states if necessary */
3808dd28afeSJean-Philippe Brucker 	for (i = 0; i < pdev->msi.nr_entries; i++) {
3818dd28afeSJean-Philippe Brucker 		bool masked = mask & (1 << i);
3828dd28afeSJean-Philippe Brucker 
3838dd28afeSJean-Philippe Brucker 		entry = &pdev->msi.entries[i];
3848dd28afeSJean-Philippe Brucker 		if (masked != msi_is_masked(entry->virt_state)) {
3858dd28afeSJean-Philippe Brucker 			msi_set_masked(entry->virt_state, masked);
3868dd28afeSJean-Philippe Brucker 			vfio_pci_update_msi_entry(kvm, vdev, entry);
3878dd28afeSJean-Philippe Brucker 		}
3888dd28afeSJean-Philippe Brucker 	}
3898dd28afeSJean-Philippe Brucker 
3908dd28afeSJean-Philippe Brucker 	return 1;
3918dd28afeSJean-Philippe Brucker }
3928dd28afeSJean-Philippe Brucker 
3938dd28afeSJean-Philippe Brucker static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
3948dd28afeSJean-Philippe Brucker 				   u8 off, u8 *data, u32 sz)
3958dd28afeSJean-Philippe Brucker {
3968dd28afeSJean-Philippe Brucker 	u8 ctrl;
3978dd28afeSJean-Philippe Brucker 	struct msi_msg msg;
3988dd28afeSJean-Philippe Brucker 	size_t i, nr_vectors;
3998dd28afeSJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
4008dd28afeSJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
4018dd28afeSJean-Philippe Brucker 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
4028dd28afeSJean-Philippe Brucker 
4038dd28afeSJean-Philippe Brucker 	off -= pdev->msi.pos;
4048dd28afeSJean-Philippe Brucker 
4058dd28afeSJean-Philippe Brucker 	mutex_lock(&pdev->msi.mutex);
4068dd28afeSJean-Philippe Brucker 
4078dd28afeSJean-Philippe Brucker 	/* Check if the guest is trying to update mask bits */
4088dd28afeSJean-Philippe Brucker 	if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz))
4098dd28afeSJean-Philippe Brucker 		goto out_unlock;
4108dd28afeSJean-Philippe Brucker 
4118dd28afeSJean-Philippe Brucker 	/* Only modify routes when guest pokes the enable bit */
4128dd28afeSJean-Philippe Brucker 	if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS)
4138dd28afeSJean-Philippe Brucker 		goto out_unlock;
4148dd28afeSJean-Philippe Brucker 
4158dd28afeSJean-Philippe Brucker 	ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off);
4168dd28afeSJean-Philippe Brucker 
4178dd28afeSJean-Philippe Brucker 	msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE);
4188dd28afeSJean-Philippe Brucker 
4198dd28afeSJean-Philippe Brucker 	if (!msi_is_enabled(pdev->msi.virt_state)) {
4208dd28afeSJean-Philippe Brucker 		vfio_pci_disable_msis(kvm, vdev, false);
4218dd28afeSJean-Philippe Brucker 		goto out_unlock;
4228dd28afeSJean-Philippe Brucker 	}
4238dd28afeSJean-Philippe Brucker 
4248dd28afeSJean-Philippe Brucker 	/* Create routes for the requested vectors */
4258dd28afeSJean-Philippe Brucker 	nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
4268dd28afeSJean-Philippe Brucker 
4278dd28afeSJean-Philippe Brucker 	msg.address_lo = msi_cap_64->address_lo;
4288dd28afeSJean-Philippe Brucker 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) {
4298dd28afeSJean-Philippe Brucker 		msg.address_hi = msi_cap_64->address_hi;
4308dd28afeSJean-Philippe Brucker 		msg.data = msi_cap_64->data;
4318dd28afeSJean-Philippe Brucker 	} else {
4328dd28afeSJean-Philippe Brucker 		struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64;
4338dd28afeSJean-Philippe Brucker 		msg.address_hi = 0;
4348dd28afeSJean-Philippe Brucker 		msg.data = msi_cap_32->data;
4358dd28afeSJean-Philippe Brucker 	}
4368dd28afeSJean-Philippe Brucker 
4378dd28afeSJean-Philippe Brucker 	for (i = 0; i < nr_vectors; i++) {
4388dd28afeSJean-Philippe Brucker 		entry = &pdev->msi.entries[i];
439e554aefdSLorenzo Pieralisi 
440e554aefdSLorenzo Pieralisi 		/*
441e554aefdSLorenzo Pieralisi 		 * Set the MSI data value as required by the PCI local
442e554aefdSLorenzo Pieralisi 		 * bus specifications, MSI capability, "Message Data".
443e554aefdSLorenzo Pieralisi 		 */
444e554aefdSLorenzo Pieralisi 		msg.data &= ~(nr_vectors - 1);
445e554aefdSLorenzo Pieralisi 		msg.data |= i;
446e554aefdSLorenzo Pieralisi 
4478dd28afeSJean-Philippe Brucker 		entry->config.msg = msg;
4488dd28afeSJean-Philippe Brucker 		vfio_pci_update_msi_entry(kvm, vdev, entry);
4498dd28afeSJean-Philippe Brucker 	}
4508dd28afeSJean-Philippe Brucker 
4518dd28afeSJean-Philippe Brucker 	/* Update the physical capability if necessary */
4528dd28afeSJean-Philippe Brucker 	if (vfio_pci_enable_msis(kvm, vdev, false))
4538dd28afeSJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot enable MSI");
4548dd28afeSJean-Philippe Brucker 
4558dd28afeSJean-Philippe Brucker out_unlock:
4568dd28afeSJean-Philippe Brucker 	mutex_unlock(&pdev->msi.mutex);
4578dd28afeSJean-Philippe Brucker }
4588dd28afeSJean-Philippe Brucker 
4596078a454SJean-Philippe Brucker static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
4606078a454SJean-Philippe Brucker 			      u8 offset, void *data, int sz)
4616078a454SJean-Philippe Brucker {
4626078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
4636078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev;
4646078a454SJean-Philippe Brucker 	struct vfio_device *vdev;
4656078a454SJean-Philippe Brucker 	char base[sz];
4666078a454SJean-Philippe Brucker 
4676078a454SJean-Philippe Brucker 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
4686078a454SJean-Philippe Brucker 	vdev = container_of(pdev, struct vfio_device, pci);
4696078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
4706078a454SJean-Philippe Brucker 
4716078a454SJean-Philippe Brucker 	/* Dummy read in case of side-effects */
4726078a454SJean-Philippe Brucker 	if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
4736078a454SJean-Philippe Brucker 		vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
4746078a454SJean-Philippe Brucker 			      sz, offset);
4756078a454SJean-Philippe Brucker }
4766078a454SJean-Philippe Brucker 
4776078a454SJean-Philippe Brucker static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
4786078a454SJean-Philippe Brucker 			       u8 offset, void *data, int sz)
4796078a454SJean-Philippe Brucker {
4806078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
4816078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev;
4826078a454SJean-Philippe Brucker 	struct vfio_device *vdev;
483*e1d0285cSAlexandru Elisei 	u32 tmp;
484*e1d0285cSAlexandru Elisei 
485*e1d0285cSAlexandru Elisei 	/* Make sure a larger size will not overrun tmp on the stack. */
486*e1d0285cSAlexandru Elisei 	assert(sz <= 4);
4876078a454SJean-Philippe Brucker 
4885b7fef16SAlexandru Elisei 	if (offset == PCI_ROM_ADDRESS)
4895b7fef16SAlexandru Elisei 		return;
4905b7fef16SAlexandru Elisei 
4916078a454SJean-Philippe Brucker 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
4926078a454SJean-Philippe Brucker 	vdev = container_of(pdev, struct vfio_device, pci);
4936078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
4946078a454SJean-Philippe Brucker 
4956078a454SJean-Philippe Brucker 	if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
4966078a454SJean-Philippe Brucker 		vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
4976078a454SJean-Philippe Brucker 			      sz, offset);
4986078a454SJean-Philippe Brucker 
499c9888d95SJean-Philippe Brucker 	/* Handle MSI write now, since it might update the hardware capability */
500c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
501c9888d95SJean-Philippe Brucker 		vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
502c9888d95SJean-Philippe Brucker 
5038dd28afeSJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI)
5048dd28afeSJean-Philippe Brucker 		vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz);
5058dd28afeSJean-Philippe Brucker 
506*e1d0285cSAlexandru Elisei 	if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz)
5076078a454SJean-Philippe Brucker 		vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
5086078a454SJean-Philippe Brucker 			      sz, offset);
5096078a454SJean-Philippe Brucker }
5106078a454SJean-Philippe Brucker 
5118dd28afeSJean-Philippe Brucker static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr)
5128dd28afeSJean-Philippe Brucker {
5138dd28afeSJean-Philippe Brucker 	size_t size = 10;
5148dd28afeSJean-Philippe Brucker 
5158dd28afeSJean-Philippe Brucker 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT)
5168dd28afeSJean-Philippe Brucker 		size += 4;
5178dd28afeSJean-Philippe Brucker 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT)
5188dd28afeSJean-Philippe Brucker 		size += 10;
5198dd28afeSJean-Philippe Brucker 
5208dd28afeSJean-Philippe Brucker 	return size;
5218dd28afeSJean-Philippe Brucker }
5228dd28afeSJean-Philippe Brucker 
523c9888d95SJean-Philippe Brucker static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
524c9888d95SJean-Philippe Brucker {
525c9888d95SJean-Philippe Brucker 	switch (cap_hdr->type) {
526c9888d95SJean-Philippe Brucker 	case PCI_CAP_ID_MSIX:
527c9888d95SJean-Philippe Brucker 		return PCI_CAP_MSIX_SIZEOF;
5288dd28afeSJean-Philippe Brucker 	case PCI_CAP_ID_MSI:
5298dd28afeSJean-Philippe Brucker 		return vfio_pci_msi_cap_size((void *)cap_hdr);
530c9888d95SJean-Philippe Brucker 	default:
531c9888d95SJean-Philippe Brucker 		pr_err("unknown PCI capability 0x%x", cap_hdr->type);
532c9888d95SJean-Philippe Brucker 		return 0;
533c9888d95SJean-Philippe Brucker 	}
534c9888d95SJean-Philippe Brucker }
535c9888d95SJean-Philippe Brucker 
536c9888d95SJean-Philippe Brucker static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
537c9888d95SJean-Philippe Brucker 			    struct pci_cap_hdr *cap, off_t pos)
538c9888d95SJean-Philippe Brucker {
539c9888d95SJean-Philippe Brucker 	struct pci_cap_hdr *last;
540c9888d95SJean-Philippe Brucker 	struct pci_device_header *hdr = &vdev->pci.hdr;
541c9888d95SJean-Philippe Brucker 
542c9888d95SJean-Philippe Brucker 	cap->next = 0;
543c9888d95SJean-Philippe Brucker 
544c9888d95SJean-Philippe Brucker 	if (!hdr->capabilities) {
545c9888d95SJean-Philippe Brucker 		hdr->capabilities = pos;
546c9888d95SJean-Philippe Brucker 		hdr->status |= PCI_STATUS_CAP_LIST;
547c9888d95SJean-Philippe Brucker 	} else {
548c9888d95SJean-Philippe Brucker 		last = PCI_CAP(virt_hdr, hdr->capabilities);
549c9888d95SJean-Philippe Brucker 
550c9888d95SJean-Philippe Brucker 		while (last->next)
551c9888d95SJean-Philippe Brucker 			last = PCI_CAP(virt_hdr, last->next);
552c9888d95SJean-Philippe Brucker 
553c9888d95SJean-Philippe Brucker 		last->next = pos;
554c9888d95SJean-Philippe Brucker 	}
555c9888d95SJean-Philippe Brucker 
556c9888d95SJean-Philippe Brucker 	memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
557c9888d95SJean-Philippe Brucker 
558c9888d95SJean-Philippe Brucker 	return 0;
559c9888d95SJean-Philippe Brucker }
560c9888d95SJean-Philippe Brucker 
5616078a454SJean-Philippe Brucker static int vfio_pci_parse_caps(struct vfio_device *vdev)
5626078a454SJean-Philippe Brucker {
563c9888d95SJean-Philippe Brucker 	int ret;
564c9888d95SJean-Philippe Brucker 	size_t size;
565c9888d95SJean-Philippe Brucker 	u8 pos, next;
566c9888d95SJean-Philippe Brucker 	struct pci_cap_hdr *cap;
567c9888d95SJean-Philippe Brucker 	u8 virt_hdr[PCI_DEV_CFG_SIZE];
5686078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
5696078a454SJean-Philippe Brucker 
5706078a454SJean-Philippe Brucker 	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
5716078a454SJean-Philippe Brucker 		return 0;
5726078a454SJean-Philippe Brucker 
573c9888d95SJean-Philippe Brucker 	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
574c9888d95SJean-Philippe Brucker 
575c9888d95SJean-Philippe Brucker 	pos = pdev->hdr.capabilities & ~3;
576c9888d95SJean-Philippe Brucker 
5776078a454SJean-Philippe Brucker 	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
5786078a454SJean-Philippe Brucker 	pdev->hdr.capabilities = 0;
5796078a454SJean-Philippe Brucker 
580c9888d95SJean-Philippe Brucker 	for (; pos; pos = next) {
581c9888d95SJean-Philippe Brucker 		cap = PCI_CAP(&pdev->hdr, pos);
582c9888d95SJean-Philippe Brucker 		next = cap->next;
583c9888d95SJean-Philippe Brucker 
584c9888d95SJean-Philippe Brucker 		switch (cap->type) {
585c9888d95SJean-Philippe Brucker 		case PCI_CAP_ID_MSIX:
586c9888d95SJean-Philippe Brucker 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
587c9888d95SJean-Philippe Brucker 			if (ret)
588c9888d95SJean-Philippe Brucker 				return ret;
589c9888d95SJean-Philippe Brucker 
590c9888d95SJean-Philippe Brucker 			pdev->msix.pos = pos;
591c9888d95SJean-Philippe Brucker 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
592c9888d95SJean-Philippe Brucker 			break;
5938dd28afeSJean-Philippe Brucker 		case PCI_CAP_ID_MSI:
5948dd28afeSJean-Philippe Brucker 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
5958dd28afeSJean-Philippe Brucker 			if (ret)
5968dd28afeSJean-Philippe Brucker 				return ret;
5978dd28afeSJean-Philippe Brucker 
5988dd28afeSJean-Philippe Brucker 			pdev->msi.pos = pos;
5998dd28afeSJean-Philippe Brucker 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI;
6008dd28afeSJean-Philippe Brucker 			break;
601c9888d95SJean-Philippe Brucker 		}
602c9888d95SJean-Philippe Brucker 	}
603c9888d95SJean-Philippe Brucker 
604c9888d95SJean-Philippe Brucker 	/* Wipe remaining capabilities */
605c9888d95SJean-Philippe Brucker 	pos = PCI_STD_HEADER_SIZEOF;
606c9888d95SJean-Philippe Brucker 	size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
607c9888d95SJean-Philippe Brucker 	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
6086078a454SJean-Philippe Brucker 
6096078a454SJean-Philippe Brucker 	return 0;
6106078a454SJean-Philippe Brucker }
6116078a454SJean-Philippe Brucker 
6126078a454SJean-Philippe Brucker static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
6136078a454SJean-Philippe Brucker {
614c9888d95SJean-Philippe Brucker 	ssize_t sz = PCI_DEV_CFG_SIZE;
6156078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
6166078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
6176078a454SJean-Philippe Brucker 
6186078a454SJean-Philippe Brucker 	if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
6196078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "Config Space not found");
6206078a454SJean-Philippe Brucker 		return -ENODEV;
6216078a454SJean-Philippe Brucker 	}
6226078a454SJean-Philippe Brucker 
6236078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
6246078a454SJean-Philippe Brucker 	*info = (struct vfio_region_info) {
6256078a454SJean-Philippe Brucker 			.argsz = sizeof(*info),
6266078a454SJean-Philippe Brucker 			.index = VFIO_PCI_CONFIG_REGION_INDEX,
6276078a454SJean-Philippe Brucker 	};
6286078a454SJean-Philippe Brucker 
6296078a454SJean-Philippe Brucker 	ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
6306078a454SJean-Philippe Brucker 	if (!info->size) {
6316078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "Config Space has size zero?!");
6326078a454SJean-Philippe Brucker 		return -EINVAL;
6336078a454SJean-Philippe Brucker 	}
6346078a454SJean-Philippe Brucker 
635c9888d95SJean-Philippe Brucker 	/* Read standard headers and capabilities */
6366078a454SJean-Philippe Brucker 	if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
6376078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
6386078a454SJean-Philippe Brucker 		return -EIO;
6396078a454SJean-Philippe Brucker 	}
6406078a454SJean-Philippe Brucker 
6416078a454SJean-Philippe Brucker 	/* Strip bit 7, that indicates multifunction */
6426078a454SJean-Philippe Brucker 	pdev->hdr.header_type &= 0x7f;
6436078a454SJean-Philippe Brucker 
6446078a454SJean-Philippe Brucker 	if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
6456078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "unsupported header type %u",
6466078a454SJean-Philippe Brucker 			     pdev->hdr.header_type);
6476078a454SJean-Philippe Brucker 		return -EOPNOTSUPP;
6486078a454SJean-Philippe Brucker 	}
6496078a454SJean-Philippe Brucker 
650c9888d95SJean-Philippe Brucker 	if (pdev->hdr.irq_pin)
651c9888d95SJean-Philippe Brucker 		pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
652c9888d95SJean-Philippe Brucker 
6536078a454SJean-Philippe Brucker 	vfio_pci_parse_caps(vdev);
6546078a454SJean-Philippe Brucker 
6556078a454SJean-Philippe Brucker 	return 0;
6566078a454SJean-Philippe Brucker }
6576078a454SJean-Philippe Brucker 
6586078a454SJean-Philippe Brucker static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
6596078a454SJean-Philippe Brucker {
6606078a454SJean-Philippe Brucker 	int i;
6613665392aSAlexandru Elisei 	u64 base;
6626078a454SJean-Philippe Brucker 	ssize_t hdr_sz;
663c9888d95SJean-Philippe Brucker 	struct msix_cap *msix;
6646078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
6656078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
6663665392aSAlexandru Elisei 	struct vfio_region *region;
6676078a454SJean-Philippe Brucker 
6686078a454SJean-Philippe Brucker 	/* Initialise the BARs */
6696078a454SJean-Philippe Brucker 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
6703665392aSAlexandru Elisei 		if ((u32)i == vdev->info.num_regions)
6713665392aSAlexandru Elisei 			break;
67282caa882SJean-Philippe Brucker 
6733665392aSAlexandru Elisei 		region = &vdev->regions[i];
67482caa882SJean-Philippe Brucker 		/* Construct a fake reg to match what we've mapped. */
67582caa882SJean-Philippe Brucker 		if (region->is_ioport) {
67682caa882SJean-Philippe Brucker 			base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) |
67782caa882SJean-Philippe Brucker 				PCI_BASE_ADDRESS_SPACE_IO;
67882caa882SJean-Philippe Brucker 		} else {
67982caa882SJean-Philippe Brucker 			base = (region->guest_phys_addr &
68082caa882SJean-Philippe Brucker 				PCI_BASE_ADDRESS_MEM_MASK) |
68182caa882SJean-Philippe Brucker 				PCI_BASE_ADDRESS_SPACE_MEMORY;
68282caa882SJean-Philippe Brucker 		}
68382caa882SJean-Philippe Brucker 
68482caa882SJean-Philippe Brucker 		pdev->hdr.bar[i] = base;
6856078a454SJean-Philippe Brucker 
6866078a454SJean-Philippe Brucker 		if (!base)
6876078a454SJean-Philippe Brucker 			continue;
6886078a454SJean-Philippe Brucker 
6896078a454SJean-Philippe Brucker 		pdev->hdr.bar_size[i] = region->info.size;
6906078a454SJean-Philippe Brucker 	}
6916078a454SJean-Philippe Brucker 
6926078a454SJean-Philippe Brucker 	/* I really can't be bothered to support cardbus. */
6936078a454SJean-Philippe Brucker 	pdev->hdr.card_bus = 0;
6946078a454SJean-Philippe Brucker 
6956078a454SJean-Philippe Brucker 	/*
6966078a454SJean-Philippe Brucker 	 * Nuke the expansion ROM for now. If we want to do this properly,
6976078a454SJean-Philippe Brucker 	 * we need to save its size somewhere and map into the guest.
6986078a454SJean-Philippe Brucker 	 */
6996078a454SJean-Philippe Brucker 	pdev->hdr.exp_rom_bar = 0;
7006078a454SJean-Philippe Brucker 
701c9888d95SJean-Philippe Brucker 	/* Plumb in our fake MSI-X capability, if we have it. */
702c9888d95SJean-Philippe Brucker 	msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
703c9888d95SJean-Philippe Brucker 	if (msix) {
704c9888d95SJean-Philippe Brucker 		/* Add a shortcut to the PBA region for the MMIO handler */
705c9888d95SJean-Philippe Brucker 		int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
706c9888d95SJean-Philippe Brucker 		pdev->msix_pba.offset = vdev->regions[pba_index].info.offset +
707c9888d95SJean-Philippe Brucker 					(msix->pba_offset & PCI_MSIX_PBA_OFFSET);
708c9888d95SJean-Philippe Brucker 
709c9888d95SJean-Philippe Brucker 		/* Tidy up the capability */
710c9888d95SJean-Philippe Brucker 		msix->table_offset &= PCI_MSIX_TABLE_BIR;
711c9888d95SJean-Philippe Brucker 		msix->pba_offset &= PCI_MSIX_PBA_BIR;
712c9888d95SJean-Philippe Brucker 		if (pdev->msix_table.bar == pdev->msix_pba.bar)
713c9888d95SJean-Philippe Brucker 			msix->pba_offset |= pdev->msix_table.size &
714c9888d95SJean-Philippe Brucker 					    PCI_MSIX_PBA_OFFSET;
715c9888d95SJean-Philippe Brucker 	}
716c9888d95SJean-Philippe Brucker 
7176078a454SJean-Philippe Brucker 	/* Install our fake Configuration Space */
7186078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
7196078a454SJean-Philippe Brucker 	hdr_sz = PCI_DEV_CFG_SIZE;
7206078a454SJean-Philippe Brucker 	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
7216078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
7226078a454SJean-Philippe Brucker 			     hdr_sz);
7236078a454SJean-Philippe Brucker 		return -EIO;
7246078a454SJean-Philippe Brucker 	}
7256078a454SJean-Philippe Brucker 
7266078a454SJean-Philippe Brucker 	/* Register callbacks for cfg accesses */
7276078a454SJean-Philippe Brucker 	pdev->hdr.cfg_ops = (struct pci_config_operations) {
7286078a454SJean-Philippe Brucker 		.read	= vfio_pci_cfg_read,
7296078a454SJean-Philippe Brucker 		.write	= vfio_pci_cfg_write,
7306078a454SJean-Philippe Brucker 	};
7316078a454SJean-Philippe Brucker 
7326078a454SJean-Philippe Brucker 	pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
7336078a454SJean-Philippe Brucker 
7346078a454SJean-Philippe Brucker 	return 0;
7356078a454SJean-Philippe Brucker }
7366078a454SJean-Philippe Brucker 
737ed01a603SAlexandru Elisei static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index,
738ed01a603SAlexandru Elisei 				    struct vfio_region_info *info)
739ed01a603SAlexandru Elisei {
740ed01a603SAlexandru Elisei 	int ret;
741ed01a603SAlexandru Elisei 
742ed01a603SAlexandru Elisei 	*info = (struct vfio_region_info) {
743ed01a603SAlexandru Elisei 		.argsz = sizeof(*info),
744ed01a603SAlexandru Elisei 		.index = index,
745ed01a603SAlexandru Elisei 	};
746ed01a603SAlexandru Elisei 
747ed01a603SAlexandru Elisei 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
748ed01a603SAlexandru Elisei 	if (ret) {
749ed01a603SAlexandru Elisei 		ret = -errno;
750ed01a603SAlexandru Elisei 		vfio_dev_err(vdev, "cannot get info for BAR %u", index);
751ed01a603SAlexandru Elisei 		return ret;
752ed01a603SAlexandru Elisei 	}
753ed01a603SAlexandru Elisei 
754ed01a603SAlexandru Elisei 	if (info->size && !is_power_of_two(info->size)) {
755ed01a603SAlexandru Elisei 		vfio_dev_err(vdev, "region is not power of two: 0x%llx",
756ed01a603SAlexandru Elisei 				info->size);
757ed01a603SAlexandru Elisei 		return -EINVAL;
758ed01a603SAlexandru Elisei 	}
759ed01a603SAlexandru Elisei 
760ed01a603SAlexandru Elisei 	return 0;
761ed01a603SAlexandru Elisei }
762ed01a603SAlexandru Elisei 
763ed01a603SAlexandru Elisei static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev)
764c9888d95SJean-Philippe Brucker {
765c9888d95SJean-Philippe Brucker 	int ret;
766c9888d95SJean-Philippe Brucker 	size_t i;
767ed01a603SAlexandru Elisei 	size_t map_size;
768c9888d95SJean-Philippe Brucker 	size_t nr_entries;
769c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_entry *entries;
770ed01a603SAlexandru Elisei 	struct vfio_pci_device *pdev = &vdev->pci;
771c9888d95SJean-Philippe Brucker 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
772c9888d95SJean-Philippe Brucker 	struct vfio_pci_msix_table *table = &pdev->msix_table;
773c9888d95SJean-Philippe Brucker 	struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
774ed01a603SAlexandru Elisei 	struct vfio_region_info info;
775c9888d95SJean-Philippe Brucker 
776c9888d95SJean-Philippe Brucker 	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
777c9888d95SJean-Philippe Brucker 	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
778c9888d95SJean-Philippe Brucker 
779c9888d95SJean-Philippe Brucker 	/*
780c9888d95SJean-Philippe Brucker 	 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE.
781c9888d95SJean-Philippe Brucker 	 */
782c9888d95SJean-Philippe Brucker 	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
783c9888d95SJean-Philippe Brucker 	table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE);
784c9888d95SJean-Philippe Brucker 	pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE);
785c9888d95SJean-Philippe Brucker 
786c9888d95SJean-Philippe Brucker 	entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
787c9888d95SJean-Philippe Brucker 	if (!entries)
788c9888d95SJean-Philippe Brucker 		return -ENOMEM;
789c9888d95SJean-Philippe Brucker 
790c9888d95SJean-Philippe Brucker 	for (i = 0; i < nr_entries; i++)
791c9888d95SJean-Philippe Brucker 		entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
792c9888d95SJean-Philippe Brucker 
793ed01a603SAlexandru Elisei 	ret = vfio_pci_get_region_info(vdev, table->bar, &info);
794ed01a603SAlexandru Elisei 	if (ret)
795ed01a603SAlexandru Elisei 		return ret;
796ed01a603SAlexandru Elisei 	if (!info.size)
797ed01a603SAlexandru Elisei 		return -EINVAL;
798ed01a603SAlexandru Elisei 	map_size = info.size;
799ed01a603SAlexandru Elisei 
800ed01a603SAlexandru Elisei 	if (table->bar != pba->bar) {
801ed01a603SAlexandru Elisei 		ret = vfio_pci_get_region_info(vdev, pba->bar, &info);
802ed01a603SAlexandru Elisei 		if (ret)
803ed01a603SAlexandru Elisei 			return ret;
804ed01a603SAlexandru Elisei 		if (!info.size)
805ed01a603SAlexandru Elisei 			return -EINVAL;
806ed01a603SAlexandru Elisei 		map_size += info.size;
807ed01a603SAlexandru Elisei 	}
808ed01a603SAlexandru Elisei 
809c9888d95SJean-Philippe Brucker 	/*
810c9888d95SJean-Philippe Brucker 	 * To ease MSI-X cap configuration in case they share the same BAR,
811c9888d95SJean-Philippe Brucker 	 * collapse table and pending array. The size of the BAR regions must be
812c9888d95SJean-Philippe Brucker 	 * powers of two.
813c9888d95SJean-Philippe Brucker 	 */
814ed01a603SAlexandru Elisei 	map_size = ALIGN(map_size, PAGE_SIZE);
815ed01a603SAlexandru Elisei 	table->guest_phys_addr = pci_get_mmio_block(map_size);
816c9888d95SJean-Philippe Brucker 	if (!table->guest_phys_addr) {
817ed01a603SAlexandru Elisei 		pr_err("cannot allocate MMIO space");
818c9888d95SJean-Philippe Brucker 		ret = -ENOMEM;
819c9888d95SJean-Philippe Brucker 		goto out_free;
820c9888d95SJean-Philippe Brucker 	}
821c9888d95SJean-Philippe Brucker 	pba->guest_phys_addr = table->guest_phys_addr + table->size;
822c9888d95SJean-Philippe Brucker 
823c9888d95SJean-Philippe Brucker 	ret = kvm__register_mmio(kvm, table->guest_phys_addr, table->size,
824c9888d95SJean-Philippe Brucker 				 false, vfio_pci_msix_table_access, pdev);
825c9888d95SJean-Philippe Brucker 	if (ret < 0)
826c9888d95SJean-Philippe Brucker 		goto out_free;
827c9888d95SJean-Philippe Brucker 
828c9888d95SJean-Philippe Brucker 	/*
829c9888d95SJean-Philippe Brucker 	 * We could map the physical PBA directly into the guest, but it's
830c9888d95SJean-Philippe Brucker 	 * likely smaller than a page, and we can only hand full pages to the
831c9888d95SJean-Philippe Brucker 	 * guest. Even though the PCI spec disallows sharing a page used for
832c9888d95SJean-Philippe Brucker 	 * MSI-X with any other resource, it allows to share the same page
833c9888d95SJean-Philippe Brucker 	 * between MSI-X table and PBA. For the sake of isolation, create a
834c9888d95SJean-Philippe Brucker 	 * virtual PBA.
835c9888d95SJean-Philippe Brucker 	 */
836c9888d95SJean-Philippe Brucker 	ret = kvm__register_mmio(kvm, pba->guest_phys_addr, pba->size, false,
837c9888d95SJean-Philippe Brucker 				 vfio_pci_msix_pba_access, pdev);
838c9888d95SJean-Philippe Brucker 	if (ret < 0)
839c9888d95SJean-Philippe Brucker 		goto out_free;
840c9888d95SJean-Philippe Brucker 
841c9888d95SJean-Philippe Brucker 	pdev->msix.entries = entries;
842c9888d95SJean-Philippe Brucker 	pdev->msix.nr_entries = nr_entries;
843c9888d95SJean-Philippe Brucker 
844c9888d95SJean-Philippe Brucker 	return 0;
845c9888d95SJean-Philippe Brucker 
846c9888d95SJean-Philippe Brucker out_free:
847c9888d95SJean-Philippe Brucker 	free(entries);
848c9888d95SJean-Philippe Brucker 
849c9888d95SJean-Philippe Brucker 	return ret;
850c9888d95SJean-Philippe Brucker }
851c9888d95SJean-Philippe Brucker 
8528dd28afeSJean-Philippe Brucker static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev)
8538dd28afeSJean-Philippe Brucker {
8548dd28afeSJean-Philippe Brucker 	struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos);
8558dd28afeSJean-Philippe Brucker 
8568dd28afeSJean-Philippe Brucker 	pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1),
8578dd28afeSJean-Philippe Brucker 	pdev->msi.entries = calloc(pdev->msi.nr_entries,
8588dd28afeSJean-Philippe Brucker 				   sizeof(struct vfio_pci_msi_entry));
8598dd28afeSJean-Philippe Brucker 	if (!pdev->msi.entries)
8608dd28afeSJean-Philippe Brucker 		return -ENOMEM;
8618dd28afeSJean-Philippe Brucker 
8628dd28afeSJean-Philippe Brucker 	return 0;
8638dd28afeSJean-Philippe Brucker }
8648dd28afeSJean-Philippe Brucker 
8656078a454SJean-Philippe Brucker static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
8666078a454SJean-Philippe Brucker 				  size_t nr)
8676078a454SJean-Philippe Brucker {
8686078a454SJean-Philippe Brucker 	int ret;
86982caa882SJean-Philippe Brucker 	u32 bar;
8706078a454SJean-Philippe Brucker 	size_t map_size;
871c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
8723665392aSAlexandru Elisei 	struct vfio_region *region;
8736078a454SJean-Philippe Brucker 
8746078a454SJean-Philippe Brucker 	if (nr >= vdev->info.num_regions)
8756078a454SJean-Philippe Brucker 		return 0;
8766078a454SJean-Philippe Brucker 
8773665392aSAlexandru Elisei 	region = &vdev->regions[nr];
87882caa882SJean-Philippe Brucker 	bar = pdev->hdr.bar[nr];
87982caa882SJean-Philippe Brucker 
88082caa882SJean-Philippe Brucker 	region->vdev = vdev;
88182caa882SJean-Philippe Brucker 	region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
8826078a454SJean-Philippe Brucker 
883ed01a603SAlexandru Elisei 	ret = vfio_pci_get_region_info(vdev, nr, &region->info);
884ed01a603SAlexandru Elisei 	if (ret)
8856078a454SJean-Philippe Brucker 		return ret;
8866078a454SJean-Philippe Brucker 
8876078a454SJean-Philippe Brucker 	/* Ignore invalid or unimplemented regions */
8886078a454SJean-Philippe Brucker 	if (!region->info.size)
8896078a454SJean-Philippe Brucker 		return 0;
8906078a454SJean-Philippe Brucker 
891c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
892c9888d95SJean-Philippe Brucker 		/* Trap and emulate MSI-X table */
893c9888d95SJean-Philippe Brucker 		if (nr == pdev->msix_table.bar) {
894c9888d95SJean-Philippe Brucker 			region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
895c9888d95SJean-Philippe Brucker 			return 0;
896c9888d95SJean-Philippe Brucker 		} else if (nr == pdev->msix_pba.bar) {
897c9888d95SJean-Philippe Brucker 			region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
898c9888d95SJean-Philippe Brucker 			return 0;
899c9888d95SJean-Philippe Brucker 		}
900c9888d95SJean-Philippe Brucker 	}
901c9888d95SJean-Philippe Brucker 
902a05e576fSAlexandru Elisei 	if (region->is_ioport) {
903a05e576fSAlexandru Elisei 		region->port_base = pci_get_io_port_block(region->info.size);
904a05e576fSAlexandru Elisei 	} else {
9056078a454SJean-Philippe Brucker 		/* Grab some MMIO space in the guest */
9066078a454SJean-Philippe Brucker 		map_size = ALIGN(region->info.size, PAGE_SIZE);
907854aa2efSJulien Thierry 		region->guest_phys_addr = pci_get_mmio_block(map_size);
90882caa882SJean-Philippe Brucker 	}
9096078a454SJean-Philippe Brucker 
91082caa882SJean-Philippe Brucker 	/* Map the BARs into the guest or setup a trap region. */
9116078a454SJean-Philippe Brucker 	ret = vfio_map_region(kvm, vdev, region);
9126078a454SJean-Philippe Brucker 	if (ret)
9136078a454SJean-Philippe Brucker 		return ret;
9146078a454SJean-Philippe Brucker 
9156078a454SJean-Philippe Brucker 	return 0;
9166078a454SJean-Philippe Brucker }
9176078a454SJean-Philippe Brucker 
9186078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_regions(struct kvm *kvm,
9196078a454SJean-Philippe Brucker 					  struct vfio_device *vdev)
9206078a454SJean-Philippe Brucker {
9216078a454SJean-Philippe Brucker 	int ret;
9226078a454SJean-Philippe Brucker 	u32 bar;
9236078a454SJean-Philippe Brucker 	size_t i;
9246078a454SJean-Philippe Brucker 	bool is_64bit = false;
9256078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
9266078a454SJean-Philippe Brucker 
9276078a454SJean-Philippe Brucker 	ret = vfio_pci_parse_cfg_space(vdev);
9286078a454SJean-Philippe Brucker 	if (ret)
9296078a454SJean-Philippe Brucker 		return ret;
9306078a454SJean-Philippe Brucker 
931c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
932ed01a603SAlexandru Elisei 		ret = vfio_pci_create_msix_table(kvm, vdev);
933c9888d95SJean-Philippe Brucker 		if (ret)
934c9888d95SJean-Philippe Brucker 			return ret;
935c9888d95SJean-Philippe Brucker 	}
936c9888d95SJean-Philippe Brucker 
9378dd28afeSJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
9388dd28afeSJean-Philippe Brucker 		ret = vfio_pci_create_msi_cap(kvm, pdev);
9398dd28afeSJean-Philippe Brucker 		if (ret)
9408dd28afeSJean-Philippe Brucker 			return ret;
9418dd28afeSJean-Philippe Brucker 	}
9428dd28afeSJean-Philippe Brucker 
9436078a454SJean-Philippe Brucker 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
9446078a454SJean-Philippe Brucker 		/* Ignore top half of 64-bit BAR */
94584998f21SAlexandru Elisei 		if (is_64bit) {
94684998f21SAlexandru Elisei 			is_64bit = false;
9476078a454SJean-Philippe Brucker 			continue;
94884998f21SAlexandru Elisei 		}
9496078a454SJean-Philippe Brucker 
9506078a454SJean-Philippe Brucker 		ret = vfio_pci_configure_bar(kvm, vdev, i);
9516078a454SJean-Philippe Brucker 		if (ret)
9526078a454SJean-Philippe Brucker 			return ret;
9536078a454SJean-Philippe Brucker 
9546078a454SJean-Philippe Brucker 		bar = pdev->hdr.bar[i];
9556078a454SJean-Philippe Brucker 		is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) ==
9566078a454SJean-Philippe Brucker 			   PCI_BASE_ADDRESS_SPACE_MEMORY &&
9576078a454SJean-Philippe Brucker 			   bar & PCI_BASE_ADDRESS_MEM_TYPE_64;
9586078a454SJean-Philippe Brucker 	}
9596078a454SJean-Philippe Brucker 
9606078a454SJean-Philippe Brucker 	/* We've configured the BARs, fake up a Configuration Space */
9616078a454SJean-Philippe Brucker 	return vfio_pci_fixup_cfg_space(vdev);
9626078a454SJean-Philippe Brucker }
9636078a454SJean-Philippe Brucker 
964c9888d95SJean-Philippe Brucker /*
965c9888d95SJean-Philippe Brucker  * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
966c9888d95SJean-Philippe Brucker  * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
967c9888d95SJean-Philippe Brucker  */
968c9888d95SJean-Philippe Brucker static int vfio_pci_reserve_irq_fds(size_t num)
969c9888d95SJean-Philippe Brucker {
970c9888d95SJean-Philippe Brucker 	/*
971c9888d95SJean-Philippe Brucker 	 * I counted around 27 fds under normal load. Let's add 100 for good
972c9888d95SJean-Philippe Brucker 	 * measure.
973c9888d95SJean-Philippe Brucker 	 */
974c9888d95SJean-Philippe Brucker 	static size_t needed = 128;
975c9888d95SJean-Philippe Brucker 	struct rlimit fd_limit, new_limit;
976c9888d95SJean-Philippe Brucker 
977c9888d95SJean-Philippe Brucker 	needed += num;
978c9888d95SJean-Philippe Brucker 
979c9888d95SJean-Philippe Brucker 	if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
980c9888d95SJean-Philippe Brucker 		perror("getrlimit(RLIMIT_NOFILE)");
981c9888d95SJean-Philippe Brucker 		return 0;
982c9888d95SJean-Philippe Brucker 	}
983c9888d95SJean-Philippe Brucker 
984c9888d95SJean-Philippe Brucker 	if (fd_limit.rlim_cur >= needed)
985c9888d95SJean-Philippe Brucker 		return 0;
986c9888d95SJean-Philippe Brucker 
987c9888d95SJean-Philippe Brucker 	new_limit.rlim_cur = needed;
988c9888d95SJean-Philippe Brucker 
989c9888d95SJean-Philippe Brucker 	if (fd_limit.rlim_max < needed)
990c9888d95SJean-Philippe Brucker 		/* Try to bump hard limit (root only) */
991c9888d95SJean-Philippe Brucker 		new_limit.rlim_max = needed;
992c9888d95SJean-Philippe Brucker 	else
993c9888d95SJean-Philippe Brucker 		new_limit.rlim_max = fd_limit.rlim_max;
994c9888d95SJean-Philippe Brucker 
995c9888d95SJean-Philippe Brucker 	if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
996c9888d95SJean-Philippe Brucker 		perror("setrlimit(RLIMIT_NOFILE)");
997c9888d95SJean-Philippe Brucker 		pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
998c9888d95SJean-Philippe Brucker 			   (size_t)(needed - fd_limit.rlim_cur));
999c9888d95SJean-Philippe Brucker 	}
1000c9888d95SJean-Philippe Brucker 
1001c9888d95SJean-Philippe Brucker 	return 0;
1002c9888d95SJean-Philippe Brucker }
1003c9888d95SJean-Philippe Brucker 
1004c9888d95SJean-Philippe Brucker static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
1005c9888d95SJean-Philippe Brucker 			     struct vfio_pci_msi_common *msis)
1006c9888d95SJean-Philippe Brucker {
1007c9888d95SJean-Philippe Brucker 	int ret;
1008c9888d95SJean-Philippe Brucker 	size_t i;
1009c9888d95SJean-Philippe Brucker 	int *eventfds;
1010c9888d95SJean-Philippe Brucker 	size_t irq_set_size;
1011c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
1012c9888d95SJean-Philippe Brucker 	size_t nr_entries = msis->nr_entries;
1013c9888d95SJean-Philippe Brucker 
1014c9888d95SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
101509533d3cSAndre Przywara 	if (ret || msis->info.count == 0) {
1016c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "no MSI reported by VFIO");
1017c9888d95SJean-Philippe Brucker 		return -ENODEV;
1018c9888d95SJean-Philippe Brucker 	}
1019c9888d95SJean-Philippe Brucker 
1020c9888d95SJean-Philippe Brucker 	if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1021c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "interrupt not EVENTFD capable");
1022c9888d95SJean-Philippe Brucker 		return -EINVAL;
1023c9888d95SJean-Philippe Brucker 	}
1024c9888d95SJean-Philippe Brucker 
1025c9888d95SJean-Philippe Brucker 	if (msis->info.count != nr_entries) {
1026c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
1027c9888d95SJean-Philippe Brucker 		return -EINVAL;
1028c9888d95SJean-Philippe Brucker 	}
1029c9888d95SJean-Philippe Brucker 
1030c9888d95SJean-Philippe Brucker 	mutex_init(&msis->mutex);
1031c9888d95SJean-Philippe Brucker 
1032c9888d95SJean-Philippe Brucker 	vfio_pci_reserve_irq_fds(nr_entries);
1033c9888d95SJean-Philippe Brucker 
1034c9888d95SJean-Philippe Brucker 	irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
1035c9888d95SJean-Philippe Brucker 	msis->irq_set = malloc(irq_set_size);
1036c9888d95SJean-Philippe Brucker 	if (!msis->irq_set)
1037c9888d95SJean-Philippe Brucker 		return -ENOMEM;
1038c9888d95SJean-Philippe Brucker 
1039c9888d95SJean-Philippe Brucker 	*msis->irq_set = (struct vfio_irq_set) {
1040c9888d95SJean-Philippe Brucker 		.argsz	= irq_set_size,
1041c9888d95SJean-Philippe Brucker 		.flags 	= VFIO_IRQ_SET_DATA_EVENTFD |
1042c9888d95SJean-Philippe Brucker 			  VFIO_IRQ_SET_ACTION_TRIGGER,
1043c9888d95SJean-Philippe Brucker 		.index 	= msis->info.index,
1044c9888d95SJean-Philippe Brucker 		.start 	= 0,
1045c9888d95SJean-Philippe Brucker 		.count 	= nr_entries,
1046c9888d95SJean-Philippe Brucker 	};
1047c9888d95SJean-Philippe Brucker 
1048c9888d95SJean-Philippe Brucker 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
1049c9888d95SJean-Philippe Brucker 
1050c9888d95SJean-Philippe Brucker 	for (i = 0; i < nr_entries; i++) {
1051c9888d95SJean-Philippe Brucker 		entry = &msis->entries[i];
1052c9888d95SJean-Philippe Brucker 		entry->gsi = -1;
1053c9888d95SJean-Philippe Brucker 		entry->eventfd = -1;
1054c9888d95SJean-Philippe Brucker 		msi_set_masked(entry->virt_state, true);
1055c9888d95SJean-Philippe Brucker 		msi_set_masked(entry->phys_state, true);
1056c9888d95SJean-Philippe Brucker 		eventfds[i] = -1;
1057c9888d95SJean-Philippe Brucker 	}
1058c9888d95SJean-Philippe Brucker 
1059c9888d95SJean-Philippe Brucker 	return 0;
1060c9888d95SJean-Philippe Brucker }
1061c9888d95SJean-Philippe Brucker 
1062c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
1063c9888d95SJean-Philippe Brucker {
1064c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
1065c9888d95SJean-Philippe Brucker 	int gsi = pdev->intx_gsi;
1066c9888d95SJean-Philippe Brucker 	struct vfio_irq_set irq_set = {
1067c9888d95SJean-Philippe Brucker 		.argsz	= sizeof(irq_set),
1068c9888d95SJean-Philippe Brucker 		.flags	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
1069c9888d95SJean-Philippe Brucker 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1070c9888d95SJean-Philippe Brucker 	};
1071c9888d95SJean-Philippe Brucker 
10727302327aSLeo Yan 	if (pdev->intx_fd == -1)
10737302327aSLeo Yan 		return;
10747302327aSLeo Yan 
1075c9888d95SJean-Philippe Brucker 	pr_debug("user requested MSI, disabling INTx %d", gsi);
1076c9888d95SJean-Philippe Brucker 
1077c9888d95SJean-Philippe Brucker 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
1078c9888d95SJean-Philippe Brucker 	irq__del_irqfd(kvm, gsi, pdev->intx_fd);
1079c9888d95SJean-Philippe Brucker 
1080c9888d95SJean-Philippe Brucker 	close(pdev->intx_fd);
1081a1ff6f87SLeo Yan 	close(pdev->unmask_fd);
10827302327aSLeo Yan 	pdev->intx_fd = -1;
1083c9888d95SJean-Philippe Brucker }
1084c9888d95SJean-Philippe Brucker 
10856078a454SJean-Philippe Brucker static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
10866078a454SJean-Philippe Brucker {
10876078a454SJean-Philippe Brucker 	int ret;
10886078a454SJean-Philippe Brucker 	int trigger_fd, unmask_fd;
1089a3704b91SAndre Przywara 	union vfio_irq_eventfd	trigger;
1090a3704b91SAndre Przywara 	union vfio_irq_eventfd	unmask;
10916078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
109212bd7a16SLeo Yan 	int gsi = pdev->intx_gsi;
10936078a454SJean-Philippe Brucker 
10947302327aSLeo Yan 	if (pdev->intx_fd != -1)
10957302327aSLeo Yan 		return 0;
10967302327aSLeo Yan 
10976078a454SJean-Philippe Brucker 	/*
10986078a454SJean-Philippe Brucker 	 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
10996078a454SJean-Philippe Brucker 	 * signals an interrupt from host to guest, and unmask_fd signals the
11006078a454SJean-Philippe Brucker 	 * deassertion of the line from guest to host.
11016078a454SJean-Philippe Brucker 	 */
11026078a454SJean-Philippe Brucker 	trigger_fd = eventfd(0, 0);
11036078a454SJean-Philippe Brucker 	if (trigger_fd < 0) {
11046078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to create trigger eventfd");
11056078a454SJean-Philippe Brucker 		return trigger_fd;
11066078a454SJean-Philippe Brucker 	}
11076078a454SJean-Philippe Brucker 
11086078a454SJean-Philippe Brucker 	unmask_fd = eventfd(0, 0);
11096078a454SJean-Philippe Brucker 	if (unmask_fd < 0) {
11106078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to create unmask eventfd");
11116078a454SJean-Philippe Brucker 		close(trigger_fd);
11126078a454SJean-Philippe Brucker 		return unmask_fd;
11136078a454SJean-Philippe Brucker 	}
11146078a454SJean-Philippe Brucker 
11156078a454SJean-Philippe Brucker 	ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
11166078a454SJean-Philippe Brucker 	if (ret)
11176078a454SJean-Philippe Brucker 		goto err_close;
11186078a454SJean-Philippe Brucker 
11196078a454SJean-Philippe Brucker 	trigger.irq = (struct vfio_irq_set) {
11206078a454SJean-Philippe Brucker 		.argsz	= sizeof(trigger),
11216078a454SJean-Philippe Brucker 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
11226078a454SJean-Philippe Brucker 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
11236078a454SJean-Philippe Brucker 		.start	= 0,
11246078a454SJean-Philippe Brucker 		.count	= 1,
11256078a454SJean-Philippe Brucker 	};
1126a3704b91SAndre Przywara 	set_vfio_irq_eventd_payload(&trigger, trigger_fd);
11276078a454SJean-Philippe Brucker 
11286078a454SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
11296078a454SJean-Philippe Brucker 	if (ret < 0) {
11306078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to setup VFIO IRQ");
11316078a454SJean-Philippe Brucker 		goto err_delete_line;
11326078a454SJean-Philippe Brucker 	}
11336078a454SJean-Philippe Brucker 
11346078a454SJean-Philippe Brucker 	unmask.irq = (struct vfio_irq_set) {
11356078a454SJean-Philippe Brucker 		.argsz	= sizeof(unmask),
11366078a454SJean-Philippe Brucker 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
11376078a454SJean-Philippe Brucker 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
11386078a454SJean-Philippe Brucker 		.start	= 0,
11396078a454SJean-Philippe Brucker 		.count	= 1,
11406078a454SJean-Philippe Brucker 	};
1141a3704b91SAndre Przywara 	set_vfio_irq_eventd_payload(&unmask, unmask_fd);
11426078a454SJean-Philippe Brucker 
11436078a454SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask);
11446078a454SJean-Philippe Brucker 	if (ret < 0) {
11456078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to setup unmask IRQ");
11466078a454SJean-Philippe Brucker 		goto err_remove_event;
11476078a454SJean-Philippe Brucker 	}
11486078a454SJean-Philippe Brucker 
1149c9888d95SJean-Philippe Brucker 	pdev->intx_fd = trigger_fd;
1150a1ff6f87SLeo Yan 	pdev->unmask_fd = unmask_fd;
1151c9888d95SJean-Philippe Brucker 
11526078a454SJean-Philippe Brucker 	return 0;
11536078a454SJean-Philippe Brucker 
11546078a454SJean-Philippe Brucker err_remove_event:
11556078a454SJean-Philippe Brucker 	/* Remove trigger event */
11566078a454SJean-Philippe Brucker 	trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
11576078a454SJean-Philippe Brucker 	trigger.irq.count = 0;
11586078a454SJean-Philippe Brucker 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
11596078a454SJean-Philippe Brucker 
11606078a454SJean-Philippe Brucker err_delete_line:
11616078a454SJean-Philippe Brucker 	irq__del_irqfd(kvm, gsi, trigger_fd);
11626078a454SJean-Philippe Brucker 
11636078a454SJean-Philippe Brucker err_close:
11646078a454SJean-Philippe Brucker 	close(trigger_fd);
11656078a454SJean-Philippe Brucker 	close(unmask_fd);
11666078a454SJean-Philippe Brucker 	return ret;
11676078a454SJean-Philippe Brucker }
11686078a454SJean-Philippe Brucker 
116912bd7a16SLeo Yan static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev)
117012bd7a16SLeo Yan {
117112bd7a16SLeo Yan 	int ret;
117212bd7a16SLeo Yan 	struct vfio_pci_device *pdev = &vdev->pci;
117312bd7a16SLeo Yan 	struct vfio_irq_info irq_info = {
117412bd7a16SLeo Yan 		.argsz = sizeof(irq_info),
117512bd7a16SLeo Yan 		.index = VFIO_PCI_INTX_IRQ_INDEX,
117612bd7a16SLeo Yan 	};
117712bd7a16SLeo Yan 
117812bd7a16SLeo Yan 	vfio_pci_reserve_irq_fds(2);
117912bd7a16SLeo Yan 
118012bd7a16SLeo Yan 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
118112bd7a16SLeo Yan 	if (ret || irq_info.count == 0) {
118212bd7a16SLeo Yan 		vfio_dev_err(vdev, "no INTx reported by VFIO");
118312bd7a16SLeo Yan 		return -ENODEV;
118412bd7a16SLeo Yan 	}
118512bd7a16SLeo Yan 
118612bd7a16SLeo Yan 	if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
118712bd7a16SLeo Yan 		vfio_dev_err(vdev, "interrupt not eventfd capable");
118812bd7a16SLeo Yan 		return -EINVAL;
118912bd7a16SLeo Yan 	}
119012bd7a16SLeo Yan 
119112bd7a16SLeo Yan 	if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
119212bd7a16SLeo Yan 		vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
119312bd7a16SLeo Yan 		return -EINVAL;
119412bd7a16SLeo Yan 	}
119512bd7a16SLeo Yan 
119612bd7a16SLeo Yan 	/* Guest is going to ovewrite our irq_line... */
119712bd7a16SLeo Yan 	pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
119812bd7a16SLeo Yan 
11997302327aSLeo Yan 	pdev->intx_fd = -1;
12007302327aSLeo Yan 
120112bd7a16SLeo Yan 	return 0;
120212bd7a16SLeo Yan }
120312bd7a16SLeo Yan 
12046078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
12056078a454SJean-Philippe Brucker {
1206c9888d95SJean-Philippe Brucker 	int ret = 0;
12076078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
12086078a454SJean-Philippe Brucker 
1209c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1210c9888d95SJean-Philippe Brucker 		pdev->msix.info = (struct vfio_irq_info) {
1211c9888d95SJean-Philippe Brucker 			.argsz = sizeof(pdev->msix.info),
1212c9888d95SJean-Philippe Brucker 			.index = VFIO_PCI_MSIX_IRQ_INDEX,
12136078a454SJean-Philippe Brucker 		};
1214c9888d95SJean-Philippe Brucker 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
1215c9888d95SJean-Philippe Brucker 		if (ret)
1216c9888d95SJean-Philippe Brucker 			return ret;
12176078a454SJean-Philippe Brucker 	}
12186078a454SJean-Philippe Brucker 
12198dd28afeSJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
12208dd28afeSJean-Philippe Brucker 		pdev->msi.info = (struct vfio_irq_info) {
12218dd28afeSJean-Philippe Brucker 			.argsz = sizeof(pdev->msi.info),
12228dd28afeSJean-Philippe Brucker 			.index = VFIO_PCI_MSI_IRQ_INDEX,
12238dd28afeSJean-Philippe Brucker 		};
12248dd28afeSJean-Philippe Brucker 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi);
12258dd28afeSJean-Philippe Brucker 		if (ret)
12268dd28afeSJean-Philippe Brucker 			return ret;
12278dd28afeSJean-Philippe Brucker 	}
12288dd28afeSJean-Philippe Brucker 
122912bd7a16SLeo Yan 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
1230c0c45eedSAndre Przywara 		pci__assign_irq(&vdev->pci.hdr);
1231c0c45eedSAndre Przywara 
123212bd7a16SLeo Yan 		ret = vfio_pci_init_intx(kvm, vdev);
123312bd7a16SLeo Yan 		if (ret)
123412bd7a16SLeo Yan 			return ret;
123512bd7a16SLeo Yan 
1236c9888d95SJean-Philippe Brucker 		ret = vfio_pci_enable_intx(kvm, vdev);
123712bd7a16SLeo Yan 	}
1238c9888d95SJean-Philippe Brucker 
1239c9888d95SJean-Philippe Brucker 	return ret;
12406078a454SJean-Philippe Brucker }
12416078a454SJean-Philippe Brucker 
12426078a454SJean-Philippe Brucker int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
12436078a454SJean-Philippe Brucker {
12446078a454SJean-Philippe Brucker 	int ret;
12456078a454SJean-Philippe Brucker 
12466078a454SJean-Philippe Brucker 	ret = vfio_pci_configure_dev_regions(kvm, vdev);
12476078a454SJean-Philippe Brucker 	if (ret) {
12486078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to configure regions");
12496078a454SJean-Philippe Brucker 		return ret;
12506078a454SJean-Philippe Brucker 	}
12516078a454SJean-Philippe Brucker 
12526078a454SJean-Philippe Brucker 	vdev->dev_hdr = (struct device_header) {
12536078a454SJean-Philippe Brucker 		.bus_type	= DEVICE_BUS_PCI,
12546078a454SJean-Philippe Brucker 		.data		= &vdev->pci.hdr,
12556078a454SJean-Philippe Brucker 	};
12566078a454SJean-Philippe Brucker 
12576078a454SJean-Philippe Brucker 	ret = device__register(&vdev->dev_hdr);
12586078a454SJean-Philippe Brucker 	if (ret) {
12596078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to register VFIO device");
12606078a454SJean-Philippe Brucker 		return ret;
12616078a454SJean-Philippe Brucker 	}
12626078a454SJean-Philippe Brucker 
12636078a454SJean-Philippe Brucker 	ret = vfio_pci_configure_dev_irqs(kvm, vdev);
12646078a454SJean-Philippe Brucker 	if (ret) {
12656078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to configure IRQs");
12666078a454SJean-Philippe Brucker 		return ret;
12676078a454SJean-Philippe Brucker 	}
12686078a454SJean-Philippe Brucker 
12696078a454SJean-Philippe Brucker 	return 0;
12706078a454SJean-Philippe Brucker }
12716078a454SJean-Philippe Brucker 
12726078a454SJean-Philippe Brucker void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
12736078a454SJean-Philippe Brucker {
12746078a454SJean-Philippe Brucker 	size_t i;
1275c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
12766078a454SJean-Philippe Brucker 
12776078a454SJean-Philippe Brucker 	for (i = 0; i < vdev->info.num_regions; i++)
12786078a454SJean-Philippe Brucker 		vfio_unmap_region(kvm, &vdev->regions[i]);
12796078a454SJean-Philippe Brucker 
12806078a454SJean-Philippe Brucker 	device__unregister(&vdev->dev_hdr);
1281c9888d95SJean-Philippe Brucker 
1282c9888d95SJean-Philippe Brucker 	free(pdev->msix.irq_set);
1283c9888d95SJean-Philippe Brucker 	free(pdev->msix.entries);
12848dd28afeSJean-Philippe Brucker 	free(pdev->msi.irq_set);
12858dd28afeSJean-Philippe Brucker 	free(pdev->msi.entries);
12866078a454SJean-Philippe Brucker }
1287