xref: /kvmtool/vfio/pci.c (revision 8dd28afedcf525f0b0b9be0c12760599ce162118)
16078a454SJean-Philippe Brucker #include "kvm/irq.h"
26078a454SJean-Philippe Brucker #include "kvm/kvm.h"
36078a454SJean-Philippe Brucker #include "kvm/kvm-cpu.h"
46078a454SJean-Philippe Brucker #include "kvm/vfio.h"
56078a454SJean-Philippe Brucker 
66078a454SJean-Philippe Brucker #include <sys/ioctl.h>
76078a454SJean-Philippe Brucker #include <sys/eventfd.h>
8c9888d95SJean-Philippe Brucker #include <sys/resource.h>
9c9888d95SJean-Philippe Brucker #include <sys/time.h>
106078a454SJean-Philippe Brucker 
116078a454SJean-Philippe Brucker /* Wrapper around UAPI vfio_irq_set */
126078a454SJean-Philippe Brucker struct vfio_irq_eventfd {
136078a454SJean-Philippe Brucker 	struct vfio_irq_set	irq;
146078a454SJean-Philippe Brucker 	int			fd;
156078a454SJean-Philippe Brucker };
166078a454SJean-Philippe Brucker 
17c9888d95SJean-Philippe Brucker #define msi_is_enabled(state)		((state) & VFIO_PCI_MSI_STATE_ENABLED)
18c9888d95SJean-Philippe Brucker #define msi_is_masked(state)		((state) & VFIO_PCI_MSI_STATE_MASKED)
19c9888d95SJean-Philippe Brucker #define msi_is_empty(state)		((state) & VFIO_PCI_MSI_STATE_EMPTY)
20c9888d95SJean-Philippe Brucker 
21c9888d95SJean-Philippe Brucker #define msi_update_state(state, val, bit)				\
22c9888d95SJean-Philippe Brucker 	(state) = (val) ? (state) | bit : (state) & ~bit;
23c9888d95SJean-Philippe Brucker #define msi_set_enabled(state, val)					\
24c9888d95SJean-Philippe Brucker 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
25c9888d95SJean-Philippe Brucker #define msi_set_masked(state, val)					\
26c9888d95SJean-Philippe Brucker 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
27c9888d95SJean-Philippe Brucker #define msi_set_empty(state, val)					\
28c9888d95SJean-Philippe Brucker 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
29c9888d95SJean-Philippe Brucker 
30c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
31c9888d95SJean-Philippe Brucker 
32*8dd28afeSJean-Philippe Brucker static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
33*8dd28afeSJean-Philippe Brucker 				bool msix)
34c9888d95SJean-Philippe Brucker {
35c9888d95SJean-Philippe Brucker 	size_t i;
36c9888d95SJean-Philippe Brucker 	int ret = 0;
37c9888d95SJean-Philippe Brucker 	int *eventfds;
38c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
39*8dd28afeSJean-Philippe Brucker 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
40c9888d95SJean-Philippe Brucker 	struct vfio_irq_eventfd single = {
41c9888d95SJean-Philippe Brucker 		.irq = {
42c9888d95SJean-Philippe Brucker 			.argsz	= sizeof(single),
43c9888d95SJean-Philippe Brucker 			.flags	= VFIO_IRQ_SET_DATA_EVENTFD |
44c9888d95SJean-Philippe Brucker 				  VFIO_IRQ_SET_ACTION_TRIGGER,
45c9888d95SJean-Philippe Brucker 			.index	= msis->info.index,
46c9888d95SJean-Philippe Brucker 			.count	= 1,
47c9888d95SJean-Philippe Brucker 		},
48c9888d95SJean-Philippe Brucker 	};
49c9888d95SJean-Philippe Brucker 
50c9888d95SJean-Philippe Brucker 	if (!msi_is_enabled(msis->virt_state))
51c9888d95SJean-Philippe Brucker 		return 0;
52c9888d95SJean-Philippe Brucker 
53c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
54c9888d95SJean-Philippe Brucker 		/*
55c9888d95SJean-Philippe Brucker 		 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
56c9888d95SJean-Philippe Brucker 		 * time. Since INTx has to be enabled from the start (we don't
57c9888d95SJean-Philippe Brucker 		 * have a reliable way to know when the user starts using it),
58c9888d95SJean-Philippe Brucker 		 * disable it now.
59c9888d95SJean-Philippe Brucker 		 */
60c9888d95SJean-Philippe Brucker 		vfio_pci_disable_intx(kvm, vdev);
61c9888d95SJean-Philippe Brucker 		/* Permanently disable INTx */
62c9888d95SJean-Philippe Brucker 		pdev->irq_modes &= ~VFIO_PCI_IRQ_MODE_INTX;
63c9888d95SJean-Philippe Brucker 	}
64c9888d95SJean-Philippe Brucker 
65c9888d95SJean-Philippe Brucker 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
66c9888d95SJean-Philippe Brucker 
67c9888d95SJean-Philippe Brucker 	/*
68c9888d95SJean-Philippe Brucker 	 * Initial registration of the full range. This enables the physical
69c9888d95SJean-Philippe Brucker 	 * MSI/MSI-X capability, which might have desired side effects. For
70c9888d95SJean-Philippe Brucker 	 * instance when assigning virtio legacy devices, enabling the MSI
71c9888d95SJean-Philippe Brucker 	 * capability modifies the config space layout!
72c9888d95SJean-Philippe Brucker 	 *
73c9888d95SJean-Philippe Brucker 	 * As an optimization, only update MSIs when guest unmasks the
74c9888d95SJean-Philippe Brucker 	 * capability. This greatly reduces the initialization time for Linux
75c9888d95SJean-Philippe Brucker 	 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
76c9888d95SJean-Philippe Brucker 	 * masked, then fills individual vectors, then unmasks the whole
77c9888d95SJean-Philippe Brucker 	 * function. So we only do one VFIO ioctl when enabling for the first
78c9888d95SJean-Philippe Brucker 	 * time, and then one when unmasking.
79c9888d95SJean-Philippe Brucker 	 *
80c9888d95SJean-Philippe Brucker 	 * phys_state is empty when it is enabled but no vector has been
81c9888d95SJean-Philippe Brucker 	 * registered via SET_IRQS yet.
82c9888d95SJean-Philippe Brucker 	 */
83c9888d95SJean-Philippe Brucker 	if (!msi_is_enabled(msis->phys_state) ||
84c9888d95SJean-Philippe Brucker 	    (!msi_is_masked(msis->virt_state) &&
85c9888d95SJean-Philippe Brucker 	     msi_is_empty(msis->phys_state))) {
86c9888d95SJean-Philippe Brucker 		bool empty = true;
87c9888d95SJean-Philippe Brucker 
88c9888d95SJean-Philippe Brucker 		for (i = 0; i < msis->nr_entries; i++) {
89c9888d95SJean-Philippe Brucker 			eventfds[i] = msis->entries[i].gsi >= 0 ?
90c9888d95SJean-Philippe Brucker 				      msis->entries[i].eventfd : -1;
91c9888d95SJean-Philippe Brucker 
92c9888d95SJean-Philippe Brucker 			if (eventfds[i] >= 0)
93c9888d95SJean-Philippe Brucker 				empty = false;
94c9888d95SJean-Philippe Brucker 		}
95c9888d95SJean-Philippe Brucker 
96c9888d95SJean-Philippe Brucker 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
97c9888d95SJean-Philippe Brucker 		if (ret < 0) {
98c9888d95SJean-Philippe Brucker 			perror("VFIO_DEVICE_SET_IRQS(multi)");
99c9888d95SJean-Philippe Brucker 			return ret;
100c9888d95SJean-Philippe Brucker 		}
101c9888d95SJean-Philippe Brucker 
102c9888d95SJean-Philippe Brucker 		msi_set_enabled(msis->phys_state, true);
103c9888d95SJean-Philippe Brucker 		msi_set_empty(msis->phys_state, empty);
104c9888d95SJean-Philippe Brucker 
105c9888d95SJean-Philippe Brucker 		return 0;
106c9888d95SJean-Philippe Brucker 	}
107c9888d95SJean-Philippe Brucker 
108c9888d95SJean-Philippe Brucker 	if (msi_is_masked(msis->virt_state)) {
109c9888d95SJean-Philippe Brucker 		/* TODO: if phys_state is not empty nor masked, mask all vectors */
110c9888d95SJean-Philippe Brucker 		return 0;
111c9888d95SJean-Philippe Brucker 	}
112c9888d95SJean-Philippe Brucker 
113c9888d95SJean-Philippe Brucker 	/* Update individual vectors to avoid breaking those in use */
114c9888d95SJean-Philippe Brucker 	for (i = 0; i < msis->nr_entries; i++) {
115c9888d95SJean-Philippe Brucker 		struct vfio_pci_msi_entry *entry = &msis->entries[i];
116c9888d95SJean-Philippe Brucker 		int fd = entry->gsi >= 0 ? entry->eventfd : -1;
117c9888d95SJean-Philippe Brucker 
118c9888d95SJean-Philippe Brucker 		if (fd == eventfds[i])
119c9888d95SJean-Philippe Brucker 			continue;
120c9888d95SJean-Philippe Brucker 
121c9888d95SJean-Philippe Brucker 		single.irq.start = i;
122c9888d95SJean-Philippe Brucker 		single.fd = fd;
123c9888d95SJean-Philippe Brucker 
124c9888d95SJean-Philippe Brucker 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
125c9888d95SJean-Philippe Brucker 		if (ret < 0) {
126c9888d95SJean-Philippe Brucker 			perror("VFIO_DEVICE_SET_IRQS(single)");
127c9888d95SJean-Philippe Brucker 			break;
128c9888d95SJean-Philippe Brucker 		}
129c9888d95SJean-Philippe Brucker 
130c9888d95SJean-Philippe Brucker 		eventfds[i] = fd;
131c9888d95SJean-Philippe Brucker 
132c9888d95SJean-Philippe Brucker 		if (msi_is_empty(msis->phys_state) && fd >= 0)
133c9888d95SJean-Philippe Brucker 			msi_set_empty(msis->phys_state, false);
134c9888d95SJean-Philippe Brucker 	}
135c9888d95SJean-Philippe Brucker 
136c9888d95SJean-Philippe Brucker 	return ret;
137c9888d95SJean-Philippe Brucker }
138c9888d95SJean-Philippe Brucker 
139*8dd28afeSJean-Philippe Brucker static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev,
140*8dd28afeSJean-Philippe Brucker 				 bool msix)
141c9888d95SJean-Philippe Brucker {
142c9888d95SJean-Philippe Brucker 	int ret;
143c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
144*8dd28afeSJean-Philippe Brucker 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
145c9888d95SJean-Philippe Brucker 	struct vfio_irq_set irq_set = {
146c9888d95SJean-Philippe Brucker 		.argsz	= sizeof(irq_set),
147c9888d95SJean-Philippe Brucker 		.flags 	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
148c9888d95SJean-Philippe Brucker 		.index 	= msis->info.index,
149c9888d95SJean-Philippe Brucker 		.start 	= 0,
150c9888d95SJean-Philippe Brucker 		.count	= 0,
151c9888d95SJean-Philippe Brucker 	};
152c9888d95SJean-Philippe Brucker 
153c9888d95SJean-Philippe Brucker 	if (!msi_is_enabled(msis->phys_state))
154c9888d95SJean-Philippe Brucker 		return 0;
155c9888d95SJean-Philippe Brucker 
156c9888d95SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
157c9888d95SJean-Philippe Brucker 	if (ret < 0) {
158c9888d95SJean-Philippe Brucker 		perror("VFIO_DEVICE_SET_IRQS(NONE)");
159c9888d95SJean-Philippe Brucker 		return ret;
160c9888d95SJean-Philippe Brucker 	}
161c9888d95SJean-Philippe Brucker 
162c9888d95SJean-Philippe Brucker 	msi_set_enabled(msis->phys_state, false);
163c9888d95SJean-Philippe Brucker 	msi_set_empty(msis->phys_state, true);
164c9888d95SJean-Philippe Brucker 
165c9888d95SJean-Philippe Brucker 	return 0;
166c9888d95SJean-Philippe Brucker }
167c9888d95SJean-Philippe Brucker 
168c9888d95SJean-Philippe Brucker static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
169c9888d95SJean-Philippe Brucker 				     struct vfio_pci_msi_entry *entry)
170c9888d95SJean-Philippe Brucker {
171c9888d95SJean-Philippe Brucker 	int ret;
172c9888d95SJean-Philippe Brucker 
173c9888d95SJean-Philippe Brucker 	if (entry->eventfd < 0) {
174c9888d95SJean-Philippe Brucker 		entry->eventfd = eventfd(0, 0);
175c9888d95SJean-Philippe Brucker 		if (entry->eventfd < 0) {
176c9888d95SJean-Philippe Brucker 			ret = -errno;
177c9888d95SJean-Philippe Brucker 			vfio_dev_err(vdev, "cannot create eventfd");
178c9888d95SJean-Philippe Brucker 			return ret;
179c9888d95SJean-Philippe Brucker 		}
180c9888d95SJean-Philippe Brucker 	}
181c9888d95SJean-Philippe Brucker 
182c9888d95SJean-Philippe Brucker 	/* Allocate IRQ if necessary */
183c9888d95SJean-Philippe Brucker 	if (entry->gsi < 0) {
184c9888d95SJean-Philippe Brucker 		int ret = irq__add_msix_route(kvm, &entry->config.msg,
185c9888d95SJean-Philippe Brucker 					      vdev->dev_hdr.dev_num << 3);
186c9888d95SJean-Philippe Brucker 		if (ret < 0) {
187c9888d95SJean-Philippe Brucker 			vfio_dev_err(vdev, "cannot create MSI-X route");
188c9888d95SJean-Philippe Brucker 			return ret;
189c9888d95SJean-Philippe Brucker 		}
190c9888d95SJean-Philippe Brucker 		entry->gsi = ret;
191c9888d95SJean-Philippe Brucker 	} else {
192c9888d95SJean-Philippe Brucker 		irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
193c9888d95SJean-Philippe Brucker 	}
194c9888d95SJean-Philippe Brucker 
195c9888d95SJean-Philippe Brucker 	/*
196c9888d95SJean-Philippe Brucker 	 * MSI masking is unimplemented in VFIO, so we have to handle it by
197c9888d95SJean-Philippe Brucker 	 * disabling/enabling IRQ route instead. We do it on the KVM side rather
198c9888d95SJean-Philippe Brucker 	 * than VFIO, because:
199c9888d95SJean-Philippe Brucker 	 * - it is 8x faster
200c9888d95SJean-Philippe Brucker 	 * - it allows to decouple masking logic from capability state.
201c9888d95SJean-Philippe Brucker 	 * - in masked state, after removing irqfd route, we could easily plug
202c9888d95SJean-Philippe Brucker 	 *   the eventfd in a local handler, in order to serve Pending Bit reads
203c9888d95SJean-Philippe Brucker 	 *   to the guest.
204c9888d95SJean-Philippe Brucker 	 *
205c9888d95SJean-Philippe Brucker 	 * So entry->phys_state is masked when there is no active irqfd route.
206c9888d95SJean-Philippe Brucker 	 */
207c9888d95SJean-Philippe Brucker 	if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state))
208c9888d95SJean-Philippe Brucker 		return 0;
209c9888d95SJean-Philippe Brucker 
210c9888d95SJean-Philippe Brucker 	if (msi_is_masked(entry->phys_state)) {
211c9888d95SJean-Philippe Brucker 		ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
212c9888d95SJean-Philippe Brucker 		if (ret < 0) {
213c9888d95SJean-Philippe Brucker 			vfio_dev_err(vdev, "cannot setup irqfd");
214c9888d95SJean-Philippe Brucker 			return ret;
215c9888d95SJean-Philippe Brucker 		}
216c9888d95SJean-Philippe Brucker 	} else {
217c9888d95SJean-Philippe Brucker 		irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
218c9888d95SJean-Philippe Brucker 	}
219c9888d95SJean-Philippe Brucker 
220c9888d95SJean-Philippe Brucker 	msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state));
221c9888d95SJean-Philippe Brucker 
222c9888d95SJean-Philippe Brucker 	return 0;
223c9888d95SJean-Philippe Brucker }
224c9888d95SJean-Philippe Brucker 
225c9888d95SJean-Philippe Brucker static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
226c9888d95SJean-Philippe Brucker 				     u32 len, u8 is_write, void *ptr)
227c9888d95SJean-Philippe Brucker {
228c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = ptr;
229c9888d95SJean-Philippe Brucker 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
230c9888d95SJean-Philippe Brucker 	u64 offset = addr - pba->guest_phys_addr;
231c9888d95SJean-Philippe Brucker 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
232c9888d95SJean-Philippe Brucker 
233c9888d95SJean-Philippe Brucker 	if (is_write)
234c9888d95SJean-Philippe Brucker 		return;
235c9888d95SJean-Philippe Brucker 
236c9888d95SJean-Philippe Brucker 	/*
237c9888d95SJean-Philippe Brucker 	 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
238c9888d95SJean-Philippe Brucker 	 * is completely useless here. Note that Linux doesn't use PBA.
239c9888d95SJean-Philippe Brucker 	 */
240c9888d95SJean-Philippe Brucker 	if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len)
241c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot access MSIX PBA\n");
242c9888d95SJean-Philippe Brucker }
243c9888d95SJean-Philippe Brucker 
244c9888d95SJean-Philippe Brucker static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
245c9888d95SJean-Philippe Brucker 				       u32 len, u8 is_write, void *ptr)
246c9888d95SJean-Philippe Brucker {
247c9888d95SJean-Philippe Brucker 	struct kvm *kvm = vcpu->kvm;
248c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
249c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = ptr;
250c9888d95SJean-Philippe Brucker 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
251c9888d95SJean-Philippe Brucker 
252c9888d95SJean-Philippe Brucker 	u64 offset = addr - pdev->msix_table.guest_phys_addr;
253c9888d95SJean-Philippe Brucker 
254c9888d95SJean-Philippe Brucker 	size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
255c9888d95SJean-Philippe Brucker 	off_t field = offset % PCI_MSIX_ENTRY_SIZE;
256c9888d95SJean-Philippe Brucker 
257c9888d95SJean-Philippe Brucker 	/*
258c9888d95SJean-Philippe Brucker 	 * PCI spec says that software must use aligned 4 or 8 bytes accesses
259c9888d95SJean-Philippe Brucker 	 * for the MSI-X tables.
260c9888d95SJean-Philippe Brucker 	 */
261c9888d95SJean-Philippe Brucker 	if ((len != 4 && len != 8) || addr & (len - 1)) {
262c9888d95SJean-Philippe Brucker 		vfio_dev_warn(vdev, "invalid MSI-X table access");
263c9888d95SJean-Philippe Brucker 		return;
264c9888d95SJean-Philippe Brucker 	}
265c9888d95SJean-Philippe Brucker 
266c9888d95SJean-Philippe Brucker 	entry = &pdev->msix.entries[vector];
267c9888d95SJean-Philippe Brucker 
268c9888d95SJean-Philippe Brucker 	mutex_lock(&pdev->msix.mutex);
269c9888d95SJean-Philippe Brucker 
270c9888d95SJean-Philippe Brucker 	if (!is_write) {
271c9888d95SJean-Philippe Brucker 		memcpy(data, (void *)&entry->config + field, len);
272c9888d95SJean-Philippe Brucker 		goto out_unlock;
273c9888d95SJean-Philippe Brucker 	}
274c9888d95SJean-Philippe Brucker 
275c9888d95SJean-Philippe Brucker 	memcpy((void *)&entry->config + field, data, len);
276c9888d95SJean-Philippe Brucker 
277c9888d95SJean-Philippe Brucker 	/*
278c9888d95SJean-Philippe Brucker 	 * Check if access touched the vector control register, which is at the
279c9888d95SJean-Philippe Brucker 	 * end of the MSI-X entry.
280c9888d95SJean-Philippe Brucker 	 */
281c9888d95SJean-Philippe Brucker 	if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
282c9888d95SJean-Philippe Brucker 		goto out_unlock;
283c9888d95SJean-Philippe Brucker 
284c9888d95SJean-Philippe Brucker 	msi_set_masked(entry->virt_state, entry->config.ctrl &
285c9888d95SJean-Philippe Brucker 		       PCI_MSIX_ENTRY_CTRL_MASKBIT);
286c9888d95SJean-Philippe Brucker 
287c9888d95SJean-Philippe Brucker 	if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
288c9888d95SJean-Philippe Brucker 		/* Not much we can do here. */
289c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
290c9888d95SJean-Philippe Brucker 
291c9888d95SJean-Philippe Brucker 	/* Update the physical capability if necessary */
292*8dd28afeSJean-Philippe Brucker 	if (vfio_pci_enable_msis(kvm, vdev, true))
293c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot enable MSIX");
294c9888d95SJean-Philippe Brucker 
295c9888d95SJean-Philippe Brucker out_unlock:
296c9888d95SJean-Philippe Brucker 	mutex_unlock(&pdev->msix.mutex);
297c9888d95SJean-Philippe Brucker }
298c9888d95SJean-Philippe Brucker 
299c9888d95SJean-Philippe Brucker static void vfio_pci_msix_cap_write(struct kvm *kvm,
300c9888d95SJean-Philippe Brucker 				    struct vfio_device *vdev, u8 off,
301c9888d95SJean-Philippe Brucker 				    void *data, int sz)
302c9888d95SJean-Philippe Brucker {
303c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
304c9888d95SJean-Philippe Brucker 	off_t enable_pos = PCI_MSIX_FLAGS + 1;
305c9888d95SJean-Philippe Brucker 	bool enable;
306c9888d95SJean-Philippe Brucker 	u16 flags;
307c9888d95SJean-Philippe Brucker 
308c9888d95SJean-Philippe Brucker 	off -= pdev->msix.pos;
309c9888d95SJean-Philippe Brucker 
310c9888d95SJean-Philippe Brucker 	/* Check if access intersects with the MSI-X Enable bit */
311c9888d95SJean-Philippe Brucker 	if (off > enable_pos || off + sz <= enable_pos)
312c9888d95SJean-Philippe Brucker 		return;
313c9888d95SJean-Philippe Brucker 
314c9888d95SJean-Philippe Brucker 	/* Read byte that contains the Enable bit */
315c9888d95SJean-Philippe Brucker 	flags = *(u8 *)(data + enable_pos - off) << 8;
316c9888d95SJean-Philippe Brucker 
317c9888d95SJean-Philippe Brucker 	mutex_lock(&pdev->msix.mutex);
318c9888d95SJean-Philippe Brucker 
319c9888d95SJean-Philippe Brucker 	msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL);
320c9888d95SJean-Philippe Brucker 	enable = flags & PCI_MSIX_FLAGS_ENABLE;
321c9888d95SJean-Philippe Brucker 	msi_set_enabled(pdev->msix.virt_state, enable);
322c9888d95SJean-Philippe Brucker 
323*8dd28afeSJean-Philippe Brucker 	if (enable && vfio_pci_enable_msis(kvm, vdev, true))
324c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot enable MSIX");
325*8dd28afeSJean-Philippe Brucker 	else if (!enable && vfio_pci_disable_msis(kvm, vdev, true))
326c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot disable MSIX");
327c9888d95SJean-Philippe Brucker 
328c9888d95SJean-Philippe Brucker 	mutex_unlock(&pdev->msix.mutex);
329c9888d95SJean-Philippe Brucker }
330c9888d95SJean-Philippe Brucker 
331*8dd28afeSJean-Philippe Brucker static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
332*8dd28afeSJean-Philippe Brucker 				     u8 off, u8 *data, u32 sz)
333*8dd28afeSJean-Philippe Brucker {
334*8dd28afeSJean-Philippe Brucker 	size_t i;
335*8dd28afeSJean-Philippe Brucker 	u32 mask = 0;
336*8dd28afeSJean-Philippe Brucker 	size_t mask_pos, start, limit;
337*8dd28afeSJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
338*8dd28afeSJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
339*8dd28afeSJean-Philippe Brucker 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
340*8dd28afeSJean-Philippe Brucker 
341*8dd28afeSJean-Philippe Brucker 	if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT))
342*8dd28afeSJean-Philippe Brucker 		return 0;
343*8dd28afeSJean-Philippe Brucker 
344*8dd28afeSJean-Philippe Brucker 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT)
345*8dd28afeSJean-Philippe Brucker 		mask_pos = PCI_MSI_MASK_64;
346*8dd28afeSJean-Philippe Brucker 	else
347*8dd28afeSJean-Philippe Brucker 		mask_pos = PCI_MSI_MASK_32;
348*8dd28afeSJean-Philippe Brucker 
349*8dd28afeSJean-Philippe Brucker 	if (off >= mask_pos + 4 || off + sz <= mask_pos)
350*8dd28afeSJean-Philippe Brucker 		return 0;
351*8dd28afeSJean-Philippe Brucker 
352*8dd28afeSJean-Philippe Brucker 	/* Set mask to current state */
353*8dd28afeSJean-Philippe Brucker 	for (i = 0; i < pdev->msi.nr_entries; i++) {
354*8dd28afeSJean-Philippe Brucker 		entry = &pdev->msi.entries[i];
355*8dd28afeSJean-Philippe Brucker 		mask |= !!msi_is_masked(entry->virt_state) << i;
356*8dd28afeSJean-Philippe Brucker 	}
357*8dd28afeSJean-Philippe Brucker 
358*8dd28afeSJean-Philippe Brucker 	/* Update mask following the intersection of access and register */
359*8dd28afeSJean-Philippe Brucker 	start = max_t(size_t, off, mask_pos);
360*8dd28afeSJean-Philippe Brucker 	limit = min_t(size_t, off + sz, mask_pos + 4);
361*8dd28afeSJean-Philippe Brucker 
362*8dd28afeSJean-Philippe Brucker 	memcpy((void *)&mask + start - mask_pos, data + start - off,
363*8dd28afeSJean-Philippe Brucker 	       limit - start);
364*8dd28afeSJean-Philippe Brucker 
365*8dd28afeSJean-Philippe Brucker 	/* Update states if necessary */
366*8dd28afeSJean-Philippe Brucker 	for (i = 0; i < pdev->msi.nr_entries; i++) {
367*8dd28afeSJean-Philippe Brucker 		bool masked = mask & (1 << i);
368*8dd28afeSJean-Philippe Brucker 
369*8dd28afeSJean-Philippe Brucker 		entry = &pdev->msi.entries[i];
370*8dd28afeSJean-Philippe Brucker 		if (masked != msi_is_masked(entry->virt_state)) {
371*8dd28afeSJean-Philippe Brucker 			msi_set_masked(entry->virt_state, masked);
372*8dd28afeSJean-Philippe Brucker 			vfio_pci_update_msi_entry(kvm, vdev, entry);
373*8dd28afeSJean-Philippe Brucker 		}
374*8dd28afeSJean-Philippe Brucker 	}
375*8dd28afeSJean-Philippe Brucker 
376*8dd28afeSJean-Philippe Brucker 	return 1;
377*8dd28afeSJean-Philippe Brucker }
378*8dd28afeSJean-Philippe Brucker 
379*8dd28afeSJean-Philippe Brucker static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
380*8dd28afeSJean-Philippe Brucker 				   u8 off, u8 *data, u32 sz)
381*8dd28afeSJean-Philippe Brucker {
382*8dd28afeSJean-Philippe Brucker 	u8 ctrl;
383*8dd28afeSJean-Philippe Brucker 	struct msi_msg msg;
384*8dd28afeSJean-Philippe Brucker 	size_t i, nr_vectors;
385*8dd28afeSJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
386*8dd28afeSJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
387*8dd28afeSJean-Philippe Brucker 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
388*8dd28afeSJean-Philippe Brucker 
389*8dd28afeSJean-Philippe Brucker 	off -= pdev->msi.pos;
390*8dd28afeSJean-Philippe Brucker 
391*8dd28afeSJean-Philippe Brucker 	mutex_lock(&pdev->msi.mutex);
392*8dd28afeSJean-Philippe Brucker 
393*8dd28afeSJean-Philippe Brucker 	/* Check if the guest is trying to update mask bits */
394*8dd28afeSJean-Philippe Brucker 	if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz))
395*8dd28afeSJean-Philippe Brucker 		goto out_unlock;
396*8dd28afeSJean-Philippe Brucker 
397*8dd28afeSJean-Philippe Brucker 	/* Only modify routes when guest pokes the enable bit */
398*8dd28afeSJean-Philippe Brucker 	if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS)
399*8dd28afeSJean-Philippe Brucker 		goto out_unlock;
400*8dd28afeSJean-Philippe Brucker 
401*8dd28afeSJean-Philippe Brucker 	ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off);
402*8dd28afeSJean-Philippe Brucker 
403*8dd28afeSJean-Philippe Brucker 	msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE);
404*8dd28afeSJean-Philippe Brucker 
405*8dd28afeSJean-Philippe Brucker 	if (!msi_is_enabled(pdev->msi.virt_state)) {
406*8dd28afeSJean-Philippe Brucker 		vfio_pci_disable_msis(kvm, vdev, false);
407*8dd28afeSJean-Philippe Brucker 		goto out_unlock;
408*8dd28afeSJean-Philippe Brucker 	}
409*8dd28afeSJean-Philippe Brucker 
410*8dd28afeSJean-Philippe Brucker 	/* Create routes for the requested vectors */
411*8dd28afeSJean-Philippe Brucker 	nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
412*8dd28afeSJean-Philippe Brucker 
413*8dd28afeSJean-Philippe Brucker 	msg.address_lo = msi_cap_64->address_lo;
414*8dd28afeSJean-Philippe Brucker 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) {
415*8dd28afeSJean-Philippe Brucker 		msg.address_hi = msi_cap_64->address_hi;
416*8dd28afeSJean-Philippe Brucker 		msg.data = msi_cap_64->data;
417*8dd28afeSJean-Philippe Brucker 	} else {
418*8dd28afeSJean-Philippe Brucker 		struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64;
419*8dd28afeSJean-Philippe Brucker 		msg.address_hi = 0;
420*8dd28afeSJean-Philippe Brucker 		msg.data = msi_cap_32->data;
421*8dd28afeSJean-Philippe Brucker 	}
422*8dd28afeSJean-Philippe Brucker 
423*8dd28afeSJean-Philippe Brucker 	for (i = 0; i < nr_vectors; i++) {
424*8dd28afeSJean-Philippe Brucker 		entry = &pdev->msi.entries[i];
425*8dd28afeSJean-Philippe Brucker 		entry->config.msg = msg;
426*8dd28afeSJean-Philippe Brucker 		vfio_pci_update_msi_entry(kvm, vdev, entry);
427*8dd28afeSJean-Philippe Brucker 	}
428*8dd28afeSJean-Philippe Brucker 
429*8dd28afeSJean-Philippe Brucker 	/* Update the physical capability if necessary */
430*8dd28afeSJean-Philippe Brucker 	if (vfio_pci_enable_msis(kvm, vdev, false))
431*8dd28afeSJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot enable MSI");
432*8dd28afeSJean-Philippe Brucker 
433*8dd28afeSJean-Philippe Brucker out_unlock:
434*8dd28afeSJean-Philippe Brucker 	mutex_unlock(&pdev->msi.mutex);
435*8dd28afeSJean-Philippe Brucker }
436*8dd28afeSJean-Philippe Brucker 
4376078a454SJean-Philippe Brucker static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
4386078a454SJean-Philippe Brucker 			      u8 offset, void *data, int sz)
4396078a454SJean-Philippe Brucker {
4406078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
4416078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev;
4426078a454SJean-Philippe Brucker 	struct vfio_device *vdev;
4436078a454SJean-Philippe Brucker 	char base[sz];
4446078a454SJean-Philippe Brucker 
4456078a454SJean-Philippe Brucker 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
4466078a454SJean-Philippe Brucker 	vdev = container_of(pdev, struct vfio_device, pci);
4476078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
4486078a454SJean-Philippe Brucker 
4496078a454SJean-Philippe Brucker 	/* Dummy read in case of side-effects */
4506078a454SJean-Philippe Brucker 	if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
4516078a454SJean-Philippe Brucker 		vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
4526078a454SJean-Philippe Brucker 			      sz, offset);
4536078a454SJean-Philippe Brucker }
4546078a454SJean-Philippe Brucker 
4556078a454SJean-Philippe Brucker static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
4566078a454SJean-Philippe Brucker 			       u8 offset, void *data, int sz)
4576078a454SJean-Philippe Brucker {
4586078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
4596078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev;
4606078a454SJean-Philippe Brucker 	struct vfio_device *vdev;
4616078a454SJean-Philippe Brucker 	void *base = pci_hdr;
4626078a454SJean-Philippe Brucker 
4636078a454SJean-Philippe Brucker 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
4646078a454SJean-Philippe Brucker 	vdev = container_of(pdev, struct vfio_device, pci);
4656078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
4666078a454SJean-Philippe Brucker 
4676078a454SJean-Philippe Brucker 	if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
4686078a454SJean-Philippe Brucker 		vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
4696078a454SJean-Philippe Brucker 			      sz, offset);
4706078a454SJean-Philippe Brucker 
471c9888d95SJean-Philippe Brucker 	/* Handle MSI write now, since it might update the hardware capability */
472c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
473c9888d95SJean-Philippe Brucker 		vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
474c9888d95SJean-Philippe Brucker 
475*8dd28afeSJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI)
476*8dd28afeSJean-Philippe Brucker 		vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz);
477*8dd28afeSJean-Philippe Brucker 
4786078a454SJean-Philippe Brucker 	if (pread(vdev->fd, base + offset, sz, info->offset + offset) != sz)
4796078a454SJean-Philippe Brucker 		vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
4806078a454SJean-Philippe Brucker 			      sz, offset);
4816078a454SJean-Philippe Brucker }
4826078a454SJean-Philippe Brucker 
483*8dd28afeSJean-Philippe Brucker static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr)
484*8dd28afeSJean-Philippe Brucker {
485*8dd28afeSJean-Philippe Brucker 	size_t size = 10;
486*8dd28afeSJean-Philippe Brucker 
487*8dd28afeSJean-Philippe Brucker 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT)
488*8dd28afeSJean-Philippe Brucker 		size += 4;
489*8dd28afeSJean-Philippe Brucker 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT)
490*8dd28afeSJean-Philippe Brucker 		size += 10;
491*8dd28afeSJean-Philippe Brucker 
492*8dd28afeSJean-Philippe Brucker 	return size;
493*8dd28afeSJean-Philippe Brucker }
494*8dd28afeSJean-Philippe Brucker 
495c9888d95SJean-Philippe Brucker static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
496c9888d95SJean-Philippe Brucker {
497c9888d95SJean-Philippe Brucker 	switch (cap_hdr->type) {
498c9888d95SJean-Philippe Brucker 	case PCI_CAP_ID_MSIX:
499c9888d95SJean-Philippe Brucker 		return PCI_CAP_MSIX_SIZEOF;
500*8dd28afeSJean-Philippe Brucker 	case PCI_CAP_ID_MSI:
501*8dd28afeSJean-Philippe Brucker 		return vfio_pci_msi_cap_size((void *)cap_hdr);
502c9888d95SJean-Philippe Brucker 	default:
503c9888d95SJean-Philippe Brucker 		pr_err("unknown PCI capability 0x%x", cap_hdr->type);
504c9888d95SJean-Philippe Brucker 		return 0;
505c9888d95SJean-Philippe Brucker 	}
506c9888d95SJean-Philippe Brucker }
507c9888d95SJean-Philippe Brucker 
508c9888d95SJean-Philippe Brucker static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
509c9888d95SJean-Philippe Brucker 			    struct pci_cap_hdr *cap, off_t pos)
510c9888d95SJean-Philippe Brucker {
511c9888d95SJean-Philippe Brucker 	struct pci_cap_hdr *last;
512c9888d95SJean-Philippe Brucker 	struct pci_device_header *hdr = &vdev->pci.hdr;
513c9888d95SJean-Philippe Brucker 
514c9888d95SJean-Philippe Brucker 	cap->next = 0;
515c9888d95SJean-Philippe Brucker 
516c9888d95SJean-Philippe Brucker 	if (!hdr->capabilities) {
517c9888d95SJean-Philippe Brucker 		hdr->capabilities = pos;
518c9888d95SJean-Philippe Brucker 		hdr->status |= PCI_STATUS_CAP_LIST;
519c9888d95SJean-Philippe Brucker 	} else {
520c9888d95SJean-Philippe Brucker 		last = PCI_CAP(virt_hdr, hdr->capabilities);
521c9888d95SJean-Philippe Brucker 
522c9888d95SJean-Philippe Brucker 		while (last->next)
523c9888d95SJean-Philippe Brucker 			last = PCI_CAP(virt_hdr, last->next);
524c9888d95SJean-Philippe Brucker 
525c9888d95SJean-Philippe Brucker 		last->next = pos;
526c9888d95SJean-Philippe Brucker 	}
527c9888d95SJean-Philippe Brucker 
528c9888d95SJean-Philippe Brucker 	memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
529c9888d95SJean-Philippe Brucker 
530c9888d95SJean-Philippe Brucker 	return 0;
531c9888d95SJean-Philippe Brucker }
532c9888d95SJean-Philippe Brucker 
5336078a454SJean-Philippe Brucker static int vfio_pci_parse_caps(struct vfio_device *vdev)
5346078a454SJean-Philippe Brucker {
535c9888d95SJean-Philippe Brucker 	int ret;
536c9888d95SJean-Philippe Brucker 	size_t size;
537c9888d95SJean-Philippe Brucker 	u8 pos, next;
538c9888d95SJean-Philippe Brucker 	struct pci_cap_hdr *cap;
539c9888d95SJean-Philippe Brucker 	u8 virt_hdr[PCI_DEV_CFG_SIZE];
5406078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
5416078a454SJean-Philippe Brucker 
5426078a454SJean-Philippe Brucker 	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
5436078a454SJean-Philippe Brucker 		return 0;
5446078a454SJean-Philippe Brucker 
545c9888d95SJean-Philippe Brucker 	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
546c9888d95SJean-Philippe Brucker 
547c9888d95SJean-Philippe Brucker 	pos = pdev->hdr.capabilities & ~3;
548c9888d95SJean-Philippe Brucker 
5496078a454SJean-Philippe Brucker 	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
5506078a454SJean-Philippe Brucker 	pdev->hdr.capabilities = 0;
5516078a454SJean-Philippe Brucker 
552c9888d95SJean-Philippe Brucker 	for (; pos; pos = next) {
553c9888d95SJean-Philippe Brucker 		if (pos >= PCI_DEV_CFG_SIZE) {
554c9888d95SJean-Philippe Brucker 			vfio_dev_warn(vdev, "ignoring cap outside of config space");
555c9888d95SJean-Philippe Brucker 			return -EINVAL;
556c9888d95SJean-Philippe Brucker 		}
557c9888d95SJean-Philippe Brucker 
558c9888d95SJean-Philippe Brucker 		cap = PCI_CAP(&pdev->hdr, pos);
559c9888d95SJean-Philippe Brucker 		next = cap->next;
560c9888d95SJean-Philippe Brucker 
561c9888d95SJean-Philippe Brucker 		switch (cap->type) {
562c9888d95SJean-Philippe Brucker 		case PCI_CAP_ID_MSIX:
563c9888d95SJean-Philippe Brucker 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
564c9888d95SJean-Philippe Brucker 			if (ret)
565c9888d95SJean-Philippe Brucker 				return ret;
566c9888d95SJean-Philippe Brucker 
567c9888d95SJean-Philippe Brucker 			pdev->msix.pos = pos;
568c9888d95SJean-Philippe Brucker 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
569c9888d95SJean-Philippe Brucker 			break;
570*8dd28afeSJean-Philippe Brucker 		case PCI_CAP_ID_MSI:
571*8dd28afeSJean-Philippe Brucker 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
572*8dd28afeSJean-Philippe Brucker 			if (ret)
573*8dd28afeSJean-Philippe Brucker 				return ret;
574*8dd28afeSJean-Philippe Brucker 
575*8dd28afeSJean-Philippe Brucker 			pdev->msi.pos = pos;
576*8dd28afeSJean-Philippe Brucker 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI;
577*8dd28afeSJean-Philippe Brucker 			break;
578c9888d95SJean-Philippe Brucker 		}
579c9888d95SJean-Philippe Brucker 	}
580c9888d95SJean-Philippe Brucker 
581c9888d95SJean-Philippe Brucker 	/* Wipe remaining capabilities */
582c9888d95SJean-Philippe Brucker 	pos = PCI_STD_HEADER_SIZEOF;
583c9888d95SJean-Philippe Brucker 	size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
584c9888d95SJean-Philippe Brucker 	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
5856078a454SJean-Philippe Brucker 
5866078a454SJean-Philippe Brucker 	return 0;
5876078a454SJean-Philippe Brucker }
5886078a454SJean-Philippe Brucker 
5896078a454SJean-Philippe Brucker static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
5906078a454SJean-Philippe Brucker {
591c9888d95SJean-Philippe Brucker 	ssize_t sz = PCI_DEV_CFG_SIZE;
5926078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
5936078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
5946078a454SJean-Philippe Brucker 
5956078a454SJean-Philippe Brucker 	if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
5966078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "Config Space not found");
5976078a454SJean-Philippe Brucker 		return -ENODEV;
5986078a454SJean-Philippe Brucker 	}
5996078a454SJean-Philippe Brucker 
6006078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
6016078a454SJean-Philippe Brucker 	*info = (struct vfio_region_info) {
6026078a454SJean-Philippe Brucker 			.argsz = sizeof(*info),
6036078a454SJean-Philippe Brucker 			.index = VFIO_PCI_CONFIG_REGION_INDEX,
6046078a454SJean-Philippe Brucker 	};
6056078a454SJean-Philippe Brucker 
6066078a454SJean-Philippe Brucker 	ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
6076078a454SJean-Philippe Brucker 	if (!info->size) {
6086078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "Config Space has size zero?!");
6096078a454SJean-Philippe Brucker 		return -EINVAL;
6106078a454SJean-Philippe Brucker 	}
6116078a454SJean-Philippe Brucker 
612c9888d95SJean-Philippe Brucker 	/* Read standard headers and capabilities */
6136078a454SJean-Philippe Brucker 	if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
6146078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
6156078a454SJean-Philippe Brucker 		return -EIO;
6166078a454SJean-Philippe Brucker 	}
6176078a454SJean-Philippe Brucker 
6186078a454SJean-Philippe Brucker 	/* Strip bit 7, that indicates multifunction */
6196078a454SJean-Philippe Brucker 	pdev->hdr.header_type &= 0x7f;
6206078a454SJean-Philippe Brucker 
6216078a454SJean-Philippe Brucker 	if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
6226078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "unsupported header type %u",
6236078a454SJean-Philippe Brucker 			     pdev->hdr.header_type);
6246078a454SJean-Philippe Brucker 		return -EOPNOTSUPP;
6256078a454SJean-Philippe Brucker 	}
6266078a454SJean-Philippe Brucker 
627c9888d95SJean-Philippe Brucker 	if (pdev->hdr.irq_pin)
628c9888d95SJean-Philippe Brucker 		pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
629c9888d95SJean-Philippe Brucker 
6306078a454SJean-Philippe Brucker 	vfio_pci_parse_caps(vdev);
6316078a454SJean-Philippe Brucker 
6326078a454SJean-Philippe Brucker 	return 0;
6336078a454SJean-Philippe Brucker }
6346078a454SJean-Philippe Brucker 
6356078a454SJean-Philippe Brucker static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
6366078a454SJean-Philippe Brucker {
6376078a454SJean-Philippe Brucker 	int i;
6386078a454SJean-Philippe Brucker 	ssize_t hdr_sz;
639c9888d95SJean-Philippe Brucker 	struct msix_cap *msix;
6406078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
6416078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
6426078a454SJean-Philippe Brucker 
6436078a454SJean-Philippe Brucker 	/* Enable exclusively MMIO and bus mastering */
6446078a454SJean-Philippe Brucker 	pdev->hdr.command &= ~PCI_COMMAND_IO;
6456078a454SJean-Philippe Brucker 	pdev->hdr.command |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
6466078a454SJean-Philippe Brucker 
6476078a454SJean-Philippe Brucker 	/* Initialise the BARs */
6486078a454SJean-Philippe Brucker 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
6496078a454SJean-Philippe Brucker 		struct vfio_region *region = &vdev->regions[i];
6506078a454SJean-Philippe Brucker 		u64 base = region->guest_phys_addr;
6516078a454SJean-Philippe Brucker 
6526078a454SJean-Philippe Brucker 		if (!base)
6536078a454SJean-Philippe Brucker 			continue;
6546078a454SJean-Philippe Brucker 
6556078a454SJean-Philippe Brucker 		pdev->hdr.bar_size[i] = region->info.size;
6566078a454SJean-Philippe Brucker 
6576078a454SJean-Philippe Brucker 		/* Construct a fake reg to match what we've mapped. */
6586078a454SJean-Philippe Brucker 		pdev->hdr.bar[i] = (base & PCI_BASE_ADDRESS_MEM_MASK) |
6596078a454SJean-Philippe Brucker 					PCI_BASE_ADDRESS_SPACE_MEMORY |
6606078a454SJean-Philippe Brucker 					PCI_BASE_ADDRESS_MEM_TYPE_32;
6616078a454SJean-Philippe Brucker 	}
6626078a454SJean-Philippe Brucker 
6636078a454SJean-Philippe Brucker 	/* I really can't be bothered to support cardbus. */
6646078a454SJean-Philippe Brucker 	pdev->hdr.card_bus = 0;
6656078a454SJean-Philippe Brucker 
6666078a454SJean-Philippe Brucker 	/*
6676078a454SJean-Philippe Brucker 	 * Nuke the expansion ROM for now. If we want to do this properly,
6686078a454SJean-Philippe Brucker 	 * we need to save its size somewhere and map into the guest.
6696078a454SJean-Philippe Brucker 	 */
6706078a454SJean-Philippe Brucker 	pdev->hdr.exp_rom_bar = 0;
6716078a454SJean-Philippe Brucker 
672c9888d95SJean-Philippe Brucker 	/* Plumb in our fake MSI-X capability, if we have it. */
673c9888d95SJean-Philippe Brucker 	msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
674c9888d95SJean-Philippe Brucker 	if (msix) {
675c9888d95SJean-Philippe Brucker 		/* Add a shortcut to the PBA region for the MMIO handler */
676c9888d95SJean-Philippe Brucker 		int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
677c9888d95SJean-Philippe Brucker 		pdev->msix_pba.offset = vdev->regions[pba_index].info.offset +
678c9888d95SJean-Philippe Brucker 					(msix->pba_offset & PCI_MSIX_PBA_OFFSET);
679c9888d95SJean-Philippe Brucker 
680c9888d95SJean-Philippe Brucker 		/* Tidy up the capability */
681c9888d95SJean-Philippe Brucker 		msix->table_offset &= PCI_MSIX_TABLE_BIR;
682c9888d95SJean-Philippe Brucker 		msix->pba_offset &= PCI_MSIX_PBA_BIR;
683c9888d95SJean-Philippe Brucker 		if (pdev->msix_table.bar == pdev->msix_pba.bar)
684c9888d95SJean-Philippe Brucker 			msix->pba_offset |= pdev->msix_table.size &
685c9888d95SJean-Philippe Brucker 					    PCI_MSIX_PBA_OFFSET;
686c9888d95SJean-Philippe Brucker 	}
687c9888d95SJean-Philippe Brucker 
6886078a454SJean-Philippe Brucker 	/* Install our fake Configuration Space */
6896078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
6906078a454SJean-Philippe Brucker 	hdr_sz = PCI_DEV_CFG_SIZE;
6916078a454SJean-Philippe Brucker 	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
6926078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
6936078a454SJean-Philippe Brucker 			     hdr_sz);
6946078a454SJean-Philippe Brucker 		return -EIO;
6956078a454SJean-Philippe Brucker 	}
6966078a454SJean-Philippe Brucker 
6976078a454SJean-Philippe Brucker 	/* Register callbacks for cfg accesses */
6986078a454SJean-Philippe Brucker 	pdev->hdr.cfg_ops = (struct pci_config_operations) {
6996078a454SJean-Philippe Brucker 		.read	= vfio_pci_cfg_read,
7006078a454SJean-Philippe Brucker 		.write	= vfio_pci_cfg_write,
7016078a454SJean-Philippe Brucker 	};
7026078a454SJean-Philippe Brucker 
7036078a454SJean-Philippe Brucker 	pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
7046078a454SJean-Philippe Brucker 
7056078a454SJean-Philippe Brucker 	return 0;
7066078a454SJean-Philippe Brucker }
7076078a454SJean-Philippe Brucker 
708c9888d95SJean-Philippe Brucker static int vfio_pci_create_msix_table(struct kvm *kvm,
709c9888d95SJean-Philippe Brucker 				      struct vfio_pci_device *pdev)
710c9888d95SJean-Philippe Brucker {
711c9888d95SJean-Philippe Brucker 	int ret;
712c9888d95SJean-Philippe Brucker 	size_t i;
713c9888d95SJean-Philippe Brucker 	size_t mmio_size;
714c9888d95SJean-Philippe Brucker 	size_t nr_entries;
715c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_entry *entries;
716c9888d95SJean-Philippe Brucker 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
717c9888d95SJean-Philippe Brucker 	struct vfio_pci_msix_table *table = &pdev->msix_table;
718c9888d95SJean-Philippe Brucker 	struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
719c9888d95SJean-Philippe Brucker 
720c9888d95SJean-Philippe Brucker 	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
721c9888d95SJean-Philippe Brucker 	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
722c9888d95SJean-Philippe Brucker 
723c9888d95SJean-Philippe Brucker 	/*
724c9888d95SJean-Philippe Brucker 	 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE.
725c9888d95SJean-Philippe Brucker 	 */
726c9888d95SJean-Philippe Brucker 	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
727c9888d95SJean-Philippe Brucker 	table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE);
728c9888d95SJean-Philippe Brucker 	pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE);
729c9888d95SJean-Philippe Brucker 
730c9888d95SJean-Philippe Brucker 	entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
731c9888d95SJean-Philippe Brucker 	if (!entries)
732c9888d95SJean-Philippe Brucker 		return -ENOMEM;
733c9888d95SJean-Philippe Brucker 
734c9888d95SJean-Philippe Brucker 	for (i = 0; i < nr_entries; i++)
735c9888d95SJean-Philippe Brucker 		entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
736c9888d95SJean-Philippe Brucker 
737c9888d95SJean-Philippe Brucker 	/*
738c9888d95SJean-Philippe Brucker 	 * To ease MSI-X cap configuration in case they share the same BAR,
739c9888d95SJean-Philippe Brucker 	 * collapse table and pending array. The size of the BAR regions must be
740c9888d95SJean-Philippe Brucker 	 * powers of two.
741c9888d95SJean-Philippe Brucker 	 */
742c9888d95SJean-Philippe Brucker 	mmio_size = roundup_pow_of_two(table->size + pba->size);
743c9888d95SJean-Philippe Brucker 	table->guest_phys_addr = pci_get_io_space_block(mmio_size);
744c9888d95SJean-Philippe Brucker 	if (!table->guest_phys_addr) {
745c9888d95SJean-Philippe Brucker 		pr_err("cannot allocate IO space");
746c9888d95SJean-Philippe Brucker 		ret = -ENOMEM;
747c9888d95SJean-Philippe Brucker 		goto out_free;
748c9888d95SJean-Philippe Brucker 	}
749c9888d95SJean-Philippe Brucker 	pba->guest_phys_addr = table->guest_phys_addr + table->size;
750c9888d95SJean-Philippe Brucker 
751c9888d95SJean-Philippe Brucker 	ret = kvm__register_mmio(kvm, table->guest_phys_addr, table->size,
752c9888d95SJean-Philippe Brucker 				 false, vfio_pci_msix_table_access, pdev);
753c9888d95SJean-Philippe Brucker 	if (ret < 0)
754c9888d95SJean-Philippe Brucker 		goto out_free;
755c9888d95SJean-Philippe Brucker 
756c9888d95SJean-Philippe Brucker 	/*
757c9888d95SJean-Philippe Brucker 	 * We could map the physical PBA directly into the guest, but it's
758c9888d95SJean-Philippe Brucker 	 * likely smaller than a page, and we can only hand full pages to the
759c9888d95SJean-Philippe Brucker 	 * guest. Even though the PCI spec disallows sharing a page used for
760c9888d95SJean-Philippe Brucker 	 * MSI-X with any other resource, it allows to share the same page
761c9888d95SJean-Philippe Brucker 	 * between MSI-X table and PBA. For the sake of isolation, create a
762c9888d95SJean-Philippe Brucker 	 * virtual PBA.
763c9888d95SJean-Philippe Brucker 	 */
764c9888d95SJean-Philippe Brucker 	ret = kvm__register_mmio(kvm, pba->guest_phys_addr, pba->size, false,
765c9888d95SJean-Philippe Brucker 				 vfio_pci_msix_pba_access, pdev);
766c9888d95SJean-Philippe Brucker 	if (ret < 0)
767c9888d95SJean-Philippe Brucker 		goto out_free;
768c9888d95SJean-Philippe Brucker 
769c9888d95SJean-Philippe Brucker 	pdev->msix.entries = entries;
770c9888d95SJean-Philippe Brucker 	pdev->msix.nr_entries = nr_entries;
771c9888d95SJean-Philippe Brucker 
772c9888d95SJean-Philippe Brucker 	return 0;
773c9888d95SJean-Philippe Brucker 
774c9888d95SJean-Philippe Brucker out_free:
775c9888d95SJean-Philippe Brucker 	free(entries);
776c9888d95SJean-Philippe Brucker 
777c9888d95SJean-Philippe Brucker 	return ret;
778c9888d95SJean-Philippe Brucker }
779c9888d95SJean-Philippe Brucker 
780*8dd28afeSJean-Philippe Brucker static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev)
781*8dd28afeSJean-Philippe Brucker {
782*8dd28afeSJean-Philippe Brucker 	struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos);
783*8dd28afeSJean-Philippe Brucker 
784*8dd28afeSJean-Philippe Brucker 	pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1),
785*8dd28afeSJean-Philippe Brucker 	pdev->msi.entries = calloc(pdev->msi.nr_entries,
786*8dd28afeSJean-Philippe Brucker 				   sizeof(struct vfio_pci_msi_entry));
787*8dd28afeSJean-Philippe Brucker 	if (!pdev->msi.entries)
788*8dd28afeSJean-Philippe Brucker 		return -ENOMEM;
789*8dd28afeSJean-Philippe Brucker 
790*8dd28afeSJean-Philippe Brucker 	return 0;
791*8dd28afeSJean-Philippe Brucker }
792*8dd28afeSJean-Philippe Brucker 
7936078a454SJean-Philippe Brucker static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
7946078a454SJean-Philippe Brucker 				  size_t nr)
7956078a454SJean-Philippe Brucker {
7966078a454SJean-Philippe Brucker 	int ret;
7976078a454SJean-Philippe Brucker 	size_t map_size;
798c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
7996078a454SJean-Philippe Brucker 	struct vfio_region *region = &vdev->regions[nr];
8006078a454SJean-Philippe Brucker 
8016078a454SJean-Philippe Brucker 	if (nr >= vdev->info.num_regions)
8026078a454SJean-Philippe Brucker 		return 0;
8036078a454SJean-Philippe Brucker 
8046078a454SJean-Philippe Brucker 	region->info = (struct vfio_region_info) {
8056078a454SJean-Philippe Brucker 		.argsz = sizeof(region->info),
8066078a454SJean-Philippe Brucker 		.index = nr,
8076078a454SJean-Philippe Brucker 	};
8086078a454SJean-Philippe Brucker 
8096078a454SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &region->info);
8106078a454SJean-Philippe Brucker 	if (ret) {
8116078a454SJean-Philippe Brucker 		ret = -errno;
8126078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot get info for BAR %zu", nr);
8136078a454SJean-Philippe Brucker 		return ret;
8146078a454SJean-Philippe Brucker 	}
8156078a454SJean-Philippe Brucker 
8166078a454SJean-Philippe Brucker 	/* Ignore invalid or unimplemented regions */
8176078a454SJean-Philippe Brucker 	if (!region->info.size)
8186078a454SJean-Philippe Brucker 		return 0;
8196078a454SJean-Philippe Brucker 
820c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
821c9888d95SJean-Philippe Brucker 		/* Trap and emulate MSI-X table */
822c9888d95SJean-Philippe Brucker 		if (nr == pdev->msix_table.bar) {
823c9888d95SJean-Philippe Brucker 			region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
824c9888d95SJean-Philippe Brucker 			return 0;
825c9888d95SJean-Philippe Brucker 		} else if (nr == pdev->msix_pba.bar) {
826c9888d95SJean-Philippe Brucker 			region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
827c9888d95SJean-Philippe Brucker 			return 0;
828c9888d95SJean-Philippe Brucker 		}
829c9888d95SJean-Philippe Brucker 	}
830c9888d95SJean-Philippe Brucker 
8316078a454SJean-Philippe Brucker 	/* Grab some MMIO space in the guest */
8326078a454SJean-Philippe Brucker 	map_size = ALIGN(region->info.size, PAGE_SIZE);
8336078a454SJean-Philippe Brucker 	region->guest_phys_addr = pci_get_io_space_block(map_size);
8346078a454SJean-Philippe Brucker 
8356078a454SJean-Philippe Brucker 	/*
8366078a454SJean-Philippe Brucker 	 * Map the BARs into the guest. We'll later need to update
8376078a454SJean-Philippe Brucker 	 * configuration space to reflect our allocation.
8386078a454SJean-Philippe Brucker 	 */
8396078a454SJean-Philippe Brucker 	ret = vfio_map_region(kvm, vdev, region);
8406078a454SJean-Philippe Brucker 	if (ret)
8416078a454SJean-Philippe Brucker 		return ret;
8426078a454SJean-Philippe Brucker 
8436078a454SJean-Philippe Brucker 	return 0;
8446078a454SJean-Philippe Brucker }
8456078a454SJean-Philippe Brucker 
8466078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_regions(struct kvm *kvm,
8476078a454SJean-Philippe Brucker 					  struct vfio_device *vdev)
8486078a454SJean-Philippe Brucker {
8496078a454SJean-Philippe Brucker 	int ret;
8506078a454SJean-Philippe Brucker 	u32 bar;
8516078a454SJean-Philippe Brucker 	size_t i;
8526078a454SJean-Philippe Brucker 	bool is_64bit = false;
8536078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
8546078a454SJean-Philippe Brucker 
8556078a454SJean-Philippe Brucker 	ret = vfio_pci_parse_cfg_space(vdev);
8566078a454SJean-Philippe Brucker 	if (ret)
8576078a454SJean-Philippe Brucker 		return ret;
8586078a454SJean-Philippe Brucker 
859c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
860c9888d95SJean-Philippe Brucker 		ret = vfio_pci_create_msix_table(kvm, pdev);
861c9888d95SJean-Philippe Brucker 		if (ret)
862c9888d95SJean-Philippe Brucker 			return ret;
863c9888d95SJean-Philippe Brucker 	}
864c9888d95SJean-Philippe Brucker 
865*8dd28afeSJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
866*8dd28afeSJean-Philippe Brucker 		ret = vfio_pci_create_msi_cap(kvm, pdev);
867*8dd28afeSJean-Philippe Brucker 		if (ret)
868*8dd28afeSJean-Philippe Brucker 			return ret;
869*8dd28afeSJean-Philippe Brucker 	}
870*8dd28afeSJean-Philippe Brucker 
8716078a454SJean-Philippe Brucker 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
8726078a454SJean-Philippe Brucker 		/* Ignore top half of 64-bit BAR */
8736078a454SJean-Philippe Brucker 		if (i % 2 && is_64bit)
8746078a454SJean-Philippe Brucker 			continue;
8756078a454SJean-Philippe Brucker 
8766078a454SJean-Philippe Brucker 		ret = vfio_pci_configure_bar(kvm, vdev, i);
8776078a454SJean-Philippe Brucker 		if (ret)
8786078a454SJean-Philippe Brucker 			return ret;
8796078a454SJean-Philippe Brucker 
8806078a454SJean-Philippe Brucker 		bar = pdev->hdr.bar[i];
8816078a454SJean-Philippe Brucker 		is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) ==
8826078a454SJean-Philippe Brucker 			   PCI_BASE_ADDRESS_SPACE_MEMORY &&
8836078a454SJean-Philippe Brucker 			   bar & PCI_BASE_ADDRESS_MEM_TYPE_64;
8846078a454SJean-Philippe Brucker 	}
8856078a454SJean-Philippe Brucker 
8866078a454SJean-Philippe Brucker 	/* We've configured the BARs, fake up a Configuration Space */
8876078a454SJean-Philippe Brucker 	return vfio_pci_fixup_cfg_space(vdev);
8886078a454SJean-Philippe Brucker }
8896078a454SJean-Philippe Brucker 
890c9888d95SJean-Philippe Brucker /*
891c9888d95SJean-Philippe Brucker  * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
892c9888d95SJean-Philippe Brucker  * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
893c9888d95SJean-Philippe Brucker  */
894c9888d95SJean-Philippe Brucker static int vfio_pci_reserve_irq_fds(size_t num)
895c9888d95SJean-Philippe Brucker {
896c9888d95SJean-Philippe Brucker 	/*
897c9888d95SJean-Philippe Brucker 	 * I counted around 27 fds under normal load. Let's add 100 for good
898c9888d95SJean-Philippe Brucker 	 * measure.
899c9888d95SJean-Philippe Brucker 	 */
900c9888d95SJean-Philippe Brucker 	static size_t needed = 128;
901c9888d95SJean-Philippe Brucker 	struct rlimit fd_limit, new_limit;
902c9888d95SJean-Philippe Brucker 
903c9888d95SJean-Philippe Brucker 	needed += num;
904c9888d95SJean-Philippe Brucker 
905c9888d95SJean-Philippe Brucker 	if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
906c9888d95SJean-Philippe Brucker 		perror("getrlimit(RLIMIT_NOFILE)");
907c9888d95SJean-Philippe Brucker 		return 0;
908c9888d95SJean-Philippe Brucker 	}
909c9888d95SJean-Philippe Brucker 
910c9888d95SJean-Philippe Brucker 	if (fd_limit.rlim_cur >= needed)
911c9888d95SJean-Philippe Brucker 		return 0;
912c9888d95SJean-Philippe Brucker 
913c9888d95SJean-Philippe Brucker 	new_limit.rlim_cur = needed;
914c9888d95SJean-Philippe Brucker 
915c9888d95SJean-Philippe Brucker 	if (fd_limit.rlim_max < needed)
916c9888d95SJean-Philippe Brucker 		/* Try to bump hard limit (root only) */
917c9888d95SJean-Philippe Brucker 		new_limit.rlim_max = needed;
918c9888d95SJean-Philippe Brucker 	else
919c9888d95SJean-Philippe Brucker 		new_limit.rlim_max = fd_limit.rlim_max;
920c9888d95SJean-Philippe Brucker 
921c9888d95SJean-Philippe Brucker 	if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
922c9888d95SJean-Philippe Brucker 		perror("setrlimit(RLIMIT_NOFILE)");
923c9888d95SJean-Philippe Brucker 		pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
924c9888d95SJean-Philippe Brucker 			   (size_t)(needed - fd_limit.rlim_cur));
925c9888d95SJean-Philippe Brucker 	}
926c9888d95SJean-Philippe Brucker 
927c9888d95SJean-Philippe Brucker 	return 0;
928c9888d95SJean-Philippe Brucker }
929c9888d95SJean-Philippe Brucker 
930c9888d95SJean-Philippe Brucker static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
931c9888d95SJean-Philippe Brucker 			     struct vfio_pci_msi_common *msis)
932c9888d95SJean-Philippe Brucker {
933c9888d95SJean-Philippe Brucker 	int ret;
934c9888d95SJean-Philippe Brucker 	size_t i;
935c9888d95SJean-Philippe Brucker 	int *eventfds;
936c9888d95SJean-Philippe Brucker 	size_t irq_set_size;
937c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
938c9888d95SJean-Philippe Brucker 	size_t nr_entries = msis->nr_entries;
939c9888d95SJean-Philippe Brucker 
940c9888d95SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
941c9888d95SJean-Philippe Brucker 	if (ret || &msis->info.count == 0) {
942c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "no MSI reported by VFIO");
943c9888d95SJean-Philippe Brucker 		return -ENODEV;
944c9888d95SJean-Philippe Brucker 	}
945c9888d95SJean-Philippe Brucker 
946c9888d95SJean-Philippe Brucker 	if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
947c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "interrupt not EVENTFD capable");
948c9888d95SJean-Philippe Brucker 		return -EINVAL;
949c9888d95SJean-Philippe Brucker 	}
950c9888d95SJean-Philippe Brucker 
951c9888d95SJean-Philippe Brucker 	if (msis->info.count != nr_entries) {
952c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
953c9888d95SJean-Philippe Brucker 		return -EINVAL;
954c9888d95SJean-Philippe Brucker 	}
955c9888d95SJean-Philippe Brucker 
956c9888d95SJean-Philippe Brucker 	mutex_init(&msis->mutex);
957c9888d95SJean-Philippe Brucker 
958c9888d95SJean-Philippe Brucker 	vfio_pci_reserve_irq_fds(nr_entries);
959c9888d95SJean-Philippe Brucker 
960c9888d95SJean-Philippe Brucker 	irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
961c9888d95SJean-Philippe Brucker 	msis->irq_set = malloc(irq_set_size);
962c9888d95SJean-Philippe Brucker 	if (!msis->irq_set)
963c9888d95SJean-Philippe Brucker 		return -ENOMEM;
964c9888d95SJean-Philippe Brucker 
965c9888d95SJean-Philippe Brucker 	*msis->irq_set = (struct vfio_irq_set) {
966c9888d95SJean-Philippe Brucker 		.argsz	= irq_set_size,
967c9888d95SJean-Philippe Brucker 		.flags 	= VFIO_IRQ_SET_DATA_EVENTFD |
968c9888d95SJean-Philippe Brucker 			  VFIO_IRQ_SET_ACTION_TRIGGER,
969c9888d95SJean-Philippe Brucker 		.index 	= msis->info.index,
970c9888d95SJean-Philippe Brucker 		.start 	= 0,
971c9888d95SJean-Philippe Brucker 		.count 	= nr_entries,
972c9888d95SJean-Philippe Brucker 	};
973c9888d95SJean-Philippe Brucker 
974c9888d95SJean-Philippe Brucker 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
975c9888d95SJean-Philippe Brucker 
976c9888d95SJean-Philippe Brucker 	for (i = 0; i < nr_entries; i++) {
977c9888d95SJean-Philippe Brucker 		entry = &msis->entries[i];
978c9888d95SJean-Philippe Brucker 		entry->gsi = -1;
979c9888d95SJean-Philippe Brucker 		entry->eventfd = -1;
980c9888d95SJean-Philippe Brucker 		msi_set_masked(entry->virt_state, true);
981c9888d95SJean-Philippe Brucker 		msi_set_masked(entry->phys_state, true);
982c9888d95SJean-Philippe Brucker 		eventfds[i] = -1;
983c9888d95SJean-Philippe Brucker 	}
984c9888d95SJean-Philippe Brucker 
985c9888d95SJean-Philippe Brucker 	return 0;
986c9888d95SJean-Philippe Brucker }
987c9888d95SJean-Philippe Brucker 
988c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
989c9888d95SJean-Philippe Brucker {
990c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
991c9888d95SJean-Philippe Brucker 	int gsi = pdev->intx_gsi;
992c9888d95SJean-Philippe Brucker 	struct vfio_irq_set irq_set = {
993c9888d95SJean-Philippe Brucker 		.argsz	= sizeof(irq_set),
994c9888d95SJean-Philippe Brucker 		.flags	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
995c9888d95SJean-Philippe Brucker 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
996c9888d95SJean-Philippe Brucker 	};
997c9888d95SJean-Philippe Brucker 
998c9888d95SJean-Philippe Brucker 	pr_debug("user requested MSI, disabling INTx %d", gsi);
999c9888d95SJean-Philippe Brucker 
1000c9888d95SJean-Philippe Brucker 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
1001c9888d95SJean-Philippe Brucker 	irq__del_irqfd(kvm, gsi, pdev->intx_fd);
1002c9888d95SJean-Philippe Brucker 
1003c9888d95SJean-Philippe Brucker 	close(pdev->intx_fd);
1004c9888d95SJean-Philippe Brucker }
1005c9888d95SJean-Philippe Brucker 
10066078a454SJean-Philippe Brucker static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
10076078a454SJean-Philippe Brucker {
10086078a454SJean-Philippe Brucker 	int ret;
10096078a454SJean-Philippe Brucker 	int trigger_fd, unmask_fd;
10106078a454SJean-Philippe Brucker 	struct vfio_irq_eventfd	trigger;
10116078a454SJean-Philippe Brucker 	struct vfio_irq_eventfd	unmask;
10126078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
10136078a454SJean-Philippe Brucker 	int gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
10146078a454SJean-Philippe Brucker 
10156078a454SJean-Philippe Brucker 	struct vfio_irq_info irq_info = {
10166078a454SJean-Philippe Brucker 		.argsz = sizeof(irq_info),
10176078a454SJean-Philippe Brucker 		.index = VFIO_PCI_INTX_IRQ_INDEX,
10186078a454SJean-Philippe Brucker 	};
10196078a454SJean-Philippe Brucker 
1020c9888d95SJean-Philippe Brucker 	vfio_pci_reserve_irq_fds(2);
1021c9888d95SJean-Philippe Brucker 
10226078a454SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
10236078a454SJean-Philippe Brucker 	if (ret || irq_info.count == 0) {
10246078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "no INTx reported by VFIO");
10256078a454SJean-Philippe Brucker 		return -ENODEV;
10266078a454SJean-Philippe Brucker 	}
10276078a454SJean-Philippe Brucker 
10286078a454SJean-Philippe Brucker 	if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
10296078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "interrupt not eventfd capable");
10306078a454SJean-Philippe Brucker 		return -EINVAL;
10316078a454SJean-Philippe Brucker 	}
10326078a454SJean-Philippe Brucker 
10336078a454SJean-Philippe Brucker 	if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
10346078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
10356078a454SJean-Philippe Brucker 		return -EINVAL;
10366078a454SJean-Philippe Brucker 	}
10376078a454SJean-Philippe Brucker 
10386078a454SJean-Philippe Brucker 	/*
10396078a454SJean-Philippe Brucker 	 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
10406078a454SJean-Philippe Brucker 	 * signals an interrupt from host to guest, and unmask_fd signals the
10416078a454SJean-Philippe Brucker 	 * deassertion of the line from guest to host.
10426078a454SJean-Philippe Brucker 	 */
10436078a454SJean-Philippe Brucker 	trigger_fd = eventfd(0, 0);
10446078a454SJean-Philippe Brucker 	if (trigger_fd < 0) {
10456078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to create trigger eventfd");
10466078a454SJean-Philippe Brucker 		return trigger_fd;
10476078a454SJean-Philippe Brucker 	}
10486078a454SJean-Philippe Brucker 
10496078a454SJean-Philippe Brucker 	unmask_fd = eventfd(0, 0);
10506078a454SJean-Philippe Brucker 	if (unmask_fd < 0) {
10516078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to create unmask eventfd");
10526078a454SJean-Philippe Brucker 		close(trigger_fd);
10536078a454SJean-Philippe Brucker 		return unmask_fd;
10546078a454SJean-Philippe Brucker 	}
10556078a454SJean-Philippe Brucker 
10566078a454SJean-Philippe Brucker 	ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
10576078a454SJean-Philippe Brucker 	if (ret)
10586078a454SJean-Philippe Brucker 		goto err_close;
10596078a454SJean-Philippe Brucker 
10606078a454SJean-Philippe Brucker 	trigger.irq = (struct vfio_irq_set) {
10616078a454SJean-Philippe Brucker 		.argsz	= sizeof(trigger),
10626078a454SJean-Philippe Brucker 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
10636078a454SJean-Philippe Brucker 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
10646078a454SJean-Philippe Brucker 		.start	= 0,
10656078a454SJean-Philippe Brucker 		.count	= 1,
10666078a454SJean-Philippe Brucker 	};
10676078a454SJean-Philippe Brucker 	trigger.fd = trigger_fd;
10686078a454SJean-Philippe Brucker 
10696078a454SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
10706078a454SJean-Philippe Brucker 	if (ret < 0) {
10716078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to setup VFIO IRQ");
10726078a454SJean-Philippe Brucker 		goto err_delete_line;
10736078a454SJean-Philippe Brucker 	}
10746078a454SJean-Philippe Brucker 
10756078a454SJean-Philippe Brucker 	unmask.irq = (struct vfio_irq_set) {
10766078a454SJean-Philippe Brucker 		.argsz	= sizeof(unmask),
10776078a454SJean-Philippe Brucker 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
10786078a454SJean-Philippe Brucker 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
10796078a454SJean-Philippe Brucker 		.start	= 0,
10806078a454SJean-Philippe Brucker 		.count	= 1,
10816078a454SJean-Philippe Brucker 	};
10826078a454SJean-Philippe Brucker 	unmask.fd = unmask_fd;
10836078a454SJean-Philippe Brucker 
10846078a454SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask);
10856078a454SJean-Philippe Brucker 	if (ret < 0) {
10866078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to setup unmask IRQ");
10876078a454SJean-Philippe Brucker 		goto err_remove_event;
10886078a454SJean-Philippe Brucker 	}
10896078a454SJean-Philippe Brucker 
1090c9888d95SJean-Philippe Brucker 	pdev->intx_fd = trigger_fd;
1091c9888d95SJean-Philippe Brucker 	/* Guest is going to ovewrite our irq_line... */
1092c9888d95SJean-Philippe Brucker 	pdev->intx_gsi = gsi;
1093c9888d95SJean-Philippe Brucker 
10946078a454SJean-Philippe Brucker 	return 0;
10956078a454SJean-Philippe Brucker 
10966078a454SJean-Philippe Brucker err_remove_event:
10976078a454SJean-Philippe Brucker 	/* Remove trigger event */
10986078a454SJean-Philippe Brucker 	trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
10996078a454SJean-Philippe Brucker 	trigger.irq.count = 0;
11006078a454SJean-Philippe Brucker 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
11016078a454SJean-Philippe Brucker 
11026078a454SJean-Philippe Brucker err_delete_line:
11036078a454SJean-Philippe Brucker 	irq__del_irqfd(kvm, gsi, trigger_fd);
11046078a454SJean-Philippe Brucker 
11056078a454SJean-Philippe Brucker err_close:
11066078a454SJean-Philippe Brucker 	close(trigger_fd);
11076078a454SJean-Philippe Brucker 	close(unmask_fd);
11086078a454SJean-Philippe Brucker 	return ret;
11096078a454SJean-Philippe Brucker }
11106078a454SJean-Philippe Brucker 
11116078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
11126078a454SJean-Philippe Brucker {
1113c9888d95SJean-Philippe Brucker 	int ret = 0;
11146078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
11156078a454SJean-Philippe Brucker 
1116c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1117c9888d95SJean-Philippe Brucker 		pdev->msix.info = (struct vfio_irq_info) {
1118c9888d95SJean-Philippe Brucker 			.argsz = sizeof(pdev->msix.info),
1119c9888d95SJean-Philippe Brucker 			.index = VFIO_PCI_MSIX_IRQ_INDEX,
11206078a454SJean-Philippe Brucker 		};
1121c9888d95SJean-Philippe Brucker 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
1122c9888d95SJean-Philippe Brucker 		if (ret)
1123c9888d95SJean-Philippe Brucker 			return ret;
11246078a454SJean-Philippe Brucker 	}
11256078a454SJean-Philippe Brucker 
1126*8dd28afeSJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
1127*8dd28afeSJean-Philippe Brucker 		pdev->msi.info = (struct vfio_irq_info) {
1128*8dd28afeSJean-Philippe Brucker 			.argsz = sizeof(pdev->msi.info),
1129*8dd28afeSJean-Philippe Brucker 			.index = VFIO_PCI_MSI_IRQ_INDEX,
1130*8dd28afeSJean-Philippe Brucker 		};
1131*8dd28afeSJean-Philippe Brucker 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi);
1132*8dd28afeSJean-Philippe Brucker 		if (ret)
1133*8dd28afeSJean-Philippe Brucker 			return ret;
1134*8dd28afeSJean-Philippe Brucker 	}
1135*8dd28afeSJean-Philippe Brucker 
1136c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
1137c9888d95SJean-Philippe Brucker 		ret = vfio_pci_enable_intx(kvm, vdev);
1138c9888d95SJean-Philippe Brucker 
1139c9888d95SJean-Philippe Brucker 	return ret;
11406078a454SJean-Philippe Brucker }
11416078a454SJean-Philippe Brucker 
11426078a454SJean-Philippe Brucker int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
11436078a454SJean-Philippe Brucker {
11446078a454SJean-Philippe Brucker 	int ret;
11456078a454SJean-Philippe Brucker 
11466078a454SJean-Philippe Brucker 	ret = vfio_pci_configure_dev_regions(kvm, vdev);
11476078a454SJean-Philippe Brucker 	if (ret) {
11486078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to configure regions");
11496078a454SJean-Philippe Brucker 		return ret;
11506078a454SJean-Philippe Brucker 	}
11516078a454SJean-Philippe Brucker 
11526078a454SJean-Philippe Brucker 	vdev->dev_hdr = (struct device_header) {
11536078a454SJean-Philippe Brucker 		.bus_type	= DEVICE_BUS_PCI,
11546078a454SJean-Philippe Brucker 		.data		= &vdev->pci.hdr,
11556078a454SJean-Philippe Brucker 	};
11566078a454SJean-Philippe Brucker 
11576078a454SJean-Philippe Brucker 	ret = device__register(&vdev->dev_hdr);
11586078a454SJean-Philippe Brucker 	if (ret) {
11596078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to register VFIO device");
11606078a454SJean-Philippe Brucker 		return ret;
11616078a454SJean-Philippe Brucker 	}
11626078a454SJean-Philippe Brucker 
11636078a454SJean-Philippe Brucker 	ret = vfio_pci_configure_dev_irqs(kvm, vdev);
11646078a454SJean-Philippe Brucker 	if (ret) {
11656078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to configure IRQs");
11666078a454SJean-Philippe Brucker 		return ret;
11676078a454SJean-Philippe Brucker 	}
11686078a454SJean-Philippe Brucker 
11696078a454SJean-Philippe Brucker 	return 0;
11706078a454SJean-Philippe Brucker }
11716078a454SJean-Philippe Brucker 
11726078a454SJean-Philippe Brucker void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
11736078a454SJean-Philippe Brucker {
11746078a454SJean-Philippe Brucker 	size_t i;
1175c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
11766078a454SJean-Philippe Brucker 
11776078a454SJean-Philippe Brucker 	for (i = 0; i < vdev->info.num_regions; i++)
11786078a454SJean-Philippe Brucker 		vfio_unmap_region(kvm, &vdev->regions[i]);
11796078a454SJean-Philippe Brucker 
11806078a454SJean-Philippe Brucker 	device__unregister(&vdev->dev_hdr);
1181c9888d95SJean-Philippe Brucker 
1182c9888d95SJean-Philippe Brucker 	free(pdev->msix.irq_set);
1183c9888d95SJean-Philippe Brucker 	free(pdev->msix.entries);
1184*8dd28afeSJean-Philippe Brucker 	free(pdev->msi.irq_set);
1185*8dd28afeSJean-Philippe Brucker 	free(pdev->msi.entries);
11866078a454SJean-Philippe Brucker }
1187