xref: /kvmtool/vfio/pci.c (revision c9888d9571ca6fa0093318c06e71220df5c3d8ec)
16078a454SJean-Philippe Brucker #include "kvm/irq.h"
26078a454SJean-Philippe Brucker #include "kvm/kvm.h"
36078a454SJean-Philippe Brucker #include "kvm/kvm-cpu.h"
46078a454SJean-Philippe Brucker #include "kvm/vfio.h"
56078a454SJean-Philippe Brucker 
66078a454SJean-Philippe Brucker #include <sys/ioctl.h>
76078a454SJean-Philippe Brucker #include <sys/eventfd.h>
8*c9888d95SJean-Philippe Brucker #include <sys/resource.h>
9*c9888d95SJean-Philippe Brucker #include <sys/time.h>
106078a454SJean-Philippe Brucker 
116078a454SJean-Philippe Brucker /* Wrapper around UAPI vfio_irq_set */
126078a454SJean-Philippe Brucker struct vfio_irq_eventfd {
136078a454SJean-Philippe Brucker 	struct vfio_irq_set	irq;
146078a454SJean-Philippe Brucker 	int			fd;
156078a454SJean-Philippe Brucker };
166078a454SJean-Philippe Brucker 
17*c9888d95SJean-Philippe Brucker #define msi_is_enabled(state)		((state) & VFIO_PCI_MSI_STATE_ENABLED)
18*c9888d95SJean-Philippe Brucker #define msi_is_masked(state)		((state) & VFIO_PCI_MSI_STATE_MASKED)
19*c9888d95SJean-Philippe Brucker #define msi_is_empty(state)		((state) & VFIO_PCI_MSI_STATE_EMPTY)
20*c9888d95SJean-Philippe Brucker 
21*c9888d95SJean-Philippe Brucker #define msi_update_state(state, val, bit)				\
22*c9888d95SJean-Philippe Brucker 	(state) = (val) ? (state) | bit : (state) & ~bit;
23*c9888d95SJean-Philippe Brucker #define msi_set_enabled(state, val)					\
24*c9888d95SJean-Philippe Brucker 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
25*c9888d95SJean-Philippe Brucker #define msi_set_masked(state, val)					\
26*c9888d95SJean-Philippe Brucker 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
27*c9888d95SJean-Philippe Brucker #define msi_set_empty(state, val)					\
28*c9888d95SJean-Philippe Brucker 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
29*c9888d95SJean-Philippe Brucker 
30*c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
31*c9888d95SJean-Philippe Brucker 
32*c9888d95SJean-Philippe Brucker static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev)
33*c9888d95SJean-Philippe Brucker {
34*c9888d95SJean-Philippe Brucker 	size_t i;
35*c9888d95SJean-Philippe Brucker 	int ret = 0;
36*c9888d95SJean-Philippe Brucker 	int *eventfds;
37*c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
38*c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_common *msis = &pdev->msix;
39*c9888d95SJean-Philippe Brucker 	struct vfio_irq_eventfd single = {
40*c9888d95SJean-Philippe Brucker 		.irq = {
41*c9888d95SJean-Philippe Brucker 			.argsz	= sizeof(single),
42*c9888d95SJean-Philippe Brucker 			.flags	= VFIO_IRQ_SET_DATA_EVENTFD |
43*c9888d95SJean-Philippe Brucker 				  VFIO_IRQ_SET_ACTION_TRIGGER,
44*c9888d95SJean-Philippe Brucker 			.index	= msis->info.index,
45*c9888d95SJean-Philippe Brucker 			.count	= 1,
46*c9888d95SJean-Philippe Brucker 		},
47*c9888d95SJean-Philippe Brucker 	};
48*c9888d95SJean-Philippe Brucker 
49*c9888d95SJean-Philippe Brucker 	if (!msi_is_enabled(msis->virt_state))
50*c9888d95SJean-Philippe Brucker 		return 0;
51*c9888d95SJean-Philippe Brucker 
52*c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
53*c9888d95SJean-Philippe Brucker 		/*
54*c9888d95SJean-Philippe Brucker 		 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
55*c9888d95SJean-Philippe Brucker 		 * time. Since INTx has to be enabled from the start (we don't
56*c9888d95SJean-Philippe Brucker 		 * have a reliable way to know when the user starts using it),
57*c9888d95SJean-Philippe Brucker 		 * disable it now.
58*c9888d95SJean-Philippe Brucker 		 */
59*c9888d95SJean-Philippe Brucker 		vfio_pci_disable_intx(kvm, vdev);
60*c9888d95SJean-Philippe Brucker 		/* Permanently disable INTx */
61*c9888d95SJean-Philippe Brucker 		pdev->irq_modes &= ~VFIO_PCI_IRQ_MODE_INTX;
62*c9888d95SJean-Philippe Brucker 	}
63*c9888d95SJean-Philippe Brucker 
64*c9888d95SJean-Philippe Brucker 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
65*c9888d95SJean-Philippe Brucker 
66*c9888d95SJean-Philippe Brucker 	/*
67*c9888d95SJean-Philippe Brucker 	 * Initial registration of the full range. This enables the physical
68*c9888d95SJean-Philippe Brucker 	 * MSI/MSI-X capability, which might have desired side effects. For
69*c9888d95SJean-Philippe Brucker 	 * instance when assigning virtio legacy devices, enabling the MSI
70*c9888d95SJean-Philippe Brucker 	 * capability modifies the config space layout!
71*c9888d95SJean-Philippe Brucker 	 *
72*c9888d95SJean-Philippe Brucker 	 * As an optimization, only update MSIs when guest unmasks the
73*c9888d95SJean-Philippe Brucker 	 * capability. This greatly reduces the initialization time for Linux
74*c9888d95SJean-Philippe Brucker 	 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
75*c9888d95SJean-Philippe Brucker 	 * masked, then fills individual vectors, then unmasks the whole
76*c9888d95SJean-Philippe Brucker 	 * function. So we only do one VFIO ioctl when enabling for the first
77*c9888d95SJean-Philippe Brucker 	 * time, and then one when unmasking.
78*c9888d95SJean-Philippe Brucker 	 *
79*c9888d95SJean-Philippe Brucker 	 * phys_state is empty when it is enabled but no vector has been
80*c9888d95SJean-Philippe Brucker 	 * registered via SET_IRQS yet.
81*c9888d95SJean-Philippe Brucker 	 */
82*c9888d95SJean-Philippe Brucker 	if (!msi_is_enabled(msis->phys_state) ||
83*c9888d95SJean-Philippe Brucker 	    (!msi_is_masked(msis->virt_state) &&
84*c9888d95SJean-Philippe Brucker 	     msi_is_empty(msis->phys_state))) {
85*c9888d95SJean-Philippe Brucker 		bool empty = true;
86*c9888d95SJean-Philippe Brucker 
87*c9888d95SJean-Philippe Brucker 		for (i = 0; i < msis->nr_entries; i++) {
88*c9888d95SJean-Philippe Brucker 			eventfds[i] = msis->entries[i].gsi >= 0 ?
89*c9888d95SJean-Philippe Brucker 				      msis->entries[i].eventfd : -1;
90*c9888d95SJean-Philippe Brucker 
91*c9888d95SJean-Philippe Brucker 			if (eventfds[i] >= 0)
92*c9888d95SJean-Philippe Brucker 				empty = false;
93*c9888d95SJean-Philippe Brucker 		}
94*c9888d95SJean-Philippe Brucker 
95*c9888d95SJean-Philippe Brucker 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
96*c9888d95SJean-Philippe Brucker 		if (ret < 0) {
97*c9888d95SJean-Philippe Brucker 			perror("VFIO_DEVICE_SET_IRQS(multi)");
98*c9888d95SJean-Philippe Brucker 			return ret;
99*c9888d95SJean-Philippe Brucker 		}
100*c9888d95SJean-Philippe Brucker 
101*c9888d95SJean-Philippe Brucker 		msi_set_enabled(msis->phys_state, true);
102*c9888d95SJean-Philippe Brucker 		msi_set_empty(msis->phys_state, empty);
103*c9888d95SJean-Philippe Brucker 
104*c9888d95SJean-Philippe Brucker 		return 0;
105*c9888d95SJean-Philippe Brucker 	}
106*c9888d95SJean-Philippe Brucker 
107*c9888d95SJean-Philippe Brucker 	if (msi_is_masked(msis->virt_state)) {
108*c9888d95SJean-Philippe Brucker 		/* TODO: if phys_state is not empty nor masked, mask all vectors */
109*c9888d95SJean-Philippe Brucker 		return 0;
110*c9888d95SJean-Philippe Brucker 	}
111*c9888d95SJean-Philippe Brucker 
112*c9888d95SJean-Philippe Brucker 	/* Update individual vectors to avoid breaking those in use */
113*c9888d95SJean-Philippe Brucker 	for (i = 0; i < msis->nr_entries; i++) {
114*c9888d95SJean-Philippe Brucker 		struct vfio_pci_msi_entry *entry = &msis->entries[i];
115*c9888d95SJean-Philippe Brucker 		int fd = entry->gsi >= 0 ? entry->eventfd : -1;
116*c9888d95SJean-Philippe Brucker 
117*c9888d95SJean-Philippe Brucker 		if (fd == eventfds[i])
118*c9888d95SJean-Philippe Brucker 			continue;
119*c9888d95SJean-Philippe Brucker 
120*c9888d95SJean-Philippe Brucker 		single.irq.start = i;
121*c9888d95SJean-Philippe Brucker 		single.fd = fd;
122*c9888d95SJean-Philippe Brucker 
123*c9888d95SJean-Philippe Brucker 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
124*c9888d95SJean-Philippe Brucker 		if (ret < 0) {
125*c9888d95SJean-Philippe Brucker 			perror("VFIO_DEVICE_SET_IRQS(single)");
126*c9888d95SJean-Philippe Brucker 			break;
127*c9888d95SJean-Philippe Brucker 		}
128*c9888d95SJean-Philippe Brucker 
129*c9888d95SJean-Philippe Brucker 		eventfds[i] = fd;
130*c9888d95SJean-Philippe Brucker 
131*c9888d95SJean-Philippe Brucker 		if (msi_is_empty(msis->phys_state) && fd >= 0)
132*c9888d95SJean-Philippe Brucker 			msi_set_empty(msis->phys_state, false);
133*c9888d95SJean-Philippe Brucker 	}
134*c9888d95SJean-Philippe Brucker 
135*c9888d95SJean-Philippe Brucker 	return ret;
136*c9888d95SJean-Philippe Brucker }
137*c9888d95SJean-Philippe Brucker 
138*c9888d95SJean-Philippe Brucker static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev)
139*c9888d95SJean-Philippe Brucker {
140*c9888d95SJean-Philippe Brucker 	int ret;
141*c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
142*c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_common *msis = &pdev->msix;
143*c9888d95SJean-Philippe Brucker 	struct vfio_irq_set irq_set = {
144*c9888d95SJean-Philippe Brucker 		.argsz	= sizeof(irq_set),
145*c9888d95SJean-Philippe Brucker 		.flags 	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
146*c9888d95SJean-Philippe Brucker 		.index 	= msis->info.index,
147*c9888d95SJean-Philippe Brucker 		.start 	= 0,
148*c9888d95SJean-Philippe Brucker 		.count	= 0,
149*c9888d95SJean-Philippe Brucker 	};
150*c9888d95SJean-Philippe Brucker 
151*c9888d95SJean-Philippe Brucker 	if (!msi_is_enabled(msis->phys_state))
152*c9888d95SJean-Philippe Brucker 		return 0;
153*c9888d95SJean-Philippe Brucker 
154*c9888d95SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
155*c9888d95SJean-Philippe Brucker 	if (ret < 0) {
156*c9888d95SJean-Philippe Brucker 		perror("VFIO_DEVICE_SET_IRQS(NONE)");
157*c9888d95SJean-Philippe Brucker 		return ret;
158*c9888d95SJean-Philippe Brucker 	}
159*c9888d95SJean-Philippe Brucker 
160*c9888d95SJean-Philippe Brucker 	msi_set_enabled(msis->phys_state, false);
161*c9888d95SJean-Philippe Brucker 	msi_set_empty(msis->phys_state, true);
162*c9888d95SJean-Philippe Brucker 
163*c9888d95SJean-Philippe Brucker 	return 0;
164*c9888d95SJean-Philippe Brucker }
165*c9888d95SJean-Philippe Brucker 
166*c9888d95SJean-Philippe Brucker static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
167*c9888d95SJean-Philippe Brucker 				     struct vfio_pci_msi_entry *entry)
168*c9888d95SJean-Philippe Brucker {
169*c9888d95SJean-Philippe Brucker 	int ret;
170*c9888d95SJean-Philippe Brucker 
171*c9888d95SJean-Philippe Brucker 	if (entry->eventfd < 0) {
172*c9888d95SJean-Philippe Brucker 		entry->eventfd = eventfd(0, 0);
173*c9888d95SJean-Philippe Brucker 		if (entry->eventfd < 0) {
174*c9888d95SJean-Philippe Brucker 			ret = -errno;
175*c9888d95SJean-Philippe Brucker 			vfio_dev_err(vdev, "cannot create eventfd");
176*c9888d95SJean-Philippe Brucker 			return ret;
177*c9888d95SJean-Philippe Brucker 		}
178*c9888d95SJean-Philippe Brucker 	}
179*c9888d95SJean-Philippe Brucker 
180*c9888d95SJean-Philippe Brucker 	/* Allocate IRQ if necessary */
181*c9888d95SJean-Philippe Brucker 	if (entry->gsi < 0) {
182*c9888d95SJean-Philippe Brucker 		int ret = irq__add_msix_route(kvm, &entry->config.msg,
183*c9888d95SJean-Philippe Brucker 					      vdev->dev_hdr.dev_num << 3);
184*c9888d95SJean-Philippe Brucker 		if (ret < 0) {
185*c9888d95SJean-Philippe Brucker 			vfio_dev_err(vdev, "cannot create MSI-X route");
186*c9888d95SJean-Philippe Brucker 			return ret;
187*c9888d95SJean-Philippe Brucker 		}
188*c9888d95SJean-Philippe Brucker 		entry->gsi = ret;
189*c9888d95SJean-Philippe Brucker 	} else {
190*c9888d95SJean-Philippe Brucker 		irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
191*c9888d95SJean-Philippe Brucker 	}
192*c9888d95SJean-Philippe Brucker 
193*c9888d95SJean-Philippe Brucker 	/*
194*c9888d95SJean-Philippe Brucker 	 * MSI masking is unimplemented in VFIO, so we have to handle it by
195*c9888d95SJean-Philippe Brucker 	 * disabling/enabling IRQ route instead. We do it on the KVM side rather
196*c9888d95SJean-Philippe Brucker 	 * than VFIO, because:
197*c9888d95SJean-Philippe Brucker 	 * - it is 8x faster
198*c9888d95SJean-Philippe Brucker 	 * - it allows to decouple masking logic from capability state.
199*c9888d95SJean-Philippe Brucker 	 * - in masked state, after removing irqfd route, we could easily plug
200*c9888d95SJean-Philippe Brucker 	 *   the eventfd in a local handler, in order to serve Pending Bit reads
201*c9888d95SJean-Philippe Brucker 	 *   to the guest.
202*c9888d95SJean-Philippe Brucker 	 *
203*c9888d95SJean-Philippe Brucker 	 * So entry->phys_state is masked when there is no active irqfd route.
204*c9888d95SJean-Philippe Brucker 	 */
205*c9888d95SJean-Philippe Brucker 	if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state))
206*c9888d95SJean-Philippe Brucker 		return 0;
207*c9888d95SJean-Philippe Brucker 
208*c9888d95SJean-Philippe Brucker 	if (msi_is_masked(entry->phys_state)) {
209*c9888d95SJean-Philippe Brucker 		ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
210*c9888d95SJean-Philippe Brucker 		if (ret < 0) {
211*c9888d95SJean-Philippe Brucker 			vfio_dev_err(vdev, "cannot setup irqfd");
212*c9888d95SJean-Philippe Brucker 			return ret;
213*c9888d95SJean-Philippe Brucker 		}
214*c9888d95SJean-Philippe Brucker 	} else {
215*c9888d95SJean-Philippe Brucker 		irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
216*c9888d95SJean-Philippe Brucker 	}
217*c9888d95SJean-Philippe Brucker 
218*c9888d95SJean-Philippe Brucker 	msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state));
219*c9888d95SJean-Philippe Brucker 
220*c9888d95SJean-Philippe Brucker 	return 0;
221*c9888d95SJean-Philippe Brucker }
222*c9888d95SJean-Philippe Brucker 
223*c9888d95SJean-Philippe Brucker static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
224*c9888d95SJean-Philippe Brucker 				     u32 len, u8 is_write, void *ptr)
225*c9888d95SJean-Philippe Brucker {
226*c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = ptr;
227*c9888d95SJean-Philippe Brucker 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
228*c9888d95SJean-Philippe Brucker 	u64 offset = addr - pba->guest_phys_addr;
229*c9888d95SJean-Philippe Brucker 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
230*c9888d95SJean-Philippe Brucker 
231*c9888d95SJean-Philippe Brucker 	if (is_write)
232*c9888d95SJean-Philippe Brucker 		return;
233*c9888d95SJean-Philippe Brucker 
234*c9888d95SJean-Philippe Brucker 	/*
235*c9888d95SJean-Philippe Brucker 	 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
236*c9888d95SJean-Philippe Brucker 	 * is completely useless here. Note that Linux doesn't use PBA.
237*c9888d95SJean-Philippe Brucker 	 */
238*c9888d95SJean-Philippe Brucker 	if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len)
239*c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot access MSIX PBA\n");
240*c9888d95SJean-Philippe Brucker }
241*c9888d95SJean-Philippe Brucker 
242*c9888d95SJean-Philippe Brucker static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
243*c9888d95SJean-Philippe Brucker 				       u32 len, u8 is_write, void *ptr)
244*c9888d95SJean-Philippe Brucker {
245*c9888d95SJean-Philippe Brucker 	struct kvm *kvm = vcpu->kvm;
246*c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
247*c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = ptr;
248*c9888d95SJean-Philippe Brucker 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
249*c9888d95SJean-Philippe Brucker 
250*c9888d95SJean-Philippe Brucker 	u64 offset = addr - pdev->msix_table.guest_phys_addr;
251*c9888d95SJean-Philippe Brucker 
252*c9888d95SJean-Philippe Brucker 	size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
253*c9888d95SJean-Philippe Brucker 	off_t field = offset % PCI_MSIX_ENTRY_SIZE;
254*c9888d95SJean-Philippe Brucker 
255*c9888d95SJean-Philippe Brucker 	/*
256*c9888d95SJean-Philippe Brucker 	 * PCI spec says that software must use aligned 4 or 8 bytes accesses
257*c9888d95SJean-Philippe Brucker 	 * for the MSI-X tables.
258*c9888d95SJean-Philippe Brucker 	 */
259*c9888d95SJean-Philippe Brucker 	if ((len != 4 && len != 8) || addr & (len - 1)) {
260*c9888d95SJean-Philippe Brucker 		vfio_dev_warn(vdev, "invalid MSI-X table access");
261*c9888d95SJean-Philippe Brucker 		return;
262*c9888d95SJean-Philippe Brucker 	}
263*c9888d95SJean-Philippe Brucker 
264*c9888d95SJean-Philippe Brucker 	entry = &pdev->msix.entries[vector];
265*c9888d95SJean-Philippe Brucker 
266*c9888d95SJean-Philippe Brucker 	mutex_lock(&pdev->msix.mutex);
267*c9888d95SJean-Philippe Brucker 
268*c9888d95SJean-Philippe Brucker 	if (!is_write) {
269*c9888d95SJean-Philippe Brucker 		memcpy(data, (void *)&entry->config + field, len);
270*c9888d95SJean-Philippe Brucker 		goto out_unlock;
271*c9888d95SJean-Philippe Brucker 	}
272*c9888d95SJean-Philippe Brucker 
273*c9888d95SJean-Philippe Brucker 	memcpy((void *)&entry->config + field, data, len);
274*c9888d95SJean-Philippe Brucker 
275*c9888d95SJean-Philippe Brucker 	/*
276*c9888d95SJean-Philippe Brucker 	 * Check if access touched the vector control register, which is at the
277*c9888d95SJean-Philippe Brucker 	 * end of the MSI-X entry.
278*c9888d95SJean-Philippe Brucker 	 */
279*c9888d95SJean-Philippe Brucker 	if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
280*c9888d95SJean-Philippe Brucker 		goto out_unlock;
281*c9888d95SJean-Philippe Brucker 
282*c9888d95SJean-Philippe Brucker 	msi_set_masked(entry->virt_state, entry->config.ctrl &
283*c9888d95SJean-Philippe Brucker 		       PCI_MSIX_ENTRY_CTRL_MASKBIT);
284*c9888d95SJean-Philippe Brucker 
285*c9888d95SJean-Philippe Brucker 	if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
286*c9888d95SJean-Philippe Brucker 		/* Not much we can do here. */
287*c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
288*c9888d95SJean-Philippe Brucker 
289*c9888d95SJean-Philippe Brucker 	/* Update the physical capability if necessary */
290*c9888d95SJean-Philippe Brucker 	if (vfio_pci_enable_msis(kvm, vdev))
291*c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot enable MSIX");
292*c9888d95SJean-Philippe Brucker 
293*c9888d95SJean-Philippe Brucker out_unlock:
294*c9888d95SJean-Philippe Brucker 	mutex_unlock(&pdev->msix.mutex);
295*c9888d95SJean-Philippe Brucker }
296*c9888d95SJean-Philippe Brucker 
297*c9888d95SJean-Philippe Brucker static void vfio_pci_msix_cap_write(struct kvm *kvm,
298*c9888d95SJean-Philippe Brucker 				    struct vfio_device *vdev, u8 off,
299*c9888d95SJean-Philippe Brucker 				    void *data, int sz)
300*c9888d95SJean-Philippe Brucker {
301*c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
302*c9888d95SJean-Philippe Brucker 	off_t enable_pos = PCI_MSIX_FLAGS + 1;
303*c9888d95SJean-Philippe Brucker 	bool enable;
304*c9888d95SJean-Philippe Brucker 	u16 flags;
305*c9888d95SJean-Philippe Brucker 
306*c9888d95SJean-Philippe Brucker 	off -= pdev->msix.pos;
307*c9888d95SJean-Philippe Brucker 
308*c9888d95SJean-Philippe Brucker 	/* Check if access intersects with the MSI-X Enable bit */
309*c9888d95SJean-Philippe Brucker 	if (off > enable_pos || off + sz <= enable_pos)
310*c9888d95SJean-Philippe Brucker 		return;
311*c9888d95SJean-Philippe Brucker 
312*c9888d95SJean-Philippe Brucker 	/* Read byte that contains the Enable bit */
313*c9888d95SJean-Philippe Brucker 	flags = *(u8 *)(data + enable_pos - off) << 8;
314*c9888d95SJean-Philippe Brucker 
315*c9888d95SJean-Philippe Brucker 	mutex_lock(&pdev->msix.mutex);
316*c9888d95SJean-Philippe Brucker 
317*c9888d95SJean-Philippe Brucker 	msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL);
318*c9888d95SJean-Philippe Brucker 	enable = flags & PCI_MSIX_FLAGS_ENABLE;
319*c9888d95SJean-Philippe Brucker 	msi_set_enabled(pdev->msix.virt_state, enable);
320*c9888d95SJean-Philippe Brucker 
321*c9888d95SJean-Philippe Brucker 	if (enable && vfio_pci_enable_msis(kvm, vdev))
322*c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot enable MSIX");
323*c9888d95SJean-Philippe Brucker 	else if (!enable && vfio_pci_disable_msis(kvm, vdev))
324*c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot disable MSIX");
325*c9888d95SJean-Philippe Brucker 
326*c9888d95SJean-Philippe Brucker 	mutex_unlock(&pdev->msix.mutex);
327*c9888d95SJean-Philippe Brucker }
328*c9888d95SJean-Philippe Brucker 
3296078a454SJean-Philippe Brucker static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
3306078a454SJean-Philippe Brucker 			      u8 offset, void *data, int sz)
3316078a454SJean-Philippe Brucker {
3326078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
3336078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev;
3346078a454SJean-Philippe Brucker 	struct vfio_device *vdev;
3356078a454SJean-Philippe Brucker 	char base[sz];
3366078a454SJean-Philippe Brucker 
3376078a454SJean-Philippe Brucker 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
3386078a454SJean-Philippe Brucker 	vdev = container_of(pdev, struct vfio_device, pci);
3396078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
3406078a454SJean-Philippe Brucker 
3416078a454SJean-Philippe Brucker 	/* Dummy read in case of side-effects */
3426078a454SJean-Philippe Brucker 	if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
3436078a454SJean-Philippe Brucker 		vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
3446078a454SJean-Philippe Brucker 			      sz, offset);
3456078a454SJean-Philippe Brucker }
3466078a454SJean-Philippe Brucker 
3476078a454SJean-Philippe Brucker static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
3486078a454SJean-Philippe Brucker 			       u8 offset, void *data, int sz)
3496078a454SJean-Philippe Brucker {
3506078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
3516078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev;
3526078a454SJean-Philippe Brucker 	struct vfio_device *vdev;
3536078a454SJean-Philippe Brucker 	void *base = pci_hdr;
3546078a454SJean-Philippe Brucker 
3556078a454SJean-Philippe Brucker 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
3566078a454SJean-Philippe Brucker 	vdev = container_of(pdev, struct vfio_device, pci);
3576078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
3586078a454SJean-Philippe Brucker 
3596078a454SJean-Philippe Brucker 	if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
3606078a454SJean-Philippe Brucker 		vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
3616078a454SJean-Philippe Brucker 			      sz, offset);
3626078a454SJean-Philippe Brucker 
363*c9888d95SJean-Philippe Brucker 	/* Handle MSI write now, since it might update the hardware capability */
364*c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
365*c9888d95SJean-Philippe Brucker 		vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
366*c9888d95SJean-Philippe Brucker 
3676078a454SJean-Philippe Brucker 	if (pread(vdev->fd, base + offset, sz, info->offset + offset) != sz)
3686078a454SJean-Philippe Brucker 		vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
3696078a454SJean-Philippe Brucker 			      sz, offset);
3706078a454SJean-Philippe Brucker }
3716078a454SJean-Philippe Brucker 
372*c9888d95SJean-Philippe Brucker static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
373*c9888d95SJean-Philippe Brucker {
374*c9888d95SJean-Philippe Brucker 	switch (cap_hdr->type) {
375*c9888d95SJean-Philippe Brucker 	case PCI_CAP_ID_MSIX:
376*c9888d95SJean-Philippe Brucker 		return PCI_CAP_MSIX_SIZEOF;
377*c9888d95SJean-Philippe Brucker 	default:
378*c9888d95SJean-Philippe Brucker 		pr_err("unknown PCI capability 0x%x", cap_hdr->type);
379*c9888d95SJean-Philippe Brucker 		return 0;
380*c9888d95SJean-Philippe Brucker 	}
381*c9888d95SJean-Philippe Brucker }
382*c9888d95SJean-Philippe Brucker 
383*c9888d95SJean-Philippe Brucker static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
384*c9888d95SJean-Philippe Brucker 			    struct pci_cap_hdr *cap, off_t pos)
385*c9888d95SJean-Philippe Brucker {
386*c9888d95SJean-Philippe Brucker 	struct pci_cap_hdr *last;
387*c9888d95SJean-Philippe Brucker 	struct pci_device_header *hdr = &vdev->pci.hdr;
388*c9888d95SJean-Philippe Brucker 
389*c9888d95SJean-Philippe Brucker 	cap->next = 0;
390*c9888d95SJean-Philippe Brucker 
391*c9888d95SJean-Philippe Brucker 	if (!hdr->capabilities) {
392*c9888d95SJean-Philippe Brucker 		hdr->capabilities = pos;
393*c9888d95SJean-Philippe Brucker 		hdr->status |= PCI_STATUS_CAP_LIST;
394*c9888d95SJean-Philippe Brucker 	} else {
395*c9888d95SJean-Philippe Brucker 		last = PCI_CAP(virt_hdr, hdr->capabilities);
396*c9888d95SJean-Philippe Brucker 
397*c9888d95SJean-Philippe Brucker 		while (last->next)
398*c9888d95SJean-Philippe Brucker 			last = PCI_CAP(virt_hdr, last->next);
399*c9888d95SJean-Philippe Brucker 
400*c9888d95SJean-Philippe Brucker 		last->next = pos;
401*c9888d95SJean-Philippe Brucker 	}
402*c9888d95SJean-Philippe Brucker 
403*c9888d95SJean-Philippe Brucker 	memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
404*c9888d95SJean-Philippe Brucker 
405*c9888d95SJean-Philippe Brucker 	return 0;
406*c9888d95SJean-Philippe Brucker }
407*c9888d95SJean-Philippe Brucker 
4086078a454SJean-Philippe Brucker static int vfio_pci_parse_caps(struct vfio_device *vdev)
4096078a454SJean-Philippe Brucker {
410*c9888d95SJean-Philippe Brucker 	int ret;
411*c9888d95SJean-Philippe Brucker 	size_t size;
412*c9888d95SJean-Philippe Brucker 	u8 pos, next;
413*c9888d95SJean-Philippe Brucker 	struct pci_cap_hdr *cap;
414*c9888d95SJean-Philippe Brucker 	u8 virt_hdr[PCI_DEV_CFG_SIZE];
4156078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
4166078a454SJean-Philippe Brucker 
4176078a454SJean-Philippe Brucker 	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
4186078a454SJean-Philippe Brucker 		return 0;
4196078a454SJean-Philippe Brucker 
420*c9888d95SJean-Philippe Brucker 	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
421*c9888d95SJean-Philippe Brucker 
422*c9888d95SJean-Philippe Brucker 	pos = pdev->hdr.capabilities & ~3;
423*c9888d95SJean-Philippe Brucker 
4246078a454SJean-Philippe Brucker 	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
4256078a454SJean-Philippe Brucker 	pdev->hdr.capabilities = 0;
4266078a454SJean-Philippe Brucker 
427*c9888d95SJean-Philippe Brucker 	for (; pos; pos = next) {
428*c9888d95SJean-Philippe Brucker 		if (pos >= PCI_DEV_CFG_SIZE) {
429*c9888d95SJean-Philippe Brucker 			vfio_dev_warn(vdev, "ignoring cap outside of config space");
430*c9888d95SJean-Philippe Brucker 			return -EINVAL;
431*c9888d95SJean-Philippe Brucker 		}
432*c9888d95SJean-Philippe Brucker 
433*c9888d95SJean-Philippe Brucker 		cap = PCI_CAP(&pdev->hdr, pos);
434*c9888d95SJean-Philippe Brucker 		next = cap->next;
435*c9888d95SJean-Philippe Brucker 
436*c9888d95SJean-Philippe Brucker 		switch (cap->type) {
437*c9888d95SJean-Philippe Brucker 		case PCI_CAP_ID_MSIX:
438*c9888d95SJean-Philippe Brucker 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
439*c9888d95SJean-Philippe Brucker 			if (ret)
440*c9888d95SJean-Philippe Brucker 				return ret;
441*c9888d95SJean-Philippe Brucker 
442*c9888d95SJean-Philippe Brucker 			pdev->msix.pos = pos;
443*c9888d95SJean-Philippe Brucker 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
444*c9888d95SJean-Philippe Brucker 			break;
445*c9888d95SJean-Philippe Brucker 		}
446*c9888d95SJean-Philippe Brucker 	}
447*c9888d95SJean-Philippe Brucker 
448*c9888d95SJean-Philippe Brucker 	/* Wipe remaining capabilities */
449*c9888d95SJean-Philippe Brucker 	pos = PCI_STD_HEADER_SIZEOF;
450*c9888d95SJean-Philippe Brucker 	size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
451*c9888d95SJean-Philippe Brucker 	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
4526078a454SJean-Philippe Brucker 
4536078a454SJean-Philippe Brucker 	return 0;
4546078a454SJean-Philippe Brucker }
4556078a454SJean-Philippe Brucker 
4566078a454SJean-Philippe Brucker static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
4576078a454SJean-Philippe Brucker {
458*c9888d95SJean-Philippe Brucker 	ssize_t sz = PCI_DEV_CFG_SIZE;
4596078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
4606078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
4616078a454SJean-Philippe Brucker 
4626078a454SJean-Philippe Brucker 	if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
4636078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "Config Space not found");
4646078a454SJean-Philippe Brucker 		return -ENODEV;
4656078a454SJean-Philippe Brucker 	}
4666078a454SJean-Philippe Brucker 
4676078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
4686078a454SJean-Philippe Brucker 	*info = (struct vfio_region_info) {
4696078a454SJean-Philippe Brucker 			.argsz = sizeof(*info),
4706078a454SJean-Philippe Brucker 			.index = VFIO_PCI_CONFIG_REGION_INDEX,
4716078a454SJean-Philippe Brucker 	};
4726078a454SJean-Philippe Brucker 
4736078a454SJean-Philippe Brucker 	ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
4746078a454SJean-Philippe Brucker 	if (!info->size) {
4756078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "Config Space has size zero?!");
4766078a454SJean-Philippe Brucker 		return -EINVAL;
4776078a454SJean-Philippe Brucker 	}
4786078a454SJean-Philippe Brucker 
479*c9888d95SJean-Philippe Brucker 	/* Read standard headers and capabilities */
4806078a454SJean-Philippe Brucker 	if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
4816078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
4826078a454SJean-Philippe Brucker 		return -EIO;
4836078a454SJean-Philippe Brucker 	}
4846078a454SJean-Philippe Brucker 
4856078a454SJean-Philippe Brucker 	/* Strip bit 7, that indicates multifunction */
4866078a454SJean-Philippe Brucker 	pdev->hdr.header_type &= 0x7f;
4876078a454SJean-Philippe Brucker 
4886078a454SJean-Philippe Brucker 	if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
4896078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "unsupported header type %u",
4906078a454SJean-Philippe Brucker 			     pdev->hdr.header_type);
4916078a454SJean-Philippe Brucker 		return -EOPNOTSUPP;
4926078a454SJean-Philippe Brucker 	}
4936078a454SJean-Philippe Brucker 
494*c9888d95SJean-Philippe Brucker 	if (pdev->hdr.irq_pin)
495*c9888d95SJean-Philippe Brucker 		pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
496*c9888d95SJean-Philippe Brucker 
4976078a454SJean-Philippe Brucker 	vfio_pci_parse_caps(vdev);
4986078a454SJean-Philippe Brucker 
4996078a454SJean-Philippe Brucker 	return 0;
5006078a454SJean-Philippe Brucker }
5016078a454SJean-Philippe Brucker 
5026078a454SJean-Philippe Brucker static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
5036078a454SJean-Philippe Brucker {
5046078a454SJean-Philippe Brucker 	int i;
5056078a454SJean-Philippe Brucker 	ssize_t hdr_sz;
506*c9888d95SJean-Philippe Brucker 	struct msix_cap *msix;
5076078a454SJean-Philippe Brucker 	struct vfio_region_info *info;
5086078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
5096078a454SJean-Philippe Brucker 
5106078a454SJean-Philippe Brucker 	/* Enable exclusively MMIO and bus mastering */
5116078a454SJean-Philippe Brucker 	pdev->hdr.command &= ~PCI_COMMAND_IO;
5126078a454SJean-Philippe Brucker 	pdev->hdr.command |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
5136078a454SJean-Philippe Brucker 
5146078a454SJean-Philippe Brucker 	/* Initialise the BARs */
5156078a454SJean-Philippe Brucker 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
5166078a454SJean-Philippe Brucker 		struct vfio_region *region = &vdev->regions[i];
5176078a454SJean-Philippe Brucker 		u64 base = region->guest_phys_addr;
5186078a454SJean-Philippe Brucker 
5196078a454SJean-Philippe Brucker 		if (!base)
5206078a454SJean-Philippe Brucker 			continue;
5216078a454SJean-Philippe Brucker 
5226078a454SJean-Philippe Brucker 		pdev->hdr.bar_size[i] = region->info.size;
5236078a454SJean-Philippe Brucker 
5246078a454SJean-Philippe Brucker 		/* Construct a fake reg to match what we've mapped. */
5256078a454SJean-Philippe Brucker 		pdev->hdr.bar[i] = (base & PCI_BASE_ADDRESS_MEM_MASK) |
5266078a454SJean-Philippe Brucker 					PCI_BASE_ADDRESS_SPACE_MEMORY |
5276078a454SJean-Philippe Brucker 					PCI_BASE_ADDRESS_MEM_TYPE_32;
5286078a454SJean-Philippe Brucker 	}
5296078a454SJean-Philippe Brucker 
5306078a454SJean-Philippe Brucker 	/* I really can't be bothered to support cardbus. */
5316078a454SJean-Philippe Brucker 	pdev->hdr.card_bus = 0;
5326078a454SJean-Philippe Brucker 
5336078a454SJean-Philippe Brucker 	/*
5346078a454SJean-Philippe Brucker 	 * Nuke the expansion ROM for now. If we want to do this properly,
5356078a454SJean-Philippe Brucker 	 * we need to save its size somewhere and map into the guest.
5366078a454SJean-Philippe Brucker 	 */
5376078a454SJean-Philippe Brucker 	pdev->hdr.exp_rom_bar = 0;
5386078a454SJean-Philippe Brucker 
539*c9888d95SJean-Philippe Brucker 	/* Plumb in our fake MSI-X capability, if we have it. */
540*c9888d95SJean-Philippe Brucker 	msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
541*c9888d95SJean-Philippe Brucker 	if (msix) {
542*c9888d95SJean-Philippe Brucker 		/* Add a shortcut to the PBA region for the MMIO handler */
543*c9888d95SJean-Philippe Brucker 		int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
544*c9888d95SJean-Philippe Brucker 		pdev->msix_pba.offset = vdev->regions[pba_index].info.offset +
545*c9888d95SJean-Philippe Brucker 					(msix->pba_offset & PCI_MSIX_PBA_OFFSET);
546*c9888d95SJean-Philippe Brucker 
547*c9888d95SJean-Philippe Brucker 		/* Tidy up the capability */
548*c9888d95SJean-Philippe Brucker 		msix->table_offset &= PCI_MSIX_TABLE_BIR;
549*c9888d95SJean-Philippe Brucker 		msix->pba_offset &= PCI_MSIX_PBA_BIR;
550*c9888d95SJean-Philippe Brucker 		if (pdev->msix_table.bar == pdev->msix_pba.bar)
551*c9888d95SJean-Philippe Brucker 			msix->pba_offset |= pdev->msix_table.size &
552*c9888d95SJean-Philippe Brucker 					    PCI_MSIX_PBA_OFFSET;
553*c9888d95SJean-Philippe Brucker 	}
554*c9888d95SJean-Philippe Brucker 
5556078a454SJean-Philippe Brucker 	/* Install our fake Configuration Space */
5566078a454SJean-Philippe Brucker 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
5576078a454SJean-Philippe Brucker 	hdr_sz = PCI_DEV_CFG_SIZE;
5586078a454SJean-Philippe Brucker 	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
5596078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
5606078a454SJean-Philippe Brucker 			     hdr_sz);
5616078a454SJean-Philippe Brucker 		return -EIO;
5626078a454SJean-Philippe Brucker 	}
5636078a454SJean-Philippe Brucker 
5646078a454SJean-Philippe Brucker 	/* Register callbacks for cfg accesses */
5656078a454SJean-Philippe Brucker 	pdev->hdr.cfg_ops = (struct pci_config_operations) {
5666078a454SJean-Philippe Brucker 		.read	= vfio_pci_cfg_read,
5676078a454SJean-Philippe Brucker 		.write	= vfio_pci_cfg_write,
5686078a454SJean-Philippe Brucker 	};
5696078a454SJean-Philippe Brucker 
5706078a454SJean-Philippe Brucker 	pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
5716078a454SJean-Philippe Brucker 
5726078a454SJean-Philippe Brucker 	return 0;
5736078a454SJean-Philippe Brucker }
5746078a454SJean-Philippe Brucker 
575*c9888d95SJean-Philippe Brucker static int vfio_pci_create_msix_table(struct kvm *kvm,
576*c9888d95SJean-Philippe Brucker 				      struct vfio_pci_device *pdev)
577*c9888d95SJean-Philippe Brucker {
578*c9888d95SJean-Philippe Brucker 	int ret;
579*c9888d95SJean-Philippe Brucker 	size_t i;
580*c9888d95SJean-Philippe Brucker 	size_t mmio_size;
581*c9888d95SJean-Philippe Brucker 	size_t nr_entries;
582*c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_entry *entries;
583*c9888d95SJean-Philippe Brucker 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
584*c9888d95SJean-Philippe Brucker 	struct vfio_pci_msix_table *table = &pdev->msix_table;
585*c9888d95SJean-Philippe Brucker 	struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
586*c9888d95SJean-Philippe Brucker 
587*c9888d95SJean-Philippe Brucker 	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
588*c9888d95SJean-Philippe Brucker 	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
589*c9888d95SJean-Philippe Brucker 
590*c9888d95SJean-Philippe Brucker 	/*
591*c9888d95SJean-Philippe Brucker 	 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE.
592*c9888d95SJean-Philippe Brucker 	 */
593*c9888d95SJean-Philippe Brucker 	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
594*c9888d95SJean-Philippe Brucker 	table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE);
595*c9888d95SJean-Philippe Brucker 	pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE);
596*c9888d95SJean-Philippe Brucker 
597*c9888d95SJean-Philippe Brucker 	entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
598*c9888d95SJean-Philippe Brucker 	if (!entries)
599*c9888d95SJean-Philippe Brucker 		return -ENOMEM;
600*c9888d95SJean-Philippe Brucker 
601*c9888d95SJean-Philippe Brucker 	for (i = 0; i < nr_entries; i++)
602*c9888d95SJean-Philippe Brucker 		entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
603*c9888d95SJean-Philippe Brucker 
604*c9888d95SJean-Philippe Brucker 	/*
605*c9888d95SJean-Philippe Brucker 	 * To ease MSI-X cap configuration in case they share the same BAR,
606*c9888d95SJean-Philippe Brucker 	 * collapse table and pending array. The size of the BAR regions must be
607*c9888d95SJean-Philippe Brucker 	 * powers of two.
608*c9888d95SJean-Philippe Brucker 	 */
609*c9888d95SJean-Philippe Brucker 	mmio_size = roundup_pow_of_two(table->size + pba->size);
610*c9888d95SJean-Philippe Brucker 	table->guest_phys_addr = pci_get_io_space_block(mmio_size);
611*c9888d95SJean-Philippe Brucker 	if (!table->guest_phys_addr) {
612*c9888d95SJean-Philippe Brucker 		pr_err("cannot allocate IO space");
613*c9888d95SJean-Philippe Brucker 		ret = -ENOMEM;
614*c9888d95SJean-Philippe Brucker 		goto out_free;
615*c9888d95SJean-Philippe Brucker 	}
616*c9888d95SJean-Philippe Brucker 	pba->guest_phys_addr = table->guest_phys_addr + table->size;
617*c9888d95SJean-Philippe Brucker 
618*c9888d95SJean-Philippe Brucker 	ret = kvm__register_mmio(kvm, table->guest_phys_addr, table->size,
619*c9888d95SJean-Philippe Brucker 				 false, vfio_pci_msix_table_access, pdev);
620*c9888d95SJean-Philippe Brucker 	if (ret < 0)
621*c9888d95SJean-Philippe Brucker 		goto out_free;
622*c9888d95SJean-Philippe Brucker 
623*c9888d95SJean-Philippe Brucker 	/*
624*c9888d95SJean-Philippe Brucker 	 * We could map the physical PBA directly into the guest, but it's
625*c9888d95SJean-Philippe Brucker 	 * likely smaller than a page, and we can only hand full pages to the
626*c9888d95SJean-Philippe Brucker 	 * guest. Even though the PCI spec disallows sharing a page used for
627*c9888d95SJean-Philippe Brucker 	 * MSI-X with any other resource, it allows to share the same page
628*c9888d95SJean-Philippe Brucker 	 * between MSI-X table and PBA. For the sake of isolation, create a
629*c9888d95SJean-Philippe Brucker 	 * virtual PBA.
630*c9888d95SJean-Philippe Brucker 	 */
631*c9888d95SJean-Philippe Brucker 	ret = kvm__register_mmio(kvm, pba->guest_phys_addr, pba->size, false,
632*c9888d95SJean-Philippe Brucker 				 vfio_pci_msix_pba_access, pdev);
633*c9888d95SJean-Philippe Brucker 	if (ret < 0)
634*c9888d95SJean-Philippe Brucker 		goto out_free;
635*c9888d95SJean-Philippe Brucker 
636*c9888d95SJean-Philippe Brucker 	pdev->msix.entries = entries;
637*c9888d95SJean-Philippe Brucker 	pdev->msix.nr_entries = nr_entries;
638*c9888d95SJean-Philippe Brucker 
639*c9888d95SJean-Philippe Brucker 	return 0;
640*c9888d95SJean-Philippe Brucker 
641*c9888d95SJean-Philippe Brucker out_free:
642*c9888d95SJean-Philippe Brucker 	free(entries);
643*c9888d95SJean-Philippe Brucker 
644*c9888d95SJean-Philippe Brucker 	return ret;
645*c9888d95SJean-Philippe Brucker }
646*c9888d95SJean-Philippe Brucker 
6476078a454SJean-Philippe Brucker static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
6486078a454SJean-Philippe Brucker 				  size_t nr)
6496078a454SJean-Philippe Brucker {
6506078a454SJean-Philippe Brucker 	int ret;
6516078a454SJean-Philippe Brucker 	size_t map_size;
652*c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
6536078a454SJean-Philippe Brucker 	struct vfio_region *region = &vdev->regions[nr];
6546078a454SJean-Philippe Brucker 
6556078a454SJean-Philippe Brucker 	if (nr >= vdev->info.num_regions)
6566078a454SJean-Philippe Brucker 		return 0;
6576078a454SJean-Philippe Brucker 
6586078a454SJean-Philippe Brucker 	region->info = (struct vfio_region_info) {
6596078a454SJean-Philippe Brucker 		.argsz = sizeof(region->info),
6606078a454SJean-Philippe Brucker 		.index = nr,
6616078a454SJean-Philippe Brucker 	};
6626078a454SJean-Philippe Brucker 
6636078a454SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &region->info);
6646078a454SJean-Philippe Brucker 	if (ret) {
6656078a454SJean-Philippe Brucker 		ret = -errno;
6666078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "cannot get info for BAR %zu", nr);
6676078a454SJean-Philippe Brucker 		return ret;
6686078a454SJean-Philippe Brucker 	}
6696078a454SJean-Philippe Brucker 
6706078a454SJean-Philippe Brucker 	/* Ignore invalid or unimplemented regions */
6716078a454SJean-Philippe Brucker 	if (!region->info.size)
6726078a454SJean-Philippe Brucker 		return 0;
6736078a454SJean-Philippe Brucker 
674*c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
675*c9888d95SJean-Philippe Brucker 		/* Trap and emulate MSI-X table */
676*c9888d95SJean-Philippe Brucker 		if (nr == pdev->msix_table.bar) {
677*c9888d95SJean-Philippe Brucker 			region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
678*c9888d95SJean-Philippe Brucker 			return 0;
679*c9888d95SJean-Philippe Brucker 		} else if (nr == pdev->msix_pba.bar) {
680*c9888d95SJean-Philippe Brucker 			region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
681*c9888d95SJean-Philippe Brucker 			return 0;
682*c9888d95SJean-Philippe Brucker 		}
683*c9888d95SJean-Philippe Brucker 	}
684*c9888d95SJean-Philippe Brucker 
6856078a454SJean-Philippe Brucker 	/* Grab some MMIO space in the guest */
6866078a454SJean-Philippe Brucker 	map_size = ALIGN(region->info.size, PAGE_SIZE);
6876078a454SJean-Philippe Brucker 	region->guest_phys_addr = pci_get_io_space_block(map_size);
6886078a454SJean-Philippe Brucker 
6896078a454SJean-Philippe Brucker 	/*
6906078a454SJean-Philippe Brucker 	 * Map the BARs into the guest. We'll later need to update
6916078a454SJean-Philippe Brucker 	 * configuration space to reflect our allocation.
6926078a454SJean-Philippe Brucker 	 */
6936078a454SJean-Philippe Brucker 	ret = vfio_map_region(kvm, vdev, region);
6946078a454SJean-Philippe Brucker 	if (ret)
6956078a454SJean-Philippe Brucker 		return ret;
6966078a454SJean-Philippe Brucker 
6976078a454SJean-Philippe Brucker 	return 0;
6986078a454SJean-Philippe Brucker }
6996078a454SJean-Philippe Brucker 
7006078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_regions(struct kvm *kvm,
7016078a454SJean-Philippe Brucker 					  struct vfio_device *vdev)
7026078a454SJean-Philippe Brucker {
7036078a454SJean-Philippe Brucker 	int ret;
7046078a454SJean-Philippe Brucker 	u32 bar;
7056078a454SJean-Philippe Brucker 	size_t i;
7066078a454SJean-Philippe Brucker 	bool is_64bit = false;
7076078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
7086078a454SJean-Philippe Brucker 
7096078a454SJean-Philippe Brucker 	ret = vfio_pci_parse_cfg_space(vdev);
7106078a454SJean-Philippe Brucker 	if (ret)
7116078a454SJean-Philippe Brucker 		return ret;
7126078a454SJean-Philippe Brucker 
713*c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
714*c9888d95SJean-Philippe Brucker 		ret = vfio_pci_create_msix_table(kvm, pdev);
715*c9888d95SJean-Philippe Brucker 		if (ret)
716*c9888d95SJean-Philippe Brucker 			return ret;
717*c9888d95SJean-Philippe Brucker 	}
718*c9888d95SJean-Philippe Brucker 
7196078a454SJean-Philippe Brucker 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
7206078a454SJean-Philippe Brucker 		/* Ignore top half of 64-bit BAR */
7216078a454SJean-Philippe Brucker 		if (i % 2 && is_64bit)
7226078a454SJean-Philippe Brucker 			continue;
7236078a454SJean-Philippe Brucker 
7246078a454SJean-Philippe Brucker 		ret = vfio_pci_configure_bar(kvm, vdev, i);
7256078a454SJean-Philippe Brucker 		if (ret)
7266078a454SJean-Philippe Brucker 			return ret;
7276078a454SJean-Philippe Brucker 
7286078a454SJean-Philippe Brucker 		bar = pdev->hdr.bar[i];
7296078a454SJean-Philippe Brucker 		is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) ==
7306078a454SJean-Philippe Brucker 			   PCI_BASE_ADDRESS_SPACE_MEMORY &&
7316078a454SJean-Philippe Brucker 			   bar & PCI_BASE_ADDRESS_MEM_TYPE_64;
7326078a454SJean-Philippe Brucker 	}
7336078a454SJean-Philippe Brucker 
7346078a454SJean-Philippe Brucker 	/* We've configured the BARs, fake up a Configuration Space */
7356078a454SJean-Philippe Brucker 	return vfio_pci_fixup_cfg_space(vdev);
7366078a454SJean-Philippe Brucker }
7376078a454SJean-Philippe Brucker 
738*c9888d95SJean-Philippe Brucker /*
739*c9888d95SJean-Philippe Brucker  * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
740*c9888d95SJean-Philippe Brucker  * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
741*c9888d95SJean-Philippe Brucker  */
742*c9888d95SJean-Philippe Brucker static int vfio_pci_reserve_irq_fds(size_t num)
743*c9888d95SJean-Philippe Brucker {
744*c9888d95SJean-Philippe Brucker 	/*
745*c9888d95SJean-Philippe Brucker 	 * I counted around 27 fds under normal load. Let's add 100 for good
746*c9888d95SJean-Philippe Brucker 	 * measure.
747*c9888d95SJean-Philippe Brucker 	 */
748*c9888d95SJean-Philippe Brucker 	static size_t needed = 128;
749*c9888d95SJean-Philippe Brucker 	struct rlimit fd_limit, new_limit;
750*c9888d95SJean-Philippe Brucker 
751*c9888d95SJean-Philippe Brucker 	needed += num;
752*c9888d95SJean-Philippe Brucker 
753*c9888d95SJean-Philippe Brucker 	if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
754*c9888d95SJean-Philippe Brucker 		perror("getrlimit(RLIMIT_NOFILE)");
755*c9888d95SJean-Philippe Brucker 		return 0;
756*c9888d95SJean-Philippe Brucker 	}
757*c9888d95SJean-Philippe Brucker 
758*c9888d95SJean-Philippe Brucker 	if (fd_limit.rlim_cur >= needed)
759*c9888d95SJean-Philippe Brucker 		return 0;
760*c9888d95SJean-Philippe Brucker 
761*c9888d95SJean-Philippe Brucker 	new_limit.rlim_cur = needed;
762*c9888d95SJean-Philippe Brucker 
763*c9888d95SJean-Philippe Brucker 	if (fd_limit.rlim_max < needed)
764*c9888d95SJean-Philippe Brucker 		/* Try to bump hard limit (root only) */
765*c9888d95SJean-Philippe Brucker 		new_limit.rlim_max = needed;
766*c9888d95SJean-Philippe Brucker 	else
767*c9888d95SJean-Philippe Brucker 		new_limit.rlim_max = fd_limit.rlim_max;
768*c9888d95SJean-Philippe Brucker 
769*c9888d95SJean-Philippe Brucker 	if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
770*c9888d95SJean-Philippe Brucker 		perror("setrlimit(RLIMIT_NOFILE)");
771*c9888d95SJean-Philippe Brucker 		pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
772*c9888d95SJean-Philippe Brucker 			   (size_t)(needed - fd_limit.rlim_cur));
773*c9888d95SJean-Philippe Brucker 	}
774*c9888d95SJean-Philippe Brucker 
775*c9888d95SJean-Philippe Brucker 	return 0;
776*c9888d95SJean-Philippe Brucker }
777*c9888d95SJean-Philippe Brucker 
778*c9888d95SJean-Philippe Brucker static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
779*c9888d95SJean-Philippe Brucker 			     struct vfio_pci_msi_common *msis)
780*c9888d95SJean-Philippe Brucker {
781*c9888d95SJean-Philippe Brucker 	int ret;
782*c9888d95SJean-Philippe Brucker 	size_t i;
783*c9888d95SJean-Philippe Brucker 	int *eventfds;
784*c9888d95SJean-Philippe Brucker 	size_t irq_set_size;
785*c9888d95SJean-Philippe Brucker 	struct vfio_pci_msi_entry *entry;
786*c9888d95SJean-Philippe Brucker 	size_t nr_entries = msis->nr_entries;
787*c9888d95SJean-Philippe Brucker 
788*c9888d95SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
789*c9888d95SJean-Philippe Brucker 	if (ret || &msis->info.count == 0) {
790*c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "no MSI reported by VFIO");
791*c9888d95SJean-Philippe Brucker 		return -ENODEV;
792*c9888d95SJean-Philippe Brucker 	}
793*c9888d95SJean-Philippe Brucker 
794*c9888d95SJean-Philippe Brucker 	if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
795*c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "interrupt not EVENTFD capable");
796*c9888d95SJean-Philippe Brucker 		return -EINVAL;
797*c9888d95SJean-Philippe Brucker 	}
798*c9888d95SJean-Philippe Brucker 
799*c9888d95SJean-Philippe Brucker 	if (msis->info.count != nr_entries) {
800*c9888d95SJean-Philippe Brucker 		vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
801*c9888d95SJean-Philippe Brucker 		return -EINVAL;
802*c9888d95SJean-Philippe Brucker 	}
803*c9888d95SJean-Philippe Brucker 
804*c9888d95SJean-Philippe Brucker 	mutex_init(&msis->mutex);
805*c9888d95SJean-Philippe Brucker 
806*c9888d95SJean-Philippe Brucker 	vfio_pci_reserve_irq_fds(nr_entries);
807*c9888d95SJean-Philippe Brucker 
808*c9888d95SJean-Philippe Brucker 	irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
809*c9888d95SJean-Philippe Brucker 	msis->irq_set = malloc(irq_set_size);
810*c9888d95SJean-Philippe Brucker 	if (!msis->irq_set)
811*c9888d95SJean-Philippe Brucker 		return -ENOMEM;
812*c9888d95SJean-Philippe Brucker 
813*c9888d95SJean-Philippe Brucker 	*msis->irq_set = (struct vfio_irq_set) {
814*c9888d95SJean-Philippe Brucker 		.argsz	= irq_set_size,
815*c9888d95SJean-Philippe Brucker 		.flags 	= VFIO_IRQ_SET_DATA_EVENTFD |
816*c9888d95SJean-Philippe Brucker 			  VFIO_IRQ_SET_ACTION_TRIGGER,
817*c9888d95SJean-Philippe Brucker 		.index 	= msis->info.index,
818*c9888d95SJean-Philippe Brucker 		.start 	= 0,
819*c9888d95SJean-Philippe Brucker 		.count 	= nr_entries,
820*c9888d95SJean-Philippe Brucker 	};
821*c9888d95SJean-Philippe Brucker 
822*c9888d95SJean-Philippe Brucker 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
823*c9888d95SJean-Philippe Brucker 
824*c9888d95SJean-Philippe Brucker 	for (i = 0; i < nr_entries; i++) {
825*c9888d95SJean-Philippe Brucker 		entry = &msis->entries[i];
826*c9888d95SJean-Philippe Brucker 		entry->gsi = -1;
827*c9888d95SJean-Philippe Brucker 		entry->eventfd = -1;
828*c9888d95SJean-Philippe Brucker 		msi_set_masked(entry->virt_state, true);
829*c9888d95SJean-Philippe Brucker 		msi_set_masked(entry->phys_state, true);
830*c9888d95SJean-Philippe Brucker 		eventfds[i] = -1;
831*c9888d95SJean-Philippe Brucker 	}
832*c9888d95SJean-Philippe Brucker 
833*c9888d95SJean-Philippe Brucker 	return 0;
834*c9888d95SJean-Philippe Brucker }
835*c9888d95SJean-Philippe Brucker 
836*c9888d95SJean-Philippe Brucker static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
837*c9888d95SJean-Philippe Brucker {
838*c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
839*c9888d95SJean-Philippe Brucker 	int gsi = pdev->intx_gsi;
840*c9888d95SJean-Philippe Brucker 	struct vfio_irq_set irq_set = {
841*c9888d95SJean-Philippe Brucker 		.argsz	= sizeof(irq_set),
842*c9888d95SJean-Philippe Brucker 		.flags	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
843*c9888d95SJean-Philippe Brucker 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
844*c9888d95SJean-Philippe Brucker 	};
845*c9888d95SJean-Philippe Brucker 
846*c9888d95SJean-Philippe Brucker 	pr_debug("user requested MSI, disabling INTx %d", gsi);
847*c9888d95SJean-Philippe Brucker 
848*c9888d95SJean-Philippe Brucker 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
849*c9888d95SJean-Philippe Brucker 	irq__del_irqfd(kvm, gsi, pdev->intx_fd);
850*c9888d95SJean-Philippe Brucker 
851*c9888d95SJean-Philippe Brucker 	close(pdev->intx_fd);
852*c9888d95SJean-Philippe Brucker }
853*c9888d95SJean-Philippe Brucker 
8546078a454SJean-Philippe Brucker static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
8556078a454SJean-Philippe Brucker {
8566078a454SJean-Philippe Brucker 	int ret;
8576078a454SJean-Philippe Brucker 	int trigger_fd, unmask_fd;
8586078a454SJean-Philippe Brucker 	struct vfio_irq_eventfd	trigger;
8596078a454SJean-Philippe Brucker 	struct vfio_irq_eventfd	unmask;
8606078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
8616078a454SJean-Philippe Brucker 	int gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
8626078a454SJean-Philippe Brucker 
8636078a454SJean-Philippe Brucker 	struct vfio_irq_info irq_info = {
8646078a454SJean-Philippe Brucker 		.argsz = sizeof(irq_info),
8656078a454SJean-Philippe Brucker 		.index = VFIO_PCI_INTX_IRQ_INDEX,
8666078a454SJean-Philippe Brucker 	};
8676078a454SJean-Philippe Brucker 
868*c9888d95SJean-Philippe Brucker 	vfio_pci_reserve_irq_fds(2);
869*c9888d95SJean-Philippe Brucker 
8706078a454SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
8716078a454SJean-Philippe Brucker 	if (ret || irq_info.count == 0) {
8726078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "no INTx reported by VFIO");
8736078a454SJean-Philippe Brucker 		return -ENODEV;
8746078a454SJean-Philippe Brucker 	}
8756078a454SJean-Philippe Brucker 
8766078a454SJean-Philippe Brucker 	if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
8776078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "interrupt not eventfd capable");
8786078a454SJean-Philippe Brucker 		return -EINVAL;
8796078a454SJean-Philippe Brucker 	}
8806078a454SJean-Philippe Brucker 
8816078a454SJean-Philippe Brucker 	if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
8826078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
8836078a454SJean-Philippe Brucker 		return -EINVAL;
8846078a454SJean-Philippe Brucker 	}
8856078a454SJean-Philippe Brucker 
8866078a454SJean-Philippe Brucker 	/*
8876078a454SJean-Philippe Brucker 	 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
8886078a454SJean-Philippe Brucker 	 * signals an interrupt from host to guest, and unmask_fd signals the
8896078a454SJean-Philippe Brucker 	 * deassertion of the line from guest to host.
8906078a454SJean-Philippe Brucker 	 */
8916078a454SJean-Philippe Brucker 	trigger_fd = eventfd(0, 0);
8926078a454SJean-Philippe Brucker 	if (trigger_fd < 0) {
8936078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to create trigger eventfd");
8946078a454SJean-Philippe Brucker 		return trigger_fd;
8956078a454SJean-Philippe Brucker 	}
8966078a454SJean-Philippe Brucker 
8976078a454SJean-Philippe Brucker 	unmask_fd = eventfd(0, 0);
8986078a454SJean-Philippe Brucker 	if (unmask_fd < 0) {
8996078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to create unmask eventfd");
9006078a454SJean-Philippe Brucker 		close(trigger_fd);
9016078a454SJean-Philippe Brucker 		return unmask_fd;
9026078a454SJean-Philippe Brucker 	}
9036078a454SJean-Philippe Brucker 
9046078a454SJean-Philippe Brucker 	ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
9056078a454SJean-Philippe Brucker 	if (ret)
9066078a454SJean-Philippe Brucker 		goto err_close;
9076078a454SJean-Philippe Brucker 
9086078a454SJean-Philippe Brucker 	trigger.irq = (struct vfio_irq_set) {
9096078a454SJean-Philippe Brucker 		.argsz	= sizeof(trigger),
9106078a454SJean-Philippe Brucker 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
9116078a454SJean-Philippe Brucker 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
9126078a454SJean-Philippe Brucker 		.start	= 0,
9136078a454SJean-Philippe Brucker 		.count	= 1,
9146078a454SJean-Philippe Brucker 	};
9156078a454SJean-Philippe Brucker 	trigger.fd = trigger_fd;
9166078a454SJean-Philippe Brucker 
9176078a454SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
9186078a454SJean-Philippe Brucker 	if (ret < 0) {
9196078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to setup VFIO IRQ");
9206078a454SJean-Philippe Brucker 		goto err_delete_line;
9216078a454SJean-Philippe Brucker 	}
9226078a454SJean-Philippe Brucker 
9236078a454SJean-Philippe Brucker 	unmask.irq = (struct vfio_irq_set) {
9246078a454SJean-Philippe Brucker 		.argsz	= sizeof(unmask),
9256078a454SJean-Philippe Brucker 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
9266078a454SJean-Philippe Brucker 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
9276078a454SJean-Philippe Brucker 		.start	= 0,
9286078a454SJean-Philippe Brucker 		.count	= 1,
9296078a454SJean-Philippe Brucker 	};
9306078a454SJean-Philippe Brucker 	unmask.fd = unmask_fd;
9316078a454SJean-Philippe Brucker 
9326078a454SJean-Philippe Brucker 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask);
9336078a454SJean-Philippe Brucker 	if (ret < 0) {
9346078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to setup unmask IRQ");
9356078a454SJean-Philippe Brucker 		goto err_remove_event;
9366078a454SJean-Philippe Brucker 	}
9376078a454SJean-Philippe Brucker 
938*c9888d95SJean-Philippe Brucker 	pdev->intx_fd = trigger_fd;
939*c9888d95SJean-Philippe Brucker 	/* Guest is going to ovewrite our irq_line... */
940*c9888d95SJean-Philippe Brucker 	pdev->intx_gsi = gsi;
941*c9888d95SJean-Philippe Brucker 
9426078a454SJean-Philippe Brucker 	return 0;
9436078a454SJean-Philippe Brucker 
9446078a454SJean-Philippe Brucker err_remove_event:
9456078a454SJean-Philippe Brucker 	/* Remove trigger event */
9466078a454SJean-Philippe Brucker 	trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
9476078a454SJean-Philippe Brucker 	trigger.irq.count = 0;
9486078a454SJean-Philippe Brucker 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
9496078a454SJean-Philippe Brucker 
9506078a454SJean-Philippe Brucker err_delete_line:
9516078a454SJean-Philippe Brucker 	irq__del_irqfd(kvm, gsi, trigger_fd);
9526078a454SJean-Philippe Brucker 
9536078a454SJean-Philippe Brucker err_close:
9546078a454SJean-Philippe Brucker 	close(trigger_fd);
9556078a454SJean-Philippe Brucker 	close(unmask_fd);
9566078a454SJean-Philippe Brucker 	return ret;
9576078a454SJean-Philippe Brucker }
9586078a454SJean-Philippe Brucker 
9596078a454SJean-Philippe Brucker static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
9606078a454SJean-Philippe Brucker {
961*c9888d95SJean-Philippe Brucker 	int ret = 0;
9626078a454SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
9636078a454SJean-Philippe Brucker 
964*c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
965*c9888d95SJean-Philippe Brucker 		pdev->msix.info = (struct vfio_irq_info) {
966*c9888d95SJean-Philippe Brucker 			.argsz = sizeof(pdev->msix.info),
967*c9888d95SJean-Philippe Brucker 			.index = VFIO_PCI_MSIX_IRQ_INDEX,
9686078a454SJean-Philippe Brucker 		};
969*c9888d95SJean-Philippe Brucker 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
970*c9888d95SJean-Philippe Brucker 		if (ret)
971*c9888d95SJean-Philippe Brucker 			return ret;
9726078a454SJean-Philippe Brucker 	}
9736078a454SJean-Philippe Brucker 
974*c9888d95SJean-Philippe Brucker 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
975*c9888d95SJean-Philippe Brucker 		ret = vfio_pci_enable_intx(kvm, vdev);
976*c9888d95SJean-Philippe Brucker 
977*c9888d95SJean-Philippe Brucker 	return ret;
9786078a454SJean-Philippe Brucker }
9796078a454SJean-Philippe Brucker 
9806078a454SJean-Philippe Brucker int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
9816078a454SJean-Philippe Brucker {
9826078a454SJean-Philippe Brucker 	int ret;
9836078a454SJean-Philippe Brucker 
9846078a454SJean-Philippe Brucker 	ret = vfio_pci_configure_dev_regions(kvm, vdev);
9856078a454SJean-Philippe Brucker 	if (ret) {
9866078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to configure regions");
9876078a454SJean-Philippe Brucker 		return ret;
9886078a454SJean-Philippe Brucker 	}
9896078a454SJean-Philippe Brucker 
9906078a454SJean-Philippe Brucker 	vdev->dev_hdr = (struct device_header) {
9916078a454SJean-Philippe Brucker 		.bus_type	= DEVICE_BUS_PCI,
9926078a454SJean-Philippe Brucker 		.data		= &vdev->pci.hdr,
9936078a454SJean-Philippe Brucker 	};
9946078a454SJean-Philippe Brucker 
9956078a454SJean-Philippe Brucker 	ret = device__register(&vdev->dev_hdr);
9966078a454SJean-Philippe Brucker 	if (ret) {
9976078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to register VFIO device");
9986078a454SJean-Philippe Brucker 		return ret;
9996078a454SJean-Philippe Brucker 	}
10006078a454SJean-Philippe Brucker 
10016078a454SJean-Philippe Brucker 	ret = vfio_pci_configure_dev_irqs(kvm, vdev);
10026078a454SJean-Philippe Brucker 	if (ret) {
10036078a454SJean-Philippe Brucker 		vfio_dev_err(vdev, "failed to configure IRQs");
10046078a454SJean-Philippe Brucker 		return ret;
10056078a454SJean-Philippe Brucker 	}
10066078a454SJean-Philippe Brucker 
10076078a454SJean-Philippe Brucker 	return 0;
10086078a454SJean-Philippe Brucker }
10096078a454SJean-Philippe Brucker 
10106078a454SJean-Philippe Brucker void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
10116078a454SJean-Philippe Brucker {
10126078a454SJean-Philippe Brucker 	size_t i;
1013*c9888d95SJean-Philippe Brucker 	struct vfio_pci_device *pdev = &vdev->pci;
10146078a454SJean-Philippe Brucker 
10156078a454SJean-Philippe Brucker 	for (i = 0; i < vdev->info.num_regions; i++)
10166078a454SJean-Philippe Brucker 		vfio_unmap_region(kvm, &vdev->regions[i]);
10176078a454SJean-Philippe Brucker 
10186078a454SJean-Philippe Brucker 	device__unregister(&vdev->dev_hdr);
1019*c9888d95SJean-Philippe Brucker 
1020*c9888d95SJean-Philippe Brucker 	free(pdev->msix.irq_set);
1021*c9888d95SJean-Philippe Brucker 	free(pdev->msix.entries);
10226078a454SJean-Philippe Brucker }
1023