xref: /kvmtool/vfio/pci.c (revision 0b5e55fc032d1c6394b8ec7fe02d842813c903df)
1 #include "linux/sizes.h"
2 
3 #include "kvm/irq.h"
4 #include "kvm/kvm.h"
5 #include "kvm/kvm-cpu.h"
6 #include "kvm/vfio.h"
7 
8 #include <assert.h>
9 
10 #include <sys/ioctl.h>
11 #include <sys/eventfd.h>
12 #include <sys/resource.h>
13 #include <sys/time.h>
14 
15 /* Some distros don't have the define. */
16 #ifndef PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1
17 #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1	12
18 #endif
19 
20 /* Wrapper around UAPI vfio_irq_set */
21 union vfio_irq_eventfd {
22 	struct vfio_irq_set	irq;
23 	u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)];
24 };
25 
set_vfio_irq_eventd_payload(union vfio_irq_eventfd * evfd,int fd)26 static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd)
27 {
28 	memcpy(&evfd->irq.data, &fd, sizeof(fd));
29 }
30 
31 /*
32  * To support MSI and MSI-X with common code, track the host and guest states of
33  * the MSI/MSI-X capability, and of individual vectors.
34  *
35  * Both MSI and MSI-X capabilities are enabled and disabled through registers.
36  * Vectors cannot be individually disabled.
37  */
38 #define msi_is_enabled(state)		((state) & VFIO_PCI_MSI_STATE_ENABLED)
39 
40 /*
41  * MSI-X: the control register allows to mask all vectors, and the table allows
42  * to mask each vector individually.
43  *
44  * MSI: if the capability supports Per-Vector Masking then the Mask Bit register
45  * allows to mask each vector individually. Otherwise there is no masking for
46  * MSI.
47  */
48 #define msi_is_masked(state)		((state) & VFIO_PCI_MSI_STATE_MASKED)
49 
50 /*
51  * A capability is empty when no vector has been registered with SET_IRQS
52  * yet. It's an optimization specific to kvmtool to avoid issuing lots of
53  * SET_IRQS ioctls when the guest configures the MSI-X table while the
54  * capability is masked.
55  */
56 #define msi_is_empty(state)		((state) & VFIO_PCI_MSI_STATE_EMPTY)
57 
58 #define msi_update_state(state, val, bit)				\
59 	(state) = (val) ? (state) | bit : (state) & ~bit;
60 #define msi_set_enabled(state, val)					\
61 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
62 #define msi_set_masked(state, val)					\
63 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
64 #define msi_set_empty(state, val)					\
65 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
66 
67 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
68 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
69 
vfio_pci_enable_msis(struct kvm * kvm,struct vfio_device * vdev,bool msix)70 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
71 				bool msix)
72 {
73 	size_t i;
74 	int ret = 0;
75 	int *eventfds;
76 	struct vfio_pci_device *pdev = &vdev->pci;
77 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
78 	union vfio_irq_eventfd single = {
79 		.irq = {
80 			.argsz	= sizeof(single),
81 			.flags	= VFIO_IRQ_SET_DATA_EVENTFD |
82 				  VFIO_IRQ_SET_ACTION_TRIGGER,
83 			.index	= msis->info.index,
84 			.count	= 1,
85 		},
86 	};
87 
88 	if (!msi_is_enabled(msis->guest_state))
89 		return 0;
90 
91 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
92 		/*
93 		 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
94 		 * time. Since INTx has to be enabled from the start (we don't
95 		 * have a reliable way to know when the guest starts using it),
96 		 * disable it now.
97 		 */
98 		vfio_pci_disable_intx(kvm, vdev);
99 
100 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
101 
102 	/*
103 	 * Initial registration of the full range. This enables the physical
104 	 * MSI/MSI-X capability, which might have side effects. For instance
105 	 * when assigning virtio legacy devices, enabling the MSI capability
106 	 * modifies the config space layout!
107 	 *
108 	 * As an optimization, only update MSIs when guest unmasks the
109 	 * capability. This greatly reduces the initialization time for Linux
110 	 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
111 	 * masked, then fills individual vectors, then unmasks the whole
112 	 * function. So we only do one VFIO ioctl when enabling for the first
113 	 * time, and then one when unmasking.
114 	 */
115 	if (!msi_is_enabled(msis->host_state) ||
116 	    (!msi_is_masked(msis->guest_state) &&
117 	     msi_is_empty(msis->host_state))) {
118 		bool empty = true;
119 
120 		for (i = 0; i < msis->nr_entries; i++) {
121 			eventfds[i] = msis->entries[i].gsi >= 0 ?
122 				      msis->entries[i].eventfd : -1;
123 
124 			if (eventfds[i] >= 0)
125 				empty = false;
126 		}
127 
128 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
129 		if (ret < 0) {
130 			perror("VFIO_DEVICE_SET_IRQS(multi)");
131 			return ret;
132 		}
133 
134 		msi_set_enabled(msis->host_state, true);
135 		msi_set_empty(msis->host_state, empty);
136 
137 		return 0;
138 	}
139 
140 	if (msi_is_masked(msis->guest_state)) {
141 		/* TODO: if host_state is not empty nor masked, mask all vectors */
142 		return 0;
143 	}
144 
145 	/* Update individual vectors to avoid breaking those in use */
146 	for (i = 0; i < msis->nr_entries; i++) {
147 		struct vfio_pci_msi_entry *entry = &msis->entries[i];
148 		int fd = entry->gsi >= 0 ? entry->eventfd : -1;
149 
150 		if (fd == eventfds[i])
151 			continue;
152 
153 		single.irq.start = i;
154 		set_vfio_irq_eventd_payload(&single, fd);
155 
156 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
157 		if (ret < 0) {
158 			perror("VFIO_DEVICE_SET_IRQS(single)");
159 			break;
160 		}
161 
162 		eventfds[i] = fd;
163 
164 		if (msi_is_empty(msis->host_state) && fd >= 0)
165 			msi_set_empty(msis->host_state, false);
166 	}
167 
168 	return ret;
169 }
170 
vfio_pci_disable_msis(struct kvm * kvm,struct vfio_device * vdev,bool msix)171 static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev,
172 				 bool msix)
173 {
174 	int ret;
175 	struct vfio_pci_device *pdev = &vdev->pci;
176 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
177 	struct vfio_irq_set irq_set = {
178 		.argsz	= sizeof(irq_set),
179 		.flags 	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
180 		.index 	= msis->info.index,
181 		.start 	= 0,
182 		.count	= 0,
183 	};
184 
185 	if (!msi_is_enabled(msis->host_state))
186 		return 0;
187 
188 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
189 	if (ret < 0) {
190 		perror("VFIO_DEVICE_SET_IRQS(NONE)");
191 		return ret;
192 	}
193 
194 	msi_set_enabled(msis->host_state, false);
195 	msi_set_empty(msis->host_state, true);
196 
197 	/*
198 	 * When MSI or MSIX is disabled, this might be called when
199 	 * PCI driver detects the MSI interrupt failure and wants to
200 	 * rollback to INTx mode.  Thus enable INTx if the device
201 	 * supports INTx mode in this case.
202 	 */
203 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
204 		ret = vfio_pci_enable_intx(kvm, vdev);
205 
206 	return ret >= 0 ? 0 : ret;
207 }
208 
vfio_pci_update_msi_entry(struct kvm * kvm,struct vfio_device * vdev,struct vfio_pci_msi_entry * entry)209 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
210 				     struct vfio_pci_msi_entry *entry)
211 {
212 	int ret;
213 
214 	if (entry->eventfd < 0) {
215 		entry->eventfd = eventfd(0, 0);
216 		if (entry->eventfd < 0) {
217 			ret = -errno;
218 			vfio_dev_err(vdev, "cannot create eventfd");
219 			return ret;
220 		}
221 	}
222 
223 	/* Allocate IRQ if necessary */
224 	if (entry->gsi < 0) {
225 		int ret = irq__add_msix_route(kvm, &entry->config.msg,
226 					      vdev->dev_hdr.dev_num << 3);
227 		if (ret < 0) {
228 			vfio_dev_err(vdev, "cannot create MSI-X route");
229 			return ret;
230 		}
231 		entry->gsi = ret;
232 	} else {
233 		irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
234 	}
235 
236 	/*
237 	 * MSI masking is unimplemented in VFIO, so we have to handle it by
238 	 * disabling/enabling IRQ route instead. We do it on the KVM side rather
239 	 * than VFIO, because:
240 	 * - it is 8x faster
241 	 * - it allows to decouple masking logic from capability state.
242 	 * - in masked state, after removing irqfd route, we could easily plug
243 	 *   the eventfd in a local handler, in order to serve Pending Bit reads
244 	 *   to the guest.
245 	 *
246 	 * So entry->host_state is masked when there is no active irqfd route.
247 	 */
248 	if (msi_is_masked(entry->guest_state) == msi_is_masked(entry->host_state))
249 		return 0;
250 
251 	if (msi_is_masked(entry->host_state)) {
252 		ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
253 		if (ret < 0) {
254 			vfio_dev_err(vdev, "cannot setup irqfd");
255 			return ret;
256 		}
257 	} else {
258 		irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
259 	}
260 
261 	msi_set_masked(entry->host_state, msi_is_masked(entry->guest_state));
262 
263 	return 0;
264 }
265 
vfio_pci_msix_pba_access(struct kvm_cpu * vcpu,u64 addr,u8 * data,u32 len,u8 is_write,void * ptr)266 static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
267 				     u32 len, u8 is_write, void *ptr)
268 {
269 	struct vfio_pci_device *pdev = ptr;
270 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
271 	u64 offset = addr - pba->guest_phys_addr;
272 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
273 
274 	if (offset >= pba->size) {
275 		vfio_dev_err(vdev, "access outside of the MSIX PBA");
276 		return;
277 	}
278 
279 	if (is_write)
280 		return;
281 
282 	/*
283 	 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
284 	 * is completely useless here. Note that Linux doesn't use PBA.
285 	 */
286 	if (pread(vdev->fd, data, len, pba->fd_offset + offset) != (ssize_t)len)
287 		vfio_dev_err(vdev, "cannot access MSIX PBA\n");
288 }
289 
vfio_pci_msix_table_access(struct kvm_cpu * vcpu,u64 addr,u8 * data,u32 len,u8 is_write,void * ptr)290 static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
291 				       u32 len, u8 is_write, void *ptr)
292 {
293 	struct kvm *kvm = vcpu->kvm;
294 	struct vfio_pci_msi_entry *entry;
295 	struct vfio_pci_device *pdev = ptr;
296 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
297 
298 	u64 offset = addr - pdev->msix_table.guest_phys_addr;
299 	if (offset >= pdev->msix_table.size) {
300 		vfio_dev_err(vdev, "access outside of the MSI-X table");
301 		return;
302 	}
303 
304 	size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
305 	off_t field = offset % PCI_MSIX_ENTRY_SIZE;
306 
307 	/*
308 	 * PCI spec says that software must use aligned 4 or 8 bytes accesses
309 	 * for the MSI-X tables.
310 	 */
311 	if ((len != 4 && len != 8) || addr & (len - 1)) {
312 		vfio_dev_warn(vdev, "invalid MSI-X table access");
313 		return;
314 	}
315 
316 	entry = &pdev->msix.entries[vector];
317 
318 	mutex_lock(&pdev->msix.mutex);
319 
320 	if (!is_write) {
321 		memcpy(data, (void *)&entry->config + field, len);
322 		goto out_unlock;
323 	}
324 
325 	memcpy((void *)&entry->config + field, data, len);
326 
327 	/*
328 	 * Check if access touched the vector control register, which is at the
329 	 * end of the MSI-X entry.
330 	 */
331 	if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
332 		goto out_unlock;
333 
334 	msi_set_masked(entry->guest_state, entry->config.ctrl &
335 		       PCI_MSIX_ENTRY_CTRL_MASKBIT);
336 
337 	if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
338 		/* Not much we can do here. */
339 		vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
340 
341 	/* Update the physical capability if necessary */
342 	if (vfio_pci_enable_msis(kvm, vdev, true))
343 		vfio_dev_err(vdev, "cannot enable MSIX");
344 
345 out_unlock:
346 	mutex_unlock(&pdev->msix.mutex);
347 }
348 
vfio_pci_msix_cap_write(struct kvm * kvm,struct vfio_device * vdev,u16 off,void * data,int sz)349 static void vfio_pci_msix_cap_write(struct kvm *kvm,
350 				    struct vfio_device *vdev, u16 off,
351 				    void *data, int sz)
352 {
353 	struct vfio_pci_device *pdev = &vdev->pci;
354 	off_t enable_pos = PCI_MSIX_FLAGS + 1;
355 	bool enable;
356 	u16 flags;
357 
358 	off -= pdev->msix.pos;
359 
360 	/* Check if access intersects with the MSI-X Enable bit */
361 	if (off > enable_pos || off + sz <= enable_pos)
362 		return;
363 
364 	/* Read byte that contains the Enable bit */
365 	flags = *(u8 *)(data + enable_pos - off) << 8;
366 
367 	mutex_lock(&pdev->msix.mutex);
368 
369 	msi_set_masked(pdev->msix.guest_state, flags & PCI_MSIX_FLAGS_MASKALL);
370 	enable = flags & PCI_MSIX_FLAGS_ENABLE;
371 	msi_set_enabled(pdev->msix.guest_state, enable);
372 
373 	if (enable && vfio_pci_enable_msis(kvm, vdev, true))
374 		vfio_dev_err(vdev, "cannot enable MSIX");
375 	else if (!enable && vfio_pci_disable_msis(kvm, vdev, true))
376 		vfio_dev_err(vdev, "cannot disable MSIX");
377 
378 	mutex_unlock(&pdev->msix.mutex);
379 }
380 
vfio_pci_msi_vector_write(struct kvm * kvm,struct vfio_device * vdev,u16 off,u8 * data,u32 sz)381 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
382 				     u16 off, u8 *data, u32 sz)
383 {
384 	size_t i;
385 	u32 mask = 0;
386 	size_t mask_pos, start, limit;
387 	struct vfio_pci_msi_entry *entry;
388 	struct vfio_pci_device *pdev = &vdev->pci;
389 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
390 
391 	if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT))
392 		return 0;
393 
394 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT)
395 		mask_pos = PCI_MSI_MASK_64;
396 	else
397 		mask_pos = PCI_MSI_MASK_32;
398 
399 	if (off >= mask_pos + 4 || off + sz <= mask_pos)
400 		return 0;
401 
402 	/* Set mask to current state */
403 	for (i = 0; i < pdev->msi.nr_entries; i++) {
404 		entry = &pdev->msi.entries[i];
405 		mask |= !!msi_is_masked(entry->guest_state) << i;
406 	}
407 
408 	/* Update mask following the intersection of access and register */
409 	start = max_t(size_t, off, mask_pos);
410 	limit = min_t(size_t, off + sz, mask_pos + 4);
411 
412 	memcpy((void *)&mask + start - mask_pos, data + start - off,
413 	       limit - start);
414 
415 	/* Update states if necessary */
416 	for (i = 0; i < pdev->msi.nr_entries; i++) {
417 		bool masked = mask & (1 << i);
418 
419 		entry = &pdev->msi.entries[i];
420 		if (masked != msi_is_masked(entry->guest_state)) {
421 			msi_set_masked(entry->guest_state, masked);
422 			vfio_pci_update_msi_entry(kvm, vdev, entry);
423 		}
424 	}
425 
426 	return 1;
427 }
428 
vfio_pci_msi_cap_write(struct kvm * kvm,struct vfio_device * vdev,u16 off,u8 * data,u32 sz)429 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
430 				   u16 off, u8 *data, u32 sz)
431 {
432 	u8 ctrl;
433 	struct msi_msg msg;
434 	size_t i, nr_vectors;
435 	struct vfio_pci_msi_entry *entry;
436 	struct vfio_pci_device *pdev = &vdev->pci;
437 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
438 
439 	off -= pdev->msi.pos;
440 
441 	mutex_lock(&pdev->msi.mutex);
442 
443 	/* Check if the guest is trying to update mask bits */
444 	if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz))
445 		goto out_unlock;
446 
447 	/* Only modify routes when guest pokes the enable bit */
448 	if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS)
449 		goto out_unlock;
450 
451 	ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off);
452 
453 	msi_set_enabled(pdev->msi.guest_state, ctrl & PCI_MSI_FLAGS_ENABLE);
454 
455 	if (!msi_is_enabled(pdev->msi.guest_state)) {
456 		vfio_pci_disable_msis(kvm, vdev, false);
457 		goto out_unlock;
458 	}
459 
460 	/* Create routes for the requested vectors */
461 	nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
462 
463 	msg.address_lo = msi_cap_64->address_lo;
464 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) {
465 		msg.address_hi = msi_cap_64->address_hi;
466 		msg.data = msi_cap_64->data;
467 	} else {
468 		struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64;
469 		msg.address_hi = 0;
470 		msg.data = msi_cap_32->data;
471 	}
472 
473 	for (i = 0; i < nr_vectors; i++) {
474 		entry = &pdev->msi.entries[i];
475 
476 		/*
477 		 * Set the MSI data value as required by the PCI local
478 		 * bus specifications, MSI capability, "Message Data".
479 		 */
480 		msg.data &= ~(nr_vectors - 1);
481 		msg.data |= i;
482 
483 		entry->config.msg = msg;
484 		vfio_pci_update_msi_entry(kvm, vdev, entry);
485 	}
486 
487 	/* Update the physical capability if necessary */
488 	if (vfio_pci_enable_msis(kvm, vdev, false))
489 		vfio_dev_err(vdev, "cannot enable MSI");
490 
491 out_unlock:
492 	mutex_unlock(&pdev->msi.mutex);
493 }
494 
vfio_pci_bar_activate(struct kvm * kvm,struct pci_device_header * pci_hdr,int bar_num,void * data)495 static int vfio_pci_bar_activate(struct kvm *kvm,
496 				 struct pci_device_header *pci_hdr,
497 				 int bar_num, void *data)
498 {
499 	struct vfio_device *vdev = data;
500 	struct vfio_pci_device *pdev = &vdev->pci;
501 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
502 	struct vfio_pci_msix_table *table = &pdev->msix_table;
503 	struct vfio_region *region;
504 	u32 bar_addr;
505 	bool has_msix;
506 	int ret;
507 
508 	assert((u32)bar_num < vdev->info.num_regions);
509 
510 	region = &vdev->regions[bar_num];
511 	has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
512 
513 	bar_addr = pci__bar_address(pci_hdr, bar_num);
514 	if (pci__bar_is_io(pci_hdr, bar_num))
515 		region->port_base = bar_addr;
516 	else
517 		region->guest_phys_addr = bar_addr;
518 
519 	if (has_msix && (u32)bar_num == table->bar) {
520 		table->guest_phys_addr = region->guest_phys_addr;
521 		ret = kvm__register_mmio(kvm, table->guest_phys_addr,
522 					 table->size, false,
523 					 vfio_pci_msix_table_access, pdev);
524 		/*
525 		 * The MSIX table and the PBA structure can share the same BAR,
526 		 * but for convenience we register different regions for mmio
527 		 * emulation. We want to we update both if they share the same
528 		 * BAR.
529 		 */
530 		if (ret < 0 || table->bar != pba->bar)
531 			goto out;
532 	}
533 
534 	if (has_msix && (u32)bar_num == pba->bar) {
535 		if (pba->bar == table->bar)
536 			pba->guest_phys_addr = table->guest_phys_addr + pba->bar_offset;
537 		else
538 			pba->guest_phys_addr = region->guest_phys_addr;
539 		ret = kvm__register_mmio(kvm, pba->guest_phys_addr,
540 					 pba->size, false,
541 					 vfio_pci_msix_pba_access, pdev);
542 		goto out;
543 	}
544 
545 	ret = vfio_map_region(kvm, vdev, region);
546 out:
547 	return ret;
548 }
549 
vfio_pci_bar_deactivate(struct kvm * kvm,struct pci_device_header * pci_hdr,int bar_num,void * data)550 static int vfio_pci_bar_deactivate(struct kvm *kvm,
551 				   struct pci_device_header *pci_hdr,
552 				   int bar_num, void *data)
553 {
554 	struct vfio_device *vdev = data;
555 	struct vfio_pci_device *pdev = &vdev->pci;
556 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
557 	struct vfio_pci_msix_table *table = &pdev->msix_table;
558 	struct vfio_region *region;
559 	bool has_msix, success;
560 	int ret;
561 
562 	assert((u32)bar_num < vdev->info.num_regions);
563 
564 	region = &vdev->regions[bar_num];
565 	has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
566 
567 	if (has_msix && (u32)bar_num == table->bar) {
568 		success = kvm__deregister_mmio(kvm, table->guest_phys_addr);
569 		/* kvm__deregister_mmio fails when the region is not found. */
570 		ret = (success ? 0 : -ENOENT);
571 		/* See vfio_pci_bar_activate(). */
572 		if (ret < 0 || table->bar!= pba->bar)
573 			goto out;
574 	}
575 
576 	if (has_msix && (u32)bar_num == pba->bar) {
577 		success = kvm__deregister_mmio(kvm, pba->guest_phys_addr);
578 		ret = (success ? 0 : -ENOENT);
579 		goto out;
580 	}
581 
582 	vfio_unmap_region(kvm, region);
583 	ret = 0;
584 
585 out:
586 	return ret;
587 }
588 
vfio_pci_cfg_read(struct kvm * kvm,struct pci_device_header * pci_hdr,u16 offset,void * data,int sz)589 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
590 			      u16 offset, void *data, int sz)
591 {
592 	struct vfio_region_info *info;
593 	struct vfio_pci_device *pdev;
594 	struct vfio_device *vdev;
595 	char base[sz];
596 
597 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
598 	vdev = container_of(pdev, struct vfio_device, pci);
599 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
600 
601 	/* Dummy read in case of side-effects */
602 	if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
603 		vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
604 			      sz, offset);
605 }
606 
vfio_pci_cfg_write(struct kvm * kvm,struct pci_device_header * pci_hdr,u16 offset,void * data,int sz)607 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
608 			       u16 offset, void *data, int sz)
609 {
610 	struct vfio_region_info *info;
611 	struct vfio_pci_device *pdev;
612 	struct vfio_device *vdev;
613 	u32 tmp;
614 
615 	/* Make sure a larger size will not overrun tmp on the stack. */
616 	assert(sz <= 4);
617 
618 	if (offset == PCI_ROM_ADDRESS)
619 		return;
620 
621 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
622 	vdev = container_of(pdev, struct vfio_device, pci);
623 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
624 
625 	if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
626 		vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
627 			      sz, offset);
628 
629 	/* Handle MSI write now, since it might update the hardware capability */
630 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
631 		vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
632 
633 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI)
634 		vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz);
635 
636 	if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz)
637 		vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
638 			      sz, offset);
639 }
640 
vfio_pci_msi_cap_size(struct msi_cap_64 * cap_hdr)641 static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr)
642 {
643 	size_t size = 10;
644 
645 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT)
646 		size += 4;
647 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT)
648 		size += 10;
649 
650 	return size;
651 }
652 
vfio_pci_cap_size(struct pci_cap_hdr * cap_hdr)653 static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
654 {
655 	switch (cap_hdr->type) {
656 	case PCI_CAP_ID_MSIX:
657 		return PCI_CAP_MSIX_SIZEOF;
658 	case PCI_CAP_ID_MSI:
659 		return vfio_pci_msi_cap_size((void *)cap_hdr);
660 	case PCI_CAP_ID_EXP:
661 		/*
662 		 * We don't emulate any of the link, slot and root complex
663 		 * properties, so ignore them.
664 		 */
665 		return PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1;
666 	default:
667 		pr_err("unknown PCI capability 0x%x", cap_hdr->type);
668 		return 0;
669 	}
670 }
671 
vfio_pci_add_cap(struct vfio_device * vdev,u8 * virt_hdr,struct pci_cap_hdr * cap,off_t pos)672 static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
673 			    struct pci_cap_hdr *cap, off_t pos)
674 {
675 	struct pci_cap_hdr *last;
676 	struct pci_device_header *hdr = &vdev->pci.hdr;
677 
678 	cap->next = 0;
679 
680 	if (!hdr->capabilities) {
681 		hdr->capabilities = pos;
682 		hdr->status |= PCI_STATUS_CAP_LIST;
683 	} else {
684 		last = PCI_CAP(virt_hdr, hdr->capabilities);
685 
686 		while (last->next)
687 			last = PCI_CAP(virt_hdr, last->next);
688 
689 		last->next = pos;
690 	}
691 
692 	memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
693 
694 	return 0;
695 }
696 
vfio_pci_parse_caps(struct vfio_device * vdev)697 static int vfio_pci_parse_caps(struct vfio_device *vdev)
698 {
699 	int ret;
700 	size_t size;
701 	u16 pos, next;
702 	struct pci_cap_hdr *cap;
703 	u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY];
704 	struct vfio_pci_device *pdev = &vdev->pci;
705 
706 	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
707 		return 0;
708 
709 	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY);
710 
711 	pos = pdev->hdr.capabilities & ~3;
712 
713 	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
714 	pdev->hdr.capabilities = 0;
715 
716 	for (; pos; pos = next) {
717 		cap = PCI_CAP(&pdev->hdr, pos);
718 		next = cap->next;
719 
720 		switch (cap->type) {
721 		case PCI_CAP_ID_MSIX:
722 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
723 			if (ret)
724 				return ret;
725 
726 			pdev->msix.pos = pos;
727 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
728 			break;
729 		case PCI_CAP_ID_MSI:
730 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
731 			if (ret)
732 				return ret;
733 
734 			pdev->msi.pos = pos;
735 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI;
736 			break;
737 		case PCI_CAP_ID_EXP:
738 			if (!arch_has_pci_exp())
739 				continue;
740 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
741 			if (ret)
742 				return ret;
743 			break;
744 		}
745 	}
746 
747 	/* Wipe remaining capabilities */
748 	pos = PCI_STD_HEADER_SIZEOF;
749 	size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF;
750 	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
751 
752 	return 0;
753 }
754 
vfio_pci_parse_cfg_space(struct vfio_device * vdev)755 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
756 {
757 	ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
758 	struct vfio_region_info *info;
759 	struct vfio_pci_device *pdev = &vdev->pci;
760 
761 	if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
762 		vfio_dev_err(vdev, "Config Space not found");
763 		return -ENODEV;
764 	}
765 
766 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
767 	*info = (struct vfio_region_info) {
768 			.argsz = sizeof(*info),
769 			.index = VFIO_PCI_CONFIG_REGION_INDEX,
770 	};
771 
772 	ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
773 	if (!info->size) {
774 		vfio_dev_err(vdev, "Config Space has size zero?!");
775 		return -EINVAL;
776 	}
777 
778 	/* Read standard headers and capabilities */
779 	if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
780 		vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
781 		return -EIO;
782 	}
783 
784 	/* Strip bit 7, that indicates multifunction */
785 	pdev->hdr.header_type &= 0x7f;
786 
787 	if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
788 		vfio_dev_err(vdev, "unsupported header type %u",
789 			     pdev->hdr.header_type);
790 		return -EOPNOTSUPP;
791 	}
792 
793 	if (pdev->hdr.irq_pin)
794 		pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
795 
796 	vfio_pci_parse_caps(vdev);
797 
798 	return 0;
799 }
800 
vfio_pci_fixup_cfg_space(struct vfio_device * vdev)801 static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
802 {
803 	int i;
804 	u64 base;
805 	ssize_t hdr_sz;
806 	struct msix_cap *msix;
807 	struct vfio_region_info *info;
808 	struct vfio_pci_device *pdev = &vdev->pci;
809 	struct vfio_region *region;
810 
811 	/* Initialise the BARs */
812 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
813 		if ((u32)i == vdev->info.num_regions)
814 			break;
815 
816 		region = &vdev->regions[i];
817 		/* Construct a fake reg to match what we've mapped. */
818 		if (region->is_ioport) {
819 			base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) |
820 				PCI_BASE_ADDRESS_SPACE_IO;
821 		} else {
822 			base = (region->guest_phys_addr &
823 				PCI_BASE_ADDRESS_MEM_MASK) |
824 				PCI_BASE_ADDRESS_SPACE_MEMORY;
825 		}
826 
827 		pdev->hdr.bar[i] = base;
828 
829 		if (!base)
830 			continue;
831 
832 		pdev->hdr.bar_size[i] = region->info.size;
833 	}
834 
835 	/* I really can't be bothered to support cardbus. */
836 	pdev->hdr.card_bus = 0;
837 
838 	/*
839 	 * Nuke the expansion ROM for now. If we want to do this properly,
840 	 * we need to save its size somewhere and map into the guest.
841 	 */
842 	pdev->hdr.exp_rom_bar = 0;
843 
844 	/* Plumb in our fake MSI-X capability, if we have it. */
845 	msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
846 	if (msix) {
847 		/* Add a shortcut to the PBA region for the MMIO handler */
848 		int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
849 		u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET;
850 
851 		pdev->msix_pba.fd_offset = vdev->regions[pba_index].info.offset +
852 					   pba_bar_offset;
853 
854 		/* Tidy up the capability */
855 		msix->table_offset &= PCI_MSIX_TABLE_BIR;
856 		if (pdev->msix_table.bar == pdev->msix_pba.bar) {
857 			/* Keep the same offset as the MSIX cap. */
858 			pdev->msix_pba.bar_offset = pba_bar_offset;
859 		} else {
860 			/* PBA is at the start of the BAR. */
861 			msix->pba_offset &= PCI_MSIX_PBA_BIR;
862 			pdev->msix_pba.bar_offset = 0;
863 		}
864 	}
865 
866 	/* Install our fake Configuration Space */
867 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
868 	/*
869 	 * We don't touch the extended configuration space, let's be cautious
870 	 * and not overwrite it all with zeros, or bad things might happen.
871 	 */
872 	hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
873 	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
874 		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
875 			     hdr_sz);
876 		return -EIO;
877 	}
878 
879 	/* Register callbacks for cfg accesses */
880 	pdev->hdr.cfg_ops = (struct pci_config_operations) {
881 		.read	= vfio_pci_cfg_read,
882 		.write	= vfio_pci_cfg_write,
883 	};
884 
885 	pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
886 
887 	return 0;
888 }
889 
vfio_pci_get_region_info(struct vfio_device * vdev,u32 index,struct vfio_region_info * info)890 static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index,
891 				    struct vfio_region_info *info)
892 {
893 	int ret;
894 
895 	*info = (struct vfio_region_info) {
896 		.argsz = sizeof(*info),
897 		.index = index,
898 	};
899 
900 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
901 	if (ret) {
902 		ret = -errno;
903 		vfio_dev_err(vdev, "cannot get info for BAR %u", index);
904 		return ret;
905 	}
906 
907 	if (info->size && !is_power_of_two(info->size)) {
908 		vfio_dev_err(vdev, "region is not power of two: 0x%llx",
909 				info->size);
910 		return -EINVAL;
911 	}
912 
913 	return 0;
914 }
915 
vfio_pci_create_msix_table(struct kvm * kvm,struct vfio_device * vdev)916 static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev)
917 {
918 	int ret;
919 	size_t i;
920 	size_t map_size;
921 	size_t nr_entries;
922 	struct vfio_pci_msi_entry *entries;
923 	struct vfio_pci_device *pdev = &vdev->pci;
924 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
925 	struct vfio_pci_msix_table *table = &pdev->msix_table;
926 	struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
927 	struct vfio_region_info info;
928 
929 	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
930 	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
931 
932 	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
933 
934 	/* MSIX table and PBA must support QWORD accesses. */
935 	table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, 8);
936 	pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), 8);
937 
938 	entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
939 	if (!entries)
940 		return -ENOMEM;
941 
942 	for (i = 0; i < nr_entries; i++)
943 		entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
944 
945 	ret = vfio_pci_get_region_info(vdev, table->bar, &info);
946 	if (ret)
947 		return ret;
948 	if (!info.size)
949 		return -EINVAL;
950 
951 	map_size = ALIGN(info.size, MAX_PAGE_SIZE);
952 	table->guest_phys_addr = pci_get_mmio_block(map_size);
953 	if (!table->guest_phys_addr) {
954 		pr_err("cannot allocate MMIO space");
955 		ret = -ENOMEM;
956 		goto out_free;
957 	}
958 
959 	/*
960 	 * We could map the physical PBA directly into the guest, but it's
961 	 * likely smaller than a page, and we can only hand full pages to the
962 	 * guest. Even though the PCI spec disallows sharing a page used for
963 	 * MSI-X with any other resource, it allows to share the same page
964 	 * between MSI-X table and PBA. For the sake of isolation, create a
965 	 * virtual PBA.
966 	 */
967 	if (table->bar == pba->bar) {
968 		u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET;
969 
970 		/* Sanity checks. */
971 		if (table->size > pba_bar_offset)
972 			die("MSIX table overlaps with PBA");
973 		if (pba_bar_offset + pba->size > info.size)
974 			die("PBA exceeds the size of the region");
975 		pba->guest_phys_addr = table->guest_phys_addr + pba_bar_offset;
976 	} else {
977 		ret = vfio_pci_get_region_info(vdev, pba->bar, &info);
978 		if (ret)
979 			return ret;
980 		if (!info.size)
981 			return -EINVAL;
982 
983 		map_size = ALIGN(info.size, MAX_PAGE_SIZE);
984 		pba->guest_phys_addr = pci_get_mmio_block(map_size);
985 		if (!pba->guest_phys_addr) {
986 			pr_err("cannot allocate MMIO space");
987 			ret = -ENOMEM;
988 			goto out_free;
989 		}
990 	}
991 
992 	pdev->msix.entries = entries;
993 	pdev->msix.nr_entries = nr_entries;
994 
995 	return 0;
996 
997 out_free:
998 	free(entries);
999 
1000 	return ret;
1001 }
1002 
vfio_pci_create_msi_cap(struct kvm * kvm,struct vfio_pci_device * pdev)1003 static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev)
1004 {
1005 	struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos);
1006 
1007 	pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1),
1008 	pdev->msi.entries = calloc(pdev->msi.nr_entries,
1009 				   sizeof(struct vfio_pci_msi_entry));
1010 	if (!pdev->msi.entries)
1011 		return -ENOMEM;
1012 
1013 	return 0;
1014 }
1015 
vfio_pci_configure_bar(struct kvm * kvm,struct vfio_device * vdev,size_t nr)1016 static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
1017 				  size_t nr)
1018 {
1019 	int ret;
1020 	u32 bar;
1021 	size_t map_size;
1022 	struct vfio_pci_device *pdev = &vdev->pci;
1023 	struct vfio_region *region;
1024 
1025 	if (nr >= vdev->info.num_regions)
1026 		return 0;
1027 
1028 	region = &vdev->regions[nr];
1029 	bar = pdev->hdr.bar[nr];
1030 
1031 	region->vdev = vdev;
1032 	region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
1033 
1034 	ret = vfio_pci_get_region_info(vdev, nr, &region->info);
1035 	if (ret)
1036 		return ret;
1037 
1038 	/* Ignore invalid or unimplemented regions */
1039 	if (!region->info.size)
1040 		return 0;
1041 
1042 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1043 		/* Trap and emulate MSI-X table */
1044 		if (nr == pdev->msix_table.bar) {
1045 			region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
1046 			return 0;
1047 		} else if (nr == pdev->msix_pba.bar) {
1048 			region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
1049 			return 0;
1050 		}
1051 	}
1052 
1053 	if (region->is_ioport) {
1054 		region->port_base = pci_get_io_port_block(region->info.size);
1055 	} else {
1056 		/* Grab some MMIO space in the guest */
1057 		map_size = ALIGN(region->info.size, PAGE_SIZE);
1058 		region->guest_phys_addr = pci_get_mmio_block(map_size);
1059 	}
1060 
1061 	return 0;
1062 }
1063 
vfio_pci_configure_dev_regions(struct kvm * kvm,struct vfio_device * vdev)1064 static int vfio_pci_configure_dev_regions(struct kvm *kvm,
1065 					  struct vfio_device *vdev)
1066 {
1067 	int ret;
1068 	u32 bar;
1069 	size_t i;
1070 	bool is_64bit = false;
1071 	struct vfio_pci_device *pdev = &vdev->pci;
1072 
1073 	ret = vfio_pci_parse_cfg_space(vdev);
1074 	if (ret)
1075 		return ret;
1076 
1077 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1078 		ret = vfio_pci_create_msix_table(kvm, vdev);
1079 		if (ret)
1080 			return ret;
1081 	}
1082 
1083 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
1084 		ret = vfio_pci_create_msi_cap(kvm, pdev);
1085 		if (ret)
1086 			return ret;
1087 	}
1088 
1089 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
1090 		/* Ignore top half of 64-bit BAR */
1091 		if (is_64bit) {
1092 			is_64bit = false;
1093 			continue;
1094 		}
1095 
1096 		ret = vfio_pci_configure_bar(kvm, vdev, i);
1097 		if (ret)
1098 			return ret;
1099 
1100 		bar = pdev->hdr.bar[i];
1101 		is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) ==
1102 			   PCI_BASE_ADDRESS_SPACE_MEMORY &&
1103 			   bar & PCI_BASE_ADDRESS_MEM_TYPE_64;
1104 	}
1105 
1106 	/* We've configured the BARs, fake up a Configuration Space */
1107 	ret = vfio_pci_fixup_cfg_space(vdev);
1108 	if (ret)
1109 		return ret;
1110 
1111 	return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate,
1112 					 vfio_pci_bar_deactivate, vdev);
1113 }
1114 
1115 /*
1116  * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
1117  * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
1118  */
vfio_pci_reserve_irq_fds(size_t num)1119 static int vfio_pci_reserve_irq_fds(size_t num)
1120 {
1121 	/*
1122 	 * I counted around 27 fds under normal load. Let's add 100 for good
1123 	 * measure.
1124 	 */
1125 	static size_t needed = 128;
1126 	struct rlimit fd_limit, new_limit;
1127 
1128 	needed += num;
1129 
1130 	if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
1131 		perror("getrlimit(RLIMIT_NOFILE)");
1132 		return 0;
1133 	}
1134 
1135 	if (fd_limit.rlim_cur >= needed)
1136 		return 0;
1137 
1138 	new_limit.rlim_cur = needed;
1139 
1140 	if (fd_limit.rlim_max < needed)
1141 		/* Try to bump hard limit (root only) */
1142 		new_limit.rlim_max = needed;
1143 	else
1144 		new_limit.rlim_max = fd_limit.rlim_max;
1145 
1146 	if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
1147 		perror("setrlimit(RLIMIT_NOFILE)");
1148 		pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
1149 			   (size_t)(needed - fd_limit.rlim_cur));
1150 	}
1151 
1152 	return 0;
1153 }
1154 
vfio_pci_init_msis(struct kvm * kvm,struct vfio_device * vdev,struct vfio_pci_msi_common * msis)1155 static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
1156 			     struct vfio_pci_msi_common *msis)
1157 {
1158 	int ret;
1159 	size_t i;
1160 	int *eventfds;
1161 	size_t irq_set_size;
1162 	struct vfio_pci_msi_entry *entry;
1163 	size_t nr_entries = msis->nr_entries;
1164 
1165 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
1166 	if (ret || msis->info.count == 0) {
1167 		vfio_dev_err(vdev, "no MSI reported by VFIO");
1168 		return -ENODEV;
1169 	}
1170 
1171 	if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1172 		vfio_dev_err(vdev, "interrupt not EVENTFD capable");
1173 		return -EINVAL;
1174 	}
1175 
1176 	if (msis->info.count != nr_entries) {
1177 		vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
1178 		return -EINVAL;
1179 	}
1180 
1181 	mutex_init(&msis->mutex);
1182 
1183 	vfio_pci_reserve_irq_fds(nr_entries);
1184 
1185 	irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
1186 	msis->irq_set = malloc(irq_set_size);
1187 	if (!msis->irq_set)
1188 		return -ENOMEM;
1189 
1190 	*msis->irq_set = (struct vfio_irq_set) {
1191 		.argsz	= irq_set_size,
1192 		.flags 	= VFIO_IRQ_SET_DATA_EVENTFD |
1193 			  VFIO_IRQ_SET_ACTION_TRIGGER,
1194 		.index 	= msis->info.index,
1195 		.start 	= 0,
1196 		.count 	= nr_entries,
1197 	};
1198 
1199 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
1200 
1201 	for (i = 0; i < nr_entries; i++) {
1202 		entry = &msis->entries[i];
1203 		entry->gsi = -1;
1204 		entry->eventfd = -1;
1205 		msi_set_masked(entry->guest_state, false);
1206 		msi_set_masked(entry->host_state, true);
1207 		eventfds[i] = -1;
1208 	}
1209 
1210 	return 0;
1211 }
1212 
vfio_pci_disable_intx(struct kvm * kvm,struct vfio_device * vdev)1213 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
1214 {
1215 	struct vfio_pci_device *pdev = &vdev->pci;
1216 	int gsi = pdev->intx_gsi;
1217 	struct vfio_irq_set irq_set = {
1218 		.argsz	= sizeof(irq_set),
1219 		.flags	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
1220 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1221 	};
1222 
1223 	if (pdev->intx_fd == -1)
1224 		return;
1225 
1226 	pr_debug("user requested MSI, disabling INTx %d", gsi);
1227 
1228 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
1229 	irq__del_irqfd(kvm, gsi, pdev->intx_fd);
1230 
1231 	close(pdev->intx_fd);
1232 	close(pdev->unmask_fd);
1233 	pdev->intx_fd = -1;
1234 }
1235 
vfio_pci_enable_intx(struct kvm * kvm,struct vfio_device * vdev)1236 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
1237 {
1238 	int ret;
1239 	int trigger_fd, unmask_fd;
1240 	union vfio_irq_eventfd	trigger;
1241 	union vfio_irq_eventfd	unmask;
1242 	struct vfio_pci_device *pdev = &vdev->pci;
1243 	int gsi = pdev->intx_gsi;
1244 
1245 	if (pdev->intx_fd != -1)
1246 		return 0;
1247 
1248 	/*
1249 	 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
1250 	 * signals an interrupt from host to guest, and unmask_fd signals the
1251 	 * deassertion of the line from guest to host.
1252 	 */
1253 	trigger_fd = eventfd(0, 0);
1254 	if (trigger_fd < 0) {
1255 		vfio_dev_err(vdev, "failed to create trigger eventfd");
1256 		return trigger_fd;
1257 	}
1258 
1259 	unmask_fd = eventfd(0, 0);
1260 	if (unmask_fd < 0) {
1261 		vfio_dev_err(vdev, "failed to create unmask eventfd");
1262 		close(trigger_fd);
1263 		return unmask_fd;
1264 	}
1265 
1266 	ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
1267 	if (ret)
1268 		goto err_close;
1269 
1270 	trigger.irq = (struct vfio_irq_set) {
1271 		.argsz	= sizeof(trigger),
1272 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
1273 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1274 		.start	= 0,
1275 		.count	= 1,
1276 	};
1277 	set_vfio_irq_eventd_payload(&trigger, trigger_fd);
1278 
1279 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1280 	if (ret < 0) {
1281 		vfio_dev_err(vdev, "failed to setup VFIO IRQ");
1282 		goto err_delete_line;
1283 	}
1284 
1285 	unmask.irq = (struct vfio_irq_set) {
1286 		.argsz	= sizeof(unmask),
1287 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
1288 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1289 		.start	= 0,
1290 		.count	= 1,
1291 	};
1292 	set_vfio_irq_eventd_payload(&unmask, unmask_fd);
1293 
1294 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask);
1295 	if (ret < 0) {
1296 		vfio_dev_err(vdev, "failed to setup unmask IRQ");
1297 		goto err_remove_event;
1298 	}
1299 
1300 	pdev->intx_fd = trigger_fd;
1301 	pdev->unmask_fd = unmask_fd;
1302 
1303 	return 0;
1304 
1305 err_remove_event:
1306 	/* Remove trigger event */
1307 	trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1308 	trigger.irq.count = 0;
1309 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1310 
1311 err_delete_line:
1312 	irq__del_irqfd(kvm, gsi, trigger_fd);
1313 
1314 err_close:
1315 	close(trigger_fd);
1316 	close(unmask_fd);
1317 	return ret;
1318 }
1319 
vfio_pci_init_intx(struct kvm * kvm,struct vfio_device * vdev)1320 static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev)
1321 {
1322 	int ret;
1323 	struct vfio_pci_device *pdev = &vdev->pci;
1324 	struct vfio_irq_info irq_info = {
1325 		.argsz = sizeof(irq_info),
1326 		.index = VFIO_PCI_INTX_IRQ_INDEX,
1327 	};
1328 
1329 	vfio_pci_reserve_irq_fds(2);
1330 
1331 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
1332 	if (ret || irq_info.count == 0) {
1333 		vfio_dev_err(vdev, "no INTx reported by VFIO");
1334 		return -ENODEV;
1335 	}
1336 
1337 	if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1338 		vfio_dev_err(vdev, "interrupt not eventfd capable");
1339 		return -EINVAL;
1340 	}
1341 
1342 	if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
1343 		vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
1344 		return -EINVAL;
1345 	}
1346 
1347 	/* Guest is going to ovewrite our irq_line... */
1348 	pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
1349 
1350 	pdev->intx_fd = -1;
1351 
1352 	return 0;
1353 }
1354 
vfio_pci_configure_dev_irqs(struct kvm * kvm,struct vfio_device * vdev)1355 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
1356 {
1357 	int ret = 0;
1358 	struct vfio_pci_device *pdev = &vdev->pci;
1359 
1360 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1361 		pdev->msix.info = (struct vfio_irq_info) {
1362 			.argsz = sizeof(pdev->msix.info),
1363 			.index = VFIO_PCI_MSIX_IRQ_INDEX,
1364 		};
1365 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
1366 		if (ret)
1367 			return ret;
1368 	}
1369 
1370 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
1371 		pdev->msi.info = (struct vfio_irq_info) {
1372 			.argsz = sizeof(pdev->msi.info),
1373 			.index = VFIO_PCI_MSI_IRQ_INDEX,
1374 		};
1375 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi);
1376 		if (ret)
1377 			return ret;
1378 	}
1379 
1380 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
1381 		pci__assign_irq(&vdev->pci.hdr);
1382 
1383 		ret = vfio_pci_init_intx(kvm, vdev);
1384 		if (ret)
1385 			return ret;
1386 
1387 		ret = vfio_pci_enable_intx(kvm, vdev);
1388 	}
1389 
1390 	return ret;
1391 }
1392 
vfio_pci_setup_device(struct kvm * kvm,struct vfio_device * vdev)1393 int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
1394 {
1395 	int ret;
1396 
1397 	ret = vfio_pci_configure_dev_regions(kvm, vdev);
1398 	if (ret) {
1399 		vfio_dev_err(vdev, "failed to configure regions");
1400 		return ret;
1401 	}
1402 
1403 	vdev->dev_hdr = (struct device_header) {
1404 		.bus_type	= DEVICE_BUS_PCI,
1405 		.data		= &vdev->pci.hdr,
1406 	};
1407 
1408 	ret = device__register(&vdev->dev_hdr);
1409 	if (ret) {
1410 		vfio_dev_err(vdev, "failed to register VFIO device");
1411 		return ret;
1412 	}
1413 
1414 	ret = vfio_pci_configure_dev_irqs(kvm, vdev);
1415 	if (ret) {
1416 		vfio_dev_err(vdev, "failed to configure IRQs");
1417 		return ret;
1418 	}
1419 
1420 	return 0;
1421 }
1422 
vfio_pci_teardown_device(struct kvm * kvm,struct vfio_device * vdev)1423 void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
1424 {
1425 	size_t i;
1426 	struct vfio_pci_device *pdev = &vdev->pci;
1427 
1428 	for (i = 0; i < vdev->info.num_regions; i++)
1429 		vfio_unmap_region(kvm, &vdev->regions[i]);
1430 
1431 	device__unregister(&vdev->dev_hdr);
1432 
1433 	free(pdev->msix.irq_set);
1434 	free(pdev->msix.entries);
1435 	free(pdev->msi.irq_set);
1436 	free(pdev->msi.entries);
1437 }
1438