xref: /kvmtool/vfio/pci.c (revision 25c1dc6c4942ff0949c08780fcad6b324fec6bf7)
1 #include "kvm/irq.h"
2 #include "kvm/kvm.h"
3 #include "kvm/kvm-cpu.h"
4 #include "kvm/vfio.h"
5 
6 #include <assert.h>
7 
8 #include <sys/ioctl.h>
9 #include <sys/eventfd.h>
10 #include <sys/resource.h>
11 #include <sys/time.h>
12 
13 #include <assert.h>
14 
15 /* Some distros don't have the define. */
16 #ifndef PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1
17 #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1	12
18 #endif
19 
20 /* Wrapper around UAPI vfio_irq_set */
21 union vfio_irq_eventfd {
22 	struct vfio_irq_set	irq;
23 	u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)];
24 };
25 
26 static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd)
27 {
28 	memcpy(&evfd->irq.data, &fd, sizeof(fd));
29 }
30 
31 #define msi_is_enabled(state)		((state) & VFIO_PCI_MSI_STATE_ENABLED)
32 #define msi_is_masked(state)		((state) & VFIO_PCI_MSI_STATE_MASKED)
33 #define msi_is_empty(state)		((state) & VFIO_PCI_MSI_STATE_EMPTY)
34 
35 #define msi_update_state(state, val, bit)				\
36 	(state) = (val) ? (state) | bit : (state) & ~bit;
37 #define msi_set_enabled(state, val)					\
38 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
39 #define msi_set_masked(state, val)					\
40 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
41 #define msi_set_empty(state, val)					\
42 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
43 
44 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
45 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
46 
47 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
48 				bool msix)
49 {
50 	size_t i;
51 	int ret = 0;
52 	int *eventfds;
53 	struct vfio_pci_device *pdev = &vdev->pci;
54 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
55 	union vfio_irq_eventfd single = {
56 		.irq = {
57 			.argsz	= sizeof(single),
58 			.flags	= VFIO_IRQ_SET_DATA_EVENTFD |
59 				  VFIO_IRQ_SET_ACTION_TRIGGER,
60 			.index	= msis->info.index,
61 			.count	= 1,
62 		},
63 	};
64 
65 	if (!msi_is_enabled(msis->virt_state))
66 		return 0;
67 
68 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
69 		/*
70 		 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
71 		 * time. Since INTx has to be enabled from the start (we don't
72 		 * have a reliable way to know when the guest starts using it),
73 		 * disable it now.
74 		 */
75 		vfio_pci_disable_intx(kvm, vdev);
76 
77 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
78 
79 	/*
80 	 * Initial registration of the full range. This enables the physical
81 	 * MSI/MSI-X capability, which might have desired side effects. For
82 	 * instance when assigning virtio legacy devices, enabling the MSI
83 	 * capability modifies the config space layout!
84 	 *
85 	 * As an optimization, only update MSIs when guest unmasks the
86 	 * capability. This greatly reduces the initialization time for Linux
87 	 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
88 	 * masked, then fills individual vectors, then unmasks the whole
89 	 * function. So we only do one VFIO ioctl when enabling for the first
90 	 * time, and then one when unmasking.
91 	 *
92 	 * phys_state is empty when it is enabled but no vector has been
93 	 * registered via SET_IRQS yet.
94 	 */
95 	if (!msi_is_enabled(msis->phys_state) ||
96 	    (!msi_is_masked(msis->virt_state) &&
97 	     msi_is_empty(msis->phys_state))) {
98 		bool empty = true;
99 
100 		for (i = 0; i < msis->nr_entries; i++) {
101 			eventfds[i] = msis->entries[i].gsi >= 0 ?
102 				      msis->entries[i].eventfd : -1;
103 
104 			if (eventfds[i] >= 0)
105 				empty = false;
106 		}
107 
108 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
109 		if (ret < 0) {
110 			perror("VFIO_DEVICE_SET_IRQS(multi)");
111 			return ret;
112 		}
113 
114 		msi_set_enabled(msis->phys_state, true);
115 		msi_set_empty(msis->phys_state, empty);
116 
117 		return 0;
118 	}
119 
120 	if (msi_is_masked(msis->virt_state)) {
121 		/* TODO: if phys_state is not empty nor masked, mask all vectors */
122 		return 0;
123 	}
124 
125 	/* Update individual vectors to avoid breaking those in use */
126 	for (i = 0; i < msis->nr_entries; i++) {
127 		struct vfio_pci_msi_entry *entry = &msis->entries[i];
128 		int fd = entry->gsi >= 0 ? entry->eventfd : -1;
129 
130 		if (fd == eventfds[i])
131 			continue;
132 
133 		single.irq.start = i;
134 		set_vfio_irq_eventd_payload(&single, fd);
135 
136 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
137 		if (ret < 0) {
138 			perror("VFIO_DEVICE_SET_IRQS(single)");
139 			break;
140 		}
141 
142 		eventfds[i] = fd;
143 
144 		if (msi_is_empty(msis->phys_state) && fd >= 0)
145 			msi_set_empty(msis->phys_state, false);
146 	}
147 
148 	return ret;
149 }
150 
151 static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev,
152 				 bool msix)
153 {
154 	int ret;
155 	struct vfio_pci_device *pdev = &vdev->pci;
156 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
157 	struct vfio_irq_set irq_set = {
158 		.argsz	= sizeof(irq_set),
159 		.flags 	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
160 		.index 	= msis->info.index,
161 		.start 	= 0,
162 		.count	= 0,
163 	};
164 
165 	if (!msi_is_enabled(msis->phys_state))
166 		return 0;
167 
168 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
169 	if (ret < 0) {
170 		perror("VFIO_DEVICE_SET_IRQS(NONE)");
171 		return ret;
172 	}
173 
174 	msi_set_enabled(msis->phys_state, false);
175 	msi_set_empty(msis->phys_state, true);
176 
177 	/*
178 	 * When MSI or MSIX is disabled, this might be called when
179 	 * PCI driver detects the MSI interrupt failure and wants to
180 	 * rollback to INTx mode.  Thus enable INTx if the device
181 	 * supports INTx mode in this case.
182 	 */
183 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
184 		ret = vfio_pci_enable_intx(kvm, vdev);
185 
186 	return ret >= 0 ? 0 : ret;
187 }
188 
189 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
190 				     struct vfio_pci_msi_entry *entry)
191 {
192 	int ret;
193 
194 	if (entry->eventfd < 0) {
195 		entry->eventfd = eventfd(0, 0);
196 		if (entry->eventfd < 0) {
197 			ret = -errno;
198 			vfio_dev_err(vdev, "cannot create eventfd");
199 			return ret;
200 		}
201 	}
202 
203 	/* Allocate IRQ if necessary */
204 	if (entry->gsi < 0) {
205 		int ret = irq__add_msix_route(kvm, &entry->config.msg,
206 					      vdev->dev_hdr.dev_num << 3);
207 		if (ret < 0) {
208 			vfio_dev_err(vdev, "cannot create MSI-X route");
209 			return ret;
210 		}
211 		entry->gsi = ret;
212 	} else {
213 		irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
214 	}
215 
216 	/*
217 	 * MSI masking is unimplemented in VFIO, so we have to handle it by
218 	 * disabling/enabling IRQ route instead. We do it on the KVM side rather
219 	 * than VFIO, because:
220 	 * - it is 8x faster
221 	 * - it allows to decouple masking logic from capability state.
222 	 * - in masked state, after removing irqfd route, we could easily plug
223 	 *   the eventfd in a local handler, in order to serve Pending Bit reads
224 	 *   to the guest.
225 	 *
226 	 * So entry->phys_state is masked when there is no active irqfd route.
227 	 */
228 	if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state))
229 		return 0;
230 
231 	if (msi_is_masked(entry->phys_state)) {
232 		ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
233 		if (ret < 0) {
234 			vfio_dev_err(vdev, "cannot setup irqfd");
235 			return ret;
236 		}
237 	} else {
238 		irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
239 	}
240 
241 	msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state));
242 
243 	return 0;
244 }
245 
246 static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
247 				     u32 len, u8 is_write, void *ptr)
248 {
249 	struct vfio_pci_device *pdev = ptr;
250 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
251 	u64 offset = addr - pba->guest_phys_addr;
252 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
253 
254 	if (is_write)
255 		return;
256 
257 	/*
258 	 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
259 	 * is completely useless here. Note that Linux doesn't use PBA.
260 	 */
261 	if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len)
262 		vfio_dev_err(vdev, "cannot access MSIX PBA\n");
263 }
264 
265 static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
266 				       u32 len, u8 is_write, void *ptr)
267 {
268 	struct kvm *kvm = vcpu->kvm;
269 	struct vfio_pci_msi_entry *entry;
270 	struct vfio_pci_device *pdev = ptr;
271 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
272 
273 	u64 offset = addr - pdev->msix_table.guest_phys_addr;
274 
275 	size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
276 	off_t field = offset % PCI_MSIX_ENTRY_SIZE;
277 
278 	/*
279 	 * PCI spec says that software must use aligned 4 or 8 bytes accesses
280 	 * for the MSI-X tables.
281 	 */
282 	if ((len != 4 && len != 8) || addr & (len - 1)) {
283 		vfio_dev_warn(vdev, "invalid MSI-X table access");
284 		return;
285 	}
286 
287 	entry = &pdev->msix.entries[vector];
288 
289 	mutex_lock(&pdev->msix.mutex);
290 
291 	if (!is_write) {
292 		memcpy(data, (void *)&entry->config + field, len);
293 		goto out_unlock;
294 	}
295 
296 	memcpy((void *)&entry->config + field, data, len);
297 
298 	/*
299 	 * Check if access touched the vector control register, which is at the
300 	 * end of the MSI-X entry.
301 	 */
302 	if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
303 		goto out_unlock;
304 
305 	msi_set_masked(entry->virt_state, entry->config.ctrl &
306 		       PCI_MSIX_ENTRY_CTRL_MASKBIT);
307 
308 	if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
309 		/* Not much we can do here. */
310 		vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
311 
312 	/* Update the physical capability if necessary */
313 	if (vfio_pci_enable_msis(kvm, vdev, true))
314 		vfio_dev_err(vdev, "cannot enable MSIX");
315 
316 out_unlock:
317 	mutex_unlock(&pdev->msix.mutex);
318 }
319 
320 static void vfio_pci_msix_cap_write(struct kvm *kvm,
321 				    struct vfio_device *vdev, u16 off,
322 				    void *data, int sz)
323 {
324 	struct vfio_pci_device *pdev = &vdev->pci;
325 	off_t enable_pos = PCI_MSIX_FLAGS + 1;
326 	bool enable;
327 	u16 flags;
328 
329 	off -= pdev->msix.pos;
330 
331 	/* Check if access intersects with the MSI-X Enable bit */
332 	if (off > enable_pos || off + sz <= enable_pos)
333 		return;
334 
335 	/* Read byte that contains the Enable bit */
336 	flags = *(u8 *)(data + enable_pos - off) << 8;
337 
338 	mutex_lock(&pdev->msix.mutex);
339 
340 	msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL);
341 	enable = flags & PCI_MSIX_FLAGS_ENABLE;
342 	msi_set_enabled(pdev->msix.virt_state, enable);
343 
344 	if (enable && vfio_pci_enable_msis(kvm, vdev, true))
345 		vfio_dev_err(vdev, "cannot enable MSIX");
346 	else if (!enable && vfio_pci_disable_msis(kvm, vdev, true))
347 		vfio_dev_err(vdev, "cannot disable MSIX");
348 
349 	mutex_unlock(&pdev->msix.mutex);
350 }
351 
352 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
353 				     u16 off, u8 *data, u32 sz)
354 {
355 	size_t i;
356 	u32 mask = 0;
357 	size_t mask_pos, start, limit;
358 	struct vfio_pci_msi_entry *entry;
359 	struct vfio_pci_device *pdev = &vdev->pci;
360 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
361 
362 	if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT))
363 		return 0;
364 
365 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT)
366 		mask_pos = PCI_MSI_MASK_64;
367 	else
368 		mask_pos = PCI_MSI_MASK_32;
369 
370 	if (off >= mask_pos + 4 || off + sz <= mask_pos)
371 		return 0;
372 
373 	/* Set mask to current state */
374 	for (i = 0; i < pdev->msi.nr_entries; i++) {
375 		entry = &pdev->msi.entries[i];
376 		mask |= !!msi_is_masked(entry->virt_state) << i;
377 	}
378 
379 	/* Update mask following the intersection of access and register */
380 	start = max_t(size_t, off, mask_pos);
381 	limit = min_t(size_t, off + sz, mask_pos + 4);
382 
383 	memcpy((void *)&mask + start - mask_pos, data + start - off,
384 	       limit - start);
385 
386 	/* Update states if necessary */
387 	for (i = 0; i < pdev->msi.nr_entries; i++) {
388 		bool masked = mask & (1 << i);
389 
390 		entry = &pdev->msi.entries[i];
391 		if (masked != msi_is_masked(entry->virt_state)) {
392 			msi_set_masked(entry->virt_state, masked);
393 			vfio_pci_update_msi_entry(kvm, vdev, entry);
394 		}
395 	}
396 
397 	return 1;
398 }
399 
400 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
401 				   u16 off, u8 *data, u32 sz)
402 {
403 	u8 ctrl;
404 	struct msi_msg msg;
405 	size_t i, nr_vectors;
406 	struct vfio_pci_msi_entry *entry;
407 	struct vfio_pci_device *pdev = &vdev->pci;
408 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
409 
410 	off -= pdev->msi.pos;
411 
412 	mutex_lock(&pdev->msi.mutex);
413 
414 	/* Check if the guest is trying to update mask bits */
415 	if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz))
416 		goto out_unlock;
417 
418 	/* Only modify routes when guest pokes the enable bit */
419 	if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS)
420 		goto out_unlock;
421 
422 	ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off);
423 
424 	msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE);
425 
426 	if (!msi_is_enabled(pdev->msi.virt_state)) {
427 		vfio_pci_disable_msis(kvm, vdev, false);
428 		goto out_unlock;
429 	}
430 
431 	/* Create routes for the requested vectors */
432 	nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
433 
434 	msg.address_lo = msi_cap_64->address_lo;
435 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) {
436 		msg.address_hi = msi_cap_64->address_hi;
437 		msg.data = msi_cap_64->data;
438 	} else {
439 		struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64;
440 		msg.address_hi = 0;
441 		msg.data = msi_cap_32->data;
442 	}
443 
444 	for (i = 0; i < nr_vectors; i++) {
445 		entry = &pdev->msi.entries[i];
446 
447 		/*
448 		 * Set the MSI data value as required by the PCI local
449 		 * bus specifications, MSI capability, "Message Data".
450 		 */
451 		msg.data &= ~(nr_vectors - 1);
452 		msg.data |= i;
453 
454 		entry->config.msg = msg;
455 		vfio_pci_update_msi_entry(kvm, vdev, entry);
456 	}
457 
458 	/* Update the physical capability if necessary */
459 	if (vfio_pci_enable_msis(kvm, vdev, false))
460 		vfio_dev_err(vdev, "cannot enable MSI");
461 
462 out_unlock:
463 	mutex_unlock(&pdev->msi.mutex);
464 }
465 
466 static int vfio_pci_bar_activate(struct kvm *kvm,
467 				 struct pci_device_header *pci_hdr,
468 				 int bar_num, void *data)
469 {
470 	struct vfio_device *vdev = data;
471 	struct vfio_pci_device *pdev = &vdev->pci;
472 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
473 	struct vfio_pci_msix_table *table = &pdev->msix_table;
474 	struct vfio_region *region;
475 	u32 bar_addr;
476 	bool has_msix;
477 	int ret;
478 
479 	assert((u32)bar_num < vdev->info.num_regions);
480 
481 	region = &vdev->regions[bar_num];
482 	has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
483 
484 	bar_addr = pci__bar_address(pci_hdr, bar_num);
485 	if (pci__bar_is_io(pci_hdr, bar_num))
486 		region->port_base = bar_addr;
487 	else
488 		region->guest_phys_addr = bar_addr;
489 
490 	if (has_msix && (u32)bar_num == table->bar) {
491 		table->guest_phys_addr = region->guest_phys_addr;
492 		ret = kvm__register_mmio(kvm, table->guest_phys_addr,
493 					 table->size, false,
494 					 vfio_pci_msix_table_access, pdev);
495 		/*
496 		 * The MSIX table and the PBA structure can share the same BAR,
497 		 * but for convenience we register different regions for mmio
498 		 * emulation. We want to we update both if they share the same
499 		 * BAR.
500 		 */
501 		if (ret < 0 || table->bar != pba->bar)
502 			goto out;
503 	}
504 
505 	if (has_msix && (u32)bar_num == pba->bar) {
506 		if (pba->bar == table->bar)
507 			pba->guest_phys_addr = table->guest_phys_addr + table->size;
508 		else
509 			pba->guest_phys_addr = region->guest_phys_addr;
510 		ret = kvm__register_mmio(kvm, pba->guest_phys_addr,
511 					 pba->size, false,
512 					 vfio_pci_msix_pba_access, pdev);
513 		goto out;
514 	}
515 
516 	ret = vfio_map_region(kvm, vdev, region);
517 out:
518 	return ret;
519 }
520 
521 static int vfio_pci_bar_deactivate(struct kvm *kvm,
522 				   struct pci_device_header *pci_hdr,
523 				   int bar_num, void *data)
524 {
525 	struct vfio_device *vdev = data;
526 	struct vfio_pci_device *pdev = &vdev->pci;
527 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
528 	struct vfio_pci_msix_table *table = &pdev->msix_table;
529 	struct vfio_region *region;
530 	bool has_msix, success;
531 	int ret;
532 
533 	assert((u32)bar_num < vdev->info.num_regions);
534 
535 	region = &vdev->regions[bar_num];
536 	has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
537 
538 	if (has_msix && (u32)bar_num == table->bar) {
539 		success = kvm__deregister_mmio(kvm, table->guest_phys_addr);
540 		/* kvm__deregister_mmio fails when the region is not found. */
541 		ret = (success ? 0 : -ENOENT);
542 		/* See vfio_pci_bar_activate(). */
543 		if (ret < 0 || table->bar!= pba->bar)
544 			goto out;
545 	}
546 
547 	if (has_msix && (u32)bar_num == pba->bar) {
548 		success = kvm__deregister_mmio(kvm, pba->guest_phys_addr);
549 		ret = (success ? 0 : -ENOENT);
550 		goto out;
551 	}
552 
553 	vfio_unmap_region(kvm, region);
554 	ret = 0;
555 
556 out:
557 	return ret;
558 }
559 
560 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
561 			      u16 offset, void *data, int sz)
562 {
563 	struct vfio_region_info *info;
564 	struct vfio_pci_device *pdev;
565 	struct vfio_device *vdev;
566 	char base[sz];
567 
568 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
569 	vdev = container_of(pdev, struct vfio_device, pci);
570 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
571 
572 	/* Dummy read in case of side-effects */
573 	if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
574 		vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
575 			      sz, offset);
576 }
577 
578 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
579 			       u16 offset, void *data, int sz)
580 {
581 	struct vfio_region_info *info;
582 	struct vfio_pci_device *pdev;
583 	struct vfio_device *vdev;
584 	u32 tmp;
585 
586 	/* Make sure a larger size will not overrun tmp on the stack. */
587 	assert(sz <= 4);
588 
589 	if (offset == PCI_ROM_ADDRESS)
590 		return;
591 
592 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
593 	vdev = container_of(pdev, struct vfio_device, pci);
594 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
595 
596 	if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
597 		vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
598 			      sz, offset);
599 
600 	/* Handle MSI write now, since it might update the hardware capability */
601 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
602 		vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
603 
604 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI)
605 		vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz);
606 
607 	if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz)
608 		vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
609 			      sz, offset);
610 }
611 
612 static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr)
613 {
614 	size_t size = 10;
615 
616 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT)
617 		size += 4;
618 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT)
619 		size += 10;
620 
621 	return size;
622 }
623 
624 static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
625 {
626 	switch (cap_hdr->type) {
627 	case PCI_CAP_ID_MSIX:
628 		return PCI_CAP_MSIX_SIZEOF;
629 	case PCI_CAP_ID_MSI:
630 		return vfio_pci_msi_cap_size((void *)cap_hdr);
631 	case PCI_CAP_ID_EXP:
632 		/*
633 		 * We don't emulate any of the link, slot and root complex
634 		 * properties, so ignore them.
635 		 */
636 		return PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1;
637 	default:
638 		pr_err("unknown PCI capability 0x%x", cap_hdr->type);
639 		return 0;
640 	}
641 }
642 
643 static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
644 			    struct pci_cap_hdr *cap, off_t pos)
645 {
646 	struct pci_cap_hdr *last;
647 	struct pci_device_header *hdr = &vdev->pci.hdr;
648 
649 	cap->next = 0;
650 
651 	if (!hdr->capabilities) {
652 		hdr->capabilities = pos;
653 		hdr->status |= PCI_STATUS_CAP_LIST;
654 	} else {
655 		last = PCI_CAP(virt_hdr, hdr->capabilities);
656 
657 		while (last->next)
658 			last = PCI_CAP(virt_hdr, last->next);
659 
660 		last->next = pos;
661 	}
662 
663 	memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
664 
665 	return 0;
666 }
667 
668 static int vfio_pci_parse_caps(struct vfio_device *vdev)
669 {
670 	int ret;
671 	size_t size;
672 	u16 pos, next;
673 	struct pci_cap_hdr *cap;
674 	u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY];
675 	struct vfio_pci_device *pdev = &vdev->pci;
676 
677 	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
678 		return 0;
679 
680 	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY);
681 
682 	pos = pdev->hdr.capabilities & ~3;
683 
684 	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
685 	pdev->hdr.capabilities = 0;
686 
687 	for (; pos; pos = next) {
688 		cap = PCI_CAP(&pdev->hdr, pos);
689 		next = cap->next;
690 
691 		switch (cap->type) {
692 		case PCI_CAP_ID_MSIX:
693 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
694 			if (ret)
695 				return ret;
696 
697 			pdev->msix.pos = pos;
698 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
699 			break;
700 		case PCI_CAP_ID_MSI:
701 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
702 			if (ret)
703 				return ret;
704 
705 			pdev->msi.pos = pos;
706 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI;
707 			break;
708 		case PCI_CAP_ID_EXP:
709 			if (!arch_has_pci_exp())
710 				continue;
711 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
712 			if (ret)
713 				return ret;
714 			break;
715 		}
716 	}
717 
718 	/* Wipe remaining capabilities */
719 	pos = PCI_STD_HEADER_SIZEOF;
720 	size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF;
721 	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
722 
723 	return 0;
724 }
725 
726 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
727 {
728 	ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
729 	struct vfio_region_info *info;
730 	struct vfio_pci_device *pdev = &vdev->pci;
731 
732 	if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
733 		vfio_dev_err(vdev, "Config Space not found");
734 		return -ENODEV;
735 	}
736 
737 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
738 	*info = (struct vfio_region_info) {
739 			.argsz = sizeof(*info),
740 			.index = VFIO_PCI_CONFIG_REGION_INDEX,
741 	};
742 
743 	ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
744 	if (!info->size) {
745 		vfio_dev_err(vdev, "Config Space has size zero?!");
746 		return -EINVAL;
747 	}
748 
749 	/* Read standard headers and capabilities */
750 	if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
751 		vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
752 		return -EIO;
753 	}
754 
755 	/* Strip bit 7, that indicates multifunction */
756 	pdev->hdr.header_type &= 0x7f;
757 
758 	if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
759 		vfio_dev_err(vdev, "unsupported header type %u",
760 			     pdev->hdr.header_type);
761 		return -EOPNOTSUPP;
762 	}
763 
764 	if (pdev->hdr.irq_pin)
765 		pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
766 
767 	vfio_pci_parse_caps(vdev);
768 
769 	return 0;
770 }
771 
772 static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
773 {
774 	int i;
775 	u64 base;
776 	ssize_t hdr_sz;
777 	struct msix_cap *msix;
778 	struct vfio_region_info *info;
779 	struct vfio_pci_device *pdev = &vdev->pci;
780 	struct vfio_region *region;
781 
782 	/* Initialise the BARs */
783 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
784 		if ((u32)i == vdev->info.num_regions)
785 			break;
786 
787 		region = &vdev->regions[i];
788 		/* Construct a fake reg to match what we've mapped. */
789 		if (region->is_ioport) {
790 			base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) |
791 				PCI_BASE_ADDRESS_SPACE_IO;
792 		} else {
793 			base = (region->guest_phys_addr &
794 				PCI_BASE_ADDRESS_MEM_MASK) |
795 				PCI_BASE_ADDRESS_SPACE_MEMORY;
796 		}
797 
798 		pdev->hdr.bar[i] = base;
799 
800 		if (!base)
801 			continue;
802 
803 		pdev->hdr.bar_size[i] = region->info.size;
804 	}
805 
806 	/* I really can't be bothered to support cardbus. */
807 	pdev->hdr.card_bus = 0;
808 
809 	/*
810 	 * Nuke the expansion ROM for now. If we want to do this properly,
811 	 * we need to save its size somewhere and map into the guest.
812 	 */
813 	pdev->hdr.exp_rom_bar = 0;
814 
815 	/* Plumb in our fake MSI-X capability, if we have it. */
816 	msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
817 	if (msix) {
818 		/* Add a shortcut to the PBA region for the MMIO handler */
819 		int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
820 		pdev->msix_pba.offset = vdev->regions[pba_index].info.offset +
821 					(msix->pba_offset & PCI_MSIX_PBA_OFFSET);
822 
823 		/* Tidy up the capability */
824 		msix->table_offset &= PCI_MSIX_TABLE_BIR;
825 		msix->pba_offset &= PCI_MSIX_PBA_BIR;
826 		if (pdev->msix_table.bar == pdev->msix_pba.bar)
827 			msix->pba_offset |= pdev->msix_table.size &
828 					    PCI_MSIX_PBA_OFFSET;
829 	}
830 
831 	/* Install our fake Configuration Space */
832 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
833 	/*
834 	 * We don't touch the extended configuration space, let's be cautious
835 	 * and not overwrite it all with zeros, or bad things might happen.
836 	 */
837 	hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
838 	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
839 		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
840 			     hdr_sz);
841 		return -EIO;
842 	}
843 
844 	/* Register callbacks for cfg accesses */
845 	pdev->hdr.cfg_ops = (struct pci_config_operations) {
846 		.read	= vfio_pci_cfg_read,
847 		.write	= vfio_pci_cfg_write,
848 	};
849 
850 	pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
851 
852 	return 0;
853 }
854 
855 static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index,
856 				    struct vfio_region_info *info)
857 {
858 	int ret;
859 
860 	*info = (struct vfio_region_info) {
861 		.argsz = sizeof(*info),
862 		.index = index,
863 	};
864 
865 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
866 	if (ret) {
867 		ret = -errno;
868 		vfio_dev_err(vdev, "cannot get info for BAR %u", index);
869 		return ret;
870 	}
871 
872 	if (info->size && !is_power_of_two(info->size)) {
873 		vfio_dev_err(vdev, "region is not power of two: 0x%llx",
874 				info->size);
875 		return -EINVAL;
876 	}
877 
878 	return 0;
879 }
880 
881 static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev)
882 {
883 	int ret;
884 	size_t i;
885 	size_t map_size;
886 	size_t nr_entries;
887 	struct vfio_pci_msi_entry *entries;
888 	struct vfio_pci_device *pdev = &vdev->pci;
889 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
890 	struct vfio_pci_msix_table *table = &pdev->msix_table;
891 	struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
892 	struct vfio_region_info info;
893 
894 	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
895 	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
896 
897 	/*
898 	 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE.
899 	 */
900 	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
901 	table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE);
902 	pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE);
903 
904 	entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
905 	if (!entries)
906 		return -ENOMEM;
907 
908 	for (i = 0; i < nr_entries; i++)
909 		entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
910 
911 	ret = vfio_pci_get_region_info(vdev, table->bar, &info);
912 	if (ret)
913 		return ret;
914 	if (!info.size)
915 		return -EINVAL;
916 	map_size = info.size;
917 
918 	if (table->bar != pba->bar) {
919 		ret = vfio_pci_get_region_info(vdev, pba->bar, &info);
920 		if (ret)
921 			return ret;
922 		if (!info.size)
923 			return -EINVAL;
924 		map_size += info.size;
925 	}
926 
927 	/*
928 	 * To ease MSI-X cap configuration in case they share the same BAR,
929 	 * collapse table and pending array. The size of the BAR regions must be
930 	 * powers of two.
931 	 */
932 	map_size = ALIGN(map_size, PAGE_SIZE);
933 	table->guest_phys_addr = pci_get_mmio_block(map_size);
934 	if (!table->guest_phys_addr) {
935 		pr_err("cannot allocate MMIO space");
936 		ret = -ENOMEM;
937 		goto out_free;
938 	}
939 
940 	/*
941 	 * We could map the physical PBA directly into the guest, but it's
942 	 * likely smaller than a page, and we can only hand full pages to the
943 	 * guest. Even though the PCI spec disallows sharing a page used for
944 	 * MSI-X with any other resource, it allows to share the same page
945 	 * between MSI-X table and PBA. For the sake of isolation, create a
946 	 * virtual PBA.
947 	 */
948 	pba->guest_phys_addr = table->guest_phys_addr + table->size;
949 
950 	pdev->msix.entries = entries;
951 	pdev->msix.nr_entries = nr_entries;
952 
953 	return 0;
954 
955 out_free:
956 	free(entries);
957 
958 	return ret;
959 }
960 
961 static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev)
962 {
963 	struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos);
964 
965 	pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1),
966 	pdev->msi.entries = calloc(pdev->msi.nr_entries,
967 				   sizeof(struct vfio_pci_msi_entry));
968 	if (!pdev->msi.entries)
969 		return -ENOMEM;
970 
971 	return 0;
972 }
973 
974 static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
975 				  size_t nr)
976 {
977 	int ret;
978 	u32 bar;
979 	size_t map_size;
980 	struct vfio_pci_device *pdev = &vdev->pci;
981 	struct vfio_region *region;
982 
983 	if (nr >= vdev->info.num_regions)
984 		return 0;
985 
986 	region = &vdev->regions[nr];
987 	bar = pdev->hdr.bar[nr];
988 
989 	region->vdev = vdev;
990 	region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
991 
992 	ret = vfio_pci_get_region_info(vdev, nr, &region->info);
993 	if (ret)
994 		return ret;
995 
996 	/* Ignore invalid or unimplemented regions */
997 	if (!region->info.size)
998 		return 0;
999 
1000 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1001 		/* Trap and emulate MSI-X table */
1002 		if (nr == pdev->msix_table.bar) {
1003 			region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
1004 			return 0;
1005 		} else if (nr == pdev->msix_pba.bar) {
1006 			region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
1007 			return 0;
1008 		}
1009 	}
1010 
1011 	if (region->is_ioport) {
1012 		region->port_base = pci_get_io_port_block(region->info.size);
1013 	} else {
1014 		/* Grab some MMIO space in the guest */
1015 		map_size = ALIGN(region->info.size, PAGE_SIZE);
1016 		region->guest_phys_addr = pci_get_mmio_block(map_size);
1017 	}
1018 
1019 	return 0;
1020 }
1021 
1022 static int vfio_pci_configure_dev_regions(struct kvm *kvm,
1023 					  struct vfio_device *vdev)
1024 {
1025 	int ret;
1026 	u32 bar;
1027 	size_t i;
1028 	bool is_64bit = false;
1029 	struct vfio_pci_device *pdev = &vdev->pci;
1030 
1031 	ret = vfio_pci_parse_cfg_space(vdev);
1032 	if (ret)
1033 		return ret;
1034 
1035 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1036 		ret = vfio_pci_create_msix_table(kvm, vdev);
1037 		if (ret)
1038 			return ret;
1039 	}
1040 
1041 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
1042 		ret = vfio_pci_create_msi_cap(kvm, pdev);
1043 		if (ret)
1044 			return ret;
1045 	}
1046 
1047 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
1048 		/* Ignore top half of 64-bit BAR */
1049 		if (is_64bit) {
1050 			is_64bit = false;
1051 			continue;
1052 		}
1053 
1054 		ret = vfio_pci_configure_bar(kvm, vdev, i);
1055 		if (ret)
1056 			return ret;
1057 
1058 		bar = pdev->hdr.bar[i];
1059 		is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) ==
1060 			   PCI_BASE_ADDRESS_SPACE_MEMORY &&
1061 			   bar & PCI_BASE_ADDRESS_MEM_TYPE_64;
1062 	}
1063 
1064 	/* We've configured the BARs, fake up a Configuration Space */
1065 	ret = vfio_pci_fixup_cfg_space(vdev);
1066 	if (ret)
1067 		return ret;
1068 
1069 	return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate,
1070 					 vfio_pci_bar_deactivate, vdev);
1071 }
1072 
1073 /*
1074  * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
1075  * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
1076  */
1077 static int vfio_pci_reserve_irq_fds(size_t num)
1078 {
1079 	/*
1080 	 * I counted around 27 fds under normal load. Let's add 100 for good
1081 	 * measure.
1082 	 */
1083 	static size_t needed = 128;
1084 	struct rlimit fd_limit, new_limit;
1085 
1086 	needed += num;
1087 
1088 	if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
1089 		perror("getrlimit(RLIMIT_NOFILE)");
1090 		return 0;
1091 	}
1092 
1093 	if (fd_limit.rlim_cur >= needed)
1094 		return 0;
1095 
1096 	new_limit.rlim_cur = needed;
1097 
1098 	if (fd_limit.rlim_max < needed)
1099 		/* Try to bump hard limit (root only) */
1100 		new_limit.rlim_max = needed;
1101 	else
1102 		new_limit.rlim_max = fd_limit.rlim_max;
1103 
1104 	if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
1105 		perror("setrlimit(RLIMIT_NOFILE)");
1106 		pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
1107 			   (size_t)(needed - fd_limit.rlim_cur));
1108 	}
1109 
1110 	return 0;
1111 }
1112 
1113 static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
1114 			     struct vfio_pci_msi_common *msis)
1115 {
1116 	int ret;
1117 	size_t i;
1118 	int *eventfds;
1119 	size_t irq_set_size;
1120 	struct vfio_pci_msi_entry *entry;
1121 	size_t nr_entries = msis->nr_entries;
1122 
1123 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
1124 	if (ret || msis->info.count == 0) {
1125 		vfio_dev_err(vdev, "no MSI reported by VFIO");
1126 		return -ENODEV;
1127 	}
1128 
1129 	if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1130 		vfio_dev_err(vdev, "interrupt not EVENTFD capable");
1131 		return -EINVAL;
1132 	}
1133 
1134 	if (msis->info.count != nr_entries) {
1135 		vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
1136 		return -EINVAL;
1137 	}
1138 
1139 	mutex_init(&msis->mutex);
1140 
1141 	vfio_pci_reserve_irq_fds(nr_entries);
1142 
1143 	irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
1144 	msis->irq_set = malloc(irq_set_size);
1145 	if (!msis->irq_set)
1146 		return -ENOMEM;
1147 
1148 	*msis->irq_set = (struct vfio_irq_set) {
1149 		.argsz	= irq_set_size,
1150 		.flags 	= VFIO_IRQ_SET_DATA_EVENTFD |
1151 			  VFIO_IRQ_SET_ACTION_TRIGGER,
1152 		.index 	= msis->info.index,
1153 		.start 	= 0,
1154 		.count 	= nr_entries,
1155 	};
1156 
1157 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
1158 
1159 	for (i = 0; i < nr_entries; i++) {
1160 		entry = &msis->entries[i];
1161 		entry->gsi = -1;
1162 		entry->eventfd = -1;
1163 		msi_set_masked(entry->virt_state, true);
1164 		msi_set_masked(entry->phys_state, true);
1165 		eventfds[i] = -1;
1166 	}
1167 
1168 	return 0;
1169 }
1170 
1171 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
1172 {
1173 	struct vfio_pci_device *pdev = &vdev->pci;
1174 	int gsi = pdev->intx_gsi;
1175 	struct vfio_irq_set irq_set = {
1176 		.argsz	= sizeof(irq_set),
1177 		.flags	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
1178 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1179 	};
1180 
1181 	if (pdev->intx_fd == -1)
1182 		return;
1183 
1184 	pr_debug("user requested MSI, disabling INTx %d", gsi);
1185 
1186 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
1187 	irq__del_irqfd(kvm, gsi, pdev->intx_fd);
1188 
1189 	close(pdev->intx_fd);
1190 	close(pdev->unmask_fd);
1191 	pdev->intx_fd = -1;
1192 }
1193 
1194 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
1195 {
1196 	int ret;
1197 	int trigger_fd, unmask_fd;
1198 	union vfio_irq_eventfd	trigger;
1199 	union vfio_irq_eventfd	unmask;
1200 	struct vfio_pci_device *pdev = &vdev->pci;
1201 	int gsi = pdev->intx_gsi;
1202 
1203 	if (pdev->intx_fd != -1)
1204 		return 0;
1205 
1206 	/*
1207 	 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
1208 	 * signals an interrupt from host to guest, and unmask_fd signals the
1209 	 * deassertion of the line from guest to host.
1210 	 */
1211 	trigger_fd = eventfd(0, 0);
1212 	if (trigger_fd < 0) {
1213 		vfio_dev_err(vdev, "failed to create trigger eventfd");
1214 		return trigger_fd;
1215 	}
1216 
1217 	unmask_fd = eventfd(0, 0);
1218 	if (unmask_fd < 0) {
1219 		vfio_dev_err(vdev, "failed to create unmask eventfd");
1220 		close(trigger_fd);
1221 		return unmask_fd;
1222 	}
1223 
1224 	ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
1225 	if (ret)
1226 		goto err_close;
1227 
1228 	trigger.irq = (struct vfio_irq_set) {
1229 		.argsz	= sizeof(trigger),
1230 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
1231 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1232 		.start	= 0,
1233 		.count	= 1,
1234 	};
1235 	set_vfio_irq_eventd_payload(&trigger, trigger_fd);
1236 
1237 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1238 	if (ret < 0) {
1239 		vfio_dev_err(vdev, "failed to setup VFIO IRQ");
1240 		goto err_delete_line;
1241 	}
1242 
1243 	unmask.irq = (struct vfio_irq_set) {
1244 		.argsz	= sizeof(unmask),
1245 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
1246 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1247 		.start	= 0,
1248 		.count	= 1,
1249 	};
1250 	set_vfio_irq_eventd_payload(&unmask, unmask_fd);
1251 
1252 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask);
1253 	if (ret < 0) {
1254 		vfio_dev_err(vdev, "failed to setup unmask IRQ");
1255 		goto err_remove_event;
1256 	}
1257 
1258 	pdev->intx_fd = trigger_fd;
1259 	pdev->unmask_fd = unmask_fd;
1260 
1261 	return 0;
1262 
1263 err_remove_event:
1264 	/* Remove trigger event */
1265 	trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1266 	trigger.irq.count = 0;
1267 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1268 
1269 err_delete_line:
1270 	irq__del_irqfd(kvm, gsi, trigger_fd);
1271 
1272 err_close:
1273 	close(trigger_fd);
1274 	close(unmask_fd);
1275 	return ret;
1276 }
1277 
1278 static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev)
1279 {
1280 	int ret;
1281 	struct vfio_pci_device *pdev = &vdev->pci;
1282 	struct vfio_irq_info irq_info = {
1283 		.argsz = sizeof(irq_info),
1284 		.index = VFIO_PCI_INTX_IRQ_INDEX,
1285 	};
1286 
1287 	vfio_pci_reserve_irq_fds(2);
1288 
1289 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
1290 	if (ret || irq_info.count == 0) {
1291 		vfio_dev_err(vdev, "no INTx reported by VFIO");
1292 		return -ENODEV;
1293 	}
1294 
1295 	if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1296 		vfio_dev_err(vdev, "interrupt not eventfd capable");
1297 		return -EINVAL;
1298 	}
1299 
1300 	if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
1301 		vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
1302 		return -EINVAL;
1303 	}
1304 
1305 	/* Guest is going to ovewrite our irq_line... */
1306 	pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
1307 
1308 	pdev->intx_fd = -1;
1309 
1310 	return 0;
1311 }
1312 
1313 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
1314 {
1315 	int ret = 0;
1316 	struct vfio_pci_device *pdev = &vdev->pci;
1317 
1318 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1319 		pdev->msix.info = (struct vfio_irq_info) {
1320 			.argsz = sizeof(pdev->msix.info),
1321 			.index = VFIO_PCI_MSIX_IRQ_INDEX,
1322 		};
1323 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
1324 		if (ret)
1325 			return ret;
1326 	}
1327 
1328 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
1329 		pdev->msi.info = (struct vfio_irq_info) {
1330 			.argsz = sizeof(pdev->msi.info),
1331 			.index = VFIO_PCI_MSI_IRQ_INDEX,
1332 		};
1333 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi);
1334 		if (ret)
1335 			return ret;
1336 	}
1337 
1338 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
1339 		pci__assign_irq(&vdev->pci.hdr);
1340 
1341 		ret = vfio_pci_init_intx(kvm, vdev);
1342 		if (ret)
1343 			return ret;
1344 
1345 		ret = vfio_pci_enable_intx(kvm, vdev);
1346 	}
1347 
1348 	return ret;
1349 }
1350 
1351 int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
1352 {
1353 	int ret;
1354 
1355 	ret = vfio_pci_configure_dev_regions(kvm, vdev);
1356 	if (ret) {
1357 		vfio_dev_err(vdev, "failed to configure regions");
1358 		return ret;
1359 	}
1360 
1361 	vdev->dev_hdr = (struct device_header) {
1362 		.bus_type	= DEVICE_BUS_PCI,
1363 		.data		= &vdev->pci.hdr,
1364 	};
1365 
1366 	ret = device__register(&vdev->dev_hdr);
1367 	if (ret) {
1368 		vfio_dev_err(vdev, "failed to register VFIO device");
1369 		return ret;
1370 	}
1371 
1372 	ret = vfio_pci_configure_dev_irqs(kvm, vdev);
1373 	if (ret) {
1374 		vfio_dev_err(vdev, "failed to configure IRQs");
1375 		return ret;
1376 	}
1377 
1378 	return 0;
1379 }
1380 
1381 void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
1382 {
1383 	size_t i;
1384 	struct vfio_pci_device *pdev = &vdev->pci;
1385 
1386 	for (i = 0; i < vdev->info.num_regions; i++)
1387 		vfio_unmap_region(kvm, &vdev->regions[i]);
1388 
1389 	device__unregister(&vdev->dev_hdr);
1390 
1391 	free(pdev->msix.irq_set);
1392 	free(pdev->msix.entries);
1393 	free(pdev->msi.irq_set);
1394 	free(pdev->msi.entries);
1395 }
1396