xref: /kvmtool/vfio/pci.c (revision e69b7663b06e8af9cc2dae16e6ec906a64c3c63d)
1 #include "kvm/irq.h"
2 #include "kvm/kvm.h"
3 #include "kvm/kvm-cpu.h"
4 #include "kvm/vfio.h"
5 
6 #include <assert.h>
7 
8 #include <sys/ioctl.h>
9 #include <sys/eventfd.h>
10 #include <sys/resource.h>
11 #include <sys/time.h>
12 
13 #include <assert.h>
14 
15 /* Wrapper around UAPI vfio_irq_set */
16 union vfio_irq_eventfd {
17 	struct vfio_irq_set	irq;
18 	u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)];
19 };
20 
21 static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd)
22 {
23 	memcpy(&evfd->irq.data, &fd, sizeof(fd));
24 }
25 
26 #define msi_is_enabled(state)		((state) & VFIO_PCI_MSI_STATE_ENABLED)
27 #define msi_is_masked(state)		((state) & VFIO_PCI_MSI_STATE_MASKED)
28 #define msi_is_empty(state)		((state) & VFIO_PCI_MSI_STATE_EMPTY)
29 
30 #define msi_update_state(state, val, bit)				\
31 	(state) = (val) ? (state) | bit : (state) & ~bit;
32 #define msi_set_enabled(state, val)					\
33 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
34 #define msi_set_masked(state, val)					\
35 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
36 #define msi_set_empty(state, val)					\
37 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
38 
39 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
40 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
41 
42 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
43 				bool msix)
44 {
45 	size_t i;
46 	int ret = 0;
47 	int *eventfds;
48 	struct vfio_pci_device *pdev = &vdev->pci;
49 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
50 	union vfio_irq_eventfd single = {
51 		.irq = {
52 			.argsz	= sizeof(single),
53 			.flags	= VFIO_IRQ_SET_DATA_EVENTFD |
54 				  VFIO_IRQ_SET_ACTION_TRIGGER,
55 			.index	= msis->info.index,
56 			.count	= 1,
57 		},
58 	};
59 
60 	if (!msi_is_enabled(msis->virt_state))
61 		return 0;
62 
63 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
64 		/*
65 		 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
66 		 * time. Since INTx has to be enabled from the start (we don't
67 		 * have a reliable way to know when the guest starts using it),
68 		 * disable it now.
69 		 */
70 		vfio_pci_disable_intx(kvm, vdev);
71 
72 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
73 
74 	/*
75 	 * Initial registration of the full range. This enables the physical
76 	 * MSI/MSI-X capability, which might have desired side effects. For
77 	 * instance when assigning virtio legacy devices, enabling the MSI
78 	 * capability modifies the config space layout!
79 	 *
80 	 * As an optimization, only update MSIs when guest unmasks the
81 	 * capability. This greatly reduces the initialization time for Linux
82 	 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
83 	 * masked, then fills individual vectors, then unmasks the whole
84 	 * function. So we only do one VFIO ioctl when enabling for the first
85 	 * time, and then one when unmasking.
86 	 *
87 	 * phys_state is empty when it is enabled but no vector has been
88 	 * registered via SET_IRQS yet.
89 	 */
90 	if (!msi_is_enabled(msis->phys_state) ||
91 	    (!msi_is_masked(msis->virt_state) &&
92 	     msi_is_empty(msis->phys_state))) {
93 		bool empty = true;
94 
95 		for (i = 0; i < msis->nr_entries; i++) {
96 			eventfds[i] = msis->entries[i].gsi >= 0 ?
97 				      msis->entries[i].eventfd : -1;
98 
99 			if (eventfds[i] >= 0)
100 				empty = false;
101 		}
102 
103 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
104 		if (ret < 0) {
105 			perror("VFIO_DEVICE_SET_IRQS(multi)");
106 			return ret;
107 		}
108 
109 		msi_set_enabled(msis->phys_state, true);
110 		msi_set_empty(msis->phys_state, empty);
111 
112 		return 0;
113 	}
114 
115 	if (msi_is_masked(msis->virt_state)) {
116 		/* TODO: if phys_state is not empty nor masked, mask all vectors */
117 		return 0;
118 	}
119 
120 	/* Update individual vectors to avoid breaking those in use */
121 	for (i = 0; i < msis->nr_entries; i++) {
122 		struct vfio_pci_msi_entry *entry = &msis->entries[i];
123 		int fd = entry->gsi >= 0 ? entry->eventfd : -1;
124 
125 		if (fd == eventfds[i])
126 			continue;
127 
128 		single.irq.start = i;
129 		set_vfio_irq_eventd_payload(&single, fd);
130 
131 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
132 		if (ret < 0) {
133 			perror("VFIO_DEVICE_SET_IRQS(single)");
134 			break;
135 		}
136 
137 		eventfds[i] = fd;
138 
139 		if (msi_is_empty(msis->phys_state) && fd >= 0)
140 			msi_set_empty(msis->phys_state, false);
141 	}
142 
143 	return ret;
144 }
145 
146 static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev,
147 				 bool msix)
148 {
149 	int ret;
150 	struct vfio_pci_device *pdev = &vdev->pci;
151 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
152 	struct vfio_irq_set irq_set = {
153 		.argsz	= sizeof(irq_set),
154 		.flags 	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
155 		.index 	= msis->info.index,
156 		.start 	= 0,
157 		.count	= 0,
158 	};
159 
160 	if (!msi_is_enabled(msis->phys_state))
161 		return 0;
162 
163 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
164 	if (ret < 0) {
165 		perror("VFIO_DEVICE_SET_IRQS(NONE)");
166 		return ret;
167 	}
168 
169 	msi_set_enabled(msis->phys_state, false);
170 	msi_set_empty(msis->phys_state, true);
171 
172 	/*
173 	 * When MSI or MSIX is disabled, this might be called when
174 	 * PCI driver detects the MSI interrupt failure and wants to
175 	 * rollback to INTx mode.  Thus enable INTx if the device
176 	 * supports INTx mode in this case.
177 	 */
178 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
179 		ret = vfio_pci_enable_intx(kvm, vdev);
180 
181 	return ret >= 0 ? 0 : ret;
182 }
183 
184 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
185 				     struct vfio_pci_msi_entry *entry)
186 {
187 	int ret;
188 
189 	if (entry->eventfd < 0) {
190 		entry->eventfd = eventfd(0, 0);
191 		if (entry->eventfd < 0) {
192 			ret = -errno;
193 			vfio_dev_err(vdev, "cannot create eventfd");
194 			return ret;
195 		}
196 	}
197 
198 	/* Allocate IRQ if necessary */
199 	if (entry->gsi < 0) {
200 		int ret = irq__add_msix_route(kvm, &entry->config.msg,
201 					      vdev->dev_hdr.dev_num << 3);
202 		if (ret < 0) {
203 			vfio_dev_err(vdev, "cannot create MSI-X route");
204 			return ret;
205 		}
206 		entry->gsi = ret;
207 	} else {
208 		irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
209 	}
210 
211 	/*
212 	 * MSI masking is unimplemented in VFIO, so we have to handle it by
213 	 * disabling/enabling IRQ route instead. We do it on the KVM side rather
214 	 * than VFIO, because:
215 	 * - it is 8x faster
216 	 * - it allows to decouple masking logic from capability state.
217 	 * - in masked state, after removing irqfd route, we could easily plug
218 	 *   the eventfd in a local handler, in order to serve Pending Bit reads
219 	 *   to the guest.
220 	 *
221 	 * So entry->phys_state is masked when there is no active irqfd route.
222 	 */
223 	if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state))
224 		return 0;
225 
226 	if (msi_is_masked(entry->phys_state)) {
227 		ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
228 		if (ret < 0) {
229 			vfio_dev_err(vdev, "cannot setup irqfd");
230 			return ret;
231 		}
232 	} else {
233 		irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
234 	}
235 
236 	msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state));
237 
238 	return 0;
239 }
240 
241 static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
242 				     u32 len, u8 is_write, void *ptr)
243 {
244 	struct vfio_pci_device *pdev = ptr;
245 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
246 	u64 offset = addr - pba->guest_phys_addr;
247 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
248 
249 	if (is_write)
250 		return;
251 
252 	/*
253 	 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
254 	 * is completely useless here. Note that Linux doesn't use PBA.
255 	 */
256 	if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len)
257 		vfio_dev_err(vdev, "cannot access MSIX PBA\n");
258 }
259 
260 static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
261 				       u32 len, u8 is_write, void *ptr)
262 {
263 	struct kvm *kvm = vcpu->kvm;
264 	struct vfio_pci_msi_entry *entry;
265 	struct vfio_pci_device *pdev = ptr;
266 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
267 
268 	u64 offset = addr - pdev->msix_table.guest_phys_addr;
269 
270 	size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
271 	off_t field = offset % PCI_MSIX_ENTRY_SIZE;
272 
273 	/*
274 	 * PCI spec says that software must use aligned 4 or 8 bytes accesses
275 	 * for the MSI-X tables.
276 	 */
277 	if ((len != 4 && len != 8) || addr & (len - 1)) {
278 		vfio_dev_warn(vdev, "invalid MSI-X table access");
279 		return;
280 	}
281 
282 	entry = &pdev->msix.entries[vector];
283 
284 	mutex_lock(&pdev->msix.mutex);
285 
286 	if (!is_write) {
287 		memcpy(data, (void *)&entry->config + field, len);
288 		goto out_unlock;
289 	}
290 
291 	memcpy((void *)&entry->config + field, data, len);
292 
293 	/*
294 	 * Check if access touched the vector control register, which is at the
295 	 * end of the MSI-X entry.
296 	 */
297 	if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
298 		goto out_unlock;
299 
300 	msi_set_masked(entry->virt_state, entry->config.ctrl &
301 		       PCI_MSIX_ENTRY_CTRL_MASKBIT);
302 
303 	if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
304 		/* Not much we can do here. */
305 		vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
306 
307 	/* Update the physical capability if necessary */
308 	if (vfio_pci_enable_msis(kvm, vdev, true))
309 		vfio_dev_err(vdev, "cannot enable MSIX");
310 
311 out_unlock:
312 	mutex_unlock(&pdev->msix.mutex);
313 }
314 
315 static void vfio_pci_msix_cap_write(struct kvm *kvm,
316 				    struct vfio_device *vdev, u16 off,
317 				    void *data, int sz)
318 {
319 	struct vfio_pci_device *pdev = &vdev->pci;
320 	off_t enable_pos = PCI_MSIX_FLAGS + 1;
321 	bool enable;
322 	u16 flags;
323 
324 	off -= pdev->msix.pos;
325 
326 	/* Check if access intersects with the MSI-X Enable bit */
327 	if (off > enable_pos || off + sz <= enable_pos)
328 		return;
329 
330 	/* Read byte that contains the Enable bit */
331 	flags = *(u8 *)(data + enable_pos - off) << 8;
332 
333 	mutex_lock(&pdev->msix.mutex);
334 
335 	msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL);
336 	enable = flags & PCI_MSIX_FLAGS_ENABLE;
337 	msi_set_enabled(pdev->msix.virt_state, enable);
338 
339 	if (enable && vfio_pci_enable_msis(kvm, vdev, true))
340 		vfio_dev_err(vdev, "cannot enable MSIX");
341 	else if (!enable && vfio_pci_disable_msis(kvm, vdev, true))
342 		vfio_dev_err(vdev, "cannot disable MSIX");
343 
344 	mutex_unlock(&pdev->msix.mutex);
345 }
346 
347 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
348 				     u16 off, u8 *data, u32 sz)
349 {
350 	size_t i;
351 	u32 mask = 0;
352 	size_t mask_pos, start, limit;
353 	struct vfio_pci_msi_entry *entry;
354 	struct vfio_pci_device *pdev = &vdev->pci;
355 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
356 
357 	if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT))
358 		return 0;
359 
360 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT)
361 		mask_pos = PCI_MSI_MASK_64;
362 	else
363 		mask_pos = PCI_MSI_MASK_32;
364 
365 	if (off >= mask_pos + 4 || off + sz <= mask_pos)
366 		return 0;
367 
368 	/* Set mask to current state */
369 	for (i = 0; i < pdev->msi.nr_entries; i++) {
370 		entry = &pdev->msi.entries[i];
371 		mask |= !!msi_is_masked(entry->virt_state) << i;
372 	}
373 
374 	/* Update mask following the intersection of access and register */
375 	start = max_t(size_t, off, mask_pos);
376 	limit = min_t(size_t, off + sz, mask_pos + 4);
377 
378 	memcpy((void *)&mask + start - mask_pos, data + start - off,
379 	       limit - start);
380 
381 	/* Update states if necessary */
382 	for (i = 0; i < pdev->msi.nr_entries; i++) {
383 		bool masked = mask & (1 << i);
384 
385 		entry = &pdev->msi.entries[i];
386 		if (masked != msi_is_masked(entry->virt_state)) {
387 			msi_set_masked(entry->virt_state, masked);
388 			vfio_pci_update_msi_entry(kvm, vdev, entry);
389 		}
390 	}
391 
392 	return 1;
393 }
394 
395 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
396 				   u16 off, u8 *data, u32 sz)
397 {
398 	u8 ctrl;
399 	struct msi_msg msg;
400 	size_t i, nr_vectors;
401 	struct vfio_pci_msi_entry *entry;
402 	struct vfio_pci_device *pdev = &vdev->pci;
403 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
404 
405 	off -= pdev->msi.pos;
406 
407 	mutex_lock(&pdev->msi.mutex);
408 
409 	/* Check if the guest is trying to update mask bits */
410 	if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz))
411 		goto out_unlock;
412 
413 	/* Only modify routes when guest pokes the enable bit */
414 	if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS)
415 		goto out_unlock;
416 
417 	ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off);
418 
419 	msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE);
420 
421 	if (!msi_is_enabled(pdev->msi.virt_state)) {
422 		vfio_pci_disable_msis(kvm, vdev, false);
423 		goto out_unlock;
424 	}
425 
426 	/* Create routes for the requested vectors */
427 	nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
428 
429 	msg.address_lo = msi_cap_64->address_lo;
430 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) {
431 		msg.address_hi = msi_cap_64->address_hi;
432 		msg.data = msi_cap_64->data;
433 	} else {
434 		struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64;
435 		msg.address_hi = 0;
436 		msg.data = msi_cap_32->data;
437 	}
438 
439 	for (i = 0; i < nr_vectors; i++) {
440 		entry = &pdev->msi.entries[i];
441 
442 		/*
443 		 * Set the MSI data value as required by the PCI local
444 		 * bus specifications, MSI capability, "Message Data".
445 		 */
446 		msg.data &= ~(nr_vectors - 1);
447 		msg.data |= i;
448 
449 		entry->config.msg = msg;
450 		vfio_pci_update_msi_entry(kvm, vdev, entry);
451 	}
452 
453 	/* Update the physical capability if necessary */
454 	if (vfio_pci_enable_msis(kvm, vdev, false))
455 		vfio_dev_err(vdev, "cannot enable MSI");
456 
457 out_unlock:
458 	mutex_unlock(&pdev->msi.mutex);
459 }
460 
461 static int vfio_pci_bar_activate(struct kvm *kvm,
462 				 struct pci_device_header *pci_hdr,
463 				 int bar_num, void *data)
464 {
465 	struct vfio_device *vdev = data;
466 	struct vfio_pci_device *pdev = &vdev->pci;
467 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
468 	struct vfio_pci_msix_table *table = &pdev->msix_table;
469 	struct vfio_region *region;
470 	u32 bar_addr;
471 	bool has_msix;
472 	int ret;
473 
474 	assert((u32)bar_num < vdev->info.num_regions);
475 
476 	region = &vdev->regions[bar_num];
477 	has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
478 
479 	bar_addr = pci__bar_address(pci_hdr, bar_num);
480 	if (pci__bar_is_io(pci_hdr, bar_num))
481 		region->port_base = bar_addr;
482 	else
483 		region->guest_phys_addr = bar_addr;
484 
485 	if (has_msix && (u32)bar_num == table->bar) {
486 		table->guest_phys_addr = region->guest_phys_addr;
487 		ret = kvm__register_mmio(kvm, table->guest_phys_addr,
488 					 table->size, false,
489 					 vfio_pci_msix_table_access, pdev);
490 		/*
491 		 * The MSIX table and the PBA structure can share the same BAR,
492 		 * but for convenience we register different regions for mmio
493 		 * emulation. We want to we update both if they share the same
494 		 * BAR.
495 		 */
496 		if (ret < 0 || table->bar != pba->bar)
497 			goto out;
498 	}
499 
500 	if (has_msix && (u32)bar_num == pba->bar) {
501 		if (pba->bar == table->bar)
502 			pba->guest_phys_addr = table->guest_phys_addr + table->size;
503 		else
504 			pba->guest_phys_addr = region->guest_phys_addr;
505 		ret = kvm__register_mmio(kvm, pba->guest_phys_addr,
506 					 pba->size, false,
507 					 vfio_pci_msix_pba_access, pdev);
508 		goto out;
509 	}
510 
511 	ret = vfio_map_region(kvm, vdev, region);
512 out:
513 	return ret;
514 }
515 
516 static int vfio_pci_bar_deactivate(struct kvm *kvm,
517 				   struct pci_device_header *pci_hdr,
518 				   int bar_num, void *data)
519 {
520 	struct vfio_device *vdev = data;
521 	struct vfio_pci_device *pdev = &vdev->pci;
522 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
523 	struct vfio_pci_msix_table *table = &pdev->msix_table;
524 	struct vfio_region *region;
525 	bool has_msix, success;
526 	int ret;
527 
528 	assert((u32)bar_num < vdev->info.num_regions);
529 
530 	region = &vdev->regions[bar_num];
531 	has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
532 
533 	if (has_msix && (u32)bar_num == table->bar) {
534 		success = kvm__deregister_mmio(kvm, table->guest_phys_addr);
535 		/* kvm__deregister_mmio fails when the region is not found. */
536 		ret = (success ? 0 : -ENOENT);
537 		/* See vfio_pci_bar_activate(). */
538 		if (ret < 0 || table->bar!= pba->bar)
539 			goto out;
540 	}
541 
542 	if (has_msix && (u32)bar_num == pba->bar) {
543 		success = kvm__deregister_mmio(kvm, pba->guest_phys_addr);
544 		ret = (success ? 0 : -ENOENT);
545 		goto out;
546 	}
547 
548 	vfio_unmap_region(kvm, region);
549 	ret = 0;
550 
551 out:
552 	return ret;
553 }
554 
555 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
556 			      u16 offset, void *data, int sz)
557 {
558 	struct vfio_region_info *info;
559 	struct vfio_pci_device *pdev;
560 	struct vfio_device *vdev;
561 	char base[sz];
562 
563 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
564 	vdev = container_of(pdev, struct vfio_device, pci);
565 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
566 
567 	/* Dummy read in case of side-effects */
568 	if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
569 		vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
570 			      sz, offset);
571 }
572 
573 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
574 			       u16 offset, void *data, int sz)
575 {
576 	struct vfio_region_info *info;
577 	struct vfio_pci_device *pdev;
578 	struct vfio_device *vdev;
579 	u32 tmp;
580 
581 	/* Make sure a larger size will not overrun tmp on the stack. */
582 	assert(sz <= 4);
583 
584 	if (offset == PCI_ROM_ADDRESS)
585 		return;
586 
587 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
588 	vdev = container_of(pdev, struct vfio_device, pci);
589 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
590 
591 	if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
592 		vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
593 			      sz, offset);
594 
595 	/* Handle MSI write now, since it might update the hardware capability */
596 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
597 		vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
598 
599 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI)
600 		vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz);
601 
602 	if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz)
603 		vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
604 			      sz, offset);
605 }
606 
607 static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr)
608 {
609 	size_t size = 10;
610 
611 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT)
612 		size += 4;
613 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT)
614 		size += 10;
615 
616 	return size;
617 }
618 
619 static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
620 {
621 	switch (cap_hdr->type) {
622 	case PCI_CAP_ID_MSIX:
623 		return PCI_CAP_MSIX_SIZEOF;
624 	case PCI_CAP_ID_MSI:
625 		return vfio_pci_msi_cap_size((void *)cap_hdr);
626 	default:
627 		pr_err("unknown PCI capability 0x%x", cap_hdr->type);
628 		return 0;
629 	}
630 }
631 
632 static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
633 			    struct pci_cap_hdr *cap, off_t pos)
634 {
635 	struct pci_cap_hdr *last;
636 	struct pci_device_header *hdr = &vdev->pci.hdr;
637 
638 	cap->next = 0;
639 
640 	if (!hdr->capabilities) {
641 		hdr->capabilities = pos;
642 		hdr->status |= PCI_STATUS_CAP_LIST;
643 	} else {
644 		last = PCI_CAP(virt_hdr, hdr->capabilities);
645 
646 		while (last->next)
647 			last = PCI_CAP(virt_hdr, last->next);
648 
649 		last->next = pos;
650 	}
651 
652 	memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
653 
654 	return 0;
655 }
656 
657 static int vfio_pci_parse_caps(struct vfio_device *vdev)
658 {
659 	int ret;
660 	size_t size;
661 	u16 pos, next;
662 	struct pci_cap_hdr *cap;
663 	u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY];
664 	struct vfio_pci_device *pdev = &vdev->pci;
665 
666 	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
667 		return 0;
668 
669 	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY);
670 
671 	pos = pdev->hdr.capabilities & ~3;
672 
673 	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
674 	pdev->hdr.capabilities = 0;
675 
676 	for (; pos; pos = next) {
677 		cap = PCI_CAP(&pdev->hdr, pos);
678 		next = cap->next;
679 
680 		switch (cap->type) {
681 		case PCI_CAP_ID_MSIX:
682 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
683 			if (ret)
684 				return ret;
685 
686 			pdev->msix.pos = pos;
687 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
688 			break;
689 		case PCI_CAP_ID_MSI:
690 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
691 			if (ret)
692 				return ret;
693 
694 			pdev->msi.pos = pos;
695 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI;
696 			break;
697 		}
698 	}
699 
700 	/* Wipe remaining capabilities */
701 	pos = PCI_STD_HEADER_SIZEOF;
702 	size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF;
703 	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
704 
705 	return 0;
706 }
707 
708 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
709 {
710 	ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
711 	struct vfio_region_info *info;
712 	struct vfio_pci_device *pdev = &vdev->pci;
713 
714 	if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
715 		vfio_dev_err(vdev, "Config Space not found");
716 		return -ENODEV;
717 	}
718 
719 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
720 	*info = (struct vfio_region_info) {
721 			.argsz = sizeof(*info),
722 			.index = VFIO_PCI_CONFIG_REGION_INDEX,
723 	};
724 
725 	ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
726 	if (!info->size) {
727 		vfio_dev_err(vdev, "Config Space has size zero?!");
728 		return -EINVAL;
729 	}
730 
731 	/* Read standard headers and capabilities */
732 	if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
733 		vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
734 		return -EIO;
735 	}
736 
737 	/* Strip bit 7, that indicates multifunction */
738 	pdev->hdr.header_type &= 0x7f;
739 
740 	if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
741 		vfio_dev_err(vdev, "unsupported header type %u",
742 			     pdev->hdr.header_type);
743 		return -EOPNOTSUPP;
744 	}
745 
746 	if (pdev->hdr.irq_pin)
747 		pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
748 
749 	vfio_pci_parse_caps(vdev);
750 
751 	return 0;
752 }
753 
754 static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
755 {
756 	int i;
757 	u64 base;
758 	ssize_t hdr_sz;
759 	struct msix_cap *msix;
760 	struct vfio_region_info *info;
761 	struct vfio_pci_device *pdev = &vdev->pci;
762 	struct vfio_region *region;
763 
764 	/* Initialise the BARs */
765 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
766 		if ((u32)i == vdev->info.num_regions)
767 			break;
768 
769 		region = &vdev->regions[i];
770 		/* Construct a fake reg to match what we've mapped. */
771 		if (region->is_ioport) {
772 			base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) |
773 				PCI_BASE_ADDRESS_SPACE_IO;
774 		} else {
775 			base = (region->guest_phys_addr &
776 				PCI_BASE_ADDRESS_MEM_MASK) |
777 				PCI_BASE_ADDRESS_SPACE_MEMORY;
778 		}
779 
780 		pdev->hdr.bar[i] = base;
781 
782 		if (!base)
783 			continue;
784 
785 		pdev->hdr.bar_size[i] = region->info.size;
786 	}
787 
788 	/* I really can't be bothered to support cardbus. */
789 	pdev->hdr.card_bus = 0;
790 
791 	/*
792 	 * Nuke the expansion ROM for now. If we want to do this properly,
793 	 * we need to save its size somewhere and map into the guest.
794 	 */
795 	pdev->hdr.exp_rom_bar = 0;
796 
797 	/* Plumb in our fake MSI-X capability, if we have it. */
798 	msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
799 	if (msix) {
800 		/* Add a shortcut to the PBA region for the MMIO handler */
801 		int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
802 		pdev->msix_pba.offset = vdev->regions[pba_index].info.offset +
803 					(msix->pba_offset & PCI_MSIX_PBA_OFFSET);
804 
805 		/* Tidy up the capability */
806 		msix->table_offset &= PCI_MSIX_TABLE_BIR;
807 		msix->pba_offset &= PCI_MSIX_PBA_BIR;
808 		if (pdev->msix_table.bar == pdev->msix_pba.bar)
809 			msix->pba_offset |= pdev->msix_table.size &
810 					    PCI_MSIX_PBA_OFFSET;
811 	}
812 
813 	/* Install our fake Configuration Space */
814 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
815 	/*
816 	 * We don't touch the extended configuration space, let's be cautious
817 	 * and not overwrite it all with zeros, or bad things might happen.
818 	 */
819 	hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
820 	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
821 		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
822 			     hdr_sz);
823 		return -EIO;
824 	}
825 
826 	/* Register callbacks for cfg accesses */
827 	pdev->hdr.cfg_ops = (struct pci_config_operations) {
828 		.read	= vfio_pci_cfg_read,
829 		.write	= vfio_pci_cfg_write,
830 	};
831 
832 	pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
833 
834 	return 0;
835 }
836 
837 static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index,
838 				    struct vfio_region_info *info)
839 {
840 	int ret;
841 
842 	*info = (struct vfio_region_info) {
843 		.argsz = sizeof(*info),
844 		.index = index,
845 	};
846 
847 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
848 	if (ret) {
849 		ret = -errno;
850 		vfio_dev_err(vdev, "cannot get info for BAR %u", index);
851 		return ret;
852 	}
853 
854 	if (info->size && !is_power_of_two(info->size)) {
855 		vfio_dev_err(vdev, "region is not power of two: 0x%llx",
856 				info->size);
857 		return -EINVAL;
858 	}
859 
860 	return 0;
861 }
862 
863 static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev)
864 {
865 	int ret;
866 	size_t i;
867 	size_t map_size;
868 	size_t nr_entries;
869 	struct vfio_pci_msi_entry *entries;
870 	struct vfio_pci_device *pdev = &vdev->pci;
871 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
872 	struct vfio_pci_msix_table *table = &pdev->msix_table;
873 	struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
874 	struct vfio_region_info info;
875 
876 	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
877 	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
878 
879 	/*
880 	 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE.
881 	 */
882 	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
883 	table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE);
884 	pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE);
885 
886 	entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
887 	if (!entries)
888 		return -ENOMEM;
889 
890 	for (i = 0; i < nr_entries; i++)
891 		entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
892 
893 	ret = vfio_pci_get_region_info(vdev, table->bar, &info);
894 	if (ret)
895 		return ret;
896 	if (!info.size)
897 		return -EINVAL;
898 	map_size = info.size;
899 
900 	if (table->bar != pba->bar) {
901 		ret = vfio_pci_get_region_info(vdev, pba->bar, &info);
902 		if (ret)
903 			return ret;
904 		if (!info.size)
905 			return -EINVAL;
906 		map_size += info.size;
907 	}
908 
909 	/*
910 	 * To ease MSI-X cap configuration in case they share the same BAR,
911 	 * collapse table and pending array. The size of the BAR regions must be
912 	 * powers of two.
913 	 */
914 	map_size = ALIGN(map_size, PAGE_SIZE);
915 	table->guest_phys_addr = pci_get_mmio_block(map_size);
916 	if (!table->guest_phys_addr) {
917 		pr_err("cannot allocate MMIO space");
918 		ret = -ENOMEM;
919 		goto out_free;
920 	}
921 
922 	/*
923 	 * We could map the physical PBA directly into the guest, but it's
924 	 * likely smaller than a page, and we can only hand full pages to the
925 	 * guest. Even though the PCI spec disallows sharing a page used for
926 	 * MSI-X with any other resource, it allows to share the same page
927 	 * between MSI-X table and PBA. For the sake of isolation, create a
928 	 * virtual PBA.
929 	 */
930 	pba->guest_phys_addr = table->guest_phys_addr + table->size;
931 
932 	pdev->msix.entries = entries;
933 	pdev->msix.nr_entries = nr_entries;
934 
935 	return 0;
936 
937 out_free:
938 	free(entries);
939 
940 	return ret;
941 }
942 
943 static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev)
944 {
945 	struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos);
946 
947 	pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1),
948 	pdev->msi.entries = calloc(pdev->msi.nr_entries,
949 				   sizeof(struct vfio_pci_msi_entry));
950 	if (!pdev->msi.entries)
951 		return -ENOMEM;
952 
953 	return 0;
954 }
955 
956 static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
957 				  size_t nr)
958 {
959 	int ret;
960 	u32 bar;
961 	size_t map_size;
962 	struct vfio_pci_device *pdev = &vdev->pci;
963 	struct vfio_region *region;
964 
965 	if (nr >= vdev->info.num_regions)
966 		return 0;
967 
968 	region = &vdev->regions[nr];
969 	bar = pdev->hdr.bar[nr];
970 
971 	region->vdev = vdev;
972 	region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
973 
974 	ret = vfio_pci_get_region_info(vdev, nr, &region->info);
975 	if (ret)
976 		return ret;
977 
978 	/* Ignore invalid or unimplemented regions */
979 	if (!region->info.size)
980 		return 0;
981 
982 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
983 		/* Trap and emulate MSI-X table */
984 		if (nr == pdev->msix_table.bar) {
985 			region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
986 			return 0;
987 		} else if (nr == pdev->msix_pba.bar) {
988 			region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
989 			return 0;
990 		}
991 	}
992 
993 	if (region->is_ioport) {
994 		region->port_base = pci_get_io_port_block(region->info.size);
995 	} else {
996 		/* Grab some MMIO space in the guest */
997 		map_size = ALIGN(region->info.size, PAGE_SIZE);
998 		region->guest_phys_addr = pci_get_mmio_block(map_size);
999 	}
1000 
1001 	return 0;
1002 }
1003 
1004 static int vfio_pci_configure_dev_regions(struct kvm *kvm,
1005 					  struct vfio_device *vdev)
1006 {
1007 	int ret;
1008 	u32 bar;
1009 	size_t i;
1010 	bool is_64bit = false;
1011 	struct vfio_pci_device *pdev = &vdev->pci;
1012 
1013 	ret = vfio_pci_parse_cfg_space(vdev);
1014 	if (ret)
1015 		return ret;
1016 
1017 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1018 		ret = vfio_pci_create_msix_table(kvm, vdev);
1019 		if (ret)
1020 			return ret;
1021 	}
1022 
1023 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
1024 		ret = vfio_pci_create_msi_cap(kvm, pdev);
1025 		if (ret)
1026 			return ret;
1027 	}
1028 
1029 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
1030 		/* Ignore top half of 64-bit BAR */
1031 		if (is_64bit) {
1032 			is_64bit = false;
1033 			continue;
1034 		}
1035 
1036 		ret = vfio_pci_configure_bar(kvm, vdev, i);
1037 		if (ret)
1038 			return ret;
1039 
1040 		bar = pdev->hdr.bar[i];
1041 		is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) ==
1042 			   PCI_BASE_ADDRESS_SPACE_MEMORY &&
1043 			   bar & PCI_BASE_ADDRESS_MEM_TYPE_64;
1044 	}
1045 
1046 	/* We've configured the BARs, fake up a Configuration Space */
1047 	ret = vfio_pci_fixup_cfg_space(vdev);
1048 	if (ret)
1049 		return ret;
1050 
1051 	return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate,
1052 					 vfio_pci_bar_deactivate, vdev);
1053 }
1054 
1055 /*
1056  * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
1057  * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
1058  */
1059 static int vfio_pci_reserve_irq_fds(size_t num)
1060 {
1061 	/*
1062 	 * I counted around 27 fds under normal load. Let's add 100 for good
1063 	 * measure.
1064 	 */
1065 	static size_t needed = 128;
1066 	struct rlimit fd_limit, new_limit;
1067 
1068 	needed += num;
1069 
1070 	if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
1071 		perror("getrlimit(RLIMIT_NOFILE)");
1072 		return 0;
1073 	}
1074 
1075 	if (fd_limit.rlim_cur >= needed)
1076 		return 0;
1077 
1078 	new_limit.rlim_cur = needed;
1079 
1080 	if (fd_limit.rlim_max < needed)
1081 		/* Try to bump hard limit (root only) */
1082 		new_limit.rlim_max = needed;
1083 	else
1084 		new_limit.rlim_max = fd_limit.rlim_max;
1085 
1086 	if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
1087 		perror("setrlimit(RLIMIT_NOFILE)");
1088 		pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
1089 			   (size_t)(needed - fd_limit.rlim_cur));
1090 	}
1091 
1092 	return 0;
1093 }
1094 
1095 static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
1096 			     struct vfio_pci_msi_common *msis)
1097 {
1098 	int ret;
1099 	size_t i;
1100 	int *eventfds;
1101 	size_t irq_set_size;
1102 	struct vfio_pci_msi_entry *entry;
1103 	size_t nr_entries = msis->nr_entries;
1104 
1105 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
1106 	if (ret || msis->info.count == 0) {
1107 		vfio_dev_err(vdev, "no MSI reported by VFIO");
1108 		return -ENODEV;
1109 	}
1110 
1111 	if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1112 		vfio_dev_err(vdev, "interrupt not EVENTFD capable");
1113 		return -EINVAL;
1114 	}
1115 
1116 	if (msis->info.count != nr_entries) {
1117 		vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
1118 		return -EINVAL;
1119 	}
1120 
1121 	mutex_init(&msis->mutex);
1122 
1123 	vfio_pci_reserve_irq_fds(nr_entries);
1124 
1125 	irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
1126 	msis->irq_set = malloc(irq_set_size);
1127 	if (!msis->irq_set)
1128 		return -ENOMEM;
1129 
1130 	*msis->irq_set = (struct vfio_irq_set) {
1131 		.argsz	= irq_set_size,
1132 		.flags 	= VFIO_IRQ_SET_DATA_EVENTFD |
1133 			  VFIO_IRQ_SET_ACTION_TRIGGER,
1134 		.index 	= msis->info.index,
1135 		.start 	= 0,
1136 		.count 	= nr_entries,
1137 	};
1138 
1139 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
1140 
1141 	for (i = 0; i < nr_entries; i++) {
1142 		entry = &msis->entries[i];
1143 		entry->gsi = -1;
1144 		entry->eventfd = -1;
1145 		msi_set_masked(entry->virt_state, true);
1146 		msi_set_masked(entry->phys_state, true);
1147 		eventfds[i] = -1;
1148 	}
1149 
1150 	return 0;
1151 }
1152 
1153 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
1154 {
1155 	struct vfio_pci_device *pdev = &vdev->pci;
1156 	int gsi = pdev->intx_gsi;
1157 	struct vfio_irq_set irq_set = {
1158 		.argsz	= sizeof(irq_set),
1159 		.flags	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
1160 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1161 	};
1162 
1163 	if (pdev->intx_fd == -1)
1164 		return;
1165 
1166 	pr_debug("user requested MSI, disabling INTx %d", gsi);
1167 
1168 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
1169 	irq__del_irqfd(kvm, gsi, pdev->intx_fd);
1170 
1171 	close(pdev->intx_fd);
1172 	close(pdev->unmask_fd);
1173 	pdev->intx_fd = -1;
1174 }
1175 
1176 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
1177 {
1178 	int ret;
1179 	int trigger_fd, unmask_fd;
1180 	union vfio_irq_eventfd	trigger;
1181 	union vfio_irq_eventfd	unmask;
1182 	struct vfio_pci_device *pdev = &vdev->pci;
1183 	int gsi = pdev->intx_gsi;
1184 
1185 	if (pdev->intx_fd != -1)
1186 		return 0;
1187 
1188 	/*
1189 	 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
1190 	 * signals an interrupt from host to guest, and unmask_fd signals the
1191 	 * deassertion of the line from guest to host.
1192 	 */
1193 	trigger_fd = eventfd(0, 0);
1194 	if (trigger_fd < 0) {
1195 		vfio_dev_err(vdev, "failed to create trigger eventfd");
1196 		return trigger_fd;
1197 	}
1198 
1199 	unmask_fd = eventfd(0, 0);
1200 	if (unmask_fd < 0) {
1201 		vfio_dev_err(vdev, "failed to create unmask eventfd");
1202 		close(trigger_fd);
1203 		return unmask_fd;
1204 	}
1205 
1206 	ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
1207 	if (ret)
1208 		goto err_close;
1209 
1210 	trigger.irq = (struct vfio_irq_set) {
1211 		.argsz	= sizeof(trigger),
1212 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
1213 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1214 		.start	= 0,
1215 		.count	= 1,
1216 	};
1217 	set_vfio_irq_eventd_payload(&trigger, trigger_fd);
1218 
1219 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1220 	if (ret < 0) {
1221 		vfio_dev_err(vdev, "failed to setup VFIO IRQ");
1222 		goto err_delete_line;
1223 	}
1224 
1225 	unmask.irq = (struct vfio_irq_set) {
1226 		.argsz	= sizeof(unmask),
1227 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
1228 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1229 		.start	= 0,
1230 		.count	= 1,
1231 	};
1232 	set_vfio_irq_eventd_payload(&unmask, unmask_fd);
1233 
1234 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask);
1235 	if (ret < 0) {
1236 		vfio_dev_err(vdev, "failed to setup unmask IRQ");
1237 		goto err_remove_event;
1238 	}
1239 
1240 	pdev->intx_fd = trigger_fd;
1241 	pdev->unmask_fd = unmask_fd;
1242 
1243 	return 0;
1244 
1245 err_remove_event:
1246 	/* Remove trigger event */
1247 	trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1248 	trigger.irq.count = 0;
1249 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1250 
1251 err_delete_line:
1252 	irq__del_irqfd(kvm, gsi, trigger_fd);
1253 
1254 err_close:
1255 	close(trigger_fd);
1256 	close(unmask_fd);
1257 	return ret;
1258 }
1259 
1260 static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev)
1261 {
1262 	int ret;
1263 	struct vfio_pci_device *pdev = &vdev->pci;
1264 	struct vfio_irq_info irq_info = {
1265 		.argsz = sizeof(irq_info),
1266 		.index = VFIO_PCI_INTX_IRQ_INDEX,
1267 	};
1268 
1269 	vfio_pci_reserve_irq_fds(2);
1270 
1271 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
1272 	if (ret || irq_info.count == 0) {
1273 		vfio_dev_err(vdev, "no INTx reported by VFIO");
1274 		return -ENODEV;
1275 	}
1276 
1277 	if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1278 		vfio_dev_err(vdev, "interrupt not eventfd capable");
1279 		return -EINVAL;
1280 	}
1281 
1282 	if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
1283 		vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
1284 		return -EINVAL;
1285 	}
1286 
1287 	/* Guest is going to ovewrite our irq_line... */
1288 	pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
1289 
1290 	pdev->intx_fd = -1;
1291 
1292 	return 0;
1293 }
1294 
1295 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
1296 {
1297 	int ret = 0;
1298 	struct vfio_pci_device *pdev = &vdev->pci;
1299 
1300 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1301 		pdev->msix.info = (struct vfio_irq_info) {
1302 			.argsz = sizeof(pdev->msix.info),
1303 			.index = VFIO_PCI_MSIX_IRQ_INDEX,
1304 		};
1305 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
1306 		if (ret)
1307 			return ret;
1308 	}
1309 
1310 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
1311 		pdev->msi.info = (struct vfio_irq_info) {
1312 			.argsz = sizeof(pdev->msi.info),
1313 			.index = VFIO_PCI_MSI_IRQ_INDEX,
1314 		};
1315 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi);
1316 		if (ret)
1317 			return ret;
1318 	}
1319 
1320 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
1321 		pci__assign_irq(&vdev->pci.hdr);
1322 
1323 		ret = vfio_pci_init_intx(kvm, vdev);
1324 		if (ret)
1325 			return ret;
1326 
1327 		ret = vfio_pci_enable_intx(kvm, vdev);
1328 	}
1329 
1330 	return ret;
1331 }
1332 
1333 int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
1334 {
1335 	int ret;
1336 
1337 	ret = vfio_pci_configure_dev_regions(kvm, vdev);
1338 	if (ret) {
1339 		vfio_dev_err(vdev, "failed to configure regions");
1340 		return ret;
1341 	}
1342 
1343 	vdev->dev_hdr = (struct device_header) {
1344 		.bus_type	= DEVICE_BUS_PCI,
1345 		.data		= &vdev->pci.hdr,
1346 	};
1347 
1348 	ret = device__register(&vdev->dev_hdr);
1349 	if (ret) {
1350 		vfio_dev_err(vdev, "failed to register VFIO device");
1351 		return ret;
1352 	}
1353 
1354 	ret = vfio_pci_configure_dev_irqs(kvm, vdev);
1355 	if (ret) {
1356 		vfio_dev_err(vdev, "failed to configure IRQs");
1357 		return ret;
1358 	}
1359 
1360 	return 0;
1361 }
1362 
1363 void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
1364 {
1365 	size_t i;
1366 	struct vfio_pci_device *pdev = &vdev->pci;
1367 
1368 	for (i = 0; i < vdev->info.num_regions; i++)
1369 		vfio_unmap_region(kvm, &vdev->regions[i]);
1370 
1371 	device__unregister(&vdev->dev_hdr);
1372 
1373 	free(pdev->msix.irq_set);
1374 	free(pdev->msix.entries);
1375 	free(pdev->msi.irq_set);
1376 	free(pdev->msi.entries);
1377 }
1378