xref: /kvmtool/vfio/pci.c (revision e1d0285c89ae5c7d0d3b1fdede92d2cf9d12bb01)
1 #include "kvm/irq.h"
2 #include "kvm/kvm.h"
3 #include "kvm/kvm-cpu.h"
4 #include "kvm/vfio.h"
5 
6 #include <assert.h>
7 
8 #include <sys/ioctl.h>
9 #include <sys/eventfd.h>
10 #include <sys/resource.h>
11 #include <sys/time.h>
12 
13 /* Wrapper around UAPI vfio_irq_set */
14 union vfio_irq_eventfd {
15 	struct vfio_irq_set	irq;
16 	u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)];
17 };
18 
19 static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd)
20 {
21 	memcpy(&evfd->irq.data, &fd, sizeof(fd));
22 }
23 
24 #define msi_is_enabled(state)		((state) & VFIO_PCI_MSI_STATE_ENABLED)
25 #define msi_is_masked(state)		((state) & VFIO_PCI_MSI_STATE_MASKED)
26 #define msi_is_empty(state)		((state) & VFIO_PCI_MSI_STATE_EMPTY)
27 
28 #define msi_update_state(state, val, bit)				\
29 	(state) = (val) ? (state) | bit : (state) & ~bit;
30 #define msi_set_enabled(state, val)					\
31 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
32 #define msi_set_masked(state, val)					\
33 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
34 #define msi_set_empty(state, val)					\
35 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
36 
37 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
38 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
39 
40 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
41 				bool msix)
42 {
43 	size_t i;
44 	int ret = 0;
45 	int *eventfds;
46 	struct vfio_pci_device *pdev = &vdev->pci;
47 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
48 	union vfio_irq_eventfd single = {
49 		.irq = {
50 			.argsz	= sizeof(single),
51 			.flags	= VFIO_IRQ_SET_DATA_EVENTFD |
52 				  VFIO_IRQ_SET_ACTION_TRIGGER,
53 			.index	= msis->info.index,
54 			.count	= 1,
55 		},
56 	};
57 
58 	if (!msi_is_enabled(msis->virt_state))
59 		return 0;
60 
61 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
62 		/*
63 		 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
64 		 * time. Since INTx has to be enabled from the start (we don't
65 		 * have a reliable way to know when the guest starts using it),
66 		 * disable it now.
67 		 */
68 		vfio_pci_disable_intx(kvm, vdev);
69 
70 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
71 
72 	/*
73 	 * Initial registration of the full range. This enables the physical
74 	 * MSI/MSI-X capability, which might have desired side effects. For
75 	 * instance when assigning virtio legacy devices, enabling the MSI
76 	 * capability modifies the config space layout!
77 	 *
78 	 * As an optimization, only update MSIs when guest unmasks the
79 	 * capability. This greatly reduces the initialization time for Linux
80 	 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
81 	 * masked, then fills individual vectors, then unmasks the whole
82 	 * function. So we only do one VFIO ioctl when enabling for the first
83 	 * time, and then one when unmasking.
84 	 *
85 	 * phys_state is empty when it is enabled but no vector has been
86 	 * registered via SET_IRQS yet.
87 	 */
88 	if (!msi_is_enabled(msis->phys_state) ||
89 	    (!msi_is_masked(msis->virt_state) &&
90 	     msi_is_empty(msis->phys_state))) {
91 		bool empty = true;
92 
93 		for (i = 0; i < msis->nr_entries; i++) {
94 			eventfds[i] = msis->entries[i].gsi >= 0 ?
95 				      msis->entries[i].eventfd : -1;
96 
97 			if (eventfds[i] >= 0)
98 				empty = false;
99 		}
100 
101 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
102 		if (ret < 0) {
103 			perror("VFIO_DEVICE_SET_IRQS(multi)");
104 			return ret;
105 		}
106 
107 		msi_set_enabled(msis->phys_state, true);
108 		msi_set_empty(msis->phys_state, empty);
109 
110 		return 0;
111 	}
112 
113 	if (msi_is_masked(msis->virt_state)) {
114 		/* TODO: if phys_state is not empty nor masked, mask all vectors */
115 		return 0;
116 	}
117 
118 	/* Update individual vectors to avoid breaking those in use */
119 	for (i = 0; i < msis->nr_entries; i++) {
120 		struct vfio_pci_msi_entry *entry = &msis->entries[i];
121 		int fd = entry->gsi >= 0 ? entry->eventfd : -1;
122 
123 		if (fd == eventfds[i])
124 			continue;
125 
126 		single.irq.start = i;
127 		set_vfio_irq_eventd_payload(&single, fd);
128 
129 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
130 		if (ret < 0) {
131 			perror("VFIO_DEVICE_SET_IRQS(single)");
132 			break;
133 		}
134 
135 		eventfds[i] = fd;
136 
137 		if (msi_is_empty(msis->phys_state) && fd >= 0)
138 			msi_set_empty(msis->phys_state, false);
139 	}
140 
141 	return ret;
142 }
143 
144 static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev,
145 				 bool msix)
146 {
147 	int ret;
148 	struct vfio_pci_device *pdev = &vdev->pci;
149 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
150 	struct vfio_irq_set irq_set = {
151 		.argsz	= sizeof(irq_set),
152 		.flags 	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
153 		.index 	= msis->info.index,
154 		.start 	= 0,
155 		.count	= 0,
156 	};
157 
158 	if (!msi_is_enabled(msis->phys_state))
159 		return 0;
160 
161 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
162 	if (ret < 0) {
163 		perror("VFIO_DEVICE_SET_IRQS(NONE)");
164 		return ret;
165 	}
166 
167 	msi_set_enabled(msis->phys_state, false);
168 	msi_set_empty(msis->phys_state, true);
169 
170 	/*
171 	 * When MSI or MSIX is disabled, this might be called when
172 	 * PCI driver detects the MSI interrupt failure and wants to
173 	 * rollback to INTx mode.  Thus enable INTx if the device
174 	 * supports INTx mode in this case.
175 	 */
176 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
177 		ret = vfio_pci_enable_intx(kvm, vdev);
178 
179 	return ret >= 0 ? 0 : ret;
180 }
181 
182 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
183 				     struct vfio_pci_msi_entry *entry)
184 {
185 	int ret;
186 
187 	if (entry->eventfd < 0) {
188 		entry->eventfd = eventfd(0, 0);
189 		if (entry->eventfd < 0) {
190 			ret = -errno;
191 			vfio_dev_err(vdev, "cannot create eventfd");
192 			return ret;
193 		}
194 	}
195 
196 	/* Allocate IRQ if necessary */
197 	if (entry->gsi < 0) {
198 		int ret = irq__add_msix_route(kvm, &entry->config.msg,
199 					      vdev->dev_hdr.dev_num << 3);
200 		if (ret < 0) {
201 			vfio_dev_err(vdev, "cannot create MSI-X route");
202 			return ret;
203 		}
204 		entry->gsi = ret;
205 	} else {
206 		irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
207 	}
208 
209 	/*
210 	 * MSI masking is unimplemented in VFIO, so we have to handle it by
211 	 * disabling/enabling IRQ route instead. We do it on the KVM side rather
212 	 * than VFIO, because:
213 	 * - it is 8x faster
214 	 * - it allows to decouple masking logic from capability state.
215 	 * - in masked state, after removing irqfd route, we could easily plug
216 	 *   the eventfd in a local handler, in order to serve Pending Bit reads
217 	 *   to the guest.
218 	 *
219 	 * So entry->phys_state is masked when there is no active irqfd route.
220 	 */
221 	if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state))
222 		return 0;
223 
224 	if (msi_is_masked(entry->phys_state)) {
225 		ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
226 		if (ret < 0) {
227 			vfio_dev_err(vdev, "cannot setup irqfd");
228 			return ret;
229 		}
230 	} else {
231 		irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
232 	}
233 
234 	msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state));
235 
236 	return 0;
237 }
238 
239 static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
240 				     u32 len, u8 is_write, void *ptr)
241 {
242 	struct vfio_pci_device *pdev = ptr;
243 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
244 	u64 offset = addr - pba->guest_phys_addr;
245 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
246 
247 	if (is_write)
248 		return;
249 
250 	/*
251 	 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
252 	 * is completely useless here. Note that Linux doesn't use PBA.
253 	 */
254 	if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len)
255 		vfio_dev_err(vdev, "cannot access MSIX PBA\n");
256 }
257 
258 static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
259 				       u32 len, u8 is_write, void *ptr)
260 {
261 	struct kvm *kvm = vcpu->kvm;
262 	struct vfio_pci_msi_entry *entry;
263 	struct vfio_pci_device *pdev = ptr;
264 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
265 
266 	u64 offset = addr - pdev->msix_table.guest_phys_addr;
267 
268 	size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
269 	off_t field = offset % PCI_MSIX_ENTRY_SIZE;
270 
271 	/*
272 	 * PCI spec says that software must use aligned 4 or 8 bytes accesses
273 	 * for the MSI-X tables.
274 	 */
275 	if ((len != 4 && len != 8) || addr & (len - 1)) {
276 		vfio_dev_warn(vdev, "invalid MSI-X table access");
277 		return;
278 	}
279 
280 	entry = &pdev->msix.entries[vector];
281 
282 	mutex_lock(&pdev->msix.mutex);
283 
284 	if (!is_write) {
285 		memcpy(data, (void *)&entry->config + field, len);
286 		goto out_unlock;
287 	}
288 
289 	memcpy((void *)&entry->config + field, data, len);
290 
291 	/*
292 	 * Check if access touched the vector control register, which is at the
293 	 * end of the MSI-X entry.
294 	 */
295 	if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
296 		goto out_unlock;
297 
298 	msi_set_masked(entry->virt_state, entry->config.ctrl &
299 		       PCI_MSIX_ENTRY_CTRL_MASKBIT);
300 
301 	if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
302 		/* Not much we can do here. */
303 		vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
304 
305 	/* Update the physical capability if necessary */
306 	if (vfio_pci_enable_msis(kvm, vdev, true))
307 		vfio_dev_err(vdev, "cannot enable MSIX");
308 
309 out_unlock:
310 	mutex_unlock(&pdev->msix.mutex);
311 }
312 
313 static void vfio_pci_msix_cap_write(struct kvm *kvm,
314 				    struct vfio_device *vdev, u8 off,
315 				    void *data, int sz)
316 {
317 	struct vfio_pci_device *pdev = &vdev->pci;
318 	off_t enable_pos = PCI_MSIX_FLAGS + 1;
319 	bool enable;
320 	u16 flags;
321 
322 	off -= pdev->msix.pos;
323 
324 	/* Check if access intersects with the MSI-X Enable bit */
325 	if (off > enable_pos || off + sz <= enable_pos)
326 		return;
327 
328 	/* Read byte that contains the Enable bit */
329 	flags = *(u8 *)(data + enable_pos - off) << 8;
330 
331 	mutex_lock(&pdev->msix.mutex);
332 
333 	msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL);
334 	enable = flags & PCI_MSIX_FLAGS_ENABLE;
335 	msi_set_enabled(pdev->msix.virt_state, enable);
336 
337 	if (enable && vfio_pci_enable_msis(kvm, vdev, true))
338 		vfio_dev_err(vdev, "cannot enable MSIX");
339 	else if (!enable && vfio_pci_disable_msis(kvm, vdev, true))
340 		vfio_dev_err(vdev, "cannot disable MSIX");
341 
342 	mutex_unlock(&pdev->msix.mutex);
343 }
344 
345 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
346 				     u8 off, u8 *data, u32 sz)
347 {
348 	size_t i;
349 	u32 mask = 0;
350 	size_t mask_pos, start, limit;
351 	struct vfio_pci_msi_entry *entry;
352 	struct vfio_pci_device *pdev = &vdev->pci;
353 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
354 
355 	if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT))
356 		return 0;
357 
358 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT)
359 		mask_pos = PCI_MSI_MASK_64;
360 	else
361 		mask_pos = PCI_MSI_MASK_32;
362 
363 	if (off >= mask_pos + 4 || off + sz <= mask_pos)
364 		return 0;
365 
366 	/* Set mask to current state */
367 	for (i = 0; i < pdev->msi.nr_entries; i++) {
368 		entry = &pdev->msi.entries[i];
369 		mask |= !!msi_is_masked(entry->virt_state) << i;
370 	}
371 
372 	/* Update mask following the intersection of access and register */
373 	start = max_t(size_t, off, mask_pos);
374 	limit = min_t(size_t, off + sz, mask_pos + 4);
375 
376 	memcpy((void *)&mask + start - mask_pos, data + start - off,
377 	       limit - start);
378 
379 	/* Update states if necessary */
380 	for (i = 0; i < pdev->msi.nr_entries; i++) {
381 		bool masked = mask & (1 << i);
382 
383 		entry = &pdev->msi.entries[i];
384 		if (masked != msi_is_masked(entry->virt_state)) {
385 			msi_set_masked(entry->virt_state, masked);
386 			vfio_pci_update_msi_entry(kvm, vdev, entry);
387 		}
388 	}
389 
390 	return 1;
391 }
392 
393 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
394 				   u8 off, u8 *data, u32 sz)
395 {
396 	u8 ctrl;
397 	struct msi_msg msg;
398 	size_t i, nr_vectors;
399 	struct vfio_pci_msi_entry *entry;
400 	struct vfio_pci_device *pdev = &vdev->pci;
401 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
402 
403 	off -= pdev->msi.pos;
404 
405 	mutex_lock(&pdev->msi.mutex);
406 
407 	/* Check if the guest is trying to update mask bits */
408 	if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz))
409 		goto out_unlock;
410 
411 	/* Only modify routes when guest pokes the enable bit */
412 	if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS)
413 		goto out_unlock;
414 
415 	ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off);
416 
417 	msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE);
418 
419 	if (!msi_is_enabled(pdev->msi.virt_state)) {
420 		vfio_pci_disable_msis(kvm, vdev, false);
421 		goto out_unlock;
422 	}
423 
424 	/* Create routes for the requested vectors */
425 	nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
426 
427 	msg.address_lo = msi_cap_64->address_lo;
428 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) {
429 		msg.address_hi = msi_cap_64->address_hi;
430 		msg.data = msi_cap_64->data;
431 	} else {
432 		struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64;
433 		msg.address_hi = 0;
434 		msg.data = msi_cap_32->data;
435 	}
436 
437 	for (i = 0; i < nr_vectors; i++) {
438 		entry = &pdev->msi.entries[i];
439 
440 		/*
441 		 * Set the MSI data value as required by the PCI local
442 		 * bus specifications, MSI capability, "Message Data".
443 		 */
444 		msg.data &= ~(nr_vectors - 1);
445 		msg.data |= i;
446 
447 		entry->config.msg = msg;
448 		vfio_pci_update_msi_entry(kvm, vdev, entry);
449 	}
450 
451 	/* Update the physical capability if necessary */
452 	if (vfio_pci_enable_msis(kvm, vdev, false))
453 		vfio_dev_err(vdev, "cannot enable MSI");
454 
455 out_unlock:
456 	mutex_unlock(&pdev->msi.mutex);
457 }
458 
459 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
460 			      u8 offset, void *data, int sz)
461 {
462 	struct vfio_region_info *info;
463 	struct vfio_pci_device *pdev;
464 	struct vfio_device *vdev;
465 	char base[sz];
466 
467 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
468 	vdev = container_of(pdev, struct vfio_device, pci);
469 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
470 
471 	/* Dummy read in case of side-effects */
472 	if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
473 		vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
474 			      sz, offset);
475 }
476 
477 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
478 			       u8 offset, void *data, int sz)
479 {
480 	struct vfio_region_info *info;
481 	struct vfio_pci_device *pdev;
482 	struct vfio_device *vdev;
483 	u32 tmp;
484 
485 	/* Make sure a larger size will not overrun tmp on the stack. */
486 	assert(sz <= 4);
487 
488 	if (offset == PCI_ROM_ADDRESS)
489 		return;
490 
491 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
492 	vdev = container_of(pdev, struct vfio_device, pci);
493 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
494 
495 	if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
496 		vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
497 			      sz, offset);
498 
499 	/* Handle MSI write now, since it might update the hardware capability */
500 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
501 		vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
502 
503 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI)
504 		vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz);
505 
506 	if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz)
507 		vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
508 			      sz, offset);
509 }
510 
511 static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr)
512 {
513 	size_t size = 10;
514 
515 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT)
516 		size += 4;
517 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT)
518 		size += 10;
519 
520 	return size;
521 }
522 
523 static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
524 {
525 	switch (cap_hdr->type) {
526 	case PCI_CAP_ID_MSIX:
527 		return PCI_CAP_MSIX_SIZEOF;
528 	case PCI_CAP_ID_MSI:
529 		return vfio_pci_msi_cap_size((void *)cap_hdr);
530 	default:
531 		pr_err("unknown PCI capability 0x%x", cap_hdr->type);
532 		return 0;
533 	}
534 }
535 
536 static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
537 			    struct pci_cap_hdr *cap, off_t pos)
538 {
539 	struct pci_cap_hdr *last;
540 	struct pci_device_header *hdr = &vdev->pci.hdr;
541 
542 	cap->next = 0;
543 
544 	if (!hdr->capabilities) {
545 		hdr->capabilities = pos;
546 		hdr->status |= PCI_STATUS_CAP_LIST;
547 	} else {
548 		last = PCI_CAP(virt_hdr, hdr->capabilities);
549 
550 		while (last->next)
551 			last = PCI_CAP(virt_hdr, last->next);
552 
553 		last->next = pos;
554 	}
555 
556 	memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
557 
558 	return 0;
559 }
560 
561 static int vfio_pci_parse_caps(struct vfio_device *vdev)
562 {
563 	int ret;
564 	size_t size;
565 	u8 pos, next;
566 	struct pci_cap_hdr *cap;
567 	u8 virt_hdr[PCI_DEV_CFG_SIZE];
568 	struct vfio_pci_device *pdev = &vdev->pci;
569 
570 	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
571 		return 0;
572 
573 	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
574 
575 	pos = pdev->hdr.capabilities & ~3;
576 
577 	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
578 	pdev->hdr.capabilities = 0;
579 
580 	for (; pos; pos = next) {
581 		cap = PCI_CAP(&pdev->hdr, pos);
582 		next = cap->next;
583 
584 		switch (cap->type) {
585 		case PCI_CAP_ID_MSIX:
586 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
587 			if (ret)
588 				return ret;
589 
590 			pdev->msix.pos = pos;
591 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
592 			break;
593 		case PCI_CAP_ID_MSI:
594 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
595 			if (ret)
596 				return ret;
597 
598 			pdev->msi.pos = pos;
599 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI;
600 			break;
601 		}
602 	}
603 
604 	/* Wipe remaining capabilities */
605 	pos = PCI_STD_HEADER_SIZEOF;
606 	size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
607 	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
608 
609 	return 0;
610 }
611 
612 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
613 {
614 	ssize_t sz = PCI_DEV_CFG_SIZE;
615 	struct vfio_region_info *info;
616 	struct vfio_pci_device *pdev = &vdev->pci;
617 
618 	if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
619 		vfio_dev_err(vdev, "Config Space not found");
620 		return -ENODEV;
621 	}
622 
623 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
624 	*info = (struct vfio_region_info) {
625 			.argsz = sizeof(*info),
626 			.index = VFIO_PCI_CONFIG_REGION_INDEX,
627 	};
628 
629 	ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
630 	if (!info->size) {
631 		vfio_dev_err(vdev, "Config Space has size zero?!");
632 		return -EINVAL;
633 	}
634 
635 	/* Read standard headers and capabilities */
636 	if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
637 		vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
638 		return -EIO;
639 	}
640 
641 	/* Strip bit 7, that indicates multifunction */
642 	pdev->hdr.header_type &= 0x7f;
643 
644 	if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
645 		vfio_dev_err(vdev, "unsupported header type %u",
646 			     pdev->hdr.header_type);
647 		return -EOPNOTSUPP;
648 	}
649 
650 	if (pdev->hdr.irq_pin)
651 		pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
652 
653 	vfio_pci_parse_caps(vdev);
654 
655 	return 0;
656 }
657 
658 static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
659 {
660 	int i;
661 	u64 base;
662 	ssize_t hdr_sz;
663 	struct msix_cap *msix;
664 	struct vfio_region_info *info;
665 	struct vfio_pci_device *pdev = &vdev->pci;
666 	struct vfio_region *region;
667 
668 	/* Initialise the BARs */
669 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
670 		if ((u32)i == vdev->info.num_regions)
671 			break;
672 
673 		region = &vdev->regions[i];
674 		/* Construct a fake reg to match what we've mapped. */
675 		if (region->is_ioport) {
676 			base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) |
677 				PCI_BASE_ADDRESS_SPACE_IO;
678 		} else {
679 			base = (region->guest_phys_addr &
680 				PCI_BASE_ADDRESS_MEM_MASK) |
681 				PCI_BASE_ADDRESS_SPACE_MEMORY;
682 		}
683 
684 		pdev->hdr.bar[i] = base;
685 
686 		if (!base)
687 			continue;
688 
689 		pdev->hdr.bar_size[i] = region->info.size;
690 	}
691 
692 	/* I really can't be bothered to support cardbus. */
693 	pdev->hdr.card_bus = 0;
694 
695 	/*
696 	 * Nuke the expansion ROM for now. If we want to do this properly,
697 	 * we need to save its size somewhere and map into the guest.
698 	 */
699 	pdev->hdr.exp_rom_bar = 0;
700 
701 	/* Plumb in our fake MSI-X capability, if we have it. */
702 	msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
703 	if (msix) {
704 		/* Add a shortcut to the PBA region for the MMIO handler */
705 		int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
706 		pdev->msix_pba.offset = vdev->regions[pba_index].info.offset +
707 					(msix->pba_offset & PCI_MSIX_PBA_OFFSET);
708 
709 		/* Tidy up the capability */
710 		msix->table_offset &= PCI_MSIX_TABLE_BIR;
711 		msix->pba_offset &= PCI_MSIX_PBA_BIR;
712 		if (pdev->msix_table.bar == pdev->msix_pba.bar)
713 			msix->pba_offset |= pdev->msix_table.size &
714 					    PCI_MSIX_PBA_OFFSET;
715 	}
716 
717 	/* Install our fake Configuration Space */
718 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
719 	hdr_sz = PCI_DEV_CFG_SIZE;
720 	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
721 		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
722 			     hdr_sz);
723 		return -EIO;
724 	}
725 
726 	/* Register callbacks for cfg accesses */
727 	pdev->hdr.cfg_ops = (struct pci_config_operations) {
728 		.read	= vfio_pci_cfg_read,
729 		.write	= vfio_pci_cfg_write,
730 	};
731 
732 	pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
733 
734 	return 0;
735 }
736 
737 static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index,
738 				    struct vfio_region_info *info)
739 {
740 	int ret;
741 
742 	*info = (struct vfio_region_info) {
743 		.argsz = sizeof(*info),
744 		.index = index,
745 	};
746 
747 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
748 	if (ret) {
749 		ret = -errno;
750 		vfio_dev_err(vdev, "cannot get info for BAR %u", index);
751 		return ret;
752 	}
753 
754 	if (info->size && !is_power_of_two(info->size)) {
755 		vfio_dev_err(vdev, "region is not power of two: 0x%llx",
756 				info->size);
757 		return -EINVAL;
758 	}
759 
760 	return 0;
761 }
762 
763 static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev)
764 {
765 	int ret;
766 	size_t i;
767 	size_t map_size;
768 	size_t nr_entries;
769 	struct vfio_pci_msi_entry *entries;
770 	struct vfio_pci_device *pdev = &vdev->pci;
771 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
772 	struct vfio_pci_msix_table *table = &pdev->msix_table;
773 	struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
774 	struct vfio_region_info info;
775 
776 	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
777 	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
778 
779 	/*
780 	 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE.
781 	 */
782 	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
783 	table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE);
784 	pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE);
785 
786 	entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
787 	if (!entries)
788 		return -ENOMEM;
789 
790 	for (i = 0; i < nr_entries; i++)
791 		entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
792 
793 	ret = vfio_pci_get_region_info(vdev, table->bar, &info);
794 	if (ret)
795 		return ret;
796 	if (!info.size)
797 		return -EINVAL;
798 	map_size = info.size;
799 
800 	if (table->bar != pba->bar) {
801 		ret = vfio_pci_get_region_info(vdev, pba->bar, &info);
802 		if (ret)
803 			return ret;
804 		if (!info.size)
805 			return -EINVAL;
806 		map_size += info.size;
807 	}
808 
809 	/*
810 	 * To ease MSI-X cap configuration in case they share the same BAR,
811 	 * collapse table and pending array. The size of the BAR regions must be
812 	 * powers of two.
813 	 */
814 	map_size = ALIGN(map_size, PAGE_SIZE);
815 	table->guest_phys_addr = pci_get_mmio_block(map_size);
816 	if (!table->guest_phys_addr) {
817 		pr_err("cannot allocate MMIO space");
818 		ret = -ENOMEM;
819 		goto out_free;
820 	}
821 	pba->guest_phys_addr = table->guest_phys_addr + table->size;
822 
823 	ret = kvm__register_mmio(kvm, table->guest_phys_addr, table->size,
824 				 false, vfio_pci_msix_table_access, pdev);
825 	if (ret < 0)
826 		goto out_free;
827 
828 	/*
829 	 * We could map the physical PBA directly into the guest, but it's
830 	 * likely smaller than a page, and we can only hand full pages to the
831 	 * guest. Even though the PCI spec disallows sharing a page used for
832 	 * MSI-X with any other resource, it allows to share the same page
833 	 * between MSI-X table and PBA. For the sake of isolation, create a
834 	 * virtual PBA.
835 	 */
836 	ret = kvm__register_mmio(kvm, pba->guest_phys_addr, pba->size, false,
837 				 vfio_pci_msix_pba_access, pdev);
838 	if (ret < 0)
839 		goto out_free;
840 
841 	pdev->msix.entries = entries;
842 	pdev->msix.nr_entries = nr_entries;
843 
844 	return 0;
845 
846 out_free:
847 	free(entries);
848 
849 	return ret;
850 }
851 
852 static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev)
853 {
854 	struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos);
855 
856 	pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1),
857 	pdev->msi.entries = calloc(pdev->msi.nr_entries,
858 				   sizeof(struct vfio_pci_msi_entry));
859 	if (!pdev->msi.entries)
860 		return -ENOMEM;
861 
862 	return 0;
863 }
864 
865 static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
866 				  size_t nr)
867 {
868 	int ret;
869 	u32 bar;
870 	size_t map_size;
871 	struct vfio_pci_device *pdev = &vdev->pci;
872 	struct vfio_region *region;
873 
874 	if (nr >= vdev->info.num_regions)
875 		return 0;
876 
877 	region = &vdev->regions[nr];
878 	bar = pdev->hdr.bar[nr];
879 
880 	region->vdev = vdev;
881 	region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
882 
883 	ret = vfio_pci_get_region_info(vdev, nr, &region->info);
884 	if (ret)
885 		return ret;
886 
887 	/* Ignore invalid or unimplemented regions */
888 	if (!region->info.size)
889 		return 0;
890 
891 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
892 		/* Trap and emulate MSI-X table */
893 		if (nr == pdev->msix_table.bar) {
894 			region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
895 			return 0;
896 		} else if (nr == pdev->msix_pba.bar) {
897 			region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
898 			return 0;
899 		}
900 	}
901 
902 	if (region->is_ioport) {
903 		region->port_base = pci_get_io_port_block(region->info.size);
904 	} else {
905 		/* Grab some MMIO space in the guest */
906 		map_size = ALIGN(region->info.size, PAGE_SIZE);
907 		region->guest_phys_addr = pci_get_mmio_block(map_size);
908 	}
909 
910 	/* Map the BARs into the guest or setup a trap region. */
911 	ret = vfio_map_region(kvm, vdev, region);
912 	if (ret)
913 		return ret;
914 
915 	return 0;
916 }
917 
918 static int vfio_pci_configure_dev_regions(struct kvm *kvm,
919 					  struct vfio_device *vdev)
920 {
921 	int ret;
922 	u32 bar;
923 	size_t i;
924 	bool is_64bit = false;
925 	struct vfio_pci_device *pdev = &vdev->pci;
926 
927 	ret = vfio_pci_parse_cfg_space(vdev);
928 	if (ret)
929 		return ret;
930 
931 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
932 		ret = vfio_pci_create_msix_table(kvm, vdev);
933 		if (ret)
934 			return ret;
935 	}
936 
937 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
938 		ret = vfio_pci_create_msi_cap(kvm, pdev);
939 		if (ret)
940 			return ret;
941 	}
942 
943 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
944 		/* Ignore top half of 64-bit BAR */
945 		if (is_64bit) {
946 			is_64bit = false;
947 			continue;
948 		}
949 
950 		ret = vfio_pci_configure_bar(kvm, vdev, i);
951 		if (ret)
952 			return ret;
953 
954 		bar = pdev->hdr.bar[i];
955 		is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) ==
956 			   PCI_BASE_ADDRESS_SPACE_MEMORY &&
957 			   bar & PCI_BASE_ADDRESS_MEM_TYPE_64;
958 	}
959 
960 	/* We've configured the BARs, fake up a Configuration Space */
961 	return vfio_pci_fixup_cfg_space(vdev);
962 }
963 
964 /*
965  * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
966  * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
967  */
968 static int vfio_pci_reserve_irq_fds(size_t num)
969 {
970 	/*
971 	 * I counted around 27 fds under normal load. Let's add 100 for good
972 	 * measure.
973 	 */
974 	static size_t needed = 128;
975 	struct rlimit fd_limit, new_limit;
976 
977 	needed += num;
978 
979 	if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
980 		perror("getrlimit(RLIMIT_NOFILE)");
981 		return 0;
982 	}
983 
984 	if (fd_limit.rlim_cur >= needed)
985 		return 0;
986 
987 	new_limit.rlim_cur = needed;
988 
989 	if (fd_limit.rlim_max < needed)
990 		/* Try to bump hard limit (root only) */
991 		new_limit.rlim_max = needed;
992 	else
993 		new_limit.rlim_max = fd_limit.rlim_max;
994 
995 	if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
996 		perror("setrlimit(RLIMIT_NOFILE)");
997 		pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
998 			   (size_t)(needed - fd_limit.rlim_cur));
999 	}
1000 
1001 	return 0;
1002 }
1003 
1004 static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
1005 			     struct vfio_pci_msi_common *msis)
1006 {
1007 	int ret;
1008 	size_t i;
1009 	int *eventfds;
1010 	size_t irq_set_size;
1011 	struct vfio_pci_msi_entry *entry;
1012 	size_t nr_entries = msis->nr_entries;
1013 
1014 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
1015 	if (ret || msis->info.count == 0) {
1016 		vfio_dev_err(vdev, "no MSI reported by VFIO");
1017 		return -ENODEV;
1018 	}
1019 
1020 	if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1021 		vfio_dev_err(vdev, "interrupt not EVENTFD capable");
1022 		return -EINVAL;
1023 	}
1024 
1025 	if (msis->info.count != nr_entries) {
1026 		vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
1027 		return -EINVAL;
1028 	}
1029 
1030 	mutex_init(&msis->mutex);
1031 
1032 	vfio_pci_reserve_irq_fds(nr_entries);
1033 
1034 	irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
1035 	msis->irq_set = malloc(irq_set_size);
1036 	if (!msis->irq_set)
1037 		return -ENOMEM;
1038 
1039 	*msis->irq_set = (struct vfio_irq_set) {
1040 		.argsz	= irq_set_size,
1041 		.flags 	= VFIO_IRQ_SET_DATA_EVENTFD |
1042 			  VFIO_IRQ_SET_ACTION_TRIGGER,
1043 		.index 	= msis->info.index,
1044 		.start 	= 0,
1045 		.count 	= nr_entries,
1046 	};
1047 
1048 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
1049 
1050 	for (i = 0; i < nr_entries; i++) {
1051 		entry = &msis->entries[i];
1052 		entry->gsi = -1;
1053 		entry->eventfd = -1;
1054 		msi_set_masked(entry->virt_state, true);
1055 		msi_set_masked(entry->phys_state, true);
1056 		eventfds[i] = -1;
1057 	}
1058 
1059 	return 0;
1060 }
1061 
1062 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
1063 {
1064 	struct vfio_pci_device *pdev = &vdev->pci;
1065 	int gsi = pdev->intx_gsi;
1066 	struct vfio_irq_set irq_set = {
1067 		.argsz	= sizeof(irq_set),
1068 		.flags	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
1069 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1070 	};
1071 
1072 	if (pdev->intx_fd == -1)
1073 		return;
1074 
1075 	pr_debug("user requested MSI, disabling INTx %d", gsi);
1076 
1077 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
1078 	irq__del_irqfd(kvm, gsi, pdev->intx_fd);
1079 
1080 	close(pdev->intx_fd);
1081 	close(pdev->unmask_fd);
1082 	pdev->intx_fd = -1;
1083 }
1084 
1085 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
1086 {
1087 	int ret;
1088 	int trigger_fd, unmask_fd;
1089 	union vfio_irq_eventfd	trigger;
1090 	union vfio_irq_eventfd	unmask;
1091 	struct vfio_pci_device *pdev = &vdev->pci;
1092 	int gsi = pdev->intx_gsi;
1093 
1094 	if (pdev->intx_fd != -1)
1095 		return 0;
1096 
1097 	/*
1098 	 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
1099 	 * signals an interrupt from host to guest, and unmask_fd signals the
1100 	 * deassertion of the line from guest to host.
1101 	 */
1102 	trigger_fd = eventfd(0, 0);
1103 	if (trigger_fd < 0) {
1104 		vfio_dev_err(vdev, "failed to create trigger eventfd");
1105 		return trigger_fd;
1106 	}
1107 
1108 	unmask_fd = eventfd(0, 0);
1109 	if (unmask_fd < 0) {
1110 		vfio_dev_err(vdev, "failed to create unmask eventfd");
1111 		close(trigger_fd);
1112 		return unmask_fd;
1113 	}
1114 
1115 	ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
1116 	if (ret)
1117 		goto err_close;
1118 
1119 	trigger.irq = (struct vfio_irq_set) {
1120 		.argsz	= sizeof(trigger),
1121 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
1122 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1123 		.start	= 0,
1124 		.count	= 1,
1125 	};
1126 	set_vfio_irq_eventd_payload(&trigger, trigger_fd);
1127 
1128 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1129 	if (ret < 0) {
1130 		vfio_dev_err(vdev, "failed to setup VFIO IRQ");
1131 		goto err_delete_line;
1132 	}
1133 
1134 	unmask.irq = (struct vfio_irq_set) {
1135 		.argsz	= sizeof(unmask),
1136 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
1137 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1138 		.start	= 0,
1139 		.count	= 1,
1140 	};
1141 	set_vfio_irq_eventd_payload(&unmask, unmask_fd);
1142 
1143 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask);
1144 	if (ret < 0) {
1145 		vfio_dev_err(vdev, "failed to setup unmask IRQ");
1146 		goto err_remove_event;
1147 	}
1148 
1149 	pdev->intx_fd = trigger_fd;
1150 	pdev->unmask_fd = unmask_fd;
1151 
1152 	return 0;
1153 
1154 err_remove_event:
1155 	/* Remove trigger event */
1156 	trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1157 	trigger.irq.count = 0;
1158 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1159 
1160 err_delete_line:
1161 	irq__del_irqfd(kvm, gsi, trigger_fd);
1162 
1163 err_close:
1164 	close(trigger_fd);
1165 	close(unmask_fd);
1166 	return ret;
1167 }
1168 
1169 static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev)
1170 {
1171 	int ret;
1172 	struct vfio_pci_device *pdev = &vdev->pci;
1173 	struct vfio_irq_info irq_info = {
1174 		.argsz = sizeof(irq_info),
1175 		.index = VFIO_PCI_INTX_IRQ_INDEX,
1176 	};
1177 
1178 	vfio_pci_reserve_irq_fds(2);
1179 
1180 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
1181 	if (ret || irq_info.count == 0) {
1182 		vfio_dev_err(vdev, "no INTx reported by VFIO");
1183 		return -ENODEV;
1184 	}
1185 
1186 	if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1187 		vfio_dev_err(vdev, "interrupt not eventfd capable");
1188 		return -EINVAL;
1189 	}
1190 
1191 	if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
1192 		vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
1193 		return -EINVAL;
1194 	}
1195 
1196 	/* Guest is going to ovewrite our irq_line... */
1197 	pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
1198 
1199 	pdev->intx_fd = -1;
1200 
1201 	return 0;
1202 }
1203 
1204 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
1205 {
1206 	int ret = 0;
1207 	struct vfio_pci_device *pdev = &vdev->pci;
1208 
1209 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1210 		pdev->msix.info = (struct vfio_irq_info) {
1211 			.argsz = sizeof(pdev->msix.info),
1212 			.index = VFIO_PCI_MSIX_IRQ_INDEX,
1213 		};
1214 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
1215 		if (ret)
1216 			return ret;
1217 	}
1218 
1219 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
1220 		pdev->msi.info = (struct vfio_irq_info) {
1221 			.argsz = sizeof(pdev->msi.info),
1222 			.index = VFIO_PCI_MSI_IRQ_INDEX,
1223 		};
1224 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi);
1225 		if (ret)
1226 			return ret;
1227 	}
1228 
1229 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
1230 		pci__assign_irq(&vdev->pci.hdr);
1231 
1232 		ret = vfio_pci_init_intx(kvm, vdev);
1233 		if (ret)
1234 			return ret;
1235 
1236 		ret = vfio_pci_enable_intx(kvm, vdev);
1237 	}
1238 
1239 	return ret;
1240 }
1241 
1242 int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
1243 {
1244 	int ret;
1245 
1246 	ret = vfio_pci_configure_dev_regions(kvm, vdev);
1247 	if (ret) {
1248 		vfio_dev_err(vdev, "failed to configure regions");
1249 		return ret;
1250 	}
1251 
1252 	vdev->dev_hdr = (struct device_header) {
1253 		.bus_type	= DEVICE_BUS_PCI,
1254 		.data		= &vdev->pci.hdr,
1255 	};
1256 
1257 	ret = device__register(&vdev->dev_hdr);
1258 	if (ret) {
1259 		vfio_dev_err(vdev, "failed to register VFIO device");
1260 		return ret;
1261 	}
1262 
1263 	ret = vfio_pci_configure_dev_irqs(kvm, vdev);
1264 	if (ret) {
1265 		vfio_dev_err(vdev, "failed to configure IRQs");
1266 		return ret;
1267 	}
1268 
1269 	return 0;
1270 }
1271 
1272 void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
1273 {
1274 	size_t i;
1275 	struct vfio_pci_device *pdev = &vdev->pci;
1276 
1277 	for (i = 0; i < vdev->info.num_regions; i++)
1278 		vfio_unmap_region(kvm, &vdev->regions[i]);
1279 
1280 	device__unregister(&vdev->dev_hdr);
1281 
1282 	free(pdev->msix.irq_set);
1283 	free(pdev->msix.entries);
1284 	free(pdev->msi.irq_set);
1285 	free(pdev->msi.entries);
1286 }
1287