xref: /kvmtool/vfio/pci.c (revision c0c45eed4f3fb799764979dec5cfb399071d6916)
1 #include "kvm/irq.h"
2 #include "kvm/kvm.h"
3 #include "kvm/kvm-cpu.h"
4 #include "kvm/vfio.h"
5 
6 #include <sys/ioctl.h>
7 #include <sys/eventfd.h>
8 #include <sys/resource.h>
9 #include <sys/time.h>
10 
11 /* Wrapper around UAPI vfio_irq_set */
12 union vfio_irq_eventfd {
13 	struct vfio_irq_set	irq;
14 	u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)];
15 };
16 
17 static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd)
18 {
19 	memcpy(&evfd->irq.data, &fd, sizeof(fd));
20 }
21 
22 #define msi_is_enabled(state)		((state) & VFIO_PCI_MSI_STATE_ENABLED)
23 #define msi_is_masked(state)		((state) & VFIO_PCI_MSI_STATE_MASKED)
24 #define msi_is_empty(state)		((state) & VFIO_PCI_MSI_STATE_EMPTY)
25 
26 #define msi_update_state(state, val, bit)				\
27 	(state) = (val) ? (state) | bit : (state) & ~bit;
28 #define msi_set_enabled(state, val)					\
29 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
30 #define msi_set_masked(state, val)					\
31 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
32 #define msi_set_empty(state, val)					\
33 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
34 
35 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
36 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
37 
38 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
39 				bool msix)
40 {
41 	size_t i;
42 	int ret = 0;
43 	int *eventfds;
44 	struct vfio_pci_device *pdev = &vdev->pci;
45 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
46 	union vfio_irq_eventfd single = {
47 		.irq = {
48 			.argsz	= sizeof(single),
49 			.flags	= VFIO_IRQ_SET_DATA_EVENTFD |
50 				  VFIO_IRQ_SET_ACTION_TRIGGER,
51 			.index	= msis->info.index,
52 			.count	= 1,
53 		},
54 	};
55 
56 	if (!msi_is_enabled(msis->virt_state))
57 		return 0;
58 
59 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
60 		/*
61 		 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
62 		 * time. Since INTx has to be enabled from the start (we don't
63 		 * have a reliable way to know when the guest starts using it),
64 		 * disable it now.
65 		 */
66 		vfio_pci_disable_intx(kvm, vdev);
67 
68 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
69 
70 	/*
71 	 * Initial registration of the full range. This enables the physical
72 	 * MSI/MSI-X capability, which might have desired side effects. For
73 	 * instance when assigning virtio legacy devices, enabling the MSI
74 	 * capability modifies the config space layout!
75 	 *
76 	 * As an optimization, only update MSIs when guest unmasks the
77 	 * capability. This greatly reduces the initialization time for Linux
78 	 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
79 	 * masked, then fills individual vectors, then unmasks the whole
80 	 * function. So we only do one VFIO ioctl when enabling for the first
81 	 * time, and then one when unmasking.
82 	 *
83 	 * phys_state is empty when it is enabled but no vector has been
84 	 * registered via SET_IRQS yet.
85 	 */
86 	if (!msi_is_enabled(msis->phys_state) ||
87 	    (!msi_is_masked(msis->virt_state) &&
88 	     msi_is_empty(msis->phys_state))) {
89 		bool empty = true;
90 
91 		for (i = 0; i < msis->nr_entries; i++) {
92 			eventfds[i] = msis->entries[i].gsi >= 0 ?
93 				      msis->entries[i].eventfd : -1;
94 
95 			if (eventfds[i] >= 0)
96 				empty = false;
97 		}
98 
99 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
100 		if (ret < 0) {
101 			perror("VFIO_DEVICE_SET_IRQS(multi)");
102 			return ret;
103 		}
104 
105 		msi_set_enabled(msis->phys_state, true);
106 		msi_set_empty(msis->phys_state, empty);
107 
108 		return 0;
109 	}
110 
111 	if (msi_is_masked(msis->virt_state)) {
112 		/* TODO: if phys_state is not empty nor masked, mask all vectors */
113 		return 0;
114 	}
115 
116 	/* Update individual vectors to avoid breaking those in use */
117 	for (i = 0; i < msis->nr_entries; i++) {
118 		struct vfio_pci_msi_entry *entry = &msis->entries[i];
119 		int fd = entry->gsi >= 0 ? entry->eventfd : -1;
120 
121 		if (fd == eventfds[i])
122 			continue;
123 
124 		single.irq.start = i;
125 		set_vfio_irq_eventd_payload(&single, fd);
126 
127 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
128 		if (ret < 0) {
129 			perror("VFIO_DEVICE_SET_IRQS(single)");
130 			break;
131 		}
132 
133 		eventfds[i] = fd;
134 
135 		if (msi_is_empty(msis->phys_state) && fd >= 0)
136 			msi_set_empty(msis->phys_state, false);
137 	}
138 
139 	return ret;
140 }
141 
142 static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev,
143 				 bool msix)
144 {
145 	int ret;
146 	struct vfio_pci_device *pdev = &vdev->pci;
147 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
148 	struct vfio_irq_set irq_set = {
149 		.argsz	= sizeof(irq_set),
150 		.flags 	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
151 		.index 	= msis->info.index,
152 		.start 	= 0,
153 		.count	= 0,
154 	};
155 
156 	if (!msi_is_enabled(msis->phys_state))
157 		return 0;
158 
159 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
160 	if (ret < 0) {
161 		perror("VFIO_DEVICE_SET_IRQS(NONE)");
162 		return ret;
163 	}
164 
165 	msi_set_enabled(msis->phys_state, false);
166 	msi_set_empty(msis->phys_state, true);
167 
168 	/*
169 	 * When MSI or MSIX is disabled, this might be called when
170 	 * PCI driver detects the MSI interrupt failure and wants to
171 	 * rollback to INTx mode.  Thus enable INTx if the device
172 	 * supports INTx mode in this case.
173 	 */
174 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
175 		ret = vfio_pci_enable_intx(kvm, vdev);
176 
177 	return ret >= 0 ? 0 : ret;
178 }
179 
180 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
181 				     struct vfio_pci_msi_entry *entry)
182 {
183 	int ret;
184 
185 	if (entry->eventfd < 0) {
186 		entry->eventfd = eventfd(0, 0);
187 		if (entry->eventfd < 0) {
188 			ret = -errno;
189 			vfio_dev_err(vdev, "cannot create eventfd");
190 			return ret;
191 		}
192 	}
193 
194 	/* Allocate IRQ if necessary */
195 	if (entry->gsi < 0) {
196 		int ret = irq__add_msix_route(kvm, &entry->config.msg,
197 					      vdev->dev_hdr.dev_num << 3);
198 		if (ret < 0) {
199 			vfio_dev_err(vdev, "cannot create MSI-X route");
200 			return ret;
201 		}
202 		entry->gsi = ret;
203 	} else {
204 		irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
205 	}
206 
207 	/*
208 	 * MSI masking is unimplemented in VFIO, so we have to handle it by
209 	 * disabling/enabling IRQ route instead. We do it on the KVM side rather
210 	 * than VFIO, because:
211 	 * - it is 8x faster
212 	 * - it allows to decouple masking logic from capability state.
213 	 * - in masked state, after removing irqfd route, we could easily plug
214 	 *   the eventfd in a local handler, in order to serve Pending Bit reads
215 	 *   to the guest.
216 	 *
217 	 * So entry->phys_state is masked when there is no active irqfd route.
218 	 */
219 	if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state))
220 		return 0;
221 
222 	if (msi_is_masked(entry->phys_state)) {
223 		ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
224 		if (ret < 0) {
225 			vfio_dev_err(vdev, "cannot setup irqfd");
226 			return ret;
227 		}
228 	} else {
229 		irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
230 	}
231 
232 	msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state));
233 
234 	return 0;
235 }
236 
237 static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
238 				     u32 len, u8 is_write, void *ptr)
239 {
240 	struct vfio_pci_device *pdev = ptr;
241 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
242 	u64 offset = addr - pba->guest_phys_addr;
243 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
244 
245 	if (is_write)
246 		return;
247 
248 	/*
249 	 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
250 	 * is completely useless here. Note that Linux doesn't use PBA.
251 	 */
252 	if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len)
253 		vfio_dev_err(vdev, "cannot access MSIX PBA\n");
254 }
255 
256 static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
257 				       u32 len, u8 is_write, void *ptr)
258 {
259 	struct kvm *kvm = vcpu->kvm;
260 	struct vfio_pci_msi_entry *entry;
261 	struct vfio_pci_device *pdev = ptr;
262 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
263 
264 	u64 offset = addr - pdev->msix_table.guest_phys_addr;
265 
266 	size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
267 	off_t field = offset % PCI_MSIX_ENTRY_SIZE;
268 
269 	/*
270 	 * PCI spec says that software must use aligned 4 or 8 bytes accesses
271 	 * for the MSI-X tables.
272 	 */
273 	if ((len != 4 && len != 8) || addr & (len - 1)) {
274 		vfio_dev_warn(vdev, "invalid MSI-X table access");
275 		return;
276 	}
277 
278 	entry = &pdev->msix.entries[vector];
279 
280 	mutex_lock(&pdev->msix.mutex);
281 
282 	if (!is_write) {
283 		memcpy(data, (void *)&entry->config + field, len);
284 		goto out_unlock;
285 	}
286 
287 	memcpy((void *)&entry->config + field, data, len);
288 
289 	/*
290 	 * Check if access touched the vector control register, which is at the
291 	 * end of the MSI-X entry.
292 	 */
293 	if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
294 		goto out_unlock;
295 
296 	msi_set_masked(entry->virt_state, entry->config.ctrl &
297 		       PCI_MSIX_ENTRY_CTRL_MASKBIT);
298 
299 	if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
300 		/* Not much we can do here. */
301 		vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
302 
303 	/* Update the physical capability if necessary */
304 	if (vfio_pci_enable_msis(kvm, vdev, true))
305 		vfio_dev_err(vdev, "cannot enable MSIX");
306 
307 out_unlock:
308 	mutex_unlock(&pdev->msix.mutex);
309 }
310 
311 static void vfio_pci_msix_cap_write(struct kvm *kvm,
312 				    struct vfio_device *vdev, u8 off,
313 				    void *data, int sz)
314 {
315 	struct vfio_pci_device *pdev = &vdev->pci;
316 	off_t enable_pos = PCI_MSIX_FLAGS + 1;
317 	bool enable;
318 	u16 flags;
319 
320 	off -= pdev->msix.pos;
321 
322 	/* Check if access intersects with the MSI-X Enable bit */
323 	if (off > enable_pos || off + sz <= enable_pos)
324 		return;
325 
326 	/* Read byte that contains the Enable bit */
327 	flags = *(u8 *)(data + enable_pos - off) << 8;
328 
329 	mutex_lock(&pdev->msix.mutex);
330 
331 	msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL);
332 	enable = flags & PCI_MSIX_FLAGS_ENABLE;
333 	msi_set_enabled(pdev->msix.virt_state, enable);
334 
335 	if (enable && vfio_pci_enable_msis(kvm, vdev, true))
336 		vfio_dev_err(vdev, "cannot enable MSIX");
337 	else if (!enable && vfio_pci_disable_msis(kvm, vdev, true))
338 		vfio_dev_err(vdev, "cannot disable MSIX");
339 
340 	mutex_unlock(&pdev->msix.mutex);
341 }
342 
343 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
344 				     u8 off, u8 *data, u32 sz)
345 {
346 	size_t i;
347 	u32 mask = 0;
348 	size_t mask_pos, start, limit;
349 	struct vfio_pci_msi_entry *entry;
350 	struct vfio_pci_device *pdev = &vdev->pci;
351 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
352 
353 	if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT))
354 		return 0;
355 
356 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT)
357 		mask_pos = PCI_MSI_MASK_64;
358 	else
359 		mask_pos = PCI_MSI_MASK_32;
360 
361 	if (off >= mask_pos + 4 || off + sz <= mask_pos)
362 		return 0;
363 
364 	/* Set mask to current state */
365 	for (i = 0; i < pdev->msi.nr_entries; i++) {
366 		entry = &pdev->msi.entries[i];
367 		mask |= !!msi_is_masked(entry->virt_state) << i;
368 	}
369 
370 	/* Update mask following the intersection of access and register */
371 	start = max_t(size_t, off, mask_pos);
372 	limit = min_t(size_t, off + sz, mask_pos + 4);
373 
374 	memcpy((void *)&mask + start - mask_pos, data + start - off,
375 	       limit - start);
376 
377 	/* Update states if necessary */
378 	for (i = 0; i < pdev->msi.nr_entries; i++) {
379 		bool masked = mask & (1 << i);
380 
381 		entry = &pdev->msi.entries[i];
382 		if (masked != msi_is_masked(entry->virt_state)) {
383 			msi_set_masked(entry->virt_state, masked);
384 			vfio_pci_update_msi_entry(kvm, vdev, entry);
385 		}
386 	}
387 
388 	return 1;
389 }
390 
391 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
392 				   u8 off, u8 *data, u32 sz)
393 {
394 	u8 ctrl;
395 	struct msi_msg msg;
396 	size_t i, nr_vectors;
397 	struct vfio_pci_msi_entry *entry;
398 	struct vfio_pci_device *pdev = &vdev->pci;
399 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
400 
401 	off -= pdev->msi.pos;
402 
403 	mutex_lock(&pdev->msi.mutex);
404 
405 	/* Check if the guest is trying to update mask bits */
406 	if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz))
407 		goto out_unlock;
408 
409 	/* Only modify routes when guest pokes the enable bit */
410 	if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS)
411 		goto out_unlock;
412 
413 	ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off);
414 
415 	msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE);
416 
417 	if (!msi_is_enabled(pdev->msi.virt_state)) {
418 		vfio_pci_disable_msis(kvm, vdev, false);
419 		goto out_unlock;
420 	}
421 
422 	/* Create routes for the requested vectors */
423 	nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
424 
425 	msg.address_lo = msi_cap_64->address_lo;
426 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) {
427 		msg.address_hi = msi_cap_64->address_hi;
428 		msg.data = msi_cap_64->data;
429 	} else {
430 		struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64;
431 		msg.address_hi = 0;
432 		msg.data = msi_cap_32->data;
433 	}
434 
435 	for (i = 0; i < nr_vectors; i++) {
436 		entry = &pdev->msi.entries[i];
437 
438 		/*
439 		 * Set the MSI data value as required by the PCI local
440 		 * bus specifications, MSI capability, "Message Data".
441 		 */
442 		msg.data &= ~(nr_vectors - 1);
443 		msg.data |= i;
444 
445 		entry->config.msg = msg;
446 		vfio_pci_update_msi_entry(kvm, vdev, entry);
447 	}
448 
449 	/* Update the physical capability if necessary */
450 	if (vfio_pci_enable_msis(kvm, vdev, false))
451 		vfio_dev_err(vdev, "cannot enable MSI");
452 
453 out_unlock:
454 	mutex_unlock(&pdev->msi.mutex);
455 }
456 
457 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
458 			      u8 offset, void *data, int sz)
459 {
460 	struct vfio_region_info *info;
461 	struct vfio_pci_device *pdev;
462 	struct vfio_device *vdev;
463 	char base[sz];
464 
465 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
466 	vdev = container_of(pdev, struct vfio_device, pci);
467 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
468 
469 	/* Dummy read in case of side-effects */
470 	if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
471 		vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
472 			      sz, offset);
473 }
474 
475 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
476 			       u8 offset, void *data, int sz)
477 {
478 	struct vfio_region_info *info;
479 	struct vfio_pci_device *pdev;
480 	struct vfio_device *vdev;
481 	void *base = pci_hdr;
482 
483 	if (offset == PCI_ROM_ADDRESS)
484 		return;
485 
486 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
487 	vdev = container_of(pdev, struct vfio_device, pci);
488 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
489 
490 	if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
491 		vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
492 			      sz, offset);
493 
494 	/* Handle MSI write now, since it might update the hardware capability */
495 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
496 		vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
497 
498 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI)
499 		vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz);
500 
501 	if (pread(vdev->fd, base + offset, sz, info->offset + offset) != sz)
502 		vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
503 			      sz, offset);
504 }
505 
506 static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr)
507 {
508 	size_t size = 10;
509 
510 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT)
511 		size += 4;
512 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT)
513 		size += 10;
514 
515 	return size;
516 }
517 
518 static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
519 {
520 	switch (cap_hdr->type) {
521 	case PCI_CAP_ID_MSIX:
522 		return PCI_CAP_MSIX_SIZEOF;
523 	case PCI_CAP_ID_MSI:
524 		return vfio_pci_msi_cap_size((void *)cap_hdr);
525 	default:
526 		pr_err("unknown PCI capability 0x%x", cap_hdr->type);
527 		return 0;
528 	}
529 }
530 
531 static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
532 			    struct pci_cap_hdr *cap, off_t pos)
533 {
534 	struct pci_cap_hdr *last;
535 	struct pci_device_header *hdr = &vdev->pci.hdr;
536 
537 	cap->next = 0;
538 
539 	if (!hdr->capabilities) {
540 		hdr->capabilities = pos;
541 		hdr->status |= PCI_STATUS_CAP_LIST;
542 	} else {
543 		last = PCI_CAP(virt_hdr, hdr->capabilities);
544 
545 		while (last->next)
546 			last = PCI_CAP(virt_hdr, last->next);
547 
548 		last->next = pos;
549 	}
550 
551 	memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
552 
553 	return 0;
554 }
555 
556 static int vfio_pci_parse_caps(struct vfio_device *vdev)
557 {
558 	int ret;
559 	size_t size;
560 	u8 pos, next;
561 	struct pci_cap_hdr *cap;
562 	u8 virt_hdr[PCI_DEV_CFG_SIZE];
563 	struct vfio_pci_device *pdev = &vdev->pci;
564 
565 	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
566 		return 0;
567 
568 	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
569 
570 	pos = pdev->hdr.capabilities & ~3;
571 
572 	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
573 	pdev->hdr.capabilities = 0;
574 
575 	for (; pos; pos = next) {
576 		cap = PCI_CAP(&pdev->hdr, pos);
577 		next = cap->next;
578 
579 		switch (cap->type) {
580 		case PCI_CAP_ID_MSIX:
581 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
582 			if (ret)
583 				return ret;
584 
585 			pdev->msix.pos = pos;
586 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
587 			break;
588 		case PCI_CAP_ID_MSI:
589 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
590 			if (ret)
591 				return ret;
592 
593 			pdev->msi.pos = pos;
594 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI;
595 			break;
596 		}
597 	}
598 
599 	/* Wipe remaining capabilities */
600 	pos = PCI_STD_HEADER_SIZEOF;
601 	size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
602 	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
603 
604 	return 0;
605 }
606 
607 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
608 {
609 	ssize_t sz = PCI_DEV_CFG_SIZE;
610 	struct vfio_region_info *info;
611 	struct vfio_pci_device *pdev = &vdev->pci;
612 
613 	if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
614 		vfio_dev_err(vdev, "Config Space not found");
615 		return -ENODEV;
616 	}
617 
618 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
619 	*info = (struct vfio_region_info) {
620 			.argsz = sizeof(*info),
621 			.index = VFIO_PCI_CONFIG_REGION_INDEX,
622 	};
623 
624 	ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
625 	if (!info->size) {
626 		vfio_dev_err(vdev, "Config Space has size zero?!");
627 		return -EINVAL;
628 	}
629 
630 	/* Read standard headers and capabilities */
631 	if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
632 		vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
633 		return -EIO;
634 	}
635 
636 	/* Strip bit 7, that indicates multifunction */
637 	pdev->hdr.header_type &= 0x7f;
638 
639 	if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
640 		vfio_dev_err(vdev, "unsupported header type %u",
641 			     pdev->hdr.header_type);
642 		return -EOPNOTSUPP;
643 	}
644 
645 	if (pdev->hdr.irq_pin)
646 		pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
647 
648 	vfio_pci_parse_caps(vdev);
649 
650 	return 0;
651 }
652 
653 static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
654 {
655 	int i;
656 	u64 base;
657 	ssize_t hdr_sz;
658 	struct msix_cap *msix;
659 	struct vfio_region_info *info;
660 	struct vfio_pci_device *pdev = &vdev->pci;
661 	struct vfio_region *region;
662 
663 	/* Initialise the BARs */
664 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
665 		if ((u32)i == vdev->info.num_regions)
666 			break;
667 
668 		region = &vdev->regions[i];
669 		/* Construct a fake reg to match what we've mapped. */
670 		if (region->is_ioport) {
671 			base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) |
672 				PCI_BASE_ADDRESS_SPACE_IO;
673 		} else {
674 			base = (region->guest_phys_addr &
675 				PCI_BASE_ADDRESS_MEM_MASK) |
676 				PCI_BASE_ADDRESS_SPACE_MEMORY;
677 		}
678 
679 		pdev->hdr.bar[i] = base;
680 
681 		if (!base)
682 			continue;
683 
684 		pdev->hdr.bar_size[i] = region->info.size;
685 	}
686 
687 	/* I really can't be bothered to support cardbus. */
688 	pdev->hdr.card_bus = 0;
689 
690 	/*
691 	 * Nuke the expansion ROM for now. If we want to do this properly,
692 	 * we need to save its size somewhere and map into the guest.
693 	 */
694 	pdev->hdr.exp_rom_bar = 0;
695 
696 	/* Plumb in our fake MSI-X capability, if we have it. */
697 	msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
698 	if (msix) {
699 		/* Add a shortcut to the PBA region for the MMIO handler */
700 		int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
701 		pdev->msix_pba.offset = vdev->regions[pba_index].info.offset +
702 					(msix->pba_offset & PCI_MSIX_PBA_OFFSET);
703 
704 		/* Tidy up the capability */
705 		msix->table_offset &= PCI_MSIX_TABLE_BIR;
706 		msix->pba_offset &= PCI_MSIX_PBA_BIR;
707 		if (pdev->msix_table.bar == pdev->msix_pba.bar)
708 			msix->pba_offset |= pdev->msix_table.size &
709 					    PCI_MSIX_PBA_OFFSET;
710 	}
711 
712 	/* Install our fake Configuration Space */
713 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
714 	hdr_sz = PCI_DEV_CFG_SIZE;
715 	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
716 		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
717 			     hdr_sz);
718 		return -EIO;
719 	}
720 
721 	/* Register callbacks for cfg accesses */
722 	pdev->hdr.cfg_ops = (struct pci_config_operations) {
723 		.read	= vfio_pci_cfg_read,
724 		.write	= vfio_pci_cfg_write,
725 	};
726 
727 	pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
728 
729 	return 0;
730 }
731 
732 static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index,
733 				    struct vfio_region_info *info)
734 {
735 	int ret;
736 
737 	*info = (struct vfio_region_info) {
738 		.argsz = sizeof(*info),
739 		.index = index,
740 	};
741 
742 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
743 	if (ret) {
744 		ret = -errno;
745 		vfio_dev_err(vdev, "cannot get info for BAR %u", index);
746 		return ret;
747 	}
748 
749 	if (info->size && !is_power_of_two(info->size)) {
750 		vfio_dev_err(vdev, "region is not power of two: 0x%llx",
751 				info->size);
752 		return -EINVAL;
753 	}
754 
755 	return 0;
756 }
757 
758 static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev)
759 {
760 	int ret;
761 	size_t i;
762 	size_t map_size;
763 	size_t nr_entries;
764 	struct vfio_pci_msi_entry *entries;
765 	struct vfio_pci_device *pdev = &vdev->pci;
766 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
767 	struct vfio_pci_msix_table *table = &pdev->msix_table;
768 	struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
769 	struct vfio_region_info info;
770 
771 	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
772 	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
773 
774 	/*
775 	 * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE.
776 	 */
777 	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
778 	table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE);
779 	pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE);
780 
781 	entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
782 	if (!entries)
783 		return -ENOMEM;
784 
785 	for (i = 0; i < nr_entries; i++)
786 		entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
787 
788 	ret = vfio_pci_get_region_info(vdev, table->bar, &info);
789 	if (ret)
790 		return ret;
791 	if (!info.size)
792 		return -EINVAL;
793 	map_size = info.size;
794 
795 	if (table->bar != pba->bar) {
796 		ret = vfio_pci_get_region_info(vdev, pba->bar, &info);
797 		if (ret)
798 			return ret;
799 		if (!info.size)
800 			return -EINVAL;
801 		map_size += info.size;
802 	}
803 
804 	/*
805 	 * To ease MSI-X cap configuration in case they share the same BAR,
806 	 * collapse table and pending array. The size of the BAR regions must be
807 	 * powers of two.
808 	 */
809 	map_size = ALIGN(map_size, PAGE_SIZE);
810 	table->guest_phys_addr = pci_get_mmio_block(map_size);
811 	if (!table->guest_phys_addr) {
812 		pr_err("cannot allocate MMIO space");
813 		ret = -ENOMEM;
814 		goto out_free;
815 	}
816 	pba->guest_phys_addr = table->guest_phys_addr + table->size;
817 
818 	ret = kvm__register_mmio(kvm, table->guest_phys_addr, table->size,
819 				 false, vfio_pci_msix_table_access, pdev);
820 	if (ret < 0)
821 		goto out_free;
822 
823 	/*
824 	 * We could map the physical PBA directly into the guest, but it's
825 	 * likely smaller than a page, and we can only hand full pages to the
826 	 * guest. Even though the PCI spec disallows sharing a page used for
827 	 * MSI-X with any other resource, it allows to share the same page
828 	 * between MSI-X table and PBA. For the sake of isolation, create a
829 	 * virtual PBA.
830 	 */
831 	ret = kvm__register_mmio(kvm, pba->guest_phys_addr, pba->size, false,
832 				 vfio_pci_msix_pba_access, pdev);
833 	if (ret < 0)
834 		goto out_free;
835 
836 	pdev->msix.entries = entries;
837 	pdev->msix.nr_entries = nr_entries;
838 
839 	return 0;
840 
841 out_free:
842 	free(entries);
843 
844 	return ret;
845 }
846 
847 static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev)
848 {
849 	struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos);
850 
851 	pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1),
852 	pdev->msi.entries = calloc(pdev->msi.nr_entries,
853 				   sizeof(struct vfio_pci_msi_entry));
854 	if (!pdev->msi.entries)
855 		return -ENOMEM;
856 
857 	return 0;
858 }
859 
860 static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
861 				  size_t nr)
862 {
863 	int ret;
864 	u32 bar;
865 	size_t map_size;
866 	struct vfio_pci_device *pdev = &vdev->pci;
867 	struct vfio_region *region;
868 
869 	if (nr >= vdev->info.num_regions)
870 		return 0;
871 
872 	region = &vdev->regions[nr];
873 	bar = pdev->hdr.bar[nr];
874 
875 	region->vdev = vdev;
876 	region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
877 
878 	ret = vfio_pci_get_region_info(vdev, nr, &region->info);
879 	if (ret)
880 		return ret;
881 
882 	/* Ignore invalid or unimplemented regions */
883 	if (!region->info.size)
884 		return 0;
885 
886 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
887 		/* Trap and emulate MSI-X table */
888 		if (nr == pdev->msix_table.bar) {
889 			region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
890 			return 0;
891 		} else if (nr == pdev->msix_pba.bar) {
892 			region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
893 			return 0;
894 		}
895 	}
896 
897 	if (!region->is_ioport) {
898 		/* Grab some MMIO space in the guest */
899 		map_size = ALIGN(region->info.size, PAGE_SIZE);
900 		region->guest_phys_addr = pci_get_mmio_block(map_size);
901 	}
902 
903 	/* Map the BARs into the guest or setup a trap region. */
904 	ret = vfio_map_region(kvm, vdev, region);
905 	if (ret)
906 		return ret;
907 
908 	return 0;
909 }
910 
911 static int vfio_pci_configure_dev_regions(struct kvm *kvm,
912 					  struct vfio_device *vdev)
913 {
914 	int ret;
915 	u32 bar;
916 	size_t i;
917 	bool is_64bit = false;
918 	struct vfio_pci_device *pdev = &vdev->pci;
919 
920 	ret = vfio_pci_parse_cfg_space(vdev);
921 	if (ret)
922 		return ret;
923 
924 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
925 		ret = vfio_pci_create_msix_table(kvm, vdev);
926 		if (ret)
927 			return ret;
928 	}
929 
930 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
931 		ret = vfio_pci_create_msi_cap(kvm, pdev);
932 		if (ret)
933 			return ret;
934 	}
935 
936 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
937 		/* Ignore top half of 64-bit BAR */
938 		if (is_64bit) {
939 			is_64bit = false;
940 			continue;
941 		}
942 
943 		ret = vfio_pci_configure_bar(kvm, vdev, i);
944 		if (ret)
945 			return ret;
946 
947 		bar = pdev->hdr.bar[i];
948 		is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) ==
949 			   PCI_BASE_ADDRESS_SPACE_MEMORY &&
950 			   bar & PCI_BASE_ADDRESS_MEM_TYPE_64;
951 	}
952 
953 	/* We've configured the BARs, fake up a Configuration Space */
954 	return vfio_pci_fixup_cfg_space(vdev);
955 }
956 
957 /*
958  * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
959  * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
960  */
961 static int vfio_pci_reserve_irq_fds(size_t num)
962 {
963 	/*
964 	 * I counted around 27 fds under normal load. Let's add 100 for good
965 	 * measure.
966 	 */
967 	static size_t needed = 128;
968 	struct rlimit fd_limit, new_limit;
969 
970 	needed += num;
971 
972 	if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
973 		perror("getrlimit(RLIMIT_NOFILE)");
974 		return 0;
975 	}
976 
977 	if (fd_limit.rlim_cur >= needed)
978 		return 0;
979 
980 	new_limit.rlim_cur = needed;
981 
982 	if (fd_limit.rlim_max < needed)
983 		/* Try to bump hard limit (root only) */
984 		new_limit.rlim_max = needed;
985 	else
986 		new_limit.rlim_max = fd_limit.rlim_max;
987 
988 	if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
989 		perror("setrlimit(RLIMIT_NOFILE)");
990 		pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
991 			   (size_t)(needed - fd_limit.rlim_cur));
992 	}
993 
994 	return 0;
995 }
996 
997 static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
998 			     struct vfio_pci_msi_common *msis)
999 {
1000 	int ret;
1001 	size_t i;
1002 	int *eventfds;
1003 	size_t irq_set_size;
1004 	struct vfio_pci_msi_entry *entry;
1005 	size_t nr_entries = msis->nr_entries;
1006 
1007 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
1008 	if (ret || msis->info.count == 0) {
1009 		vfio_dev_err(vdev, "no MSI reported by VFIO");
1010 		return -ENODEV;
1011 	}
1012 
1013 	if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1014 		vfio_dev_err(vdev, "interrupt not EVENTFD capable");
1015 		return -EINVAL;
1016 	}
1017 
1018 	if (msis->info.count != nr_entries) {
1019 		vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
1020 		return -EINVAL;
1021 	}
1022 
1023 	mutex_init(&msis->mutex);
1024 
1025 	vfio_pci_reserve_irq_fds(nr_entries);
1026 
1027 	irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
1028 	msis->irq_set = malloc(irq_set_size);
1029 	if (!msis->irq_set)
1030 		return -ENOMEM;
1031 
1032 	*msis->irq_set = (struct vfio_irq_set) {
1033 		.argsz	= irq_set_size,
1034 		.flags 	= VFIO_IRQ_SET_DATA_EVENTFD |
1035 			  VFIO_IRQ_SET_ACTION_TRIGGER,
1036 		.index 	= msis->info.index,
1037 		.start 	= 0,
1038 		.count 	= nr_entries,
1039 	};
1040 
1041 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
1042 
1043 	for (i = 0; i < nr_entries; i++) {
1044 		entry = &msis->entries[i];
1045 		entry->gsi = -1;
1046 		entry->eventfd = -1;
1047 		msi_set_masked(entry->virt_state, true);
1048 		msi_set_masked(entry->phys_state, true);
1049 		eventfds[i] = -1;
1050 	}
1051 
1052 	return 0;
1053 }
1054 
1055 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
1056 {
1057 	struct vfio_pci_device *pdev = &vdev->pci;
1058 	int gsi = pdev->intx_gsi;
1059 	struct vfio_irq_set irq_set = {
1060 		.argsz	= sizeof(irq_set),
1061 		.flags	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
1062 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1063 	};
1064 
1065 	if (pdev->intx_fd == -1)
1066 		return;
1067 
1068 	pr_debug("user requested MSI, disabling INTx %d", gsi);
1069 
1070 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
1071 	irq__del_irqfd(kvm, gsi, pdev->intx_fd);
1072 
1073 	close(pdev->intx_fd);
1074 	close(pdev->unmask_fd);
1075 	pdev->intx_fd = -1;
1076 }
1077 
1078 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
1079 {
1080 	int ret;
1081 	int trigger_fd, unmask_fd;
1082 	union vfio_irq_eventfd	trigger;
1083 	union vfio_irq_eventfd	unmask;
1084 	struct vfio_pci_device *pdev = &vdev->pci;
1085 	int gsi = pdev->intx_gsi;
1086 
1087 	if (pdev->intx_fd != -1)
1088 		return 0;
1089 
1090 	/*
1091 	 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
1092 	 * signals an interrupt from host to guest, and unmask_fd signals the
1093 	 * deassertion of the line from guest to host.
1094 	 */
1095 	trigger_fd = eventfd(0, 0);
1096 	if (trigger_fd < 0) {
1097 		vfio_dev_err(vdev, "failed to create trigger eventfd");
1098 		return trigger_fd;
1099 	}
1100 
1101 	unmask_fd = eventfd(0, 0);
1102 	if (unmask_fd < 0) {
1103 		vfio_dev_err(vdev, "failed to create unmask eventfd");
1104 		close(trigger_fd);
1105 		return unmask_fd;
1106 	}
1107 
1108 	ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
1109 	if (ret)
1110 		goto err_close;
1111 
1112 	trigger.irq = (struct vfio_irq_set) {
1113 		.argsz	= sizeof(trigger),
1114 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
1115 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1116 		.start	= 0,
1117 		.count	= 1,
1118 	};
1119 	set_vfio_irq_eventd_payload(&trigger, trigger_fd);
1120 
1121 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1122 	if (ret < 0) {
1123 		vfio_dev_err(vdev, "failed to setup VFIO IRQ");
1124 		goto err_delete_line;
1125 	}
1126 
1127 	unmask.irq = (struct vfio_irq_set) {
1128 		.argsz	= sizeof(unmask),
1129 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
1130 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1131 		.start	= 0,
1132 		.count	= 1,
1133 	};
1134 	set_vfio_irq_eventd_payload(&unmask, unmask_fd);
1135 
1136 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask);
1137 	if (ret < 0) {
1138 		vfio_dev_err(vdev, "failed to setup unmask IRQ");
1139 		goto err_remove_event;
1140 	}
1141 
1142 	pdev->intx_fd = trigger_fd;
1143 	pdev->unmask_fd = unmask_fd;
1144 
1145 	return 0;
1146 
1147 err_remove_event:
1148 	/* Remove trigger event */
1149 	trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1150 	trigger.irq.count = 0;
1151 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1152 
1153 err_delete_line:
1154 	irq__del_irqfd(kvm, gsi, trigger_fd);
1155 
1156 err_close:
1157 	close(trigger_fd);
1158 	close(unmask_fd);
1159 	return ret;
1160 }
1161 
1162 static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev)
1163 {
1164 	int ret;
1165 	struct vfio_pci_device *pdev = &vdev->pci;
1166 	struct vfio_irq_info irq_info = {
1167 		.argsz = sizeof(irq_info),
1168 		.index = VFIO_PCI_INTX_IRQ_INDEX,
1169 	};
1170 
1171 	vfio_pci_reserve_irq_fds(2);
1172 
1173 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
1174 	if (ret || irq_info.count == 0) {
1175 		vfio_dev_err(vdev, "no INTx reported by VFIO");
1176 		return -ENODEV;
1177 	}
1178 
1179 	if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1180 		vfio_dev_err(vdev, "interrupt not eventfd capable");
1181 		return -EINVAL;
1182 	}
1183 
1184 	if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
1185 		vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
1186 		return -EINVAL;
1187 	}
1188 
1189 	/* Guest is going to ovewrite our irq_line... */
1190 	pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
1191 
1192 	pdev->intx_fd = -1;
1193 
1194 	return 0;
1195 }
1196 
1197 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
1198 {
1199 	int ret = 0;
1200 	struct vfio_pci_device *pdev = &vdev->pci;
1201 
1202 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1203 		pdev->msix.info = (struct vfio_irq_info) {
1204 			.argsz = sizeof(pdev->msix.info),
1205 			.index = VFIO_PCI_MSIX_IRQ_INDEX,
1206 		};
1207 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
1208 		if (ret)
1209 			return ret;
1210 	}
1211 
1212 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
1213 		pdev->msi.info = (struct vfio_irq_info) {
1214 			.argsz = sizeof(pdev->msi.info),
1215 			.index = VFIO_PCI_MSI_IRQ_INDEX,
1216 		};
1217 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi);
1218 		if (ret)
1219 			return ret;
1220 	}
1221 
1222 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
1223 		pci__assign_irq(&vdev->pci.hdr);
1224 
1225 		ret = vfio_pci_init_intx(kvm, vdev);
1226 		if (ret)
1227 			return ret;
1228 
1229 		ret = vfio_pci_enable_intx(kvm, vdev);
1230 	}
1231 
1232 	return ret;
1233 }
1234 
1235 int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
1236 {
1237 	int ret;
1238 
1239 	ret = vfio_pci_configure_dev_regions(kvm, vdev);
1240 	if (ret) {
1241 		vfio_dev_err(vdev, "failed to configure regions");
1242 		return ret;
1243 	}
1244 
1245 	vdev->dev_hdr = (struct device_header) {
1246 		.bus_type	= DEVICE_BUS_PCI,
1247 		.data		= &vdev->pci.hdr,
1248 	};
1249 
1250 	ret = device__register(&vdev->dev_hdr);
1251 	if (ret) {
1252 		vfio_dev_err(vdev, "failed to register VFIO device");
1253 		return ret;
1254 	}
1255 
1256 	ret = vfio_pci_configure_dev_irqs(kvm, vdev);
1257 	if (ret) {
1258 		vfio_dev_err(vdev, "failed to configure IRQs");
1259 		return ret;
1260 	}
1261 
1262 	return 0;
1263 }
1264 
1265 void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
1266 {
1267 	size_t i;
1268 	struct vfio_pci_device *pdev = &vdev->pci;
1269 
1270 	for (i = 0; i < vdev->info.num_regions; i++)
1271 		vfio_unmap_region(kvm, &vdev->regions[i]);
1272 
1273 	device__unregister(&vdev->dev_hdr);
1274 
1275 	free(pdev->msix.irq_set);
1276 	free(pdev->msix.entries);
1277 	free(pdev->msi.irq_set);
1278 	free(pdev->msi.entries);
1279 }
1280