xref: /kvmtool/vfio/pci.c (revision b20d6e3029400b4b4b19c654192951f8bedd39cc)
1 #include "kvm/irq.h"
2 #include "kvm/kvm.h"
3 #include "kvm/kvm-cpu.h"
4 #include "kvm/vfio.h"
5 
6 #include <assert.h>
7 
8 #include <sys/ioctl.h>
9 #include <sys/eventfd.h>
10 #include <sys/resource.h>
11 #include <sys/time.h>
12 
13 /* Some distros don't have the define. */
14 #ifndef PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1
15 #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1	12
16 #endif
17 
18 /* Wrapper around UAPI vfio_irq_set */
19 union vfio_irq_eventfd {
20 	struct vfio_irq_set	irq;
21 	u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)];
22 };
23 
24 static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd)
25 {
26 	memcpy(&evfd->irq.data, &fd, sizeof(fd));
27 }
28 
29 #define msi_is_enabled(state)		((state) & VFIO_PCI_MSI_STATE_ENABLED)
30 #define msi_is_masked(state)		((state) & VFIO_PCI_MSI_STATE_MASKED)
31 #define msi_is_empty(state)		((state) & VFIO_PCI_MSI_STATE_EMPTY)
32 
33 #define msi_update_state(state, val, bit)				\
34 	(state) = (val) ? (state) | bit : (state) & ~bit;
35 #define msi_set_enabled(state, val)					\
36 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
37 #define msi_set_masked(state, val)					\
38 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
39 #define msi_set_empty(state, val)					\
40 	msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
41 
42 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
43 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
44 
45 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
46 				bool msix)
47 {
48 	size_t i;
49 	int ret = 0;
50 	int *eventfds;
51 	struct vfio_pci_device *pdev = &vdev->pci;
52 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
53 	union vfio_irq_eventfd single = {
54 		.irq = {
55 			.argsz	= sizeof(single),
56 			.flags	= VFIO_IRQ_SET_DATA_EVENTFD |
57 				  VFIO_IRQ_SET_ACTION_TRIGGER,
58 			.index	= msis->info.index,
59 			.count	= 1,
60 		},
61 	};
62 
63 	if (!msi_is_enabled(msis->virt_state))
64 		return 0;
65 
66 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
67 		/*
68 		 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
69 		 * time. Since INTx has to be enabled from the start (we don't
70 		 * have a reliable way to know when the guest starts using it),
71 		 * disable it now.
72 		 */
73 		vfio_pci_disable_intx(kvm, vdev);
74 
75 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
76 
77 	/*
78 	 * Initial registration of the full range. This enables the physical
79 	 * MSI/MSI-X capability, which might have desired side effects. For
80 	 * instance when assigning virtio legacy devices, enabling the MSI
81 	 * capability modifies the config space layout!
82 	 *
83 	 * As an optimization, only update MSIs when guest unmasks the
84 	 * capability. This greatly reduces the initialization time for Linux
85 	 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
86 	 * masked, then fills individual vectors, then unmasks the whole
87 	 * function. So we only do one VFIO ioctl when enabling for the first
88 	 * time, and then one when unmasking.
89 	 *
90 	 * phys_state is empty when it is enabled but no vector has been
91 	 * registered via SET_IRQS yet.
92 	 */
93 	if (!msi_is_enabled(msis->phys_state) ||
94 	    (!msi_is_masked(msis->virt_state) &&
95 	     msi_is_empty(msis->phys_state))) {
96 		bool empty = true;
97 
98 		for (i = 0; i < msis->nr_entries; i++) {
99 			eventfds[i] = msis->entries[i].gsi >= 0 ?
100 				      msis->entries[i].eventfd : -1;
101 
102 			if (eventfds[i] >= 0)
103 				empty = false;
104 		}
105 
106 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
107 		if (ret < 0) {
108 			perror("VFIO_DEVICE_SET_IRQS(multi)");
109 			return ret;
110 		}
111 
112 		msi_set_enabled(msis->phys_state, true);
113 		msi_set_empty(msis->phys_state, empty);
114 
115 		return 0;
116 	}
117 
118 	if (msi_is_masked(msis->virt_state)) {
119 		/* TODO: if phys_state is not empty nor masked, mask all vectors */
120 		return 0;
121 	}
122 
123 	/* Update individual vectors to avoid breaking those in use */
124 	for (i = 0; i < msis->nr_entries; i++) {
125 		struct vfio_pci_msi_entry *entry = &msis->entries[i];
126 		int fd = entry->gsi >= 0 ? entry->eventfd : -1;
127 
128 		if (fd == eventfds[i])
129 			continue;
130 
131 		single.irq.start = i;
132 		set_vfio_irq_eventd_payload(&single, fd);
133 
134 		ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
135 		if (ret < 0) {
136 			perror("VFIO_DEVICE_SET_IRQS(single)");
137 			break;
138 		}
139 
140 		eventfds[i] = fd;
141 
142 		if (msi_is_empty(msis->phys_state) && fd >= 0)
143 			msi_set_empty(msis->phys_state, false);
144 	}
145 
146 	return ret;
147 }
148 
149 static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev,
150 				 bool msix)
151 {
152 	int ret;
153 	struct vfio_pci_device *pdev = &vdev->pci;
154 	struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
155 	struct vfio_irq_set irq_set = {
156 		.argsz	= sizeof(irq_set),
157 		.flags 	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
158 		.index 	= msis->info.index,
159 		.start 	= 0,
160 		.count	= 0,
161 	};
162 
163 	if (!msi_is_enabled(msis->phys_state))
164 		return 0;
165 
166 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
167 	if (ret < 0) {
168 		perror("VFIO_DEVICE_SET_IRQS(NONE)");
169 		return ret;
170 	}
171 
172 	msi_set_enabled(msis->phys_state, false);
173 	msi_set_empty(msis->phys_state, true);
174 
175 	/*
176 	 * When MSI or MSIX is disabled, this might be called when
177 	 * PCI driver detects the MSI interrupt failure and wants to
178 	 * rollback to INTx mode.  Thus enable INTx if the device
179 	 * supports INTx mode in this case.
180 	 */
181 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
182 		ret = vfio_pci_enable_intx(kvm, vdev);
183 
184 	return ret >= 0 ? 0 : ret;
185 }
186 
187 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
188 				     struct vfio_pci_msi_entry *entry)
189 {
190 	int ret;
191 
192 	if (entry->eventfd < 0) {
193 		entry->eventfd = eventfd(0, 0);
194 		if (entry->eventfd < 0) {
195 			ret = -errno;
196 			vfio_dev_err(vdev, "cannot create eventfd");
197 			return ret;
198 		}
199 	}
200 
201 	/* Allocate IRQ if necessary */
202 	if (entry->gsi < 0) {
203 		int ret = irq__add_msix_route(kvm, &entry->config.msg,
204 					      vdev->dev_hdr.dev_num << 3);
205 		if (ret < 0) {
206 			vfio_dev_err(vdev, "cannot create MSI-X route");
207 			return ret;
208 		}
209 		entry->gsi = ret;
210 	} else {
211 		irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
212 	}
213 
214 	/*
215 	 * MSI masking is unimplemented in VFIO, so we have to handle it by
216 	 * disabling/enabling IRQ route instead. We do it on the KVM side rather
217 	 * than VFIO, because:
218 	 * - it is 8x faster
219 	 * - it allows to decouple masking logic from capability state.
220 	 * - in masked state, after removing irqfd route, we could easily plug
221 	 *   the eventfd in a local handler, in order to serve Pending Bit reads
222 	 *   to the guest.
223 	 *
224 	 * So entry->phys_state is masked when there is no active irqfd route.
225 	 */
226 	if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state))
227 		return 0;
228 
229 	if (msi_is_masked(entry->phys_state)) {
230 		ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
231 		if (ret < 0) {
232 			vfio_dev_err(vdev, "cannot setup irqfd");
233 			return ret;
234 		}
235 	} else {
236 		irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
237 	}
238 
239 	msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state));
240 
241 	return 0;
242 }
243 
244 static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
245 				     u32 len, u8 is_write, void *ptr)
246 {
247 	struct vfio_pci_device *pdev = ptr;
248 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
249 	u64 offset = addr - pba->guest_phys_addr;
250 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
251 
252 	if (offset >= pba->size) {
253 		vfio_dev_err(vdev, "access outside of the MSIX PBA");
254 		return;
255 	}
256 
257 	if (is_write)
258 		return;
259 
260 	/*
261 	 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
262 	 * is completely useless here. Note that Linux doesn't use PBA.
263 	 */
264 	if (pread(vdev->fd, data, len, pba->fd_offset + offset) != (ssize_t)len)
265 		vfio_dev_err(vdev, "cannot access MSIX PBA\n");
266 }
267 
268 static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
269 				       u32 len, u8 is_write, void *ptr)
270 {
271 	struct kvm *kvm = vcpu->kvm;
272 	struct vfio_pci_msi_entry *entry;
273 	struct vfio_pci_device *pdev = ptr;
274 	struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
275 
276 	u64 offset = addr - pdev->msix_table.guest_phys_addr;
277 	if (offset >= pdev->msix_table.size) {
278 		vfio_dev_err(vdev, "access outside of the MSI-X table");
279 		return;
280 	}
281 
282 	size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
283 	off_t field = offset % PCI_MSIX_ENTRY_SIZE;
284 
285 	/*
286 	 * PCI spec says that software must use aligned 4 or 8 bytes accesses
287 	 * for the MSI-X tables.
288 	 */
289 	if ((len != 4 && len != 8) || addr & (len - 1)) {
290 		vfio_dev_warn(vdev, "invalid MSI-X table access");
291 		return;
292 	}
293 
294 	entry = &pdev->msix.entries[vector];
295 
296 	mutex_lock(&pdev->msix.mutex);
297 
298 	if (!is_write) {
299 		memcpy(data, (void *)&entry->config + field, len);
300 		goto out_unlock;
301 	}
302 
303 	memcpy((void *)&entry->config + field, data, len);
304 
305 	/*
306 	 * Check if access touched the vector control register, which is at the
307 	 * end of the MSI-X entry.
308 	 */
309 	if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
310 		goto out_unlock;
311 
312 	msi_set_masked(entry->virt_state, entry->config.ctrl &
313 		       PCI_MSIX_ENTRY_CTRL_MASKBIT);
314 
315 	if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
316 		/* Not much we can do here. */
317 		vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
318 
319 	/* Update the physical capability if necessary */
320 	if (vfio_pci_enable_msis(kvm, vdev, true))
321 		vfio_dev_err(vdev, "cannot enable MSIX");
322 
323 out_unlock:
324 	mutex_unlock(&pdev->msix.mutex);
325 }
326 
327 static void vfio_pci_msix_cap_write(struct kvm *kvm,
328 				    struct vfio_device *vdev, u16 off,
329 				    void *data, int sz)
330 {
331 	struct vfio_pci_device *pdev = &vdev->pci;
332 	off_t enable_pos = PCI_MSIX_FLAGS + 1;
333 	bool enable;
334 	u16 flags;
335 
336 	off -= pdev->msix.pos;
337 
338 	/* Check if access intersects with the MSI-X Enable bit */
339 	if (off > enable_pos || off + sz <= enable_pos)
340 		return;
341 
342 	/* Read byte that contains the Enable bit */
343 	flags = *(u8 *)(data + enable_pos - off) << 8;
344 
345 	mutex_lock(&pdev->msix.mutex);
346 
347 	msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL);
348 	enable = flags & PCI_MSIX_FLAGS_ENABLE;
349 	msi_set_enabled(pdev->msix.virt_state, enable);
350 
351 	if (enable && vfio_pci_enable_msis(kvm, vdev, true))
352 		vfio_dev_err(vdev, "cannot enable MSIX");
353 	else if (!enable && vfio_pci_disable_msis(kvm, vdev, true))
354 		vfio_dev_err(vdev, "cannot disable MSIX");
355 
356 	mutex_unlock(&pdev->msix.mutex);
357 }
358 
359 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
360 				     u16 off, u8 *data, u32 sz)
361 {
362 	size_t i;
363 	u32 mask = 0;
364 	size_t mask_pos, start, limit;
365 	struct vfio_pci_msi_entry *entry;
366 	struct vfio_pci_device *pdev = &vdev->pci;
367 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
368 
369 	if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT))
370 		return 0;
371 
372 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT)
373 		mask_pos = PCI_MSI_MASK_64;
374 	else
375 		mask_pos = PCI_MSI_MASK_32;
376 
377 	if (off >= mask_pos + 4 || off + sz <= mask_pos)
378 		return 0;
379 
380 	/* Set mask to current state */
381 	for (i = 0; i < pdev->msi.nr_entries; i++) {
382 		entry = &pdev->msi.entries[i];
383 		mask |= !!msi_is_masked(entry->virt_state) << i;
384 	}
385 
386 	/* Update mask following the intersection of access and register */
387 	start = max_t(size_t, off, mask_pos);
388 	limit = min_t(size_t, off + sz, mask_pos + 4);
389 
390 	memcpy((void *)&mask + start - mask_pos, data + start - off,
391 	       limit - start);
392 
393 	/* Update states if necessary */
394 	for (i = 0; i < pdev->msi.nr_entries; i++) {
395 		bool masked = mask & (1 << i);
396 
397 		entry = &pdev->msi.entries[i];
398 		if (masked != msi_is_masked(entry->virt_state)) {
399 			msi_set_masked(entry->virt_state, masked);
400 			vfio_pci_update_msi_entry(kvm, vdev, entry);
401 		}
402 	}
403 
404 	return 1;
405 }
406 
407 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
408 				   u16 off, u8 *data, u32 sz)
409 {
410 	u8 ctrl;
411 	struct msi_msg msg;
412 	size_t i, nr_vectors;
413 	struct vfio_pci_msi_entry *entry;
414 	struct vfio_pci_device *pdev = &vdev->pci;
415 	struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
416 
417 	off -= pdev->msi.pos;
418 
419 	mutex_lock(&pdev->msi.mutex);
420 
421 	/* Check if the guest is trying to update mask bits */
422 	if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz))
423 		goto out_unlock;
424 
425 	/* Only modify routes when guest pokes the enable bit */
426 	if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS)
427 		goto out_unlock;
428 
429 	ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off);
430 
431 	msi_set_enabled(pdev->msi.virt_state, ctrl & PCI_MSI_FLAGS_ENABLE);
432 
433 	if (!msi_is_enabled(pdev->msi.virt_state)) {
434 		vfio_pci_disable_msis(kvm, vdev, false);
435 		goto out_unlock;
436 	}
437 
438 	/* Create routes for the requested vectors */
439 	nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
440 
441 	msg.address_lo = msi_cap_64->address_lo;
442 	if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) {
443 		msg.address_hi = msi_cap_64->address_hi;
444 		msg.data = msi_cap_64->data;
445 	} else {
446 		struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64;
447 		msg.address_hi = 0;
448 		msg.data = msi_cap_32->data;
449 	}
450 
451 	for (i = 0; i < nr_vectors; i++) {
452 		entry = &pdev->msi.entries[i];
453 
454 		/*
455 		 * Set the MSI data value as required by the PCI local
456 		 * bus specifications, MSI capability, "Message Data".
457 		 */
458 		msg.data &= ~(nr_vectors - 1);
459 		msg.data |= i;
460 
461 		entry->config.msg = msg;
462 		vfio_pci_update_msi_entry(kvm, vdev, entry);
463 	}
464 
465 	/* Update the physical capability if necessary */
466 	if (vfio_pci_enable_msis(kvm, vdev, false))
467 		vfio_dev_err(vdev, "cannot enable MSI");
468 
469 out_unlock:
470 	mutex_unlock(&pdev->msi.mutex);
471 }
472 
473 static int vfio_pci_bar_activate(struct kvm *kvm,
474 				 struct pci_device_header *pci_hdr,
475 				 int bar_num, void *data)
476 {
477 	struct vfio_device *vdev = data;
478 	struct vfio_pci_device *pdev = &vdev->pci;
479 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
480 	struct vfio_pci_msix_table *table = &pdev->msix_table;
481 	struct vfio_region *region;
482 	u32 bar_addr;
483 	bool has_msix;
484 	int ret;
485 
486 	assert((u32)bar_num < vdev->info.num_regions);
487 
488 	region = &vdev->regions[bar_num];
489 	has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
490 
491 	bar_addr = pci__bar_address(pci_hdr, bar_num);
492 	if (pci__bar_is_io(pci_hdr, bar_num))
493 		region->port_base = bar_addr;
494 	else
495 		region->guest_phys_addr = bar_addr;
496 
497 	if (has_msix && (u32)bar_num == table->bar) {
498 		table->guest_phys_addr = region->guest_phys_addr;
499 		ret = kvm__register_mmio(kvm, table->guest_phys_addr,
500 					 table->size, false,
501 					 vfio_pci_msix_table_access, pdev);
502 		/*
503 		 * The MSIX table and the PBA structure can share the same BAR,
504 		 * but for convenience we register different regions for mmio
505 		 * emulation. We want to we update both if they share the same
506 		 * BAR.
507 		 */
508 		if (ret < 0 || table->bar != pba->bar)
509 			goto out;
510 	}
511 
512 	if (has_msix && (u32)bar_num == pba->bar) {
513 		if (pba->bar == table->bar)
514 			pba->guest_phys_addr = table->guest_phys_addr + pba->bar_offset;
515 		else
516 			pba->guest_phys_addr = region->guest_phys_addr;
517 		ret = kvm__register_mmio(kvm, pba->guest_phys_addr,
518 					 pba->size, false,
519 					 vfio_pci_msix_pba_access, pdev);
520 		goto out;
521 	}
522 
523 	ret = vfio_map_region(kvm, vdev, region);
524 out:
525 	return ret;
526 }
527 
528 static int vfio_pci_bar_deactivate(struct kvm *kvm,
529 				   struct pci_device_header *pci_hdr,
530 				   int bar_num, void *data)
531 {
532 	struct vfio_device *vdev = data;
533 	struct vfio_pci_device *pdev = &vdev->pci;
534 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
535 	struct vfio_pci_msix_table *table = &pdev->msix_table;
536 	struct vfio_region *region;
537 	bool has_msix, success;
538 	int ret;
539 
540 	assert((u32)bar_num < vdev->info.num_regions);
541 
542 	region = &vdev->regions[bar_num];
543 	has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
544 
545 	if (has_msix && (u32)bar_num == table->bar) {
546 		success = kvm__deregister_mmio(kvm, table->guest_phys_addr);
547 		/* kvm__deregister_mmio fails when the region is not found. */
548 		ret = (success ? 0 : -ENOENT);
549 		/* See vfio_pci_bar_activate(). */
550 		if (ret < 0 || table->bar!= pba->bar)
551 			goto out;
552 	}
553 
554 	if (has_msix && (u32)bar_num == pba->bar) {
555 		success = kvm__deregister_mmio(kvm, pba->guest_phys_addr);
556 		ret = (success ? 0 : -ENOENT);
557 		goto out;
558 	}
559 
560 	vfio_unmap_region(kvm, region);
561 	ret = 0;
562 
563 out:
564 	return ret;
565 }
566 
567 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
568 			      u16 offset, void *data, int sz)
569 {
570 	struct vfio_region_info *info;
571 	struct vfio_pci_device *pdev;
572 	struct vfio_device *vdev;
573 	char base[sz];
574 
575 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
576 	vdev = container_of(pdev, struct vfio_device, pci);
577 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
578 
579 	/* Dummy read in case of side-effects */
580 	if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
581 		vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
582 			      sz, offset);
583 }
584 
585 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
586 			       u16 offset, void *data, int sz)
587 {
588 	struct vfio_region_info *info;
589 	struct vfio_pci_device *pdev;
590 	struct vfio_device *vdev;
591 	u32 tmp;
592 
593 	/* Make sure a larger size will not overrun tmp on the stack. */
594 	assert(sz <= 4);
595 
596 	if (offset == PCI_ROM_ADDRESS)
597 		return;
598 
599 	pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
600 	vdev = container_of(pdev, struct vfio_device, pci);
601 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
602 
603 	if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
604 		vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
605 			      sz, offset);
606 
607 	/* Handle MSI write now, since it might update the hardware capability */
608 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
609 		vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
610 
611 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI)
612 		vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz);
613 
614 	if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz)
615 		vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
616 			      sz, offset);
617 }
618 
619 static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr)
620 {
621 	size_t size = 10;
622 
623 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT)
624 		size += 4;
625 	if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT)
626 		size += 10;
627 
628 	return size;
629 }
630 
631 static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
632 {
633 	switch (cap_hdr->type) {
634 	case PCI_CAP_ID_MSIX:
635 		return PCI_CAP_MSIX_SIZEOF;
636 	case PCI_CAP_ID_MSI:
637 		return vfio_pci_msi_cap_size((void *)cap_hdr);
638 	case PCI_CAP_ID_EXP:
639 		/*
640 		 * We don't emulate any of the link, slot and root complex
641 		 * properties, so ignore them.
642 		 */
643 		return PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1;
644 	default:
645 		pr_err("unknown PCI capability 0x%x", cap_hdr->type);
646 		return 0;
647 	}
648 }
649 
650 static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
651 			    struct pci_cap_hdr *cap, off_t pos)
652 {
653 	struct pci_cap_hdr *last;
654 	struct pci_device_header *hdr = &vdev->pci.hdr;
655 
656 	cap->next = 0;
657 
658 	if (!hdr->capabilities) {
659 		hdr->capabilities = pos;
660 		hdr->status |= PCI_STATUS_CAP_LIST;
661 	} else {
662 		last = PCI_CAP(virt_hdr, hdr->capabilities);
663 
664 		while (last->next)
665 			last = PCI_CAP(virt_hdr, last->next);
666 
667 		last->next = pos;
668 	}
669 
670 	memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
671 
672 	return 0;
673 }
674 
675 static int vfio_pci_parse_caps(struct vfio_device *vdev)
676 {
677 	int ret;
678 	size_t size;
679 	u16 pos, next;
680 	struct pci_cap_hdr *cap;
681 	u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY];
682 	struct vfio_pci_device *pdev = &vdev->pci;
683 
684 	if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
685 		return 0;
686 
687 	memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY);
688 
689 	pos = pdev->hdr.capabilities & ~3;
690 
691 	pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
692 	pdev->hdr.capabilities = 0;
693 
694 	for (; pos; pos = next) {
695 		cap = PCI_CAP(&pdev->hdr, pos);
696 		next = cap->next;
697 
698 		switch (cap->type) {
699 		case PCI_CAP_ID_MSIX:
700 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
701 			if (ret)
702 				return ret;
703 
704 			pdev->msix.pos = pos;
705 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
706 			break;
707 		case PCI_CAP_ID_MSI:
708 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
709 			if (ret)
710 				return ret;
711 
712 			pdev->msi.pos = pos;
713 			pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI;
714 			break;
715 		case PCI_CAP_ID_EXP:
716 			if (!arch_has_pci_exp())
717 				continue;
718 			ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
719 			if (ret)
720 				return ret;
721 			break;
722 		}
723 	}
724 
725 	/* Wipe remaining capabilities */
726 	pos = PCI_STD_HEADER_SIZEOF;
727 	size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF;
728 	memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
729 
730 	return 0;
731 }
732 
733 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
734 {
735 	ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
736 	struct vfio_region_info *info;
737 	struct vfio_pci_device *pdev = &vdev->pci;
738 
739 	if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
740 		vfio_dev_err(vdev, "Config Space not found");
741 		return -ENODEV;
742 	}
743 
744 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
745 	*info = (struct vfio_region_info) {
746 			.argsz = sizeof(*info),
747 			.index = VFIO_PCI_CONFIG_REGION_INDEX,
748 	};
749 
750 	ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
751 	if (!info->size) {
752 		vfio_dev_err(vdev, "Config Space has size zero?!");
753 		return -EINVAL;
754 	}
755 
756 	/* Read standard headers and capabilities */
757 	if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
758 		vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
759 		return -EIO;
760 	}
761 
762 	/* Strip bit 7, that indicates multifunction */
763 	pdev->hdr.header_type &= 0x7f;
764 
765 	if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
766 		vfio_dev_err(vdev, "unsupported header type %u",
767 			     pdev->hdr.header_type);
768 		return -EOPNOTSUPP;
769 	}
770 
771 	if (pdev->hdr.irq_pin)
772 		pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
773 
774 	vfio_pci_parse_caps(vdev);
775 
776 	return 0;
777 }
778 
779 static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
780 {
781 	int i;
782 	u64 base;
783 	ssize_t hdr_sz;
784 	struct msix_cap *msix;
785 	struct vfio_region_info *info;
786 	struct vfio_pci_device *pdev = &vdev->pci;
787 	struct vfio_region *region;
788 
789 	/* Initialise the BARs */
790 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
791 		if ((u32)i == vdev->info.num_regions)
792 			break;
793 
794 		region = &vdev->regions[i];
795 		/* Construct a fake reg to match what we've mapped. */
796 		if (region->is_ioport) {
797 			base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) |
798 				PCI_BASE_ADDRESS_SPACE_IO;
799 		} else {
800 			base = (region->guest_phys_addr &
801 				PCI_BASE_ADDRESS_MEM_MASK) |
802 				PCI_BASE_ADDRESS_SPACE_MEMORY;
803 		}
804 
805 		pdev->hdr.bar[i] = base;
806 
807 		if (!base)
808 			continue;
809 
810 		pdev->hdr.bar_size[i] = region->info.size;
811 	}
812 
813 	/* I really can't be bothered to support cardbus. */
814 	pdev->hdr.card_bus = 0;
815 
816 	/*
817 	 * Nuke the expansion ROM for now. If we want to do this properly,
818 	 * we need to save its size somewhere and map into the guest.
819 	 */
820 	pdev->hdr.exp_rom_bar = 0;
821 
822 	/* Plumb in our fake MSI-X capability, if we have it. */
823 	msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
824 	if (msix) {
825 		/* Add a shortcut to the PBA region for the MMIO handler */
826 		int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
827 		u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET;
828 
829 		pdev->msix_pba.fd_offset = vdev->regions[pba_index].info.offset +
830 					   pba_bar_offset;
831 
832 		/* Tidy up the capability */
833 		msix->table_offset &= PCI_MSIX_TABLE_BIR;
834 		if (pdev->msix_table.bar == pdev->msix_pba.bar) {
835 			/* Keep the same offset as the MSIX cap. */
836 			pdev->msix_pba.bar_offset = pba_bar_offset;
837 		} else {
838 			/* PBA is at the start of the BAR. */
839 			msix->pba_offset &= PCI_MSIX_PBA_BIR;
840 			pdev->msix_pba.bar_offset = 0;
841 		}
842 	}
843 
844 	/* Install our fake Configuration Space */
845 	info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
846 	/*
847 	 * We don't touch the extended configuration space, let's be cautious
848 	 * and not overwrite it all with zeros, or bad things might happen.
849 	 */
850 	hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
851 	if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
852 		vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
853 			     hdr_sz);
854 		return -EIO;
855 	}
856 
857 	/* Register callbacks for cfg accesses */
858 	pdev->hdr.cfg_ops = (struct pci_config_operations) {
859 		.read	= vfio_pci_cfg_read,
860 		.write	= vfio_pci_cfg_write,
861 	};
862 
863 	pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
864 
865 	return 0;
866 }
867 
868 static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index,
869 				    struct vfio_region_info *info)
870 {
871 	int ret;
872 
873 	*info = (struct vfio_region_info) {
874 		.argsz = sizeof(*info),
875 		.index = index,
876 	};
877 
878 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
879 	if (ret) {
880 		ret = -errno;
881 		vfio_dev_err(vdev, "cannot get info for BAR %u", index);
882 		return ret;
883 	}
884 
885 	if (info->size && !is_power_of_two(info->size)) {
886 		vfio_dev_err(vdev, "region is not power of two: 0x%llx",
887 				info->size);
888 		return -EINVAL;
889 	}
890 
891 	return 0;
892 }
893 
894 static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev)
895 {
896 	int ret;
897 	size_t i;
898 	size_t map_size;
899 	size_t nr_entries;
900 	struct vfio_pci_msi_entry *entries;
901 	struct vfio_pci_device *pdev = &vdev->pci;
902 	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
903 	struct vfio_pci_msix_table *table = &pdev->msix_table;
904 	struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
905 	struct vfio_region_info info;
906 
907 	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
908 	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
909 
910 	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
911 
912 	/* MSIX table and PBA must support QWORD accesses. */
913 	table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, 8);
914 	pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), 8);
915 
916 	entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
917 	if (!entries)
918 		return -ENOMEM;
919 
920 	for (i = 0; i < nr_entries; i++)
921 		entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
922 
923 	ret = vfio_pci_get_region_info(vdev, table->bar, &info);
924 	if (ret)
925 		return ret;
926 	if (!info.size)
927 		return -EINVAL;
928 
929 	map_size = ALIGN(info.size, PAGE_SIZE);
930 	table->guest_phys_addr = pci_get_mmio_block(map_size);
931 	if (!table->guest_phys_addr) {
932 		pr_err("cannot allocate MMIO space");
933 		ret = -ENOMEM;
934 		goto out_free;
935 	}
936 
937 	/*
938 	 * We could map the physical PBA directly into the guest, but it's
939 	 * likely smaller than a page, and we can only hand full pages to the
940 	 * guest. Even though the PCI spec disallows sharing a page used for
941 	 * MSI-X with any other resource, it allows to share the same page
942 	 * between MSI-X table and PBA. For the sake of isolation, create a
943 	 * virtual PBA.
944 	 */
945 	if (table->bar == pba->bar) {
946 		u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET;
947 
948 		/* Sanity checks. */
949 		if (table->size > pba_bar_offset)
950 			die("MSIX table overlaps with PBA");
951 		if (pba_bar_offset + pba->size > info.size)
952 			die("PBA exceeds the size of the region");
953 		pba->guest_phys_addr = table->guest_phys_addr + pba_bar_offset;
954 	} else {
955 		ret = vfio_pci_get_region_info(vdev, pba->bar, &info);
956 		if (ret)
957 			return ret;
958 		if (!info.size)
959 			return -EINVAL;
960 
961 		map_size = ALIGN(info.size, PAGE_SIZE);
962 		pba->guest_phys_addr = pci_get_mmio_block(map_size);
963 		if (!pba->guest_phys_addr) {
964 			pr_err("cannot allocate MMIO space");
965 			ret = -ENOMEM;
966 			goto out_free;
967 		}
968 	}
969 
970 	pdev->msix.entries = entries;
971 	pdev->msix.nr_entries = nr_entries;
972 
973 	return 0;
974 
975 out_free:
976 	free(entries);
977 
978 	return ret;
979 }
980 
981 static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev)
982 {
983 	struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos);
984 
985 	pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1),
986 	pdev->msi.entries = calloc(pdev->msi.nr_entries,
987 				   sizeof(struct vfio_pci_msi_entry));
988 	if (!pdev->msi.entries)
989 		return -ENOMEM;
990 
991 	return 0;
992 }
993 
994 static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
995 				  size_t nr)
996 {
997 	int ret;
998 	u32 bar;
999 	size_t map_size;
1000 	struct vfio_pci_device *pdev = &vdev->pci;
1001 	struct vfio_region *region;
1002 
1003 	if (nr >= vdev->info.num_regions)
1004 		return 0;
1005 
1006 	region = &vdev->regions[nr];
1007 	bar = pdev->hdr.bar[nr];
1008 
1009 	region->vdev = vdev;
1010 	region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
1011 
1012 	ret = vfio_pci_get_region_info(vdev, nr, &region->info);
1013 	if (ret)
1014 		return ret;
1015 
1016 	/* Ignore invalid or unimplemented regions */
1017 	if (!region->info.size)
1018 		return 0;
1019 
1020 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1021 		/* Trap and emulate MSI-X table */
1022 		if (nr == pdev->msix_table.bar) {
1023 			region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
1024 			return 0;
1025 		} else if (nr == pdev->msix_pba.bar) {
1026 			region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
1027 			return 0;
1028 		}
1029 	}
1030 
1031 	if (region->is_ioport) {
1032 		region->port_base = pci_get_io_port_block(region->info.size);
1033 	} else {
1034 		/* Grab some MMIO space in the guest */
1035 		map_size = ALIGN(region->info.size, PAGE_SIZE);
1036 		region->guest_phys_addr = pci_get_mmio_block(map_size);
1037 	}
1038 
1039 	return 0;
1040 }
1041 
1042 static int vfio_pci_configure_dev_regions(struct kvm *kvm,
1043 					  struct vfio_device *vdev)
1044 {
1045 	int ret;
1046 	u32 bar;
1047 	size_t i;
1048 	bool is_64bit = false;
1049 	struct vfio_pci_device *pdev = &vdev->pci;
1050 
1051 	ret = vfio_pci_parse_cfg_space(vdev);
1052 	if (ret)
1053 		return ret;
1054 
1055 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1056 		ret = vfio_pci_create_msix_table(kvm, vdev);
1057 		if (ret)
1058 			return ret;
1059 	}
1060 
1061 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
1062 		ret = vfio_pci_create_msi_cap(kvm, pdev);
1063 		if (ret)
1064 			return ret;
1065 	}
1066 
1067 	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
1068 		/* Ignore top half of 64-bit BAR */
1069 		if (is_64bit) {
1070 			is_64bit = false;
1071 			continue;
1072 		}
1073 
1074 		ret = vfio_pci_configure_bar(kvm, vdev, i);
1075 		if (ret)
1076 			return ret;
1077 
1078 		bar = pdev->hdr.bar[i];
1079 		is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) ==
1080 			   PCI_BASE_ADDRESS_SPACE_MEMORY &&
1081 			   bar & PCI_BASE_ADDRESS_MEM_TYPE_64;
1082 	}
1083 
1084 	/* We've configured the BARs, fake up a Configuration Space */
1085 	ret = vfio_pci_fixup_cfg_space(vdev);
1086 	if (ret)
1087 		return ret;
1088 
1089 	return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate,
1090 					 vfio_pci_bar_deactivate, vdev);
1091 }
1092 
1093 /*
1094  * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
1095  * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
1096  */
1097 static int vfio_pci_reserve_irq_fds(size_t num)
1098 {
1099 	/*
1100 	 * I counted around 27 fds under normal load. Let's add 100 for good
1101 	 * measure.
1102 	 */
1103 	static size_t needed = 128;
1104 	struct rlimit fd_limit, new_limit;
1105 
1106 	needed += num;
1107 
1108 	if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
1109 		perror("getrlimit(RLIMIT_NOFILE)");
1110 		return 0;
1111 	}
1112 
1113 	if (fd_limit.rlim_cur >= needed)
1114 		return 0;
1115 
1116 	new_limit.rlim_cur = needed;
1117 
1118 	if (fd_limit.rlim_max < needed)
1119 		/* Try to bump hard limit (root only) */
1120 		new_limit.rlim_max = needed;
1121 	else
1122 		new_limit.rlim_max = fd_limit.rlim_max;
1123 
1124 	if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
1125 		perror("setrlimit(RLIMIT_NOFILE)");
1126 		pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
1127 			   (size_t)(needed - fd_limit.rlim_cur));
1128 	}
1129 
1130 	return 0;
1131 }
1132 
1133 static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
1134 			     struct vfio_pci_msi_common *msis)
1135 {
1136 	int ret;
1137 	size_t i;
1138 	int *eventfds;
1139 	size_t irq_set_size;
1140 	struct vfio_pci_msi_entry *entry;
1141 	size_t nr_entries = msis->nr_entries;
1142 
1143 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
1144 	if (ret || msis->info.count == 0) {
1145 		vfio_dev_err(vdev, "no MSI reported by VFIO");
1146 		return -ENODEV;
1147 	}
1148 
1149 	if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1150 		vfio_dev_err(vdev, "interrupt not EVENTFD capable");
1151 		return -EINVAL;
1152 	}
1153 
1154 	if (msis->info.count != nr_entries) {
1155 		vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
1156 		return -EINVAL;
1157 	}
1158 
1159 	mutex_init(&msis->mutex);
1160 
1161 	vfio_pci_reserve_irq_fds(nr_entries);
1162 
1163 	irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
1164 	msis->irq_set = malloc(irq_set_size);
1165 	if (!msis->irq_set)
1166 		return -ENOMEM;
1167 
1168 	*msis->irq_set = (struct vfio_irq_set) {
1169 		.argsz	= irq_set_size,
1170 		.flags 	= VFIO_IRQ_SET_DATA_EVENTFD |
1171 			  VFIO_IRQ_SET_ACTION_TRIGGER,
1172 		.index 	= msis->info.index,
1173 		.start 	= 0,
1174 		.count 	= nr_entries,
1175 	};
1176 
1177 	eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
1178 
1179 	for (i = 0; i < nr_entries; i++) {
1180 		entry = &msis->entries[i];
1181 		entry->gsi = -1;
1182 		entry->eventfd = -1;
1183 		msi_set_masked(entry->virt_state, true);
1184 		msi_set_masked(entry->phys_state, true);
1185 		eventfds[i] = -1;
1186 	}
1187 
1188 	return 0;
1189 }
1190 
1191 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
1192 {
1193 	struct vfio_pci_device *pdev = &vdev->pci;
1194 	int gsi = pdev->intx_gsi;
1195 	struct vfio_irq_set irq_set = {
1196 		.argsz	= sizeof(irq_set),
1197 		.flags	= VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
1198 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1199 	};
1200 
1201 	if (pdev->intx_fd == -1)
1202 		return;
1203 
1204 	pr_debug("user requested MSI, disabling INTx %d", gsi);
1205 
1206 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
1207 	irq__del_irqfd(kvm, gsi, pdev->intx_fd);
1208 
1209 	close(pdev->intx_fd);
1210 	close(pdev->unmask_fd);
1211 	pdev->intx_fd = -1;
1212 }
1213 
1214 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
1215 {
1216 	int ret;
1217 	int trigger_fd, unmask_fd;
1218 	union vfio_irq_eventfd	trigger;
1219 	union vfio_irq_eventfd	unmask;
1220 	struct vfio_pci_device *pdev = &vdev->pci;
1221 	int gsi = pdev->intx_gsi;
1222 
1223 	if (pdev->intx_fd != -1)
1224 		return 0;
1225 
1226 	/*
1227 	 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
1228 	 * signals an interrupt from host to guest, and unmask_fd signals the
1229 	 * deassertion of the line from guest to host.
1230 	 */
1231 	trigger_fd = eventfd(0, 0);
1232 	if (trigger_fd < 0) {
1233 		vfio_dev_err(vdev, "failed to create trigger eventfd");
1234 		return trigger_fd;
1235 	}
1236 
1237 	unmask_fd = eventfd(0, 0);
1238 	if (unmask_fd < 0) {
1239 		vfio_dev_err(vdev, "failed to create unmask eventfd");
1240 		close(trigger_fd);
1241 		return unmask_fd;
1242 	}
1243 
1244 	ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
1245 	if (ret)
1246 		goto err_close;
1247 
1248 	trigger.irq = (struct vfio_irq_set) {
1249 		.argsz	= sizeof(trigger),
1250 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
1251 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1252 		.start	= 0,
1253 		.count	= 1,
1254 	};
1255 	set_vfio_irq_eventd_payload(&trigger, trigger_fd);
1256 
1257 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1258 	if (ret < 0) {
1259 		vfio_dev_err(vdev, "failed to setup VFIO IRQ");
1260 		goto err_delete_line;
1261 	}
1262 
1263 	unmask.irq = (struct vfio_irq_set) {
1264 		.argsz	= sizeof(unmask),
1265 		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
1266 		.index	= VFIO_PCI_INTX_IRQ_INDEX,
1267 		.start	= 0,
1268 		.count	= 1,
1269 	};
1270 	set_vfio_irq_eventd_payload(&unmask, unmask_fd);
1271 
1272 	ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask);
1273 	if (ret < 0) {
1274 		vfio_dev_err(vdev, "failed to setup unmask IRQ");
1275 		goto err_remove_event;
1276 	}
1277 
1278 	pdev->intx_fd = trigger_fd;
1279 	pdev->unmask_fd = unmask_fd;
1280 
1281 	return 0;
1282 
1283 err_remove_event:
1284 	/* Remove trigger event */
1285 	trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1286 	trigger.irq.count = 0;
1287 	ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1288 
1289 err_delete_line:
1290 	irq__del_irqfd(kvm, gsi, trigger_fd);
1291 
1292 err_close:
1293 	close(trigger_fd);
1294 	close(unmask_fd);
1295 	return ret;
1296 }
1297 
1298 static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev)
1299 {
1300 	int ret;
1301 	struct vfio_pci_device *pdev = &vdev->pci;
1302 	struct vfio_irq_info irq_info = {
1303 		.argsz = sizeof(irq_info),
1304 		.index = VFIO_PCI_INTX_IRQ_INDEX,
1305 	};
1306 
1307 	vfio_pci_reserve_irq_fds(2);
1308 
1309 	ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
1310 	if (ret || irq_info.count == 0) {
1311 		vfio_dev_err(vdev, "no INTx reported by VFIO");
1312 		return -ENODEV;
1313 	}
1314 
1315 	if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1316 		vfio_dev_err(vdev, "interrupt not eventfd capable");
1317 		return -EINVAL;
1318 	}
1319 
1320 	if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
1321 		vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
1322 		return -EINVAL;
1323 	}
1324 
1325 	/* Guest is going to ovewrite our irq_line... */
1326 	pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
1327 
1328 	pdev->intx_fd = -1;
1329 
1330 	return 0;
1331 }
1332 
1333 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
1334 {
1335 	int ret = 0;
1336 	struct vfio_pci_device *pdev = &vdev->pci;
1337 
1338 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1339 		pdev->msix.info = (struct vfio_irq_info) {
1340 			.argsz = sizeof(pdev->msix.info),
1341 			.index = VFIO_PCI_MSIX_IRQ_INDEX,
1342 		};
1343 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
1344 		if (ret)
1345 			return ret;
1346 	}
1347 
1348 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
1349 		pdev->msi.info = (struct vfio_irq_info) {
1350 			.argsz = sizeof(pdev->msi.info),
1351 			.index = VFIO_PCI_MSI_IRQ_INDEX,
1352 		};
1353 		ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi);
1354 		if (ret)
1355 			return ret;
1356 	}
1357 
1358 	if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
1359 		pci__assign_irq(&vdev->pci.hdr);
1360 
1361 		ret = vfio_pci_init_intx(kvm, vdev);
1362 		if (ret)
1363 			return ret;
1364 
1365 		ret = vfio_pci_enable_intx(kvm, vdev);
1366 	}
1367 
1368 	return ret;
1369 }
1370 
1371 int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
1372 {
1373 	int ret;
1374 
1375 	ret = vfio_pci_configure_dev_regions(kvm, vdev);
1376 	if (ret) {
1377 		vfio_dev_err(vdev, "failed to configure regions");
1378 		return ret;
1379 	}
1380 
1381 	vdev->dev_hdr = (struct device_header) {
1382 		.bus_type	= DEVICE_BUS_PCI,
1383 		.data		= &vdev->pci.hdr,
1384 	};
1385 
1386 	ret = device__register(&vdev->dev_hdr);
1387 	if (ret) {
1388 		vfio_dev_err(vdev, "failed to register VFIO device");
1389 		return ret;
1390 	}
1391 
1392 	ret = vfio_pci_configure_dev_irqs(kvm, vdev);
1393 	if (ret) {
1394 		vfio_dev_err(vdev, "failed to configure IRQs");
1395 		return ret;
1396 	}
1397 
1398 	return 0;
1399 }
1400 
1401 void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
1402 {
1403 	size_t i;
1404 	struct vfio_pci_device *pdev = &vdev->pci;
1405 
1406 	for (i = 0; i < vdev->info.num_regions; i++)
1407 		vfio_unmap_region(kvm, &vdev->regions[i]);
1408 
1409 	device__unregister(&vdev->dev_hdr);
1410 
1411 	free(pdev->msix.irq_set);
1412 	free(pdev->msix.entries);
1413 	free(pdev->msi.irq_set);
1414 	free(pdev->msi.entries);
1415 }
1416