1 #include "linux/sizes.h"
2
3 #include "kvm/irq.h"
4 #include "kvm/kvm.h"
5 #include "kvm/kvm-cpu.h"
6 #include "kvm/vfio.h"
7
8 #include <assert.h>
9
10 #include <sys/ioctl.h>
11 #include <sys/eventfd.h>
12 #include <sys/resource.h>
13 #include <sys/time.h>
14
15 /* Some distros don't have the define. */
16 #ifndef PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1
17 #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 12
18 #endif
19
20 /* Wrapper around UAPI vfio_irq_set */
21 union vfio_irq_eventfd {
22 struct vfio_irq_set irq;
23 u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)];
24 };
25
set_vfio_irq_eventd_payload(union vfio_irq_eventfd * evfd,int fd)26 static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd)
27 {
28 memcpy(&evfd->irq.data, &fd, sizeof(fd));
29 }
30
31 /*
32 * To support MSI and MSI-X with common code, track the host and guest states of
33 * the MSI/MSI-X capability, and of individual vectors.
34 *
35 * Both MSI and MSI-X capabilities are enabled and disabled through registers.
36 * Vectors cannot be individually disabled.
37 */
38 #define msi_is_enabled(state) ((state) & VFIO_PCI_MSI_STATE_ENABLED)
39
40 /*
41 * MSI-X: the control register allows to mask all vectors, and the table allows
42 * to mask each vector individually.
43 *
44 * MSI: if the capability supports Per-Vector Masking then the Mask Bit register
45 * allows to mask each vector individually. Otherwise there is no masking for
46 * MSI.
47 */
48 #define msi_is_masked(state) ((state) & VFIO_PCI_MSI_STATE_MASKED)
49
50 /*
51 * A capability is empty when no vector has been registered with SET_IRQS
52 * yet. It's an optimization specific to kvmtool to avoid issuing lots of
53 * SET_IRQS ioctls when the guest configures the MSI-X table while the
54 * capability is masked.
55 */
56 #define msi_is_empty(state) ((state) & VFIO_PCI_MSI_STATE_EMPTY)
57
58 #define msi_update_state(state, val, bit) \
59 (state) = (val) ? (state) | bit : (state) & ~bit;
60 #define msi_set_enabled(state, val) \
61 msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
62 #define msi_set_masked(state, val) \
63 msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
64 #define msi_set_empty(state, val) \
65 msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
66
67 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
68 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev);
69
vfio_pci_enable_msis(struct kvm * kvm,struct vfio_device * vdev,bool msix)70 static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev,
71 bool msix)
72 {
73 size_t i;
74 int ret = 0;
75 int *eventfds;
76 struct vfio_pci_device *pdev = &vdev->pci;
77 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
78 union vfio_irq_eventfd single = {
79 .irq = {
80 .argsz = sizeof(single),
81 .flags = VFIO_IRQ_SET_DATA_EVENTFD |
82 VFIO_IRQ_SET_ACTION_TRIGGER,
83 .index = msis->info.index,
84 .count = 1,
85 },
86 };
87
88 if (!msi_is_enabled(msis->guest_state))
89 return 0;
90
91 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
92 /*
93 * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
94 * time. Since INTx has to be enabled from the start (we don't
95 * have a reliable way to know when the guest starts using it),
96 * disable it now.
97 */
98 vfio_pci_disable_intx(kvm, vdev);
99
100 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
101
102 /*
103 * Initial registration of the full range. This enables the physical
104 * MSI/MSI-X capability, which might have side effects. For instance
105 * when assigning virtio legacy devices, enabling the MSI capability
106 * modifies the config space layout!
107 *
108 * As an optimization, only update MSIs when guest unmasks the
109 * capability. This greatly reduces the initialization time for Linux
110 * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
111 * masked, then fills individual vectors, then unmasks the whole
112 * function. So we only do one VFIO ioctl when enabling for the first
113 * time, and then one when unmasking.
114 */
115 if (!msi_is_enabled(msis->host_state) ||
116 (!msi_is_masked(msis->guest_state) &&
117 msi_is_empty(msis->host_state))) {
118 bool empty = true;
119
120 for (i = 0; i < msis->nr_entries; i++) {
121 eventfds[i] = msis->entries[i].gsi >= 0 ?
122 msis->entries[i].eventfd : -1;
123
124 if (eventfds[i] >= 0)
125 empty = false;
126 }
127
128 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
129 if (ret < 0) {
130 perror("VFIO_DEVICE_SET_IRQS(multi)");
131 return ret;
132 }
133
134 msi_set_enabled(msis->host_state, true);
135 msi_set_empty(msis->host_state, empty);
136
137 return 0;
138 }
139
140 if (msi_is_masked(msis->guest_state)) {
141 /* TODO: if host_state is not empty nor masked, mask all vectors */
142 return 0;
143 }
144
145 /* Update individual vectors to avoid breaking those in use */
146 for (i = 0; i < msis->nr_entries; i++) {
147 struct vfio_pci_msi_entry *entry = &msis->entries[i];
148 int fd = entry->gsi >= 0 ? entry->eventfd : -1;
149
150 if (fd == eventfds[i])
151 continue;
152
153 single.irq.start = i;
154 set_vfio_irq_eventd_payload(&single, fd);
155
156 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
157 if (ret < 0) {
158 perror("VFIO_DEVICE_SET_IRQS(single)");
159 break;
160 }
161
162 eventfds[i] = fd;
163
164 if (msi_is_empty(msis->host_state) && fd >= 0)
165 msi_set_empty(msis->host_state, false);
166 }
167
168 return ret;
169 }
170
vfio_pci_disable_msis(struct kvm * kvm,struct vfio_device * vdev,bool msix)171 static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev,
172 bool msix)
173 {
174 int ret;
175 struct vfio_pci_device *pdev = &vdev->pci;
176 struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi;
177 struct vfio_irq_set irq_set = {
178 .argsz = sizeof(irq_set),
179 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
180 .index = msis->info.index,
181 .start = 0,
182 .count = 0,
183 };
184
185 if (!msi_is_enabled(msis->host_state))
186 return 0;
187
188 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
189 if (ret < 0) {
190 perror("VFIO_DEVICE_SET_IRQS(NONE)");
191 return ret;
192 }
193
194 msi_set_enabled(msis->host_state, false);
195 msi_set_empty(msis->host_state, true);
196
197 /*
198 * When MSI or MSIX is disabled, this might be called when
199 * PCI driver detects the MSI interrupt failure and wants to
200 * rollback to INTx mode. Thus enable INTx if the device
201 * supports INTx mode in this case.
202 */
203 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
204 ret = vfio_pci_enable_intx(kvm, vdev);
205
206 return ret >= 0 ? 0 : ret;
207 }
208
vfio_pci_update_msi_entry(struct kvm * kvm,struct vfio_device * vdev,struct vfio_pci_msi_entry * entry)209 static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
210 struct vfio_pci_msi_entry *entry)
211 {
212 int ret;
213
214 if (entry->eventfd < 0) {
215 entry->eventfd = eventfd(0, 0);
216 if (entry->eventfd < 0) {
217 ret = -errno;
218 vfio_dev_err(vdev, "cannot create eventfd");
219 return ret;
220 }
221 }
222
223 /* Allocate IRQ if necessary */
224 if (entry->gsi < 0) {
225 int ret = irq__add_msix_route(kvm, &entry->config.msg,
226 vdev->dev_hdr.dev_num << 3);
227 if (ret < 0) {
228 vfio_dev_err(vdev, "cannot create MSI-X route");
229 return ret;
230 }
231 entry->gsi = ret;
232 } else {
233 irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
234 }
235
236 /*
237 * MSI masking is unimplemented in VFIO, so we have to handle it by
238 * disabling/enabling IRQ route instead. We do it on the KVM side rather
239 * than VFIO, because:
240 * - it is 8x faster
241 * - it allows to decouple masking logic from capability state.
242 * - in masked state, after removing irqfd route, we could easily plug
243 * the eventfd in a local handler, in order to serve Pending Bit reads
244 * to the guest.
245 *
246 * So entry->host_state is masked when there is no active irqfd route.
247 */
248 if (msi_is_masked(entry->guest_state) == msi_is_masked(entry->host_state))
249 return 0;
250
251 if (msi_is_masked(entry->host_state)) {
252 ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
253 if (ret < 0) {
254 vfio_dev_err(vdev, "cannot setup irqfd");
255 return ret;
256 }
257 } else {
258 irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
259 }
260
261 msi_set_masked(entry->host_state, msi_is_masked(entry->guest_state));
262
263 return 0;
264 }
265
vfio_pci_msix_pba_access(struct kvm_cpu * vcpu,u64 addr,u8 * data,u32 len,u8 is_write,void * ptr)266 static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
267 u32 len, u8 is_write, void *ptr)
268 {
269 struct vfio_pci_device *pdev = ptr;
270 struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
271 u64 offset = addr - pba->guest_phys_addr;
272 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
273
274 if (offset >= pba->size) {
275 vfio_dev_err(vdev, "access outside of the MSIX PBA");
276 return;
277 }
278
279 if (is_write)
280 return;
281
282 /*
283 * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
284 * is completely useless here. Note that Linux doesn't use PBA.
285 */
286 if (pread(vdev->fd, data, len, pba->fd_offset + offset) != (ssize_t)len)
287 vfio_dev_err(vdev, "cannot access MSIX PBA\n");
288 }
289
vfio_pci_msix_table_access(struct kvm_cpu * vcpu,u64 addr,u8 * data,u32 len,u8 is_write,void * ptr)290 static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
291 u32 len, u8 is_write, void *ptr)
292 {
293 struct kvm *kvm = vcpu->kvm;
294 struct vfio_pci_msi_entry *entry;
295 struct vfio_pci_device *pdev = ptr;
296 struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
297
298 u64 offset = addr - pdev->msix_table.guest_phys_addr;
299 if (offset >= pdev->msix_table.size) {
300 vfio_dev_err(vdev, "access outside of the MSI-X table");
301 return;
302 }
303
304 size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
305 off_t field = offset % PCI_MSIX_ENTRY_SIZE;
306
307 /*
308 * PCI spec says that software must use aligned 4 or 8 bytes accesses
309 * for the MSI-X tables.
310 */
311 if ((len != 4 && len != 8) || addr & (len - 1)) {
312 vfio_dev_warn(vdev, "invalid MSI-X table access");
313 return;
314 }
315
316 entry = &pdev->msix.entries[vector];
317
318 mutex_lock(&pdev->msix.mutex);
319
320 if (!is_write) {
321 memcpy(data, (void *)&entry->config + field, len);
322 goto out_unlock;
323 }
324
325 memcpy((void *)&entry->config + field, data, len);
326
327 /*
328 * Check if access touched the vector control register, which is at the
329 * end of the MSI-X entry.
330 */
331 if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
332 goto out_unlock;
333
334 msi_set_masked(entry->guest_state, entry->config.ctrl &
335 PCI_MSIX_ENTRY_CTRL_MASKBIT);
336
337 if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
338 /* Not much we can do here. */
339 vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
340
341 /* Update the physical capability if necessary */
342 if (vfio_pci_enable_msis(kvm, vdev, true))
343 vfio_dev_err(vdev, "cannot enable MSIX");
344
345 out_unlock:
346 mutex_unlock(&pdev->msix.mutex);
347 }
348
vfio_pci_msix_cap_write(struct kvm * kvm,struct vfio_device * vdev,u16 off,void * data,int sz)349 static void vfio_pci_msix_cap_write(struct kvm *kvm,
350 struct vfio_device *vdev, u16 off,
351 void *data, int sz)
352 {
353 struct vfio_pci_device *pdev = &vdev->pci;
354 off_t enable_pos = PCI_MSIX_FLAGS + 1;
355 bool enable;
356 u16 flags;
357
358 off -= pdev->msix.pos;
359
360 /* Check if access intersects with the MSI-X Enable bit */
361 if (off > enable_pos || off + sz <= enable_pos)
362 return;
363
364 /* Read byte that contains the Enable bit */
365 flags = *(u8 *)(data + enable_pos - off) << 8;
366
367 mutex_lock(&pdev->msix.mutex);
368
369 msi_set_masked(pdev->msix.guest_state, flags & PCI_MSIX_FLAGS_MASKALL);
370 enable = flags & PCI_MSIX_FLAGS_ENABLE;
371 msi_set_enabled(pdev->msix.guest_state, enable);
372
373 if (enable && vfio_pci_enable_msis(kvm, vdev, true))
374 vfio_dev_err(vdev, "cannot enable MSIX");
375 else if (!enable && vfio_pci_disable_msis(kvm, vdev, true))
376 vfio_dev_err(vdev, "cannot disable MSIX");
377
378 mutex_unlock(&pdev->msix.mutex);
379 }
380
vfio_pci_msi_vector_write(struct kvm * kvm,struct vfio_device * vdev,u16 off,u8 * data,u32 sz)381 static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev,
382 u16 off, u8 *data, u32 sz)
383 {
384 size_t i;
385 u32 mask = 0;
386 size_t mask_pos, start, limit;
387 struct vfio_pci_msi_entry *entry;
388 struct vfio_pci_device *pdev = &vdev->pci;
389 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
390
391 if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT))
392 return 0;
393
394 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT)
395 mask_pos = PCI_MSI_MASK_64;
396 else
397 mask_pos = PCI_MSI_MASK_32;
398
399 if (off >= mask_pos + 4 || off + sz <= mask_pos)
400 return 0;
401
402 /* Set mask to current state */
403 for (i = 0; i < pdev->msi.nr_entries; i++) {
404 entry = &pdev->msi.entries[i];
405 mask |= !!msi_is_masked(entry->guest_state) << i;
406 }
407
408 /* Update mask following the intersection of access and register */
409 start = max_t(size_t, off, mask_pos);
410 limit = min_t(size_t, off + sz, mask_pos + 4);
411
412 memcpy((void *)&mask + start - mask_pos, data + start - off,
413 limit - start);
414
415 /* Update states if necessary */
416 for (i = 0; i < pdev->msi.nr_entries; i++) {
417 bool masked = mask & (1 << i);
418
419 entry = &pdev->msi.entries[i];
420 if (masked != msi_is_masked(entry->guest_state)) {
421 msi_set_masked(entry->guest_state, masked);
422 vfio_pci_update_msi_entry(kvm, vdev, entry);
423 }
424 }
425
426 return 1;
427 }
428
vfio_pci_msi_cap_write(struct kvm * kvm,struct vfio_device * vdev,u16 off,u8 * data,u32 sz)429 static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev,
430 u16 off, u8 *data, u32 sz)
431 {
432 u8 ctrl;
433 struct msi_msg msg;
434 size_t i, nr_vectors;
435 struct vfio_pci_msi_entry *entry;
436 struct vfio_pci_device *pdev = &vdev->pci;
437 struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos);
438
439 off -= pdev->msi.pos;
440
441 mutex_lock(&pdev->msi.mutex);
442
443 /* Check if the guest is trying to update mask bits */
444 if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz))
445 goto out_unlock;
446
447 /* Only modify routes when guest pokes the enable bit */
448 if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS)
449 goto out_unlock;
450
451 ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off);
452
453 msi_set_enabled(pdev->msi.guest_state, ctrl & PCI_MSI_FLAGS_ENABLE);
454
455 if (!msi_is_enabled(pdev->msi.guest_state)) {
456 vfio_pci_disable_msis(kvm, vdev, false);
457 goto out_unlock;
458 }
459
460 /* Create routes for the requested vectors */
461 nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4);
462
463 msg.address_lo = msi_cap_64->address_lo;
464 if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) {
465 msg.address_hi = msi_cap_64->address_hi;
466 msg.data = msi_cap_64->data;
467 } else {
468 struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64;
469 msg.address_hi = 0;
470 msg.data = msi_cap_32->data;
471 }
472
473 for (i = 0; i < nr_vectors; i++) {
474 entry = &pdev->msi.entries[i];
475
476 /*
477 * Set the MSI data value as required by the PCI local
478 * bus specifications, MSI capability, "Message Data".
479 */
480 msg.data &= ~(nr_vectors - 1);
481 msg.data |= i;
482
483 entry->config.msg = msg;
484 vfio_pci_update_msi_entry(kvm, vdev, entry);
485 }
486
487 /* Update the physical capability if necessary */
488 if (vfio_pci_enable_msis(kvm, vdev, false))
489 vfio_dev_err(vdev, "cannot enable MSI");
490
491 out_unlock:
492 mutex_unlock(&pdev->msi.mutex);
493 }
494
vfio_pci_bar_activate(struct kvm * kvm,struct pci_device_header * pci_hdr,int bar_num,void * data)495 static int vfio_pci_bar_activate(struct kvm *kvm,
496 struct pci_device_header *pci_hdr,
497 int bar_num, void *data)
498 {
499 struct vfio_device *vdev = data;
500 struct vfio_pci_device *pdev = &vdev->pci;
501 struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
502 struct vfio_pci_msix_table *table = &pdev->msix_table;
503 struct vfio_region *region;
504 u32 bar_addr;
505 bool has_msix;
506 int ret;
507
508 assert((u32)bar_num < vdev->info.num_regions);
509
510 region = &vdev->regions[bar_num];
511 has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
512
513 bar_addr = pci__bar_address(pci_hdr, bar_num);
514 if (pci__bar_is_io(pci_hdr, bar_num))
515 region->port_base = bar_addr;
516 else
517 region->guest_phys_addr = bar_addr;
518
519 if (has_msix && (u32)bar_num == table->bar) {
520 table->guest_phys_addr = region->guest_phys_addr;
521 ret = kvm__register_mmio(kvm, table->guest_phys_addr,
522 table->size, false,
523 vfio_pci_msix_table_access, pdev);
524 /*
525 * The MSIX table and the PBA structure can share the same BAR,
526 * but for convenience we register different regions for mmio
527 * emulation. We want to we update both if they share the same
528 * BAR.
529 */
530 if (ret < 0 || table->bar != pba->bar)
531 goto out;
532 }
533
534 if (has_msix && (u32)bar_num == pba->bar) {
535 if (pba->bar == table->bar)
536 pba->guest_phys_addr = table->guest_phys_addr + pba->bar_offset;
537 else
538 pba->guest_phys_addr = region->guest_phys_addr;
539 ret = kvm__register_mmio(kvm, pba->guest_phys_addr,
540 pba->size, false,
541 vfio_pci_msix_pba_access, pdev);
542 goto out;
543 }
544
545 ret = vfio_map_region(kvm, vdev, region);
546 out:
547 return ret;
548 }
549
vfio_pci_bar_deactivate(struct kvm * kvm,struct pci_device_header * pci_hdr,int bar_num,void * data)550 static int vfio_pci_bar_deactivate(struct kvm *kvm,
551 struct pci_device_header *pci_hdr,
552 int bar_num, void *data)
553 {
554 struct vfio_device *vdev = data;
555 struct vfio_pci_device *pdev = &vdev->pci;
556 struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
557 struct vfio_pci_msix_table *table = &pdev->msix_table;
558 struct vfio_region *region;
559 bool has_msix, success;
560 int ret;
561
562 assert((u32)bar_num < vdev->info.num_regions);
563
564 region = &vdev->regions[bar_num];
565 has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
566
567 if (has_msix && (u32)bar_num == table->bar) {
568 success = kvm__deregister_mmio(kvm, table->guest_phys_addr);
569 /* kvm__deregister_mmio fails when the region is not found. */
570 ret = (success ? 0 : -ENOENT);
571 /* See vfio_pci_bar_activate(). */
572 if (ret < 0 || table->bar!= pba->bar)
573 goto out;
574 }
575
576 if (has_msix && (u32)bar_num == pba->bar) {
577 success = kvm__deregister_mmio(kvm, pba->guest_phys_addr);
578 ret = (success ? 0 : -ENOENT);
579 goto out;
580 }
581
582 vfio_unmap_region(kvm, region);
583 ret = 0;
584
585 out:
586 return ret;
587 }
588
vfio_pci_cfg_read(struct kvm * kvm,struct pci_device_header * pci_hdr,u16 offset,void * data,int sz)589 static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
590 u16 offset, void *data, int sz)
591 {
592 struct vfio_region_info *info;
593 struct vfio_pci_device *pdev;
594 struct vfio_device *vdev;
595 char base[sz];
596
597 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
598 vdev = container_of(pdev, struct vfio_device, pci);
599 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
600
601 /* Dummy read in case of side-effects */
602 if (pread(vdev->fd, base, sz, info->offset + offset) != sz)
603 vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x",
604 sz, offset);
605 }
606
vfio_pci_cfg_write(struct kvm * kvm,struct pci_device_header * pci_hdr,u16 offset,void * data,int sz)607 static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr,
608 u16 offset, void *data, int sz)
609 {
610 struct vfio_region_info *info;
611 struct vfio_pci_device *pdev;
612 struct vfio_device *vdev;
613 u32 tmp;
614
615 /* Make sure a larger size will not overrun tmp on the stack. */
616 assert(sz <= 4);
617
618 if (offset == PCI_ROM_ADDRESS)
619 return;
620
621 pdev = container_of(pci_hdr, struct vfio_pci_device, hdr);
622 vdev = container_of(pdev, struct vfio_device, pci);
623 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
624
625 if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz)
626 vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
627 sz, offset);
628
629 /* Handle MSI write now, since it might update the hardware capability */
630 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
631 vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
632
633 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI)
634 vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz);
635
636 if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz)
637 vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
638 sz, offset);
639 }
640
vfio_pci_msi_cap_size(struct msi_cap_64 * cap_hdr)641 static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr)
642 {
643 size_t size = 10;
644
645 if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT)
646 size += 4;
647 if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT)
648 size += 10;
649
650 return size;
651 }
652
vfio_pci_cap_size(struct pci_cap_hdr * cap_hdr)653 static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
654 {
655 switch (cap_hdr->type) {
656 case PCI_CAP_ID_MSIX:
657 return PCI_CAP_MSIX_SIZEOF;
658 case PCI_CAP_ID_MSI:
659 return vfio_pci_msi_cap_size((void *)cap_hdr);
660 case PCI_CAP_ID_EXP:
661 /*
662 * We don't emulate any of the link, slot and root complex
663 * properties, so ignore them.
664 */
665 return PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1;
666 default:
667 pr_err("unknown PCI capability 0x%x", cap_hdr->type);
668 return 0;
669 }
670 }
671
vfio_pci_add_cap(struct vfio_device * vdev,u8 * virt_hdr,struct pci_cap_hdr * cap,off_t pos)672 static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
673 struct pci_cap_hdr *cap, off_t pos)
674 {
675 struct pci_cap_hdr *last;
676 struct pci_device_header *hdr = &vdev->pci.hdr;
677
678 cap->next = 0;
679
680 if (!hdr->capabilities) {
681 hdr->capabilities = pos;
682 hdr->status |= PCI_STATUS_CAP_LIST;
683 } else {
684 last = PCI_CAP(virt_hdr, hdr->capabilities);
685
686 while (last->next)
687 last = PCI_CAP(virt_hdr, last->next);
688
689 last->next = pos;
690 }
691
692 memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
693
694 return 0;
695 }
696
vfio_pci_parse_caps(struct vfio_device * vdev)697 static int vfio_pci_parse_caps(struct vfio_device *vdev)
698 {
699 int ret;
700 size_t size;
701 u16 pos, next;
702 struct pci_cap_hdr *cap;
703 u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY];
704 struct vfio_pci_device *pdev = &vdev->pci;
705
706 if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
707 return 0;
708
709 memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY);
710
711 pos = pdev->hdr.capabilities & ~3;
712
713 pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
714 pdev->hdr.capabilities = 0;
715
716 for (; pos; pos = next) {
717 cap = PCI_CAP(&pdev->hdr, pos);
718 next = cap->next;
719
720 switch (cap->type) {
721 case PCI_CAP_ID_MSIX:
722 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
723 if (ret)
724 return ret;
725
726 pdev->msix.pos = pos;
727 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
728 break;
729 case PCI_CAP_ID_MSI:
730 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
731 if (ret)
732 return ret;
733
734 pdev->msi.pos = pos;
735 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI;
736 break;
737 case PCI_CAP_ID_EXP:
738 if (!arch_has_pci_exp())
739 continue;
740 ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
741 if (ret)
742 return ret;
743 break;
744 }
745 }
746
747 /* Wipe remaining capabilities */
748 pos = PCI_STD_HEADER_SIZEOF;
749 size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF;
750 memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
751
752 return 0;
753 }
754
vfio_pci_parse_cfg_space(struct vfio_device * vdev)755 static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
756 {
757 ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY;
758 struct vfio_region_info *info;
759 struct vfio_pci_device *pdev = &vdev->pci;
760
761 if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
762 vfio_dev_err(vdev, "Config Space not found");
763 return -ENODEV;
764 }
765
766 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
767 *info = (struct vfio_region_info) {
768 .argsz = sizeof(*info),
769 .index = VFIO_PCI_CONFIG_REGION_INDEX,
770 };
771
772 ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
773 if (!info->size) {
774 vfio_dev_err(vdev, "Config Space has size zero?!");
775 return -EINVAL;
776 }
777
778 /* Read standard headers and capabilities */
779 if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
780 vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
781 return -EIO;
782 }
783
784 /* Strip bit 7, that indicates multifunction */
785 pdev->hdr.header_type &= 0x7f;
786
787 if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
788 vfio_dev_err(vdev, "unsupported header type %u",
789 pdev->hdr.header_type);
790 return -EOPNOTSUPP;
791 }
792
793 if (pdev->hdr.irq_pin)
794 pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
795
796 vfio_pci_parse_caps(vdev);
797
798 return 0;
799 }
800
vfio_pci_fixup_cfg_space(struct vfio_device * vdev)801 static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
802 {
803 int i;
804 u64 base;
805 ssize_t hdr_sz;
806 struct msix_cap *msix;
807 struct vfio_region_info *info;
808 struct vfio_pci_device *pdev = &vdev->pci;
809 struct vfio_region *region;
810
811 /* Initialise the BARs */
812 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
813 if ((u32)i == vdev->info.num_regions)
814 break;
815
816 region = &vdev->regions[i];
817 /* Construct a fake reg to match what we've mapped. */
818 if (region->is_ioport) {
819 base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) |
820 PCI_BASE_ADDRESS_SPACE_IO;
821 } else {
822 base = (region->guest_phys_addr &
823 PCI_BASE_ADDRESS_MEM_MASK) |
824 PCI_BASE_ADDRESS_SPACE_MEMORY;
825 }
826
827 pdev->hdr.bar[i] = base;
828
829 if (!base)
830 continue;
831
832 pdev->hdr.bar_size[i] = region->info.size;
833 }
834
835 /* I really can't be bothered to support cardbus. */
836 pdev->hdr.card_bus = 0;
837
838 /*
839 * Nuke the expansion ROM for now. If we want to do this properly,
840 * we need to save its size somewhere and map into the guest.
841 */
842 pdev->hdr.exp_rom_bar = 0;
843
844 /* Plumb in our fake MSI-X capability, if we have it. */
845 msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
846 if (msix) {
847 /* Add a shortcut to the PBA region for the MMIO handler */
848 int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
849 u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET;
850
851 pdev->msix_pba.fd_offset = vdev->regions[pba_index].info.offset +
852 pba_bar_offset;
853
854 /* Tidy up the capability */
855 msix->table_offset &= PCI_MSIX_TABLE_BIR;
856 if (pdev->msix_table.bar == pdev->msix_pba.bar) {
857 /* Keep the same offset as the MSIX cap. */
858 pdev->msix_pba.bar_offset = pba_bar_offset;
859 } else {
860 /* PBA is at the start of the BAR. */
861 msix->pba_offset &= PCI_MSIX_PBA_BIR;
862 pdev->msix_pba.bar_offset = 0;
863 }
864 }
865
866 /* Install our fake Configuration Space */
867 info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
868 /*
869 * We don't touch the extended configuration space, let's be cautious
870 * and not overwrite it all with zeros, or bad things might happen.
871 */
872 hdr_sz = PCI_DEV_CFG_SIZE_LEGACY;
873 if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) {
874 vfio_dev_err(vdev, "failed to write %zd bytes to Config Space",
875 hdr_sz);
876 return -EIO;
877 }
878
879 /* Register callbacks for cfg accesses */
880 pdev->hdr.cfg_ops = (struct pci_config_operations) {
881 .read = vfio_pci_cfg_read,
882 .write = vfio_pci_cfg_write,
883 };
884
885 pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH;
886
887 return 0;
888 }
889
vfio_pci_get_region_info(struct vfio_device * vdev,u32 index,struct vfio_region_info * info)890 static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index,
891 struct vfio_region_info *info)
892 {
893 int ret;
894
895 *info = (struct vfio_region_info) {
896 .argsz = sizeof(*info),
897 .index = index,
898 };
899
900 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
901 if (ret) {
902 ret = -errno;
903 vfio_dev_err(vdev, "cannot get info for BAR %u", index);
904 return ret;
905 }
906
907 if (info->size && !is_power_of_two(info->size)) {
908 vfio_dev_err(vdev, "region is not power of two: 0x%llx",
909 info->size);
910 return -EINVAL;
911 }
912
913 return 0;
914 }
915
vfio_pci_create_msix_table(struct kvm * kvm,struct vfio_device * vdev)916 static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev)
917 {
918 int ret;
919 size_t i;
920 size_t map_size;
921 size_t nr_entries;
922 struct vfio_pci_msi_entry *entries;
923 struct vfio_pci_device *pdev = &vdev->pci;
924 struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
925 struct vfio_pci_msix_table *table = &pdev->msix_table;
926 struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
927 struct vfio_region_info info;
928
929 table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
930 pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
931
932 nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
933
934 /* MSIX table and PBA must support QWORD accesses. */
935 table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, 8);
936 pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), 8);
937
938 entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
939 if (!entries)
940 return -ENOMEM;
941
942 for (i = 0; i < nr_entries; i++)
943 entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
944
945 ret = vfio_pci_get_region_info(vdev, table->bar, &info);
946 if (ret)
947 return ret;
948 if (!info.size)
949 return -EINVAL;
950
951 map_size = ALIGN(info.size, MAX_PAGE_SIZE);
952 table->guest_phys_addr = pci_get_mmio_block(map_size);
953 if (!table->guest_phys_addr) {
954 pr_err("cannot allocate MMIO space");
955 ret = -ENOMEM;
956 goto out_free;
957 }
958
959 /*
960 * We could map the physical PBA directly into the guest, but it's
961 * likely smaller than a page, and we can only hand full pages to the
962 * guest. Even though the PCI spec disallows sharing a page used for
963 * MSI-X with any other resource, it allows to share the same page
964 * between MSI-X table and PBA. For the sake of isolation, create a
965 * virtual PBA.
966 */
967 if (table->bar == pba->bar) {
968 u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET;
969
970 /* Sanity checks. */
971 if (table->size > pba_bar_offset)
972 die("MSIX table overlaps with PBA");
973 if (pba_bar_offset + pba->size > info.size)
974 die("PBA exceeds the size of the region");
975 pba->guest_phys_addr = table->guest_phys_addr + pba_bar_offset;
976 } else {
977 ret = vfio_pci_get_region_info(vdev, pba->bar, &info);
978 if (ret)
979 return ret;
980 if (!info.size)
981 return -EINVAL;
982
983 map_size = ALIGN(info.size, MAX_PAGE_SIZE);
984 pba->guest_phys_addr = pci_get_mmio_block(map_size);
985 if (!pba->guest_phys_addr) {
986 pr_err("cannot allocate MMIO space");
987 ret = -ENOMEM;
988 goto out_free;
989 }
990 }
991
992 pdev->msix.entries = entries;
993 pdev->msix.nr_entries = nr_entries;
994
995 return 0;
996
997 out_free:
998 free(entries);
999
1000 return ret;
1001 }
1002
vfio_pci_create_msi_cap(struct kvm * kvm,struct vfio_pci_device * pdev)1003 static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev)
1004 {
1005 struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos);
1006
1007 pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1),
1008 pdev->msi.entries = calloc(pdev->msi.nr_entries,
1009 sizeof(struct vfio_pci_msi_entry));
1010 if (!pdev->msi.entries)
1011 return -ENOMEM;
1012
1013 return 0;
1014 }
1015
vfio_pci_configure_bar(struct kvm * kvm,struct vfio_device * vdev,size_t nr)1016 static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
1017 size_t nr)
1018 {
1019 int ret;
1020 u32 bar;
1021 size_t map_size;
1022 struct vfio_pci_device *pdev = &vdev->pci;
1023 struct vfio_region *region;
1024
1025 if (nr >= vdev->info.num_regions)
1026 return 0;
1027
1028 region = &vdev->regions[nr];
1029 bar = pdev->hdr.bar[nr];
1030
1031 region->vdev = vdev;
1032 region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO);
1033
1034 ret = vfio_pci_get_region_info(vdev, nr, ®ion->info);
1035 if (ret)
1036 return ret;
1037
1038 /* Ignore invalid or unimplemented regions */
1039 if (!region->info.size)
1040 return 0;
1041
1042 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1043 /* Trap and emulate MSI-X table */
1044 if (nr == pdev->msix_table.bar) {
1045 region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
1046 return 0;
1047 } else if (nr == pdev->msix_pba.bar) {
1048 region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
1049 return 0;
1050 }
1051 }
1052
1053 if (region->is_ioport) {
1054 region->port_base = pci_get_io_port_block(region->info.size);
1055 } else {
1056 /* Grab some MMIO space in the guest */
1057 map_size = ALIGN(region->info.size, PAGE_SIZE);
1058 region->guest_phys_addr = pci_get_mmio_block(map_size);
1059 }
1060
1061 return 0;
1062 }
1063
vfio_pci_configure_dev_regions(struct kvm * kvm,struct vfio_device * vdev)1064 static int vfio_pci_configure_dev_regions(struct kvm *kvm,
1065 struct vfio_device *vdev)
1066 {
1067 int ret;
1068 u32 bar;
1069 size_t i;
1070 bool is_64bit = false;
1071 struct vfio_pci_device *pdev = &vdev->pci;
1072
1073 ret = vfio_pci_parse_cfg_space(vdev);
1074 if (ret)
1075 return ret;
1076
1077 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1078 ret = vfio_pci_create_msix_table(kvm, vdev);
1079 if (ret)
1080 return ret;
1081 }
1082
1083 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
1084 ret = vfio_pci_create_msi_cap(kvm, pdev);
1085 if (ret)
1086 return ret;
1087 }
1088
1089 for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
1090 /* Ignore top half of 64-bit BAR */
1091 if (is_64bit) {
1092 is_64bit = false;
1093 continue;
1094 }
1095
1096 ret = vfio_pci_configure_bar(kvm, vdev, i);
1097 if (ret)
1098 return ret;
1099
1100 bar = pdev->hdr.bar[i];
1101 is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) ==
1102 PCI_BASE_ADDRESS_SPACE_MEMORY &&
1103 bar & PCI_BASE_ADDRESS_MEM_TYPE_64;
1104 }
1105
1106 /* We've configured the BARs, fake up a Configuration Space */
1107 ret = vfio_pci_fixup_cfg_space(vdev);
1108 if (ret)
1109 return ret;
1110
1111 return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate,
1112 vfio_pci_bar_deactivate, vdev);
1113 }
1114
1115 /*
1116 * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
1117 * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
1118 */
vfio_pci_reserve_irq_fds(size_t num)1119 static int vfio_pci_reserve_irq_fds(size_t num)
1120 {
1121 /*
1122 * I counted around 27 fds under normal load. Let's add 100 for good
1123 * measure.
1124 */
1125 static size_t needed = 128;
1126 struct rlimit fd_limit, new_limit;
1127
1128 needed += num;
1129
1130 if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
1131 perror("getrlimit(RLIMIT_NOFILE)");
1132 return 0;
1133 }
1134
1135 if (fd_limit.rlim_cur >= needed)
1136 return 0;
1137
1138 new_limit.rlim_cur = needed;
1139
1140 if (fd_limit.rlim_max < needed)
1141 /* Try to bump hard limit (root only) */
1142 new_limit.rlim_max = needed;
1143 else
1144 new_limit.rlim_max = fd_limit.rlim_max;
1145
1146 if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
1147 perror("setrlimit(RLIMIT_NOFILE)");
1148 pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
1149 (size_t)(needed - fd_limit.rlim_cur));
1150 }
1151
1152 return 0;
1153 }
1154
vfio_pci_init_msis(struct kvm * kvm,struct vfio_device * vdev,struct vfio_pci_msi_common * msis)1155 static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
1156 struct vfio_pci_msi_common *msis)
1157 {
1158 int ret;
1159 size_t i;
1160 int *eventfds;
1161 size_t irq_set_size;
1162 struct vfio_pci_msi_entry *entry;
1163 size_t nr_entries = msis->nr_entries;
1164
1165 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
1166 if (ret || msis->info.count == 0) {
1167 vfio_dev_err(vdev, "no MSI reported by VFIO");
1168 return -ENODEV;
1169 }
1170
1171 if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1172 vfio_dev_err(vdev, "interrupt not EVENTFD capable");
1173 return -EINVAL;
1174 }
1175
1176 if (msis->info.count != nr_entries) {
1177 vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
1178 return -EINVAL;
1179 }
1180
1181 mutex_init(&msis->mutex);
1182
1183 vfio_pci_reserve_irq_fds(nr_entries);
1184
1185 irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
1186 msis->irq_set = malloc(irq_set_size);
1187 if (!msis->irq_set)
1188 return -ENOMEM;
1189
1190 *msis->irq_set = (struct vfio_irq_set) {
1191 .argsz = irq_set_size,
1192 .flags = VFIO_IRQ_SET_DATA_EVENTFD |
1193 VFIO_IRQ_SET_ACTION_TRIGGER,
1194 .index = msis->info.index,
1195 .start = 0,
1196 .count = nr_entries,
1197 };
1198
1199 eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
1200
1201 for (i = 0; i < nr_entries; i++) {
1202 entry = &msis->entries[i];
1203 entry->gsi = -1;
1204 entry->eventfd = -1;
1205 msi_set_masked(entry->guest_state, false);
1206 msi_set_masked(entry->host_state, true);
1207 eventfds[i] = -1;
1208 }
1209
1210 return 0;
1211 }
1212
vfio_pci_disable_intx(struct kvm * kvm,struct vfio_device * vdev)1213 static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
1214 {
1215 struct vfio_pci_device *pdev = &vdev->pci;
1216 int gsi = pdev->intx_gsi;
1217 struct vfio_irq_set irq_set = {
1218 .argsz = sizeof(irq_set),
1219 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
1220 .index = VFIO_PCI_INTX_IRQ_INDEX,
1221 };
1222
1223 if (pdev->intx_fd == -1)
1224 return;
1225
1226 pr_debug("user requested MSI, disabling INTx %d", gsi);
1227
1228 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
1229 irq__del_irqfd(kvm, gsi, pdev->intx_fd);
1230
1231 close(pdev->intx_fd);
1232 close(pdev->unmask_fd);
1233 pdev->intx_fd = -1;
1234 }
1235
vfio_pci_enable_intx(struct kvm * kvm,struct vfio_device * vdev)1236 static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
1237 {
1238 int ret;
1239 int trigger_fd, unmask_fd;
1240 union vfio_irq_eventfd trigger;
1241 union vfio_irq_eventfd unmask;
1242 struct vfio_pci_device *pdev = &vdev->pci;
1243 int gsi = pdev->intx_gsi;
1244
1245 if (pdev->intx_fd != -1)
1246 return 0;
1247
1248 /*
1249 * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd
1250 * signals an interrupt from host to guest, and unmask_fd signals the
1251 * deassertion of the line from guest to host.
1252 */
1253 trigger_fd = eventfd(0, 0);
1254 if (trigger_fd < 0) {
1255 vfio_dev_err(vdev, "failed to create trigger eventfd");
1256 return trigger_fd;
1257 }
1258
1259 unmask_fd = eventfd(0, 0);
1260 if (unmask_fd < 0) {
1261 vfio_dev_err(vdev, "failed to create unmask eventfd");
1262 close(trigger_fd);
1263 return unmask_fd;
1264 }
1265
1266 ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd);
1267 if (ret)
1268 goto err_close;
1269
1270 trigger.irq = (struct vfio_irq_set) {
1271 .argsz = sizeof(trigger),
1272 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
1273 .index = VFIO_PCI_INTX_IRQ_INDEX,
1274 .start = 0,
1275 .count = 1,
1276 };
1277 set_vfio_irq_eventd_payload(&trigger, trigger_fd);
1278
1279 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1280 if (ret < 0) {
1281 vfio_dev_err(vdev, "failed to setup VFIO IRQ");
1282 goto err_delete_line;
1283 }
1284
1285 unmask.irq = (struct vfio_irq_set) {
1286 .argsz = sizeof(unmask),
1287 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK,
1288 .index = VFIO_PCI_INTX_IRQ_INDEX,
1289 .start = 0,
1290 .count = 1,
1291 };
1292 set_vfio_irq_eventd_payload(&unmask, unmask_fd);
1293
1294 ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask);
1295 if (ret < 0) {
1296 vfio_dev_err(vdev, "failed to setup unmask IRQ");
1297 goto err_remove_event;
1298 }
1299
1300 pdev->intx_fd = trigger_fd;
1301 pdev->unmask_fd = unmask_fd;
1302
1303 return 0;
1304
1305 err_remove_event:
1306 /* Remove trigger event */
1307 trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1308 trigger.irq.count = 0;
1309 ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger);
1310
1311 err_delete_line:
1312 irq__del_irqfd(kvm, gsi, trigger_fd);
1313
1314 err_close:
1315 close(trigger_fd);
1316 close(unmask_fd);
1317 return ret;
1318 }
1319
vfio_pci_init_intx(struct kvm * kvm,struct vfio_device * vdev)1320 static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev)
1321 {
1322 int ret;
1323 struct vfio_pci_device *pdev = &vdev->pci;
1324 struct vfio_irq_info irq_info = {
1325 .argsz = sizeof(irq_info),
1326 .index = VFIO_PCI_INTX_IRQ_INDEX,
1327 };
1328
1329 vfio_pci_reserve_irq_fds(2);
1330
1331 ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
1332 if (ret || irq_info.count == 0) {
1333 vfio_dev_err(vdev, "no INTx reported by VFIO");
1334 return -ENODEV;
1335 }
1336
1337 if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
1338 vfio_dev_err(vdev, "interrupt not eventfd capable");
1339 return -EINVAL;
1340 }
1341
1342 if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
1343 vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED");
1344 return -EINVAL;
1345 }
1346
1347 /* Guest is going to ovewrite our irq_line... */
1348 pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET;
1349
1350 pdev->intx_fd = -1;
1351
1352 return 0;
1353 }
1354
vfio_pci_configure_dev_irqs(struct kvm * kvm,struct vfio_device * vdev)1355 static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
1356 {
1357 int ret = 0;
1358 struct vfio_pci_device *pdev = &vdev->pci;
1359
1360 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
1361 pdev->msix.info = (struct vfio_irq_info) {
1362 .argsz = sizeof(pdev->msix.info),
1363 .index = VFIO_PCI_MSIX_IRQ_INDEX,
1364 };
1365 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
1366 if (ret)
1367 return ret;
1368 }
1369
1370 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) {
1371 pdev->msi.info = (struct vfio_irq_info) {
1372 .argsz = sizeof(pdev->msi.info),
1373 .index = VFIO_PCI_MSI_IRQ_INDEX,
1374 };
1375 ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi);
1376 if (ret)
1377 return ret;
1378 }
1379
1380 if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
1381 pci__assign_irq(&vdev->pci.hdr);
1382
1383 ret = vfio_pci_init_intx(kvm, vdev);
1384 if (ret)
1385 return ret;
1386
1387 ret = vfio_pci_enable_intx(kvm, vdev);
1388 }
1389
1390 return ret;
1391 }
1392
vfio_pci_setup_device(struct kvm * kvm,struct vfio_device * vdev)1393 int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
1394 {
1395 int ret;
1396
1397 ret = vfio_pci_configure_dev_regions(kvm, vdev);
1398 if (ret) {
1399 vfio_dev_err(vdev, "failed to configure regions");
1400 return ret;
1401 }
1402
1403 vdev->dev_hdr = (struct device_header) {
1404 .bus_type = DEVICE_BUS_PCI,
1405 .data = &vdev->pci.hdr,
1406 };
1407
1408 ret = device__register(&vdev->dev_hdr);
1409 if (ret) {
1410 vfio_dev_err(vdev, "failed to register VFIO device");
1411 return ret;
1412 }
1413
1414 ret = vfio_pci_configure_dev_irqs(kvm, vdev);
1415 if (ret) {
1416 vfio_dev_err(vdev, "failed to configure IRQs");
1417 return ret;
1418 }
1419
1420 return 0;
1421 }
1422
vfio_pci_teardown_device(struct kvm * kvm,struct vfio_device * vdev)1423 void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
1424 {
1425 size_t i;
1426 struct vfio_pci_device *pdev = &vdev->pci;
1427
1428 for (i = 0; i < vdev->info.num_regions; i++)
1429 vfio_unmap_region(kvm, &vdev->regions[i]);
1430
1431 device__unregister(&vdev->dev_hdr);
1432
1433 free(pdev->msix.irq_set);
1434 free(pdev->msix.entries);
1435 free(pdev->msi.irq_set);
1436 free(pdev->msi.entries);
1437 }
1438