xref: /qemu/hw/vfio/pci-quirks.c (revision dee69a8ca6ebc325939dc0b6800becdb26cd9dbf)
1 /*
2  * device quirks for PCI devices
3  *
4  * Copyright Red Hat, Inc. 2012-2015
5  *
6  * Authors:
7  *  Alex Williamson <alex.williamson@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include CONFIG_DEVICES
15 #include "exec/memop.h"
16 #include "qemu/units.h"
17 #include "qemu/log.h"
18 #include "qemu/error-report.h"
19 #include "qemu/main-loop.h"
20 #include "qemu/module.h"
21 #include "qemu/range.h"
22 #include "qapi/error.h"
23 #include "qapi/visitor.h"
24 #include <sys/ioctl.h>
25 #include "hw/nvram/fw_cfg.h"
26 #include "hw/qdev-properties.h"
27 #include "pci.h"
28 #include "pci-quirks.h"
29 #include "trace.h"
30 
31 /*
32  * List of device ids/vendor ids for which to disable
33  * option rom loading. This avoids the guest hangs during rom
34  * execution as noticed with the BCM 57810 card for lack of a
35  * more better way to handle such issues.
36  * The  user can still override by specifying a romfile or
37  * rombar=1.
38  * Please see https://bugs.launchpad.net/qemu/+bug/1284874
39  * for an analysis of the 57810 card hang. When adding
40  * a new vendor id/device id combination below, please also add
41  * your card/environment details and information that could
42  * help in debugging to the bug tracking this issue
43  */
44 static const struct {
45     uint32_t vendor;
46     uint32_t device;
47 } rom_denylist[] = {
48     { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */
49 };
50 
51 bool vfio_opt_rom_in_denylist(VFIOPCIDevice *vdev)
52 {
53     int i;
54 
55     for (i = 0 ; i < ARRAY_SIZE(rom_denylist); i++) {
56         if (vfio_pci_is(vdev, rom_denylist[i].vendor, rom_denylist[i].device)) {
57             trace_vfio_quirk_rom_in_denylist(vdev->vbasedev.name,
58                                              rom_denylist[i].vendor,
59                                              rom_denylist[i].device);
60             return true;
61         }
62     }
63     return false;
64 }
65 
66 /*
67  * Device specific region quirks (mostly backdoors to PCI config space)
68  */
69 
70 static uint64_t vfio_generic_window_quirk_address_read(void *opaque,
71                                                        hwaddr addr,
72                                                        unsigned size)
73 {
74     VFIOConfigWindowQuirk *window = opaque;
75     VFIOPCIDevice *vdev = window->vdev;
76 
77     return vfio_region_read(&vdev->bars[window->bar].region,
78                             addr + window->address_offset, size);
79 }
80 
81 static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr,
82                                                     uint64_t data,
83                                                     unsigned size)
84 {
85     VFIOConfigWindowQuirk *window = opaque;
86     VFIOPCIDevice *vdev = window->vdev;
87     int i;
88 
89     window->window_enabled = false;
90 
91     vfio_region_write(&vdev->bars[window->bar].region,
92                       addr + window->address_offset, data, size);
93 
94     for (i = 0; i < window->nr_matches; i++) {
95         if ((data & ~window->matches[i].mask) == window->matches[i].match) {
96             window->window_enabled = true;
97             window->address_val = data & window->matches[i].mask;
98             trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name,
99                                     memory_region_name(window->addr_mem), data);
100             break;
101         }
102     }
103 }
104 
105 const MemoryRegionOps vfio_generic_window_address_quirk = {
106     .read = vfio_generic_window_quirk_address_read,
107     .write = vfio_generic_window_quirk_address_write,
108     .endianness = DEVICE_LITTLE_ENDIAN,
109 };
110 
111 static uint64_t vfio_generic_window_quirk_data_read(void *opaque,
112                                                     hwaddr addr, unsigned size)
113 {
114     VFIOConfigWindowQuirk *window = opaque;
115     VFIOPCIDevice *vdev = window->vdev;
116     uint64_t data;
117 
118     /* Always read data reg, discard if window enabled */
119     data = vfio_region_read(&vdev->bars[window->bar].region,
120                             addr + window->data_offset, size);
121 
122     if (window->window_enabled) {
123         data = vfio_pci_read_config(&vdev->pdev, window->address_val, size);
124         trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name,
125                                     memory_region_name(window->data_mem), data);
126     }
127 
128     return data;
129 }
130 
131 static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr,
132                                                  uint64_t data, unsigned size)
133 {
134     VFIOConfigWindowQuirk *window = opaque;
135     VFIOPCIDevice *vdev = window->vdev;
136 
137     if (window->window_enabled) {
138         vfio_pci_write_config(&vdev->pdev, window->address_val, data, size);
139         trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name,
140                                     memory_region_name(window->data_mem), data);
141         return;
142     }
143 
144     vfio_region_write(&vdev->bars[window->bar].region,
145                       addr + window->data_offset, data, size);
146 }
147 
148 const MemoryRegionOps vfio_generic_window_data_quirk = {
149     .read = vfio_generic_window_quirk_data_read,
150     .write = vfio_generic_window_quirk_data_write,
151     .endianness = DEVICE_LITTLE_ENDIAN,
152 };
153 
154 static uint64_t vfio_generic_quirk_mirror_read(void *opaque,
155                                                hwaddr addr, unsigned size)
156 {
157     VFIOConfigMirrorQuirk *mirror = opaque;
158     VFIOPCIDevice *vdev = mirror->vdev;
159     uint64_t data;
160 
161     /* Read and discard in case the hardware cares */
162     (void)vfio_region_read(&vdev->bars[mirror->bar].region,
163                            addr + mirror->offset, size);
164 
165     data = vfio_pci_read_config(&vdev->pdev, addr, size);
166     trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name,
167                                          memory_region_name(mirror->mem),
168                                          addr, data);
169     return data;
170 }
171 
172 static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr,
173                                             uint64_t data, unsigned size)
174 {
175     VFIOConfigMirrorQuirk *mirror = opaque;
176     VFIOPCIDevice *vdev = mirror->vdev;
177 
178     vfio_pci_write_config(&vdev->pdev, addr, data, size);
179     trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name,
180                                           memory_region_name(mirror->mem),
181                                           addr, data);
182 }
183 
184 const MemoryRegionOps vfio_generic_mirror_quirk = {
185     .read = vfio_generic_quirk_mirror_read,
186     .write = vfio_generic_quirk_mirror_write,
187     .endianness = DEVICE_LITTLE_ENDIAN,
188 };
189 
190 /* Is range1 fully contained within range2?  */
191 static bool vfio_range_contained(uint64_t first1, uint64_t len1,
192                                  uint64_t first2, uint64_t len2) {
193     return (first1 >= first2 && first1 + len1 <= first2 + len2);
194 }
195 
196 #define PCI_VENDOR_ID_ATI               0x1002
197 
198 /*
199  * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
200  * through VGA register 0x3c3.  On newer cards, the I/O port BAR is always
201  * BAR4 (older cards like the X550 used BAR1, but we don't care to support
202  * those).  Note that on bare metal, a read of 0x3c3 doesn't always return the
203  * I/O port BAR address.  Originally this was coded to return the virtual BAR
204  * address only if the physical register read returns the actual BAR address,
205  * but users have reported greater success if we return the virtual address
206  * unconditionally.
207  */
208 static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
209                                         hwaddr addr, unsigned size)
210 {
211     VFIOPCIDevice *vdev = opaque;
212     uint64_t data = vfio_pci_read_config(&vdev->pdev,
213                                          PCI_BASE_ADDRESS_4 + 1, size);
214 
215     trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data);
216 
217     return data;
218 }
219 
220 static void vfio_ati_3c3_quirk_write(void *opaque, hwaddr addr,
221                                         uint64_t data, unsigned size)
222 {
223     qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid access\n", __func__);
224 }
225 
226 static const MemoryRegionOps vfio_ati_3c3_quirk = {
227     .read = vfio_ati_3c3_quirk_read,
228     .write = vfio_ati_3c3_quirk_write,
229     .endianness = DEVICE_LITTLE_ENDIAN,
230 };
231 
232 VFIOQuirk *vfio_quirk_alloc(int nr_mem)
233 {
234     VFIOQuirk *quirk = g_new0(VFIOQuirk, 1);
235     QLIST_INIT(&quirk->ioeventfds);
236     quirk->mem = g_new0(MemoryRegion, nr_mem);
237     quirk->nr_mem = nr_mem;
238 
239     return quirk;
240 }
241 
242 static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
243 {
244     QLIST_REMOVE(ioeventfd, next);
245     memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
246                               true, ioeventfd->data, &ioeventfd->e);
247 
248     if (ioeventfd->vfio) {
249         struct vfio_device_ioeventfd vfio_ioeventfd;
250 
251         vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
252         vfio_ioeventfd.flags = ioeventfd->size;
253         vfio_ioeventfd.data = ioeventfd->data;
254         vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
255                                 ioeventfd->region_addr;
256         vfio_ioeventfd.fd = -1;
257 
258         if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) {
259             error_report("Failed to remove vfio ioeventfd for %s+0x%"
260                          HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)",
261                          memory_region_name(ioeventfd->mr), ioeventfd->addr,
262                          ioeventfd->size, ioeventfd->data);
263         }
264     } else {
265         qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
266                             NULL, NULL, NULL);
267     }
268 
269     event_notifier_cleanup(&ioeventfd->e);
270     trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
271                               (uint64_t)ioeventfd->addr, ioeventfd->size,
272                               ioeventfd->data);
273     g_free(ioeventfd);
274 }
275 
276 static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
277 {
278     VFIOIOEventFD *ioeventfd, *tmp;
279 
280     QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
281         if (ioeventfd->dynamic) {
282             vfio_ioeventfd_exit(vdev, ioeventfd);
283         }
284     }
285 }
286 
287 static void vfio_ioeventfd_handler(void *opaque)
288 {
289     VFIOIOEventFD *ioeventfd = opaque;
290 
291     if (event_notifier_test_and_clear(&ioeventfd->e)) {
292         vfio_region_write(ioeventfd->region, ioeventfd->region_addr,
293                           ioeventfd->data, ioeventfd->size);
294         trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr),
295                                      (uint64_t)ioeventfd->addr, ioeventfd->size,
296                                      ioeventfd->data);
297     }
298 }
299 
300 static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
301                                           MemoryRegion *mr, hwaddr addr,
302                                           unsigned size, uint64_t data,
303                                           VFIORegion *region,
304                                           hwaddr region_addr, bool dynamic)
305 {
306     VFIOIOEventFD *ioeventfd;
307 
308     if (vdev->no_kvm_ioeventfd) {
309         return NULL;
310     }
311 
312     ioeventfd = g_malloc0(sizeof(*ioeventfd));
313 
314     if (event_notifier_init(&ioeventfd->e, 0)) {
315         g_free(ioeventfd);
316         return NULL;
317     }
318 
319     /*
320      * MemoryRegion and relative offset, plus additional ioeventfd setup
321      * parameters for configuring and later tearing down KVM ioeventfd.
322      */
323     ioeventfd->mr = mr;
324     ioeventfd->addr = addr;
325     ioeventfd->size = size;
326     ioeventfd->data = data;
327     ioeventfd->dynamic = dynamic;
328     /*
329      * VFIORegion and relative offset for implementing the userspace
330      * handler.  data & size fields shared for both uses.
331      */
332     ioeventfd->region = region;
333     ioeventfd->region_addr = region_addr;
334 
335     if (!vdev->no_vfio_ioeventfd) {
336         struct vfio_device_ioeventfd vfio_ioeventfd;
337 
338         vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
339         vfio_ioeventfd.flags = ioeventfd->size;
340         vfio_ioeventfd.data = ioeventfd->data;
341         vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
342                                 ioeventfd->region_addr;
343         vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e);
344 
345         ioeventfd->vfio = !ioctl(vdev->vbasedev.fd,
346                                  VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
347     }
348 
349     if (!ioeventfd->vfio) {
350         qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
351                             vfio_ioeventfd_handler, NULL, ioeventfd);
352     }
353 
354     memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
355                               true, ioeventfd->data, &ioeventfd->e);
356     trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
357                               size, data, ioeventfd->vfio);
358 
359     return ioeventfd;
360 }
361 
362 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
363 {
364     VFIOQuirk *quirk;
365 
366     /*
367      * As long as the BAR is >= 256 bytes it will be aligned such that the
368      * lower byte is always zero.  Filter out anything else, if it exists.
369      */
370     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
371         !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) {
372         return;
373     }
374 
375     quirk = vfio_quirk_alloc(1);
376 
377     memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev,
378                           "vfio-ati-3c3-quirk", 1);
379     memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
380                                 3 /* offset 3 bytes from 0x3c0 */, quirk->mem);
381 
382     QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
383                       quirk, next);
384 
385     trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name);
386 }
387 
388 /*
389  * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI
390  * config space through MMIO BAR2 at offset 0x4000.  Nothing seems to access
391  * the MMIO space directly, but a window to this space is provided through
392  * I/O port BAR4.  Offset 0x0 is the address register and offset 0x4 is the
393  * data register.  When the address is programmed to a range of 0x4000-0x4fff
394  * PCI configuration space is available.  Experimentation seems to indicate
395  * that read-only may be provided by hardware.
396  */
397 static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr)
398 {
399     VFIOQuirk *quirk;
400     VFIOConfigWindowQuirk *window;
401 
402     /* This windows doesn't seem to be used except by legacy VGA code */
403     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
404         !vdev->vga || nr != 4) {
405         return;
406     }
407 
408     quirk = vfio_quirk_alloc(2);
409     window = quirk->data = g_malloc0(sizeof(*window) +
410                                      sizeof(VFIOConfigWindowMatch));
411     window->vdev = vdev;
412     window->address_offset = 0;
413     window->data_offset = 4;
414     window->nr_matches = 1;
415     window->matches[0].match = 0x4000;
416     window->matches[0].mask = vdev->config_size - 1;
417     window->bar = nr;
418     window->addr_mem = &quirk->mem[0];
419     window->data_mem = &quirk->mem[1];
420 
421     memory_region_init_io(window->addr_mem, OBJECT(vdev),
422                           &vfio_generic_window_address_quirk, window,
423                           "vfio-ati-bar4-window-address-quirk", 4);
424     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
425                                         window->address_offset,
426                                         window->addr_mem, 1);
427 
428     memory_region_init_io(window->data_mem, OBJECT(vdev),
429                           &vfio_generic_window_data_quirk, window,
430                           "vfio-ati-bar4-window-data-quirk", 4);
431     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
432                                         window->data_offset,
433                                         window->data_mem, 1);
434 
435     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
436 
437     trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name);
438 }
439 
440 /*
441  * Trap the BAR2 MMIO mirror to config space as well.
442  */
443 static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr)
444 {
445     VFIOQuirk *quirk;
446     VFIOConfigMirrorQuirk *mirror;
447 
448     /* Only enable on newer devices where BAR2 is 64bit */
449     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
450         !vdev->vga || nr != 2 || !vdev->bars[2].mem64) {
451         return;
452     }
453 
454     quirk = vfio_quirk_alloc(1);
455     mirror = quirk->data = g_malloc0(sizeof(*mirror));
456     mirror->mem = quirk->mem;
457     mirror->vdev = vdev;
458     mirror->offset = 0x4000;
459     mirror->bar = nr;
460 
461     memory_region_init_io(mirror->mem, OBJECT(vdev),
462                           &vfio_generic_mirror_quirk, mirror,
463                           "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE);
464     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
465                                         mirror->offset, mirror->mem, 1);
466 
467     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
468 
469     trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name);
470 }
471 
472 /*
473  * Older ATI/AMD cards like the X550 have a similar window to that above.
474  * I/O port BAR1 provides a window to a mirror of PCI config space located
475  * in BAR2 at offset 0xf00.  We don't care to support such older cards, but
476  * note it for future reference.
477  */
478 
479 /*
480  * Nvidia has several different methods to get to config space, the
481  * nouveu project has several of these documented here:
482  * https://github.com/pathscale/envytools/tree/master/hwdocs
483  *
484  * The first quirk is actually not documented in envytools and is found
485  * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]).  This is an
486  * NV46 chipset.  The backdoor uses the legacy VGA I/O ports to access
487  * the mirror of PCI config space found at BAR0 offset 0x1800.  The access
488  * sequence first writes 0x338 to I/O port 0x3d4.  The target offset is
489  * then written to 0x3d0.  Finally 0x538 is written for a read and 0x738
490  * is written for a write to 0x3d4.  The BAR0 offset is then accessible
491  * through 0x3d0.  This quirk doesn't seem to be necessary on newer cards
492  * that use the I/O port BAR5 window but it doesn't hurt to leave it.
493  */
494 typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State;
495 static const char *nv3d0_states[] = { "NONE", "SELECT",
496                                       "WINDOW", "READ", "WRITE" };
497 
498 typedef struct VFIONvidia3d0Quirk {
499     VFIOPCIDevice *vdev;
500     VFIONvidia3d0State state;
501     uint32_t offset;
502 } VFIONvidia3d0Quirk;
503 
504 static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque,
505                                            hwaddr addr, unsigned size)
506 {
507     VFIONvidia3d0Quirk *quirk = opaque;
508     VFIOPCIDevice *vdev = quirk->vdev;
509 
510     quirk->state = NONE;
511 
512     return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
513                          addr + 0x14, size);
514 }
515 
516 static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr,
517                                         uint64_t data, unsigned size)
518 {
519     VFIONvidia3d0Quirk *quirk = opaque;
520     VFIOPCIDevice *vdev = quirk->vdev;
521     VFIONvidia3d0State old_state = quirk->state;
522 
523     quirk->state = NONE;
524 
525     switch (data) {
526     case 0x338:
527         if (old_state == NONE) {
528             quirk->state = SELECT;
529             trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
530                                               nv3d0_states[quirk->state]);
531         }
532         break;
533     case 0x538:
534         if (old_state == WINDOW) {
535             quirk->state = READ;
536             trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
537                                               nv3d0_states[quirk->state]);
538         }
539         break;
540     case 0x738:
541         if (old_state == WINDOW) {
542             quirk->state = WRITE;
543             trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
544                                               nv3d0_states[quirk->state]);
545         }
546         break;
547     }
548 
549     vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
550                    addr + 0x14, data, size);
551 }
552 
553 static const MemoryRegionOps vfio_nvidia_3d4_quirk = {
554     .read = vfio_nvidia_3d4_quirk_read,
555     .write = vfio_nvidia_3d4_quirk_write,
556     .endianness = DEVICE_LITTLE_ENDIAN,
557 };
558 
559 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
560                                            hwaddr addr, unsigned size)
561 {
562     VFIONvidia3d0Quirk *quirk = opaque;
563     VFIOPCIDevice *vdev = quirk->vdev;
564     VFIONvidia3d0State old_state = quirk->state;
565     uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
566                                   addr + 0x10, size);
567 
568     quirk->state = NONE;
569 
570     if (old_state == READ &&
571         (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
572         uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
573 
574         data = vfio_pci_read_config(&vdev->pdev, offset, size);
575         trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name,
576                                          offset, size, data);
577     }
578 
579     return data;
580 }
581 
582 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
583                                         uint64_t data, unsigned size)
584 {
585     VFIONvidia3d0Quirk *quirk = opaque;
586     VFIOPCIDevice *vdev = quirk->vdev;
587     VFIONvidia3d0State old_state = quirk->state;
588 
589     quirk->state = NONE;
590 
591     if (old_state == SELECT) {
592         quirk->offset = (uint32_t)data;
593         quirk->state = WINDOW;
594         trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
595                                           nv3d0_states[quirk->state]);
596     } else if (old_state == WRITE) {
597         if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
598             uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
599 
600             vfio_pci_write_config(&vdev->pdev, offset, data, size);
601             trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name,
602                                               offset, data, size);
603             return;
604         }
605     }
606 
607     vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
608                    addr + 0x10, data, size);
609 }
610 
611 static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
612     .read = vfio_nvidia_3d0_quirk_read,
613     .write = vfio_nvidia_3d0_quirk_write,
614     .endianness = DEVICE_LITTLE_ENDIAN,
615 };
616 
617 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev)
618 {
619     VFIOQuirk *quirk;
620     VFIONvidia3d0Quirk *data;
621 
622     if (vdev->no_geforce_quirks ||
623         !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
624         !vdev->bars[1].region.size) {
625         return;
626     }
627 
628     quirk = vfio_quirk_alloc(2);
629     quirk->data = data = g_malloc0(sizeof(*data));
630     data->vdev = vdev;
631 
632     memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk,
633                           data, "vfio-nvidia-3d4-quirk", 2);
634     memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
635                                 0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]);
636 
637     memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk,
638                           data, "vfio-nvidia-3d0-quirk", 2);
639     memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
640                                 0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]);
641 
642     QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
643                       quirk, next);
644 
645     trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name);
646 }
647 
648 /*
649  * The second quirk is documented in envytools.  The I/O port BAR5 is just
650  * a set of address/data ports to the MMIO BARs.  The BAR we care about is
651  * again BAR0.  This backdoor is apparently a bit newer than the one above
652  * so we need to not only trap 256 bytes @0x1800, but all of PCI config
653  * space, including extended space is available at the 4k @0x88000.
654  */
655 typedef struct VFIONvidiaBAR5Quirk {
656     uint32_t master;
657     uint32_t enable;
658     MemoryRegion *addr_mem;
659     MemoryRegion *data_mem;
660     bool enabled;
661     VFIOConfigWindowQuirk window; /* last for match data */
662 } VFIONvidiaBAR5Quirk;
663 
664 static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5)
665 {
666     VFIOPCIDevice *vdev = bar5->window.vdev;
667 
668     if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) {
669         return;
670     }
671 
672     bar5->enabled = !bar5->enabled;
673     trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name,
674                                        bar5->enabled ?  "Enable" : "Disable");
675     memory_region_set_enabled(bar5->addr_mem, bar5->enabled);
676     memory_region_set_enabled(bar5->data_mem, bar5->enabled);
677 }
678 
679 static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque,
680                                                    hwaddr addr, unsigned size)
681 {
682     VFIONvidiaBAR5Quirk *bar5 = opaque;
683     VFIOPCIDevice *vdev = bar5->window.vdev;
684 
685     return vfio_region_read(&vdev->bars[5].region, addr, size);
686 }
687 
688 static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr,
689                                                 uint64_t data, unsigned size)
690 {
691     VFIONvidiaBAR5Quirk *bar5 = opaque;
692     VFIOPCIDevice *vdev = bar5->window.vdev;
693 
694     vfio_region_write(&vdev->bars[5].region, addr, data, size);
695 
696     bar5->master = data;
697     vfio_nvidia_bar5_enable(bar5);
698 }
699 
700 static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = {
701     .read = vfio_nvidia_bar5_quirk_master_read,
702     .write = vfio_nvidia_bar5_quirk_master_write,
703     .endianness = DEVICE_LITTLE_ENDIAN,
704 };
705 
706 static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque,
707                                                    hwaddr addr, unsigned size)
708 {
709     VFIONvidiaBAR5Quirk *bar5 = opaque;
710     VFIOPCIDevice *vdev = bar5->window.vdev;
711 
712     return vfio_region_read(&vdev->bars[5].region, addr + 4, size);
713 }
714 
715 static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr,
716                                                 uint64_t data, unsigned size)
717 {
718     VFIONvidiaBAR5Quirk *bar5 = opaque;
719     VFIOPCIDevice *vdev = bar5->window.vdev;
720 
721     vfio_region_write(&vdev->bars[5].region, addr + 4, data, size);
722 
723     bar5->enable = data;
724     vfio_nvidia_bar5_enable(bar5);
725 }
726 
727 static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = {
728     .read = vfio_nvidia_bar5_quirk_enable_read,
729     .write = vfio_nvidia_bar5_quirk_enable_write,
730     .endianness = DEVICE_LITTLE_ENDIAN,
731 };
732 
733 static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr)
734 {
735     VFIOQuirk *quirk;
736     VFIONvidiaBAR5Quirk *bar5;
737     VFIOConfigWindowQuirk *window;
738 
739     if (vdev->no_geforce_quirks ||
740         !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
741         !vdev->vga || nr != 5 || !vdev->bars[5].ioport) {
742         return;
743     }
744 
745     quirk = vfio_quirk_alloc(4);
746     bar5 = quirk->data = g_malloc0(sizeof(*bar5) +
747                                    (sizeof(VFIOConfigWindowMatch) * 2));
748     window = &bar5->window;
749 
750     window->vdev = vdev;
751     window->address_offset = 0x8;
752     window->data_offset = 0xc;
753     window->nr_matches = 2;
754     window->matches[0].match = 0x1800;
755     window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1;
756     window->matches[1].match = 0x88000;
757     window->matches[1].mask = vdev->config_size - 1;
758     window->bar = nr;
759     window->addr_mem = bar5->addr_mem = &quirk->mem[0];
760     window->data_mem = bar5->data_mem = &quirk->mem[1];
761 
762     memory_region_init_io(window->addr_mem, OBJECT(vdev),
763                           &vfio_generic_window_address_quirk, window,
764                           "vfio-nvidia-bar5-window-address-quirk", 4);
765     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
766                                         window->address_offset,
767                                         window->addr_mem, 1);
768     memory_region_set_enabled(window->addr_mem, false);
769 
770     memory_region_init_io(window->data_mem, OBJECT(vdev),
771                           &vfio_generic_window_data_quirk, window,
772                           "vfio-nvidia-bar5-window-data-quirk", 4);
773     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
774                                         window->data_offset,
775                                         window->data_mem, 1);
776     memory_region_set_enabled(window->data_mem, false);
777 
778     memory_region_init_io(&quirk->mem[2], OBJECT(vdev),
779                           &vfio_nvidia_bar5_quirk_master, bar5,
780                           "vfio-nvidia-bar5-master-quirk", 4);
781     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
782                                         0, &quirk->mem[2], 1);
783 
784     memory_region_init_io(&quirk->mem[3], OBJECT(vdev),
785                           &vfio_nvidia_bar5_quirk_enable, bar5,
786                           "vfio-nvidia-bar5-enable-quirk", 4);
787     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
788                                         4, &quirk->mem[3], 1);
789 
790     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
791 
792     trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name);
793 }
794 
795 typedef struct LastDataSet {
796     VFIOQuirk *quirk;
797     hwaddr addr;
798     uint64_t data;
799     unsigned size;
800     int hits;
801     int added;
802 } LastDataSet;
803 
804 #define MAX_DYN_IOEVENTFD 10
805 #define HITS_FOR_IOEVENTFD 10
806 
807 /*
808  * Finally, BAR0 itself.  We want to redirect any accesses to either
809  * 0x1800 or 0x88000 through the PCI config space access functions.
810  */
811 static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
812                                            uint64_t data, unsigned size)
813 {
814     VFIOConfigMirrorQuirk *mirror = opaque;
815     VFIOPCIDevice *vdev = mirror->vdev;
816     PCIDevice *pdev = &vdev->pdev;
817     LastDataSet *last = (LastDataSet *)&mirror->data;
818 
819     vfio_generic_quirk_mirror_write(opaque, addr, data, size);
820 
821     /*
822      * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
823      * MSI capability ID register.  Both the ID and next register are
824      * read-only, so we allow writes covering either of those to real hw.
825      */
826     if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
827         vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
828         vfio_region_write(&vdev->bars[mirror->bar].region,
829                           addr + mirror->offset, data, size);
830         trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name);
831     }
832 
833     /*
834      * Automatically add an ioeventfd to handle any repeated write with the
835      * same data and size above the standard PCI config space header.  This is
836      * primarily expected to accelerate the MSI-ACK behavior, such as noted
837      * above.  Current hardware/drivers should trigger an ioeventfd at config
838      * offset 0x704 (region offset 0x88704), with data 0x0, size 4.
839      *
840      * The criteria of 10 successive hits is arbitrary but reliably adds the
841      * MSI-ACK region.  Note that as some writes are bypassed via the ioeventfd,
842      * the remaining ones have a greater chance of being seen successively.
843      * To avoid the pathological case of burning up all of QEMU's open file
844      * handles, arbitrarily limit this algorithm from adding no more than 10
845      * ioeventfds, print an error if we would have added an 11th, and then
846      * stop counting.
847      */
848     if (!vdev->no_kvm_ioeventfd &&
849         addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) {
850         if (addr != last->addr || data != last->data || size != last->size) {
851             last->addr = addr;
852             last->data = data;
853             last->size = size;
854             last->hits = 1;
855         } else if (++last->hits >= HITS_FOR_IOEVENTFD) {
856             if (last->added < MAX_DYN_IOEVENTFD) {
857                 VFIOIOEventFD *ioeventfd;
858                 ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size,
859                                         data, &vdev->bars[mirror->bar].region,
860                                         mirror->offset + addr, true);
861                 if (ioeventfd) {
862                     VFIOQuirk *quirk = last->quirk;
863 
864                     QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next);
865                     last->added++;
866                 }
867             } else {
868                 last->added++;
869                 warn_report("NVIDIA ioeventfd queue full for %s, unable to "
870                             "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", "
871                             "size %u", vdev->vbasedev.name, addr, data, size);
872             }
873         }
874     }
875 }
876 
877 static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
878     .read = vfio_generic_quirk_mirror_read,
879     .write = vfio_nvidia_quirk_mirror_write,
880     .endianness = DEVICE_LITTLE_ENDIAN,
881 };
882 
883 static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
884 {
885     VFIOConfigMirrorQuirk *mirror = quirk->data;
886     LastDataSet *last = (LastDataSet *)&mirror->data;
887 
888     last->addr = last->data = last->size = last->hits = last->added = 0;
889 
890     vfio_drop_dynamic_eventfds(vdev, quirk);
891 }
892 
893 static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
894 {
895     VFIOQuirk *quirk;
896     VFIOConfigMirrorQuirk *mirror;
897     LastDataSet *last;
898 
899     if (vdev->no_geforce_quirks ||
900         !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
901         !vfio_is_vga(vdev) || nr != 0) {
902         return;
903     }
904 
905     quirk = vfio_quirk_alloc(1);
906     quirk->reset = vfio_nvidia_bar0_quirk_reset;
907     mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
908     mirror->mem = quirk->mem;
909     mirror->vdev = vdev;
910     mirror->offset = 0x88000;
911     mirror->bar = nr;
912     last = (LastDataSet *)&mirror->data;
913     last->quirk = quirk;
914 
915     memory_region_init_io(mirror->mem, OBJECT(vdev),
916                           &vfio_nvidia_mirror_quirk, mirror,
917                           "vfio-nvidia-bar0-88000-mirror-quirk",
918                           vdev->config_size);
919     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
920                                         mirror->offset, mirror->mem, 1);
921 
922     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
923 
924     /* The 0x1800 offset mirror only seems to get used by legacy VGA */
925     if (vdev->vga) {
926         quirk = vfio_quirk_alloc(1);
927         quirk->reset = vfio_nvidia_bar0_quirk_reset;
928         mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
929         mirror->mem = quirk->mem;
930         mirror->vdev = vdev;
931         mirror->offset = 0x1800;
932         mirror->bar = nr;
933         last = (LastDataSet *)&mirror->data;
934         last->quirk = quirk;
935 
936         memory_region_init_io(mirror->mem, OBJECT(vdev),
937                               &vfio_nvidia_mirror_quirk, mirror,
938                               "vfio-nvidia-bar0-1800-mirror-quirk",
939                               PCI_CONFIG_SPACE_SIZE);
940         memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
941                                             mirror->offset, mirror->mem, 1);
942 
943         QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
944     }
945 
946     trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name);
947 }
948 
949 /*
950  * TODO - Some Nvidia devices provide config access to their companion HDA
951  * device and even to their parent bridge via these config space mirrors.
952  * Add quirks for those regions.
953  */
954 
955 #define PCI_VENDOR_ID_REALTEK 0x10ec
956 
957 /*
958  * RTL8168 devices have a backdoor that can access the MSI-X table.  At BAR2
959  * offset 0x70 there is a dword data register, offset 0x74 is a dword address
960  * register.  According to the Linux r8169 driver, the MSI-X table is addressed
961  * when the "type" portion of the address register is set to 0x1.  This appears
962  * to be bits 16:30.  Bit 31 is both a write indicator and some sort of
963  * "address latched" indicator.  Bits 12:15 are a mask field, which we can
964  * ignore because the MSI-X table should always be accessed as a dword (full
965  * mask).  Bits 0:11 is offset within the type.
966  *
967  * Example trace:
968  *
969  * Read from MSI-X table offset 0
970  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
971  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
972  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
973  *
974  * Write 0xfee00000 to MSI-X table offset 0
975  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
976  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
977  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
978  */
979 typedef struct VFIOrtl8168Quirk {
980     VFIOPCIDevice *vdev;
981     uint32_t addr;
982     uint32_t data;
983     bool enabled;
984 } VFIOrtl8168Quirk;
985 
986 static uint64_t vfio_rtl8168_quirk_address_read(void *opaque,
987                                                 hwaddr addr, unsigned size)
988 {
989     VFIOrtl8168Quirk *rtl = opaque;
990     VFIOPCIDevice *vdev = rtl->vdev;
991     uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size);
992 
993     if (rtl->enabled) {
994         data = rtl->addr ^ 0x80000000U; /* latch/complete */
995         trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data);
996     }
997 
998     return data;
999 }
1000 
1001 static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr,
1002                                              uint64_t data, unsigned size)
1003 {
1004     VFIOrtl8168Quirk *rtl = opaque;
1005     VFIOPCIDevice *vdev = rtl->vdev;
1006 
1007     rtl->enabled = false;
1008 
1009     if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */
1010         rtl->enabled = true;
1011         rtl->addr = (uint32_t)data;
1012 
1013         if (data & 0x80000000U) { /* Do write */
1014             if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
1015                 hwaddr offset = data & 0xfff;
1016                 uint64_t val = rtl->data;
1017 
1018                 trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name,
1019                                                     (uint16_t)offset, val);
1020 
1021                 /* Write to the proper guest MSI-X table instead */
1022                 memory_region_dispatch_write(&vdev->pdev.msix_table_mmio,
1023                                              offset, val,
1024                                              size_memop(size) | MO_LE,
1025                                              MEMTXATTRS_UNSPECIFIED);
1026             }
1027             return; /* Do not write guest MSI-X data to hardware */
1028         }
1029     }
1030 
1031     vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size);
1032 }
1033 
1034 static const MemoryRegionOps vfio_rtl_address_quirk = {
1035     .read = vfio_rtl8168_quirk_address_read,
1036     .write = vfio_rtl8168_quirk_address_write,
1037     .valid = {
1038         .min_access_size = 4,
1039         .max_access_size = 4,
1040         .unaligned = false,
1041     },
1042     .endianness = DEVICE_LITTLE_ENDIAN,
1043 };
1044 
1045 static uint64_t vfio_rtl8168_quirk_data_read(void *opaque,
1046                                              hwaddr addr, unsigned size)
1047 {
1048     VFIOrtl8168Quirk *rtl = opaque;
1049     VFIOPCIDevice *vdev = rtl->vdev;
1050     uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size);
1051 
1052     if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
1053         hwaddr offset = rtl->addr & 0xfff;
1054         memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset,
1055                                     &data, size_memop(size) | MO_LE,
1056                                     MEMTXATTRS_UNSPECIFIED);
1057         trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data);
1058     }
1059 
1060     return data;
1061 }
1062 
1063 static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr,
1064                                           uint64_t data, unsigned size)
1065 {
1066     VFIOrtl8168Quirk *rtl = opaque;
1067     VFIOPCIDevice *vdev = rtl->vdev;
1068 
1069     rtl->data = (uint32_t)data;
1070 
1071     vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size);
1072 }
1073 
1074 static const MemoryRegionOps vfio_rtl_data_quirk = {
1075     .read = vfio_rtl8168_quirk_data_read,
1076     .write = vfio_rtl8168_quirk_data_write,
1077     .valid = {
1078         .min_access_size = 4,
1079         .max_access_size = 4,
1080         .unaligned = false,
1081     },
1082     .endianness = DEVICE_LITTLE_ENDIAN,
1083 };
1084 
1085 static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr)
1086 {
1087     VFIOQuirk *quirk;
1088     VFIOrtl8168Quirk *rtl;
1089 
1090     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) {
1091         return;
1092     }
1093 
1094     quirk = vfio_quirk_alloc(2);
1095     quirk->data = rtl = g_malloc0(sizeof(*rtl));
1096     rtl->vdev = vdev;
1097 
1098     memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
1099                           &vfio_rtl_address_quirk, rtl,
1100                           "vfio-rtl8168-window-address-quirk", 4);
1101     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1102                                         0x74, &quirk->mem[0], 1);
1103 
1104     memory_region_init_io(&quirk->mem[1], OBJECT(vdev),
1105                           &vfio_rtl_data_quirk, rtl,
1106                           "vfio-rtl8168-window-data-quirk", 4);
1107     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1108                                         0x70, &quirk->mem[1], 1);
1109 
1110     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1111 
1112     trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name);
1113 }
1114 
1115 #define IGD_ASLS 0xfc /* ASL Storage Register */
1116 
1117 /*
1118  * The OpRegion includes the Video BIOS Table, which seems important for
1119  * telling the driver what sort of outputs it has.  Without this, the device
1120  * may work in the guest, but we may not get output.  This also requires BIOS
1121  * support to reserve and populate a section of guest memory sufficient for
1122  * the table and to write the base address of that memory to the ASLS register
1123  * of the IGD device.
1124  */
1125 bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
1126                                 struct vfio_region_info *info, Error **errp)
1127 {
1128     int ret;
1129 
1130     vdev->igd_opregion = g_malloc0(info->size);
1131     ret = pread(vdev->vbasedev.fd, vdev->igd_opregion,
1132                 info->size, info->offset);
1133     if (ret != info->size) {
1134         error_setg(errp, "failed to read IGD OpRegion");
1135         g_free(vdev->igd_opregion);
1136         vdev->igd_opregion = NULL;
1137         return false;
1138     }
1139 
1140     /*
1141      * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to
1142      * allocate 32bit reserved memory for, copy these contents into, and write
1143      * the reserved memory base address to the device ASLS register at 0xFC.
1144      * Alignment of this reserved region seems flexible, but using a 4k page
1145      * alignment seems to work well.  This interface assumes a single IGD
1146      * device, which may be at VM address 00:02.0 in legacy mode or another
1147      * address in UPT mode.
1148      *
1149      * NB, there may be future use cases discovered where the VM should have
1150      * direct interaction with the host OpRegion, in which case the write to
1151      * the ASLS register would trigger MemoryRegion setup to enable that.
1152      */
1153     fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion",
1154                     vdev->igd_opregion, info->size);
1155 
1156     trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name);
1157 
1158     pci_set_long(vdev->pdev.config + IGD_ASLS, 0);
1159     pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0);
1160     pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0);
1161 
1162     return true;
1163 }
1164 
1165 /*
1166  * Common quirk probe entry points.
1167  */
1168 void vfio_vga_quirk_setup(VFIOPCIDevice *vdev)
1169 {
1170     vfio_vga_probe_ati_3c3_quirk(vdev);
1171     vfio_vga_probe_nvidia_3d0_quirk(vdev);
1172 }
1173 
1174 void vfio_vga_quirk_exit(VFIOPCIDevice *vdev)
1175 {
1176     VFIOQuirk *quirk;
1177     int i, j;
1178 
1179     for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1180         QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) {
1181             for (j = 0; j < quirk->nr_mem; j++) {
1182                 memory_region_del_subregion(&vdev->vga->region[i].mem,
1183                                             &quirk->mem[j]);
1184             }
1185         }
1186     }
1187 }
1188 
1189 void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev)
1190 {
1191     int i, j;
1192 
1193     for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1194         while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) {
1195             VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks);
1196             QLIST_REMOVE(quirk, next);
1197             for (j = 0; j < quirk->nr_mem; j++) {
1198                 object_unparent(OBJECT(&quirk->mem[j]));
1199             }
1200             g_free(quirk->mem);
1201             g_free(quirk->data);
1202             g_free(quirk);
1203         }
1204     }
1205 }
1206 
1207 void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
1208 {
1209     vfio_probe_ati_bar4_quirk(vdev, nr);
1210     vfio_probe_ati_bar2_quirk(vdev, nr);
1211     vfio_probe_nvidia_bar5_quirk(vdev, nr);
1212     vfio_probe_nvidia_bar0_quirk(vdev, nr);
1213     vfio_probe_rtl8168_bar2_quirk(vdev, nr);
1214 #ifdef CONFIG_VFIO_IGD
1215     vfio_probe_igd_bar0_quirk(vdev, nr);
1216     vfio_probe_igd_bar4_quirk(vdev, nr);
1217 #endif
1218 }
1219 
1220 void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
1221 {
1222     VFIOBAR *bar = &vdev->bars[nr];
1223     VFIOQuirk *quirk;
1224     int i;
1225 
1226     QLIST_FOREACH(quirk, &bar->quirks, next) {
1227         while (!QLIST_EMPTY(&quirk->ioeventfds)) {
1228             vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds));
1229         }
1230 
1231         for (i = 0; i < quirk->nr_mem; i++) {
1232             memory_region_del_subregion(bar->region.mem, &quirk->mem[i]);
1233         }
1234     }
1235 }
1236 
1237 void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr)
1238 {
1239     VFIOBAR *bar = &vdev->bars[nr];
1240     int i;
1241 
1242     while (!QLIST_EMPTY(&bar->quirks)) {
1243         VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1244         QLIST_REMOVE(quirk, next);
1245         for (i = 0; i < quirk->nr_mem; i++) {
1246             object_unparent(OBJECT(&quirk->mem[i]));
1247         }
1248         g_free(quirk->mem);
1249         g_free(quirk->data);
1250         g_free(quirk);
1251     }
1252 }
1253 
1254 /*
1255  * Reset quirks
1256  */
1257 void vfio_quirk_reset(VFIOPCIDevice *vdev)
1258 {
1259     int i;
1260 
1261     for (i = 0; i < PCI_ROM_SLOT; i++) {
1262         VFIOQuirk *quirk;
1263         VFIOBAR *bar = &vdev->bars[i];
1264 
1265         QLIST_FOREACH(quirk, &bar->quirks, next) {
1266             if (quirk->reset) {
1267                 quirk->reset(vdev, quirk);
1268             }
1269         }
1270     }
1271 }
1272 
1273 /*
1274  * AMD Radeon PCI config reset, based on Linux:
1275  *   drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running()
1276  *   drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset
1277  *   drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc()
1278  *   drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock()
1279  * IDs: include/drm/drm_pciids.h
1280  * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0
1281  *
1282  * Bonaire and Hawaii GPUs do not respond to a bus reset.  This is a bug in the
1283  * hardware that should be fixed on future ASICs.  The symptom of this is that
1284  * once the accerlated driver loads, Windows guests will bsod on subsequent
1285  * attmpts to load the driver, such as after VM reset or shutdown/restart.  To
1286  * work around this, we do an AMD specific PCI config reset, followed by an SMC
1287  * reset.  The PCI config reset only works if SMC firmware is running, so we
1288  * have a dependency on the state of the device as to whether this reset will
1289  * be effective.  There are still cases where we won't be able to kick the
1290  * device into working, but this greatly improves the usability overall.  The
1291  * config reset magic is relatively common on AMD GPUs, but the setup and SMC
1292  * poking is largely ASIC specific.
1293  */
1294 static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev)
1295 {
1296     uint32_t clk, pc_c;
1297 
1298     /*
1299      * Registers 200h and 204h are index and data registers for accessing
1300      * indirect configuration registers within the device.
1301      */
1302     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1303     clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1304     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4);
1305     pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1306 
1307     return (!(clk & 1) && (0x20100 <= pc_c));
1308 }
1309 
1310 /*
1311  * The scope of a config reset is controlled by a mode bit in the misc register
1312  * and a fuse, exposed as a bit in another register.  The fuse is the default
1313  * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the formula
1314  * scope = !(misc ^ fuse), where the resulting scope is defined the same as
1315  * the fuse.  A truth table therefore tells us that if misc == fuse, we need
1316  * to flip the value of the bit in the misc register.
1317  */
1318 static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev)
1319 {
1320     uint32_t misc, fuse;
1321     bool a, b;
1322 
1323     vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4);
1324     fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1325     b = fuse & 64;
1326 
1327     vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4);
1328     misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1329     a = misc & 2;
1330 
1331     if (a == b) {
1332         vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4);
1333         vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */
1334     }
1335 }
1336 
1337 static int vfio_radeon_reset(VFIOPCIDevice *vdev)
1338 {
1339     PCIDevice *pdev = &vdev->pdev;
1340     int i, ret = 0;
1341     uint32_t data;
1342 
1343     /* Defer to a kernel implemented reset */
1344     if (vdev->vbasedev.reset_works) {
1345         trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name);
1346         return -ENODEV;
1347     }
1348 
1349     /* Enable only memory BAR access */
1350     vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2);
1351 
1352     /* Reset only works if SMC firmware is loaded and running */
1353     if (!vfio_radeon_smc_is_running(vdev)) {
1354         ret = -EINVAL;
1355         trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name);
1356         goto out;
1357     }
1358 
1359     /* Make sure only the GFX function is reset */
1360     vfio_radeon_set_gfx_only_reset(vdev);
1361 
1362     /* AMD PCI config reset */
1363     vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4);
1364     usleep(100);
1365 
1366     /* Read back the memory size to make sure we're out of reset */
1367     for (i = 0; i < 100000; i++) {
1368         if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) {
1369             goto reset_smc;
1370         }
1371         usleep(1);
1372     }
1373 
1374     trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name);
1375 
1376 reset_smc:
1377     /* Reset SMC */
1378     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4);
1379     data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1380     data |= 1;
1381     vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
1382 
1383     /* Disable SMC clock */
1384     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1385     data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1386     data |= 1;
1387     vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
1388 
1389     trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name);
1390 
1391 out:
1392     /* Restore PCI command register */
1393     vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2);
1394 
1395     return ret;
1396 }
1397 
1398 void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev)
1399 {
1400     switch (vdev->vendor_id) {
1401     case 0x1002:
1402         switch (vdev->device_id) {
1403         /* Bonaire */
1404         case 0x6649: /* Bonaire [FirePro W5100] */
1405         case 0x6650:
1406         case 0x6651:
1407         case 0x6658: /* Bonaire XTX [Radeon R7 260X] */
1408         case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */
1409         case 0x665d: /* Bonaire [Radeon R7 200 Series] */
1410         /* Hawaii */
1411         case 0x67A0: /* Hawaii XT GL [FirePro W9100] */
1412         case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */
1413         case 0x67A2:
1414         case 0x67A8:
1415         case 0x67A9:
1416         case 0x67AA:
1417         case 0x67B0: /* Hawaii XT [Radeon R9 290X] */
1418         case 0x67B1: /* Hawaii PRO [Radeon R9 290] */
1419         case 0x67B8:
1420         case 0x67B9:
1421         case 0x67BA:
1422         case 0x67BE:
1423             vdev->resetfn = vfio_radeon_reset;
1424             trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name);
1425             break;
1426         }
1427         break;
1428     }
1429 }
1430 
1431 /*
1432  * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify
1433  * devices as a member of a clique.  Devices within the same clique ID
1434  * are capable of direct P2P.  It's the user's responsibility that this
1435  * is correct.  The spec says that this may reside at any unused config
1436  * offset, but reserves and recommends hypervisors place this at C8h.
1437  * The spec also states that the hypervisor should place this capability
1438  * at the end of the capability list, thus next is defined as 0h.
1439  *
1440  * +----------------+----------------+----------------+----------------+
1441  * | sig 7:0 ('P')  |  vndr len (8h) |    next (0h)   |   cap id (9h)  |
1442  * +----------------+----------------+----------------+----------------+
1443  * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)|          sig 23:8 ('P2')        |
1444  * +---------------------------------+---------------------------------+
1445  *
1446  * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf
1447  *
1448  * Specification for Turning and later GPU architectures:
1449  * https://lists.gnu.org/archive/html/qemu-devel/2023-06/pdf142OR4O4c2.pdf
1450  */
1451 static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v,
1452                                        const char *name, void *opaque,
1453                                        Error **errp)
1454 {
1455     const Property *prop = opaque;
1456     uint8_t *ptr = object_field_prop_ptr(obj, prop);
1457 
1458     visit_type_uint8(v, name, ptr, errp);
1459 }
1460 
1461 static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v,
1462                                        const char *name, void *opaque,
1463                                        Error **errp)
1464 {
1465     const Property *prop = opaque;
1466     uint8_t value, *ptr = object_field_prop_ptr(obj, prop);
1467 
1468     if (!visit_type_uint8(v, name, &value, errp)) {
1469         return;
1470     }
1471 
1472     if (value & ~0xF) {
1473         error_setg(errp, "Property %s: valid range 0-15", name);
1474         return;
1475     }
1476 
1477     *ptr = value;
1478 }
1479 
1480 const PropertyInfo qdev_prop_nv_gpudirect_clique = {
1481     .name = "uint4",
1482     .description = "NVIDIA GPUDirect Clique ID (0 - 15)",
1483     .get = get_nv_gpudirect_clique_id,
1484     .set = set_nv_gpudirect_clique_id,
1485 };
1486 
1487 static bool is_valid_std_cap_offset(uint8_t pos)
1488 {
1489     return (pos >= PCI_STD_HEADER_SIZEOF &&
1490             pos <= (PCI_CFG_SPACE_SIZE - PCI_CAP_SIZEOF));
1491 }
1492 
1493 static bool vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
1494 {
1495     ERRP_GUARD();
1496     PCIDevice *pdev = &vdev->pdev;
1497     int ret, pos;
1498     bool c8_conflict = false, d4_conflict = false;
1499     uint8_t tmp;
1500 
1501     if (vdev->nv_gpudirect_clique == 0xFF) {
1502         return true;
1503     }
1504 
1505     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) {
1506         error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor");
1507         return false;
1508     }
1509 
1510     if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) !=
1511         PCI_BASE_CLASS_DISPLAY) {
1512         error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class");
1513         return false;
1514     }
1515 
1516     /*
1517      * Per the updated specification above, it's recommended to use offset
1518      * D4h for Turing and later GPU architectures due to a conflict of the
1519      * MSI-X capability at C8h.  We don't know how to determine the GPU
1520      * architecture, instead we walk the capability chain to mark conflicts
1521      * and choose one or error based on the result.
1522      *
1523      * NB. Cap list head in pdev->config is already cleared, read from device.
1524      */
1525     ret = pread(vdev->vbasedev.fd, &tmp, 1,
1526                 vdev->config_offset + PCI_CAPABILITY_LIST);
1527     if (ret != 1 || !is_valid_std_cap_offset(tmp)) {
1528         error_setg(errp, "NVIDIA GPUDirect Clique ID: error getting cap list");
1529         return false;
1530     }
1531 
1532     do {
1533         if (tmp == 0xC8) {
1534             c8_conflict = true;
1535         } else if (tmp == 0xD4) {
1536             d4_conflict = true;
1537         }
1538         tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT];
1539     } while (is_valid_std_cap_offset(tmp));
1540 
1541     if (!c8_conflict) {
1542         pos = 0xC8;
1543     } else if (!d4_conflict) {
1544         pos = 0xD4;
1545     } else {
1546         error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid config space");
1547         return false;
1548     }
1549 
1550     ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp);
1551     if (ret < 0) {
1552         error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: ");
1553         return false;
1554     }
1555 
1556     memset(vdev->emulated_config_bits + pos, 0xFF, 8);
1557     pos += PCI_CAP_FLAGS;
1558     pci_set_byte(pdev->config + pos++, 8);
1559     pci_set_byte(pdev->config + pos++, 'P');
1560     pci_set_byte(pdev->config + pos++, '2');
1561     pci_set_byte(pdev->config + pos++, 'P');
1562     pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3);
1563     pci_set_byte(pdev->config + pos, 0);
1564 
1565     return true;
1566 }
1567 
1568 /*
1569  * The VMD endpoint provides a real PCIe domain to the guest and the guest
1570  * kernel performs enumeration of the VMD sub-device domain. Guest transactions
1571  * to VMD sub-devices go through MMU translation from guest addresses to
1572  * physical addresses. When MMIO goes to an endpoint after being translated to
1573  * physical addresses, the bridge rejects the transaction because the window
1574  * has been programmed with guest addresses.
1575  *
1576  * VMD can use the Host Physical Address in order to correctly program the
1577  * bridge windows in its PCIe domain. VMD device 28C0 has HPA shadow registers
1578  * located at offset 0x2000 in MEMBAR2 (BAR 4). This quirk provides the HPA
1579  * shadow registers in a vendor-specific capability register for devices
1580  * without native support. The position of 0xE8-0xFF is in the reserved range
1581  * of the VMD device capability space following the Power Management
1582  * Capability.
1583  */
1584 #define VMD_SHADOW_CAP_VER 1
1585 #define VMD_SHADOW_CAP_LEN 24
1586 static bool vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, Error **errp)
1587 {
1588     ERRP_GUARD();
1589     uint8_t membar_phys[16];
1590     int ret, pos = 0xE8;
1591 
1592     if (!(vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x201D) ||
1593           vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x467F) ||
1594           vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x4C3D) ||
1595           vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x9A0B))) {
1596         return true;
1597     }
1598 
1599     ret = pread(vdev->vbasedev.fd, membar_phys, 16,
1600                 vdev->config_offset + PCI_BASE_ADDRESS_2);
1601     if (ret != 16) {
1602         error_report("VMD %s cannot read MEMBARs (%d)",
1603                      vdev->vbasedev.name, ret);
1604         return false;
1605     }
1606 
1607     ret = pci_add_capability(&vdev->pdev, PCI_CAP_ID_VNDR, pos,
1608                              VMD_SHADOW_CAP_LEN, errp);
1609     if (ret < 0) {
1610         error_prepend(errp, "Failed to add VMD MEMBAR Shadow cap: ");
1611         return false;
1612     }
1613 
1614     memset(vdev->emulated_config_bits + pos, 0xFF, VMD_SHADOW_CAP_LEN);
1615     pos += PCI_CAP_FLAGS;
1616     pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_LEN);
1617     pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_VER);
1618     pci_set_long(vdev->pdev.config + pos, 0x53484457); /* SHDW */
1619     memcpy(vdev->pdev.config + pos + 4, membar_phys, 16);
1620 
1621     return true;
1622 }
1623 
1624 bool vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp)
1625 {
1626     if (!vfio_add_nv_gpudirect_cap(vdev, errp)) {
1627         return false;
1628     }
1629 
1630     if (!vfio_add_vmd_shadow_cap(vdev, errp)) {
1631         return false;
1632     }
1633 
1634     return true;
1635 }
1636