xref: /qemu/hw/vfio/pci-quirks.c (revision bcf3c3d029e73d54455e1d7a51177c37d668378c)
1 /*
2  * device quirks for PCI devices
3  *
4  * Copyright Red Hat, Inc. 2012-2015
5  *
6  * Authors:
7  *  Alex Williamson <alex.williamson@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "qemu/error-report.h"
15 #include "qemu/range.h"
16 #include "qapi/error.h"
17 #include "qapi/visitor.h"
18 #include "hw/nvram/fw_cfg.h"
19 #include "pci.h"
20 #include "trace.h"
21 
22 /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */
23 static bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t device)
24 {
25     return (vendor == PCI_ANY_ID || vendor == vdev->vendor_id) &&
26            (device == PCI_ANY_ID || device == vdev->device_id);
27 }
28 
29 static bool vfio_is_vga(VFIOPCIDevice *vdev)
30 {
31     PCIDevice *pdev = &vdev->pdev;
32     uint16_t class = pci_get_word(pdev->config + PCI_CLASS_DEVICE);
33 
34     return class == PCI_CLASS_DISPLAY_VGA;
35 }
36 
37 /*
38  * List of device ids/vendor ids for which to disable
39  * option rom loading. This avoids the guest hangs during rom
40  * execution as noticed with the BCM 57810 card for lack of a
41  * more better way to handle such issues.
42  * The  user can still override by specifying a romfile or
43  * rombar=1.
44  * Please see https://bugs.launchpad.net/qemu/+bug/1284874
45  * for an analysis of the 57810 card hang. When adding
46  * a new vendor id/device id combination below, please also add
47  * your card/environment details and information that could
48  * help in debugging to the bug tracking this issue
49  */
50 static const struct {
51     uint32_t vendor;
52     uint32_t device;
53 } romblacklist[] = {
54     { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */
55 };
56 
57 bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev)
58 {
59     int i;
60 
61     for (i = 0 ; i < ARRAY_SIZE(romblacklist); i++) {
62         if (vfio_pci_is(vdev, romblacklist[i].vendor, romblacklist[i].device)) {
63             trace_vfio_quirk_rom_blacklisted(vdev->vbasedev.name,
64                                              romblacklist[i].vendor,
65                                              romblacklist[i].device);
66             return true;
67         }
68     }
69     return false;
70 }
71 
72 /*
73  * Device specific region quirks (mostly backdoors to PCI config space)
74  */
75 
76 /*
77  * The generic window quirks operate on an address and data register,
78  * vfio_generic_window_address_quirk handles the address register and
79  * vfio_generic_window_data_quirk handles the data register.  These ops
80  * pass reads and writes through to hardware until a value matching the
81  * stored address match/mask is written.  When this occurs, the data
82  * register access emulated PCI config space for the device rather than
83  * passing through accesses.  This enables devices where PCI config space
84  * is accessible behind a window register to maintain the virtualization
85  * provided through vfio.
86  */
87 typedef struct VFIOConfigWindowMatch {
88     uint32_t match;
89     uint32_t mask;
90 } VFIOConfigWindowMatch;
91 
92 typedef struct VFIOConfigWindowQuirk {
93     struct VFIOPCIDevice *vdev;
94 
95     uint32_t address_val;
96 
97     uint32_t address_offset;
98     uint32_t data_offset;
99 
100     bool window_enabled;
101     uint8_t bar;
102 
103     MemoryRegion *addr_mem;
104     MemoryRegion *data_mem;
105 
106     uint32_t nr_matches;
107     VFIOConfigWindowMatch matches[];
108 } VFIOConfigWindowQuirk;
109 
110 static uint64_t vfio_generic_window_quirk_address_read(void *opaque,
111                                                        hwaddr addr,
112                                                        unsigned size)
113 {
114     VFIOConfigWindowQuirk *window = opaque;
115     VFIOPCIDevice *vdev = window->vdev;
116 
117     return vfio_region_read(&vdev->bars[window->bar].region,
118                             addr + window->address_offset, size);
119 }
120 
121 static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr,
122                                                     uint64_t data,
123                                                     unsigned size)
124 {
125     VFIOConfigWindowQuirk *window = opaque;
126     VFIOPCIDevice *vdev = window->vdev;
127     int i;
128 
129     window->window_enabled = false;
130 
131     vfio_region_write(&vdev->bars[window->bar].region,
132                       addr + window->address_offset, data, size);
133 
134     for (i = 0; i < window->nr_matches; i++) {
135         if ((data & ~window->matches[i].mask) == window->matches[i].match) {
136             window->window_enabled = true;
137             window->address_val = data & window->matches[i].mask;
138             trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name,
139                                     memory_region_name(window->addr_mem), data);
140             break;
141         }
142     }
143 }
144 
145 static const MemoryRegionOps vfio_generic_window_address_quirk = {
146     .read = vfio_generic_window_quirk_address_read,
147     .write = vfio_generic_window_quirk_address_write,
148     .endianness = DEVICE_LITTLE_ENDIAN,
149 };
150 
151 static uint64_t vfio_generic_window_quirk_data_read(void *opaque,
152                                                     hwaddr addr, unsigned size)
153 {
154     VFIOConfigWindowQuirk *window = opaque;
155     VFIOPCIDevice *vdev = window->vdev;
156     uint64_t data;
157 
158     /* Always read data reg, discard if window enabled */
159     data = vfio_region_read(&vdev->bars[window->bar].region,
160                             addr + window->data_offset, size);
161 
162     if (window->window_enabled) {
163         data = vfio_pci_read_config(&vdev->pdev, window->address_val, size);
164         trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name,
165                                     memory_region_name(window->data_mem), data);
166     }
167 
168     return data;
169 }
170 
171 static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr,
172                                                  uint64_t data, unsigned size)
173 {
174     VFIOConfigWindowQuirk *window = opaque;
175     VFIOPCIDevice *vdev = window->vdev;
176 
177     if (window->window_enabled) {
178         vfio_pci_write_config(&vdev->pdev, window->address_val, data, size);
179         trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name,
180                                     memory_region_name(window->data_mem), data);
181         return;
182     }
183 
184     vfio_region_write(&vdev->bars[window->bar].region,
185                       addr + window->data_offset, data, size);
186 }
187 
188 static const MemoryRegionOps vfio_generic_window_data_quirk = {
189     .read = vfio_generic_window_quirk_data_read,
190     .write = vfio_generic_window_quirk_data_write,
191     .endianness = DEVICE_LITTLE_ENDIAN,
192 };
193 
194 /*
195  * The generic mirror quirk handles devices which expose PCI config space
196  * through a region within a BAR.  When enabled, reads and writes are
197  * redirected through to emulated PCI config space.  XXX if PCI config space
198  * used memory regions, this could just be an alias.
199  */
200 typedef struct VFIOConfigMirrorQuirk {
201     struct VFIOPCIDevice *vdev;
202     uint32_t offset;
203     uint8_t bar;
204     MemoryRegion *mem;
205 } VFIOConfigMirrorQuirk;
206 
207 static uint64_t vfio_generic_quirk_mirror_read(void *opaque,
208                                                hwaddr addr, unsigned size)
209 {
210     VFIOConfigMirrorQuirk *mirror = opaque;
211     VFIOPCIDevice *vdev = mirror->vdev;
212     uint64_t data;
213 
214     /* Read and discard in case the hardware cares */
215     (void)vfio_region_read(&vdev->bars[mirror->bar].region,
216                            addr + mirror->offset, size);
217 
218     data = vfio_pci_read_config(&vdev->pdev, addr, size);
219     trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name,
220                                          memory_region_name(mirror->mem),
221                                          addr, data);
222     return data;
223 }
224 
225 static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr,
226                                             uint64_t data, unsigned size)
227 {
228     VFIOConfigMirrorQuirk *mirror = opaque;
229     VFIOPCIDevice *vdev = mirror->vdev;
230 
231     vfio_pci_write_config(&vdev->pdev, addr, data, size);
232     trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name,
233                                           memory_region_name(mirror->mem),
234                                           addr, data);
235 }
236 
237 static const MemoryRegionOps vfio_generic_mirror_quirk = {
238     .read = vfio_generic_quirk_mirror_read,
239     .write = vfio_generic_quirk_mirror_write,
240     .endianness = DEVICE_LITTLE_ENDIAN,
241 };
242 
243 /* Is range1 fully contained within range2?  */
244 static bool vfio_range_contained(uint64_t first1, uint64_t len1,
245                                  uint64_t first2, uint64_t len2) {
246     return (first1 >= first2 && first1 + len1 <= first2 + len2);
247 }
248 
249 #define PCI_VENDOR_ID_ATI               0x1002
250 
251 /*
252  * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
253  * through VGA register 0x3c3.  On newer cards, the I/O port BAR is always
254  * BAR4 (older cards like the X550 used BAR1, but we don't care to support
255  * those).  Note that on bare metal, a read of 0x3c3 doesn't always return the
256  * I/O port BAR address.  Originally this was coded to return the virtual BAR
257  * address only if the physical register read returns the actual BAR address,
258  * but users have reported greater success if we return the virtual address
259  * unconditionally.
260  */
261 static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
262                                         hwaddr addr, unsigned size)
263 {
264     VFIOPCIDevice *vdev = opaque;
265     uint64_t data = vfio_pci_read_config(&vdev->pdev,
266                                          PCI_BASE_ADDRESS_4 + 1, size);
267 
268     trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data);
269 
270     return data;
271 }
272 
273 static const MemoryRegionOps vfio_ati_3c3_quirk = {
274     .read = vfio_ati_3c3_quirk_read,
275     .endianness = DEVICE_LITTLE_ENDIAN,
276 };
277 
278 static VFIOQuirk *vfio_quirk_alloc(int nr_mem)
279 {
280     VFIOQuirk *quirk = g_new0(VFIOQuirk, 1);
281     quirk->mem = g_new0(MemoryRegion, nr_mem);
282     quirk->nr_mem = nr_mem;
283 
284     return quirk;
285 }
286 
287 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
288 {
289     VFIOQuirk *quirk;
290 
291     /*
292      * As long as the BAR is >= 256 bytes it will be aligned such that the
293      * lower byte is always zero.  Filter out anything else, if it exists.
294      */
295     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
296         !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) {
297         return;
298     }
299 
300     quirk = vfio_quirk_alloc(1);
301 
302     memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev,
303                           "vfio-ati-3c3-quirk", 1);
304     memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
305                                 3 /* offset 3 bytes from 0x3c0 */, quirk->mem);
306 
307     QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
308                       quirk, next);
309 
310     trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name);
311 }
312 
313 /*
314  * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI
315  * config space through MMIO BAR2 at offset 0x4000.  Nothing seems to access
316  * the MMIO space directly, but a window to this space is provided through
317  * I/O port BAR4.  Offset 0x0 is the address register and offset 0x4 is the
318  * data register.  When the address is programmed to a range of 0x4000-0x4fff
319  * PCI configuration space is available.  Experimentation seems to indicate
320  * that read-only may be provided by hardware.
321  */
322 static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr)
323 {
324     VFIOQuirk *quirk;
325     VFIOConfigWindowQuirk *window;
326 
327     /* This windows doesn't seem to be used except by legacy VGA code */
328     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
329         !vdev->vga || nr != 4) {
330         return;
331     }
332 
333     quirk = vfio_quirk_alloc(2);
334     window = quirk->data = g_malloc0(sizeof(*window) +
335                                      sizeof(VFIOConfigWindowMatch));
336     window->vdev = vdev;
337     window->address_offset = 0;
338     window->data_offset = 4;
339     window->nr_matches = 1;
340     window->matches[0].match = 0x4000;
341     window->matches[0].mask = vdev->config_size - 1;
342     window->bar = nr;
343     window->addr_mem = &quirk->mem[0];
344     window->data_mem = &quirk->mem[1];
345 
346     memory_region_init_io(window->addr_mem, OBJECT(vdev),
347                           &vfio_generic_window_address_quirk, window,
348                           "vfio-ati-bar4-window-address-quirk", 4);
349     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
350                                         window->address_offset,
351                                         window->addr_mem, 1);
352 
353     memory_region_init_io(window->data_mem, OBJECT(vdev),
354                           &vfio_generic_window_data_quirk, window,
355                           "vfio-ati-bar4-window-data-quirk", 4);
356     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
357                                         window->data_offset,
358                                         window->data_mem, 1);
359 
360     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
361 
362     trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name);
363 }
364 
365 /*
366  * Trap the BAR2 MMIO mirror to config space as well.
367  */
368 static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr)
369 {
370     VFIOQuirk *quirk;
371     VFIOConfigMirrorQuirk *mirror;
372 
373     /* Only enable on newer devices where BAR2 is 64bit */
374     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
375         !vdev->vga || nr != 2 || !vdev->bars[2].mem64) {
376         return;
377     }
378 
379     quirk = vfio_quirk_alloc(1);
380     mirror = quirk->data = g_malloc0(sizeof(*mirror));
381     mirror->mem = quirk->mem;
382     mirror->vdev = vdev;
383     mirror->offset = 0x4000;
384     mirror->bar = nr;
385 
386     memory_region_init_io(mirror->mem, OBJECT(vdev),
387                           &vfio_generic_mirror_quirk, mirror,
388                           "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE);
389     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
390                                         mirror->offset, mirror->mem, 1);
391 
392     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
393 
394     trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name);
395 }
396 
397 /*
398  * Older ATI/AMD cards like the X550 have a similar window to that above.
399  * I/O port BAR1 provides a window to a mirror of PCI config space located
400  * in BAR2 at offset 0xf00.  We don't care to support such older cards, but
401  * note it for future reference.
402  */
403 
404 #define PCI_VENDOR_ID_NVIDIA                    0x10de
405 
406 /*
407  * Nvidia has several different methods to get to config space, the
408  * nouveu project has several of these documented here:
409  * https://github.com/pathscale/envytools/tree/master/hwdocs
410  *
411  * The first quirk is actually not documented in envytools and is found
412  * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]).  This is an
413  * NV46 chipset.  The backdoor uses the legacy VGA I/O ports to access
414  * the mirror of PCI config space found at BAR0 offset 0x1800.  The access
415  * sequence first writes 0x338 to I/O port 0x3d4.  The target offset is
416  * then written to 0x3d0.  Finally 0x538 is written for a read and 0x738
417  * is written for a write to 0x3d4.  The BAR0 offset is then accessible
418  * through 0x3d0.  This quirk doesn't seem to be necessary on newer cards
419  * that use the I/O port BAR5 window but it doesn't hurt to leave it.
420  */
421 typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State;
422 static const char *nv3d0_states[] = { "NONE", "SELECT",
423                                       "WINDOW", "READ", "WRITE" };
424 
425 typedef struct VFIONvidia3d0Quirk {
426     VFIOPCIDevice *vdev;
427     VFIONvidia3d0State state;
428     uint32_t offset;
429 } VFIONvidia3d0Quirk;
430 
431 static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque,
432                                            hwaddr addr, unsigned size)
433 {
434     VFIONvidia3d0Quirk *quirk = opaque;
435     VFIOPCIDevice *vdev = quirk->vdev;
436 
437     quirk->state = NONE;
438 
439     return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
440                          addr + 0x14, size);
441 }
442 
443 static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr,
444                                         uint64_t data, unsigned size)
445 {
446     VFIONvidia3d0Quirk *quirk = opaque;
447     VFIOPCIDevice *vdev = quirk->vdev;
448     VFIONvidia3d0State old_state = quirk->state;
449 
450     quirk->state = NONE;
451 
452     switch (data) {
453     case 0x338:
454         if (old_state == NONE) {
455             quirk->state = SELECT;
456             trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
457                                               nv3d0_states[quirk->state]);
458         }
459         break;
460     case 0x538:
461         if (old_state == WINDOW) {
462             quirk->state = READ;
463             trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
464                                               nv3d0_states[quirk->state]);
465         }
466         break;
467     case 0x738:
468         if (old_state == WINDOW) {
469             quirk->state = WRITE;
470             trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
471                                               nv3d0_states[quirk->state]);
472         }
473         break;
474     }
475 
476     vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
477                    addr + 0x14, data, size);
478 }
479 
480 static const MemoryRegionOps vfio_nvidia_3d4_quirk = {
481     .read = vfio_nvidia_3d4_quirk_read,
482     .write = vfio_nvidia_3d4_quirk_write,
483     .endianness = DEVICE_LITTLE_ENDIAN,
484 };
485 
486 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
487                                            hwaddr addr, unsigned size)
488 {
489     VFIONvidia3d0Quirk *quirk = opaque;
490     VFIOPCIDevice *vdev = quirk->vdev;
491     VFIONvidia3d0State old_state = quirk->state;
492     uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
493                                   addr + 0x10, size);
494 
495     quirk->state = NONE;
496 
497     if (old_state == READ &&
498         (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
499         uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
500 
501         data = vfio_pci_read_config(&vdev->pdev, offset, size);
502         trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name,
503                                          offset, size, data);
504     }
505 
506     return data;
507 }
508 
509 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
510                                         uint64_t data, unsigned size)
511 {
512     VFIONvidia3d0Quirk *quirk = opaque;
513     VFIOPCIDevice *vdev = quirk->vdev;
514     VFIONvidia3d0State old_state = quirk->state;
515 
516     quirk->state = NONE;
517 
518     if (old_state == SELECT) {
519         quirk->offset = (uint32_t)data;
520         quirk->state = WINDOW;
521         trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
522                                           nv3d0_states[quirk->state]);
523     } else if (old_state == WRITE) {
524         if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
525             uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
526 
527             vfio_pci_write_config(&vdev->pdev, offset, data, size);
528             trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name,
529                                               offset, data, size);
530             return;
531         }
532     }
533 
534     vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
535                    addr + 0x10, data, size);
536 }
537 
538 static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
539     .read = vfio_nvidia_3d0_quirk_read,
540     .write = vfio_nvidia_3d0_quirk_write,
541     .endianness = DEVICE_LITTLE_ENDIAN,
542 };
543 
544 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev)
545 {
546     VFIOQuirk *quirk;
547     VFIONvidia3d0Quirk *data;
548 
549     if (vdev->no_geforce_quirks ||
550         !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
551         !vdev->bars[1].region.size) {
552         return;
553     }
554 
555     quirk = vfio_quirk_alloc(2);
556     quirk->data = data = g_malloc0(sizeof(*data));
557     data->vdev = vdev;
558 
559     memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk,
560                           data, "vfio-nvidia-3d4-quirk", 2);
561     memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
562                                 0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]);
563 
564     memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk,
565                           data, "vfio-nvidia-3d0-quirk", 2);
566     memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
567                                 0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]);
568 
569     QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
570                       quirk, next);
571 
572     trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name);
573 }
574 
575 /*
576  * The second quirk is documented in envytools.  The I/O port BAR5 is just
577  * a set of address/data ports to the MMIO BARs.  The BAR we care about is
578  * again BAR0.  This backdoor is apparently a bit newer than the one above
579  * so we need to not only trap 256 bytes @0x1800, but all of PCI config
580  * space, including extended space is available at the 4k @0x88000.
581  */
582 typedef struct VFIONvidiaBAR5Quirk {
583     uint32_t master;
584     uint32_t enable;
585     MemoryRegion *addr_mem;
586     MemoryRegion *data_mem;
587     bool enabled;
588     VFIOConfigWindowQuirk window; /* last for match data */
589 } VFIONvidiaBAR5Quirk;
590 
591 static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5)
592 {
593     VFIOPCIDevice *vdev = bar5->window.vdev;
594 
595     if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) {
596         return;
597     }
598 
599     bar5->enabled = !bar5->enabled;
600     trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name,
601                                        bar5->enabled ?  "Enable" : "Disable");
602     memory_region_set_enabled(bar5->addr_mem, bar5->enabled);
603     memory_region_set_enabled(bar5->data_mem, bar5->enabled);
604 }
605 
606 static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque,
607                                                    hwaddr addr, unsigned size)
608 {
609     VFIONvidiaBAR5Quirk *bar5 = opaque;
610     VFIOPCIDevice *vdev = bar5->window.vdev;
611 
612     return vfio_region_read(&vdev->bars[5].region, addr, size);
613 }
614 
615 static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr,
616                                                 uint64_t data, unsigned size)
617 {
618     VFIONvidiaBAR5Quirk *bar5 = opaque;
619     VFIOPCIDevice *vdev = bar5->window.vdev;
620 
621     vfio_region_write(&vdev->bars[5].region, addr, data, size);
622 
623     bar5->master = data;
624     vfio_nvidia_bar5_enable(bar5);
625 }
626 
627 static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = {
628     .read = vfio_nvidia_bar5_quirk_master_read,
629     .write = vfio_nvidia_bar5_quirk_master_write,
630     .endianness = DEVICE_LITTLE_ENDIAN,
631 };
632 
633 static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque,
634                                                    hwaddr addr, unsigned size)
635 {
636     VFIONvidiaBAR5Quirk *bar5 = opaque;
637     VFIOPCIDevice *vdev = bar5->window.vdev;
638 
639     return vfio_region_read(&vdev->bars[5].region, addr + 4, size);
640 }
641 
642 static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr,
643                                                 uint64_t data, unsigned size)
644 {
645     VFIONvidiaBAR5Quirk *bar5 = opaque;
646     VFIOPCIDevice *vdev = bar5->window.vdev;
647 
648     vfio_region_write(&vdev->bars[5].region, addr + 4, data, size);
649 
650     bar5->enable = data;
651     vfio_nvidia_bar5_enable(bar5);
652 }
653 
654 static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = {
655     .read = vfio_nvidia_bar5_quirk_enable_read,
656     .write = vfio_nvidia_bar5_quirk_enable_write,
657     .endianness = DEVICE_LITTLE_ENDIAN,
658 };
659 
660 static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr)
661 {
662     VFIOQuirk *quirk;
663     VFIONvidiaBAR5Quirk *bar5;
664     VFIOConfigWindowQuirk *window;
665 
666     if (vdev->no_geforce_quirks ||
667         !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
668         !vdev->vga || nr != 5 || !vdev->bars[5].ioport) {
669         return;
670     }
671 
672     quirk = vfio_quirk_alloc(4);
673     bar5 = quirk->data = g_malloc0(sizeof(*bar5) +
674                                    (sizeof(VFIOConfigWindowMatch) * 2));
675     window = &bar5->window;
676 
677     window->vdev = vdev;
678     window->address_offset = 0x8;
679     window->data_offset = 0xc;
680     window->nr_matches = 2;
681     window->matches[0].match = 0x1800;
682     window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1;
683     window->matches[1].match = 0x88000;
684     window->matches[1].mask = vdev->config_size - 1;
685     window->bar = nr;
686     window->addr_mem = bar5->addr_mem = &quirk->mem[0];
687     window->data_mem = bar5->data_mem = &quirk->mem[1];
688 
689     memory_region_init_io(window->addr_mem, OBJECT(vdev),
690                           &vfio_generic_window_address_quirk, window,
691                           "vfio-nvidia-bar5-window-address-quirk", 4);
692     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
693                                         window->address_offset,
694                                         window->addr_mem, 1);
695     memory_region_set_enabled(window->addr_mem, false);
696 
697     memory_region_init_io(window->data_mem, OBJECT(vdev),
698                           &vfio_generic_window_data_quirk, window,
699                           "vfio-nvidia-bar5-window-data-quirk", 4);
700     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
701                                         window->data_offset,
702                                         window->data_mem, 1);
703     memory_region_set_enabled(window->data_mem, false);
704 
705     memory_region_init_io(&quirk->mem[2], OBJECT(vdev),
706                           &vfio_nvidia_bar5_quirk_master, bar5,
707                           "vfio-nvidia-bar5-master-quirk", 4);
708     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
709                                         0, &quirk->mem[2], 1);
710 
711     memory_region_init_io(&quirk->mem[3], OBJECT(vdev),
712                           &vfio_nvidia_bar5_quirk_enable, bar5,
713                           "vfio-nvidia-bar5-enable-quirk", 4);
714     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
715                                         4, &quirk->mem[3], 1);
716 
717     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
718 
719     trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name);
720 }
721 
722 /*
723  * Finally, BAR0 itself.  We want to redirect any accesses to either
724  * 0x1800 or 0x88000 through the PCI config space access functions.
725  */
726 static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
727                                            uint64_t data, unsigned size)
728 {
729     VFIOConfigMirrorQuirk *mirror = opaque;
730     VFIOPCIDevice *vdev = mirror->vdev;
731     PCIDevice *pdev = &vdev->pdev;
732 
733     vfio_generic_quirk_mirror_write(opaque, addr, data, size);
734 
735     /*
736      * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
737      * MSI capability ID register.  Both the ID and next register are
738      * read-only, so we allow writes covering either of those to real hw.
739      */
740     if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
741         vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
742         vfio_region_write(&vdev->bars[mirror->bar].region,
743                           addr + mirror->offset, data, size);
744         trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name);
745     }
746 }
747 
748 static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
749     .read = vfio_generic_quirk_mirror_read,
750     .write = vfio_nvidia_quirk_mirror_write,
751     .endianness = DEVICE_LITTLE_ENDIAN,
752 };
753 
754 static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
755 {
756     VFIOQuirk *quirk;
757     VFIOConfigMirrorQuirk *mirror;
758 
759     if (vdev->no_geforce_quirks ||
760         !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
761         !vfio_is_vga(vdev) || nr != 0) {
762         return;
763     }
764 
765     quirk = vfio_quirk_alloc(1);
766     mirror = quirk->data = g_malloc0(sizeof(*mirror));
767     mirror->mem = quirk->mem;
768     mirror->vdev = vdev;
769     mirror->offset = 0x88000;
770     mirror->bar = nr;
771 
772     memory_region_init_io(mirror->mem, OBJECT(vdev),
773                           &vfio_nvidia_mirror_quirk, mirror,
774                           "vfio-nvidia-bar0-88000-mirror-quirk",
775                           vdev->config_size);
776     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
777                                         mirror->offset, mirror->mem, 1);
778 
779     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
780 
781     /* The 0x1800 offset mirror only seems to get used by legacy VGA */
782     if (vdev->vga) {
783         quirk = vfio_quirk_alloc(1);
784         mirror = quirk->data = g_malloc0(sizeof(*mirror));
785         mirror->mem = quirk->mem;
786         mirror->vdev = vdev;
787         mirror->offset = 0x1800;
788         mirror->bar = nr;
789 
790         memory_region_init_io(mirror->mem, OBJECT(vdev),
791                               &vfio_nvidia_mirror_quirk, mirror,
792                               "vfio-nvidia-bar0-1800-mirror-quirk",
793                               PCI_CONFIG_SPACE_SIZE);
794         memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
795                                             mirror->offset, mirror->mem, 1);
796 
797         QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
798     }
799 
800     trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name);
801 }
802 
803 /*
804  * TODO - Some Nvidia devices provide config access to their companion HDA
805  * device and even to their parent bridge via these config space mirrors.
806  * Add quirks for those regions.
807  */
808 
809 #define PCI_VENDOR_ID_REALTEK 0x10ec
810 
811 /*
812  * RTL8168 devices have a backdoor that can access the MSI-X table.  At BAR2
813  * offset 0x70 there is a dword data register, offset 0x74 is a dword address
814  * register.  According to the Linux r8169 driver, the MSI-X table is addressed
815  * when the "type" portion of the address register is set to 0x1.  This appears
816  * to be bits 16:30.  Bit 31 is both a write indicator and some sort of
817  * "address latched" indicator.  Bits 12:15 are a mask field, which we can
818  * ignore because the MSI-X table should always be accessed as a dword (full
819  * mask).  Bits 0:11 is offset within the type.
820  *
821  * Example trace:
822  *
823  * Read from MSI-X table offset 0
824  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
825  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
826  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
827  *
828  * Write 0xfee00000 to MSI-X table offset 0
829  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
830  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
831  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
832  */
833 typedef struct VFIOrtl8168Quirk {
834     VFIOPCIDevice *vdev;
835     uint32_t addr;
836     uint32_t data;
837     bool enabled;
838 } VFIOrtl8168Quirk;
839 
840 static uint64_t vfio_rtl8168_quirk_address_read(void *opaque,
841                                                 hwaddr addr, unsigned size)
842 {
843     VFIOrtl8168Quirk *rtl = opaque;
844     VFIOPCIDevice *vdev = rtl->vdev;
845     uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size);
846 
847     if (rtl->enabled) {
848         data = rtl->addr ^ 0x80000000U; /* latch/complete */
849         trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data);
850     }
851 
852     return data;
853 }
854 
855 static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr,
856                                              uint64_t data, unsigned size)
857 {
858     VFIOrtl8168Quirk *rtl = opaque;
859     VFIOPCIDevice *vdev = rtl->vdev;
860 
861     rtl->enabled = false;
862 
863     if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */
864         rtl->enabled = true;
865         rtl->addr = (uint32_t)data;
866 
867         if (data & 0x80000000U) { /* Do write */
868             if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
869                 hwaddr offset = data & 0xfff;
870                 uint64_t val = rtl->data;
871 
872                 trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name,
873                                                     (uint16_t)offset, val);
874 
875                 /* Write to the proper guest MSI-X table instead */
876                 memory_region_dispatch_write(&vdev->pdev.msix_table_mmio,
877                                              offset, val, size,
878                                              MEMTXATTRS_UNSPECIFIED);
879             }
880             return; /* Do not write guest MSI-X data to hardware */
881         }
882     }
883 
884     vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size);
885 }
886 
887 static const MemoryRegionOps vfio_rtl_address_quirk = {
888     .read = vfio_rtl8168_quirk_address_read,
889     .write = vfio_rtl8168_quirk_address_write,
890     .valid = {
891         .min_access_size = 4,
892         .max_access_size = 4,
893         .unaligned = false,
894     },
895     .endianness = DEVICE_LITTLE_ENDIAN,
896 };
897 
898 static uint64_t vfio_rtl8168_quirk_data_read(void *opaque,
899                                              hwaddr addr, unsigned size)
900 {
901     VFIOrtl8168Quirk *rtl = opaque;
902     VFIOPCIDevice *vdev = rtl->vdev;
903     uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size);
904 
905     if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
906         hwaddr offset = rtl->addr & 0xfff;
907         memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset,
908                                     &data, size, MEMTXATTRS_UNSPECIFIED);
909         trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data);
910     }
911 
912     return data;
913 }
914 
915 static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr,
916                                           uint64_t data, unsigned size)
917 {
918     VFIOrtl8168Quirk *rtl = opaque;
919     VFIOPCIDevice *vdev = rtl->vdev;
920 
921     rtl->data = (uint32_t)data;
922 
923     vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size);
924 }
925 
926 static const MemoryRegionOps vfio_rtl_data_quirk = {
927     .read = vfio_rtl8168_quirk_data_read,
928     .write = vfio_rtl8168_quirk_data_write,
929     .valid = {
930         .min_access_size = 4,
931         .max_access_size = 4,
932         .unaligned = false,
933     },
934     .endianness = DEVICE_LITTLE_ENDIAN,
935 };
936 
937 static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr)
938 {
939     VFIOQuirk *quirk;
940     VFIOrtl8168Quirk *rtl;
941 
942     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) {
943         return;
944     }
945 
946     quirk = vfio_quirk_alloc(2);
947     quirk->data = rtl = g_malloc0(sizeof(*rtl));
948     rtl->vdev = vdev;
949 
950     memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
951                           &vfio_rtl_address_quirk, rtl,
952                           "vfio-rtl8168-window-address-quirk", 4);
953     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
954                                         0x74, &quirk->mem[0], 1);
955 
956     memory_region_init_io(&quirk->mem[1], OBJECT(vdev),
957                           &vfio_rtl_data_quirk, rtl,
958                           "vfio-rtl8168-window-data-quirk", 4);
959     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
960                                         0x70, &quirk->mem[1], 1);
961 
962     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
963 
964     trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name);
965 }
966 
967 /*
968  * Intel IGD support
969  *
970  * Obviously IGD is not a discrete device, this is evidenced not only by it
971  * being integrated into the CPU, but by the various chipset and BIOS
972  * dependencies that it brings along with it.  Intel is trying to move away
973  * from this and Broadwell and newer devices can run in what Intel calls
974  * "Universal Pass-Through" mode, or UPT.  Theoretically in UPT mode, nothing
975  * more is required beyond assigning the IGD device to a VM.  There are
976  * however support limitations to this mode.  It only supports IGD as a
977  * secondary graphics device in the VM and it doesn't officially support any
978  * physical outputs.
979  *
980  * The code here attempts to enable what we'll call legacy mode assignment,
981  * IGD retains most of the capabilities we expect for it to have on bare
982  * metal.  To enable this mode, the IGD device must be assigned to the VM
983  * at PCI address 00:02.0, it must have a ROM, it very likely needs VGA
984  * support, we must have VM BIOS support for reserving and populating some
985  * of the required tables, and we need to tweak the chipset with revisions
986  * and IDs and an LPC/ISA bridge device.  The intention is to make all of
987  * this happen automatically by installing the device at the correct VM PCI
988  * bus address.  If any of the conditions are not met, we cross our fingers
989  * and hope the user knows better.
990  *
991  * NB - It is possible to enable physical outputs in UPT mode by supplying
992  * an OpRegion table.  We don't do this by default because the guest driver
993  * behaves differently if an OpRegion is provided and no monitor is attached
994  * vs no OpRegion and a monitor being attached or not.  Effectively, if a
995  * headless setup is desired, the OpRegion gets in the way of that.
996  */
997 
998 /*
999  * This presumes the device is already known to be an Intel VGA device, so we
1000  * take liberties in which device ID bits match which generation.  This should
1001  * not be taken as an indication that all the devices are supported, or even
1002  * supportable, some of them don't even support VT-d.
1003  * See linux:include/drm/i915_pciids.h for IDs.
1004  */
1005 static int igd_gen(VFIOPCIDevice *vdev)
1006 {
1007     if ((vdev->device_id & 0xfff) == 0xa84) {
1008         return 8; /* Broxton */
1009     }
1010 
1011     switch (vdev->device_id & 0xff00) {
1012     /* Old, untested, unavailable, unknown */
1013     case 0x0000:
1014     case 0x2500:
1015     case 0x2700:
1016     case 0x2900:
1017     case 0x2a00:
1018     case 0x2e00:
1019     case 0x3500:
1020     case 0xa000:
1021         return -1;
1022     /* SandyBridge, IvyBridge, ValleyView, Haswell */
1023     case 0x0100:
1024     case 0x0400:
1025     case 0x0a00:
1026     case 0x0c00:
1027     case 0x0d00:
1028     case 0x0f00:
1029         return 6;
1030     /* BroadWell, CherryView, SkyLake, KabyLake */
1031     case 0x1600:
1032     case 0x1900:
1033     case 0x2200:
1034     case 0x5900:
1035         return 8;
1036     }
1037 
1038     return 8; /* Assume newer is compatible */
1039 }
1040 
1041 typedef struct VFIOIGDQuirk {
1042     struct VFIOPCIDevice *vdev;
1043     uint32_t index;
1044     uint32_t bdsm;
1045 } VFIOIGDQuirk;
1046 
1047 #define IGD_GMCH 0x50 /* Graphics Control Register */
1048 #define IGD_BDSM 0x5c /* Base Data of Stolen Memory */
1049 #define IGD_ASLS 0xfc /* ASL Storage Register */
1050 
1051 /*
1052  * The OpRegion includes the Video BIOS Table, which seems important for
1053  * telling the driver what sort of outputs it has.  Without this, the device
1054  * may work in the guest, but we may not get output.  This also requires BIOS
1055  * support to reserve and populate a section of guest memory sufficient for
1056  * the table and to write the base address of that memory to the ASLS register
1057  * of the IGD device.
1058  */
1059 int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
1060                                struct vfio_region_info *info, Error **errp)
1061 {
1062     int ret;
1063 
1064     vdev->igd_opregion = g_malloc0(info->size);
1065     ret = pread(vdev->vbasedev.fd, vdev->igd_opregion,
1066                 info->size, info->offset);
1067     if (ret != info->size) {
1068         error_setg(errp, "failed to read IGD OpRegion");
1069         g_free(vdev->igd_opregion);
1070         vdev->igd_opregion = NULL;
1071         return -EINVAL;
1072     }
1073 
1074     /*
1075      * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to
1076      * allocate 32bit reserved memory for, copy these contents into, and write
1077      * the reserved memory base address to the device ASLS register at 0xFC.
1078      * Alignment of this reserved region seems flexible, but using a 4k page
1079      * alignment seems to work well.  This interface assumes a single IGD
1080      * device, which may be at VM address 00:02.0 in legacy mode or another
1081      * address in UPT mode.
1082      *
1083      * NB, there may be future use cases discovered where the VM should have
1084      * direct interaction with the host OpRegion, in which case the write to
1085      * the ASLS register would trigger MemoryRegion setup to enable that.
1086      */
1087     fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion",
1088                     vdev->igd_opregion, info->size);
1089 
1090     trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name);
1091 
1092     pci_set_long(vdev->pdev.config + IGD_ASLS, 0);
1093     pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0);
1094     pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0);
1095 
1096     return 0;
1097 }
1098 
1099 /*
1100  * The rather short list of registers that we copy from the host devices.
1101  * The LPC/ISA bridge values are definitely needed to support the vBIOS, the
1102  * host bridge values may or may not be needed depending on the guest OS.
1103  * Since we're only munging revision and subsystem values on the host bridge,
1104  * we don't require our own device.  The LPC/ISA bridge needs to be our very
1105  * own though.
1106  */
1107 typedef struct {
1108     uint8_t offset;
1109     uint8_t len;
1110 } IGDHostInfo;
1111 
1112 static const IGDHostInfo igd_host_bridge_infos[] = {
1113     {PCI_REVISION_ID,         2},
1114     {PCI_SUBSYSTEM_VENDOR_ID, 2},
1115     {PCI_SUBSYSTEM_ID,        2},
1116 };
1117 
1118 static const IGDHostInfo igd_lpc_bridge_infos[] = {
1119     {PCI_VENDOR_ID,           2},
1120     {PCI_DEVICE_ID,           2},
1121     {PCI_REVISION_ID,         2},
1122     {PCI_SUBSYSTEM_VENDOR_ID, 2},
1123     {PCI_SUBSYSTEM_ID,        2},
1124 };
1125 
1126 static int vfio_pci_igd_copy(VFIOPCIDevice *vdev, PCIDevice *pdev,
1127                              struct vfio_region_info *info,
1128                              const IGDHostInfo *list, int len)
1129 {
1130     int i, ret;
1131 
1132     for (i = 0; i < len; i++) {
1133         ret = pread(vdev->vbasedev.fd, pdev->config + list[i].offset,
1134                     list[i].len, info->offset + list[i].offset);
1135         if (ret != list[i].len) {
1136             error_report("IGD copy failed: %m");
1137             return -errno;
1138         }
1139     }
1140 
1141     return 0;
1142 }
1143 
1144 /*
1145  * Stuff a few values into the host bridge.
1146  */
1147 static int vfio_pci_igd_host_init(VFIOPCIDevice *vdev,
1148                                   struct vfio_region_info *info)
1149 {
1150     PCIBus *bus;
1151     PCIDevice *host_bridge;
1152     int ret;
1153 
1154     bus = pci_device_root_bus(&vdev->pdev);
1155     host_bridge = pci_find_device(bus, 0, PCI_DEVFN(0, 0));
1156 
1157     if (!host_bridge) {
1158         error_report("Can't find host bridge");
1159         return -ENODEV;
1160     }
1161 
1162     ret = vfio_pci_igd_copy(vdev, host_bridge, info, igd_host_bridge_infos,
1163                             ARRAY_SIZE(igd_host_bridge_infos));
1164     if (!ret) {
1165         trace_vfio_pci_igd_host_bridge_enabled(vdev->vbasedev.name);
1166     }
1167 
1168     return ret;
1169 }
1170 
1171 /*
1172  * IGD LPC/ISA bridge support code.  The vBIOS needs this, but we can't write
1173  * arbitrary values into just any bridge, so we must create our own.  We try
1174  * to handle if the user has created it for us, which they might want to do
1175  * to enable multifunction so we don't occupy the whole PCI slot.
1176  */
1177 static void vfio_pci_igd_lpc_bridge_realize(PCIDevice *pdev, Error **errp)
1178 {
1179     if (pdev->devfn != PCI_DEVFN(0x1f, 0)) {
1180         error_setg(errp, "VFIO dummy ISA/LPC bridge must have address 1f.0");
1181     }
1182 }
1183 
1184 static void vfio_pci_igd_lpc_bridge_class_init(ObjectClass *klass, void *data)
1185 {
1186     DeviceClass *dc = DEVICE_CLASS(klass);
1187     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1188 
1189     set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
1190     dc->desc = "VFIO dummy ISA/LPC bridge for IGD assignment";
1191     dc->hotpluggable = false;
1192     k->realize = vfio_pci_igd_lpc_bridge_realize;
1193     k->class_id = PCI_CLASS_BRIDGE_ISA;
1194 }
1195 
1196 static TypeInfo vfio_pci_igd_lpc_bridge_info = {
1197     .name = "vfio-pci-igd-lpc-bridge",
1198     .parent = TYPE_PCI_DEVICE,
1199     .class_init = vfio_pci_igd_lpc_bridge_class_init,
1200     .interfaces = (InterfaceInfo[]) {
1201         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
1202         { },
1203     },
1204 };
1205 
1206 static void vfio_pci_igd_register_types(void)
1207 {
1208     type_register_static(&vfio_pci_igd_lpc_bridge_info);
1209 }
1210 
1211 type_init(vfio_pci_igd_register_types)
1212 
1213 static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev,
1214                                  struct vfio_region_info *info)
1215 {
1216     PCIDevice *lpc_bridge;
1217     int ret;
1218 
1219     lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev),
1220                                  0, PCI_DEVFN(0x1f, 0));
1221     if (!lpc_bridge) {
1222         lpc_bridge = pci_create_simple(pci_device_root_bus(&vdev->pdev),
1223                                  PCI_DEVFN(0x1f, 0), "vfio-pci-igd-lpc-bridge");
1224     }
1225 
1226     ret = vfio_pci_igd_copy(vdev, lpc_bridge, info, igd_lpc_bridge_infos,
1227                             ARRAY_SIZE(igd_lpc_bridge_infos));
1228     if (!ret) {
1229         trace_vfio_pci_igd_lpc_bridge_enabled(vdev->vbasedev.name);
1230     }
1231 
1232     return ret;
1233 }
1234 
1235 /*
1236  * IGD Gen8 and newer support up to 8MB for the GTT and use a 64bit PTE
1237  * entry, older IGDs use 2MB and 32bit.  Each PTE maps a 4k page.  Therefore
1238  * we either have 2M/4k * 4 = 2k or 8M/4k * 8 = 16k as the maximum iobar index
1239  * for programming the GTT.
1240  *
1241  * See linux:include/drm/i915_drm.h for shift and mask values.
1242  */
1243 static int vfio_igd_gtt_max(VFIOPCIDevice *vdev)
1244 {
1245     uint32_t gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch));
1246     int ggms, gen = igd_gen(vdev);
1247 
1248     gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch));
1249     ggms = (gmch >> (gen < 8 ? 8 : 6)) & 0x3;
1250     if (gen > 6) {
1251         ggms = 1 << ggms;
1252     }
1253 
1254     ggms *= 1024 * 1024;
1255 
1256     return (ggms / (4 * 1024)) * (gen < 8 ? 4 : 8);
1257 }
1258 
1259 /*
1260  * The IGD ROM will make use of stolen memory (GGMS) for support of VESA modes.
1261  * Somehow the host stolen memory range is used for this, but how the ROM gets
1262  * it is a mystery, perhaps it's hardcoded into the ROM.  Thankfully though, it
1263  * reprograms the GTT through the IOBAR where we can trap it and transpose the
1264  * programming to the VM allocated buffer.  That buffer gets reserved by the VM
1265  * firmware via the fw_cfg entry added below.  Here we're just monitoring the
1266  * IOBAR address and data registers to detect a write sequence targeting the
1267  * GTTADR.  This code is developed by observed behavior and doesn't have a
1268  * direct spec reference, unfortunately.
1269  */
1270 static uint64_t vfio_igd_quirk_data_read(void *opaque,
1271                                          hwaddr addr, unsigned size)
1272 {
1273     VFIOIGDQuirk *igd = opaque;
1274     VFIOPCIDevice *vdev = igd->vdev;
1275 
1276     igd->index = ~0;
1277 
1278     return vfio_region_read(&vdev->bars[4].region, addr + 4, size);
1279 }
1280 
1281 static void vfio_igd_quirk_data_write(void *opaque, hwaddr addr,
1282                                       uint64_t data, unsigned size)
1283 {
1284     VFIOIGDQuirk *igd = opaque;
1285     VFIOPCIDevice *vdev = igd->vdev;
1286     uint64_t val = data;
1287     int gen = igd_gen(vdev);
1288 
1289     /*
1290      * Programming the GGMS starts at index 0x1 and uses every 4th index (ie.
1291      * 0x1, 0x5, 0x9, 0xd,...).  For pre-Gen8 each 4-byte write is a whole PTE
1292      * entry, with 0th bit enable set.  For Gen8 and up, PTEs are 64bit, so
1293      * entries 0x5 & 0xd are the high dword, in our case zero.  Each PTE points
1294      * to a 4k page, which we translate to a page from the VM allocated region,
1295      * pointed to by the BDSM register.  If this is not set, we fail.
1296      *
1297      * We trap writes to the full configured GTT size, but we typically only
1298      * see the vBIOS writing up to (nearly) the 1MB barrier.  In fact it often
1299      * seems to miss the last entry for an even 1MB GTT.  Doing a gratuitous
1300      * write of that last entry does work, but is hopefully unnecessary since
1301      * we clear the previous GTT on initialization.
1302      */
1303     if ((igd->index % 4 == 1) && igd->index < vfio_igd_gtt_max(vdev)) {
1304         if (gen < 8 || (igd->index % 8 == 1)) {
1305             uint32_t base;
1306 
1307             base = pci_get_long(vdev->pdev.config + IGD_BDSM);
1308             if (!base) {
1309                 hw_error("vfio-igd: Guest attempted to program IGD GTT before "
1310                          "BIOS reserved stolen memory.  Unsupported BIOS?");
1311             }
1312 
1313             val = data - igd->bdsm + base;
1314         } else {
1315             val = 0; /* upper 32bits of pte, we only enable below 4G PTEs */
1316         }
1317 
1318         trace_vfio_pci_igd_bar4_write(vdev->vbasedev.name,
1319                                       igd->index, data, val);
1320     }
1321 
1322     vfio_region_write(&vdev->bars[4].region, addr + 4, val, size);
1323 
1324     igd->index = ~0;
1325 }
1326 
1327 static const MemoryRegionOps vfio_igd_data_quirk = {
1328     .read = vfio_igd_quirk_data_read,
1329     .write = vfio_igd_quirk_data_write,
1330     .endianness = DEVICE_LITTLE_ENDIAN,
1331 };
1332 
1333 static uint64_t vfio_igd_quirk_index_read(void *opaque,
1334                                           hwaddr addr, unsigned size)
1335 {
1336     VFIOIGDQuirk *igd = opaque;
1337     VFIOPCIDevice *vdev = igd->vdev;
1338 
1339     igd->index = ~0;
1340 
1341     return vfio_region_read(&vdev->bars[4].region, addr, size);
1342 }
1343 
1344 static void vfio_igd_quirk_index_write(void *opaque, hwaddr addr,
1345                                        uint64_t data, unsigned size)
1346 {
1347     VFIOIGDQuirk *igd = opaque;
1348     VFIOPCIDevice *vdev = igd->vdev;
1349 
1350     igd->index = data;
1351 
1352     vfio_region_write(&vdev->bars[4].region, addr, data, size);
1353 }
1354 
1355 static const MemoryRegionOps vfio_igd_index_quirk = {
1356     .read = vfio_igd_quirk_index_read,
1357     .write = vfio_igd_quirk_index_write,
1358     .endianness = DEVICE_LITTLE_ENDIAN,
1359 };
1360 
1361 static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
1362 {
1363     struct vfio_region_info *rom = NULL, *opregion = NULL,
1364                             *host = NULL, *lpc = NULL;
1365     VFIOQuirk *quirk;
1366     VFIOIGDQuirk *igd;
1367     PCIDevice *lpc_bridge;
1368     int i, ret, ggms_mb, gms_mb = 0, gen;
1369     uint64_t *bdsm_size;
1370     uint32_t gmch;
1371     uint16_t cmd_orig, cmd;
1372     Error *err = NULL;
1373 
1374     /*
1375      * This must be an Intel VGA device at address 00:02.0 for us to even
1376      * consider enabling legacy mode.  The vBIOS has dependencies on the
1377      * PCI bus address.
1378      */
1379     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) ||
1380         !vfio_is_vga(vdev) || nr != 4 ||
1381         &vdev->pdev != pci_find_device(pci_device_root_bus(&vdev->pdev),
1382                                        0, PCI_DEVFN(0x2, 0))) {
1383         return;
1384     }
1385 
1386     /*
1387      * We need to create an LPC/ISA bridge at PCI bus address 00:1f.0 that we
1388      * can stuff host values into, so if there's already one there and it's not
1389      * one we can hack on, legacy mode is no-go.  Sorry Q35.
1390      */
1391     lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev),
1392                                  0, PCI_DEVFN(0x1f, 0));
1393     if (lpc_bridge && !object_dynamic_cast(OBJECT(lpc_bridge),
1394                                            "vfio-pci-igd-lpc-bridge")) {
1395         error_report("IGD device %s cannot support legacy mode due to existing "
1396                      "devices at address 1f.0", vdev->vbasedev.name);
1397         return;
1398     }
1399 
1400     /*
1401      * IGD is not a standard, they like to change their specs often.  We
1402      * only attempt to support back to SandBridge and we hope that newer
1403      * devices maintain compatibility with generation 8.
1404      */
1405     gen = igd_gen(vdev);
1406     if (gen != 6 && gen != 8) {
1407         error_report("IGD device %s is unsupported in legacy mode, "
1408                      "try SandyBridge or newer", vdev->vbasedev.name);
1409         return;
1410     }
1411 
1412     /*
1413      * Most of what we're doing here is to enable the ROM to run, so if
1414      * there's no ROM, there's no point in setting up this quirk.
1415      * NB. We only seem to get BIOS ROMs, so a UEFI VM would need CSM support.
1416      */
1417     ret = vfio_get_region_info(&vdev->vbasedev,
1418                                VFIO_PCI_ROM_REGION_INDEX, &rom);
1419     if ((ret || !rom->size) && !vdev->pdev.romfile) {
1420         error_report("IGD device %s has no ROM, legacy mode disabled",
1421                      vdev->vbasedev.name);
1422         goto out;
1423     }
1424 
1425     /*
1426      * Ignore the hotplug corner case, mark the ROM failed, we can't
1427      * create the devices we need for legacy mode in the hotplug scenario.
1428      */
1429     if (vdev->pdev.qdev.hotplugged) {
1430         error_report("IGD device %s hotplugged, ROM disabled, "
1431                      "legacy mode disabled", vdev->vbasedev.name);
1432         vdev->rom_read_failed = true;
1433         goto out;
1434     }
1435 
1436     /*
1437      * Check whether we have all the vfio device specific regions to
1438      * support legacy mode (added in Linux v4.6).  If not, bail.
1439      */
1440     ret = vfio_get_dev_region_info(&vdev->vbasedev,
1441                         VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
1442                         VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
1443     if (ret) {
1444         error_report("IGD device %s does not support OpRegion access,"
1445                      "legacy mode disabled", vdev->vbasedev.name);
1446         goto out;
1447     }
1448 
1449     ret = vfio_get_dev_region_info(&vdev->vbasedev,
1450                         VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
1451                         VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host);
1452     if (ret) {
1453         error_report("IGD device %s does not support host bridge access,"
1454                      "legacy mode disabled", vdev->vbasedev.name);
1455         goto out;
1456     }
1457 
1458     ret = vfio_get_dev_region_info(&vdev->vbasedev,
1459                         VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
1460                         VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &lpc);
1461     if (ret) {
1462         error_report("IGD device %s does not support LPC bridge access,"
1463                      "legacy mode disabled", vdev->vbasedev.name);
1464         goto out;
1465     }
1466 
1467     gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4);
1468 
1469     /*
1470      * If IGD VGA Disable is clear (expected) and VGA is not already enabled,
1471      * try to enable it.  Probably shouldn't be using legacy mode without VGA,
1472      * but also no point in us enabling VGA if disabled in hardware.
1473      */
1474     if (!(gmch & 0x2) && !vdev->vga && vfio_populate_vga(vdev, &err)) {
1475         error_reportf_err(err, ERR_PREFIX, vdev->vbasedev.name);
1476         error_report("IGD device %s failed to enable VGA access, "
1477                      "legacy mode disabled", vdev->vbasedev.name);
1478         goto out;
1479     }
1480 
1481     /* Create our LPC/ISA bridge */
1482     ret = vfio_pci_igd_lpc_init(vdev, lpc);
1483     if (ret) {
1484         error_report("IGD device %s failed to create LPC bridge, "
1485                      "legacy mode disabled", vdev->vbasedev.name);
1486         goto out;
1487     }
1488 
1489     /* Stuff some host values into the VM PCI host bridge */
1490     ret = vfio_pci_igd_host_init(vdev, host);
1491     if (ret) {
1492         error_report("IGD device %s failed to modify host bridge, "
1493                      "legacy mode disabled", vdev->vbasedev.name);
1494         goto out;
1495     }
1496 
1497     /* Setup OpRegion access */
1498     ret = vfio_pci_igd_opregion_init(vdev, opregion, &err);
1499     if (ret) {
1500         error_append_hint(&err, "IGD legacy mode disabled\n");
1501         error_reportf_err(err, ERR_PREFIX, vdev->vbasedev.name);
1502         goto out;
1503     }
1504 
1505     /* Setup our quirk to munge GTT addresses to the VM allocated buffer */
1506     quirk = vfio_quirk_alloc(2);
1507     igd = quirk->data = g_malloc0(sizeof(*igd));
1508     igd->vdev = vdev;
1509     igd->index = ~0;
1510     igd->bdsm = vfio_pci_read_config(&vdev->pdev, IGD_BDSM, 4);
1511     igd->bdsm &= ~((1 << 20) - 1); /* 1MB aligned */
1512 
1513     memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_igd_index_quirk,
1514                           igd, "vfio-igd-index-quirk", 4);
1515     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1516                                         0, &quirk->mem[0], 1);
1517 
1518     memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_igd_data_quirk,
1519                           igd, "vfio-igd-data-quirk", 4);
1520     memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1521                                         4, &quirk->mem[1], 1);
1522 
1523     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1524 
1525     /* Determine the size of stolen memory needed for GTT */
1526     ggms_mb = (gmch >> (gen < 8 ? 8 : 6)) & 0x3;
1527     if (gen > 6) {
1528         ggms_mb = 1 << ggms_mb;
1529     }
1530 
1531     /*
1532      * Assume we have no GMS memory, but allow it to be overrided by device
1533      * option (experimental).  The spec doesn't actually allow zero GMS when
1534      * when IVD (IGD VGA Disable) is clear, but the claim is that it's unused,
1535      * so let's not waste VM memory for it.
1536      */
1537     gmch &= ~((gen < 8 ? 0x1f : 0xff) << (gen < 8 ? 3 : 8));
1538 
1539     if (vdev->igd_gms) {
1540         if (vdev->igd_gms <= 0x10) {
1541             gms_mb = vdev->igd_gms * 32;
1542             gmch |= vdev->igd_gms << (gen < 8 ? 3 : 8);
1543         } else {
1544             error_report("Unsupported IGD GMS value 0x%x", vdev->igd_gms);
1545             vdev->igd_gms = 0;
1546         }
1547     }
1548 
1549     /*
1550      * Request reserved memory for stolen memory via fw_cfg.  VM firmware
1551      * must allocate a 1MB aligned reserved memory region below 4GB with
1552      * the requested size (in bytes) for use by the Intel PCI class VGA
1553      * device at VM address 00:02.0.  The base address of this reserved
1554      * memory region must be written to the device BDSM regsiter at PCI
1555      * config offset 0x5C.
1556      */
1557     bdsm_size = g_malloc(sizeof(*bdsm_size));
1558     *bdsm_size = cpu_to_le64((ggms_mb + gms_mb) * 1024 * 1024);
1559     fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size",
1560                     bdsm_size, sizeof(*bdsm_size));
1561 
1562     /* GMCH is read-only, emulated */
1563     pci_set_long(vdev->pdev.config + IGD_GMCH, gmch);
1564     pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0);
1565     pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0);
1566 
1567     /* BDSM is read-write, emulated.  The BIOS needs to be able to write it */
1568     pci_set_long(vdev->pdev.config + IGD_BDSM, 0);
1569     pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0);
1570     pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0);
1571 
1572     /*
1573      * This IOBAR gives us access to GTTADR, which allows us to write to
1574      * the GTT itself.  So let's go ahead and write zero to all the GTT
1575      * entries to avoid spurious DMA faults.  Be sure I/O access is enabled
1576      * before talking to the device.
1577      */
1578     if (pread(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig),
1579               vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) {
1580         error_report("IGD device %s - failed to read PCI command register",
1581                      vdev->vbasedev.name);
1582     }
1583 
1584     cmd = cmd_orig | PCI_COMMAND_IO;
1585 
1586     if (pwrite(vdev->vbasedev.fd, &cmd, sizeof(cmd),
1587                vdev->config_offset + PCI_COMMAND) != sizeof(cmd)) {
1588         error_report("IGD device %s - failed to write PCI command register",
1589                      vdev->vbasedev.name);
1590     }
1591 
1592     for (i = 1; i < vfio_igd_gtt_max(vdev); i += 4) {
1593         vfio_region_write(&vdev->bars[4].region, 0, i, 4);
1594         vfio_region_write(&vdev->bars[4].region, 4, 0, 4);
1595     }
1596 
1597     if (pwrite(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig),
1598                vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) {
1599         error_report("IGD device %s - failed to restore PCI command register",
1600                      vdev->vbasedev.name);
1601     }
1602 
1603     trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, ggms_mb + gms_mb);
1604 
1605 out:
1606     g_free(rom);
1607     g_free(opregion);
1608     g_free(host);
1609     g_free(lpc);
1610 }
1611 
1612 /*
1613  * Common quirk probe entry points.
1614  */
1615 void vfio_vga_quirk_setup(VFIOPCIDevice *vdev)
1616 {
1617     vfio_vga_probe_ati_3c3_quirk(vdev);
1618     vfio_vga_probe_nvidia_3d0_quirk(vdev);
1619 }
1620 
1621 void vfio_vga_quirk_exit(VFIOPCIDevice *vdev)
1622 {
1623     VFIOQuirk *quirk;
1624     int i, j;
1625 
1626     for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1627         QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) {
1628             for (j = 0; j < quirk->nr_mem; j++) {
1629                 memory_region_del_subregion(&vdev->vga->region[i].mem,
1630                                             &quirk->mem[j]);
1631             }
1632         }
1633     }
1634 }
1635 
1636 void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev)
1637 {
1638     int i, j;
1639 
1640     for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1641         while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) {
1642             VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks);
1643             QLIST_REMOVE(quirk, next);
1644             for (j = 0; j < quirk->nr_mem; j++) {
1645                 object_unparent(OBJECT(&quirk->mem[j]));
1646             }
1647             g_free(quirk->mem);
1648             g_free(quirk->data);
1649             g_free(quirk);
1650         }
1651     }
1652 }
1653 
1654 void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
1655 {
1656     vfio_probe_ati_bar4_quirk(vdev, nr);
1657     vfio_probe_ati_bar2_quirk(vdev, nr);
1658     vfio_probe_nvidia_bar5_quirk(vdev, nr);
1659     vfio_probe_nvidia_bar0_quirk(vdev, nr);
1660     vfio_probe_rtl8168_bar2_quirk(vdev, nr);
1661     vfio_probe_igd_bar4_quirk(vdev, nr);
1662 }
1663 
1664 void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
1665 {
1666     VFIOBAR *bar = &vdev->bars[nr];
1667     VFIOQuirk *quirk;
1668     int i;
1669 
1670     QLIST_FOREACH(quirk, &bar->quirks, next) {
1671         for (i = 0; i < quirk->nr_mem; i++) {
1672             memory_region_del_subregion(bar->region.mem, &quirk->mem[i]);
1673         }
1674     }
1675 }
1676 
1677 void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr)
1678 {
1679     VFIOBAR *bar = &vdev->bars[nr];
1680     int i;
1681 
1682     while (!QLIST_EMPTY(&bar->quirks)) {
1683         VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1684         QLIST_REMOVE(quirk, next);
1685         for (i = 0; i < quirk->nr_mem; i++) {
1686             object_unparent(OBJECT(&quirk->mem[i]));
1687         }
1688         g_free(quirk->mem);
1689         g_free(quirk->data);
1690         g_free(quirk);
1691     }
1692 }
1693 
1694 /*
1695  * Reset quirks
1696  */
1697 
1698 /*
1699  * AMD Radeon PCI config reset, based on Linux:
1700  *   drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running()
1701  *   drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset
1702  *   drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc()
1703  *   drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock()
1704  * IDs: include/drm/drm_pciids.h
1705  * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0
1706  *
1707  * Bonaire and Hawaii GPUs do not respond to a bus reset.  This is a bug in the
1708  * hardware that should be fixed on future ASICs.  The symptom of this is that
1709  * once the accerlated driver loads, Windows guests will bsod on subsequent
1710  * attmpts to load the driver, such as after VM reset or shutdown/restart.  To
1711  * work around this, we do an AMD specific PCI config reset, followed by an SMC
1712  * reset.  The PCI config reset only works if SMC firmware is running, so we
1713  * have a dependency on the state of the device as to whether this reset will
1714  * be effective.  There are still cases where we won't be able to kick the
1715  * device into working, but this greatly improves the usability overall.  The
1716  * config reset magic is relatively common on AMD GPUs, but the setup and SMC
1717  * poking is largely ASIC specific.
1718  */
1719 static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev)
1720 {
1721     uint32_t clk, pc_c;
1722 
1723     /*
1724      * Registers 200h and 204h are index and data registers for accessing
1725      * indirect configuration registers within the device.
1726      */
1727     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1728     clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1729     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4);
1730     pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1731 
1732     return (!(clk & 1) && (0x20100 <= pc_c));
1733 }
1734 
1735 /*
1736  * The scope of a config reset is controlled by a mode bit in the misc register
1737  * and a fuse, exposed as a bit in another register.  The fuse is the default
1738  * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula
1739  * scope = !(misc ^ fuse), where the resulting scope is defined the same as
1740  * the fuse.  A truth table therefore tells us that if misc == fuse, we need
1741  * to flip the value of the bit in the misc register.
1742  */
1743 static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev)
1744 {
1745     uint32_t misc, fuse;
1746     bool a, b;
1747 
1748     vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4);
1749     fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1750     b = fuse & 64;
1751 
1752     vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4);
1753     misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1754     a = misc & 2;
1755 
1756     if (a == b) {
1757         vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4);
1758         vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */
1759     }
1760 }
1761 
1762 static int vfio_radeon_reset(VFIOPCIDevice *vdev)
1763 {
1764     PCIDevice *pdev = &vdev->pdev;
1765     int i, ret = 0;
1766     uint32_t data;
1767 
1768     /* Defer to a kernel implemented reset */
1769     if (vdev->vbasedev.reset_works) {
1770         trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name);
1771         return -ENODEV;
1772     }
1773 
1774     /* Enable only memory BAR access */
1775     vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2);
1776 
1777     /* Reset only works if SMC firmware is loaded and running */
1778     if (!vfio_radeon_smc_is_running(vdev)) {
1779         ret = -EINVAL;
1780         trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name);
1781         goto out;
1782     }
1783 
1784     /* Make sure only the GFX function is reset */
1785     vfio_radeon_set_gfx_only_reset(vdev);
1786 
1787     /* AMD PCI config reset */
1788     vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4);
1789     usleep(100);
1790 
1791     /* Read back the memory size to make sure we're out of reset */
1792     for (i = 0; i < 100000; i++) {
1793         if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) {
1794             goto reset_smc;
1795         }
1796         usleep(1);
1797     }
1798 
1799     trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name);
1800 
1801 reset_smc:
1802     /* Reset SMC */
1803     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4);
1804     data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1805     data |= 1;
1806     vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
1807 
1808     /* Disable SMC clock */
1809     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1810     data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1811     data |= 1;
1812     vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
1813 
1814     trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name);
1815 
1816 out:
1817     /* Restore PCI command register */
1818     vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2);
1819 
1820     return ret;
1821 }
1822 
1823 void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev)
1824 {
1825     switch (vdev->vendor_id) {
1826     case 0x1002:
1827         switch (vdev->device_id) {
1828         /* Bonaire */
1829         case 0x6649: /* Bonaire [FirePro W5100] */
1830         case 0x6650:
1831         case 0x6651:
1832         case 0x6658: /* Bonaire XTX [Radeon R7 260X] */
1833         case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */
1834         case 0x665d: /* Bonaire [Radeon R7 200 Series] */
1835         /* Hawaii */
1836         case 0x67A0: /* Hawaii XT GL [FirePro W9100] */
1837         case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */
1838         case 0x67A2:
1839         case 0x67A8:
1840         case 0x67A9:
1841         case 0x67AA:
1842         case 0x67B0: /* Hawaii XT [Radeon R9 290X] */
1843         case 0x67B1: /* Hawaii PRO [Radeon R9 290] */
1844         case 0x67B8:
1845         case 0x67B9:
1846         case 0x67BA:
1847         case 0x67BE:
1848             vdev->resetfn = vfio_radeon_reset;
1849             trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name);
1850             break;
1851         }
1852         break;
1853     }
1854 }
1855 
1856 /*
1857  * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify
1858  * devices as a member of a clique.  Devices within the same clique ID
1859  * are capable of direct P2P.  It's the user's responsibility that this
1860  * is correct.  The spec says that this may reside at any unused config
1861  * offset, but reserves and recommends hypervisors place this at C8h.
1862  * The spec also states that the hypervisor should place this capability
1863  * at the end of the capability list, thus next is defined as 0h.
1864  *
1865  * +----------------+----------------+----------------+----------------+
1866  * | sig 7:0 ('P')  |  vndr len (8h) |    next (0h)   |   cap id (9h)  |
1867  * +----------------+----------------+----------------+----------------+
1868  * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)|          sig 23:8 ('P2')        |
1869  * +---------------------------------+---------------------------------+
1870  *
1871  * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf
1872  */
1873 static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v,
1874                                        const char *name, void *opaque,
1875                                        Error **errp)
1876 {
1877     DeviceState *dev = DEVICE(obj);
1878     Property *prop = opaque;
1879     uint8_t *ptr = qdev_get_prop_ptr(dev, prop);
1880 
1881     visit_type_uint8(v, name, ptr, errp);
1882 }
1883 
1884 static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v,
1885                                        const char *name, void *opaque,
1886                                        Error **errp)
1887 {
1888     DeviceState *dev = DEVICE(obj);
1889     Property *prop = opaque;
1890     uint8_t value, *ptr = qdev_get_prop_ptr(dev, prop);
1891     Error *local_err = NULL;
1892 
1893     if (dev->realized) {
1894         qdev_prop_set_after_realize(dev, name, errp);
1895         return;
1896     }
1897 
1898     visit_type_uint8(v, name, &value, &local_err);
1899     if (local_err) {
1900         error_propagate(errp, local_err);
1901         return;
1902     }
1903 
1904     if (value & ~0xF) {
1905         error_setg(errp, "Property %s: valid range 0-15", name);
1906         return;
1907     }
1908 
1909     *ptr = value;
1910 }
1911 
1912 const PropertyInfo qdev_prop_nv_gpudirect_clique = {
1913     .name = "uint4",
1914     .description = "NVIDIA GPUDirect Clique ID (0 - 15)",
1915     .get = get_nv_gpudirect_clique_id,
1916     .set = set_nv_gpudirect_clique_id,
1917 };
1918 
1919 static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
1920 {
1921     PCIDevice *pdev = &vdev->pdev;
1922     int ret, pos = 0xC8;
1923 
1924     if (vdev->nv_gpudirect_clique == 0xFF) {
1925         return 0;
1926     }
1927 
1928     if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) {
1929         error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor");
1930         return -EINVAL;
1931     }
1932 
1933     if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) !=
1934         PCI_BASE_CLASS_DISPLAY) {
1935         error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class");
1936         return -EINVAL;
1937     }
1938 
1939     ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp);
1940     if (ret < 0) {
1941         error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: ");
1942         return ret;
1943     }
1944 
1945     memset(vdev->emulated_config_bits + pos, 0xFF, 8);
1946     pos += PCI_CAP_FLAGS;
1947     pci_set_byte(pdev->config + pos++, 8);
1948     pci_set_byte(pdev->config + pos++, 'P');
1949     pci_set_byte(pdev->config + pos++, '2');
1950     pci_set_byte(pdev->config + pos++, 'P');
1951     pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3);
1952     pci_set_byte(pdev->config + pos, 0);
1953 
1954     return 0;
1955 }
1956 
1957 int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp)
1958 {
1959     int ret;
1960 
1961     ret = vfio_add_nv_gpudirect_cap(vdev, errp);
1962     if (ret) {
1963         return ret;
1964     }
1965 
1966     return 0;
1967 }
1968