1 /* 2 * device quirks for PCI devices 3 * 4 * Copyright Red Hat, Inc. 2012-2015 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include CONFIG_DEVICES 15 #include "exec/memop.h" 16 #include "qemu/units.h" 17 #include "qemu/log.h" 18 #include "qemu/error-report.h" 19 #include "qemu/main-loop.h" 20 #include "qemu/module.h" 21 #include "qemu/range.h" 22 #include "qapi/error.h" 23 #include "qapi/visitor.h" 24 #include <sys/ioctl.h> 25 #include "hw/nvram/fw_cfg.h" 26 #include "hw/qdev-properties.h" 27 #include "pci.h" 28 #include "pci-quirks.h" 29 #include "trace.h" 30 31 /* 32 * List of device ids/vendor ids for which to disable 33 * option rom loading. This avoids the guest hangs during rom 34 * execution as noticed with the BCM 57810 card for lack of a 35 * more better way to handle such issues. 36 * The user can still override by specifying a romfile or 37 * rombar=1. 38 * Please see https://bugs.launchpad.net/qemu/+bug/1284874 39 * for an analysis of the 57810 card hang. When adding 40 * a new vendor id/device id combination below, please also add 41 * your card/environment details and information that could 42 * help in debugging to the bug tracking this issue 43 */ 44 static const struct { 45 uint32_t vendor; 46 uint32_t device; 47 } rom_denylist[] = { 48 { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */ 49 }; 50 51 bool vfio_opt_rom_in_denylist(VFIOPCIDevice *vdev) 52 { 53 int i; 54 55 for (i = 0 ; i < ARRAY_SIZE(rom_denylist); i++) { 56 if (vfio_pci_is(vdev, rom_denylist[i].vendor, rom_denylist[i].device)) { 57 trace_vfio_quirk_rom_in_denylist(vdev->vbasedev.name, 58 rom_denylist[i].vendor, 59 rom_denylist[i].device); 60 return true; 61 } 62 } 63 return false; 64 } 65 66 /* 67 * Device specific region quirks (mostly backdoors to PCI config space) 68 */ 69 70 static uint64_t vfio_generic_window_quirk_address_read(void *opaque, 71 hwaddr addr, 72 unsigned size) 73 { 74 VFIOConfigWindowQuirk *window = opaque; 75 VFIOPCIDevice *vdev = window->vdev; 76 77 return vfio_region_read(&vdev->bars[window->bar].region, 78 addr + window->address_offset, size); 79 } 80 81 static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr, 82 uint64_t data, 83 unsigned size) 84 { 85 VFIOConfigWindowQuirk *window = opaque; 86 VFIOPCIDevice *vdev = window->vdev; 87 int i; 88 89 window->window_enabled = false; 90 91 vfio_region_write(&vdev->bars[window->bar].region, 92 addr + window->address_offset, data, size); 93 94 for (i = 0; i < window->nr_matches; i++) { 95 if ((data & ~window->matches[i].mask) == window->matches[i].match) { 96 window->window_enabled = true; 97 window->address_val = data & window->matches[i].mask; 98 trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name, 99 memory_region_name(window->addr_mem), data); 100 break; 101 } 102 } 103 } 104 105 const MemoryRegionOps vfio_generic_window_address_quirk = { 106 .read = vfio_generic_window_quirk_address_read, 107 .write = vfio_generic_window_quirk_address_write, 108 .endianness = DEVICE_LITTLE_ENDIAN, 109 }; 110 111 static uint64_t vfio_generic_window_quirk_data_read(void *opaque, 112 hwaddr addr, unsigned size) 113 { 114 VFIOConfigWindowQuirk *window = opaque; 115 VFIOPCIDevice *vdev = window->vdev; 116 uint64_t data; 117 118 /* Always read data reg, discard if window enabled */ 119 data = vfio_region_read(&vdev->bars[window->bar].region, 120 addr + window->data_offset, size); 121 122 if (window->window_enabled) { 123 data = vfio_pci_read_config(&vdev->pdev, window->address_val, size); 124 trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name, 125 memory_region_name(window->data_mem), data); 126 } 127 128 return data; 129 } 130 131 static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr, 132 uint64_t data, unsigned size) 133 { 134 VFIOConfigWindowQuirk *window = opaque; 135 VFIOPCIDevice *vdev = window->vdev; 136 137 if (window->window_enabled) { 138 vfio_pci_write_config(&vdev->pdev, window->address_val, data, size); 139 trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name, 140 memory_region_name(window->data_mem), data); 141 return; 142 } 143 144 vfio_region_write(&vdev->bars[window->bar].region, 145 addr + window->data_offset, data, size); 146 } 147 148 const MemoryRegionOps vfio_generic_window_data_quirk = { 149 .read = vfio_generic_window_quirk_data_read, 150 .write = vfio_generic_window_quirk_data_write, 151 .endianness = DEVICE_LITTLE_ENDIAN, 152 }; 153 154 static uint64_t vfio_generic_quirk_mirror_read(void *opaque, 155 hwaddr addr, unsigned size) 156 { 157 VFIOConfigMirrorQuirk *mirror = opaque; 158 VFIOPCIDevice *vdev = mirror->vdev; 159 uint64_t data; 160 161 /* Read and discard in case the hardware cares */ 162 (void)vfio_region_read(&vdev->bars[mirror->bar].region, 163 addr + mirror->offset, size); 164 165 data = vfio_pci_read_config(&vdev->pdev, addr, size); 166 trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name, 167 memory_region_name(mirror->mem), 168 addr, data); 169 return data; 170 } 171 172 static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr, 173 uint64_t data, unsigned size) 174 { 175 VFIOConfigMirrorQuirk *mirror = opaque; 176 VFIOPCIDevice *vdev = mirror->vdev; 177 178 vfio_pci_write_config(&vdev->pdev, addr, data, size); 179 trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name, 180 memory_region_name(mirror->mem), 181 addr, data); 182 } 183 184 const MemoryRegionOps vfio_generic_mirror_quirk = { 185 .read = vfio_generic_quirk_mirror_read, 186 .write = vfio_generic_quirk_mirror_write, 187 .endianness = DEVICE_LITTLE_ENDIAN, 188 }; 189 190 /* Is range1 fully contained within range2? */ 191 static bool vfio_range_contained(uint64_t first1, uint64_t len1, 192 uint64_t first2, uint64_t len2) { 193 return (first1 >= first2 && first1 + len1 <= first2 + len2); 194 } 195 196 #define PCI_VENDOR_ID_ATI 0x1002 197 198 /* 199 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR 200 * through VGA register 0x3c3. On newer cards, the I/O port BAR is always 201 * BAR4 (older cards like the X550 used BAR1, but we don't care to support 202 * those). Note that on bare metal, a read of 0x3c3 doesn't always return the 203 * I/O port BAR address. Originally this was coded to return the virtual BAR 204 * address only if the physical register read returns the actual BAR address, 205 * but users have reported greater success if we return the virtual address 206 * unconditionally. 207 */ 208 static uint64_t vfio_ati_3c3_quirk_read(void *opaque, 209 hwaddr addr, unsigned size) 210 { 211 VFIOPCIDevice *vdev = opaque; 212 uint64_t data = vfio_pci_read_config(&vdev->pdev, 213 PCI_BASE_ADDRESS_4 + 1, size); 214 215 trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data); 216 217 return data; 218 } 219 220 static void vfio_ati_3c3_quirk_write(void *opaque, hwaddr addr, 221 uint64_t data, unsigned size) 222 { 223 qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid access\n", __func__); 224 } 225 226 static const MemoryRegionOps vfio_ati_3c3_quirk = { 227 .read = vfio_ati_3c3_quirk_read, 228 .write = vfio_ati_3c3_quirk_write, 229 .endianness = DEVICE_LITTLE_ENDIAN, 230 }; 231 232 VFIOQuirk *vfio_quirk_alloc(int nr_mem) 233 { 234 VFIOQuirk *quirk = g_new0(VFIOQuirk, 1); 235 QLIST_INIT(&quirk->ioeventfds); 236 quirk->mem = g_new0(MemoryRegion, nr_mem); 237 quirk->nr_mem = nr_mem; 238 239 return quirk; 240 } 241 242 static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd) 243 { 244 QLIST_REMOVE(ioeventfd, next); 245 memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size, 246 true, ioeventfd->data, &ioeventfd->e); 247 248 if (ioeventfd->vfio) { 249 struct vfio_device_ioeventfd vfio_ioeventfd; 250 251 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd); 252 vfio_ioeventfd.flags = ioeventfd->size; 253 vfio_ioeventfd.data = ioeventfd->data; 254 vfio_ioeventfd.offset = ioeventfd->region->fd_offset + 255 ioeventfd->region_addr; 256 vfio_ioeventfd.fd = -1; 257 258 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) { 259 error_report("Failed to remove vfio ioeventfd for %s+0x%" 260 HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)", 261 memory_region_name(ioeventfd->mr), ioeventfd->addr, 262 ioeventfd->size, ioeventfd->data); 263 } 264 } else { 265 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), 266 NULL, NULL, NULL); 267 } 268 269 event_notifier_cleanup(&ioeventfd->e); 270 trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr), 271 (uint64_t)ioeventfd->addr, ioeventfd->size, 272 ioeventfd->data); 273 g_free(ioeventfd); 274 } 275 276 static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk) 277 { 278 VFIOIOEventFD *ioeventfd, *tmp; 279 280 QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) { 281 if (ioeventfd->dynamic) { 282 vfio_ioeventfd_exit(vdev, ioeventfd); 283 } 284 } 285 } 286 287 static void vfio_ioeventfd_handler(void *opaque) 288 { 289 VFIOIOEventFD *ioeventfd = opaque; 290 291 if (event_notifier_test_and_clear(&ioeventfd->e)) { 292 vfio_region_write(ioeventfd->region, ioeventfd->region_addr, 293 ioeventfd->data, ioeventfd->size); 294 trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr), 295 (uint64_t)ioeventfd->addr, ioeventfd->size, 296 ioeventfd->data); 297 } 298 } 299 300 static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev, 301 MemoryRegion *mr, hwaddr addr, 302 unsigned size, uint64_t data, 303 VFIORegion *region, 304 hwaddr region_addr, bool dynamic) 305 { 306 VFIOIOEventFD *ioeventfd; 307 308 if (vdev->no_kvm_ioeventfd) { 309 return NULL; 310 } 311 312 ioeventfd = g_malloc0(sizeof(*ioeventfd)); 313 314 if (event_notifier_init(&ioeventfd->e, 0)) { 315 g_free(ioeventfd); 316 return NULL; 317 } 318 319 /* 320 * MemoryRegion and relative offset, plus additional ioeventfd setup 321 * parameters for configuring and later tearing down KVM ioeventfd. 322 */ 323 ioeventfd->mr = mr; 324 ioeventfd->addr = addr; 325 ioeventfd->size = size; 326 ioeventfd->data = data; 327 ioeventfd->dynamic = dynamic; 328 /* 329 * VFIORegion and relative offset for implementing the userspace 330 * handler. data & size fields shared for both uses. 331 */ 332 ioeventfd->region = region; 333 ioeventfd->region_addr = region_addr; 334 335 if (!vdev->no_vfio_ioeventfd) { 336 struct vfio_device_ioeventfd vfio_ioeventfd; 337 338 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd); 339 vfio_ioeventfd.flags = ioeventfd->size; 340 vfio_ioeventfd.data = ioeventfd->data; 341 vfio_ioeventfd.offset = ioeventfd->region->fd_offset + 342 ioeventfd->region_addr; 343 vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e); 344 345 ioeventfd->vfio = !ioctl(vdev->vbasedev.fd, 346 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd); 347 } 348 349 if (!ioeventfd->vfio) { 350 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), 351 vfio_ioeventfd_handler, NULL, ioeventfd); 352 } 353 354 memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size, 355 true, ioeventfd->data, &ioeventfd->e); 356 trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr, 357 size, data, ioeventfd->vfio); 358 359 return ioeventfd; 360 } 361 362 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev) 363 { 364 VFIOQuirk *quirk; 365 366 /* 367 * As long as the BAR is >= 256 bytes it will be aligned such that the 368 * lower byte is always zero. Filter out anything else, if it exists. 369 */ 370 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 371 !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) { 372 return; 373 } 374 375 quirk = vfio_quirk_alloc(1); 376 377 memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev, 378 "vfio-ati-3c3-quirk", 1); 379 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 380 3 /* offset 3 bytes from 0x3c0 */, quirk->mem); 381 382 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, 383 quirk, next); 384 385 trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name); 386 } 387 388 /* 389 * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI 390 * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access 391 * the MMIO space directly, but a window to this space is provided through 392 * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the 393 * data register. When the address is programmed to a range of 0x4000-0x4fff 394 * PCI configuration space is available. Experimentation seems to indicate 395 * that read-only may be provided by hardware. 396 */ 397 static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr) 398 { 399 VFIOQuirk *quirk; 400 VFIOConfigWindowQuirk *window; 401 402 /* This windows doesn't seem to be used except by legacy VGA code */ 403 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 404 !vdev->vga || nr != 4) { 405 return; 406 } 407 408 quirk = vfio_quirk_alloc(2); 409 window = quirk->data = g_malloc0(sizeof(*window) + 410 sizeof(VFIOConfigWindowMatch)); 411 window->vdev = vdev; 412 window->address_offset = 0; 413 window->data_offset = 4; 414 window->nr_matches = 1; 415 window->matches[0].match = 0x4000; 416 window->matches[0].mask = vdev->config_size - 1; 417 window->bar = nr; 418 window->addr_mem = &quirk->mem[0]; 419 window->data_mem = &quirk->mem[1]; 420 421 memory_region_init_io(window->addr_mem, OBJECT(vdev), 422 &vfio_generic_window_address_quirk, window, 423 "vfio-ati-bar4-window-address-quirk", 4); 424 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 425 window->address_offset, 426 window->addr_mem, 1); 427 428 memory_region_init_io(window->data_mem, OBJECT(vdev), 429 &vfio_generic_window_data_quirk, window, 430 "vfio-ati-bar4-window-data-quirk", 4); 431 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 432 window->data_offset, 433 window->data_mem, 1); 434 435 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 436 437 trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name); 438 } 439 440 /* 441 * Trap the BAR2 MMIO mirror to config space as well. 442 */ 443 static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr) 444 { 445 VFIOQuirk *quirk; 446 VFIOConfigMirrorQuirk *mirror; 447 448 /* Only enable on newer devices where BAR2 is 64bit */ 449 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 450 !vdev->vga || nr != 2 || !vdev->bars[2].mem64) { 451 return; 452 } 453 454 quirk = vfio_quirk_alloc(1); 455 mirror = quirk->data = g_malloc0(sizeof(*mirror)); 456 mirror->mem = quirk->mem; 457 mirror->vdev = vdev; 458 mirror->offset = 0x4000; 459 mirror->bar = nr; 460 461 memory_region_init_io(mirror->mem, OBJECT(vdev), 462 &vfio_generic_mirror_quirk, mirror, 463 "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE); 464 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 465 mirror->offset, mirror->mem, 1); 466 467 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 468 469 trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name); 470 } 471 472 /* 473 * Older ATI/AMD cards like the X550 have a similar window to that above. 474 * I/O port BAR1 provides a window to a mirror of PCI config space located 475 * in BAR2 at offset 0xf00. We don't care to support such older cards, but 476 * note it for future reference. 477 */ 478 479 /* 480 * Nvidia has several different methods to get to config space, the 481 * nouveu project has several of these documented here: 482 * https://github.com/pathscale/envytools/tree/master/hwdocs 483 * 484 * The first quirk is actually not documented in envytools and is found 485 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an 486 * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access 487 * the mirror of PCI config space found at BAR0 offset 0x1800. The access 488 * sequence first writes 0x338 to I/O port 0x3d4. The target offset is 489 * then written to 0x3d0. Finally 0x538 is written for a read and 0x738 490 * is written for a write to 0x3d4. The BAR0 offset is then accessible 491 * through 0x3d0. This quirk doesn't seem to be necessary on newer cards 492 * that use the I/O port BAR5 window but it doesn't hurt to leave it. 493 */ 494 typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State; 495 static const char *nv3d0_states[] = { "NONE", "SELECT", 496 "WINDOW", "READ", "WRITE" }; 497 498 typedef struct VFIONvidia3d0Quirk { 499 VFIOPCIDevice *vdev; 500 VFIONvidia3d0State state; 501 uint32_t offset; 502 } VFIONvidia3d0Quirk; 503 504 static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque, 505 hwaddr addr, unsigned size) 506 { 507 VFIONvidia3d0Quirk *quirk = opaque; 508 VFIOPCIDevice *vdev = quirk->vdev; 509 510 quirk->state = NONE; 511 512 return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 513 addr + 0x14, size); 514 } 515 516 static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr, 517 uint64_t data, unsigned size) 518 { 519 VFIONvidia3d0Quirk *quirk = opaque; 520 VFIOPCIDevice *vdev = quirk->vdev; 521 VFIONvidia3d0State old_state = quirk->state; 522 523 quirk->state = NONE; 524 525 switch (data) { 526 case 0x338: 527 if (old_state == NONE) { 528 quirk->state = SELECT; 529 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 530 nv3d0_states[quirk->state]); 531 } 532 break; 533 case 0x538: 534 if (old_state == WINDOW) { 535 quirk->state = READ; 536 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 537 nv3d0_states[quirk->state]); 538 } 539 break; 540 case 0x738: 541 if (old_state == WINDOW) { 542 quirk->state = WRITE; 543 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 544 nv3d0_states[quirk->state]); 545 } 546 break; 547 } 548 549 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 550 addr + 0x14, data, size); 551 } 552 553 static const MemoryRegionOps vfio_nvidia_3d4_quirk = { 554 .read = vfio_nvidia_3d4_quirk_read, 555 .write = vfio_nvidia_3d4_quirk_write, 556 .endianness = DEVICE_LITTLE_ENDIAN, 557 }; 558 559 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque, 560 hwaddr addr, unsigned size) 561 { 562 VFIONvidia3d0Quirk *quirk = opaque; 563 VFIOPCIDevice *vdev = quirk->vdev; 564 VFIONvidia3d0State old_state = quirk->state; 565 uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 566 addr + 0x10, size); 567 568 quirk->state = NONE; 569 570 if (old_state == READ && 571 (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { 572 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); 573 574 data = vfio_pci_read_config(&vdev->pdev, offset, size); 575 trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name, 576 offset, size, data); 577 } 578 579 return data; 580 } 581 582 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr, 583 uint64_t data, unsigned size) 584 { 585 VFIONvidia3d0Quirk *quirk = opaque; 586 VFIOPCIDevice *vdev = quirk->vdev; 587 VFIONvidia3d0State old_state = quirk->state; 588 589 quirk->state = NONE; 590 591 if (old_state == SELECT) { 592 quirk->offset = (uint32_t)data; 593 quirk->state = WINDOW; 594 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 595 nv3d0_states[quirk->state]); 596 } else if (old_state == WRITE) { 597 if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { 598 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); 599 600 vfio_pci_write_config(&vdev->pdev, offset, data, size); 601 trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name, 602 offset, data, size); 603 return; 604 } 605 } 606 607 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 608 addr + 0x10, data, size); 609 } 610 611 static const MemoryRegionOps vfio_nvidia_3d0_quirk = { 612 .read = vfio_nvidia_3d0_quirk_read, 613 .write = vfio_nvidia_3d0_quirk_write, 614 .endianness = DEVICE_LITTLE_ENDIAN, 615 }; 616 617 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev) 618 { 619 VFIOQuirk *quirk; 620 VFIONvidia3d0Quirk *data; 621 622 if (vdev->no_geforce_quirks || 623 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 624 !vdev->bars[1].region.size) { 625 return; 626 } 627 628 quirk = vfio_quirk_alloc(2); 629 quirk->data = data = g_malloc0(sizeof(*data)); 630 data->vdev = vdev; 631 632 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk, 633 data, "vfio-nvidia-3d4-quirk", 2); 634 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 635 0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]); 636 637 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk, 638 data, "vfio-nvidia-3d0-quirk", 2); 639 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 640 0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]); 641 642 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, 643 quirk, next); 644 645 trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name); 646 } 647 648 /* 649 * The second quirk is documented in envytools. The I/O port BAR5 is just 650 * a set of address/data ports to the MMIO BARs. The BAR we care about is 651 * again BAR0. This backdoor is apparently a bit newer than the one above 652 * so we need to not only trap 256 bytes @0x1800, but all of PCI config 653 * space, including extended space is available at the 4k @0x88000. 654 */ 655 typedef struct VFIONvidiaBAR5Quirk { 656 uint32_t master; 657 uint32_t enable; 658 MemoryRegion *addr_mem; 659 MemoryRegion *data_mem; 660 bool enabled; 661 VFIOConfigWindowQuirk window; /* last for match data */ 662 } VFIONvidiaBAR5Quirk; 663 664 static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5) 665 { 666 VFIOPCIDevice *vdev = bar5->window.vdev; 667 668 if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) { 669 return; 670 } 671 672 bar5->enabled = !bar5->enabled; 673 trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name, 674 bar5->enabled ? "Enable" : "Disable"); 675 memory_region_set_enabled(bar5->addr_mem, bar5->enabled); 676 memory_region_set_enabled(bar5->data_mem, bar5->enabled); 677 } 678 679 static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque, 680 hwaddr addr, unsigned size) 681 { 682 VFIONvidiaBAR5Quirk *bar5 = opaque; 683 VFIOPCIDevice *vdev = bar5->window.vdev; 684 685 return vfio_region_read(&vdev->bars[5].region, addr, size); 686 } 687 688 static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr, 689 uint64_t data, unsigned size) 690 { 691 VFIONvidiaBAR5Quirk *bar5 = opaque; 692 VFIOPCIDevice *vdev = bar5->window.vdev; 693 694 vfio_region_write(&vdev->bars[5].region, addr, data, size); 695 696 bar5->master = data; 697 vfio_nvidia_bar5_enable(bar5); 698 } 699 700 static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = { 701 .read = vfio_nvidia_bar5_quirk_master_read, 702 .write = vfio_nvidia_bar5_quirk_master_write, 703 .endianness = DEVICE_LITTLE_ENDIAN, 704 }; 705 706 static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque, 707 hwaddr addr, unsigned size) 708 { 709 VFIONvidiaBAR5Quirk *bar5 = opaque; 710 VFIOPCIDevice *vdev = bar5->window.vdev; 711 712 return vfio_region_read(&vdev->bars[5].region, addr + 4, size); 713 } 714 715 static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr, 716 uint64_t data, unsigned size) 717 { 718 VFIONvidiaBAR5Quirk *bar5 = opaque; 719 VFIOPCIDevice *vdev = bar5->window.vdev; 720 721 vfio_region_write(&vdev->bars[5].region, addr + 4, data, size); 722 723 bar5->enable = data; 724 vfio_nvidia_bar5_enable(bar5); 725 } 726 727 static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = { 728 .read = vfio_nvidia_bar5_quirk_enable_read, 729 .write = vfio_nvidia_bar5_quirk_enable_write, 730 .endianness = DEVICE_LITTLE_ENDIAN, 731 }; 732 733 static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) 734 { 735 VFIOQuirk *quirk; 736 VFIONvidiaBAR5Quirk *bar5; 737 VFIOConfigWindowQuirk *window; 738 739 if (vdev->no_geforce_quirks || 740 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 741 !vdev->vga || nr != 5 || !vdev->bars[5].ioport) { 742 return; 743 } 744 745 quirk = vfio_quirk_alloc(4); 746 bar5 = quirk->data = g_malloc0(sizeof(*bar5) + 747 (sizeof(VFIOConfigWindowMatch) * 2)); 748 window = &bar5->window; 749 750 window->vdev = vdev; 751 window->address_offset = 0x8; 752 window->data_offset = 0xc; 753 window->nr_matches = 2; 754 window->matches[0].match = 0x1800; 755 window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1; 756 window->matches[1].match = 0x88000; 757 window->matches[1].mask = vdev->config_size - 1; 758 window->bar = nr; 759 window->addr_mem = bar5->addr_mem = &quirk->mem[0]; 760 window->data_mem = bar5->data_mem = &quirk->mem[1]; 761 762 memory_region_init_io(window->addr_mem, OBJECT(vdev), 763 &vfio_generic_window_address_quirk, window, 764 "vfio-nvidia-bar5-window-address-quirk", 4); 765 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 766 window->address_offset, 767 window->addr_mem, 1); 768 memory_region_set_enabled(window->addr_mem, false); 769 770 memory_region_init_io(window->data_mem, OBJECT(vdev), 771 &vfio_generic_window_data_quirk, window, 772 "vfio-nvidia-bar5-window-data-quirk", 4); 773 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 774 window->data_offset, 775 window->data_mem, 1); 776 memory_region_set_enabled(window->data_mem, false); 777 778 memory_region_init_io(&quirk->mem[2], OBJECT(vdev), 779 &vfio_nvidia_bar5_quirk_master, bar5, 780 "vfio-nvidia-bar5-master-quirk", 4); 781 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 782 0, &quirk->mem[2], 1); 783 784 memory_region_init_io(&quirk->mem[3], OBJECT(vdev), 785 &vfio_nvidia_bar5_quirk_enable, bar5, 786 "vfio-nvidia-bar5-enable-quirk", 4); 787 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 788 4, &quirk->mem[3], 1); 789 790 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 791 792 trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name); 793 } 794 795 typedef struct LastDataSet { 796 VFIOQuirk *quirk; 797 hwaddr addr; 798 uint64_t data; 799 unsigned size; 800 int hits; 801 int added; 802 } LastDataSet; 803 804 #define MAX_DYN_IOEVENTFD 10 805 #define HITS_FOR_IOEVENTFD 10 806 807 /* 808 * Finally, BAR0 itself. We want to redirect any accesses to either 809 * 0x1800 or 0x88000 through the PCI config space access functions. 810 */ 811 static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr, 812 uint64_t data, unsigned size) 813 { 814 VFIOConfigMirrorQuirk *mirror = opaque; 815 VFIOPCIDevice *vdev = mirror->vdev; 816 PCIDevice *pdev = &vdev->pdev; 817 LastDataSet *last = (LastDataSet *)&mirror->data; 818 819 vfio_generic_quirk_mirror_write(opaque, addr, data, size); 820 821 /* 822 * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the 823 * MSI capability ID register. Both the ID and next register are 824 * read-only, so we allow writes covering either of those to real hw. 825 */ 826 if ((pdev->cap_present & QEMU_PCI_CAP_MSI) && 827 vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) { 828 vfio_region_write(&vdev->bars[mirror->bar].region, 829 addr + mirror->offset, data, size); 830 trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name); 831 } 832 833 /* 834 * Automatically add an ioeventfd to handle any repeated write with the 835 * same data and size above the standard PCI config space header. This is 836 * primarily expected to accelerate the MSI-ACK behavior, such as noted 837 * above. Current hardware/drivers should trigger an ioeventfd at config 838 * offset 0x704 (region offset 0x88704), with data 0x0, size 4. 839 * 840 * The criteria of 10 successive hits is arbitrary but reliably adds the 841 * MSI-ACK region. Note that as some writes are bypassed via the ioeventfd, 842 * the remaining ones have a greater chance of being seen successively. 843 * To avoid the pathological case of burning up all of QEMU's open file 844 * handles, arbitrarily limit this algorithm from adding no more than 10 845 * ioeventfds, print an error if we would have added an 11th, and then 846 * stop counting. 847 */ 848 if (!vdev->no_kvm_ioeventfd && 849 addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) { 850 if (addr != last->addr || data != last->data || size != last->size) { 851 last->addr = addr; 852 last->data = data; 853 last->size = size; 854 last->hits = 1; 855 } else if (++last->hits >= HITS_FOR_IOEVENTFD) { 856 if (last->added < MAX_DYN_IOEVENTFD) { 857 VFIOIOEventFD *ioeventfd; 858 ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size, 859 data, &vdev->bars[mirror->bar].region, 860 mirror->offset + addr, true); 861 if (ioeventfd) { 862 VFIOQuirk *quirk = last->quirk; 863 864 QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next); 865 last->added++; 866 } 867 } else { 868 last->added++; 869 warn_report("NVIDIA ioeventfd queue full for %s, unable to " 870 "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", " 871 "size %u", vdev->vbasedev.name, addr, data, size); 872 } 873 } 874 } 875 } 876 877 static const MemoryRegionOps vfio_nvidia_mirror_quirk = { 878 .read = vfio_generic_quirk_mirror_read, 879 .write = vfio_nvidia_quirk_mirror_write, 880 .endianness = DEVICE_LITTLE_ENDIAN, 881 }; 882 883 static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk) 884 { 885 VFIOConfigMirrorQuirk *mirror = quirk->data; 886 LastDataSet *last = (LastDataSet *)&mirror->data; 887 888 last->addr = last->data = last->size = last->hits = last->added = 0; 889 890 vfio_drop_dynamic_eventfds(vdev, quirk); 891 } 892 893 static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) 894 { 895 VFIOQuirk *quirk; 896 VFIOConfigMirrorQuirk *mirror; 897 LastDataSet *last; 898 899 if (vdev->no_geforce_quirks || 900 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 901 !vfio_is_vga(vdev) || nr != 0) { 902 return; 903 } 904 905 quirk = vfio_quirk_alloc(1); 906 quirk->reset = vfio_nvidia_bar0_quirk_reset; 907 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); 908 mirror->mem = quirk->mem; 909 mirror->vdev = vdev; 910 mirror->offset = 0x88000; 911 mirror->bar = nr; 912 last = (LastDataSet *)&mirror->data; 913 last->quirk = quirk; 914 915 memory_region_init_io(mirror->mem, OBJECT(vdev), 916 &vfio_nvidia_mirror_quirk, mirror, 917 "vfio-nvidia-bar0-88000-mirror-quirk", 918 vdev->config_size); 919 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 920 mirror->offset, mirror->mem, 1); 921 922 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 923 924 /* The 0x1800 offset mirror only seems to get used by legacy VGA */ 925 if (vdev->vga) { 926 quirk = vfio_quirk_alloc(1); 927 quirk->reset = vfio_nvidia_bar0_quirk_reset; 928 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); 929 mirror->mem = quirk->mem; 930 mirror->vdev = vdev; 931 mirror->offset = 0x1800; 932 mirror->bar = nr; 933 last = (LastDataSet *)&mirror->data; 934 last->quirk = quirk; 935 936 memory_region_init_io(mirror->mem, OBJECT(vdev), 937 &vfio_nvidia_mirror_quirk, mirror, 938 "vfio-nvidia-bar0-1800-mirror-quirk", 939 PCI_CONFIG_SPACE_SIZE); 940 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 941 mirror->offset, mirror->mem, 1); 942 943 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 944 } 945 946 trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name); 947 } 948 949 /* 950 * TODO - Some Nvidia devices provide config access to their companion HDA 951 * device and even to their parent bridge via these config space mirrors. 952 * Add quirks for those regions. 953 */ 954 955 #define PCI_VENDOR_ID_REALTEK 0x10ec 956 957 /* 958 * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2 959 * offset 0x70 there is a dword data register, offset 0x74 is a dword address 960 * register. According to the Linux r8169 driver, the MSI-X table is addressed 961 * when the "type" portion of the address register is set to 0x1. This appears 962 * to be bits 16:30. Bit 31 is both a write indicator and some sort of 963 * "address latched" indicator. Bits 12:15 are a mask field, which we can 964 * ignore because the MSI-X table should always be accessed as a dword (full 965 * mask). Bits 0:11 is offset within the type. 966 * 967 * Example trace: 968 * 969 * Read from MSI-X table offset 0 970 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr 971 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch 972 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data 973 * 974 * Write 0xfee00000 to MSI-X table offset 0 975 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data 976 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write 977 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete 978 */ 979 typedef struct VFIOrtl8168Quirk { 980 VFIOPCIDevice *vdev; 981 uint32_t addr; 982 uint32_t data; 983 bool enabled; 984 } VFIOrtl8168Quirk; 985 986 static uint64_t vfio_rtl8168_quirk_address_read(void *opaque, 987 hwaddr addr, unsigned size) 988 { 989 VFIOrtl8168Quirk *rtl = opaque; 990 VFIOPCIDevice *vdev = rtl->vdev; 991 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size); 992 993 if (rtl->enabled) { 994 data = rtl->addr ^ 0x80000000U; /* latch/complete */ 995 trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data); 996 } 997 998 return data; 999 } 1000 1001 static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr, 1002 uint64_t data, unsigned size) 1003 { 1004 VFIOrtl8168Quirk *rtl = opaque; 1005 VFIOPCIDevice *vdev = rtl->vdev; 1006 1007 rtl->enabled = false; 1008 1009 if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */ 1010 rtl->enabled = true; 1011 rtl->addr = (uint32_t)data; 1012 1013 if (data & 0x80000000U) { /* Do write */ 1014 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) { 1015 hwaddr offset = data & 0xfff; 1016 uint64_t val = rtl->data; 1017 1018 trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name, 1019 (uint16_t)offset, val); 1020 1021 /* Write to the proper guest MSI-X table instead */ 1022 memory_region_dispatch_write(&vdev->pdev.msix_table_mmio, 1023 offset, val, 1024 size_memop(size) | MO_LE, 1025 MEMTXATTRS_UNSPECIFIED); 1026 } 1027 return; /* Do not write guest MSI-X data to hardware */ 1028 } 1029 } 1030 1031 vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size); 1032 } 1033 1034 static const MemoryRegionOps vfio_rtl_address_quirk = { 1035 .read = vfio_rtl8168_quirk_address_read, 1036 .write = vfio_rtl8168_quirk_address_write, 1037 .valid = { 1038 .min_access_size = 4, 1039 .max_access_size = 4, 1040 .unaligned = false, 1041 }, 1042 .endianness = DEVICE_LITTLE_ENDIAN, 1043 }; 1044 1045 static uint64_t vfio_rtl8168_quirk_data_read(void *opaque, 1046 hwaddr addr, unsigned size) 1047 { 1048 VFIOrtl8168Quirk *rtl = opaque; 1049 VFIOPCIDevice *vdev = rtl->vdev; 1050 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size); 1051 1052 if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) { 1053 hwaddr offset = rtl->addr & 0xfff; 1054 memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset, 1055 &data, size_memop(size) | MO_LE, 1056 MEMTXATTRS_UNSPECIFIED); 1057 trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data); 1058 } 1059 1060 return data; 1061 } 1062 1063 static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr, 1064 uint64_t data, unsigned size) 1065 { 1066 VFIOrtl8168Quirk *rtl = opaque; 1067 VFIOPCIDevice *vdev = rtl->vdev; 1068 1069 rtl->data = (uint32_t)data; 1070 1071 vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size); 1072 } 1073 1074 static const MemoryRegionOps vfio_rtl_data_quirk = { 1075 .read = vfio_rtl8168_quirk_data_read, 1076 .write = vfio_rtl8168_quirk_data_write, 1077 .valid = { 1078 .min_access_size = 4, 1079 .max_access_size = 4, 1080 .unaligned = false, 1081 }, 1082 .endianness = DEVICE_LITTLE_ENDIAN, 1083 }; 1084 1085 static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) 1086 { 1087 VFIOQuirk *quirk; 1088 VFIOrtl8168Quirk *rtl; 1089 1090 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) { 1091 return; 1092 } 1093 1094 quirk = vfio_quirk_alloc(2); 1095 quirk->data = rtl = g_malloc0(sizeof(*rtl)); 1096 rtl->vdev = vdev; 1097 1098 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), 1099 &vfio_rtl_address_quirk, rtl, 1100 "vfio-rtl8168-window-address-quirk", 4); 1101 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1102 0x74, &quirk->mem[0], 1); 1103 1104 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), 1105 &vfio_rtl_data_quirk, rtl, 1106 "vfio-rtl8168-window-data-quirk", 4); 1107 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1108 0x70, &quirk->mem[1], 1); 1109 1110 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 1111 1112 trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name); 1113 } 1114 1115 #define IGD_ASLS 0xfc /* ASL Storage Register */ 1116 1117 /* 1118 * The OpRegion includes the Video BIOS Table, which seems important for 1119 * telling the driver what sort of outputs it has. Without this, the device 1120 * may work in the guest, but we may not get output. This also requires BIOS 1121 * support to reserve and populate a section of guest memory sufficient for 1122 * the table and to write the base address of that memory to the ASLS register 1123 * of the IGD device. 1124 */ 1125 bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, 1126 struct vfio_region_info *info, Error **errp) 1127 { 1128 int ret; 1129 1130 vdev->igd_opregion = g_malloc0(info->size); 1131 ret = pread(vdev->vbasedev.fd, vdev->igd_opregion, 1132 info->size, info->offset); 1133 if (ret != info->size) { 1134 error_setg(errp, "failed to read IGD OpRegion"); 1135 g_free(vdev->igd_opregion); 1136 vdev->igd_opregion = NULL; 1137 return false; 1138 } 1139 1140 /* 1141 * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to 1142 * allocate 32bit reserved memory for, copy these contents into, and write 1143 * the reserved memory base address to the device ASLS register at 0xFC. 1144 * Alignment of this reserved region seems flexible, but using a 4k page 1145 * alignment seems to work well. This interface assumes a single IGD 1146 * device, which may be at VM address 00:02.0 in legacy mode or another 1147 * address in UPT mode. 1148 * 1149 * NB, there may be future use cases discovered where the VM should have 1150 * direct interaction with the host OpRegion, in which case the write to 1151 * the ASLS register would trigger MemoryRegion setup to enable that. 1152 */ 1153 fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion", 1154 vdev->igd_opregion, info->size); 1155 1156 trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name); 1157 1158 pci_set_long(vdev->pdev.config + IGD_ASLS, 0); 1159 pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); 1160 pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); 1161 1162 return true; 1163 } 1164 1165 /* 1166 * Common quirk probe entry points. 1167 */ 1168 void vfio_vga_quirk_setup(VFIOPCIDevice *vdev) 1169 { 1170 vfio_vga_probe_ati_3c3_quirk(vdev); 1171 vfio_vga_probe_nvidia_3d0_quirk(vdev); 1172 } 1173 1174 void vfio_vga_quirk_exit(VFIOPCIDevice *vdev) 1175 { 1176 VFIOQuirk *quirk; 1177 int i, j; 1178 1179 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1180 QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) { 1181 for (j = 0; j < quirk->nr_mem; j++) { 1182 memory_region_del_subregion(&vdev->vga->region[i].mem, 1183 &quirk->mem[j]); 1184 } 1185 } 1186 } 1187 } 1188 1189 void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev) 1190 { 1191 int i, j; 1192 1193 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1194 while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) { 1195 VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks); 1196 QLIST_REMOVE(quirk, next); 1197 for (j = 0; j < quirk->nr_mem; j++) { 1198 object_unparent(OBJECT(&quirk->mem[j])); 1199 } 1200 g_free(quirk->mem); 1201 g_free(quirk->data); 1202 g_free(quirk); 1203 } 1204 } 1205 } 1206 1207 void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) 1208 { 1209 vfio_probe_ati_bar4_quirk(vdev, nr); 1210 vfio_probe_ati_bar2_quirk(vdev, nr); 1211 vfio_probe_nvidia_bar5_quirk(vdev, nr); 1212 vfio_probe_nvidia_bar0_quirk(vdev, nr); 1213 vfio_probe_rtl8168_bar2_quirk(vdev, nr); 1214 #ifdef CONFIG_VFIO_IGD 1215 vfio_probe_igd_bar0_quirk(vdev, nr); 1216 vfio_probe_igd_bar4_quirk(vdev, nr); 1217 #endif 1218 } 1219 1220 void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr) 1221 { 1222 VFIOBAR *bar = &vdev->bars[nr]; 1223 VFIOQuirk *quirk; 1224 int i; 1225 1226 QLIST_FOREACH(quirk, &bar->quirks, next) { 1227 while (!QLIST_EMPTY(&quirk->ioeventfds)) { 1228 vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds)); 1229 } 1230 1231 for (i = 0; i < quirk->nr_mem; i++) { 1232 memory_region_del_subregion(bar->region.mem, &quirk->mem[i]); 1233 } 1234 } 1235 } 1236 1237 void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr) 1238 { 1239 VFIOBAR *bar = &vdev->bars[nr]; 1240 int i; 1241 1242 while (!QLIST_EMPTY(&bar->quirks)) { 1243 VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks); 1244 QLIST_REMOVE(quirk, next); 1245 for (i = 0; i < quirk->nr_mem; i++) { 1246 object_unparent(OBJECT(&quirk->mem[i])); 1247 } 1248 g_free(quirk->mem); 1249 g_free(quirk->data); 1250 g_free(quirk); 1251 } 1252 } 1253 1254 /* 1255 * Reset quirks 1256 */ 1257 void vfio_quirk_reset(VFIOPCIDevice *vdev) 1258 { 1259 int i; 1260 1261 for (i = 0; i < PCI_ROM_SLOT; i++) { 1262 VFIOQuirk *quirk; 1263 VFIOBAR *bar = &vdev->bars[i]; 1264 1265 QLIST_FOREACH(quirk, &bar->quirks, next) { 1266 if (quirk->reset) { 1267 quirk->reset(vdev, quirk); 1268 } 1269 } 1270 } 1271 } 1272 1273 /* 1274 * AMD Radeon PCI config reset, based on Linux: 1275 * drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running() 1276 * drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset 1277 * drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc() 1278 * drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock() 1279 * IDs: include/drm/drm_pciids.h 1280 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0 1281 * 1282 * Bonaire and Hawaii GPUs do not respond to a bus reset. This is a bug in the 1283 * hardware that should be fixed on future ASICs. The symptom of this is that 1284 * once the accerlated driver loads, Windows guests will bsod on subsequent 1285 * attmpts to load the driver, such as after VM reset or shutdown/restart. To 1286 * work around this, we do an AMD specific PCI config reset, followed by an SMC 1287 * reset. The PCI config reset only works if SMC firmware is running, so we 1288 * have a dependency on the state of the device as to whether this reset will 1289 * be effective. There are still cases where we won't be able to kick the 1290 * device into working, but this greatly improves the usability overall. The 1291 * config reset magic is relatively common on AMD GPUs, but the setup and SMC 1292 * poking is largely ASIC specific. 1293 */ 1294 static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev) 1295 { 1296 uint32_t clk, pc_c; 1297 1298 /* 1299 * Registers 200h and 204h are index and data registers for accessing 1300 * indirect configuration registers within the device. 1301 */ 1302 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); 1303 clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1304 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4); 1305 pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1306 1307 return (!(clk & 1) && (0x20100 <= pc_c)); 1308 } 1309 1310 /* 1311 * The scope of a config reset is controlled by a mode bit in the misc register 1312 * and a fuse, exposed as a bit in another register. The fuse is the default 1313 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the formula 1314 * scope = !(misc ^ fuse), where the resulting scope is defined the same as 1315 * the fuse. A truth table therefore tells us that if misc == fuse, we need 1316 * to flip the value of the bit in the misc register. 1317 */ 1318 static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev) 1319 { 1320 uint32_t misc, fuse; 1321 bool a, b; 1322 1323 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4); 1324 fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1325 b = fuse & 64; 1326 1327 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4); 1328 misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1329 a = misc & 2; 1330 1331 if (a == b) { 1332 vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4); 1333 vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */ 1334 } 1335 } 1336 1337 static int vfio_radeon_reset(VFIOPCIDevice *vdev) 1338 { 1339 PCIDevice *pdev = &vdev->pdev; 1340 int i, ret = 0; 1341 uint32_t data; 1342 1343 /* Defer to a kernel implemented reset */ 1344 if (vdev->vbasedev.reset_works) { 1345 trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name); 1346 return -ENODEV; 1347 } 1348 1349 /* Enable only memory BAR access */ 1350 vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2); 1351 1352 /* Reset only works if SMC firmware is loaded and running */ 1353 if (!vfio_radeon_smc_is_running(vdev)) { 1354 ret = -EINVAL; 1355 trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name); 1356 goto out; 1357 } 1358 1359 /* Make sure only the GFX function is reset */ 1360 vfio_radeon_set_gfx_only_reset(vdev); 1361 1362 /* AMD PCI config reset */ 1363 vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4); 1364 usleep(100); 1365 1366 /* Read back the memory size to make sure we're out of reset */ 1367 for (i = 0; i < 100000; i++) { 1368 if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) { 1369 goto reset_smc; 1370 } 1371 usleep(1); 1372 } 1373 1374 trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name); 1375 1376 reset_smc: 1377 /* Reset SMC */ 1378 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4); 1379 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1380 data |= 1; 1381 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); 1382 1383 /* Disable SMC clock */ 1384 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); 1385 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1386 data |= 1; 1387 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); 1388 1389 trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name); 1390 1391 out: 1392 /* Restore PCI command register */ 1393 vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2); 1394 1395 return ret; 1396 } 1397 1398 void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev) 1399 { 1400 switch (vdev->vendor_id) { 1401 case 0x1002: 1402 switch (vdev->device_id) { 1403 /* Bonaire */ 1404 case 0x6649: /* Bonaire [FirePro W5100] */ 1405 case 0x6650: 1406 case 0x6651: 1407 case 0x6658: /* Bonaire XTX [Radeon R7 260X] */ 1408 case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */ 1409 case 0x665d: /* Bonaire [Radeon R7 200 Series] */ 1410 /* Hawaii */ 1411 case 0x67A0: /* Hawaii XT GL [FirePro W9100] */ 1412 case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */ 1413 case 0x67A2: 1414 case 0x67A8: 1415 case 0x67A9: 1416 case 0x67AA: 1417 case 0x67B0: /* Hawaii XT [Radeon R9 290X] */ 1418 case 0x67B1: /* Hawaii PRO [Radeon R9 290] */ 1419 case 0x67B8: 1420 case 0x67B9: 1421 case 0x67BA: 1422 case 0x67BE: 1423 vdev->resetfn = vfio_radeon_reset; 1424 trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name); 1425 break; 1426 } 1427 break; 1428 } 1429 } 1430 1431 /* 1432 * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify 1433 * devices as a member of a clique. Devices within the same clique ID 1434 * are capable of direct P2P. It's the user's responsibility that this 1435 * is correct. The spec says that this may reside at any unused config 1436 * offset, but reserves and recommends hypervisors place this at C8h. 1437 * The spec also states that the hypervisor should place this capability 1438 * at the end of the capability list, thus next is defined as 0h. 1439 * 1440 * +----------------+----------------+----------------+----------------+ 1441 * | sig 7:0 ('P') | vndr len (8h) | next (0h) | cap id (9h) | 1442 * +----------------+----------------+----------------+----------------+ 1443 * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)| sig 23:8 ('P2') | 1444 * +---------------------------------+---------------------------------+ 1445 * 1446 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf 1447 * 1448 * Specification for Turning and later GPU architectures: 1449 * https://lists.gnu.org/archive/html/qemu-devel/2023-06/pdf142OR4O4c2.pdf 1450 */ 1451 static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v, 1452 const char *name, void *opaque, 1453 Error **errp) 1454 { 1455 const Property *prop = opaque; 1456 uint8_t *ptr = object_field_prop_ptr(obj, prop); 1457 1458 visit_type_uint8(v, name, ptr, errp); 1459 } 1460 1461 static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v, 1462 const char *name, void *opaque, 1463 Error **errp) 1464 { 1465 const Property *prop = opaque; 1466 uint8_t value, *ptr = object_field_prop_ptr(obj, prop); 1467 1468 if (!visit_type_uint8(v, name, &value, errp)) { 1469 return; 1470 } 1471 1472 if (value & ~0xF) { 1473 error_setg(errp, "Property %s: valid range 0-15", name); 1474 return; 1475 } 1476 1477 *ptr = value; 1478 } 1479 1480 const PropertyInfo qdev_prop_nv_gpudirect_clique = { 1481 .name = "uint4", 1482 .description = "NVIDIA GPUDirect Clique ID (0 - 15)", 1483 .get = get_nv_gpudirect_clique_id, 1484 .set = set_nv_gpudirect_clique_id, 1485 }; 1486 1487 static bool is_valid_std_cap_offset(uint8_t pos) 1488 { 1489 return (pos >= PCI_STD_HEADER_SIZEOF && 1490 pos <= (PCI_CFG_SPACE_SIZE - PCI_CAP_SIZEOF)); 1491 } 1492 1493 static bool vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) 1494 { 1495 ERRP_GUARD(); 1496 PCIDevice *pdev = &vdev->pdev; 1497 int ret, pos; 1498 bool c8_conflict = false, d4_conflict = false; 1499 uint8_t tmp; 1500 1501 if (vdev->nv_gpudirect_clique == 0xFF) { 1502 return true; 1503 } 1504 1505 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) { 1506 error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor"); 1507 return false; 1508 } 1509 1510 if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) != 1511 PCI_BASE_CLASS_DISPLAY) { 1512 error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class"); 1513 return false; 1514 } 1515 1516 /* 1517 * Per the updated specification above, it's recommended to use offset 1518 * D4h for Turing and later GPU architectures due to a conflict of the 1519 * MSI-X capability at C8h. We don't know how to determine the GPU 1520 * architecture, instead we walk the capability chain to mark conflicts 1521 * and choose one or error based on the result. 1522 * 1523 * NB. Cap list head in pdev->config is already cleared, read from device. 1524 */ 1525 ret = pread(vdev->vbasedev.fd, &tmp, 1, 1526 vdev->config_offset + PCI_CAPABILITY_LIST); 1527 if (ret != 1 || !is_valid_std_cap_offset(tmp)) { 1528 error_setg(errp, "NVIDIA GPUDirect Clique ID: error getting cap list"); 1529 return false; 1530 } 1531 1532 do { 1533 if (tmp == 0xC8) { 1534 c8_conflict = true; 1535 } else if (tmp == 0xD4) { 1536 d4_conflict = true; 1537 } 1538 tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]; 1539 } while (is_valid_std_cap_offset(tmp)); 1540 1541 if (!c8_conflict) { 1542 pos = 0xC8; 1543 } else if (!d4_conflict) { 1544 pos = 0xD4; 1545 } else { 1546 error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid config space"); 1547 return false; 1548 } 1549 1550 ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp); 1551 if (ret < 0) { 1552 error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: "); 1553 return false; 1554 } 1555 1556 memset(vdev->emulated_config_bits + pos, 0xFF, 8); 1557 pos += PCI_CAP_FLAGS; 1558 pci_set_byte(pdev->config + pos++, 8); 1559 pci_set_byte(pdev->config + pos++, 'P'); 1560 pci_set_byte(pdev->config + pos++, '2'); 1561 pci_set_byte(pdev->config + pos++, 'P'); 1562 pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3); 1563 pci_set_byte(pdev->config + pos, 0); 1564 1565 return true; 1566 } 1567 1568 /* 1569 * The VMD endpoint provides a real PCIe domain to the guest and the guest 1570 * kernel performs enumeration of the VMD sub-device domain. Guest transactions 1571 * to VMD sub-devices go through MMU translation from guest addresses to 1572 * physical addresses. When MMIO goes to an endpoint after being translated to 1573 * physical addresses, the bridge rejects the transaction because the window 1574 * has been programmed with guest addresses. 1575 * 1576 * VMD can use the Host Physical Address in order to correctly program the 1577 * bridge windows in its PCIe domain. VMD device 28C0 has HPA shadow registers 1578 * located at offset 0x2000 in MEMBAR2 (BAR 4). This quirk provides the HPA 1579 * shadow registers in a vendor-specific capability register for devices 1580 * without native support. The position of 0xE8-0xFF is in the reserved range 1581 * of the VMD device capability space following the Power Management 1582 * Capability. 1583 */ 1584 #define VMD_SHADOW_CAP_VER 1 1585 #define VMD_SHADOW_CAP_LEN 24 1586 static bool vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, Error **errp) 1587 { 1588 ERRP_GUARD(); 1589 uint8_t membar_phys[16]; 1590 int ret, pos = 0xE8; 1591 1592 if (!(vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x201D) || 1593 vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x467F) || 1594 vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x4C3D) || 1595 vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x9A0B))) { 1596 return true; 1597 } 1598 1599 ret = pread(vdev->vbasedev.fd, membar_phys, 16, 1600 vdev->config_offset + PCI_BASE_ADDRESS_2); 1601 if (ret != 16) { 1602 error_report("VMD %s cannot read MEMBARs (%d)", 1603 vdev->vbasedev.name, ret); 1604 return false; 1605 } 1606 1607 ret = pci_add_capability(&vdev->pdev, PCI_CAP_ID_VNDR, pos, 1608 VMD_SHADOW_CAP_LEN, errp); 1609 if (ret < 0) { 1610 error_prepend(errp, "Failed to add VMD MEMBAR Shadow cap: "); 1611 return false; 1612 } 1613 1614 memset(vdev->emulated_config_bits + pos, 0xFF, VMD_SHADOW_CAP_LEN); 1615 pos += PCI_CAP_FLAGS; 1616 pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_LEN); 1617 pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_VER); 1618 pci_set_long(vdev->pdev.config + pos, 0x53484457); /* SHDW */ 1619 memcpy(vdev->pdev.config + pos + 4, membar_phys, 16); 1620 1621 return true; 1622 } 1623 1624 bool vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp) 1625 { 1626 if (!vfio_add_nv_gpudirect_cap(vdev, errp)) { 1627 return false; 1628 } 1629 1630 if (!vfio_add_vmd_shadow_cap(vdev, errp)) { 1631 return false; 1632 } 1633 1634 return true; 1635 } 1636