1 /* 2 * device quirks for PCI devices 3 * 4 * Copyright Red Hat, Inc. 2012-2015 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include CONFIG_DEVICES 15 #include "exec/memop.h" 16 #include "qemu/units.h" 17 #include "qemu/log.h" 18 #include "qemu/error-report.h" 19 #include "qemu/main-loop.h" 20 #include "qemu/module.h" 21 #include "qemu/range.h" 22 #include "qapi/error.h" 23 #include "qapi/visitor.h" 24 #include <sys/ioctl.h> 25 #include "hw/nvram/fw_cfg.h" 26 #include "hw/qdev-properties.h" 27 #include "pci.h" 28 #include "pci-quirks.h" 29 #include "trace.h" 30 31 /* 32 * List of device ids/vendor ids for which to disable 33 * option rom loading. This avoids the guest hangs during rom 34 * execution as noticed with the BCM 57810 card for lack of a 35 * more better way to handle such issues. 36 * The user can still override by specifying a romfile or 37 * rombar=1. 38 * Please see https://bugs.launchpad.net/qemu/+bug/1284874 39 * for an analysis of the 57810 card hang. When adding 40 * a new vendor id/device id combination below, please also add 41 * your card/environment details and information that could 42 * help in debugging to the bug tracking this issue 43 */ 44 static const struct { 45 uint32_t vendor; 46 uint32_t device; 47 } rom_denylist[] = { 48 { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */ 49 }; 50 51 bool vfio_opt_rom_in_denylist(VFIOPCIDevice *vdev) 52 { 53 int i; 54 55 for (i = 0 ; i < ARRAY_SIZE(rom_denylist); i++) { 56 if (vfio_pci_is(vdev, rom_denylist[i].vendor, rom_denylist[i].device)) { 57 trace_vfio_quirk_rom_in_denylist(vdev->vbasedev.name, 58 rom_denylist[i].vendor, 59 rom_denylist[i].device); 60 return true; 61 } 62 } 63 return false; 64 } 65 66 /* 67 * Device specific region quirks (mostly backdoors to PCI config space) 68 */ 69 70 static uint64_t vfio_generic_window_quirk_address_read(void *opaque, 71 hwaddr addr, 72 unsigned size) 73 { 74 VFIOConfigWindowQuirk *window = opaque; 75 VFIOPCIDevice *vdev = window->vdev; 76 77 return vfio_region_read(&vdev->bars[window->bar].region, 78 addr + window->address_offset, size); 79 } 80 81 static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr, 82 uint64_t data, 83 unsigned size) 84 { 85 VFIOConfigWindowQuirk *window = opaque; 86 VFIOPCIDevice *vdev = window->vdev; 87 int i; 88 89 window->window_enabled = false; 90 91 vfio_region_write(&vdev->bars[window->bar].region, 92 addr + window->address_offset, data, size); 93 94 for (i = 0; i < window->nr_matches; i++) { 95 if ((data & ~window->matches[i].mask) == window->matches[i].match) { 96 window->window_enabled = true; 97 window->address_val = data & window->matches[i].mask; 98 trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name, 99 memory_region_name(window->addr_mem), data); 100 break; 101 } 102 } 103 } 104 105 const MemoryRegionOps vfio_generic_window_address_quirk = { 106 .read = vfio_generic_window_quirk_address_read, 107 .write = vfio_generic_window_quirk_address_write, 108 .endianness = DEVICE_LITTLE_ENDIAN, 109 }; 110 111 static uint64_t vfio_generic_window_quirk_data_read(void *opaque, 112 hwaddr addr, unsigned size) 113 { 114 VFIOConfigWindowQuirk *window = opaque; 115 VFIOPCIDevice *vdev = window->vdev; 116 uint64_t data; 117 118 /* Always read data reg, discard if window enabled */ 119 data = vfio_region_read(&vdev->bars[window->bar].region, 120 addr + window->data_offset, size); 121 122 if (window->window_enabled) { 123 data = vfio_pci_read_config(&vdev->pdev, window->address_val, size); 124 trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name, 125 memory_region_name(window->data_mem), data); 126 } 127 128 return data; 129 } 130 131 static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr, 132 uint64_t data, unsigned size) 133 { 134 VFIOConfigWindowQuirk *window = opaque; 135 VFIOPCIDevice *vdev = window->vdev; 136 137 if (window->window_enabled) { 138 vfio_pci_write_config(&vdev->pdev, window->address_val, data, size); 139 trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name, 140 memory_region_name(window->data_mem), data); 141 return; 142 } 143 144 vfio_region_write(&vdev->bars[window->bar].region, 145 addr + window->data_offset, data, size); 146 } 147 148 const MemoryRegionOps vfio_generic_window_data_quirk = { 149 .read = vfio_generic_window_quirk_data_read, 150 .write = vfio_generic_window_quirk_data_write, 151 .endianness = DEVICE_LITTLE_ENDIAN, 152 }; 153 154 static uint64_t vfio_generic_quirk_mirror_read(void *opaque, 155 hwaddr addr, unsigned size) 156 { 157 VFIOConfigMirrorQuirk *mirror = opaque; 158 VFIOPCIDevice *vdev = mirror->vdev; 159 uint64_t data; 160 161 /* Read and discard in case the hardware cares */ 162 (void)vfio_region_read(&vdev->bars[mirror->bar].region, 163 addr + mirror->offset, size); 164 165 addr += mirror->config_offset; 166 data = vfio_pci_read_config(&vdev->pdev, addr, size); 167 trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name, 168 memory_region_name(mirror->mem), 169 addr, data); 170 return data; 171 } 172 173 static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr, 174 uint64_t data, unsigned size) 175 { 176 VFIOConfigMirrorQuirk *mirror = opaque; 177 VFIOPCIDevice *vdev = mirror->vdev; 178 179 addr += mirror->config_offset; 180 vfio_pci_write_config(&vdev->pdev, addr, data, size); 181 trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name, 182 memory_region_name(mirror->mem), 183 addr, data); 184 } 185 186 const MemoryRegionOps vfio_generic_mirror_quirk = { 187 .read = vfio_generic_quirk_mirror_read, 188 .write = vfio_generic_quirk_mirror_write, 189 .endianness = DEVICE_LITTLE_ENDIAN, 190 }; 191 192 /* Is range1 fully contained within range2? */ 193 static bool vfio_range_contained(uint64_t first1, uint64_t len1, 194 uint64_t first2, uint64_t len2) { 195 return (first1 >= first2 && first1 + len1 <= first2 + len2); 196 } 197 198 #define PCI_VENDOR_ID_ATI 0x1002 199 200 /* 201 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR 202 * through VGA register 0x3c3. On newer cards, the I/O port BAR is always 203 * BAR4 (older cards like the X550 used BAR1, but we don't care to support 204 * those). Note that on bare metal, a read of 0x3c3 doesn't always return the 205 * I/O port BAR address. Originally this was coded to return the virtual BAR 206 * address only if the physical register read returns the actual BAR address, 207 * but users have reported greater success if we return the virtual address 208 * unconditionally. 209 */ 210 static uint64_t vfio_ati_3c3_quirk_read(void *opaque, 211 hwaddr addr, unsigned size) 212 { 213 VFIOPCIDevice *vdev = opaque; 214 uint64_t data = vfio_pci_read_config(&vdev->pdev, 215 PCI_BASE_ADDRESS_4 + 1, size); 216 217 trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data); 218 219 return data; 220 } 221 222 static void vfio_ati_3c3_quirk_write(void *opaque, hwaddr addr, 223 uint64_t data, unsigned size) 224 { 225 qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid access\n", __func__); 226 } 227 228 static const MemoryRegionOps vfio_ati_3c3_quirk = { 229 .read = vfio_ati_3c3_quirk_read, 230 .write = vfio_ati_3c3_quirk_write, 231 .endianness = DEVICE_LITTLE_ENDIAN, 232 }; 233 234 VFIOQuirk *vfio_quirk_alloc(int nr_mem) 235 { 236 VFIOQuirk *quirk = g_new0(VFIOQuirk, 1); 237 QLIST_INIT(&quirk->ioeventfds); 238 quirk->mem = g_new0(MemoryRegion, nr_mem); 239 quirk->nr_mem = nr_mem; 240 241 return quirk; 242 } 243 244 static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd) 245 { 246 QLIST_REMOVE(ioeventfd, next); 247 memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size, 248 true, ioeventfd->data, &ioeventfd->e); 249 250 if (ioeventfd->vfio) { 251 struct vfio_device_ioeventfd vfio_ioeventfd; 252 253 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd); 254 vfio_ioeventfd.flags = ioeventfd->size; 255 vfio_ioeventfd.data = ioeventfd->data; 256 vfio_ioeventfd.offset = ioeventfd->region->fd_offset + 257 ioeventfd->region_addr; 258 vfio_ioeventfd.fd = -1; 259 260 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) { 261 error_report("Failed to remove vfio ioeventfd for %s+0x%" 262 HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)", 263 memory_region_name(ioeventfd->mr), ioeventfd->addr, 264 ioeventfd->size, ioeventfd->data); 265 } 266 } else { 267 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), 268 NULL, NULL, NULL); 269 } 270 271 event_notifier_cleanup(&ioeventfd->e); 272 trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr), 273 (uint64_t)ioeventfd->addr, ioeventfd->size, 274 ioeventfd->data); 275 g_free(ioeventfd); 276 } 277 278 static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk) 279 { 280 VFIOIOEventFD *ioeventfd, *tmp; 281 282 QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) { 283 if (ioeventfd->dynamic) { 284 vfio_ioeventfd_exit(vdev, ioeventfd); 285 } 286 } 287 } 288 289 static void vfio_ioeventfd_handler(void *opaque) 290 { 291 VFIOIOEventFD *ioeventfd = opaque; 292 293 if (event_notifier_test_and_clear(&ioeventfd->e)) { 294 vfio_region_write(ioeventfd->region, ioeventfd->region_addr, 295 ioeventfd->data, ioeventfd->size); 296 trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr), 297 (uint64_t)ioeventfd->addr, ioeventfd->size, 298 ioeventfd->data); 299 } 300 } 301 302 static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev, 303 MemoryRegion *mr, hwaddr addr, 304 unsigned size, uint64_t data, 305 VFIORegion *region, 306 hwaddr region_addr, bool dynamic) 307 { 308 VFIOIOEventFD *ioeventfd; 309 310 if (vdev->no_kvm_ioeventfd) { 311 return NULL; 312 } 313 314 ioeventfd = g_malloc0(sizeof(*ioeventfd)); 315 316 if (event_notifier_init(&ioeventfd->e, 0)) { 317 g_free(ioeventfd); 318 return NULL; 319 } 320 321 /* 322 * MemoryRegion and relative offset, plus additional ioeventfd setup 323 * parameters for configuring and later tearing down KVM ioeventfd. 324 */ 325 ioeventfd->mr = mr; 326 ioeventfd->addr = addr; 327 ioeventfd->size = size; 328 ioeventfd->data = data; 329 ioeventfd->dynamic = dynamic; 330 /* 331 * VFIORegion and relative offset for implementing the userspace 332 * handler. data & size fields shared for both uses. 333 */ 334 ioeventfd->region = region; 335 ioeventfd->region_addr = region_addr; 336 337 if (!vdev->no_vfio_ioeventfd) { 338 struct vfio_device_ioeventfd vfio_ioeventfd; 339 340 vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd); 341 vfio_ioeventfd.flags = ioeventfd->size; 342 vfio_ioeventfd.data = ioeventfd->data; 343 vfio_ioeventfd.offset = ioeventfd->region->fd_offset + 344 ioeventfd->region_addr; 345 vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e); 346 347 ioeventfd->vfio = !ioctl(vdev->vbasedev.fd, 348 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd); 349 } 350 351 if (!ioeventfd->vfio) { 352 qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), 353 vfio_ioeventfd_handler, NULL, ioeventfd); 354 } 355 356 memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size, 357 true, ioeventfd->data, &ioeventfd->e); 358 trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr, 359 size, data, ioeventfd->vfio); 360 361 return ioeventfd; 362 } 363 364 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev) 365 { 366 VFIOQuirk *quirk; 367 368 /* 369 * As long as the BAR is >= 256 bytes it will be aligned such that the 370 * lower byte is always zero. Filter out anything else, if it exists. 371 */ 372 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 373 !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) { 374 return; 375 } 376 377 quirk = vfio_quirk_alloc(1); 378 379 memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev, 380 "vfio-ati-3c3-quirk", 1); 381 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 382 3 /* offset 3 bytes from 0x3c0 */, quirk->mem); 383 384 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, 385 quirk, next); 386 387 trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name); 388 } 389 390 /* 391 * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI 392 * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access 393 * the MMIO space directly, but a window to this space is provided through 394 * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the 395 * data register. When the address is programmed to a range of 0x4000-0x4fff 396 * PCI configuration space is available. Experimentation seems to indicate 397 * that read-only may be provided by hardware. 398 */ 399 static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr) 400 { 401 VFIOQuirk *quirk; 402 VFIOConfigWindowQuirk *window; 403 404 /* This windows doesn't seem to be used except by legacy VGA code */ 405 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 406 !vdev->vga || nr != 4) { 407 return; 408 } 409 410 quirk = vfio_quirk_alloc(2); 411 window = quirk->data = g_malloc0(sizeof(*window) + 412 sizeof(VFIOConfigWindowMatch)); 413 window->vdev = vdev; 414 window->address_offset = 0; 415 window->data_offset = 4; 416 window->nr_matches = 1; 417 window->matches[0].match = 0x4000; 418 window->matches[0].mask = vdev->config_size - 1; 419 window->bar = nr; 420 window->addr_mem = &quirk->mem[0]; 421 window->data_mem = &quirk->mem[1]; 422 423 memory_region_init_io(window->addr_mem, OBJECT(vdev), 424 &vfio_generic_window_address_quirk, window, 425 "vfio-ati-bar4-window-address-quirk", 4); 426 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 427 window->address_offset, 428 window->addr_mem, 1); 429 430 memory_region_init_io(window->data_mem, OBJECT(vdev), 431 &vfio_generic_window_data_quirk, window, 432 "vfio-ati-bar4-window-data-quirk", 4); 433 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 434 window->data_offset, 435 window->data_mem, 1); 436 437 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 438 439 trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name); 440 } 441 442 /* 443 * Trap the BAR2 MMIO mirror to config space as well. 444 */ 445 static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr) 446 { 447 VFIOQuirk *quirk; 448 VFIOConfigMirrorQuirk *mirror; 449 450 /* Only enable on newer devices where BAR2 is 64bit */ 451 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || 452 !vdev->vga || nr != 2 || !vdev->bars[2].mem64) { 453 return; 454 } 455 456 quirk = vfio_quirk_alloc(1); 457 mirror = quirk->data = g_malloc0(sizeof(*mirror)); 458 mirror->mem = quirk->mem; 459 mirror->vdev = vdev; 460 mirror->offset = 0x4000; 461 mirror->bar = nr; 462 463 memory_region_init_io(mirror->mem, OBJECT(vdev), 464 &vfio_generic_mirror_quirk, mirror, 465 "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE); 466 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 467 mirror->offset, mirror->mem, 1); 468 469 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 470 471 trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name); 472 } 473 474 /* 475 * Older ATI/AMD cards like the X550 have a similar window to that above. 476 * I/O port BAR1 provides a window to a mirror of PCI config space located 477 * in BAR2 at offset 0xf00. We don't care to support such older cards, but 478 * note it for future reference. 479 */ 480 481 /* 482 * Nvidia has several different methods to get to config space, the 483 * nouveu project has several of these documented here: 484 * https://github.com/pathscale/envytools/tree/master/hwdocs 485 * 486 * The first quirk is actually not documented in envytools and is found 487 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an 488 * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access 489 * the mirror of PCI config space found at BAR0 offset 0x1800. The access 490 * sequence first writes 0x338 to I/O port 0x3d4. The target offset is 491 * then written to 0x3d0. Finally 0x538 is written for a read and 0x738 492 * is written for a write to 0x3d4. The BAR0 offset is then accessible 493 * through 0x3d0. This quirk doesn't seem to be necessary on newer cards 494 * that use the I/O port BAR5 window but it doesn't hurt to leave it. 495 */ 496 typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State; 497 static const char *nv3d0_states[] = { "NONE", "SELECT", 498 "WINDOW", "READ", "WRITE" }; 499 500 typedef struct VFIONvidia3d0Quirk { 501 VFIOPCIDevice *vdev; 502 VFIONvidia3d0State state; 503 uint32_t offset; 504 } VFIONvidia3d0Quirk; 505 506 static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque, 507 hwaddr addr, unsigned size) 508 { 509 VFIONvidia3d0Quirk *quirk = opaque; 510 VFIOPCIDevice *vdev = quirk->vdev; 511 512 quirk->state = NONE; 513 514 return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 515 addr + 0x14, size); 516 } 517 518 static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr, 519 uint64_t data, unsigned size) 520 { 521 VFIONvidia3d0Quirk *quirk = opaque; 522 VFIOPCIDevice *vdev = quirk->vdev; 523 VFIONvidia3d0State old_state = quirk->state; 524 525 quirk->state = NONE; 526 527 switch (data) { 528 case 0x338: 529 if (old_state == NONE) { 530 quirk->state = SELECT; 531 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 532 nv3d0_states[quirk->state]); 533 } 534 break; 535 case 0x538: 536 if (old_state == WINDOW) { 537 quirk->state = READ; 538 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 539 nv3d0_states[quirk->state]); 540 } 541 break; 542 case 0x738: 543 if (old_state == WINDOW) { 544 quirk->state = WRITE; 545 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 546 nv3d0_states[quirk->state]); 547 } 548 break; 549 } 550 551 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 552 addr + 0x14, data, size); 553 } 554 555 static const MemoryRegionOps vfio_nvidia_3d4_quirk = { 556 .read = vfio_nvidia_3d4_quirk_read, 557 .write = vfio_nvidia_3d4_quirk_write, 558 .endianness = DEVICE_LITTLE_ENDIAN, 559 }; 560 561 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque, 562 hwaddr addr, unsigned size) 563 { 564 VFIONvidia3d0Quirk *quirk = opaque; 565 VFIOPCIDevice *vdev = quirk->vdev; 566 VFIONvidia3d0State old_state = quirk->state; 567 uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 568 addr + 0x10, size); 569 570 quirk->state = NONE; 571 572 if (old_state == READ && 573 (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { 574 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); 575 576 data = vfio_pci_read_config(&vdev->pdev, offset, size); 577 trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name, 578 offset, size, data); 579 } 580 581 return data; 582 } 583 584 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr, 585 uint64_t data, unsigned size) 586 { 587 VFIONvidia3d0Quirk *quirk = opaque; 588 VFIOPCIDevice *vdev = quirk->vdev; 589 VFIONvidia3d0State old_state = quirk->state; 590 591 quirk->state = NONE; 592 593 if (old_state == SELECT) { 594 quirk->offset = (uint32_t)data; 595 quirk->state = WINDOW; 596 trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, 597 nv3d0_states[quirk->state]); 598 } else if (old_state == WRITE) { 599 if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { 600 uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); 601 602 vfio_pci_write_config(&vdev->pdev, offset, data, size); 603 trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name, 604 offset, data, size); 605 return; 606 } 607 } 608 609 vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI], 610 addr + 0x10, data, size); 611 } 612 613 static const MemoryRegionOps vfio_nvidia_3d0_quirk = { 614 .read = vfio_nvidia_3d0_quirk_read, 615 .write = vfio_nvidia_3d0_quirk_write, 616 .endianness = DEVICE_LITTLE_ENDIAN, 617 }; 618 619 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev) 620 { 621 VFIOQuirk *quirk; 622 VFIONvidia3d0Quirk *data; 623 624 if (vdev->no_geforce_quirks || 625 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 626 !vdev->bars[1].region.size) { 627 return; 628 } 629 630 quirk = vfio_quirk_alloc(2); 631 quirk->data = data = g_malloc0(sizeof(*data)); 632 data->vdev = vdev; 633 634 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk, 635 data, "vfio-nvidia-3d4-quirk", 2); 636 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 637 0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]); 638 639 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk, 640 data, "vfio-nvidia-3d0-quirk", 2); 641 memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, 642 0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]); 643 644 QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks, 645 quirk, next); 646 647 trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name); 648 } 649 650 /* 651 * The second quirk is documented in envytools. The I/O port BAR5 is just 652 * a set of address/data ports to the MMIO BARs. The BAR we care about is 653 * again BAR0. This backdoor is apparently a bit newer than the one above 654 * so we need to not only trap 256 bytes @0x1800, but all of PCI config 655 * space, including extended space is available at the 4k @0x88000. 656 */ 657 typedef struct VFIONvidiaBAR5Quirk { 658 uint32_t master; 659 uint32_t enable; 660 MemoryRegion *addr_mem; 661 MemoryRegion *data_mem; 662 bool enabled; 663 VFIOConfigWindowQuirk window; /* last for match data */ 664 } VFIONvidiaBAR5Quirk; 665 666 static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5) 667 { 668 VFIOPCIDevice *vdev = bar5->window.vdev; 669 670 if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) { 671 return; 672 } 673 674 bar5->enabled = !bar5->enabled; 675 trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name, 676 bar5->enabled ? "Enable" : "Disable"); 677 memory_region_set_enabled(bar5->addr_mem, bar5->enabled); 678 memory_region_set_enabled(bar5->data_mem, bar5->enabled); 679 } 680 681 static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque, 682 hwaddr addr, unsigned size) 683 { 684 VFIONvidiaBAR5Quirk *bar5 = opaque; 685 VFIOPCIDevice *vdev = bar5->window.vdev; 686 687 return vfio_region_read(&vdev->bars[5].region, addr, size); 688 } 689 690 static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr, 691 uint64_t data, unsigned size) 692 { 693 VFIONvidiaBAR5Quirk *bar5 = opaque; 694 VFIOPCIDevice *vdev = bar5->window.vdev; 695 696 vfio_region_write(&vdev->bars[5].region, addr, data, size); 697 698 bar5->master = data; 699 vfio_nvidia_bar5_enable(bar5); 700 } 701 702 static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = { 703 .read = vfio_nvidia_bar5_quirk_master_read, 704 .write = vfio_nvidia_bar5_quirk_master_write, 705 .endianness = DEVICE_LITTLE_ENDIAN, 706 }; 707 708 static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque, 709 hwaddr addr, unsigned size) 710 { 711 VFIONvidiaBAR5Quirk *bar5 = opaque; 712 VFIOPCIDevice *vdev = bar5->window.vdev; 713 714 return vfio_region_read(&vdev->bars[5].region, addr + 4, size); 715 } 716 717 static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr, 718 uint64_t data, unsigned size) 719 { 720 VFIONvidiaBAR5Quirk *bar5 = opaque; 721 VFIOPCIDevice *vdev = bar5->window.vdev; 722 723 vfio_region_write(&vdev->bars[5].region, addr + 4, data, size); 724 725 bar5->enable = data; 726 vfio_nvidia_bar5_enable(bar5); 727 } 728 729 static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = { 730 .read = vfio_nvidia_bar5_quirk_enable_read, 731 .write = vfio_nvidia_bar5_quirk_enable_write, 732 .endianness = DEVICE_LITTLE_ENDIAN, 733 }; 734 735 static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) 736 { 737 VFIOQuirk *quirk; 738 VFIONvidiaBAR5Quirk *bar5; 739 VFIOConfigWindowQuirk *window; 740 741 if (vdev->no_geforce_quirks || 742 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 743 !vdev->vga || nr != 5 || !vdev->bars[5].ioport) { 744 return; 745 } 746 747 quirk = vfio_quirk_alloc(4); 748 bar5 = quirk->data = g_malloc0(sizeof(*bar5) + 749 (sizeof(VFIOConfigWindowMatch) * 2)); 750 window = &bar5->window; 751 752 window->vdev = vdev; 753 window->address_offset = 0x8; 754 window->data_offset = 0xc; 755 window->nr_matches = 2; 756 window->matches[0].match = 0x1800; 757 window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1; 758 window->matches[1].match = 0x88000; 759 window->matches[1].mask = vdev->config_size - 1; 760 window->bar = nr; 761 window->addr_mem = bar5->addr_mem = &quirk->mem[0]; 762 window->data_mem = bar5->data_mem = &quirk->mem[1]; 763 764 memory_region_init_io(window->addr_mem, OBJECT(vdev), 765 &vfio_generic_window_address_quirk, window, 766 "vfio-nvidia-bar5-window-address-quirk", 4); 767 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 768 window->address_offset, 769 window->addr_mem, 1); 770 memory_region_set_enabled(window->addr_mem, false); 771 772 memory_region_init_io(window->data_mem, OBJECT(vdev), 773 &vfio_generic_window_data_quirk, window, 774 "vfio-nvidia-bar5-window-data-quirk", 4); 775 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 776 window->data_offset, 777 window->data_mem, 1); 778 memory_region_set_enabled(window->data_mem, false); 779 780 memory_region_init_io(&quirk->mem[2], OBJECT(vdev), 781 &vfio_nvidia_bar5_quirk_master, bar5, 782 "vfio-nvidia-bar5-master-quirk", 4); 783 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 784 0, &quirk->mem[2], 1); 785 786 memory_region_init_io(&quirk->mem[3], OBJECT(vdev), 787 &vfio_nvidia_bar5_quirk_enable, bar5, 788 "vfio-nvidia-bar5-enable-quirk", 4); 789 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 790 4, &quirk->mem[3], 1); 791 792 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 793 794 trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name); 795 } 796 797 typedef struct LastDataSet { 798 VFIOQuirk *quirk; 799 hwaddr addr; 800 uint64_t data; 801 unsigned size; 802 int hits; 803 int added; 804 } LastDataSet; 805 806 #define MAX_DYN_IOEVENTFD 10 807 #define HITS_FOR_IOEVENTFD 10 808 809 /* 810 * Finally, BAR0 itself. We want to redirect any accesses to either 811 * 0x1800 or 0x88000 through the PCI config space access functions. 812 */ 813 static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr, 814 uint64_t data, unsigned size) 815 { 816 VFIOConfigMirrorQuirk *mirror = opaque; 817 VFIOPCIDevice *vdev = mirror->vdev; 818 PCIDevice *pdev = &vdev->pdev; 819 LastDataSet *last = (LastDataSet *)&mirror->data; 820 821 vfio_generic_quirk_mirror_write(opaque, addr, data, size); 822 823 /* 824 * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the 825 * MSI capability ID register. Both the ID and next register are 826 * read-only, so we allow writes covering either of those to real hw. 827 */ 828 if ((pdev->cap_present & QEMU_PCI_CAP_MSI) && 829 vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) { 830 vfio_region_write(&vdev->bars[mirror->bar].region, 831 addr + mirror->offset, data, size); 832 trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name); 833 } 834 835 /* 836 * Automatically add an ioeventfd to handle any repeated write with the 837 * same data and size above the standard PCI config space header. This is 838 * primarily expected to accelerate the MSI-ACK behavior, such as noted 839 * above. Current hardware/drivers should trigger an ioeventfd at config 840 * offset 0x704 (region offset 0x88704), with data 0x0, size 4. 841 * 842 * The criteria of 10 successive hits is arbitrary but reliably adds the 843 * MSI-ACK region. Note that as some writes are bypassed via the ioeventfd, 844 * the remaining ones have a greater chance of being seen successively. 845 * To avoid the pathological case of burning up all of QEMU's open file 846 * handles, arbitrarily limit this algorithm from adding no more than 10 847 * ioeventfds, print an error if we would have added an 11th, and then 848 * stop counting. 849 */ 850 if (!vdev->no_kvm_ioeventfd && 851 addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) { 852 if (addr != last->addr || data != last->data || size != last->size) { 853 last->addr = addr; 854 last->data = data; 855 last->size = size; 856 last->hits = 1; 857 } else if (++last->hits >= HITS_FOR_IOEVENTFD) { 858 if (last->added < MAX_DYN_IOEVENTFD) { 859 VFIOIOEventFD *ioeventfd; 860 ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size, 861 data, &vdev->bars[mirror->bar].region, 862 mirror->offset + addr, true); 863 if (ioeventfd) { 864 VFIOQuirk *quirk = last->quirk; 865 866 QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next); 867 last->added++; 868 } 869 } else { 870 last->added++; 871 warn_report("NVIDIA ioeventfd queue full for %s, unable to " 872 "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", " 873 "size %u", vdev->vbasedev.name, addr, data, size); 874 } 875 } 876 } 877 } 878 879 static const MemoryRegionOps vfio_nvidia_mirror_quirk = { 880 .read = vfio_generic_quirk_mirror_read, 881 .write = vfio_nvidia_quirk_mirror_write, 882 .endianness = DEVICE_LITTLE_ENDIAN, 883 }; 884 885 static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk) 886 { 887 VFIOConfigMirrorQuirk *mirror = quirk->data; 888 LastDataSet *last = (LastDataSet *)&mirror->data; 889 890 last->addr = last->data = last->size = last->hits = last->added = 0; 891 892 vfio_drop_dynamic_eventfds(vdev, quirk); 893 } 894 895 static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) 896 { 897 VFIOQuirk *quirk; 898 VFIOConfigMirrorQuirk *mirror; 899 LastDataSet *last; 900 901 if (vdev->no_geforce_quirks || 902 !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || 903 !vfio_is_vga(vdev) || nr != 0) { 904 return; 905 } 906 907 quirk = vfio_quirk_alloc(1); 908 quirk->reset = vfio_nvidia_bar0_quirk_reset; 909 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); 910 mirror->mem = quirk->mem; 911 mirror->vdev = vdev; 912 mirror->offset = 0x88000; 913 mirror->bar = nr; 914 last = (LastDataSet *)&mirror->data; 915 last->quirk = quirk; 916 917 memory_region_init_io(mirror->mem, OBJECT(vdev), 918 &vfio_nvidia_mirror_quirk, mirror, 919 "vfio-nvidia-bar0-88000-mirror-quirk", 920 vdev->config_size); 921 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 922 mirror->offset, mirror->mem, 1); 923 924 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 925 926 /* The 0x1800 offset mirror only seems to get used by legacy VGA */ 927 if (vdev->vga) { 928 quirk = vfio_quirk_alloc(1); 929 quirk->reset = vfio_nvidia_bar0_quirk_reset; 930 mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet)); 931 mirror->mem = quirk->mem; 932 mirror->vdev = vdev; 933 mirror->offset = 0x1800; 934 mirror->bar = nr; 935 last = (LastDataSet *)&mirror->data; 936 last->quirk = quirk; 937 938 memory_region_init_io(mirror->mem, OBJECT(vdev), 939 &vfio_nvidia_mirror_quirk, mirror, 940 "vfio-nvidia-bar0-1800-mirror-quirk", 941 PCI_CONFIG_SPACE_SIZE); 942 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 943 mirror->offset, mirror->mem, 1); 944 945 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 946 } 947 948 trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name); 949 } 950 951 /* 952 * TODO - Some Nvidia devices provide config access to their companion HDA 953 * device and even to their parent bridge via these config space mirrors. 954 * Add quirks for those regions. 955 */ 956 957 #define PCI_VENDOR_ID_REALTEK 0x10ec 958 959 /* 960 * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2 961 * offset 0x70 there is a dword data register, offset 0x74 is a dword address 962 * register. According to the Linux r8169 driver, the MSI-X table is addressed 963 * when the "type" portion of the address register is set to 0x1. This appears 964 * to be bits 16:30. Bit 31 is both a write indicator and some sort of 965 * "address latched" indicator. Bits 12:15 are a mask field, which we can 966 * ignore because the MSI-X table should always be accessed as a dword (full 967 * mask). Bits 0:11 is offset within the type. 968 * 969 * Example trace: 970 * 971 * Read from MSI-X table offset 0 972 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr 973 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch 974 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data 975 * 976 * Write 0xfee00000 to MSI-X table offset 0 977 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data 978 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write 979 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete 980 */ 981 typedef struct VFIOrtl8168Quirk { 982 VFIOPCIDevice *vdev; 983 uint32_t addr; 984 uint32_t data; 985 bool enabled; 986 } VFIOrtl8168Quirk; 987 988 static uint64_t vfio_rtl8168_quirk_address_read(void *opaque, 989 hwaddr addr, unsigned size) 990 { 991 VFIOrtl8168Quirk *rtl = opaque; 992 VFIOPCIDevice *vdev = rtl->vdev; 993 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size); 994 995 if (rtl->enabled) { 996 data = rtl->addr ^ 0x80000000U; /* latch/complete */ 997 trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data); 998 } 999 1000 return data; 1001 } 1002 1003 static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr, 1004 uint64_t data, unsigned size) 1005 { 1006 VFIOrtl8168Quirk *rtl = opaque; 1007 VFIOPCIDevice *vdev = rtl->vdev; 1008 1009 rtl->enabled = false; 1010 1011 if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */ 1012 rtl->enabled = true; 1013 rtl->addr = (uint32_t)data; 1014 1015 if (data & 0x80000000U) { /* Do write */ 1016 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) { 1017 hwaddr offset = data & 0xfff; 1018 uint64_t val = rtl->data; 1019 1020 trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name, 1021 (uint16_t)offset, val); 1022 1023 /* Write to the proper guest MSI-X table instead */ 1024 memory_region_dispatch_write(&vdev->pdev.msix_table_mmio, 1025 offset, val, 1026 size_memop(size) | MO_LE, 1027 MEMTXATTRS_UNSPECIFIED); 1028 } 1029 return; /* Do not write guest MSI-X data to hardware */ 1030 } 1031 } 1032 1033 vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size); 1034 } 1035 1036 static const MemoryRegionOps vfio_rtl_address_quirk = { 1037 .read = vfio_rtl8168_quirk_address_read, 1038 .write = vfio_rtl8168_quirk_address_write, 1039 .valid = { 1040 .min_access_size = 4, 1041 .max_access_size = 4, 1042 .unaligned = false, 1043 }, 1044 .endianness = DEVICE_LITTLE_ENDIAN, 1045 }; 1046 1047 static uint64_t vfio_rtl8168_quirk_data_read(void *opaque, 1048 hwaddr addr, unsigned size) 1049 { 1050 VFIOrtl8168Quirk *rtl = opaque; 1051 VFIOPCIDevice *vdev = rtl->vdev; 1052 uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size); 1053 1054 if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) { 1055 hwaddr offset = rtl->addr & 0xfff; 1056 memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset, 1057 &data, size_memop(size) | MO_LE, 1058 MEMTXATTRS_UNSPECIFIED); 1059 trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data); 1060 } 1061 1062 return data; 1063 } 1064 1065 static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr, 1066 uint64_t data, unsigned size) 1067 { 1068 VFIOrtl8168Quirk *rtl = opaque; 1069 VFIOPCIDevice *vdev = rtl->vdev; 1070 1071 rtl->data = (uint32_t)data; 1072 1073 vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size); 1074 } 1075 1076 static const MemoryRegionOps vfio_rtl_data_quirk = { 1077 .read = vfio_rtl8168_quirk_data_read, 1078 .write = vfio_rtl8168_quirk_data_write, 1079 .valid = { 1080 .min_access_size = 4, 1081 .max_access_size = 4, 1082 .unaligned = false, 1083 }, 1084 .endianness = DEVICE_LITTLE_ENDIAN, 1085 }; 1086 1087 static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) 1088 { 1089 VFIOQuirk *quirk; 1090 VFIOrtl8168Quirk *rtl; 1091 1092 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) { 1093 return; 1094 } 1095 1096 quirk = vfio_quirk_alloc(2); 1097 quirk->data = rtl = g_malloc0(sizeof(*rtl)); 1098 rtl->vdev = vdev; 1099 1100 memory_region_init_io(&quirk->mem[0], OBJECT(vdev), 1101 &vfio_rtl_address_quirk, rtl, 1102 "vfio-rtl8168-window-address-quirk", 4); 1103 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1104 0x74, &quirk->mem[0], 1); 1105 1106 memory_region_init_io(&quirk->mem[1], OBJECT(vdev), 1107 &vfio_rtl_data_quirk, rtl, 1108 "vfio-rtl8168-window-data-quirk", 4); 1109 memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, 1110 0x70, &quirk->mem[1], 1); 1111 1112 QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); 1113 1114 trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name); 1115 } 1116 1117 #define IGD_ASLS 0xfc /* ASL Storage Register */ 1118 1119 /* 1120 * The OpRegion includes the Video BIOS Table, which seems important for 1121 * telling the driver what sort of outputs it has. Without this, the device 1122 * may work in the guest, but we may not get output. This also requires BIOS 1123 * support to reserve and populate a section of guest memory sufficient for 1124 * the table and to write the base address of that memory to the ASLS register 1125 * of the IGD device. 1126 */ 1127 bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, 1128 struct vfio_region_info *info, Error **errp) 1129 { 1130 int ret; 1131 1132 vdev->igd_opregion = g_malloc0(info->size); 1133 ret = pread(vdev->vbasedev.fd, vdev->igd_opregion, 1134 info->size, info->offset); 1135 if (ret != info->size) { 1136 error_setg(errp, "failed to read IGD OpRegion"); 1137 g_free(vdev->igd_opregion); 1138 vdev->igd_opregion = NULL; 1139 return false; 1140 } 1141 1142 /* 1143 * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to 1144 * allocate 32bit reserved memory for, copy these contents into, and write 1145 * the reserved memory base address to the device ASLS register at 0xFC. 1146 * Alignment of this reserved region seems flexible, but using a 4k page 1147 * alignment seems to work well. This interface assumes a single IGD 1148 * device, which may be at VM address 00:02.0 in legacy mode or another 1149 * address in UPT mode. 1150 * 1151 * NB, there may be future use cases discovered where the VM should have 1152 * direct interaction with the host OpRegion, in which case the write to 1153 * the ASLS register would trigger MemoryRegion setup to enable that. 1154 */ 1155 fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion", 1156 vdev->igd_opregion, info->size); 1157 1158 trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name); 1159 1160 pci_set_long(vdev->pdev.config + IGD_ASLS, 0); 1161 pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); 1162 pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); 1163 1164 return true; 1165 } 1166 1167 /* 1168 * Common quirk probe entry points. 1169 */ 1170 void vfio_vga_quirk_setup(VFIOPCIDevice *vdev) 1171 { 1172 vfio_vga_probe_ati_3c3_quirk(vdev); 1173 vfio_vga_probe_nvidia_3d0_quirk(vdev); 1174 } 1175 1176 void vfio_vga_quirk_exit(VFIOPCIDevice *vdev) 1177 { 1178 VFIOQuirk *quirk; 1179 int i, j; 1180 1181 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1182 QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) { 1183 for (j = 0; j < quirk->nr_mem; j++) { 1184 memory_region_del_subregion(&vdev->vga->region[i].mem, 1185 &quirk->mem[j]); 1186 } 1187 } 1188 } 1189 } 1190 1191 void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev) 1192 { 1193 int i, j; 1194 1195 for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) { 1196 while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) { 1197 VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks); 1198 QLIST_REMOVE(quirk, next); 1199 for (j = 0; j < quirk->nr_mem; j++) { 1200 object_unparent(OBJECT(&quirk->mem[j])); 1201 } 1202 g_free(quirk->mem); 1203 g_free(quirk->data); 1204 g_free(quirk); 1205 } 1206 } 1207 } 1208 1209 void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) 1210 { 1211 vfio_probe_ati_bar4_quirk(vdev, nr); 1212 vfio_probe_ati_bar2_quirk(vdev, nr); 1213 vfio_probe_nvidia_bar5_quirk(vdev, nr); 1214 vfio_probe_nvidia_bar0_quirk(vdev, nr); 1215 vfio_probe_rtl8168_bar2_quirk(vdev, nr); 1216 #ifdef CONFIG_VFIO_IGD 1217 vfio_probe_igd_bar0_quirk(vdev, nr); 1218 vfio_probe_igd_bar4_quirk(vdev, nr); 1219 #endif 1220 } 1221 1222 void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr) 1223 { 1224 VFIOBAR *bar = &vdev->bars[nr]; 1225 VFIOQuirk *quirk; 1226 int i; 1227 1228 QLIST_FOREACH(quirk, &bar->quirks, next) { 1229 while (!QLIST_EMPTY(&quirk->ioeventfds)) { 1230 vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds)); 1231 } 1232 1233 for (i = 0; i < quirk->nr_mem; i++) { 1234 memory_region_del_subregion(bar->region.mem, &quirk->mem[i]); 1235 } 1236 } 1237 } 1238 1239 void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr) 1240 { 1241 VFIOBAR *bar = &vdev->bars[nr]; 1242 int i; 1243 1244 while (!QLIST_EMPTY(&bar->quirks)) { 1245 VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks); 1246 QLIST_REMOVE(quirk, next); 1247 for (i = 0; i < quirk->nr_mem; i++) { 1248 object_unparent(OBJECT(&quirk->mem[i])); 1249 } 1250 g_free(quirk->mem); 1251 g_free(quirk->data); 1252 g_free(quirk); 1253 } 1254 } 1255 1256 /* 1257 * Reset quirks 1258 */ 1259 void vfio_quirk_reset(VFIOPCIDevice *vdev) 1260 { 1261 int i; 1262 1263 for (i = 0; i < PCI_ROM_SLOT; i++) { 1264 VFIOQuirk *quirk; 1265 VFIOBAR *bar = &vdev->bars[i]; 1266 1267 QLIST_FOREACH(quirk, &bar->quirks, next) { 1268 if (quirk->reset) { 1269 quirk->reset(vdev, quirk); 1270 } 1271 } 1272 } 1273 } 1274 1275 /* 1276 * AMD Radeon PCI config reset, based on Linux: 1277 * drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running() 1278 * drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset 1279 * drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc() 1280 * drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock() 1281 * IDs: include/drm/drm_pciids.h 1282 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0 1283 * 1284 * Bonaire and Hawaii GPUs do not respond to a bus reset. This is a bug in the 1285 * hardware that should be fixed on future ASICs. The symptom of this is that 1286 * once the accerlated driver loads, Windows guests will bsod on subsequent 1287 * attmpts to load the driver, such as after VM reset or shutdown/restart. To 1288 * work around this, we do an AMD specific PCI config reset, followed by an SMC 1289 * reset. The PCI config reset only works if SMC firmware is running, so we 1290 * have a dependency on the state of the device as to whether this reset will 1291 * be effective. There are still cases where we won't be able to kick the 1292 * device into working, but this greatly improves the usability overall. The 1293 * config reset magic is relatively common on AMD GPUs, but the setup and SMC 1294 * poking is largely ASIC specific. 1295 */ 1296 static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev) 1297 { 1298 uint32_t clk, pc_c; 1299 1300 /* 1301 * Registers 200h and 204h are index and data registers for accessing 1302 * indirect configuration registers within the device. 1303 */ 1304 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); 1305 clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1306 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4); 1307 pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1308 1309 return (!(clk & 1) && (0x20100 <= pc_c)); 1310 } 1311 1312 /* 1313 * The scope of a config reset is controlled by a mode bit in the misc register 1314 * and a fuse, exposed as a bit in another register. The fuse is the default 1315 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the formula 1316 * scope = !(misc ^ fuse), where the resulting scope is defined the same as 1317 * the fuse. A truth table therefore tells us that if misc == fuse, we need 1318 * to flip the value of the bit in the misc register. 1319 */ 1320 static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev) 1321 { 1322 uint32_t misc, fuse; 1323 bool a, b; 1324 1325 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4); 1326 fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1327 b = fuse & 64; 1328 1329 vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4); 1330 misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1331 a = misc & 2; 1332 1333 if (a == b) { 1334 vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4); 1335 vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */ 1336 } 1337 } 1338 1339 static int vfio_radeon_reset(VFIOPCIDevice *vdev) 1340 { 1341 PCIDevice *pdev = &vdev->pdev; 1342 int i, ret = 0; 1343 uint32_t data; 1344 1345 /* Defer to a kernel implemented reset */ 1346 if (vdev->vbasedev.reset_works) { 1347 trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name); 1348 return -ENODEV; 1349 } 1350 1351 /* Enable only memory BAR access */ 1352 vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2); 1353 1354 /* Reset only works if SMC firmware is loaded and running */ 1355 if (!vfio_radeon_smc_is_running(vdev)) { 1356 ret = -EINVAL; 1357 trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name); 1358 goto out; 1359 } 1360 1361 /* Make sure only the GFX function is reset */ 1362 vfio_radeon_set_gfx_only_reset(vdev); 1363 1364 /* AMD PCI config reset */ 1365 vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4); 1366 usleep(100); 1367 1368 /* Read back the memory size to make sure we're out of reset */ 1369 for (i = 0; i < 100000; i++) { 1370 if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) { 1371 goto reset_smc; 1372 } 1373 usleep(1); 1374 } 1375 1376 trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name); 1377 1378 reset_smc: 1379 /* Reset SMC */ 1380 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4); 1381 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1382 data |= 1; 1383 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); 1384 1385 /* Disable SMC clock */ 1386 vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); 1387 data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); 1388 data |= 1; 1389 vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); 1390 1391 trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name); 1392 1393 out: 1394 /* Restore PCI command register */ 1395 vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2); 1396 1397 return ret; 1398 } 1399 1400 void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev) 1401 { 1402 switch (vdev->vendor_id) { 1403 case 0x1002: 1404 switch (vdev->device_id) { 1405 /* Bonaire */ 1406 case 0x6649: /* Bonaire [FirePro W5100] */ 1407 case 0x6650: 1408 case 0x6651: 1409 case 0x6658: /* Bonaire XTX [Radeon R7 260X] */ 1410 case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */ 1411 case 0x665d: /* Bonaire [Radeon R7 200 Series] */ 1412 /* Hawaii */ 1413 case 0x67A0: /* Hawaii XT GL [FirePro W9100] */ 1414 case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */ 1415 case 0x67A2: 1416 case 0x67A8: 1417 case 0x67A9: 1418 case 0x67AA: 1419 case 0x67B0: /* Hawaii XT [Radeon R9 290X] */ 1420 case 0x67B1: /* Hawaii PRO [Radeon R9 290] */ 1421 case 0x67B8: 1422 case 0x67B9: 1423 case 0x67BA: 1424 case 0x67BE: 1425 vdev->resetfn = vfio_radeon_reset; 1426 trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name); 1427 break; 1428 } 1429 break; 1430 } 1431 } 1432 1433 /* 1434 * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify 1435 * devices as a member of a clique. Devices within the same clique ID 1436 * are capable of direct P2P. It's the user's responsibility that this 1437 * is correct. The spec says that this may reside at any unused config 1438 * offset, but reserves and recommends hypervisors place this at C8h. 1439 * The spec also states that the hypervisor should place this capability 1440 * at the end of the capability list, thus next is defined as 0h. 1441 * 1442 * +----------------+----------------+----------------+----------------+ 1443 * | sig 7:0 ('P') | vndr len (8h) | next (0h) | cap id (9h) | 1444 * +----------------+----------------+----------------+----------------+ 1445 * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)| sig 23:8 ('P2') | 1446 * +---------------------------------+---------------------------------+ 1447 * 1448 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf 1449 * 1450 * Specification for Turning and later GPU architectures: 1451 * https://lists.gnu.org/archive/html/qemu-devel/2023-06/pdf142OR4O4c2.pdf 1452 */ 1453 static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v, 1454 const char *name, void *opaque, 1455 Error **errp) 1456 { 1457 const Property *prop = opaque; 1458 uint8_t *ptr = object_field_prop_ptr(obj, prop); 1459 1460 visit_type_uint8(v, name, ptr, errp); 1461 } 1462 1463 static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v, 1464 const char *name, void *opaque, 1465 Error **errp) 1466 { 1467 const Property *prop = opaque; 1468 uint8_t value, *ptr = object_field_prop_ptr(obj, prop); 1469 1470 if (!visit_type_uint8(v, name, &value, errp)) { 1471 return; 1472 } 1473 1474 if (value & ~0xF) { 1475 error_setg(errp, "Property %s: valid range 0-15", name); 1476 return; 1477 } 1478 1479 *ptr = value; 1480 } 1481 1482 const PropertyInfo qdev_prop_nv_gpudirect_clique = { 1483 .name = "uint4", 1484 .description = "NVIDIA GPUDirect Clique ID (0 - 15)", 1485 .get = get_nv_gpudirect_clique_id, 1486 .set = set_nv_gpudirect_clique_id, 1487 }; 1488 1489 static bool is_valid_std_cap_offset(uint8_t pos) 1490 { 1491 return (pos >= PCI_STD_HEADER_SIZEOF && 1492 pos <= (PCI_CFG_SPACE_SIZE - PCI_CAP_SIZEOF)); 1493 } 1494 1495 static bool vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) 1496 { 1497 ERRP_GUARD(); 1498 PCIDevice *pdev = &vdev->pdev; 1499 int ret, pos; 1500 bool c8_conflict = false, d4_conflict = false; 1501 uint8_t tmp; 1502 1503 if (vdev->nv_gpudirect_clique == 0xFF) { 1504 return true; 1505 } 1506 1507 if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) { 1508 error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor"); 1509 return false; 1510 } 1511 1512 if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) != 1513 PCI_BASE_CLASS_DISPLAY) { 1514 error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class"); 1515 return false; 1516 } 1517 1518 /* 1519 * Per the updated specification above, it's recommended to use offset 1520 * D4h for Turing and later GPU architectures due to a conflict of the 1521 * MSI-X capability at C8h. We don't know how to determine the GPU 1522 * architecture, instead we walk the capability chain to mark conflicts 1523 * and choose one or error based on the result. 1524 * 1525 * NB. Cap list head in pdev->config is already cleared, read from device. 1526 */ 1527 ret = pread(vdev->vbasedev.fd, &tmp, 1, 1528 vdev->config_offset + PCI_CAPABILITY_LIST); 1529 if (ret != 1 || !is_valid_std_cap_offset(tmp)) { 1530 error_setg(errp, "NVIDIA GPUDirect Clique ID: error getting cap list"); 1531 return false; 1532 } 1533 1534 do { 1535 if (tmp == 0xC8) { 1536 c8_conflict = true; 1537 } else if (tmp == 0xD4) { 1538 d4_conflict = true; 1539 } 1540 tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]; 1541 } while (is_valid_std_cap_offset(tmp)); 1542 1543 if (!c8_conflict) { 1544 pos = 0xC8; 1545 } else if (!d4_conflict) { 1546 pos = 0xD4; 1547 } else { 1548 error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid config space"); 1549 return false; 1550 } 1551 1552 ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp); 1553 if (ret < 0) { 1554 error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: "); 1555 return false; 1556 } 1557 1558 memset(vdev->emulated_config_bits + pos, 0xFF, 8); 1559 pos += PCI_CAP_FLAGS; 1560 pci_set_byte(pdev->config + pos++, 8); 1561 pci_set_byte(pdev->config + pos++, 'P'); 1562 pci_set_byte(pdev->config + pos++, '2'); 1563 pci_set_byte(pdev->config + pos++, 'P'); 1564 pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3); 1565 pci_set_byte(pdev->config + pos, 0); 1566 1567 return true; 1568 } 1569 1570 /* 1571 * The VMD endpoint provides a real PCIe domain to the guest and the guest 1572 * kernel performs enumeration of the VMD sub-device domain. Guest transactions 1573 * to VMD sub-devices go through MMU translation from guest addresses to 1574 * physical addresses. When MMIO goes to an endpoint after being translated to 1575 * physical addresses, the bridge rejects the transaction because the window 1576 * has been programmed with guest addresses. 1577 * 1578 * VMD can use the Host Physical Address in order to correctly program the 1579 * bridge windows in its PCIe domain. VMD device 28C0 has HPA shadow registers 1580 * located at offset 0x2000 in MEMBAR2 (BAR 4). This quirk provides the HPA 1581 * shadow registers in a vendor-specific capability register for devices 1582 * without native support. The position of 0xE8-0xFF is in the reserved range 1583 * of the VMD device capability space following the Power Management 1584 * Capability. 1585 */ 1586 #define VMD_SHADOW_CAP_VER 1 1587 #define VMD_SHADOW_CAP_LEN 24 1588 static bool vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, Error **errp) 1589 { 1590 ERRP_GUARD(); 1591 uint8_t membar_phys[16]; 1592 int ret, pos = 0xE8; 1593 1594 if (!(vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x201D) || 1595 vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x467F) || 1596 vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x4C3D) || 1597 vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x9A0B))) { 1598 return true; 1599 } 1600 1601 ret = pread(vdev->vbasedev.fd, membar_phys, 16, 1602 vdev->config_offset + PCI_BASE_ADDRESS_2); 1603 if (ret != 16) { 1604 error_report("VMD %s cannot read MEMBARs (%d)", 1605 vdev->vbasedev.name, ret); 1606 return false; 1607 } 1608 1609 ret = pci_add_capability(&vdev->pdev, PCI_CAP_ID_VNDR, pos, 1610 VMD_SHADOW_CAP_LEN, errp); 1611 if (ret < 0) { 1612 error_prepend(errp, "Failed to add VMD MEMBAR Shadow cap: "); 1613 return false; 1614 } 1615 1616 memset(vdev->emulated_config_bits + pos, 0xFF, VMD_SHADOW_CAP_LEN); 1617 pos += PCI_CAP_FLAGS; 1618 pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_LEN); 1619 pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_VER); 1620 pci_set_long(vdev->pdev.config + pos, 0x53484457); /* SHDW */ 1621 memcpy(vdev->pdev.config + pos + 4, membar_phys, 16); 1622 1623 return true; 1624 } 1625 1626 bool vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp) 1627 { 1628 if (!vfio_add_nv_gpudirect_cap(vdev, errp)) { 1629 return false; 1630 } 1631 1632 if (!vfio_add_vmd_shadow_cap(vdev, errp)) { 1633 return false; 1634 } 1635 1636 return true; 1637 } 1638