1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2025 Ant Group 4 * Author: Tiwei Bie <tiwei.btw@antgroup.com> 5 */ 6 7 #define pr_fmt(fmt) "vfio-uml: " fmt 8 9 #include <linux/module.h> 10 #include <linux/logic_iomem.h> 11 #include <linux/mutex.h> 12 #include <linux/list.h> 13 #include <linux/string.h> 14 #include <linux/unaligned.h> 15 #include <irq_kern.h> 16 #include <init.h> 17 #include <os.h> 18 19 #include "virt-pci.h" 20 #include "vfio_user.h" 21 22 #define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev) 23 24 struct uml_vfio_intr_ctx { 25 struct uml_vfio_device *dev; 26 int irq; 27 }; 28 29 struct uml_vfio_device { 30 const char *name; 31 int group; 32 33 struct um_pci_device pdev; 34 struct uml_vfio_user_device udev; 35 struct uml_vfio_intr_ctx *intr_ctx; 36 37 int msix_cap; 38 int msix_bar; 39 int msix_offset; 40 int msix_size; 41 u32 *msix_data; 42 43 struct list_head list; 44 }; 45 46 struct uml_vfio_group { 47 int id; 48 int fd; 49 int users; 50 struct list_head list; 51 }; 52 53 static struct { 54 int fd; 55 int users; 56 } uml_vfio_container = { .fd = -1 }; 57 static DEFINE_MUTEX(uml_vfio_container_mtx); 58 59 static LIST_HEAD(uml_vfio_groups); 60 static DEFINE_MUTEX(uml_vfio_groups_mtx); 61 62 static LIST_HEAD(uml_vfio_devices); 63 64 static int uml_vfio_set_container(int group_fd) 65 { 66 int err; 67 68 guard(mutex)(¨_vfio_container_mtx); 69 70 err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd); 71 if (err) 72 return err; 73 74 uml_vfio_container.users++; 75 if (uml_vfio_container.users > 1) 76 return 0; 77 78 err = uml_vfio_user_setup_iommu(uml_vfio_container.fd); 79 if (err) { 80 uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); 81 uml_vfio_container.users--; 82 } 83 return err; 84 } 85 86 static void uml_vfio_unset_container(int group_fd) 87 { 88 guard(mutex)(¨_vfio_container_mtx); 89 90 uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); 91 uml_vfio_container.users--; 92 } 93 94 static int uml_vfio_open_group(int group_id) 95 { 96 struct uml_vfio_group *group; 97 int err; 98 99 guard(mutex)(¨_vfio_groups_mtx); 100 101 list_for_each_entry(group, ¨_vfio_groups, list) { 102 if (group->id == group_id) { 103 group->users++; 104 return group->fd; 105 } 106 } 107 108 group = kzalloc(sizeof(*group), GFP_KERNEL); 109 if (!group) 110 return -ENOMEM; 111 112 group->fd = uml_vfio_user_open_group(group_id); 113 if (group->fd < 0) { 114 err = group->fd; 115 goto free_group; 116 } 117 118 err = uml_vfio_set_container(group->fd); 119 if (err) 120 goto close_group; 121 122 group->id = group_id; 123 group->users = 1; 124 125 list_add(&group->list, ¨_vfio_groups); 126 127 return group->fd; 128 129 close_group: 130 os_close_file(group->fd); 131 free_group: 132 kfree(group); 133 return err; 134 } 135 136 static int uml_vfio_release_group(int group_fd) 137 { 138 struct uml_vfio_group *group; 139 140 guard(mutex)(¨_vfio_groups_mtx); 141 142 list_for_each_entry(group, ¨_vfio_groups, list) { 143 if (group->fd == group_fd) { 144 group->users--; 145 if (group->users == 0) { 146 uml_vfio_unset_container(group_fd); 147 os_close_file(group_fd); 148 list_del(&group->list); 149 kfree(group); 150 } 151 return 0; 152 } 153 } 154 155 return -ENOENT; 156 } 157 158 static irqreturn_t uml_vfio_interrupt(int unused, void *opaque) 159 { 160 struct uml_vfio_intr_ctx *ctx = opaque; 161 struct uml_vfio_device *dev = ctx->dev; 162 int index = ctx - dev->intr_ctx; 163 int irqfd = dev->udev.irqfd[index]; 164 int irq = dev->msix_data[index]; 165 uint64_t v; 166 int r; 167 168 do { 169 r = os_read_file(irqfd, &v, sizeof(v)); 170 if (r == sizeof(v)) 171 generic_handle_irq(irq); 172 } while (r == sizeof(v) || r == -EINTR); 173 WARN(r != -EAGAIN, "read returned %d\n", r); 174 175 return IRQ_HANDLED; 176 } 177 178 static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index) 179 { 180 struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; 181 int err, irqfd; 182 183 if (ctx->irq >= 0) 184 return 0; 185 186 irqfd = uml_vfio_user_activate_irq(&dev->udev, index); 187 if (irqfd < 0) 188 return irqfd; 189 190 ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ, 191 uml_vfio_interrupt, 0, 192 "vfio-uml", ctx); 193 if (ctx->irq < 0) { 194 err = ctx->irq; 195 goto deactivate; 196 } 197 198 err = add_sigio_fd(irqfd); 199 if (err) 200 goto free_irq; 201 202 return 0; 203 204 free_irq: 205 um_free_irq(ctx->irq, ctx); 206 ctx->irq = -1; 207 deactivate: 208 uml_vfio_user_deactivate_irq(&dev->udev, index); 209 return err; 210 } 211 212 static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index) 213 { 214 struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; 215 216 if (ctx->irq >= 0) { 217 ignore_sigio_fd(dev->udev.irqfd[index]); 218 um_free_irq(ctx->irq, ctx); 219 uml_vfio_user_deactivate_irq(&dev->udev, index); 220 ctx->irq = -1; 221 } 222 return 0; 223 } 224 225 static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev, 226 unsigned int offset, int size, 227 unsigned long val) 228 { 229 /* 230 * Here, we handle only the operations we care about, 231 * ignoring the rest. 232 */ 233 if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) { 234 switch (val & ~PCI_MSIX_FLAGS_QSIZE) { 235 case PCI_MSIX_FLAGS_ENABLE: 236 case 0: 237 return uml_vfio_user_update_irqs(&dev->udev); 238 } 239 } 240 return 0; 241 } 242 243 static int uml_vfio_update_msix_table(struct uml_vfio_device *dev, 244 unsigned int offset, int size, 245 unsigned long val) 246 { 247 int index; 248 249 /* 250 * Here, we handle only the operations we care about, 251 * ignoring the rest. 252 */ 253 offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA; 254 255 if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0) 256 return 0; 257 258 index = offset / PCI_MSIX_ENTRY_SIZE; 259 if (index >= dev->udev.irq_count) 260 return -EINVAL; 261 262 dev->msix_data[index] = val; 263 264 return val ? uml_vfio_activate_irq(dev, index) : 265 uml_vfio_deactivate_irq(dev, index); 266 } 267 268 static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev, 269 unsigned int offset, int size) 270 { 271 u8 data[8]; 272 273 memset(data, 0xff, sizeof(data)); 274 275 if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size)) 276 return ULONG_MAX; 277 278 switch (size) { 279 case 1: 280 return data[0]; 281 case 2: 282 return le16_to_cpup((void *)data); 283 case 4: 284 return le32_to_cpup((void *)data); 285 #ifdef CONFIG_64BIT 286 case 8: 287 return le64_to_cpup((void *)data); 288 #endif 289 default: 290 return ULONG_MAX; 291 } 292 } 293 294 static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev, 295 unsigned int offset, int size) 296 { 297 struct uml_vfio_device *dev = to_vdev(pdev); 298 299 return __uml_vfio_cfgspace_read(dev, offset, size); 300 } 301 302 static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev, 303 unsigned int offset, int size, 304 unsigned long val) 305 { 306 u8 data[8]; 307 308 switch (size) { 309 case 1: 310 data[0] = (u8)val; 311 break; 312 case 2: 313 put_unaligned_le16(val, (void *)data); 314 break; 315 case 4: 316 put_unaligned_le32(val, (void *)data); 317 break; 318 #ifdef CONFIG_64BIT 319 case 8: 320 put_unaligned_le64(val, (void *)data); 321 break; 322 #endif 323 } 324 325 WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size)); 326 } 327 328 static void uml_vfio_cfgspace_write(struct um_pci_device *pdev, 329 unsigned int offset, int size, 330 unsigned long val) 331 { 332 struct uml_vfio_device *dev = to_vdev(pdev); 333 334 if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF && 335 offset + size > dev->msix_cap) 336 WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val)); 337 338 __uml_vfio_cfgspace_write(dev, offset, size, val); 339 } 340 341 static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar, 342 void *buffer, unsigned int offset, int size) 343 { 344 struct uml_vfio_device *dev = to_vdev(pdev); 345 346 memset(buffer, 0xff, size); 347 uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size); 348 } 349 350 static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar, 351 unsigned int offset, int size) 352 { 353 u8 data[8]; 354 355 uml_vfio_bar_copy_from(pdev, bar, data, offset, size); 356 357 switch (size) { 358 case 1: 359 return data[0]; 360 case 2: 361 return le16_to_cpup((void *)data); 362 case 4: 363 return le32_to_cpup((void *)data); 364 #ifdef CONFIG_64BIT 365 case 8: 366 return le64_to_cpup((void *)data); 367 #endif 368 default: 369 return ULONG_MAX; 370 } 371 } 372 373 static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar, 374 unsigned int offset, const void *buffer, 375 int size) 376 { 377 struct uml_vfio_device *dev = to_vdev(pdev); 378 379 uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size); 380 } 381 382 static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar, 383 unsigned int offset, int size, 384 unsigned long val) 385 { 386 struct uml_vfio_device *dev = to_vdev(pdev); 387 u8 data[8]; 388 389 if (bar == dev->msix_bar && offset + size > dev->msix_offset && 390 offset < dev->msix_offset + dev->msix_size) 391 WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val)); 392 393 switch (size) { 394 case 1: 395 data[0] = (u8)val; 396 break; 397 case 2: 398 put_unaligned_le16(val, (void *)data); 399 break; 400 case 4: 401 put_unaligned_le32(val, (void *)data); 402 break; 403 #ifdef CONFIG_64BIT 404 case 8: 405 put_unaligned_le64(val, (void *)data); 406 break; 407 #endif 408 } 409 410 uml_vfio_bar_copy_to(pdev, bar, offset, data, size); 411 } 412 413 static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar, 414 unsigned int offset, u8 value, int size) 415 { 416 struct uml_vfio_device *dev = to_vdev(pdev); 417 int i; 418 419 for (i = 0; i < size; i++) 420 uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1); 421 } 422 423 static const struct um_pci_ops uml_vfio_um_pci_ops = { 424 .cfgspace_read = uml_vfio_cfgspace_read, 425 .cfgspace_write = uml_vfio_cfgspace_write, 426 .bar_read = uml_vfio_bar_read, 427 .bar_write = uml_vfio_bar_write, 428 .bar_copy_from = uml_vfio_bar_copy_from, 429 .bar_copy_to = uml_vfio_bar_copy_to, 430 .bar_set = uml_vfio_bar_set, 431 }; 432 433 static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap) 434 { 435 u8 id, pos; 436 u16 ent; 437 int ttl = 48; /* PCI_FIND_CAP_TTL */ 438 439 pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos)); 440 441 while (pos && ttl--) { 442 ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent)); 443 444 id = ent & 0xff; 445 if (id == 0xff) 446 break; 447 if (id == cap) 448 return pos; 449 450 pos = ent >> 8; 451 } 452 453 return 0; 454 } 455 456 static int uml_vfio_read_msix_table(struct uml_vfio_device *dev) 457 { 458 unsigned int off; 459 u16 flags; 460 u32 tbl; 461 462 off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX); 463 if (!off) 464 return -ENOTSUPP; 465 466 dev->msix_cap = off; 467 468 tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl)); 469 flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, sizeof(flags)); 470 471 dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR; 472 dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET; 473 dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * PCI_MSIX_ENTRY_SIZE; 474 475 dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL); 476 if (!dev->msix_data) 477 return -ENOMEM; 478 479 return 0; 480 } 481 482 static void uml_vfio_open_device(struct uml_vfio_device *dev) 483 { 484 struct uml_vfio_intr_ctx *ctx; 485 int err, group_id, i; 486 487 group_id = uml_vfio_user_get_group_id(dev->name); 488 if (group_id < 0) { 489 pr_err("Failed to get group id (%s), error %d\n", 490 dev->name, group_id); 491 goto free_dev; 492 } 493 494 dev->group = uml_vfio_open_group(group_id); 495 if (dev->group < 0) { 496 pr_err("Failed to open group %d (%s), error %d\n", 497 group_id, dev->name, dev->group); 498 goto free_dev; 499 } 500 501 err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name); 502 if (err) { 503 pr_err("Failed to setup device (%s), error %d\n", 504 dev->name, err); 505 goto release_group; 506 } 507 508 err = uml_vfio_read_msix_table(dev); 509 if (err) { 510 pr_err("Failed to read MSI-X table (%s), error %d\n", 511 dev->name, err); 512 goto teardown_udev; 513 } 514 515 dev->intr_ctx = kmalloc_array(dev->udev.irq_count, 516 sizeof(struct uml_vfio_intr_ctx), 517 GFP_KERNEL); 518 if (!dev->intr_ctx) { 519 pr_err("Failed to allocate interrupt context (%s)\n", 520 dev->name); 521 goto free_msix; 522 } 523 524 for (i = 0; i < dev->udev.irq_count; i++) { 525 ctx = &dev->intr_ctx[i]; 526 ctx->dev = dev; 527 ctx->irq = -1; 528 } 529 530 dev->pdev.ops = ¨_vfio_um_pci_ops; 531 532 err = um_pci_device_register(&dev->pdev); 533 if (err) { 534 pr_err("Failed to register UM PCI device (%s), error %d\n", 535 dev->name, err); 536 goto free_intr_ctx; 537 } 538 539 return; 540 541 free_intr_ctx: 542 kfree(dev->intr_ctx); 543 free_msix: 544 kfree(dev->msix_data); 545 teardown_udev: 546 uml_vfio_user_teardown_device(&dev->udev); 547 release_group: 548 uml_vfio_release_group(dev->group); 549 free_dev: 550 list_del(&dev->list); 551 kfree(dev->name); 552 kfree(dev); 553 } 554 555 static void uml_vfio_release_device(struct uml_vfio_device *dev) 556 { 557 int i; 558 559 for (i = 0; i < dev->udev.irq_count; i++) 560 uml_vfio_deactivate_irq(dev, i); 561 uml_vfio_user_update_irqs(&dev->udev); 562 563 um_pci_device_unregister(&dev->pdev); 564 kfree(dev->intr_ctx); 565 kfree(dev->msix_data); 566 uml_vfio_user_teardown_device(&dev->udev); 567 uml_vfio_release_group(dev->group); 568 list_del(&dev->list); 569 kfree(dev->name); 570 kfree(dev); 571 } 572 573 static struct uml_vfio_device *uml_vfio_find_device(const char *device) 574 { 575 struct uml_vfio_device *dev; 576 577 list_for_each_entry(dev, ¨_vfio_devices, list) { 578 if (!strcmp(dev->name, device)) 579 return dev; 580 } 581 return NULL; 582 } 583 584 static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp) 585 { 586 struct uml_vfio_device *dev; 587 int fd; 588 589 if (uml_vfio_container.fd < 0) { 590 fd = uml_vfio_user_open_container(); 591 if (fd < 0) 592 return fd; 593 uml_vfio_container.fd = fd; 594 } 595 596 if (uml_vfio_find_device(device)) 597 return -EEXIST; 598 599 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 600 if (!dev) 601 return -ENOMEM; 602 603 dev->name = kstrdup(device, GFP_KERNEL); 604 if (!dev->name) { 605 kfree(dev); 606 return -ENOMEM; 607 } 608 609 list_add_tail(&dev->list, ¨_vfio_devices); 610 return 0; 611 } 612 613 static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp) 614 { 615 return 0; 616 } 617 618 static const struct kernel_param_ops uml_vfio_cmdline_param_ops = { 619 .set = uml_vfio_cmdline_set, 620 .get = uml_vfio_cmdline_get, 621 }; 622 623 device_param_cb(device, ¨_vfio_cmdline_param_ops, NULL, 0400); 624 __uml_help(uml_vfio_cmdline_param_ops, 625 "vfio_uml.device=<domain:bus:slot.function>\n" 626 " Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n" 627 " capable devices are supported, and it is assumed that drivers will\n" 628 " use MSI-X. This parameter can be specified multiple times to pass\n" 629 " through multiple PCI devices to UML.\n\n" 630 ); 631 632 static int __init uml_vfio_init(void) 633 { 634 struct uml_vfio_device *dev, *n; 635 636 sigio_broken(); 637 638 /* If the opening fails, the device will be released. */ 639 list_for_each_entry_safe(dev, n, ¨_vfio_devices, list) 640 uml_vfio_open_device(dev); 641 642 return 0; 643 } 644 late_initcall(uml_vfio_init); 645 646 static void __exit uml_vfio_exit(void) 647 { 648 struct uml_vfio_device *dev, *n; 649 650 list_for_each_entry_safe(dev, n, ¨_vfio_devices, list) 651 uml_vfio_release_device(dev); 652 653 if (uml_vfio_container.fd >= 0) 654 os_close_file(uml_vfio_container.fd); 655 } 656 module_exit(uml_vfio_exit); 657