1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 6 use crate::{ 7 msi_num_enabled_vectors, BarReprogrammingParams, MsiConfig, MsixCap, MsixConfig, 8 PciBarConfiguration, PciBarRegionType, PciCapabilityId, PciClassCode, PciConfiguration, 9 PciDevice, PciDeviceError, PciHeaderType, PciSubclass, MSIX_TABLE_ENTRY_SIZE, 10 }; 11 use byteorder::{ByteOrder, LittleEndian}; 12 use std::any::Any; 13 use std::os::unix::io::AsRawFd; 14 use std::ptr::null_mut; 15 use std::sync::{Arc, Barrier}; 16 use std::{fmt, io, result}; 17 use vfio_bindings::bindings::vfio::*; 18 use vfio_ioctls::{VfioContainer, VfioDevice, VfioError}; 19 use vm_allocator::SystemAllocator; 20 use vm_device::interrupt::{ 21 InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, 22 }; 23 use vm_device::BusDevice; 24 use vm_memory::{Address, GuestAddress, GuestUsize}; 25 use vmm_sys_util::eventfd::EventFd; 26 27 #[derive(Debug)] 28 pub enum VfioPciError { 29 AllocateGsi, 30 DmaMap(VfioError), 31 DmaUnmap(VfioError), 32 EnableIntx(VfioError), 33 EnableMsi(VfioError), 34 EnableMsix(VfioError), 35 EventFd(io::Error), 36 InterruptSourceGroupCreate(io::Error), 37 IrqFd(hypervisor::HypervisorVmError), 38 MapRegionGuest(anyhow::Error), 39 MissingNotifier, 40 MsiNotConfigured, 41 MsixNotConfigured, 42 NewVfioPciDevice, 43 SetGsiRouting(hypervisor::HypervisorVmError), 44 } 45 pub type Result<T> = std::result::Result<T, VfioPciError>; 46 47 impl fmt::Display for VfioPciError { 48 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 49 match self { 50 VfioPciError::AllocateGsi => write!(f, "failed to allocate GSI"), 51 VfioPciError::DmaMap(e) => write!(f, "failed to DMA map: {}", e), 52 VfioPciError::DmaUnmap(e) => write!(f, "failed to DMA unmap: {}", e), 53 VfioPciError::EnableIntx(e) => write!(f, "failed to enable INTx: {}", e), 54 VfioPciError::EnableMsi(e) => write!(f, "failed to enable MSI: {}", e), 55 VfioPciError::EnableMsix(e) => write!(f, "failed to enable MSI-X: {}", e), 56 VfioPciError::EventFd(e) => write!(f, "failed to create eventfd: {}", e), 57 VfioPciError::InterruptSourceGroupCreate(e) => { 58 write!(f, "failed to create interrupt source group: {}", e) 59 } 60 VfioPciError::IrqFd(e) => write!(f, "failed to register irqfd: {}", e), 61 VfioPciError::MapRegionGuest(e) => { 62 write!(f, "failed to map VFIO PCI region into guest: {}", e) 63 } 64 VfioPciError::MissingNotifier => write!(f, "failed to notifier's eventfd"), 65 VfioPciError::MsiNotConfigured => write!(f, "MSI interrupt not yet configured"), 66 VfioPciError::MsixNotConfigured => write!(f, "MSI-X interrupt not yet configured"), 67 VfioPciError::NewVfioPciDevice => write!(f, "failed to create VFIO PCI device"), 68 VfioPciError::SetGsiRouting(e) => write!(f, "failed to set GSI routes: {}", e), 69 } 70 } 71 } 72 73 #[derive(Copy, Clone)] 74 enum PciVfioSubclass { 75 VfioSubclass = 0xff, 76 } 77 78 impl PciSubclass for PciVfioSubclass { 79 fn get_register_value(&self) -> u8 { 80 *self as u8 81 } 82 } 83 84 enum InterruptUpdateAction { 85 EnableMsi, 86 DisableMsi, 87 EnableMsix, 88 DisableMsix, 89 } 90 91 struct VfioIntx { 92 interrupt_source_group: Arc<Box<dyn InterruptSourceGroup>>, 93 enabled: bool, 94 } 95 96 struct VfioMsi { 97 cfg: MsiConfig, 98 cap_offset: u32, 99 interrupt_source_group: Arc<Box<dyn InterruptSourceGroup>>, 100 } 101 102 impl VfioMsi { 103 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 104 let old_enabled = self.cfg.enabled(); 105 106 self.cfg.update(offset, data); 107 108 let new_enabled = self.cfg.enabled(); 109 110 if !old_enabled && new_enabled { 111 return Some(InterruptUpdateAction::EnableMsi); 112 } 113 114 if old_enabled && !new_enabled { 115 return Some(InterruptUpdateAction::DisableMsi); 116 } 117 118 None 119 } 120 } 121 122 struct VfioMsix { 123 bar: MsixConfig, 124 cap: MsixCap, 125 cap_offset: u32, 126 interrupt_source_group: Arc<Box<dyn InterruptSourceGroup>>, 127 } 128 129 impl VfioMsix { 130 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 131 let old_enabled = self.bar.enabled(); 132 133 // Update "Message Control" word 134 if offset == 2 && data.len() == 2 { 135 self.bar.set_msg_ctl(LittleEndian::read_u16(data)); 136 } 137 138 let new_enabled = self.bar.enabled(); 139 140 if !old_enabled && new_enabled { 141 return Some(InterruptUpdateAction::EnableMsix); 142 } 143 144 if old_enabled && !new_enabled { 145 return Some(InterruptUpdateAction::DisableMsix); 146 } 147 148 None 149 } 150 151 fn table_accessed(&self, bar_index: u32, offset: u64) -> bool { 152 let table_offset: u64 = u64::from(self.cap.table_offset()); 153 let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64); 154 let table_bir: u32 = self.cap.table_bir(); 155 156 bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size 157 } 158 } 159 160 struct Interrupt { 161 intx: Option<VfioIntx>, 162 msi: Option<VfioMsi>, 163 msix: Option<VfioMsix>, 164 } 165 166 impl Interrupt { 167 fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 168 if let Some(ref mut msi) = &mut self.msi { 169 let action = msi.update(offset, data); 170 return action; 171 } 172 173 None 174 } 175 176 fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 177 if let Some(ref mut msix) = &mut self.msix { 178 let action = msix.update(offset, data); 179 return action; 180 } 181 182 None 183 } 184 185 fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { 186 if let Some(msi) = &self.msi { 187 if offset >= u64::from(msi.cap_offset) 188 && offset < u64::from(msi.cap_offset) + msi.cfg.size() 189 { 190 return Some(( 191 PciCapabilityId::MessageSignalledInterrupts, 192 u64::from(msi.cap_offset), 193 )); 194 } 195 } 196 197 if let Some(msix) = &self.msix { 198 if offset == u64::from(msix.cap_offset) { 199 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); 200 } 201 } 202 203 None 204 } 205 206 fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool { 207 if let Some(msix) = &self.msix { 208 return msix.table_accessed(bar_index, offset); 209 } 210 211 false 212 } 213 214 fn msix_write_table(&mut self, offset: u64, data: &[u8]) { 215 if let Some(ref mut msix) = &mut self.msix { 216 let offset = offset - u64::from(msix.cap.table_offset()); 217 msix.bar.write_table(offset, data) 218 } 219 } 220 221 fn msix_read_table(&self, offset: u64, data: &mut [u8]) { 222 if let Some(msix) = &self.msix { 223 let offset = offset - u64::from(msix.cap.table_offset()); 224 msix.bar.read_table(offset, data) 225 } 226 } 227 228 fn intx_in_use(&self) -> bool { 229 if let Some(intx) = &self.intx { 230 return intx.enabled; 231 } 232 233 false 234 } 235 } 236 237 #[derive(Copy, Clone)] 238 pub struct MmioRegion { 239 pub start: GuestAddress, 240 pub length: GuestUsize, 241 type_: PciBarRegionType, 242 index: u32, 243 mem_slot: Option<u32>, 244 host_addr: Option<u64>, 245 mmap_size: Option<usize>, 246 } 247 248 struct VfioPciConfig { 249 device: Arc<VfioDevice>, 250 } 251 252 impl VfioPciConfig { 253 fn new(device: Arc<VfioDevice>) -> Self { 254 VfioPciConfig { device } 255 } 256 257 fn read_config_byte(&self, offset: u32) -> u8 { 258 let mut data: [u8; 1] = [0]; 259 self.device 260 .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); 261 262 data[0] 263 } 264 265 fn read_config_word(&self, offset: u32) -> u16 { 266 let mut data: [u8; 2] = [0, 0]; 267 self.device 268 .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); 269 270 u16::from_le_bytes(data) 271 } 272 273 fn read_config_dword(&self, offset: u32) -> u32 { 274 let mut data: [u8; 4] = [0, 0, 0, 0]; 275 self.device 276 .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); 277 278 u32::from_le_bytes(data) 279 } 280 281 fn write_config_dword(&self, buf: u32, offset: u32) { 282 let data: [u8; 4] = buf.to_le_bytes(); 283 self.device 284 .region_write(VFIO_PCI_CONFIG_REGION_INDEX, &data, offset.into()) 285 } 286 } 287 288 /// VfioPciDevice represents a VFIO PCI device. 289 /// This structure implements the BusDevice and PciDevice traits. 290 /// 291 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. 292 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, 293 /// which then gets added to the PCI bus. 294 pub struct VfioPciDevice { 295 vm: Arc<dyn hypervisor::Vm>, 296 device: Arc<VfioDevice>, 297 container: Arc<VfioContainer>, 298 vfio_pci_configuration: VfioPciConfig, 299 configuration: PciConfiguration, 300 mmio_regions: Vec<MmioRegion>, 301 interrupt: Interrupt, 302 iommu_attached: bool, 303 } 304 305 impl VfioPciDevice { 306 /// Constructs a new Vfio Pci device for the given Vfio device 307 pub fn new( 308 vm: &Arc<dyn hypervisor::Vm>, 309 device: VfioDevice, 310 container: Arc<VfioContainer>, 311 msi_interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 312 legacy_interrupt_group: Option<Arc<Box<dyn InterruptSourceGroup>>>, 313 iommu_attached: bool, 314 ) -> Result<Self> { 315 let device = Arc::new(device); 316 device.reset(); 317 318 let configuration = PciConfiguration::new( 319 0, 320 0, 321 0, 322 PciClassCode::Other, 323 &PciVfioSubclass::VfioSubclass, 324 None, 325 PciHeaderType::Device, 326 0, 327 0, 328 None, 329 ); 330 331 let vfio_pci_configuration = VfioPciConfig::new(Arc::clone(&device)); 332 333 let mut vfio_pci_device = VfioPciDevice { 334 vm: vm.clone(), 335 device, 336 container, 337 configuration, 338 vfio_pci_configuration, 339 mmio_regions: Vec::new(), 340 interrupt: Interrupt { 341 intx: None, 342 msi: None, 343 msix: None, 344 }, 345 iommu_attached, 346 }; 347 348 vfio_pci_device.parse_capabilities(msi_interrupt_manager); 349 350 vfio_pci_device.initialize_legacy_interrupt(legacy_interrupt_group)?; 351 352 Ok(vfio_pci_device) 353 } 354 355 pub fn iommu_attached(&self) -> bool { 356 self.iommu_attached 357 } 358 359 fn enable_intx(&mut self) -> Result<()> { 360 if let Some(intx) = &mut self.interrupt.intx { 361 if !intx.enabled { 362 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { 363 self.device 364 .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) 365 .map_err(VfioPciError::EnableIntx)?; 366 367 intx.enabled = true; 368 } else { 369 return Err(VfioPciError::MissingNotifier); 370 } 371 } 372 } 373 374 Ok(()) 375 } 376 377 fn disable_intx(&mut self) { 378 if let Some(intx) = &mut self.interrupt.intx { 379 if intx.enabled { 380 if let Err(e) = self.device.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { 381 error!("Could not disable INTx: {}", e); 382 } else { 383 intx.enabled = false; 384 } 385 } 386 } 387 } 388 389 fn enable_msi(&self) -> Result<()> { 390 if let Some(msi) = &self.interrupt.msi { 391 let mut irq_fds: Vec<EventFd> = Vec::new(); 392 for i in 0..msi.cfg.num_enabled_vectors() { 393 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { 394 irq_fds.push(eventfd); 395 } else { 396 return Err(VfioPciError::MissingNotifier); 397 } 398 } 399 400 self.device 401 .enable_msi(irq_fds.iter().collect()) 402 .map_err(VfioPciError::EnableMsi)?; 403 } 404 405 Ok(()) 406 } 407 408 fn disable_msi(&self) { 409 if let Err(e) = self.device.disable_msi() { 410 error!("Could not disable MSI: {}", e); 411 } 412 } 413 414 fn enable_msix(&self) -> Result<()> { 415 if let Some(msix) = &self.interrupt.msix { 416 let mut irq_fds: Vec<EventFd> = Vec::new(); 417 for i in 0..msix.bar.table_entries.len() { 418 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { 419 irq_fds.push(eventfd); 420 } else { 421 return Err(VfioPciError::MissingNotifier); 422 } 423 } 424 425 self.device 426 .enable_msix(irq_fds.iter().collect()) 427 .map_err(VfioPciError::EnableMsix)?; 428 } 429 430 Ok(()) 431 } 432 433 fn disable_msix(&self) { 434 if let Err(e) = self.device.disable_msix() { 435 error!("Could not disable MSI-X: {}", e); 436 } 437 } 438 439 fn initialize_legacy_interrupt( 440 &mut self, 441 legacy_interrupt_group: Option<Arc<Box<dyn InterruptSourceGroup>>>, 442 ) -> Result<()> { 443 if let Some(irq_info) = self.device.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { 444 if irq_info.count == 0 { 445 // A count of 0 means the INTx IRQ is not supported, therefore 446 // it shouldn't be initialized. 447 return Ok(()); 448 } 449 } 450 451 if let Some(interrupt_source_group) = legacy_interrupt_group { 452 self.interrupt.intx = Some(VfioIntx { 453 interrupt_source_group, 454 enabled: false, 455 }); 456 457 self.enable_intx()?; 458 } 459 460 Ok(()) 461 } 462 463 fn parse_msix_capabilities( 464 &mut self, 465 cap: u8, 466 interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 467 ) { 468 let msg_ctl = self 469 .vfio_pci_configuration 470 .read_config_word((cap + 2).into()); 471 472 let table = self 473 .vfio_pci_configuration 474 .read_config_dword((cap + 4).into()); 475 476 let pba = self 477 .vfio_pci_configuration 478 .read_config_dword((cap + 8).into()); 479 480 let msix_cap = MsixCap { 481 msg_ctl, 482 table, 483 pba, 484 }; 485 486 let interrupt_source_group = interrupt_manager 487 .create_group(MsiIrqGroupConfig { 488 base: 0, 489 count: msix_cap.table_size() as InterruptIndex, 490 }) 491 .unwrap(); 492 493 let msix_config = MsixConfig::new(msix_cap.table_size(), interrupt_source_group.clone(), 0); 494 495 self.interrupt.msix = Some(VfioMsix { 496 bar: msix_config, 497 cap: msix_cap, 498 cap_offset: cap.into(), 499 interrupt_source_group, 500 }); 501 } 502 503 fn parse_msi_capabilities( 504 &mut self, 505 cap: u8, 506 interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 507 ) { 508 let msg_ctl = self 509 .vfio_pci_configuration 510 .read_config_word((cap + 2).into()); 511 512 let interrupt_source_group = interrupt_manager 513 .create_group(MsiIrqGroupConfig { 514 base: 0, 515 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, 516 }) 517 .unwrap(); 518 519 let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone()); 520 521 self.interrupt.msi = Some(VfioMsi { 522 cfg: msi_config, 523 cap_offset: cap.into(), 524 interrupt_source_group, 525 }); 526 } 527 528 fn parse_capabilities( 529 &mut self, 530 interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 531 ) { 532 let mut cap_next = self 533 .vfio_pci_configuration 534 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 535 536 while cap_next != 0 { 537 let cap_id = self 538 .vfio_pci_configuration 539 .read_config_byte(cap_next.into()); 540 541 match PciCapabilityId::from(cap_id) { 542 PciCapabilityId::MessageSignalledInterrupts => { 543 if let Some(irq_info) = self.device.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { 544 if irq_info.count > 0 { 545 // Parse capability only if the VFIO device 546 // supports MSI. 547 self.parse_msi_capabilities(cap_next, interrupt_manager); 548 } 549 } 550 } 551 PciCapabilityId::MsiX => { 552 if let Some(irq_info) = self.device.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) { 553 if irq_info.count > 0 { 554 // Parse capability only if the VFIO device 555 // supports MSI-X. 556 self.parse_msix_capabilities(cap_next, interrupt_manager); 557 } 558 } 559 } 560 _ => {} 561 }; 562 563 cap_next = self 564 .vfio_pci_configuration 565 .read_config_byte((cap_next + 1).into()); 566 } 567 } 568 569 fn update_msi_capabilities(&mut self, offset: u64, data: &[u8]) -> Result<()> { 570 match self.interrupt.update_msi(offset, data) { 571 Some(InterruptUpdateAction::EnableMsi) => { 572 // Disable INTx before we can enable MSI 573 self.disable_intx(); 574 self.enable_msi()?; 575 } 576 Some(InterruptUpdateAction::DisableMsi) => { 577 // Fallback onto INTx when disabling MSI 578 self.disable_msi(); 579 self.enable_intx()?; 580 } 581 _ => {} 582 } 583 584 Ok(()) 585 } 586 587 fn update_msix_capabilities(&mut self, offset: u64, data: &[u8]) -> Result<()> { 588 match self.interrupt.update_msix(offset, data) { 589 Some(InterruptUpdateAction::EnableMsix) => { 590 // Disable INTx before we can enable MSI-X 591 self.disable_intx(); 592 self.enable_msix()?; 593 } 594 Some(InterruptUpdateAction::DisableMsix) => { 595 // Fallback onto INTx when disabling MSI-X 596 self.disable_msix(); 597 self.enable_intx()?; 598 } 599 _ => {} 600 } 601 602 Ok(()) 603 } 604 605 fn find_region(&self, addr: u64) -> Option<MmioRegion> { 606 for region in self.mmio_regions.iter() { 607 if addr >= region.start.raw_value() 608 && addr < region.start.unchecked_add(region.length).raw_value() 609 { 610 return Some(*region); 611 } 612 } 613 None 614 } 615 616 /// Map MMIO regions into the guest, and avoid VM exits when the guest tries 617 /// to reach those regions. 618 /// 619 /// # Arguments 620 /// 621 /// * `vm` - The VM object. It is used to set the VFIO MMIO regions 622 /// as user memory regions. 623 /// * `mem_slot` - The closure to return a memory slot. 624 pub fn map_mmio_regions<F>(&mut self, vm: &Arc<dyn hypervisor::Vm>, mem_slot: F) -> Result<()> 625 where 626 F: Fn() -> u32, 627 { 628 let fd = self.device.as_raw_fd(); 629 630 for region in self.mmio_regions.iter_mut() { 631 // We want to skip the mapping of the BAR containing the MSI-X 632 // table even if it is mappable. The reason is we need to trap 633 // any access to the MSI-X table and update the GSI routing 634 // accordingly. 635 if let Some(msix) = &self.interrupt.msix { 636 if region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir() { 637 continue; 638 } 639 } 640 641 let region_flags = self.device.get_region_flags(region.index); 642 if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { 643 let mut prot = 0; 644 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 { 645 prot |= libc::PROT_READ; 646 } 647 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { 648 prot |= libc::PROT_WRITE; 649 } 650 let (mmap_offset, mmap_size) = self.device.get_region_mmap(region.index); 651 let offset = self.device.get_region_offset(region.index) + mmap_offset; 652 653 let host_addr = unsafe { 654 libc::mmap( 655 null_mut(), 656 mmap_size as usize, 657 prot, 658 libc::MAP_SHARED, 659 fd, 660 offset as libc::off_t, 661 ) 662 }; 663 664 if host_addr == libc::MAP_FAILED { 665 error!( 666 "Could not mmap regions, error:{}", 667 io::Error::last_os_error() 668 ); 669 continue; 670 } 671 672 let slot = mem_slot(); 673 let mem_region = vm.make_user_memory_region( 674 slot, 675 region.start.raw_value() + mmap_offset, 676 mmap_size as u64, 677 host_addr as u64, 678 false, 679 false, 680 ); 681 682 vm.create_user_memory_region(mem_region) 683 .map_err(|e| VfioPciError::MapRegionGuest(e.into()))?; 684 685 // Update the region with memory mapped info. 686 region.mem_slot = Some(slot); 687 region.host_addr = Some(host_addr as u64); 688 region.mmap_size = Some(mmap_size as usize); 689 } 690 } 691 692 Ok(()) 693 } 694 695 pub fn unmap_mmio_regions(&mut self) { 696 for region in self.mmio_regions.iter() { 697 if let (Some(host_addr), Some(mmap_size), Some(mem_slot)) = 698 (region.host_addr, region.mmap_size, region.mem_slot) 699 { 700 let (mmap_offset, _) = self.device.get_region_mmap(region.index); 701 702 // Remove region 703 let r = self.vm.make_user_memory_region( 704 mem_slot, 705 region.start.raw_value() + mmap_offset, 706 mmap_size as u64, 707 host_addr as u64, 708 false, 709 false, 710 ); 711 712 if let Err(e) = self.vm.remove_user_memory_region(r) { 713 error!("Could not remove the userspace memory region: {}", e); 714 } 715 716 let ret = unsafe { libc::munmap(host_addr as *mut libc::c_void, mmap_size) }; 717 if ret != 0 { 718 error!( 719 "Could not unmap region {}, error:{}", 720 region.index, 721 io::Error::last_os_error() 722 ); 723 } 724 } 725 } 726 } 727 728 pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<()> { 729 if !self.iommu_attached { 730 self.container 731 .vfio_dma_map(iova, size, user_addr) 732 .map_err(VfioPciError::DmaMap)?; 733 } 734 735 Ok(()) 736 } 737 738 pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<()> { 739 if !self.iommu_attached { 740 self.container 741 .vfio_dma_unmap(iova, size) 742 .map_err(VfioPciError::DmaUnmap)?; 743 } 744 745 Ok(()) 746 } 747 748 pub fn mmio_regions(&self) -> Vec<MmioRegion> { 749 self.mmio_regions.clone() 750 } 751 } 752 753 impl Drop for VfioPciDevice { 754 fn drop(&mut self) { 755 self.unmap_mmio_regions(); 756 757 if let Some(msix) = &self.interrupt.msix { 758 if msix.bar.enabled() { 759 self.disable_msix(); 760 } 761 } 762 763 if let Some(msi) = &self.interrupt.msi { 764 if msi.cfg.enabled() { 765 self.disable_msi(); 766 } 767 } 768 769 if self.interrupt.intx_in_use() { 770 self.disable_intx(); 771 } 772 } 773 } 774 775 impl BusDevice for VfioPciDevice { 776 fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { 777 self.read_bar(base, offset, data) 778 } 779 780 fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 781 self.write_bar(base, offset, data) 782 } 783 } 784 785 // First BAR offset in the PCI config space. 786 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; 787 // Capability register offset in the PCI config space. 788 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; 789 // IO BAR when first BAR bit is 1. 790 const PCI_CONFIG_IO_BAR: u32 = 0x1; 791 // 64-bit memory bar flag. 792 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; 793 // PCI config register size (4 bytes). 794 const PCI_CONFIG_REGISTER_SIZE: usize = 4; 795 // Number of BARs for a PCI device 796 const BAR_NUMS: usize = 6; 797 // PCI Header Type register index 798 const PCI_HEADER_TYPE_REG_INDEX: usize = 3; 799 // First BAR register index 800 const PCI_CONFIG_BAR0_INDEX: usize = 4; 801 // PCI ROM expansion BAR register index 802 const PCI_ROM_EXP_BAR_INDEX: usize = 12; 803 804 impl PciDevice for VfioPciDevice { 805 fn allocate_bars( 806 &mut self, 807 allocator: &mut SystemAllocator, 808 ) -> std::result::Result<Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, PciDeviceError> 809 { 810 let mut ranges = Vec::new(); 811 let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX as u32; 812 813 // Going through all regular regions to compute the BAR size. 814 // We're not saving the BAR address to restore it, because we 815 // are going to allocate a guest address for each BAR and write 816 // that new address back. 817 while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { 818 let region_size: u64; 819 let bar_addr: GuestAddress; 820 821 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 822 (PCI_ROM_EXP_BAR_INDEX * 4) as u32 823 } else { 824 PCI_CONFIG_BAR_OFFSET + bar_id * 4 825 }; 826 827 // First read flags 828 let flags = self.vfio_pci_configuration.read_config_dword(bar_offset); 829 830 // Is this an IO BAR? 831 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 832 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) 833 } else { 834 false 835 }; 836 837 // Is this a 64-bit BAR? 838 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 839 matches!( 840 flags & PCI_CONFIG_MEMORY_BAR_64BIT, 841 PCI_CONFIG_MEMORY_BAR_64BIT 842 ) 843 } else { 844 false 845 }; 846 847 // By default, the region type is 32 bits memory BAR. 848 let mut region_type = PciBarRegionType::Memory32BitRegion; 849 850 // To get size write all 1s 851 self.vfio_pci_configuration 852 .write_config_dword(0xffff_ffff, bar_offset); 853 854 // And read back BAR value. The device will write zeros for bits it doesn't care about 855 let mut lower = self.vfio_pci_configuration.read_config_dword(bar_offset); 856 857 if io_bar { 858 #[cfg(target_arch = "x86_64")] 859 { 860 // IO BAR 861 region_type = PciBarRegionType::IoRegion; 862 863 // Mask flag bits (lowest 2 for I/O bars) 864 lower &= !0b11; 865 866 // BAR is not enabled 867 if lower == 0 { 868 bar_id += 1; 869 continue; 870 } 871 872 // Invert bits and add 1 to calculate size 873 region_size = (!lower + 1) as u64; 874 875 // The address needs to be 4 bytes aligned. 876 bar_addr = allocator 877 .allocate_io_addresses(None, region_size, Some(0x4)) 878 .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; 879 } 880 #[cfg(target_arch = "aarch64")] 881 unimplemented!() 882 } else if is_64bit_bar { 883 // 64 bits Memory BAR 884 region_type = PciBarRegionType::Memory64BitRegion; 885 886 // Query size of upper BAR of 64-bit BAR 887 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; 888 self.vfio_pci_configuration 889 .write_config_dword(0xffff_ffff, upper_offset); 890 let upper = self.vfio_pci_configuration.read_config_dword(upper_offset); 891 892 let mut combined_size = u64::from(upper) << 32 | u64::from(lower); 893 894 // Mask out flag bits (lowest 4 for memory bars) 895 combined_size &= !0b1111; 896 897 // BAR is not enabled 898 if combined_size == 0 { 899 bar_id += 1; 900 continue; 901 } 902 903 // Invert and add 1 to to find size 904 region_size = (!combined_size + 1) as u64; 905 906 // BAR allocation must be naturally aligned 907 bar_addr = allocator 908 .allocate_mmio_addresses(None, region_size, Some(region_size)) 909 .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; 910 } else { 911 // Mask out flag bits (lowest 4 for memory bars) 912 lower &= !0b1111; 913 914 if lower == 0 { 915 bar_id += 1; 916 continue; 917 } 918 919 // Invert and add 1 to to find size 920 region_size = (!lower + 1) as u64; 921 922 // BAR allocation must be naturally aligned 923 bar_addr = allocator 924 .allocate_mmio_hole_addresses(None, region_size, Some(region_size)) 925 .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; 926 } 927 928 let reg_idx = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 929 PCI_ROM_EXP_BAR_INDEX 930 } else { 931 bar_id as usize 932 }; 933 934 // We can now build our BAR configuration block. 935 let config = PciBarConfiguration::default() 936 .set_register_index(reg_idx) 937 .set_address(bar_addr.raw_value()) 938 .set_size(region_size) 939 .set_region_type(region_type); 940 941 if bar_id == VFIO_PCI_ROM_REGION_INDEX { 942 self.configuration 943 .add_pci_rom_bar(&config, flags & 0x1) 944 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 945 } else { 946 self.configuration 947 .add_pci_bar(&config) 948 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 949 } 950 951 ranges.push((bar_addr, region_size, region_type)); 952 self.mmio_regions.push(MmioRegion { 953 start: bar_addr, 954 length: region_size, 955 type_: region_type, 956 index: bar_id as u32, 957 mem_slot: None, 958 host_addr: None, 959 mmap_size: None, 960 }); 961 962 bar_id += 1; 963 if is_64bit_bar { 964 bar_id += 1; 965 } 966 } 967 968 Ok(ranges) 969 } 970 971 fn free_bars( 972 &mut self, 973 allocator: &mut SystemAllocator, 974 ) -> std::result::Result<(), PciDeviceError> { 975 for region in self.mmio_regions.iter() { 976 match region.type_ { 977 PciBarRegionType::IoRegion => { 978 #[cfg(target_arch = "x86_64")] 979 allocator.free_io_addresses(region.start, region.length); 980 #[cfg(target_arch = "aarch64")] 981 error!("I/O region is not supported"); 982 } 983 PciBarRegionType::Memory32BitRegion => { 984 allocator.free_mmio_hole_addresses(region.start, region.length); 985 } 986 PciBarRegionType::Memory64BitRegion => { 987 allocator.free_mmio_addresses(region.start, region.length); 988 } 989 } 990 } 991 Ok(()) 992 } 993 994 fn write_config_register( 995 &mut self, 996 reg_idx: usize, 997 offset: u64, 998 data: &[u8], 999 ) -> Option<Arc<Barrier>> { 1000 // When the guest wants to write to a BAR, we trap it into 1001 // our local configuration space. We're not reprogramming 1002 // VFIO device. 1003 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1004 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1005 { 1006 // We keep our local cache updated with the BARs. 1007 // We'll read it back from there when the guest is asking 1008 // for BARs (see read_config_register()). 1009 self.configuration 1010 .write_config_register(reg_idx, offset, data); 1011 return None; 1012 } 1013 1014 let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; 1015 1016 // If the MSI or MSI-X capabilities are accessed, we need to 1017 // update our local cache accordingly. 1018 // Depending on how the capabilities are modified, this could 1019 // trigger a VFIO MSI or MSI-X toggle. 1020 if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { 1021 let cap_offset: u64 = reg - cap_base + offset; 1022 match cap_id { 1023 PciCapabilityId::MessageSignalledInterrupts => { 1024 if let Err(e) = self.update_msi_capabilities(cap_offset, data) { 1025 error!("Could not update MSI capabilities: {}", e); 1026 } 1027 } 1028 PciCapabilityId::MsiX => { 1029 if let Err(e) = self.update_msix_capabilities(cap_offset, data) { 1030 error!("Could not update MSI-X capabilities: {}", e); 1031 } 1032 } 1033 _ => {} 1034 } 1035 } 1036 1037 // Make sure to write to the device's PCI config space after MSI/MSI-X 1038 // interrupts have been enabled/disabled. In case of MSI, when the 1039 // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), 1040 // the MSI Enable bit in the MSI capability structure found in the PCI 1041 // config space is disabled by default. That's why when the guest is 1042 // enabling this bit, we first need to enable the MSI interrupts with 1043 // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write 1044 // to the device region to update the MSI Enable bit. 1045 self.device 1046 .region_write(VFIO_PCI_CONFIG_REGION_INDEX, data, reg + offset); 1047 1048 None 1049 } 1050 1051 fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1052 // When reading the BARs, we trap it and return what comes 1053 // from our local configuration space. We want the guest to 1054 // use that and not the VFIO device BARs as it does not map 1055 // with the guest address space. 1056 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1057 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1058 { 1059 return self.configuration.read_reg(reg_idx); 1060 } 1061 1062 // Since we don't support passing multi-functions devices, we should 1063 // mask the multi-function bit, bit 7 of the Header Type byte on the 1064 // register 3. 1065 let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { 1066 0xff7f_ffff 1067 } else { 1068 0xffff_ffff 1069 }; 1070 1071 // The config register read comes from the VFIO device itself. 1072 self.vfio_pci_configuration 1073 .read_config_dword((reg_idx * 4) as u32) 1074 & mask 1075 } 1076 1077 fn detect_bar_reprogramming( 1078 &mut self, 1079 reg_idx: usize, 1080 data: &[u8], 1081 ) -> Option<BarReprogrammingParams> { 1082 self.configuration.detect_bar_reprogramming(reg_idx, data) 1083 } 1084 1085 fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1086 let addr = base + offset; 1087 if let Some(region) = self.find_region(addr) { 1088 let offset = addr - region.start.raw_value(); 1089 1090 if self.interrupt.msix_table_accessed(region.index, offset) { 1091 self.interrupt.msix_read_table(offset, data); 1092 } else { 1093 self.device.region_read(region.index, data, offset); 1094 } 1095 } 1096 1097 // INTx EOI 1098 // The guest reading from the BAR potentially means the interrupt has 1099 // been received and can be acknowledged. 1100 if self.interrupt.intx_in_use() { 1101 if let Err(e) = self.device.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1102 error!("Failed unmasking INTx IRQ: {}", e); 1103 } 1104 } 1105 } 1106 1107 fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1108 let addr = base + offset; 1109 if let Some(region) = self.find_region(addr) { 1110 let offset = addr - region.start.raw_value(); 1111 1112 // If the MSI-X table is written to, we need to update our cache. 1113 if self.interrupt.msix_table_accessed(region.index, offset) { 1114 self.interrupt.msix_write_table(offset, data); 1115 } else { 1116 self.device.region_write(region.index, data, offset); 1117 } 1118 } 1119 1120 // INTx EOI 1121 // The guest writing to the BAR potentially means the interrupt has 1122 // been received and can be acknowledged. 1123 if self.interrupt.intx_in_use() { 1124 if let Err(e) = self.device.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1125 error!("Failed unmasking INTx IRQ: {}", e); 1126 } 1127 } 1128 1129 None 1130 } 1131 1132 fn move_bar(&mut self, old_base: u64, new_base: u64) -> result::Result<(), io::Error> { 1133 for region in self.mmio_regions.iter_mut() { 1134 if region.start.raw_value() == old_base { 1135 region.start = GuestAddress(new_base); 1136 1137 if let Some(mem_slot) = region.mem_slot { 1138 if let Some(host_addr) = region.host_addr { 1139 let (mmap_offset, mmap_size) = self.device.get_region_mmap(region.index); 1140 1141 // Remove old region 1142 let old_mem_region = self.vm.make_user_memory_region( 1143 mem_slot, 1144 old_base + mmap_offset, 1145 mmap_size as u64, 1146 host_addr as u64, 1147 false, 1148 false, 1149 ); 1150 1151 self.vm 1152 .remove_user_memory_region(old_mem_region) 1153 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1154 1155 // Insert new region 1156 let new_mem_region = self.vm.make_user_memory_region( 1157 mem_slot, 1158 new_base + mmap_offset, 1159 mmap_size as u64, 1160 host_addr as u64, 1161 false, 1162 false, 1163 ); 1164 1165 self.vm 1166 .create_user_memory_region(new_mem_region) 1167 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1168 } 1169 } 1170 } 1171 } 1172 1173 Ok(()) 1174 } 1175 1176 fn as_any(&mut self) -> &mut dyn Any { 1177 self 1178 } 1179 } 1180