1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 6 use std::any::Any; 7 use std::collections::{BTreeMap, HashMap}; 8 use std::io; 9 use std::os::unix::io::AsRawFd; 10 use std::path::PathBuf; 11 use std::ptr::null_mut; 12 use std::sync::{Arc, Barrier, Mutex}; 13 14 use anyhow::anyhow; 15 use byteorder::{ByteOrder, LittleEndian}; 16 use hypervisor::HypervisorVmError; 17 use libc::{sysconf, _SC_PAGESIZE}; 18 use serde::{Deserialize, Serialize}; 19 use thiserror::Error; 20 use vfio_bindings::bindings::vfio::*; 21 use vfio_ioctls::{ 22 VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea, 23 }; 24 use vm_allocator::page_size::{ 25 align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned, 26 }; 27 use vm_allocator::{AddressAllocator, MemorySlotAllocator, SystemAllocator}; 28 use vm_device::dma_mapping::ExternalDmaMapping; 29 use vm_device::interrupt::{ 30 InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, 31 }; 32 use vm_device::{BusDevice, Resource}; 33 use vm_memory::{Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestUsize}; 34 use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; 35 use vmm_sys_util::eventfd::EventFd; 36 37 use crate::msi::{MsiConfigState, MSI_CONFIG_ID}; 38 use crate::msix::MsixConfigState; 39 use crate::{ 40 msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig, 41 PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId, 42 PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId, 43 PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID, 44 }; 45 46 pub(crate) const VFIO_COMMON_ID: &str = "vfio_common"; 47 48 #[derive(Debug, Error)] 49 pub enum VfioPciError { 50 #[error("Failed to create user memory region: {0}")] 51 CreateUserMemoryRegion(#[source] HypervisorVmError), 52 #[error("Failed to DMA map: {0} for device {1} (guest BDF: {2})")] 53 DmaMap(#[source] vfio_ioctls::VfioError, PathBuf, PciBdf), 54 #[error("Failed to DMA unmap: {0} for device {1} (guest BDF: {2})")] 55 DmaUnmap(#[source] vfio_ioctls::VfioError, PathBuf, PciBdf), 56 #[error("Failed to enable INTx: {0}")] 57 EnableIntx(#[source] VfioError), 58 #[error("Failed to enable MSI: {0}")] 59 EnableMsi(#[source] VfioError), 60 #[error("Failed to enable MSI-x: {0}")] 61 EnableMsix(#[source] VfioError), 62 #[error("Failed to mmap the area")] 63 MmapArea, 64 #[error("Failed to notifier's eventfd")] 65 MissingNotifier, 66 #[error("Invalid region alignment")] 67 RegionAlignment, 68 #[error("Invalid region size")] 69 RegionSize, 70 #[error("Failed to retrieve MsiConfigState: {0}")] 71 RetrieveMsiConfigState(#[source] anyhow::Error), 72 #[error("Failed to retrieve MsixConfigState: {0}")] 73 RetrieveMsixConfigState(#[source] anyhow::Error), 74 #[error("Failed to retrieve PciConfigurationState: {0}")] 75 RetrievePciConfigurationState(#[source] anyhow::Error), 76 #[error("Failed to retrieve VfioCommonState: {0}")] 77 RetrieveVfioCommonState(#[source] anyhow::Error), 78 } 79 80 #[derive(Copy, Clone)] 81 enum PciVfioSubclass { 82 VfioSubclass = 0xff, 83 } 84 85 impl PciSubclass for PciVfioSubclass { 86 fn get_register_value(&self) -> u8 { 87 *self as u8 88 } 89 } 90 91 enum InterruptUpdateAction { 92 EnableMsi, 93 DisableMsi, 94 EnableMsix, 95 DisableMsix, 96 } 97 98 #[derive(Serialize, Deserialize)] 99 struct IntxState { 100 enabled: bool, 101 } 102 103 pub(crate) struct VfioIntx { 104 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 105 enabled: bool, 106 } 107 108 #[derive(Serialize, Deserialize)] 109 struct MsiState { 110 cap: MsiCap, 111 cap_offset: u32, 112 } 113 114 pub(crate) struct VfioMsi { 115 pub(crate) cfg: MsiConfig, 116 cap_offset: u32, 117 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 118 } 119 120 impl VfioMsi { 121 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 122 let old_enabled = self.cfg.enabled(); 123 124 self.cfg.update(offset, data); 125 126 let new_enabled = self.cfg.enabled(); 127 128 if !old_enabled && new_enabled { 129 return Some(InterruptUpdateAction::EnableMsi); 130 } 131 132 if old_enabled && !new_enabled { 133 return Some(InterruptUpdateAction::DisableMsi); 134 } 135 136 None 137 } 138 } 139 140 #[derive(Serialize, Deserialize)] 141 struct MsixState { 142 cap: MsixCap, 143 cap_offset: u32, 144 bdf: u32, 145 } 146 147 pub(crate) struct VfioMsix { 148 pub(crate) bar: MsixConfig, 149 cap: MsixCap, 150 cap_offset: u32, 151 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 152 } 153 154 impl VfioMsix { 155 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 156 let old_enabled = self.bar.enabled(); 157 158 // Update "Message Control" word 159 if offset == 2 && data.len() == 2 { 160 self.bar.set_msg_ctl(LittleEndian::read_u16(data)); 161 } 162 163 let new_enabled = self.bar.enabled(); 164 165 if !old_enabled && new_enabled { 166 return Some(InterruptUpdateAction::EnableMsix); 167 } 168 169 if old_enabled && !new_enabled { 170 return Some(InterruptUpdateAction::DisableMsix); 171 } 172 173 None 174 } 175 176 fn table_accessed(&self, bar_index: u32, offset: u64) -> bool { 177 let table_offset: u64 = u64::from(self.cap.table_offset()); 178 let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64); 179 let table_bir: u32 = self.cap.table_bir(); 180 181 bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size 182 } 183 } 184 185 pub(crate) struct Interrupt { 186 pub(crate) intx: Option<VfioIntx>, 187 pub(crate) msi: Option<VfioMsi>, 188 pub(crate) msix: Option<VfioMsix>, 189 } 190 191 impl Interrupt { 192 fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 193 if let Some(ref mut msi) = &mut self.msi { 194 let action = msi.update(offset, data); 195 return action; 196 } 197 198 None 199 } 200 201 fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 202 if let Some(ref mut msix) = &mut self.msix { 203 let action = msix.update(offset, data); 204 return action; 205 } 206 207 None 208 } 209 210 fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { 211 if let Some(msi) = &self.msi { 212 if offset >= u64::from(msi.cap_offset) 213 && offset < u64::from(msi.cap_offset) + msi.cfg.size() 214 { 215 return Some(( 216 PciCapabilityId::MessageSignalledInterrupts, 217 u64::from(msi.cap_offset), 218 )); 219 } 220 } 221 222 if let Some(msix) = &self.msix { 223 if offset == u64::from(msix.cap_offset) { 224 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); 225 } 226 } 227 228 None 229 } 230 231 fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool { 232 if let Some(msix) = &self.msix { 233 return msix.table_accessed(bar_index, offset); 234 } 235 236 false 237 } 238 239 fn msix_write_table(&mut self, offset: u64, data: &[u8]) { 240 if let Some(ref mut msix) = &mut self.msix { 241 let offset = offset - u64::from(msix.cap.table_offset()); 242 msix.bar.write_table(offset, data) 243 } 244 } 245 246 fn msix_read_table(&self, offset: u64, data: &mut [u8]) { 247 if let Some(msix) = &self.msix { 248 let offset = offset - u64::from(msix.cap.table_offset()); 249 msix.bar.read_table(offset, data) 250 } 251 } 252 253 pub(crate) fn intx_in_use(&self) -> bool { 254 if let Some(intx) = &self.intx { 255 return intx.enabled; 256 } 257 258 false 259 } 260 } 261 262 #[derive(Copy, Clone)] 263 pub struct UserMemoryRegion { 264 pub slot: u32, 265 pub start: u64, 266 pub size: u64, 267 pub host_addr: u64, 268 } 269 270 #[derive(Clone)] 271 pub struct MmioRegion { 272 pub start: GuestAddress, 273 pub length: GuestUsize, 274 pub(crate) type_: PciBarRegionType, 275 pub(crate) index: u32, 276 pub(crate) user_memory_regions: Vec<UserMemoryRegion>, 277 } 278 279 trait MmioRegionRange { 280 fn check_range(&self, guest_addr: u64, size: u64) -> bool; 281 fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error>; 282 } 283 284 impl MmioRegionRange for Vec<MmioRegion> { 285 // Check if a guest address is within the range of mmio regions 286 fn check_range(&self, guest_addr: u64, size: u64) -> bool { 287 for region in self.iter() { 288 let Some(guest_addr_end) = guest_addr.checked_add(size) else { 289 return false; 290 }; 291 let Some(region_end) = region.start.raw_value().checked_add(region.length) else { 292 return false; 293 }; 294 if guest_addr >= region.start.raw_value() && guest_addr_end <= region_end { 295 return true; 296 } 297 } 298 false 299 } 300 301 // Locate the user region address for a guest address within all mmio regions 302 fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error> { 303 for region in self.iter() { 304 for user_region in region.user_memory_regions.iter() { 305 if guest_addr >= user_region.start 306 && guest_addr < user_region.start + user_region.size 307 { 308 return Ok(user_region.host_addr + (guest_addr - user_region.start)); 309 } 310 } 311 } 312 313 Err(io::Error::other(format!( 314 "unable to find user address: 0x{guest_addr:x}" 315 ))) 316 } 317 } 318 319 #[derive(Debug, Error)] 320 pub enum VfioError { 321 #[error("Kernel VFIO error: {0}")] 322 KernelVfio(#[source] vfio_ioctls::VfioError), 323 #[error("VFIO user error: {0}")] 324 VfioUser(#[source] vfio_user::Error), 325 } 326 327 pub(crate) trait Vfio: Send + Sync { 328 fn read_config_byte(&self, offset: u32) -> u8 { 329 let mut data: [u8; 1] = [0]; 330 self.read_config(offset, &mut data); 331 data[0] 332 } 333 334 fn read_config_word(&self, offset: u32) -> u16 { 335 let mut data: [u8; 2] = [0, 0]; 336 self.read_config(offset, &mut data); 337 u16::from_le_bytes(data) 338 } 339 340 fn read_config_dword(&self, offset: u32) -> u32 { 341 let mut data: [u8; 4] = [0, 0, 0, 0]; 342 self.read_config(offset, &mut data); 343 u32::from_le_bytes(data) 344 } 345 346 fn write_config_dword(&self, offset: u32, buf: u32) { 347 let data: [u8; 4] = buf.to_le_bytes(); 348 self.write_config(offset, &data) 349 } 350 351 fn read_config(&self, offset: u32, data: &mut [u8]) { 352 self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut()); 353 } 354 355 fn write_config(&self, offset: u32, data: &[u8]) { 356 self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data) 357 } 358 359 fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 360 self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds) 361 } 362 363 fn disable_msi(&self) -> Result<(), VfioError> { 364 self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX) 365 } 366 367 fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 368 self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds) 369 } 370 371 fn disable_msix(&self) -> Result<(), VfioError> { 372 self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX) 373 } 374 375 fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) { 376 unimplemented!() 377 } 378 379 fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) { 380 unimplemented!() 381 } 382 383 fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> { 384 unimplemented!() 385 } 386 387 fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 388 unimplemented!() 389 } 390 391 fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 392 unimplemented!() 393 } 394 395 fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 396 unimplemented!() 397 } 398 } 399 400 struct VfioDeviceWrapper { 401 device: Arc<VfioDevice>, 402 } 403 404 impl VfioDeviceWrapper { 405 fn new(device: Arc<VfioDevice>) -> Self { 406 Self { device } 407 } 408 } 409 410 impl Vfio for VfioDeviceWrapper { 411 fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) { 412 self.device.region_read(index, data, offset) 413 } 414 415 fn region_write(&self, index: u32, offset: u64, data: &[u8]) { 416 self.device.region_write(index, data, offset) 417 } 418 419 fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> { 420 self.device.get_irq_info(irq_index).copied() 421 } 422 423 fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 424 self.device 425 .enable_irq(irq_index, event_fds) 426 .map_err(VfioError::KernelVfio) 427 } 428 429 fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> { 430 self.device 431 .disable_irq(irq_index) 432 .map_err(VfioError::KernelVfio) 433 } 434 435 fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> { 436 self.device 437 .unmask_irq(irq_index) 438 .map_err(VfioError::KernelVfio) 439 } 440 } 441 442 #[derive(Serialize, Deserialize)] 443 struct VfioCommonState { 444 intx_state: Option<IntxState>, 445 msi_state: Option<MsiState>, 446 msix_state: Option<MsixState>, 447 } 448 449 pub(crate) struct ConfigPatch { 450 mask: u32, 451 patch: u32, 452 } 453 454 pub(crate) struct VfioCommon { 455 pub(crate) configuration: PciConfiguration, 456 pub(crate) mmio_regions: Vec<MmioRegion>, 457 pub(crate) interrupt: Interrupt, 458 pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 459 pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 460 pub(crate) vfio_wrapper: Arc<dyn Vfio>, 461 pub(crate) patches: HashMap<usize, ConfigPatch>, 462 x_nv_gpudirect_clique: Option<u8>, 463 } 464 465 impl VfioCommon { 466 pub(crate) fn new( 467 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 468 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 469 vfio_wrapper: Arc<dyn Vfio>, 470 subclass: &dyn PciSubclass, 471 bdf: PciBdf, 472 snapshot: Option<Snapshot>, 473 x_nv_gpudirect_clique: Option<u8>, 474 ) -> Result<Self, VfioPciError> { 475 let pci_configuration_state = 476 vm_migration::state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID).map_err(|e| { 477 VfioPciError::RetrievePciConfigurationState(anyhow!( 478 "Failed to get PciConfigurationState from Snapshot: {}", 479 e 480 )) 481 })?; 482 483 let configuration = PciConfiguration::new( 484 0, 485 0, 486 0, 487 PciClassCode::Other, 488 subclass, 489 None, 490 PciHeaderType::Device, 491 0, 492 0, 493 None, 494 pci_configuration_state, 495 ); 496 497 let mut vfio_common = VfioCommon { 498 mmio_regions: Vec::new(), 499 configuration, 500 interrupt: Interrupt { 501 intx: None, 502 msi: None, 503 msix: None, 504 }, 505 msi_interrupt_manager, 506 legacy_interrupt_group, 507 vfio_wrapper, 508 patches: HashMap::new(), 509 x_nv_gpudirect_clique, 510 }; 511 512 let state: Option<VfioCommonState> = snapshot 513 .as_ref() 514 .map(|s| s.to_state()) 515 .transpose() 516 .map_err(|e| { 517 VfioPciError::RetrieveVfioCommonState(anyhow!( 518 "Failed to get VfioCommonState from Snapshot: {}", 519 e 520 )) 521 })?; 522 let msi_state = 523 vm_migration::state_from_id(snapshot.as_ref(), MSI_CONFIG_ID).map_err(|e| { 524 VfioPciError::RetrieveMsiConfigState(anyhow!( 525 "Failed to get MsiConfigState from Snapshot: {}", 526 e 527 )) 528 })?; 529 let msix_state = 530 vm_migration::state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID).map_err(|e| { 531 VfioPciError::RetrieveMsixConfigState(anyhow!( 532 "Failed to get MsixConfigState from Snapshot: {}", 533 e 534 )) 535 })?; 536 537 if let Some(state) = state.as_ref() { 538 vfio_common.set_state(state, msi_state, msix_state)?; 539 } else { 540 vfio_common.parse_capabilities(bdf); 541 vfio_common.initialize_legacy_interrupt()?; 542 } 543 544 Ok(vfio_common) 545 } 546 547 /// In case msix table offset is not page size aligned, we need do some fixup to achieve it. 548 /// Because we don't want the MMIO RW region and trap region overlap each other. 549 fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 { 550 if let Some(msix) = self.interrupt.msix.as_mut() { 551 let msix_cap = &mut msix.cap; 552 553 // Suppose table_bir equals to pba_bir here. Am I right? 554 let (table_offset, table_size) = msix_cap.table_range(); 555 if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id { 556 return region_size; 557 } 558 559 let (pba_offset, pba_size) = msix_cap.pba_range(); 560 let msix_sz = align_page_size_up(table_size + pba_size); 561 // Expand region to hold RW and trap region which both page size aligned 562 let size = std::cmp::max(region_size * 2, msix_sz * 2); 563 // let table starts from the middle of the region 564 msix_cap.table_set_offset((size / 2) as u32); 565 msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32); 566 567 size 568 } else { 569 // MSI-X not supported for this device 570 region_size 571 } 572 } 573 574 // The `allocator` argument is unused on `aarch64` 575 #[allow(unused_variables)] 576 pub(crate) fn allocate_bars( 577 &mut self, 578 allocator: &Arc<Mutex<SystemAllocator>>, 579 mmio32_allocator: &mut AddressAllocator, 580 mmio64_allocator: &mut AddressAllocator, 581 resources: Option<Vec<Resource>>, 582 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 583 let mut bars = Vec::new(); 584 let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX; 585 586 // Going through all regular regions to compute the BAR size. 587 // We're not saving the BAR address to restore it, because we 588 // are going to allocate a guest address for each BAR and write 589 // that new address back. 590 while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { 591 let mut region_size: u64 = 0; 592 let mut region_type = PciBarRegionType::Memory32BitRegion; 593 let mut prefetchable = PciBarPrefetchable::NotPrefetchable; 594 let mut flags: u32 = 0; 595 596 let mut restored_bar_addr = None; 597 if let Some(resources) = &resources { 598 for resource in resources { 599 if let Resource::PciBar { 600 index, 601 base, 602 size, 603 type_, 604 .. 605 } = resource 606 { 607 if *index == bar_id as usize { 608 restored_bar_addr = Some(GuestAddress(*base)); 609 region_size = *size; 610 region_type = PciBarRegionType::from(*type_); 611 break; 612 } 613 } 614 } 615 if restored_bar_addr.is_none() { 616 bar_id += 1; 617 continue; 618 } 619 } else { 620 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 621 (PCI_ROM_EXP_BAR_INDEX * 4) as u32 622 } else { 623 PCI_CONFIG_BAR_OFFSET + bar_id * 4 624 }; 625 626 // First read flags 627 flags = self.vfio_wrapper.read_config_dword(bar_offset); 628 629 // Is this an IO BAR? 630 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 631 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) 632 } else { 633 false 634 }; 635 636 // Is this a 64-bit BAR? 637 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 638 matches!( 639 flags & PCI_CONFIG_MEMORY_BAR_64BIT, 640 PCI_CONFIG_MEMORY_BAR_64BIT 641 ) 642 } else { 643 false 644 }; 645 646 if matches!( 647 flags & PCI_CONFIG_BAR_PREFETCHABLE, 648 PCI_CONFIG_BAR_PREFETCHABLE 649 ) { 650 prefetchable = PciBarPrefetchable::Prefetchable 651 }; 652 653 // To get size write all 1s 654 self.vfio_wrapper 655 .write_config_dword(bar_offset, 0xffff_ffff); 656 657 // And read back BAR value. The device will write zeros for bits it doesn't care about 658 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset); 659 660 if io_bar { 661 // Mask flag bits (lowest 2 for I/O bars) 662 lower &= !0b11; 663 664 // BAR is not enabled 665 if lower == 0 { 666 bar_id += 1; 667 continue; 668 } 669 670 // IO BAR 671 region_type = PciBarRegionType::IoRegion; 672 673 // Invert bits and add 1 to calculate size 674 region_size = (!lower + 1) as u64; 675 } else if is_64bit_bar { 676 // 64 bits Memory BAR 677 region_type = PciBarRegionType::Memory64BitRegion; 678 679 // Query size of upper BAR of 64-bit BAR 680 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; 681 self.vfio_wrapper 682 .write_config_dword(upper_offset, 0xffff_ffff); 683 let upper = self.vfio_wrapper.read_config_dword(upper_offset); 684 685 let mut combined_size = (u64::from(upper) << 32) | u64::from(lower); 686 687 // Mask out flag bits (lowest 4 for memory bars) 688 combined_size &= !0b1111; 689 690 // BAR is not enabled 691 if combined_size == 0 { 692 bar_id += 1; 693 continue; 694 } 695 696 // Invert and add 1 to to find size 697 region_size = !combined_size + 1; 698 } else { 699 region_type = PciBarRegionType::Memory32BitRegion; 700 701 // Mask out flag bits (lowest 4 for memory bars) 702 lower &= !0b1111; 703 704 if lower == 0 { 705 bar_id += 1; 706 continue; 707 } 708 709 // Invert and add 1 to to find size 710 region_size = (!lower + 1) as u64; 711 } 712 } 713 714 let bar_addr = match region_type { 715 PciBarRegionType::IoRegion => { 716 // The address needs to be 4 bytes aligned. 717 allocator 718 .lock() 719 .unwrap() 720 .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4)) 721 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 722 } 723 PciBarRegionType::Memory32BitRegion => { 724 // BAR allocation must be naturally aligned 725 mmio32_allocator 726 .allocate(restored_bar_addr, region_size, Some(region_size)) 727 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 728 } 729 PciBarRegionType::Memory64BitRegion => { 730 // We need do some fixup to keep MMIO RW region and msix cap region page size 731 // aligned. 732 region_size = self.fixup_msix_region(bar_id, region_size); 733 mmio64_allocator 734 .allocate( 735 restored_bar_addr, 736 region_size, 737 Some(std::cmp::max( 738 // SAFETY: FFI call. Trivially safe. 739 unsafe { sysconf(_SC_PAGESIZE) as GuestUsize }, 740 region_size, 741 )), 742 ) 743 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 744 } 745 }; 746 747 // We can now build our BAR configuration block. 748 let bar = PciBarConfiguration::default() 749 .set_index(bar_id as usize) 750 .set_address(bar_addr.raw_value()) 751 .set_size(region_size) 752 .set_region_type(region_type) 753 .set_prefetchable(prefetchable); 754 755 if bar_id == VFIO_PCI_ROM_REGION_INDEX { 756 self.configuration 757 .add_pci_rom_bar(&bar, flags & 0x1) 758 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 759 } else { 760 self.configuration 761 .add_pci_bar(&bar) 762 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 763 } 764 765 bars.push(bar); 766 self.mmio_regions.push(MmioRegion { 767 start: bar_addr, 768 length: region_size, 769 type_: region_type, 770 index: bar_id, 771 user_memory_regions: Vec::new(), 772 }); 773 774 bar_id += 1; 775 if region_type == PciBarRegionType::Memory64BitRegion { 776 bar_id += 1; 777 } 778 } 779 780 Ok(bars) 781 } 782 783 // The `allocator` argument is unused on `aarch64` 784 #[allow(unused_variables)] 785 pub(crate) fn free_bars( 786 &mut self, 787 allocator: &mut SystemAllocator, 788 mmio32_allocator: &mut AddressAllocator, 789 mmio64_allocator: &mut AddressAllocator, 790 ) -> Result<(), PciDeviceError> { 791 for region in self.mmio_regions.iter() { 792 match region.type_ { 793 PciBarRegionType::IoRegion => { 794 allocator.free_io_addresses(region.start, region.length); 795 } 796 PciBarRegionType::Memory32BitRegion => { 797 mmio32_allocator.free(region.start, region.length); 798 } 799 PciBarRegionType::Memory64BitRegion => { 800 mmio64_allocator.free(region.start, region.length); 801 } 802 } 803 } 804 Ok(()) 805 } 806 807 fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap { 808 let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into()); 809 810 let table = self.vfio_wrapper.read_config_dword((cap + 4).into()); 811 812 let pba = self.vfio_wrapper.read_config_dword((cap + 8).into()); 813 814 MsixCap { 815 msg_ctl, 816 table, 817 pba, 818 } 819 } 820 821 fn initialize_msix( 822 &mut self, 823 msix_cap: MsixCap, 824 cap_offset: u32, 825 bdf: PciBdf, 826 state: Option<MsixConfigState>, 827 ) { 828 let interrupt_source_group = self 829 .msi_interrupt_manager 830 .create_group(MsiIrqGroupConfig { 831 base: 0, 832 count: msix_cap.table_size() as InterruptIndex, 833 }) 834 .unwrap(); 835 836 let msix_config = MsixConfig::new( 837 msix_cap.table_size(), 838 interrupt_source_group.clone(), 839 bdf.into(), 840 state, 841 ) 842 .unwrap(); 843 844 self.interrupt.msix = Some(VfioMsix { 845 bar: msix_config, 846 cap: msix_cap, 847 cap_offset, 848 interrupt_source_group, 849 }); 850 } 851 852 fn parse_msi_capabilities(&mut self, cap: u8) -> u16 { 853 self.vfio_wrapper.read_config_word((cap + 2).into()) 854 } 855 856 fn initialize_msi(&mut self, msg_ctl: u16, cap_offset: u32, state: Option<MsiConfigState>) { 857 let interrupt_source_group = self 858 .msi_interrupt_manager 859 .create_group(MsiIrqGroupConfig { 860 base: 0, 861 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, 862 }) 863 .unwrap(); 864 865 let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap(); 866 867 self.interrupt.msi = Some(VfioMsi { 868 cfg: msi_config, 869 cap_offset, 870 interrupt_source_group, 871 }); 872 } 873 874 /// Returns true, if the device claims to have a PCI capability list. 875 fn has_capabilities(&self) -> bool { 876 let status = self.vfio_wrapper.read_config_word(PCI_CONFIG_STATUS_OFFSET); 877 status & PCI_CONFIG_STATUS_CAPABILITIES_LIST != 0 878 } 879 880 fn get_msix_cap_idx(&self) -> Option<usize> { 881 if !self.has_capabilities() { 882 return None; 883 } 884 885 let mut cap_next = self 886 .vfio_wrapper 887 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET) 888 & PCI_CONFIG_CAPABILITY_PTR_MASK; 889 890 while cap_next != 0 { 891 let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into()); 892 if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX { 893 return Some(cap_next as usize); 894 } else { 895 let cap_ptr = self.vfio_wrapper.read_config_byte((cap_next + 1).into()) 896 & PCI_CONFIG_CAPABILITY_PTR_MASK; 897 898 // See parse_capabilities below for an explanation. 899 if cap_ptr != cap_next { 900 cap_next = cap_ptr; 901 } else { 902 break; 903 } 904 } 905 } 906 907 None 908 } 909 910 fn parse_capabilities(&mut self, bdf: PciBdf) { 911 if !self.has_capabilities() { 912 return; 913 } 914 915 let mut cap_iter = self 916 .vfio_wrapper 917 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET) 918 & PCI_CONFIG_CAPABILITY_PTR_MASK; 919 920 let mut pci_express_cap_found = false; 921 let mut power_management_cap_found = false; 922 923 while cap_iter != 0 { 924 let cap_id = self.vfio_wrapper.read_config_byte(cap_iter.into()); 925 926 match PciCapabilityId::from(cap_id) { 927 PciCapabilityId::MessageSignalledInterrupts => { 928 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { 929 if irq_info.count > 0 { 930 // Parse capability only if the VFIO device 931 // supports MSI. 932 let msg_ctl = self.parse_msi_capabilities(cap_iter); 933 self.initialize_msi(msg_ctl, cap_iter as u32, None); 934 } 935 } 936 } 937 PciCapabilityId::MsiX => { 938 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) 939 { 940 if irq_info.count > 0 { 941 // Parse capability only if the VFIO device 942 // supports MSI-X. 943 let msix_cap = self.parse_msix_capabilities(cap_iter); 944 self.initialize_msix(msix_cap, cap_iter as u32, bdf, None); 945 } 946 } 947 } 948 PciCapabilityId::PciExpress => pci_express_cap_found = true, 949 PciCapabilityId::PowerManagement => power_management_cap_found = true, 950 _ => {} 951 }; 952 953 let cap_next = self.vfio_wrapper.read_config_byte((cap_iter + 1).into()) 954 & PCI_CONFIG_CAPABILITY_PTR_MASK; 955 956 // Break out of the loop, if we either find the end or we have a broken device. This 957 // doesn't handle all cases where a device might send us in a loop here, but it 958 // handles case of a device returning 0xFF instead of implementing a real 959 // capabilities list. 960 if cap_next == 0 || cap_next == cap_iter { 961 break; 962 } 963 964 cap_iter = cap_next; 965 } 966 967 if let Some(clique_id) = self.x_nv_gpudirect_clique { 968 self.add_nv_gpudirect_clique_cap(cap_iter, clique_id); 969 } 970 971 if pci_express_cap_found && power_management_cap_found { 972 self.parse_extended_capabilities(); 973 } 974 } 975 976 fn add_nv_gpudirect_clique_cap(&mut self, cap_iter: u8, clique_id: u8) { 977 // Turing, Ampere, Hopper, and Lovelace GPUs have dedicated space 978 // at 0xD4 for this capability. 979 let cap_offset = 0xd4u32; 980 981 let reg_idx = (cap_iter / 4) as usize; 982 self.patches.insert( 983 reg_idx, 984 ConfigPatch { 985 mask: 0x0000_ff00, 986 patch: cap_offset << 8, 987 }, 988 ); 989 990 let reg_idx = (cap_offset / 4) as usize; 991 self.patches.insert( 992 reg_idx, 993 ConfigPatch { 994 mask: 0xffff_ffff, 995 patch: 0x50080009u32, 996 }, 997 ); 998 self.patches.insert( 999 reg_idx + 1, 1000 ConfigPatch { 1001 mask: 0xffff_ffff, 1002 patch: (u32::from(clique_id) << 19) | 0x5032, 1003 }, 1004 ); 1005 } 1006 1007 fn parse_extended_capabilities(&mut self) { 1008 let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET; 1009 1010 loop { 1011 let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset); 1012 1013 let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16; 1014 let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16; 1015 1016 match PciExpressCapabilityId::from(cap_id) { 1017 PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation 1018 | PciExpressCapabilityId::ResizeableBar 1019 | PciExpressCapabilityId::SingleRootIoVirtualization => { 1020 let reg_idx = (current_offset / 4) as usize; 1021 self.patches.insert( 1022 reg_idx, 1023 ConfigPatch { 1024 mask: 0x0000_ffff, 1025 patch: PciExpressCapabilityId::NullCapability as u32, 1026 }, 1027 ); 1028 } 1029 _ => {} 1030 } 1031 1032 if cap_next == 0 { 1033 break; 1034 } 1035 1036 current_offset = cap_next.into(); 1037 } 1038 } 1039 1040 pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> { 1041 if let Some(intx) = &mut self.interrupt.intx { 1042 if !intx.enabled { 1043 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { 1044 self.vfio_wrapper 1045 .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) 1046 .map_err(VfioPciError::EnableIntx)?; 1047 1048 intx.enabled = true; 1049 } else { 1050 return Err(VfioPciError::MissingNotifier); 1051 } 1052 } 1053 } 1054 1055 Ok(()) 1056 } 1057 1058 pub(crate) fn disable_intx(&mut self) { 1059 if let Some(intx) = &mut self.interrupt.intx { 1060 if intx.enabled { 1061 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1062 error!("Could not disable INTx: {}", e); 1063 } else { 1064 intx.enabled = false; 1065 } 1066 } 1067 } 1068 } 1069 1070 pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> { 1071 if let Some(msi) = &self.interrupt.msi { 1072 let mut irq_fds: Vec<EventFd> = Vec::new(); 1073 for i in 0..msi.cfg.num_enabled_vectors() { 1074 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { 1075 irq_fds.push(eventfd); 1076 } else { 1077 return Err(VfioPciError::MissingNotifier); 1078 } 1079 } 1080 1081 self.vfio_wrapper 1082 .enable_msi(irq_fds.iter().collect()) 1083 .map_err(VfioPciError::EnableMsi)?; 1084 } 1085 1086 Ok(()) 1087 } 1088 1089 pub(crate) fn disable_msi(&self) { 1090 if let Err(e) = self.vfio_wrapper.disable_msi() { 1091 error!("Could not disable MSI: {}", e); 1092 } 1093 } 1094 1095 pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> { 1096 if let Some(msix) = &self.interrupt.msix { 1097 let mut irq_fds: Vec<EventFd> = Vec::new(); 1098 for i in 0..msix.bar.table_entries.len() { 1099 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { 1100 irq_fds.push(eventfd); 1101 } else { 1102 return Err(VfioPciError::MissingNotifier); 1103 } 1104 } 1105 1106 self.vfio_wrapper 1107 .enable_msix(irq_fds.iter().collect()) 1108 .map_err(VfioPciError::EnableMsix)?; 1109 } 1110 1111 Ok(()) 1112 } 1113 1114 pub(crate) fn disable_msix(&self) { 1115 if let Err(e) = self.vfio_wrapper.disable_msix() { 1116 error!("Could not disable MSI-X: {}", e); 1117 } 1118 } 1119 1120 fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> { 1121 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { 1122 if irq_info.count == 0 { 1123 // A count of 0 means the INTx IRQ is not supported, therefore 1124 // it shouldn't be initialized. 1125 return Ok(()); 1126 } 1127 } 1128 1129 if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() { 1130 self.interrupt.intx = Some(VfioIntx { 1131 interrupt_source_group, 1132 enabled: false, 1133 }); 1134 1135 self.enable_intx()?; 1136 } 1137 1138 Ok(()) 1139 } 1140 1141 fn update_msi_capabilities(&mut self, offset: u64, data: &[u8]) -> Result<(), VfioPciError> { 1142 match self.interrupt.update_msi(offset, data) { 1143 Some(InterruptUpdateAction::EnableMsi) => { 1144 // Disable INTx before we can enable MSI 1145 self.disable_intx(); 1146 self.enable_msi()?; 1147 } 1148 Some(InterruptUpdateAction::DisableMsi) => { 1149 // Fallback onto INTx when disabling MSI 1150 self.disable_msi(); 1151 self.enable_intx()?; 1152 } 1153 _ => {} 1154 } 1155 1156 Ok(()) 1157 } 1158 1159 fn update_msix_capabilities(&mut self, offset: u64, data: &[u8]) -> Result<(), VfioPciError> { 1160 match self.interrupt.update_msix(offset, data) { 1161 Some(InterruptUpdateAction::EnableMsix) => { 1162 // Disable INTx before we can enable MSI-X 1163 self.disable_intx(); 1164 self.enable_msix()?; 1165 } 1166 Some(InterruptUpdateAction::DisableMsix) => { 1167 // Fallback onto INTx when disabling MSI-X 1168 self.disable_msix(); 1169 self.enable_intx()?; 1170 } 1171 _ => {} 1172 } 1173 1174 Ok(()) 1175 } 1176 1177 fn find_region(&self, addr: u64) -> Option<MmioRegion> { 1178 for region in self.mmio_regions.iter() { 1179 if addr >= region.start.raw_value() 1180 && addr < region.start.unchecked_add(region.length).raw_value() 1181 { 1182 return Some(region.clone()); 1183 } 1184 } 1185 None 1186 } 1187 1188 pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1189 let addr = base + offset; 1190 if let Some(region) = self.find_region(addr) { 1191 let offset = addr - region.start.raw_value(); 1192 1193 if self.interrupt.msix_table_accessed(region.index, offset) { 1194 self.interrupt.msix_read_table(offset, data); 1195 } else { 1196 self.vfio_wrapper.region_read(region.index, offset, data); 1197 } 1198 } 1199 1200 // INTx EOI 1201 // The guest reading from the BAR potentially means the interrupt has 1202 // been received and can be acknowledged. 1203 if self.interrupt.intx_in_use() { 1204 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1205 error!("Failed unmasking INTx IRQ: {}", e); 1206 } 1207 } 1208 } 1209 1210 pub(crate) fn write_bar( 1211 &mut self, 1212 base: u64, 1213 offset: u64, 1214 data: &[u8], 1215 ) -> Option<Arc<Barrier>> { 1216 let addr = base + offset; 1217 if let Some(region) = self.find_region(addr) { 1218 let offset = addr - region.start.raw_value(); 1219 1220 // If the MSI-X table is written to, we need to update our cache. 1221 if self.interrupt.msix_table_accessed(region.index, offset) { 1222 self.interrupt.msix_write_table(offset, data); 1223 } else { 1224 self.vfio_wrapper.region_write(region.index, offset, data); 1225 } 1226 } 1227 1228 // INTx EOI 1229 // The guest writing to the BAR potentially means the interrupt has 1230 // been received and can be acknowledged. 1231 if self.interrupt.intx_in_use() { 1232 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1233 error!("Failed unmasking INTx IRQ: {}", e); 1234 } 1235 } 1236 1237 None 1238 } 1239 1240 pub(crate) fn write_config_register( 1241 &mut self, 1242 reg_idx: usize, 1243 offset: u64, 1244 data: &[u8], 1245 ) -> (Vec<BarReprogrammingParams>, Option<Arc<Barrier>>) { 1246 // When the guest wants to write to a BAR, we trap it into 1247 // our local configuration space. We're not reprogramming 1248 // VFIO device. 1249 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1250 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1251 { 1252 // We keep our local cache updated with the BARs. 1253 // We'll read it back from there when the guest is asking 1254 // for BARs (see read_config_register()). 1255 return ( 1256 self.configuration 1257 .write_config_register(reg_idx, offset, data), 1258 None, 1259 ); 1260 } 1261 1262 let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; 1263 1264 // If the MSI or MSI-X capabilities are accessed, we need to 1265 // update our local cache accordingly. 1266 // Depending on how the capabilities are modified, this could 1267 // trigger a VFIO MSI or MSI-X toggle. 1268 if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { 1269 let cap_offset: u64 = reg - cap_base + offset; 1270 match cap_id { 1271 PciCapabilityId::MessageSignalledInterrupts => { 1272 if let Err(e) = self.update_msi_capabilities(cap_offset, data) { 1273 error!("Could not update MSI capabilities: {}", e); 1274 } 1275 } 1276 PciCapabilityId::MsiX => { 1277 if let Err(e) = self.update_msix_capabilities(cap_offset, data) { 1278 error!("Could not update MSI-X capabilities: {}", e); 1279 } 1280 } 1281 _ => {} 1282 } 1283 } 1284 1285 // Make sure to write to the device's PCI config space after MSI/MSI-X 1286 // interrupts have been enabled/disabled. In case of MSI, when the 1287 // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), 1288 // the MSI Enable bit in the MSI capability structure found in the PCI 1289 // config space is disabled by default. That's why when the guest is 1290 // enabling this bit, we first need to enable the MSI interrupts with 1291 // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write 1292 // to the device region to update the MSI Enable bit. 1293 self.vfio_wrapper.write_config((reg + offset) as u32, data); 1294 1295 // Return pending BAR repgrogramming if MSE bit is set 1296 let mut ret_param = self.configuration.pending_bar_reprogram(); 1297 if !ret_param.is_empty() { 1298 if self.read_config_register(crate::configuration::COMMAND_REG) 1299 & crate::configuration::COMMAND_REG_MEMORY_SPACE_MASK 1300 == crate::configuration::COMMAND_REG_MEMORY_SPACE_MASK 1301 { 1302 info!("BAR reprogramming parameter is returned: {:x?}", ret_param); 1303 self.configuration.clear_pending_bar_reprogram(); 1304 } else { 1305 info!( 1306 "MSE bit is disabled. No BAR reprogramming parameter is returned: {:x?}", 1307 ret_param 1308 ); 1309 1310 ret_param = Vec::new(); 1311 } 1312 } 1313 1314 (ret_param, None) 1315 } 1316 1317 pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1318 // When reading the BARs, we trap it and return what comes 1319 // from our local configuration space. We want the guest to 1320 // use that and not the VFIO device BARs as it does not map 1321 // with the guest address space. 1322 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1323 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1324 { 1325 return self.configuration.read_reg(reg_idx); 1326 } 1327 1328 if let Some(id) = self.get_msix_cap_idx() { 1329 let msix = self.interrupt.msix.as_mut().unwrap(); 1330 if reg_idx * 4 == id + 4 { 1331 return msix.cap.table; 1332 } else if reg_idx * 4 == id + 8 { 1333 return msix.cap.pba; 1334 } 1335 } 1336 1337 // Since we don't support passing multi-functions devices, we should 1338 // mask the multi-function bit, bit 7 of the Header Type byte on the 1339 // register 3. 1340 let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { 1341 0xff7f_ffff 1342 } else { 1343 0xffff_ffff 1344 }; 1345 1346 // The config register read comes from the VFIO device itself. 1347 let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask; 1348 1349 if let Some(config_patch) = self.patches.get(®_idx) { 1350 value = (value & !config_patch.mask) | config_patch.patch; 1351 } 1352 1353 value 1354 } 1355 1356 fn state(&self) -> VfioCommonState { 1357 let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState { 1358 enabled: intx.enabled, 1359 }); 1360 1361 let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState { 1362 cap: msi.cfg.cap, 1363 cap_offset: msi.cap_offset, 1364 }); 1365 1366 let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState { 1367 cap: msix.cap, 1368 cap_offset: msix.cap_offset, 1369 bdf: msix.bar.devid, 1370 }); 1371 1372 VfioCommonState { 1373 intx_state, 1374 msi_state, 1375 msix_state, 1376 } 1377 } 1378 1379 fn set_state( 1380 &mut self, 1381 state: &VfioCommonState, 1382 msi_state: Option<MsiConfigState>, 1383 msix_state: Option<MsixConfigState>, 1384 ) -> Result<(), VfioPciError> { 1385 if let (Some(intx), Some(interrupt_source_group)) = 1386 (&state.intx_state, self.legacy_interrupt_group.clone()) 1387 { 1388 self.interrupt.intx = Some(VfioIntx { 1389 interrupt_source_group, 1390 enabled: false, 1391 }); 1392 1393 if intx.enabled { 1394 self.enable_intx()?; 1395 } 1396 } 1397 1398 if let Some(msi) = &state.msi_state { 1399 self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state); 1400 } 1401 1402 if let Some(msix) = &state.msix_state { 1403 self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state); 1404 } 1405 1406 Ok(()) 1407 } 1408 } 1409 1410 impl Pausable for VfioCommon {} 1411 1412 impl Snapshottable for VfioCommon { 1413 fn id(&self) -> String { 1414 String::from(VFIO_COMMON_ID) 1415 } 1416 1417 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1418 let mut vfio_common_snapshot = Snapshot::new_from_state(&self.state())?; 1419 1420 // Snapshot PciConfiguration 1421 vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?); 1422 1423 // Snapshot MSI 1424 if let Some(msi) = &mut self.interrupt.msi { 1425 vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?); 1426 } 1427 1428 // Snapshot MSI-X 1429 if let Some(msix) = &mut self.interrupt.msix { 1430 vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?); 1431 } 1432 1433 Ok(vfio_common_snapshot) 1434 } 1435 } 1436 1437 /// VfioPciDevice represents a VFIO PCI device. 1438 /// This structure implements the BusDevice and PciDevice traits. 1439 /// 1440 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. 1441 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, 1442 /// which then gets added to the PCI bus. 1443 pub struct VfioPciDevice { 1444 id: String, 1445 vm: Arc<dyn hypervisor::Vm>, 1446 device: Arc<VfioDevice>, 1447 container: Arc<VfioContainer>, 1448 common: VfioCommon, 1449 iommu_attached: bool, 1450 memory_slot_allocator: MemorySlotAllocator, 1451 bdf: PciBdf, 1452 device_path: PathBuf, 1453 } 1454 1455 impl VfioPciDevice { 1456 /// Constructs a new Vfio Pci device for the given Vfio device 1457 #[allow(clippy::too_many_arguments)] 1458 pub fn new( 1459 id: String, 1460 vm: &Arc<dyn hypervisor::Vm>, 1461 device: VfioDevice, 1462 container: Arc<VfioContainer>, 1463 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 1464 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 1465 iommu_attached: bool, 1466 bdf: PciBdf, 1467 memory_slot_allocator: MemorySlotAllocator, 1468 snapshot: Option<Snapshot>, 1469 x_nv_gpudirect_clique: Option<u8>, 1470 device_path: PathBuf, 1471 ) -> Result<Self, VfioPciError> { 1472 let device = Arc::new(device); 1473 device.reset(); 1474 1475 let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device)); 1476 1477 let common = VfioCommon::new( 1478 msi_interrupt_manager, 1479 legacy_interrupt_group, 1480 Arc::new(vfio_wrapper) as Arc<dyn Vfio>, 1481 &PciVfioSubclass::VfioSubclass, 1482 bdf, 1483 vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID), 1484 x_nv_gpudirect_clique, 1485 )?; 1486 1487 let vfio_pci_device = VfioPciDevice { 1488 id, 1489 vm: vm.clone(), 1490 device, 1491 container, 1492 common, 1493 iommu_attached, 1494 memory_slot_allocator, 1495 bdf, 1496 device_path: device_path.clone(), 1497 }; 1498 1499 Ok(vfio_pci_device) 1500 } 1501 1502 pub fn iommu_attached(&self) -> bool { 1503 self.iommu_attached 1504 } 1505 1506 fn generate_sparse_areas( 1507 caps: &[VfioRegionInfoCap], 1508 region_index: u32, 1509 region_start: u64, 1510 region_size: u64, 1511 vfio_msix: Option<&VfioMsix>, 1512 ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> { 1513 for cap in caps { 1514 match cap { 1515 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()), 1516 VfioRegionInfoCap::MsixMappable => { 1517 if !is_4k_aligned(region_start) { 1518 error!( 1519 "Region start address 0x{:x} must be at least aligned on 4KiB", 1520 region_start 1521 ); 1522 return Err(VfioPciError::RegionAlignment); 1523 } 1524 if !is_4k_multiple(region_size) { 1525 error!( 1526 "Region size 0x{:x} must be at least a multiple of 4KiB", 1527 region_size 1528 ); 1529 return Err(VfioPciError::RegionSize); 1530 } 1531 1532 // In case the region contains the MSI-X vectors table or 1533 // the MSI-X PBA table, we must calculate the subregions 1534 // around them, leading to a list of sparse areas. 1535 // We want to make sure we will still trap MMIO accesses 1536 // to these MSI-X specific ranges. If these region don't align 1537 // with pagesize, we can achieve it by enlarging its range. 1538 // 1539 // Using a BtreeMap as the list provided through the iterator is sorted 1540 // by key. This ensures proper split of the whole region. 1541 let mut inter_ranges = BTreeMap::new(); 1542 if let Some(msix) = vfio_msix { 1543 if region_index == msix.cap.table_bir() { 1544 let (offset, size) = msix.cap.table_range(); 1545 let offset = align_page_size_down(offset); 1546 let size = align_page_size_up(size); 1547 inter_ranges.insert(offset, size); 1548 } 1549 if region_index == msix.cap.pba_bir() { 1550 let (offset, size) = msix.cap.pba_range(); 1551 let offset = align_page_size_down(offset); 1552 let size = align_page_size_up(size); 1553 inter_ranges.insert(offset, size); 1554 } 1555 } 1556 1557 let mut sparse_areas = Vec::new(); 1558 let mut current_offset = 0; 1559 for (range_offset, range_size) in inter_ranges { 1560 if range_offset > current_offset { 1561 sparse_areas.push(VfioRegionSparseMmapArea { 1562 offset: current_offset, 1563 size: range_offset - current_offset, 1564 }); 1565 } 1566 current_offset = align_page_size_down(range_offset + range_size); 1567 } 1568 1569 if region_size > current_offset { 1570 sparse_areas.push(VfioRegionSparseMmapArea { 1571 offset: current_offset, 1572 size: region_size - current_offset, 1573 }); 1574 } 1575 1576 return Ok(sparse_areas); 1577 } 1578 _ => {} 1579 } 1580 } 1581 1582 // In case no relevant capabilities have been found, create a single 1583 // sparse area corresponding to the entire MMIO region. 1584 Ok(vec![VfioRegionSparseMmapArea { 1585 offset: 0, 1586 size: region_size, 1587 }]) 1588 } 1589 1590 /// Map MMIO regions into the guest, and avoid VM exits when the guest tries 1591 /// to reach those regions. 1592 /// 1593 /// # Arguments 1594 /// 1595 /// * `vm` - The VM object. It is used to set the VFIO MMIO regions 1596 /// as user memory regions. 1597 /// * `mem_slot` - The closure to return a memory slot. 1598 pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> { 1599 let fd = self.device.as_raw_fd(); 1600 1601 for region in self.common.mmio_regions.iter_mut() { 1602 let region_flags = self.device.get_region_flags(region.index); 1603 if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { 1604 let mut prot = 0; 1605 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 { 1606 prot |= libc::PROT_READ; 1607 } 1608 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { 1609 prot |= libc::PROT_WRITE; 1610 } 1611 1612 // Retrieve the list of capabilities found on the region 1613 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 { 1614 self.device.get_region_caps(region.index) 1615 } else { 1616 Vec::new() 1617 }; 1618 1619 // Don't try to mmap the region if it contains MSI-X table or 1620 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE 1621 // in the list of supported capabilities. 1622 if let Some(msix) = self.common.interrupt.msix.as_ref() { 1623 if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir()) 1624 && !caps.contains(&VfioRegionInfoCap::MsixMappable) 1625 { 1626 continue; 1627 } 1628 } 1629 1630 let mmap_size = self.device.get_region_size(region.index); 1631 let mmap_offset = self.device.get_region_offset(region.index); 1632 1633 let sparse_areas = Self::generate_sparse_areas( 1634 &caps, 1635 region.index, 1636 region.start.0, 1637 mmap_size, 1638 self.common.interrupt.msix.as_ref(), 1639 )?; 1640 1641 for area in sparse_areas.iter() { 1642 // SAFETY: FFI call with correct arguments 1643 let host_addr = unsafe { 1644 libc::mmap( 1645 null_mut(), 1646 area.size as usize, 1647 prot, 1648 libc::MAP_SHARED, 1649 fd, 1650 mmap_offset as libc::off_t + area.offset as libc::off_t, 1651 ) 1652 }; 1653 1654 if std::ptr::eq(host_addr, libc::MAP_FAILED) { 1655 error!( 1656 "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", 1657 area.offset, 1658 area.size, 1659 std::io::Error::last_os_error() 1660 ); 1661 return Err(VfioPciError::MmapArea); 1662 } 1663 1664 if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) { 1665 warn!( 1666 "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})", 1667 area.offset, 1668 area.size, 1669 ); 1670 return Ok(()); 1671 } 1672 1673 let user_memory_region = UserMemoryRegion { 1674 slot: self.memory_slot_allocator.next_memory_slot(), 1675 start: region.start.0 + area.offset, 1676 size: area.size, 1677 host_addr: host_addr as u64, 1678 }; 1679 1680 region.user_memory_regions.push(user_memory_region); 1681 1682 let mem_region = self.vm.make_user_memory_region( 1683 user_memory_region.slot, 1684 user_memory_region.start, 1685 user_memory_region.size, 1686 user_memory_region.host_addr, 1687 false, 1688 false, 1689 ); 1690 1691 self.vm 1692 .create_user_memory_region(mem_region) 1693 .map_err(VfioPciError::CreateUserMemoryRegion)?; 1694 1695 if !self.iommu_attached { 1696 self.container 1697 .vfio_dma_map( 1698 user_memory_region.start, 1699 user_memory_region.size, 1700 user_memory_region.host_addr, 1701 ) 1702 .map_err(|e| { 1703 VfioPciError::DmaMap(e, self.device_path.clone(), self.bdf) 1704 })?; 1705 } 1706 } 1707 } 1708 } 1709 1710 Ok(()) 1711 } 1712 1713 pub fn unmap_mmio_regions(&mut self) { 1714 for region in self.common.mmio_regions.iter() { 1715 for user_memory_region in region.user_memory_regions.iter() { 1716 // Unmap from vfio container 1717 if !self.iommu_attached { 1718 if let Err(e) = self 1719 .container 1720 .vfio_dma_unmap(user_memory_region.start, user_memory_region.size) 1721 .map_err(|e| VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf)) 1722 { 1723 error!( 1724 "Could not unmap mmio region from vfio container: \ 1725 iova 0x{:x}, size 0x{:x}: {}, ", 1726 user_memory_region.start, user_memory_region.size, e 1727 ); 1728 } 1729 } 1730 1731 // Remove region 1732 let r = self.vm.make_user_memory_region( 1733 user_memory_region.slot, 1734 user_memory_region.start, 1735 user_memory_region.size, 1736 user_memory_region.host_addr, 1737 false, 1738 false, 1739 ); 1740 1741 if let Err(e) = self.vm.remove_user_memory_region(r) { 1742 error!("Could not remove the userspace memory region: {}", e); 1743 } 1744 1745 self.memory_slot_allocator 1746 .free_memory_slot(user_memory_region.slot); 1747 1748 // SAFETY: FFI call with correct arguments 1749 let ret = unsafe { 1750 libc::munmap( 1751 user_memory_region.host_addr as *mut libc::c_void, 1752 user_memory_region.size as usize, 1753 ) 1754 }; 1755 if ret != 0 { 1756 error!( 1757 "Could not unmap region {}, error:{}", 1758 region.index, 1759 io::Error::last_os_error() 1760 ); 1761 } 1762 } 1763 } 1764 } 1765 1766 pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> { 1767 if !self.iommu_attached { 1768 self.container 1769 .vfio_dma_map(iova, size, user_addr) 1770 .map_err(|e| VfioPciError::DmaMap(e, self.device_path.clone(), self.bdf))?; 1771 } 1772 1773 Ok(()) 1774 } 1775 1776 pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> { 1777 if !self.iommu_attached { 1778 self.container 1779 .vfio_dma_unmap(iova, size) 1780 .map_err(|e| VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf))?; 1781 } 1782 1783 Ok(()) 1784 } 1785 1786 pub fn mmio_regions(&self) -> Vec<MmioRegion> { 1787 self.common.mmio_regions.clone() 1788 } 1789 } 1790 1791 impl Drop for VfioPciDevice { 1792 fn drop(&mut self) { 1793 self.unmap_mmio_regions(); 1794 1795 if let Some(msix) = &self.common.interrupt.msix { 1796 if msix.bar.enabled() { 1797 self.common.disable_msix(); 1798 } 1799 } 1800 1801 if let Some(msi) = &self.common.interrupt.msi { 1802 if msi.cfg.enabled() { 1803 self.common.disable_msi() 1804 } 1805 } 1806 1807 if self.common.interrupt.intx_in_use() { 1808 self.common.disable_intx(); 1809 } 1810 } 1811 } 1812 1813 impl BusDevice for VfioPciDevice { 1814 fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1815 self.read_bar(base, offset, data) 1816 } 1817 1818 fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1819 self.write_bar(base, offset, data) 1820 } 1821 } 1822 1823 // Offset of the 16-bit status register in the PCI configuration space. 1824 const PCI_CONFIG_STATUS_OFFSET: u32 = 0x06; 1825 // Status bit indicating the presence of a capabilities list. 1826 const PCI_CONFIG_STATUS_CAPABILITIES_LIST: u16 = 1 << 4; 1827 // First BAR offset in the PCI config space. 1828 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; 1829 // Capability register offset in the PCI config space. 1830 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; 1831 // The valid bits for the capabilities pointer. 1832 const PCI_CONFIG_CAPABILITY_PTR_MASK: u8 = !0b11; 1833 // Extended capabilities register offset in the PCI config space. 1834 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100; 1835 // IO BAR when first BAR bit is 1. 1836 const PCI_CONFIG_IO_BAR: u32 = 0x1; 1837 // 64-bit memory bar flag. 1838 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; 1839 // Prefetchable BAR bit 1840 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8; 1841 // PCI config register size (4 bytes). 1842 const PCI_CONFIG_REGISTER_SIZE: usize = 4; 1843 // Number of BARs for a PCI device 1844 const BAR_NUMS: usize = 6; 1845 // PCI Header Type register index 1846 const PCI_HEADER_TYPE_REG_INDEX: usize = 3; 1847 // First BAR register index 1848 const PCI_CONFIG_BAR0_INDEX: usize = 4; 1849 // PCI ROM expansion BAR register index 1850 const PCI_ROM_EXP_BAR_INDEX: usize = 12; 1851 1852 impl PciDevice for VfioPciDevice { 1853 fn allocate_bars( 1854 &mut self, 1855 allocator: &Arc<Mutex<SystemAllocator>>, 1856 mmio32_allocator: &mut AddressAllocator, 1857 mmio64_allocator: &mut AddressAllocator, 1858 resources: Option<Vec<Resource>>, 1859 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 1860 self.common 1861 .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources) 1862 } 1863 1864 fn free_bars( 1865 &mut self, 1866 allocator: &mut SystemAllocator, 1867 mmio32_allocator: &mut AddressAllocator, 1868 mmio64_allocator: &mut AddressAllocator, 1869 ) -> Result<(), PciDeviceError> { 1870 self.common 1871 .free_bars(allocator, mmio32_allocator, mmio64_allocator) 1872 } 1873 1874 fn write_config_register( 1875 &mut self, 1876 reg_idx: usize, 1877 offset: u64, 1878 data: &[u8], 1879 ) -> (Vec<BarReprogrammingParams>, Option<Arc<Barrier>>) { 1880 self.common.write_config_register(reg_idx, offset, data) 1881 } 1882 1883 fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1884 self.common.read_config_register(reg_idx) 1885 } 1886 1887 fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1888 self.common.read_bar(base, offset, data) 1889 } 1890 1891 fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1892 self.common.write_bar(base, offset, data) 1893 } 1894 1895 fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> { 1896 for region in self.common.mmio_regions.iter_mut() { 1897 if region.start.raw_value() == old_base { 1898 region.start = GuestAddress(new_base); 1899 1900 for user_memory_region in region.user_memory_regions.iter_mut() { 1901 // Unmap the old MMIO region from vfio container 1902 if !self.iommu_attached { 1903 if let Err(e) = self 1904 .container 1905 .vfio_dma_unmap(user_memory_region.start, user_memory_region.size) 1906 .map_err(|e| { 1907 VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf) 1908 }) 1909 { 1910 error!( 1911 "Could not unmap mmio region from vfio container: \ 1912 iova 0x{:x}, size 0x{:x}: {}, ", 1913 user_memory_region.start, user_memory_region.size, e 1914 ); 1915 } 1916 } 1917 1918 // Remove old region 1919 let old_mem_region = self.vm.make_user_memory_region( 1920 user_memory_region.slot, 1921 user_memory_region.start, 1922 user_memory_region.size, 1923 user_memory_region.host_addr, 1924 false, 1925 false, 1926 ); 1927 1928 self.vm 1929 .remove_user_memory_region(old_mem_region) 1930 .map_err(io::Error::other)?; 1931 1932 // Update the user memory region with the correct start address. 1933 if new_base > old_base { 1934 user_memory_region.start += new_base - old_base; 1935 } else { 1936 user_memory_region.start -= old_base - new_base; 1937 } 1938 1939 // Insert new region 1940 let new_mem_region = self.vm.make_user_memory_region( 1941 user_memory_region.slot, 1942 user_memory_region.start, 1943 user_memory_region.size, 1944 user_memory_region.host_addr, 1945 false, 1946 false, 1947 ); 1948 1949 self.vm 1950 .create_user_memory_region(new_mem_region) 1951 .map_err(io::Error::other)?; 1952 1953 // Map the moved mmio region to vfio container 1954 if !self.iommu_attached { 1955 self.container 1956 .vfio_dma_map( 1957 user_memory_region.start, 1958 user_memory_region.size, 1959 user_memory_region.host_addr, 1960 ) 1961 .map_err(|e| { 1962 VfioPciError::DmaMap(e, self.device_path.clone(), self.bdf) 1963 }) 1964 .map_err(|e| { 1965 io::Error::other(format!( 1966 "Could not map mmio region to vfio container: \ 1967 iova 0x{:x}, size 0x{:x}: {}, ", 1968 user_memory_region.start, user_memory_region.size, e 1969 )) 1970 })?; 1971 } 1972 } 1973 } 1974 } 1975 1976 Ok(()) 1977 } 1978 1979 fn as_any_mut(&mut self) -> &mut dyn Any { 1980 self 1981 } 1982 1983 fn id(&self) -> Option<String> { 1984 Some(self.id.clone()) 1985 } 1986 } 1987 1988 impl Pausable for VfioPciDevice {} 1989 1990 impl Snapshottable for VfioPciDevice { 1991 fn id(&self) -> String { 1992 self.id.clone() 1993 } 1994 1995 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1996 let mut vfio_pci_dev_snapshot = Snapshot::default(); 1997 1998 // Snapshot VfioCommon 1999 vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?); 2000 2001 Ok(vfio_pci_dev_snapshot) 2002 } 2003 } 2004 impl Transportable for VfioPciDevice {} 2005 impl Migratable for VfioPciDevice {} 2006 2007 /// This structure implements the ExternalDmaMapping trait. It is meant to 2008 /// be used when the caller tries to provide a way to update the mappings 2009 /// associated with a specific VFIO container. 2010 pub struct VfioDmaMapping<M: GuestAddressSpace> { 2011 container: Arc<VfioContainer>, 2012 memory: Arc<M>, 2013 mmio_regions: Arc<Mutex<Vec<MmioRegion>>>, 2014 } 2015 2016 impl<M: GuestAddressSpace> VfioDmaMapping<M> { 2017 /// Create a DmaMapping object. 2018 /// # Parameters 2019 /// * `container`: VFIO container object. 2020 /// * `memory`: guest memory to mmap. 2021 /// * `mmio_regions`: mmio_regions to mmap. 2022 pub fn new( 2023 container: Arc<VfioContainer>, 2024 memory: Arc<M>, 2025 mmio_regions: Arc<Mutex<Vec<MmioRegion>>>, 2026 ) -> Self { 2027 VfioDmaMapping { 2028 container, 2029 memory, 2030 mmio_regions, 2031 } 2032 } 2033 } 2034 2035 impl<M: GuestAddressSpace + Sync + Send> ExternalDmaMapping for VfioDmaMapping<M> { 2036 fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), io::Error> { 2037 let mem = self.memory.memory(); 2038 let guest_addr = GuestAddress(gpa); 2039 let user_addr = if mem.check_range(guest_addr, size as usize) { 2040 match mem.get_host_address(guest_addr) { 2041 Ok(t) => t as u64, 2042 Err(e) => { 2043 return Err(io::Error::other( 2044 format!("unable to retrieve user address for gpa 0x{gpa:x} from guest memory region: {e}") 2045 )); 2046 } 2047 } 2048 } else if self.mmio_regions.lock().unwrap().check_range(gpa, size) { 2049 self.mmio_regions.lock().unwrap().find_user_address(gpa)? 2050 } else { 2051 return Err(io::Error::other(format!( 2052 "failed to locate guest address 0x{gpa:x} in guest memory" 2053 ))); 2054 }; 2055 2056 self.container 2057 .vfio_dma_map(iova, size, user_addr) 2058 .map_err(|e| { 2059 io::Error::other(format!( 2060 "failed to map memory for VFIO container, \ 2061 iova 0x{iova:x}, gpa 0x{gpa:x}, size 0x{size:x}: {e:?}" 2062 )) 2063 }) 2064 } 2065 2066 fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), io::Error> { 2067 self.container.vfio_dma_unmap(iova, size).map_err(|e| { 2068 io::Error::other(format!( 2069 "failed to unmap memory for VFIO container, \ 2070 iova 0x{iova:x}, size 0x{size:x}: {e:?}" 2071 )) 2072 }) 2073 } 2074 } 2075