1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 6 use crate::msi::{MsiConfigState, MSI_CONFIG_ID}; 7 use crate::msix::MsixConfigState; 8 use crate::{ 9 msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig, 10 PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId, 11 PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId, 12 PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID, 13 }; 14 use anyhow::anyhow; 15 use byteorder::{ByteOrder, LittleEndian}; 16 use hypervisor::HypervisorVmError; 17 use libc::{sysconf, _SC_PAGESIZE}; 18 use std::any::Any; 19 use std::collections::{BTreeMap, HashMap}; 20 use std::io; 21 use std::os::unix::io::AsRawFd; 22 use std::ptr::null_mut; 23 use std::sync::{Arc, Barrier, Mutex}; 24 use thiserror::Error; 25 use versionize::{VersionMap, Versionize, VersionizeResult}; 26 use versionize_derive::Versionize; 27 use vfio_bindings::bindings::vfio::*; 28 use vfio_ioctls::{ 29 VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea, 30 }; 31 use vm_allocator::page_size::{ 32 align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned, 33 }; 34 use vm_allocator::{AddressAllocator, SystemAllocator}; 35 use vm_device::dma_mapping::ExternalDmaMapping; 36 use vm_device::interrupt::{ 37 InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, 38 }; 39 use vm_device::{BusDevice, Resource}; 40 use vm_memory::{Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestUsize}; 41 use vm_migration::{ 42 Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped, 43 }; 44 use vmm_sys_util::eventfd::EventFd; 45 46 pub(crate) const VFIO_COMMON_ID: &str = "vfio_common"; 47 48 #[derive(Debug, Error)] 49 pub enum VfioPciError { 50 #[error("Failed to create user memory region: {0}")] 51 CreateUserMemoryRegion(#[source] HypervisorVmError), 52 #[error("Failed to DMA map: {0}")] 53 DmaMap(#[source] vfio_ioctls::VfioError), 54 #[error("Failed to DMA unmap: {0}")] 55 DmaUnmap(#[source] vfio_ioctls::VfioError), 56 #[error("Failed to enable INTx: {0}")] 57 EnableIntx(#[source] VfioError), 58 #[error("Failed to enable MSI: {0}")] 59 EnableMsi(#[source] VfioError), 60 #[error("Failed to enable MSI-x: {0}")] 61 EnableMsix(#[source] VfioError), 62 #[error("Failed to mmap the area")] 63 MmapArea, 64 #[error("Failed to notifier's eventfd")] 65 MissingNotifier, 66 #[error("Invalid region alignment")] 67 RegionAlignment, 68 #[error("Invalid region size")] 69 RegionSize, 70 #[error("Failed to retrieve MsiConfigState: {0}")] 71 RetrieveMsiConfigState(#[source] anyhow::Error), 72 #[error("Failed to retrieve MsixConfigState: {0}")] 73 RetrieveMsixConfigState(#[source] anyhow::Error), 74 #[error("Failed to retrieve PciConfigurationState: {0}")] 75 RetrievePciConfigurationState(#[source] anyhow::Error), 76 #[error("Failed to retrieve VfioCommonState: {0}")] 77 RetrieveVfioCommonState(#[source] anyhow::Error), 78 } 79 80 #[derive(Copy, Clone)] 81 enum PciVfioSubclass { 82 VfioSubclass = 0xff, 83 } 84 85 impl PciSubclass for PciVfioSubclass { 86 fn get_register_value(&self) -> u8 { 87 *self as u8 88 } 89 } 90 91 enum InterruptUpdateAction { 92 EnableMsi, 93 DisableMsi, 94 EnableMsix, 95 DisableMsix, 96 } 97 98 #[derive(Versionize)] 99 struct IntxState { 100 enabled: bool, 101 } 102 103 pub(crate) struct VfioIntx { 104 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 105 enabled: bool, 106 } 107 108 #[derive(Versionize)] 109 struct MsiState { 110 cap: MsiCap, 111 cap_offset: u32, 112 } 113 114 pub(crate) struct VfioMsi { 115 pub(crate) cfg: MsiConfig, 116 cap_offset: u32, 117 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 118 } 119 120 impl VfioMsi { 121 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 122 let old_enabled = self.cfg.enabled(); 123 124 self.cfg.update(offset, data); 125 126 let new_enabled = self.cfg.enabled(); 127 128 if !old_enabled && new_enabled { 129 return Some(InterruptUpdateAction::EnableMsi); 130 } 131 132 if old_enabled && !new_enabled { 133 return Some(InterruptUpdateAction::DisableMsi); 134 } 135 136 None 137 } 138 } 139 140 #[derive(Versionize)] 141 struct MsixState { 142 cap: MsixCap, 143 cap_offset: u32, 144 bdf: u32, 145 } 146 147 pub(crate) struct VfioMsix { 148 pub(crate) bar: MsixConfig, 149 cap: MsixCap, 150 cap_offset: u32, 151 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 152 } 153 154 impl VfioMsix { 155 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 156 let old_enabled = self.bar.enabled(); 157 158 // Update "Message Control" word 159 if offset == 2 && data.len() == 2 { 160 self.bar.set_msg_ctl(LittleEndian::read_u16(data)); 161 } 162 163 let new_enabled = self.bar.enabled(); 164 165 if !old_enabled && new_enabled { 166 return Some(InterruptUpdateAction::EnableMsix); 167 } 168 169 if old_enabled && !new_enabled { 170 return Some(InterruptUpdateAction::DisableMsix); 171 } 172 173 None 174 } 175 176 fn table_accessed(&self, bar_index: u32, offset: u64) -> bool { 177 let table_offset: u64 = u64::from(self.cap.table_offset()); 178 let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64); 179 let table_bir: u32 = self.cap.table_bir(); 180 181 bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size 182 } 183 } 184 185 pub(crate) struct Interrupt { 186 pub(crate) intx: Option<VfioIntx>, 187 pub(crate) msi: Option<VfioMsi>, 188 pub(crate) msix: Option<VfioMsix>, 189 } 190 191 impl Interrupt { 192 fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 193 if let Some(ref mut msi) = &mut self.msi { 194 let action = msi.update(offset, data); 195 return action; 196 } 197 198 None 199 } 200 201 fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 202 if let Some(ref mut msix) = &mut self.msix { 203 let action = msix.update(offset, data); 204 return action; 205 } 206 207 None 208 } 209 210 fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { 211 if let Some(msi) = &self.msi { 212 if offset >= u64::from(msi.cap_offset) 213 && offset < u64::from(msi.cap_offset) + msi.cfg.size() 214 { 215 return Some(( 216 PciCapabilityId::MessageSignalledInterrupts, 217 u64::from(msi.cap_offset), 218 )); 219 } 220 } 221 222 if let Some(msix) = &self.msix { 223 if offset == u64::from(msix.cap_offset) { 224 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); 225 } 226 } 227 228 None 229 } 230 231 fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool { 232 if let Some(msix) = &self.msix { 233 return msix.table_accessed(bar_index, offset); 234 } 235 236 false 237 } 238 239 fn msix_write_table(&mut self, offset: u64, data: &[u8]) { 240 if let Some(ref mut msix) = &mut self.msix { 241 let offset = offset - u64::from(msix.cap.table_offset()); 242 msix.bar.write_table(offset, data) 243 } 244 } 245 246 fn msix_read_table(&self, offset: u64, data: &mut [u8]) { 247 if let Some(msix) = &self.msix { 248 let offset = offset - u64::from(msix.cap.table_offset()); 249 msix.bar.read_table(offset, data) 250 } 251 } 252 253 pub(crate) fn intx_in_use(&self) -> bool { 254 if let Some(intx) = &self.intx { 255 return intx.enabled; 256 } 257 258 false 259 } 260 } 261 262 #[derive(Copy, Clone)] 263 pub struct UserMemoryRegion { 264 pub slot: u32, 265 pub start: u64, 266 pub size: u64, 267 pub host_addr: u64, 268 } 269 270 #[derive(Clone)] 271 pub struct MmioRegion { 272 pub start: GuestAddress, 273 pub length: GuestUsize, 274 pub(crate) type_: PciBarRegionType, 275 pub(crate) index: u32, 276 pub(crate) user_memory_regions: Vec<UserMemoryRegion>, 277 } 278 279 trait MmioRegionRange { 280 fn check_range(&self, guest_addr: u64, size: u64) -> bool; 281 fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error>; 282 } 283 284 impl MmioRegionRange for Vec<MmioRegion> { 285 // Check if a guest address is within the range of mmio regions 286 fn check_range(&self, guest_addr: u64, size: u64) -> bool { 287 for region in self.iter() { 288 let Some(guest_addr_end) = guest_addr.checked_add(size) else { 289 return false; 290 }; 291 let Some(region_end) = region.start.raw_value().checked_add(region.length) else { 292 return false; 293 }; 294 if guest_addr >= region.start.raw_value() && guest_addr_end <= region_end { 295 return true; 296 } 297 } 298 false 299 } 300 301 // Locate the user region address for a guest address within all mmio regions 302 fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error> { 303 for region in self.iter() { 304 for user_region in region.user_memory_regions.iter() { 305 if guest_addr >= user_region.start 306 && guest_addr < user_region.start + user_region.size 307 { 308 return Ok(user_region.host_addr + (guest_addr - user_region.start)); 309 } 310 } 311 } 312 313 Err(io::Error::new( 314 io::ErrorKind::Other, 315 format!("unable to find user address: 0x{guest_addr:x}"), 316 )) 317 } 318 } 319 320 #[derive(Debug, Error)] 321 pub enum VfioError { 322 #[error("Kernel VFIO error: {0}")] 323 KernelVfio(#[source] vfio_ioctls::VfioError), 324 #[error("VFIO user error: {0}")] 325 VfioUser(#[source] vfio_user::Error), 326 } 327 328 pub(crate) trait Vfio: Send + Sync { 329 fn read_config_byte(&self, offset: u32) -> u8 { 330 let mut data: [u8; 1] = [0]; 331 self.read_config(offset, &mut data); 332 data[0] 333 } 334 335 fn read_config_word(&self, offset: u32) -> u16 { 336 let mut data: [u8; 2] = [0, 0]; 337 self.read_config(offset, &mut data); 338 u16::from_le_bytes(data) 339 } 340 341 fn read_config_dword(&self, offset: u32) -> u32 { 342 let mut data: [u8; 4] = [0, 0, 0, 0]; 343 self.read_config(offset, &mut data); 344 u32::from_le_bytes(data) 345 } 346 347 fn write_config_dword(&self, offset: u32, buf: u32) { 348 let data: [u8; 4] = buf.to_le_bytes(); 349 self.write_config(offset, &data) 350 } 351 352 fn read_config(&self, offset: u32, data: &mut [u8]) { 353 self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut()); 354 } 355 356 fn write_config(&self, offset: u32, data: &[u8]) { 357 self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data) 358 } 359 360 fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 361 self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds) 362 } 363 364 fn disable_msi(&self) -> Result<(), VfioError> { 365 self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX) 366 } 367 368 fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 369 self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds) 370 } 371 372 fn disable_msix(&self) -> Result<(), VfioError> { 373 self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX) 374 } 375 376 fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) { 377 unimplemented!() 378 } 379 380 fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) { 381 unimplemented!() 382 } 383 384 fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> { 385 unimplemented!() 386 } 387 388 fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 389 unimplemented!() 390 } 391 392 fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 393 unimplemented!() 394 } 395 396 fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 397 unimplemented!() 398 } 399 } 400 401 struct VfioDeviceWrapper { 402 device: Arc<VfioDevice>, 403 } 404 405 impl VfioDeviceWrapper { 406 fn new(device: Arc<VfioDevice>) -> Self { 407 Self { device } 408 } 409 } 410 411 impl Vfio for VfioDeviceWrapper { 412 fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) { 413 self.device.region_read(index, data, offset) 414 } 415 416 fn region_write(&self, index: u32, offset: u64, data: &[u8]) { 417 self.device.region_write(index, data, offset) 418 } 419 420 fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> { 421 self.device.get_irq_info(irq_index).copied() 422 } 423 424 fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 425 self.device 426 .enable_irq(irq_index, event_fds) 427 .map_err(VfioError::KernelVfio) 428 } 429 430 fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> { 431 self.device 432 .disable_irq(irq_index) 433 .map_err(VfioError::KernelVfio) 434 } 435 436 fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> { 437 self.device 438 .unmask_irq(irq_index) 439 .map_err(VfioError::KernelVfio) 440 } 441 } 442 443 #[derive(Versionize)] 444 struct VfioCommonState { 445 intx_state: Option<IntxState>, 446 msi_state: Option<MsiState>, 447 msix_state: Option<MsixState>, 448 } 449 450 impl VersionMapped for VfioCommonState {} 451 452 pub(crate) struct ConfigPatch { 453 mask: u32, 454 patch: u32, 455 } 456 457 pub(crate) struct VfioCommon { 458 pub(crate) configuration: PciConfiguration, 459 pub(crate) mmio_regions: Vec<MmioRegion>, 460 pub(crate) interrupt: Interrupt, 461 pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 462 pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 463 pub(crate) vfio_wrapper: Arc<dyn Vfio>, 464 pub(crate) patches: HashMap<usize, ConfigPatch>, 465 x_nv_gpudirect_clique: Option<u8>, 466 } 467 468 impl VfioCommon { 469 pub(crate) fn new( 470 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 471 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 472 vfio_wrapper: Arc<dyn Vfio>, 473 subclass: &dyn PciSubclass, 474 bdf: PciBdf, 475 snapshot: Option<Snapshot>, 476 x_nv_gpudirect_clique: Option<u8>, 477 ) -> Result<Self, VfioPciError> { 478 let pci_configuration_state = 479 vm_migration::versioned_state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID) 480 .map_err(|e| { 481 VfioPciError::RetrievePciConfigurationState(anyhow!( 482 "Failed to get PciConfigurationState from Snapshot: {}", 483 e 484 )) 485 })?; 486 487 let configuration = PciConfiguration::new( 488 0, 489 0, 490 0, 491 PciClassCode::Other, 492 subclass, 493 None, 494 PciHeaderType::Device, 495 0, 496 0, 497 None, 498 pci_configuration_state, 499 ); 500 501 let mut vfio_common = VfioCommon { 502 mmio_regions: Vec::new(), 503 configuration, 504 interrupt: Interrupt { 505 intx: None, 506 msi: None, 507 msix: None, 508 }, 509 msi_interrupt_manager, 510 legacy_interrupt_group, 511 vfio_wrapper, 512 patches: HashMap::new(), 513 x_nv_gpudirect_clique, 514 }; 515 516 let state: Option<VfioCommonState> = snapshot 517 .as_ref() 518 .map(|s| s.to_versioned_state()) 519 .transpose() 520 .map_err(|e| { 521 VfioPciError::RetrieveVfioCommonState(anyhow!( 522 "Failed to get VfioCommonState from Snapshot: {}", 523 e 524 )) 525 })?; 526 let msi_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSI_CONFIG_ID) 527 .map_err(|e| { 528 VfioPciError::RetrieveMsiConfigState(anyhow!( 529 "Failed to get MsiConfigState from Snapshot: {}", 530 e 531 )) 532 })?; 533 let msix_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID) 534 .map_err(|e| { 535 VfioPciError::RetrieveMsixConfigState(anyhow!( 536 "Failed to get MsixConfigState from Snapshot: {}", 537 e 538 )) 539 })?; 540 541 if let Some(state) = state.as_ref() { 542 vfio_common.set_state(state, msi_state, msix_state)?; 543 } else { 544 vfio_common.parse_capabilities(bdf); 545 vfio_common.initialize_legacy_interrupt()?; 546 } 547 548 Ok(vfio_common) 549 } 550 551 /// In case msix table offset is not page size aligned, we need do some fixup to achieve it. 552 /// Because we don't want the MMIO RW region and trap region overlap each other. 553 fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 { 554 if let Some(msix) = self.interrupt.msix.as_mut() { 555 let msix_cap = &mut msix.cap; 556 557 // Suppose table_bir equals to pba_bir here. Am I right? 558 let (table_offset, table_size) = msix_cap.table_range(); 559 if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id { 560 return region_size; 561 } 562 563 let (pba_offset, pba_size) = msix_cap.pba_range(); 564 let msix_sz = align_page_size_up(table_size + pba_size); 565 // Expand region to hold RW and trap region which both page size aligned 566 let size = std::cmp::max(region_size * 2, msix_sz * 2); 567 // let table starts from the middle of the region 568 msix_cap.table_set_offset((size / 2) as u32); 569 msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32); 570 571 size 572 } else { 573 // MSI-X not supported for this device 574 region_size 575 } 576 } 577 578 // The `allocator` argument is unused on `aarch64` 579 #[allow(unused_variables)] 580 pub(crate) fn allocate_bars( 581 &mut self, 582 allocator: &Arc<Mutex<SystemAllocator>>, 583 mmio32_allocator: &mut AddressAllocator, 584 mmio64_allocator: &mut AddressAllocator, 585 resources: Option<Vec<Resource>>, 586 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 587 let mut bars = Vec::new(); 588 let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX; 589 590 // Going through all regular regions to compute the BAR size. 591 // We're not saving the BAR address to restore it, because we 592 // are going to allocate a guest address for each BAR and write 593 // that new address back. 594 while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { 595 let mut region_size: u64 = 0; 596 let mut region_type = PciBarRegionType::Memory32BitRegion; 597 let mut prefetchable = PciBarPrefetchable::NotPrefetchable; 598 let mut flags: u32 = 0; 599 600 let mut restored_bar_addr = None; 601 if let Some(resources) = &resources { 602 for resource in resources { 603 if let Resource::PciBar { 604 index, 605 base, 606 size, 607 type_, 608 .. 609 } = resource 610 { 611 if *index == bar_id as usize { 612 restored_bar_addr = Some(GuestAddress(*base)); 613 region_size = *size; 614 region_type = PciBarRegionType::from(*type_); 615 break; 616 } 617 } 618 } 619 if restored_bar_addr.is_none() { 620 bar_id += 1; 621 continue; 622 } 623 } else { 624 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 625 (PCI_ROM_EXP_BAR_INDEX * 4) as u32 626 } else { 627 PCI_CONFIG_BAR_OFFSET + bar_id * 4 628 }; 629 630 // First read flags 631 flags = self.vfio_wrapper.read_config_dword(bar_offset); 632 633 // Is this an IO BAR? 634 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 635 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) 636 } else { 637 false 638 }; 639 640 // Is this a 64-bit BAR? 641 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 642 matches!( 643 flags & PCI_CONFIG_MEMORY_BAR_64BIT, 644 PCI_CONFIG_MEMORY_BAR_64BIT 645 ) 646 } else { 647 false 648 }; 649 650 if matches!( 651 flags & PCI_CONFIG_BAR_PREFETCHABLE, 652 PCI_CONFIG_BAR_PREFETCHABLE 653 ) { 654 prefetchable = PciBarPrefetchable::Prefetchable 655 }; 656 657 // To get size write all 1s 658 self.vfio_wrapper 659 .write_config_dword(bar_offset, 0xffff_ffff); 660 661 // And read back BAR value. The device will write zeros for bits it doesn't care about 662 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset); 663 664 if io_bar { 665 // Mask flag bits (lowest 2 for I/O bars) 666 lower &= !0b11; 667 668 // BAR is not enabled 669 if lower == 0 { 670 bar_id += 1; 671 continue; 672 } 673 674 // IO BAR 675 region_type = PciBarRegionType::IoRegion; 676 677 // Invert bits and add 1 to calculate size 678 region_size = (!lower + 1) as u64; 679 } else if is_64bit_bar { 680 // 64 bits Memory BAR 681 region_type = PciBarRegionType::Memory64BitRegion; 682 683 // Query size of upper BAR of 64-bit BAR 684 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; 685 self.vfio_wrapper 686 .write_config_dword(upper_offset, 0xffff_ffff); 687 let upper = self.vfio_wrapper.read_config_dword(upper_offset); 688 689 let mut combined_size = u64::from(upper) << 32 | u64::from(lower); 690 691 // Mask out flag bits (lowest 4 for memory bars) 692 combined_size &= !0b1111; 693 694 // BAR is not enabled 695 if combined_size == 0 { 696 bar_id += 1; 697 continue; 698 } 699 700 // Invert and add 1 to to find size 701 region_size = !combined_size + 1; 702 } else { 703 region_type = PciBarRegionType::Memory32BitRegion; 704 705 // Mask out flag bits (lowest 4 for memory bars) 706 lower &= !0b1111; 707 708 if lower == 0 { 709 bar_id += 1; 710 continue; 711 } 712 713 // Invert and add 1 to to find size 714 region_size = (!lower + 1) as u64; 715 } 716 } 717 718 let bar_addr = match region_type { 719 PciBarRegionType::IoRegion => { 720 #[cfg(target_arch = "aarch64")] 721 unimplemented!(); 722 723 // The address needs to be 4 bytes aligned. 724 #[cfg(not(target_arch = "aarch64"))] 725 allocator 726 .lock() 727 .unwrap() 728 .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4)) 729 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 730 } 731 PciBarRegionType::Memory32BitRegion => { 732 // BAR allocation must be naturally aligned 733 mmio32_allocator 734 .allocate(restored_bar_addr, region_size, Some(region_size)) 735 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 736 } 737 PciBarRegionType::Memory64BitRegion => { 738 // We need do some fixup to keep MMIO RW region and msix cap region page size 739 // aligned. 740 region_size = self.fixup_msix_region(bar_id, region_size); 741 mmio64_allocator 742 .allocate( 743 restored_bar_addr, 744 region_size, 745 Some(std::cmp::max( 746 // SAFETY: FFI call. Trivially safe. 747 unsafe { sysconf(_SC_PAGESIZE) as GuestUsize }, 748 region_size, 749 )), 750 ) 751 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 752 } 753 }; 754 755 // We can now build our BAR configuration block. 756 let bar = PciBarConfiguration::default() 757 .set_index(bar_id as usize) 758 .set_address(bar_addr.raw_value()) 759 .set_size(region_size) 760 .set_region_type(region_type) 761 .set_prefetchable(prefetchable); 762 763 if bar_id == VFIO_PCI_ROM_REGION_INDEX { 764 self.configuration 765 .add_pci_rom_bar(&bar, flags & 0x1) 766 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 767 } else { 768 self.configuration 769 .add_pci_bar(&bar) 770 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 771 } 772 773 bars.push(bar); 774 self.mmio_regions.push(MmioRegion { 775 start: bar_addr, 776 length: region_size, 777 type_: region_type, 778 index: bar_id, 779 user_memory_regions: Vec::new(), 780 }); 781 782 bar_id += 1; 783 if region_type == PciBarRegionType::Memory64BitRegion { 784 bar_id += 1; 785 } 786 } 787 788 Ok(bars) 789 } 790 791 // The `allocator` argument is unused on `aarch64` 792 #[allow(unused_variables)] 793 pub(crate) fn free_bars( 794 &mut self, 795 allocator: &mut SystemAllocator, 796 mmio32_allocator: &mut AddressAllocator, 797 mmio64_allocator: &mut AddressAllocator, 798 ) -> Result<(), PciDeviceError> { 799 for region in self.mmio_regions.iter() { 800 match region.type_ { 801 PciBarRegionType::IoRegion => { 802 #[cfg(target_arch = "x86_64")] 803 allocator.free_io_addresses(region.start, region.length); 804 #[cfg(target_arch = "aarch64")] 805 error!("I/O region is not supported"); 806 } 807 PciBarRegionType::Memory32BitRegion => { 808 mmio32_allocator.free(region.start, region.length); 809 } 810 PciBarRegionType::Memory64BitRegion => { 811 mmio64_allocator.free(region.start, region.length); 812 } 813 } 814 } 815 Ok(()) 816 } 817 818 pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap { 819 let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into()); 820 821 let table = self.vfio_wrapper.read_config_dword((cap + 4).into()); 822 823 let pba = self.vfio_wrapper.read_config_dword((cap + 8).into()); 824 825 MsixCap { 826 msg_ctl, 827 table, 828 pba, 829 } 830 } 831 832 pub(crate) fn initialize_msix( 833 &mut self, 834 msix_cap: MsixCap, 835 cap_offset: u32, 836 bdf: PciBdf, 837 state: Option<MsixConfigState>, 838 ) { 839 let interrupt_source_group = self 840 .msi_interrupt_manager 841 .create_group(MsiIrqGroupConfig { 842 base: 0, 843 count: msix_cap.table_size() as InterruptIndex, 844 }) 845 .unwrap(); 846 847 let msix_config = MsixConfig::new( 848 msix_cap.table_size(), 849 interrupt_source_group.clone(), 850 bdf.into(), 851 state, 852 ) 853 .unwrap(); 854 855 self.interrupt.msix = Some(VfioMsix { 856 bar: msix_config, 857 cap: msix_cap, 858 cap_offset, 859 interrupt_source_group, 860 }); 861 } 862 863 pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 { 864 self.vfio_wrapper.read_config_word((cap + 2).into()) 865 } 866 867 pub(crate) fn initialize_msi( 868 &mut self, 869 msg_ctl: u16, 870 cap_offset: u32, 871 state: Option<MsiConfigState>, 872 ) { 873 let interrupt_source_group = self 874 .msi_interrupt_manager 875 .create_group(MsiIrqGroupConfig { 876 base: 0, 877 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, 878 }) 879 .unwrap(); 880 881 let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap(); 882 883 self.interrupt.msi = Some(VfioMsi { 884 cfg: msi_config, 885 cap_offset, 886 interrupt_source_group, 887 }); 888 } 889 890 pub(crate) fn get_msix_cap_idx(&self) -> Option<usize> { 891 let mut cap_next = self 892 .vfio_wrapper 893 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 894 895 while cap_next != 0 { 896 let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into()); 897 if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX { 898 return Some(cap_next as usize); 899 } else { 900 cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into()); 901 } 902 } 903 904 None 905 } 906 907 pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) { 908 let mut cap_iter = self 909 .vfio_wrapper 910 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 911 912 let mut pci_express_cap_found = false; 913 let mut power_management_cap_found = false; 914 915 while cap_iter != 0 { 916 let cap_id = self.vfio_wrapper.read_config_byte(cap_iter.into()); 917 918 match PciCapabilityId::from(cap_id) { 919 PciCapabilityId::MessageSignalledInterrupts => { 920 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { 921 if irq_info.count > 0 { 922 // Parse capability only if the VFIO device 923 // supports MSI. 924 let msg_ctl = self.parse_msi_capabilities(cap_iter); 925 self.initialize_msi(msg_ctl, cap_iter as u32, None); 926 } 927 } 928 } 929 PciCapabilityId::MsiX => { 930 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) 931 { 932 if irq_info.count > 0 { 933 // Parse capability only if the VFIO device 934 // supports MSI-X. 935 let msix_cap = self.parse_msix_capabilities(cap_iter); 936 self.initialize_msix(msix_cap, cap_iter as u32, bdf, None); 937 } 938 } 939 } 940 PciCapabilityId::PciExpress => pci_express_cap_found = true, 941 PciCapabilityId::PowerManagement => power_management_cap_found = true, 942 _ => {} 943 }; 944 945 let cap_next = self.vfio_wrapper.read_config_byte((cap_iter + 1).into()); 946 if cap_next == 0 { 947 break; 948 } 949 950 cap_iter = cap_next; 951 } 952 953 if let Some(clique_id) = self.x_nv_gpudirect_clique { 954 self.add_nv_gpudirect_clique_cap(cap_iter, clique_id); 955 } 956 957 if pci_express_cap_found && power_management_cap_found { 958 self.parse_extended_capabilities(); 959 } 960 } 961 962 fn add_nv_gpudirect_clique_cap(&mut self, cap_iter: u8, clique_id: u8) { 963 // Turing, Ampere, Hopper, and Lovelace GPUs have dedicated space 964 // at 0xD4 for this capability. 965 let cap_offset = 0xd4u32; 966 967 let reg_idx = (cap_iter / 4) as usize; 968 self.patches.insert( 969 reg_idx, 970 ConfigPatch { 971 mask: 0x0000_ff00, 972 patch: cap_offset << 8, 973 }, 974 ); 975 976 let reg_idx = (cap_offset / 4) as usize; 977 self.patches.insert( 978 reg_idx, 979 ConfigPatch { 980 mask: 0xffff_ffff, 981 patch: 0x50080009u32, 982 }, 983 ); 984 self.patches.insert( 985 reg_idx + 1, 986 ConfigPatch { 987 mask: 0xffff_ffff, 988 patch: u32::from(clique_id) << 19 | 0x5032, 989 }, 990 ); 991 } 992 993 fn parse_extended_capabilities(&mut self) { 994 let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET; 995 996 loop { 997 let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset); 998 999 let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16; 1000 let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16; 1001 1002 match PciExpressCapabilityId::from(cap_id) { 1003 PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation 1004 | PciExpressCapabilityId::ResizeableBar 1005 | PciExpressCapabilityId::SingleRootIoVirtualization => { 1006 let reg_idx = (current_offset / 4) as usize; 1007 self.patches.insert( 1008 reg_idx, 1009 ConfigPatch { 1010 mask: 0x0000_ffff, 1011 patch: PciExpressCapabilityId::NullCapability as u32, 1012 }, 1013 ); 1014 } 1015 _ => {} 1016 } 1017 1018 if cap_next == 0 { 1019 break; 1020 } 1021 1022 current_offset = cap_next.into(); 1023 } 1024 } 1025 1026 pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> { 1027 if let Some(intx) = &mut self.interrupt.intx { 1028 if !intx.enabled { 1029 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { 1030 self.vfio_wrapper 1031 .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) 1032 .map_err(VfioPciError::EnableIntx)?; 1033 1034 intx.enabled = true; 1035 } else { 1036 return Err(VfioPciError::MissingNotifier); 1037 } 1038 } 1039 } 1040 1041 Ok(()) 1042 } 1043 1044 pub(crate) fn disable_intx(&mut self) { 1045 if let Some(intx) = &mut self.interrupt.intx { 1046 if intx.enabled { 1047 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1048 error!("Could not disable INTx: {}", e); 1049 } else { 1050 intx.enabled = false; 1051 } 1052 } 1053 } 1054 } 1055 1056 pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> { 1057 if let Some(msi) = &self.interrupt.msi { 1058 let mut irq_fds: Vec<EventFd> = Vec::new(); 1059 for i in 0..msi.cfg.num_enabled_vectors() { 1060 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { 1061 irq_fds.push(eventfd); 1062 } else { 1063 return Err(VfioPciError::MissingNotifier); 1064 } 1065 } 1066 1067 self.vfio_wrapper 1068 .enable_msi(irq_fds.iter().collect()) 1069 .map_err(VfioPciError::EnableMsi)?; 1070 } 1071 1072 Ok(()) 1073 } 1074 1075 pub(crate) fn disable_msi(&self) { 1076 if let Err(e) = self.vfio_wrapper.disable_msi() { 1077 error!("Could not disable MSI: {}", e); 1078 } 1079 } 1080 1081 pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> { 1082 if let Some(msix) = &self.interrupt.msix { 1083 let mut irq_fds: Vec<EventFd> = Vec::new(); 1084 for i in 0..msix.bar.table_entries.len() { 1085 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { 1086 irq_fds.push(eventfd); 1087 } else { 1088 return Err(VfioPciError::MissingNotifier); 1089 } 1090 } 1091 1092 self.vfio_wrapper 1093 .enable_msix(irq_fds.iter().collect()) 1094 .map_err(VfioPciError::EnableMsix)?; 1095 } 1096 1097 Ok(()) 1098 } 1099 1100 pub(crate) fn disable_msix(&self) { 1101 if let Err(e) = self.vfio_wrapper.disable_msix() { 1102 error!("Could not disable MSI-X: {}", e); 1103 } 1104 } 1105 1106 pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> { 1107 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { 1108 if irq_info.count == 0 { 1109 // A count of 0 means the INTx IRQ is not supported, therefore 1110 // it shouldn't be initialized. 1111 return Ok(()); 1112 } 1113 } 1114 1115 if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() { 1116 self.interrupt.intx = Some(VfioIntx { 1117 interrupt_source_group, 1118 enabled: false, 1119 }); 1120 1121 self.enable_intx()?; 1122 } 1123 1124 Ok(()) 1125 } 1126 1127 pub(crate) fn update_msi_capabilities( 1128 &mut self, 1129 offset: u64, 1130 data: &[u8], 1131 ) -> Result<(), VfioPciError> { 1132 match self.interrupt.update_msi(offset, data) { 1133 Some(InterruptUpdateAction::EnableMsi) => { 1134 // Disable INTx before we can enable MSI 1135 self.disable_intx(); 1136 self.enable_msi()?; 1137 } 1138 Some(InterruptUpdateAction::DisableMsi) => { 1139 // Fallback onto INTx when disabling MSI 1140 self.disable_msi(); 1141 self.enable_intx()?; 1142 } 1143 _ => {} 1144 } 1145 1146 Ok(()) 1147 } 1148 1149 pub(crate) fn update_msix_capabilities( 1150 &mut self, 1151 offset: u64, 1152 data: &[u8], 1153 ) -> Result<(), VfioPciError> { 1154 match self.interrupt.update_msix(offset, data) { 1155 Some(InterruptUpdateAction::EnableMsix) => { 1156 // Disable INTx before we can enable MSI-X 1157 self.disable_intx(); 1158 self.enable_msix()?; 1159 } 1160 Some(InterruptUpdateAction::DisableMsix) => { 1161 // Fallback onto INTx when disabling MSI-X 1162 self.disable_msix(); 1163 self.enable_intx()?; 1164 } 1165 _ => {} 1166 } 1167 1168 Ok(()) 1169 } 1170 1171 pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> { 1172 for region in self.mmio_regions.iter() { 1173 if addr >= region.start.raw_value() 1174 && addr < region.start.unchecked_add(region.length).raw_value() 1175 { 1176 return Some(region.clone()); 1177 } 1178 } 1179 None 1180 } 1181 1182 pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1183 let addr = base + offset; 1184 if let Some(region) = self.find_region(addr) { 1185 let offset = addr - region.start.raw_value(); 1186 1187 if self.interrupt.msix_table_accessed(region.index, offset) { 1188 self.interrupt.msix_read_table(offset, data); 1189 } else { 1190 self.vfio_wrapper.region_read(region.index, offset, data); 1191 } 1192 } 1193 1194 // INTx EOI 1195 // The guest reading from the BAR potentially means the interrupt has 1196 // been received and can be acknowledged. 1197 if self.interrupt.intx_in_use() { 1198 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1199 error!("Failed unmasking INTx IRQ: {}", e); 1200 } 1201 } 1202 } 1203 1204 pub(crate) fn write_bar( 1205 &mut self, 1206 base: u64, 1207 offset: u64, 1208 data: &[u8], 1209 ) -> Option<Arc<Barrier>> { 1210 let addr = base + offset; 1211 if let Some(region) = self.find_region(addr) { 1212 let offset = addr - region.start.raw_value(); 1213 1214 // If the MSI-X table is written to, we need to update our cache. 1215 if self.interrupt.msix_table_accessed(region.index, offset) { 1216 self.interrupt.msix_write_table(offset, data); 1217 } else { 1218 self.vfio_wrapper.region_write(region.index, offset, data); 1219 } 1220 } 1221 1222 // INTx EOI 1223 // The guest writing to the BAR potentially means the interrupt has 1224 // been received and can be acknowledged. 1225 if self.interrupt.intx_in_use() { 1226 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1227 error!("Failed unmasking INTx IRQ: {}", e); 1228 } 1229 } 1230 1231 None 1232 } 1233 1234 pub(crate) fn write_config_register( 1235 &mut self, 1236 reg_idx: usize, 1237 offset: u64, 1238 data: &[u8], 1239 ) -> Option<Arc<Barrier>> { 1240 // When the guest wants to write to a BAR, we trap it into 1241 // our local configuration space. We're not reprogramming 1242 // VFIO device. 1243 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1244 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1245 { 1246 // We keep our local cache updated with the BARs. 1247 // We'll read it back from there when the guest is asking 1248 // for BARs (see read_config_register()). 1249 self.configuration 1250 .write_config_register(reg_idx, offset, data); 1251 return None; 1252 } 1253 1254 let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; 1255 1256 // If the MSI or MSI-X capabilities are accessed, we need to 1257 // update our local cache accordingly. 1258 // Depending on how the capabilities are modified, this could 1259 // trigger a VFIO MSI or MSI-X toggle. 1260 if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { 1261 let cap_offset: u64 = reg - cap_base + offset; 1262 match cap_id { 1263 PciCapabilityId::MessageSignalledInterrupts => { 1264 if let Err(e) = self.update_msi_capabilities(cap_offset, data) { 1265 error!("Could not update MSI capabilities: {}", e); 1266 } 1267 } 1268 PciCapabilityId::MsiX => { 1269 if let Err(e) = self.update_msix_capabilities(cap_offset, data) { 1270 error!("Could not update MSI-X capabilities: {}", e); 1271 } 1272 } 1273 _ => {} 1274 } 1275 } 1276 1277 // Make sure to write to the device's PCI config space after MSI/MSI-X 1278 // interrupts have been enabled/disabled. In case of MSI, when the 1279 // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), 1280 // the MSI Enable bit in the MSI capability structure found in the PCI 1281 // config space is disabled by default. That's why when the guest is 1282 // enabling this bit, we first need to enable the MSI interrupts with 1283 // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write 1284 // to the device region to update the MSI Enable bit. 1285 self.vfio_wrapper.write_config((reg + offset) as u32, data); 1286 1287 None 1288 } 1289 1290 pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1291 // When reading the BARs, we trap it and return what comes 1292 // from our local configuration space. We want the guest to 1293 // use that and not the VFIO device BARs as it does not map 1294 // with the guest address space. 1295 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1296 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1297 { 1298 return self.configuration.read_reg(reg_idx); 1299 } 1300 1301 if let Some(id) = self.get_msix_cap_idx() { 1302 let msix = self.interrupt.msix.as_mut().unwrap(); 1303 if reg_idx * 4 == id + 4 { 1304 return msix.cap.table; 1305 } else if reg_idx * 4 == id + 8 { 1306 return msix.cap.pba; 1307 } 1308 } 1309 1310 // Since we don't support passing multi-functions devices, we should 1311 // mask the multi-function bit, bit 7 of the Header Type byte on the 1312 // register 3. 1313 let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { 1314 0xff7f_ffff 1315 } else { 1316 0xffff_ffff 1317 }; 1318 1319 // The config register read comes from the VFIO device itself. 1320 let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask; 1321 1322 if let Some(config_patch) = self.patches.get(®_idx) { 1323 value = (value & !config_patch.mask) | config_patch.patch; 1324 } 1325 1326 value 1327 } 1328 1329 fn state(&self) -> VfioCommonState { 1330 let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState { 1331 enabled: intx.enabled, 1332 }); 1333 1334 let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState { 1335 cap: msi.cfg.cap, 1336 cap_offset: msi.cap_offset, 1337 }); 1338 1339 let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState { 1340 cap: msix.cap, 1341 cap_offset: msix.cap_offset, 1342 bdf: msix.bar.devid, 1343 }); 1344 1345 VfioCommonState { 1346 intx_state, 1347 msi_state, 1348 msix_state, 1349 } 1350 } 1351 1352 fn set_state( 1353 &mut self, 1354 state: &VfioCommonState, 1355 msi_state: Option<MsiConfigState>, 1356 msix_state: Option<MsixConfigState>, 1357 ) -> Result<(), VfioPciError> { 1358 if let (Some(intx), Some(interrupt_source_group)) = 1359 (&state.intx_state, self.legacy_interrupt_group.clone()) 1360 { 1361 self.interrupt.intx = Some(VfioIntx { 1362 interrupt_source_group, 1363 enabled: false, 1364 }); 1365 1366 if intx.enabled { 1367 self.enable_intx()?; 1368 } 1369 } 1370 1371 if let Some(msi) = &state.msi_state { 1372 self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state); 1373 } 1374 1375 if let Some(msix) = &state.msix_state { 1376 self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state); 1377 } 1378 1379 Ok(()) 1380 } 1381 } 1382 1383 impl Pausable for VfioCommon {} 1384 1385 impl Snapshottable for VfioCommon { 1386 fn id(&self) -> String { 1387 String::from(VFIO_COMMON_ID) 1388 } 1389 1390 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1391 let mut vfio_common_snapshot = Snapshot::new_from_versioned_state(&self.state())?; 1392 1393 // Snapshot PciConfiguration 1394 vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?); 1395 1396 // Snapshot MSI 1397 if let Some(msi) = &mut self.interrupt.msi { 1398 vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?); 1399 } 1400 1401 // Snapshot MSI-X 1402 if let Some(msix) = &mut self.interrupt.msix { 1403 vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?); 1404 } 1405 1406 Ok(vfio_common_snapshot) 1407 } 1408 } 1409 1410 /// VfioPciDevice represents a VFIO PCI device. 1411 /// This structure implements the BusDevice and PciDevice traits. 1412 /// 1413 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. 1414 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, 1415 /// which then gets added to the PCI bus. 1416 pub struct VfioPciDevice { 1417 id: String, 1418 vm: Arc<dyn hypervisor::Vm>, 1419 device: Arc<VfioDevice>, 1420 container: Arc<VfioContainer>, 1421 common: VfioCommon, 1422 iommu_attached: bool, 1423 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1424 } 1425 1426 impl VfioPciDevice { 1427 /// Constructs a new Vfio Pci device for the given Vfio device 1428 #[allow(clippy::too_many_arguments)] 1429 pub fn new( 1430 id: String, 1431 vm: &Arc<dyn hypervisor::Vm>, 1432 device: VfioDevice, 1433 container: Arc<VfioContainer>, 1434 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 1435 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 1436 iommu_attached: bool, 1437 bdf: PciBdf, 1438 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1439 snapshot: Option<Snapshot>, 1440 x_nv_gpudirect_clique: Option<u8>, 1441 ) -> Result<Self, VfioPciError> { 1442 let device = Arc::new(device); 1443 device.reset(); 1444 1445 let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device)); 1446 1447 let common = VfioCommon::new( 1448 msi_interrupt_manager, 1449 legacy_interrupt_group, 1450 Arc::new(vfio_wrapper) as Arc<dyn Vfio>, 1451 &PciVfioSubclass::VfioSubclass, 1452 bdf, 1453 vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID), 1454 x_nv_gpudirect_clique, 1455 )?; 1456 1457 let vfio_pci_device = VfioPciDevice { 1458 id, 1459 vm: vm.clone(), 1460 device, 1461 container, 1462 common, 1463 iommu_attached, 1464 memory_slot, 1465 }; 1466 1467 Ok(vfio_pci_device) 1468 } 1469 1470 pub fn iommu_attached(&self) -> bool { 1471 self.iommu_attached 1472 } 1473 1474 fn generate_sparse_areas( 1475 caps: &[VfioRegionInfoCap], 1476 region_index: u32, 1477 region_start: u64, 1478 region_size: u64, 1479 vfio_msix: Option<&VfioMsix>, 1480 ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> { 1481 for cap in caps { 1482 match cap { 1483 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()), 1484 VfioRegionInfoCap::MsixMappable => { 1485 if !is_4k_aligned(region_start) { 1486 error!( 1487 "Region start address 0x{:x} must be at least aligned on 4KiB", 1488 region_start 1489 ); 1490 return Err(VfioPciError::RegionAlignment); 1491 } 1492 if !is_4k_multiple(region_size) { 1493 error!( 1494 "Region size 0x{:x} must be at least a multiple of 4KiB", 1495 region_size 1496 ); 1497 return Err(VfioPciError::RegionSize); 1498 } 1499 1500 // In case the region contains the MSI-X vectors table or 1501 // the MSI-X PBA table, we must calculate the subregions 1502 // around them, leading to a list of sparse areas. 1503 // We want to make sure we will still trap MMIO accesses 1504 // to these MSI-X specific ranges. If these region don't align 1505 // with pagesize, we can achieve it by enlarging its range. 1506 // 1507 // Using a BtreeMap as the list provided through the iterator is sorted 1508 // by key. This ensures proper split of the whole region. 1509 let mut inter_ranges = BTreeMap::new(); 1510 if let Some(msix) = vfio_msix { 1511 if region_index == msix.cap.table_bir() { 1512 let (offset, size) = msix.cap.table_range(); 1513 let offset = align_page_size_down(offset); 1514 let size = align_page_size_up(size); 1515 inter_ranges.insert(offset, size); 1516 } 1517 if region_index == msix.cap.pba_bir() { 1518 let (offset, size) = msix.cap.pba_range(); 1519 let offset = align_page_size_down(offset); 1520 let size = align_page_size_up(size); 1521 inter_ranges.insert(offset, size); 1522 } 1523 } 1524 1525 let mut sparse_areas = Vec::new(); 1526 let mut current_offset = 0; 1527 for (range_offset, range_size) in inter_ranges { 1528 if range_offset > current_offset { 1529 sparse_areas.push(VfioRegionSparseMmapArea { 1530 offset: current_offset, 1531 size: range_offset - current_offset, 1532 }); 1533 } 1534 current_offset = align_page_size_down(range_offset + range_size); 1535 } 1536 1537 if region_size > current_offset { 1538 sparse_areas.push(VfioRegionSparseMmapArea { 1539 offset: current_offset, 1540 size: region_size - current_offset, 1541 }); 1542 } 1543 1544 return Ok(sparse_areas); 1545 } 1546 _ => {} 1547 } 1548 } 1549 1550 // In case no relevant capabilities have been found, create a single 1551 // sparse area corresponding to the entire MMIO region. 1552 Ok(vec![VfioRegionSparseMmapArea { 1553 offset: 0, 1554 size: region_size, 1555 }]) 1556 } 1557 1558 /// Map MMIO regions into the guest, and avoid VM exits when the guest tries 1559 /// to reach those regions. 1560 /// 1561 /// # Arguments 1562 /// 1563 /// * `vm` - The VM object. It is used to set the VFIO MMIO regions 1564 /// as user memory regions. 1565 /// * `mem_slot` - The closure to return a memory slot. 1566 pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> { 1567 let fd = self.device.as_raw_fd(); 1568 1569 for region in self.common.mmio_regions.iter_mut() { 1570 let region_flags = self.device.get_region_flags(region.index); 1571 if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { 1572 let mut prot = 0; 1573 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 { 1574 prot |= libc::PROT_READ; 1575 } 1576 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { 1577 prot |= libc::PROT_WRITE; 1578 } 1579 1580 // Retrieve the list of capabilities found on the region 1581 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 { 1582 self.device.get_region_caps(region.index) 1583 } else { 1584 Vec::new() 1585 }; 1586 1587 // Don't try to mmap the region if it contains MSI-X table or 1588 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE 1589 // in the list of supported capabilities. 1590 if let Some(msix) = self.common.interrupt.msix.as_ref() { 1591 if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir()) 1592 && !caps.contains(&VfioRegionInfoCap::MsixMappable) 1593 { 1594 continue; 1595 } 1596 } 1597 1598 let mmap_size = self.device.get_region_size(region.index); 1599 let mmap_offset = self.device.get_region_offset(region.index); 1600 1601 let sparse_areas = Self::generate_sparse_areas( 1602 &caps, 1603 region.index, 1604 region.start.0, 1605 mmap_size, 1606 self.common.interrupt.msix.as_ref(), 1607 )?; 1608 1609 for area in sparse_areas.iter() { 1610 // SAFETY: FFI call with correct arguments 1611 let host_addr = unsafe { 1612 libc::mmap( 1613 null_mut(), 1614 area.size as usize, 1615 prot, 1616 libc::MAP_SHARED, 1617 fd, 1618 mmap_offset as libc::off_t + area.offset as libc::off_t, 1619 ) 1620 }; 1621 1622 if host_addr == libc::MAP_FAILED { 1623 error!( 1624 "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", 1625 area.offset, 1626 area.size, 1627 std::io::Error::last_os_error() 1628 ); 1629 return Err(VfioPciError::MmapArea); 1630 } 1631 1632 if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) { 1633 warn!( 1634 "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})", 1635 area.offset, 1636 area.size, 1637 ); 1638 return Ok(()); 1639 } 1640 1641 let user_memory_region = UserMemoryRegion { 1642 slot: (self.memory_slot)(), 1643 start: region.start.0 + area.offset, 1644 size: area.size, 1645 host_addr: host_addr as u64, 1646 }; 1647 1648 region.user_memory_regions.push(user_memory_region); 1649 1650 let mem_region = self.vm.make_user_memory_region( 1651 user_memory_region.slot, 1652 user_memory_region.start, 1653 user_memory_region.size, 1654 user_memory_region.host_addr, 1655 false, 1656 false, 1657 ); 1658 1659 self.vm 1660 .create_user_memory_region(mem_region) 1661 .map_err(VfioPciError::CreateUserMemoryRegion)?; 1662 1663 if !self.iommu_attached { 1664 self.container 1665 .vfio_dma_map( 1666 user_memory_region.start, 1667 user_memory_region.size, 1668 user_memory_region.host_addr, 1669 ) 1670 .map_err(VfioPciError::DmaMap)?; 1671 } 1672 } 1673 } 1674 } 1675 1676 Ok(()) 1677 } 1678 1679 pub fn unmap_mmio_regions(&mut self) { 1680 for region in self.common.mmio_regions.iter() { 1681 for user_memory_region in region.user_memory_regions.iter() { 1682 // Unmap from vfio container 1683 if !self.iommu_attached { 1684 if let Err(e) = self 1685 .container 1686 .vfio_dma_unmap(user_memory_region.start, user_memory_region.size) 1687 { 1688 error!("Could not unmap mmio region from vfio container: {}", e); 1689 } 1690 } 1691 1692 // Remove region 1693 let r = self.vm.make_user_memory_region( 1694 user_memory_region.slot, 1695 user_memory_region.start, 1696 user_memory_region.size, 1697 user_memory_region.host_addr, 1698 false, 1699 false, 1700 ); 1701 1702 if let Err(e) = self.vm.remove_user_memory_region(r) { 1703 error!("Could not remove the userspace memory region: {}", e); 1704 } 1705 1706 // SAFETY: FFI call with correct arguments 1707 let ret = unsafe { 1708 libc::munmap( 1709 user_memory_region.host_addr as *mut libc::c_void, 1710 user_memory_region.size as usize, 1711 ) 1712 }; 1713 if ret != 0 { 1714 error!( 1715 "Could not unmap region {}, error:{}", 1716 region.index, 1717 io::Error::last_os_error() 1718 ); 1719 } 1720 } 1721 } 1722 } 1723 1724 pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> { 1725 if !self.iommu_attached { 1726 self.container 1727 .vfio_dma_map(iova, size, user_addr) 1728 .map_err(VfioPciError::DmaMap)?; 1729 } 1730 1731 Ok(()) 1732 } 1733 1734 pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> { 1735 if !self.iommu_attached { 1736 self.container 1737 .vfio_dma_unmap(iova, size) 1738 .map_err(VfioPciError::DmaUnmap)?; 1739 } 1740 1741 Ok(()) 1742 } 1743 1744 pub fn mmio_regions(&self) -> Vec<MmioRegion> { 1745 self.common.mmio_regions.clone() 1746 } 1747 } 1748 1749 impl Drop for VfioPciDevice { 1750 fn drop(&mut self) { 1751 self.unmap_mmio_regions(); 1752 1753 if let Some(msix) = &self.common.interrupt.msix { 1754 if msix.bar.enabled() { 1755 self.common.disable_msix(); 1756 } 1757 } 1758 1759 if let Some(msi) = &self.common.interrupt.msi { 1760 if msi.cfg.enabled() { 1761 self.common.disable_msi() 1762 } 1763 } 1764 1765 if self.common.interrupt.intx_in_use() { 1766 self.common.disable_intx(); 1767 } 1768 } 1769 } 1770 1771 impl BusDevice for VfioPciDevice { 1772 fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1773 self.read_bar(base, offset, data) 1774 } 1775 1776 fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1777 self.write_bar(base, offset, data) 1778 } 1779 } 1780 1781 // First BAR offset in the PCI config space. 1782 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; 1783 // Capability register offset in the PCI config space. 1784 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; 1785 // Extended capabilities register offset in the PCI config space. 1786 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100; 1787 // IO BAR when first BAR bit is 1. 1788 const PCI_CONFIG_IO_BAR: u32 = 0x1; 1789 // 64-bit memory bar flag. 1790 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; 1791 // Prefetchable BAR bit 1792 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8; 1793 // PCI config register size (4 bytes). 1794 const PCI_CONFIG_REGISTER_SIZE: usize = 4; 1795 // Number of BARs for a PCI device 1796 const BAR_NUMS: usize = 6; 1797 // PCI Header Type register index 1798 const PCI_HEADER_TYPE_REG_INDEX: usize = 3; 1799 // First BAR register index 1800 const PCI_CONFIG_BAR0_INDEX: usize = 4; 1801 // PCI ROM expansion BAR register index 1802 const PCI_ROM_EXP_BAR_INDEX: usize = 12; 1803 1804 impl PciDevice for VfioPciDevice { 1805 fn allocate_bars( 1806 &mut self, 1807 allocator: &Arc<Mutex<SystemAllocator>>, 1808 mmio32_allocator: &mut AddressAllocator, 1809 mmio64_allocator: &mut AddressAllocator, 1810 resources: Option<Vec<Resource>>, 1811 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 1812 self.common 1813 .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources) 1814 } 1815 1816 fn free_bars( 1817 &mut self, 1818 allocator: &mut SystemAllocator, 1819 mmio32_allocator: &mut AddressAllocator, 1820 mmio64_allocator: &mut AddressAllocator, 1821 ) -> Result<(), PciDeviceError> { 1822 self.common 1823 .free_bars(allocator, mmio32_allocator, mmio64_allocator) 1824 } 1825 1826 fn write_config_register( 1827 &mut self, 1828 reg_idx: usize, 1829 offset: u64, 1830 data: &[u8], 1831 ) -> Option<Arc<Barrier>> { 1832 self.common.write_config_register(reg_idx, offset, data) 1833 } 1834 1835 fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1836 self.common.read_config_register(reg_idx) 1837 } 1838 1839 fn detect_bar_reprogramming( 1840 &mut self, 1841 reg_idx: usize, 1842 data: &[u8], 1843 ) -> Option<BarReprogrammingParams> { 1844 self.common 1845 .configuration 1846 .detect_bar_reprogramming(reg_idx, data) 1847 } 1848 1849 fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1850 self.common.read_bar(base, offset, data) 1851 } 1852 1853 fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1854 self.common.write_bar(base, offset, data) 1855 } 1856 1857 fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> { 1858 for region in self.common.mmio_regions.iter_mut() { 1859 if region.start.raw_value() == old_base { 1860 region.start = GuestAddress(new_base); 1861 1862 for user_memory_region in region.user_memory_regions.iter_mut() { 1863 // Remove old region 1864 let old_mem_region = self.vm.make_user_memory_region( 1865 user_memory_region.slot, 1866 user_memory_region.start, 1867 user_memory_region.size, 1868 user_memory_region.host_addr, 1869 false, 1870 false, 1871 ); 1872 1873 self.vm 1874 .remove_user_memory_region(old_mem_region) 1875 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1876 1877 // Update the user memory region with the correct start address. 1878 if new_base > old_base { 1879 user_memory_region.start += new_base - old_base; 1880 } else { 1881 user_memory_region.start -= old_base - new_base; 1882 } 1883 1884 // Insert new region 1885 let new_mem_region = self.vm.make_user_memory_region( 1886 user_memory_region.slot, 1887 user_memory_region.start, 1888 user_memory_region.size, 1889 user_memory_region.host_addr, 1890 false, 1891 false, 1892 ); 1893 1894 self.vm 1895 .create_user_memory_region(new_mem_region) 1896 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1897 } 1898 } 1899 } 1900 1901 Ok(()) 1902 } 1903 1904 fn as_any(&mut self) -> &mut dyn Any { 1905 self 1906 } 1907 1908 fn id(&self) -> Option<String> { 1909 Some(self.id.clone()) 1910 } 1911 } 1912 1913 impl Pausable for VfioPciDevice {} 1914 1915 impl Snapshottable for VfioPciDevice { 1916 fn id(&self) -> String { 1917 self.id.clone() 1918 } 1919 1920 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1921 let mut vfio_pci_dev_snapshot = Snapshot::default(); 1922 1923 // Snapshot VfioCommon 1924 vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?); 1925 1926 Ok(vfio_pci_dev_snapshot) 1927 } 1928 } 1929 impl Transportable for VfioPciDevice {} 1930 impl Migratable for VfioPciDevice {} 1931 1932 /// This structure implements the ExternalDmaMapping trait. It is meant to 1933 /// be used when the caller tries to provide a way to update the mappings 1934 /// associated with a specific VFIO container. 1935 pub struct VfioDmaMapping<M: GuestAddressSpace> { 1936 container: Arc<VfioContainer>, 1937 memory: Arc<M>, 1938 mmio_regions: Arc<Mutex<Vec<MmioRegion>>>, 1939 } 1940 1941 impl<M: GuestAddressSpace> VfioDmaMapping<M> { 1942 /// Create a DmaMapping object. 1943 /// # Parameters 1944 /// * `container`: VFIO container object. 1945 /// * `memory`: guest memory to mmap. 1946 /// * `mmio_regions`: mmio_regions to mmap. 1947 pub fn new( 1948 container: Arc<VfioContainer>, 1949 memory: Arc<M>, 1950 mmio_regions: Arc<Mutex<Vec<MmioRegion>>>, 1951 ) -> Self { 1952 VfioDmaMapping { 1953 container, 1954 memory, 1955 mmio_regions, 1956 } 1957 } 1958 } 1959 1960 impl<M: GuestAddressSpace + Sync + Send> ExternalDmaMapping for VfioDmaMapping<M> { 1961 fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), io::Error> { 1962 let mem = self.memory.memory(); 1963 let guest_addr = GuestAddress(gpa); 1964 let user_addr = if mem.check_range(guest_addr, size as usize) { 1965 match mem.get_host_address(guest_addr) { 1966 Ok(t) => t as u64, 1967 Err(e) => { 1968 return Err(io::Error::new( 1969 io::ErrorKind::Other, 1970 format!("unable to retrieve user address for gpa 0x{gpa:x} from guest memory region: {e}") 1971 )); 1972 } 1973 } 1974 } else if self.mmio_regions.lock().unwrap().check_range(gpa, size) { 1975 self.mmio_regions.lock().unwrap().find_user_address(gpa)? 1976 } else { 1977 return Err(io::Error::new( 1978 io::ErrorKind::Other, 1979 format!("failed to locate guest address 0x{gpa:x} in guest memory"), 1980 )); 1981 }; 1982 1983 self.container 1984 .vfio_dma_map(iova, size, user_addr) 1985 .map_err(|e| { 1986 io::Error::new( 1987 io::ErrorKind::Other, 1988 format!( 1989 "failed to map memory for VFIO container, \ 1990 iova 0x{iova:x}, gpa 0x{gpa:x}, size 0x{size:x}: {e:?}" 1991 ), 1992 ) 1993 }) 1994 } 1995 1996 fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), io::Error> { 1997 self.container.vfio_dma_unmap(iova, size).map_err(|e| { 1998 io::Error::new( 1999 io::ErrorKind::Other, 2000 format!( 2001 "failed to unmap memory for VFIO container, \ 2002 iova 0x{iova:x}, size 0x{size:x}: {e:?}" 2003 ), 2004 ) 2005 }) 2006 } 2007 } 2008