1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 6 use std::any::Any; 7 use std::collections::{BTreeMap, HashMap}; 8 use std::io; 9 use std::os::unix::io::AsRawFd; 10 use std::ptr::null_mut; 11 use std::sync::{Arc, Barrier, Mutex}; 12 13 use anyhow::anyhow; 14 use byteorder::{ByteOrder, LittleEndian}; 15 use hypervisor::HypervisorVmError; 16 use libc::{sysconf, _SC_PAGESIZE}; 17 use serde::{Deserialize, Serialize}; 18 use thiserror::Error; 19 use vfio_bindings::bindings::vfio::*; 20 use vfio_ioctls::{ 21 VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea, 22 }; 23 use vm_allocator::page_size::{ 24 align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned, 25 }; 26 use vm_allocator::{AddressAllocator, SystemAllocator}; 27 use vm_device::dma_mapping::ExternalDmaMapping; 28 use vm_device::interrupt::{ 29 InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, 30 }; 31 use vm_device::{BusDevice, Resource}; 32 use vm_memory::{Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestUsize}; 33 use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; 34 use vmm_sys_util::eventfd::EventFd; 35 36 use crate::msi::{MsiConfigState, MSI_CONFIG_ID}; 37 use crate::msix::MsixConfigState; 38 use crate::{ 39 msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig, 40 PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId, 41 PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId, 42 PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID, 43 }; 44 45 pub(crate) const VFIO_COMMON_ID: &str = "vfio_common"; 46 47 #[derive(Debug, Error)] 48 pub enum VfioPciError { 49 #[error("Failed to create user memory region: {0}")] 50 CreateUserMemoryRegion(#[source] HypervisorVmError), 51 #[error("Failed to DMA map: {0}")] 52 DmaMap(#[source] vfio_ioctls::VfioError), 53 #[error("Failed to DMA unmap: {0}")] 54 DmaUnmap(#[source] vfio_ioctls::VfioError), 55 #[error("Failed to enable INTx: {0}")] 56 EnableIntx(#[source] VfioError), 57 #[error("Failed to enable MSI: {0}")] 58 EnableMsi(#[source] VfioError), 59 #[error("Failed to enable MSI-x: {0}")] 60 EnableMsix(#[source] VfioError), 61 #[error("Failed to mmap the area")] 62 MmapArea, 63 #[error("Failed to notifier's eventfd")] 64 MissingNotifier, 65 #[error("Invalid region alignment")] 66 RegionAlignment, 67 #[error("Invalid region size")] 68 RegionSize, 69 #[error("Failed to retrieve MsiConfigState: {0}")] 70 RetrieveMsiConfigState(#[source] anyhow::Error), 71 #[error("Failed to retrieve MsixConfigState: {0}")] 72 RetrieveMsixConfigState(#[source] anyhow::Error), 73 #[error("Failed to retrieve PciConfigurationState: {0}")] 74 RetrievePciConfigurationState(#[source] anyhow::Error), 75 #[error("Failed to retrieve VfioCommonState: {0}")] 76 RetrieveVfioCommonState(#[source] anyhow::Error), 77 } 78 79 #[derive(Copy, Clone)] 80 enum PciVfioSubclass { 81 VfioSubclass = 0xff, 82 } 83 84 impl PciSubclass for PciVfioSubclass { 85 fn get_register_value(&self) -> u8 { 86 *self as u8 87 } 88 } 89 90 enum InterruptUpdateAction { 91 EnableMsi, 92 DisableMsi, 93 EnableMsix, 94 DisableMsix, 95 } 96 97 #[derive(Serialize, Deserialize)] 98 struct IntxState { 99 enabled: bool, 100 } 101 102 pub(crate) struct VfioIntx { 103 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 104 enabled: bool, 105 } 106 107 #[derive(Serialize, Deserialize)] 108 struct MsiState { 109 cap: MsiCap, 110 cap_offset: u32, 111 } 112 113 pub(crate) struct VfioMsi { 114 pub(crate) cfg: MsiConfig, 115 cap_offset: u32, 116 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 117 } 118 119 impl VfioMsi { 120 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 121 let old_enabled = self.cfg.enabled(); 122 123 self.cfg.update(offset, data); 124 125 let new_enabled = self.cfg.enabled(); 126 127 if !old_enabled && new_enabled { 128 return Some(InterruptUpdateAction::EnableMsi); 129 } 130 131 if old_enabled && !new_enabled { 132 return Some(InterruptUpdateAction::DisableMsi); 133 } 134 135 None 136 } 137 } 138 139 #[derive(Serialize, Deserialize)] 140 struct MsixState { 141 cap: MsixCap, 142 cap_offset: u32, 143 bdf: u32, 144 } 145 146 pub(crate) struct VfioMsix { 147 pub(crate) bar: MsixConfig, 148 cap: MsixCap, 149 cap_offset: u32, 150 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 151 } 152 153 impl VfioMsix { 154 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 155 let old_enabled = self.bar.enabled(); 156 157 // Update "Message Control" word 158 if offset == 2 && data.len() == 2 { 159 self.bar.set_msg_ctl(LittleEndian::read_u16(data)); 160 } 161 162 let new_enabled = self.bar.enabled(); 163 164 if !old_enabled && new_enabled { 165 return Some(InterruptUpdateAction::EnableMsix); 166 } 167 168 if old_enabled && !new_enabled { 169 return Some(InterruptUpdateAction::DisableMsix); 170 } 171 172 None 173 } 174 175 fn table_accessed(&self, bar_index: u32, offset: u64) -> bool { 176 let table_offset: u64 = u64::from(self.cap.table_offset()); 177 let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64); 178 let table_bir: u32 = self.cap.table_bir(); 179 180 bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size 181 } 182 } 183 184 pub(crate) struct Interrupt { 185 pub(crate) intx: Option<VfioIntx>, 186 pub(crate) msi: Option<VfioMsi>, 187 pub(crate) msix: Option<VfioMsix>, 188 } 189 190 impl Interrupt { 191 fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 192 if let Some(ref mut msi) = &mut self.msi { 193 let action = msi.update(offset, data); 194 return action; 195 } 196 197 None 198 } 199 200 fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 201 if let Some(ref mut msix) = &mut self.msix { 202 let action = msix.update(offset, data); 203 return action; 204 } 205 206 None 207 } 208 209 fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { 210 if let Some(msi) = &self.msi { 211 if offset >= u64::from(msi.cap_offset) 212 && offset < u64::from(msi.cap_offset) + msi.cfg.size() 213 { 214 return Some(( 215 PciCapabilityId::MessageSignalledInterrupts, 216 u64::from(msi.cap_offset), 217 )); 218 } 219 } 220 221 if let Some(msix) = &self.msix { 222 if offset == u64::from(msix.cap_offset) { 223 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); 224 } 225 } 226 227 None 228 } 229 230 fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool { 231 if let Some(msix) = &self.msix { 232 return msix.table_accessed(bar_index, offset); 233 } 234 235 false 236 } 237 238 fn msix_write_table(&mut self, offset: u64, data: &[u8]) { 239 if let Some(ref mut msix) = &mut self.msix { 240 let offset = offset - u64::from(msix.cap.table_offset()); 241 msix.bar.write_table(offset, data) 242 } 243 } 244 245 fn msix_read_table(&self, offset: u64, data: &mut [u8]) { 246 if let Some(msix) = &self.msix { 247 let offset = offset - u64::from(msix.cap.table_offset()); 248 msix.bar.read_table(offset, data) 249 } 250 } 251 252 pub(crate) fn intx_in_use(&self) -> bool { 253 if let Some(intx) = &self.intx { 254 return intx.enabled; 255 } 256 257 false 258 } 259 } 260 261 #[derive(Copy, Clone)] 262 pub struct UserMemoryRegion { 263 pub slot: u32, 264 pub start: u64, 265 pub size: u64, 266 pub host_addr: u64, 267 } 268 269 #[derive(Clone)] 270 pub struct MmioRegion { 271 pub start: GuestAddress, 272 pub length: GuestUsize, 273 pub(crate) type_: PciBarRegionType, 274 pub(crate) index: u32, 275 pub(crate) user_memory_regions: Vec<UserMemoryRegion>, 276 } 277 278 trait MmioRegionRange { 279 fn check_range(&self, guest_addr: u64, size: u64) -> bool; 280 fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error>; 281 } 282 283 impl MmioRegionRange for Vec<MmioRegion> { 284 // Check if a guest address is within the range of mmio regions 285 fn check_range(&self, guest_addr: u64, size: u64) -> bool { 286 for region in self.iter() { 287 let Some(guest_addr_end) = guest_addr.checked_add(size) else { 288 return false; 289 }; 290 let Some(region_end) = region.start.raw_value().checked_add(region.length) else { 291 return false; 292 }; 293 if guest_addr >= region.start.raw_value() && guest_addr_end <= region_end { 294 return true; 295 } 296 } 297 false 298 } 299 300 // Locate the user region address for a guest address within all mmio regions 301 fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error> { 302 for region in self.iter() { 303 for user_region in region.user_memory_regions.iter() { 304 if guest_addr >= user_region.start 305 && guest_addr < user_region.start + user_region.size 306 { 307 return Ok(user_region.host_addr + (guest_addr - user_region.start)); 308 } 309 } 310 } 311 312 Err(io::Error::new( 313 io::ErrorKind::Other, 314 format!("unable to find user address: 0x{guest_addr:x}"), 315 )) 316 } 317 } 318 319 #[derive(Debug, Error)] 320 pub enum VfioError { 321 #[error("Kernel VFIO error: {0}")] 322 KernelVfio(#[source] vfio_ioctls::VfioError), 323 #[error("VFIO user error: {0}")] 324 VfioUser(#[source] vfio_user::Error), 325 } 326 327 pub(crate) trait Vfio: Send + Sync { 328 fn read_config_byte(&self, offset: u32) -> u8 { 329 let mut data: [u8; 1] = [0]; 330 self.read_config(offset, &mut data); 331 data[0] 332 } 333 334 fn read_config_word(&self, offset: u32) -> u16 { 335 let mut data: [u8; 2] = [0, 0]; 336 self.read_config(offset, &mut data); 337 u16::from_le_bytes(data) 338 } 339 340 fn read_config_dword(&self, offset: u32) -> u32 { 341 let mut data: [u8; 4] = [0, 0, 0, 0]; 342 self.read_config(offset, &mut data); 343 u32::from_le_bytes(data) 344 } 345 346 fn write_config_dword(&self, offset: u32, buf: u32) { 347 let data: [u8; 4] = buf.to_le_bytes(); 348 self.write_config(offset, &data) 349 } 350 351 fn read_config(&self, offset: u32, data: &mut [u8]) { 352 self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut()); 353 } 354 355 fn write_config(&self, offset: u32, data: &[u8]) { 356 self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data) 357 } 358 359 fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 360 self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds) 361 } 362 363 fn disable_msi(&self) -> Result<(), VfioError> { 364 self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX) 365 } 366 367 fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 368 self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds) 369 } 370 371 fn disable_msix(&self) -> Result<(), VfioError> { 372 self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX) 373 } 374 375 fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) { 376 unimplemented!() 377 } 378 379 fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) { 380 unimplemented!() 381 } 382 383 fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> { 384 unimplemented!() 385 } 386 387 fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 388 unimplemented!() 389 } 390 391 fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 392 unimplemented!() 393 } 394 395 fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 396 unimplemented!() 397 } 398 } 399 400 struct VfioDeviceWrapper { 401 device: Arc<VfioDevice>, 402 } 403 404 impl VfioDeviceWrapper { 405 fn new(device: Arc<VfioDevice>) -> Self { 406 Self { device } 407 } 408 } 409 410 impl Vfio for VfioDeviceWrapper { 411 fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) { 412 self.device.region_read(index, data, offset) 413 } 414 415 fn region_write(&self, index: u32, offset: u64, data: &[u8]) { 416 self.device.region_write(index, data, offset) 417 } 418 419 fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> { 420 self.device.get_irq_info(irq_index).copied() 421 } 422 423 fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 424 self.device 425 .enable_irq(irq_index, event_fds) 426 .map_err(VfioError::KernelVfio) 427 } 428 429 fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> { 430 self.device 431 .disable_irq(irq_index) 432 .map_err(VfioError::KernelVfio) 433 } 434 435 fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> { 436 self.device 437 .unmask_irq(irq_index) 438 .map_err(VfioError::KernelVfio) 439 } 440 } 441 442 #[derive(Serialize, Deserialize)] 443 struct VfioCommonState { 444 intx_state: Option<IntxState>, 445 msi_state: Option<MsiState>, 446 msix_state: Option<MsixState>, 447 } 448 449 pub(crate) struct ConfigPatch { 450 mask: u32, 451 patch: u32, 452 } 453 454 pub(crate) struct VfioCommon { 455 pub(crate) configuration: PciConfiguration, 456 pub(crate) mmio_regions: Vec<MmioRegion>, 457 pub(crate) interrupt: Interrupt, 458 pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 459 pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 460 pub(crate) vfio_wrapper: Arc<dyn Vfio>, 461 pub(crate) patches: HashMap<usize, ConfigPatch>, 462 x_nv_gpudirect_clique: Option<u8>, 463 } 464 465 impl VfioCommon { 466 pub(crate) fn new( 467 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 468 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 469 vfio_wrapper: Arc<dyn Vfio>, 470 subclass: &dyn PciSubclass, 471 bdf: PciBdf, 472 snapshot: Option<Snapshot>, 473 x_nv_gpudirect_clique: Option<u8>, 474 ) -> Result<Self, VfioPciError> { 475 let pci_configuration_state = 476 vm_migration::state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID).map_err(|e| { 477 VfioPciError::RetrievePciConfigurationState(anyhow!( 478 "Failed to get PciConfigurationState from Snapshot: {}", 479 e 480 )) 481 })?; 482 483 let configuration = PciConfiguration::new( 484 0, 485 0, 486 0, 487 PciClassCode::Other, 488 subclass, 489 None, 490 PciHeaderType::Device, 491 0, 492 0, 493 None, 494 pci_configuration_state, 495 ); 496 497 let mut vfio_common = VfioCommon { 498 mmio_regions: Vec::new(), 499 configuration, 500 interrupt: Interrupt { 501 intx: None, 502 msi: None, 503 msix: None, 504 }, 505 msi_interrupt_manager, 506 legacy_interrupt_group, 507 vfio_wrapper, 508 patches: HashMap::new(), 509 x_nv_gpudirect_clique, 510 }; 511 512 let state: Option<VfioCommonState> = snapshot 513 .as_ref() 514 .map(|s| s.to_state()) 515 .transpose() 516 .map_err(|e| { 517 VfioPciError::RetrieveVfioCommonState(anyhow!( 518 "Failed to get VfioCommonState from Snapshot: {}", 519 e 520 )) 521 })?; 522 let msi_state = 523 vm_migration::state_from_id(snapshot.as_ref(), MSI_CONFIG_ID).map_err(|e| { 524 VfioPciError::RetrieveMsiConfigState(anyhow!( 525 "Failed to get MsiConfigState from Snapshot: {}", 526 e 527 )) 528 })?; 529 let msix_state = 530 vm_migration::state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID).map_err(|e| { 531 VfioPciError::RetrieveMsixConfigState(anyhow!( 532 "Failed to get MsixConfigState from Snapshot: {}", 533 e 534 )) 535 })?; 536 537 if let Some(state) = state.as_ref() { 538 vfio_common.set_state(state, msi_state, msix_state)?; 539 } else { 540 vfio_common.parse_capabilities(bdf); 541 vfio_common.initialize_legacy_interrupt()?; 542 } 543 544 Ok(vfio_common) 545 } 546 547 /// In case msix table offset is not page size aligned, we need do some fixup to achieve it. 548 /// Because we don't want the MMIO RW region and trap region overlap each other. 549 fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 { 550 if let Some(msix) = self.interrupt.msix.as_mut() { 551 let msix_cap = &mut msix.cap; 552 553 // Suppose table_bir equals to pba_bir here. Am I right? 554 let (table_offset, table_size) = msix_cap.table_range(); 555 if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id { 556 return region_size; 557 } 558 559 let (pba_offset, pba_size) = msix_cap.pba_range(); 560 let msix_sz = align_page_size_up(table_size + pba_size); 561 // Expand region to hold RW and trap region which both page size aligned 562 let size = std::cmp::max(region_size * 2, msix_sz * 2); 563 // let table starts from the middle of the region 564 msix_cap.table_set_offset((size / 2) as u32); 565 msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32); 566 567 size 568 } else { 569 // MSI-X not supported for this device 570 region_size 571 } 572 } 573 574 // The `allocator` argument is unused on `aarch64` 575 #[allow(unused_variables)] 576 pub(crate) fn allocate_bars( 577 &mut self, 578 allocator: &Arc<Mutex<SystemAllocator>>, 579 mmio32_allocator: &mut AddressAllocator, 580 mmio64_allocator: &mut AddressAllocator, 581 resources: Option<Vec<Resource>>, 582 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 583 let mut bars = Vec::new(); 584 let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX; 585 586 // Going through all regular regions to compute the BAR size. 587 // We're not saving the BAR address to restore it, because we 588 // are going to allocate a guest address for each BAR and write 589 // that new address back. 590 while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { 591 let mut region_size: u64 = 0; 592 let mut region_type = PciBarRegionType::Memory32BitRegion; 593 let mut prefetchable = PciBarPrefetchable::NotPrefetchable; 594 let mut flags: u32 = 0; 595 596 let mut restored_bar_addr = None; 597 if let Some(resources) = &resources { 598 for resource in resources { 599 if let Resource::PciBar { 600 index, 601 base, 602 size, 603 type_, 604 .. 605 } = resource 606 { 607 if *index == bar_id as usize { 608 restored_bar_addr = Some(GuestAddress(*base)); 609 region_size = *size; 610 region_type = PciBarRegionType::from(*type_); 611 break; 612 } 613 } 614 } 615 if restored_bar_addr.is_none() { 616 bar_id += 1; 617 continue; 618 } 619 } else { 620 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 621 (PCI_ROM_EXP_BAR_INDEX * 4) as u32 622 } else { 623 PCI_CONFIG_BAR_OFFSET + bar_id * 4 624 }; 625 626 // First read flags 627 flags = self.vfio_wrapper.read_config_dword(bar_offset); 628 629 // Is this an IO BAR? 630 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 631 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) 632 } else { 633 false 634 }; 635 636 // Is this a 64-bit BAR? 637 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 638 matches!( 639 flags & PCI_CONFIG_MEMORY_BAR_64BIT, 640 PCI_CONFIG_MEMORY_BAR_64BIT 641 ) 642 } else { 643 false 644 }; 645 646 if matches!( 647 flags & PCI_CONFIG_BAR_PREFETCHABLE, 648 PCI_CONFIG_BAR_PREFETCHABLE 649 ) { 650 prefetchable = PciBarPrefetchable::Prefetchable 651 }; 652 653 // To get size write all 1s 654 self.vfio_wrapper 655 .write_config_dword(bar_offset, 0xffff_ffff); 656 657 // And read back BAR value. The device will write zeros for bits it doesn't care about 658 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset); 659 660 if io_bar { 661 // Mask flag bits (lowest 2 for I/O bars) 662 lower &= !0b11; 663 664 // BAR is not enabled 665 if lower == 0 { 666 bar_id += 1; 667 continue; 668 } 669 670 // IO BAR 671 region_type = PciBarRegionType::IoRegion; 672 673 // Invert bits and add 1 to calculate size 674 region_size = (!lower + 1) as u64; 675 } else if is_64bit_bar { 676 // 64 bits Memory BAR 677 region_type = PciBarRegionType::Memory64BitRegion; 678 679 // Query size of upper BAR of 64-bit BAR 680 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; 681 self.vfio_wrapper 682 .write_config_dword(upper_offset, 0xffff_ffff); 683 let upper = self.vfio_wrapper.read_config_dword(upper_offset); 684 685 let mut combined_size = u64::from(upper) << 32 | u64::from(lower); 686 687 // Mask out flag bits (lowest 4 for memory bars) 688 combined_size &= !0b1111; 689 690 // BAR is not enabled 691 if combined_size == 0 { 692 bar_id += 1; 693 continue; 694 } 695 696 // Invert and add 1 to to find size 697 region_size = !combined_size + 1; 698 } else { 699 region_type = PciBarRegionType::Memory32BitRegion; 700 701 // Mask out flag bits (lowest 4 for memory bars) 702 lower &= !0b1111; 703 704 if lower == 0 { 705 bar_id += 1; 706 continue; 707 } 708 709 // Invert and add 1 to to find size 710 region_size = (!lower + 1) as u64; 711 } 712 } 713 714 let bar_addr = match region_type { 715 PciBarRegionType::IoRegion => { 716 #[cfg(not(target_arch = "x86_64"))] 717 unimplemented!(); 718 719 // The address needs to be 4 bytes aligned. 720 #[cfg(target_arch = "x86_64")] 721 allocator 722 .lock() 723 .unwrap() 724 .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4)) 725 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 726 } 727 PciBarRegionType::Memory32BitRegion => { 728 // BAR allocation must be naturally aligned 729 mmio32_allocator 730 .allocate(restored_bar_addr, region_size, Some(region_size)) 731 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 732 } 733 PciBarRegionType::Memory64BitRegion => { 734 // We need do some fixup to keep MMIO RW region and msix cap region page size 735 // aligned. 736 region_size = self.fixup_msix_region(bar_id, region_size); 737 mmio64_allocator 738 .allocate( 739 restored_bar_addr, 740 region_size, 741 Some(std::cmp::max( 742 // SAFETY: FFI call. Trivially safe. 743 unsafe { sysconf(_SC_PAGESIZE) as GuestUsize }, 744 region_size, 745 )), 746 ) 747 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 748 } 749 }; 750 751 // We can now build our BAR configuration block. 752 let bar = PciBarConfiguration::default() 753 .set_index(bar_id as usize) 754 .set_address(bar_addr.raw_value()) 755 .set_size(region_size) 756 .set_region_type(region_type) 757 .set_prefetchable(prefetchable); 758 759 if bar_id == VFIO_PCI_ROM_REGION_INDEX { 760 self.configuration 761 .add_pci_rom_bar(&bar, flags & 0x1) 762 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 763 } else { 764 self.configuration 765 .add_pci_bar(&bar) 766 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 767 } 768 769 bars.push(bar); 770 self.mmio_regions.push(MmioRegion { 771 start: bar_addr, 772 length: region_size, 773 type_: region_type, 774 index: bar_id, 775 user_memory_regions: Vec::new(), 776 }); 777 778 bar_id += 1; 779 if region_type == PciBarRegionType::Memory64BitRegion { 780 bar_id += 1; 781 } 782 } 783 784 Ok(bars) 785 } 786 787 // The `allocator` argument is unused on `aarch64` 788 #[allow(unused_variables)] 789 pub(crate) fn free_bars( 790 &mut self, 791 allocator: &mut SystemAllocator, 792 mmio32_allocator: &mut AddressAllocator, 793 mmio64_allocator: &mut AddressAllocator, 794 ) -> Result<(), PciDeviceError> { 795 for region in self.mmio_regions.iter() { 796 match region.type_ { 797 PciBarRegionType::IoRegion => { 798 #[cfg(target_arch = "x86_64")] 799 allocator.free_io_addresses(region.start, region.length); 800 #[cfg(not(target_arch = "x86_64"))] 801 error!("I/O region is not supported"); 802 } 803 PciBarRegionType::Memory32BitRegion => { 804 mmio32_allocator.free(region.start, region.length); 805 } 806 PciBarRegionType::Memory64BitRegion => { 807 mmio64_allocator.free(region.start, region.length); 808 } 809 } 810 } 811 Ok(()) 812 } 813 814 pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap { 815 let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into()); 816 817 let table = self.vfio_wrapper.read_config_dword((cap + 4).into()); 818 819 let pba = self.vfio_wrapper.read_config_dword((cap + 8).into()); 820 821 MsixCap { 822 msg_ctl, 823 table, 824 pba, 825 } 826 } 827 828 pub(crate) fn initialize_msix( 829 &mut self, 830 msix_cap: MsixCap, 831 cap_offset: u32, 832 bdf: PciBdf, 833 state: Option<MsixConfigState>, 834 ) { 835 let interrupt_source_group = self 836 .msi_interrupt_manager 837 .create_group(MsiIrqGroupConfig { 838 base: 0, 839 count: msix_cap.table_size() as InterruptIndex, 840 }) 841 .unwrap(); 842 843 let msix_config = MsixConfig::new( 844 msix_cap.table_size(), 845 interrupt_source_group.clone(), 846 bdf.into(), 847 state, 848 ) 849 .unwrap(); 850 851 self.interrupt.msix = Some(VfioMsix { 852 bar: msix_config, 853 cap: msix_cap, 854 cap_offset, 855 interrupt_source_group, 856 }); 857 } 858 859 pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 { 860 self.vfio_wrapper.read_config_word((cap + 2).into()) 861 } 862 863 pub(crate) fn initialize_msi( 864 &mut self, 865 msg_ctl: u16, 866 cap_offset: u32, 867 state: Option<MsiConfigState>, 868 ) { 869 let interrupt_source_group = self 870 .msi_interrupt_manager 871 .create_group(MsiIrqGroupConfig { 872 base: 0, 873 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, 874 }) 875 .unwrap(); 876 877 let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap(); 878 879 self.interrupt.msi = Some(VfioMsi { 880 cfg: msi_config, 881 cap_offset, 882 interrupt_source_group, 883 }); 884 } 885 886 pub(crate) fn get_msix_cap_idx(&self) -> Option<usize> { 887 let mut cap_next = self 888 .vfio_wrapper 889 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 890 891 while cap_next != 0 { 892 let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into()); 893 if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX { 894 return Some(cap_next as usize); 895 } else { 896 cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into()); 897 } 898 } 899 900 None 901 } 902 903 pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) { 904 let mut cap_iter = self 905 .vfio_wrapper 906 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 907 908 let mut pci_express_cap_found = false; 909 let mut power_management_cap_found = false; 910 911 while cap_iter != 0 { 912 let cap_id = self.vfio_wrapper.read_config_byte(cap_iter.into()); 913 914 match PciCapabilityId::from(cap_id) { 915 PciCapabilityId::MessageSignalledInterrupts => { 916 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { 917 if irq_info.count > 0 { 918 // Parse capability only if the VFIO device 919 // supports MSI. 920 let msg_ctl = self.parse_msi_capabilities(cap_iter); 921 self.initialize_msi(msg_ctl, cap_iter as u32, None); 922 } 923 } 924 } 925 PciCapabilityId::MsiX => { 926 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) 927 { 928 if irq_info.count > 0 { 929 // Parse capability only if the VFIO device 930 // supports MSI-X. 931 let msix_cap = self.parse_msix_capabilities(cap_iter); 932 self.initialize_msix(msix_cap, cap_iter as u32, bdf, None); 933 } 934 } 935 } 936 PciCapabilityId::PciExpress => pci_express_cap_found = true, 937 PciCapabilityId::PowerManagement => power_management_cap_found = true, 938 _ => {} 939 }; 940 941 let cap_next = self.vfio_wrapper.read_config_byte((cap_iter + 1).into()); 942 if cap_next == 0 { 943 break; 944 } 945 946 cap_iter = cap_next; 947 } 948 949 if let Some(clique_id) = self.x_nv_gpudirect_clique { 950 self.add_nv_gpudirect_clique_cap(cap_iter, clique_id); 951 } 952 953 if pci_express_cap_found && power_management_cap_found { 954 self.parse_extended_capabilities(); 955 } 956 } 957 958 fn add_nv_gpudirect_clique_cap(&mut self, cap_iter: u8, clique_id: u8) { 959 // Turing, Ampere, Hopper, and Lovelace GPUs have dedicated space 960 // at 0xD4 for this capability. 961 let cap_offset = 0xd4u32; 962 963 let reg_idx = (cap_iter / 4) as usize; 964 self.patches.insert( 965 reg_idx, 966 ConfigPatch { 967 mask: 0x0000_ff00, 968 patch: cap_offset << 8, 969 }, 970 ); 971 972 let reg_idx = (cap_offset / 4) as usize; 973 self.patches.insert( 974 reg_idx, 975 ConfigPatch { 976 mask: 0xffff_ffff, 977 patch: 0x50080009u32, 978 }, 979 ); 980 self.patches.insert( 981 reg_idx + 1, 982 ConfigPatch { 983 mask: 0xffff_ffff, 984 patch: u32::from(clique_id) << 19 | 0x5032, 985 }, 986 ); 987 } 988 989 fn parse_extended_capabilities(&mut self) { 990 let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET; 991 992 loop { 993 let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset); 994 995 let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16; 996 let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16; 997 998 match PciExpressCapabilityId::from(cap_id) { 999 PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation 1000 | PciExpressCapabilityId::ResizeableBar 1001 | PciExpressCapabilityId::SingleRootIoVirtualization => { 1002 let reg_idx = (current_offset / 4) as usize; 1003 self.patches.insert( 1004 reg_idx, 1005 ConfigPatch { 1006 mask: 0x0000_ffff, 1007 patch: PciExpressCapabilityId::NullCapability as u32, 1008 }, 1009 ); 1010 } 1011 _ => {} 1012 } 1013 1014 if cap_next == 0 { 1015 break; 1016 } 1017 1018 current_offset = cap_next.into(); 1019 } 1020 } 1021 1022 pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> { 1023 if let Some(intx) = &mut self.interrupt.intx { 1024 if !intx.enabled { 1025 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { 1026 self.vfio_wrapper 1027 .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) 1028 .map_err(VfioPciError::EnableIntx)?; 1029 1030 intx.enabled = true; 1031 } else { 1032 return Err(VfioPciError::MissingNotifier); 1033 } 1034 } 1035 } 1036 1037 Ok(()) 1038 } 1039 1040 pub(crate) fn disable_intx(&mut self) { 1041 if let Some(intx) = &mut self.interrupt.intx { 1042 if intx.enabled { 1043 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1044 error!("Could not disable INTx: {}", e); 1045 } else { 1046 intx.enabled = false; 1047 } 1048 } 1049 } 1050 } 1051 1052 pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> { 1053 if let Some(msi) = &self.interrupt.msi { 1054 let mut irq_fds: Vec<EventFd> = Vec::new(); 1055 for i in 0..msi.cfg.num_enabled_vectors() { 1056 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { 1057 irq_fds.push(eventfd); 1058 } else { 1059 return Err(VfioPciError::MissingNotifier); 1060 } 1061 } 1062 1063 self.vfio_wrapper 1064 .enable_msi(irq_fds.iter().collect()) 1065 .map_err(VfioPciError::EnableMsi)?; 1066 } 1067 1068 Ok(()) 1069 } 1070 1071 pub(crate) fn disable_msi(&self) { 1072 if let Err(e) = self.vfio_wrapper.disable_msi() { 1073 error!("Could not disable MSI: {}", e); 1074 } 1075 } 1076 1077 pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> { 1078 if let Some(msix) = &self.interrupt.msix { 1079 let mut irq_fds: Vec<EventFd> = Vec::new(); 1080 for i in 0..msix.bar.table_entries.len() { 1081 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { 1082 irq_fds.push(eventfd); 1083 } else { 1084 return Err(VfioPciError::MissingNotifier); 1085 } 1086 } 1087 1088 self.vfio_wrapper 1089 .enable_msix(irq_fds.iter().collect()) 1090 .map_err(VfioPciError::EnableMsix)?; 1091 } 1092 1093 Ok(()) 1094 } 1095 1096 pub(crate) fn disable_msix(&self) { 1097 if let Err(e) = self.vfio_wrapper.disable_msix() { 1098 error!("Could not disable MSI-X: {}", e); 1099 } 1100 } 1101 1102 pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> { 1103 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { 1104 if irq_info.count == 0 { 1105 // A count of 0 means the INTx IRQ is not supported, therefore 1106 // it shouldn't be initialized. 1107 return Ok(()); 1108 } 1109 } 1110 1111 if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() { 1112 self.interrupt.intx = Some(VfioIntx { 1113 interrupt_source_group, 1114 enabled: false, 1115 }); 1116 1117 self.enable_intx()?; 1118 } 1119 1120 Ok(()) 1121 } 1122 1123 pub(crate) fn update_msi_capabilities( 1124 &mut self, 1125 offset: u64, 1126 data: &[u8], 1127 ) -> Result<(), VfioPciError> { 1128 match self.interrupt.update_msi(offset, data) { 1129 Some(InterruptUpdateAction::EnableMsi) => { 1130 // Disable INTx before we can enable MSI 1131 self.disable_intx(); 1132 self.enable_msi()?; 1133 } 1134 Some(InterruptUpdateAction::DisableMsi) => { 1135 // Fallback onto INTx when disabling MSI 1136 self.disable_msi(); 1137 self.enable_intx()?; 1138 } 1139 _ => {} 1140 } 1141 1142 Ok(()) 1143 } 1144 1145 pub(crate) fn update_msix_capabilities( 1146 &mut self, 1147 offset: u64, 1148 data: &[u8], 1149 ) -> Result<(), VfioPciError> { 1150 match self.interrupt.update_msix(offset, data) { 1151 Some(InterruptUpdateAction::EnableMsix) => { 1152 // Disable INTx before we can enable MSI-X 1153 self.disable_intx(); 1154 self.enable_msix()?; 1155 } 1156 Some(InterruptUpdateAction::DisableMsix) => { 1157 // Fallback onto INTx when disabling MSI-X 1158 self.disable_msix(); 1159 self.enable_intx()?; 1160 } 1161 _ => {} 1162 } 1163 1164 Ok(()) 1165 } 1166 1167 pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> { 1168 for region in self.mmio_regions.iter() { 1169 if addr >= region.start.raw_value() 1170 && addr < region.start.unchecked_add(region.length).raw_value() 1171 { 1172 return Some(region.clone()); 1173 } 1174 } 1175 None 1176 } 1177 1178 pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1179 let addr = base + offset; 1180 if let Some(region) = self.find_region(addr) { 1181 let offset = addr - region.start.raw_value(); 1182 1183 if self.interrupt.msix_table_accessed(region.index, offset) { 1184 self.interrupt.msix_read_table(offset, data); 1185 } else { 1186 self.vfio_wrapper.region_read(region.index, offset, data); 1187 } 1188 } 1189 1190 // INTx EOI 1191 // The guest reading from the BAR potentially means the interrupt has 1192 // been received and can be acknowledged. 1193 if self.interrupt.intx_in_use() { 1194 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1195 error!("Failed unmasking INTx IRQ: {}", e); 1196 } 1197 } 1198 } 1199 1200 pub(crate) fn write_bar( 1201 &mut self, 1202 base: u64, 1203 offset: u64, 1204 data: &[u8], 1205 ) -> Option<Arc<Barrier>> { 1206 let addr = base + offset; 1207 if let Some(region) = self.find_region(addr) { 1208 let offset = addr - region.start.raw_value(); 1209 1210 // If the MSI-X table is written to, we need to update our cache. 1211 if self.interrupt.msix_table_accessed(region.index, offset) { 1212 self.interrupt.msix_write_table(offset, data); 1213 } else { 1214 self.vfio_wrapper.region_write(region.index, offset, data); 1215 } 1216 } 1217 1218 // INTx EOI 1219 // The guest writing to the BAR potentially means the interrupt has 1220 // been received and can be acknowledged. 1221 if self.interrupt.intx_in_use() { 1222 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1223 error!("Failed unmasking INTx IRQ: {}", e); 1224 } 1225 } 1226 1227 None 1228 } 1229 1230 pub(crate) fn write_config_register( 1231 &mut self, 1232 reg_idx: usize, 1233 offset: u64, 1234 data: &[u8], 1235 ) -> Option<Arc<Barrier>> { 1236 // When the guest wants to write to a BAR, we trap it into 1237 // our local configuration space. We're not reprogramming 1238 // VFIO device. 1239 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1240 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1241 { 1242 // We keep our local cache updated with the BARs. 1243 // We'll read it back from there when the guest is asking 1244 // for BARs (see read_config_register()). 1245 self.configuration 1246 .write_config_register(reg_idx, offset, data); 1247 return None; 1248 } 1249 1250 let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; 1251 1252 // If the MSI or MSI-X capabilities are accessed, we need to 1253 // update our local cache accordingly. 1254 // Depending on how the capabilities are modified, this could 1255 // trigger a VFIO MSI or MSI-X toggle. 1256 if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { 1257 let cap_offset: u64 = reg - cap_base + offset; 1258 match cap_id { 1259 PciCapabilityId::MessageSignalledInterrupts => { 1260 if let Err(e) = self.update_msi_capabilities(cap_offset, data) { 1261 error!("Could not update MSI capabilities: {}", e); 1262 } 1263 } 1264 PciCapabilityId::MsiX => { 1265 if let Err(e) = self.update_msix_capabilities(cap_offset, data) { 1266 error!("Could not update MSI-X capabilities: {}", e); 1267 } 1268 } 1269 _ => {} 1270 } 1271 } 1272 1273 // Make sure to write to the device's PCI config space after MSI/MSI-X 1274 // interrupts have been enabled/disabled. In case of MSI, when the 1275 // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), 1276 // the MSI Enable bit in the MSI capability structure found in the PCI 1277 // config space is disabled by default. That's why when the guest is 1278 // enabling this bit, we first need to enable the MSI interrupts with 1279 // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write 1280 // to the device region to update the MSI Enable bit. 1281 self.vfio_wrapper.write_config((reg + offset) as u32, data); 1282 1283 None 1284 } 1285 1286 pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1287 // When reading the BARs, we trap it and return what comes 1288 // from our local configuration space. We want the guest to 1289 // use that and not the VFIO device BARs as it does not map 1290 // with the guest address space. 1291 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1292 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1293 { 1294 return self.configuration.read_reg(reg_idx); 1295 } 1296 1297 if let Some(id) = self.get_msix_cap_idx() { 1298 let msix = self.interrupt.msix.as_mut().unwrap(); 1299 if reg_idx * 4 == id + 4 { 1300 return msix.cap.table; 1301 } else if reg_idx * 4 == id + 8 { 1302 return msix.cap.pba; 1303 } 1304 } 1305 1306 // Since we don't support passing multi-functions devices, we should 1307 // mask the multi-function bit, bit 7 of the Header Type byte on the 1308 // register 3. 1309 let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { 1310 0xff7f_ffff 1311 } else { 1312 0xffff_ffff 1313 }; 1314 1315 // The config register read comes from the VFIO device itself. 1316 let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask; 1317 1318 if let Some(config_patch) = self.patches.get(®_idx) { 1319 value = (value & !config_patch.mask) | config_patch.patch; 1320 } 1321 1322 value 1323 } 1324 1325 fn state(&self) -> VfioCommonState { 1326 let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState { 1327 enabled: intx.enabled, 1328 }); 1329 1330 let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState { 1331 cap: msi.cfg.cap, 1332 cap_offset: msi.cap_offset, 1333 }); 1334 1335 let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState { 1336 cap: msix.cap, 1337 cap_offset: msix.cap_offset, 1338 bdf: msix.bar.devid, 1339 }); 1340 1341 VfioCommonState { 1342 intx_state, 1343 msi_state, 1344 msix_state, 1345 } 1346 } 1347 1348 fn set_state( 1349 &mut self, 1350 state: &VfioCommonState, 1351 msi_state: Option<MsiConfigState>, 1352 msix_state: Option<MsixConfigState>, 1353 ) -> Result<(), VfioPciError> { 1354 if let (Some(intx), Some(interrupt_source_group)) = 1355 (&state.intx_state, self.legacy_interrupt_group.clone()) 1356 { 1357 self.interrupt.intx = Some(VfioIntx { 1358 interrupt_source_group, 1359 enabled: false, 1360 }); 1361 1362 if intx.enabled { 1363 self.enable_intx()?; 1364 } 1365 } 1366 1367 if let Some(msi) = &state.msi_state { 1368 self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state); 1369 } 1370 1371 if let Some(msix) = &state.msix_state { 1372 self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state); 1373 } 1374 1375 Ok(()) 1376 } 1377 } 1378 1379 impl Pausable for VfioCommon {} 1380 1381 impl Snapshottable for VfioCommon { 1382 fn id(&self) -> String { 1383 String::from(VFIO_COMMON_ID) 1384 } 1385 1386 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1387 let mut vfio_common_snapshot = Snapshot::new_from_state(&self.state())?; 1388 1389 // Snapshot PciConfiguration 1390 vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?); 1391 1392 // Snapshot MSI 1393 if let Some(msi) = &mut self.interrupt.msi { 1394 vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?); 1395 } 1396 1397 // Snapshot MSI-X 1398 if let Some(msix) = &mut self.interrupt.msix { 1399 vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?); 1400 } 1401 1402 Ok(vfio_common_snapshot) 1403 } 1404 } 1405 1406 /// VfioPciDevice represents a VFIO PCI device. 1407 /// This structure implements the BusDevice and PciDevice traits. 1408 /// 1409 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. 1410 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, 1411 /// which then gets added to the PCI bus. 1412 pub struct VfioPciDevice { 1413 id: String, 1414 vm: Arc<dyn hypervisor::Vm>, 1415 device: Arc<VfioDevice>, 1416 container: Arc<VfioContainer>, 1417 common: VfioCommon, 1418 iommu_attached: bool, 1419 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1420 } 1421 1422 impl VfioPciDevice { 1423 /// Constructs a new Vfio Pci device for the given Vfio device 1424 #[allow(clippy::too_many_arguments)] 1425 pub fn new( 1426 id: String, 1427 vm: &Arc<dyn hypervisor::Vm>, 1428 device: VfioDevice, 1429 container: Arc<VfioContainer>, 1430 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 1431 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 1432 iommu_attached: bool, 1433 bdf: PciBdf, 1434 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1435 snapshot: Option<Snapshot>, 1436 x_nv_gpudirect_clique: Option<u8>, 1437 ) -> Result<Self, VfioPciError> { 1438 let device = Arc::new(device); 1439 device.reset(); 1440 1441 let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device)); 1442 1443 let common = VfioCommon::new( 1444 msi_interrupt_manager, 1445 legacy_interrupt_group, 1446 Arc::new(vfio_wrapper) as Arc<dyn Vfio>, 1447 &PciVfioSubclass::VfioSubclass, 1448 bdf, 1449 vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID), 1450 x_nv_gpudirect_clique, 1451 )?; 1452 1453 let vfio_pci_device = VfioPciDevice { 1454 id, 1455 vm: vm.clone(), 1456 device, 1457 container, 1458 common, 1459 iommu_attached, 1460 memory_slot, 1461 }; 1462 1463 Ok(vfio_pci_device) 1464 } 1465 1466 pub fn iommu_attached(&self) -> bool { 1467 self.iommu_attached 1468 } 1469 1470 fn generate_sparse_areas( 1471 caps: &[VfioRegionInfoCap], 1472 region_index: u32, 1473 region_start: u64, 1474 region_size: u64, 1475 vfio_msix: Option<&VfioMsix>, 1476 ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> { 1477 for cap in caps { 1478 match cap { 1479 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()), 1480 VfioRegionInfoCap::MsixMappable => { 1481 if !is_4k_aligned(region_start) { 1482 error!( 1483 "Region start address 0x{:x} must be at least aligned on 4KiB", 1484 region_start 1485 ); 1486 return Err(VfioPciError::RegionAlignment); 1487 } 1488 if !is_4k_multiple(region_size) { 1489 error!( 1490 "Region size 0x{:x} must be at least a multiple of 4KiB", 1491 region_size 1492 ); 1493 return Err(VfioPciError::RegionSize); 1494 } 1495 1496 // In case the region contains the MSI-X vectors table or 1497 // the MSI-X PBA table, we must calculate the subregions 1498 // around them, leading to a list of sparse areas. 1499 // We want to make sure we will still trap MMIO accesses 1500 // to these MSI-X specific ranges. If these region don't align 1501 // with pagesize, we can achieve it by enlarging its range. 1502 // 1503 // Using a BtreeMap as the list provided through the iterator is sorted 1504 // by key. This ensures proper split of the whole region. 1505 let mut inter_ranges = BTreeMap::new(); 1506 if let Some(msix) = vfio_msix { 1507 if region_index == msix.cap.table_bir() { 1508 let (offset, size) = msix.cap.table_range(); 1509 let offset = align_page_size_down(offset); 1510 let size = align_page_size_up(size); 1511 inter_ranges.insert(offset, size); 1512 } 1513 if region_index == msix.cap.pba_bir() { 1514 let (offset, size) = msix.cap.pba_range(); 1515 let offset = align_page_size_down(offset); 1516 let size = align_page_size_up(size); 1517 inter_ranges.insert(offset, size); 1518 } 1519 } 1520 1521 let mut sparse_areas = Vec::new(); 1522 let mut current_offset = 0; 1523 for (range_offset, range_size) in inter_ranges { 1524 if range_offset > current_offset { 1525 sparse_areas.push(VfioRegionSparseMmapArea { 1526 offset: current_offset, 1527 size: range_offset - current_offset, 1528 }); 1529 } 1530 current_offset = align_page_size_down(range_offset + range_size); 1531 } 1532 1533 if region_size > current_offset { 1534 sparse_areas.push(VfioRegionSparseMmapArea { 1535 offset: current_offset, 1536 size: region_size - current_offset, 1537 }); 1538 } 1539 1540 return Ok(sparse_areas); 1541 } 1542 _ => {} 1543 } 1544 } 1545 1546 // In case no relevant capabilities have been found, create a single 1547 // sparse area corresponding to the entire MMIO region. 1548 Ok(vec![VfioRegionSparseMmapArea { 1549 offset: 0, 1550 size: region_size, 1551 }]) 1552 } 1553 1554 /// Map MMIO regions into the guest, and avoid VM exits when the guest tries 1555 /// to reach those regions. 1556 /// 1557 /// # Arguments 1558 /// 1559 /// * `vm` - The VM object. It is used to set the VFIO MMIO regions 1560 /// as user memory regions. 1561 /// * `mem_slot` - The closure to return a memory slot. 1562 pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> { 1563 let fd = self.device.as_raw_fd(); 1564 1565 for region in self.common.mmio_regions.iter_mut() { 1566 let region_flags = self.device.get_region_flags(region.index); 1567 if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { 1568 let mut prot = 0; 1569 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 { 1570 prot |= libc::PROT_READ; 1571 } 1572 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { 1573 prot |= libc::PROT_WRITE; 1574 } 1575 1576 // Retrieve the list of capabilities found on the region 1577 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 { 1578 self.device.get_region_caps(region.index) 1579 } else { 1580 Vec::new() 1581 }; 1582 1583 // Don't try to mmap the region if it contains MSI-X table or 1584 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE 1585 // in the list of supported capabilities. 1586 if let Some(msix) = self.common.interrupt.msix.as_ref() { 1587 if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir()) 1588 && !caps.contains(&VfioRegionInfoCap::MsixMappable) 1589 { 1590 continue; 1591 } 1592 } 1593 1594 let mmap_size = self.device.get_region_size(region.index); 1595 let mmap_offset = self.device.get_region_offset(region.index); 1596 1597 let sparse_areas = Self::generate_sparse_areas( 1598 &caps, 1599 region.index, 1600 region.start.0, 1601 mmap_size, 1602 self.common.interrupt.msix.as_ref(), 1603 )?; 1604 1605 for area in sparse_areas.iter() { 1606 // SAFETY: FFI call with correct arguments 1607 let host_addr = unsafe { 1608 libc::mmap( 1609 null_mut(), 1610 area.size as usize, 1611 prot, 1612 libc::MAP_SHARED, 1613 fd, 1614 mmap_offset as libc::off_t + area.offset as libc::off_t, 1615 ) 1616 }; 1617 1618 if host_addr == libc::MAP_FAILED { 1619 error!( 1620 "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", 1621 area.offset, 1622 area.size, 1623 std::io::Error::last_os_error() 1624 ); 1625 return Err(VfioPciError::MmapArea); 1626 } 1627 1628 if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) { 1629 warn!( 1630 "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})", 1631 area.offset, 1632 area.size, 1633 ); 1634 return Ok(()); 1635 } 1636 1637 let user_memory_region = UserMemoryRegion { 1638 slot: (self.memory_slot)(), 1639 start: region.start.0 + area.offset, 1640 size: area.size, 1641 host_addr: host_addr as u64, 1642 }; 1643 1644 region.user_memory_regions.push(user_memory_region); 1645 1646 let mem_region = self.vm.make_user_memory_region( 1647 user_memory_region.slot, 1648 user_memory_region.start, 1649 user_memory_region.size, 1650 user_memory_region.host_addr, 1651 false, 1652 false, 1653 ); 1654 1655 self.vm 1656 .create_user_memory_region(mem_region) 1657 .map_err(VfioPciError::CreateUserMemoryRegion)?; 1658 1659 if !self.iommu_attached { 1660 self.container 1661 .vfio_dma_map( 1662 user_memory_region.start, 1663 user_memory_region.size, 1664 user_memory_region.host_addr, 1665 ) 1666 .map_err(VfioPciError::DmaMap)?; 1667 } 1668 } 1669 } 1670 } 1671 1672 Ok(()) 1673 } 1674 1675 pub fn unmap_mmio_regions(&mut self) { 1676 for region in self.common.mmio_regions.iter() { 1677 for user_memory_region in region.user_memory_regions.iter() { 1678 // Unmap from vfio container 1679 if !self.iommu_attached { 1680 if let Err(e) = self 1681 .container 1682 .vfio_dma_unmap(user_memory_region.start, user_memory_region.size) 1683 { 1684 error!("Could not unmap mmio region from vfio container: {}", e); 1685 } 1686 } 1687 1688 // Remove region 1689 let r = self.vm.make_user_memory_region( 1690 user_memory_region.slot, 1691 user_memory_region.start, 1692 user_memory_region.size, 1693 user_memory_region.host_addr, 1694 false, 1695 false, 1696 ); 1697 1698 if let Err(e) = self.vm.remove_user_memory_region(r) { 1699 error!("Could not remove the userspace memory region: {}", e); 1700 } 1701 1702 // SAFETY: FFI call with correct arguments 1703 let ret = unsafe { 1704 libc::munmap( 1705 user_memory_region.host_addr as *mut libc::c_void, 1706 user_memory_region.size as usize, 1707 ) 1708 }; 1709 if ret != 0 { 1710 error!( 1711 "Could not unmap region {}, error:{}", 1712 region.index, 1713 io::Error::last_os_error() 1714 ); 1715 } 1716 } 1717 } 1718 } 1719 1720 pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> { 1721 if !self.iommu_attached { 1722 self.container 1723 .vfio_dma_map(iova, size, user_addr) 1724 .map_err(VfioPciError::DmaMap)?; 1725 } 1726 1727 Ok(()) 1728 } 1729 1730 pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> { 1731 if !self.iommu_attached { 1732 self.container 1733 .vfio_dma_unmap(iova, size) 1734 .map_err(VfioPciError::DmaUnmap)?; 1735 } 1736 1737 Ok(()) 1738 } 1739 1740 pub fn mmio_regions(&self) -> Vec<MmioRegion> { 1741 self.common.mmio_regions.clone() 1742 } 1743 } 1744 1745 impl Drop for VfioPciDevice { 1746 fn drop(&mut self) { 1747 self.unmap_mmio_regions(); 1748 1749 if let Some(msix) = &self.common.interrupt.msix { 1750 if msix.bar.enabled() { 1751 self.common.disable_msix(); 1752 } 1753 } 1754 1755 if let Some(msi) = &self.common.interrupt.msi { 1756 if msi.cfg.enabled() { 1757 self.common.disable_msi() 1758 } 1759 } 1760 1761 if self.common.interrupt.intx_in_use() { 1762 self.common.disable_intx(); 1763 } 1764 } 1765 } 1766 1767 impl BusDevice for VfioPciDevice { 1768 fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1769 self.read_bar(base, offset, data) 1770 } 1771 1772 fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1773 self.write_bar(base, offset, data) 1774 } 1775 } 1776 1777 // First BAR offset in the PCI config space. 1778 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; 1779 // Capability register offset in the PCI config space. 1780 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; 1781 // Extended capabilities register offset in the PCI config space. 1782 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100; 1783 // IO BAR when first BAR bit is 1. 1784 const PCI_CONFIG_IO_BAR: u32 = 0x1; 1785 // 64-bit memory bar flag. 1786 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; 1787 // Prefetchable BAR bit 1788 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8; 1789 // PCI config register size (4 bytes). 1790 const PCI_CONFIG_REGISTER_SIZE: usize = 4; 1791 // Number of BARs for a PCI device 1792 const BAR_NUMS: usize = 6; 1793 // PCI Header Type register index 1794 const PCI_HEADER_TYPE_REG_INDEX: usize = 3; 1795 // First BAR register index 1796 const PCI_CONFIG_BAR0_INDEX: usize = 4; 1797 // PCI ROM expansion BAR register index 1798 const PCI_ROM_EXP_BAR_INDEX: usize = 12; 1799 1800 impl PciDevice for VfioPciDevice { 1801 fn allocate_bars( 1802 &mut self, 1803 allocator: &Arc<Mutex<SystemAllocator>>, 1804 mmio32_allocator: &mut AddressAllocator, 1805 mmio64_allocator: &mut AddressAllocator, 1806 resources: Option<Vec<Resource>>, 1807 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 1808 self.common 1809 .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources) 1810 } 1811 1812 fn free_bars( 1813 &mut self, 1814 allocator: &mut SystemAllocator, 1815 mmio32_allocator: &mut AddressAllocator, 1816 mmio64_allocator: &mut AddressAllocator, 1817 ) -> Result<(), PciDeviceError> { 1818 self.common 1819 .free_bars(allocator, mmio32_allocator, mmio64_allocator) 1820 } 1821 1822 fn write_config_register( 1823 &mut self, 1824 reg_idx: usize, 1825 offset: u64, 1826 data: &[u8], 1827 ) -> Option<Arc<Barrier>> { 1828 self.common.write_config_register(reg_idx, offset, data) 1829 } 1830 1831 fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1832 self.common.read_config_register(reg_idx) 1833 } 1834 1835 fn detect_bar_reprogramming( 1836 &mut self, 1837 reg_idx: usize, 1838 data: &[u8], 1839 ) -> Option<BarReprogrammingParams> { 1840 self.common 1841 .configuration 1842 .detect_bar_reprogramming(reg_idx, data) 1843 } 1844 1845 fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1846 self.common.read_bar(base, offset, data) 1847 } 1848 1849 fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1850 self.common.write_bar(base, offset, data) 1851 } 1852 1853 fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> { 1854 for region in self.common.mmio_regions.iter_mut() { 1855 if region.start.raw_value() == old_base { 1856 region.start = GuestAddress(new_base); 1857 1858 for user_memory_region in region.user_memory_regions.iter_mut() { 1859 // Remove old region 1860 let old_mem_region = self.vm.make_user_memory_region( 1861 user_memory_region.slot, 1862 user_memory_region.start, 1863 user_memory_region.size, 1864 user_memory_region.host_addr, 1865 false, 1866 false, 1867 ); 1868 1869 self.vm 1870 .remove_user_memory_region(old_mem_region) 1871 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1872 1873 // Update the user memory region with the correct start address. 1874 if new_base > old_base { 1875 user_memory_region.start += new_base - old_base; 1876 } else { 1877 user_memory_region.start -= old_base - new_base; 1878 } 1879 1880 // Insert new region 1881 let new_mem_region = self.vm.make_user_memory_region( 1882 user_memory_region.slot, 1883 user_memory_region.start, 1884 user_memory_region.size, 1885 user_memory_region.host_addr, 1886 false, 1887 false, 1888 ); 1889 1890 self.vm 1891 .create_user_memory_region(new_mem_region) 1892 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1893 } 1894 } 1895 } 1896 1897 Ok(()) 1898 } 1899 1900 fn as_any(&mut self) -> &mut dyn Any { 1901 self 1902 } 1903 1904 fn id(&self) -> Option<String> { 1905 Some(self.id.clone()) 1906 } 1907 } 1908 1909 impl Pausable for VfioPciDevice {} 1910 1911 impl Snapshottable for VfioPciDevice { 1912 fn id(&self) -> String { 1913 self.id.clone() 1914 } 1915 1916 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1917 let mut vfio_pci_dev_snapshot = Snapshot::default(); 1918 1919 // Snapshot VfioCommon 1920 vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?); 1921 1922 Ok(vfio_pci_dev_snapshot) 1923 } 1924 } 1925 impl Transportable for VfioPciDevice {} 1926 impl Migratable for VfioPciDevice {} 1927 1928 /// This structure implements the ExternalDmaMapping trait. It is meant to 1929 /// be used when the caller tries to provide a way to update the mappings 1930 /// associated with a specific VFIO container. 1931 pub struct VfioDmaMapping<M: GuestAddressSpace> { 1932 container: Arc<VfioContainer>, 1933 memory: Arc<M>, 1934 mmio_regions: Arc<Mutex<Vec<MmioRegion>>>, 1935 } 1936 1937 impl<M: GuestAddressSpace> VfioDmaMapping<M> { 1938 /// Create a DmaMapping object. 1939 /// # Parameters 1940 /// * `container`: VFIO container object. 1941 /// * `memory`: guest memory to mmap. 1942 /// * `mmio_regions`: mmio_regions to mmap. 1943 pub fn new( 1944 container: Arc<VfioContainer>, 1945 memory: Arc<M>, 1946 mmio_regions: Arc<Mutex<Vec<MmioRegion>>>, 1947 ) -> Self { 1948 VfioDmaMapping { 1949 container, 1950 memory, 1951 mmio_regions, 1952 } 1953 } 1954 } 1955 1956 impl<M: GuestAddressSpace + Sync + Send> ExternalDmaMapping for VfioDmaMapping<M> { 1957 fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), io::Error> { 1958 let mem = self.memory.memory(); 1959 let guest_addr = GuestAddress(gpa); 1960 let user_addr = if mem.check_range(guest_addr, size as usize) { 1961 match mem.get_host_address(guest_addr) { 1962 Ok(t) => t as u64, 1963 Err(e) => { 1964 return Err(io::Error::new( 1965 io::ErrorKind::Other, 1966 format!("unable to retrieve user address for gpa 0x{gpa:x} from guest memory region: {e}") 1967 )); 1968 } 1969 } 1970 } else if self.mmio_regions.lock().unwrap().check_range(gpa, size) { 1971 self.mmio_regions.lock().unwrap().find_user_address(gpa)? 1972 } else { 1973 return Err(io::Error::new( 1974 io::ErrorKind::Other, 1975 format!("failed to locate guest address 0x{gpa:x} in guest memory"), 1976 )); 1977 }; 1978 1979 self.container 1980 .vfio_dma_map(iova, size, user_addr) 1981 .map_err(|e| { 1982 io::Error::new( 1983 io::ErrorKind::Other, 1984 format!( 1985 "failed to map memory for VFIO container, \ 1986 iova 0x{iova:x}, gpa 0x{gpa:x}, size 0x{size:x}: {e:?}" 1987 ), 1988 ) 1989 }) 1990 } 1991 1992 fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), io::Error> { 1993 self.container.vfio_dma_unmap(iova, size).map_err(|e| { 1994 io::Error::new( 1995 io::ErrorKind::Other, 1996 format!( 1997 "failed to unmap memory for VFIO container, \ 1998 iova 0x{iova:x}, size 0x{size:x}: {e:?}" 1999 ), 2000 ) 2001 }) 2002 } 2003 } 2004