1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 6 use crate::msi::{MsiConfigState, MSI_CONFIG_ID}; 7 use crate::msix::MsixConfigState; 8 use crate::{ 9 msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig, 10 PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId, 11 PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId, 12 PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID, 13 }; 14 use anyhow::anyhow; 15 use byteorder::{ByteOrder, LittleEndian}; 16 use hypervisor::HypervisorVmError; 17 use libc::{sysconf, _SC_PAGESIZE}; 18 use serde::{Deserialize, Serialize}; 19 use std::any::Any; 20 use std::collections::{BTreeMap, HashMap}; 21 use std::io; 22 use std::os::unix::io::AsRawFd; 23 use std::ptr::null_mut; 24 use std::sync::{Arc, Barrier, Mutex}; 25 use thiserror::Error; 26 use vfio_bindings::bindings::vfio::*; 27 use vfio_ioctls::{ 28 VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea, 29 }; 30 use vm_allocator::page_size::{ 31 align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned, 32 }; 33 use vm_allocator::{AddressAllocator, SystemAllocator}; 34 use vm_device::dma_mapping::ExternalDmaMapping; 35 use vm_device::interrupt::{ 36 InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, 37 }; 38 use vm_device::{BusDevice, Resource}; 39 use vm_memory::{Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestUsize}; 40 use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; 41 use vmm_sys_util::eventfd::EventFd; 42 43 pub(crate) const VFIO_COMMON_ID: &str = "vfio_common"; 44 45 #[derive(Debug, Error)] 46 pub enum VfioPciError { 47 #[error("Failed to create user memory region: {0}")] 48 CreateUserMemoryRegion(#[source] HypervisorVmError), 49 #[error("Failed to DMA map: {0}")] 50 DmaMap(#[source] vfio_ioctls::VfioError), 51 #[error("Failed to DMA unmap: {0}")] 52 DmaUnmap(#[source] vfio_ioctls::VfioError), 53 #[error("Failed to enable INTx: {0}")] 54 EnableIntx(#[source] VfioError), 55 #[error("Failed to enable MSI: {0}")] 56 EnableMsi(#[source] VfioError), 57 #[error("Failed to enable MSI-x: {0}")] 58 EnableMsix(#[source] VfioError), 59 #[error("Failed to mmap the area")] 60 MmapArea, 61 #[error("Failed to notifier's eventfd")] 62 MissingNotifier, 63 #[error("Invalid region alignment")] 64 RegionAlignment, 65 #[error("Invalid region size")] 66 RegionSize, 67 #[error("Failed to retrieve MsiConfigState: {0}")] 68 RetrieveMsiConfigState(#[source] anyhow::Error), 69 #[error("Failed to retrieve MsixConfigState: {0}")] 70 RetrieveMsixConfigState(#[source] anyhow::Error), 71 #[error("Failed to retrieve PciConfigurationState: {0}")] 72 RetrievePciConfigurationState(#[source] anyhow::Error), 73 #[error("Failed to retrieve VfioCommonState: {0}")] 74 RetrieveVfioCommonState(#[source] anyhow::Error), 75 } 76 77 #[derive(Copy, Clone)] 78 enum PciVfioSubclass { 79 VfioSubclass = 0xff, 80 } 81 82 impl PciSubclass for PciVfioSubclass { 83 fn get_register_value(&self) -> u8 { 84 *self as u8 85 } 86 } 87 88 enum InterruptUpdateAction { 89 EnableMsi, 90 DisableMsi, 91 EnableMsix, 92 DisableMsix, 93 } 94 95 #[derive(Serialize, Deserialize)] 96 struct IntxState { 97 enabled: bool, 98 } 99 100 pub(crate) struct VfioIntx { 101 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 102 enabled: bool, 103 } 104 105 #[derive(Serialize, Deserialize)] 106 struct MsiState { 107 cap: MsiCap, 108 cap_offset: u32, 109 } 110 111 pub(crate) struct VfioMsi { 112 pub(crate) cfg: MsiConfig, 113 cap_offset: u32, 114 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 115 } 116 117 impl VfioMsi { 118 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 119 let old_enabled = self.cfg.enabled(); 120 121 self.cfg.update(offset, data); 122 123 let new_enabled = self.cfg.enabled(); 124 125 if !old_enabled && new_enabled { 126 return Some(InterruptUpdateAction::EnableMsi); 127 } 128 129 if old_enabled && !new_enabled { 130 return Some(InterruptUpdateAction::DisableMsi); 131 } 132 133 None 134 } 135 } 136 137 #[derive(Serialize, Deserialize)] 138 struct MsixState { 139 cap: MsixCap, 140 cap_offset: u32, 141 bdf: u32, 142 } 143 144 pub(crate) struct VfioMsix { 145 pub(crate) bar: MsixConfig, 146 cap: MsixCap, 147 cap_offset: u32, 148 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 149 } 150 151 impl VfioMsix { 152 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 153 let old_enabled = self.bar.enabled(); 154 155 // Update "Message Control" word 156 if offset == 2 && data.len() == 2 { 157 self.bar.set_msg_ctl(LittleEndian::read_u16(data)); 158 } 159 160 let new_enabled = self.bar.enabled(); 161 162 if !old_enabled && new_enabled { 163 return Some(InterruptUpdateAction::EnableMsix); 164 } 165 166 if old_enabled && !new_enabled { 167 return Some(InterruptUpdateAction::DisableMsix); 168 } 169 170 None 171 } 172 173 fn table_accessed(&self, bar_index: u32, offset: u64) -> bool { 174 let table_offset: u64 = u64::from(self.cap.table_offset()); 175 let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64); 176 let table_bir: u32 = self.cap.table_bir(); 177 178 bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size 179 } 180 } 181 182 pub(crate) struct Interrupt { 183 pub(crate) intx: Option<VfioIntx>, 184 pub(crate) msi: Option<VfioMsi>, 185 pub(crate) msix: Option<VfioMsix>, 186 } 187 188 impl Interrupt { 189 fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 190 if let Some(ref mut msi) = &mut self.msi { 191 let action = msi.update(offset, data); 192 return action; 193 } 194 195 None 196 } 197 198 fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 199 if let Some(ref mut msix) = &mut self.msix { 200 let action = msix.update(offset, data); 201 return action; 202 } 203 204 None 205 } 206 207 fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { 208 if let Some(msi) = &self.msi { 209 if offset >= u64::from(msi.cap_offset) 210 && offset < u64::from(msi.cap_offset) + msi.cfg.size() 211 { 212 return Some(( 213 PciCapabilityId::MessageSignalledInterrupts, 214 u64::from(msi.cap_offset), 215 )); 216 } 217 } 218 219 if let Some(msix) = &self.msix { 220 if offset == u64::from(msix.cap_offset) { 221 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); 222 } 223 } 224 225 None 226 } 227 228 fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool { 229 if let Some(msix) = &self.msix { 230 return msix.table_accessed(bar_index, offset); 231 } 232 233 false 234 } 235 236 fn msix_write_table(&mut self, offset: u64, data: &[u8]) { 237 if let Some(ref mut msix) = &mut self.msix { 238 let offset = offset - u64::from(msix.cap.table_offset()); 239 msix.bar.write_table(offset, data) 240 } 241 } 242 243 fn msix_read_table(&self, offset: u64, data: &mut [u8]) { 244 if let Some(msix) = &self.msix { 245 let offset = offset - u64::from(msix.cap.table_offset()); 246 msix.bar.read_table(offset, data) 247 } 248 } 249 250 pub(crate) fn intx_in_use(&self) -> bool { 251 if let Some(intx) = &self.intx { 252 return intx.enabled; 253 } 254 255 false 256 } 257 } 258 259 #[derive(Copy, Clone)] 260 pub struct UserMemoryRegion { 261 pub slot: u32, 262 pub start: u64, 263 pub size: u64, 264 pub host_addr: u64, 265 } 266 267 #[derive(Clone)] 268 pub struct MmioRegion { 269 pub start: GuestAddress, 270 pub length: GuestUsize, 271 pub(crate) type_: PciBarRegionType, 272 pub(crate) index: u32, 273 pub(crate) user_memory_regions: Vec<UserMemoryRegion>, 274 } 275 276 trait MmioRegionRange { 277 fn check_range(&self, guest_addr: u64, size: u64) -> bool; 278 fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error>; 279 } 280 281 impl MmioRegionRange for Vec<MmioRegion> { 282 // Check if a guest address is within the range of mmio regions 283 fn check_range(&self, guest_addr: u64, size: u64) -> bool { 284 for region in self.iter() { 285 let Some(guest_addr_end) = guest_addr.checked_add(size) else { 286 return false; 287 }; 288 let Some(region_end) = region.start.raw_value().checked_add(region.length) else { 289 return false; 290 }; 291 if guest_addr >= region.start.raw_value() && guest_addr_end <= region_end { 292 return true; 293 } 294 } 295 false 296 } 297 298 // Locate the user region address for a guest address within all mmio regions 299 fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error> { 300 for region in self.iter() { 301 for user_region in region.user_memory_regions.iter() { 302 if guest_addr >= user_region.start 303 && guest_addr < user_region.start + user_region.size 304 { 305 return Ok(user_region.host_addr + (guest_addr - user_region.start)); 306 } 307 } 308 } 309 310 Err(io::Error::new( 311 io::ErrorKind::Other, 312 format!("unable to find user address: 0x{guest_addr:x}"), 313 )) 314 } 315 } 316 317 #[derive(Debug, Error)] 318 pub enum VfioError { 319 #[error("Kernel VFIO error: {0}")] 320 KernelVfio(#[source] vfio_ioctls::VfioError), 321 #[error("VFIO user error: {0}")] 322 VfioUser(#[source] vfio_user::Error), 323 } 324 325 pub(crate) trait Vfio: Send + Sync { 326 fn read_config_byte(&self, offset: u32) -> u8 { 327 let mut data: [u8; 1] = [0]; 328 self.read_config(offset, &mut data); 329 data[0] 330 } 331 332 fn read_config_word(&self, offset: u32) -> u16 { 333 let mut data: [u8; 2] = [0, 0]; 334 self.read_config(offset, &mut data); 335 u16::from_le_bytes(data) 336 } 337 338 fn read_config_dword(&self, offset: u32) -> u32 { 339 let mut data: [u8; 4] = [0, 0, 0, 0]; 340 self.read_config(offset, &mut data); 341 u32::from_le_bytes(data) 342 } 343 344 fn write_config_dword(&self, offset: u32, buf: u32) { 345 let data: [u8; 4] = buf.to_le_bytes(); 346 self.write_config(offset, &data) 347 } 348 349 fn read_config(&self, offset: u32, data: &mut [u8]) { 350 self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut()); 351 } 352 353 fn write_config(&self, offset: u32, data: &[u8]) { 354 self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data) 355 } 356 357 fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 358 self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds) 359 } 360 361 fn disable_msi(&self) -> Result<(), VfioError> { 362 self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX) 363 } 364 365 fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 366 self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds) 367 } 368 369 fn disable_msix(&self) -> Result<(), VfioError> { 370 self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX) 371 } 372 373 fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) { 374 unimplemented!() 375 } 376 377 fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) { 378 unimplemented!() 379 } 380 381 fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> { 382 unimplemented!() 383 } 384 385 fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 386 unimplemented!() 387 } 388 389 fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 390 unimplemented!() 391 } 392 393 fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 394 unimplemented!() 395 } 396 } 397 398 struct VfioDeviceWrapper { 399 device: Arc<VfioDevice>, 400 } 401 402 impl VfioDeviceWrapper { 403 fn new(device: Arc<VfioDevice>) -> Self { 404 Self { device } 405 } 406 } 407 408 impl Vfio for VfioDeviceWrapper { 409 fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) { 410 self.device.region_read(index, data, offset) 411 } 412 413 fn region_write(&self, index: u32, offset: u64, data: &[u8]) { 414 self.device.region_write(index, data, offset) 415 } 416 417 fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> { 418 self.device.get_irq_info(irq_index).copied() 419 } 420 421 fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 422 self.device 423 .enable_irq(irq_index, event_fds) 424 .map_err(VfioError::KernelVfio) 425 } 426 427 fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> { 428 self.device 429 .disable_irq(irq_index) 430 .map_err(VfioError::KernelVfio) 431 } 432 433 fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> { 434 self.device 435 .unmask_irq(irq_index) 436 .map_err(VfioError::KernelVfio) 437 } 438 } 439 440 #[derive(Serialize, Deserialize)] 441 struct VfioCommonState { 442 intx_state: Option<IntxState>, 443 msi_state: Option<MsiState>, 444 msix_state: Option<MsixState>, 445 } 446 447 pub(crate) struct ConfigPatch { 448 mask: u32, 449 patch: u32, 450 } 451 452 pub(crate) struct VfioCommon { 453 pub(crate) configuration: PciConfiguration, 454 pub(crate) mmio_regions: Vec<MmioRegion>, 455 pub(crate) interrupt: Interrupt, 456 pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 457 pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 458 pub(crate) vfio_wrapper: Arc<dyn Vfio>, 459 pub(crate) patches: HashMap<usize, ConfigPatch>, 460 x_nv_gpudirect_clique: Option<u8>, 461 } 462 463 impl VfioCommon { 464 pub(crate) fn new( 465 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 466 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 467 vfio_wrapper: Arc<dyn Vfio>, 468 subclass: &dyn PciSubclass, 469 bdf: PciBdf, 470 snapshot: Option<Snapshot>, 471 x_nv_gpudirect_clique: Option<u8>, 472 ) -> Result<Self, VfioPciError> { 473 let pci_configuration_state = 474 vm_migration::state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID).map_err(|e| { 475 VfioPciError::RetrievePciConfigurationState(anyhow!( 476 "Failed to get PciConfigurationState from Snapshot: {}", 477 e 478 )) 479 })?; 480 481 let configuration = PciConfiguration::new( 482 0, 483 0, 484 0, 485 PciClassCode::Other, 486 subclass, 487 None, 488 PciHeaderType::Device, 489 0, 490 0, 491 None, 492 pci_configuration_state, 493 ); 494 495 let mut vfio_common = VfioCommon { 496 mmio_regions: Vec::new(), 497 configuration, 498 interrupt: Interrupt { 499 intx: None, 500 msi: None, 501 msix: None, 502 }, 503 msi_interrupt_manager, 504 legacy_interrupt_group, 505 vfio_wrapper, 506 patches: HashMap::new(), 507 x_nv_gpudirect_clique, 508 }; 509 510 let state: Option<VfioCommonState> = snapshot 511 .as_ref() 512 .map(|s| s.to_state()) 513 .transpose() 514 .map_err(|e| { 515 VfioPciError::RetrieveVfioCommonState(anyhow!( 516 "Failed to get VfioCommonState from Snapshot: {}", 517 e 518 )) 519 })?; 520 let msi_state = 521 vm_migration::state_from_id(snapshot.as_ref(), MSI_CONFIG_ID).map_err(|e| { 522 VfioPciError::RetrieveMsiConfigState(anyhow!( 523 "Failed to get MsiConfigState from Snapshot: {}", 524 e 525 )) 526 })?; 527 let msix_state = 528 vm_migration::state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID).map_err(|e| { 529 VfioPciError::RetrieveMsixConfigState(anyhow!( 530 "Failed to get MsixConfigState from Snapshot: {}", 531 e 532 )) 533 })?; 534 535 if let Some(state) = state.as_ref() { 536 vfio_common.set_state(state, msi_state, msix_state)?; 537 } else { 538 vfio_common.parse_capabilities(bdf); 539 vfio_common.initialize_legacy_interrupt()?; 540 } 541 542 Ok(vfio_common) 543 } 544 545 /// In case msix table offset is not page size aligned, we need do some fixup to achieve it. 546 /// Because we don't want the MMIO RW region and trap region overlap each other. 547 fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 { 548 if let Some(msix) = self.interrupt.msix.as_mut() { 549 let msix_cap = &mut msix.cap; 550 551 // Suppose table_bir equals to pba_bir here. Am I right? 552 let (table_offset, table_size) = msix_cap.table_range(); 553 if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id { 554 return region_size; 555 } 556 557 let (pba_offset, pba_size) = msix_cap.pba_range(); 558 let msix_sz = align_page_size_up(table_size + pba_size); 559 // Expand region to hold RW and trap region which both page size aligned 560 let size = std::cmp::max(region_size * 2, msix_sz * 2); 561 // let table starts from the middle of the region 562 msix_cap.table_set_offset((size / 2) as u32); 563 msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32); 564 565 size 566 } else { 567 // MSI-X not supported for this device 568 region_size 569 } 570 } 571 572 // The `allocator` argument is unused on `aarch64` 573 #[allow(unused_variables)] 574 pub(crate) fn allocate_bars( 575 &mut self, 576 allocator: &Arc<Mutex<SystemAllocator>>, 577 mmio32_allocator: &mut AddressAllocator, 578 mmio64_allocator: &mut AddressAllocator, 579 resources: Option<Vec<Resource>>, 580 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 581 let mut bars = Vec::new(); 582 let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX; 583 584 // Going through all regular regions to compute the BAR size. 585 // We're not saving the BAR address to restore it, because we 586 // are going to allocate a guest address for each BAR and write 587 // that new address back. 588 while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { 589 let mut region_size: u64 = 0; 590 let mut region_type = PciBarRegionType::Memory32BitRegion; 591 let mut prefetchable = PciBarPrefetchable::NotPrefetchable; 592 let mut flags: u32 = 0; 593 594 let mut restored_bar_addr = None; 595 if let Some(resources) = &resources { 596 for resource in resources { 597 if let Resource::PciBar { 598 index, 599 base, 600 size, 601 type_, 602 .. 603 } = resource 604 { 605 if *index == bar_id as usize { 606 restored_bar_addr = Some(GuestAddress(*base)); 607 region_size = *size; 608 region_type = PciBarRegionType::from(*type_); 609 break; 610 } 611 } 612 } 613 if restored_bar_addr.is_none() { 614 bar_id += 1; 615 continue; 616 } 617 } else { 618 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 619 (PCI_ROM_EXP_BAR_INDEX * 4) as u32 620 } else { 621 PCI_CONFIG_BAR_OFFSET + bar_id * 4 622 }; 623 624 // First read flags 625 flags = self.vfio_wrapper.read_config_dword(bar_offset); 626 627 // Is this an IO BAR? 628 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 629 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) 630 } else { 631 false 632 }; 633 634 // Is this a 64-bit BAR? 635 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 636 matches!( 637 flags & PCI_CONFIG_MEMORY_BAR_64BIT, 638 PCI_CONFIG_MEMORY_BAR_64BIT 639 ) 640 } else { 641 false 642 }; 643 644 if matches!( 645 flags & PCI_CONFIG_BAR_PREFETCHABLE, 646 PCI_CONFIG_BAR_PREFETCHABLE 647 ) { 648 prefetchable = PciBarPrefetchable::Prefetchable 649 }; 650 651 // To get size write all 1s 652 self.vfio_wrapper 653 .write_config_dword(bar_offset, 0xffff_ffff); 654 655 // And read back BAR value. The device will write zeros for bits it doesn't care about 656 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset); 657 658 if io_bar { 659 // Mask flag bits (lowest 2 for I/O bars) 660 lower &= !0b11; 661 662 // BAR is not enabled 663 if lower == 0 { 664 bar_id += 1; 665 continue; 666 } 667 668 // IO BAR 669 region_type = PciBarRegionType::IoRegion; 670 671 // Invert bits and add 1 to calculate size 672 region_size = (!lower + 1) as u64; 673 } else if is_64bit_bar { 674 // 64 bits Memory BAR 675 region_type = PciBarRegionType::Memory64BitRegion; 676 677 // Query size of upper BAR of 64-bit BAR 678 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; 679 self.vfio_wrapper 680 .write_config_dword(upper_offset, 0xffff_ffff); 681 let upper = self.vfio_wrapper.read_config_dword(upper_offset); 682 683 let mut combined_size = u64::from(upper) << 32 | u64::from(lower); 684 685 // Mask out flag bits (lowest 4 for memory bars) 686 combined_size &= !0b1111; 687 688 // BAR is not enabled 689 if combined_size == 0 { 690 bar_id += 1; 691 continue; 692 } 693 694 // Invert and add 1 to to find size 695 region_size = !combined_size + 1; 696 } else { 697 region_type = PciBarRegionType::Memory32BitRegion; 698 699 // Mask out flag bits (lowest 4 for memory bars) 700 lower &= !0b1111; 701 702 if lower == 0 { 703 bar_id += 1; 704 continue; 705 } 706 707 // Invert and add 1 to to find size 708 region_size = (!lower + 1) as u64; 709 } 710 } 711 712 let bar_addr = match region_type { 713 PciBarRegionType::IoRegion => { 714 #[cfg(not(target_arch = "x86_64"))] 715 unimplemented!(); 716 717 // The address needs to be 4 bytes aligned. 718 #[cfg(target_arch = "x86_64")] 719 allocator 720 .lock() 721 .unwrap() 722 .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4)) 723 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 724 } 725 PciBarRegionType::Memory32BitRegion => { 726 // BAR allocation must be naturally aligned 727 mmio32_allocator 728 .allocate(restored_bar_addr, region_size, Some(region_size)) 729 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 730 } 731 PciBarRegionType::Memory64BitRegion => { 732 // We need do some fixup to keep MMIO RW region and msix cap region page size 733 // aligned. 734 region_size = self.fixup_msix_region(bar_id, region_size); 735 mmio64_allocator 736 .allocate( 737 restored_bar_addr, 738 region_size, 739 Some(std::cmp::max( 740 // SAFETY: FFI call. Trivially safe. 741 unsafe { sysconf(_SC_PAGESIZE) as GuestUsize }, 742 region_size, 743 )), 744 ) 745 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 746 } 747 }; 748 749 // We can now build our BAR configuration block. 750 let bar = PciBarConfiguration::default() 751 .set_index(bar_id as usize) 752 .set_address(bar_addr.raw_value()) 753 .set_size(region_size) 754 .set_region_type(region_type) 755 .set_prefetchable(prefetchable); 756 757 if bar_id == VFIO_PCI_ROM_REGION_INDEX { 758 self.configuration 759 .add_pci_rom_bar(&bar, flags & 0x1) 760 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 761 } else { 762 self.configuration 763 .add_pci_bar(&bar) 764 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 765 } 766 767 bars.push(bar); 768 self.mmio_regions.push(MmioRegion { 769 start: bar_addr, 770 length: region_size, 771 type_: region_type, 772 index: bar_id, 773 user_memory_regions: Vec::new(), 774 }); 775 776 bar_id += 1; 777 if region_type == PciBarRegionType::Memory64BitRegion { 778 bar_id += 1; 779 } 780 } 781 782 Ok(bars) 783 } 784 785 // The `allocator` argument is unused on `aarch64` 786 #[allow(unused_variables)] 787 pub(crate) fn free_bars( 788 &mut self, 789 allocator: &mut SystemAllocator, 790 mmio32_allocator: &mut AddressAllocator, 791 mmio64_allocator: &mut AddressAllocator, 792 ) -> Result<(), PciDeviceError> { 793 for region in self.mmio_regions.iter() { 794 match region.type_ { 795 PciBarRegionType::IoRegion => { 796 #[cfg(target_arch = "x86_64")] 797 allocator.free_io_addresses(region.start, region.length); 798 #[cfg(not(target_arch = "x86_64"))] 799 error!("I/O region is not supported"); 800 } 801 PciBarRegionType::Memory32BitRegion => { 802 mmio32_allocator.free(region.start, region.length); 803 } 804 PciBarRegionType::Memory64BitRegion => { 805 mmio64_allocator.free(region.start, region.length); 806 } 807 } 808 } 809 Ok(()) 810 } 811 812 pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap { 813 let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into()); 814 815 let table = self.vfio_wrapper.read_config_dword((cap + 4).into()); 816 817 let pba = self.vfio_wrapper.read_config_dword((cap + 8).into()); 818 819 MsixCap { 820 msg_ctl, 821 table, 822 pba, 823 } 824 } 825 826 pub(crate) fn initialize_msix( 827 &mut self, 828 msix_cap: MsixCap, 829 cap_offset: u32, 830 bdf: PciBdf, 831 state: Option<MsixConfigState>, 832 ) { 833 let interrupt_source_group = self 834 .msi_interrupt_manager 835 .create_group(MsiIrqGroupConfig { 836 base: 0, 837 count: msix_cap.table_size() as InterruptIndex, 838 }) 839 .unwrap(); 840 841 let msix_config = MsixConfig::new( 842 msix_cap.table_size(), 843 interrupt_source_group.clone(), 844 bdf.into(), 845 state, 846 ) 847 .unwrap(); 848 849 self.interrupt.msix = Some(VfioMsix { 850 bar: msix_config, 851 cap: msix_cap, 852 cap_offset, 853 interrupt_source_group, 854 }); 855 } 856 857 pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 { 858 self.vfio_wrapper.read_config_word((cap + 2).into()) 859 } 860 861 pub(crate) fn initialize_msi( 862 &mut self, 863 msg_ctl: u16, 864 cap_offset: u32, 865 state: Option<MsiConfigState>, 866 ) { 867 let interrupt_source_group = self 868 .msi_interrupt_manager 869 .create_group(MsiIrqGroupConfig { 870 base: 0, 871 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, 872 }) 873 .unwrap(); 874 875 let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap(); 876 877 self.interrupt.msi = Some(VfioMsi { 878 cfg: msi_config, 879 cap_offset, 880 interrupt_source_group, 881 }); 882 } 883 884 pub(crate) fn get_msix_cap_idx(&self) -> Option<usize> { 885 let mut cap_next = self 886 .vfio_wrapper 887 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 888 889 while cap_next != 0 { 890 let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into()); 891 if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX { 892 return Some(cap_next as usize); 893 } else { 894 cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into()); 895 } 896 } 897 898 None 899 } 900 901 pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) { 902 let mut cap_iter = self 903 .vfio_wrapper 904 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 905 906 let mut pci_express_cap_found = false; 907 let mut power_management_cap_found = false; 908 909 while cap_iter != 0 { 910 let cap_id = self.vfio_wrapper.read_config_byte(cap_iter.into()); 911 912 match PciCapabilityId::from(cap_id) { 913 PciCapabilityId::MessageSignalledInterrupts => { 914 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { 915 if irq_info.count > 0 { 916 // Parse capability only if the VFIO device 917 // supports MSI. 918 let msg_ctl = self.parse_msi_capabilities(cap_iter); 919 self.initialize_msi(msg_ctl, cap_iter as u32, None); 920 } 921 } 922 } 923 PciCapabilityId::MsiX => { 924 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) 925 { 926 if irq_info.count > 0 { 927 // Parse capability only if the VFIO device 928 // supports MSI-X. 929 let msix_cap = self.parse_msix_capabilities(cap_iter); 930 self.initialize_msix(msix_cap, cap_iter as u32, bdf, None); 931 } 932 } 933 } 934 PciCapabilityId::PciExpress => pci_express_cap_found = true, 935 PciCapabilityId::PowerManagement => power_management_cap_found = true, 936 _ => {} 937 }; 938 939 let cap_next = self.vfio_wrapper.read_config_byte((cap_iter + 1).into()); 940 if cap_next == 0 { 941 break; 942 } 943 944 cap_iter = cap_next; 945 } 946 947 if let Some(clique_id) = self.x_nv_gpudirect_clique { 948 self.add_nv_gpudirect_clique_cap(cap_iter, clique_id); 949 } 950 951 if pci_express_cap_found && power_management_cap_found { 952 self.parse_extended_capabilities(); 953 } 954 } 955 956 fn add_nv_gpudirect_clique_cap(&mut self, cap_iter: u8, clique_id: u8) { 957 // Turing, Ampere, Hopper, and Lovelace GPUs have dedicated space 958 // at 0xD4 for this capability. 959 let cap_offset = 0xd4u32; 960 961 let reg_idx = (cap_iter / 4) as usize; 962 self.patches.insert( 963 reg_idx, 964 ConfigPatch { 965 mask: 0x0000_ff00, 966 patch: cap_offset << 8, 967 }, 968 ); 969 970 let reg_idx = (cap_offset / 4) as usize; 971 self.patches.insert( 972 reg_idx, 973 ConfigPatch { 974 mask: 0xffff_ffff, 975 patch: 0x50080009u32, 976 }, 977 ); 978 self.patches.insert( 979 reg_idx + 1, 980 ConfigPatch { 981 mask: 0xffff_ffff, 982 patch: u32::from(clique_id) << 19 | 0x5032, 983 }, 984 ); 985 } 986 987 fn parse_extended_capabilities(&mut self) { 988 let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET; 989 990 loop { 991 let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset); 992 993 let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16; 994 let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16; 995 996 match PciExpressCapabilityId::from(cap_id) { 997 PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation 998 | PciExpressCapabilityId::ResizeableBar 999 | PciExpressCapabilityId::SingleRootIoVirtualization => { 1000 let reg_idx = (current_offset / 4) as usize; 1001 self.patches.insert( 1002 reg_idx, 1003 ConfigPatch { 1004 mask: 0x0000_ffff, 1005 patch: PciExpressCapabilityId::NullCapability as u32, 1006 }, 1007 ); 1008 } 1009 _ => {} 1010 } 1011 1012 if cap_next == 0 { 1013 break; 1014 } 1015 1016 current_offset = cap_next.into(); 1017 } 1018 } 1019 1020 pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> { 1021 if let Some(intx) = &mut self.interrupt.intx { 1022 if !intx.enabled { 1023 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { 1024 self.vfio_wrapper 1025 .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) 1026 .map_err(VfioPciError::EnableIntx)?; 1027 1028 intx.enabled = true; 1029 } else { 1030 return Err(VfioPciError::MissingNotifier); 1031 } 1032 } 1033 } 1034 1035 Ok(()) 1036 } 1037 1038 pub(crate) fn disable_intx(&mut self) { 1039 if let Some(intx) = &mut self.interrupt.intx { 1040 if intx.enabled { 1041 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1042 error!("Could not disable INTx: {}", e); 1043 } else { 1044 intx.enabled = false; 1045 } 1046 } 1047 } 1048 } 1049 1050 pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> { 1051 if let Some(msi) = &self.interrupt.msi { 1052 let mut irq_fds: Vec<EventFd> = Vec::new(); 1053 for i in 0..msi.cfg.num_enabled_vectors() { 1054 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { 1055 irq_fds.push(eventfd); 1056 } else { 1057 return Err(VfioPciError::MissingNotifier); 1058 } 1059 } 1060 1061 self.vfio_wrapper 1062 .enable_msi(irq_fds.iter().collect()) 1063 .map_err(VfioPciError::EnableMsi)?; 1064 } 1065 1066 Ok(()) 1067 } 1068 1069 pub(crate) fn disable_msi(&self) { 1070 if let Err(e) = self.vfio_wrapper.disable_msi() { 1071 error!("Could not disable MSI: {}", e); 1072 } 1073 } 1074 1075 pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> { 1076 if let Some(msix) = &self.interrupt.msix { 1077 let mut irq_fds: Vec<EventFd> = Vec::new(); 1078 for i in 0..msix.bar.table_entries.len() { 1079 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { 1080 irq_fds.push(eventfd); 1081 } else { 1082 return Err(VfioPciError::MissingNotifier); 1083 } 1084 } 1085 1086 self.vfio_wrapper 1087 .enable_msix(irq_fds.iter().collect()) 1088 .map_err(VfioPciError::EnableMsix)?; 1089 } 1090 1091 Ok(()) 1092 } 1093 1094 pub(crate) fn disable_msix(&self) { 1095 if let Err(e) = self.vfio_wrapper.disable_msix() { 1096 error!("Could not disable MSI-X: {}", e); 1097 } 1098 } 1099 1100 pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> { 1101 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { 1102 if irq_info.count == 0 { 1103 // A count of 0 means the INTx IRQ is not supported, therefore 1104 // it shouldn't be initialized. 1105 return Ok(()); 1106 } 1107 } 1108 1109 if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() { 1110 self.interrupt.intx = Some(VfioIntx { 1111 interrupt_source_group, 1112 enabled: false, 1113 }); 1114 1115 self.enable_intx()?; 1116 } 1117 1118 Ok(()) 1119 } 1120 1121 pub(crate) fn update_msi_capabilities( 1122 &mut self, 1123 offset: u64, 1124 data: &[u8], 1125 ) -> Result<(), VfioPciError> { 1126 match self.interrupt.update_msi(offset, data) { 1127 Some(InterruptUpdateAction::EnableMsi) => { 1128 // Disable INTx before we can enable MSI 1129 self.disable_intx(); 1130 self.enable_msi()?; 1131 } 1132 Some(InterruptUpdateAction::DisableMsi) => { 1133 // Fallback onto INTx when disabling MSI 1134 self.disable_msi(); 1135 self.enable_intx()?; 1136 } 1137 _ => {} 1138 } 1139 1140 Ok(()) 1141 } 1142 1143 pub(crate) fn update_msix_capabilities( 1144 &mut self, 1145 offset: u64, 1146 data: &[u8], 1147 ) -> Result<(), VfioPciError> { 1148 match self.interrupt.update_msix(offset, data) { 1149 Some(InterruptUpdateAction::EnableMsix) => { 1150 // Disable INTx before we can enable MSI-X 1151 self.disable_intx(); 1152 self.enable_msix()?; 1153 } 1154 Some(InterruptUpdateAction::DisableMsix) => { 1155 // Fallback onto INTx when disabling MSI-X 1156 self.disable_msix(); 1157 self.enable_intx()?; 1158 } 1159 _ => {} 1160 } 1161 1162 Ok(()) 1163 } 1164 1165 pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> { 1166 for region in self.mmio_regions.iter() { 1167 if addr >= region.start.raw_value() 1168 && addr < region.start.unchecked_add(region.length).raw_value() 1169 { 1170 return Some(region.clone()); 1171 } 1172 } 1173 None 1174 } 1175 1176 pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1177 let addr = base + offset; 1178 if let Some(region) = self.find_region(addr) { 1179 let offset = addr - region.start.raw_value(); 1180 1181 if self.interrupt.msix_table_accessed(region.index, offset) { 1182 self.interrupt.msix_read_table(offset, data); 1183 } else { 1184 self.vfio_wrapper.region_read(region.index, offset, data); 1185 } 1186 } 1187 1188 // INTx EOI 1189 // The guest reading from the BAR potentially means the interrupt has 1190 // been received and can be acknowledged. 1191 if self.interrupt.intx_in_use() { 1192 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1193 error!("Failed unmasking INTx IRQ: {}", e); 1194 } 1195 } 1196 } 1197 1198 pub(crate) fn write_bar( 1199 &mut self, 1200 base: u64, 1201 offset: u64, 1202 data: &[u8], 1203 ) -> Option<Arc<Barrier>> { 1204 let addr = base + offset; 1205 if let Some(region) = self.find_region(addr) { 1206 let offset = addr - region.start.raw_value(); 1207 1208 // If the MSI-X table is written to, we need to update our cache. 1209 if self.interrupt.msix_table_accessed(region.index, offset) { 1210 self.interrupt.msix_write_table(offset, data); 1211 } else { 1212 self.vfio_wrapper.region_write(region.index, offset, data); 1213 } 1214 } 1215 1216 // INTx EOI 1217 // The guest writing to the BAR potentially means the interrupt has 1218 // been received and can be acknowledged. 1219 if self.interrupt.intx_in_use() { 1220 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1221 error!("Failed unmasking INTx IRQ: {}", e); 1222 } 1223 } 1224 1225 None 1226 } 1227 1228 pub(crate) fn write_config_register( 1229 &mut self, 1230 reg_idx: usize, 1231 offset: u64, 1232 data: &[u8], 1233 ) -> Option<Arc<Barrier>> { 1234 // When the guest wants to write to a BAR, we trap it into 1235 // our local configuration space. We're not reprogramming 1236 // VFIO device. 1237 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1238 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1239 { 1240 // We keep our local cache updated with the BARs. 1241 // We'll read it back from there when the guest is asking 1242 // for BARs (see read_config_register()). 1243 self.configuration 1244 .write_config_register(reg_idx, offset, data); 1245 return None; 1246 } 1247 1248 let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; 1249 1250 // If the MSI or MSI-X capabilities are accessed, we need to 1251 // update our local cache accordingly. 1252 // Depending on how the capabilities are modified, this could 1253 // trigger a VFIO MSI or MSI-X toggle. 1254 if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { 1255 let cap_offset: u64 = reg - cap_base + offset; 1256 match cap_id { 1257 PciCapabilityId::MessageSignalledInterrupts => { 1258 if let Err(e) = self.update_msi_capabilities(cap_offset, data) { 1259 error!("Could not update MSI capabilities: {}", e); 1260 } 1261 } 1262 PciCapabilityId::MsiX => { 1263 if let Err(e) = self.update_msix_capabilities(cap_offset, data) { 1264 error!("Could not update MSI-X capabilities: {}", e); 1265 } 1266 } 1267 _ => {} 1268 } 1269 } 1270 1271 // Make sure to write to the device's PCI config space after MSI/MSI-X 1272 // interrupts have been enabled/disabled. In case of MSI, when the 1273 // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), 1274 // the MSI Enable bit in the MSI capability structure found in the PCI 1275 // config space is disabled by default. That's why when the guest is 1276 // enabling this bit, we first need to enable the MSI interrupts with 1277 // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write 1278 // to the device region to update the MSI Enable bit. 1279 self.vfio_wrapper.write_config((reg + offset) as u32, data); 1280 1281 None 1282 } 1283 1284 pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1285 // When reading the BARs, we trap it and return what comes 1286 // from our local configuration space. We want the guest to 1287 // use that and not the VFIO device BARs as it does not map 1288 // with the guest address space. 1289 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1290 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1291 { 1292 return self.configuration.read_reg(reg_idx); 1293 } 1294 1295 if let Some(id) = self.get_msix_cap_idx() { 1296 let msix = self.interrupt.msix.as_mut().unwrap(); 1297 if reg_idx * 4 == id + 4 { 1298 return msix.cap.table; 1299 } else if reg_idx * 4 == id + 8 { 1300 return msix.cap.pba; 1301 } 1302 } 1303 1304 // Since we don't support passing multi-functions devices, we should 1305 // mask the multi-function bit, bit 7 of the Header Type byte on the 1306 // register 3. 1307 let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { 1308 0xff7f_ffff 1309 } else { 1310 0xffff_ffff 1311 }; 1312 1313 // The config register read comes from the VFIO device itself. 1314 let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask; 1315 1316 if let Some(config_patch) = self.patches.get(®_idx) { 1317 value = (value & !config_patch.mask) | config_patch.patch; 1318 } 1319 1320 value 1321 } 1322 1323 fn state(&self) -> VfioCommonState { 1324 let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState { 1325 enabled: intx.enabled, 1326 }); 1327 1328 let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState { 1329 cap: msi.cfg.cap, 1330 cap_offset: msi.cap_offset, 1331 }); 1332 1333 let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState { 1334 cap: msix.cap, 1335 cap_offset: msix.cap_offset, 1336 bdf: msix.bar.devid, 1337 }); 1338 1339 VfioCommonState { 1340 intx_state, 1341 msi_state, 1342 msix_state, 1343 } 1344 } 1345 1346 fn set_state( 1347 &mut self, 1348 state: &VfioCommonState, 1349 msi_state: Option<MsiConfigState>, 1350 msix_state: Option<MsixConfigState>, 1351 ) -> Result<(), VfioPciError> { 1352 if let (Some(intx), Some(interrupt_source_group)) = 1353 (&state.intx_state, self.legacy_interrupt_group.clone()) 1354 { 1355 self.interrupt.intx = Some(VfioIntx { 1356 interrupt_source_group, 1357 enabled: false, 1358 }); 1359 1360 if intx.enabled { 1361 self.enable_intx()?; 1362 } 1363 } 1364 1365 if let Some(msi) = &state.msi_state { 1366 self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state); 1367 } 1368 1369 if let Some(msix) = &state.msix_state { 1370 self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state); 1371 } 1372 1373 Ok(()) 1374 } 1375 } 1376 1377 impl Pausable for VfioCommon {} 1378 1379 impl Snapshottable for VfioCommon { 1380 fn id(&self) -> String { 1381 String::from(VFIO_COMMON_ID) 1382 } 1383 1384 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1385 let mut vfio_common_snapshot = Snapshot::new_from_state(&self.state())?; 1386 1387 // Snapshot PciConfiguration 1388 vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?); 1389 1390 // Snapshot MSI 1391 if let Some(msi) = &mut self.interrupt.msi { 1392 vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?); 1393 } 1394 1395 // Snapshot MSI-X 1396 if let Some(msix) = &mut self.interrupt.msix { 1397 vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?); 1398 } 1399 1400 Ok(vfio_common_snapshot) 1401 } 1402 } 1403 1404 /// VfioPciDevice represents a VFIO PCI device. 1405 /// This structure implements the BusDevice and PciDevice traits. 1406 /// 1407 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. 1408 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, 1409 /// which then gets added to the PCI bus. 1410 pub struct VfioPciDevice { 1411 id: String, 1412 vm: Arc<dyn hypervisor::Vm>, 1413 device: Arc<VfioDevice>, 1414 container: Arc<VfioContainer>, 1415 common: VfioCommon, 1416 iommu_attached: bool, 1417 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1418 } 1419 1420 impl VfioPciDevice { 1421 /// Constructs a new Vfio Pci device for the given Vfio device 1422 #[allow(clippy::too_many_arguments)] 1423 pub fn new( 1424 id: String, 1425 vm: &Arc<dyn hypervisor::Vm>, 1426 device: VfioDevice, 1427 container: Arc<VfioContainer>, 1428 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 1429 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 1430 iommu_attached: bool, 1431 bdf: PciBdf, 1432 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1433 snapshot: Option<Snapshot>, 1434 x_nv_gpudirect_clique: Option<u8>, 1435 ) -> Result<Self, VfioPciError> { 1436 let device = Arc::new(device); 1437 device.reset(); 1438 1439 let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device)); 1440 1441 let common = VfioCommon::new( 1442 msi_interrupt_manager, 1443 legacy_interrupt_group, 1444 Arc::new(vfio_wrapper) as Arc<dyn Vfio>, 1445 &PciVfioSubclass::VfioSubclass, 1446 bdf, 1447 vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID), 1448 x_nv_gpudirect_clique, 1449 )?; 1450 1451 let vfio_pci_device = VfioPciDevice { 1452 id, 1453 vm: vm.clone(), 1454 device, 1455 container, 1456 common, 1457 iommu_attached, 1458 memory_slot, 1459 }; 1460 1461 Ok(vfio_pci_device) 1462 } 1463 1464 pub fn iommu_attached(&self) -> bool { 1465 self.iommu_attached 1466 } 1467 1468 fn generate_sparse_areas( 1469 caps: &[VfioRegionInfoCap], 1470 region_index: u32, 1471 region_start: u64, 1472 region_size: u64, 1473 vfio_msix: Option<&VfioMsix>, 1474 ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> { 1475 for cap in caps { 1476 match cap { 1477 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()), 1478 VfioRegionInfoCap::MsixMappable => { 1479 if !is_4k_aligned(region_start) { 1480 error!( 1481 "Region start address 0x{:x} must be at least aligned on 4KiB", 1482 region_start 1483 ); 1484 return Err(VfioPciError::RegionAlignment); 1485 } 1486 if !is_4k_multiple(region_size) { 1487 error!( 1488 "Region size 0x{:x} must be at least a multiple of 4KiB", 1489 region_size 1490 ); 1491 return Err(VfioPciError::RegionSize); 1492 } 1493 1494 // In case the region contains the MSI-X vectors table or 1495 // the MSI-X PBA table, we must calculate the subregions 1496 // around them, leading to a list of sparse areas. 1497 // We want to make sure we will still trap MMIO accesses 1498 // to these MSI-X specific ranges. If these region don't align 1499 // with pagesize, we can achieve it by enlarging its range. 1500 // 1501 // Using a BtreeMap as the list provided through the iterator is sorted 1502 // by key. This ensures proper split of the whole region. 1503 let mut inter_ranges = BTreeMap::new(); 1504 if let Some(msix) = vfio_msix { 1505 if region_index == msix.cap.table_bir() { 1506 let (offset, size) = msix.cap.table_range(); 1507 let offset = align_page_size_down(offset); 1508 let size = align_page_size_up(size); 1509 inter_ranges.insert(offset, size); 1510 } 1511 if region_index == msix.cap.pba_bir() { 1512 let (offset, size) = msix.cap.pba_range(); 1513 let offset = align_page_size_down(offset); 1514 let size = align_page_size_up(size); 1515 inter_ranges.insert(offset, size); 1516 } 1517 } 1518 1519 let mut sparse_areas = Vec::new(); 1520 let mut current_offset = 0; 1521 for (range_offset, range_size) in inter_ranges { 1522 if range_offset > current_offset { 1523 sparse_areas.push(VfioRegionSparseMmapArea { 1524 offset: current_offset, 1525 size: range_offset - current_offset, 1526 }); 1527 } 1528 current_offset = align_page_size_down(range_offset + range_size); 1529 } 1530 1531 if region_size > current_offset { 1532 sparse_areas.push(VfioRegionSparseMmapArea { 1533 offset: current_offset, 1534 size: region_size - current_offset, 1535 }); 1536 } 1537 1538 return Ok(sparse_areas); 1539 } 1540 _ => {} 1541 } 1542 } 1543 1544 // In case no relevant capabilities have been found, create a single 1545 // sparse area corresponding to the entire MMIO region. 1546 Ok(vec![VfioRegionSparseMmapArea { 1547 offset: 0, 1548 size: region_size, 1549 }]) 1550 } 1551 1552 /// Map MMIO regions into the guest, and avoid VM exits when the guest tries 1553 /// to reach those regions. 1554 /// 1555 /// # Arguments 1556 /// 1557 /// * `vm` - The VM object. It is used to set the VFIO MMIO regions 1558 /// as user memory regions. 1559 /// * `mem_slot` - The closure to return a memory slot. 1560 pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> { 1561 let fd = self.device.as_raw_fd(); 1562 1563 for region in self.common.mmio_regions.iter_mut() { 1564 let region_flags = self.device.get_region_flags(region.index); 1565 if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { 1566 let mut prot = 0; 1567 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 { 1568 prot |= libc::PROT_READ; 1569 } 1570 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { 1571 prot |= libc::PROT_WRITE; 1572 } 1573 1574 // Retrieve the list of capabilities found on the region 1575 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 { 1576 self.device.get_region_caps(region.index) 1577 } else { 1578 Vec::new() 1579 }; 1580 1581 // Don't try to mmap the region if it contains MSI-X table or 1582 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE 1583 // in the list of supported capabilities. 1584 if let Some(msix) = self.common.interrupt.msix.as_ref() { 1585 if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir()) 1586 && !caps.contains(&VfioRegionInfoCap::MsixMappable) 1587 { 1588 continue; 1589 } 1590 } 1591 1592 let mmap_size = self.device.get_region_size(region.index); 1593 let mmap_offset = self.device.get_region_offset(region.index); 1594 1595 let sparse_areas = Self::generate_sparse_areas( 1596 &caps, 1597 region.index, 1598 region.start.0, 1599 mmap_size, 1600 self.common.interrupt.msix.as_ref(), 1601 )?; 1602 1603 for area in sparse_areas.iter() { 1604 // SAFETY: FFI call with correct arguments 1605 let host_addr = unsafe { 1606 libc::mmap( 1607 null_mut(), 1608 area.size as usize, 1609 prot, 1610 libc::MAP_SHARED, 1611 fd, 1612 mmap_offset as libc::off_t + area.offset as libc::off_t, 1613 ) 1614 }; 1615 1616 if host_addr == libc::MAP_FAILED { 1617 error!( 1618 "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", 1619 area.offset, 1620 area.size, 1621 std::io::Error::last_os_error() 1622 ); 1623 return Err(VfioPciError::MmapArea); 1624 } 1625 1626 if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) { 1627 warn!( 1628 "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})", 1629 area.offset, 1630 area.size, 1631 ); 1632 return Ok(()); 1633 } 1634 1635 let user_memory_region = UserMemoryRegion { 1636 slot: (self.memory_slot)(), 1637 start: region.start.0 + area.offset, 1638 size: area.size, 1639 host_addr: host_addr as u64, 1640 }; 1641 1642 region.user_memory_regions.push(user_memory_region); 1643 1644 let mem_region = self.vm.make_user_memory_region( 1645 user_memory_region.slot, 1646 user_memory_region.start, 1647 user_memory_region.size, 1648 user_memory_region.host_addr, 1649 false, 1650 false, 1651 ); 1652 1653 self.vm 1654 .create_user_memory_region(mem_region) 1655 .map_err(VfioPciError::CreateUserMemoryRegion)?; 1656 1657 if !self.iommu_attached { 1658 self.container 1659 .vfio_dma_map( 1660 user_memory_region.start, 1661 user_memory_region.size, 1662 user_memory_region.host_addr, 1663 ) 1664 .map_err(VfioPciError::DmaMap)?; 1665 } 1666 } 1667 } 1668 } 1669 1670 Ok(()) 1671 } 1672 1673 pub fn unmap_mmio_regions(&mut self) { 1674 for region in self.common.mmio_regions.iter() { 1675 for user_memory_region in region.user_memory_regions.iter() { 1676 // Unmap from vfio container 1677 if !self.iommu_attached { 1678 if let Err(e) = self 1679 .container 1680 .vfio_dma_unmap(user_memory_region.start, user_memory_region.size) 1681 { 1682 error!("Could not unmap mmio region from vfio container: {}", e); 1683 } 1684 } 1685 1686 // Remove region 1687 let r = self.vm.make_user_memory_region( 1688 user_memory_region.slot, 1689 user_memory_region.start, 1690 user_memory_region.size, 1691 user_memory_region.host_addr, 1692 false, 1693 false, 1694 ); 1695 1696 if let Err(e) = self.vm.remove_user_memory_region(r) { 1697 error!("Could not remove the userspace memory region: {}", e); 1698 } 1699 1700 // SAFETY: FFI call with correct arguments 1701 let ret = unsafe { 1702 libc::munmap( 1703 user_memory_region.host_addr as *mut libc::c_void, 1704 user_memory_region.size as usize, 1705 ) 1706 }; 1707 if ret != 0 { 1708 error!( 1709 "Could not unmap region {}, error:{}", 1710 region.index, 1711 io::Error::last_os_error() 1712 ); 1713 } 1714 } 1715 } 1716 } 1717 1718 pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> { 1719 if !self.iommu_attached { 1720 self.container 1721 .vfio_dma_map(iova, size, user_addr) 1722 .map_err(VfioPciError::DmaMap)?; 1723 } 1724 1725 Ok(()) 1726 } 1727 1728 pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> { 1729 if !self.iommu_attached { 1730 self.container 1731 .vfio_dma_unmap(iova, size) 1732 .map_err(VfioPciError::DmaUnmap)?; 1733 } 1734 1735 Ok(()) 1736 } 1737 1738 pub fn mmio_regions(&self) -> Vec<MmioRegion> { 1739 self.common.mmio_regions.clone() 1740 } 1741 } 1742 1743 impl Drop for VfioPciDevice { 1744 fn drop(&mut self) { 1745 self.unmap_mmio_regions(); 1746 1747 if let Some(msix) = &self.common.interrupt.msix { 1748 if msix.bar.enabled() { 1749 self.common.disable_msix(); 1750 } 1751 } 1752 1753 if let Some(msi) = &self.common.interrupt.msi { 1754 if msi.cfg.enabled() { 1755 self.common.disable_msi() 1756 } 1757 } 1758 1759 if self.common.interrupt.intx_in_use() { 1760 self.common.disable_intx(); 1761 } 1762 } 1763 } 1764 1765 impl BusDevice for VfioPciDevice { 1766 fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1767 self.read_bar(base, offset, data) 1768 } 1769 1770 fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1771 self.write_bar(base, offset, data) 1772 } 1773 } 1774 1775 // First BAR offset in the PCI config space. 1776 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; 1777 // Capability register offset in the PCI config space. 1778 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; 1779 // Extended capabilities register offset in the PCI config space. 1780 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100; 1781 // IO BAR when first BAR bit is 1. 1782 const PCI_CONFIG_IO_BAR: u32 = 0x1; 1783 // 64-bit memory bar flag. 1784 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; 1785 // Prefetchable BAR bit 1786 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8; 1787 // PCI config register size (4 bytes). 1788 const PCI_CONFIG_REGISTER_SIZE: usize = 4; 1789 // Number of BARs for a PCI device 1790 const BAR_NUMS: usize = 6; 1791 // PCI Header Type register index 1792 const PCI_HEADER_TYPE_REG_INDEX: usize = 3; 1793 // First BAR register index 1794 const PCI_CONFIG_BAR0_INDEX: usize = 4; 1795 // PCI ROM expansion BAR register index 1796 const PCI_ROM_EXP_BAR_INDEX: usize = 12; 1797 1798 impl PciDevice for VfioPciDevice { 1799 fn allocate_bars( 1800 &mut self, 1801 allocator: &Arc<Mutex<SystemAllocator>>, 1802 mmio32_allocator: &mut AddressAllocator, 1803 mmio64_allocator: &mut AddressAllocator, 1804 resources: Option<Vec<Resource>>, 1805 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 1806 self.common 1807 .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources) 1808 } 1809 1810 fn free_bars( 1811 &mut self, 1812 allocator: &mut SystemAllocator, 1813 mmio32_allocator: &mut AddressAllocator, 1814 mmio64_allocator: &mut AddressAllocator, 1815 ) -> Result<(), PciDeviceError> { 1816 self.common 1817 .free_bars(allocator, mmio32_allocator, mmio64_allocator) 1818 } 1819 1820 fn write_config_register( 1821 &mut self, 1822 reg_idx: usize, 1823 offset: u64, 1824 data: &[u8], 1825 ) -> Option<Arc<Barrier>> { 1826 self.common.write_config_register(reg_idx, offset, data) 1827 } 1828 1829 fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1830 self.common.read_config_register(reg_idx) 1831 } 1832 1833 fn detect_bar_reprogramming( 1834 &mut self, 1835 reg_idx: usize, 1836 data: &[u8], 1837 ) -> Option<BarReprogrammingParams> { 1838 self.common 1839 .configuration 1840 .detect_bar_reprogramming(reg_idx, data) 1841 } 1842 1843 fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1844 self.common.read_bar(base, offset, data) 1845 } 1846 1847 fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1848 self.common.write_bar(base, offset, data) 1849 } 1850 1851 fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> { 1852 for region in self.common.mmio_regions.iter_mut() { 1853 if region.start.raw_value() == old_base { 1854 region.start = GuestAddress(new_base); 1855 1856 for user_memory_region in region.user_memory_regions.iter_mut() { 1857 // Remove old region 1858 let old_mem_region = self.vm.make_user_memory_region( 1859 user_memory_region.slot, 1860 user_memory_region.start, 1861 user_memory_region.size, 1862 user_memory_region.host_addr, 1863 false, 1864 false, 1865 ); 1866 1867 self.vm 1868 .remove_user_memory_region(old_mem_region) 1869 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1870 1871 // Update the user memory region with the correct start address. 1872 if new_base > old_base { 1873 user_memory_region.start += new_base - old_base; 1874 } else { 1875 user_memory_region.start -= old_base - new_base; 1876 } 1877 1878 // Insert new region 1879 let new_mem_region = self.vm.make_user_memory_region( 1880 user_memory_region.slot, 1881 user_memory_region.start, 1882 user_memory_region.size, 1883 user_memory_region.host_addr, 1884 false, 1885 false, 1886 ); 1887 1888 self.vm 1889 .create_user_memory_region(new_mem_region) 1890 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1891 } 1892 } 1893 } 1894 1895 Ok(()) 1896 } 1897 1898 fn as_any(&mut self) -> &mut dyn Any { 1899 self 1900 } 1901 1902 fn id(&self) -> Option<String> { 1903 Some(self.id.clone()) 1904 } 1905 } 1906 1907 impl Pausable for VfioPciDevice {} 1908 1909 impl Snapshottable for VfioPciDevice { 1910 fn id(&self) -> String { 1911 self.id.clone() 1912 } 1913 1914 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1915 let mut vfio_pci_dev_snapshot = Snapshot::default(); 1916 1917 // Snapshot VfioCommon 1918 vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?); 1919 1920 Ok(vfio_pci_dev_snapshot) 1921 } 1922 } 1923 impl Transportable for VfioPciDevice {} 1924 impl Migratable for VfioPciDevice {} 1925 1926 /// This structure implements the ExternalDmaMapping trait. It is meant to 1927 /// be used when the caller tries to provide a way to update the mappings 1928 /// associated with a specific VFIO container. 1929 pub struct VfioDmaMapping<M: GuestAddressSpace> { 1930 container: Arc<VfioContainer>, 1931 memory: Arc<M>, 1932 mmio_regions: Arc<Mutex<Vec<MmioRegion>>>, 1933 } 1934 1935 impl<M: GuestAddressSpace> VfioDmaMapping<M> { 1936 /// Create a DmaMapping object. 1937 /// # Parameters 1938 /// * `container`: VFIO container object. 1939 /// * `memory`: guest memory to mmap. 1940 /// * `mmio_regions`: mmio_regions to mmap. 1941 pub fn new( 1942 container: Arc<VfioContainer>, 1943 memory: Arc<M>, 1944 mmio_regions: Arc<Mutex<Vec<MmioRegion>>>, 1945 ) -> Self { 1946 VfioDmaMapping { 1947 container, 1948 memory, 1949 mmio_regions, 1950 } 1951 } 1952 } 1953 1954 impl<M: GuestAddressSpace + Sync + Send> ExternalDmaMapping for VfioDmaMapping<M> { 1955 fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), io::Error> { 1956 let mem = self.memory.memory(); 1957 let guest_addr = GuestAddress(gpa); 1958 let user_addr = if mem.check_range(guest_addr, size as usize) { 1959 match mem.get_host_address(guest_addr) { 1960 Ok(t) => t as u64, 1961 Err(e) => { 1962 return Err(io::Error::new( 1963 io::ErrorKind::Other, 1964 format!("unable to retrieve user address for gpa 0x{gpa:x} from guest memory region: {e}") 1965 )); 1966 } 1967 } 1968 } else if self.mmio_regions.lock().unwrap().check_range(gpa, size) { 1969 self.mmio_regions.lock().unwrap().find_user_address(gpa)? 1970 } else { 1971 return Err(io::Error::new( 1972 io::ErrorKind::Other, 1973 format!("failed to locate guest address 0x{gpa:x} in guest memory"), 1974 )); 1975 }; 1976 1977 self.container 1978 .vfio_dma_map(iova, size, user_addr) 1979 .map_err(|e| { 1980 io::Error::new( 1981 io::ErrorKind::Other, 1982 format!( 1983 "failed to map memory for VFIO container, \ 1984 iova 0x{iova:x}, gpa 0x{gpa:x}, size 0x{size:x}: {e:?}" 1985 ), 1986 ) 1987 }) 1988 } 1989 1990 fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), io::Error> { 1991 self.container.vfio_dma_unmap(iova, size).map_err(|e| { 1992 io::Error::new( 1993 io::ErrorKind::Other, 1994 format!( 1995 "failed to unmap memory for VFIO container, \ 1996 iova 0x{iova:x}, size 0x{size:x}: {e:?}" 1997 ), 1998 ) 1999 }) 2000 } 2001 } 2002