1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 6 use crate::msi::{MsiConfigState, MSI_CONFIG_ID}; 7 use crate::msix::MsixConfigState; 8 use crate::{ 9 msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig, 10 PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId, 11 PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId, 12 PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID, 13 }; 14 use anyhow::anyhow; 15 use byteorder::{ByteOrder, LittleEndian}; 16 use hypervisor::HypervisorVmError; 17 use libc::{sysconf, _SC_PAGESIZE}; 18 use std::any::Any; 19 use std::collections::{BTreeMap, HashMap}; 20 use std::io; 21 use std::os::unix::io::AsRawFd; 22 use std::ptr::null_mut; 23 use std::sync::{Arc, Barrier, Mutex}; 24 use thiserror::Error; 25 use versionize::{VersionMap, Versionize, VersionizeResult}; 26 use versionize_derive::Versionize; 27 use vfio_bindings::bindings::vfio::*; 28 use vfio_ioctls::{ 29 VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea, 30 }; 31 use vm_allocator::page_size::{ 32 align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned, 33 }; 34 use vm_allocator::{AddressAllocator, SystemAllocator}; 35 use vm_device::interrupt::{ 36 InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, 37 }; 38 use vm_device::{BusDevice, Resource}; 39 use vm_memory::{Address, GuestAddress, GuestUsize}; 40 use vm_migration::{ 41 Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped, 42 }; 43 use vmm_sys_util::eventfd::EventFd; 44 45 pub(crate) const VFIO_COMMON_ID: &str = "vfio_common"; 46 47 #[derive(Debug, Error)] 48 pub enum VfioPciError { 49 #[error("Failed to create user memory region: {0}")] 50 CreateUserMemoryRegion(#[source] HypervisorVmError), 51 #[error("Failed to DMA map: {0}")] 52 DmaMap(#[source] vfio_ioctls::VfioError), 53 #[error("Failed to DMA unmap: {0}")] 54 DmaUnmap(#[source] vfio_ioctls::VfioError), 55 #[error("Failed to enable INTx: {0}")] 56 EnableIntx(#[source] VfioError), 57 #[error("Failed to enable MSI: {0}")] 58 EnableMsi(#[source] VfioError), 59 #[error("Failed to enable MSI-x: {0}")] 60 EnableMsix(#[source] VfioError), 61 #[error("Failed to mmap the area")] 62 MmapArea, 63 #[error("Failed to notifier's eventfd")] 64 MissingNotifier, 65 #[error("Invalid region alignment")] 66 RegionAlignment, 67 #[error("Invalid region size")] 68 RegionSize, 69 #[error("Failed to retrieve MsiConfigState: {0}")] 70 RetrieveMsiConfigState(#[source] anyhow::Error), 71 #[error("Failed to retrieve MsixConfigState: {0}")] 72 RetrieveMsixConfigState(#[source] anyhow::Error), 73 #[error("Failed to retrieve PciConfigurationState: {0}")] 74 RetrievePciConfigurationState(#[source] anyhow::Error), 75 #[error("Failed to retrieve VfioCommonState: {0}")] 76 RetrieveVfioCommonState(#[source] anyhow::Error), 77 } 78 79 #[derive(Copy, Clone)] 80 enum PciVfioSubclass { 81 VfioSubclass = 0xff, 82 } 83 84 impl PciSubclass for PciVfioSubclass { 85 fn get_register_value(&self) -> u8 { 86 *self as u8 87 } 88 } 89 90 enum InterruptUpdateAction { 91 EnableMsi, 92 DisableMsi, 93 EnableMsix, 94 DisableMsix, 95 } 96 97 #[derive(Versionize)] 98 struct IntxState { 99 enabled: bool, 100 } 101 102 pub(crate) struct VfioIntx { 103 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 104 enabled: bool, 105 } 106 107 #[derive(Versionize)] 108 struct MsiState { 109 cap: MsiCap, 110 cap_offset: u32, 111 } 112 113 pub(crate) struct VfioMsi { 114 pub(crate) cfg: MsiConfig, 115 cap_offset: u32, 116 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 117 } 118 119 impl VfioMsi { 120 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 121 let old_enabled = self.cfg.enabled(); 122 123 self.cfg.update(offset, data); 124 125 let new_enabled = self.cfg.enabled(); 126 127 if !old_enabled && new_enabled { 128 return Some(InterruptUpdateAction::EnableMsi); 129 } 130 131 if old_enabled && !new_enabled { 132 return Some(InterruptUpdateAction::DisableMsi); 133 } 134 135 None 136 } 137 } 138 139 #[derive(Versionize)] 140 struct MsixState { 141 cap: MsixCap, 142 cap_offset: u32, 143 bdf: u32, 144 } 145 146 pub(crate) struct VfioMsix { 147 pub(crate) bar: MsixConfig, 148 cap: MsixCap, 149 cap_offset: u32, 150 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 151 } 152 153 impl VfioMsix { 154 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 155 let old_enabled = self.bar.enabled(); 156 157 // Update "Message Control" word 158 if offset == 2 && data.len() == 2 { 159 self.bar.set_msg_ctl(LittleEndian::read_u16(data)); 160 } 161 162 let new_enabled = self.bar.enabled(); 163 164 if !old_enabled && new_enabled { 165 return Some(InterruptUpdateAction::EnableMsix); 166 } 167 168 if old_enabled && !new_enabled { 169 return Some(InterruptUpdateAction::DisableMsix); 170 } 171 172 None 173 } 174 175 fn table_accessed(&self, bar_index: u32, offset: u64) -> bool { 176 let table_offset: u64 = u64::from(self.cap.table_offset()); 177 let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64); 178 let table_bir: u32 = self.cap.table_bir(); 179 180 bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size 181 } 182 } 183 184 pub(crate) struct Interrupt { 185 pub(crate) intx: Option<VfioIntx>, 186 pub(crate) msi: Option<VfioMsi>, 187 pub(crate) msix: Option<VfioMsix>, 188 } 189 190 impl Interrupt { 191 fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 192 if let Some(ref mut msi) = &mut self.msi { 193 let action = msi.update(offset, data); 194 return action; 195 } 196 197 None 198 } 199 200 fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 201 if let Some(ref mut msix) = &mut self.msix { 202 let action = msix.update(offset, data); 203 return action; 204 } 205 206 None 207 } 208 209 fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { 210 if let Some(msi) = &self.msi { 211 if offset >= u64::from(msi.cap_offset) 212 && offset < u64::from(msi.cap_offset) + msi.cfg.size() 213 { 214 return Some(( 215 PciCapabilityId::MessageSignalledInterrupts, 216 u64::from(msi.cap_offset), 217 )); 218 } 219 } 220 221 if let Some(msix) = &self.msix { 222 if offset == u64::from(msix.cap_offset) { 223 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); 224 } 225 } 226 227 None 228 } 229 230 fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool { 231 if let Some(msix) = &self.msix { 232 return msix.table_accessed(bar_index, offset); 233 } 234 235 false 236 } 237 238 fn msix_write_table(&mut self, offset: u64, data: &[u8]) { 239 if let Some(ref mut msix) = &mut self.msix { 240 let offset = offset - u64::from(msix.cap.table_offset()); 241 msix.bar.write_table(offset, data) 242 } 243 } 244 245 fn msix_read_table(&self, offset: u64, data: &mut [u8]) { 246 if let Some(msix) = &self.msix { 247 let offset = offset - u64::from(msix.cap.table_offset()); 248 msix.bar.read_table(offset, data) 249 } 250 } 251 252 pub(crate) fn intx_in_use(&self) -> bool { 253 if let Some(intx) = &self.intx { 254 return intx.enabled; 255 } 256 257 false 258 } 259 } 260 261 #[derive(Copy, Clone)] 262 pub struct UserMemoryRegion { 263 pub slot: u32, 264 pub start: u64, 265 pub size: u64, 266 pub host_addr: u64, 267 } 268 269 #[derive(Clone)] 270 pub struct MmioRegion { 271 pub start: GuestAddress, 272 pub length: GuestUsize, 273 pub(crate) type_: PciBarRegionType, 274 pub(crate) index: u32, 275 pub(crate) user_memory_regions: Vec<UserMemoryRegion>, 276 } 277 #[derive(Debug, Error)] 278 pub enum VfioError { 279 #[error("Kernel VFIO error: {0}")] 280 KernelVfio(#[source] vfio_ioctls::VfioError), 281 #[error("VFIO user error: {0}")] 282 VfioUser(#[source] vfio_user::Error), 283 } 284 285 pub(crate) trait Vfio: Send + Sync { 286 fn read_config_byte(&self, offset: u32) -> u8 { 287 let mut data: [u8; 1] = [0]; 288 self.read_config(offset, &mut data); 289 data[0] 290 } 291 292 fn read_config_word(&self, offset: u32) -> u16 { 293 let mut data: [u8; 2] = [0, 0]; 294 self.read_config(offset, &mut data); 295 u16::from_le_bytes(data) 296 } 297 298 fn read_config_dword(&self, offset: u32) -> u32 { 299 let mut data: [u8; 4] = [0, 0, 0, 0]; 300 self.read_config(offset, &mut data); 301 u32::from_le_bytes(data) 302 } 303 304 fn write_config_dword(&self, offset: u32, buf: u32) { 305 let data: [u8; 4] = buf.to_le_bytes(); 306 self.write_config(offset, &data) 307 } 308 309 fn read_config(&self, offset: u32, data: &mut [u8]) { 310 self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut()); 311 } 312 313 fn write_config(&self, offset: u32, data: &[u8]) { 314 self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data) 315 } 316 317 fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 318 self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds) 319 } 320 321 fn disable_msi(&self) -> Result<(), VfioError> { 322 self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX) 323 } 324 325 fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 326 self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds) 327 } 328 329 fn disable_msix(&self) -> Result<(), VfioError> { 330 self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX) 331 } 332 333 fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) { 334 unimplemented!() 335 } 336 337 fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) { 338 unimplemented!() 339 } 340 341 fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> { 342 unimplemented!() 343 } 344 345 fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 346 unimplemented!() 347 } 348 349 fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 350 unimplemented!() 351 } 352 353 fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 354 unimplemented!() 355 } 356 } 357 358 struct VfioDeviceWrapper { 359 device: Arc<VfioDevice>, 360 } 361 362 impl VfioDeviceWrapper { 363 fn new(device: Arc<VfioDevice>) -> Self { 364 Self { device } 365 } 366 } 367 368 impl Vfio for VfioDeviceWrapper { 369 fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) { 370 self.device.region_read(index, data, offset) 371 } 372 373 fn region_write(&self, index: u32, offset: u64, data: &[u8]) { 374 self.device.region_write(index, data, offset) 375 } 376 377 fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> { 378 self.device.get_irq_info(irq_index).copied() 379 } 380 381 fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 382 self.device 383 .enable_irq(irq_index, event_fds) 384 .map_err(VfioError::KernelVfio) 385 } 386 387 fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> { 388 self.device 389 .disable_irq(irq_index) 390 .map_err(VfioError::KernelVfio) 391 } 392 393 fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> { 394 self.device 395 .unmask_irq(irq_index) 396 .map_err(VfioError::KernelVfio) 397 } 398 } 399 400 #[derive(Versionize)] 401 struct VfioCommonState { 402 intx_state: Option<IntxState>, 403 msi_state: Option<MsiState>, 404 msix_state: Option<MsixState>, 405 } 406 407 impl VersionMapped for VfioCommonState {} 408 409 pub(crate) struct ConfigPatch { 410 mask: u32, 411 patch: u32, 412 } 413 414 pub(crate) struct VfioCommon { 415 pub(crate) configuration: PciConfiguration, 416 pub(crate) mmio_regions: Vec<MmioRegion>, 417 pub(crate) interrupt: Interrupt, 418 pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 419 pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 420 pub(crate) vfio_wrapper: Arc<dyn Vfio>, 421 pub(crate) patches: HashMap<usize, ConfigPatch>, 422 x_nv_gpudirect_clique: Option<u8>, 423 } 424 425 impl VfioCommon { 426 pub(crate) fn new( 427 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 428 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 429 vfio_wrapper: Arc<dyn Vfio>, 430 subclass: &dyn PciSubclass, 431 bdf: PciBdf, 432 snapshot: Option<Snapshot>, 433 x_nv_gpudirect_clique: Option<u8>, 434 ) -> Result<Self, VfioPciError> { 435 let pci_configuration_state = 436 vm_migration::versioned_state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID) 437 .map_err(|e| { 438 VfioPciError::RetrievePciConfigurationState(anyhow!( 439 "Failed to get PciConfigurationState from Snapshot: {}", 440 e 441 )) 442 })?; 443 444 let configuration = PciConfiguration::new( 445 0, 446 0, 447 0, 448 PciClassCode::Other, 449 subclass, 450 None, 451 PciHeaderType::Device, 452 0, 453 0, 454 None, 455 pci_configuration_state, 456 ); 457 458 let mut vfio_common = VfioCommon { 459 mmio_regions: Vec::new(), 460 configuration, 461 interrupt: Interrupt { 462 intx: None, 463 msi: None, 464 msix: None, 465 }, 466 msi_interrupt_manager, 467 legacy_interrupt_group, 468 vfio_wrapper, 469 patches: HashMap::new(), 470 x_nv_gpudirect_clique, 471 }; 472 473 let state: Option<VfioCommonState> = snapshot 474 .as_ref() 475 .map(|s| s.to_versioned_state()) 476 .transpose() 477 .map_err(|e| { 478 VfioPciError::RetrieveVfioCommonState(anyhow!( 479 "Failed to get VfioCommonState from Snapshot: {}", 480 e 481 )) 482 })?; 483 let msi_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSI_CONFIG_ID) 484 .map_err(|e| { 485 VfioPciError::RetrieveMsiConfigState(anyhow!( 486 "Failed to get MsiConfigState from Snapshot: {}", 487 e 488 )) 489 })?; 490 let msix_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID) 491 .map_err(|e| { 492 VfioPciError::RetrieveMsixConfigState(anyhow!( 493 "Failed to get MsixConfigState from Snapshot: {}", 494 e 495 )) 496 })?; 497 498 if let Some(state) = state.as_ref() { 499 vfio_common.set_state(state, msi_state, msix_state)?; 500 } else { 501 vfio_common.parse_capabilities(bdf); 502 vfio_common.initialize_legacy_interrupt()?; 503 } 504 505 Ok(vfio_common) 506 } 507 508 /// In case msix table offset is not page size aligned, we need do some fixup to achieve it. 509 /// Because we don't want the MMIO RW region and trap region overlap each other. 510 fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 { 511 if let Some(msix) = self.interrupt.msix.as_mut() { 512 let msix_cap = &mut msix.cap; 513 514 // Suppose table_bir equals to pba_bir here. Am I right? 515 let (table_offset, table_size) = msix_cap.table_range(); 516 if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id { 517 return region_size; 518 } 519 520 let (pba_offset, pba_size) = msix_cap.pba_range(); 521 let msix_sz = align_page_size_up(table_size + pba_size); 522 // Expand region to hold RW and trap region which both page size aligned 523 let size = std::cmp::max(region_size * 2, msix_sz * 2); 524 // let table starts from the middle of the region 525 msix_cap.table_set_offset((size / 2) as u32); 526 msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32); 527 528 size 529 } else { 530 // MSI-X not supported for this device 531 region_size 532 } 533 } 534 535 // The `allocator` argument is unused on `aarch64` 536 #[allow(unused_variables)] 537 pub(crate) fn allocate_bars( 538 &mut self, 539 allocator: &Arc<Mutex<SystemAllocator>>, 540 mmio32_allocator: &mut AddressAllocator, 541 mmio64_allocator: &mut AddressAllocator, 542 resources: Option<Vec<Resource>>, 543 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 544 let mut bars = Vec::new(); 545 let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX; 546 547 // Going through all regular regions to compute the BAR size. 548 // We're not saving the BAR address to restore it, because we 549 // are going to allocate a guest address for each BAR and write 550 // that new address back. 551 while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { 552 let mut region_size: u64 = 0; 553 let mut region_type = PciBarRegionType::Memory32BitRegion; 554 let mut prefetchable = PciBarPrefetchable::NotPrefetchable; 555 let mut flags: u32 = 0; 556 557 let mut restored_bar_addr = None; 558 if let Some(resources) = &resources { 559 for resource in resources { 560 if let Resource::PciBar { 561 index, 562 base, 563 size, 564 type_, 565 .. 566 } = resource 567 { 568 if *index == bar_id as usize { 569 restored_bar_addr = Some(GuestAddress(*base)); 570 region_size = *size; 571 region_type = PciBarRegionType::from(*type_); 572 break; 573 } 574 } 575 } 576 if restored_bar_addr.is_none() { 577 bar_id += 1; 578 continue; 579 } 580 } else { 581 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 582 (PCI_ROM_EXP_BAR_INDEX * 4) as u32 583 } else { 584 PCI_CONFIG_BAR_OFFSET + bar_id * 4 585 }; 586 587 // First read flags 588 flags = self.vfio_wrapper.read_config_dword(bar_offset); 589 590 // Is this an IO BAR? 591 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 592 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) 593 } else { 594 false 595 }; 596 597 // Is this a 64-bit BAR? 598 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 599 matches!( 600 flags & PCI_CONFIG_MEMORY_BAR_64BIT, 601 PCI_CONFIG_MEMORY_BAR_64BIT 602 ) 603 } else { 604 false 605 }; 606 607 if matches!( 608 flags & PCI_CONFIG_BAR_PREFETCHABLE, 609 PCI_CONFIG_BAR_PREFETCHABLE 610 ) { 611 prefetchable = PciBarPrefetchable::Prefetchable 612 }; 613 614 // To get size write all 1s 615 self.vfio_wrapper 616 .write_config_dword(bar_offset, 0xffff_ffff); 617 618 // And read back BAR value. The device will write zeros for bits it doesn't care about 619 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset); 620 621 if io_bar { 622 // Mask flag bits (lowest 2 for I/O bars) 623 lower &= !0b11; 624 625 // BAR is not enabled 626 if lower == 0 { 627 bar_id += 1; 628 continue; 629 } 630 631 // IO BAR 632 region_type = PciBarRegionType::IoRegion; 633 634 // Invert bits and add 1 to calculate size 635 region_size = (!lower + 1) as u64; 636 } else if is_64bit_bar { 637 // 64 bits Memory BAR 638 region_type = PciBarRegionType::Memory64BitRegion; 639 640 // Query size of upper BAR of 64-bit BAR 641 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; 642 self.vfio_wrapper 643 .write_config_dword(upper_offset, 0xffff_ffff); 644 let upper = self.vfio_wrapper.read_config_dword(upper_offset); 645 646 let mut combined_size = u64::from(upper) << 32 | u64::from(lower); 647 648 // Mask out flag bits (lowest 4 for memory bars) 649 combined_size &= !0b1111; 650 651 // BAR is not enabled 652 if combined_size == 0 { 653 bar_id += 1; 654 continue; 655 } 656 657 // Invert and add 1 to to find size 658 region_size = !combined_size + 1; 659 } else { 660 region_type = PciBarRegionType::Memory32BitRegion; 661 662 // Mask out flag bits (lowest 4 for memory bars) 663 lower &= !0b1111; 664 665 if lower == 0 { 666 bar_id += 1; 667 continue; 668 } 669 670 // Invert and add 1 to to find size 671 region_size = (!lower + 1) as u64; 672 } 673 } 674 675 let bar_addr = match region_type { 676 PciBarRegionType::IoRegion => { 677 #[cfg(target_arch = "aarch64")] 678 unimplemented!(); 679 680 // The address needs to be 4 bytes aligned. 681 #[cfg(not(target_arch = "aarch64"))] 682 allocator 683 .lock() 684 .unwrap() 685 .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4)) 686 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 687 } 688 PciBarRegionType::Memory32BitRegion => { 689 // BAR allocation must be naturally aligned 690 mmio32_allocator 691 .allocate(restored_bar_addr, region_size, Some(region_size)) 692 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 693 } 694 PciBarRegionType::Memory64BitRegion => { 695 // We need do some fixup to keep MMIO RW region and msix cap region page size 696 // aligned. 697 region_size = self.fixup_msix_region(bar_id, region_size); 698 mmio64_allocator 699 .allocate( 700 restored_bar_addr, 701 region_size, 702 Some(std::cmp::max( 703 // SAFETY: FFI call. Trivially safe. 704 unsafe { sysconf(_SC_PAGESIZE) as GuestUsize }, 705 region_size, 706 )), 707 ) 708 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 709 } 710 }; 711 712 // We can now build our BAR configuration block. 713 let bar = PciBarConfiguration::default() 714 .set_index(bar_id as usize) 715 .set_address(bar_addr.raw_value()) 716 .set_size(region_size) 717 .set_region_type(region_type) 718 .set_prefetchable(prefetchable); 719 720 if bar_id == VFIO_PCI_ROM_REGION_INDEX { 721 self.configuration 722 .add_pci_rom_bar(&bar, flags & 0x1) 723 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 724 } else { 725 self.configuration 726 .add_pci_bar(&bar) 727 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 728 } 729 730 bars.push(bar); 731 self.mmio_regions.push(MmioRegion { 732 start: bar_addr, 733 length: region_size, 734 type_: region_type, 735 index: bar_id, 736 user_memory_regions: Vec::new(), 737 }); 738 739 bar_id += 1; 740 if region_type == PciBarRegionType::Memory64BitRegion { 741 bar_id += 1; 742 } 743 } 744 745 Ok(bars) 746 } 747 748 // The `allocator` argument is unused on `aarch64` 749 #[allow(unused_variables)] 750 pub(crate) fn free_bars( 751 &mut self, 752 allocator: &mut SystemAllocator, 753 mmio32_allocator: &mut AddressAllocator, 754 mmio64_allocator: &mut AddressAllocator, 755 ) -> Result<(), PciDeviceError> { 756 for region in self.mmio_regions.iter() { 757 match region.type_ { 758 PciBarRegionType::IoRegion => { 759 #[cfg(target_arch = "x86_64")] 760 allocator.free_io_addresses(region.start, region.length); 761 #[cfg(target_arch = "aarch64")] 762 error!("I/O region is not supported"); 763 } 764 PciBarRegionType::Memory32BitRegion => { 765 mmio32_allocator.free(region.start, region.length); 766 } 767 PciBarRegionType::Memory64BitRegion => { 768 mmio64_allocator.free(region.start, region.length); 769 } 770 } 771 } 772 Ok(()) 773 } 774 775 pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap { 776 let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into()); 777 778 let table = self.vfio_wrapper.read_config_dword((cap + 4).into()); 779 780 let pba = self.vfio_wrapper.read_config_dword((cap + 8).into()); 781 782 MsixCap { 783 msg_ctl, 784 table, 785 pba, 786 } 787 } 788 789 pub(crate) fn initialize_msix( 790 &mut self, 791 msix_cap: MsixCap, 792 cap_offset: u32, 793 bdf: PciBdf, 794 state: Option<MsixConfigState>, 795 ) { 796 let interrupt_source_group = self 797 .msi_interrupt_manager 798 .create_group(MsiIrqGroupConfig { 799 base: 0, 800 count: msix_cap.table_size() as InterruptIndex, 801 }) 802 .unwrap(); 803 804 let msix_config = MsixConfig::new( 805 msix_cap.table_size(), 806 interrupt_source_group.clone(), 807 bdf.into(), 808 state, 809 ) 810 .unwrap(); 811 812 self.interrupt.msix = Some(VfioMsix { 813 bar: msix_config, 814 cap: msix_cap, 815 cap_offset, 816 interrupt_source_group, 817 }); 818 } 819 820 pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 { 821 self.vfio_wrapper.read_config_word((cap + 2).into()) 822 } 823 824 pub(crate) fn initialize_msi( 825 &mut self, 826 msg_ctl: u16, 827 cap_offset: u32, 828 state: Option<MsiConfigState>, 829 ) { 830 let interrupt_source_group = self 831 .msi_interrupt_manager 832 .create_group(MsiIrqGroupConfig { 833 base: 0, 834 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, 835 }) 836 .unwrap(); 837 838 let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap(); 839 840 self.interrupt.msi = Some(VfioMsi { 841 cfg: msi_config, 842 cap_offset, 843 interrupt_source_group, 844 }); 845 } 846 847 pub(crate) fn get_msix_cap_idx(&self) -> Option<usize> { 848 let mut cap_next = self 849 .vfio_wrapper 850 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 851 852 while cap_next != 0 { 853 let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into()); 854 if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX { 855 return Some(cap_next as usize); 856 } else { 857 cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into()); 858 } 859 } 860 861 None 862 } 863 864 pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) { 865 let mut cap_iter = self 866 .vfio_wrapper 867 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 868 869 let mut pci_express_cap_found = false; 870 let mut power_management_cap_found = false; 871 872 while cap_iter != 0 { 873 let cap_id = self.vfio_wrapper.read_config_byte(cap_iter.into()); 874 875 match PciCapabilityId::from(cap_id) { 876 PciCapabilityId::MessageSignalledInterrupts => { 877 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { 878 if irq_info.count > 0 { 879 // Parse capability only if the VFIO device 880 // supports MSI. 881 let msg_ctl = self.parse_msi_capabilities(cap_iter); 882 self.initialize_msi(msg_ctl, cap_iter as u32, None); 883 } 884 } 885 } 886 PciCapabilityId::MsiX => { 887 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) 888 { 889 if irq_info.count > 0 { 890 // Parse capability only if the VFIO device 891 // supports MSI-X. 892 let msix_cap = self.parse_msix_capabilities(cap_iter); 893 self.initialize_msix(msix_cap, cap_iter as u32, bdf, None); 894 } 895 } 896 } 897 PciCapabilityId::PciExpress => pci_express_cap_found = true, 898 PciCapabilityId::PowerManagement => power_management_cap_found = true, 899 _ => {} 900 }; 901 902 let cap_next = self.vfio_wrapper.read_config_byte((cap_iter + 1).into()); 903 if cap_next == 0 { 904 break; 905 } 906 907 cap_iter = cap_next; 908 } 909 910 if let Some(clique_id) = self.x_nv_gpudirect_clique { 911 self.add_nv_gpudirect_clique_cap(cap_iter, clique_id); 912 } 913 914 if pci_express_cap_found && power_management_cap_found { 915 self.parse_extended_capabilities(); 916 } 917 } 918 919 fn add_nv_gpudirect_clique_cap(&mut self, cap_iter: u8, clique_id: u8) { 920 // Turing, Ampere, Hopper, and Lovelace GPUs have dedicated space 921 // at 0xD4 for this capability. 922 let cap_offset = 0xd4u32; 923 924 let reg_idx = (cap_iter / 4) as usize; 925 self.patches.insert( 926 reg_idx, 927 ConfigPatch { 928 mask: 0x0000_ff00, 929 patch: cap_offset << 8, 930 }, 931 ); 932 933 let reg_idx = (cap_offset / 4) as usize; 934 self.patches.insert( 935 reg_idx, 936 ConfigPatch { 937 mask: 0xffff_ffff, 938 patch: 0x50080009u32, 939 }, 940 ); 941 self.patches.insert( 942 reg_idx + 1, 943 ConfigPatch { 944 mask: 0xffff_ffff, 945 patch: u32::from(clique_id) << 19 | 0x5032, 946 }, 947 ); 948 } 949 950 fn parse_extended_capabilities(&mut self) { 951 let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET; 952 953 loop { 954 let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset); 955 956 let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16; 957 let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16; 958 959 match PciExpressCapabilityId::from(cap_id) { 960 PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation 961 | PciExpressCapabilityId::ResizeableBar 962 | PciExpressCapabilityId::SingleRootIoVirtualization => { 963 let reg_idx = (current_offset / 4) as usize; 964 self.patches.insert( 965 reg_idx, 966 ConfigPatch { 967 mask: 0x0000_ffff, 968 patch: PciExpressCapabilityId::NullCapability as u32, 969 }, 970 ); 971 } 972 _ => {} 973 } 974 975 if cap_next == 0 { 976 break; 977 } 978 979 current_offset = cap_next.into(); 980 } 981 } 982 983 pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> { 984 if let Some(intx) = &mut self.interrupt.intx { 985 if !intx.enabled { 986 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { 987 self.vfio_wrapper 988 .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) 989 .map_err(VfioPciError::EnableIntx)?; 990 991 intx.enabled = true; 992 } else { 993 return Err(VfioPciError::MissingNotifier); 994 } 995 } 996 } 997 998 Ok(()) 999 } 1000 1001 pub(crate) fn disable_intx(&mut self) { 1002 if let Some(intx) = &mut self.interrupt.intx { 1003 if intx.enabled { 1004 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1005 error!("Could not disable INTx: {}", e); 1006 } else { 1007 intx.enabled = false; 1008 } 1009 } 1010 } 1011 } 1012 1013 pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> { 1014 if let Some(msi) = &self.interrupt.msi { 1015 let mut irq_fds: Vec<EventFd> = Vec::new(); 1016 for i in 0..msi.cfg.num_enabled_vectors() { 1017 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { 1018 irq_fds.push(eventfd); 1019 } else { 1020 return Err(VfioPciError::MissingNotifier); 1021 } 1022 } 1023 1024 self.vfio_wrapper 1025 .enable_msi(irq_fds.iter().collect()) 1026 .map_err(VfioPciError::EnableMsi)?; 1027 } 1028 1029 Ok(()) 1030 } 1031 1032 pub(crate) fn disable_msi(&self) { 1033 if let Err(e) = self.vfio_wrapper.disable_msi() { 1034 error!("Could not disable MSI: {}", e); 1035 } 1036 } 1037 1038 pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> { 1039 if let Some(msix) = &self.interrupt.msix { 1040 let mut irq_fds: Vec<EventFd> = Vec::new(); 1041 for i in 0..msix.bar.table_entries.len() { 1042 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { 1043 irq_fds.push(eventfd); 1044 } else { 1045 return Err(VfioPciError::MissingNotifier); 1046 } 1047 } 1048 1049 self.vfio_wrapper 1050 .enable_msix(irq_fds.iter().collect()) 1051 .map_err(VfioPciError::EnableMsix)?; 1052 } 1053 1054 Ok(()) 1055 } 1056 1057 pub(crate) fn disable_msix(&self) { 1058 if let Err(e) = self.vfio_wrapper.disable_msix() { 1059 error!("Could not disable MSI-X: {}", e); 1060 } 1061 } 1062 1063 pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> { 1064 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { 1065 if irq_info.count == 0 { 1066 // A count of 0 means the INTx IRQ is not supported, therefore 1067 // it shouldn't be initialized. 1068 return Ok(()); 1069 } 1070 } 1071 1072 if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() { 1073 self.interrupt.intx = Some(VfioIntx { 1074 interrupt_source_group, 1075 enabled: false, 1076 }); 1077 1078 self.enable_intx()?; 1079 } 1080 1081 Ok(()) 1082 } 1083 1084 pub(crate) fn update_msi_capabilities( 1085 &mut self, 1086 offset: u64, 1087 data: &[u8], 1088 ) -> Result<(), VfioPciError> { 1089 match self.interrupt.update_msi(offset, data) { 1090 Some(InterruptUpdateAction::EnableMsi) => { 1091 // Disable INTx before we can enable MSI 1092 self.disable_intx(); 1093 self.enable_msi()?; 1094 } 1095 Some(InterruptUpdateAction::DisableMsi) => { 1096 // Fallback onto INTx when disabling MSI 1097 self.disable_msi(); 1098 self.enable_intx()?; 1099 } 1100 _ => {} 1101 } 1102 1103 Ok(()) 1104 } 1105 1106 pub(crate) fn update_msix_capabilities( 1107 &mut self, 1108 offset: u64, 1109 data: &[u8], 1110 ) -> Result<(), VfioPciError> { 1111 match self.interrupt.update_msix(offset, data) { 1112 Some(InterruptUpdateAction::EnableMsix) => { 1113 // Disable INTx before we can enable MSI-X 1114 self.disable_intx(); 1115 self.enable_msix()?; 1116 } 1117 Some(InterruptUpdateAction::DisableMsix) => { 1118 // Fallback onto INTx when disabling MSI-X 1119 self.disable_msix(); 1120 self.enable_intx()?; 1121 } 1122 _ => {} 1123 } 1124 1125 Ok(()) 1126 } 1127 1128 pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> { 1129 for region in self.mmio_regions.iter() { 1130 if addr >= region.start.raw_value() 1131 && addr < region.start.unchecked_add(region.length).raw_value() 1132 { 1133 return Some(region.clone()); 1134 } 1135 } 1136 None 1137 } 1138 1139 pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1140 let addr = base + offset; 1141 if let Some(region) = self.find_region(addr) { 1142 let offset = addr - region.start.raw_value(); 1143 1144 if self.interrupt.msix_table_accessed(region.index, offset) { 1145 self.interrupt.msix_read_table(offset, data); 1146 } else { 1147 self.vfio_wrapper.region_read(region.index, offset, data); 1148 } 1149 } 1150 1151 // INTx EOI 1152 // The guest reading from the BAR potentially means the interrupt has 1153 // been received and can be acknowledged. 1154 if self.interrupt.intx_in_use() { 1155 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1156 error!("Failed unmasking INTx IRQ: {}", e); 1157 } 1158 } 1159 } 1160 1161 pub(crate) fn write_bar( 1162 &mut self, 1163 base: u64, 1164 offset: u64, 1165 data: &[u8], 1166 ) -> Option<Arc<Barrier>> { 1167 let addr = base + offset; 1168 if let Some(region) = self.find_region(addr) { 1169 let offset = addr - region.start.raw_value(); 1170 1171 // If the MSI-X table is written to, we need to update our cache. 1172 if self.interrupt.msix_table_accessed(region.index, offset) { 1173 self.interrupt.msix_write_table(offset, data); 1174 } else { 1175 self.vfio_wrapper.region_write(region.index, offset, data); 1176 } 1177 } 1178 1179 // INTx EOI 1180 // The guest writing to the BAR potentially means the interrupt has 1181 // been received and can be acknowledged. 1182 if self.interrupt.intx_in_use() { 1183 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1184 error!("Failed unmasking INTx IRQ: {}", e); 1185 } 1186 } 1187 1188 None 1189 } 1190 1191 pub(crate) fn write_config_register( 1192 &mut self, 1193 reg_idx: usize, 1194 offset: u64, 1195 data: &[u8], 1196 ) -> Option<Arc<Barrier>> { 1197 // When the guest wants to write to a BAR, we trap it into 1198 // our local configuration space. We're not reprogramming 1199 // VFIO device. 1200 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1201 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1202 { 1203 // We keep our local cache updated with the BARs. 1204 // We'll read it back from there when the guest is asking 1205 // for BARs (see read_config_register()). 1206 self.configuration 1207 .write_config_register(reg_idx, offset, data); 1208 return None; 1209 } 1210 1211 let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; 1212 1213 // If the MSI or MSI-X capabilities are accessed, we need to 1214 // update our local cache accordingly. 1215 // Depending on how the capabilities are modified, this could 1216 // trigger a VFIO MSI or MSI-X toggle. 1217 if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { 1218 let cap_offset: u64 = reg - cap_base + offset; 1219 match cap_id { 1220 PciCapabilityId::MessageSignalledInterrupts => { 1221 if let Err(e) = self.update_msi_capabilities(cap_offset, data) { 1222 error!("Could not update MSI capabilities: {}", e); 1223 } 1224 } 1225 PciCapabilityId::MsiX => { 1226 if let Err(e) = self.update_msix_capabilities(cap_offset, data) { 1227 error!("Could not update MSI-X capabilities: {}", e); 1228 } 1229 } 1230 _ => {} 1231 } 1232 } 1233 1234 // Make sure to write to the device's PCI config space after MSI/MSI-X 1235 // interrupts have been enabled/disabled. In case of MSI, when the 1236 // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), 1237 // the MSI Enable bit in the MSI capability structure found in the PCI 1238 // config space is disabled by default. That's why when the guest is 1239 // enabling this bit, we first need to enable the MSI interrupts with 1240 // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write 1241 // to the device region to update the MSI Enable bit. 1242 self.vfio_wrapper.write_config((reg + offset) as u32, data); 1243 1244 None 1245 } 1246 1247 pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1248 // When reading the BARs, we trap it and return what comes 1249 // from our local configuration space. We want the guest to 1250 // use that and not the VFIO device BARs as it does not map 1251 // with the guest address space. 1252 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1253 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1254 { 1255 return self.configuration.read_reg(reg_idx); 1256 } 1257 1258 if let Some(id) = self.get_msix_cap_idx() { 1259 let msix = self.interrupt.msix.as_mut().unwrap(); 1260 if reg_idx * 4 == id + 4 { 1261 return msix.cap.table; 1262 } else if reg_idx * 4 == id + 8 { 1263 return msix.cap.pba; 1264 } 1265 } 1266 1267 // Since we don't support passing multi-functions devices, we should 1268 // mask the multi-function bit, bit 7 of the Header Type byte on the 1269 // register 3. 1270 let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { 1271 0xff7f_ffff 1272 } else { 1273 0xffff_ffff 1274 }; 1275 1276 // The config register read comes from the VFIO device itself. 1277 let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask; 1278 1279 if let Some(config_patch) = self.patches.get(®_idx) { 1280 value = (value & !config_patch.mask) | config_patch.patch; 1281 } 1282 1283 value 1284 } 1285 1286 fn state(&self) -> VfioCommonState { 1287 let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState { 1288 enabled: intx.enabled, 1289 }); 1290 1291 let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState { 1292 cap: msi.cfg.cap, 1293 cap_offset: msi.cap_offset, 1294 }); 1295 1296 let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState { 1297 cap: msix.cap, 1298 cap_offset: msix.cap_offset, 1299 bdf: msix.bar.devid, 1300 }); 1301 1302 VfioCommonState { 1303 intx_state, 1304 msi_state, 1305 msix_state, 1306 } 1307 } 1308 1309 fn set_state( 1310 &mut self, 1311 state: &VfioCommonState, 1312 msi_state: Option<MsiConfigState>, 1313 msix_state: Option<MsixConfigState>, 1314 ) -> Result<(), VfioPciError> { 1315 if let (Some(intx), Some(interrupt_source_group)) = 1316 (&state.intx_state, self.legacy_interrupt_group.clone()) 1317 { 1318 self.interrupt.intx = Some(VfioIntx { 1319 interrupt_source_group, 1320 enabled: false, 1321 }); 1322 1323 if intx.enabled { 1324 self.enable_intx()?; 1325 } 1326 } 1327 1328 if let Some(msi) = &state.msi_state { 1329 self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state); 1330 } 1331 1332 if let Some(msix) = &state.msix_state { 1333 self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state); 1334 } 1335 1336 Ok(()) 1337 } 1338 } 1339 1340 impl Pausable for VfioCommon {} 1341 1342 impl Snapshottable for VfioCommon { 1343 fn id(&self) -> String { 1344 String::from(VFIO_COMMON_ID) 1345 } 1346 1347 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1348 let mut vfio_common_snapshot = Snapshot::new_from_versioned_state(&self.state())?; 1349 1350 // Snapshot PciConfiguration 1351 vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?); 1352 1353 // Snapshot MSI 1354 if let Some(msi) = &mut self.interrupt.msi { 1355 vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?); 1356 } 1357 1358 // Snapshot MSI-X 1359 if let Some(msix) = &mut self.interrupt.msix { 1360 vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?); 1361 } 1362 1363 Ok(vfio_common_snapshot) 1364 } 1365 } 1366 1367 /// VfioPciDevice represents a VFIO PCI device. 1368 /// This structure implements the BusDevice and PciDevice traits. 1369 /// 1370 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. 1371 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, 1372 /// which then gets added to the PCI bus. 1373 pub struct VfioPciDevice { 1374 id: String, 1375 vm: Arc<dyn hypervisor::Vm>, 1376 device: Arc<VfioDevice>, 1377 container: Arc<VfioContainer>, 1378 common: VfioCommon, 1379 iommu_attached: bool, 1380 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1381 } 1382 1383 impl VfioPciDevice { 1384 /// Constructs a new Vfio Pci device for the given Vfio device 1385 #[allow(clippy::too_many_arguments)] 1386 pub fn new( 1387 id: String, 1388 vm: &Arc<dyn hypervisor::Vm>, 1389 device: VfioDevice, 1390 container: Arc<VfioContainer>, 1391 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 1392 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 1393 iommu_attached: bool, 1394 bdf: PciBdf, 1395 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1396 snapshot: Option<Snapshot>, 1397 x_nv_gpudirect_clique: Option<u8>, 1398 ) -> Result<Self, VfioPciError> { 1399 let device = Arc::new(device); 1400 device.reset(); 1401 1402 let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device)); 1403 1404 let common = VfioCommon::new( 1405 msi_interrupt_manager, 1406 legacy_interrupt_group, 1407 Arc::new(vfio_wrapper) as Arc<dyn Vfio>, 1408 &PciVfioSubclass::VfioSubclass, 1409 bdf, 1410 vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID), 1411 x_nv_gpudirect_clique, 1412 )?; 1413 1414 let vfio_pci_device = VfioPciDevice { 1415 id, 1416 vm: vm.clone(), 1417 device, 1418 container, 1419 common, 1420 iommu_attached, 1421 memory_slot, 1422 }; 1423 1424 Ok(vfio_pci_device) 1425 } 1426 1427 pub fn iommu_attached(&self) -> bool { 1428 self.iommu_attached 1429 } 1430 1431 fn generate_sparse_areas( 1432 caps: &[VfioRegionInfoCap], 1433 region_index: u32, 1434 region_start: u64, 1435 region_size: u64, 1436 vfio_msix: Option<&VfioMsix>, 1437 ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> { 1438 for cap in caps { 1439 match cap { 1440 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()), 1441 VfioRegionInfoCap::MsixMappable => { 1442 if !is_4k_aligned(region_start) { 1443 error!( 1444 "Region start address 0x{:x} must be at least aligned on 4KiB", 1445 region_start 1446 ); 1447 return Err(VfioPciError::RegionAlignment); 1448 } 1449 if !is_4k_multiple(region_size) { 1450 error!( 1451 "Region size 0x{:x} must be at least a multiple of 4KiB", 1452 region_size 1453 ); 1454 return Err(VfioPciError::RegionSize); 1455 } 1456 1457 // In case the region contains the MSI-X vectors table or 1458 // the MSI-X PBA table, we must calculate the subregions 1459 // around them, leading to a list of sparse areas. 1460 // We want to make sure we will still trap MMIO accesses 1461 // to these MSI-X specific ranges. If these region don't align 1462 // with pagesize, we can achieve it by enlarging its range. 1463 // 1464 // Using a BtreeMap as the list provided through the iterator is sorted 1465 // by key. This ensures proper split of the whole region. 1466 let mut inter_ranges = BTreeMap::new(); 1467 if let Some(msix) = vfio_msix { 1468 if region_index == msix.cap.table_bir() { 1469 let (offset, size) = msix.cap.table_range(); 1470 let offset = align_page_size_down(offset); 1471 let size = align_page_size_up(size); 1472 inter_ranges.insert(offset, size); 1473 } 1474 if region_index == msix.cap.pba_bir() { 1475 let (offset, size) = msix.cap.pba_range(); 1476 let offset = align_page_size_down(offset); 1477 let size = align_page_size_up(size); 1478 inter_ranges.insert(offset, size); 1479 } 1480 } 1481 1482 let mut sparse_areas = Vec::new(); 1483 let mut current_offset = 0; 1484 for (range_offset, range_size) in inter_ranges { 1485 if range_offset > current_offset { 1486 sparse_areas.push(VfioRegionSparseMmapArea { 1487 offset: current_offset, 1488 size: range_offset - current_offset, 1489 }); 1490 } 1491 current_offset = align_page_size_down(range_offset + range_size); 1492 } 1493 1494 if region_size > current_offset { 1495 sparse_areas.push(VfioRegionSparseMmapArea { 1496 offset: current_offset, 1497 size: region_size - current_offset, 1498 }); 1499 } 1500 1501 return Ok(sparse_areas); 1502 } 1503 _ => {} 1504 } 1505 } 1506 1507 // In case no relevant capabilities have been found, create a single 1508 // sparse area corresponding to the entire MMIO region. 1509 Ok(vec![VfioRegionSparseMmapArea { 1510 offset: 0, 1511 size: region_size, 1512 }]) 1513 } 1514 1515 /// Map MMIO regions into the guest, and avoid VM exits when the guest tries 1516 /// to reach those regions. 1517 /// 1518 /// # Arguments 1519 /// 1520 /// * `vm` - The VM object. It is used to set the VFIO MMIO regions 1521 /// as user memory regions. 1522 /// * `mem_slot` - The closure to return a memory slot. 1523 pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> { 1524 let fd = self.device.as_raw_fd(); 1525 1526 for region in self.common.mmio_regions.iter_mut() { 1527 let region_flags = self.device.get_region_flags(region.index); 1528 if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { 1529 let mut prot = 0; 1530 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 { 1531 prot |= libc::PROT_READ; 1532 } 1533 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { 1534 prot |= libc::PROT_WRITE; 1535 } 1536 1537 // Retrieve the list of capabilities found on the region 1538 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 { 1539 self.device.get_region_caps(region.index) 1540 } else { 1541 Vec::new() 1542 }; 1543 1544 // Don't try to mmap the region if it contains MSI-X table or 1545 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE 1546 // in the list of supported capabilities. 1547 if let Some(msix) = self.common.interrupt.msix.as_ref() { 1548 if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir()) 1549 && !caps.contains(&VfioRegionInfoCap::MsixMappable) 1550 { 1551 continue; 1552 } 1553 } 1554 1555 let mmap_size = self.device.get_region_size(region.index); 1556 let mmap_offset = self.device.get_region_offset(region.index); 1557 1558 let sparse_areas = Self::generate_sparse_areas( 1559 &caps, 1560 region.index, 1561 region.start.0, 1562 mmap_size, 1563 self.common.interrupt.msix.as_ref(), 1564 )?; 1565 1566 for area in sparse_areas.iter() { 1567 // SAFETY: FFI call with correct arguments 1568 let host_addr = unsafe { 1569 libc::mmap( 1570 null_mut(), 1571 area.size as usize, 1572 prot, 1573 libc::MAP_SHARED, 1574 fd, 1575 mmap_offset as libc::off_t + area.offset as libc::off_t, 1576 ) 1577 }; 1578 1579 if host_addr == libc::MAP_FAILED { 1580 error!( 1581 "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", 1582 area.offset, 1583 area.size, 1584 std::io::Error::last_os_error() 1585 ); 1586 return Err(VfioPciError::MmapArea); 1587 } 1588 1589 if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) { 1590 warn!( 1591 "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})", 1592 area.offset, 1593 area.size, 1594 ); 1595 return Ok(()); 1596 } 1597 1598 let user_memory_region = UserMemoryRegion { 1599 slot: (self.memory_slot)(), 1600 start: region.start.0 + area.offset, 1601 size: area.size, 1602 host_addr: host_addr as u64, 1603 }; 1604 1605 region.user_memory_regions.push(user_memory_region); 1606 1607 let mem_region = self.vm.make_user_memory_region( 1608 user_memory_region.slot, 1609 user_memory_region.start, 1610 user_memory_region.size, 1611 user_memory_region.host_addr, 1612 false, 1613 false, 1614 ); 1615 1616 self.vm 1617 .create_user_memory_region(mem_region) 1618 .map_err(VfioPciError::CreateUserMemoryRegion)?; 1619 1620 if !self.iommu_attached { 1621 self.container 1622 .vfio_dma_map( 1623 user_memory_region.start, 1624 user_memory_region.size, 1625 user_memory_region.host_addr, 1626 ) 1627 .map_err(VfioPciError::DmaMap)?; 1628 } 1629 } 1630 } 1631 } 1632 1633 Ok(()) 1634 } 1635 1636 pub fn unmap_mmio_regions(&mut self) { 1637 for region in self.common.mmio_regions.iter() { 1638 for user_memory_region in region.user_memory_regions.iter() { 1639 // Unmap from vfio container 1640 if !self.iommu_attached { 1641 if let Err(e) = self 1642 .container 1643 .vfio_dma_unmap(user_memory_region.start, user_memory_region.size) 1644 { 1645 error!("Could not unmap mmio region from vfio container: {}", e); 1646 } 1647 } 1648 1649 // Remove region 1650 let r = self.vm.make_user_memory_region( 1651 user_memory_region.slot, 1652 user_memory_region.start, 1653 user_memory_region.size, 1654 user_memory_region.host_addr, 1655 false, 1656 false, 1657 ); 1658 1659 if let Err(e) = self.vm.remove_user_memory_region(r) { 1660 error!("Could not remove the userspace memory region: {}", e); 1661 } 1662 1663 // SAFETY: FFI call with correct arguments 1664 let ret = unsafe { 1665 libc::munmap( 1666 user_memory_region.host_addr as *mut libc::c_void, 1667 user_memory_region.size as usize, 1668 ) 1669 }; 1670 if ret != 0 { 1671 error!( 1672 "Could not unmap region {}, error:{}", 1673 region.index, 1674 io::Error::last_os_error() 1675 ); 1676 } 1677 } 1678 } 1679 } 1680 1681 pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> { 1682 if !self.iommu_attached { 1683 self.container 1684 .vfio_dma_map(iova, size, user_addr) 1685 .map_err(VfioPciError::DmaMap)?; 1686 } 1687 1688 Ok(()) 1689 } 1690 1691 pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> { 1692 if !self.iommu_attached { 1693 self.container 1694 .vfio_dma_unmap(iova, size) 1695 .map_err(VfioPciError::DmaUnmap)?; 1696 } 1697 1698 Ok(()) 1699 } 1700 1701 pub fn mmio_regions(&self) -> Vec<MmioRegion> { 1702 self.common.mmio_regions.clone() 1703 } 1704 } 1705 1706 impl Drop for VfioPciDevice { 1707 fn drop(&mut self) { 1708 self.unmap_mmio_regions(); 1709 1710 if let Some(msix) = &self.common.interrupt.msix { 1711 if msix.bar.enabled() { 1712 self.common.disable_msix(); 1713 } 1714 } 1715 1716 if let Some(msi) = &self.common.interrupt.msi { 1717 if msi.cfg.enabled() { 1718 self.common.disable_msi() 1719 } 1720 } 1721 1722 if self.common.interrupt.intx_in_use() { 1723 self.common.disable_intx(); 1724 } 1725 } 1726 } 1727 1728 impl BusDevice for VfioPciDevice { 1729 fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1730 self.read_bar(base, offset, data) 1731 } 1732 1733 fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1734 self.write_bar(base, offset, data) 1735 } 1736 } 1737 1738 // First BAR offset in the PCI config space. 1739 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; 1740 // Capability register offset in the PCI config space. 1741 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; 1742 // Extended capabilities register offset in the PCI config space. 1743 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100; 1744 // IO BAR when first BAR bit is 1. 1745 const PCI_CONFIG_IO_BAR: u32 = 0x1; 1746 // 64-bit memory bar flag. 1747 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; 1748 // Prefetchable BAR bit 1749 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8; 1750 // PCI config register size (4 bytes). 1751 const PCI_CONFIG_REGISTER_SIZE: usize = 4; 1752 // Number of BARs for a PCI device 1753 const BAR_NUMS: usize = 6; 1754 // PCI Header Type register index 1755 const PCI_HEADER_TYPE_REG_INDEX: usize = 3; 1756 // First BAR register index 1757 const PCI_CONFIG_BAR0_INDEX: usize = 4; 1758 // PCI ROM expansion BAR register index 1759 const PCI_ROM_EXP_BAR_INDEX: usize = 12; 1760 1761 impl PciDevice for VfioPciDevice { 1762 fn allocate_bars( 1763 &mut self, 1764 allocator: &Arc<Mutex<SystemAllocator>>, 1765 mmio32_allocator: &mut AddressAllocator, 1766 mmio64_allocator: &mut AddressAllocator, 1767 resources: Option<Vec<Resource>>, 1768 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 1769 self.common 1770 .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources) 1771 } 1772 1773 fn free_bars( 1774 &mut self, 1775 allocator: &mut SystemAllocator, 1776 mmio32_allocator: &mut AddressAllocator, 1777 mmio64_allocator: &mut AddressAllocator, 1778 ) -> Result<(), PciDeviceError> { 1779 self.common 1780 .free_bars(allocator, mmio32_allocator, mmio64_allocator) 1781 } 1782 1783 fn write_config_register( 1784 &mut self, 1785 reg_idx: usize, 1786 offset: u64, 1787 data: &[u8], 1788 ) -> Option<Arc<Barrier>> { 1789 self.common.write_config_register(reg_idx, offset, data) 1790 } 1791 1792 fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1793 self.common.read_config_register(reg_idx) 1794 } 1795 1796 fn detect_bar_reprogramming( 1797 &mut self, 1798 reg_idx: usize, 1799 data: &[u8], 1800 ) -> Option<BarReprogrammingParams> { 1801 self.common 1802 .configuration 1803 .detect_bar_reprogramming(reg_idx, data) 1804 } 1805 1806 fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1807 self.common.read_bar(base, offset, data) 1808 } 1809 1810 fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1811 self.common.write_bar(base, offset, data) 1812 } 1813 1814 fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> { 1815 for region in self.common.mmio_regions.iter_mut() { 1816 if region.start.raw_value() == old_base { 1817 region.start = GuestAddress(new_base); 1818 1819 for user_memory_region in region.user_memory_regions.iter_mut() { 1820 // Remove old region 1821 let old_mem_region = self.vm.make_user_memory_region( 1822 user_memory_region.slot, 1823 user_memory_region.start, 1824 user_memory_region.size, 1825 user_memory_region.host_addr, 1826 false, 1827 false, 1828 ); 1829 1830 self.vm 1831 .remove_user_memory_region(old_mem_region) 1832 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1833 1834 // Update the user memory region with the correct start address. 1835 if new_base > old_base { 1836 user_memory_region.start += new_base - old_base; 1837 } else { 1838 user_memory_region.start -= old_base - new_base; 1839 } 1840 1841 // Insert new region 1842 let new_mem_region = self.vm.make_user_memory_region( 1843 user_memory_region.slot, 1844 user_memory_region.start, 1845 user_memory_region.size, 1846 user_memory_region.host_addr, 1847 false, 1848 false, 1849 ); 1850 1851 self.vm 1852 .create_user_memory_region(new_mem_region) 1853 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1854 } 1855 } 1856 } 1857 1858 Ok(()) 1859 } 1860 1861 fn as_any(&mut self) -> &mut dyn Any { 1862 self 1863 } 1864 1865 fn id(&self) -> Option<String> { 1866 Some(self.id.clone()) 1867 } 1868 } 1869 1870 impl Pausable for VfioPciDevice {} 1871 1872 impl Snapshottable for VfioPciDevice { 1873 fn id(&self) -> String { 1874 self.id.clone() 1875 } 1876 1877 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1878 let mut vfio_pci_dev_snapshot = Snapshot::default(); 1879 1880 // Snapshot VfioCommon 1881 vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?); 1882 1883 Ok(vfio_pci_dev_snapshot) 1884 } 1885 } 1886 impl Transportable for VfioPciDevice {} 1887 impl Migratable for VfioPciDevice {} 1888