1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 6 use crate::msi::{MsiConfigState, MSI_CONFIG_ID}; 7 use crate::msix::MsixConfigState; 8 use crate::{ 9 msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig, 10 PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId, 11 PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId, 12 PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID, 13 }; 14 use anyhow::anyhow; 15 use byteorder::{ByteOrder, LittleEndian}; 16 use hypervisor::HypervisorVmError; 17 use libc::{sysconf, _SC_PAGESIZE}; 18 use std::any::Any; 19 use std::collections::{BTreeMap, HashMap}; 20 use std::io; 21 use std::os::unix::io::AsRawFd; 22 use std::ptr::null_mut; 23 use std::sync::{Arc, Barrier, Mutex}; 24 use thiserror::Error; 25 use versionize::{VersionMap, Versionize, VersionizeResult}; 26 use versionize_derive::Versionize; 27 use vfio_bindings::bindings::vfio::*; 28 use vfio_ioctls::{ 29 VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea, 30 }; 31 use vm_allocator::page_size::{ 32 align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned, 33 }; 34 use vm_allocator::{AddressAllocator, SystemAllocator}; 35 use vm_device::interrupt::{ 36 InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, 37 }; 38 use vm_device::{BusDevice, Resource}; 39 use vm_memory::{Address, GuestAddress, GuestUsize}; 40 use vm_migration::{ 41 Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped, 42 }; 43 use vmm_sys_util::eventfd::EventFd; 44 45 pub(crate) const VFIO_COMMON_ID: &str = "vfio_common"; 46 47 #[derive(Debug, Error)] 48 pub enum VfioPciError { 49 #[error("Failed to create user memory region: {0}")] 50 CreateUserMemoryRegion(#[source] HypervisorVmError), 51 #[error("Failed to DMA map: {0}")] 52 DmaMap(#[source] vfio_ioctls::VfioError), 53 #[error("Failed to DMA unmap: {0}")] 54 DmaUnmap(#[source] vfio_ioctls::VfioError), 55 #[error("Failed to enable INTx: {0}")] 56 EnableIntx(#[source] VfioError), 57 #[error("Failed to enable MSI: {0}")] 58 EnableMsi(#[source] VfioError), 59 #[error("Failed to enable MSI-x: {0}")] 60 EnableMsix(#[source] VfioError), 61 #[error("Failed to mmap the area")] 62 MmapArea, 63 #[error("Failed to notifier's eventfd")] 64 MissingNotifier, 65 #[error("Invalid region alignment")] 66 RegionAlignment, 67 #[error("Invalid region size")] 68 RegionSize, 69 #[error("Failed to retrieve MsiConfigState: {0}")] 70 RetrieveMsiConfigState(#[source] anyhow::Error), 71 #[error("Failed to retrieve MsixConfigState: {0}")] 72 RetrieveMsixConfigState(#[source] anyhow::Error), 73 #[error("Failed to retrieve PciConfigurationState: {0}")] 74 RetrievePciConfigurationState(#[source] anyhow::Error), 75 #[error("Failed to retrieve VfioCommonState: {0}")] 76 RetrieveVfioCommonState(#[source] anyhow::Error), 77 } 78 79 #[derive(Copy, Clone)] 80 enum PciVfioSubclass { 81 VfioSubclass = 0xff, 82 } 83 84 impl PciSubclass for PciVfioSubclass { 85 fn get_register_value(&self) -> u8 { 86 *self as u8 87 } 88 } 89 90 enum InterruptUpdateAction { 91 EnableMsi, 92 DisableMsi, 93 EnableMsix, 94 DisableMsix, 95 } 96 97 #[derive(Versionize)] 98 struct IntxState { 99 enabled: bool, 100 } 101 102 pub(crate) struct VfioIntx { 103 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 104 enabled: bool, 105 } 106 107 #[derive(Versionize)] 108 struct MsiState { 109 cap: MsiCap, 110 cap_offset: u32, 111 } 112 113 pub(crate) struct VfioMsi { 114 pub(crate) cfg: MsiConfig, 115 cap_offset: u32, 116 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 117 } 118 119 impl VfioMsi { 120 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 121 let old_enabled = self.cfg.enabled(); 122 123 self.cfg.update(offset, data); 124 125 let new_enabled = self.cfg.enabled(); 126 127 if !old_enabled && new_enabled { 128 return Some(InterruptUpdateAction::EnableMsi); 129 } 130 131 if old_enabled && !new_enabled { 132 return Some(InterruptUpdateAction::DisableMsi); 133 } 134 135 None 136 } 137 } 138 139 #[derive(Versionize)] 140 struct MsixState { 141 cap: MsixCap, 142 cap_offset: u32, 143 bdf: u32, 144 } 145 146 pub(crate) struct VfioMsix { 147 pub(crate) bar: MsixConfig, 148 cap: MsixCap, 149 cap_offset: u32, 150 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 151 } 152 153 impl VfioMsix { 154 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 155 let old_enabled = self.bar.enabled(); 156 157 // Update "Message Control" word 158 if offset == 2 && data.len() == 2 { 159 self.bar.set_msg_ctl(LittleEndian::read_u16(data)); 160 } 161 162 let new_enabled = self.bar.enabled(); 163 164 if !old_enabled && new_enabled { 165 return Some(InterruptUpdateAction::EnableMsix); 166 } 167 168 if old_enabled && !new_enabled { 169 return Some(InterruptUpdateAction::DisableMsix); 170 } 171 172 None 173 } 174 175 fn table_accessed(&self, bar_index: u32, offset: u64) -> bool { 176 let table_offset: u64 = u64::from(self.cap.table_offset()); 177 let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64); 178 let table_bir: u32 = self.cap.table_bir(); 179 180 bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size 181 } 182 } 183 184 pub(crate) struct Interrupt { 185 pub(crate) intx: Option<VfioIntx>, 186 pub(crate) msi: Option<VfioMsi>, 187 pub(crate) msix: Option<VfioMsix>, 188 } 189 190 impl Interrupt { 191 fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 192 if let Some(ref mut msi) = &mut self.msi { 193 let action = msi.update(offset, data); 194 return action; 195 } 196 197 None 198 } 199 200 fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 201 if let Some(ref mut msix) = &mut self.msix { 202 let action = msix.update(offset, data); 203 return action; 204 } 205 206 None 207 } 208 209 fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { 210 if let Some(msi) = &self.msi { 211 if offset >= u64::from(msi.cap_offset) 212 && offset < u64::from(msi.cap_offset) + msi.cfg.size() 213 { 214 return Some(( 215 PciCapabilityId::MessageSignalledInterrupts, 216 u64::from(msi.cap_offset), 217 )); 218 } 219 } 220 221 if let Some(msix) = &self.msix { 222 if offset == u64::from(msix.cap_offset) { 223 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); 224 } 225 } 226 227 None 228 } 229 230 fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool { 231 if let Some(msix) = &self.msix { 232 return msix.table_accessed(bar_index, offset); 233 } 234 235 false 236 } 237 238 fn msix_write_table(&mut self, offset: u64, data: &[u8]) { 239 if let Some(ref mut msix) = &mut self.msix { 240 let offset = offset - u64::from(msix.cap.table_offset()); 241 msix.bar.write_table(offset, data) 242 } 243 } 244 245 fn msix_read_table(&self, offset: u64, data: &mut [u8]) { 246 if let Some(msix) = &self.msix { 247 let offset = offset - u64::from(msix.cap.table_offset()); 248 msix.bar.read_table(offset, data) 249 } 250 } 251 252 pub(crate) fn intx_in_use(&self) -> bool { 253 if let Some(intx) = &self.intx { 254 return intx.enabled; 255 } 256 257 false 258 } 259 } 260 261 #[derive(Copy, Clone)] 262 pub struct UserMemoryRegion { 263 pub slot: u32, 264 pub start: u64, 265 pub size: u64, 266 pub host_addr: u64, 267 } 268 269 #[derive(Clone)] 270 pub struct MmioRegion { 271 pub start: GuestAddress, 272 pub length: GuestUsize, 273 pub(crate) type_: PciBarRegionType, 274 pub(crate) index: u32, 275 pub(crate) user_memory_regions: Vec<UserMemoryRegion>, 276 } 277 #[derive(Debug, Error)] 278 pub enum VfioError { 279 #[error("Kernel VFIO error: {0}")] 280 KernelVfio(#[source] vfio_ioctls::VfioError), 281 #[error("VFIO user error: {0}")] 282 VfioUser(#[source] vfio_user::Error), 283 } 284 285 pub(crate) trait Vfio: Send + Sync { 286 fn read_config_byte(&self, offset: u32) -> u8 { 287 let mut data: [u8; 1] = [0]; 288 self.read_config(offset, &mut data); 289 data[0] 290 } 291 292 fn read_config_word(&self, offset: u32) -> u16 { 293 let mut data: [u8; 2] = [0, 0]; 294 self.read_config(offset, &mut data); 295 u16::from_le_bytes(data) 296 } 297 298 fn read_config_dword(&self, offset: u32) -> u32 { 299 let mut data: [u8; 4] = [0, 0, 0, 0]; 300 self.read_config(offset, &mut data); 301 u32::from_le_bytes(data) 302 } 303 304 fn write_config_dword(&self, offset: u32, buf: u32) { 305 let data: [u8; 4] = buf.to_le_bytes(); 306 self.write_config(offset, &data) 307 } 308 309 fn read_config(&self, offset: u32, data: &mut [u8]) { 310 self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut()); 311 } 312 313 fn write_config(&self, offset: u32, data: &[u8]) { 314 self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data) 315 } 316 317 fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 318 self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds) 319 } 320 321 fn disable_msi(&self) -> Result<(), VfioError> { 322 self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX) 323 } 324 325 fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 326 self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds) 327 } 328 329 fn disable_msix(&self) -> Result<(), VfioError> { 330 self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX) 331 } 332 333 fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) { 334 unimplemented!() 335 } 336 337 fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) { 338 unimplemented!() 339 } 340 341 fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> { 342 unimplemented!() 343 } 344 345 fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 346 unimplemented!() 347 } 348 349 fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 350 unimplemented!() 351 } 352 353 fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 354 unimplemented!() 355 } 356 } 357 358 struct VfioDeviceWrapper { 359 device: Arc<VfioDevice>, 360 } 361 362 impl VfioDeviceWrapper { 363 fn new(device: Arc<VfioDevice>) -> Self { 364 Self { device } 365 } 366 } 367 368 impl Vfio for VfioDeviceWrapper { 369 fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) { 370 self.device.region_read(index, data, offset) 371 } 372 373 fn region_write(&self, index: u32, offset: u64, data: &[u8]) { 374 self.device.region_write(index, data, offset) 375 } 376 377 fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> { 378 self.device.get_irq_info(irq_index).copied() 379 } 380 381 fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 382 self.device 383 .enable_irq(irq_index, event_fds) 384 .map_err(VfioError::KernelVfio) 385 } 386 387 fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> { 388 self.device 389 .disable_irq(irq_index) 390 .map_err(VfioError::KernelVfio) 391 } 392 393 fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> { 394 self.device 395 .unmask_irq(irq_index) 396 .map_err(VfioError::KernelVfio) 397 } 398 } 399 400 #[derive(Versionize)] 401 struct VfioCommonState { 402 intx_state: Option<IntxState>, 403 msi_state: Option<MsiState>, 404 msix_state: Option<MsixState>, 405 } 406 407 impl VersionMapped for VfioCommonState {} 408 409 pub(crate) struct ConfigPatch { 410 mask: u32, 411 patch: u32, 412 } 413 414 pub(crate) struct VfioCommon { 415 pub(crate) configuration: PciConfiguration, 416 pub(crate) mmio_regions: Vec<MmioRegion>, 417 pub(crate) interrupt: Interrupt, 418 pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 419 pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 420 pub(crate) vfio_wrapper: Arc<dyn Vfio>, 421 pub(crate) patches: HashMap<usize, ConfigPatch>, 422 } 423 424 impl VfioCommon { 425 pub(crate) fn new( 426 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 427 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 428 vfio_wrapper: Arc<dyn Vfio>, 429 subclass: &dyn PciSubclass, 430 bdf: PciBdf, 431 snapshot: Option<Snapshot>, 432 ) -> Result<Self, VfioPciError> { 433 let pci_configuration_state = 434 vm_migration::versioned_state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID) 435 .map_err(|e| { 436 VfioPciError::RetrievePciConfigurationState(anyhow!( 437 "Failed to get PciConfigurationState from Snapshot: {}", 438 e 439 )) 440 })?; 441 442 let configuration = PciConfiguration::new( 443 0, 444 0, 445 0, 446 PciClassCode::Other, 447 subclass, 448 None, 449 PciHeaderType::Device, 450 0, 451 0, 452 None, 453 pci_configuration_state, 454 ); 455 456 let mut vfio_common = VfioCommon { 457 mmio_regions: Vec::new(), 458 configuration, 459 interrupt: Interrupt { 460 intx: None, 461 msi: None, 462 msix: None, 463 }, 464 msi_interrupt_manager, 465 legacy_interrupt_group, 466 vfio_wrapper, 467 patches: HashMap::new(), 468 }; 469 470 let state: Option<VfioCommonState> = snapshot 471 .as_ref() 472 .map(|s| s.to_versioned_state()) 473 .transpose() 474 .map_err(|e| { 475 VfioPciError::RetrieveVfioCommonState(anyhow!( 476 "Failed to get VfioCommonState from Snapshot: {}", 477 e 478 )) 479 })?; 480 let msi_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSI_CONFIG_ID) 481 .map_err(|e| { 482 VfioPciError::RetrieveMsiConfigState(anyhow!( 483 "Failed to get MsiConfigState from Snapshot: {}", 484 e 485 )) 486 })?; 487 let msix_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID) 488 .map_err(|e| { 489 VfioPciError::RetrieveMsixConfigState(anyhow!( 490 "Failed to get MsixConfigState from Snapshot: {}", 491 e 492 )) 493 })?; 494 495 if let Some(state) = state.as_ref() { 496 vfio_common.set_state(state, msi_state, msix_state)?; 497 } else { 498 vfio_common.parse_capabilities(bdf); 499 vfio_common.initialize_legacy_interrupt()?; 500 } 501 502 Ok(vfio_common) 503 } 504 505 /// In case msix table offset is not page size aligned, we need do some fixup to achieve it. 506 /// Because we don't want the MMIO RW region and trap region overlap each other. 507 fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 { 508 if let Some(msix) = self.interrupt.msix.as_mut() { 509 let msix_cap = &mut msix.cap; 510 511 // Suppose table_bir equals to pba_bir here. Am I right? 512 let (table_offset, table_size) = msix_cap.table_range(); 513 if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id { 514 return region_size; 515 } 516 517 let (pba_offset, pba_size) = msix_cap.pba_range(); 518 let msix_sz = align_page_size_up(table_size + pba_size); 519 // Expand region to hold RW and trap region which both page size aligned 520 let size = std::cmp::max(region_size * 2, msix_sz * 2); 521 // let table starts from the middle of the region 522 msix_cap.table_set_offset((size / 2) as u32); 523 msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32); 524 525 size 526 } else { 527 // MSI-X not supported for this device 528 region_size 529 } 530 } 531 532 // The `allocator` argument is unused on `aarch64` 533 #[allow(unused_variables)] 534 pub(crate) fn allocate_bars( 535 &mut self, 536 allocator: &Arc<Mutex<SystemAllocator>>, 537 mmio32_allocator: &mut AddressAllocator, 538 mmio64_allocator: &mut AddressAllocator, 539 resources: Option<Vec<Resource>>, 540 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 541 let mut bars = Vec::new(); 542 let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX; 543 544 // Going through all regular regions to compute the BAR size. 545 // We're not saving the BAR address to restore it, because we 546 // are going to allocate a guest address for each BAR and write 547 // that new address back. 548 while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { 549 let mut region_size: u64 = 0; 550 let mut region_type = PciBarRegionType::Memory32BitRegion; 551 let mut prefetchable = PciBarPrefetchable::NotPrefetchable; 552 let mut flags: u32 = 0; 553 554 let mut restored_bar_addr = None; 555 if let Some(resources) = &resources { 556 for resource in resources { 557 if let Resource::PciBar { 558 index, 559 base, 560 size, 561 type_, 562 .. 563 } = resource 564 { 565 if *index == bar_id as usize { 566 restored_bar_addr = Some(GuestAddress(*base)); 567 region_size = *size; 568 region_type = PciBarRegionType::from(*type_); 569 break; 570 } 571 } 572 } 573 if restored_bar_addr.is_none() { 574 bar_id += 1; 575 continue; 576 } 577 } else { 578 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 579 (PCI_ROM_EXP_BAR_INDEX * 4) as u32 580 } else { 581 PCI_CONFIG_BAR_OFFSET + bar_id * 4 582 }; 583 584 // First read flags 585 flags = self.vfio_wrapper.read_config_dword(bar_offset); 586 587 // Is this an IO BAR? 588 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 589 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) 590 } else { 591 false 592 }; 593 594 // Is this a 64-bit BAR? 595 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 596 matches!( 597 flags & PCI_CONFIG_MEMORY_BAR_64BIT, 598 PCI_CONFIG_MEMORY_BAR_64BIT 599 ) 600 } else { 601 false 602 }; 603 604 if matches!( 605 flags & PCI_CONFIG_BAR_PREFETCHABLE, 606 PCI_CONFIG_BAR_PREFETCHABLE 607 ) { 608 prefetchable = PciBarPrefetchable::Prefetchable 609 }; 610 611 // To get size write all 1s 612 self.vfio_wrapper 613 .write_config_dword(bar_offset, 0xffff_ffff); 614 615 // And read back BAR value. The device will write zeros for bits it doesn't care about 616 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset); 617 618 if io_bar { 619 // Mask flag bits (lowest 2 for I/O bars) 620 lower &= !0b11; 621 622 // BAR is not enabled 623 if lower == 0 { 624 bar_id += 1; 625 continue; 626 } 627 628 // IO BAR 629 region_type = PciBarRegionType::IoRegion; 630 631 // Invert bits and add 1 to calculate size 632 region_size = (!lower + 1) as u64; 633 } else if is_64bit_bar { 634 // 64 bits Memory BAR 635 region_type = PciBarRegionType::Memory64BitRegion; 636 637 // Query size of upper BAR of 64-bit BAR 638 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; 639 self.vfio_wrapper 640 .write_config_dword(upper_offset, 0xffff_ffff); 641 let upper = self.vfio_wrapper.read_config_dword(upper_offset); 642 643 let mut combined_size = u64::from(upper) << 32 | u64::from(lower); 644 645 // Mask out flag bits (lowest 4 for memory bars) 646 combined_size &= !0b1111; 647 648 // BAR is not enabled 649 if combined_size == 0 { 650 bar_id += 1; 651 continue; 652 } 653 654 // Invert and add 1 to to find size 655 region_size = !combined_size + 1; 656 } else { 657 region_type = PciBarRegionType::Memory32BitRegion; 658 659 // Mask out flag bits (lowest 4 for memory bars) 660 lower &= !0b1111; 661 662 if lower == 0 { 663 bar_id += 1; 664 continue; 665 } 666 667 // Invert and add 1 to to find size 668 region_size = (!lower + 1) as u64; 669 } 670 } 671 672 let bar_addr = match region_type { 673 PciBarRegionType::IoRegion => { 674 #[cfg(target_arch = "aarch64")] 675 unimplemented!(); 676 677 // The address needs to be 4 bytes aligned. 678 #[cfg(not(target_arch = "aarch64"))] 679 allocator 680 .lock() 681 .unwrap() 682 .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4)) 683 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 684 } 685 PciBarRegionType::Memory32BitRegion => { 686 // BAR allocation must be naturally aligned 687 mmio32_allocator 688 .allocate(restored_bar_addr, region_size, Some(region_size)) 689 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 690 } 691 PciBarRegionType::Memory64BitRegion => { 692 // We need do some fixup to keep MMIO RW region and msix cap region page size 693 // aligned. 694 region_size = self.fixup_msix_region(bar_id, region_size); 695 mmio64_allocator 696 .allocate( 697 restored_bar_addr, 698 region_size, 699 // SAFETY: FFI call. Trivially safe. 700 Some(unsafe { sysconf(_SC_PAGESIZE) as GuestUsize }), 701 ) 702 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 703 } 704 }; 705 706 // We can now build our BAR configuration block. 707 let bar = PciBarConfiguration::default() 708 .set_index(bar_id as usize) 709 .set_address(bar_addr.raw_value()) 710 .set_size(region_size) 711 .set_region_type(region_type) 712 .set_prefetchable(prefetchable); 713 714 if bar_id == VFIO_PCI_ROM_REGION_INDEX { 715 self.configuration 716 .add_pci_rom_bar(&bar, flags & 0x1) 717 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 718 } else { 719 self.configuration 720 .add_pci_bar(&bar) 721 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 722 } 723 724 bars.push(bar); 725 self.mmio_regions.push(MmioRegion { 726 start: bar_addr, 727 length: region_size, 728 type_: region_type, 729 index: bar_id, 730 user_memory_regions: Vec::new(), 731 }); 732 733 bar_id += 1; 734 if region_type == PciBarRegionType::Memory64BitRegion { 735 bar_id += 1; 736 } 737 } 738 739 Ok(bars) 740 } 741 742 // The `allocator` argument is unused on `aarch64` 743 #[allow(unused_variables)] 744 pub(crate) fn free_bars( 745 &mut self, 746 allocator: &mut SystemAllocator, 747 mmio32_allocator: &mut AddressAllocator, 748 mmio64_allocator: &mut AddressAllocator, 749 ) -> Result<(), PciDeviceError> { 750 for region in self.mmio_regions.iter() { 751 match region.type_ { 752 PciBarRegionType::IoRegion => { 753 #[cfg(target_arch = "x86_64")] 754 allocator.free_io_addresses(region.start, region.length); 755 #[cfg(target_arch = "aarch64")] 756 error!("I/O region is not supported"); 757 } 758 PciBarRegionType::Memory32BitRegion => { 759 mmio32_allocator.free(region.start, region.length); 760 } 761 PciBarRegionType::Memory64BitRegion => { 762 mmio64_allocator.free(region.start, region.length); 763 } 764 } 765 } 766 Ok(()) 767 } 768 769 pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap { 770 let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into()); 771 772 let table = self.vfio_wrapper.read_config_dword((cap + 4).into()); 773 774 let pba = self.vfio_wrapper.read_config_dword((cap + 8).into()); 775 776 MsixCap { 777 msg_ctl, 778 table, 779 pba, 780 } 781 } 782 783 pub(crate) fn initialize_msix( 784 &mut self, 785 msix_cap: MsixCap, 786 cap_offset: u32, 787 bdf: PciBdf, 788 state: Option<MsixConfigState>, 789 ) { 790 let interrupt_source_group = self 791 .msi_interrupt_manager 792 .create_group(MsiIrqGroupConfig { 793 base: 0, 794 count: msix_cap.table_size() as InterruptIndex, 795 }) 796 .unwrap(); 797 798 let msix_config = MsixConfig::new( 799 msix_cap.table_size(), 800 interrupt_source_group.clone(), 801 bdf.into(), 802 state, 803 ) 804 .unwrap(); 805 806 self.interrupt.msix = Some(VfioMsix { 807 bar: msix_config, 808 cap: msix_cap, 809 cap_offset, 810 interrupt_source_group, 811 }); 812 } 813 814 pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 { 815 self.vfio_wrapper.read_config_word((cap + 2).into()) 816 } 817 818 pub(crate) fn initialize_msi( 819 &mut self, 820 msg_ctl: u16, 821 cap_offset: u32, 822 state: Option<MsiConfigState>, 823 ) { 824 let interrupt_source_group = self 825 .msi_interrupt_manager 826 .create_group(MsiIrqGroupConfig { 827 base: 0, 828 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, 829 }) 830 .unwrap(); 831 832 let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap(); 833 834 self.interrupt.msi = Some(VfioMsi { 835 cfg: msi_config, 836 cap_offset, 837 interrupt_source_group, 838 }); 839 } 840 841 pub(crate) fn get_msix_cap_idx(&self) -> Option<usize> { 842 let mut cap_next = self 843 .vfio_wrapper 844 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 845 846 while cap_next != 0 { 847 let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into()); 848 if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX { 849 return Some(cap_next as usize); 850 } else { 851 cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into()); 852 } 853 } 854 855 None 856 } 857 858 pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) { 859 let mut cap_next = self 860 .vfio_wrapper 861 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 862 863 let mut pci_express_cap_found = false; 864 let mut power_management_cap_found = false; 865 866 while cap_next != 0 { 867 let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into()); 868 869 match PciCapabilityId::from(cap_id) { 870 PciCapabilityId::MessageSignalledInterrupts => { 871 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { 872 if irq_info.count > 0 { 873 // Parse capability only if the VFIO device 874 // supports MSI. 875 let msg_ctl = self.parse_msi_capabilities(cap_next); 876 self.initialize_msi(msg_ctl, cap_next as u32, None); 877 } 878 } 879 } 880 PciCapabilityId::MsiX => { 881 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) 882 { 883 if irq_info.count > 0 { 884 // Parse capability only if the VFIO device 885 // supports MSI-X. 886 let msix_cap = self.parse_msix_capabilities(cap_next); 887 self.initialize_msix(msix_cap, cap_next as u32, bdf, None); 888 } 889 } 890 } 891 PciCapabilityId::PciExpress => pci_express_cap_found = true, 892 PciCapabilityId::PowerManagement => power_management_cap_found = true, 893 _ => {} 894 }; 895 896 cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into()); 897 } 898 899 if pci_express_cap_found && power_management_cap_found { 900 self.parse_extended_capabilities(); 901 } 902 } 903 904 fn parse_extended_capabilities(&mut self) { 905 let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET; 906 907 loop { 908 let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset); 909 910 let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16; 911 let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16; 912 913 match PciExpressCapabilityId::from(cap_id) { 914 PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation 915 | PciExpressCapabilityId::ResizeableBar 916 | PciExpressCapabilityId::SingleRootIoVirtualization => { 917 let reg_idx = (current_offset / 4) as usize; 918 self.patches.insert( 919 reg_idx, 920 ConfigPatch { 921 mask: 0x0000_ffff, 922 patch: PciExpressCapabilityId::NullCapability as u32, 923 }, 924 ); 925 } 926 _ => {} 927 } 928 929 if cap_next == 0 { 930 break; 931 } 932 933 current_offset = cap_next.into(); 934 } 935 } 936 937 pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> { 938 if let Some(intx) = &mut self.interrupt.intx { 939 if !intx.enabled { 940 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { 941 self.vfio_wrapper 942 .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) 943 .map_err(VfioPciError::EnableIntx)?; 944 945 intx.enabled = true; 946 } else { 947 return Err(VfioPciError::MissingNotifier); 948 } 949 } 950 } 951 952 Ok(()) 953 } 954 955 pub(crate) fn disable_intx(&mut self) { 956 if let Some(intx) = &mut self.interrupt.intx { 957 if intx.enabled { 958 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { 959 error!("Could not disable INTx: {}", e); 960 } else { 961 intx.enabled = false; 962 } 963 } 964 } 965 } 966 967 pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> { 968 if let Some(msi) = &self.interrupt.msi { 969 let mut irq_fds: Vec<EventFd> = Vec::new(); 970 for i in 0..msi.cfg.num_enabled_vectors() { 971 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { 972 irq_fds.push(eventfd); 973 } else { 974 return Err(VfioPciError::MissingNotifier); 975 } 976 } 977 978 self.vfio_wrapper 979 .enable_msi(irq_fds.iter().collect()) 980 .map_err(VfioPciError::EnableMsi)?; 981 } 982 983 Ok(()) 984 } 985 986 pub(crate) fn disable_msi(&self) { 987 if let Err(e) = self.vfio_wrapper.disable_msi() { 988 error!("Could not disable MSI: {}", e); 989 } 990 } 991 992 pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> { 993 if let Some(msix) = &self.interrupt.msix { 994 let mut irq_fds: Vec<EventFd> = Vec::new(); 995 for i in 0..msix.bar.table_entries.len() { 996 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { 997 irq_fds.push(eventfd); 998 } else { 999 return Err(VfioPciError::MissingNotifier); 1000 } 1001 } 1002 1003 self.vfio_wrapper 1004 .enable_msix(irq_fds.iter().collect()) 1005 .map_err(VfioPciError::EnableMsix)?; 1006 } 1007 1008 Ok(()) 1009 } 1010 1011 pub(crate) fn disable_msix(&self) { 1012 if let Err(e) = self.vfio_wrapper.disable_msix() { 1013 error!("Could not disable MSI-X: {}", e); 1014 } 1015 } 1016 1017 pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> { 1018 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { 1019 if irq_info.count == 0 { 1020 // A count of 0 means the INTx IRQ is not supported, therefore 1021 // it shouldn't be initialized. 1022 return Ok(()); 1023 } 1024 } 1025 1026 if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() { 1027 self.interrupt.intx = Some(VfioIntx { 1028 interrupt_source_group, 1029 enabled: false, 1030 }); 1031 1032 self.enable_intx()?; 1033 } 1034 1035 Ok(()) 1036 } 1037 1038 pub(crate) fn update_msi_capabilities( 1039 &mut self, 1040 offset: u64, 1041 data: &[u8], 1042 ) -> Result<(), VfioPciError> { 1043 match self.interrupt.update_msi(offset, data) { 1044 Some(InterruptUpdateAction::EnableMsi) => { 1045 // Disable INTx before we can enable MSI 1046 self.disable_intx(); 1047 self.enable_msi()?; 1048 } 1049 Some(InterruptUpdateAction::DisableMsi) => { 1050 // Fallback onto INTx when disabling MSI 1051 self.disable_msi(); 1052 self.enable_intx()?; 1053 } 1054 _ => {} 1055 } 1056 1057 Ok(()) 1058 } 1059 1060 pub(crate) fn update_msix_capabilities( 1061 &mut self, 1062 offset: u64, 1063 data: &[u8], 1064 ) -> Result<(), VfioPciError> { 1065 match self.interrupt.update_msix(offset, data) { 1066 Some(InterruptUpdateAction::EnableMsix) => { 1067 // Disable INTx before we can enable MSI-X 1068 self.disable_intx(); 1069 self.enable_msix()?; 1070 } 1071 Some(InterruptUpdateAction::DisableMsix) => { 1072 // Fallback onto INTx when disabling MSI-X 1073 self.disable_msix(); 1074 self.enable_intx()?; 1075 } 1076 _ => {} 1077 } 1078 1079 Ok(()) 1080 } 1081 1082 pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> { 1083 for region in self.mmio_regions.iter() { 1084 if addr >= region.start.raw_value() 1085 && addr < region.start.unchecked_add(region.length).raw_value() 1086 { 1087 return Some(region.clone()); 1088 } 1089 } 1090 None 1091 } 1092 1093 pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1094 let addr = base + offset; 1095 if let Some(region) = self.find_region(addr) { 1096 let offset = addr - region.start.raw_value(); 1097 1098 if self.interrupt.msix_table_accessed(region.index, offset) { 1099 self.interrupt.msix_read_table(offset, data); 1100 } else { 1101 self.vfio_wrapper.region_read(region.index, offset, data); 1102 } 1103 } 1104 1105 // INTx EOI 1106 // The guest reading from the BAR potentially means the interrupt has 1107 // been received and can be acknowledged. 1108 if self.interrupt.intx_in_use() { 1109 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1110 error!("Failed unmasking INTx IRQ: {}", e); 1111 } 1112 } 1113 } 1114 1115 pub(crate) fn write_bar( 1116 &mut self, 1117 base: u64, 1118 offset: u64, 1119 data: &[u8], 1120 ) -> Option<Arc<Barrier>> { 1121 let addr = base + offset; 1122 if let Some(region) = self.find_region(addr) { 1123 let offset = addr - region.start.raw_value(); 1124 1125 // If the MSI-X table is written to, we need to update our cache. 1126 if self.interrupt.msix_table_accessed(region.index, offset) { 1127 self.interrupt.msix_write_table(offset, data); 1128 } else { 1129 self.vfio_wrapper.region_write(region.index, offset, data); 1130 } 1131 } 1132 1133 // INTx EOI 1134 // The guest writing to the BAR potentially means the interrupt has 1135 // been received and can be acknowledged. 1136 if self.interrupt.intx_in_use() { 1137 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1138 error!("Failed unmasking INTx IRQ: {}", e); 1139 } 1140 } 1141 1142 None 1143 } 1144 1145 pub(crate) fn write_config_register( 1146 &mut self, 1147 reg_idx: usize, 1148 offset: u64, 1149 data: &[u8], 1150 ) -> Option<Arc<Barrier>> { 1151 // When the guest wants to write to a BAR, we trap it into 1152 // our local configuration space. We're not reprogramming 1153 // VFIO device. 1154 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1155 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1156 { 1157 // We keep our local cache updated with the BARs. 1158 // We'll read it back from there when the guest is asking 1159 // for BARs (see read_config_register()). 1160 self.configuration 1161 .write_config_register(reg_idx, offset, data); 1162 return None; 1163 } 1164 1165 let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; 1166 1167 // If the MSI or MSI-X capabilities are accessed, we need to 1168 // update our local cache accordingly. 1169 // Depending on how the capabilities are modified, this could 1170 // trigger a VFIO MSI or MSI-X toggle. 1171 if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { 1172 let cap_offset: u64 = reg - cap_base + offset; 1173 match cap_id { 1174 PciCapabilityId::MessageSignalledInterrupts => { 1175 if let Err(e) = self.update_msi_capabilities(cap_offset, data) { 1176 error!("Could not update MSI capabilities: {}", e); 1177 } 1178 } 1179 PciCapabilityId::MsiX => { 1180 if let Err(e) = self.update_msix_capabilities(cap_offset, data) { 1181 error!("Could not update MSI-X capabilities: {}", e); 1182 } 1183 } 1184 _ => {} 1185 } 1186 } 1187 1188 // Make sure to write to the device's PCI config space after MSI/MSI-X 1189 // interrupts have been enabled/disabled. In case of MSI, when the 1190 // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), 1191 // the MSI Enable bit in the MSI capability structure found in the PCI 1192 // config space is disabled by default. That's why when the guest is 1193 // enabling this bit, we first need to enable the MSI interrupts with 1194 // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write 1195 // to the device region to update the MSI Enable bit. 1196 self.vfio_wrapper.write_config((reg + offset) as u32, data); 1197 1198 None 1199 } 1200 1201 pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1202 // When reading the BARs, we trap it and return what comes 1203 // from our local configuration space. We want the guest to 1204 // use that and not the VFIO device BARs as it does not map 1205 // with the guest address space. 1206 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1207 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1208 { 1209 return self.configuration.read_reg(reg_idx); 1210 } 1211 1212 if let Some(id) = self.get_msix_cap_idx() { 1213 let msix = self.interrupt.msix.as_mut().unwrap(); 1214 if reg_idx * 4 == id + 4 { 1215 return msix.cap.table; 1216 } else if reg_idx * 4 == id + 8 { 1217 return msix.cap.pba; 1218 } 1219 } 1220 1221 // Since we don't support passing multi-functions devices, we should 1222 // mask the multi-function bit, bit 7 of the Header Type byte on the 1223 // register 3. 1224 let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { 1225 0xff7f_ffff 1226 } else { 1227 0xffff_ffff 1228 }; 1229 1230 // The config register read comes from the VFIO device itself. 1231 let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask; 1232 1233 if let Some(config_patch) = self.patches.get(®_idx) { 1234 value = (value & !config_patch.mask) | config_patch.patch; 1235 } 1236 1237 value 1238 } 1239 1240 fn state(&self) -> VfioCommonState { 1241 let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState { 1242 enabled: intx.enabled, 1243 }); 1244 1245 let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState { 1246 cap: msi.cfg.cap, 1247 cap_offset: msi.cap_offset, 1248 }); 1249 1250 let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState { 1251 cap: msix.cap, 1252 cap_offset: msix.cap_offset, 1253 bdf: msix.bar.devid, 1254 }); 1255 1256 VfioCommonState { 1257 intx_state, 1258 msi_state, 1259 msix_state, 1260 } 1261 } 1262 1263 fn set_state( 1264 &mut self, 1265 state: &VfioCommonState, 1266 msi_state: Option<MsiConfigState>, 1267 msix_state: Option<MsixConfigState>, 1268 ) -> Result<(), VfioPciError> { 1269 if let (Some(intx), Some(interrupt_source_group)) = 1270 (&state.intx_state, self.legacy_interrupt_group.clone()) 1271 { 1272 self.interrupt.intx = Some(VfioIntx { 1273 interrupt_source_group, 1274 enabled: false, 1275 }); 1276 1277 if intx.enabled { 1278 self.enable_intx()?; 1279 } 1280 } 1281 1282 if let Some(msi) = &state.msi_state { 1283 self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state); 1284 } 1285 1286 if let Some(msix) = &state.msix_state { 1287 self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state); 1288 } 1289 1290 Ok(()) 1291 } 1292 } 1293 1294 impl Pausable for VfioCommon {} 1295 1296 impl Snapshottable for VfioCommon { 1297 fn id(&self) -> String { 1298 String::from(VFIO_COMMON_ID) 1299 } 1300 1301 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1302 let mut vfio_common_snapshot = Snapshot::new_from_versioned_state(&self.state())?; 1303 1304 // Snapshot PciConfiguration 1305 vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?); 1306 1307 // Snapshot MSI 1308 if let Some(msi) = &mut self.interrupt.msi { 1309 vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?); 1310 } 1311 1312 // Snapshot MSI-X 1313 if let Some(msix) = &mut self.interrupt.msix { 1314 vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?); 1315 } 1316 1317 Ok(vfio_common_snapshot) 1318 } 1319 } 1320 1321 /// VfioPciDevice represents a VFIO PCI device. 1322 /// This structure implements the BusDevice and PciDevice traits. 1323 /// 1324 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. 1325 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, 1326 /// which then gets added to the PCI bus. 1327 pub struct VfioPciDevice { 1328 id: String, 1329 vm: Arc<dyn hypervisor::Vm>, 1330 device: Arc<VfioDevice>, 1331 container: Arc<VfioContainer>, 1332 common: VfioCommon, 1333 iommu_attached: bool, 1334 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1335 } 1336 1337 impl VfioPciDevice { 1338 /// Constructs a new Vfio Pci device for the given Vfio device 1339 #[allow(clippy::too_many_arguments)] 1340 pub fn new( 1341 id: String, 1342 vm: &Arc<dyn hypervisor::Vm>, 1343 device: VfioDevice, 1344 container: Arc<VfioContainer>, 1345 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 1346 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 1347 iommu_attached: bool, 1348 bdf: PciBdf, 1349 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1350 snapshot: Option<Snapshot>, 1351 ) -> Result<Self, VfioPciError> { 1352 let device = Arc::new(device); 1353 device.reset(); 1354 1355 let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device)); 1356 1357 let common = VfioCommon::new( 1358 msi_interrupt_manager, 1359 legacy_interrupt_group, 1360 Arc::new(vfio_wrapper) as Arc<dyn Vfio>, 1361 &PciVfioSubclass::VfioSubclass, 1362 bdf, 1363 vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID), 1364 )?; 1365 1366 let vfio_pci_device = VfioPciDevice { 1367 id, 1368 vm: vm.clone(), 1369 device, 1370 container, 1371 common, 1372 iommu_attached, 1373 memory_slot, 1374 }; 1375 1376 Ok(vfio_pci_device) 1377 } 1378 1379 pub fn iommu_attached(&self) -> bool { 1380 self.iommu_attached 1381 } 1382 1383 fn generate_sparse_areas( 1384 caps: &[VfioRegionInfoCap], 1385 region_index: u32, 1386 region_start: u64, 1387 region_size: u64, 1388 vfio_msix: Option<&VfioMsix>, 1389 ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> { 1390 for cap in caps { 1391 match cap { 1392 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()), 1393 VfioRegionInfoCap::MsixMappable => { 1394 if !is_4k_aligned(region_start) { 1395 error!( 1396 "Region start address 0x{:x} must be at least aligned on 4KiB", 1397 region_start 1398 ); 1399 return Err(VfioPciError::RegionAlignment); 1400 } 1401 if !is_4k_multiple(region_size) { 1402 error!( 1403 "Region size 0x{:x} must be at least a multiple of 4KiB", 1404 region_size 1405 ); 1406 return Err(VfioPciError::RegionSize); 1407 } 1408 1409 // In case the region contains the MSI-X vectors table or 1410 // the MSI-X PBA table, we must calculate the subregions 1411 // around them, leading to a list of sparse areas. 1412 // We want to make sure we will still trap MMIO accesses 1413 // to these MSI-X specific ranges. If these region don't align 1414 // with pagesize, we can achieve it by enlarging its range. 1415 // 1416 // Using a BtreeMap as the list provided through the iterator is sorted 1417 // by key. This ensures proper split of the whole region. 1418 let mut inter_ranges = BTreeMap::new(); 1419 if let Some(msix) = vfio_msix { 1420 if region_index == msix.cap.table_bir() { 1421 let (offset, size) = msix.cap.table_range(); 1422 let offset = align_page_size_down(offset); 1423 let size = align_page_size_up(size); 1424 inter_ranges.insert(offset, size); 1425 } 1426 if region_index == msix.cap.pba_bir() { 1427 let (offset, size) = msix.cap.pba_range(); 1428 let offset = align_page_size_down(offset); 1429 let size = align_page_size_up(size); 1430 inter_ranges.insert(offset, size); 1431 } 1432 } 1433 1434 let mut sparse_areas = Vec::new(); 1435 let mut current_offset = 0; 1436 for (range_offset, range_size) in inter_ranges { 1437 if range_offset > current_offset { 1438 sparse_areas.push(VfioRegionSparseMmapArea { 1439 offset: current_offset, 1440 size: range_offset - current_offset, 1441 }); 1442 } 1443 current_offset = align_page_size_down(range_offset + range_size); 1444 } 1445 1446 if region_size > current_offset { 1447 sparse_areas.push(VfioRegionSparseMmapArea { 1448 offset: current_offset, 1449 size: region_size - current_offset, 1450 }); 1451 } 1452 1453 return Ok(sparse_areas); 1454 } 1455 _ => {} 1456 } 1457 } 1458 1459 // In case no relevant capabilities have been found, create a single 1460 // sparse area corresponding to the entire MMIO region. 1461 Ok(vec![VfioRegionSparseMmapArea { 1462 offset: 0, 1463 size: region_size, 1464 }]) 1465 } 1466 1467 /// Map MMIO regions into the guest, and avoid VM exits when the guest tries 1468 /// to reach those regions. 1469 /// 1470 /// # Arguments 1471 /// 1472 /// * `vm` - The VM object. It is used to set the VFIO MMIO regions 1473 /// as user memory regions. 1474 /// * `mem_slot` - The closure to return a memory slot. 1475 pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> { 1476 let fd = self.device.as_raw_fd(); 1477 1478 for region in self.common.mmio_regions.iter_mut() { 1479 let region_flags = self.device.get_region_flags(region.index); 1480 if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { 1481 let mut prot = 0; 1482 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 { 1483 prot |= libc::PROT_READ; 1484 } 1485 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { 1486 prot |= libc::PROT_WRITE; 1487 } 1488 1489 // Retrieve the list of capabilities found on the region 1490 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 { 1491 self.device.get_region_caps(region.index) 1492 } else { 1493 Vec::new() 1494 }; 1495 1496 // Don't try to mmap the region if it contains MSI-X table or 1497 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE 1498 // in the list of supported capabilities. 1499 if let Some(msix) = self.common.interrupt.msix.as_ref() { 1500 if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir()) 1501 && !caps.contains(&VfioRegionInfoCap::MsixMappable) 1502 { 1503 continue; 1504 } 1505 } 1506 1507 let mmap_size = self.device.get_region_size(region.index); 1508 let mmap_offset = self.device.get_region_offset(region.index); 1509 1510 let sparse_areas = Self::generate_sparse_areas( 1511 &caps, 1512 region.index, 1513 region.start.0, 1514 mmap_size, 1515 self.common.interrupt.msix.as_ref(), 1516 )?; 1517 1518 for area in sparse_areas.iter() { 1519 // SAFETY: FFI call with correct arguments 1520 let host_addr = unsafe { 1521 libc::mmap( 1522 null_mut(), 1523 area.size as usize, 1524 prot, 1525 libc::MAP_SHARED, 1526 fd, 1527 mmap_offset as libc::off_t + area.offset as libc::off_t, 1528 ) 1529 }; 1530 1531 if host_addr == libc::MAP_FAILED { 1532 error!( 1533 "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", 1534 area.offset, 1535 area.size, 1536 std::io::Error::last_os_error() 1537 ); 1538 return Err(VfioPciError::MmapArea); 1539 } 1540 1541 if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) { 1542 warn!( 1543 "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})", 1544 area.offset, 1545 area.size, 1546 ); 1547 return Ok(()); 1548 } 1549 1550 let user_memory_region = UserMemoryRegion { 1551 slot: (self.memory_slot)(), 1552 start: region.start.0 + area.offset, 1553 size: area.size, 1554 host_addr: host_addr as u64, 1555 }; 1556 1557 region.user_memory_regions.push(user_memory_region); 1558 1559 let mem_region = self.vm.make_user_memory_region( 1560 user_memory_region.slot, 1561 user_memory_region.start, 1562 user_memory_region.size, 1563 user_memory_region.host_addr, 1564 false, 1565 false, 1566 ); 1567 1568 self.vm 1569 .create_user_memory_region(mem_region) 1570 .map_err(VfioPciError::CreateUserMemoryRegion)?; 1571 } 1572 } 1573 } 1574 1575 Ok(()) 1576 } 1577 1578 pub fn unmap_mmio_regions(&mut self) { 1579 for region in self.common.mmio_regions.iter() { 1580 for user_memory_region in region.user_memory_regions.iter() { 1581 // Remove region 1582 let r = self.vm.make_user_memory_region( 1583 user_memory_region.slot, 1584 user_memory_region.start, 1585 user_memory_region.size, 1586 user_memory_region.host_addr, 1587 false, 1588 false, 1589 ); 1590 1591 if let Err(e) = self.vm.remove_user_memory_region(r) { 1592 error!("Could not remove the userspace memory region: {}", e); 1593 } 1594 1595 // SAFETY: FFI call with correct arguments 1596 let ret = unsafe { 1597 libc::munmap( 1598 user_memory_region.host_addr as *mut libc::c_void, 1599 user_memory_region.size as usize, 1600 ) 1601 }; 1602 if ret != 0 { 1603 error!( 1604 "Could not unmap region {}, error:{}", 1605 region.index, 1606 io::Error::last_os_error() 1607 ); 1608 } 1609 } 1610 } 1611 } 1612 1613 pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> { 1614 if !self.iommu_attached { 1615 self.container 1616 .vfio_dma_map(iova, size, user_addr) 1617 .map_err(VfioPciError::DmaMap)?; 1618 } 1619 1620 Ok(()) 1621 } 1622 1623 pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> { 1624 if !self.iommu_attached { 1625 self.container 1626 .vfio_dma_unmap(iova, size) 1627 .map_err(VfioPciError::DmaUnmap)?; 1628 } 1629 1630 Ok(()) 1631 } 1632 1633 pub fn mmio_regions(&self) -> Vec<MmioRegion> { 1634 self.common.mmio_regions.clone() 1635 } 1636 } 1637 1638 impl Drop for VfioPciDevice { 1639 fn drop(&mut self) { 1640 self.unmap_mmio_regions(); 1641 1642 if let Some(msix) = &self.common.interrupt.msix { 1643 if msix.bar.enabled() { 1644 self.common.disable_msix(); 1645 } 1646 } 1647 1648 if let Some(msi) = &self.common.interrupt.msi { 1649 if msi.cfg.enabled() { 1650 self.common.disable_msi() 1651 } 1652 } 1653 1654 if self.common.interrupt.intx_in_use() { 1655 self.common.disable_intx(); 1656 } 1657 } 1658 } 1659 1660 impl BusDevice for VfioPciDevice { 1661 fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1662 self.read_bar(base, offset, data) 1663 } 1664 1665 fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1666 self.write_bar(base, offset, data) 1667 } 1668 } 1669 1670 // First BAR offset in the PCI config space. 1671 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; 1672 // Capability register offset in the PCI config space. 1673 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; 1674 // Extended capabilities register offset in the PCI config space. 1675 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100; 1676 // IO BAR when first BAR bit is 1. 1677 const PCI_CONFIG_IO_BAR: u32 = 0x1; 1678 // 64-bit memory bar flag. 1679 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; 1680 // Prefetchable BAR bit 1681 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8; 1682 // PCI config register size (4 bytes). 1683 const PCI_CONFIG_REGISTER_SIZE: usize = 4; 1684 // Number of BARs for a PCI device 1685 const BAR_NUMS: usize = 6; 1686 // PCI Header Type register index 1687 const PCI_HEADER_TYPE_REG_INDEX: usize = 3; 1688 // First BAR register index 1689 const PCI_CONFIG_BAR0_INDEX: usize = 4; 1690 // PCI ROM expansion BAR register index 1691 const PCI_ROM_EXP_BAR_INDEX: usize = 12; 1692 1693 impl PciDevice for VfioPciDevice { 1694 fn allocate_bars( 1695 &mut self, 1696 allocator: &Arc<Mutex<SystemAllocator>>, 1697 mmio32_allocator: &mut AddressAllocator, 1698 mmio64_allocator: &mut AddressAllocator, 1699 resources: Option<Vec<Resource>>, 1700 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 1701 self.common 1702 .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources) 1703 } 1704 1705 fn free_bars( 1706 &mut self, 1707 allocator: &mut SystemAllocator, 1708 mmio32_allocator: &mut AddressAllocator, 1709 mmio64_allocator: &mut AddressAllocator, 1710 ) -> Result<(), PciDeviceError> { 1711 self.common 1712 .free_bars(allocator, mmio32_allocator, mmio64_allocator) 1713 } 1714 1715 fn write_config_register( 1716 &mut self, 1717 reg_idx: usize, 1718 offset: u64, 1719 data: &[u8], 1720 ) -> Option<Arc<Barrier>> { 1721 self.common.write_config_register(reg_idx, offset, data) 1722 } 1723 1724 fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1725 self.common.read_config_register(reg_idx) 1726 } 1727 1728 fn detect_bar_reprogramming( 1729 &mut self, 1730 reg_idx: usize, 1731 data: &[u8], 1732 ) -> Option<BarReprogrammingParams> { 1733 self.common 1734 .configuration 1735 .detect_bar_reprogramming(reg_idx, data) 1736 } 1737 1738 fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1739 self.common.read_bar(base, offset, data) 1740 } 1741 1742 fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1743 self.common.write_bar(base, offset, data) 1744 } 1745 1746 fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> { 1747 for region in self.common.mmio_regions.iter_mut() { 1748 if region.start.raw_value() == old_base { 1749 region.start = GuestAddress(new_base); 1750 1751 for user_memory_region in region.user_memory_regions.iter_mut() { 1752 // Remove old region 1753 let old_mem_region = self.vm.make_user_memory_region( 1754 user_memory_region.slot, 1755 user_memory_region.start, 1756 user_memory_region.size, 1757 user_memory_region.host_addr, 1758 false, 1759 false, 1760 ); 1761 1762 self.vm 1763 .remove_user_memory_region(old_mem_region) 1764 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1765 1766 // Update the user memory region with the correct start address. 1767 if new_base > old_base { 1768 user_memory_region.start += new_base - old_base; 1769 } else { 1770 user_memory_region.start -= old_base - new_base; 1771 } 1772 1773 // Insert new region 1774 let new_mem_region = self.vm.make_user_memory_region( 1775 user_memory_region.slot, 1776 user_memory_region.start, 1777 user_memory_region.size, 1778 user_memory_region.host_addr, 1779 false, 1780 false, 1781 ); 1782 1783 self.vm 1784 .create_user_memory_region(new_mem_region) 1785 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1786 } 1787 } 1788 } 1789 1790 Ok(()) 1791 } 1792 1793 fn as_any(&mut self) -> &mut dyn Any { 1794 self 1795 } 1796 1797 fn id(&self) -> Option<String> { 1798 Some(self.id.clone()) 1799 } 1800 } 1801 1802 impl Pausable for VfioPciDevice {} 1803 1804 impl Snapshottable for VfioPciDevice { 1805 fn id(&self) -> String { 1806 self.id.clone() 1807 } 1808 1809 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1810 let mut vfio_pci_dev_snapshot = Snapshot::default(); 1811 1812 // Snapshot VfioCommon 1813 vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?); 1814 1815 Ok(vfio_pci_dev_snapshot) 1816 } 1817 } 1818 impl Transportable for VfioPciDevice {} 1819 impl Migratable for VfioPciDevice {} 1820