1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 6 use crate::msi::{MsiConfigState, MSI_CONFIG_ID}; 7 use crate::msix::MsixConfigState; 8 use crate::{ 9 msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig, 10 PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId, 11 PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId, 12 PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID, 13 }; 14 use anyhow::anyhow; 15 use byteorder::{ByteOrder, LittleEndian}; 16 use hypervisor::HypervisorVmError; 17 use std::any::Any; 18 use std::collections::{BTreeMap, HashMap}; 19 use std::io; 20 use std::os::unix::io::AsRawFd; 21 use std::ptr::null_mut; 22 use std::sync::{Arc, Barrier, Mutex}; 23 use thiserror::Error; 24 use versionize::{VersionMap, Versionize, VersionizeResult}; 25 use versionize_derive::Versionize; 26 use vfio_bindings::bindings::vfio::*; 27 use vfio_ioctls::{ 28 VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea, 29 }; 30 use vm_allocator::{AddressAllocator, SystemAllocator}; 31 use vm_device::interrupt::{ 32 InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, 33 }; 34 use vm_device::{BusDevice, Resource}; 35 use vm_memory::{Address, GuestAddress, GuestUsize}; 36 use vm_migration::{ 37 Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped, 38 }; 39 use vmm_sys_util::eventfd::EventFd; 40 41 pub(crate) const VFIO_COMMON_ID: &str = "vfio_common"; 42 43 #[derive(Debug, Error)] 44 pub enum VfioPciError { 45 #[error("Failed to create user memory region: {0}")] 46 CreateUserMemoryRegion(#[source] HypervisorVmError), 47 #[error("Failed to DMA map: {0}")] 48 DmaMap(#[source] vfio_ioctls::VfioError), 49 #[error("Failed to DMA unmap: {0}")] 50 DmaUnmap(#[source] vfio_ioctls::VfioError), 51 #[error("Failed to enable INTx: {0}")] 52 EnableIntx(#[source] VfioError), 53 #[error("Failed to enable MSI: {0}")] 54 EnableMsi(#[source] VfioError), 55 #[error("Failed to enable MSI-x: {0}")] 56 EnableMsix(#[source] VfioError), 57 #[error("Failed to mmap the area")] 58 MmapArea, 59 #[error("Failed to notifier's eventfd")] 60 MissingNotifier, 61 #[error("Invalid region alignment")] 62 RegionAlignment, 63 #[error("Invalid region size")] 64 RegionSize, 65 #[error("Failed to retrieve MsiConfigState: {0}")] 66 RetrieveMsiConfigState(#[source] anyhow::Error), 67 #[error("Failed to retrieve MsixConfigState: {0}")] 68 RetrieveMsixConfigState(#[source] anyhow::Error), 69 #[error("Failed to retrieve PciConfigurationState: {0}")] 70 RetrievePciConfigurationState(#[source] anyhow::Error), 71 #[error("Failed to retrieve VfioCommonState: {0}")] 72 RetrieveVfioCommonState(#[source] anyhow::Error), 73 } 74 75 #[derive(Copy, Clone)] 76 enum PciVfioSubclass { 77 VfioSubclass = 0xff, 78 } 79 80 impl PciSubclass for PciVfioSubclass { 81 fn get_register_value(&self) -> u8 { 82 *self as u8 83 } 84 } 85 86 enum InterruptUpdateAction { 87 EnableMsi, 88 DisableMsi, 89 EnableMsix, 90 DisableMsix, 91 } 92 93 #[derive(Versionize)] 94 struct IntxState { 95 enabled: bool, 96 } 97 98 pub(crate) struct VfioIntx { 99 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 100 enabled: bool, 101 } 102 103 #[derive(Versionize)] 104 struct MsiState { 105 cap: MsiCap, 106 cap_offset: u32, 107 } 108 109 pub(crate) struct VfioMsi { 110 pub(crate) cfg: MsiConfig, 111 cap_offset: u32, 112 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 113 } 114 115 impl VfioMsi { 116 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 117 let old_enabled = self.cfg.enabled(); 118 119 self.cfg.update(offset, data); 120 121 let new_enabled = self.cfg.enabled(); 122 123 if !old_enabled && new_enabled { 124 return Some(InterruptUpdateAction::EnableMsi); 125 } 126 127 if old_enabled && !new_enabled { 128 return Some(InterruptUpdateAction::DisableMsi); 129 } 130 131 None 132 } 133 } 134 135 #[derive(Versionize)] 136 struct MsixState { 137 cap: MsixCap, 138 cap_offset: u32, 139 bdf: u32, 140 } 141 142 pub(crate) struct VfioMsix { 143 pub(crate) bar: MsixConfig, 144 cap: MsixCap, 145 cap_offset: u32, 146 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 147 } 148 149 impl VfioMsix { 150 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 151 let old_enabled = self.bar.enabled(); 152 153 // Update "Message Control" word 154 if offset == 2 && data.len() == 2 { 155 self.bar.set_msg_ctl(LittleEndian::read_u16(data)); 156 } 157 158 let new_enabled = self.bar.enabled(); 159 160 if !old_enabled && new_enabled { 161 return Some(InterruptUpdateAction::EnableMsix); 162 } 163 164 if old_enabled && !new_enabled { 165 return Some(InterruptUpdateAction::DisableMsix); 166 } 167 168 None 169 } 170 171 fn table_accessed(&self, bar_index: u32, offset: u64) -> bool { 172 let table_offset: u64 = u64::from(self.cap.table_offset()); 173 let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64); 174 let table_bir: u32 = self.cap.table_bir(); 175 176 bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size 177 } 178 } 179 180 pub(crate) struct Interrupt { 181 pub(crate) intx: Option<VfioIntx>, 182 pub(crate) msi: Option<VfioMsi>, 183 pub(crate) msix: Option<VfioMsix>, 184 } 185 186 impl Interrupt { 187 fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 188 if let Some(ref mut msi) = &mut self.msi { 189 let action = msi.update(offset, data); 190 return action; 191 } 192 193 None 194 } 195 196 fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 197 if let Some(ref mut msix) = &mut self.msix { 198 let action = msix.update(offset, data); 199 return action; 200 } 201 202 None 203 } 204 205 fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { 206 if let Some(msi) = &self.msi { 207 if offset >= u64::from(msi.cap_offset) 208 && offset < u64::from(msi.cap_offset) + msi.cfg.size() 209 { 210 return Some(( 211 PciCapabilityId::MessageSignalledInterrupts, 212 u64::from(msi.cap_offset), 213 )); 214 } 215 } 216 217 if let Some(msix) = &self.msix { 218 if offset == u64::from(msix.cap_offset) { 219 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); 220 } 221 } 222 223 None 224 } 225 226 fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool { 227 if let Some(msix) = &self.msix { 228 return msix.table_accessed(bar_index, offset); 229 } 230 231 false 232 } 233 234 fn msix_write_table(&mut self, offset: u64, data: &[u8]) { 235 if let Some(ref mut msix) = &mut self.msix { 236 let offset = offset - u64::from(msix.cap.table_offset()); 237 msix.bar.write_table(offset, data) 238 } 239 } 240 241 fn msix_read_table(&self, offset: u64, data: &mut [u8]) { 242 if let Some(msix) = &self.msix { 243 let offset = offset - u64::from(msix.cap.table_offset()); 244 msix.bar.read_table(offset, data) 245 } 246 } 247 248 pub(crate) fn intx_in_use(&self) -> bool { 249 if let Some(intx) = &self.intx { 250 return intx.enabled; 251 } 252 253 false 254 } 255 } 256 257 #[derive(Copy, Clone)] 258 pub struct UserMemoryRegion { 259 pub slot: u32, 260 pub start: u64, 261 pub size: u64, 262 pub host_addr: u64, 263 } 264 265 #[derive(Clone)] 266 pub struct MmioRegion { 267 pub start: GuestAddress, 268 pub length: GuestUsize, 269 pub(crate) type_: PciBarRegionType, 270 pub(crate) index: u32, 271 pub(crate) user_memory_regions: Vec<UserMemoryRegion>, 272 } 273 #[derive(Debug, Error)] 274 pub enum VfioError { 275 #[error("Kernel VFIO error: {0}")] 276 KernelVfio(#[source] vfio_ioctls::VfioError), 277 #[error("VFIO user error: {0}")] 278 VfioUser(#[source] vfio_user::Error), 279 } 280 281 pub(crate) trait Vfio: Send + Sync { 282 fn read_config_byte(&self, offset: u32) -> u8 { 283 let mut data: [u8; 1] = [0]; 284 self.read_config(offset, &mut data); 285 data[0] 286 } 287 288 fn read_config_word(&self, offset: u32) -> u16 { 289 let mut data: [u8; 2] = [0, 0]; 290 self.read_config(offset, &mut data); 291 u16::from_le_bytes(data) 292 } 293 294 fn read_config_dword(&self, offset: u32) -> u32 { 295 let mut data: [u8; 4] = [0, 0, 0, 0]; 296 self.read_config(offset, &mut data); 297 u32::from_le_bytes(data) 298 } 299 300 fn write_config_dword(&self, offset: u32, buf: u32) { 301 let data: [u8; 4] = buf.to_le_bytes(); 302 self.write_config(offset, &data) 303 } 304 305 fn read_config(&self, offset: u32, data: &mut [u8]) { 306 self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut()); 307 } 308 309 fn write_config(&self, offset: u32, data: &[u8]) { 310 self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data) 311 } 312 313 fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 314 self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds) 315 } 316 317 fn disable_msi(&self) -> Result<(), VfioError> { 318 self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX) 319 } 320 321 fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 322 self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds) 323 } 324 325 fn disable_msix(&self) -> Result<(), VfioError> { 326 self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX) 327 } 328 329 fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) { 330 unimplemented!() 331 } 332 333 fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) { 334 unimplemented!() 335 } 336 337 fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> { 338 unimplemented!() 339 } 340 341 fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 342 unimplemented!() 343 } 344 345 fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 346 unimplemented!() 347 } 348 349 fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 350 unimplemented!() 351 } 352 } 353 354 struct VfioDeviceWrapper { 355 device: Arc<VfioDevice>, 356 } 357 358 impl VfioDeviceWrapper { 359 fn new(device: Arc<VfioDevice>) -> Self { 360 Self { device } 361 } 362 } 363 364 impl Vfio for VfioDeviceWrapper { 365 fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) { 366 self.device.region_read(index, data, offset) 367 } 368 369 fn region_write(&self, index: u32, offset: u64, data: &[u8]) { 370 self.device.region_write(index, data, offset) 371 } 372 373 fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> { 374 self.device.get_irq_info(irq_index).copied() 375 } 376 377 fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 378 self.device 379 .enable_irq(irq_index, event_fds) 380 .map_err(VfioError::KernelVfio) 381 } 382 383 fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> { 384 self.device 385 .disable_irq(irq_index) 386 .map_err(VfioError::KernelVfio) 387 } 388 389 fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> { 390 self.device 391 .unmask_irq(irq_index) 392 .map_err(VfioError::KernelVfio) 393 } 394 } 395 396 #[derive(Versionize)] 397 struct VfioCommonState { 398 intx_state: Option<IntxState>, 399 msi_state: Option<MsiState>, 400 msix_state: Option<MsixState>, 401 } 402 403 impl VersionMapped for VfioCommonState {} 404 405 pub(crate) struct ConfigPatch { 406 mask: u32, 407 patch: u32, 408 } 409 410 pub(crate) struct VfioCommon { 411 pub(crate) configuration: PciConfiguration, 412 pub(crate) mmio_regions: Vec<MmioRegion>, 413 pub(crate) interrupt: Interrupt, 414 pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 415 pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 416 pub(crate) vfio_wrapper: Arc<dyn Vfio>, 417 pub(crate) patches: HashMap<usize, ConfigPatch>, 418 } 419 420 impl VfioCommon { 421 pub(crate) fn new( 422 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 423 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 424 vfio_wrapper: Arc<dyn Vfio>, 425 subclass: &dyn PciSubclass, 426 bdf: PciBdf, 427 snapshot: Option<Snapshot>, 428 ) -> Result<Self, VfioPciError> { 429 let pci_configuration_state = 430 vm_migration::versioned_state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID) 431 .map_err(|e| { 432 VfioPciError::RetrievePciConfigurationState(anyhow!( 433 "Failed to get PciConfigurationState from Snapshot: {}", 434 e 435 )) 436 })?; 437 438 let configuration = PciConfiguration::new( 439 0, 440 0, 441 0, 442 PciClassCode::Other, 443 subclass, 444 None, 445 PciHeaderType::Device, 446 0, 447 0, 448 None, 449 pci_configuration_state, 450 ); 451 452 let mut vfio_common = VfioCommon { 453 mmio_regions: Vec::new(), 454 configuration, 455 interrupt: Interrupt { 456 intx: None, 457 msi: None, 458 msix: None, 459 }, 460 msi_interrupt_manager, 461 legacy_interrupt_group, 462 vfio_wrapper, 463 patches: HashMap::new(), 464 }; 465 466 let state: Option<VfioCommonState> = snapshot 467 .as_ref() 468 .map(|s| s.to_versioned_state()) 469 .transpose() 470 .map_err(|e| { 471 VfioPciError::RetrieveVfioCommonState(anyhow!( 472 "Failed to get VfioCommonState from Snapshot: {}", 473 e 474 )) 475 })?; 476 let msi_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSI_CONFIG_ID) 477 .map_err(|e| { 478 VfioPciError::RetrieveMsiConfigState(anyhow!( 479 "Failed to get MsiConfigState from Snapshot: {}", 480 e 481 )) 482 })?; 483 let msix_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID) 484 .map_err(|e| { 485 VfioPciError::RetrieveMsixConfigState(anyhow!( 486 "Failed to get MsixConfigState from Snapshot: {}", 487 e 488 )) 489 })?; 490 491 if let Some(state) = state.as_ref() { 492 vfio_common.set_state(state, msi_state, msix_state)?; 493 } else { 494 vfio_common.parse_capabilities(bdf); 495 vfio_common.initialize_legacy_interrupt()?; 496 } 497 498 Ok(vfio_common) 499 } 500 501 pub(crate) fn allocate_bars( 502 &mut self, 503 allocator: &Arc<Mutex<SystemAllocator>>, 504 mmio_allocator: &mut AddressAllocator, 505 resources: Option<Vec<Resource>>, 506 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 507 let mut bars = Vec::new(); 508 let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX; 509 510 // Going through all regular regions to compute the BAR size. 511 // We're not saving the BAR address to restore it, because we 512 // are going to allocate a guest address for each BAR and write 513 // that new address back. 514 while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { 515 let mut region_size: u64 = 0; 516 let mut region_type = PciBarRegionType::Memory32BitRegion; 517 let mut prefetchable = PciBarPrefetchable::NotPrefetchable; 518 let mut flags: u32 = 0; 519 520 let mut restored_bar_addr = None; 521 if let Some(resources) = &resources { 522 for resource in resources { 523 if let Resource::PciBar { 524 index, 525 base, 526 size, 527 type_, 528 .. 529 } = resource 530 { 531 if *index == bar_id as usize { 532 restored_bar_addr = Some(GuestAddress(*base)); 533 region_size = *size; 534 region_type = PciBarRegionType::from(*type_); 535 break; 536 } 537 } 538 } 539 if restored_bar_addr.is_none() { 540 bar_id += 1; 541 continue; 542 } 543 } else { 544 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 545 (PCI_ROM_EXP_BAR_INDEX * 4) as u32 546 } else { 547 PCI_CONFIG_BAR_OFFSET + bar_id * 4 548 }; 549 550 // First read flags 551 flags = self.vfio_wrapper.read_config_dword(bar_offset); 552 553 // Is this an IO BAR? 554 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 555 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) 556 } else { 557 false 558 }; 559 560 // Is this a 64-bit BAR? 561 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 562 matches!( 563 flags & PCI_CONFIG_MEMORY_BAR_64BIT, 564 PCI_CONFIG_MEMORY_BAR_64BIT 565 ) 566 } else { 567 false 568 }; 569 570 if matches!( 571 flags & PCI_CONFIG_BAR_PREFETCHABLE, 572 PCI_CONFIG_BAR_PREFETCHABLE 573 ) { 574 prefetchable = PciBarPrefetchable::Prefetchable 575 }; 576 577 // To get size write all 1s 578 self.vfio_wrapper 579 .write_config_dword(bar_offset, 0xffff_ffff); 580 581 // And read back BAR value. The device will write zeros for bits it doesn't care about 582 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset); 583 584 if io_bar { 585 // Mask flag bits (lowest 2 for I/O bars) 586 lower &= !0b11; 587 588 // BAR is not enabled 589 if lower == 0 { 590 bar_id += 1; 591 continue; 592 } 593 594 // IO BAR 595 region_type = PciBarRegionType::IoRegion; 596 597 // Invert bits and add 1 to calculate size 598 region_size = (!lower + 1) as u64; 599 } else if is_64bit_bar { 600 // 64 bits Memory BAR 601 region_type = PciBarRegionType::Memory64BitRegion; 602 603 // Query size of upper BAR of 64-bit BAR 604 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; 605 self.vfio_wrapper 606 .write_config_dword(upper_offset, 0xffff_ffff); 607 let upper = self.vfio_wrapper.read_config_dword(upper_offset); 608 609 let mut combined_size = u64::from(upper) << 32 | u64::from(lower); 610 611 // Mask out flag bits (lowest 4 for memory bars) 612 combined_size &= !0b1111; 613 614 // BAR is not enabled 615 if combined_size == 0 { 616 bar_id += 1; 617 continue; 618 } 619 620 // Invert and add 1 to to find size 621 region_size = !combined_size + 1; 622 } else { 623 region_type = PciBarRegionType::Memory32BitRegion; 624 625 // Mask out flag bits (lowest 4 for memory bars) 626 lower &= !0b1111; 627 628 if lower == 0 { 629 bar_id += 1; 630 continue; 631 } 632 633 // Invert and add 1 to to find size 634 region_size = (!lower + 1) as u64; 635 } 636 } 637 638 let bar_addr = match region_type { 639 PciBarRegionType::IoRegion => { 640 #[cfg(target_arch = "aarch64")] 641 unimplemented!(); 642 643 // The address needs to be 4 bytes aligned. 644 #[cfg(not(target_arch = "aarch64"))] 645 allocator 646 .lock() 647 .unwrap() 648 .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4)) 649 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 650 } 651 PciBarRegionType::Memory32BitRegion => { 652 // BAR allocation must be naturally aligned 653 allocator 654 .lock() 655 .unwrap() 656 .allocate_mmio_hole_addresses( 657 restored_bar_addr, 658 region_size, 659 Some(region_size), 660 ) 661 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 662 } 663 PciBarRegionType::Memory64BitRegion => { 664 // BAR allocation must be naturally aligned 665 mmio_allocator 666 .allocate(restored_bar_addr, region_size, Some(region_size)) 667 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 668 } 669 }; 670 671 // We can now build our BAR configuration block. 672 let bar = PciBarConfiguration::default() 673 .set_index(bar_id as usize) 674 .set_address(bar_addr.raw_value()) 675 .set_size(region_size) 676 .set_region_type(region_type) 677 .set_prefetchable(prefetchable); 678 679 if bar_id == VFIO_PCI_ROM_REGION_INDEX { 680 self.configuration 681 .add_pci_rom_bar(&bar, flags & 0x1) 682 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 683 } else { 684 self.configuration 685 .add_pci_bar(&bar) 686 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 687 } 688 689 bars.push(bar); 690 self.mmio_regions.push(MmioRegion { 691 start: bar_addr, 692 length: region_size, 693 type_: region_type, 694 index: bar_id, 695 user_memory_regions: Vec::new(), 696 }); 697 698 bar_id += 1; 699 if region_type == PciBarRegionType::Memory64BitRegion { 700 bar_id += 1; 701 } 702 } 703 704 Ok(bars) 705 } 706 707 pub(crate) fn free_bars( 708 &mut self, 709 allocator: &mut SystemAllocator, 710 mmio_allocator: &mut AddressAllocator, 711 ) -> Result<(), PciDeviceError> { 712 for region in self.mmio_regions.iter() { 713 match region.type_ { 714 PciBarRegionType::IoRegion => { 715 #[cfg(target_arch = "x86_64")] 716 allocator.free_io_addresses(region.start, region.length); 717 #[cfg(target_arch = "aarch64")] 718 error!("I/O region is not supported"); 719 } 720 PciBarRegionType::Memory32BitRegion => { 721 allocator.free_mmio_hole_addresses(region.start, region.length); 722 } 723 PciBarRegionType::Memory64BitRegion => { 724 mmio_allocator.free(region.start, region.length); 725 } 726 } 727 } 728 Ok(()) 729 } 730 731 pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap { 732 let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into()); 733 734 let table = self.vfio_wrapper.read_config_dword((cap + 4).into()); 735 736 let pba = self.vfio_wrapper.read_config_dword((cap + 8).into()); 737 738 MsixCap { 739 msg_ctl, 740 table, 741 pba, 742 } 743 } 744 745 pub(crate) fn initialize_msix( 746 &mut self, 747 msix_cap: MsixCap, 748 cap_offset: u32, 749 bdf: PciBdf, 750 state: Option<MsixConfigState>, 751 ) { 752 let interrupt_source_group = self 753 .msi_interrupt_manager 754 .create_group(MsiIrqGroupConfig { 755 base: 0, 756 count: msix_cap.table_size() as InterruptIndex, 757 }) 758 .unwrap(); 759 760 let msix_config = MsixConfig::new( 761 msix_cap.table_size(), 762 interrupt_source_group.clone(), 763 bdf.into(), 764 state, 765 ) 766 .unwrap(); 767 768 self.interrupt.msix = Some(VfioMsix { 769 bar: msix_config, 770 cap: msix_cap, 771 cap_offset, 772 interrupt_source_group, 773 }); 774 } 775 776 pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 { 777 self.vfio_wrapper.read_config_word((cap + 2).into()) 778 } 779 780 pub(crate) fn initialize_msi( 781 &mut self, 782 msg_ctl: u16, 783 cap_offset: u32, 784 state: Option<MsiConfigState>, 785 ) { 786 let interrupt_source_group = self 787 .msi_interrupt_manager 788 .create_group(MsiIrqGroupConfig { 789 base: 0, 790 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, 791 }) 792 .unwrap(); 793 794 let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap(); 795 796 self.interrupt.msi = Some(VfioMsi { 797 cfg: msi_config, 798 cap_offset, 799 interrupt_source_group, 800 }); 801 } 802 803 pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) { 804 let mut cap_next = self 805 .vfio_wrapper 806 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 807 808 let mut pci_express_cap_found = false; 809 let mut power_management_cap_found = false; 810 811 while cap_next != 0 { 812 let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into()); 813 814 match PciCapabilityId::from(cap_id) { 815 PciCapabilityId::MessageSignalledInterrupts => { 816 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { 817 if irq_info.count > 0 { 818 // Parse capability only if the VFIO device 819 // supports MSI. 820 let msg_ctl = self.parse_msi_capabilities(cap_next); 821 self.initialize_msi(msg_ctl, cap_next as u32, None); 822 } 823 } 824 } 825 PciCapabilityId::MsiX => { 826 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) 827 { 828 if irq_info.count > 0 { 829 // Parse capability only if the VFIO device 830 // supports MSI-X. 831 let msix_cap = self.parse_msix_capabilities(cap_next); 832 self.initialize_msix(msix_cap, cap_next as u32, bdf, None); 833 } 834 } 835 } 836 PciCapabilityId::PciExpress => pci_express_cap_found = true, 837 PciCapabilityId::PowerManagement => power_management_cap_found = true, 838 _ => {} 839 }; 840 841 cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into()); 842 } 843 844 if pci_express_cap_found && power_management_cap_found { 845 self.parse_extended_capabilities(); 846 } 847 } 848 849 fn parse_extended_capabilities(&mut self) { 850 let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET; 851 852 loop { 853 let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset); 854 855 let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16; 856 let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16; 857 858 match PciExpressCapabilityId::from(cap_id) { 859 PciExpressCapabilityId::AlternativeRoutingIdentificationIntepretation 860 | PciExpressCapabilityId::ResizeableBar 861 | PciExpressCapabilityId::SingleRootIoVirtualization => { 862 let reg_idx = (current_offset / 4) as usize; 863 self.patches.insert( 864 reg_idx, 865 ConfigPatch { 866 mask: 0x0000_ffff, 867 patch: PciExpressCapabilityId::NullCapability as u32, 868 }, 869 ); 870 } 871 _ => {} 872 } 873 874 if cap_next == 0 { 875 break; 876 } 877 878 current_offset = cap_next.into(); 879 } 880 } 881 882 pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> { 883 if let Some(intx) = &mut self.interrupt.intx { 884 if !intx.enabled { 885 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { 886 self.vfio_wrapper 887 .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) 888 .map_err(VfioPciError::EnableIntx)?; 889 890 intx.enabled = true; 891 } else { 892 return Err(VfioPciError::MissingNotifier); 893 } 894 } 895 } 896 897 Ok(()) 898 } 899 900 pub(crate) fn disable_intx(&mut self) { 901 if let Some(intx) = &mut self.interrupt.intx { 902 if intx.enabled { 903 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { 904 error!("Could not disable INTx: {}", e); 905 } else { 906 intx.enabled = false; 907 } 908 } 909 } 910 } 911 912 pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> { 913 if let Some(msi) = &self.interrupt.msi { 914 let mut irq_fds: Vec<EventFd> = Vec::new(); 915 for i in 0..msi.cfg.num_enabled_vectors() { 916 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { 917 irq_fds.push(eventfd); 918 } else { 919 return Err(VfioPciError::MissingNotifier); 920 } 921 } 922 923 self.vfio_wrapper 924 .enable_msi(irq_fds.iter().collect()) 925 .map_err(VfioPciError::EnableMsi)?; 926 } 927 928 Ok(()) 929 } 930 931 pub(crate) fn disable_msi(&self) { 932 if let Err(e) = self.vfio_wrapper.disable_msi() { 933 error!("Could not disable MSI: {}", e); 934 } 935 } 936 937 pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> { 938 if let Some(msix) = &self.interrupt.msix { 939 let mut irq_fds: Vec<EventFd> = Vec::new(); 940 for i in 0..msix.bar.table_entries.len() { 941 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { 942 irq_fds.push(eventfd); 943 } else { 944 return Err(VfioPciError::MissingNotifier); 945 } 946 } 947 948 self.vfio_wrapper 949 .enable_msix(irq_fds.iter().collect()) 950 .map_err(VfioPciError::EnableMsix)?; 951 } 952 953 Ok(()) 954 } 955 956 pub(crate) fn disable_msix(&self) { 957 if let Err(e) = self.vfio_wrapper.disable_msix() { 958 error!("Could not disable MSI-X: {}", e); 959 } 960 } 961 962 pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> { 963 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { 964 if irq_info.count == 0 { 965 // A count of 0 means the INTx IRQ is not supported, therefore 966 // it shouldn't be initialized. 967 return Ok(()); 968 } 969 } 970 971 if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() { 972 self.interrupt.intx = Some(VfioIntx { 973 interrupt_source_group, 974 enabled: false, 975 }); 976 977 self.enable_intx()?; 978 } 979 980 Ok(()) 981 } 982 983 pub(crate) fn update_msi_capabilities( 984 &mut self, 985 offset: u64, 986 data: &[u8], 987 ) -> Result<(), VfioPciError> { 988 match self.interrupt.update_msi(offset, data) { 989 Some(InterruptUpdateAction::EnableMsi) => { 990 // Disable INTx before we can enable MSI 991 self.disable_intx(); 992 self.enable_msi()?; 993 } 994 Some(InterruptUpdateAction::DisableMsi) => { 995 // Fallback onto INTx when disabling MSI 996 self.disable_msi(); 997 self.enable_intx()?; 998 } 999 _ => {} 1000 } 1001 1002 Ok(()) 1003 } 1004 1005 pub(crate) fn update_msix_capabilities( 1006 &mut self, 1007 offset: u64, 1008 data: &[u8], 1009 ) -> Result<(), VfioPciError> { 1010 match self.interrupt.update_msix(offset, data) { 1011 Some(InterruptUpdateAction::EnableMsix) => { 1012 // Disable INTx before we can enable MSI-X 1013 self.disable_intx(); 1014 self.enable_msix()?; 1015 } 1016 Some(InterruptUpdateAction::DisableMsix) => { 1017 // Fallback onto INTx when disabling MSI-X 1018 self.disable_msix(); 1019 self.enable_intx()?; 1020 } 1021 _ => {} 1022 } 1023 1024 Ok(()) 1025 } 1026 1027 pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> { 1028 for region in self.mmio_regions.iter() { 1029 if addr >= region.start.raw_value() 1030 && addr < region.start.unchecked_add(region.length).raw_value() 1031 { 1032 return Some(region.clone()); 1033 } 1034 } 1035 None 1036 } 1037 1038 pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1039 let addr = base + offset; 1040 if let Some(region) = self.find_region(addr) { 1041 let offset = addr - region.start.raw_value(); 1042 1043 if self.interrupt.msix_table_accessed(region.index, offset) { 1044 self.interrupt.msix_read_table(offset, data); 1045 } else { 1046 self.vfio_wrapper.region_read(region.index, offset, data); 1047 } 1048 } 1049 1050 // INTx EOI 1051 // The guest reading from the BAR potentially means the interrupt has 1052 // been received and can be acknowledged. 1053 if self.interrupt.intx_in_use() { 1054 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1055 error!("Failed unmasking INTx IRQ: {}", e); 1056 } 1057 } 1058 } 1059 1060 pub(crate) fn write_bar( 1061 &mut self, 1062 base: u64, 1063 offset: u64, 1064 data: &[u8], 1065 ) -> Option<Arc<Barrier>> { 1066 let addr = base + offset; 1067 if let Some(region) = self.find_region(addr) { 1068 let offset = addr - region.start.raw_value(); 1069 1070 // If the MSI-X table is written to, we need to update our cache. 1071 if self.interrupt.msix_table_accessed(region.index, offset) { 1072 self.interrupt.msix_write_table(offset, data); 1073 } else { 1074 self.vfio_wrapper.region_write(region.index, offset, data); 1075 } 1076 } 1077 1078 // INTx EOI 1079 // The guest writing to the BAR potentially means the interrupt has 1080 // been received and can be acknowledged. 1081 if self.interrupt.intx_in_use() { 1082 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 1083 error!("Failed unmasking INTx IRQ: {}", e); 1084 } 1085 } 1086 1087 None 1088 } 1089 1090 pub(crate) fn write_config_register( 1091 &mut self, 1092 reg_idx: usize, 1093 offset: u64, 1094 data: &[u8], 1095 ) -> Option<Arc<Barrier>> { 1096 // When the guest wants to write to a BAR, we trap it into 1097 // our local configuration space. We're not reprogramming 1098 // VFIO device. 1099 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1100 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1101 { 1102 // We keep our local cache updated with the BARs. 1103 // We'll read it back from there when the guest is asking 1104 // for BARs (see read_config_register()). 1105 self.configuration 1106 .write_config_register(reg_idx, offset, data); 1107 return None; 1108 } 1109 1110 let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; 1111 1112 // If the MSI or MSI-X capabilities are accessed, we need to 1113 // update our local cache accordingly. 1114 // Depending on how the capabilities are modified, this could 1115 // trigger a VFIO MSI or MSI-X toggle. 1116 if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { 1117 let cap_offset: u64 = reg - cap_base + offset; 1118 match cap_id { 1119 PciCapabilityId::MessageSignalledInterrupts => { 1120 if let Err(e) = self.update_msi_capabilities(cap_offset, data) { 1121 error!("Could not update MSI capabilities: {}", e); 1122 } 1123 } 1124 PciCapabilityId::MsiX => { 1125 if let Err(e) = self.update_msix_capabilities(cap_offset, data) { 1126 error!("Could not update MSI-X capabilities: {}", e); 1127 } 1128 } 1129 _ => {} 1130 } 1131 } 1132 1133 // Make sure to write to the device's PCI config space after MSI/MSI-X 1134 // interrupts have been enabled/disabled. In case of MSI, when the 1135 // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), 1136 // the MSI Enable bit in the MSI capability structure found in the PCI 1137 // config space is disabled by default. That's why when the guest is 1138 // enabling this bit, we first need to enable the MSI interrupts with 1139 // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write 1140 // to the device region to update the MSI Enable bit. 1141 self.vfio_wrapper.write_config((reg + offset) as u32, data); 1142 1143 None 1144 } 1145 1146 pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1147 // When reading the BARs, we trap it and return what comes 1148 // from our local configuration space. We want the guest to 1149 // use that and not the VFIO device BARs as it does not map 1150 // with the guest address space. 1151 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1152 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1153 { 1154 return self.configuration.read_reg(reg_idx); 1155 } 1156 1157 // Since we don't support passing multi-functions devices, we should 1158 // mask the multi-function bit, bit 7 of the Header Type byte on the 1159 // register 3. 1160 let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { 1161 0xff7f_ffff 1162 } else { 1163 0xffff_ffff 1164 }; 1165 1166 // The config register read comes from the VFIO device itself. 1167 let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask; 1168 1169 if let Some(config_patch) = self.patches.get(®_idx) { 1170 value = (value & !config_patch.mask) | config_patch.patch; 1171 } 1172 1173 value 1174 } 1175 1176 fn state(&self) -> VfioCommonState { 1177 let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState { 1178 enabled: intx.enabled, 1179 }); 1180 1181 let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState { 1182 cap: msi.cfg.cap, 1183 cap_offset: msi.cap_offset, 1184 }); 1185 1186 let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState { 1187 cap: msix.cap, 1188 cap_offset: msix.cap_offset, 1189 bdf: msix.bar.devid, 1190 }); 1191 1192 VfioCommonState { 1193 intx_state, 1194 msi_state, 1195 msix_state, 1196 } 1197 } 1198 1199 fn set_state( 1200 &mut self, 1201 state: &VfioCommonState, 1202 msi_state: Option<MsiConfigState>, 1203 msix_state: Option<MsixConfigState>, 1204 ) -> Result<(), VfioPciError> { 1205 if let (Some(intx), Some(interrupt_source_group)) = 1206 (&state.intx_state, self.legacy_interrupt_group.clone()) 1207 { 1208 self.interrupt.intx = Some(VfioIntx { 1209 interrupt_source_group, 1210 enabled: false, 1211 }); 1212 1213 if intx.enabled { 1214 self.enable_intx()?; 1215 } 1216 } 1217 1218 if let Some(msi) = &state.msi_state { 1219 self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state); 1220 } 1221 1222 if let Some(msix) = &state.msix_state { 1223 self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state); 1224 } 1225 1226 Ok(()) 1227 } 1228 } 1229 1230 impl Pausable for VfioCommon {} 1231 1232 impl Snapshottable for VfioCommon { 1233 fn id(&self) -> String { 1234 String::from(VFIO_COMMON_ID) 1235 } 1236 1237 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1238 let mut vfio_common_snapshot = Snapshot::new_from_versioned_state(&self.state())?; 1239 1240 // Snapshot PciConfiguration 1241 vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?); 1242 1243 // Snapshot MSI 1244 if let Some(msi) = &mut self.interrupt.msi { 1245 vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?); 1246 } 1247 1248 // Snapshot MSI-X 1249 if let Some(msix) = &mut self.interrupt.msix { 1250 vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?); 1251 } 1252 1253 Ok(vfio_common_snapshot) 1254 } 1255 } 1256 1257 /// VfioPciDevice represents a VFIO PCI device. 1258 /// This structure implements the BusDevice and PciDevice traits. 1259 /// 1260 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. 1261 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, 1262 /// which then gets added to the PCI bus. 1263 pub struct VfioPciDevice { 1264 id: String, 1265 vm: Arc<dyn hypervisor::Vm>, 1266 device: Arc<VfioDevice>, 1267 container: Arc<VfioContainer>, 1268 common: VfioCommon, 1269 iommu_attached: bool, 1270 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1271 } 1272 1273 impl VfioPciDevice { 1274 /// Constructs a new Vfio Pci device for the given Vfio device 1275 #[allow(clippy::too_many_arguments)] 1276 pub fn new( 1277 id: String, 1278 vm: &Arc<dyn hypervisor::Vm>, 1279 device: VfioDevice, 1280 container: Arc<VfioContainer>, 1281 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 1282 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 1283 iommu_attached: bool, 1284 bdf: PciBdf, 1285 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1286 snapshot: Option<Snapshot>, 1287 ) -> Result<Self, VfioPciError> { 1288 let device = Arc::new(device); 1289 device.reset(); 1290 1291 let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device)); 1292 1293 let common = VfioCommon::new( 1294 msi_interrupt_manager, 1295 legacy_interrupt_group, 1296 Arc::new(vfio_wrapper) as Arc<dyn Vfio>, 1297 &PciVfioSubclass::VfioSubclass, 1298 bdf, 1299 vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID), 1300 )?; 1301 1302 let vfio_pci_device = VfioPciDevice { 1303 id, 1304 vm: vm.clone(), 1305 device, 1306 container, 1307 common, 1308 iommu_attached, 1309 memory_slot, 1310 }; 1311 1312 Ok(vfio_pci_device) 1313 } 1314 1315 pub fn iommu_attached(&self) -> bool { 1316 self.iommu_attached 1317 } 1318 1319 fn align_4k(address: u64) -> u64 { 1320 (address + 0xfff) & 0xffff_ffff_ffff_f000 1321 } 1322 1323 fn is_4k_aligned(address: u64) -> bool { 1324 (address & 0xfff) == 0 1325 } 1326 1327 fn is_4k_multiple(size: u64) -> bool { 1328 (size & 0xfff) == 0 1329 } 1330 1331 fn generate_sparse_areas( 1332 caps: &[VfioRegionInfoCap], 1333 region_index: u32, 1334 region_start: u64, 1335 region_size: u64, 1336 vfio_msix: Option<&VfioMsix>, 1337 ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> { 1338 for cap in caps { 1339 match cap { 1340 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()), 1341 VfioRegionInfoCap::MsixMappable => { 1342 if !Self::is_4k_aligned(region_start) { 1343 error!( 1344 "Region start address 0x{:x} must be at least aligned on 4KiB", 1345 region_start 1346 ); 1347 return Err(VfioPciError::RegionAlignment); 1348 } 1349 if !Self::is_4k_multiple(region_size) { 1350 error!( 1351 "Region size 0x{:x} must be at least a multiple of 4KiB", 1352 region_size 1353 ); 1354 return Err(VfioPciError::RegionSize); 1355 } 1356 1357 // In case the region contains the MSI-X vectors table or 1358 // the MSI-X PBA table, we must calculate the subregions 1359 // around them, leading to a list of sparse areas. 1360 // We want to make sure we will still trap MMIO accesses 1361 // to these MSI-X specific ranges. 1362 // 1363 // Using a BtreeMap as the list provided through the iterator is sorted 1364 // by key. This ensures proper split of the whole region. 1365 let mut inter_ranges = BTreeMap::new(); 1366 if let Some(msix) = vfio_msix { 1367 if region_index == msix.cap.table_bir() { 1368 let (offset, size) = msix.cap.table_range(); 1369 inter_ranges.insert(offset, size); 1370 } 1371 if region_index == msix.cap.pba_bir() { 1372 let (offset, size) = msix.cap.pba_range(); 1373 inter_ranges.insert(offset, size); 1374 } 1375 } 1376 1377 let mut sparse_areas = Vec::new(); 1378 let mut current_offset = 0; 1379 for (range_offset, range_size) in inter_ranges { 1380 if range_offset > current_offset { 1381 sparse_areas.push(VfioRegionSparseMmapArea { 1382 offset: current_offset, 1383 size: range_offset - current_offset, 1384 }); 1385 } 1386 1387 current_offset = Self::align_4k(range_offset + range_size); 1388 } 1389 1390 if region_size > current_offset { 1391 sparse_areas.push(VfioRegionSparseMmapArea { 1392 offset: current_offset, 1393 size: region_size - current_offset, 1394 }); 1395 } 1396 1397 return Ok(sparse_areas); 1398 } 1399 _ => {} 1400 } 1401 } 1402 1403 // In case no relevant capabilities have been found, create a single 1404 // sparse area corresponding to the entire MMIO region. 1405 Ok(vec![VfioRegionSparseMmapArea { 1406 offset: 0, 1407 size: region_size, 1408 }]) 1409 } 1410 1411 /// Map MMIO regions into the guest, and avoid VM exits when the guest tries 1412 /// to reach those regions. 1413 /// 1414 /// # Arguments 1415 /// 1416 /// * `vm` - The VM object. It is used to set the VFIO MMIO regions 1417 /// as user memory regions. 1418 /// * `mem_slot` - The closure to return a memory slot. 1419 pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> { 1420 let fd = self.device.as_raw_fd(); 1421 1422 for region in self.common.mmio_regions.iter_mut() { 1423 let region_flags = self.device.get_region_flags(region.index); 1424 if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { 1425 let mut prot = 0; 1426 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 { 1427 prot |= libc::PROT_READ; 1428 } 1429 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { 1430 prot |= libc::PROT_WRITE; 1431 } 1432 1433 // Retrieve the list of capabilities found on the region 1434 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 { 1435 self.device.get_region_caps(region.index) 1436 } else { 1437 Vec::new() 1438 }; 1439 1440 // Don't try to mmap the region if it contains MSI-X table or 1441 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE 1442 // in the list of supported capabilities. 1443 if let Some(msix) = self.common.interrupt.msix.as_ref() { 1444 if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir()) 1445 && !caps.contains(&VfioRegionInfoCap::MsixMappable) 1446 { 1447 continue; 1448 } 1449 } 1450 1451 let mmap_size = self.device.get_region_size(region.index); 1452 let mmap_offset = self.device.get_region_offset(region.index); 1453 1454 let sparse_areas = Self::generate_sparse_areas( 1455 &caps, 1456 region.index, 1457 region.start.0, 1458 mmap_size, 1459 self.common.interrupt.msix.as_ref(), 1460 )?; 1461 1462 for area in sparse_areas.iter() { 1463 // SAFETY: FFI call with correct arguments 1464 let host_addr = unsafe { 1465 libc::mmap( 1466 null_mut(), 1467 area.size as usize, 1468 prot, 1469 libc::MAP_SHARED, 1470 fd, 1471 mmap_offset as libc::off_t + area.offset as libc::off_t, 1472 ) 1473 }; 1474 1475 if host_addr == libc::MAP_FAILED { 1476 error!( 1477 "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", 1478 area.offset, 1479 area.size, 1480 std::io::Error::last_os_error() 1481 ); 1482 return Err(VfioPciError::MmapArea); 1483 } 1484 1485 let user_memory_region = UserMemoryRegion { 1486 slot: (self.memory_slot)(), 1487 start: region.start.0 + area.offset, 1488 size: area.size, 1489 host_addr: host_addr as u64, 1490 }; 1491 1492 region.user_memory_regions.push(user_memory_region); 1493 1494 let mem_region = self.vm.make_user_memory_region( 1495 user_memory_region.slot, 1496 user_memory_region.start, 1497 user_memory_region.size, 1498 user_memory_region.host_addr, 1499 false, 1500 false, 1501 ); 1502 1503 self.vm 1504 .create_user_memory_region(mem_region) 1505 .map_err(VfioPciError::CreateUserMemoryRegion)?; 1506 } 1507 } 1508 } 1509 1510 Ok(()) 1511 } 1512 1513 pub fn unmap_mmio_regions(&mut self) { 1514 for region in self.common.mmio_regions.iter() { 1515 for user_memory_region in region.user_memory_regions.iter() { 1516 // Remove region 1517 let r = self.vm.make_user_memory_region( 1518 user_memory_region.slot, 1519 user_memory_region.start, 1520 user_memory_region.size, 1521 user_memory_region.host_addr, 1522 false, 1523 false, 1524 ); 1525 1526 if let Err(e) = self.vm.remove_user_memory_region(r) { 1527 error!("Could not remove the userspace memory region: {}", e); 1528 } 1529 1530 // SAFETY: FFI call with correct arguments 1531 let ret = unsafe { 1532 libc::munmap( 1533 user_memory_region.host_addr as *mut libc::c_void, 1534 user_memory_region.size as usize, 1535 ) 1536 }; 1537 if ret != 0 { 1538 error!( 1539 "Could not unmap region {}, error:{}", 1540 region.index, 1541 io::Error::last_os_error() 1542 ); 1543 } 1544 } 1545 } 1546 } 1547 1548 pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> { 1549 if !self.iommu_attached { 1550 self.container 1551 .vfio_dma_map(iova, size, user_addr) 1552 .map_err(VfioPciError::DmaMap)?; 1553 } 1554 1555 Ok(()) 1556 } 1557 1558 pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> { 1559 if !self.iommu_attached { 1560 self.container 1561 .vfio_dma_unmap(iova, size) 1562 .map_err(VfioPciError::DmaUnmap)?; 1563 } 1564 1565 Ok(()) 1566 } 1567 1568 pub fn mmio_regions(&self) -> Vec<MmioRegion> { 1569 self.common.mmio_regions.clone() 1570 } 1571 } 1572 1573 impl Drop for VfioPciDevice { 1574 fn drop(&mut self) { 1575 self.unmap_mmio_regions(); 1576 1577 if let Some(msix) = &self.common.interrupt.msix { 1578 if msix.bar.enabled() { 1579 self.common.disable_msix(); 1580 } 1581 } 1582 1583 if let Some(msi) = &self.common.interrupt.msi { 1584 if msi.cfg.enabled() { 1585 self.common.disable_msi() 1586 } 1587 } 1588 1589 if self.common.interrupt.intx_in_use() { 1590 self.common.disable_intx(); 1591 } 1592 } 1593 } 1594 1595 impl BusDevice for VfioPciDevice { 1596 fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1597 self.read_bar(base, offset, data) 1598 } 1599 1600 fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1601 self.write_bar(base, offset, data) 1602 } 1603 } 1604 1605 // First BAR offset in the PCI config space. 1606 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; 1607 // Capability register offset in the PCI config space. 1608 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; 1609 // Extended capabilities register offset in the PCI config space. 1610 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100; 1611 // IO BAR when first BAR bit is 1. 1612 const PCI_CONFIG_IO_BAR: u32 = 0x1; 1613 // 64-bit memory bar flag. 1614 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; 1615 // Prefetchable BAR bit 1616 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8; 1617 // PCI config register size (4 bytes). 1618 const PCI_CONFIG_REGISTER_SIZE: usize = 4; 1619 // Number of BARs for a PCI device 1620 const BAR_NUMS: usize = 6; 1621 // PCI Header Type register index 1622 const PCI_HEADER_TYPE_REG_INDEX: usize = 3; 1623 // First BAR register index 1624 const PCI_CONFIG_BAR0_INDEX: usize = 4; 1625 // PCI ROM expansion BAR register index 1626 const PCI_ROM_EXP_BAR_INDEX: usize = 12; 1627 1628 impl PciDevice for VfioPciDevice { 1629 fn allocate_bars( 1630 &mut self, 1631 allocator: &Arc<Mutex<SystemAllocator>>, 1632 mmio_allocator: &mut AddressAllocator, 1633 resources: Option<Vec<Resource>>, 1634 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 1635 self.common 1636 .allocate_bars(allocator, mmio_allocator, resources) 1637 } 1638 1639 fn free_bars( 1640 &mut self, 1641 allocator: &mut SystemAllocator, 1642 mmio_allocator: &mut AddressAllocator, 1643 ) -> Result<(), PciDeviceError> { 1644 self.common.free_bars(allocator, mmio_allocator) 1645 } 1646 1647 fn write_config_register( 1648 &mut self, 1649 reg_idx: usize, 1650 offset: u64, 1651 data: &[u8], 1652 ) -> Option<Arc<Barrier>> { 1653 self.common.write_config_register(reg_idx, offset, data) 1654 } 1655 1656 fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1657 self.common.read_config_register(reg_idx) 1658 } 1659 1660 fn detect_bar_reprogramming( 1661 &mut self, 1662 reg_idx: usize, 1663 data: &[u8], 1664 ) -> Option<BarReprogrammingParams> { 1665 self.common 1666 .configuration 1667 .detect_bar_reprogramming(reg_idx, data) 1668 } 1669 1670 fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1671 self.common.read_bar(base, offset, data) 1672 } 1673 1674 fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1675 self.common.write_bar(base, offset, data) 1676 } 1677 1678 fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> { 1679 for region in self.common.mmio_regions.iter_mut() { 1680 if region.start.raw_value() == old_base { 1681 region.start = GuestAddress(new_base); 1682 1683 for user_memory_region in region.user_memory_regions.iter_mut() { 1684 // Remove old region 1685 let old_mem_region = self.vm.make_user_memory_region( 1686 user_memory_region.slot, 1687 user_memory_region.start, 1688 user_memory_region.size, 1689 user_memory_region.host_addr, 1690 false, 1691 false, 1692 ); 1693 1694 self.vm 1695 .remove_user_memory_region(old_mem_region) 1696 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1697 1698 // Update the user memory region with the correct start address. 1699 if new_base > old_base { 1700 user_memory_region.start += new_base - old_base; 1701 } else { 1702 user_memory_region.start -= old_base - new_base; 1703 } 1704 1705 // Insert new region 1706 let new_mem_region = self.vm.make_user_memory_region( 1707 user_memory_region.slot, 1708 user_memory_region.start, 1709 user_memory_region.size, 1710 user_memory_region.host_addr, 1711 false, 1712 false, 1713 ); 1714 1715 self.vm 1716 .create_user_memory_region(new_mem_region) 1717 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1718 } 1719 } 1720 } 1721 1722 Ok(()) 1723 } 1724 1725 fn as_any(&mut self) -> &mut dyn Any { 1726 self 1727 } 1728 1729 fn id(&self) -> Option<String> { 1730 Some(self.id.clone()) 1731 } 1732 } 1733 1734 impl Pausable for VfioPciDevice {} 1735 1736 impl Snapshottable for VfioPciDevice { 1737 fn id(&self) -> String { 1738 self.id.clone() 1739 } 1740 1741 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1742 let mut vfio_pci_dev_snapshot = Snapshot::default(); 1743 1744 // Snapshot VfioCommon 1745 vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?); 1746 1747 Ok(vfio_pci_dev_snapshot) 1748 } 1749 } 1750 impl Transportable for VfioPciDevice {} 1751 impl Migratable for VfioPciDevice {} 1752