1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 6 use crate::{ 7 msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig, 8 PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId, 9 PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId, 10 PciHeaderType, PciSubclass, MSIX_TABLE_ENTRY_SIZE, 11 }; 12 use anyhow::anyhow; 13 use byteorder::{ByteOrder, LittleEndian}; 14 use hypervisor::HypervisorVmError; 15 use std::any::Any; 16 use std::collections::{BTreeMap, HashMap}; 17 use std::io; 18 use std::os::unix::io::AsRawFd; 19 use std::ptr::null_mut; 20 use std::sync::{Arc, Barrier, Mutex}; 21 use thiserror::Error; 22 use versionize::{VersionMap, Versionize, VersionizeResult}; 23 use versionize_derive::Versionize; 24 use vfio_bindings::bindings::vfio::*; 25 use vfio_ioctls::{ 26 VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea, 27 }; 28 use vm_allocator::{AddressAllocator, SystemAllocator}; 29 use vm_device::interrupt::{ 30 InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, 31 }; 32 use vm_device::{BusDevice, Resource}; 33 use vm_memory::{Address, GuestAddress, GuestUsize}; 34 use vm_migration::{ 35 Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped, 36 }; 37 use vmm_sys_util::eventfd::EventFd; 38 39 #[derive(Debug, Error)] 40 pub enum VfioPciError { 41 #[error("Failed to create user memory region: {0}")] 42 CreateUserMemoryRegion(#[source] HypervisorVmError), 43 #[error("Failed to DMA map: {0}")] 44 DmaMap(#[source] vfio_ioctls::VfioError), 45 #[error("Failed to DMA unmap: {0}")] 46 DmaUnmap(#[source] vfio_ioctls::VfioError), 47 #[error("Failed to enable INTx: {0}")] 48 EnableIntx(#[source] VfioError), 49 #[error("Failed to enable MSI: {0}")] 50 EnableMsi(#[source] VfioError), 51 #[error("Failed to enable MSI-x: {0}")] 52 EnableMsix(#[source] VfioError), 53 #[error("Failed to mmap the area")] 54 MmapArea, 55 #[error("Failed to notifier's eventfd")] 56 MissingNotifier, 57 #[error("Invalid region alignment")] 58 RegionAlignment, 59 #[error("Invalid region size")] 60 RegionSize, 61 } 62 63 #[derive(Copy, Clone)] 64 enum PciVfioSubclass { 65 VfioSubclass = 0xff, 66 } 67 68 impl PciSubclass for PciVfioSubclass { 69 fn get_register_value(&self) -> u8 { 70 *self as u8 71 } 72 } 73 74 enum InterruptUpdateAction { 75 EnableMsi, 76 DisableMsi, 77 EnableMsix, 78 DisableMsix, 79 } 80 81 #[derive(Versionize)] 82 struct IntxState { 83 enabled: bool, 84 } 85 86 pub(crate) struct VfioIntx { 87 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 88 enabled: bool, 89 } 90 91 #[derive(Versionize)] 92 struct MsiState { 93 cap: MsiCap, 94 cap_offset: u32, 95 } 96 97 pub(crate) struct VfioMsi { 98 pub(crate) cfg: MsiConfig, 99 cap_offset: u32, 100 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 101 } 102 103 impl VfioMsi { 104 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 105 let old_enabled = self.cfg.enabled(); 106 107 self.cfg.update(offset, data); 108 109 let new_enabled = self.cfg.enabled(); 110 111 if !old_enabled && new_enabled { 112 return Some(InterruptUpdateAction::EnableMsi); 113 } 114 115 if old_enabled && !new_enabled { 116 return Some(InterruptUpdateAction::DisableMsi); 117 } 118 119 None 120 } 121 } 122 123 #[derive(Versionize)] 124 struct MsixState { 125 cap: MsixCap, 126 cap_offset: u32, 127 bdf: u32, 128 } 129 130 pub(crate) struct VfioMsix { 131 pub(crate) bar: MsixConfig, 132 cap: MsixCap, 133 cap_offset: u32, 134 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 135 } 136 137 impl VfioMsix { 138 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 139 let old_enabled = self.bar.enabled(); 140 141 // Update "Message Control" word 142 if offset == 2 && data.len() == 2 { 143 self.bar.set_msg_ctl(LittleEndian::read_u16(data)); 144 } 145 146 let new_enabled = self.bar.enabled(); 147 148 if !old_enabled && new_enabled { 149 return Some(InterruptUpdateAction::EnableMsix); 150 } 151 152 if old_enabled && !new_enabled { 153 return Some(InterruptUpdateAction::DisableMsix); 154 } 155 156 None 157 } 158 159 fn table_accessed(&self, bar_index: u32, offset: u64) -> bool { 160 let table_offset: u64 = u64::from(self.cap.table_offset()); 161 let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64); 162 let table_bir: u32 = self.cap.table_bir(); 163 164 bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size 165 } 166 } 167 168 pub(crate) struct Interrupt { 169 pub(crate) intx: Option<VfioIntx>, 170 pub(crate) msi: Option<VfioMsi>, 171 pub(crate) msix: Option<VfioMsix>, 172 } 173 174 impl Interrupt { 175 fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 176 if let Some(ref mut msi) = &mut self.msi { 177 let action = msi.update(offset, data); 178 return action; 179 } 180 181 None 182 } 183 184 fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 185 if let Some(ref mut msix) = &mut self.msix { 186 let action = msix.update(offset, data); 187 return action; 188 } 189 190 None 191 } 192 193 fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { 194 if let Some(msi) = &self.msi { 195 if offset >= u64::from(msi.cap_offset) 196 && offset < u64::from(msi.cap_offset) + msi.cfg.size() 197 { 198 return Some(( 199 PciCapabilityId::MessageSignalledInterrupts, 200 u64::from(msi.cap_offset), 201 )); 202 } 203 } 204 205 if let Some(msix) = &self.msix { 206 if offset == u64::from(msix.cap_offset) { 207 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); 208 } 209 } 210 211 None 212 } 213 214 fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool { 215 if let Some(msix) = &self.msix { 216 return msix.table_accessed(bar_index, offset); 217 } 218 219 false 220 } 221 222 fn msix_write_table(&mut self, offset: u64, data: &[u8]) { 223 if let Some(ref mut msix) = &mut self.msix { 224 let offset = offset - u64::from(msix.cap.table_offset()); 225 msix.bar.write_table(offset, data) 226 } 227 } 228 229 fn msix_read_table(&self, offset: u64, data: &mut [u8]) { 230 if let Some(msix) = &self.msix { 231 let offset = offset - u64::from(msix.cap.table_offset()); 232 msix.bar.read_table(offset, data) 233 } 234 } 235 236 pub(crate) fn intx_in_use(&self) -> bool { 237 if let Some(intx) = &self.intx { 238 return intx.enabled; 239 } 240 241 false 242 } 243 } 244 245 #[derive(Copy, Clone)] 246 pub struct UserMemoryRegion { 247 pub slot: u32, 248 pub start: u64, 249 pub size: u64, 250 pub host_addr: u64, 251 } 252 253 #[derive(Clone)] 254 pub struct MmioRegion { 255 pub start: GuestAddress, 256 pub length: GuestUsize, 257 pub(crate) type_: PciBarRegionType, 258 pub(crate) index: u32, 259 pub(crate) user_memory_regions: Vec<UserMemoryRegion>, 260 } 261 #[derive(Debug, Error)] 262 pub enum VfioError { 263 #[error("Kernel VFIO error: {0}")] 264 KernelVfio(#[source] vfio_ioctls::VfioError), 265 #[error("VFIO user error: {0}")] 266 VfioUser(#[source] vfio_user::Error), 267 } 268 269 pub(crate) trait Vfio: Send + Sync { 270 fn read_config_byte(&self, offset: u32) -> u8 { 271 let mut data: [u8; 1] = [0]; 272 self.read_config(offset, &mut data); 273 data[0] 274 } 275 276 fn read_config_word(&self, offset: u32) -> u16 { 277 let mut data: [u8; 2] = [0, 0]; 278 self.read_config(offset, &mut data); 279 u16::from_le_bytes(data) 280 } 281 282 fn read_config_dword(&self, offset: u32) -> u32 { 283 let mut data: [u8; 4] = [0, 0, 0, 0]; 284 self.read_config(offset, &mut data); 285 u32::from_le_bytes(data) 286 } 287 288 fn write_config_dword(&self, offset: u32, buf: u32) { 289 let data: [u8; 4] = buf.to_le_bytes(); 290 self.write_config(offset, &data) 291 } 292 293 fn read_config(&self, offset: u32, data: &mut [u8]) { 294 self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut()); 295 } 296 297 fn write_config(&self, offset: u32, data: &[u8]) { 298 self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data) 299 } 300 301 fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 302 self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds) 303 } 304 305 fn disable_msi(&self) -> Result<(), VfioError> { 306 self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX) 307 } 308 309 fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 310 self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds) 311 } 312 313 fn disable_msix(&self) -> Result<(), VfioError> { 314 self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX) 315 } 316 317 fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) { 318 unimplemented!() 319 } 320 321 fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) { 322 unimplemented!() 323 } 324 325 fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> { 326 unimplemented!() 327 } 328 329 fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 330 unimplemented!() 331 } 332 333 fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 334 unimplemented!() 335 } 336 337 fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 338 unimplemented!() 339 } 340 } 341 342 struct VfioDeviceWrapper { 343 device: Arc<VfioDevice>, 344 } 345 346 impl VfioDeviceWrapper { 347 fn new(device: Arc<VfioDevice>) -> Self { 348 Self { device } 349 } 350 } 351 352 impl Vfio for VfioDeviceWrapper { 353 fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) { 354 self.device.region_read(index, data, offset) 355 } 356 357 fn region_write(&self, index: u32, offset: u64, data: &[u8]) { 358 self.device.region_write(index, data, offset) 359 } 360 361 fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> { 362 self.device.get_irq_info(irq_index).copied() 363 } 364 365 fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 366 self.device 367 .enable_irq(irq_index, event_fds) 368 .map_err(VfioError::KernelVfio) 369 } 370 371 fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> { 372 self.device 373 .disable_irq(irq_index) 374 .map_err(VfioError::KernelVfio) 375 } 376 377 fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> { 378 self.device 379 .unmask_irq(irq_index) 380 .map_err(VfioError::KernelVfio) 381 } 382 } 383 384 #[derive(Versionize)] 385 struct VfioCommonState { 386 intx_state: Option<IntxState>, 387 msi_state: Option<MsiState>, 388 msix_state: Option<MsixState>, 389 } 390 391 impl VersionMapped for VfioCommonState {} 392 393 pub(crate) struct ConfigPatch { 394 mask: u32, 395 patch: u32, 396 } 397 398 pub(crate) struct VfioCommon { 399 pub(crate) configuration: PciConfiguration, 400 pub(crate) mmio_regions: Vec<MmioRegion>, 401 pub(crate) interrupt: Interrupt, 402 pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 403 pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 404 pub(crate) vfio_wrapper: Arc<dyn Vfio>, 405 pub(crate) patches: HashMap<usize, ConfigPatch>, 406 } 407 408 impl VfioCommon { 409 pub(crate) fn allocate_bars( 410 &mut self, 411 allocator: &Arc<Mutex<SystemAllocator>>, 412 mmio_allocator: &mut AddressAllocator, 413 resources: Option<Vec<Resource>>, 414 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 415 let mut bars = Vec::new(); 416 let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX; 417 418 // Going through all regular regions to compute the BAR size. 419 // We're not saving the BAR address to restore it, because we 420 // are going to allocate a guest address for each BAR and write 421 // that new address back. 422 while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { 423 let mut region_size: u64 = 0; 424 let mut region_type = PciBarRegionType::Memory32BitRegion; 425 let mut prefetchable = PciBarPrefetchable::NotPrefetchable; 426 let mut flags: u32 = 0; 427 428 let mut restored_bar_addr = None; 429 if let Some(resources) = &resources { 430 for resource in resources { 431 if let Resource::PciBar { 432 index, 433 base, 434 size, 435 type_, 436 .. 437 } = resource 438 { 439 if *index == bar_id as usize { 440 restored_bar_addr = Some(GuestAddress(*base)); 441 region_size = *size; 442 region_type = PciBarRegionType::from(*type_); 443 break; 444 } 445 } 446 } 447 if restored_bar_addr.is_none() { 448 bar_id += 1; 449 continue; 450 } 451 } else { 452 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 453 (PCI_ROM_EXP_BAR_INDEX * 4) as u32 454 } else { 455 PCI_CONFIG_BAR_OFFSET + bar_id * 4 456 }; 457 458 // First read flags 459 flags = self.vfio_wrapper.read_config_dword(bar_offset); 460 461 // Is this an IO BAR? 462 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 463 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) 464 } else { 465 false 466 }; 467 468 // Is this a 64-bit BAR? 469 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 470 matches!( 471 flags & PCI_CONFIG_MEMORY_BAR_64BIT, 472 PCI_CONFIG_MEMORY_BAR_64BIT 473 ) 474 } else { 475 false 476 }; 477 478 if matches!( 479 flags & PCI_CONFIG_BAR_PREFETCHABLE, 480 PCI_CONFIG_BAR_PREFETCHABLE 481 ) { 482 prefetchable = PciBarPrefetchable::Prefetchable 483 }; 484 485 // To get size write all 1s 486 self.vfio_wrapper 487 .write_config_dword(bar_offset, 0xffff_ffff); 488 489 // And read back BAR value. The device will write zeros for bits it doesn't care about 490 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset); 491 492 if io_bar { 493 // Mask flag bits (lowest 2 for I/O bars) 494 lower &= !0b11; 495 496 // BAR is not enabled 497 if lower == 0 { 498 bar_id += 1; 499 continue; 500 } 501 502 // IO BAR 503 region_type = PciBarRegionType::IoRegion; 504 505 // Invert bits and add 1 to calculate size 506 region_size = (!lower + 1) as u64; 507 } else if is_64bit_bar { 508 // 64 bits Memory BAR 509 region_type = PciBarRegionType::Memory64BitRegion; 510 511 // Query size of upper BAR of 64-bit BAR 512 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; 513 self.vfio_wrapper 514 .write_config_dword(upper_offset, 0xffff_ffff); 515 let upper = self.vfio_wrapper.read_config_dword(upper_offset); 516 517 let mut combined_size = u64::from(upper) << 32 | u64::from(lower); 518 519 // Mask out flag bits (lowest 4 for memory bars) 520 combined_size &= !0b1111; 521 522 // BAR is not enabled 523 if combined_size == 0 { 524 bar_id += 1; 525 continue; 526 } 527 528 // Invert and add 1 to to find size 529 region_size = !combined_size + 1; 530 } else { 531 region_type = PciBarRegionType::Memory32BitRegion; 532 533 // Mask out flag bits (lowest 4 for memory bars) 534 lower &= !0b1111; 535 536 if lower == 0 { 537 bar_id += 1; 538 continue; 539 } 540 541 // Invert and add 1 to to find size 542 region_size = (!lower + 1) as u64; 543 } 544 } 545 546 let bar_addr = match region_type { 547 PciBarRegionType::IoRegion => { 548 #[cfg(target_arch = "aarch64")] 549 unimplemented!(); 550 551 // The address needs to be 4 bytes aligned. 552 #[cfg(not(target_arch = "aarch64"))] 553 allocator 554 .lock() 555 .unwrap() 556 .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4)) 557 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 558 } 559 PciBarRegionType::Memory32BitRegion => { 560 // BAR allocation must be naturally aligned 561 allocator 562 .lock() 563 .unwrap() 564 .allocate_mmio_hole_addresses( 565 restored_bar_addr, 566 region_size, 567 Some(region_size), 568 ) 569 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 570 } 571 PciBarRegionType::Memory64BitRegion => { 572 // BAR allocation must be naturally aligned 573 mmio_allocator 574 .allocate(restored_bar_addr, region_size, Some(region_size)) 575 .ok_or(PciDeviceError::IoAllocationFailed(region_size))? 576 } 577 }; 578 579 // We can now build our BAR configuration block. 580 let bar = PciBarConfiguration::default() 581 .set_index(bar_id as usize) 582 .set_address(bar_addr.raw_value()) 583 .set_size(region_size) 584 .set_region_type(region_type) 585 .set_prefetchable(prefetchable); 586 587 if bar_id == VFIO_PCI_ROM_REGION_INDEX { 588 self.configuration 589 .add_pci_rom_bar(&bar, flags & 0x1) 590 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 591 } else { 592 self.configuration 593 .add_pci_bar(&bar) 594 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 595 } 596 597 bars.push(bar); 598 self.mmio_regions.push(MmioRegion { 599 start: bar_addr, 600 length: region_size, 601 type_: region_type, 602 index: bar_id, 603 user_memory_regions: Vec::new(), 604 }); 605 606 bar_id += 1; 607 if region_type == PciBarRegionType::Memory64BitRegion { 608 bar_id += 1; 609 } 610 } 611 612 Ok(bars) 613 } 614 615 pub(crate) fn free_bars( 616 &mut self, 617 allocator: &mut SystemAllocator, 618 mmio_allocator: &mut AddressAllocator, 619 ) -> Result<(), PciDeviceError> { 620 for region in self.mmio_regions.iter() { 621 match region.type_ { 622 PciBarRegionType::IoRegion => { 623 #[cfg(target_arch = "x86_64")] 624 allocator.free_io_addresses(region.start, region.length); 625 #[cfg(target_arch = "aarch64")] 626 error!("I/O region is not supported"); 627 } 628 PciBarRegionType::Memory32BitRegion => { 629 allocator.free_mmio_hole_addresses(region.start, region.length); 630 } 631 PciBarRegionType::Memory64BitRegion => { 632 mmio_allocator.free(region.start, region.length); 633 } 634 } 635 } 636 Ok(()) 637 } 638 639 pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap { 640 let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into()); 641 642 let table = self.vfio_wrapper.read_config_dword((cap + 4).into()); 643 644 let pba = self.vfio_wrapper.read_config_dword((cap + 8).into()); 645 646 MsixCap { 647 msg_ctl, 648 table, 649 pba, 650 } 651 } 652 653 pub(crate) fn initialize_msix(&mut self, msix_cap: MsixCap, cap_offset: u32, bdf: PciBdf) { 654 let interrupt_source_group = self 655 .msi_interrupt_manager 656 .create_group(MsiIrqGroupConfig { 657 base: 0, 658 count: msix_cap.table_size() as InterruptIndex, 659 }) 660 .unwrap(); 661 662 let msix_config = MsixConfig::new( 663 msix_cap.table_size(), 664 interrupt_source_group.clone(), 665 bdf.into(), 666 None, 667 ) 668 .unwrap(); 669 670 self.interrupt.msix = Some(VfioMsix { 671 bar: msix_config, 672 cap: msix_cap, 673 cap_offset, 674 interrupt_source_group, 675 }); 676 } 677 678 pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 { 679 self.vfio_wrapper.read_config_word((cap + 2).into()) 680 } 681 682 pub(crate) fn initialize_msi(&mut self, msg_ctl: u16, cap_offset: u32) { 683 let interrupt_source_group = self 684 .msi_interrupt_manager 685 .create_group(MsiIrqGroupConfig { 686 base: 0, 687 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, 688 }) 689 .unwrap(); 690 691 let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone()); 692 693 self.interrupt.msi = Some(VfioMsi { 694 cfg: msi_config, 695 cap_offset, 696 interrupt_source_group, 697 }); 698 } 699 700 pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) { 701 let mut cap_next = self 702 .vfio_wrapper 703 .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 704 705 let mut pci_express_cap_found = false; 706 let mut power_management_cap_found = false; 707 708 while cap_next != 0 { 709 let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into()); 710 711 match PciCapabilityId::from(cap_id) { 712 PciCapabilityId::MessageSignalledInterrupts => { 713 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { 714 if irq_info.count > 0 { 715 // Parse capability only if the VFIO device 716 // supports MSI. 717 let msg_ctl = self.parse_msi_capabilities(cap_next); 718 self.initialize_msi(msg_ctl, cap_next as u32); 719 } 720 } 721 } 722 PciCapabilityId::MsiX => { 723 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) 724 { 725 if irq_info.count > 0 { 726 // Parse capability only if the VFIO device 727 // supports MSI-X. 728 let msix_cap = self.parse_msix_capabilities(cap_next); 729 self.initialize_msix(msix_cap, cap_next as u32, bdf); 730 } 731 } 732 } 733 PciCapabilityId::PciExpress => pci_express_cap_found = true, 734 PciCapabilityId::PowerManagement => power_management_cap_found = true, 735 _ => {} 736 }; 737 738 cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into()); 739 } 740 741 if pci_express_cap_found && power_management_cap_found { 742 self.parse_extended_capabilities(); 743 } 744 } 745 746 fn parse_extended_capabilities(&mut self) { 747 let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET; 748 749 loop { 750 let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset); 751 752 let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16; 753 let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16; 754 755 match PciExpressCapabilityId::from(cap_id) { 756 PciExpressCapabilityId::AlternativeRoutingIdentificationIntepretation 757 | PciExpressCapabilityId::ResizeableBar 758 | PciExpressCapabilityId::SingleRootIoVirtualization => { 759 let reg_idx = (current_offset / 4) as usize; 760 self.patches.insert( 761 reg_idx, 762 ConfigPatch { 763 mask: 0x0000_ffff, 764 patch: PciExpressCapabilityId::NullCapability as u32, 765 }, 766 ); 767 } 768 _ => {} 769 } 770 771 if cap_next == 0 { 772 break; 773 } 774 775 current_offset = cap_next.into(); 776 } 777 } 778 779 pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> { 780 if let Some(intx) = &mut self.interrupt.intx { 781 if !intx.enabled { 782 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { 783 self.vfio_wrapper 784 .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) 785 .map_err(VfioPciError::EnableIntx)?; 786 787 intx.enabled = true; 788 } else { 789 return Err(VfioPciError::MissingNotifier); 790 } 791 } 792 } 793 794 Ok(()) 795 } 796 797 pub(crate) fn disable_intx(&mut self) { 798 if let Some(intx) = &mut self.interrupt.intx { 799 if intx.enabled { 800 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { 801 error!("Could not disable INTx: {}", e); 802 } else { 803 intx.enabled = false; 804 } 805 } 806 } 807 } 808 809 pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> { 810 if let Some(msi) = &self.interrupt.msi { 811 let mut irq_fds: Vec<EventFd> = Vec::new(); 812 for i in 0..msi.cfg.num_enabled_vectors() { 813 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { 814 irq_fds.push(eventfd); 815 } else { 816 return Err(VfioPciError::MissingNotifier); 817 } 818 } 819 820 self.vfio_wrapper 821 .enable_msi(irq_fds.iter().collect()) 822 .map_err(VfioPciError::EnableMsi)?; 823 } 824 825 Ok(()) 826 } 827 828 pub(crate) fn disable_msi(&self) { 829 if let Err(e) = self.vfio_wrapper.disable_msi() { 830 error!("Could not disable MSI: {}", e); 831 } 832 } 833 834 pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> { 835 if let Some(msix) = &self.interrupt.msix { 836 let mut irq_fds: Vec<EventFd> = Vec::new(); 837 for i in 0..msix.bar.table_entries.len() { 838 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { 839 irq_fds.push(eventfd); 840 } else { 841 return Err(VfioPciError::MissingNotifier); 842 } 843 } 844 845 self.vfio_wrapper 846 .enable_msix(irq_fds.iter().collect()) 847 .map_err(VfioPciError::EnableMsix)?; 848 } 849 850 Ok(()) 851 } 852 853 pub(crate) fn disable_msix(&self) { 854 if let Err(e) = self.vfio_wrapper.disable_msix() { 855 error!("Could not disable MSI-X: {}", e); 856 } 857 } 858 859 pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> { 860 if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { 861 if irq_info.count == 0 { 862 // A count of 0 means the INTx IRQ is not supported, therefore 863 // it shouldn't be initialized. 864 return Ok(()); 865 } 866 } 867 868 if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() { 869 self.interrupt.intx = Some(VfioIntx { 870 interrupt_source_group, 871 enabled: false, 872 }); 873 874 self.enable_intx()?; 875 } 876 877 Ok(()) 878 } 879 880 pub(crate) fn update_msi_capabilities( 881 &mut self, 882 offset: u64, 883 data: &[u8], 884 ) -> Result<(), VfioPciError> { 885 match self.interrupt.update_msi(offset, data) { 886 Some(InterruptUpdateAction::EnableMsi) => { 887 // Disable INTx before we can enable MSI 888 self.disable_intx(); 889 self.enable_msi()?; 890 } 891 Some(InterruptUpdateAction::DisableMsi) => { 892 // Fallback onto INTx when disabling MSI 893 self.disable_msi(); 894 self.enable_intx()?; 895 } 896 _ => {} 897 } 898 899 Ok(()) 900 } 901 902 pub(crate) fn update_msix_capabilities( 903 &mut self, 904 offset: u64, 905 data: &[u8], 906 ) -> Result<(), VfioPciError> { 907 match self.interrupt.update_msix(offset, data) { 908 Some(InterruptUpdateAction::EnableMsix) => { 909 // Disable INTx before we can enable MSI-X 910 self.disable_intx(); 911 self.enable_msix()?; 912 } 913 Some(InterruptUpdateAction::DisableMsix) => { 914 // Fallback onto INTx when disabling MSI-X 915 self.disable_msix(); 916 self.enable_intx()?; 917 } 918 _ => {} 919 } 920 921 Ok(()) 922 } 923 924 pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> { 925 for region in self.mmio_regions.iter() { 926 if addr >= region.start.raw_value() 927 && addr < region.start.unchecked_add(region.length).raw_value() 928 { 929 return Some(region.clone()); 930 } 931 } 932 None 933 } 934 935 pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 936 let addr = base + offset; 937 if let Some(region) = self.find_region(addr) { 938 let offset = addr - region.start.raw_value(); 939 940 if self.interrupt.msix_table_accessed(region.index, offset) { 941 self.interrupt.msix_read_table(offset, data); 942 } else { 943 self.vfio_wrapper.region_read(region.index, offset, data); 944 } 945 } 946 947 // INTx EOI 948 // The guest reading from the BAR potentially means the interrupt has 949 // been received and can be acknowledged. 950 if self.interrupt.intx_in_use() { 951 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 952 error!("Failed unmasking INTx IRQ: {}", e); 953 } 954 } 955 } 956 957 pub(crate) fn write_bar( 958 &mut self, 959 base: u64, 960 offset: u64, 961 data: &[u8], 962 ) -> Option<Arc<Barrier>> { 963 let addr = base + offset; 964 if let Some(region) = self.find_region(addr) { 965 let offset = addr - region.start.raw_value(); 966 967 // If the MSI-X table is written to, we need to update our cache. 968 if self.interrupt.msix_table_accessed(region.index, offset) { 969 self.interrupt.msix_write_table(offset, data); 970 } else { 971 self.vfio_wrapper.region_write(region.index, offset, data); 972 } 973 } 974 975 // INTx EOI 976 // The guest writing to the BAR potentially means the interrupt has 977 // been received and can be acknowledged. 978 if self.interrupt.intx_in_use() { 979 if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 980 error!("Failed unmasking INTx IRQ: {}", e); 981 } 982 } 983 984 None 985 } 986 987 pub(crate) fn write_config_register( 988 &mut self, 989 reg_idx: usize, 990 offset: u64, 991 data: &[u8], 992 ) -> Option<Arc<Barrier>> { 993 // When the guest wants to write to a BAR, we trap it into 994 // our local configuration space. We're not reprogramming 995 // VFIO device. 996 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 997 || reg_idx == PCI_ROM_EXP_BAR_INDEX 998 { 999 // We keep our local cache updated with the BARs. 1000 // We'll read it back from there when the guest is asking 1001 // for BARs (see read_config_register()). 1002 self.configuration 1003 .write_config_register(reg_idx, offset, data); 1004 return None; 1005 } 1006 1007 let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; 1008 1009 // If the MSI or MSI-X capabilities are accessed, we need to 1010 // update our local cache accordingly. 1011 // Depending on how the capabilities are modified, this could 1012 // trigger a VFIO MSI or MSI-X toggle. 1013 if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { 1014 let cap_offset: u64 = reg - cap_base + offset; 1015 match cap_id { 1016 PciCapabilityId::MessageSignalledInterrupts => { 1017 if let Err(e) = self.update_msi_capabilities(cap_offset, data) { 1018 error!("Could not update MSI capabilities: {}", e); 1019 } 1020 } 1021 PciCapabilityId::MsiX => { 1022 if let Err(e) = self.update_msix_capabilities(cap_offset, data) { 1023 error!("Could not update MSI-X capabilities: {}", e); 1024 } 1025 } 1026 _ => {} 1027 } 1028 } 1029 1030 // Make sure to write to the device's PCI config space after MSI/MSI-X 1031 // interrupts have been enabled/disabled. In case of MSI, when the 1032 // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), 1033 // the MSI Enable bit in the MSI capability structure found in the PCI 1034 // config space is disabled by default. That's why when the guest is 1035 // enabling this bit, we first need to enable the MSI interrupts with 1036 // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write 1037 // to the device region to update the MSI Enable bit. 1038 self.vfio_wrapper.write_config((reg + offset) as u32, data); 1039 1040 None 1041 } 1042 1043 pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1044 // When reading the BARs, we trap it and return what comes 1045 // from our local configuration space. We want the guest to 1046 // use that and not the VFIO device BARs as it does not map 1047 // with the guest address space. 1048 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 1049 || reg_idx == PCI_ROM_EXP_BAR_INDEX 1050 { 1051 return self.configuration.read_reg(reg_idx); 1052 } 1053 1054 // Since we don't support passing multi-functions devices, we should 1055 // mask the multi-function bit, bit 7 of the Header Type byte on the 1056 // register 3. 1057 let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { 1058 0xff7f_ffff 1059 } else { 1060 0xffff_ffff 1061 }; 1062 1063 // The config register read comes from the VFIO device itself. 1064 let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask; 1065 1066 if let Some(config_patch) = self.patches.get(®_idx) { 1067 value = (value & !config_patch.mask) | config_patch.patch; 1068 } 1069 1070 value 1071 } 1072 1073 fn state(&self) -> VfioCommonState { 1074 let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState { 1075 enabled: intx.enabled, 1076 }); 1077 1078 let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState { 1079 cap: msi.cfg.cap, 1080 cap_offset: msi.cap_offset, 1081 }); 1082 1083 let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState { 1084 cap: msix.cap, 1085 cap_offset: msix.cap_offset, 1086 bdf: msix.bar.devid, 1087 }); 1088 1089 VfioCommonState { 1090 intx_state, 1091 msi_state, 1092 msix_state, 1093 } 1094 } 1095 1096 fn set_state(&mut self, state: &VfioCommonState) -> Result<(), VfioPciError> { 1097 if let (Some(intx), Some(interrupt_source_group)) = 1098 (&state.intx_state, self.legacy_interrupt_group.clone()) 1099 { 1100 self.interrupt.intx = Some(VfioIntx { 1101 interrupt_source_group, 1102 enabled: false, 1103 }); 1104 1105 if intx.enabled { 1106 self.enable_intx()?; 1107 } 1108 } 1109 1110 if let Some(msi) = &state.msi_state { 1111 self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset); 1112 } 1113 1114 if let Some(msix) = &state.msix_state { 1115 self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into()); 1116 } 1117 1118 Ok(()) 1119 } 1120 } 1121 1122 impl Pausable for VfioCommon {} 1123 1124 impl Snapshottable for VfioCommon { 1125 fn id(&self) -> String { 1126 String::from("vfio_common") 1127 } 1128 1129 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1130 let mut vfio_common_snapshot = 1131 Snapshot::new_from_versioned_state(&self.id(), &self.state())?; 1132 1133 // Snapshot PciConfiguration 1134 vfio_common_snapshot.add_snapshot(self.configuration.snapshot()?); 1135 1136 // Snapshot MSI 1137 if let Some(msi) = &mut self.interrupt.msi { 1138 vfio_common_snapshot.add_snapshot(msi.cfg.snapshot()?); 1139 } 1140 1141 // Snapshot MSI-X 1142 if let Some(msix) = &mut self.interrupt.msix { 1143 vfio_common_snapshot.add_snapshot(msix.bar.snapshot()?); 1144 } 1145 1146 Ok(vfio_common_snapshot) 1147 } 1148 1149 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 1150 if let Some(vfio_common_section) = snapshot 1151 .snapshot_data 1152 .get(&format!("{}-section", self.id())) 1153 { 1154 // It has to be invoked first as we want Interrupt to be initialized 1155 // correctly before we try to restore MSI and MSI-X configurations. 1156 self.set_state(&vfio_common_section.to_versioned_state()?) 1157 .map_err(|e| { 1158 MigratableError::Restore(anyhow!("Could not restore VFIO_COMMON state {:?}", e)) 1159 })?; 1160 1161 // Restore PciConfiguration 1162 if let Some(pci_config_snapshot) = snapshot.snapshots.get(&self.configuration.id()) { 1163 self.configuration.restore(*pci_config_snapshot.clone())?; 1164 } 1165 1166 // Restore MSI 1167 if let Some(msi) = &mut self.interrupt.msi { 1168 if let Some(msi_snapshot) = snapshot.snapshots.get(&msi.cfg.id()) { 1169 msi.cfg.restore(*msi_snapshot.clone())?; 1170 } 1171 if msi.cfg.enabled() { 1172 self.enable_msi().unwrap(); 1173 } 1174 } 1175 1176 // Restore MSI-X 1177 if let Some(msix) = &mut self.interrupt.msix { 1178 if let Some(msix_snapshot) = snapshot.snapshots.get(&msix.bar.id()) { 1179 msix.bar.restore(*msix_snapshot.clone())?; 1180 } 1181 if msix.bar.enabled() { 1182 self.enable_msix().unwrap(); 1183 } 1184 } 1185 1186 return Ok(()); 1187 } 1188 1189 Err(MigratableError::Restore(anyhow!( 1190 "Could not find VFIO_COMMON snapshot section" 1191 ))) 1192 } 1193 } 1194 1195 /// VfioPciDevice represents a VFIO PCI device. 1196 /// This structure implements the BusDevice and PciDevice traits. 1197 /// 1198 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. 1199 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, 1200 /// which then gets added to the PCI bus. 1201 pub struct VfioPciDevice { 1202 id: String, 1203 vm: Arc<dyn hypervisor::Vm>, 1204 device: Arc<VfioDevice>, 1205 container: Arc<VfioContainer>, 1206 common: VfioCommon, 1207 iommu_attached: bool, 1208 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1209 } 1210 1211 impl VfioPciDevice { 1212 /// Constructs a new Vfio Pci device for the given Vfio device 1213 #[allow(clippy::too_many_arguments)] 1214 pub fn new( 1215 id: String, 1216 vm: &Arc<dyn hypervisor::Vm>, 1217 device: VfioDevice, 1218 container: Arc<VfioContainer>, 1219 msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 1220 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 1221 iommu_attached: bool, 1222 bdf: PciBdf, 1223 restoring: bool, 1224 memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>, 1225 ) -> Result<Self, VfioPciError> { 1226 let device = Arc::new(device); 1227 device.reset(); 1228 1229 let configuration = PciConfiguration::new( 1230 0, 1231 0, 1232 0, 1233 PciClassCode::Other, 1234 &PciVfioSubclass::VfioSubclass, 1235 None, 1236 PciHeaderType::Device, 1237 0, 1238 0, 1239 None, 1240 None, 1241 ); 1242 1243 let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device)); 1244 1245 let mut common = VfioCommon { 1246 mmio_regions: Vec::new(), 1247 configuration, 1248 interrupt: Interrupt { 1249 intx: None, 1250 msi: None, 1251 msix: None, 1252 }, 1253 msi_interrupt_manager, 1254 legacy_interrupt_group, 1255 vfio_wrapper: Arc::new(vfio_wrapper) as Arc<dyn Vfio>, 1256 patches: HashMap::new(), 1257 }; 1258 1259 // No need to parse capabilities from the device if on the restore path. 1260 // The initialization will be performed later when restore() will be 1261 // called. 1262 if !restoring { 1263 common.parse_capabilities(bdf); 1264 common.initialize_legacy_interrupt()?; 1265 } 1266 1267 let vfio_pci_device = VfioPciDevice { 1268 id, 1269 vm: vm.clone(), 1270 device, 1271 container, 1272 common, 1273 iommu_attached, 1274 memory_slot, 1275 }; 1276 1277 Ok(vfio_pci_device) 1278 } 1279 1280 pub fn iommu_attached(&self) -> bool { 1281 self.iommu_attached 1282 } 1283 1284 fn align_4k(address: u64) -> u64 { 1285 (address + 0xfff) & 0xffff_ffff_ffff_f000 1286 } 1287 1288 fn is_4k_aligned(address: u64) -> bool { 1289 (address & 0xfff) == 0 1290 } 1291 1292 fn is_4k_multiple(size: u64) -> bool { 1293 (size & 0xfff) == 0 1294 } 1295 1296 fn generate_sparse_areas( 1297 caps: &[VfioRegionInfoCap], 1298 region_index: u32, 1299 region_start: u64, 1300 region_size: u64, 1301 vfio_msix: Option<&VfioMsix>, 1302 ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> { 1303 for cap in caps { 1304 match cap { 1305 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()), 1306 VfioRegionInfoCap::MsixMappable => { 1307 if !Self::is_4k_aligned(region_start) { 1308 error!( 1309 "Region start address 0x{:x} must be at least aligned on 4KiB", 1310 region_start 1311 ); 1312 return Err(VfioPciError::RegionAlignment); 1313 } 1314 if !Self::is_4k_multiple(region_size) { 1315 error!( 1316 "Region size 0x{:x} must be at least a multiple of 4KiB", 1317 region_size 1318 ); 1319 return Err(VfioPciError::RegionSize); 1320 } 1321 1322 // In case the region contains the MSI-X vectors table or 1323 // the MSI-X PBA table, we must calculate the subregions 1324 // around them, leading to a list of sparse areas. 1325 // We want to make sure we will still trap MMIO accesses 1326 // to these MSI-X specific ranges. 1327 // 1328 // Using a BtreeMap as the list provided through the iterator is sorted 1329 // by key. This ensures proper split of the whole region. 1330 let mut inter_ranges = BTreeMap::new(); 1331 if let Some(msix) = vfio_msix { 1332 if region_index == msix.cap.table_bir() { 1333 let (offset, size) = msix.cap.table_range(); 1334 inter_ranges.insert(offset, size); 1335 } 1336 if region_index == msix.cap.pba_bir() { 1337 let (offset, size) = msix.cap.pba_range(); 1338 inter_ranges.insert(offset, size); 1339 } 1340 } 1341 1342 let mut sparse_areas = Vec::new(); 1343 let mut current_offset = 0; 1344 for (range_offset, range_size) in inter_ranges { 1345 if range_offset > current_offset { 1346 sparse_areas.push(VfioRegionSparseMmapArea { 1347 offset: current_offset, 1348 size: range_offset - current_offset, 1349 }); 1350 } 1351 1352 current_offset = Self::align_4k(range_offset + range_size); 1353 } 1354 1355 if region_size > current_offset { 1356 sparse_areas.push(VfioRegionSparseMmapArea { 1357 offset: current_offset, 1358 size: region_size - current_offset, 1359 }); 1360 } 1361 1362 return Ok(sparse_areas); 1363 } 1364 _ => {} 1365 } 1366 } 1367 1368 // In case no relevant capabilities have been found, create a single 1369 // sparse area corresponding to the entire MMIO region. 1370 Ok(vec![VfioRegionSparseMmapArea { 1371 offset: 0, 1372 size: region_size, 1373 }]) 1374 } 1375 1376 /// Map MMIO regions into the guest, and avoid VM exits when the guest tries 1377 /// to reach those regions. 1378 /// 1379 /// # Arguments 1380 /// 1381 /// * `vm` - The VM object. It is used to set the VFIO MMIO regions 1382 /// as user memory regions. 1383 /// * `mem_slot` - The closure to return a memory slot. 1384 pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> { 1385 let fd = self.device.as_raw_fd(); 1386 1387 for region in self.common.mmio_regions.iter_mut() { 1388 let region_flags = self.device.get_region_flags(region.index); 1389 if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { 1390 let mut prot = 0; 1391 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 { 1392 prot |= libc::PROT_READ; 1393 } 1394 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { 1395 prot |= libc::PROT_WRITE; 1396 } 1397 1398 // Retrieve the list of capabilities found on the region 1399 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 { 1400 self.device.get_region_caps(region.index) 1401 } else { 1402 Vec::new() 1403 }; 1404 1405 // Don't try to mmap the region if it contains MSI-X table or 1406 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE 1407 // in the list of supported capabilities. 1408 if let Some(msix) = self.common.interrupt.msix.as_ref() { 1409 if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir()) 1410 && !caps.contains(&VfioRegionInfoCap::MsixMappable) 1411 { 1412 continue; 1413 } 1414 } 1415 1416 let mmap_size = self.device.get_region_size(region.index); 1417 let mmap_offset = self.device.get_region_offset(region.index); 1418 1419 let sparse_areas = Self::generate_sparse_areas( 1420 &caps, 1421 region.index, 1422 region.start.0, 1423 mmap_size, 1424 self.common.interrupt.msix.as_ref(), 1425 )?; 1426 1427 for area in sparse_areas.iter() { 1428 // SAFETY: FFI call with correct arguments 1429 let host_addr = unsafe { 1430 libc::mmap( 1431 null_mut(), 1432 area.size as usize, 1433 prot, 1434 libc::MAP_SHARED, 1435 fd, 1436 mmap_offset as libc::off_t + area.offset as libc::off_t, 1437 ) 1438 }; 1439 1440 if host_addr == libc::MAP_FAILED { 1441 error!( 1442 "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", 1443 area.offset, 1444 area.size, 1445 std::io::Error::last_os_error() 1446 ); 1447 return Err(VfioPciError::MmapArea); 1448 } 1449 1450 let user_memory_region = UserMemoryRegion { 1451 slot: (self.memory_slot)(), 1452 start: region.start.0 + area.offset, 1453 size: area.size, 1454 host_addr: host_addr as u64, 1455 }; 1456 1457 region.user_memory_regions.push(user_memory_region); 1458 1459 let mem_region = self.vm.make_user_memory_region( 1460 user_memory_region.slot, 1461 user_memory_region.start, 1462 user_memory_region.size, 1463 user_memory_region.host_addr, 1464 false, 1465 false, 1466 ); 1467 1468 self.vm 1469 .create_user_memory_region(mem_region) 1470 .map_err(VfioPciError::CreateUserMemoryRegion)?; 1471 } 1472 } 1473 } 1474 1475 Ok(()) 1476 } 1477 1478 pub fn unmap_mmio_regions(&mut self) { 1479 for region in self.common.mmio_regions.iter() { 1480 for user_memory_region in region.user_memory_regions.iter() { 1481 // Remove region 1482 let r = self.vm.make_user_memory_region( 1483 user_memory_region.slot, 1484 user_memory_region.start, 1485 user_memory_region.size, 1486 user_memory_region.host_addr, 1487 false, 1488 false, 1489 ); 1490 1491 if let Err(e) = self.vm.remove_user_memory_region(r) { 1492 error!("Could not remove the userspace memory region: {}", e); 1493 } 1494 1495 // SAFETY: FFI call with correct arguments 1496 let ret = unsafe { 1497 libc::munmap( 1498 user_memory_region.host_addr as *mut libc::c_void, 1499 user_memory_region.size as usize, 1500 ) 1501 }; 1502 if ret != 0 { 1503 error!( 1504 "Could not unmap region {}, error:{}", 1505 region.index, 1506 io::Error::last_os_error() 1507 ); 1508 } 1509 } 1510 } 1511 } 1512 1513 pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> { 1514 if !self.iommu_attached { 1515 self.container 1516 .vfio_dma_map(iova, size, user_addr) 1517 .map_err(VfioPciError::DmaMap)?; 1518 } 1519 1520 Ok(()) 1521 } 1522 1523 pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> { 1524 if !self.iommu_attached { 1525 self.container 1526 .vfio_dma_unmap(iova, size) 1527 .map_err(VfioPciError::DmaUnmap)?; 1528 } 1529 1530 Ok(()) 1531 } 1532 1533 pub fn mmio_regions(&self) -> Vec<MmioRegion> { 1534 self.common.mmio_regions.clone() 1535 } 1536 } 1537 1538 impl Drop for VfioPciDevice { 1539 fn drop(&mut self) { 1540 self.unmap_mmio_regions(); 1541 1542 if let Some(msix) = &self.common.interrupt.msix { 1543 if msix.bar.enabled() { 1544 self.common.disable_msix(); 1545 } 1546 } 1547 1548 if let Some(msi) = &self.common.interrupt.msi { 1549 if msi.cfg.enabled() { 1550 self.common.disable_msi() 1551 } 1552 } 1553 1554 if self.common.interrupt.intx_in_use() { 1555 self.common.disable_intx(); 1556 } 1557 } 1558 } 1559 1560 impl BusDevice for VfioPciDevice { 1561 fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1562 self.read_bar(base, offset, data) 1563 } 1564 1565 fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1566 self.write_bar(base, offset, data) 1567 } 1568 } 1569 1570 // First BAR offset in the PCI config space. 1571 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; 1572 // Capability register offset in the PCI config space. 1573 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; 1574 // Extended capabilities register offset in the PCI config space. 1575 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100; 1576 // IO BAR when first BAR bit is 1. 1577 const PCI_CONFIG_IO_BAR: u32 = 0x1; 1578 // 64-bit memory bar flag. 1579 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; 1580 // Prefetchable BAR bit 1581 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8; 1582 // PCI config register size (4 bytes). 1583 const PCI_CONFIG_REGISTER_SIZE: usize = 4; 1584 // Number of BARs for a PCI device 1585 const BAR_NUMS: usize = 6; 1586 // PCI Header Type register index 1587 const PCI_HEADER_TYPE_REG_INDEX: usize = 3; 1588 // First BAR register index 1589 const PCI_CONFIG_BAR0_INDEX: usize = 4; 1590 // PCI ROM expansion BAR register index 1591 const PCI_ROM_EXP_BAR_INDEX: usize = 12; 1592 1593 impl PciDevice for VfioPciDevice { 1594 fn allocate_bars( 1595 &mut self, 1596 allocator: &Arc<Mutex<SystemAllocator>>, 1597 mmio_allocator: &mut AddressAllocator, 1598 resources: Option<Vec<Resource>>, 1599 ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> { 1600 self.common 1601 .allocate_bars(allocator, mmio_allocator, resources) 1602 } 1603 1604 fn free_bars( 1605 &mut self, 1606 allocator: &mut SystemAllocator, 1607 mmio_allocator: &mut AddressAllocator, 1608 ) -> Result<(), PciDeviceError> { 1609 self.common.free_bars(allocator, mmio_allocator) 1610 } 1611 1612 fn write_config_register( 1613 &mut self, 1614 reg_idx: usize, 1615 offset: u64, 1616 data: &[u8], 1617 ) -> Option<Arc<Barrier>> { 1618 self.common.write_config_register(reg_idx, offset, data) 1619 } 1620 1621 fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1622 self.common.read_config_register(reg_idx) 1623 } 1624 1625 fn detect_bar_reprogramming( 1626 &mut self, 1627 reg_idx: usize, 1628 data: &[u8], 1629 ) -> Option<BarReprogrammingParams> { 1630 self.common 1631 .configuration 1632 .detect_bar_reprogramming(reg_idx, data) 1633 } 1634 1635 fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1636 self.common.read_bar(base, offset, data) 1637 } 1638 1639 fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1640 self.common.write_bar(base, offset, data) 1641 } 1642 1643 fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> { 1644 for region in self.common.mmio_regions.iter_mut() { 1645 if region.start.raw_value() == old_base { 1646 region.start = GuestAddress(new_base); 1647 1648 for user_memory_region in region.user_memory_regions.iter_mut() { 1649 // Remove old region 1650 let old_mem_region = self.vm.make_user_memory_region( 1651 user_memory_region.slot, 1652 user_memory_region.start, 1653 user_memory_region.size, 1654 user_memory_region.host_addr, 1655 false, 1656 false, 1657 ); 1658 1659 self.vm 1660 .remove_user_memory_region(old_mem_region) 1661 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1662 1663 // Update the user memory region with the correct start address. 1664 if new_base > old_base { 1665 user_memory_region.start += new_base - old_base; 1666 } else { 1667 user_memory_region.start -= old_base - new_base; 1668 } 1669 1670 // Insert new region 1671 let new_mem_region = self.vm.make_user_memory_region( 1672 user_memory_region.slot, 1673 user_memory_region.start, 1674 user_memory_region.size, 1675 user_memory_region.host_addr, 1676 false, 1677 false, 1678 ); 1679 1680 self.vm 1681 .create_user_memory_region(new_mem_region) 1682 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1683 } 1684 } 1685 } 1686 1687 Ok(()) 1688 } 1689 1690 fn as_any(&mut self) -> &mut dyn Any { 1691 self 1692 } 1693 1694 fn id(&self) -> Option<String> { 1695 Some(self.id.clone()) 1696 } 1697 } 1698 1699 impl Pausable for VfioPciDevice {} 1700 1701 impl Snapshottable for VfioPciDevice { 1702 fn id(&self) -> String { 1703 self.id.clone() 1704 } 1705 1706 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1707 let mut vfio_pci_dev_snapshot = Snapshot::new(&self.id); 1708 1709 // Snapshot VfioCommon 1710 vfio_pci_dev_snapshot.add_snapshot(self.common.snapshot()?); 1711 1712 Ok(vfio_pci_dev_snapshot) 1713 } 1714 1715 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 1716 // Restore VfioCommon 1717 if let Some(vfio_common_snapshot) = snapshot.snapshots.get(&self.common.id()) { 1718 self.common.restore(*vfio_common_snapshot.clone())?; 1719 self.map_mmio_regions().map_err(|e| { 1720 MigratableError::Restore(anyhow!( 1721 "Could not map MMIO regions for VfioPciDevice on restore {:?}", 1722 e 1723 )) 1724 })?; 1725 } 1726 1727 Ok(()) 1728 } 1729 } 1730 impl Transportable for VfioPciDevice {} 1731 impl Migratable for VfioPciDevice {} 1732