1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 6 use crate::{ 7 msi_num_enabled_vectors, BarReprogrammingParams, MsiConfig, MsixCap, MsixConfig, 8 PciBarConfiguration, PciBarRegionType, PciCapabilityId, PciClassCode, PciConfiguration, 9 PciDevice, PciDeviceError, PciHeaderType, PciSubclass, MSIX_TABLE_ENTRY_SIZE, 10 }; 11 use byteorder::{ByteOrder, LittleEndian}; 12 use hypervisor::HypervisorVmError; 13 use std::any::Any; 14 use std::io; 15 use std::os::unix::io::AsRawFd; 16 use std::ptr::null_mut; 17 use std::sync::{Arc, Barrier}; 18 use thiserror::Error; 19 use vfio_bindings::bindings::vfio::*; 20 use vfio_ioctls::{VfioContainer, VfioDevice, VfioIrq}; 21 use vm_allocator::SystemAllocator; 22 use vm_device::interrupt::{ 23 InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, 24 }; 25 use vm_device::BusDevice; 26 use vm_memory::{Address, GuestAddress, GuestUsize}; 27 use vmm_sys_util::eventfd::EventFd; 28 29 #[derive(Debug, Error)] 30 pub enum VfioPciError { 31 #[error("Failed to DMA map: {0}")] 32 DmaMap(#[source] vfio_ioctls::VfioError), 33 #[error("Failed to DMA unmap: {0}")] 34 DmaUnmap(#[source] vfio_ioctls::VfioError), 35 #[error("Failed to enable INTx: {0}")] 36 EnableIntx(#[source] VfioError), 37 #[error("Failed to enable MSI: {0}")] 38 EnableMsi(#[source] VfioError), 39 #[error("Failed to enable MSI-x: {0}")] 40 EnableMsix(#[source] VfioError), 41 #[error("Failed to map VFIO PCI region into guest: {0}")] 42 MapRegionGuest(#[source] HypervisorVmError), 43 #[error("Failed to notifier's eventfd")] 44 MissingNotifier, 45 } 46 47 #[derive(Copy, Clone)] 48 enum PciVfioSubclass { 49 VfioSubclass = 0xff, 50 } 51 52 impl PciSubclass for PciVfioSubclass { 53 fn get_register_value(&self) -> u8 { 54 *self as u8 55 } 56 } 57 58 enum InterruptUpdateAction { 59 EnableMsi, 60 DisableMsi, 61 EnableMsix, 62 DisableMsix, 63 } 64 65 pub(crate) struct VfioIntx { 66 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 67 enabled: bool, 68 } 69 70 pub(crate) struct VfioMsi { 71 pub(crate) cfg: MsiConfig, 72 cap_offset: u32, 73 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 74 } 75 76 impl VfioMsi { 77 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 78 let old_enabled = self.cfg.enabled(); 79 80 self.cfg.update(offset, data); 81 82 let new_enabled = self.cfg.enabled(); 83 84 if !old_enabled && new_enabled { 85 return Some(InterruptUpdateAction::EnableMsi); 86 } 87 88 if old_enabled && !new_enabled { 89 return Some(InterruptUpdateAction::DisableMsi); 90 } 91 92 None 93 } 94 } 95 96 pub(crate) struct VfioMsix { 97 pub(crate) bar: MsixConfig, 98 cap: MsixCap, 99 cap_offset: u32, 100 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 101 } 102 103 impl VfioMsix { 104 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 105 let old_enabled = self.bar.enabled(); 106 107 // Update "Message Control" word 108 if offset == 2 && data.len() == 2 { 109 self.bar.set_msg_ctl(LittleEndian::read_u16(data)); 110 } 111 112 let new_enabled = self.bar.enabled(); 113 114 if !old_enabled && new_enabled { 115 return Some(InterruptUpdateAction::EnableMsix); 116 } 117 118 if old_enabled && !new_enabled { 119 return Some(InterruptUpdateAction::DisableMsix); 120 } 121 122 None 123 } 124 125 fn table_accessed(&self, bar_index: u32, offset: u64) -> bool { 126 let table_offset: u64 = u64::from(self.cap.table_offset()); 127 let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64); 128 let table_bir: u32 = self.cap.table_bir(); 129 130 bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size 131 } 132 } 133 134 pub(crate) struct Interrupt { 135 pub(crate) intx: Option<VfioIntx>, 136 pub(crate) msi: Option<VfioMsi>, 137 pub(crate) msix: Option<VfioMsix>, 138 } 139 140 impl Interrupt { 141 fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 142 if let Some(ref mut msi) = &mut self.msi { 143 let action = msi.update(offset, data); 144 return action; 145 } 146 147 None 148 } 149 150 fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 151 if let Some(ref mut msix) = &mut self.msix { 152 let action = msix.update(offset, data); 153 return action; 154 } 155 156 None 157 } 158 159 fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { 160 if let Some(msi) = &self.msi { 161 if offset >= u64::from(msi.cap_offset) 162 && offset < u64::from(msi.cap_offset) + msi.cfg.size() 163 { 164 return Some(( 165 PciCapabilityId::MessageSignalledInterrupts, 166 u64::from(msi.cap_offset), 167 )); 168 } 169 } 170 171 if let Some(msix) = &self.msix { 172 if offset == u64::from(msix.cap_offset) { 173 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); 174 } 175 } 176 177 None 178 } 179 180 fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool { 181 if let Some(msix) = &self.msix { 182 return msix.table_accessed(bar_index, offset); 183 } 184 185 false 186 } 187 188 fn msix_write_table(&mut self, offset: u64, data: &[u8]) { 189 if let Some(ref mut msix) = &mut self.msix { 190 let offset = offset - u64::from(msix.cap.table_offset()); 191 msix.bar.write_table(offset, data) 192 } 193 } 194 195 fn msix_read_table(&self, offset: u64, data: &mut [u8]) { 196 if let Some(msix) = &self.msix { 197 let offset = offset - u64::from(msix.cap.table_offset()); 198 msix.bar.read_table(offset, data) 199 } 200 } 201 202 pub(crate) fn intx_in_use(&self) -> bool { 203 if let Some(intx) = &self.intx { 204 return intx.enabled; 205 } 206 207 false 208 } 209 } 210 211 #[derive(Copy, Clone)] 212 pub struct MmioRegion { 213 pub start: GuestAddress, 214 pub length: GuestUsize, 215 pub(crate) type_: PciBarRegionType, 216 pub(crate) index: u32, 217 pub(crate) mem_slot: Option<u32>, 218 pub(crate) host_addr: Option<u64>, 219 pub(crate) mmap_size: Option<usize>, 220 } 221 #[derive(Debug, Error)] 222 pub enum VfioError { 223 #[error("Kernel VFIO error: {0}")] 224 KernelVfio(#[source] vfio_ioctls::VfioError), 225 } 226 227 pub(crate) trait Vfio { 228 fn read_config_byte(&self, offset: u32) -> u8 { 229 let mut data: [u8; 1] = [0]; 230 self.read_config(offset, &mut data); 231 data[0] 232 } 233 234 fn read_config_word(&self, offset: u32) -> u16 { 235 let mut data: [u8; 2] = [0, 0]; 236 self.read_config(offset, &mut data); 237 u16::from_le_bytes(data) 238 } 239 240 fn read_config_dword(&self, offset: u32) -> u32 { 241 let mut data: [u8; 4] = [0, 0, 0, 0]; 242 self.read_config(offset, &mut data); 243 u32::from_le_bytes(data) 244 } 245 246 fn write_config_dword(&self, offset: u32, buf: u32) { 247 let data: [u8; 4] = buf.to_le_bytes(); 248 self.write_config(offset, &data) 249 } 250 251 fn read_config(&self, offset: u32, data: &mut [u8]) { 252 self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut()); 253 } 254 255 fn write_config(&self, offset: u32, data: &[u8]) { 256 self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data) 257 } 258 259 fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 260 self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds) 261 } 262 263 fn disable_msi(&self) -> Result<(), VfioError> { 264 self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX) 265 } 266 267 fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 268 self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds) 269 } 270 271 fn disable_msix(&self) -> Result<(), VfioError> { 272 self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX) 273 } 274 275 fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) { 276 unimplemented!() 277 } 278 279 fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) { 280 unimplemented!() 281 } 282 283 fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> { 284 unimplemented!() 285 } 286 287 fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 288 unimplemented!() 289 } 290 291 fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 292 unimplemented!() 293 } 294 295 fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 296 unimplemented!() 297 } 298 } 299 300 struct VfioDeviceWrapper { 301 device: Arc<VfioDevice>, 302 } 303 304 impl VfioDeviceWrapper { 305 fn new(device: Arc<VfioDevice>) -> Self { 306 Self { device } 307 } 308 } 309 310 impl Vfio for VfioDeviceWrapper { 311 fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) { 312 self.device.region_read(index, data, offset) 313 } 314 315 fn region_write(&self, index: u32, offset: u64, data: &[u8]) { 316 self.device.region_write(index, data, offset) 317 } 318 319 fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> { 320 self.device.get_irq_info(irq_index).copied() 321 } 322 323 fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 324 self.device 325 .enable_irq(irq_index, event_fds) 326 .map_err(VfioError::KernelVfio) 327 } 328 329 fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> { 330 self.device 331 .disable_irq(irq_index) 332 .map_err(VfioError::KernelVfio) 333 } 334 335 fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> { 336 self.device 337 .unmask_irq(irq_index) 338 .map_err(VfioError::KernelVfio) 339 } 340 } 341 342 pub(crate) struct VfioCommon { 343 pub(crate) configuration: PciConfiguration, 344 pub(crate) mmio_regions: Vec<MmioRegion>, 345 pub(crate) interrupt: Interrupt, 346 } 347 348 impl VfioCommon { 349 pub(crate) fn allocate_bars( 350 &mut self, 351 allocator: &mut SystemAllocator, 352 vfio_wrapper: &dyn Vfio, 353 ) -> Result<Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, PciDeviceError> { 354 let mut ranges = Vec::new(); 355 let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX as u32; 356 357 // Going through all regular regions to compute the BAR size. 358 // We're not saving the BAR address to restore it, because we 359 // are going to allocate a guest address for each BAR and write 360 // that new address back. 361 while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { 362 let region_size: u64; 363 let bar_addr: GuestAddress; 364 365 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 366 (PCI_ROM_EXP_BAR_INDEX * 4) as u32 367 } else { 368 PCI_CONFIG_BAR_OFFSET + bar_id * 4 369 }; 370 371 // First read flags 372 let flags = vfio_wrapper.read_config_dword(bar_offset); 373 374 // Is this an IO BAR? 375 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 376 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) 377 } else { 378 false 379 }; 380 381 // Is this a 64-bit BAR? 382 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 383 matches!( 384 flags & PCI_CONFIG_MEMORY_BAR_64BIT, 385 PCI_CONFIG_MEMORY_BAR_64BIT 386 ) 387 } else { 388 false 389 }; 390 391 // By default, the region type is 32 bits memory BAR. 392 let mut region_type = PciBarRegionType::Memory32BitRegion; 393 394 // To get size write all 1s 395 vfio_wrapper.write_config_dword(bar_offset, 0xffff_ffff); 396 397 // And read back BAR value. The device will write zeros for bits it doesn't care about 398 let mut lower = vfio_wrapper.read_config_dword(bar_offset); 399 400 if io_bar { 401 #[cfg(target_arch = "x86_64")] 402 { 403 // IO BAR 404 region_type = PciBarRegionType::IoRegion; 405 406 // Mask flag bits (lowest 2 for I/O bars) 407 lower &= !0b11; 408 409 // BAR is not enabled 410 if lower == 0 { 411 bar_id += 1; 412 continue; 413 } 414 415 // Invert bits and add 1 to calculate size 416 region_size = (!lower + 1) as u64; 417 418 // The address needs to be 4 bytes aligned. 419 bar_addr = allocator 420 .allocate_io_addresses(None, region_size, Some(0x4)) 421 .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; 422 } 423 #[cfg(target_arch = "aarch64")] 424 unimplemented!() 425 } else if is_64bit_bar { 426 // 64 bits Memory BAR 427 region_type = PciBarRegionType::Memory64BitRegion; 428 429 // Query size of upper BAR of 64-bit BAR 430 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; 431 vfio_wrapper.write_config_dword(upper_offset, 0xffff_ffff); 432 let upper = vfio_wrapper.read_config_dword(upper_offset); 433 434 let mut combined_size = u64::from(upper) << 32 | u64::from(lower); 435 436 // Mask out flag bits (lowest 4 for memory bars) 437 combined_size &= !0b1111; 438 439 // BAR is not enabled 440 if combined_size == 0 { 441 bar_id += 1; 442 continue; 443 } 444 445 // Invert and add 1 to to find size 446 region_size = (!combined_size + 1) as u64; 447 448 // BAR allocation must be naturally aligned 449 bar_addr = allocator 450 .allocate_mmio_addresses(None, region_size, Some(region_size)) 451 .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; 452 } else { 453 // Mask out flag bits (lowest 4 for memory bars) 454 lower &= !0b1111; 455 456 if lower == 0 { 457 bar_id += 1; 458 continue; 459 } 460 461 // Invert and add 1 to to find size 462 region_size = (!lower + 1) as u64; 463 464 // BAR allocation must be naturally aligned 465 bar_addr = allocator 466 .allocate_mmio_hole_addresses(None, region_size, Some(region_size)) 467 .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; 468 } 469 470 let reg_idx = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 471 PCI_ROM_EXP_BAR_INDEX 472 } else { 473 bar_id as usize 474 }; 475 476 // We can now build our BAR configuration block. 477 let config = PciBarConfiguration::default() 478 .set_register_index(reg_idx) 479 .set_address(bar_addr.raw_value()) 480 .set_size(region_size) 481 .set_region_type(region_type); 482 483 if bar_id == VFIO_PCI_ROM_REGION_INDEX { 484 self.configuration 485 .add_pci_rom_bar(&config, flags & 0x1) 486 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 487 } else { 488 self.configuration 489 .add_pci_bar(&config) 490 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 491 } 492 493 ranges.push((bar_addr, region_size, region_type)); 494 self.mmio_regions.push(MmioRegion { 495 start: bar_addr, 496 length: region_size, 497 type_: region_type, 498 index: bar_id as u32, 499 mem_slot: None, 500 host_addr: None, 501 mmap_size: None, 502 }); 503 504 bar_id += 1; 505 if is_64bit_bar { 506 bar_id += 1; 507 } 508 } 509 510 Ok(ranges) 511 } 512 513 pub(crate) fn free_bars( 514 &mut self, 515 allocator: &mut SystemAllocator, 516 ) -> Result<(), PciDeviceError> { 517 for region in self.mmio_regions.iter() { 518 match region.type_ { 519 PciBarRegionType::IoRegion => { 520 #[cfg(target_arch = "x86_64")] 521 allocator.free_io_addresses(region.start, region.length); 522 #[cfg(target_arch = "aarch64")] 523 error!("I/O region is not supported"); 524 } 525 PciBarRegionType::Memory32BitRegion => { 526 allocator.free_mmio_hole_addresses(region.start, region.length); 527 } 528 PciBarRegionType::Memory64BitRegion => { 529 allocator.free_mmio_addresses(region.start, region.length); 530 } 531 } 532 } 533 Ok(()) 534 } 535 536 pub(crate) fn parse_msix_capabilities( 537 &mut self, 538 cap: u8, 539 interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 540 vfio_wrapper: &dyn Vfio, 541 ) { 542 let msg_ctl = vfio_wrapper.read_config_word((cap + 2).into()); 543 544 let table = vfio_wrapper.read_config_dword((cap + 4).into()); 545 546 let pba = vfio_wrapper.read_config_dword((cap + 8).into()); 547 548 let msix_cap = MsixCap { 549 msg_ctl, 550 table, 551 pba, 552 }; 553 554 let interrupt_source_group = interrupt_manager 555 .create_group(MsiIrqGroupConfig { 556 base: 0, 557 count: msix_cap.table_size() as InterruptIndex, 558 }) 559 .unwrap(); 560 561 let msix_config = MsixConfig::new(msix_cap.table_size(), interrupt_source_group.clone(), 0); 562 563 self.interrupt.msix = Some(VfioMsix { 564 bar: msix_config, 565 cap: msix_cap, 566 cap_offset: cap.into(), 567 interrupt_source_group, 568 }); 569 } 570 571 pub(crate) fn parse_msi_capabilities( 572 &mut self, 573 cap: u8, 574 interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 575 vfio_wrapper: &dyn Vfio, 576 ) { 577 let msg_ctl = vfio_wrapper.read_config_word((cap + 2).into()); 578 579 let interrupt_source_group = interrupt_manager 580 .create_group(MsiIrqGroupConfig { 581 base: 0, 582 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, 583 }) 584 .unwrap(); 585 586 let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone()); 587 588 self.interrupt.msi = Some(VfioMsi { 589 cfg: msi_config, 590 cap_offset: cap.into(), 591 interrupt_source_group, 592 }); 593 } 594 595 pub(crate) fn parse_capabilities( 596 &mut self, 597 interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 598 vfio_wrapper: &dyn Vfio, 599 ) { 600 let mut cap_next = vfio_wrapper.read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 601 602 while cap_next != 0 { 603 let cap_id = vfio_wrapper.read_config_byte(cap_next.into()); 604 605 match PciCapabilityId::from(cap_id) { 606 PciCapabilityId::MessageSignalledInterrupts => { 607 if let Some(irq_info) = vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { 608 if irq_info.count > 0 { 609 // Parse capability only if the VFIO device 610 // supports MSI. 611 self.parse_msi_capabilities(cap_next, interrupt_manager, vfio_wrapper); 612 } 613 } 614 } 615 PciCapabilityId::MsiX => { 616 if let Some(irq_info) = vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) { 617 if irq_info.count > 0 { 618 // Parse capability only if the VFIO device 619 // supports MSI-X. 620 self.parse_msix_capabilities(cap_next, interrupt_manager, vfio_wrapper); 621 } 622 } 623 } 624 _ => {} 625 }; 626 627 cap_next = vfio_wrapper.read_config_byte((cap_next + 1).into()); 628 } 629 } 630 631 pub(crate) fn enable_intx(&mut self, wrapper: &dyn Vfio) -> Result<(), VfioPciError> { 632 if let Some(intx) = &mut self.interrupt.intx { 633 if !intx.enabled { 634 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { 635 wrapper 636 .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) 637 .map_err(VfioPciError::EnableIntx)?; 638 639 intx.enabled = true; 640 } else { 641 return Err(VfioPciError::MissingNotifier); 642 } 643 } 644 } 645 646 Ok(()) 647 } 648 649 pub(crate) fn disable_intx(&mut self, wrapper: &dyn Vfio) { 650 if let Some(intx) = &mut self.interrupt.intx { 651 if intx.enabled { 652 if let Err(e) = wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { 653 error!("Could not disable INTx: {}", e); 654 } else { 655 intx.enabled = false; 656 } 657 } 658 } 659 } 660 661 pub(crate) fn enable_msi(&self, wrapper: &dyn Vfio) -> Result<(), VfioPciError> { 662 if let Some(msi) = &self.interrupt.msi { 663 let mut irq_fds: Vec<EventFd> = Vec::new(); 664 for i in 0..msi.cfg.num_enabled_vectors() { 665 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { 666 irq_fds.push(eventfd); 667 } else { 668 return Err(VfioPciError::MissingNotifier); 669 } 670 } 671 672 wrapper 673 .enable_msi(irq_fds.iter().collect()) 674 .map_err(VfioPciError::EnableMsi)?; 675 } 676 677 Ok(()) 678 } 679 680 pub(crate) fn disable_msi(&self, wrapper: &dyn Vfio) { 681 if let Err(e) = wrapper.disable_msi() { 682 error!("Could not disable MSI: {}", e); 683 } 684 } 685 686 pub(crate) fn enable_msix(&self, wrapper: &dyn Vfio) -> Result<(), VfioPciError> { 687 if let Some(msix) = &self.interrupt.msix { 688 let mut irq_fds: Vec<EventFd> = Vec::new(); 689 for i in 0..msix.bar.table_entries.len() { 690 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { 691 irq_fds.push(eventfd); 692 } else { 693 return Err(VfioPciError::MissingNotifier); 694 } 695 } 696 697 wrapper 698 .enable_msix(irq_fds.iter().collect()) 699 .map_err(VfioPciError::EnableMsix)?; 700 } 701 702 Ok(()) 703 } 704 705 pub(crate) fn disable_msix(&self, wrapper: &dyn Vfio) { 706 if let Err(e) = wrapper.disable_msix() { 707 error!("Could not disable MSI-X: {}", e); 708 } 709 } 710 711 pub(crate) fn initialize_legacy_interrupt( 712 &mut self, 713 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 714 wrapper: &dyn Vfio, 715 ) -> Result<(), VfioPciError> { 716 if let Some(irq_info) = wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { 717 if irq_info.count == 0 { 718 // A count of 0 means the INTx IRQ is not supported, therefore 719 // it shouldn't be initialized. 720 return Ok(()); 721 } 722 } 723 724 if let Some(interrupt_source_group) = legacy_interrupt_group { 725 self.interrupt.intx = Some(VfioIntx { 726 interrupt_source_group, 727 enabled: false, 728 }); 729 730 self.enable_intx(wrapper)?; 731 } 732 733 Ok(()) 734 } 735 736 pub(crate) fn update_msi_capabilities( 737 &mut self, 738 offset: u64, 739 data: &[u8], 740 wrapper: &dyn Vfio, 741 ) -> Result<(), VfioPciError> { 742 match self.interrupt.update_msi(offset, data) { 743 Some(InterruptUpdateAction::EnableMsi) => { 744 // Disable INTx before we can enable MSI 745 self.disable_intx(wrapper); 746 self.enable_msi(wrapper)?; 747 } 748 Some(InterruptUpdateAction::DisableMsi) => { 749 // Fallback onto INTx when disabling MSI 750 self.disable_msi(wrapper); 751 self.enable_intx(wrapper)?; 752 } 753 _ => {} 754 } 755 756 Ok(()) 757 } 758 759 pub(crate) fn update_msix_capabilities( 760 &mut self, 761 offset: u64, 762 data: &[u8], 763 wrapper: &dyn Vfio, 764 ) -> Result<(), VfioPciError> { 765 match self.interrupt.update_msix(offset, data) { 766 Some(InterruptUpdateAction::EnableMsix) => { 767 // Disable INTx before we can enable MSI-X 768 self.disable_intx(wrapper); 769 self.enable_msix(wrapper)?; 770 } 771 Some(InterruptUpdateAction::DisableMsix) => { 772 // Fallback onto INTx when disabling MSI-X 773 self.disable_msix(wrapper); 774 self.enable_intx(wrapper)?; 775 } 776 _ => {} 777 } 778 779 Ok(()) 780 } 781 782 pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> { 783 for region in self.mmio_regions.iter() { 784 if addr >= region.start.raw_value() 785 && addr < region.start.unchecked_add(region.length).raw_value() 786 { 787 return Some(*region); 788 } 789 } 790 None 791 } 792 793 pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8], wrapper: &dyn Vfio) { 794 let addr = base + offset; 795 if let Some(region) = self.find_region(addr) { 796 let offset = addr - region.start.raw_value(); 797 798 if self.interrupt.msix_table_accessed(region.index, offset) { 799 self.interrupt.msix_read_table(offset, data); 800 } else { 801 wrapper.region_read(region.index, offset, data); 802 } 803 } 804 805 // INTx EOI 806 // The guest reading from the BAR potentially means the interrupt has 807 // been received and can be acknowledged. 808 if self.interrupt.intx_in_use() { 809 if let Err(e) = wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 810 error!("Failed unmasking INTx IRQ: {}", e); 811 } 812 } 813 } 814 815 pub(crate) fn write_bar( 816 &mut self, 817 base: u64, 818 offset: u64, 819 data: &[u8], 820 wrapper: &dyn Vfio, 821 ) -> Option<Arc<Barrier>> { 822 let addr = base + offset; 823 if let Some(region) = self.find_region(addr) { 824 let offset = addr - region.start.raw_value(); 825 826 // If the MSI-X table is written to, we need to update our cache. 827 if self.interrupt.msix_table_accessed(region.index, offset) { 828 self.interrupt.msix_write_table(offset, data); 829 } else { 830 wrapper.region_write(region.index, offset, data); 831 } 832 } 833 834 // INTx EOI 835 // The guest writing to the BAR potentially means the interrupt has 836 // been received and can be acknowledged. 837 if self.interrupt.intx_in_use() { 838 if let Err(e) = wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 839 error!("Failed unmasking INTx IRQ: {}", e); 840 } 841 } 842 843 None 844 } 845 846 pub(crate) fn write_config_register( 847 &mut self, 848 reg_idx: usize, 849 offset: u64, 850 data: &[u8], 851 wrapper: &dyn Vfio, 852 ) -> Option<Arc<Barrier>> { 853 // When the guest wants to write to a BAR, we trap it into 854 // our local configuration space. We're not reprogramming 855 // VFIO device. 856 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 857 || reg_idx == PCI_ROM_EXP_BAR_INDEX 858 { 859 // We keep our local cache updated with the BARs. 860 // We'll read it back from there when the guest is asking 861 // for BARs (see read_config_register()). 862 self.configuration 863 .write_config_register(reg_idx, offset, data); 864 return None; 865 } 866 867 let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; 868 869 // If the MSI or MSI-X capabilities are accessed, we need to 870 // update our local cache accordingly. 871 // Depending on how the capabilities are modified, this could 872 // trigger a VFIO MSI or MSI-X toggle. 873 if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { 874 let cap_offset: u64 = reg - cap_base + offset; 875 match cap_id { 876 PciCapabilityId::MessageSignalledInterrupts => { 877 if let Err(e) = self.update_msi_capabilities(cap_offset, data, wrapper) { 878 error!("Could not update MSI capabilities: {}", e); 879 } 880 } 881 PciCapabilityId::MsiX => { 882 if let Err(e) = self.update_msix_capabilities(cap_offset, data, wrapper) { 883 error!("Could not update MSI-X capabilities: {}", e); 884 } 885 } 886 _ => {} 887 } 888 } 889 890 // Make sure to write to the device's PCI config space after MSI/MSI-X 891 // interrupts have been enabled/disabled. In case of MSI, when the 892 // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), 893 // the MSI Enable bit in the MSI capability structure found in the PCI 894 // config space is disabled by default. That's why when the guest is 895 // enabling this bit, we first need to enable the MSI interrupts with 896 // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write 897 // to the device region to update the MSI Enable bit. 898 wrapper.write_config((reg + offset) as u32, data); 899 900 None 901 } 902 903 pub(crate) fn read_config_register(&mut self, reg_idx: usize, wrapper: &dyn Vfio) -> u32 { 904 // When reading the BARs, we trap it and return what comes 905 // from our local configuration space. We want the guest to 906 // use that and not the VFIO device BARs as it does not map 907 // with the guest address space. 908 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 909 || reg_idx == PCI_ROM_EXP_BAR_INDEX 910 { 911 return self.configuration.read_reg(reg_idx); 912 } 913 914 // Since we don't support passing multi-functions devices, we should 915 // mask the multi-function bit, bit 7 of the Header Type byte on the 916 // register 3. 917 let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { 918 0xff7f_ffff 919 } else { 920 0xffff_ffff 921 }; 922 923 // The config register read comes from the VFIO device itself. 924 wrapper.read_config_dword((reg_idx * 4) as u32) & mask 925 } 926 } 927 928 /// VfioPciDevice represents a VFIO PCI device. 929 /// This structure implements the BusDevice and PciDevice traits. 930 /// 931 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. 932 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, 933 /// which then gets added to the PCI bus. 934 pub struct VfioPciDevice { 935 vm: Arc<dyn hypervisor::Vm>, 936 device: Arc<VfioDevice>, 937 container: Arc<VfioContainer>, 938 vfio_wrapper: VfioDeviceWrapper, 939 common: VfioCommon, 940 iommu_attached: bool, 941 } 942 943 impl VfioPciDevice { 944 /// Constructs a new Vfio Pci device for the given Vfio device 945 pub fn new( 946 vm: &Arc<dyn hypervisor::Vm>, 947 device: VfioDevice, 948 container: Arc<VfioContainer>, 949 msi_interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 950 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 951 iommu_attached: bool, 952 ) -> Result<Self, VfioPciError> { 953 let device = Arc::new(device); 954 device.reset(); 955 956 let configuration = PciConfiguration::new( 957 0, 958 0, 959 0, 960 PciClassCode::Other, 961 &PciVfioSubclass::VfioSubclass, 962 None, 963 PciHeaderType::Device, 964 0, 965 0, 966 None, 967 ); 968 969 let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device)); 970 971 let mut common = VfioCommon { 972 mmio_regions: Vec::new(), 973 configuration, 974 interrupt: Interrupt { 975 intx: None, 976 msi: None, 977 msix: None, 978 }, 979 }; 980 981 common.parse_capabilities(msi_interrupt_manager, &vfio_wrapper); 982 common.initialize_legacy_interrupt(legacy_interrupt_group, &vfio_wrapper)?; 983 984 let vfio_pci_device = VfioPciDevice { 985 vm: vm.clone(), 986 device, 987 container, 988 vfio_wrapper, 989 common, 990 iommu_attached, 991 }; 992 993 Ok(vfio_pci_device) 994 } 995 996 pub fn iommu_attached(&self) -> bool { 997 self.iommu_attached 998 } 999 1000 /// Map MMIO regions into the guest, and avoid VM exits when the guest tries 1001 /// to reach those regions. 1002 /// 1003 /// # Arguments 1004 /// 1005 /// * `vm` - The VM object. It is used to set the VFIO MMIO regions 1006 /// as user memory regions. 1007 /// * `mem_slot` - The closure to return a memory slot. 1008 pub fn map_mmio_regions<F>( 1009 &mut self, 1010 vm: &Arc<dyn hypervisor::Vm>, 1011 mem_slot: F, 1012 ) -> Result<(), VfioPciError> 1013 where 1014 F: Fn() -> u32, 1015 { 1016 let fd = self.device.as_raw_fd(); 1017 1018 for region in self.common.mmio_regions.iter_mut() { 1019 // We want to skip the mapping of the BAR containing the MSI-X 1020 // table even if it is mappable. The reason is we need to trap 1021 // any access to the MSI-X table and update the GSI routing 1022 // accordingly. 1023 if let Some(msix) = &self.common.interrupt.msix { 1024 if region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir() { 1025 continue; 1026 } 1027 } 1028 1029 let region_flags = self.device.get_region_flags(region.index); 1030 if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { 1031 let mut prot = 0; 1032 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 { 1033 prot |= libc::PROT_READ; 1034 } 1035 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { 1036 prot |= libc::PROT_WRITE; 1037 } 1038 let (mmap_offset, mmap_size) = self.device.get_region_mmap(region.index); 1039 let offset = self.device.get_region_offset(region.index) + mmap_offset; 1040 1041 let host_addr = unsafe { 1042 libc::mmap( 1043 null_mut(), 1044 mmap_size as usize, 1045 prot, 1046 libc::MAP_SHARED, 1047 fd, 1048 offset as libc::off_t, 1049 ) 1050 }; 1051 1052 if host_addr == libc::MAP_FAILED { 1053 error!( 1054 "Could not mmap regions, error:{}", 1055 io::Error::last_os_error() 1056 ); 1057 continue; 1058 } 1059 1060 let slot = mem_slot(); 1061 let mem_region = vm.make_user_memory_region( 1062 slot, 1063 region.start.raw_value() + mmap_offset, 1064 mmap_size as u64, 1065 host_addr as u64, 1066 false, 1067 false, 1068 ); 1069 1070 vm.create_user_memory_region(mem_region) 1071 .map_err(VfioPciError::MapRegionGuest)?; 1072 1073 // Update the region with memory mapped info. 1074 region.mem_slot = Some(slot); 1075 region.host_addr = Some(host_addr as u64); 1076 region.mmap_size = Some(mmap_size as usize); 1077 } 1078 } 1079 1080 Ok(()) 1081 } 1082 1083 pub fn unmap_mmio_regions(&mut self) { 1084 for region in self.common.mmio_regions.iter() { 1085 if let (Some(host_addr), Some(mmap_size), Some(mem_slot)) = 1086 (region.host_addr, region.mmap_size, region.mem_slot) 1087 { 1088 let (mmap_offset, _) = self.device.get_region_mmap(region.index); 1089 1090 // Remove region 1091 let r = self.vm.make_user_memory_region( 1092 mem_slot, 1093 region.start.raw_value() + mmap_offset, 1094 mmap_size as u64, 1095 host_addr as u64, 1096 false, 1097 false, 1098 ); 1099 1100 if let Err(e) = self.vm.remove_user_memory_region(r) { 1101 error!("Could not remove the userspace memory region: {}", e); 1102 } 1103 1104 let ret = unsafe { libc::munmap(host_addr as *mut libc::c_void, mmap_size) }; 1105 if ret != 0 { 1106 error!( 1107 "Could not unmap region {}, error:{}", 1108 region.index, 1109 io::Error::last_os_error() 1110 ); 1111 } 1112 } 1113 } 1114 } 1115 1116 pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> { 1117 if !self.iommu_attached { 1118 self.container 1119 .vfio_dma_map(iova, size, user_addr) 1120 .map_err(VfioPciError::DmaMap)?; 1121 } 1122 1123 Ok(()) 1124 } 1125 1126 pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> { 1127 if !self.iommu_attached { 1128 self.container 1129 .vfio_dma_unmap(iova, size) 1130 .map_err(VfioPciError::DmaUnmap)?; 1131 } 1132 1133 Ok(()) 1134 } 1135 1136 pub fn mmio_regions(&self) -> Vec<MmioRegion> { 1137 self.common.mmio_regions.clone() 1138 } 1139 } 1140 1141 impl Drop for VfioPciDevice { 1142 fn drop(&mut self) { 1143 self.unmap_mmio_regions(); 1144 1145 if let Some(msix) = &self.common.interrupt.msix { 1146 if msix.bar.enabled() { 1147 self.common.disable_msix(&self.vfio_wrapper); 1148 } 1149 } 1150 1151 if let Some(msi) = &self.common.interrupt.msi { 1152 if msi.cfg.enabled() { 1153 self.common.disable_msi(&self.vfio_wrapper) 1154 } 1155 } 1156 1157 if self.common.interrupt.intx_in_use() { 1158 self.common.disable_intx(&self.vfio_wrapper); 1159 } 1160 } 1161 } 1162 1163 impl BusDevice for VfioPciDevice { 1164 fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1165 self.read_bar(base, offset, data) 1166 } 1167 1168 fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1169 self.write_bar(base, offset, data) 1170 } 1171 } 1172 1173 // First BAR offset in the PCI config space. 1174 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; 1175 // Capability register offset in the PCI config space. 1176 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; 1177 // IO BAR when first BAR bit is 1. 1178 const PCI_CONFIG_IO_BAR: u32 = 0x1; 1179 // 64-bit memory bar flag. 1180 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; 1181 // PCI config register size (4 bytes). 1182 const PCI_CONFIG_REGISTER_SIZE: usize = 4; 1183 // Number of BARs for a PCI device 1184 const BAR_NUMS: usize = 6; 1185 // PCI Header Type register index 1186 const PCI_HEADER_TYPE_REG_INDEX: usize = 3; 1187 // First BAR register index 1188 const PCI_CONFIG_BAR0_INDEX: usize = 4; 1189 // PCI ROM expansion BAR register index 1190 const PCI_ROM_EXP_BAR_INDEX: usize = 12; 1191 1192 impl PciDevice for VfioPciDevice { 1193 fn allocate_bars( 1194 &mut self, 1195 allocator: &mut SystemAllocator, 1196 ) -> Result<Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, PciDeviceError> { 1197 self.common.allocate_bars(allocator, &self.vfio_wrapper) 1198 } 1199 1200 fn free_bars(&mut self, allocator: &mut SystemAllocator) -> Result<(), PciDeviceError> { 1201 self.common.free_bars(allocator) 1202 } 1203 1204 fn write_config_register( 1205 &mut self, 1206 reg_idx: usize, 1207 offset: u64, 1208 data: &[u8], 1209 ) -> Option<Arc<Barrier>> { 1210 self.common 1211 .write_config_register(reg_idx, offset, data, &self.vfio_wrapper) 1212 } 1213 1214 fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1215 self.common 1216 .read_config_register(reg_idx, &self.vfio_wrapper) 1217 } 1218 1219 fn detect_bar_reprogramming( 1220 &mut self, 1221 reg_idx: usize, 1222 data: &[u8], 1223 ) -> Option<BarReprogrammingParams> { 1224 self.common 1225 .configuration 1226 .detect_bar_reprogramming(reg_idx, data) 1227 } 1228 1229 fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1230 self.common.read_bar(base, offset, data, &self.vfio_wrapper) 1231 } 1232 1233 fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1234 self.common 1235 .write_bar(base, offset, data, &self.vfio_wrapper) 1236 } 1237 1238 fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> { 1239 for region in self.common.mmio_regions.iter_mut() { 1240 if region.start.raw_value() == old_base { 1241 region.start = GuestAddress(new_base); 1242 1243 if let Some(mem_slot) = region.mem_slot { 1244 if let Some(host_addr) = region.host_addr { 1245 let (mmap_offset, mmap_size) = self.device.get_region_mmap(region.index); 1246 1247 // Remove old region 1248 let old_mem_region = self.vm.make_user_memory_region( 1249 mem_slot, 1250 old_base + mmap_offset, 1251 mmap_size as u64, 1252 host_addr as u64, 1253 false, 1254 false, 1255 ); 1256 1257 self.vm 1258 .remove_user_memory_region(old_mem_region) 1259 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1260 1261 // Insert new region 1262 let new_mem_region = self.vm.make_user_memory_region( 1263 mem_slot, 1264 new_base + mmap_offset, 1265 mmap_size as u64, 1266 host_addr as u64, 1267 false, 1268 false, 1269 ); 1270 1271 self.vm 1272 .create_user_memory_region(new_mem_region) 1273 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1274 } 1275 } 1276 } 1277 } 1278 1279 Ok(()) 1280 } 1281 1282 fn as_any(&mut self) -> &mut dyn Any { 1283 self 1284 } 1285 } 1286