1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 6 use crate::{ 7 msi_num_enabled_vectors, BarReprogrammingParams, MsiConfig, MsixCap, MsixConfig, 8 PciBarConfiguration, PciBarRegionType, PciCapabilityId, PciClassCode, PciConfiguration, 9 PciDevice, PciDeviceError, PciHeaderType, PciSubclass, MSIX_TABLE_ENTRY_SIZE, 10 }; 11 use byteorder::{ByteOrder, LittleEndian}; 12 use hypervisor::HypervisorVmError; 13 use std::any::Any; 14 use std::io; 15 use std::os::unix::io::AsRawFd; 16 use std::ptr::null_mut; 17 use std::sync::{Arc, Barrier}; 18 use thiserror::Error; 19 use vfio_bindings::bindings::vfio::*; 20 use vfio_ioctls::{VfioContainer, VfioDevice, VfioIrq}; 21 use vm_allocator::SystemAllocator; 22 use vm_device::interrupt::{ 23 InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, 24 }; 25 use vm_device::BusDevice; 26 use vm_memory::{Address, GuestAddress, GuestUsize}; 27 use vmm_sys_util::eventfd::EventFd; 28 29 #[derive(Debug, Error)] 30 pub enum VfioPciError { 31 #[error("Failed to DMA map: {0}")] 32 DmaMap(#[source] vfio_ioctls::VfioError), 33 #[error("Failed to DMA unmap: {0}")] 34 DmaUnmap(#[source] vfio_ioctls::VfioError), 35 #[error("Failed to enable INTx: {0}")] 36 EnableIntx(#[source] VfioError), 37 #[error("Failed to enable MSI: {0}")] 38 EnableMsi(#[source] VfioError), 39 #[error("Failed to enable MSI-x: {0}")] 40 EnableMsix(#[source] VfioError), 41 #[error("Failed to map VFIO PCI region into guest: {0}")] 42 MapRegionGuest(#[source] HypervisorVmError), 43 #[error("Failed to notifier's eventfd")] 44 MissingNotifier, 45 } 46 47 #[derive(Copy, Clone)] 48 enum PciVfioSubclass { 49 VfioSubclass = 0xff, 50 } 51 52 impl PciSubclass for PciVfioSubclass { 53 fn get_register_value(&self) -> u8 { 54 *self as u8 55 } 56 } 57 58 enum InterruptUpdateAction { 59 EnableMsi, 60 DisableMsi, 61 EnableMsix, 62 DisableMsix, 63 } 64 65 pub(crate) struct VfioIntx { 66 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 67 enabled: bool, 68 } 69 70 pub(crate) struct VfioMsi { 71 pub(crate) cfg: MsiConfig, 72 cap_offset: u32, 73 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 74 } 75 76 impl VfioMsi { 77 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 78 let old_enabled = self.cfg.enabled(); 79 80 self.cfg.update(offset, data); 81 82 let new_enabled = self.cfg.enabled(); 83 84 if !old_enabled && new_enabled { 85 return Some(InterruptUpdateAction::EnableMsi); 86 } 87 88 if old_enabled && !new_enabled { 89 return Some(InterruptUpdateAction::DisableMsi); 90 } 91 92 None 93 } 94 } 95 96 pub(crate) struct VfioMsix { 97 pub(crate) bar: MsixConfig, 98 cap: MsixCap, 99 cap_offset: u32, 100 interrupt_source_group: Arc<dyn InterruptSourceGroup>, 101 } 102 103 impl VfioMsix { 104 fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 105 let old_enabled = self.bar.enabled(); 106 107 // Update "Message Control" word 108 if offset == 2 && data.len() == 2 { 109 self.bar.set_msg_ctl(LittleEndian::read_u16(data)); 110 } 111 112 let new_enabled = self.bar.enabled(); 113 114 if !old_enabled && new_enabled { 115 return Some(InterruptUpdateAction::EnableMsix); 116 } 117 118 if old_enabled && !new_enabled { 119 return Some(InterruptUpdateAction::DisableMsix); 120 } 121 122 None 123 } 124 125 fn table_accessed(&self, bar_index: u32, offset: u64) -> bool { 126 let table_offset: u64 = u64::from(self.cap.table_offset()); 127 let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64); 128 let table_bir: u32 = self.cap.table_bir(); 129 130 bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size 131 } 132 } 133 134 pub(crate) struct Interrupt { 135 pub(crate) intx: Option<VfioIntx>, 136 pub(crate) msi: Option<VfioMsi>, 137 pub(crate) msix: Option<VfioMsix>, 138 } 139 140 impl Interrupt { 141 fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 142 if let Some(ref mut msi) = &mut self.msi { 143 let action = msi.update(offset, data); 144 return action; 145 } 146 147 None 148 } 149 150 fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> { 151 if let Some(ref mut msix) = &mut self.msix { 152 let action = msix.update(offset, data); 153 return action; 154 } 155 156 None 157 } 158 159 fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { 160 if let Some(msi) = &self.msi { 161 if offset >= u64::from(msi.cap_offset) 162 && offset < u64::from(msi.cap_offset) + msi.cfg.size() 163 { 164 return Some(( 165 PciCapabilityId::MessageSignalledInterrupts, 166 u64::from(msi.cap_offset), 167 )); 168 } 169 } 170 171 if let Some(msix) = &self.msix { 172 if offset == u64::from(msix.cap_offset) { 173 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); 174 } 175 } 176 177 None 178 } 179 180 fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool { 181 if let Some(msix) = &self.msix { 182 return msix.table_accessed(bar_index, offset); 183 } 184 185 false 186 } 187 188 fn msix_write_table(&mut self, offset: u64, data: &[u8]) { 189 if let Some(ref mut msix) = &mut self.msix { 190 let offset = offset - u64::from(msix.cap.table_offset()); 191 msix.bar.write_table(offset, data) 192 } 193 } 194 195 fn msix_read_table(&self, offset: u64, data: &mut [u8]) { 196 if let Some(msix) = &self.msix { 197 let offset = offset - u64::from(msix.cap.table_offset()); 198 msix.bar.read_table(offset, data) 199 } 200 } 201 202 pub(crate) fn intx_in_use(&self) -> bool { 203 if let Some(intx) = &self.intx { 204 return intx.enabled; 205 } 206 207 false 208 } 209 } 210 211 #[derive(Copy, Clone)] 212 pub struct MmioRegion { 213 pub start: GuestAddress, 214 pub length: GuestUsize, 215 pub(crate) type_: PciBarRegionType, 216 pub(crate) index: u32, 217 pub(crate) mem_slot: Option<u32>, 218 pub(crate) host_addr: Option<u64>, 219 pub(crate) mmap_size: Option<usize>, 220 } 221 #[derive(Debug, Error)] 222 pub enum VfioError { 223 #[error("Kernel VFIO error: {0}")] 224 KernelVfio(#[source] vfio_ioctls::VfioError), 225 #[error("VFIO user error: {0}")] 226 VfioUser(#[source] vfio_user::Error), 227 } 228 229 pub(crate) trait Vfio { 230 fn read_config_byte(&self, offset: u32) -> u8 { 231 let mut data: [u8; 1] = [0]; 232 self.read_config(offset, &mut data); 233 data[0] 234 } 235 236 fn read_config_word(&self, offset: u32) -> u16 { 237 let mut data: [u8; 2] = [0, 0]; 238 self.read_config(offset, &mut data); 239 u16::from_le_bytes(data) 240 } 241 242 fn read_config_dword(&self, offset: u32) -> u32 { 243 let mut data: [u8; 4] = [0, 0, 0, 0]; 244 self.read_config(offset, &mut data); 245 u32::from_le_bytes(data) 246 } 247 248 fn write_config_dword(&self, offset: u32, buf: u32) { 249 let data: [u8; 4] = buf.to_le_bytes(); 250 self.write_config(offset, &data) 251 } 252 253 fn read_config(&self, offset: u32, data: &mut [u8]) { 254 self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut()); 255 } 256 257 fn write_config(&self, offset: u32, data: &[u8]) { 258 self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data) 259 } 260 261 fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 262 self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds) 263 } 264 265 fn disable_msi(&self) -> Result<(), VfioError> { 266 self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX) 267 } 268 269 fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { 270 self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds) 271 } 272 273 fn disable_msix(&self) -> Result<(), VfioError> { 274 self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX) 275 } 276 277 fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) { 278 unimplemented!() 279 } 280 281 fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) { 282 unimplemented!() 283 } 284 285 fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> { 286 unimplemented!() 287 } 288 289 fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 290 unimplemented!() 291 } 292 293 fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 294 unimplemented!() 295 } 296 297 fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> { 298 unimplemented!() 299 } 300 } 301 302 struct VfioDeviceWrapper { 303 device: Arc<VfioDevice>, 304 } 305 306 impl VfioDeviceWrapper { 307 fn new(device: Arc<VfioDevice>) -> Self { 308 Self { device } 309 } 310 } 311 312 impl Vfio for VfioDeviceWrapper { 313 fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) { 314 self.device.region_read(index, data, offset) 315 } 316 317 fn region_write(&self, index: u32, offset: u64, data: &[u8]) { 318 self.device.region_write(index, data, offset) 319 } 320 321 fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> { 322 self.device.get_irq_info(irq_index).copied() 323 } 324 325 fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> { 326 self.device 327 .enable_irq(irq_index, event_fds) 328 .map_err(VfioError::KernelVfio) 329 } 330 331 fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> { 332 self.device 333 .disable_irq(irq_index) 334 .map_err(VfioError::KernelVfio) 335 } 336 337 fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> { 338 self.device 339 .unmask_irq(irq_index) 340 .map_err(VfioError::KernelVfio) 341 } 342 } 343 344 pub(crate) struct VfioCommon { 345 pub(crate) configuration: PciConfiguration, 346 pub(crate) mmio_regions: Vec<MmioRegion>, 347 pub(crate) interrupt: Interrupt, 348 } 349 350 impl VfioCommon { 351 pub(crate) fn allocate_bars( 352 &mut self, 353 allocator: &mut SystemAllocator, 354 vfio_wrapper: &dyn Vfio, 355 ) -> Result<Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, PciDeviceError> { 356 let mut ranges = Vec::new(); 357 let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX as u32; 358 359 // Going through all regular regions to compute the BAR size. 360 // We're not saving the BAR address to restore it, because we 361 // are going to allocate a guest address for each BAR and write 362 // that new address back. 363 while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { 364 let region_size: u64; 365 let bar_addr: GuestAddress; 366 367 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 368 (PCI_ROM_EXP_BAR_INDEX * 4) as u32 369 } else { 370 PCI_CONFIG_BAR_OFFSET + bar_id * 4 371 }; 372 373 // First read flags 374 let flags = vfio_wrapper.read_config_dword(bar_offset); 375 376 // Is this an IO BAR? 377 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 378 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) 379 } else { 380 false 381 }; 382 383 // Is this a 64-bit BAR? 384 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { 385 matches!( 386 flags & PCI_CONFIG_MEMORY_BAR_64BIT, 387 PCI_CONFIG_MEMORY_BAR_64BIT 388 ) 389 } else { 390 false 391 }; 392 393 // By default, the region type is 32 bits memory BAR. 394 let mut region_type = PciBarRegionType::Memory32BitRegion; 395 396 // To get size write all 1s 397 vfio_wrapper.write_config_dword(bar_offset, 0xffff_ffff); 398 399 // And read back BAR value. The device will write zeros for bits it doesn't care about 400 let mut lower = vfio_wrapper.read_config_dword(bar_offset); 401 402 if io_bar { 403 #[cfg(target_arch = "x86_64")] 404 { 405 // IO BAR 406 region_type = PciBarRegionType::IoRegion; 407 408 // Mask flag bits (lowest 2 for I/O bars) 409 lower &= !0b11; 410 411 // BAR is not enabled 412 if lower == 0 { 413 bar_id += 1; 414 continue; 415 } 416 417 // Invert bits and add 1 to calculate size 418 region_size = (!lower + 1) as u64; 419 420 // The address needs to be 4 bytes aligned. 421 bar_addr = allocator 422 .allocate_io_addresses(None, region_size, Some(0x4)) 423 .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; 424 } 425 #[cfg(target_arch = "aarch64")] 426 unimplemented!() 427 } else if is_64bit_bar { 428 // 64 bits Memory BAR 429 region_type = PciBarRegionType::Memory64BitRegion; 430 431 // Query size of upper BAR of 64-bit BAR 432 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; 433 vfio_wrapper.write_config_dword(upper_offset, 0xffff_ffff); 434 let upper = vfio_wrapper.read_config_dword(upper_offset); 435 436 let mut combined_size = u64::from(upper) << 32 | u64::from(lower); 437 438 // Mask out flag bits (lowest 4 for memory bars) 439 combined_size &= !0b1111; 440 441 // BAR is not enabled 442 if combined_size == 0 { 443 bar_id += 1; 444 continue; 445 } 446 447 // Invert and add 1 to to find size 448 region_size = (!combined_size + 1) as u64; 449 450 // BAR allocation must be naturally aligned 451 bar_addr = allocator 452 .allocate_mmio_addresses(None, region_size, Some(region_size)) 453 .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; 454 } else { 455 // Mask out flag bits (lowest 4 for memory bars) 456 lower &= !0b1111; 457 458 if lower == 0 { 459 bar_id += 1; 460 continue; 461 } 462 463 // Invert and add 1 to to find size 464 region_size = (!lower + 1) as u64; 465 466 // BAR allocation must be naturally aligned 467 bar_addr = allocator 468 .allocate_mmio_hole_addresses(None, region_size, Some(region_size)) 469 .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; 470 } 471 472 let reg_idx = if bar_id == VFIO_PCI_ROM_REGION_INDEX { 473 PCI_ROM_EXP_BAR_INDEX 474 } else { 475 bar_id as usize 476 }; 477 478 // We can now build our BAR configuration block. 479 let config = PciBarConfiguration::default() 480 .set_register_index(reg_idx) 481 .set_address(bar_addr.raw_value()) 482 .set_size(region_size) 483 .set_region_type(region_type); 484 485 if bar_id == VFIO_PCI_ROM_REGION_INDEX { 486 self.configuration 487 .add_pci_rom_bar(&config, flags & 0x1) 488 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 489 } else { 490 self.configuration 491 .add_pci_bar(&config) 492 .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; 493 } 494 495 ranges.push((bar_addr, region_size, region_type)); 496 self.mmio_regions.push(MmioRegion { 497 start: bar_addr, 498 length: region_size, 499 type_: region_type, 500 index: bar_id as u32, 501 mem_slot: None, 502 host_addr: None, 503 mmap_size: None, 504 }); 505 506 bar_id += 1; 507 if is_64bit_bar { 508 bar_id += 1; 509 } 510 } 511 512 Ok(ranges) 513 } 514 515 pub(crate) fn free_bars( 516 &mut self, 517 allocator: &mut SystemAllocator, 518 ) -> Result<(), PciDeviceError> { 519 for region in self.mmio_regions.iter() { 520 match region.type_ { 521 PciBarRegionType::IoRegion => { 522 #[cfg(target_arch = "x86_64")] 523 allocator.free_io_addresses(region.start, region.length); 524 #[cfg(target_arch = "aarch64")] 525 error!("I/O region is not supported"); 526 } 527 PciBarRegionType::Memory32BitRegion => { 528 allocator.free_mmio_hole_addresses(region.start, region.length); 529 } 530 PciBarRegionType::Memory64BitRegion => { 531 allocator.free_mmio_addresses(region.start, region.length); 532 } 533 } 534 } 535 Ok(()) 536 } 537 538 pub(crate) fn parse_msix_capabilities( 539 &mut self, 540 cap: u8, 541 interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 542 vfio_wrapper: &dyn Vfio, 543 ) { 544 let msg_ctl = vfio_wrapper.read_config_word((cap + 2).into()); 545 546 let table = vfio_wrapper.read_config_dword((cap + 4).into()); 547 548 let pba = vfio_wrapper.read_config_dword((cap + 8).into()); 549 550 let msix_cap = MsixCap { 551 msg_ctl, 552 table, 553 pba, 554 }; 555 556 let interrupt_source_group = interrupt_manager 557 .create_group(MsiIrqGroupConfig { 558 base: 0, 559 count: msix_cap.table_size() as InterruptIndex, 560 }) 561 .unwrap(); 562 563 let msix_config = MsixConfig::new(msix_cap.table_size(), interrupt_source_group.clone(), 0); 564 565 self.interrupt.msix = Some(VfioMsix { 566 bar: msix_config, 567 cap: msix_cap, 568 cap_offset: cap.into(), 569 interrupt_source_group, 570 }); 571 } 572 573 pub(crate) fn parse_msi_capabilities( 574 &mut self, 575 cap: u8, 576 interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 577 vfio_wrapper: &dyn Vfio, 578 ) { 579 let msg_ctl = vfio_wrapper.read_config_word((cap + 2).into()); 580 581 let interrupt_source_group = interrupt_manager 582 .create_group(MsiIrqGroupConfig { 583 base: 0, 584 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, 585 }) 586 .unwrap(); 587 588 let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone()); 589 590 self.interrupt.msi = Some(VfioMsi { 591 cfg: msi_config, 592 cap_offset: cap.into(), 593 interrupt_source_group, 594 }); 595 } 596 597 pub(crate) fn parse_capabilities( 598 &mut self, 599 interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 600 vfio_wrapper: &dyn Vfio, 601 ) { 602 let mut cap_next = vfio_wrapper.read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); 603 604 while cap_next != 0 { 605 let cap_id = vfio_wrapper.read_config_byte(cap_next.into()); 606 607 match PciCapabilityId::from(cap_id) { 608 PciCapabilityId::MessageSignalledInterrupts => { 609 if let Some(irq_info) = vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { 610 if irq_info.count > 0 { 611 // Parse capability only if the VFIO device 612 // supports MSI. 613 self.parse_msi_capabilities(cap_next, interrupt_manager, vfio_wrapper); 614 } 615 } 616 } 617 PciCapabilityId::MsiX => { 618 if let Some(irq_info) = vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) { 619 if irq_info.count > 0 { 620 // Parse capability only if the VFIO device 621 // supports MSI-X. 622 self.parse_msix_capabilities(cap_next, interrupt_manager, vfio_wrapper); 623 } 624 } 625 } 626 _ => {} 627 }; 628 629 cap_next = vfio_wrapper.read_config_byte((cap_next + 1).into()); 630 } 631 } 632 633 pub(crate) fn enable_intx(&mut self, wrapper: &dyn Vfio) -> Result<(), VfioPciError> { 634 if let Some(intx) = &mut self.interrupt.intx { 635 if !intx.enabled { 636 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { 637 wrapper 638 .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) 639 .map_err(VfioPciError::EnableIntx)?; 640 641 intx.enabled = true; 642 } else { 643 return Err(VfioPciError::MissingNotifier); 644 } 645 } 646 } 647 648 Ok(()) 649 } 650 651 pub(crate) fn disable_intx(&mut self, wrapper: &dyn Vfio) { 652 if let Some(intx) = &mut self.interrupt.intx { 653 if intx.enabled { 654 if let Err(e) = wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { 655 error!("Could not disable INTx: {}", e); 656 } else { 657 intx.enabled = false; 658 } 659 } 660 } 661 } 662 663 pub(crate) fn enable_msi(&self, wrapper: &dyn Vfio) -> Result<(), VfioPciError> { 664 if let Some(msi) = &self.interrupt.msi { 665 let mut irq_fds: Vec<EventFd> = Vec::new(); 666 for i in 0..msi.cfg.num_enabled_vectors() { 667 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { 668 irq_fds.push(eventfd); 669 } else { 670 return Err(VfioPciError::MissingNotifier); 671 } 672 } 673 674 wrapper 675 .enable_msi(irq_fds.iter().collect()) 676 .map_err(VfioPciError::EnableMsi)?; 677 } 678 679 Ok(()) 680 } 681 682 pub(crate) fn disable_msi(&self, wrapper: &dyn Vfio) { 683 if let Err(e) = wrapper.disable_msi() { 684 error!("Could not disable MSI: {}", e); 685 } 686 } 687 688 pub(crate) fn enable_msix(&self, wrapper: &dyn Vfio) -> Result<(), VfioPciError> { 689 if let Some(msix) = &self.interrupt.msix { 690 let mut irq_fds: Vec<EventFd> = Vec::new(); 691 for i in 0..msix.bar.table_entries.len() { 692 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { 693 irq_fds.push(eventfd); 694 } else { 695 return Err(VfioPciError::MissingNotifier); 696 } 697 } 698 699 wrapper 700 .enable_msix(irq_fds.iter().collect()) 701 .map_err(VfioPciError::EnableMsix)?; 702 } 703 704 Ok(()) 705 } 706 707 pub(crate) fn disable_msix(&self, wrapper: &dyn Vfio) { 708 if let Err(e) = wrapper.disable_msix() { 709 error!("Could not disable MSI-X: {}", e); 710 } 711 } 712 713 pub(crate) fn initialize_legacy_interrupt( 714 &mut self, 715 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 716 wrapper: &dyn Vfio, 717 ) -> Result<(), VfioPciError> { 718 if let Some(irq_info) = wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { 719 if irq_info.count == 0 { 720 // A count of 0 means the INTx IRQ is not supported, therefore 721 // it shouldn't be initialized. 722 return Ok(()); 723 } 724 } 725 726 if let Some(interrupt_source_group) = legacy_interrupt_group { 727 self.interrupt.intx = Some(VfioIntx { 728 interrupt_source_group, 729 enabled: false, 730 }); 731 732 self.enable_intx(wrapper)?; 733 } 734 735 Ok(()) 736 } 737 738 pub(crate) fn update_msi_capabilities( 739 &mut self, 740 offset: u64, 741 data: &[u8], 742 wrapper: &dyn Vfio, 743 ) -> Result<(), VfioPciError> { 744 match self.interrupt.update_msi(offset, data) { 745 Some(InterruptUpdateAction::EnableMsi) => { 746 // Disable INTx before we can enable MSI 747 self.disable_intx(wrapper); 748 self.enable_msi(wrapper)?; 749 } 750 Some(InterruptUpdateAction::DisableMsi) => { 751 // Fallback onto INTx when disabling MSI 752 self.disable_msi(wrapper); 753 self.enable_intx(wrapper)?; 754 } 755 _ => {} 756 } 757 758 Ok(()) 759 } 760 761 pub(crate) fn update_msix_capabilities( 762 &mut self, 763 offset: u64, 764 data: &[u8], 765 wrapper: &dyn Vfio, 766 ) -> Result<(), VfioPciError> { 767 match self.interrupt.update_msix(offset, data) { 768 Some(InterruptUpdateAction::EnableMsix) => { 769 // Disable INTx before we can enable MSI-X 770 self.disable_intx(wrapper); 771 self.enable_msix(wrapper)?; 772 } 773 Some(InterruptUpdateAction::DisableMsix) => { 774 // Fallback onto INTx when disabling MSI-X 775 self.disable_msix(wrapper); 776 self.enable_intx(wrapper)?; 777 } 778 _ => {} 779 } 780 781 Ok(()) 782 } 783 784 pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> { 785 for region in self.mmio_regions.iter() { 786 if addr >= region.start.raw_value() 787 && addr < region.start.unchecked_add(region.length).raw_value() 788 { 789 return Some(*region); 790 } 791 } 792 None 793 } 794 795 pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8], wrapper: &dyn Vfio) { 796 let addr = base + offset; 797 if let Some(region) = self.find_region(addr) { 798 let offset = addr - region.start.raw_value(); 799 800 if self.interrupt.msix_table_accessed(region.index, offset) { 801 self.interrupt.msix_read_table(offset, data); 802 } else { 803 wrapper.region_read(region.index, offset, data); 804 } 805 } 806 807 // INTx EOI 808 // The guest reading from the BAR potentially means the interrupt has 809 // been received and can be acknowledged. 810 if self.interrupt.intx_in_use() { 811 if let Err(e) = wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 812 error!("Failed unmasking INTx IRQ: {}", e); 813 } 814 } 815 } 816 817 pub(crate) fn write_bar( 818 &mut self, 819 base: u64, 820 offset: u64, 821 data: &[u8], 822 wrapper: &dyn Vfio, 823 ) -> Option<Arc<Barrier>> { 824 let addr = base + offset; 825 if let Some(region) = self.find_region(addr) { 826 let offset = addr - region.start.raw_value(); 827 828 // If the MSI-X table is written to, we need to update our cache. 829 if self.interrupt.msix_table_accessed(region.index, offset) { 830 self.interrupt.msix_write_table(offset, data); 831 } else { 832 wrapper.region_write(region.index, offset, data); 833 } 834 } 835 836 // INTx EOI 837 // The guest writing to the BAR potentially means the interrupt has 838 // been received and can be acknowledged. 839 if self.interrupt.intx_in_use() { 840 if let Err(e) = wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { 841 error!("Failed unmasking INTx IRQ: {}", e); 842 } 843 } 844 845 None 846 } 847 848 pub(crate) fn write_config_register( 849 &mut self, 850 reg_idx: usize, 851 offset: u64, 852 data: &[u8], 853 wrapper: &dyn Vfio, 854 ) -> Option<Arc<Barrier>> { 855 // When the guest wants to write to a BAR, we trap it into 856 // our local configuration space. We're not reprogramming 857 // VFIO device. 858 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 859 || reg_idx == PCI_ROM_EXP_BAR_INDEX 860 { 861 // We keep our local cache updated with the BARs. 862 // We'll read it back from there when the guest is asking 863 // for BARs (see read_config_register()). 864 self.configuration 865 .write_config_register(reg_idx, offset, data); 866 return None; 867 } 868 869 let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; 870 871 // If the MSI or MSI-X capabilities are accessed, we need to 872 // update our local cache accordingly. 873 // Depending on how the capabilities are modified, this could 874 // trigger a VFIO MSI or MSI-X toggle. 875 if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { 876 let cap_offset: u64 = reg - cap_base + offset; 877 match cap_id { 878 PciCapabilityId::MessageSignalledInterrupts => { 879 if let Err(e) = self.update_msi_capabilities(cap_offset, data, wrapper) { 880 error!("Could not update MSI capabilities: {}", e); 881 } 882 } 883 PciCapabilityId::MsiX => { 884 if let Err(e) = self.update_msix_capabilities(cap_offset, data, wrapper) { 885 error!("Could not update MSI-X capabilities: {}", e); 886 } 887 } 888 _ => {} 889 } 890 } 891 892 // Make sure to write to the device's PCI config space after MSI/MSI-X 893 // interrupts have been enabled/disabled. In case of MSI, when the 894 // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), 895 // the MSI Enable bit in the MSI capability structure found in the PCI 896 // config space is disabled by default. That's why when the guest is 897 // enabling this bit, we first need to enable the MSI interrupts with 898 // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write 899 // to the device region to update the MSI Enable bit. 900 wrapper.write_config((reg + offset) as u32, data); 901 902 None 903 } 904 905 pub(crate) fn read_config_register(&mut self, reg_idx: usize, wrapper: &dyn Vfio) -> u32 { 906 // When reading the BARs, we trap it and return what comes 907 // from our local configuration space. We want the guest to 908 // use that and not the VFIO device BARs as it does not map 909 // with the guest address space. 910 if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) 911 || reg_idx == PCI_ROM_EXP_BAR_INDEX 912 { 913 return self.configuration.read_reg(reg_idx); 914 } 915 916 // Since we don't support passing multi-functions devices, we should 917 // mask the multi-function bit, bit 7 of the Header Type byte on the 918 // register 3. 919 let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { 920 0xff7f_ffff 921 } else { 922 0xffff_ffff 923 }; 924 925 // The config register read comes from the VFIO device itself. 926 wrapper.read_config_dword((reg_idx * 4) as u32) & mask 927 } 928 } 929 930 /// VfioPciDevice represents a VFIO PCI device. 931 /// This structure implements the BusDevice and PciDevice traits. 932 /// 933 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. 934 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, 935 /// which then gets added to the PCI bus. 936 pub struct VfioPciDevice { 937 vm: Arc<dyn hypervisor::Vm>, 938 device: Arc<VfioDevice>, 939 container: Arc<VfioContainer>, 940 vfio_wrapper: VfioDeviceWrapper, 941 common: VfioCommon, 942 iommu_attached: bool, 943 } 944 945 impl VfioPciDevice { 946 /// Constructs a new Vfio Pci device for the given Vfio device 947 pub fn new( 948 vm: &Arc<dyn hypervisor::Vm>, 949 device: VfioDevice, 950 container: Arc<VfioContainer>, 951 msi_interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>, 952 legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>, 953 iommu_attached: bool, 954 ) -> Result<Self, VfioPciError> { 955 let device = Arc::new(device); 956 device.reset(); 957 958 let configuration = PciConfiguration::new( 959 0, 960 0, 961 0, 962 PciClassCode::Other, 963 &PciVfioSubclass::VfioSubclass, 964 None, 965 PciHeaderType::Device, 966 0, 967 0, 968 None, 969 ); 970 971 let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device)); 972 973 let mut common = VfioCommon { 974 mmio_regions: Vec::new(), 975 configuration, 976 interrupt: Interrupt { 977 intx: None, 978 msi: None, 979 msix: None, 980 }, 981 }; 982 983 common.parse_capabilities(msi_interrupt_manager, &vfio_wrapper); 984 common.initialize_legacy_interrupt(legacy_interrupt_group, &vfio_wrapper)?; 985 986 let vfio_pci_device = VfioPciDevice { 987 vm: vm.clone(), 988 device, 989 container, 990 vfio_wrapper, 991 common, 992 iommu_attached, 993 }; 994 995 Ok(vfio_pci_device) 996 } 997 998 pub fn iommu_attached(&self) -> bool { 999 self.iommu_attached 1000 } 1001 1002 /// Map MMIO regions into the guest, and avoid VM exits when the guest tries 1003 /// to reach those regions. 1004 /// 1005 /// # Arguments 1006 /// 1007 /// * `vm` - The VM object. It is used to set the VFIO MMIO regions 1008 /// as user memory regions. 1009 /// * `mem_slot` - The closure to return a memory slot. 1010 pub fn map_mmio_regions<F>( 1011 &mut self, 1012 vm: &Arc<dyn hypervisor::Vm>, 1013 mem_slot: F, 1014 ) -> Result<(), VfioPciError> 1015 where 1016 F: Fn() -> u32, 1017 { 1018 let fd = self.device.as_raw_fd(); 1019 1020 for region in self.common.mmio_regions.iter_mut() { 1021 // We want to skip the mapping of the BAR containing the MSI-X 1022 // table even if it is mappable. The reason is we need to trap 1023 // any access to the MSI-X table and update the GSI routing 1024 // accordingly. 1025 if let Some(msix) = &self.common.interrupt.msix { 1026 if region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir() { 1027 continue; 1028 } 1029 } 1030 1031 let region_flags = self.device.get_region_flags(region.index); 1032 if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { 1033 let mut prot = 0; 1034 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 { 1035 prot |= libc::PROT_READ; 1036 } 1037 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { 1038 prot |= libc::PROT_WRITE; 1039 } 1040 let (mmap_offset, mmap_size) = self.device.get_region_mmap(region.index); 1041 let offset = self.device.get_region_offset(region.index) + mmap_offset; 1042 1043 let host_addr = unsafe { 1044 libc::mmap( 1045 null_mut(), 1046 mmap_size as usize, 1047 prot, 1048 libc::MAP_SHARED, 1049 fd, 1050 offset as libc::off_t, 1051 ) 1052 }; 1053 1054 if host_addr == libc::MAP_FAILED { 1055 error!( 1056 "Could not mmap regions, error:{}", 1057 io::Error::last_os_error() 1058 ); 1059 continue; 1060 } 1061 1062 let slot = mem_slot(); 1063 let mem_region = vm.make_user_memory_region( 1064 slot, 1065 region.start.raw_value() + mmap_offset, 1066 mmap_size as u64, 1067 host_addr as u64, 1068 false, 1069 false, 1070 ); 1071 1072 vm.create_user_memory_region(mem_region) 1073 .map_err(VfioPciError::MapRegionGuest)?; 1074 1075 // Update the region with memory mapped info. 1076 region.mem_slot = Some(slot); 1077 region.host_addr = Some(host_addr as u64); 1078 region.mmap_size = Some(mmap_size as usize); 1079 } 1080 } 1081 1082 Ok(()) 1083 } 1084 1085 pub fn unmap_mmio_regions(&mut self) { 1086 for region in self.common.mmio_regions.iter() { 1087 if let (Some(host_addr), Some(mmap_size), Some(mem_slot)) = 1088 (region.host_addr, region.mmap_size, region.mem_slot) 1089 { 1090 let (mmap_offset, _) = self.device.get_region_mmap(region.index); 1091 1092 // Remove region 1093 let r = self.vm.make_user_memory_region( 1094 mem_slot, 1095 region.start.raw_value() + mmap_offset, 1096 mmap_size as u64, 1097 host_addr as u64, 1098 false, 1099 false, 1100 ); 1101 1102 if let Err(e) = self.vm.remove_user_memory_region(r) { 1103 error!("Could not remove the userspace memory region: {}", e); 1104 } 1105 1106 let ret = unsafe { libc::munmap(host_addr as *mut libc::c_void, mmap_size) }; 1107 if ret != 0 { 1108 error!( 1109 "Could not unmap region {}, error:{}", 1110 region.index, 1111 io::Error::last_os_error() 1112 ); 1113 } 1114 } 1115 } 1116 } 1117 1118 pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> { 1119 if !self.iommu_attached { 1120 self.container 1121 .vfio_dma_map(iova, size, user_addr) 1122 .map_err(VfioPciError::DmaMap)?; 1123 } 1124 1125 Ok(()) 1126 } 1127 1128 pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> { 1129 if !self.iommu_attached { 1130 self.container 1131 .vfio_dma_unmap(iova, size) 1132 .map_err(VfioPciError::DmaUnmap)?; 1133 } 1134 1135 Ok(()) 1136 } 1137 1138 pub fn mmio_regions(&self) -> Vec<MmioRegion> { 1139 self.common.mmio_regions.clone() 1140 } 1141 } 1142 1143 impl Drop for VfioPciDevice { 1144 fn drop(&mut self) { 1145 self.unmap_mmio_regions(); 1146 1147 if let Some(msix) = &self.common.interrupt.msix { 1148 if msix.bar.enabled() { 1149 self.common.disable_msix(&self.vfio_wrapper); 1150 } 1151 } 1152 1153 if let Some(msi) = &self.common.interrupt.msi { 1154 if msi.cfg.enabled() { 1155 self.common.disable_msi(&self.vfio_wrapper) 1156 } 1157 } 1158 1159 if self.common.interrupt.intx_in_use() { 1160 self.common.disable_intx(&self.vfio_wrapper); 1161 } 1162 } 1163 } 1164 1165 impl BusDevice for VfioPciDevice { 1166 fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1167 self.read_bar(base, offset, data) 1168 } 1169 1170 fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1171 self.write_bar(base, offset, data) 1172 } 1173 } 1174 1175 // First BAR offset in the PCI config space. 1176 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; 1177 // Capability register offset in the PCI config space. 1178 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; 1179 // IO BAR when first BAR bit is 1. 1180 const PCI_CONFIG_IO_BAR: u32 = 0x1; 1181 // 64-bit memory bar flag. 1182 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; 1183 // PCI config register size (4 bytes). 1184 const PCI_CONFIG_REGISTER_SIZE: usize = 4; 1185 // Number of BARs for a PCI device 1186 const BAR_NUMS: usize = 6; 1187 // PCI Header Type register index 1188 const PCI_HEADER_TYPE_REG_INDEX: usize = 3; 1189 // First BAR register index 1190 const PCI_CONFIG_BAR0_INDEX: usize = 4; 1191 // PCI ROM expansion BAR register index 1192 const PCI_ROM_EXP_BAR_INDEX: usize = 12; 1193 1194 impl PciDevice for VfioPciDevice { 1195 fn allocate_bars( 1196 &mut self, 1197 allocator: &mut SystemAllocator, 1198 ) -> Result<Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, PciDeviceError> { 1199 self.common.allocate_bars(allocator, &self.vfio_wrapper) 1200 } 1201 1202 fn free_bars(&mut self, allocator: &mut SystemAllocator) -> Result<(), PciDeviceError> { 1203 self.common.free_bars(allocator) 1204 } 1205 1206 fn write_config_register( 1207 &mut self, 1208 reg_idx: usize, 1209 offset: u64, 1210 data: &[u8], 1211 ) -> Option<Arc<Barrier>> { 1212 self.common 1213 .write_config_register(reg_idx, offset, data, &self.vfio_wrapper) 1214 } 1215 1216 fn read_config_register(&mut self, reg_idx: usize) -> u32 { 1217 self.common 1218 .read_config_register(reg_idx, &self.vfio_wrapper) 1219 } 1220 1221 fn detect_bar_reprogramming( 1222 &mut self, 1223 reg_idx: usize, 1224 data: &[u8], 1225 ) -> Option<BarReprogrammingParams> { 1226 self.common 1227 .configuration 1228 .detect_bar_reprogramming(reg_idx, data) 1229 } 1230 1231 fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { 1232 self.common.read_bar(base, offset, data, &self.vfio_wrapper) 1233 } 1234 1235 fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 1236 self.common 1237 .write_bar(base, offset, data, &self.vfio_wrapper) 1238 } 1239 1240 fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> { 1241 for region in self.common.mmio_regions.iter_mut() { 1242 if region.start.raw_value() == old_base { 1243 region.start = GuestAddress(new_base); 1244 1245 if let Some(mem_slot) = region.mem_slot { 1246 if let Some(host_addr) = region.host_addr { 1247 let (mmap_offset, mmap_size) = self.device.get_region_mmap(region.index); 1248 1249 // Remove old region 1250 let old_mem_region = self.vm.make_user_memory_region( 1251 mem_slot, 1252 old_base + mmap_offset, 1253 mmap_size as u64, 1254 host_addr as u64, 1255 false, 1256 false, 1257 ); 1258 1259 self.vm 1260 .remove_user_memory_region(old_mem_region) 1261 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1262 1263 // Insert new region 1264 let new_mem_region = self.vm.make_user_memory_region( 1265 mem_slot, 1266 new_base + mmap_offset, 1267 mmap_size as u64, 1268 host_addr as u64, 1269 false, 1270 false, 1271 ); 1272 1273 self.vm 1274 .create_user_memory_region(new_mem_region) 1275 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; 1276 } 1277 } 1278 } 1279 } 1280 1281 Ok(()) 1282 } 1283 1284 fn as_any(&mut self) -> &mut dyn Any { 1285 self 1286 } 1287 } 1288