xref: /cloud-hypervisor/pci/src/vfio.rs (revision f67b3f79ea19c9a66e04074cbbf5d292f6529e43)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 
6 use crate::{
7     msi_num_enabled_vectors, BarReprogrammingParams, MsiConfig, MsixCap, MsixConfig,
8     PciBarConfiguration, PciBarRegionType, PciCapabilityId, PciClassCode, PciConfiguration,
9     PciDevice, PciDeviceError, PciHeaderType, PciSubclass, MSIX_TABLE_ENTRY_SIZE,
10 };
11 use byteorder::{ByteOrder, LittleEndian};
12 use hypervisor::HypervisorVmError;
13 use std::any::Any;
14 use std::io;
15 use std::os::unix::io::AsRawFd;
16 use std::ptr::null_mut;
17 use std::sync::{Arc, Barrier};
18 use thiserror::Error;
19 use vfio_bindings::bindings::vfio::*;
20 use vfio_ioctls::{VfioContainer, VfioDevice, VfioIrq};
21 use vm_allocator::SystemAllocator;
22 use vm_device::interrupt::{
23     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
24 };
25 use vm_device::BusDevice;
26 use vm_memory::{Address, GuestAddress, GuestUsize};
27 use vmm_sys_util::eventfd::EventFd;
28 
29 #[derive(Debug, Error)]
30 pub enum VfioPciError {
31     #[error("Failed to DMA map: {0}")]
32     DmaMap(#[source] vfio_ioctls::VfioError),
33     #[error("Failed to DMA unmap: {0}")]
34     DmaUnmap(#[source] vfio_ioctls::VfioError),
35     #[error("Failed to enable INTx: {0}")]
36     EnableIntx(#[source] VfioError),
37     #[error("Failed to enable MSI: {0}")]
38     EnableMsi(#[source] VfioError),
39     #[error("Failed to enable MSI-x: {0}")]
40     EnableMsix(#[source] VfioError),
41     #[error("Failed to map VFIO PCI region into guest: {0}")]
42     MapRegionGuest(#[source] HypervisorVmError),
43     #[error("Failed to notifier's eventfd")]
44     MissingNotifier,
45 }
46 
47 #[derive(Copy, Clone)]
48 enum PciVfioSubclass {
49     VfioSubclass = 0xff,
50 }
51 
52 impl PciSubclass for PciVfioSubclass {
53     fn get_register_value(&self) -> u8 {
54         *self as u8
55     }
56 }
57 
58 enum InterruptUpdateAction {
59     EnableMsi,
60     DisableMsi,
61     EnableMsix,
62     DisableMsix,
63 }
64 
65 pub(crate) struct VfioIntx {
66     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
67     enabled: bool,
68 }
69 
70 pub(crate) struct VfioMsi {
71     pub(crate) cfg: MsiConfig,
72     cap_offset: u32,
73     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
74 }
75 
76 impl VfioMsi {
77     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
78         let old_enabled = self.cfg.enabled();
79 
80         self.cfg.update(offset, data);
81 
82         let new_enabled = self.cfg.enabled();
83 
84         if !old_enabled && new_enabled {
85             return Some(InterruptUpdateAction::EnableMsi);
86         }
87 
88         if old_enabled && !new_enabled {
89             return Some(InterruptUpdateAction::DisableMsi);
90         }
91 
92         None
93     }
94 }
95 
96 pub(crate) struct VfioMsix {
97     pub(crate) bar: MsixConfig,
98     cap: MsixCap,
99     cap_offset: u32,
100     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
101 }
102 
103 impl VfioMsix {
104     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
105         let old_enabled = self.bar.enabled();
106 
107         // Update "Message Control" word
108         if offset == 2 && data.len() == 2 {
109             self.bar.set_msg_ctl(LittleEndian::read_u16(data));
110         }
111 
112         let new_enabled = self.bar.enabled();
113 
114         if !old_enabled && new_enabled {
115             return Some(InterruptUpdateAction::EnableMsix);
116         }
117 
118         if old_enabled && !new_enabled {
119             return Some(InterruptUpdateAction::DisableMsix);
120         }
121 
122         None
123     }
124 
125     fn table_accessed(&self, bar_index: u32, offset: u64) -> bool {
126         let table_offset: u64 = u64::from(self.cap.table_offset());
127         let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64);
128         let table_bir: u32 = self.cap.table_bir();
129 
130         bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size
131     }
132 }
133 
134 pub(crate) struct Interrupt {
135     pub(crate) intx: Option<VfioIntx>,
136     pub(crate) msi: Option<VfioMsi>,
137     pub(crate) msix: Option<VfioMsix>,
138 }
139 
140 impl Interrupt {
141     fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
142         if let Some(ref mut msi) = &mut self.msi {
143             let action = msi.update(offset, data);
144             return action;
145         }
146 
147         None
148     }
149 
150     fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
151         if let Some(ref mut msix) = &mut self.msix {
152             let action = msix.update(offset, data);
153             return action;
154         }
155 
156         None
157     }
158 
159     fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> {
160         if let Some(msi) = &self.msi {
161             if offset >= u64::from(msi.cap_offset)
162                 && offset < u64::from(msi.cap_offset) + msi.cfg.size()
163             {
164                 return Some((
165                     PciCapabilityId::MessageSignalledInterrupts,
166                     u64::from(msi.cap_offset),
167                 ));
168             }
169         }
170 
171         if let Some(msix) = &self.msix {
172             if offset == u64::from(msix.cap_offset) {
173                 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset)));
174             }
175         }
176 
177         None
178     }
179 
180     fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool {
181         if let Some(msix) = &self.msix {
182             return msix.table_accessed(bar_index, offset);
183         }
184 
185         false
186     }
187 
188     fn msix_write_table(&mut self, offset: u64, data: &[u8]) {
189         if let Some(ref mut msix) = &mut self.msix {
190             let offset = offset - u64::from(msix.cap.table_offset());
191             msix.bar.write_table(offset, data)
192         }
193     }
194 
195     fn msix_read_table(&self, offset: u64, data: &mut [u8]) {
196         if let Some(msix) = &self.msix {
197             let offset = offset - u64::from(msix.cap.table_offset());
198             msix.bar.read_table(offset, data)
199         }
200     }
201 
202     pub(crate) fn intx_in_use(&self) -> bool {
203         if let Some(intx) = &self.intx {
204             return intx.enabled;
205         }
206 
207         false
208     }
209 }
210 
211 #[derive(Copy, Clone)]
212 pub struct MmioRegion {
213     pub start: GuestAddress,
214     pub length: GuestUsize,
215     pub(crate) type_: PciBarRegionType,
216     pub(crate) index: u32,
217     pub(crate) mem_slot: Option<u32>,
218     pub(crate) host_addr: Option<u64>,
219     pub(crate) mmap_size: Option<usize>,
220 }
221 #[derive(Debug, Error)]
222 pub enum VfioError {
223     #[error("Kernel VFIO error: {0}")]
224     KernelVfio(#[source] vfio_ioctls::VfioError),
225     #[error("VFIO user error: {0}")]
226     VfioUser(#[source] vfio_user::Error),
227 }
228 
229 pub(crate) trait Vfio {
230     fn read_config_byte(&self, offset: u32) -> u8 {
231         let mut data: [u8; 1] = [0];
232         self.read_config(offset, &mut data);
233         data[0]
234     }
235 
236     fn read_config_word(&self, offset: u32) -> u16 {
237         let mut data: [u8; 2] = [0, 0];
238         self.read_config(offset, &mut data);
239         u16::from_le_bytes(data)
240     }
241 
242     fn read_config_dword(&self, offset: u32) -> u32 {
243         let mut data: [u8; 4] = [0, 0, 0, 0];
244         self.read_config(offset, &mut data);
245         u32::from_le_bytes(data)
246     }
247 
248     fn write_config_dword(&self, offset: u32, buf: u32) {
249         let data: [u8; 4] = buf.to_le_bytes();
250         self.write_config(offset, &data)
251     }
252 
253     fn read_config(&self, offset: u32, data: &mut [u8]) {
254         self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut());
255     }
256 
257     fn write_config(&self, offset: u32, data: &[u8]) {
258         self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data)
259     }
260 
261     fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
262         self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds)
263     }
264 
265     fn disable_msi(&self) -> Result<(), VfioError> {
266         self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX)
267     }
268 
269     fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
270         self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds)
271     }
272 
273     fn disable_msix(&self) -> Result<(), VfioError> {
274         self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX)
275     }
276 
277     fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) {
278         unimplemented!()
279     }
280 
281     fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) {
282         unimplemented!()
283     }
284 
285     fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> {
286         unimplemented!()
287     }
288 
289     fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
290         unimplemented!()
291     }
292 
293     fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
294         unimplemented!()
295     }
296 
297     fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
298         unimplemented!()
299     }
300 }
301 
302 struct VfioDeviceWrapper {
303     device: Arc<VfioDevice>,
304 }
305 
306 impl VfioDeviceWrapper {
307     fn new(device: Arc<VfioDevice>) -> Self {
308         Self { device }
309     }
310 }
311 
312 impl Vfio for VfioDeviceWrapper {
313     fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) {
314         self.device.region_read(index, data, offset)
315     }
316 
317     fn region_write(&self, index: u32, offset: u64, data: &[u8]) {
318         self.device.region_write(index, data, offset)
319     }
320 
321     fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> {
322         self.device.get_irq_info(irq_index).copied()
323     }
324 
325     fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
326         self.device
327             .enable_irq(irq_index, event_fds)
328             .map_err(VfioError::KernelVfio)
329     }
330 
331     fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> {
332         self.device
333             .disable_irq(irq_index)
334             .map_err(VfioError::KernelVfio)
335     }
336 
337     fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> {
338         self.device
339             .unmask_irq(irq_index)
340             .map_err(VfioError::KernelVfio)
341     }
342 }
343 
344 pub(crate) struct VfioCommon {
345     pub(crate) configuration: PciConfiguration,
346     pub(crate) mmio_regions: Vec<MmioRegion>,
347     pub(crate) interrupt: Interrupt,
348 }
349 
350 impl VfioCommon {
351     pub(crate) fn allocate_bars(
352         &mut self,
353         allocator: &mut SystemAllocator,
354         vfio_wrapper: &dyn Vfio,
355     ) -> Result<Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, PciDeviceError> {
356         let mut ranges = Vec::new();
357         let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX as u32;
358 
359         // Going through all regular regions to compute the BAR size.
360         // We're not saving the BAR address to restore it, because we
361         // are going to allocate a guest address for each BAR and write
362         // that new address back.
363         while bar_id < VFIO_PCI_CONFIG_REGION_INDEX {
364             let region_size: u64;
365             let bar_addr: GuestAddress;
366 
367             let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
368                 (PCI_ROM_EXP_BAR_INDEX * 4) as u32
369             } else {
370                 PCI_CONFIG_BAR_OFFSET + bar_id * 4
371             };
372 
373             // First read flags
374             let flags = vfio_wrapper.read_config_dword(bar_offset);
375 
376             // Is this an IO BAR?
377             let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
378                 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR)
379             } else {
380                 false
381             };
382 
383             // Is this a 64-bit BAR?
384             let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
385                 matches!(
386                     flags & PCI_CONFIG_MEMORY_BAR_64BIT,
387                     PCI_CONFIG_MEMORY_BAR_64BIT
388                 )
389             } else {
390                 false
391             };
392 
393             // By default, the region type is 32 bits memory BAR.
394             let mut region_type = PciBarRegionType::Memory32BitRegion;
395 
396             // To get size write all 1s
397             vfio_wrapper.write_config_dword(bar_offset, 0xffff_ffff);
398 
399             // And read back BAR value. The device will write zeros for bits it doesn't care about
400             let mut lower = vfio_wrapper.read_config_dword(bar_offset);
401 
402             if io_bar {
403                 #[cfg(target_arch = "x86_64")]
404                 {
405                     // IO BAR
406                     region_type = PciBarRegionType::IoRegion;
407 
408                     // Mask flag bits (lowest 2 for I/O bars)
409                     lower &= !0b11;
410 
411                     // BAR is not enabled
412                     if lower == 0 {
413                         bar_id += 1;
414                         continue;
415                     }
416 
417                     // Invert bits and add 1 to calculate size
418                     region_size = (!lower + 1) as u64;
419 
420                     // The address needs to be 4 bytes aligned.
421                     bar_addr = allocator
422                         .allocate_io_addresses(None, region_size, Some(0x4))
423                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?;
424                 }
425                 #[cfg(target_arch = "aarch64")]
426                 unimplemented!()
427             } else if is_64bit_bar {
428                 // 64 bits Memory BAR
429                 region_type = PciBarRegionType::Memory64BitRegion;
430 
431                 // Query size of upper BAR of 64-bit BAR
432                 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4;
433                 vfio_wrapper.write_config_dword(upper_offset, 0xffff_ffff);
434                 let upper = vfio_wrapper.read_config_dword(upper_offset);
435 
436                 let mut combined_size = u64::from(upper) << 32 | u64::from(lower);
437 
438                 // Mask out flag bits (lowest 4 for memory bars)
439                 combined_size &= !0b1111;
440 
441                 // BAR is not enabled
442                 if combined_size == 0 {
443                     bar_id += 1;
444                     continue;
445                 }
446 
447                 // Invert and add 1 to to find size
448                 region_size = (!combined_size + 1) as u64;
449 
450                 // BAR allocation must be naturally aligned
451                 bar_addr = allocator
452                     .allocate_mmio_addresses(None, region_size, Some(region_size))
453                     .ok_or(PciDeviceError::IoAllocationFailed(region_size))?;
454             } else {
455                 // Mask out flag bits (lowest 4 for memory bars)
456                 lower &= !0b1111;
457 
458                 if lower == 0 {
459                     bar_id += 1;
460                     continue;
461                 }
462 
463                 // Invert and add 1 to to find size
464                 region_size = (!lower + 1) as u64;
465 
466                 // BAR allocation must be naturally aligned
467                 bar_addr = allocator
468                     .allocate_mmio_hole_addresses(None, region_size, Some(region_size))
469                     .ok_or(PciDeviceError::IoAllocationFailed(region_size))?;
470             }
471 
472             let reg_idx = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
473                 PCI_ROM_EXP_BAR_INDEX
474             } else {
475                 bar_id as usize
476             };
477 
478             // We can now build our BAR configuration block.
479             let config = PciBarConfiguration::default()
480                 .set_register_index(reg_idx)
481                 .set_address(bar_addr.raw_value())
482                 .set_size(region_size)
483                 .set_region_type(region_type);
484 
485             if bar_id == VFIO_PCI_ROM_REGION_INDEX {
486                 self.configuration
487                     .add_pci_rom_bar(&config, flags & 0x1)
488                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
489             } else {
490                 self.configuration
491                     .add_pci_bar(&config)
492                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
493             }
494 
495             ranges.push((bar_addr, region_size, region_type));
496             self.mmio_regions.push(MmioRegion {
497                 start: bar_addr,
498                 length: region_size,
499                 type_: region_type,
500                 index: bar_id as u32,
501                 mem_slot: None,
502                 host_addr: None,
503                 mmap_size: None,
504             });
505 
506             bar_id += 1;
507             if is_64bit_bar {
508                 bar_id += 1;
509             }
510         }
511 
512         Ok(ranges)
513     }
514 
515     pub(crate) fn free_bars(
516         &mut self,
517         allocator: &mut SystemAllocator,
518     ) -> Result<(), PciDeviceError> {
519         for region in self.mmio_regions.iter() {
520             match region.type_ {
521                 PciBarRegionType::IoRegion => {
522                     #[cfg(target_arch = "x86_64")]
523                     allocator.free_io_addresses(region.start, region.length);
524                     #[cfg(target_arch = "aarch64")]
525                     error!("I/O region is not supported");
526                 }
527                 PciBarRegionType::Memory32BitRegion => {
528                     allocator.free_mmio_hole_addresses(region.start, region.length);
529                 }
530                 PciBarRegionType::Memory64BitRegion => {
531                     allocator.free_mmio_addresses(region.start, region.length);
532                 }
533             }
534         }
535         Ok(())
536     }
537 
538     pub(crate) fn parse_msix_capabilities(
539         &mut self,
540         cap: u8,
541         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
542         vfio_wrapper: &dyn Vfio,
543     ) {
544         let msg_ctl = vfio_wrapper.read_config_word((cap + 2).into());
545 
546         let table = vfio_wrapper.read_config_dword((cap + 4).into());
547 
548         let pba = vfio_wrapper.read_config_dword((cap + 8).into());
549 
550         let msix_cap = MsixCap {
551             msg_ctl,
552             table,
553             pba,
554         };
555 
556         let interrupt_source_group = interrupt_manager
557             .create_group(MsiIrqGroupConfig {
558                 base: 0,
559                 count: msix_cap.table_size() as InterruptIndex,
560             })
561             .unwrap();
562 
563         let msix_config = MsixConfig::new(msix_cap.table_size(), interrupt_source_group.clone(), 0);
564 
565         self.interrupt.msix = Some(VfioMsix {
566             bar: msix_config,
567             cap: msix_cap,
568             cap_offset: cap.into(),
569             interrupt_source_group,
570         });
571     }
572 
573     pub(crate) fn parse_msi_capabilities(
574         &mut self,
575         cap: u8,
576         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
577         vfio_wrapper: &dyn Vfio,
578     ) {
579         let msg_ctl = vfio_wrapper.read_config_word((cap + 2).into());
580 
581         let interrupt_source_group = interrupt_manager
582             .create_group(MsiIrqGroupConfig {
583                 base: 0,
584                 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex,
585             })
586             .unwrap();
587 
588         let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone());
589 
590         self.interrupt.msi = Some(VfioMsi {
591             cfg: msi_config,
592             cap_offset: cap.into(),
593             interrupt_source_group,
594         });
595     }
596 
597     pub(crate) fn parse_capabilities(
598         &mut self,
599         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
600         vfio_wrapper: &dyn Vfio,
601     ) {
602         let mut cap_next = vfio_wrapper.read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
603 
604         while cap_next != 0 {
605             let cap_id = vfio_wrapper.read_config_byte(cap_next.into());
606 
607             match PciCapabilityId::from(cap_id) {
608                 PciCapabilityId::MessageSignalledInterrupts => {
609                     if let Some(irq_info) = vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) {
610                         if irq_info.count > 0 {
611                             // Parse capability only if the VFIO device
612                             // supports MSI.
613                             self.parse_msi_capabilities(cap_next, interrupt_manager, vfio_wrapper);
614                         }
615                     }
616                 }
617                 PciCapabilityId::MsiX => {
618                     if let Some(irq_info) = vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) {
619                         if irq_info.count > 0 {
620                             // Parse capability only if the VFIO device
621                             // supports MSI-X.
622                             self.parse_msix_capabilities(cap_next, interrupt_manager, vfio_wrapper);
623                         }
624                     }
625                 }
626                 _ => {}
627             };
628 
629             cap_next = vfio_wrapper.read_config_byte((cap_next + 1).into());
630         }
631     }
632 
633     pub(crate) fn enable_intx(&mut self, wrapper: &dyn Vfio) -> Result<(), VfioPciError> {
634         if let Some(intx) = &mut self.interrupt.intx {
635             if !intx.enabled {
636                 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) {
637                     wrapper
638                         .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd])
639                         .map_err(VfioPciError::EnableIntx)?;
640 
641                     intx.enabled = true;
642                 } else {
643                     return Err(VfioPciError::MissingNotifier);
644                 }
645             }
646         }
647 
648         Ok(())
649     }
650 
651     pub(crate) fn disable_intx(&mut self, wrapper: &dyn Vfio) {
652         if let Some(intx) = &mut self.interrupt.intx {
653             if intx.enabled {
654                 if let Err(e) = wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) {
655                     error!("Could not disable INTx: {}", e);
656                 } else {
657                     intx.enabled = false;
658                 }
659             }
660         }
661     }
662 
663     pub(crate) fn enable_msi(&self, wrapper: &dyn Vfio) -> Result<(), VfioPciError> {
664         if let Some(msi) = &self.interrupt.msi {
665             let mut irq_fds: Vec<EventFd> = Vec::new();
666             for i in 0..msi.cfg.num_enabled_vectors() {
667                 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) {
668                     irq_fds.push(eventfd);
669                 } else {
670                     return Err(VfioPciError::MissingNotifier);
671                 }
672             }
673 
674             wrapper
675                 .enable_msi(irq_fds.iter().collect())
676                 .map_err(VfioPciError::EnableMsi)?;
677         }
678 
679         Ok(())
680     }
681 
682     pub(crate) fn disable_msi(&self, wrapper: &dyn Vfio) {
683         if let Err(e) = wrapper.disable_msi() {
684             error!("Could not disable MSI: {}", e);
685         }
686     }
687 
688     pub(crate) fn enable_msix(&self, wrapper: &dyn Vfio) -> Result<(), VfioPciError> {
689         if let Some(msix) = &self.interrupt.msix {
690             let mut irq_fds: Vec<EventFd> = Vec::new();
691             for i in 0..msix.bar.table_entries.len() {
692                 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) {
693                     irq_fds.push(eventfd);
694                 } else {
695                     return Err(VfioPciError::MissingNotifier);
696                 }
697             }
698 
699             wrapper
700                 .enable_msix(irq_fds.iter().collect())
701                 .map_err(VfioPciError::EnableMsix)?;
702         }
703 
704         Ok(())
705     }
706 
707     pub(crate) fn disable_msix(&self, wrapper: &dyn Vfio) {
708         if let Err(e) = wrapper.disable_msix() {
709             error!("Could not disable MSI-X: {}", e);
710         }
711     }
712 
713     pub(crate) fn initialize_legacy_interrupt(
714         &mut self,
715         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
716         wrapper: &dyn Vfio,
717     ) -> Result<(), VfioPciError> {
718         if let Some(irq_info) = wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) {
719             if irq_info.count == 0 {
720                 // A count of 0 means the INTx IRQ is not supported, therefore
721                 // it shouldn't be initialized.
722                 return Ok(());
723             }
724         }
725 
726         if let Some(interrupt_source_group) = legacy_interrupt_group {
727             self.interrupt.intx = Some(VfioIntx {
728                 interrupt_source_group,
729                 enabled: false,
730             });
731 
732             self.enable_intx(wrapper)?;
733         }
734 
735         Ok(())
736     }
737 
738     pub(crate) fn update_msi_capabilities(
739         &mut self,
740         offset: u64,
741         data: &[u8],
742         wrapper: &dyn Vfio,
743     ) -> Result<(), VfioPciError> {
744         match self.interrupt.update_msi(offset, data) {
745             Some(InterruptUpdateAction::EnableMsi) => {
746                 // Disable INTx before we can enable MSI
747                 self.disable_intx(wrapper);
748                 self.enable_msi(wrapper)?;
749             }
750             Some(InterruptUpdateAction::DisableMsi) => {
751                 // Fallback onto INTx when disabling MSI
752                 self.disable_msi(wrapper);
753                 self.enable_intx(wrapper)?;
754             }
755             _ => {}
756         }
757 
758         Ok(())
759     }
760 
761     pub(crate) fn update_msix_capabilities(
762         &mut self,
763         offset: u64,
764         data: &[u8],
765         wrapper: &dyn Vfio,
766     ) -> Result<(), VfioPciError> {
767         match self.interrupt.update_msix(offset, data) {
768             Some(InterruptUpdateAction::EnableMsix) => {
769                 // Disable INTx before we can enable MSI-X
770                 self.disable_intx(wrapper);
771                 self.enable_msix(wrapper)?;
772             }
773             Some(InterruptUpdateAction::DisableMsix) => {
774                 // Fallback onto INTx when disabling MSI-X
775                 self.disable_msix(wrapper);
776                 self.enable_intx(wrapper)?;
777             }
778             _ => {}
779         }
780 
781         Ok(())
782     }
783 
784     pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> {
785         for region in self.mmio_regions.iter() {
786             if addr >= region.start.raw_value()
787                 && addr < region.start.unchecked_add(region.length).raw_value()
788             {
789                 return Some(*region);
790             }
791         }
792         None
793     }
794 
795     pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8], wrapper: &dyn Vfio) {
796         let addr = base + offset;
797         if let Some(region) = self.find_region(addr) {
798             let offset = addr - region.start.raw_value();
799 
800             if self.interrupt.msix_table_accessed(region.index, offset) {
801                 self.interrupt.msix_read_table(offset, data);
802             } else {
803                 wrapper.region_read(region.index, offset, data);
804             }
805         }
806 
807         // INTx EOI
808         // The guest reading from the BAR potentially means the interrupt has
809         // been received and can be acknowledged.
810         if self.interrupt.intx_in_use() {
811             if let Err(e) = wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
812                 error!("Failed unmasking INTx IRQ: {}", e);
813             }
814         }
815     }
816 
817     pub(crate) fn write_bar(
818         &mut self,
819         base: u64,
820         offset: u64,
821         data: &[u8],
822         wrapper: &dyn Vfio,
823     ) -> Option<Arc<Barrier>> {
824         let addr = base + offset;
825         if let Some(region) = self.find_region(addr) {
826             let offset = addr - region.start.raw_value();
827 
828             // If the MSI-X table is written to, we need to update our cache.
829             if self.interrupt.msix_table_accessed(region.index, offset) {
830                 self.interrupt.msix_write_table(offset, data);
831             } else {
832                 wrapper.region_write(region.index, offset, data);
833             }
834         }
835 
836         // INTx EOI
837         // The guest writing to the BAR potentially means the interrupt has
838         // been received and can be acknowledged.
839         if self.interrupt.intx_in_use() {
840             if let Err(e) = wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
841                 error!("Failed unmasking INTx IRQ: {}", e);
842             }
843         }
844 
845         None
846     }
847 
848     pub(crate) fn write_config_register(
849         &mut self,
850         reg_idx: usize,
851         offset: u64,
852         data: &[u8],
853         wrapper: &dyn Vfio,
854     ) -> Option<Arc<Barrier>> {
855         // When the guest wants to write to a BAR, we trap it into
856         // our local configuration space. We're not reprogramming
857         // VFIO device.
858         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
859             || reg_idx == PCI_ROM_EXP_BAR_INDEX
860         {
861             // We keep our local cache updated with the BARs.
862             // We'll read it back from there when the guest is asking
863             // for BARs (see read_config_register()).
864             self.configuration
865                 .write_config_register(reg_idx, offset, data);
866             return None;
867         }
868 
869         let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64;
870 
871         // If the MSI or MSI-X capabilities are accessed, we need to
872         // update our local cache accordingly.
873         // Depending on how the capabilities are modified, this could
874         // trigger a VFIO MSI or MSI-X toggle.
875         if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) {
876             let cap_offset: u64 = reg - cap_base + offset;
877             match cap_id {
878                 PciCapabilityId::MessageSignalledInterrupts => {
879                     if let Err(e) = self.update_msi_capabilities(cap_offset, data, wrapper) {
880                         error!("Could not update MSI capabilities: {}", e);
881                     }
882                 }
883                 PciCapabilityId::MsiX => {
884                     if let Err(e) = self.update_msix_capabilities(cap_offset, data, wrapper) {
885                         error!("Could not update MSI-X capabilities: {}", e);
886                     }
887                 }
888                 _ => {}
889             }
890         }
891 
892         // Make sure to write to the device's PCI config space after MSI/MSI-X
893         // interrupts have been enabled/disabled. In case of MSI, when the
894         // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS),
895         // the MSI Enable bit in the MSI capability structure found in the PCI
896         // config space is disabled by default. That's why when the guest is
897         // enabling this bit, we first need to enable the MSI interrupts with
898         // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write
899         // to the device region to update the MSI Enable bit.
900         wrapper.write_config((reg + offset) as u32, data);
901 
902         None
903     }
904 
905     pub(crate) fn read_config_register(&mut self, reg_idx: usize, wrapper: &dyn Vfio) -> u32 {
906         // When reading the BARs, we trap it and return what comes
907         // from our local configuration space. We want the guest to
908         // use that and not the VFIO device BARs as it does not map
909         // with the guest address space.
910         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
911             || reg_idx == PCI_ROM_EXP_BAR_INDEX
912         {
913             return self.configuration.read_reg(reg_idx);
914         }
915 
916         // Since we don't support passing multi-functions devices, we should
917         // mask the multi-function bit, bit 7 of the Header Type byte on the
918         // register 3.
919         let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX {
920             0xff7f_ffff
921         } else {
922             0xffff_ffff
923         };
924 
925         // The config register read comes from the VFIO device itself.
926         wrapper.read_config_dword((reg_idx * 4) as u32) & mask
927     }
928 }
929 
930 /// VfioPciDevice represents a VFIO PCI device.
931 /// This structure implements the BusDevice and PciDevice traits.
932 ///
933 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device.
934 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice,
935 /// which then gets added to the PCI bus.
936 pub struct VfioPciDevice {
937     vm: Arc<dyn hypervisor::Vm>,
938     device: Arc<VfioDevice>,
939     container: Arc<VfioContainer>,
940     vfio_wrapper: VfioDeviceWrapper,
941     common: VfioCommon,
942     iommu_attached: bool,
943 }
944 
945 impl VfioPciDevice {
946     /// Constructs a new Vfio Pci device for the given Vfio device
947     pub fn new(
948         vm: &Arc<dyn hypervisor::Vm>,
949         device: VfioDevice,
950         container: Arc<VfioContainer>,
951         msi_interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
952         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
953         iommu_attached: bool,
954     ) -> Result<Self, VfioPciError> {
955         let device = Arc::new(device);
956         device.reset();
957 
958         let configuration = PciConfiguration::new(
959             0,
960             0,
961             0,
962             PciClassCode::Other,
963             &PciVfioSubclass::VfioSubclass,
964             None,
965             PciHeaderType::Device,
966             0,
967             0,
968             None,
969         );
970 
971         let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device));
972 
973         let mut common = VfioCommon {
974             mmio_regions: Vec::new(),
975             configuration,
976             interrupt: Interrupt {
977                 intx: None,
978                 msi: None,
979                 msix: None,
980             },
981         };
982 
983         common.parse_capabilities(msi_interrupt_manager, &vfio_wrapper);
984         common.initialize_legacy_interrupt(legacy_interrupt_group, &vfio_wrapper)?;
985 
986         let vfio_pci_device = VfioPciDevice {
987             vm: vm.clone(),
988             device,
989             container,
990             vfio_wrapper,
991             common,
992             iommu_attached,
993         };
994 
995         Ok(vfio_pci_device)
996     }
997 
998     pub fn iommu_attached(&self) -> bool {
999         self.iommu_attached
1000     }
1001 
1002     /// Map MMIO regions into the guest, and avoid VM exits when the guest tries
1003     /// to reach those regions.
1004     ///
1005     /// # Arguments
1006     ///
1007     /// * `vm` - The VM object. It is used to set the VFIO MMIO regions
1008     ///          as user memory regions.
1009     /// * `mem_slot` - The closure to return a memory slot.
1010     pub fn map_mmio_regions<F>(
1011         &mut self,
1012         vm: &Arc<dyn hypervisor::Vm>,
1013         mem_slot: F,
1014     ) -> Result<(), VfioPciError>
1015     where
1016         F: Fn() -> u32,
1017     {
1018         let fd = self.device.as_raw_fd();
1019 
1020         for region in self.common.mmio_regions.iter_mut() {
1021             // We want to skip the mapping of the BAR containing the MSI-X
1022             // table even if it is mappable. The reason is we need to trap
1023             // any access to the MSI-X table and update the GSI routing
1024             // accordingly.
1025             if let Some(msix) = &self.common.interrupt.msix {
1026                 if region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir() {
1027                     continue;
1028                 }
1029             }
1030 
1031             let region_flags = self.device.get_region_flags(region.index);
1032             if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1033                 let mut prot = 0;
1034                 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 {
1035                     prot |= libc::PROT_READ;
1036                 }
1037                 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 {
1038                     prot |= libc::PROT_WRITE;
1039                 }
1040                 let (mmap_offset, mmap_size) = self.device.get_region_mmap(region.index);
1041                 let offset = self.device.get_region_offset(region.index) + mmap_offset;
1042 
1043                 let host_addr = unsafe {
1044                     libc::mmap(
1045                         null_mut(),
1046                         mmap_size as usize,
1047                         prot,
1048                         libc::MAP_SHARED,
1049                         fd,
1050                         offset as libc::off_t,
1051                     )
1052                 };
1053 
1054                 if host_addr == libc::MAP_FAILED {
1055                     error!(
1056                         "Could not mmap regions, error:{}",
1057                         io::Error::last_os_error()
1058                     );
1059                     continue;
1060                 }
1061 
1062                 let slot = mem_slot();
1063                 let mem_region = vm.make_user_memory_region(
1064                     slot,
1065                     region.start.raw_value() + mmap_offset,
1066                     mmap_size as u64,
1067                     host_addr as u64,
1068                     false,
1069                     false,
1070                 );
1071 
1072                 vm.create_user_memory_region(mem_region)
1073                     .map_err(VfioPciError::MapRegionGuest)?;
1074 
1075                 // Update the region with memory mapped info.
1076                 region.mem_slot = Some(slot);
1077                 region.host_addr = Some(host_addr as u64);
1078                 region.mmap_size = Some(mmap_size as usize);
1079             }
1080         }
1081 
1082         Ok(())
1083     }
1084 
1085     pub fn unmap_mmio_regions(&mut self) {
1086         for region in self.common.mmio_regions.iter() {
1087             if let (Some(host_addr), Some(mmap_size), Some(mem_slot)) =
1088                 (region.host_addr, region.mmap_size, region.mem_slot)
1089             {
1090                 let (mmap_offset, _) = self.device.get_region_mmap(region.index);
1091 
1092                 // Remove region
1093                 let r = self.vm.make_user_memory_region(
1094                     mem_slot,
1095                     region.start.raw_value() + mmap_offset,
1096                     mmap_size as u64,
1097                     host_addr as u64,
1098                     false,
1099                     false,
1100                 );
1101 
1102                 if let Err(e) = self.vm.remove_user_memory_region(r) {
1103                     error!("Could not remove the userspace memory region: {}", e);
1104                 }
1105 
1106                 let ret = unsafe { libc::munmap(host_addr as *mut libc::c_void, mmap_size) };
1107                 if ret != 0 {
1108                     error!(
1109                         "Could not unmap region {}, error:{}",
1110                         region.index,
1111                         io::Error::last_os_error()
1112                     );
1113                 }
1114             }
1115         }
1116     }
1117 
1118     pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> {
1119         if !self.iommu_attached {
1120             self.container
1121                 .vfio_dma_map(iova, size, user_addr)
1122                 .map_err(VfioPciError::DmaMap)?;
1123         }
1124 
1125         Ok(())
1126     }
1127 
1128     pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> {
1129         if !self.iommu_attached {
1130             self.container
1131                 .vfio_dma_unmap(iova, size)
1132                 .map_err(VfioPciError::DmaUnmap)?;
1133         }
1134 
1135         Ok(())
1136     }
1137 
1138     pub fn mmio_regions(&self) -> Vec<MmioRegion> {
1139         self.common.mmio_regions.clone()
1140     }
1141 }
1142 
1143 impl Drop for VfioPciDevice {
1144     fn drop(&mut self) {
1145         self.unmap_mmio_regions();
1146 
1147         if let Some(msix) = &self.common.interrupt.msix {
1148             if msix.bar.enabled() {
1149                 self.common.disable_msix(&self.vfio_wrapper);
1150             }
1151         }
1152 
1153         if let Some(msi) = &self.common.interrupt.msi {
1154             if msi.cfg.enabled() {
1155                 self.common.disable_msi(&self.vfio_wrapper)
1156             }
1157         }
1158 
1159         if self.common.interrupt.intx_in_use() {
1160             self.common.disable_intx(&self.vfio_wrapper);
1161         }
1162     }
1163 }
1164 
1165 impl BusDevice for VfioPciDevice {
1166     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1167         self.read_bar(base, offset, data)
1168     }
1169 
1170     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1171         self.write_bar(base, offset, data)
1172     }
1173 }
1174 
1175 // First BAR offset in the PCI config space.
1176 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10;
1177 // Capability register offset in the PCI config space.
1178 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34;
1179 // IO BAR when first BAR bit is 1.
1180 const PCI_CONFIG_IO_BAR: u32 = 0x1;
1181 // 64-bit memory bar flag.
1182 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4;
1183 // PCI config register size (4 bytes).
1184 const PCI_CONFIG_REGISTER_SIZE: usize = 4;
1185 // Number of BARs for a PCI device
1186 const BAR_NUMS: usize = 6;
1187 // PCI Header Type register index
1188 const PCI_HEADER_TYPE_REG_INDEX: usize = 3;
1189 // First BAR register index
1190 const PCI_CONFIG_BAR0_INDEX: usize = 4;
1191 // PCI ROM expansion BAR register index
1192 const PCI_ROM_EXP_BAR_INDEX: usize = 12;
1193 
1194 impl PciDevice for VfioPciDevice {
1195     fn allocate_bars(
1196         &mut self,
1197         allocator: &mut SystemAllocator,
1198     ) -> Result<Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, PciDeviceError> {
1199         self.common.allocate_bars(allocator, &self.vfio_wrapper)
1200     }
1201 
1202     fn free_bars(&mut self, allocator: &mut SystemAllocator) -> Result<(), PciDeviceError> {
1203         self.common.free_bars(allocator)
1204     }
1205 
1206     fn write_config_register(
1207         &mut self,
1208         reg_idx: usize,
1209         offset: u64,
1210         data: &[u8],
1211     ) -> Option<Arc<Barrier>> {
1212         self.common
1213             .write_config_register(reg_idx, offset, data, &self.vfio_wrapper)
1214     }
1215 
1216     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1217         self.common
1218             .read_config_register(reg_idx, &self.vfio_wrapper)
1219     }
1220 
1221     fn detect_bar_reprogramming(
1222         &mut self,
1223         reg_idx: usize,
1224         data: &[u8],
1225     ) -> Option<BarReprogrammingParams> {
1226         self.common
1227             .configuration
1228             .detect_bar_reprogramming(reg_idx, data)
1229     }
1230 
1231     fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1232         self.common.read_bar(base, offset, data, &self.vfio_wrapper)
1233     }
1234 
1235     fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1236         self.common
1237             .write_bar(base, offset, data, &self.vfio_wrapper)
1238     }
1239 
1240     fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> {
1241         for region in self.common.mmio_regions.iter_mut() {
1242             if region.start.raw_value() == old_base {
1243                 region.start = GuestAddress(new_base);
1244 
1245                 if let Some(mem_slot) = region.mem_slot {
1246                     if let Some(host_addr) = region.host_addr {
1247                         let (mmap_offset, mmap_size) = self.device.get_region_mmap(region.index);
1248 
1249                         // Remove old region
1250                         let old_mem_region = self.vm.make_user_memory_region(
1251                             mem_slot,
1252                             old_base + mmap_offset,
1253                             mmap_size as u64,
1254                             host_addr as u64,
1255                             false,
1256                             false,
1257                         );
1258 
1259                         self.vm
1260                             .remove_user_memory_region(old_mem_region)
1261                             .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1262 
1263                         // Insert new region
1264                         let new_mem_region = self.vm.make_user_memory_region(
1265                             mem_slot,
1266                             new_base + mmap_offset,
1267                             mmap_size as u64,
1268                             host_addr as u64,
1269                             false,
1270                             false,
1271                         );
1272 
1273                         self.vm
1274                             .create_user_memory_region(new_mem_region)
1275                             .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1276                     }
1277                 }
1278             }
1279         }
1280 
1281         Ok(())
1282     }
1283 
1284     fn as_any(&mut self) -> &mut dyn Any {
1285         self
1286     }
1287 }
1288