xref: /cloud-hypervisor/pci/src/vfio.rs (revision 5e52729453cb62edbe4fb3a4aa24f8cca31e667e)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 
6 use crate::msi::{MsiConfigState, MSI_CONFIG_ID};
7 use crate::msix::MsixConfigState;
8 use crate::{
9     msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig,
10     PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId,
11     PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId,
12     PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID,
13 };
14 use anyhow::anyhow;
15 use byteorder::{ByteOrder, LittleEndian};
16 use hypervisor::HypervisorVmError;
17 use std::any::Any;
18 use std::collections::{BTreeMap, HashMap};
19 use std::io;
20 use std::os::unix::io::AsRawFd;
21 use std::ptr::null_mut;
22 use std::sync::{Arc, Barrier, Mutex};
23 use thiserror::Error;
24 use versionize::{VersionMap, Versionize, VersionizeResult};
25 use versionize_derive::Versionize;
26 use vfio_bindings::bindings::vfio::*;
27 use vfio_ioctls::{
28     VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea,
29 };
30 use vm_allocator::{AddressAllocator, SystemAllocator};
31 use vm_device::interrupt::{
32     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
33 };
34 use vm_device::{BusDevice, Resource};
35 use vm_memory::{Address, GuestAddress, GuestUsize};
36 use vm_migration::{
37     Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped,
38 };
39 use vmm_sys_util::eventfd::EventFd;
40 
41 pub(crate) const VFIO_COMMON_ID: &str = "vfio_common";
42 
43 #[derive(Debug, Error)]
44 pub enum VfioPciError {
45     #[error("Failed to create user memory region: {0}")]
46     CreateUserMemoryRegion(#[source] HypervisorVmError),
47     #[error("Failed to DMA map: {0}")]
48     DmaMap(#[source] vfio_ioctls::VfioError),
49     #[error("Failed to DMA unmap: {0}")]
50     DmaUnmap(#[source] vfio_ioctls::VfioError),
51     #[error("Failed to enable INTx: {0}")]
52     EnableIntx(#[source] VfioError),
53     #[error("Failed to enable MSI: {0}")]
54     EnableMsi(#[source] VfioError),
55     #[error("Failed to enable MSI-x: {0}")]
56     EnableMsix(#[source] VfioError),
57     #[error("Failed to mmap the area")]
58     MmapArea,
59     #[error("Failed to notifier's eventfd")]
60     MissingNotifier,
61     #[error("Invalid region alignment")]
62     RegionAlignment,
63     #[error("Invalid region size")]
64     RegionSize,
65     #[error("Failed to retrieve MsiConfigState: {0}")]
66     RetrieveMsiConfigState(#[source] anyhow::Error),
67     #[error("Failed to retrieve MsixConfigState: {0}")]
68     RetrieveMsixConfigState(#[source] anyhow::Error),
69     #[error("Failed to retrieve PciConfigurationState: {0}")]
70     RetrievePciConfigurationState(#[source] anyhow::Error),
71     #[error("Failed to retrieve VfioCommonState: {0}")]
72     RetrieveVfioCommonState(#[source] anyhow::Error),
73 }
74 
75 #[derive(Copy, Clone)]
76 enum PciVfioSubclass {
77     VfioSubclass = 0xff,
78 }
79 
80 impl PciSubclass for PciVfioSubclass {
81     fn get_register_value(&self) -> u8 {
82         *self as u8
83     }
84 }
85 
86 enum InterruptUpdateAction {
87     EnableMsi,
88     DisableMsi,
89     EnableMsix,
90     DisableMsix,
91 }
92 
93 #[derive(Versionize)]
94 struct IntxState {
95     enabled: bool,
96 }
97 
98 pub(crate) struct VfioIntx {
99     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
100     enabled: bool,
101 }
102 
103 #[derive(Versionize)]
104 struct MsiState {
105     cap: MsiCap,
106     cap_offset: u32,
107 }
108 
109 pub(crate) struct VfioMsi {
110     pub(crate) cfg: MsiConfig,
111     cap_offset: u32,
112     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
113 }
114 
115 impl VfioMsi {
116     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
117         let old_enabled = self.cfg.enabled();
118 
119         self.cfg.update(offset, data);
120 
121         let new_enabled = self.cfg.enabled();
122 
123         if !old_enabled && new_enabled {
124             return Some(InterruptUpdateAction::EnableMsi);
125         }
126 
127         if old_enabled && !new_enabled {
128             return Some(InterruptUpdateAction::DisableMsi);
129         }
130 
131         None
132     }
133 }
134 
135 #[derive(Versionize)]
136 struct MsixState {
137     cap: MsixCap,
138     cap_offset: u32,
139     bdf: u32,
140 }
141 
142 pub(crate) struct VfioMsix {
143     pub(crate) bar: MsixConfig,
144     cap: MsixCap,
145     cap_offset: u32,
146     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
147 }
148 
149 impl VfioMsix {
150     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
151         let old_enabled = self.bar.enabled();
152 
153         // Update "Message Control" word
154         if offset == 2 && data.len() == 2 {
155             self.bar.set_msg_ctl(LittleEndian::read_u16(data));
156         }
157 
158         let new_enabled = self.bar.enabled();
159 
160         if !old_enabled && new_enabled {
161             return Some(InterruptUpdateAction::EnableMsix);
162         }
163 
164         if old_enabled && !new_enabled {
165             return Some(InterruptUpdateAction::DisableMsix);
166         }
167 
168         None
169     }
170 
171     fn table_accessed(&self, bar_index: u32, offset: u64) -> bool {
172         let table_offset: u64 = u64::from(self.cap.table_offset());
173         let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64);
174         let table_bir: u32 = self.cap.table_bir();
175 
176         bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size
177     }
178 }
179 
180 pub(crate) struct Interrupt {
181     pub(crate) intx: Option<VfioIntx>,
182     pub(crate) msi: Option<VfioMsi>,
183     pub(crate) msix: Option<VfioMsix>,
184 }
185 
186 impl Interrupt {
187     fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
188         if let Some(ref mut msi) = &mut self.msi {
189             let action = msi.update(offset, data);
190             return action;
191         }
192 
193         None
194     }
195 
196     fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
197         if let Some(ref mut msix) = &mut self.msix {
198             let action = msix.update(offset, data);
199             return action;
200         }
201 
202         None
203     }
204 
205     fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> {
206         if let Some(msi) = &self.msi {
207             if offset >= u64::from(msi.cap_offset)
208                 && offset < u64::from(msi.cap_offset) + msi.cfg.size()
209             {
210                 return Some((
211                     PciCapabilityId::MessageSignalledInterrupts,
212                     u64::from(msi.cap_offset),
213                 ));
214             }
215         }
216 
217         if let Some(msix) = &self.msix {
218             if offset == u64::from(msix.cap_offset) {
219                 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset)));
220             }
221         }
222 
223         None
224     }
225 
226     fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool {
227         if let Some(msix) = &self.msix {
228             return msix.table_accessed(bar_index, offset);
229         }
230 
231         false
232     }
233 
234     fn msix_write_table(&mut self, offset: u64, data: &[u8]) {
235         if let Some(ref mut msix) = &mut self.msix {
236             let offset = offset - u64::from(msix.cap.table_offset());
237             msix.bar.write_table(offset, data)
238         }
239     }
240 
241     fn msix_read_table(&self, offset: u64, data: &mut [u8]) {
242         if let Some(msix) = &self.msix {
243             let offset = offset - u64::from(msix.cap.table_offset());
244             msix.bar.read_table(offset, data)
245         }
246     }
247 
248     pub(crate) fn intx_in_use(&self) -> bool {
249         if let Some(intx) = &self.intx {
250             return intx.enabled;
251         }
252 
253         false
254     }
255 }
256 
257 #[derive(Copy, Clone)]
258 pub struct UserMemoryRegion {
259     pub slot: u32,
260     pub start: u64,
261     pub size: u64,
262     pub host_addr: u64,
263 }
264 
265 #[derive(Clone)]
266 pub struct MmioRegion {
267     pub start: GuestAddress,
268     pub length: GuestUsize,
269     pub(crate) type_: PciBarRegionType,
270     pub(crate) index: u32,
271     pub(crate) user_memory_regions: Vec<UserMemoryRegion>,
272 }
273 #[derive(Debug, Error)]
274 pub enum VfioError {
275     #[error("Kernel VFIO error: {0}")]
276     KernelVfio(#[source] vfio_ioctls::VfioError),
277     #[error("VFIO user error: {0}")]
278     VfioUser(#[source] vfio_user::Error),
279 }
280 
281 pub(crate) trait Vfio: Send + Sync {
282     fn read_config_byte(&self, offset: u32) -> u8 {
283         let mut data: [u8; 1] = [0];
284         self.read_config(offset, &mut data);
285         data[0]
286     }
287 
288     fn read_config_word(&self, offset: u32) -> u16 {
289         let mut data: [u8; 2] = [0, 0];
290         self.read_config(offset, &mut data);
291         u16::from_le_bytes(data)
292     }
293 
294     fn read_config_dword(&self, offset: u32) -> u32 {
295         let mut data: [u8; 4] = [0, 0, 0, 0];
296         self.read_config(offset, &mut data);
297         u32::from_le_bytes(data)
298     }
299 
300     fn write_config_dword(&self, offset: u32, buf: u32) {
301         let data: [u8; 4] = buf.to_le_bytes();
302         self.write_config(offset, &data)
303     }
304 
305     fn read_config(&self, offset: u32, data: &mut [u8]) {
306         self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut());
307     }
308 
309     fn write_config(&self, offset: u32, data: &[u8]) {
310         self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data)
311     }
312 
313     fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
314         self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds)
315     }
316 
317     fn disable_msi(&self) -> Result<(), VfioError> {
318         self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX)
319     }
320 
321     fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
322         self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds)
323     }
324 
325     fn disable_msix(&self) -> Result<(), VfioError> {
326         self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX)
327     }
328 
329     fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) {
330         unimplemented!()
331     }
332 
333     fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) {
334         unimplemented!()
335     }
336 
337     fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> {
338         unimplemented!()
339     }
340 
341     fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
342         unimplemented!()
343     }
344 
345     fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
346         unimplemented!()
347     }
348 
349     fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
350         unimplemented!()
351     }
352 }
353 
354 struct VfioDeviceWrapper {
355     device: Arc<VfioDevice>,
356 }
357 
358 impl VfioDeviceWrapper {
359     fn new(device: Arc<VfioDevice>) -> Self {
360         Self { device }
361     }
362 }
363 
364 impl Vfio for VfioDeviceWrapper {
365     fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) {
366         self.device.region_read(index, data, offset)
367     }
368 
369     fn region_write(&self, index: u32, offset: u64, data: &[u8]) {
370         self.device.region_write(index, data, offset)
371     }
372 
373     fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> {
374         self.device.get_irq_info(irq_index).copied()
375     }
376 
377     fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
378         self.device
379             .enable_irq(irq_index, event_fds)
380             .map_err(VfioError::KernelVfio)
381     }
382 
383     fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> {
384         self.device
385             .disable_irq(irq_index)
386             .map_err(VfioError::KernelVfio)
387     }
388 
389     fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> {
390         self.device
391             .unmask_irq(irq_index)
392             .map_err(VfioError::KernelVfio)
393     }
394 }
395 
396 #[derive(Versionize)]
397 struct VfioCommonState {
398     intx_state: Option<IntxState>,
399     msi_state: Option<MsiState>,
400     msix_state: Option<MsixState>,
401 }
402 
403 impl VersionMapped for VfioCommonState {}
404 
405 pub(crate) struct ConfigPatch {
406     mask: u32,
407     patch: u32,
408 }
409 
410 pub(crate) struct VfioCommon {
411     pub(crate) configuration: PciConfiguration,
412     pub(crate) mmio_regions: Vec<MmioRegion>,
413     pub(crate) interrupt: Interrupt,
414     pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
415     pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
416     pub(crate) vfio_wrapper: Arc<dyn Vfio>,
417     pub(crate) patches: HashMap<usize, ConfigPatch>,
418 }
419 
420 impl VfioCommon {
421     pub(crate) fn new(
422         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
423         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
424         vfio_wrapper: Arc<dyn Vfio>,
425         subclass: &dyn PciSubclass,
426         bdf: PciBdf,
427         snapshot: Option<Snapshot>,
428     ) -> Result<Self, VfioPciError> {
429         let pci_configuration_state =
430             vm_migration::versioned_state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID)
431                 .map_err(|e| {
432                     VfioPciError::RetrievePciConfigurationState(anyhow!(
433                         "Failed to get PciConfigurationState from Snapshot: {}",
434                         e
435                     ))
436                 })?;
437 
438         let configuration = PciConfiguration::new(
439             0,
440             0,
441             0,
442             PciClassCode::Other,
443             subclass,
444             None,
445             PciHeaderType::Device,
446             0,
447             0,
448             None,
449             pci_configuration_state,
450         );
451 
452         let mut vfio_common = VfioCommon {
453             mmio_regions: Vec::new(),
454             configuration,
455             interrupt: Interrupt {
456                 intx: None,
457                 msi: None,
458                 msix: None,
459             },
460             msi_interrupt_manager,
461             legacy_interrupt_group,
462             vfio_wrapper,
463             patches: HashMap::new(),
464         };
465 
466         let state: Option<VfioCommonState> = snapshot
467             .as_ref()
468             .map(|s| s.to_versioned_state())
469             .transpose()
470             .map_err(|e| {
471                 VfioPciError::RetrieveVfioCommonState(anyhow!(
472                     "Failed to get VfioCommonState from Snapshot: {}",
473                     e
474                 ))
475             })?;
476         let msi_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSI_CONFIG_ID)
477             .map_err(|e| {
478                 VfioPciError::RetrieveMsiConfigState(anyhow!(
479                     "Failed to get MsiConfigState from Snapshot: {}",
480                     e
481                 ))
482             })?;
483         let msix_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID)
484             .map_err(|e| {
485             VfioPciError::RetrieveMsixConfigState(anyhow!(
486                 "Failed to get MsixConfigState from Snapshot: {}",
487                 e
488             ))
489         })?;
490 
491         if let Some(state) = state.as_ref() {
492             vfio_common.set_state(state, msi_state, msix_state)?;
493         } else {
494             vfio_common.parse_capabilities(bdf);
495             vfio_common.initialize_legacy_interrupt()?;
496         }
497 
498         Ok(vfio_common)
499     }
500 
501     pub(crate) fn allocate_bars(
502         &mut self,
503         allocator: &Arc<Mutex<SystemAllocator>>,
504         mmio_allocator: &mut AddressAllocator,
505         resources: Option<Vec<Resource>>,
506     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
507         let mut bars = Vec::new();
508         let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX;
509 
510         // Going through all regular regions to compute the BAR size.
511         // We're not saving the BAR address to restore it, because we
512         // are going to allocate a guest address for each BAR and write
513         // that new address back.
514         while bar_id < VFIO_PCI_CONFIG_REGION_INDEX {
515             let mut region_size: u64 = 0;
516             let mut region_type = PciBarRegionType::Memory32BitRegion;
517             let mut prefetchable = PciBarPrefetchable::NotPrefetchable;
518             let mut flags: u32 = 0;
519 
520             let mut restored_bar_addr = None;
521             if let Some(resources) = &resources {
522                 for resource in resources {
523                     if let Resource::PciBar {
524                         index,
525                         base,
526                         size,
527                         type_,
528                         ..
529                     } = resource
530                     {
531                         if *index == bar_id as usize {
532                             restored_bar_addr = Some(GuestAddress(*base));
533                             region_size = *size;
534                             region_type = PciBarRegionType::from(*type_);
535                             break;
536                         }
537                     }
538                 }
539                 if restored_bar_addr.is_none() {
540                     bar_id += 1;
541                     continue;
542                 }
543             } else {
544                 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
545                     (PCI_ROM_EXP_BAR_INDEX * 4) as u32
546                 } else {
547                     PCI_CONFIG_BAR_OFFSET + bar_id * 4
548                 };
549 
550                 // First read flags
551                 flags = self.vfio_wrapper.read_config_dword(bar_offset);
552 
553                 // Is this an IO BAR?
554                 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
555                     matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR)
556                 } else {
557                     false
558                 };
559 
560                 // Is this a 64-bit BAR?
561                 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
562                     matches!(
563                         flags & PCI_CONFIG_MEMORY_BAR_64BIT,
564                         PCI_CONFIG_MEMORY_BAR_64BIT
565                     )
566                 } else {
567                     false
568                 };
569 
570                 if matches!(
571                     flags & PCI_CONFIG_BAR_PREFETCHABLE,
572                     PCI_CONFIG_BAR_PREFETCHABLE
573                 ) {
574                     prefetchable = PciBarPrefetchable::Prefetchable
575                 };
576 
577                 // To get size write all 1s
578                 self.vfio_wrapper
579                     .write_config_dword(bar_offset, 0xffff_ffff);
580 
581                 // And read back BAR value. The device will write zeros for bits it doesn't care about
582                 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset);
583 
584                 if io_bar {
585                     // Mask flag bits (lowest 2 for I/O bars)
586                     lower &= !0b11;
587 
588                     // BAR is not enabled
589                     if lower == 0 {
590                         bar_id += 1;
591                         continue;
592                     }
593 
594                     // IO BAR
595                     region_type = PciBarRegionType::IoRegion;
596 
597                     // Invert bits and add 1 to calculate size
598                     region_size = (!lower + 1) as u64;
599                 } else if is_64bit_bar {
600                     // 64 bits Memory BAR
601                     region_type = PciBarRegionType::Memory64BitRegion;
602 
603                     // Query size of upper BAR of 64-bit BAR
604                     let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4;
605                     self.vfio_wrapper
606                         .write_config_dword(upper_offset, 0xffff_ffff);
607                     let upper = self.vfio_wrapper.read_config_dword(upper_offset);
608 
609                     let mut combined_size = u64::from(upper) << 32 | u64::from(lower);
610 
611                     // Mask out flag bits (lowest 4 for memory bars)
612                     combined_size &= !0b1111;
613 
614                     // BAR is not enabled
615                     if combined_size == 0 {
616                         bar_id += 1;
617                         continue;
618                     }
619 
620                     // Invert and add 1 to to find size
621                     region_size = !combined_size + 1;
622                 } else {
623                     region_type = PciBarRegionType::Memory32BitRegion;
624 
625                     // Mask out flag bits (lowest 4 for memory bars)
626                     lower &= !0b1111;
627 
628                     if lower == 0 {
629                         bar_id += 1;
630                         continue;
631                     }
632 
633                     // Invert and add 1 to to find size
634                     region_size = (!lower + 1) as u64;
635                 }
636             }
637 
638             let bar_addr = match region_type {
639                 PciBarRegionType::IoRegion => {
640                     #[cfg(target_arch = "aarch64")]
641                     unimplemented!();
642 
643                     // The address needs to be 4 bytes aligned.
644                     #[cfg(not(target_arch = "aarch64"))]
645                     allocator
646                         .lock()
647                         .unwrap()
648                         .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4))
649                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
650                 }
651                 PciBarRegionType::Memory32BitRegion => {
652                     // BAR allocation must be naturally aligned
653                     allocator
654                         .lock()
655                         .unwrap()
656                         .allocate_mmio_hole_addresses(
657                             restored_bar_addr,
658                             region_size,
659                             Some(region_size),
660                         )
661                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
662                 }
663                 PciBarRegionType::Memory64BitRegion => {
664                     // BAR allocation must be naturally aligned
665                     mmio_allocator
666                         .allocate(restored_bar_addr, region_size, Some(region_size))
667                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
668                 }
669             };
670 
671             // We can now build our BAR configuration block.
672             let bar = PciBarConfiguration::default()
673                 .set_index(bar_id as usize)
674                 .set_address(bar_addr.raw_value())
675                 .set_size(region_size)
676                 .set_region_type(region_type)
677                 .set_prefetchable(prefetchable);
678 
679             if bar_id == VFIO_PCI_ROM_REGION_INDEX {
680                 self.configuration
681                     .add_pci_rom_bar(&bar, flags & 0x1)
682                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
683             } else {
684                 self.configuration
685                     .add_pci_bar(&bar)
686                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
687             }
688 
689             bars.push(bar);
690             self.mmio_regions.push(MmioRegion {
691                 start: bar_addr,
692                 length: region_size,
693                 type_: region_type,
694                 index: bar_id,
695                 user_memory_regions: Vec::new(),
696             });
697 
698             bar_id += 1;
699             if region_type == PciBarRegionType::Memory64BitRegion {
700                 bar_id += 1;
701             }
702         }
703 
704         Ok(bars)
705     }
706 
707     pub(crate) fn free_bars(
708         &mut self,
709         allocator: &mut SystemAllocator,
710         mmio_allocator: &mut AddressAllocator,
711     ) -> Result<(), PciDeviceError> {
712         for region in self.mmio_regions.iter() {
713             match region.type_ {
714                 PciBarRegionType::IoRegion => {
715                     #[cfg(target_arch = "x86_64")]
716                     allocator.free_io_addresses(region.start, region.length);
717                     #[cfg(target_arch = "aarch64")]
718                     error!("I/O region is not supported");
719                 }
720                 PciBarRegionType::Memory32BitRegion => {
721                     allocator.free_mmio_hole_addresses(region.start, region.length);
722                 }
723                 PciBarRegionType::Memory64BitRegion => {
724                     mmio_allocator.free(region.start, region.length);
725                 }
726             }
727         }
728         Ok(())
729     }
730 
731     pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap {
732         let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into());
733 
734         let table = self.vfio_wrapper.read_config_dword((cap + 4).into());
735 
736         let pba = self.vfio_wrapper.read_config_dword((cap + 8).into());
737 
738         MsixCap {
739             msg_ctl,
740             table,
741             pba,
742         }
743     }
744 
745     pub(crate) fn initialize_msix(
746         &mut self,
747         msix_cap: MsixCap,
748         cap_offset: u32,
749         bdf: PciBdf,
750         state: Option<MsixConfigState>,
751     ) {
752         let interrupt_source_group = self
753             .msi_interrupt_manager
754             .create_group(MsiIrqGroupConfig {
755                 base: 0,
756                 count: msix_cap.table_size() as InterruptIndex,
757             })
758             .unwrap();
759 
760         let msix_config = MsixConfig::new(
761             msix_cap.table_size(),
762             interrupt_source_group.clone(),
763             bdf.into(),
764             state,
765         )
766         .unwrap();
767 
768         self.interrupt.msix = Some(VfioMsix {
769             bar: msix_config,
770             cap: msix_cap,
771             cap_offset,
772             interrupt_source_group,
773         });
774     }
775 
776     pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 {
777         self.vfio_wrapper.read_config_word((cap + 2).into())
778     }
779 
780     pub(crate) fn initialize_msi(
781         &mut self,
782         msg_ctl: u16,
783         cap_offset: u32,
784         state: Option<MsiConfigState>,
785     ) {
786         let interrupt_source_group = self
787             .msi_interrupt_manager
788             .create_group(MsiIrqGroupConfig {
789                 base: 0,
790                 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex,
791             })
792             .unwrap();
793 
794         let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap();
795 
796         self.interrupt.msi = Some(VfioMsi {
797             cfg: msi_config,
798             cap_offset,
799             interrupt_source_group,
800         });
801     }
802 
803     pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) {
804         let mut cap_next = self
805             .vfio_wrapper
806             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
807 
808         let mut pci_express_cap_found = false;
809         let mut power_management_cap_found = false;
810 
811         while cap_next != 0 {
812             let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into());
813 
814             match PciCapabilityId::from(cap_id) {
815                 PciCapabilityId::MessageSignalledInterrupts => {
816                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) {
817                         if irq_info.count > 0 {
818                             // Parse capability only if the VFIO device
819                             // supports MSI.
820                             let msg_ctl = self.parse_msi_capabilities(cap_next);
821                             self.initialize_msi(msg_ctl, cap_next as u32, None);
822                         }
823                     }
824                 }
825                 PciCapabilityId::MsiX => {
826                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX)
827                     {
828                         if irq_info.count > 0 {
829                             // Parse capability only if the VFIO device
830                             // supports MSI-X.
831                             let msix_cap = self.parse_msix_capabilities(cap_next);
832                             self.initialize_msix(msix_cap, cap_next as u32, bdf, None);
833                         }
834                     }
835                 }
836                 PciCapabilityId::PciExpress => pci_express_cap_found = true,
837                 PciCapabilityId::PowerManagement => power_management_cap_found = true,
838                 _ => {}
839             };
840 
841             cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into());
842         }
843 
844         if pci_express_cap_found && power_management_cap_found {
845             self.parse_extended_capabilities();
846         }
847     }
848 
849     fn parse_extended_capabilities(&mut self) {
850         let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET;
851 
852         loop {
853             let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset);
854 
855             let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16;
856             let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16;
857 
858             match PciExpressCapabilityId::from(cap_id) {
859                 PciExpressCapabilityId::AlternativeRoutingIdentificationIntepretation
860                 | PciExpressCapabilityId::ResizeableBar
861                 | PciExpressCapabilityId::SingleRootIoVirtualization => {
862                     let reg_idx = (current_offset / 4) as usize;
863                     self.patches.insert(
864                         reg_idx,
865                         ConfigPatch {
866                             mask: 0x0000_ffff,
867                             patch: PciExpressCapabilityId::NullCapability as u32,
868                         },
869                     );
870                 }
871                 _ => {}
872             }
873 
874             if cap_next == 0 {
875                 break;
876             }
877 
878             current_offset = cap_next.into();
879         }
880     }
881 
882     pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> {
883         if let Some(intx) = &mut self.interrupt.intx {
884             if !intx.enabled {
885                 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) {
886                     self.vfio_wrapper
887                         .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd])
888                         .map_err(VfioPciError::EnableIntx)?;
889 
890                     intx.enabled = true;
891                 } else {
892                     return Err(VfioPciError::MissingNotifier);
893                 }
894             }
895         }
896 
897         Ok(())
898     }
899 
900     pub(crate) fn disable_intx(&mut self) {
901         if let Some(intx) = &mut self.interrupt.intx {
902             if intx.enabled {
903                 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) {
904                     error!("Could not disable INTx: {}", e);
905                 } else {
906                     intx.enabled = false;
907                 }
908             }
909         }
910     }
911 
912     pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> {
913         if let Some(msi) = &self.interrupt.msi {
914             let mut irq_fds: Vec<EventFd> = Vec::new();
915             for i in 0..msi.cfg.num_enabled_vectors() {
916                 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) {
917                     irq_fds.push(eventfd);
918                 } else {
919                     return Err(VfioPciError::MissingNotifier);
920                 }
921             }
922 
923             self.vfio_wrapper
924                 .enable_msi(irq_fds.iter().collect())
925                 .map_err(VfioPciError::EnableMsi)?;
926         }
927 
928         Ok(())
929     }
930 
931     pub(crate) fn disable_msi(&self) {
932         if let Err(e) = self.vfio_wrapper.disable_msi() {
933             error!("Could not disable MSI: {}", e);
934         }
935     }
936 
937     pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> {
938         if let Some(msix) = &self.interrupt.msix {
939             let mut irq_fds: Vec<EventFd> = Vec::new();
940             for i in 0..msix.bar.table_entries.len() {
941                 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) {
942                     irq_fds.push(eventfd);
943                 } else {
944                     return Err(VfioPciError::MissingNotifier);
945                 }
946             }
947 
948             self.vfio_wrapper
949                 .enable_msix(irq_fds.iter().collect())
950                 .map_err(VfioPciError::EnableMsix)?;
951         }
952 
953         Ok(())
954     }
955 
956     pub(crate) fn disable_msix(&self) {
957         if let Err(e) = self.vfio_wrapper.disable_msix() {
958             error!("Could not disable MSI-X: {}", e);
959         }
960     }
961 
962     pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> {
963         if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) {
964             if irq_info.count == 0 {
965                 // A count of 0 means the INTx IRQ is not supported, therefore
966                 // it shouldn't be initialized.
967                 return Ok(());
968             }
969         }
970 
971         if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() {
972             self.interrupt.intx = Some(VfioIntx {
973                 interrupt_source_group,
974                 enabled: false,
975             });
976 
977             self.enable_intx()?;
978         }
979 
980         Ok(())
981     }
982 
983     pub(crate) fn update_msi_capabilities(
984         &mut self,
985         offset: u64,
986         data: &[u8],
987     ) -> Result<(), VfioPciError> {
988         match self.interrupt.update_msi(offset, data) {
989             Some(InterruptUpdateAction::EnableMsi) => {
990                 // Disable INTx before we can enable MSI
991                 self.disable_intx();
992                 self.enable_msi()?;
993             }
994             Some(InterruptUpdateAction::DisableMsi) => {
995                 // Fallback onto INTx when disabling MSI
996                 self.disable_msi();
997                 self.enable_intx()?;
998             }
999             _ => {}
1000         }
1001 
1002         Ok(())
1003     }
1004 
1005     pub(crate) fn update_msix_capabilities(
1006         &mut self,
1007         offset: u64,
1008         data: &[u8],
1009     ) -> Result<(), VfioPciError> {
1010         match self.interrupt.update_msix(offset, data) {
1011             Some(InterruptUpdateAction::EnableMsix) => {
1012                 // Disable INTx before we can enable MSI-X
1013                 self.disable_intx();
1014                 self.enable_msix()?;
1015             }
1016             Some(InterruptUpdateAction::DisableMsix) => {
1017                 // Fallback onto INTx when disabling MSI-X
1018                 self.disable_msix();
1019                 self.enable_intx()?;
1020             }
1021             _ => {}
1022         }
1023 
1024         Ok(())
1025     }
1026 
1027     pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> {
1028         for region in self.mmio_regions.iter() {
1029             if addr >= region.start.raw_value()
1030                 && addr < region.start.unchecked_add(region.length).raw_value()
1031             {
1032                 return Some(region.clone());
1033             }
1034         }
1035         None
1036     }
1037 
1038     pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1039         let addr = base + offset;
1040         if let Some(region) = self.find_region(addr) {
1041             let offset = addr - region.start.raw_value();
1042 
1043             if self.interrupt.msix_table_accessed(region.index, offset) {
1044                 self.interrupt.msix_read_table(offset, data);
1045             } else {
1046                 self.vfio_wrapper.region_read(region.index, offset, data);
1047             }
1048         }
1049 
1050         // INTx EOI
1051         // The guest reading from the BAR potentially means the interrupt has
1052         // been received and can be acknowledged.
1053         if self.interrupt.intx_in_use() {
1054             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1055                 error!("Failed unmasking INTx IRQ: {}", e);
1056             }
1057         }
1058     }
1059 
1060     pub(crate) fn write_bar(
1061         &mut self,
1062         base: u64,
1063         offset: u64,
1064         data: &[u8],
1065     ) -> Option<Arc<Barrier>> {
1066         let addr = base + offset;
1067         if let Some(region) = self.find_region(addr) {
1068             let offset = addr - region.start.raw_value();
1069 
1070             // If the MSI-X table is written to, we need to update our cache.
1071             if self.interrupt.msix_table_accessed(region.index, offset) {
1072                 self.interrupt.msix_write_table(offset, data);
1073             } else {
1074                 self.vfio_wrapper.region_write(region.index, offset, data);
1075             }
1076         }
1077 
1078         // INTx EOI
1079         // The guest writing to the BAR potentially means the interrupt has
1080         // been received and can be acknowledged.
1081         if self.interrupt.intx_in_use() {
1082             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1083                 error!("Failed unmasking INTx IRQ: {}", e);
1084             }
1085         }
1086 
1087         None
1088     }
1089 
1090     pub(crate) fn write_config_register(
1091         &mut self,
1092         reg_idx: usize,
1093         offset: u64,
1094         data: &[u8],
1095     ) -> Option<Arc<Barrier>> {
1096         // When the guest wants to write to a BAR, we trap it into
1097         // our local configuration space. We're not reprogramming
1098         // VFIO device.
1099         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1100             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1101         {
1102             // We keep our local cache updated with the BARs.
1103             // We'll read it back from there when the guest is asking
1104             // for BARs (see read_config_register()).
1105             self.configuration
1106                 .write_config_register(reg_idx, offset, data);
1107             return None;
1108         }
1109 
1110         let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64;
1111 
1112         // If the MSI or MSI-X capabilities are accessed, we need to
1113         // update our local cache accordingly.
1114         // Depending on how the capabilities are modified, this could
1115         // trigger a VFIO MSI or MSI-X toggle.
1116         if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) {
1117             let cap_offset: u64 = reg - cap_base + offset;
1118             match cap_id {
1119                 PciCapabilityId::MessageSignalledInterrupts => {
1120                     if let Err(e) = self.update_msi_capabilities(cap_offset, data) {
1121                         error!("Could not update MSI capabilities: {}", e);
1122                     }
1123                 }
1124                 PciCapabilityId::MsiX => {
1125                     if let Err(e) = self.update_msix_capabilities(cap_offset, data) {
1126                         error!("Could not update MSI-X capabilities: {}", e);
1127                     }
1128                 }
1129                 _ => {}
1130             }
1131         }
1132 
1133         // Make sure to write to the device's PCI config space after MSI/MSI-X
1134         // interrupts have been enabled/disabled. In case of MSI, when the
1135         // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS),
1136         // the MSI Enable bit in the MSI capability structure found in the PCI
1137         // config space is disabled by default. That's why when the guest is
1138         // enabling this bit, we first need to enable the MSI interrupts with
1139         // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write
1140         // to the device region to update the MSI Enable bit.
1141         self.vfio_wrapper.write_config((reg + offset) as u32, data);
1142 
1143         None
1144     }
1145 
1146     pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1147         // When reading the BARs, we trap it and return what comes
1148         // from our local configuration space. We want the guest to
1149         // use that and not the VFIO device BARs as it does not map
1150         // with the guest address space.
1151         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1152             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1153         {
1154             return self.configuration.read_reg(reg_idx);
1155         }
1156 
1157         // Since we don't support passing multi-functions devices, we should
1158         // mask the multi-function bit, bit 7 of the Header Type byte on the
1159         // register 3.
1160         let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX {
1161             0xff7f_ffff
1162         } else {
1163             0xffff_ffff
1164         };
1165 
1166         // The config register read comes from the VFIO device itself.
1167         let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask;
1168 
1169         if let Some(config_patch) = self.patches.get(&reg_idx) {
1170             value = (value & !config_patch.mask) | config_patch.patch;
1171         }
1172 
1173         value
1174     }
1175 
1176     fn state(&self) -> VfioCommonState {
1177         let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState {
1178             enabled: intx.enabled,
1179         });
1180 
1181         let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState {
1182             cap: msi.cfg.cap,
1183             cap_offset: msi.cap_offset,
1184         });
1185 
1186         let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState {
1187             cap: msix.cap,
1188             cap_offset: msix.cap_offset,
1189             bdf: msix.bar.devid,
1190         });
1191 
1192         VfioCommonState {
1193             intx_state,
1194             msi_state,
1195             msix_state,
1196         }
1197     }
1198 
1199     fn set_state(
1200         &mut self,
1201         state: &VfioCommonState,
1202         msi_state: Option<MsiConfigState>,
1203         msix_state: Option<MsixConfigState>,
1204     ) -> Result<(), VfioPciError> {
1205         if let (Some(intx), Some(interrupt_source_group)) =
1206             (&state.intx_state, self.legacy_interrupt_group.clone())
1207         {
1208             self.interrupt.intx = Some(VfioIntx {
1209                 interrupt_source_group,
1210                 enabled: false,
1211             });
1212 
1213             if intx.enabled {
1214                 self.enable_intx()?;
1215             }
1216         }
1217 
1218         if let Some(msi) = &state.msi_state {
1219             self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state);
1220         }
1221 
1222         if let Some(msix) = &state.msix_state {
1223             self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state);
1224         }
1225 
1226         Ok(())
1227     }
1228 }
1229 
1230 impl Pausable for VfioCommon {}
1231 
1232 impl Snapshottable for VfioCommon {
1233     fn id(&self) -> String {
1234         String::from(VFIO_COMMON_ID)
1235     }
1236 
1237     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1238         let mut vfio_common_snapshot = Snapshot::new_from_versioned_state(&self.state())?;
1239 
1240         // Snapshot PciConfiguration
1241         vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?);
1242 
1243         // Snapshot MSI
1244         if let Some(msi) = &mut self.interrupt.msi {
1245             vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?);
1246         }
1247 
1248         // Snapshot MSI-X
1249         if let Some(msix) = &mut self.interrupt.msix {
1250             vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?);
1251         }
1252 
1253         Ok(vfio_common_snapshot)
1254     }
1255 }
1256 
1257 /// VfioPciDevice represents a VFIO PCI device.
1258 /// This structure implements the BusDevice and PciDevice traits.
1259 ///
1260 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device.
1261 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice,
1262 /// which then gets added to the PCI bus.
1263 pub struct VfioPciDevice {
1264     id: String,
1265     vm: Arc<dyn hypervisor::Vm>,
1266     device: Arc<VfioDevice>,
1267     container: Arc<VfioContainer>,
1268     common: VfioCommon,
1269     iommu_attached: bool,
1270     memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
1271 }
1272 
1273 impl VfioPciDevice {
1274     /// Constructs a new Vfio Pci device for the given Vfio device
1275     #[allow(clippy::too_many_arguments)]
1276     pub fn new(
1277         id: String,
1278         vm: &Arc<dyn hypervisor::Vm>,
1279         device: VfioDevice,
1280         container: Arc<VfioContainer>,
1281         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
1282         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
1283         iommu_attached: bool,
1284         bdf: PciBdf,
1285         memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
1286         snapshot: Option<Snapshot>,
1287     ) -> Result<Self, VfioPciError> {
1288         let device = Arc::new(device);
1289         device.reset();
1290 
1291         let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device));
1292 
1293         let common = VfioCommon::new(
1294             msi_interrupt_manager,
1295             legacy_interrupt_group,
1296             Arc::new(vfio_wrapper) as Arc<dyn Vfio>,
1297             &PciVfioSubclass::VfioSubclass,
1298             bdf,
1299             vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID),
1300         )?;
1301 
1302         let vfio_pci_device = VfioPciDevice {
1303             id,
1304             vm: vm.clone(),
1305             device,
1306             container,
1307             common,
1308             iommu_attached,
1309             memory_slot,
1310         };
1311 
1312         Ok(vfio_pci_device)
1313     }
1314 
1315     pub fn iommu_attached(&self) -> bool {
1316         self.iommu_attached
1317     }
1318 
1319     fn align_4k(address: u64) -> u64 {
1320         (address + 0xfff) & 0xffff_ffff_ffff_f000
1321     }
1322 
1323     fn is_4k_aligned(address: u64) -> bool {
1324         (address & 0xfff) == 0
1325     }
1326 
1327     fn is_4k_multiple(size: u64) -> bool {
1328         (size & 0xfff) == 0
1329     }
1330 
1331     fn generate_sparse_areas(
1332         caps: &[VfioRegionInfoCap],
1333         region_index: u32,
1334         region_start: u64,
1335         region_size: u64,
1336         vfio_msix: Option<&VfioMsix>,
1337     ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> {
1338         for cap in caps {
1339             match cap {
1340                 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()),
1341                 VfioRegionInfoCap::MsixMappable => {
1342                     if !Self::is_4k_aligned(region_start) {
1343                         error!(
1344                             "Region start address 0x{:x} must be at least aligned on 4KiB",
1345                             region_start
1346                         );
1347                         return Err(VfioPciError::RegionAlignment);
1348                     }
1349                     if !Self::is_4k_multiple(region_size) {
1350                         error!(
1351                             "Region size 0x{:x} must be at least a multiple of 4KiB",
1352                             region_size
1353                         );
1354                         return Err(VfioPciError::RegionSize);
1355                     }
1356 
1357                     // In case the region contains the MSI-X vectors table or
1358                     // the MSI-X PBA table, we must calculate the subregions
1359                     // around them, leading to a list of sparse areas.
1360                     // We want to make sure we will still trap MMIO accesses
1361                     // to these MSI-X specific ranges.
1362                     //
1363                     // Using a BtreeMap as the list provided through the iterator is sorted
1364                     // by key. This ensures proper split of the whole region.
1365                     let mut inter_ranges = BTreeMap::new();
1366                     if let Some(msix) = vfio_msix {
1367                         if region_index == msix.cap.table_bir() {
1368                             let (offset, size) = msix.cap.table_range();
1369                             inter_ranges.insert(offset, size);
1370                         }
1371                         if region_index == msix.cap.pba_bir() {
1372                             let (offset, size) = msix.cap.pba_range();
1373                             inter_ranges.insert(offset, size);
1374                         }
1375                     }
1376 
1377                     let mut sparse_areas = Vec::new();
1378                     let mut current_offset = 0;
1379                     for (range_offset, range_size) in inter_ranges {
1380                         if range_offset > current_offset {
1381                             sparse_areas.push(VfioRegionSparseMmapArea {
1382                                 offset: current_offset,
1383                                 size: range_offset - current_offset,
1384                             });
1385                         }
1386 
1387                         current_offset = Self::align_4k(range_offset + range_size);
1388                     }
1389 
1390                     if region_size > current_offset {
1391                         sparse_areas.push(VfioRegionSparseMmapArea {
1392                             offset: current_offset,
1393                             size: region_size - current_offset,
1394                         });
1395                     }
1396 
1397                     return Ok(sparse_areas);
1398                 }
1399                 _ => {}
1400             }
1401         }
1402 
1403         // In case no relevant capabilities have been found, create a single
1404         // sparse area corresponding to the entire MMIO region.
1405         Ok(vec![VfioRegionSparseMmapArea {
1406             offset: 0,
1407             size: region_size,
1408         }])
1409     }
1410 
1411     /// Map MMIO regions into the guest, and avoid VM exits when the guest tries
1412     /// to reach those regions.
1413     ///
1414     /// # Arguments
1415     ///
1416     /// * `vm` - The VM object. It is used to set the VFIO MMIO regions
1417     ///          as user memory regions.
1418     /// * `mem_slot` - The closure to return a memory slot.
1419     pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> {
1420         let fd = self.device.as_raw_fd();
1421 
1422         for region in self.common.mmio_regions.iter_mut() {
1423             let region_flags = self.device.get_region_flags(region.index);
1424             if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1425                 let mut prot = 0;
1426                 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 {
1427                     prot |= libc::PROT_READ;
1428                 }
1429                 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 {
1430                     prot |= libc::PROT_WRITE;
1431                 }
1432 
1433                 // Retrieve the list of capabilities found on the region
1434                 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 {
1435                     self.device.get_region_caps(region.index)
1436                 } else {
1437                     Vec::new()
1438                 };
1439 
1440                 // Don't try to mmap the region if it contains MSI-X table or
1441                 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE
1442                 // in the list of supported capabilities.
1443                 if let Some(msix) = self.common.interrupt.msix.as_ref() {
1444                     if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir())
1445                         && !caps.contains(&VfioRegionInfoCap::MsixMappable)
1446                     {
1447                         continue;
1448                     }
1449                 }
1450 
1451                 let mmap_size = self.device.get_region_size(region.index);
1452                 let mmap_offset = self.device.get_region_offset(region.index);
1453 
1454                 let sparse_areas = Self::generate_sparse_areas(
1455                     &caps,
1456                     region.index,
1457                     region.start.0,
1458                     mmap_size,
1459                     self.common.interrupt.msix.as_ref(),
1460                 )?;
1461 
1462                 for area in sparse_areas.iter() {
1463                     // SAFETY: FFI call with correct arguments
1464                     let host_addr = unsafe {
1465                         libc::mmap(
1466                             null_mut(),
1467                             area.size as usize,
1468                             prot,
1469                             libc::MAP_SHARED,
1470                             fd,
1471                             mmap_offset as libc::off_t + area.offset as libc::off_t,
1472                         )
1473                     };
1474 
1475                     if host_addr == libc::MAP_FAILED {
1476                         error!(
1477                             "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}",
1478                             area.offset,
1479                             area.size,
1480                             std::io::Error::last_os_error()
1481                         );
1482                         return Err(VfioPciError::MmapArea);
1483                     }
1484 
1485                     let user_memory_region = UserMemoryRegion {
1486                         slot: (self.memory_slot)(),
1487                         start: region.start.0 + area.offset,
1488                         size: area.size,
1489                         host_addr: host_addr as u64,
1490                     };
1491 
1492                     region.user_memory_regions.push(user_memory_region);
1493 
1494                     let mem_region = self.vm.make_user_memory_region(
1495                         user_memory_region.slot,
1496                         user_memory_region.start,
1497                         user_memory_region.size,
1498                         user_memory_region.host_addr,
1499                         false,
1500                         false,
1501                     );
1502 
1503                     self.vm
1504                         .create_user_memory_region(mem_region)
1505                         .map_err(VfioPciError::CreateUserMemoryRegion)?;
1506                 }
1507             }
1508         }
1509 
1510         Ok(())
1511     }
1512 
1513     pub fn unmap_mmio_regions(&mut self) {
1514         for region in self.common.mmio_regions.iter() {
1515             for user_memory_region in region.user_memory_regions.iter() {
1516                 // Remove region
1517                 let r = self.vm.make_user_memory_region(
1518                     user_memory_region.slot,
1519                     user_memory_region.start,
1520                     user_memory_region.size,
1521                     user_memory_region.host_addr,
1522                     false,
1523                     false,
1524                 );
1525 
1526                 if let Err(e) = self.vm.remove_user_memory_region(r) {
1527                     error!("Could not remove the userspace memory region: {}", e);
1528                 }
1529 
1530                 // SAFETY: FFI call with correct arguments
1531                 let ret = unsafe {
1532                     libc::munmap(
1533                         user_memory_region.host_addr as *mut libc::c_void,
1534                         user_memory_region.size as usize,
1535                     )
1536                 };
1537                 if ret != 0 {
1538                     error!(
1539                         "Could not unmap region {}, error:{}",
1540                         region.index,
1541                         io::Error::last_os_error()
1542                     );
1543                 }
1544             }
1545         }
1546     }
1547 
1548     pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> {
1549         if !self.iommu_attached {
1550             self.container
1551                 .vfio_dma_map(iova, size, user_addr)
1552                 .map_err(VfioPciError::DmaMap)?;
1553         }
1554 
1555         Ok(())
1556     }
1557 
1558     pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> {
1559         if !self.iommu_attached {
1560             self.container
1561                 .vfio_dma_unmap(iova, size)
1562                 .map_err(VfioPciError::DmaUnmap)?;
1563         }
1564 
1565         Ok(())
1566     }
1567 
1568     pub fn mmio_regions(&self) -> Vec<MmioRegion> {
1569         self.common.mmio_regions.clone()
1570     }
1571 }
1572 
1573 impl Drop for VfioPciDevice {
1574     fn drop(&mut self) {
1575         self.unmap_mmio_regions();
1576 
1577         if let Some(msix) = &self.common.interrupt.msix {
1578             if msix.bar.enabled() {
1579                 self.common.disable_msix();
1580             }
1581         }
1582 
1583         if let Some(msi) = &self.common.interrupt.msi {
1584             if msi.cfg.enabled() {
1585                 self.common.disable_msi()
1586             }
1587         }
1588 
1589         if self.common.interrupt.intx_in_use() {
1590             self.common.disable_intx();
1591         }
1592     }
1593 }
1594 
1595 impl BusDevice for VfioPciDevice {
1596     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1597         self.read_bar(base, offset, data)
1598     }
1599 
1600     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1601         self.write_bar(base, offset, data)
1602     }
1603 }
1604 
1605 // First BAR offset in the PCI config space.
1606 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10;
1607 // Capability register offset in the PCI config space.
1608 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34;
1609 // Extended capabilities register offset in the PCI config space.
1610 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100;
1611 // IO BAR when first BAR bit is 1.
1612 const PCI_CONFIG_IO_BAR: u32 = 0x1;
1613 // 64-bit memory bar flag.
1614 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4;
1615 // Prefetchable BAR bit
1616 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8;
1617 // PCI config register size (4 bytes).
1618 const PCI_CONFIG_REGISTER_SIZE: usize = 4;
1619 // Number of BARs for a PCI device
1620 const BAR_NUMS: usize = 6;
1621 // PCI Header Type register index
1622 const PCI_HEADER_TYPE_REG_INDEX: usize = 3;
1623 // First BAR register index
1624 const PCI_CONFIG_BAR0_INDEX: usize = 4;
1625 // PCI ROM expansion BAR register index
1626 const PCI_ROM_EXP_BAR_INDEX: usize = 12;
1627 
1628 impl PciDevice for VfioPciDevice {
1629     fn allocate_bars(
1630         &mut self,
1631         allocator: &Arc<Mutex<SystemAllocator>>,
1632         mmio_allocator: &mut AddressAllocator,
1633         resources: Option<Vec<Resource>>,
1634     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
1635         self.common
1636             .allocate_bars(allocator, mmio_allocator, resources)
1637     }
1638 
1639     fn free_bars(
1640         &mut self,
1641         allocator: &mut SystemAllocator,
1642         mmio_allocator: &mut AddressAllocator,
1643     ) -> Result<(), PciDeviceError> {
1644         self.common.free_bars(allocator, mmio_allocator)
1645     }
1646 
1647     fn write_config_register(
1648         &mut self,
1649         reg_idx: usize,
1650         offset: u64,
1651         data: &[u8],
1652     ) -> Option<Arc<Barrier>> {
1653         self.common.write_config_register(reg_idx, offset, data)
1654     }
1655 
1656     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1657         self.common.read_config_register(reg_idx)
1658     }
1659 
1660     fn detect_bar_reprogramming(
1661         &mut self,
1662         reg_idx: usize,
1663         data: &[u8],
1664     ) -> Option<BarReprogrammingParams> {
1665         self.common
1666             .configuration
1667             .detect_bar_reprogramming(reg_idx, data)
1668     }
1669 
1670     fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1671         self.common.read_bar(base, offset, data)
1672     }
1673 
1674     fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1675         self.common.write_bar(base, offset, data)
1676     }
1677 
1678     fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> {
1679         for region in self.common.mmio_regions.iter_mut() {
1680             if region.start.raw_value() == old_base {
1681                 region.start = GuestAddress(new_base);
1682 
1683                 for user_memory_region in region.user_memory_regions.iter_mut() {
1684                     // Remove old region
1685                     let old_mem_region = self.vm.make_user_memory_region(
1686                         user_memory_region.slot,
1687                         user_memory_region.start,
1688                         user_memory_region.size,
1689                         user_memory_region.host_addr,
1690                         false,
1691                         false,
1692                     );
1693 
1694                     self.vm
1695                         .remove_user_memory_region(old_mem_region)
1696                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1697 
1698                     // Update the user memory region with the correct start address.
1699                     if new_base > old_base {
1700                         user_memory_region.start += new_base - old_base;
1701                     } else {
1702                         user_memory_region.start -= old_base - new_base;
1703                     }
1704 
1705                     // Insert new region
1706                     let new_mem_region = self.vm.make_user_memory_region(
1707                         user_memory_region.slot,
1708                         user_memory_region.start,
1709                         user_memory_region.size,
1710                         user_memory_region.host_addr,
1711                         false,
1712                         false,
1713                     );
1714 
1715                     self.vm
1716                         .create_user_memory_region(new_mem_region)
1717                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1718                 }
1719             }
1720         }
1721 
1722         Ok(())
1723     }
1724 
1725     fn as_any(&mut self) -> &mut dyn Any {
1726         self
1727     }
1728 
1729     fn id(&self) -> Option<String> {
1730         Some(self.id.clone())
1731     }
1732 }
1733 
1734 impl Pausable for VfioPciDevice {}
1735 
1736 impl Snapshottable for VfioPciDevice {
1737     fn id(&self) -> String {
1738         self.id.clone()
1739     }
1740 
1741     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1742         let mut vfio_pci_dev_snapshot = Snapshot::default();
1743 
1744         // Snapshot VfioCommon
1745         vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?);
1746 
1747         Ok(vfio_pci_dev_snapshot)
1748     }
1749 }
1750 impl Transportable for VfioPciDevice {}
1751 impl Migratable for VfioPciDevice {}
1752