xref: /cloud-hypervisor/pci/src/vfio.rs (revision 190a11f2124b0b60a2d44e85b7c9988373acfb6d)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 
6 use std::any::Any;
7 use std::collections::{BTreeMap, HashMap};
8 use std::io;
9 use std::os::unix::io::AsRawFd;
10 use std::path::PathBuf;
11 use std::ptr::null_mut;
12 use std::sync::{Arc, Barrier, Mutex};
13 
14 use anyhow::anyhow;
15 use byteorder::{ByteOrder, LittleEndian};
16 use hypervisor::HypervisorVmError;
17 use libc::{sysconf, _SC_PAGESIZE};
18 use serde::{Deserialize, Serialize};
19 use thiserror::Error;
20 use vfio_bindings::bindings::vfio::*;
21 use vfio_ioctls::{
22     VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea,
23 };
24 use vm_allocator::page_size::{
25     align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned,
26 };
27 use vm_allocator::{AddressAllocator, MemorySlotAllocator, SystemAllocator};
28 use vm_device::dma_mapping::ExternalDmaMapping;
29 use vm_device::interrupt::{
30     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
31 };
32 use vm_device::{BusDevice, Resource};
33 use vm_memory::{Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestUsize};
34 use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable};
35 use vmm_sys_util::eventfd::EventFd;
36 
37 use crate::msi::{MsiConfigState, MSI_CONFIG_ID};
38 use crate::msix::MsixConfigState;
39 use crate::{
40     msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig,
41     PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId,
42     PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId,
43     PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID,
44 };
45 
46 pub(crate) const VFIO_COMMON_ID: &str = "vfio_common";
47 
48 #[derive(Debug, Error)]
49 pub enum VfioPciError {
50     #[error("Failed to create user memory region")]
51     CreateUserMemoryRegion(#[source] HypervisorVmError),
52     #[error("Failed to DMA map: {0} for device {1} (guest BDF: {2})")]
53     DmaMap(#[source] vfio_ioctls::VfioError, PathBuf, PciBdf),
54     #[error("Failed to DMA unmap: {0} for device {1} (guest BDF: {2})")]
55     DmaUnmap(#[source] vfio_ioctls::VfioError, PathBuf, PciBdf),
56     #[error("Failed to enable INTx")]
57     EnableIntx(#[source] VfioError),
58     #[error("Failed to enable MSI")]
59     EnableMsi(#[source] VfioError),
60     #[error("Failed to enable MSI-x")]
61     EnableMsix(#[source] VfioError),
62     #[error("Failed to mmap the area")]
63     MmapArea,
64     #[error("Failed to notifier's eventfd")]
65     MissingNotifier,
66     #[error("Invalid region alignment")]
67     RegionAlignment,
68     #[error("Invalid region size")]
69     RegionSize,
70     #[error("Failed to retrieve MsiConfigState")]
71     RetrieveMsiConfigState(#[source] anyhow::Error),
72     #[error("Failed to retrieve MsixConfigState")]
73     RetrieveMsixConfigState(#[source] anyhow::Error),
74     #[error("Failed to retrieve PciConfigurationState")]
75     RetrievePciConfigurationState(#[source] anyhow::Error),
76     #[error("Failed to retrieve VfioCommonState")]
77     RetrieveVfioCommonState(#[source] anyhow::Error),
78 }
79 
80 #[derive(Copy, Clone)]
81 enum PciVfioSubclass {
82     VfioSubclass = 0xff,
83 }
84 
85 impl PciSubclass for PciVfioSubclass {
86     fn get_register_value(&self) -> u8 {
87         *self as u8
88     }
89 }
90 
91 enum InterruptUpdateAction {
92     EnableMsi,
93     DisableMsi,
94     EnableMsix,
95     DisableMsix,
96 }
97 
98 #[derive(Serialize, Deserialize)]
99 struct IntxState {
100     enabled: bool,
101 }
102 
103 pub(crate) struct VfioIntx {
104     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
105     enabled: bool,
106 }
107 
108 #[derive(Serialize, Deserialize)]
109 struct MsiState {
110     cap: MsiCap,
111     cap_offset: u32,
112 }
113 
114 pub(crate) struct VfioMsi {
115     pub(crate) cfg: MsiConfig,
116     cap_offset: u32,
117     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
118 }
119 
120 impl VfioMsi {
121     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
122         let old_enabled = self.cfg.enabled();
123 
124         self.cfg.update(offset, data);
125 
126         let new_enabled = self.cfg.enabled();
127 
128         if !old_enabled && new_enabled {
129             return Some(InterruptUpdateAction::EnableMsi);
130         }
131 
132         if old_enabled && !new_enabled {
133             return Some(InterruptUpdateAction::DisableMsi);
134         }
135 
136         None
137     }
138 }
139 
140 #[derive(Serialize, Deserialize)]
141 struct MsixState {
142     cap: MsixCap,
143     cap_offset: u32,
144     bdf: u32,
145 }
146 
147 pub(crate) struct VfioMsix {
148     pub(crate) bar: MsixConfig,
149     cap: MsixCap,
150     cap_offset: u32,
151     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
152 }
153 
154 impl VfioMsix {
155     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
156         let old_enabled = self.bar.enabled();
157 
158         // Update "Message Control" word
159         if offset == 2 && data.len() == 2 {
160             self.bar.set_msg_ctl(LittleEndian::read_u16(data));
161         }
162 
163         let new_enabled = self.bar.enabled();
164 
165         if !old_enabled && new_enabled {
166             return Some(InterruptUpdateAction::EnableMsix);
167         }
168 
169         if old_enabled && !new_enabled {
170             return Some(InterruptUpdateAction::DisableMsix);
171         }
172 
173         None
174     }
175 
176     fn table_accessed(&self, bar_index: u32, offset: u64) -> bool {
177         let table_offset: u64 = u64::from(self.cap.table_offset());
178         let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64);
179         let table_bir: u32 = self.cap.table_bir();
180 
181         bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size
182     }
183 }
184 
185 pub(crate) struct Interrupt {
186     pub(crate) intx: Option<VfioIntx>,
187     pub(crate) msi: Option<VfioMsi>,
188     pub(crate) msix: Option<VfioMsix>,
189 }
190 
191 impl Interrupt {
192     fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
193         if let Some(ref mut msi) = &mut self.msi {
194             let action = msi.update(offset, data);
195             return action;
196         }
197 
198         None
199     }
200 
201     fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
202         if let Some(ref mut msix) = &mut self.msix {
203             let action = msix.update(offset, data);
204             return action;
205         }
206 
207         None
208     }
209 
210     fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> {
211         if let Some(msi) = &self.msi {
212             if offset >= u64::from(msi.cap_offset)
213                 && offset < u64::from(msi.cap_offset) + msi.cfg.size()
214             {
215                 return Some((
216                     PciCapabilityId::MessageSignalledInterrupts,
217                     u64::from(msi.cap_offset),
218                 ));
219             }
220         }
221 
222         if let Some(msix) = &self.msix {
223             if offset == u64::from(msix.cap_offset) {
224                 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset)));
225             }
226         }
227 
228         None
229     }
230 
231     fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool {
232         if let Some(msix) = &self.msix {
233             return msix.table_accessed(bar_index, offset);
234         }
235 
236         false
237     }
238 
239     fn msix_write_table(&mut self, offset: u64, data: &[u8]) {
240         if let Some(ref mut msix) = &mut self.msix {
241             let offset = offset - u64::from(msix.cap.table_offset());
242             msix.bar.write_table(offset, data)
243         }
244     }
245 
246     fn msix_read_table(&self, offset: u64, data: &mut [u8]) {
247         if let Some(msix) = &self.msix {
248             let offset = offset - u64::from(msix.cap.table_offset());
249             msix.bar.read_table(offset, data)
250         }
251     }
252 
253     pub(crate) fn intx_in_use(&self) -> bool {
254         if let Some(intx) = &self.intx {
255             return intx.enabled;
256         }
257 
258         false
259     }
260 }
261 
262 #[derive(Copy, Clone)]
263 pub struct UserMemoryRegion {
264     pub slot: u32,
265     pub start: u64,
266     pub size: u64,
267     pub host_addr: u64,
268 }
269 
270 #[derive(Clone)]
271 pub struct MmioRegion {
272     pub start: GuestAddress,
273     pub length: GuestUsize,
274     pub(crate) type_: PciBarRegionType,
275     pub(crate) index: u32,
276     pub(crate) user_memory_regions: Vec<UserMemoryRegion>,
277 }
278 
279 trait MmioRegionRange {
280     fn check_range(&self, guest_addr: u64, size: u64) -> bool;
281     fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error>;
282 }
283 
284 impl MmioRegionRange for Vec<MmioRegion> {
285     // Check if a guest address is within the range of mmio regions
286     fn check_range(&self, guest_addr: u64, size: u64) -> bool {
287         for region in self.iter() {
288             let Some(guest_addr_end) = guest_addr.checked_add(size) else {
289                 return false;
290             };
291             let Some(region_end) = region.start.raw_value().checked_add(region.length) else {
292                 return false;
293             };
294             if guest_addr >= region.start.raw_value() && guest_addr_end <= region_end {
295                 return true;
296             }
297         }
298         false
299     }
300 
301     // Locate the user region address for a guest address within all mmio regions
302     fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error> {
303         for region in self.iter() {
304             for user_region in region.user_memory_regions.iter() {
305                 if guest_addr >= user_region.start
306                     && guest_addr < user_region.start + user_region.size
307                 {
308                     return Ok(user_region.host_addr + (guest_addr - user_region.start));
309                 }
310             }
311         }
312 
313         Err(io::Error::other(format!(
314             "unable to find user address: 0x{guest_addr:x}"
315         )))
316     }
317 }
318 
319 #[derive(Debug, Error)]
320 pub enum VfioError {
321     #[error("Kernel VFIO error")]
322     KernelVfio(#[source] vfio_ioctls::VfioError),
323     #[error("VFIO user error")]
324     VfioUser(#[source] vfio_user::Error),
325 }
326 
327 pub(crate) trait Vfio: Send + Sync {
328     fn read_config_byte(&self, offset: u32) -> u8 {
329         let mut data: [u8; 1] = [0];
330         self.read_config(offset, &mut data);
331         data[0]
332     }
333 
334     fn read_config_word(&self, offset: u32) -> u16 {
335         let mut data: [u8; 2] = [0, 0];
336         self.read_config(offset, &mut data);
337         u16::from_le_bytes(data)
338     }
339 
340     fn read_config_dword(&self, offset: u32) -> u32 {
341         let mut data: [u8; 4] = [0, 0, 0, 0];
342         self.read_config(offset, &mut data);
343         u32::from_le_bytes(data)
344     }
345 
346     fn write_config_dword(&self, offset: u32, buf: u32) {
347         let data: [u8; 4] = buf.to_le_bytes();
348         self.write_config(offset, &data)
349     }
350 
351     fn read_config(&self, offset: u32, data: &mut [u8]) {
352         self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut());
353     }
354 
355     fn write_config(&self, offset: u32, data: &[u8]) {
356         self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data)
357     }
358 
359     fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
360         self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds)
361     }
362 
363     fn disable_msi(&self) -> Result<(), VfioError> {
364         self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX)
365     }
366 
367     fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
368         self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds)
369     }
370 
371     fn disable_msix(&self) -> Result<(), VfioError> {
372         self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX)
373     }
374 
375     fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) {
376         unimplemented!()
377     }
378 
379     fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) {
380         unimplemented!()
381     }
382 
383     fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> {
384         unimplemented!()
385     }
386 
387     fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
388         unimplemented!()
389     }
390 
391     fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
392         unimplemented!()
393     }
394 
395     fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
396         unimplemented!()
397     }
398 }
399 
400 struct VfioDeviceWrapper {
401     device: Arc<VfioDevice>,
402 }
403 
404 impl VfioDeviceWrapper {
405     fn new(device: Arc<VfioDevice>) -> Self {
406         Self { device }
407     }
408 }
409 
410 impl Vfio for VfioDeviceWrapper {
411     fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) {
412         self.device.region_read(index, data, offset)
413     }
414 
415     fn region_write(&self, index: u32, offset: u64, data: &[u8]) {
416         self.device.region_write(index, data, offset)
417     }
418 
419     fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> {
420         self.device.get_irq_info(irq_index).copied()
421     }
422 
423     fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
424         self.device
425             .enable_irq(irq_index, event_fds)
426             .map_err(VfioError::KernelVfio)
427     }
428 
429     fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> {
430         self.device
431             .disable_irq(irq_index)
432             .map_err(VfioError::KernelVfio)
433     }
434 
435     fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> {
436         self.device
437             .unmask_irq(irq_index)
438             .map_err(VfioError::KernelVfio)
439     }
440 }
441 
442 #[derive(Serialize, Deserialize)]
443 struct VfioCommonState {
444     intx_state: Option<IntxState>,
445     msi_state: Option<MsiState>,
446     msix_state: Option<MsixState>,
447 }
448 
449 pub(crate) struct ConfigPatch {
450     mask: u32,
451     patch: u32,
452 }
453 
454 pub(crate) struct VfioCommon {
455     pub(crate) configuration: PciConfiguration,
456     pub(crate) mmio_regions: Vec<MmioRegion>,
457     pub(crate) interrupt: Interrupt,
458     pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
459     pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
460     pub(crate) vfio_wrapper: Arc<dyn Vfio>,
461     pub(crate) patches: HashMap<usize, ConfigPatch>,
462     x_nv_gpudirect_clique: Option<u8>,
463 }
464 
465 impl VfioCommon {
466     pub(crate) fn new(
467         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
468         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
469         vfio_wrapper: Arc<dyn Vfio>,
470         subclass: &dyn PciSubclass,
471         bdf: PciBdf,
472         snapshot: Option<Snapshot>,
473         x_nv_gpudirect_clique: Option<u8>,
474     ) -> Result<Self, VfioPciError> {
475         let pci_configuration_state =
476             vm_migration::state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID).map_err(|e| {
477                 VfioPciError::RetrievePciConfigurationState(anyhow!(
478                     "Failed to get PciConfigurationState from Snapshot: {}",
479                     e
480                 ))
481             })?;
482 
483         let configuration = PciConfiguration::new(
484             0,
485             0,
486             0,
487             PciClassCode::Other,
488             subclass,
489             None,
490             PciHeaderType::Device,
491             0,
492             0,
493             None,
494             pci_configuration_state,
495         );
496 
497         let mut vfio_common = VfioCommon {
498             mmio_regions: Vec::new(),
499             configuration,
500             interrupt: Interrupt {
501                 intx: None,
502                 msi: None,
503                 msix: None,
504             },
505             msi_interrupt_manager,
506             legacy_interrupt_group,
507             vfio_wrapper,
508             patches: HashMap::new(),
509             x_nv_gpudirect_clique,
510         };
511 
512         let state: Option<VfioCommonState> = snapshot
513             .as_ref()
514             .map(|s| s.to_state())
515             .transpose()
516             .map_err(|e| {
517                 VfioPciError::RetrieveVfioCommonState(anyhow!(
518                     "Failed to get VfioCommonState from Snapshot: {}",
519                     e
520                 ))
521             })?;
522         let msi_state =
523             vm_migration::state_from_id(snapshot.as_ref(), MSI_CONFIG_ID).map_err(|e| {
524                 VfioPciError::RetrieveMsiConfigState(anyhow!(
525                     "Failed to get MsiConfigState from Snapshot: {}",
526                     e
527                 ))
528             })?;
529         let msix_state =
530             vm_migration::state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID).map_err(|e| {
531                 VfioPciError::RetrieveMsixConfigState(anyhow!(
532                     "Failed to get MsixConfigState from Snapshot: {}",
533                     e
534                 ))
535             })?;
536 
537         if let Some(state) = state.as_ref() {
538             vfio_common.set_state(state, msi_state, msix_state)?;
539         } else {
540             vfio_common.parse_capabilities(bdf);
541             vfio_common.initialize_legacy_interrupt()?;
542         }
543 
544         Ok(vfio_common)
545     }
546 
547     /// In case msix table offset is not page size aligned, we need do some fixup to achieve it.
548     /// Because we don't want the MMIO RW region and trap region overlap each other.
549     fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 {
550         if let Some(msix) = self.interrupt.msix.as_mut() {
551             let msix_cap = &mut msix.cap;
552 
553             // Suppose table_bir equals to pba_bir here. Am I right?
554             let (table_offset, table_size) = msix_cap.table_range();
555             if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id {
556                 return region_size;
557             }
558 
559             let (pba_offset, pba_size) = msix_cap.pba_range();
560             let msix_sz = align_page_size_up(table_size + pba_size);
561             // Expand region to hold RW and trap region which both page size aligned
562             let size = std::cmp::max(region_size * 2, msix_sz * 2);
563             // let table starts from the middle of the region
564             msix_cap.table_set_offset((size / 2) as u32);
565             msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32);
566 
567             size
568         } else {
569             // MSI-X not supported for this device
570             region_size
571         }
572     }
573 
574     // The `allocator` argument is unused on `aarch64`
575     #[allow(unused_variables)]
576     pub(crate) fn allocate_bars(
577         &mut self,
578         allocator: &Arc<Mutex<SystemAllocator>>,
579         mmio32_allocator: &mut AddressAllocator,
580         mmio64_allocator: &mut AddressAllocator,
581         resources: Option<Vec<Resource>>,
582     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
583         let mut bars = Vec::new();
584         let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX;
585 
586         // Going through all regular regions to compute the BAR size.
587         // We're not saving the BAR address to restore it, because we
588         // are going to allocate a guest address for each BAR and write
589         // that new address back.
590         while bar_id < VFIO_PCI_CONFIG_REGION_INDEX {
591             let mut region_size: u64 = 0;
592             let mut region_type = PciBarRegionType::Memory32BitRegion;
593             let mut prefetchable = PciBarPrefetchable::NotPrefetchable;
594             let mut flags: u32 = 0;
595 
596             let mut restored_bar_addr = None;
597             if let Some(resources) = &resources {
598                 for resource in resources {
599                     if let Resource::PciBar {
600                         index,
601                         base,
602                         size,
603                         type_,
604                         ..
605                     } = resource
606                     {
607                         if *index == bar_id as usize {
608                             restored_bar_addr = Some(GuestAddress(*base));
609                             region_size = *size;
610                             region_type = PciBarRegionType::from(*type_);
611                             break;
612                         }
613                     }
614                 }
615                 if restored_bar_addr.is_none() {
616                     bar_id += 1;
617                     continue;
618                 }
619             } else {
620                 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
621                     (PCI_ROM_EXP_BAR_INDEX * 4) as u32
622                 } else {
623                     PCI_CONFIG_BAR_OFFSET + bar_id * 4
624                 };
625 
626                 // First read flags
627                 flags = self.vfio_wrapper.read_config_dword(bar_offset);
628 
629                 // Is this an IO BAR?
630                 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
631                     matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR)
632                 } else {
633                     false
634                 };
635 
636                 // Is this a 64-bit BAR?
637                 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
638                     matches!(
639                         flags & PCI_CONFIG_MEMORY_BAR_64BIT,
640                         PCI_CONFIG_MEMORY_BAR_64BIT
641                     )
642                 } else {
643                     false
644                 };
645 
646                 if matches!(
647                     flags & PCI_CONFIG_BAR_PREFETCHABLE,
648                     PCI_CONFIG_BAR_PREFETCHABLE
649                 ) {
650                     prefetchable = PciBarPrefetchable::Prefetchable
651                 };
652 
653                 // To get size write all 1s
654                 self.vfio_wrapper
655                     .write_config_dword(bar_offset, 0xffff_ffff);
656 
657                 // And read back BAR value. The device will write zeros for bits it doesn't care about
658                 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset);
659 
660                 if io_bar {
661                     // Mask flag bits (lowest 2 for I/O bars)
662                     lower &= !0b11;
663 
664                     // BAR is not enabled
665                     if lower == 0 {
666                         bar_id += 1;
667                         continue;
668                     }
669 
670                     // IO BAR
671                     region_type = PciBarRegionType::IoRegion;
672 
673                     // Invert bits and add 1 to calculate size
674                     region_size = (!lower + 1) as u64;
675                 } else if is_64bit_bar {
676                     // 64 bits Memory BAR
677                     region_type = PciBarRegionType::Memory64BitRegion;
678 
679                     // Query size of upper BAR of 64-bit BAR
680                     let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4;
681                     self.vfio_wrapper
682                         .write_config_dword(upper_offset, 0xffff_ffff);
683                     let upper = self.vfio_wrapper.read_config_dword(upper_offset);
684 
685                     let mut combined_size = (u64::from(upper) << 32) | u64::from(lower);
686 
687                     // Mask out flag bits (lowest 4 for memory bars)
688                     combined_size &= !0b1111;
689 
690                     // BAR is not enabled
691                     if combined_size == 0 {
692                         bar_id += 1;
693                         continue;
694                     }
695 
696                     // Invert and add 1 to to find size
697                     region_size = !combined_size + 1;
698                 } else {
699                     region_type = PciBarRegionType::Memory32BitRegion;
700 
701                     // Mask out flag bits (lowest 4 for memory bars)
702                     lower &= !0b1111;
703 
704                     if lower == 0 {
705                         bar_id += 1;
706                         continue;
707                     }
708 
709                     // Invert and add 1 to to find size
710                     region_size = (!lower + 1) as u64;
711                 }
712             }
713 
714             let bar_addr = match region_type {
715                 PciBarRegionType::IoRegion => {
716                     // The address needs to be 4 bytes aligned.
717                     allocator
718                         .lock()
719                         .unwrap()
720                         .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4))
721                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
722                 }
723                 PciBarRegionType::Memory32BitRegion => {
724                     // BAR allocation must be naturally aligned
725                     mmio32_allocator
726                         .allocate(restored_bar_addr, region_size, Some(region_size))
727                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
728                 }
729                 PciBarRegionType::Memory64BitRegion => {
730                     // We need do some fixup to keep MMIO RW region and msix cap region page size
731                     // aligned.
732                     region_size = self.fixup_msix_region(bar_id, region_size);
733                     mmio64_allocator
734                         .allocate(
735                             restored_bar_addr,
736                             region_size,
737                             Some(std::cmp::max(
738                                 // SAFETY: FFI call. Trivially safe.
739                                 unsafe { sysconf(_SC_PAGESIZE) as GuestUsize },
740                                 region_size,
741                             )),
742                         )
743                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
744                 }
745             };
746 
747             // We can now build our BAR configuration block.
748             let bar = PciBarConfiguration::default()
749                 .set_index(bar_id as usize)
750                 .set_address(bar_addr.raw_value())
751                 .set_size(region_size)
752                 .set_region_type(region_type)
753                 .set_prefetchable(prefetchable);
754 
755             if bar_id == VFIO_PCI_ROM_REGION_INDEX {
756                 self.configuration
757                     .add_pci_rom_bar(&bar, flags & 0x1)
758                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
759             } else {
760                 self.configuration
761                     .add_pci_bar(&bar)
762                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
763             }
764 
765             bars.push(bar);
766             self.mmio_regions.push(MmioRegion {
767                 start: bar_addr,
768                 length: region_size,
769                 type_: region_type,
770                 index: bar_id,
771                 user_memory_regions: Vec::new(),
772             });
773 
774             bar_id += 1;
775             if region_type == PciBarRegionType::Memory64BitRegion {
776                 bar_id += 1;
777             }
778         }
779 
780         Ok(bars)
781     }
782 
783     // The `allocator` argument is unused on `aarch64`
784     #[allow(unused_variables)]
785     pub(crate) fn free_bars(
786         &mut self,
787         allocator: &mut SystemAllocator,
788         mmio32_allocator: &mut AddressAllocator,
789         mmio64_allocator: &mut AddressAllocator,
790     ) -> Result<(), PciDeviceError> {
791         for region in self.mmio_regions.iter() {
792             match region.type_ {
793                 PciBarRegionType::IoRegion => {
794                     allocator.free_io_addresses(region.start, region.length);
795                 }
796                 PciBarRegionType::Memory32BitRegion => {
797                     mmio32_allocator.free(region.start, region.length);
798                 }
799                 PciBarRegionType::Memory64BitRegion => {
800                     mmio64_allocator.free(region.start, region.length);
801                 }
802             }
803         }
804         Ok(())
805     }
806 
807     fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap {
808         let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into());
809 
810         let table = self.vfio_wrapper.read_config_dword((cap + 4).into());
811 
812         let pba = self.vfio_wrapper.read_config_dword((cap + 8).into());
813 
814         MsixCap {
815             msg_ctl,
816             table,
817             pba,
818         }
819     }
820 
821     fn initialize_msix(
822         &mut self,
823         msix_cap: MsixCap,
824         cap_offset: u32,
825         bdf: PciBdf,
826         state: Option<MsixConfigState>,
827     ) {
828         let interrupt_source_group = self
829             .msi_interrupt_manager
830             .create_group(MsiIrqGroupConfig {
831                 base: 0,
832                 count: msix_cap.table_size() as InterruptIndex,
833             })
834             .unwrap();
835 
836         let msix_config = MsixConfig::new(
837             msix_cap.table_size(),
838             interrupt_source_group.clone(),
839             bdf.into(),
840             state,
841         )
842         .unwrap();
843 
844         self.interrupt.msix = Some(VfioMsix {
845             bar: msix_config,
846             cap: msix_cap,
847             cap_offset,
848             interrupt_source_group,
849         });
850     }
851 
852     fn parse_msi_capabilities(&mut self, cap: u8) -> u16 {
853         self.vfio_wrapper.read_config_word((cap + 2).into())
854     }
855 
856     fn initialize_msi(&mut self, msg_ctl: u16, cap_offset: u32, state: Option<MsiConfigState>) {
857         let interrupt_source_group = self
858             .msi_interrupt_manager
859             .create_group(MsiIrqGroupConfig {
860                 base: 0,
861                 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex,
862             })
863             .unwrap();
864 
865         let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap();
866 
867         self.interrupt.msi = Some(VfioMsi {
868             cfg: msi_config,
869             cap_offset,
870             interrupt_source_group,
871         });
872     }
873 
874     /// Returns true, if the device claims to have a PCI capability list.
875     fn has_capabilities(&self) -> bool {
876         let status = self.vfio_wrapper.read_config_word(PCI_CONFIG_STATUS_OFFSET);
877         status & PCI_CONFIG_STATUS_CAPABILITIES_LIST != 0
878     }
879 
880     fn get_msix_cap_idx(&self) -> Option<usize> {
881         if !self.has_capabilities() {
882             return None;
883         }
884 
885         let mut cap_next = self
886             .vfio_wrapper
887             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET)
888             & PCI_CONFIG_CAPABILITY_PTR_MASK;
889 
890         while cap_next != 0 {
891             let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into());
892             if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX {
893                 return Some(cap_next as usize);
894             } else {
895                 let cap_ptr = self.vfio_wrapper.read_config_byte((cap_next + 1).into())
896                     & PCI_CONFIG_CAPABILITY_PTR_MASK;
897 
898                 // See parse_capabilities below for an explanation.
899                 if cap_ptr != cap_next {
900                     cap_next = cap_ptr;
901                 } else {
902                     break;
903                 }
904             }
905         }
906 
907         None
908     }
909 
910     fn parse_capabilities(&mut self, bdf: PciBdf) {
911         if !self.has_capabilities() {
912             return;
913         }
914 
915         let mut cap_iter = self
916             .vfio_wrapper
917             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET)
918             & PCI_CONFIG_CAPABILITY_PTR_MASK;
919 
920         let mut pci_express_cap_found = false;
921         let mut power_management_cap_found = false;
922 
923         while cap_iter != 0 {
924             let cap_id = self.vfio_wrapper.read_config_byte(cap_iter.into());
925 
926             match PciCapabilityId::from(cap_id) {
927                 PciCapabilityId::MessageSignalledInterrupts => {
928                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) {
929                         if irq_info.count > 0 {
930                             // Parse capability only if the VFIO device
931                             // supports MSI.
932                             let msg_ctl = self.parse_msi_capabilities(cap_iter);
933                             self.initialize_msi(msg_ctl, cap_iter as u32, None);
934                         }
935                     }
936                 }
937                 PciCapabilityId::MsiX => {
938                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX)
939                     {
940                         if irq_info.count > 0 {
941                             // Parse capability only if the VFIO device
942                             // supports MSI-X.
943                             let msix_cap = self.parse_msix_capabilities(cap_iter);
944                             self.initialize_msix(msix_cap, cap_iter as u32, bdf, None);
945                         }
946                     }
947                 }
948                 PciCapabilityId::PciExpress => pci_express_cap_found = true,
949                 PciCapabilityId::PowerManagement => power_management_cap_found = true,
950                 _ => {}
951             };
952 
953             let cap_next = self.vfio_wrapper.read_config_byte((cap_iter + 1).into())
954                 & PCI_CONFIG_CAPABILITY_PTR_MASK;
955 
956             // Break out of the loop, if we either find the end or we have a broken device. This
957             // doesn't handle all cases where a device might send us in a loop here, but it
958             // handles case of a device returning 0xFF instead of implementing a real
959             // capabilities list.
960             if cap_next == 0 || cap_next == cap_iter {
961                 break;
962             }
963 
964             cap_iter = cap_next;
965         }
966 
967         if let Some(clique_id) = self.x_nv_gpudirect_clique {
968             self.add_nv_gpudirect_clique_cap(cap_iter, clique_id);
969         }
970 
971         if pci_express_cap_found && power_management_cap_found {
972             self.parse_extended_capabilities();
973         }
974     }
975 
976     fn add_nv_gpudirect_clique_cap(&mut self, cap_iter: u8, clique_id: u8) {
977         // Turing, Ampere, Hopper, and Lovelace GPUs have dedicated space
978         // at 0xD4 for this capability.
979         let cap_offset = 0xd4u32;
980 
981         let reg_idx = (cap_iter / 4) as usize;
982         self.patches.insert(
983             reg_idx,
984             ConfigPatch {
985                 mask: 0x0000_ff00,
986                 patch: cap_offset << 8,
987             },
988         );
989 
990         let reg_idx = (cap_offset / 4) as usize;
991         self.patches.insert(
992             reg_idx,
993             ConfigPatch {
994                 mask: 0xffff_ffff,
995                 patch: 0x50080009u32,
996             },
997         );
998         self.patches.insert(
999             reg_idx + 1,
1000             ConfigPatch {
1001                 mask: 0xffff_ffff,
1002                 patch: (u32::from(clique_id) << 19) | 0x5032,
1003             },
1004         );
1005     }
1006 
1007     fn parse_extended_capabilities(&mut self) {
1008         let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET;
1009 
1010         loop {
1011             let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset);
1012 
1013             let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16;
1014             let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16;
1015 
1016             match PciExpressCapabilityId::from(cap_id) {
1017                 PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation
1018                 | PciExpressCapabilityId::ResizeableBar
1019                 | PciExpressCapabilityId::SingleRootIoVirtualization => {
1020                     let reg_idx = (current_offset / 4) as usize;
1021                     self.patches.insert(
1022                         reg_idx,
1023                         ConfigPatch {
1024                             mask: 0x0000_ffff,
1025                             patch: PciExpressCapabilityId::NullCapability as u32,
1026                         },
1027                     );
1028                 }
1029                 _ => {}
1030             }
1031 
1032             if cap_next == 0 {
1033                 break;
1034             }
1035 
1036             current_offset = cap_next.into();
1037         }
1038     }
1039 
1040     pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> {
1041         if let Some(intx) = &mut self.interrupt.intx {
1042             if !intx.enabled {
1043                 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) {
1044                     self.vfio_wrapper
1045                         .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd])
1046                         .map_err(VfioPciError::EnableIntx)?;
1047 
1048                     intx.enabled = true;
1049                 } else {
1050                     return Err(VfioPciError::MissingNotifier);
1051                 }
1052             }
1053         }
1054 
1055         Ok(())
1056     }
1057 
1058     pub(crate) fn disable_intx(&mut self) {
1059         if let Some(intx) = &mut self.interrupt.intx {
1060             if intx.enabled {
1061                 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1062                     error!("Could not disable INTx: {}", e);
1063                 } else {
1064                     intx.enabled = false;
1065                 }
1066             }
1067         }
1068     }
1069 
1070     pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> {
1071         if let Some(msi) = &self.interrupt.msi {
1072             let mut irq_fds: Vec<EventFd> = Vec::new();
1073             for i in 0..msi.cfg.num_enabled_vectors() {
1074                 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) {
1075                     irq_fds.push(eventfd);
1076                 } else {
1077                     return Err(VfioPciError::MissingNotifier);
1078                 }
1079             }
1080 
1081             self.vfio_wrapper
1082                 .enable_msi(irq_fds.iter().collect())
1083                 .map_err(VfioPciError::EnableMsi)?;
1084         }
1085 
1086         Ok(())
1087     }
1088 
1089     pub(crate) fn disable_msi(&self) {
1090         if let Err(e) = self.vfio_wrapper.disable_msi() {
1091             error!("Could not disable MSI: {}", e);
1092         }
1093     }
1094 
1095     pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> {
1096         if let Some(msix) = &self.interrupt.msix {
1097             let mut irq_fds: Vec<EventFd> = Vec::new();
1098             for i in 0..msix.bar.table_entries.len() {
1099                 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) {
1100                     irq_fds.push(eventfd);
1101                 } else {
1102                     return Err(VfioPciError::MissingNotifier);
1103                 }
1104             }
1105 
1106             self.vfio_wrapper
1107                 .enable_msix(irq_fds.iter().collect())
1108                 .map_err(VfioPciError::EnableMsix)?;
1109         }
1110 
1111         Ok(())
1112     }
1113 
1114     pub(crate) fn disable_msix(&self) {
1115         if let Err(e) = self.vfio_wrapper.disable_msix() {
1116             error!("Could not disable MSI-X: {}", e);
1117         }
1118     }
1119 
1120     fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> {
1121         if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) {
1122             if irq_info.count == 0 {
1123                 // A count of 0 means the INTx IRQ is not supported, therefore
1124                 // it shouldn't be initialized.
1125                 return Ok(());
1126             }
1127         }
1128 
1129         if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() {
1130             self.interrupt.intx = Some(VfioIntx {
1131                 interrupt_source_group,
1132                 enabled: false,
1133             });
1134 
1135             self.enable_intx()?;
1136         }
1137 
1138         Ok(())
1139     }
1140 
1141     fn update_msi_capabilities(&mut self, offset: u64, data: &[u8]) -> Result<(), VfioPciError> {
1142         match self.interrupt.update_msi(offset, data) {
1143             Some(InterruptUpdateAction::EnableMsi) => {
1144                 // Disable INTx before we can enable MSI
1145                 self.disable_intx();
1146                 self.enable_msi()?;
1147             }
1148             Some(InterruptUpdateAction::DisableMsi) => {
1149                 // Fallback onto INTx when disabling MSI
1150                 self.disable_msi();
1151                 self.enable_intx()?;
1152             }
1153             _ => {}
1154         }
1155 
1156         Ok(())
1157     }
1158 
1159     fn update_msix_capabilities(&mut self, offset: u64, data: &[u8]) -> Result<(), VfioPciError> {
1160         match self.interrupt.update_msix(offset, data) {
1161             Some(InterruptUpdateAction::EnableMsix) => {
1162                 // Disable INTx before we can enable MSI-X
1163                 self.disable_intx();
1164                 self.enable_msix()?;
1165             }
1166             Some(InterruptUpdateAction::DisableMsix) => {
1167                 // Fallback onto INTx when disabling MSI-X
1168                 self.disable_msix();
1169                 self.enable_intx()?;
1170             }
1171             _ => {}
1172         }
1173 
1174         Ok(())
1175     }
1176 
1177     fn find_region(&self, addr: u64) -> Option<MmioRegion> {
1178         for region in self.mmio_regions.iter() {
1179             if addr >= region.start.raw_value()
1180                 && addr < region.start.unchecked_add(region.length).raw_value()
1181             {
1182                 return Some(region.clone());
1183             }
1184         }
1185         None
1186     }
1187 
1188     pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1189         let addr = base + offset;
1190         if let Some(region) = self.find_region(addr) {
1191             let offset = addr - region.start.raw_value();
1192 
1193             if self.interrupt.msix_table_accessed(region.index, offset) {
1194                 self.interrupt.msix_read_table(offset, data);
1195             } else {
1196                 self.vfio_wrapper.region_read(region.index, offset, data);
1197             }
1198         }
1199 
1200         // INTx EOI
1201         // The guest reading from the BAR potentially means the interrupt has
1202         // been received and can be acknowledged.
1203         if self.interrupt.intx_in_use() {
1204             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1205                 error!("Failed unmasking INTx IRQ: {}", e);
1206             }
1207         }
1208     }
1209 
1210     pub(crate) fn write_bar(
1211         &mut self,
1212         base: u64,
1213         offset: u64,
1214         data: &[u8],
1215     ) -> Option<Arc<Barrier>> {
1216         let addr = base + offset;
1217         if let Some(region) = self.find_region(addr) {
1218             let offset = addr - region.start.raw_value();
1219 
1220             // If the MSI-X table is written to, we need to update our cache.
1221             if self.interrupt.msix_table_accessed(region.index, offset) {
1222                 self.interrupt.msix_write_table(offset, data);
1223             } else {
1224                 self.vfio_wrapper.region_write(region.index, offset, data);
1225             }
1226         }
1227 
1228         // INTx EOI
1229         // The guest writing to the BAR potentially means the interrupt has
1230         // been received and can be acknowledged.
1231         if self.interrupt.intx_in_use() {
1232             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1233                 error!("Failed unmasking INTx IRQ: {}", e);
1234             }
1235         }
1236 
1237         None
1238     }
1239 
1240     pub(crate) fn write_config_register(
1241         &mut self,
1242         reg_idx: usize,
1243         offset: u64,
1244         data: &[u8],
1245     ) -> (Vec<BarReprogrammingParams>, Option<Arc<Barrier>>) {
1246         // When the guest wants to write to a BAR, we trap it into
1247         // our local configuration space. We're not reprogramming
1248         // VFIO device.
1249         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1250             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1251         {
1252             // We keep our local cache updated with the BARs.
1253             // We'll read it back from there when the guest is asking
1254             // for BARs (see read_config_register()).
1255             return (
1256                 self.configuration
1257                     .write_config_register(reg_idx, offset, data),
1258                 None,
1259             );
1260         }
1261 
1262         let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64;
1263 
1264         // If the MSI or MSI-X capabilities are accessed, we need to
1265         // update our local cache accordingly.
1266         // Depending on how the capabilities are modified, this could
1267         // trigger a VFIO MSI or MSI-X toggle.
1268         if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) {
1269             let cap_offset: u64 = reg - cap_base + offset;
1270             match cap_id {
1271                 PciCapabilityId::MessageSignalledInterrupts => {
1272                     if let Err(e) = self.update_msi_capabilities(cap_offset, data) {
1273                         error!("Could not update MSI capabilities: {}", e);
1274                     }
1275                 }
1276                 PciCapabilityId::MsiX => {
1277                     if let Err(e) = self.update_msix_capabilities(cap_offset, data) {
1278                         error!("Could not update MSI-X capabilities: {}", e);
1279                     }
1280                 }
1281                 _ => {}
1282             }
1283         }
1284 
1285         // Make sure to write to the device's PCI config space after MSI/MSI-X
1286         // interrupts have been enabled/disabled. In case of MSI, when the
1287         // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS),
1288         // the MSI Enable bit in the MSI capability structure found in the PCI
1289         // config space is disabled by default. That's why when the guest is
1290         // enabling this bit, we first need to enable the MSI interrupts with
1291         // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write
1292         // to the device region to update the MSI Enable bit.
1293         self.vfio_wrapper.write_config((reg + offset) as u32, data);
1294 
1295         // Return pending BAR repgrogramming if MSE bit is set
1296         let mut ret_param = self.configuration.pending_bar_reprogram();
1297         if !ret_param.is_empty() {
1298             if self.read_config_register(crate::configuration::COMMAND_REG)
1299                 & crate::configuration::COMMAND_REG_MEMORY_SPACE_MASK
1300                 == crate::configuration::COMMAND_REG_MEMORY_SPACE_MASK
1301             {
1302                 info!("BAR reprogramming parameter is returned: {:x?}", ret_param);
1303                 self.configuration.clear_pending_bar_reprogram();
1304             } else {
1305                 info!(
1306                     "MSE bit is disabled. No BAR reprogramming parameter is returned: {:x?}",
1307                     ret_param
1308                 );
1309 
1310                 ret_param = Vec::new();
1311             }
1312         }
1313 
1314         (ret_param, None)
1315     }
1316 
1317     pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1318         // When reading the BARs, we trap it and return what comes
1319         // from our local configuration space. We want the guest to
1320         // use that and not the VFIO device BARs as it does not map
1321         // with the guest address space.
1322         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1323             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1324         {
1325             return self.configuration.read_reg(reg_idx);
1326         }
1327 
1328         if let Some(id) = self.get_msix_cap_idx() {
1329             let msix = self.interrupt.msix.as_mut().unwrap();
1330             if reg_idx * 4 == id + 4 {
1331                 return msix.cap.table;
1332             } else if reg_idx * 4 == id + 8 {
1333                 return msix.cap.pba;
1334             }
1335         }
1336 
1337         // Since we don't support passing multi-functions devices, we should
1338         // mask the multi-function bit, bit 7 of the Header Type byte on the
1339         // register 3.
1340         let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX {
1341             0xff7f_ffff
1342         } else {
1343             0xffff_ffff
1344         };
1345 
1346         // The config register read comes from the VFIO device itself.
1347         let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask;
1348 
1349         if let Some(config_patch) = self.patches.get(&reg_idx) {
1350             value = (value & !config_patch.mask) | config_patch.patch;
1351         }
1352 
1353         value
1354     }
1355 
1356     fn state(&self) -> VfioCommonState {
1357         let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState {
1358             enabled: intx.enabled,
1359         });
1360 
1361         let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState {
1362             cap: msi.cfg.cap,
1363             cap_offset: msi.cap_offset,
1364         });
1365 
1366         let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState {
1367             cap: msix.cap,
1368             cap_offset: msix.cap_offset,
1369             bdf: msix.bar.devid,
1370         });
1371 
1372         VfioCommonState {
1373             intx_state,
1374             msi_state,
1375             msix_state,
1376         }
1377     }
1378 
1379     fn set_state(
1380         &mut self,
1381         state: &VfioCommonState,
1382         msi_state: Option<MsiConfigState>,
1383         msix_state: Option<MsixConfigState>,
1384     ) -> Result<(), VfioPciError> {
1385         if let (Some(intx), Some(interrupt_source_group)) =
1386             (&state.intx_state, self.legacy_interrupt_group.clone())
1387         {
1388             self.interrupt.intx = Some(VfioIntx {
1389                 interrupt_source_group,
1390                 enabled: false,
1391             });
1392 
1393             if intx.enabled {
1394                 self.enable_intx()?;
1395             }
1396         }
1397 
1398         if let Some(msi) = &state.msi_state {
1399             self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state);
1400         }
1401 
1402         if let Some(msix) = &state.msix_state {
1403             self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state);
1404         }
1405 
1406         Ok(())
1407     }
1408 }
1409 
1410 impl Pausable for VfioCommon {}
1411 
1412 impl Snapshottable for VfioCommon {
1413     fn id(&self) -> String {
1414         String::from(VFIO_COMMON_ID)
1415     }
1416 
1417     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1418         let mut vfio_common_snapshot = Snapshot::new_from_state(&self.state())?;
1419 
1420         // Snapshot PciConfiguration
1421         vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?);
1422 
1423         // Snapshot MSI
1424         if let Some(msi) = &mut self.interrupt.msi {
1425             vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?);
1426         }
1427 
1428         // Snapshot MSI-X
1429         if let Some(msix) = &mut self.interrupt.msix {
1430             vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?);
1431         }
1432 
1433         Ok(vfio_common_snapshot)
1434     }
1435 }
1436 
1437 /// VfioPciDevice represents a VFIO PCI device.
1438 /// This structure implements the BusDevice and PciDevice traits.
1439 ///
1440 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device.
1441 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice,
1442 /// which then gets added to the PCI bus.
1443 pub struct VfioPciDevice {
1444     id: String,
1445     vm: Arc<dyn hypervisor::Vm>,
1446     device: Arc<VfioDevice>,
1447     container: Arc<VfioContainer>,
1448     common: VfioCommon,
1449     iommu_attached: bool,
1450     memory_slot_allocator: MemorySlotAllocator,
1451     bdf: PciBdf,
1452     device_path: PathBuf,
1453 }
1454 
1455 impl VfioPciDevice {
1456     /// Constructs a new Vfio Pci device for the given Vfio device
1457     #[allow(clippy::too_many_arguments)]
1458     pub fn new(
1459         id: String,
1460         vm: &Arc<dyn hypervisor::Vm>,
1461         device: VfioDevice,
1462         container: Arc<VfioContainer>,
1463         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
1464         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
1465         iommu_attached: bool,
1466         bdf: PciBdf,
1467         memory_slot_allocator: MemorySlotAllocator,
1468         snapshot: Option<Snapshot>,
1469         x_nv_gpudirect_clique: Option<u8>,
1470         device_path: PathBuf,
1471     ) -> Result<Self, VfioPciError> {
1472         let device = Arc::new(device);
1473         device.reset();
1474 
1475         let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device));
1476 
1477         let common = VfioCommon::new(
1478             msi_interrupt_manager,
1479             legacy_interrupt_group,
1480             Arc::new(vfio_wrapper) as Arc<dyn Vfio>,
1481             &PciVfioSubclass::VfioSubclass,
1482             bdf,
1483             vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID),
1484             x_nv_gpudirect_clique,
1485         )?;
1486 
1487         let vfio_pci_device = VfioPciDevice {
1488             id,
1489             vm: vm.clone(),
1490             device,
1491             container,
1492             common,
1493             iommu_attached,
1494             memory_slot_allocator,
1495             bdf,
1496             device_path: device_path.clone(),
1497         };
1498 
1499         Ok(vfio_pci_device)
1500     }
1501 
1502     pub fn iommu_attached(&self) -> bool {
1503         self.iommu_attached
1504     }
1505 
1506     fn generate_sparse_areas(
1507         caps: &[VfioRegionInfoCap],
1508         region_index: u32,
1509         region_start: u64,
1510         region_size: u64,
1511         vfio_msix: Option<&VfioMsix>,
1512     ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> {
1513         for cap in caps {
1514             match cap {
1515                 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()),
1516                 VfioRegionInfoCap::MsixMappable => {
1517                     if !is_4k_aligned(region_start) {
1518                         error!(
1519                             "Region start address 0x{:x} must be at least aligned on 4KiB",
1520                             region_start
1521                         );
1522                         return Err(VfioPciError::RegionAlignment);
1523                     }
1524                     if !is_4k_multiple(region_size) {
1525                         error!(
1526                             "Region size 0x{:x} must be at least a multiple of 4KiB",
1527                             region_size
1528                         );
1529                         return Err(VfioPciError::RegionSize);
1530                     }
1531 
1532                     // In case the region contains the MSI-X vectors table or
1533                     // the MSI-X PBA table, we must calculate the subregions
1534                     // around them, leading to a list of sparse areas.
1535                     // We want to make sure we will still trap MMIO accesses
1536                     // to these MSI-X specific ranges. If these region don't align
1537                     // with pagesize, we can achieve it by enlarging its range.
1538                     //
1539                     // Using a BtreeMap as the list provided through the iterator is sorted
1540                     // by key. This ensures proper split of the whole region.
1541                     let mut inter_ranges = BTreeMap::new();
1542                     if let Some(msix) = vfio_msix {
1543                         if region_index == msix.cap.table_bir() {
1544                             let (offset, size) = msix.cap.table_range();
1545                             let offset = align_page_size_down(offset);
1546                             let size = align_page_size_up(size);
1547                             inter_ranges.insert(offset, size);
1548                         }
1549                         if region_index == msix.cap.pba_bir() {
1550                             let (offset, size) = msix.cap.pba_range();
1551                             let offset = align_page_size_down(offset);
1552                             let size = align_page_size_up(size);
1553                             inter_ranges.insert(offset, size);
1554                         }
1555                     }
1556 
1557                     let mut sparse_areas = Vec::new();
1558                     let mut current_offset = 0;
1559                     for (range_offset, range_size) in inter_ranges {
1560                         if range_offset > current_offset {
1561                             sparse_areas.push(VfioRegionSparseMmapArea {
1562                                 offset: current_offset,
1563                                 size: range_offset - current_offset,
1564                             });
1565                         }
1566                         current_offset = align_page_size_down(range_offset + range_size);
1567                     }
1568 
1569                     if region_size > current_offset {
1570                         sparse_areas.push(VfioRegionSparseMmapArea {
1571                             offset: current_offset,
1572                             size: region_size - current_offset,
1573                         });
1574                     }
1575 
1576                     return Ok(sparse_areas);
1577                 }
1578                 _ => {}
1579             }
1580         }
1581 
1582         // In case no relevant capabilities have been found, create a single
1583         // sparse area corresponding to the entire MMIO region.
1584         Ok(vec![VfioRegionSparseMmapArea {
1585             offset: 0,
1586             size: region_size,
1587         }])
1588     }
1589 
1590     /// Map MMIO regions into the guest, and avoid VM exits when the guest tries
1591     /// to reach those regions.
1592     ///
1593     /// # Arguments
1594     ///
1595     /// * `vm` - The VM object. It is used to set the VFIO MMIO regions
1596     ///   as user memory regions.
1597     /// * `mem_slot` - The closure to return a memory slot.
1598     pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> {
1599         let fd = self.device.as_raw_fd();
1600 
1601         for region in self.common.mmio_regions.iter_mut() {
1602             let region_flags = self.device.get_region_flags(region.index);
1603             if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1604                 let mut prot = 0;
1605                 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 {
1606                     prot |= libc::PROT_READ;
1607                 }
1608                 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 {
1609                     prot |= libc::PROT_WRITE;
1610                 }
1611 
1612                 // Retrieve the list of capabilities found on the region
1613                 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 {
1614                     self.device.get_region_caps(region.index)
1615                 } else {
1616                     Vec::new()
1617                 };
1618 
1619                 // Don't try to mmap the region if it contains MSI-X table or
1620                 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE
1621                 // in the list of supported capabilities.
1622                 if let Some(msix) = self.common.interrupt.msix.as_ref() {
1623                     if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir())
1624                         && !caps.contains(&VfioRegionInfoCap::MsixMappable)
1625                     {
1626                         continue;
1627                     }
1628                 }
1629 
1630                 let mmap_size = self.device.get_region_size(region.index);
1631                 let mmap_offset = self.device.get_region_offset(region.index);
1632 
1633                 let sparse_areas = Self::generate_sparse_areas(
1634                     &caps,
1635                     region.index,
1636                     region.start.0,
1637                     mmap_size,
1638                     self.common.interrupt.msix.as_ref(),
1639                 )?;
1640 
1641                 for area in sparse_areas.iter() {
1642                     // SAFETY: FFI call with correct arguments
1643                     let host_addr = unsafe {
1644                         libc::mmap(
1645                             null_mut(),
1646                             area.size as usize,
1647                             prot,
1648                             libc::MAP_SHARED,
1649                             fd,
1650                             mmap_offset as libc::off_t + area.offset as libc::off_t,
1651                         )
1652                     };
1653 
1654                     if std::ptr::eq(host_addr, libc::MAP_FAILED) {
1655                         error!(
1656                             "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}",
1657                             area.offset,
1658                             area.size,
1659                             std::io::Error::last_os_error()
1660                         );
1661                         return Err(VfioPciError::MmapArea);
1662                     }
1663 
1664                     if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) {
1665                         warn!(
1666                             "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})",
1667                             area.offset,
1668                             area.size,
1669                             );
1670                         return Ok(());
1671                     }
1672 
1673                     let user_memory_region = UserMemoryRegion {
1674                         slot: self.memory_slot_allocator.next_memory_slot(),
1675                         start: region.start.0 + area.offset,
1676                         size: area.size,
1677                         host_addr: host_addr as u64,
1678                     };
1679 
1680                     region.user_memory_regions.push(user_memory_region);
1681 
1682                     let mem_region = self.vm.make_user_memory_region(
1683                         user_memory_region.slot,
1684                         user_memory_region.start,
1685                         user_memory_region.size,
1686                         user_memory_region.host_addr,
1687                         false,
1688                         false,
1689                     );
1690 
1691                     self.vm
1692                         .create_user_memory_region(mem_region)
1693                         .map_err(VfioPciError::CreateUserMemoryRegion)?;
1694 
1695                     if !self.iommu_attached {
1696                         self.container
1697                             .vfio_dma_map(
1698                                 user_memory_region.start,
1699                                 user_memory_region.size,
1700                                 user_memory_region.host_addr,
1701                             )
1702                             .map_err(|e| {
1703                                 VfioPciError::DmaMap(e, self.device_path.clone(), self.bdf)
1704                             })?;
1705                     }
1706                 }
1707             }
1708         }
1709 
1710         Ok(())
1711     }
1712 
1713     pub fn unmap_mmio_regions(&mut self) {
1714         for region in self.common.mmio_regions.iter() {
1715             for user_memory_region in region.user_memory_regions.iter() {
1716                 // Unmap from vfio container
1717                 if !self.iommu_attached {
1718                     if let Err(e) = self
1719                         .container
1720                         .vfio_dma_unmap(user_memory_region.start, user_memory_region.size)
1721                         .map_err(|e| VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf))
1722                     {
1723                         error!(
1724                             "Could not unmap mmio region from vfio container: \
1725                             iova 0x{:x}, size 0x{:x}: {}, ",
1726                             user_memory_region.start, user_memory_region.size, e
1727                         );
1728                     }
1729                 }
1730 
1731                 // Remove region
1732                 let r = self.vm.make_user_memory_region(
1733                     user_memory_region.slot,
1734                     user_memory_region.start,
1735                     user_memory_region.size,
1736                     user_memory_region.host_addr,
1737                     false,
1738                     false,
1739                 );
1740 
1741                 if let Err(e) = self.vm.remove_user_memory_region(r) {
1742                     error!("Could not remove the userspace memory region: {}", e);
1743                 }
1744 
1745                 self.memory_slot_allocator
1746                     .free_memory_slot(user_memory_region.slot);
1747 
1748                 // SAFETY: FFI call with correct arguments
1749                 let ret = unsafe {
1750                     libc::munmap(
1751                         user_memory_region.host_addr as *mut libc::c_void,
1752                         user_memory_region.size as usize,
1753                     )
1754                 };
1755                 if ret != 0 {
1756                     error!(
1757                         "Could not unmap region {}, error:{}",
1758                         region.index,
1759                         io::Error::last_os_error()
1760                     );
1761                 }
1762             }
1763         }
1764     }
1765 
1766     pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> {
1767         if !self.iommu_attached {
1768             self.container
1769                 .vfio_dma_map(iova, size, user_addr)
1770                 .map_err(|e| VfioPciError::DmaMap(e, self.device_path.clone(), self.bdf))?;
1771         }
1772 
1773         Ok(())
1774     }
1775 
1776     pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> {
1777         if !self.iommu_attached {
1778             self.container
1779                 .vfio_dma_unmap(iova, size)
1780                 .map_err(|e| VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf))?;
1781         }
1782 
1783         Ok(())
1784     }
1785 
1786     pub fn mmio_regions(&self) -> Vec<MmioRegion> {
1787         self.common.mmio_regions.clone()
1788     }
1789 }
1790 
1791 impl Drop for VfioPciDevice {
1792     fn drop(&mut self) {
1793         self.unmap_mmio_regions();
1794 
1795         if let Some(msix) = &self.common.interrupt.msix {
1796             if msix.bar.enabled() {
1797                 self.common.disable_msix();
1798             }
1799         }
1800 
1801         if let Some(msi) = &self.common.interrupt.msi {
1802             if msi.cfg.enabled() {
1803                 self.common.disable_msi()
1804             }
1805         }
1806 
1807         if self.common.interrupt.intx_in_use() {
1808             self.common.disable_intx();
1809         }
1810     }
1811 }
1812 
1813 impl BusDevice for VfioPciDevice {
1814     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1815         self.read_bar(base, offset, data)
1816     }
1817 
1818     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1819         self.write_bar(base, offset, data)
1820     }
1821 }
1822 
1823 // Offset of the 16-bit status register in the PCI configuration space.
1824 const PCI_CONFIG_STATUS_OFFSET: u32 = 0x06;
1825 // Status bit indicating the presence of a capabilities list.
1826 const PCI_CONFIG_STATUS_CAPABILITIES_LIST: u16 = 1 << 4;
1827 // First BAR offset in the PCI config space.
1828 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10;
1829 // Capability register offset in the PCI config space.
1830 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34;
1831 // The valid bits for the capabilities pointer.
1832 const PCI_CONFIG_CAPABILITY_PTR_MASK: u8 = !0b11;
1833 // Extended capabilities register offset in the PCI config space.
1834 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100;
1835 // IO BAR when first BAR bit is 1.
1836 const PCI_CONFIG_IO_BAR: u32 = 0x1;
1837 // 64-bit memory bar flag.
1838 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4;
1839 // Prefetchable BAR bit
1840 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8;
1841 // PCI config register size (4 bytes).
1842 const PCI_CONFIG_REGISTER_SIZE: usize = 4;
1843 // Number of BARs for a PCI device
1844 const BAR_NUMS: usize = 6;
1845 // PCI Header Type register index
1846 const PCI_HEADER_TYPE_REG_INDEX: usize = 3;
1847 // First BAR register index
1848 const PCI_CONFIG_BAR0_INDEX: usize = 4;
1849 // PCI ROM expansion BAR register index
1850 const PCI_ROM_EXP_BAR_INDEX: usize = 12;
1851 
1852 impl PciDevice for VfioPciDevice {
1853     fn allocate_bars(
1854         &mut self,
1855         allocator: &Arc<Mutex<SystemAllocator>>,
1856         mmio32_allocator: &mut AddressAllocator,
1857         mmio64_allocator: &mut AddressAllocator,
1858         resources: Option<Vec<Resource>>,
1859     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
1860         self.common
1861             .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources)
1862     }
1863 
1864     fn free_bars(
1865         &mut self,
1866         allocator: &mut SystemAllocator,
1867         mmio32_allocator: &mut AddressAllocator,
1868         mmio64_allocator: &mut AddressAllocator,
1869     ) -> Result<(), PciDeviceError> {
1870         self.common
1871             .free_bars(allocator, mmio32_allocator, mmio64_allocator)
1872     }
1873 
1874     fn write_config_register(
1875         &mut self,
1876         reg_idx: usize,
1877         offset: u64,
1878         data: &[u8],
1879     ) -> (Vec<BarReprogrammingParams>, Option<Arc<Barrier>>) {
1880         self.common.write_config_register(reg_idx, offset, data)
1881     }
1882 
1883     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1884         self.common.read_config_register(reg_idx)
1885     }
1886 
1887     fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1888         self.common.read_bar(base, offset, data)
1889     }
1890 
1891     fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1892         self.common.write_bar(base, offset, data)
1893     }
1894 
1895     fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> {
1896         for region in self.common.mmio_regions.iter_mut() {
1897             if region.start.raw_value() == old_base {
1898                 region.start = GuestAddress(new_base);
1899 
1900                 for user_memory_region in region.user_memory_regions.iter_mut() {
1901                     // Unmap the old MMIO region from vfio container
1902                     if !self.iommu_attached {
1903                         if let Err(e) = self
1904                             .container
1905                             .vfio_dma_unmap(user_memory_region.start, user_memory_region.size)
1906                             .map_err(|e| {
1907                                 VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf)
1908                             })
1909                         {
1910                             error!(
1911                                 "Could not unmap mmio region from vfio container: \
1912                                 iova 0x{:x}, size 0x{:x}: {}, ",
1913                                 user_memory_region.start, user_memory_region.size, e
1914                             );
1915                         }
1916                     }
1917 
1918                     // Remove old region
1919                     let old_mem_region = self.vm.make_user_memory_region(
1920                         user_memory_region.slot,
1921                         user_memory_region.start,
1922                         user_memory_region.size,
1923                         user_memory_region.host_addr,
1924                         false,
1925                         false,
1926                     );
1927 
1928                     self.vm
1929                         .remove_user_memory_region(old_mem_region)
1930                         .map_err(io::Error::other)?;
1931 
1932                     // Update the user memory region with the correct start address.
1933                     if new_base > old_base {
1934                         user_memory_region.start += new_base - old_base;
1935                     } else {
1936                         user_memory_region.start -= old_base - new_base;
1937                     }
1938 
1939                     // Insert new region
1940                     let new_mem_region = self.vm.make_user_memory_region(
1941                         user_memory_region.slot,
1942                         user_memory_region.start,
1943                         user_memory_region.size,
1944                         user_memory_region.host_addr,
1945                         false,
1946                         false,
1947                     );
1948 
1949                     self.vm
1950                         .create_user_memory_region(new_mem_region)
1951                         .map_err(io::Error::other)?;
1952 
1953                     // Map the moved mmio region to vfio container
1954                     if !self.iommu_attached {
1955                         self.container
1956                             .vfio_dma_map(
1957                                 user_memory_region.start,
1958                                 user_memory_region.size,
1959                                 user_memory_region.host_addr,
1960                             )
1961                             .map_err(|e| {
1962                                 VfioPciError::DmaMap(e, self.device_path.clone(), self.bdf)
1963                             })
1964                             .map_err(|e| {
1965                                 io::Error::other(format!(
1966                                     "Could not map mmio region to vfio container: \
1967                                     iova 0x{:x}, size 0x{:x}: {}, ",
1968                                     user_memory_region.start, user_memory_region.size, e
1969                                 ))
1970                             })?;
1971                     }
1972                 }
1973             }
1974         }
1975 
1976         Ok(())
1977     }
1978 
1979     fn as_any_mut(&mut self) -> &mut dyn Any {
1980         self
1981     }
1982 
1983     fn id(&self) -> Option<String> {
1984         Some(self.id.clone())
1985     }
1986 }
1987 
1988 impl Pausable for VfioPciDevice {}
1989 
1990 impl Snapshottable for VfioPciDevice {
1991     fn id(&self) -> String {
1992         self.id.clone()
1993     }
1994 
1995     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1996         let mut vfio_pci_dev_snapshot = Snapshot::default();
1997 
1998         // Snapshot VfioCommon
1999         vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?);
2000 
2001         Ok(vfio_pci_dev_snapshot)
2002     }
2003 }
2004 impl Transportable for VfioPciDevice {}
2005 impl Migratable for VfioPciDevice {}
2006 
2007 /// This structure implements the ExternalDmaMapping trait. It is meant to
2008 /// be used when the caller tries to provide a way to update the mappings
2009 /// associated with a specific VFIO container.
2010 pub struct VfioDmaMapping<M: GuestAddressSpace> {
2011     container: Arc<VfioContainer>,
2012     memory: Arc<M>,
2013     mmio_regions: Arc<Mutex<Vec<MmioRegion>>>,
2014 }
2015 
2016 impl<M: GuestAddressSpace> VfioDmaMapping<M> {
2017     /// Create a DmaMapping object.
2018     /// # Parameters
2019     /// * `container`: VFIO container object.
2020     /// * `memory`: guest memory to mmap.
2021     /// * `mmio_regions`: mmio_regions to mmap.
2022     pub fn new(
2023         container: Arc<VfioContainer>,
2024         memory: Arc<M>,
2025         mmio_regions: Arc<Mutex<Vec<MmioRegion>>>,
2026     ) -> Self {
2027         VfioDmaMapping {
2028             container,
2029             memory,
2030             mmio_regions,
2031         }
2032     }
2033 }
2034 
2035 impl<M: GuestAddressSpace + Sync + Send> ExternalDmaMapping for VfioDmaMapping<M> {
2036     fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), io::Error> {
2037         let mem = self.memory.memory();
2038         let guest_addr = GuestAddress(gpa);
2039         let user_addr = if mem.check_range(guest_addr, size as usize) {
2040             match mem.get_host_address(guest_addr) {
2041                 Ok(t) => t as u64,
2042                 Err(e) => {
2043                     return Err(io::Error::other(
2044                         format!("unable to retrieve user address for gpa 0x{gpa:x} from guest memory region: {e}")
2045                     ));
2046                 }
2047             }
2048         } else if self.mmio_regions.lock().unwrap().check_range(gpa, size) {
2049             self.mmio_regions.lock().unwrap().find_user_address(gpa)?
2050         } else {
2051             return Err(io::Error::other(format!(
2052                 "failed to locate guest address 0x{gpa:x} in guest memory"
2053             )));
2054         };
2055 
2056         self.container
2057             .vfio_dma_map(iova, size, user_addr)
2058             .map_err(|e| {
2059                 io::Error::other(format!(
2060                     "failed to map memory for VFIO container, \
2061                          iova 0x{iova:x}, gpa 0x{gpa:x}, size 0x{size:x}: {e:?}"
2062                 ))
2063             })
2064     }
2065 
2066     fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), io::Error> {
2067         self.container.vfio_dma_unmap(iova, size).map_err(|e| {
2068             io::Error::other(format!(
2069                 "failed to unmap memory for VFIO container, \
2070                      iova 0x{iova:x}, size 0x{size:x}: {e:?}"
2071             ))
2072         })
2073     }
2074 }
2075