xref: /cloud-hypervisor/pci/src/vfio.rs (revision fa7a000dbe9637eb256af18ae8c3c4a8d5bf9c8f)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 
6 use crate::msi::{MsiConfigState, MSI_CONFIG_ID};
7 use crate::msix::MsixConfigState;
8 use crate::{
9     msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig,
10     PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId,
11     PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId,
12     PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID,
13 };
14 use anyhow::anyhow;
15 use byteorder::{ByteOrder, LittleEndian};
16 use hypervisor::HypervisorVmError;
17 use libc::{sysconf, _SC_PAGESIZE};
18 use std::any::Any;
19 use std::collections::{BTreeMap, HashMap};
20 use std::io;
21 use std::os::unix::io::AsRawFd;
22 use std::ptr::null_mut;
23 use std::sync::{Arc, Barrier, Mutex};
24 use thiserror::Error;
25 use versionize::{VersionMap, Versionize, VersionizeResult};
26 use versionize_derive::Versionize;
27 use vfio_bindings::bindings::vfio::*;
28 use vfio_ioctls::{
29     VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea,
30 };
31 use vm_allocator::page_size::{
32     align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned,
33 };
34 use vm_allocator::{AddressAllocator, SystemAllocator};
35 use vm_device::dma_mapping::ExternalDmaMapping;
36 use vm_device::interrupt::{
37     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
38 };
39 use vm_device::{BusDevice, Resource};
40 use vm_memory::{Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestUsize};
41 use vm_migration::{
42     Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped,
43 };
44 use vmm_sys_util::eventfd::EventFd;
45 
46 pub(crate) const VFIO_COMMON_ID: &str = "vfio_common";
47 
48 #[derive(Debug, Error)]
49 pub enum VfioPciError {
50     #[error("Failed to create user memory region: {0}")]
51     CreateUserMemoryRegion(#[source] HypervisorVmError),
52     #[error("Failed to DMA map: {0}")]
53     DmaMap(#[source] vfio_ioctls::VfioError),
54     #[error("Failed to DMA unmap: {0}")]
55     DmaUnmap(#[source] vfio_ioctls::VfioError),
56     #[error("Failed to enable INTx: {0}")]
57     EnableIntx(#[source] VfioError),
58     #[error("Failed to enable MSI: {0}")]
59     EnableMsi(#[source] VfioError),
60     #[error("Failed to enable MSI-x: {0}")]
61     EnableMsix(#[source] VfioError),
62     #[error("Failed to mmap the area")]
63     MmapArea,
64     #[error("Failed to notifier's eventfd")]
65     MissingNotifier,
66     #[error("Invalid region alignment")]
67     RegionAlignment,
68     #[error("Invalid region size")]
69     RegionSize,
70     #[error("Failed to retrieve MsiConfigState: {0}")]
71     RetrieveMsiConfigState(#[source] anyhow::Error),
72     #[error("Failed to retrieve MsixConfigState: {0}")]
73     RetrieveMsixConfigState(#[source] anyhow::Error),
74     #[error("Failed to retrieve PciConfigurationState: {0}")]
75     RetrievePciConfigurationState(#[source] anyhow::Error),
76     #[error("Failed to retrieve VfioCommonState: {0}")]
77     RetrieveVfioCommonState(#[source] anyhow::Error),
78 }
79 
80 #[derive(Copy, Clone)]
81 enum PciVfioSubclass {
82     VfioSubclass = 0xff,
83 }
84 
85 impl PciSubclass for PciVfioSubclass {
86     fn get_register_value(&self) -> u8 {
87         *self as u8
88     }
89 }
90 
91 enum InterruptUpdateAction {
92     EnableMsi,
93     DisableMsi,
94     EnableMsix,
95     DisableMsix,
96 }
97 
98 #[derive(Versionize)]
99 struct IntxState {
100     enabled: bool,
101 }
102 
103 pub(crate) struct VfioIntx {
104     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
105     enabled: bool,
106 }
107 
108 #[derive(Versionize)]
109 struct MsiState {
110     cap: MsiCap,
111     cap_offset: u32,
112 }
113 
114 pub(crate) struct VfioMsi {
115     pub(crate) cfg: MsiConfig,
116     cap_offset: u32,
117     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
118 }
119 
120 impl VfioMsi {
121     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
122         let old_enabled = self.cfg.enabled();
123 
124         self.cfg.update(offset, data);
125 
126         let new_enabled = self.cfg.enabled();
127 
128         if !old_enabled && new_enabled {
129             return Some(InterruptUpdateAction::EnableMsi);
130         }
131 
132         if old_enabled && !new_enabled {
133             return Some(InterruptUpdateAction::DisableMsi);
134         }
135 
136         None
137     }
138 }
139 
140 #[derive(Versionize)]
141 struct MsixState {
142     cap: MsixCap,
143     cap_offset: u32,
144     bdf: u32,
145 }
146 
147 pub(crate) struct VfioMsix {
148     pub(crate) bar: MsixConfig,
149     cap: MsixCap,
150     cap_offset: u32,
151     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
152 }
153 
154 impl VfioMsix {
155     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
156         let old_enabled = self.bar.enabled();
157 
158         // Update "Message Control" word
159         if offset == 2 && data.len() == 2 {
160             self.bar.set_msg_ctl(LittleEndian::read_u16(data));
161         }
162 
163         let new_enabled = self.bar.enabled();
164 
165         if !old_enabled && new_enabled {
166             return Some(InterruptUpdateAction::EnableMsix);
167         }
168 
169         if old_enabled && !new_enabled {
170             return Some(InterruptUpdateAction::DisableMsix);
171         }
172 
173         None
174     }
175 
176     fn table_accessed(&self, bar_index: u32, offset: u64) -> bool {
177         let table_offset: u64 = u64::from(self.cap.table_offset());
178         let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64);
179         let table_bir: u32 = self.cap.table_bir();
180 
181         bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size
182     }
183 }
184 
185 pub(crate) struct Interrupt {
186     pub(crate) intx: Option<VfioIntx>,
187     pub(crate) msi: Option<VfioMsi>,
188     pub(crate) msix: Option<VfioMsix>,
189 }
190 
191 impl Interrupt {
192     fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
193         if let Some(ref mut msi) = &mut self.msi {
194             let action = msi.update(offset, data);
195             return action;
196         }
197 
198         None
199     }
200 
201     fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
202         if let Some(ref mut msix) = &mut self.msix {
203             let action = msix.update(offset, data);
204             return action;
205         }
206 
207         None
208     }
209 
210     fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> {
211         if let Some(msi) = &self.msi {
212             if offset >= u64::from(msi.cap_offset)
213                 && offset < u64::from(msi.cap_offset) + msi.cfg.size()
214             {
215                 return Some((
216                     PciCapabilityId::MessageSignalledInterrupts,
217                     u64::from(msi.cap_offset),
218                 ));
219             }
220         }
221 
222         if let Some(msix) = &self.msix {
223             if offset == u64::from(msix.cap_offset) {
224                 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset)));
225             }
226         }
227 
228         None
229     }
230 
231     fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool {
232         if let Some(msix) = &self.msix {
233             return msix.table_accessed(bar_index, offset);
234         }
235 
236         false
237     }
238 
239     fn msix_write_table(&mut self, offset: u64, data: &[u8]) {
240         if let Some(ref mut msix) = &mut self.msix {
241             let offset = offset - u64::from(msix.cap.table_offset());
242             msix.bar.write_table(offset, data)
243         }
244     }
245 
246     fn msix_read_table(&self, offset: u64, data: &mut [u8]) {
247         if let Some(msix) = &self.msix {
248             let offset = offset - u64::from(msix.cap.table_offset());
249             msix.bar.read_table(offset, data)
250         }
251     }
252 
253     pub(crate) fn intx_in_use(&self) -> bool {
254         if let Some(intx) = &self.intx {
255             return intx.enabled;
256         }
257 
258         false
259     }
260 }
261 
262 #[derive(Copy, Clone)]
263 pub struct UserMemoryRegion {
264     pub slot: u32,
265     pub start: u64,
266     pub size: u64,
267     pub host_addr: u64,
268 }
269 
270 #[derive(Clone)]
271 pub struct MmioRegion {
272     pub start: GuestAddress,
273     pub length: GuestUsize,
274     pub(crate) type_: PciBarRegionType,
275     pub(crate) index: u32,
276     pub(crate) user_memory_regions: Vec<UserMemoryRegion>,
277 }
278 
279 trait MmioRegionRange {
280     fn check_range(&self, guest_addr: u64, size: u64) -> bool;
281     fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error>;
282 }
283 
284 impl MmioRegionRange for Vec<MmioRegion> {
285     // Check if a guest address is within the range of mmio regions
286     fn check_range(&self, guest_addr: u64, size: u64) -> bool {
287         for region in self.iter() {
288             let Some(guest_addr_end) = guest_addr.checked_add(size) else {
289                 return false;
290             };
291             let Some(region_end) = region.start.raw_value().checked_add(region.length) else {
292                 return false;
293             };
294             if guest_addr >= region.start.raw_value() && guest_addr_end <= region_end {
295                 return true;
296             }
297         }
298         false
299     }
300 
301     // Locate the user region address for a guest address within all mmio regions
302     fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error> {
303         for region in self.iter() {
304             for user_region in region.user_memory_regions.iter() {
305                 if guest_addr >= user_region.start
306                     && guest_addr < user_region.start + user_region.size
307                 {
308                     return Ok(user_region.host_addr + (guest_addr - user_region.start));
309                 }
310             }
311         }
312 
313         Err(io::Error::new(
314             io::ErrorKind::Other,
315             format!("unable to find user address: 0x{guest_addr:x}"),
316         ))
317     }
318 }
319 
320 #[derive(Debug, Error)]
321 pub enum VfioError {
322     #[error("Kernel VFIO error: {0}")]
323     KernelVfio(#[source] vfio_ioctls::VfioError),
324     #[error("VFIO user error: {0}")]
325     VfioUser(#[source] vfio_user::Error),
326 }
327 
328 pub(crate) trait Vfio: Send + Sync {
329     fn read_config_byte(&self, offset: u32) -> u8 {
330         let mut data: [u8; 1] = [0];
331         self.read_config(offset, &mut data);
332         data[0]
333     }
334 
335     fn read_config_word(&self, offset: u32) -> u16 {
336         let mut data: [u8; 2] = [0, 0];
337         self.read_config(offset, &mut data);
338         u16::from_le_bytes(data)
339     }
340 
341     fn read_config_dword(&self, offset: u32) -> u32 {
342         let mut data: [u8; 4] = [0, 0, 0, 0];
343         self.read_config(offset, &mut data);
344         u32::from_le_bytes(data)
345     }
346 
347     fn write_config_dword(&self, offset: u32, buf: u32) {
348         let data: [u8; 4] = buf.to_le_bytes();
349         self.write_config(offset, &data)
350     }
351 
352     fn read_config(&self, offset: u32, data: &mut [u8]) {
353         self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut());
354     }
355 
356     fn write_config(&self, offset: u32, data: &[u8]) {
357         self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data)
358     }
359 
360     fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
361         self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds)
362     }
363 
364     fn disable_msi(&self) -> Result<(), VfioError> {
365         self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX)
366     }
367 
368     fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
369         self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds)
370     }
371 
372     fn disable_msix(&self) -> Result<(), VfioError> {
373         self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX)
374     }
375 
376     fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) {
377         unimplemented!()
378     }
379 
380     fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) {
381         unimplemented!()
382     }
383 
384     fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> {
385         unimplemented!()
386     }
387 
388     fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
389         unimplemented!()
390     }
391 
392     fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
393         unimplemented!()
394     }
395 
396     fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
397         unimplemented!()
398     }
399 }
400 
401 struct VfioDeviceWrapper {
402     device: Arc<VfioDevice>,
403 }
404 
405 impl VfioDeviceWrapper {
406     fn new(device: Arc<VfioDevice>) -> Self {
407         Self { device }
408     }
409 }
410 
411 impl Vfio for VfioDeviceWrapper {
412     fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) {
413         self.device.region_read(index, data, offset)
414     }
415 
416     fn region_write(&self, index: u32, offset: u64, data: &[u8]) {
417         self.device.region_write(index, data, offset)
418     }
419 
420     fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> {
421         self.device.get_irq_info(irq_index).copied()
422     }
423 
424     fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
425         self.device
426             .enable_irq(irq_index, event_fds)
427             .map_err(VfioError::KernelVfio)
428     }
429 
430     fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> {
431         self.device
432             .disable_irq(irq_index)
433             .map_err(VfioError::KernelVfio)
434     }
435 
436     fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> {
437         self.device
438             .unmask_irq(irq_index)
439             .map_err(VfioError::KernelVfio)
440     }
441 }
442 
443 #[derive(Versionize)]
444 struct VfioCommonState {
445     intx_state: Option<IntxState>,
446     msi_state: Option<MsiState>,
447     msix_state: Option<MsixState>,
448 }
449 
450 impl VersionMapped for VfioCommonState {}
451 
452 pub(crate) struct ConfigPatch {
453     mask: u32,
454     patch: u32,
455 }
456 
457 pub(crate) struct VfioCommon {
458     pub(crate) configuration: PciConfiguration,
459     pub(crate) mmio_regions: Vec<MmioRegion>,
460     pub(crate) interrupt: Interrupt,
461     pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
462     pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
463     pub(crate) vfio_wrapper: Arc<dyn Vfio>,
464     pub(crate) patches: HashMap<usize, ConfigPatch>,
465     x_nv_gpudirect_clique: Option<u8>,
466 }
467 
468 impl VfioCommon {
469     pub(crate) fn new(
470         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
471         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
472         vfio_wrapper: Arc<dyn Vfio>,
473         subclass: &dyn PciSubclass,
474         bdf: PciBdf,
475         snapshot: Option<Snapshot>,
476         x_nv_gpudirect_clique: Option<u8>,
477     ) -> Result<Self, VfioPciError> {
478         let pci_configuration_state =
479             vm_migration::versioned_state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID)
480                 .map_err(|e| {
481                     VfioPciError::RetrievePciConfigurationState(anyhow!(
482                         "Failed to get PciConfigurationState from Snapshot: {}",
483                         e
484                     ))
485                 })?;
486 
487         let configuration = PciConfiguration::new(
488             0,
489             0,
490             0,
491             PciClassCode::Other,
492             subclass,
493             None,
494             PciHeaderType::Device,
495             0,
496             0,
497             None,
498             pci_configuration_state,
499         );
500 
501         let mut vfio_common = VfioCommon {
502             mmio_regions: Vec::new(),
503             configuration,
504             interrupt: Interrupt {
505                 intx: None,
506                 msi: None,
507                 msix: None,
508             },
509             msi_interrupt_manager,
510             legacy_interrupt_group,
511             vfio_wrapper,
512             patches: HashMap::new(),
513             x_nv_gpudirect_clique,
514         };
515 
516         let state: Option<VfioCommonState> = snapshot
517             .as_ref()
518             .map(|s| s.to_versioned_state())
519             .transpose()
520             .map_err(|e| {
521                 VfioPciError::RetrieveVfioCommonState(anyhow!(
522                     "Failed to get VfioCommonState from Snapshot: {}",
523                     e
524                 ))
525             })?;
526         let msi_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSI_CONFIG_ID)
527             .map_err(|e| {
528                 VfioPciError::RetrieveMsiConfigState(anyhow!(
529                     "Failed to get MsiConfigState from Snapshot: {}",
530                     e
531                 ))
532             })?;
533         let msix_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID)
534             .map_err(|e| {
535             VfioPciError::RetrieveMsixConfigState(anyhow!(
536                 "Failed to get MsixConfigState from Snapshot: {}",
537                 e
538             ))
539         })?;
540 
541         if let Some(state) = state.as_ref() {
542             vfio_common.set_state(state, msi_state, msix_state)?;
543         } else {
544             vfio_common.parse_capabilities(bdf);
545             vfio_common.initialize_legacy_interrupt()?;
546         }
547 
548         Ok(vfio_common)
549     }
550 
551     /// In case msix table offset is not page size aligned, we need do some fixup to achieve it.
552     /// Because we don't want the MMIO RW region and trap region overlap each other.
553     fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 {
554         if let Some(msix) = self.interrupt.msix.as_mut() {
555             let msix_cap = &mut msix.cap;
556 
557             // Suppose table_bir equals to pba_bir here. Am I right?
558             let (table_offset, table_size) = msix_cap.table_range();
559             if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id {
560                 return region_size;
561             }
562 
563             let (pba_offset, pba_size) = msix_cap.pba_range();
564             let msix_sz = align_page_size_up(table_size + pba_size);
565             // Expand region to hold RW and trap region which both page size aligned
566             let size = std::cmp::max(region_size * 2, msix_sz * 2);
567             // let table starts from the middle of the region
568             msix_cap.table_set_offset((size / 2) as u32);
569             msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32);
570 
571             size
572         } else {
573             // MSI-X not supported for this device
574             region_size
575         }
576     }
577 
578     // The `allocator` argument is unused on `aarch64`
579     #[allow(unused_variables)]
580     pub(crate) fn allocate_bars(
581         &mut self,
582         allocator: &Arc<Mutex<SystemAllocator>>,
583         mmio32_allocator: &mut AddressAllocator,
584         mmio64_allocator: &mut AddressAllocator,
585         resources: Option<Vec<Resource>>,
586     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
587         let mut bars = Vec::new();
588         let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX;
589 
590         // Going through all regular regions to compute the BAR size.
591         // We're not saving the BAR address to restore it, because we
592         // are going to allocate a guest address for each BAR and write
593         // that new address back.
594         while bar_id < VFIO_PCI_CONFIG_REGION_INDEX {
595             let mut region_size: u64 = 0;
596             let mut region_type = PciBarRegionType::Memory32BitRegion;
597             let mut prefetchable = PciBarPrefetchable::NotPrefetchable;
598             let mut flags: u32 = 0;
599 
600             let mut restored_bar_addr = None;
601             if let Some(resources) = &resources {
602                 for resource in resources {
603                     if let Resource::PciBar {
604                         index,
605                         base,
606                         size,
607                         type_,
608                         ..
609                     } = resource
610                     {
611                         if *index == bar_id as usize {
612                             restored_bar_addr = Some(GuestAddress(*base));
613                             region_size = *size;
614                             region_type = PciBarRegionType::from(*type_);
615                             break;
616                         }
617                     }
618                 }
619                 if restored_bar_addr.is_none() {
620                     bar_id += 1;
621                     continue;
622                 }
623             } else {
624                 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
625                     (PCI_ROM_EXP_BAR_INDEX * 4) as u32
626                 } else {
627                     PCI_CONFIG_BAR_OFFSET + bar_id * 4
628                 };
629 
630                 // First read flags
631                 flags = self.vfio_wrapper.read_config_dword(bar_offset);
632 
633                 // Is this an IO BAR?
634                 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
635                     matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR)
636                 } else {
637                     false
638                 };
639 
640                 // Is this a 64-bit BAR?
641                 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
642                     matches!(
643                         flags & PCI_CONFIG_MEMORY_BAR_64BIT,
644                         PCI_CONFIG_MEMORY_BAR_64BIT
645                     )
646                 } else {
647                     false
648                 };
649 
650                 if matches!(
651                     flags & PCI_CONFIG_BAR_PREFETCHABLE,
652                     PCI_CONFIG_BAR_PREFETCHABLE
653                 ) {
654                     prefetchable = PciBarPrefetchable::Prefetchable
655                 };
656 
657                 // To get size write all 1s
658                 self.vfio_wrapper
659                     .write_config_dword(bar_offset, 0xffff_ffff);
660 
661                 // And read back BAR value. The device will write zeros for bits it doesn't care about
662                 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset);
663 
664                 if io_bar {
665                     // Mask flag bits (lowest 2 for I/O bars)
666                     lower &= !0b11;
667 
668                     // BAR is not enabled
669                     if lower == 0 {
670                         bar_id += 1;
671                         continue;
672                     }
673 
674                     // IO BAR
675                     region_type = PciBarRegionType::IoRegion;
676 
677                     // Invert bits and add 1 to calculate size
678                     region_size = (!lower + 1) as u64;
679                 } else if is_64bit_bar {
680                     // 64 bits Memory BAR
681                     region_type = PciBarRegionType::Memory64BitRegion;
682 
683                     // Query size of upper BAR of 64-bit BAR
684                     let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4;
685                     self.vfio_wrapper
686                         .write_config_dword(upper_offset, 0xffff_ffff);
687                     let upper = self.vfio_wrapper.read_config_dword(upper_offset);
688 
689                     let mut combined_size = u64::from(upper) << 32 | u64::from(lower);
690 
691                     // Mask out flag bits (lowest 4 for memory bars)
692                     combined_size &= !0b1111;
693 
694                     // BAR is not enabled
695                     if combined_size == 0 {
696                         bar_id += 1;
697                         continue;
698                     }
699 
700                     // Invert and add 1 to to find size
701                     region_size = !combined_size + 1;
702                 } else {
703                     region_type = PciBarRegionType::Memory32BitRegion;
704 
705                     // Mask out flag bits (lowest 4 for memory bars)
706                     lower &= !0b1111;
707 
708                     if lower == 0 {
709                         bar_id += 1;
710                         continue;
711                     }
712 
713                     // Invert and add 1 to to find size
714                     region_size = (!lower + 1) as u64;
715                 }
716             }
717 
718             let bar_addr = match region_type {
719                 PciBarRegionType::IoRegion => {
720                     #[cfg(target_arch = "aarch64")]
721                     unimplemented!();
722 
723                     // The address needs to be 4 bytes aligned.
724                     #[cfg(not(target_arch = "aarch64"))]
725                     allocator
726                         .lock()
727                         .unwrap()
728                         .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4))
729                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
730                 }
731                 PciBarRegionType::Memory32BitRegion => {
732                     // BAR allocation must be naturally aligned
733                     mmio32_allocator
734                         .allocate(restored_bar_addr, region_size, Some(region_size))
735                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
736                 }
737                 PciBarRegionType::Memory64BitRegion => {
738                     // We need do some fixup to keep MMIO RW region and msix cap region page size
739                     // aligned.
740                     region_size = self.fixup_msix_region(bar_id, region_size);
741                     mmio64_allocator
742                         .allocate(
743                             restored_bar_addr,
744                             region_size,
745                             Some(std::cmp::max(
746                                 // SAFETY: FFI call. Trivially safe.
747                                 unsafe { sysconf(_SC_PAGESIZE) as GuestUsize },
748                                 region_size,
749                             )),
750                         )
751                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
752                 }
753             };
754 
755             // We can now build our BAR configuration block.
756             let bar = PciBarConfiguration::default()
757                 .set_index(bar_id as usize)
758                 .set_address(bar_addr.raw_value())
759                 .set_size(region_size)
760                 .set_region_type(region_type)
761                 .set_prefetchable(prefetchable);
762 
763             if bar_id == VFIO_PCI_ROM_REGION_INDEX {
764                 self.configuration
765                     .add_pci_rom_bar(&bar, flags & 0x1)
766                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
767             } else {
768                 self.configuration
769                     .add_pci_bar(&bar)
770                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
771             }
772 
773             bars.push(bar);
774             self.mmio_regions.push(MmioRegion {
775                 start: bar_addr,
776                 length: region_size,
777                 type_: region_type,
778                 index: bar_id,
779                 user_memory_regions: Vec::new(),
780             });
781 
782             bar_id += 1;
783             if region_type == PciBarRegionType::Memory64BitRegion {
784                 bar_id += 1;
785             }
786         }
787 
788         Ok(bars)
789     }
790 
791     // The `allocator` argument is unused on `aarch64`
792     #[allow(unused_variables)]
793     pub(crate) fn free_bars(
794         &mut self,
795         allocator: &mut SystemAllocator,
796         mmio32_allocator: &mut AddressAllocator,
797         mmio64_allocator: &mut AddressAllocator,
798     ) -> Result<(), PciDeviceError> {
799         for region in self.mmio_regions.iter() {
800             match region.type_ {
801                 PciBarRegionType::IoRegion => {
802                     #[cfg(target_arch = "x86_64")]
803                     allocator.free_io_addresses(region.start, region.length);
804                     #[cfg(target_arch = "aarch64")]
805                     error!("I/O region is not supported");
806                 }
807                 PciBarRegionType::Memory32BitRegion => {
808                     mmio32_allocator.free(region.start, region.length);
809                 }
810                 PciBarRegionType::Memory64BitRegion => {
811                     mmio64_allocator.free(region.start, region.length);
812                 }
813             }
814         }
815         Ok(())
816     }
817 
818     pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap {
819         let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into());
820 
821         let table = self.vfio_wrapper.read_config_dword((cap + 4).into());
822 
823         let pba = self.vfio_wrapper.read_config_dword((cap + 8).into());
824 
825         MsixCap {
826             msg_ctl,
827             table,
828             pba,
829         }
830     }
831 
832     pub(crate) fn initialize_msix(
833         &mut self,
834         msix_cap: MsixCap,
835         cap_offset: u32,
836         bdf: PciBdf,
837         state: Option<MsixConfigState>,
838     ) {
839         let interrupt_source_group = self
840             .msi_interrupt_manager
841             .create_group(MsiIrqGroupConfig {
842                 base: 0,
843                 count: msix_cap.table_size() as InterruptIndex,
844             })
845             .unwrap();
846 
847         let msix_config = MsixConfig::new(
848             msix_cap.table_size(),
849             interrupt_source_group.clone(),
850             bdf.into(),
851             state,
852         )
853         .unwrap();
854 
855         self.interrupt.msix = Some(VfioMsix {
856             bar: msix_config,
857             cap: msix_cap,
858             cap_offset,
859             interrupt_source_group,
860         });
861     }
862 
863     pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 {
864         self.vfio_wrapper.read_config_word((cap + 2).into())
865     }
866 
867     pub(crate) fn initialize_msi(
868         &mut self,
869         msg_ctl: u16,
870         cap_offset: u32,
871         state: Option<MsiConfigState>,
872     ) {
873         let interrupt_source_group = self
874             .msi_interrupt_manager
875             .create_group(MsiIrqGroupConfig {
876                 base: 0,
877                 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex,
878             })
879             .unwrap();
880 
881         let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap();
882 
883         self.interrupt.msi = Some(VfioMsi {
884             cfg: msi_config,
885             cap_offset,
886             interrupt_source_group,
887         });
888     }
889 
890     pub(crate) fn get_msix_cap_idx(&self) -> Option<usize> {
891         let mut cap_next = self
892             .vfio_wrapper
893             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
894 
895         while cap_next != 0 {
896             let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into());
897             if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX {
898                 return Some(cap_next as usize);
899             } else {
900                 cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into());
901             }
902         }
903 
904         None
905     }
906 
907     pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) {
908         let mut cap_iter = self
909             .vfio_wrapper
910             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
911 
912         let mut pci_express_cap_found = false;
913         let mut power_management_cap_found = false;
914 
915         while cap_iter != 0 {
916             let cap_id = self.vfio_wrapper.read_config_byte(cap_iter.into());
917 
918             match PciCapabilityId::from(cap_id) {
919                 PciCapabilityId::MessageSignalledInterrupts => {
920                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) {
921                         if irq_info.count > 0 {
922                             // Parse capability only if the VFIO device
923                             // supports MSI.
924                             let msg_ctl = self.parse_msi_capabilities(cap_iter);
925                             self.initialize_msi(msg_ctl, cap_iter as u32, None);
926                         }
927                     }
928                 }
929                 PciCapabilityId::MsiX => {
930                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX)
931                     {
932                         if irq_info.count > 0 {
933                             // Parse capability only if the VFIO device
934                             // supports MSI-X.
935                             let msix_cap = self.parse_msix_capabilities(cap_iter);
936                             self.initialize_msix(msix_cap, cap_iter as u32, bdf, None);
937                         }
938                     }
939                 }
940                 PciCapabilityId::PciExpress => pci_express_cap_found = true,
941                 PciCapabilityId::PowerManagement => power_management_cap_found = true,
942                 _ => {}
943             };
944 
945             let cap_next = self.vfio_wrapper.read_config_byte((cap_iter + 1).into());
946             if cap_next == 0 {
947                 break;
948             }
949 
950             cap_iter = cap_next;
951         }
952 
953         if let Some(clique_id) = self.x_nv_gpudirect_clique {
954             self.add_nv_gpudirect_clique_cap(cap_iter, clique_id);
955         }
956 
957         if pci_express_cap_found && power_management_cap_found {
958             self.parse_extended_capabilities();
959         }
960     }
961 
962     fn add_nv_gpudirect_clique_cap(&mut self, cap_iter: u8, clique_id: u8) {
963         // Turing, Ampere, Hopper, and Lovelace GPUs have dedicated space
964         // at 0xD4 for this capability.
965         let cap_offset = 0xd4u32;
966 
967         let reg_idx = (cap_iter / 4) as usize;
968         self.patches.insert(
969             reg_idx,
970             ConfigPatch {
971                 mask: 0x0000_ff00,
972                 patch: cap_offset << 8,
973             },
974         );
975 
976         let reg_idx = (cap_offset / 4) as usize;
977         self.patches.insert(
978             reg_idx,
979             ConfigPatch {
980                 mask: 0xffff_ffff,
981                 patch: 0x50080009u32,
982             },
983         );
984         self.patches.insert(
985             reg_idx + 1,
986             ConfigPatch {
987                 mask: 0xffff_ffff,
988                 patch: u32::from(clique_id) << 19 | 0x5032,
989             },
990         );
991     }
992 
993     fn parse_extended_capabilities(&mut self) {
994         let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET;
995 
996         loop {
997             let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset);
998 
999             let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16;
1000             let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16;
1001 
1002             match PciExpressCapabilityId::from(cap_id) {
1003                 PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation
1004                 | PciExpressCapabilityId::ResizeableBar
1005                 | PciExpressCapabilityId::SingleRootIoVirtualization => {
1006                     let reg_idx = (current_offset / 4) as usize;
1007                     self.patches.insert(
1008                         reg_idx,
1009                         ConfigPatch {
1010                             mask: 0x0000_ffff,
1011                             patch: PciExpressCapabilityId::NullCapability as u32,
1012                         },
1013                     );
1014                 }
1015                 _ => {}
1016             }
1017 
1018             if cap_next == 0 {
1019                 break;
1020             }
1021 
1022             current_offset = cap_next.into();
1023         }
1024     }
1025 
1026     pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> {
1027         if let Some(intx) = &mut self.interrupt.intx {
1028             if !intx.enabled {
1029                 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) {
1030                     self.vfio_wrapper
1031                         .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd])
1032                         .map_err(VfioPciError::EnableIntx)?;
1033 
1034                     intx.enabled = true;
1035                 } else {
1036                     return Err(VfioPciError::MissingNotifier);
1037                 }
1038             }
1039         }
1040 
1041         Ok(())
1042     }
1043 
1044     pub(crate) fn disable_intx(&mut self) {
1045         if let Some(intx) = &mut self.interrupt.intx {
1046             if intx.enabled {
1047                 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1048                     error!("Could not disable INTx: {}", e);
1049                 } else {
1050                     intx.enabled = false;
1051                 }
1052             }
1053         }
1054     }
1055 
1056     pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> {
1057         if let Some(msi) = &self.interrupt.msi {
1058             let mut irq_fds: Vec<EventFd> = Vec::new();
1059             for i in 0..msi.cfg.num_enabled_vectors() {
1060                 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) {
1061                     irq_fds.push(eventfd);
1062                 } else {
1063                     return Err(VfioPciError::MissingNotifier);
1064                 }
1065             }
1066 
1067             self.vfio_wrapper
1068                 .enable_msi(irq_fds.iter().collect())
1069                 .map_err(VfioPciError::EnableMsi)?;
1070         }
1071 
1072         Ok(())
1073     }
1074 
1075     pub(crate) fn disable_msi(&self) {
1076         if let Err(e) = self.vfio_wrapper.disable_msi() {
1077             error!("Could not disable MSI: {}", e);
1078         }
1079     }
1080 
1081     pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> {
1082         if let Some(msix) = &self.interrupt.msix {
1083             let mut irq_fds: Vec<EventFd> = Vec::new();
1084             for i in 0..msix.bar.table_entries.len() {
1085                 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) {
1086                     irq_fds.push(eventfd);
1087                 } else {
1088                     return Err(VfioPciError::MissingNotifier);
1089                 }
1090             }
1091 
1092             self.vfio_wrapper
1093                 .enable_msix(irq_fds.iter().collect())
1094                 .map_err(VfioPciError::EnableMsix)?;
1095         }
1096 
1097         Ok(())
1098     }
1099 
1100     pub(crate) fn disable_msix(&self) {
1101         if let Err(e) = self.vfio_wrapper.disable_msix() {
1102             error!("Could not disable MSI-X: {}", e);
1103         }
1104     }
1105 
1106     pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> {
1107         if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) {
1108             if irq_info.count == 0 {
1109                 // A count of 0 means the INTx IRQ is not supported, therefore
1110                 // it shouldn't be initialized.
1111                 return Ok(());
1112             }
1113         }
1114 
1115         if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() {
1116             self.interrupt.intx = Some(VfioIntx {
1117                 interrupt_source_group,
1118                 enabled: false,
1119             });
1120 
1121             self.enable_intx()?;
1122         }
1123 
1124         Ok(())
1125     }
1126 
1127     pub(crate) fn update_msi_capabilities(
1128         &mut self,
1129         offset: u64,
1130         data: &[u8],
1131     ) -> Result<(), VfioPciError> {
1132         match self.interrupt.update_msi(offset, data) {
1133             Some(InterruptUpdateAction::EnableMsi) => {
1134                 // Disable INTx before we can enable MSI
1135                 self.disable_intx();
1136                 self.enable_msi()?;
1137             }
1138             Some(InterruptUpdateAction::DisableMsi) => {
1139                 // Fallback onto INTx when disabling MSI
1140                 self.disable_msi();
1141                 self.enable_intx()?;
1142             }
1143             _ => {}
1144         }
1145 
1146         Ok(())
1147     }
1148 
1149     pub(crate) fn update_msix_capabilities(
1150         &mut self,
1151         offset: u64,
1152         data: &[u8],
1153     ) -> Result<(), VfioPciError> {
1154         match self.interrupt.update_msix(offset, data) {
1155             Some(InterruptUpdateAction::EnableMsix) => {
1156                 // Disable INTx before we can enable MSI-X
1157                 self.disable_intx();
1158                 self.enable_msix()?;
1159             }
1160             Some(InterruptUpdateAction::DisableMsix) => {
1161                 // Fallback onto INTx when disabling MSI-X
1162                 self.disable_msix();
1163                 self.enable_intx()?;
1164             }
1165             _ => {}
1166         }
1167 
1168         Ok(())
1169     }
1170 
1171     pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> {
1172         for region in self.mmio_regions.iter() {
1173             if addr >= region.start.raw_value()
1174                 && addr < region.start.unchecked_add(region.length).raw_value()
1175             {
1176                 return Some(region.clone());
1177             }
1178         }
1179         None
1180     }
1181 
1182     pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1183         let addr = base + offset;
1184         if let Some(region) = self.find_region(addr) {
1185             let offset = addr - region.start.raw_value();
1186 
1187             if self.interrupt.msix_table_accessed(region.index, offset) {
1188                 self.interrupt.msix_read_table(offset, data);
1189             } else {
1190                 self.vfio_wrapper.region_read(region.index, offset, data);
1191             }
1192         }
1193 
1194         // INTx EOI
1195         // The guest reading from the BAR potentially means the interrupt has
1196         // been received and can be acknowledged.
1197         if self.interrupt.intx_in_use() {
1198             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1199                 error!("Failed unmasking INTx IRQ: {}", e);
1200             }
1201         }
1202     }
1203 
1204     pub(crate) fn write_bar(
1205         &mut self,
1206         base: u64,
1207         offset: u64,
1208         data: &[u8],
1209     ) -> Option<Arc<Barrier>> {
1210         let addr = base + offset;
1211         if let Some(region) = self.find_region(addr) {
1212             let offset = addr - region.start.raw_value();
1213 
1214             // If the MSI-X table is written to, we need to update our cache.
1215             if self.interrupt.msix_table_accessed(region.index, offset) {
1216                 self.interrupt.msix_write_table(offset, data);
1217             } else {
1218                 self.vfio_wrapper.region_write(region.index, offset, data);
1219             }
1220         }
1221 
1222         // INTx EOI
1223         // The guest writing to the BAR potentially means the interrupt has
1224         // been received and can be acknowledged.
1225         if self.interrupt.intx_in_use() {
1226             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1227                 error!("Failed unmasking INTx IRQ: {}", e);
1228             }
1229         }
1230 
1231         None
1232     }
1233 
1234     pub(crate) fn write_config_register(
1235         &mut self,
1236         reg_idx: usize,
1237         offset: u64,
1238         data: &[u8],
1239     ) -> Option<Arc<Barrier>> {
1240         // When the guest wants to write to a BAR, we trap it into
1241         // our local configuration space. We're not reprogramming
1242         // VFIO device.
1243         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1244             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1245         {
1246             // We keep our local cache updated with the BARs.
1247             // We'll read it back from there when the guest is asking
1248             // for BARs (see read_config_register()).
1249             self.configuration
1250                 .write_config_register(reg_idx, offset, data);
1251             return None;
1252         }
1253 
1254         let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64;
1255 
1256         // If the MSI or MSI-X capabilities are accessed, we need to
1257         // update our local cache accordingly.
1258         // Depending on how the capabilities are modified, this could
1259         // trigger a VFIO MSI or MSI-X toggle.
1260         if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) {
1261             let cap_offset: u64 = reg - cap_base + offset;
1262             match cap_id {
1263                 PciCapabilityId::MessageSignalledInterrupts => {
1264                     if let Err(e) = self.update_msi_capabilities(cap_offset, data) {
1265                         error!("Could not update MSI capabilities: {}", e);
1266                     }
1267                 }
1268                 PciCapabilityId::MsiX => {
1269                     if let Err(e) = self.update_msix_capabilities(cap_offset, data) {
1270                         error!("Could not update MSI-X capabilities: {}", e);
1271                     }
1272                 }
1273                 _ => {}
1274             }
1275         }
1276 
1277         // Make sure to write to the device's PCI config space after MSI/MSI-X
1278         // interrupts have been enabled/disabled. In case of MSI, when the
1279         // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS),
1280         // the MSI Enable bit in the MSI capability structure found in the PCI
1281         // config space is disabled by default. That's why when the guest is
1282         // enabling this bit, we first need to enable the MSI interrupts with
1283         // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write
1284         // to the device region to update the MSI Enable bit.
1285         self.vfio_wrapper.write_config((reg + offset) as u32, data);
1286 
1287         None
1288     }
1289 
1290     pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1291         // When reading the BARs, we trap it and return what comes
1292         // from our local configuration space. We want the guest to
1293         // use that and not the VFIO device BARs as it does not map
1294         // with the guest address space.
1295         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1296             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1297         {
1298             return self.configuration.read_reg(reg_idx);
1299         }
1300 
1301         if let Some(id) = self.get_msix_cap_idx() {
1302             let msix = self.interrupt.msix.as_mut().unwrap();
1303             if reg_idx * 4 == id + 4 {
1304                 return msix.cap.table;
1305             } else if reg_idx * 4 == id + 8 {
1306                 return msix.cap.pba;
1307             }
1308         }
1309 
1310         // Since we don't support passing multi-functions devices, we should
1311         // mask the multi-function bit, bit 7 of the Header Type byte on the
1312         // register 3.
1313         let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX {
1314             0xff7f_ffff
1315         } else {
1316             0xffff_ffff
1317         };
1318 
1319         // The config register read comes from the VFIO device itself.
1320         let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask;
1321 
1322         if let Some(config_patch) = self.patches.get(&reg_idx) {
1323             value = (value & !config_patch.mask) | config_patch.patch;
1324         }
1325 
1326         value
1327     }
1328 
1329     fn state(&self) -> VfioCommonState {
1330         let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState {
1331             enabled: intx.enabled,
1332         });
1333 
1334         let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState {
1335             cap: msi.cfg.cap,
1336             cap_offset: msi.cap_offset,
1337         });
1338 
1339         let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState {
1340             cap: msix.cap,
1341             cap_offset: msix.cap_offset,
1342             bdf: msix.bar.devid,
1343         });
1344 
1345         VfioCommonState {
1346             intx_state,
1347             msi_state,
1348             msix_state,
1349         }
1350     }
1351 
1352     fn set_state(
1353         &mut self,
1354         state: &VfioCommonState,
1355         msi_state: Option<MsiConfigState>,
1356         msix_state: Option<MsixConfigState>,
1357     ) -> Result<(), VfioPciError> {
1358         if let (Some(intx), Some(interrupt_source_group)) =
1359             (&state.intx_state, self.legacy_interrupt_group.clone())
1360         {
1361             self.interrupt.intx = Some(VfioIntx {
1362                 interrupt_source_group,
1363                 enabled: false,
1364             });
1365 
1366             if intx.enabled {
1367                 self.enable_intx()?;
1368             }
1369         }
1370 
1371         if let Some(msi) = &state.msi_state {
1372             self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state);
1373         }
1374 
1375         if let Some(msix) = &state.msix_state {
1376             self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state);
1377         }
1378 
1379         Ok(())
1380     }
1381 }
1382 
1383 impl Pausable for VfioCommon {}
1384 
1385 impl Snapshottable for VfioCommon {
1386     fn id(&self) -> String {
1387         String::from(VFIO_COMMON_ID)
1388     }
1389 
1390     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1391         let mut vfio_common_snapshot = Snapshot::new_from_versioned_state(&self.state())?;
1392 
1393         // Snapshot PciConfiguration
1394         vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?);
1395 
1396         // Snapshot MSI
1397         if let Some(msi) = &mut self.interrupt.msi {
1398             vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?);
1399         }
1400 
1401         // Snapshot MSI-X
1402         if let Some(msix) = &mut self.interrupt.msix {
1403             vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?);
1404         }
1405 
1406         Ok(vfio_common_snapshot)
1407     }
1408 }
1409 
1410 /// VfioPciDevice represents a VFIO PCI device.
1411 /// This structure implements the BusDevice and PciDevice traits.
1412 ///
1413 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device.
1414 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice,
1415 /// which then gets added to the PCI bus.
1416 pub struct VfioPciDevice {
1417     id: String,
1418     vm: Arc<dyn hypervisor::Vm>,
1419     device: Arc<VfioDevice>,
1420     container: Arc<VfioContainer>,
1421     common: VfioCommon,
1422     iommu_attached: bool,
1423     memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
1424 }
1425 
1426 impl VfioPciDevice {
1427     /// Constructs a new Vfio Pci device for the given Vfio device
1428     #[allow(clippy::too_many_arguments)]
1429     pub fn new(
1430         id: String,
1431         vm: &Arc<dyn hypervisor::Vm>,
1432         device: VfioDevice,
1433         container: Arc<VfioContainer>,
1434         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
1435         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
1436         iommu_attached: bool,
1437         bdf: PciBdf,
1438         memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
1439         snapshot: Option<Snapshot>,
1440         x_nv_gpudirect_clique: Option<u8>,
1441     ) -> Result<Self, VfioPciError> {
1442         let device = Arc::new(device);
1443         device.reset();
1444 
1445         let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device));
1446 
1447         let common = VfioCommon::new(
1448             msi_interrupt_manager,
1449             legacy_interrupt_group,
1450             Arc::new(vfio_wrapper) as Arc<dyn Vfio>,
1451             &PciVfioSubclass::VfioSubclass,
1452             bdf,
1453             vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID),
1454             x_nv_gpudirect_clique,
1455         )?;
1456 
1457         let vfio_pci_device = VfioPciDevice {
1458             id,
1459             vm: vm.clone(),
1460             device,
1461             container,
1462             common,
1463             iommu_attached,
1464             memory_slot,
1465         };
1466 
1467         Ok(vfio_pci_device)
1468     }
1469 
1470     pub fn iommu_attached(&self) -> bool {
1471         self.iommu_attached
1472     }
1473 
1474     fn generate_sparse_areas(
1475         caps: &[VfioRegionInfoCap],
1476         region_index: u32,
1477         region_start: u64,
1478         region_size: u64,
1479         vfio_msix: Option<&VfioMsix>,
1480     ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> {
1481         for cap in caps {
1482             match cap {
1483                 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()),
1484                 VfioRegionInfoCap::MsixMappable => {
1485                     if !is_4k_aligned(region_start) {
1486                         error!(
1487                             "Region start address 0x{:x} must be at least aligned on 4KiB",
1488                             region_start
1489                         );
1490                         return Err(VfioPciError::RegionAlignment);
1491                     }
1492                     if !is_4k_multiple(region_size) {
1493                         error!(
1494                             "Region size 0x{:x} must be at least a multiple of 4KiB",
1495                             region_size
1496                         );
1497                         return Err(VfioPciError::RegionSize);
1498                     }
1499 
1500                     // In case the region contains the MSI-X vectors table or
1501                     // the MSI-X PBA table, we must calculate the subregions
1502                     // around them, leading to a list of sparse areas.
1503                     // We want to make sure we will still trap MMIO accesses
1504                     // to these MSI-X specific ranges. If these region don't align
1505                     // with pagesize, we can achieve it by enlarging its range.
1506                     //
1507                     // Using a BtreeMap as the list provided through the iterator is sorted
1508                     // by key. This ensures proper split of the whole region.
1509                     let mut inter_ranges = BTreeMap::new();
1510                     if let Some(msix) = vfio_msix {
1511                         if region_index == msix.cap.table_bir() {
1512                             let (offset, size) = msix.cap.table_range();
1513                             let offset = align_page_size_down(offset);
1514                             let size = align_page_size_up(size);
1515                             inter_ranges.insert(offset, size);
1516                         }
1517                         if region_index == msix.cap.pba_bir() {
1518                             let (offset, size) = msix.cap.pba_range();
1519                             let offset = align_page_size_down(offset);
1520                             let size = align_page_size_up(size);
1521                             inter_ranges.insert(offset, size);
1522                         }
1523                     }
1524 
1525                     let mut sparse_areas = Vec::new();
1526                     let mut current_offset = 0;
1527                     for (range_offset, range_size) in inter_ranges {
1528                         if range_offset > current_offset {
1529                             sparse_areas.push(VfioRegionSparseMmapArea {
1530                                 offset: current_offset,
1531                                 size: range_offset - current_offset,
1532                             });
1533                         }
1534                         current_offset = align_page_size_down(range_offset + range_size);
1535                     }
1536 
1537                     if region_size > current_offset {
1538                         sparse_areas.push(VfioRegionSparseMmapArea {
1539                             offset: current_offset,
1540                             size: region_size - current_offset,
1541                         });
1542                     }
1543 
1544                     return Ok(sparse_areas);
1545                 }
1546                 _ => {}
1547             }
1548         }
1549 
1550         // In case no relevant capabilities have been found, create a single
1551         // sparse area corresponding to the entire MMIO region.
1552         Ok(vec![VfioRegionSparseMmapArea {
1553             offset: 0,
1554             size: region_size,
1555         }])
1556     }
1557 
1558     /// Map MMIO regions into the guest, and avoid VM exits when the guest tries
1559     /// to reach those regions.
1560     ///
1561     /// # Arguments
1562     ///
1563     /// * `vm` - The VM object. It is used to set the VFIO MMIO regions
1564     ///          as user memory regions.
1565     /// * `mem_slot` - The closure to return a memory slot.
1566     pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> {
1567         let fd = self.device.as_raw_fd();
1568 
1569         for region in self.common.mmio_regions.iter_mut() {
1570             let region_flags = self.device.get_region_flags(region.index);
1571             if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1572                 let mut prot = 0;
1573                 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 {
1574                     prot |= libc::PROT_READ;
1575                 }
1576                 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 {
1577                     prot |= libc::PROT_WRITE;
1578                 }
1579 
1580                 // Retrieve the list of capabilities found on the region
1581                 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 {
1582                     self.device.get_region_caps(region.index)
1583                 } else {
1584                     Vec::new()
1585                 };
1586 
1587                 // Don't try to mmap the region if it contains MSI-X table or
1588                 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE
1589                 // in the list of supported capabilities.
1590                 if let Some(msix) = self.common.interrupt.msix.as_ref() {
1591                     if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir())
1592                         && !caps.contains(&VfioRegionInfoCap::MsixMappable)
1593                     {
1594                         continue;
1595                     }
1596                 }
1597 
1598                 let mmap_size = self.device.get_region_size(region.index);
1599                 let mmap_offset = self.device.get_region_offset(region.index);
1600 
1601                 let sparse_areas = Self::generate_sparse_areas(
1602                     &caps,
1603                     region.index,
1604                     region.start.0,
1605                     mmap_size,
1606                     self.common.interrupt.msix.as_ref(),
1607                 )?;
1608 
1609                 for area in sparse_areas.iter() {
1610                     // SAFETY: FFI call with correct arguments
1611                     let host_addr = unsafe {
1612                         libc::mmap(
1613                             null_mut(),
1614                             area.size as usize,
1615                             prot,
1616                             libc::MAP_SHARED,
1617                             fd,
1618                             mmap_offset as libc::off_t + area.offset as libc::off_t,
1619                         )
1620                     };
1621 
1622                     if host_addr == libc::MAP_FAILED {
1623                         error!(
1624                             "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}",
1625                             area.offset,
1626                             area.size,
1627                             std::io::Error::last_os_error()
1628                         );
1629                         return Err(VfioPciError::MmapArea);
1630                     }
1631 
1632                     if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) {
1633                         warn!(
1634                             "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})",
1635                             area.offset,
1636                             area.size,
1637                             );
1638                         return Ok(());
1639                     }
1640 
1641                     let user_memory_region = UserMemoryRegion {
1642                         slot: (self.memory_slot)(),
1643                         start: region.start.0 + area.offset,
1644                         size: area.size,
1645                         host_addr: host_addr as u64,
1646                     };
1647 
1648                     region.user_memory_regions.push(user_memory_region);
1649 
1650                     let mem_region = self.vm.make_user_memory_region(
1651                         user_memory_region.slot,
1652                         user_memory_region.start,
1653                         user_memory_region.size,
1654                         user_memory_region.host_addr,
1655                         false,
1656                         false,
1657                     );
1658 
1659                     self.vm
1660                         .create_user_memory_region(mem_region)
1661                         .map_err(VfioPciError::CreateUserMemoryRegion)?;
1662 
1663                     if !self.iommu_attached {
1664                         self.container
1665                             .vfio_dma_map(
1666                                 user_memory_region.start,
1667                                 user_memory_region.size,
1668                                 user_memory_region.host_addr,
1669                             )
1670                             .map_err(VfioPciError::DmaMap)?;
1671                     }
1672                 }
1673             }
1674         }
1675 
1676         Ok(())
1677     }
1678 
1679     pub fn unmap_mmio_regions(&mut self) {
1680         for region in self.common.mmio_regions.iter() {
1681             for user_memory_region in region.user_memory_regions.iter() {
1682                 // Unmap from vfio container
1683                 if !self.iommu_attached {
1684                     if let Err(e) = self
1685                         .container
1686                         .vfio_dma_unmap(user_memory_region.start, user_memory_region.size)
1687                     {
1688                         error!("Could not unmap mmio region from vfio container: {}", e);
1689                     }
1690                 }
1691 
1692                 // Remove region
1693                 let r = self.vm.make_user_memory_region(
1694                     user_memory_region.slot,
1695                     user_memory_region.start,
1696                     user_memory_region.size,
1697                     user_memory_region.host_addr,
1698                     false,
1699                     false,
1700                 );
1701 
1702                 if let Err(e) = self.vm.remove_user_memory_region(r) {
1703                     error!("Could not remove the userspace memory region: {}", e);
1704                 }
1705 
1706                 // SAFETY: FFI call with correct arguments
1707                 let ret = unsafe {
1708                     libc::munmap(
1709                         user_memory_region.host_addr as *mut libc::c_void,
1710                         user_memory_region.size as usize,
1711                     )
1712                 };
1713                 if ret != 0 {
1714                     error!(
1715                         "Could not unmap region {}, error:{}",
1716                         region.index,
1717                         io::Error::last_os_error()
1718                     );
1719                 }
1720             }
1721         }
1722     }
1723 
1724     pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> {
1725         if !self.iommu_attached {
1726             self.container
1727                 .vfio_dma_map(iova, size, user_addr)
1728                 .map_err(VfioPciError::DmaMap)?;
1729         }
1730 
1731         Ok(())
1732     }
1733 
1734     pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> {
1735         if !self.iommu_attached {
1736             self.container
1737                 .vfio_dma_unmap(iova, size)
1738                 .map_err(VfioPciError::DmaUnmap)?;
1739         }
1740 
1741         Ok(())
1742     }
1743 
1744     pub fn mmio_regions(&self) -> Vec<MmioRegion> {
1745         self.common.mmio_regions.clone()
1746     }
1747 }
1748 
1749 impl Drop for VfioPciDevice {
1750     fn drop(&mut self) {
1751         self.unmap_mmio_regions();
1752 
1753         if let Some(msix) = &self.common.interrupt.msix {
1754             if msix.bar.enabled() {
1755                 self.common.disable_msix();
1756             }
1757         }
1758 
1759         if let Some(msi) = &self.common.interrupt.msi {
1760             if msi.cfg.enabled() {
1761                 self.common.disable_msi()
1762             }
1763         }
1764 
1765         if self.common.interrupt.intx_in_use() {
1766             self.common.disable_intx();
1767         }
1768     }
1769 }
1770 
1771 impl BusDevice for VfioPciDevice {
1772     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1773         self.read_bar(base, offset, data)
1774     }
1775 
1776     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1777         self.write_bar(base, offset, data)
1778     }
1779 }
1780 
1781 // First BAR offset in the PCI config space.
1782 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10;
1783 // Capability register offset in the PCI config space.
1784 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34;
1785 // Extended capabilities register offset in the PCI config space.
1786 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100;
1787 // IO BAR when first BAR bit is 1.
1788 const PCI_CONFIG_IO_BAR: u32 = 0x1;
1789 // 64-bit memory bar flag.
1790 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4;
1791 // Prefetchable BAR bit
1792 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8;
1793 // PCI config register size (4 bytes).
1794 const PCI_CONFIG_REGISTER_SIZE: usize = 4;
1795 // Number of BARs for a PCI device
1796 const BAR_NUMS: usize = 6;
1797 // PCI Header Type register index
1798 const PCI_HEADER_TYPE_REG_INDEX: usize = 3;
1799 // First BAR register index
1800 const PCI_CONFIG_BAR0_INDEX: usize = 4;
1801 // PCI ROM expansion BAR register index
1802 const PCI_ROM_EXP_BAR_INDEX: usize = 12;
1803 
1804 impl PciDevice for VfioPciDevice {
1805     fn allocate_bars(
1806         &mut self,
1807         allocator: &Arc<Mutex<SystemAllocator>>,
1808         mmio32_allocator: &mut AddressAllocator,
1809         mmio64_allocator: &mut AddressAllocator,
1810         resources: Option<Vec<Resource>>,
1811     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
1812         self.common
1813             .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources)
1814     }
1815 
1816     fn free_bars(
1817         &mut self,
1818         allocator: &mut SystemAllocator,
1819         mmio32_allocator: &mut AddressAllocator,
1820         mmio64_allocator: &mut AddressAllocator,
1821     ) -> Result<(), PciDeviceError> {
1822         self.common
1823             .free_bars(allocator, mmio32_allocator, mmio64_allocator)
1824     }
1825 
1826     fn write_config_register(
1827         &mut self,
1828         reg_idx: usize,
1829         offset: u64,
1830         data: &[u8],
1831     ) -> Option<Arc<Barrier>> {
1832         self.common.write_config_register(reg_idx, offset, data)
1833     }
1834 
1835     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1836         self.common.read_config_register(reg_idx)
1837     }
1838 
1839     fn detect_bar_reprogramming(
1840         &mut self,
1841         reg_idx: usize,
1842         data: &[u8],
1843     ) -> Option<BarReprogrammingParams> {
1844         self.common
1845             .configuration
1846             .detect_bar_reprogramming(reg_idx, data)
1847     }
1848 
1849     fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1850         self.common.read_bar(base, offset, data)
1851     }
1852 
1853     fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1854         self.common.write_bar(base, offset, data)
1855     }
1856 
1857     fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> {
1858         for region in self.common.mmio_regions.iter_mut() {
1859             if region.start.raw_value() == old_base {
1860                 region.start = GuestAddress(new_base);
1861 
1862                 for user_memory_region in region.user_memory_regions.iter_mut() {
1863                     // Remove old region
1864                     let old_mem_region = self.vm.make_user_memory_region(
1865                         user_memory_region.slot,
1866                         user_memory_region.start,
1867                         user_memory_region.size,
1868                         user_memory_region.host_addr,
1869                         false,
1870                         false,
1871                     );
1872 
1873                     self.vm
1874                         .remove_user_memory_region(old_mem_region)
1875                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1876 
1877                     // Update the user memory region with the correct start address.
1878                     if new_base > old_base {
1879                         user_memory_region.start += new_base - old_base;
1880                     } else {
1881                         user_memory_region.start -= old_base - new_base;
1882                     }
1883 
1884                     // Insert new region
1885                     let new_mem_region = self.vm.make_user_memory_region(
1886                         user_memory_region.slot,
1887                         user_memory_region.start,
1888                         user_memory_region.size,
1889                         user_memory_region.host_addr,
1890                         false,
1891                         false,
1892                     );
1893 
1894                     self.vm
1895                         .create_user_memory_region(new_mem_region)
1896                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1897                 }
1898             }
1899         }
1900 
1901         Ok(())
1902     }
1903 
1904     fn as_any(&mut self) -> &mut dyn Any {
1905         self
1906     }
1907 
1908     fn id(&self) -> Option<String> {
1909         Some(self.id.clone())
1910     }
1911 }
1912 
1913 impl Pausable for VfioPciDevice {}
1914 
1915 impl Snapshottable for VfioPciDevice {
1916     fn id(&self) -> String {
1917         self.id.clone()
1918     }
1919 
1920     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1921         let mut vfio_pci_dev_snapshot = Snapshot::default();
1922 
1923         // Snapshot VfioCommon
1924         vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?);
1925 
1926         Ok(vfio_pci_dev_snapshot)
1927     }
1928 }
1929 impl Transportable for VfioPciDevice {}
1930 impl Migratable for VfioPciDevice {}
1931 
1932 /// This structure implements the ExternalDmaMapping trait. It is meant to
1933 /// be used when the caller tries to provide a way to update the mappings
1934 /// associated with a specific VFIO container.
1935 pub struct VfioDmaMapping<M: GuestAddressSpace> {
1936     container: Arc<VfioContainer>,
1937     memory: Arc<M>,
1938     mmio_regions: Arc<Mutex<Vec<MmioRegion>>>,
1939 }
1940 
1941 impl<M: GuestAddressSpace> VfioDmaMapping<M> {
1942     /// Create a DmaMapping object.
1943     /// # Parameters
1944     /// * `container`: VFIO container object.
1945     /// * `memory`: guest memory to mmap.
1946     /// * `mmio_regions`: mmio_regions to mmap.
1947     pub fn new(
1948         container: Arc<VfioContainer>,
1949         memory: Arc<M>,
1950         mmio_regions: Arc<Mutex<Vec<MmioRegion>>>,
1951     ) -> Self {
1952         VfioDmaMapping {
1953             container,
1954             memory,
1955             mmio_regions,
1956         }
1957     }
1958 }
1959 
1960 impl<M: GuestAddressSpace + Sync + Send> ExternalDmaMapping for VfioDmaMapping<M> {
1961     fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), io::Error> {
1962         let mem = self.memory.memory();
1963         let guest_addr = GuestAddress(gpa);
1964         let user_addr = if mem.check_range(guest_addr, size as usize) {
1965             match mem.get_host_address(guest_addr) {
1966                 Ok(t) => t as u64,
1967                 Err(e) => {
1968                     return Err(io::Error::new(
1969                         io::ErrorKind::Other,
1970                         format!("unable to retrieve user address for gpa 0x{gpa:x} from guest memory region: {e}")
1971                     ));
1972                 }
1973             }
1974         } else if self.mmio_regions.lock().unwrap().check_range(gpa, size) {
1975             self.mmio_regions.lock().unwrap().find_user_address(gpa)?
1976         } else {
1977             return Err(io::Error::new(
1978                 io::ErrorKind::Other,
1979                 format!("failed to locate guest address 0x{gpa:x} in guest memory"),
1980             ));
1981         };
1982 
1983         self.container
1984             .vfio_dma_map(iova, size, user_addr)
1985             .map_err(|e| {
1986                 io::Error::new(
1987                     io::ErrorKind::Other,
1988                     format!(
1989                         "failed to map memory for VFIO container, \
1990                          iova 0x{iova:x}, gpa 0x{gpa:x}, size 0x{size:x}: {e:?}"
1991                     ),
1992                 )
1993             })
1994     }
1995 
1996     fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), io::Error> {
1997         self.container.vfio_dma_unmap(iova, size).map_err(|e| {
1998             io::Error::new(
1999                 io::ErrorKind::Other,
2000                 format!(
2001                     "failed to unmap memory for VFIO container, \
2002                      iova 0x{iova:x}, size 0x{size:x}: {e:?}"
2003                 ),
2004             )
2005         })
2006     }
2007 }
2008