xref: /cloud-hypervisor/pci/src/vfio.rs (revision 261bfac4d47e4da0a8554b0968706ce30c6cc70c)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 
6 use std::any::Any;
7 use std::collections::{BTreeMap, HashMap};
8 use std::io;
9 use std::os::unix::io::AsRawFd;
10 use std::ptr::null_mut;
11 use std::sync::{Arc, Barrier, Mutex};
12 
13 use anyhow::anyhow;
14 use byteorder::{ByteOrder, LittleEndian};
15 use hypervisor::HypervisorVmError;
16 use libc::{sysconf, _SC_PAGESIZE};
17 use serde::{Deserialize, Serialize};
18 use thiserror::Error;
19 use vfio_bindings::bindings::vfio::*;
20 use vfio_ioctls::{
21     VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea,
22 };
23 use vm_allocator::page_size::{
24     align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned,
25 };
26 use vm_allocator::{AddressAllocator, MemorySlotAllocator, SystemAllocator};
27 use vm_device::dma_mapping::ExternalDmaMapping;
28 use vm_device::interrupt::{
29     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
30 };
31 use vm_device::{BusDevice, Resource};
32 use vm_memory::{Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestUsize};
33 use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable};
34 use vmm_sys_util::eventfd::EventFd;
35 
36 use crate::msi::{MsiConfigState, MSI_CONFIG_ID};
37 use crate::msix::MsixConfigState;
38 use crate::{
39     msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig,
40     PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId,
41     PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId,
42     PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID,
43 };
44 
45 pub(crate) const VFIO_COMMON_ID: &str = "vfio_common";
46 
47 #[derive(Debug, Error)]
48 pub enum VfioPciError {
49     #[error("Failed to create user memory region: {0}")]
50     CreateUserMemoryRegion(#[source] HypervisorVmError),
51     #[error("Failed to DMA map: {0}")]
52     DmaMap(#[source] vfio_ioctls::VfioError),
53     #[error("Failed to DMA unmap: {0}")]
54     DmaUnmap(#[source] vfio_ioctls::VfioError),
55     #[error("Failed to enable INTx: {0}")]
56     EnableIntx(#[source] VfioError),
57     #[error("Failed to enable MSI: {0}")]
58     EnableMsi(#[source] VfioError),
59     #[error("Failed to enable MSI-x: {0}")]
60     EnableMsix(#[source] VfioError),
61     #[error("Failed to mmap the area")]
62     MmapArea,
63     #[error("Failed to notifier's eventfd")]
64     MissingNotifier,
65     #[error("Invalid region alignment")]
66     RegionAlignment,
67     #[error("Invalid region size")]
68     RegionSize,
69     #[error("Failed to retrieve MsiConfigState: {0}")]
70     RetrieveMsiConfigState(#[source] anyhow::Error),
71     #[error("Failed to retrieve MsixConfigState: {0}")]
72     RetrieveMsixConfigState(#[source] anyhow::Error),
73     #[error("Failed to retrieve PciConfigurationState: {0}")]
74     RetrievePciConfigurationState(#[source] anyhow::Error),
75     #[error("Failed to retrieve VfioCommonState: {0}")]
76     RetrieveVfioCommonState(#[source] anyhow::Error),
77 }
78 
79 #[derive(Copy, Clone)]
80 enum PciVfioSubclass {
81     VfioSubclass = 0xff,
82 }
83 
84 impl PciSubclass for PciVfioSubclass {
85     fn get_register_value(&self) -> u8 {
86         *self as u8
87     }
88 }
89 
90 enum InterruptUpdateAction {
91     EnableMsi,
92     DisableMsi,
93     EnableMsix,
94     DisableMsix,
95 }
96 
97 #[derive(Serialize, Deserialize)]
98 struct IntxState {
99     enabled: bool,
100 }
101 
102 pub(crate) struct VfioIntx {
103     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
104     enabled: bool,
105 }
106 
107 #[derive(Serialize, Deserialize)]
108 struct MsiState {
109     cap: MsiCap,
110     cap_offset: u32,
111 }
112 
113 pub(crate) struct VfioMsi {
114     pub(crate) cfg: MsiConfig,
115     cap_offset: u32,
116     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
117 }
118 
119 impl VfioMsi {
120     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
121         let old_enabled = self.cfg.enabled();
122 
123         self.cfg.update(offset, data);
124 
125         let new_enabled = self.cfg.enabled();
126 
127         if !old_enabled && new_enabled {
128             return Some(InterruptUpdateAction::EnableMsi);
129         }
130 
131         if old_enabled && !new_enabled {
132             return Some(InterruptUpdateAction::DisableMsi);
133         }
134 
135         None
136     }
137 }
138 
139 #[derive(Serialize, Deserialize)]
140 struct MsixState {
141     cap: MsixCap,
142     cap_offset: u32,
143     bdf: u32,
144 }
145 
146 pub(crate) struct VfioMsix {
147     pub(crate) bar: MsixConfig,
148     cap: MsixCap,
149     cap_offset: u32,
150     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
151 }
152 
153 impl VfioMsix {
154     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
155         let old_enabled = self.bar.enabled();
156 
157         // Update "Message Control" word
158         if offset == 2 && data.len() == 2 {
159             self.bar.set_msg_ctl(LittleEndian::read_u16(data));
160         }
161 
162         let new_enabled = self.bar.enabled();
163 
164         if !old_enabled && new_enabled {
165             return Some(InterruptUpdateAction::EnableMsix);
166         }
167 
168         if old_enabled && !new_enabled {
169             return Some(InterruptUpdateAction::DisableMsix);
170         }
171 
172         None
173     }
174 
175     fn table_accessed(&self, bar_index: u32, offset: u64) -> bool {
176         let table_offset: u64 = u64::from(self.cap.table_offset());
177         let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64);
178         let table_bir: u32 = self.cap.table_bir();
179 
180         bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size
181     }
182 }
183 
184 pub(crate) struct Interrupt {
185     pub(crate) intx: Option<VfioIntx>,
186     pub(crate) msi: Option<VfioMsi>,
187     pub(crate) msix: Option<VfioMsix>,
188 }
189 
190 impl Interrupt {
191     fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
192         if let Some(ref mut msi) = &mut self.msi {
193             let action = msi.update(offset, data);
194             return action;
195         }
196 
197         None
198     }
199 
200     fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
201         if let Some(ref mut msix) = &mut self.msix {
202             let action = msix.update(offset, data);
203             return action;
204         }
205 
206         None
207     }
208 
209     fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> {
210         if let Some(msi) = &self.msi {
211             if offset >= u64::from(msi.cap_offset)
212                 && offset < u64::from(msi.cap_offset) + msi.cfg.size()
213             {
214                 return Some((
215                     PciCapabilityId::MessageSignalledInterrupts,
216                     u64::from(msi.cap_offset),
217                 ));
218             }
219         }
220 
221         if let Some(msix) = &self.msix {
222             if offset == u64::from(msix.cap_offset) {
223                 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset)));
224             }
225         }
226 
227         None
228     }
229 
230     fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool {
231         if let Some(msix) = &self.msix {
232             return msix.table_accessed(bar_index, offset);
233         }
234 
235         false
236     }
237 
238     fn msix_write_table(&mut self, offset: u64, data: &[u8]) {
239         if let Some(ref mut msix) = &mut self.msix {
240             let offset = offset - u64::from(msix.cap.table_offset());
241             msix.bar.write_table(offset, data)
242         }
243     }
244 
245     fn msix_read_table(&self, offset: u64, data: &mut [u8]) {
246         if let Some(msix) = &self.msix {
247             let offset = offset - u64::from(msix.cap.table_offset());
248             msix.bar.read_table(offset, data)
249         }
250     }
251 
252     pub(crate) fn intx_in_use(&self) -> bool {
253         if let Some(intx) = &self.intx {
254             return intx.enabled;
255         }
256 
257         false
258     }
259 }
260 
261 #[derive(Copy, Clone)]
262 pub struct UserMemoryRegion {
263     pub slot: u32,
264     pub start: u64,
265     pub size: u64,
266     pub host_addr: u64,
267 }
268 
269 #[derive(Clone)]
270 pub struct MmioRegion {
271     pub start: GuestAddress,
272     pub length: GuestUsize,
273     pub(crate) type_: PciBarRegionType,
274     pub(crate) index: u32,
275     pub(crate) user_memory_regions: Vec<UserMemoryRegion>,
276 }
277 
278 trait MmioRegionRange {
279     fn check_range(&self, guest_addr: u64, size: u64) -> bool;
280     fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error>;
281 }
282 
283 impl MmioRegionRange for Vec<MmioRegion> {
284     // Check if a guest address is within the range of mmio regions
285     fn check_range(&self, guest_addr: u64, size: u64) -> bool {
286         for region in self.iter() {
287             let Some(guest_addr_end) = guest_addr.checked_add(size) else {
288                 return false;
289             };
290             let Some(region_end) = region.start.raw_value().checked_add(region.length) else {
291                 return false;
292             };
293             if guest_addr >= region.start.raw_value() && guest_addr_end <= region_end {
294                 return true;
295             }
296         }
297         false
298     }
299 
300     // Locate the user region address for a guest address within all mmio regions
301     fn find_user_address(&self, guest_addr: u64) -> Result<u64, io::Error> {
302         for region in self.iter() {
303             for user_region in region.user_memory_regions.iter() {
304                 if guest_addr >= user_region.start
305                     && guest_addr < user_region.start + user_region.size
306                 {
307                     return Ok(user_region.host_addr + (guest_addr - user_region.start));
308                 }
309             }
310         }
311 
312         Err(io::Error::new(
313             io::ErrorKind::Other,
314             format!("unable to find user address: 0x{guest_addr:x}"),
315         ))
316     }
317 }
318 
319 #[derive(Debug, Error)]
320 pub enum VfioError {
321     #[error("Kernel VFIO error: {0}")]
322     KernelVfio(#[source] vfio_ioctls::VfioError),
323     #[error("VFIO user error: {0}")]
324     VfioUser(#[source] vfio_user::Error),
325 }
326 
327 pub(crate) trait Vfio: Send + Sync {
328     fn read_config_byte(&self, offset: u32) -> u8 {
329         let mut data: [u8; 1] = [0];
330         self.read_config(offset, &mut data);
331         data[0]
332     }
333 
334     fn read_config_word(&self, offset: u32) -> u16 {
335         let mut data: [u8; 2] = [0, 0];
336         self.read_config(offset, &mut data);
337         u16::from_le_bytes(data)
338     }
339 
340     fn read_config_dword(&self, offset: u32) -> u32 {
341         let mut data: [u8; 4] = [0, 0, 0, 0];
342         self.read_config(offset, &mut data);
343         u32::from_le_bytes(data)
344     }
345 
346     fn write_config_dword(&self, offset: u32, buf: u32) {
347         let data: [u8; 4] = buf.to_le_bytes();
348         self.write_config(offset, &data)
349     }
350 
351     fn read_config(&self, offset: u32, data: &mut [u8]) {
352         self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut());
353     }
354 
355     fn write_config(&self, offset: u32, data: &[u8]) {
356         self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data)
357     }
358 
359     fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
360         self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds)
361     }
362 
363     fn disable_msi(&self) -> Result<(), VfioError> {
364         self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX)
365     }
366 
367     fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
368         self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds)
369     }
370 
371     fn disable_msix(&self) -> Result<(), VfioError> {
372         self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX)
373     }
374 
375     fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) {
376         unimplemented!()
377     }
378 
379     fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) {
380         unimplemented!()
381     }
382 
383     fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> {
384         unimplemented!()
385     }
386 
387     fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
388         unimplemented!()
389     }
390 
391     fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
392         unimplemented!()
393     }
394 
395     fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
396         unimplemented!()
397     }
398 }
399 
400 struct VfioDeviceWrapper {
401     device: Arc<VfioDevice>,
402 }
403 
404 impl VfioDeviceWrapper {
405     fn new(device: Arc<VfioDevice>) -> Self {
406         Self { device }
407     }
408 }
409 
410 impl Vfio for VfioDeviceWrapper {
411     fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) {
412         self.device.region_read(index, data, offset)
413     }
414 
415     fn region_write(&self, index: u32, offset: u64, data: &[u8]) {
416         self.device.region_write(index, data, offset)
417     }
418 
419     fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> {
420         self.device.get_irq_info(irq_index).copied()
421     }
422 
423     fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
424         self.device
425             .enable_irq(irq_index, event_fds)
426             .map_err(VfioError::KernelVfio)
427     }
428 
429     fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> {
430         self.device
431             .disable_irq(irq_index)
432             .map_err(VfioError::KernelVfio)
433     }
434 
435     fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> {
436         self.device
437             .unmask_irq(irq_index)
438             .map_err(VfioError::KernelVfio)
439     }
440 }
441 
442 #[derive(Serialize, Deserialize)]
443 struct VfioCommonState {
444     intx_state: Option<IntxState>,
445     msi_state: Option<MsiState>,
446     msix_state: Option<MsixState>,
447 }
448 
449 pub(crate) struct ConfigPatch {
450     mask: u32,
451     patch: u32,
452 }
453 
454 pub(crate) struct VfioCommon {
455     pub(crate) configuration: PciConfiguration,
456     pub(crate) mmio_regions: Vec<MmioRegion>,
457     pub(crate) interrupt: Interrupt,
458     pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
459     pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
460     pub(crate) vfio_wrapper: Arc<dyn Vfio>,
461     pub(crate) patches: HashMap<usize, ConfigPatch>,
462     x_nv_gpudirect_clique: Option<u8>,
463 }
464 
465 impl VfioCommon {
466     pub(crate) fn new(
467         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
468         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
469         vfio_wrapper: Arc<dyn Vfio>,
470         subclass: &dyn PciSubclass,
471         bdf: PciBdf,
472         snapshot: Option<Snapshot>,
473         x_nv_gpudirect_clique: Option<u8>,
474     ) -> Result<Self, VfioPciError> {
475         let pci_configuration_state =
476             vm_migration::state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID).map_err(|e| {
477                 VfioPciError::RetrievePciConfigurationState(anyhow!(
478                     "Failed to get PciConfigurationState from Snapshot: {}",
479                     e
480                 ))
481             })?;
482 
483         let configuration = PciConfiguration::new(
484             0,
485             0,
486             0,
487             PciClassCode::Other,
488             subclass,
489             None,
490             PciHeaderType::Device,
491             0,
492             0,
493             None,
494             pci_configuration_state,
495         );
496 
497         let mut vfio_common = VfioCommon {
498             mmio_regions: Vec::new(),
499             configuration,
500             interrupt: Interrupt {
501                 intx: None,
502                 msi: None,
503                 msix: None,
504             },
505             msi_interrupt_manager,
506             legacy_interrupt_group,
507             vfio_wrapper,
508             patches: HashMap::new(),
509             x_nv_gpudirect_clique,
510         };
511 
512         let state: Option<VfioCommonState> = snapshot
513             .as_ref()
514             .map(|s| s.to_state())
515             .transpose()
516             .map_err(|e| {
517                 VfioPciError::RetrieveVfioCommonState(anyhow!(
518                     "Failed to get VfioCommonState from Snapshot: {}",
519                     e
520                 ))
521             })?;
522         let msi_state =
523             vm_migration::state_from_id(snapshot.as_ref(), MSI_CONFIG_ID).map_err(|e| {
524                 VfioPciError::RetrieveMsiConfigState(anyhow!(
525                     "Failed to get MsiConfigState from Snapshot: {}",
526                     e
527                 ))
528             })?;
529         let msix_state =
530             vm_migration::state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID).map_err(|e| {
531                 VfioPciError::RetrieveMsixConfigState(anyhow!(
532                     "Failed to get MsixConfigState from Snapshot: {}",
533                     e
534                 ))
535             })?;
536 
537         if let Some(state) = state.as_ref() {
538             vfio_common.set_state(state, msi_state, msix_state)?;
539         } else {
540             vfio_common.parse_capabilities(bdf);
541             vfio_common.initialize_legacy_interrupt()?;
542         }
543 
544         Ok(vfio_common)
545     }
546 
547     /// In case msix table offset is not page size aligned, we need do some fixup to achieve it.
548     /// Because we don't want the MMIO RW region and trap region overlap each other.
549     fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 {
550         if let Some(msix) = self.interrupt.msix.as_mut() {
551             let msix_cap = &mut msix.cap;
552 
553             // Suppose table_bir equals to pba_bir here. Am I right?
554             let (table_offset, table_size) = msix_cap.table_range();
555             if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id {
556                 return region_size;
557             }
558 
559             let (pba_offset, pba_size) = msix_cap.pba_range();
560             let msix_sz = align_page_size_up(table_size + pba_size);
561             // Expand region to hold RW and trap region which both page size aligned
562             let size = std::cmp::max(region_size * 2, msix_sz * 2);
563             // let table starts from the middle of the region
564             msix_cap.table_set_offset((size / 2) as u32);
565             msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32);
566 
567             size
568         } else {
569             // MSI-X not supported for this device
570             region_size
571         }
572     }
573 
574     // The `allocator` argument is unused on `aarch64`
575     #[allow(unused_variables)]
576     pub(crate) fn allocate_bars(
577         &mut self,
578         allocator: &Arc<Mutex<SystemAllocator>>,
579         mmio32_allocator: &mut AddressAllocator,
580         mmio64_allocator: &mut AddressAllocator,
581         resources: Option<Vec<Resource>>,
582     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
583         let mut bars = Vec::new();
584         let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX;
585 
586         // Going through all regular regions to compute the BAR size.
587         // We're not saving the BAR address to restore it, because we
588         // are going to allocate a guest address for each BAR and write
589         // that new address back.
590         while bar_id < VFIO_PCI_CONFIG_REGION_INDEX {
591             let mut region_size: u64 = 0;
592             let mut region_type = PciBarRegionType::Memory32BitRegion;
593             let mut prefetchable = PciBarPrefetchable::NotPrefetchable;
594             let mut flags: u32 = 0;
595 
596             let mut restored_bar_addr = None;
597             if let Some(resources) = &resources {
598                 for resource in resources {
599                     if let Resource::PciBar {
600                         index,
601                         base,
602                         size,
603                         type_,
604                         ..
605                     } = resource
606                     {
607                         if *index == bar_id as usize {
608                             restored_bar_addr = Some(GuestAddress(*base));
609                             region_size = *size;
610                             region_type = PciBarRegionType::from(*type_);
611                             break;
612                         }
613                     }
614                 }
615                 if restored_bar_addr.is_none() {
616                     bar_id += 1;
617                     continue;
618                 }
619             } else {
620                 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
621                     (PCI_ROM_EXP_BAR_INDEX * 4) as u32
622                 } else {
623                     PCI_CONFIG_BAR_OFFSET + bar_id * 4
624                 };
625 
626                 // First read flags
627                 flags = self.vfio_wrapper.read_config_dword(bar_offset);
628 
629                 // Is this an IO BAR?
630                 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
631                     matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR)
632                 } else {
633                     false
634                 };
635 
636                 // Is this a 64-bit BAR?
637                 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
638                     matches!(
639                         flags & PCI_CONFIG_MEMORY_BAR_64BIT,
640                         PCI_CONFIG_MEMORY_BAR_64BIT
641                     )
642                 } else {
643                     false
644                 };
645 
646                 if matches!(
647                     flags & PCI_CONFIG_BAR_PREFETCHABLE,
648                     PCI_CONFIG_BAR_PREFETCHABLE
649                 ) {
650                     prefetchable = PciBarPrefetchable::Prefetchable
651                 };
652 
653                 // To get size write all 1s
654                 self.vfio_wrapper
655                     .write_config_dword(bar_offset, 0xffff_ffff);
656 
657                 // And read back BAR value. The device will write zeros for bits it doesn't care about
658                 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset);
659 
660                 if io_bar {
661                     // Mask flag bits (lowest 2 for I/O bars)
662                     lower &= !0b11;
663 
664                     // BAR is not enabled
665                     if lower == 0 {
666                         bar_id += 1;
667                         continue;
668                     }
669 
670                     // IO BAR
671                     region_type = PciBarRegionType::IoRegion;
672 
673                     // Invert bits and add 1 to calculate size
674                     region_size = (!lower + 1) as u64;
675                 } else if is_64bit_bar {
676                     // 64 bits Memory BAR
677                     region_type = PciBarRegionType::Memory64BitRegion;
678 
679                     // Query size of upper BAR of 64-bit BAR
680                     let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4;
681                     self.vfio_wrapper
682                         .write_config_dword(upper_offset, 0xffff_ffff);
683                     let upper = self.vfio_wrapper.read_config_dword(upper_offset);
684 
685                     let mut combined_size = u64::from(upper) << 32 | u64::from(lower);
686 
687                     // Mask out flag bits (lowest 4 for memory bars)
688                     combined_size &= !0b1111;
689 
690                     // BAR is not enabled
691                     if combined_size == 0 {
692                         bar_id += 1;
693                         continue;
694                     }
695 
696                     // Invert and add 1 to to find size
697                     region_size = !combined_size + 1;
698                 } else {
699                     region_type = PciBarRegionType::Memory32BitRegion;
700 
701                     // Mask out flag bits (lowest 4 for memory bars)
702                     lower &= !0b1111;
703 
704                     if lower == 0 {
705                         bar_id += 1;
706                         continue;
707                     }
708 
709                     // Invert and add 1 to to find size
710                     region_size = (!lower + 1) as u64;
711                 }
712             }
713 
714             let bar_addr = match region_type {
715                 PciBarRegionType::IoRegion => {
716                     #[cfg(not(target_arch = "x86_64"))]
717                     unimplemented!();
718 
719                     // The address needs to be 4 bytes aligned.
720                     #[cfg(target_arch = "x86_64")]
721                     allocator
722                         .lock()
723                         .unwrap()
724                         .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4))
725                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
726                 }
727                 PciBarRegionType::Memory32BitRegion => {
728                     // BAR allocation must be naturally aligned
729                     mmio32_allocator
730                         .allocate(restored_bar_addr, region_size, Some(region_size))
731                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
732                 }
733                 PciBarRegionType::Memory64BitRegion => {
734                     // We need do some fixup to keep MMIO RW region and msix cap region page size
735                     // aligned.
736                     region_size = self.fixup_msix_region(bar_id, region_size);
737                     mmio64_allocator
738                         .allocate(
739                             restored_bar_addr,
740                             region_size,
741                             Some(std::cmp::max(
742                                 // SAFETY: FFI call. Trivially safe.
743                                 unsafe { sysconf(_SC_PAGESIZE) as GuestUsize },
744                                 region_size,
745                             )),
746                         )
747                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
748                 }
749             };
750 
751             // We can now build our BAR configuration block.
752             let bar = PciBarConfiguration::default()
753                 .set_index(bar_id as usize)
754                 .set_address(bar_addr.raw_value())
755                 .set_size(region_size)
756                 .set_region_type(region_type)
757                 .set_prefetchable(prefetchable);
758 
759             if bar_id == VFIO_PCI_ROM_REGION_INDEX {
760                 self.configuration
761                     .add_pci_rom_bar(&bar, flags & 0x1)
762                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
763             } else {
764                 self.configuration
765                     .add_pci_bar(&bar)
766                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
767             }
768 
769             bars.push(bar);
770             self.mmio_regions.push(MmioRegion {
771                 start: bar_addr,
772                 length: region_size,
773                 type_: region_type,
774                 index: bar_id,
775                 user_memory_regions: Vec::new(),
776             });
777 
778             bar_id += 1;
779             if region_type == PciBarRegionType::Memory64BitRegion {
780                 bar_id += 1;
781             }
782         }
783 
784         Ok(bars)
785     }
786 
787     // The `allocator` argument is unused on `aarch64`
788     #[allow(unused_variables)]
789     pub(crate) fn free_bars(
790         &mut self,
791         allocator: &mut SystemAllocator,
792         mmio32_allocator: &mut AddressAllocator,
793         mmio64_allocator: &mut AddressAllocator,
794     ) -> Result<(), PciDeviceError> {
795         for region in self.mmio_regions.iter() {
796             match region.type_ {
797                 PciBarRegionType::IoRegion => {
798                     #[cfg(target_arch = "x86_64")]
799                     allocator.free_io_addresses(region.start, region.length);
800                     #[cfg(not(target_arch = "x86_64"))]
801                     error!("I/O region is not supported");
802                 }
803                 PciBarRegionType::Memory32BitRegion => {
804                     mmio32_allocator.free(region.start, region.length);
805                 }
806                 PciBarRegionType::Memory64BitRegion => {
807                     mmio64_allocator.free(region.start, region.length);
808                 }
809             }
810         }
811         Ok(())
812     }
813 
814     pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap {
815         let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into());
816 
817         let table = self.vfio_wrapper.read_config_dword((cap + 4).into());
818 
819         let pba = self.vfio_wrapper.read_config_dword((cap + 8).into());
820 
821         MsixCap {
822             msg_ctl,
823             table,
824             pba,
825         }
826     }
827 
828     pub(crate) fn initialize_msix(
829         &mut self,
830         msix_cap: MsixCap,
831         cap_offset: u32,
832         bdf: PciBdf,
833         state: Option<MsixConfigState>,
834     ) {
835         let interrupt_source_group = self
836             .msi_interrupt_manager
837             .create_group(MsiIrqGroupConfig {
838                 base: 0,
839                 count: msix_cap.table_size() as InterruptIndex,
840             })
841             .unwrap();
842 
843         let msix_config = MsixConfig::new(
844             msix_cap.table_size(),
845             interrupt_source_group.clone(),
846             bdf.into(),
847             state,
848         )
849         .unwrap();
850 
851         self.interrupt.msix = Some(VfioMsix {
852             bar: msix_config,
853             cap: msix_cap,
854             cap_offset,
855             interrupt_source_group,
856         });
857     }
858 
859     pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 {
860         self.vfio_wrapper.read_config_word((cap + 2).into())
861     }
862 
863     pub(crate) fn initialize_msi(
864         &mut self,
865         msg_ctl: u16,
866         cap_offset: u32,
867         state: Option<MsiConfigState>,
868     ) {
869         let interrupt_source_group = self
870             .msi_interrupt_manager
871             .create_group(MsiIrqGroupConfig {
872                 base: 0,
873                 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex,
874             })
875             .unwrap();
876 
877         let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap();
878 
879         self.interrupt.msi = Some(VfioMsi {
880             cfg: msi_config,
881             cap_offset,
882             interrupt_source_group,
883         });
884     }
885 
886     pub(crate) fn get_msix_cap_idx(&self) -> Option<usize> {
887         let mut cap_next = self
888             .vfio_wrapper
889             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
890 
891         while cap_next != 0 {
892             let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into());
893             if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX {
894                 return Some(cap_next as usize);
895             } else {
896                 cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into());
897             }
898         }
899 
900         None
901     }
902 
903     pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) {
904         let mut cap_iter = self
905             .vfio_wrapper
906             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
907 
908         let mut pci_express_cap_found = false;
909         let mut power_management_cap_found = false;
910 
911         while cap_iter != 0 {
912             let cap_id = self.vfio_wrapper.read_config_byte(cap_iter.into());
913 
914             match PciCapabilityId::from(cap_id) {
915                 PciCapabilityId::MessageSignalledInterrupts => {
916                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) {
917                         if irq_info.count > 0 {
918                             // Parse capability only if the VFIO device
919                             // supports MSI.
920                             let msg_ctl = self.parse_msi_capabilities(cap_iter);
921                             self.initialize_msi(msg_ctl, cap_iter as u32, None);
922                         }
923                     }
924                 }
925                 PciCapabilityId::MsiX => {
926                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX)
927                     {
928                         if irq_info.count > 0 {
929                             // Parse capability only if the VFIO device
930                             // supports MSI-X.
931                             let msix_cap = self.parse_msix_capabilities(cap_iter);
932                             self.initialize_msix(msix_cap, cap_iter as u32, bdf, None);
933                         }
934                     }
935                 }
936                 PciCapabilityId::PciExpress => pci_express_cap_found = true,
937                 PciCapabilityId::PowerManagement => power_management_cap_found = true,
938                 _ => {}
939             };
940 
941             let cap_next = self.vfio_wrapper.read_config_byte((cap_iter + 1).into());
942             if cap_next == 0 {
943                 break;
944             }
945 
946             cap_iter = cap_next;
947         }
948 
949         if let Some(clique_id) = self.x_nv_gpudirect_clique {
950             self.add_nv_gpudirect_clique_cap(cap_iter, clique_id);
951         }
952 
953         if pci_express_cap_found && power_management_cap_found {
954             self.parse_extended_capabilities();
955         }
956     }
957 
958     fn add_nv_gpudirect_clique_cap(&mut self, cap_iter: u8, clique_id: u8) {
959         // Turing, Ampere, Hopper, and Lovelace GPUs have dedicated space
960         // at 0xD4 for this capability.
961         let cap_offset = 0xd4u32;
962 
963         let reg_idx = (cap_iter / 4) as usize;
964         self.patches.insert(
965             reg_idx,
966             ConfigPatch {
967                 mask: 0x0000_ff00,
968                 patch: cap_offset << 8,
969             },
970         );
971 
972         let reg_idx = (cap_offset / 4) as usize;
973         self.patches.insert(
974             reg_idx,
975             ConfigPatch {
976                 mask: 0xffff_ffff,
977                 patch: 0x50080009u32,
978             },
979         );
980         self.patches.insert(
981             reg_idx + 1,
982             ConfigPatch {
983                 mask: 0xffff_ffff,
984                 patch: u32::from(clique_id) << 19 | 0x5032,
985             },
986         );
987     }
988 
989     fn parse_extended_capabilities(&mut self) {
990         let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET;
991 
992         loop {
993             let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset);
994 
995             let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16;
996             let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16;
997 
998             match PciExpressCapabilityId::from(cap_id) {
999                 PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation
1000                 | PciExpressCapabilityId::ResizeableBar
1001                 | PciExpressCapabilityId::SingleRootIoVirtualization => {
1002                     let reg_idx = (current_offset / 4) as usize;
1003                     self.patches.insert(
1004                         reg_idx,
1005                         ConfigPatch {
1006                             mask: 0x0000_ffff,
1007                             patch: PciExpressCapabilityId::NullCapability as u32,
1008                         },
1009                     );
1010                 }
1011                 _ => {}
1012             }
1013 
1014             if cap_next == 0 {
1015                 break;
1016             }
1017 
1018             current_offset = cap_next.into();
1019         }
1020     }
1021 
1022     pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> {
1023         if let Some(intx) = &mut self.interrupt.intx {
1024             if !intx.enabled {
1025                 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) {
1026                     self.vfio_wrapper
1027                         .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd])
1028                         .map_err(VfioPciError::EnableIntx)?;
1029 
1030                     intx.enabled = true;
1031                 } else {
1032                     return Err(VfioPciError::MissingNotifier);
1033                 }
1034             }
1035         }
1036 
1037         Ok(())
1038     }
1039 
1040     pub(crate) fn disable_intx(&mut self) {
1041         if let Some(intx) = &mut self.interrupt.intx {
1042             if intx.enabled {
1043                 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1044                     error!("Could not disable INTx: {}", e);
1045                 } else {
1046                     intx.enabled = false;
1047                 }
1048             }
1049         }
1050     }
1051 
1052     pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> {
1053         if let Some(msi) = &self.interrupt.msi {
1054             let mut irq_fds: Vec<EventFd> = Vec::new();
1055             for i in 0..msi.cfg.num_enabled_vectors() {
1056                 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) {
1057                     irq_fds.push(eventfd);
1058                 } else {
1059                     return Err(VfioPciError::MissingNotifier);
1060                 }
1061             }
1062 
1063             self.vfio_wrapper
1064                 .enable_msi(irq_fds.iter().collect())
1065                 .map_err(VfioPciError::EnableMsi)?;
1066         }
1067 
1068         Ok(())
1069     }
1070 
1071     pub(crate) fn disable_msi(&self) {
1072         if let Err(e) = self.vfio_wrapper.disable_msi() {
1073             error!("Could not disable MSI: {}", e);
1074         }
1075     }
1076 
1077     pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> {
1078         if let Some(msix) = &self.interrupt.msix {
1079             let mut irq_fds: Vec<EventFd> = Vec::new();
1080             for i in 0..msix.bar.table_entries.len() {
1081                 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) {
1082                     irq_fds.push(eventfd);
1083                 } else {
1084                     return Err(VfioPciError::MissingNotifier);
1085                 }
1086             }
1087 
1088             self.vfio_wrapper
1089                 .enable_msix(irq_fds.iter().collect())
1090                 .map_err(VfioPciError::EnableMsix)?;
1091         }
1092 
1093         Ok(())
1094     }
1095 
1096     pub(crate) fn disable_msix(&self) {
1097         if let Err(e) = self.vfio_wrapper.disable_msix() {
1098             error!("Could not disable MSI-X: {}", e);
1099         }
1100     }
1101 
1102     pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> {
1103         if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) {
1104             if irq_info.count == 0 {
1105                 // A count of 0 means the INTx IRQ is not supported, therefore
1106                 // it shouldn't be initialized.
1107                 return Ok(());
1108             }
1109         }
1110 
1111         if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() {
1112             self.interrupt.intx = Some(VfioIntx {
1113                 interrupt_source_group,
1114                 enabled: false,
1115             });
1116 
1117             self.enable_intx()?;
1118         }
1119 
1120         Ok(())
1121     }
1122 
1123     pub(crate) fn update_msi_capabilities(
1124         &mut self,
1125         offset: u64,
1126         data: &[u8],
1127     ) -> Result<(), VfioPciError> {
1128         match self.interrupt.update_msi(offset, data) {
1129             Some(InterruptUpdateAction::EnableMsi) => {
1130                 // Disable INTx before we can enable MSI
1131                 self.disable_intx();
1132                 self.enable_msi()?;
1133             }
1134             Some(InterruptUpdateAction::DisableMsi) => {
1135                 // Fallback onto INTx when disabling MSI
1136                 self.disable_msi();
1137                 self.enable_intx()?;
1138             }
1139             _ => {}
1140         }
1141 
1142         Ok(())
1143     }
1144 
1145     pub(crate) fn update_msix_capabilities(
1146         &mut self,
1147         offset: u64,
1148         data: &[u8],
1149     ) -> Result<(), VfioPciError> {
1150         match self.interrupt.update_msix(offset, data) {
1151             Some(InterruptUpdateAction::EnableMsix) => {
1152                 // Disable INTx before we can enable MSI-X
1153                 self.disable_intx();
1154                 self.enable_msix()?;
1155             }
1156             Some(InterruptUpdateAction::DisableMsix) => {
1157                 // Fallback onto INTx when disabling MSI-X
1158                 self.disable_msix();
1159                 self.enable_intx()?;
1160             }
1161             _ => {}
1162         }
1163 
1164         Ok(())
1165     }
1166 
1167     pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> {
1168         for region in self.mmio_regions.iter() {
1169             if addr >= region.start.raw_value()
1170                 && addr < region.start.unchecked_add(region.length).raw_value()
1171             {
1172                 return Some(region.clone());
1173             }
1174         }
1175         None
1176     }
1177 
1178     pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1179         let addr = base + offset;
1180         if let Some(region) = self.find_region(addr) {
1181             let offset = addr - region.start.raw_value();
1182 
1183             if self.interrupt.msix_table_accessed(region.index, offset) {
1184                 self.interrupt.msix_read_table(offset, data);
1185             } else {
1186                 self.vfio_wrapper.region_read(region.index, offset, data);
1187             }
1188         }
1189 
1190         // INTx EOI
1191         // The guest reading from the BAR potentially means the interrupt has
1192         // been received and can be acknowledged.
1193         if self.interrupt.intx_in_use() {
1194             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1195                 error!("Failed unmasking INTx IRQ: {}", e);
1196             }
1197         }
1198     }
1199 
1200     pub(crate) fn write_bar(
1201         &mut self,
1202         base: u64,
1203         offset: u64,
1204         data: &[u8],
1205     ) -> Option<Arc<Barrier>> {
1206         let addr = base + offset;
1207         if let Some(region) = self.find_region(addr) {
1208             let offset = addr - region.start.raw_value();
1209 
1210             // If the MSI-X table is written to, we need to update our cache.
1211             if self.interrupt.msix_table_accessed(region.index, offset) {
1212                 self.interrupt.msix_write_table(offset, data);
1213             } else {
1214                 self.vfio_wrapper.region_write(region.index, offset, data);
1215             }
1216         }
1217 
1218         // INTx EOI
1219         // The guest writing to the BAR potentially means the interrupt has
1220         // been received and can be acknowledged.
1221         if self.interrupt.intx_in_use() {
1222             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1223                 error!("Failed unmasking INTx IRQ: {}", e);
1224             }
1225         }
1226 
1227         None
1228     }
1229 
1230     pub(crate) fn write_config_register(
1231         &mut self,
1232         reg_idx: usize,
1233         offset: u64,
1234         data: &[u8],
1235     ) -> Option<Arc<Barrier>> {
1236         // When the guest wants to write to a BAR, we trap it into
1237         // our local configuration space. We're not reprogramming
1238         // VFIO device.
1239         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1240             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1241         {
1242             // We keep our local cache updated with the BARs.
1243             // We'll read it back from there when the guest is asking
1244             // for BARs (see read_config_register()).
1245             self.configuration
1246                 .write_config_register(reg_idx, offset, data);
1247             return None;
1248         }
1249 
1250         let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64;
1251 
1252         // If the MSI or MSI-X capabilities are accessed, we need to
1253         // update our local cache accordingly.
1254         // Depending on how the capabilities are modified, this could
1255         // trigger a VFIO MSI or MSI-X toggle.
1256         if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) {
1257             let cap_offset: u64 = reg - cap_base + offset;
1258             match cap_id {
1259                 PciCapabilityId::MessageSignalledInterrupts => {
1260                     if let Err(e) = self.update_msi_capabilities(cap_offset, data) {
1261                         error!("Could not update MSI capabilities: {}", e);
1262                     }
1263                 }
1264                 PciCapabilityId::MsiX => {
1265                     if let Err(e) = self.update_msix_capabilities(cap_offset, data) {
1266                         error!("Could not update MSI-X capabilities: {}", e);
1267                     }
1268                 }
1269                 _ => {}
1270             }
1271         }
1272 
1273         // Make sure to write to the device's PCI config space after MSI/MSI-X
1274         // interrupts have been enabled/disabled. In case of MSI, when the
1275         // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS),
1276         // the MSI Enable bit in the MSI capability structure found in the PCI
1277         // config space is disabled by default. That's why when the guest is
1278         // enabling this bit, we first need to enable the MSI interrupts with
1279         // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write
1280         // to the device region to update the MSI Enable bit.
1281         self.vfio_wrapper.write_config((reg + offset) as u32, data);
1282 
1283         None
1284     }
1285 
1286     pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1287         // When reading the BARs, we trap it and return what comes
1288         // from our local configuration space. We want the guest to
1289         // use that and not the VFIO device BARs as it does not map
1290         // with the guest address space.
1291         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1292             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1293         {
1294             return self.configuration.read_reg(reg_idx);
1295         }
1296 
1297         if let Some(id) = self.get_msix_cap_idx() {
1298             let msix = self.interrupt.msix.as_mut().unwrap();
1299             if reg_idx * 4 == id + 4 {
1300                 return msix.cap.table;
1301             } else if reg_idx * 4 == id + 8 {
1302                 return msix.cap.pba;
1303             }
1304         }
1305 
1306         // Since we don't support passing multi-functions devices, we should
1307         // mask the multi-function bit, bit 7 of the Header Type byte on the
1308         // register 3.
1309         let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX {
1310             0xff7f_ffff
1311         } else {
1312             0xffff_ffff
1313         };
1314 
1315         // The config register read comes from the VFIO device itself.
1316         let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask;
1317 
1318         if let Some(config_patch) = self.patches.get(&reg_idx) {
1319             value = (value & !config_patch.mask) | config_patch.patch;
1320         }
1321 
1322         value
1323     }
1324 
1325     fn state(&self) -> VfioCommonState {
1326         let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState {
1327             enabled: intx.enabled,
1328         });
1329 
1330         let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState {
1331             cap: msi.cfg.cap,
1332             cap_offset: msi.cap_offset,
1333         });
1334 
1335         let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState {
1336             cap: msix.cap,
1337             cap_offset: msix.cap_offset,
1338             bdf: msix.bar.devid,
1339         });
1340 
1341         VfioCommonState {
1342             intx_state,
1343             msi_state,
1344             msix_state,
1345         }
1346     }
1347 
1348     fn set_state(
1349         &mut self,
1350         state: &VfioCommonState,
1351         msi_state: Option<MsiConfigState>,
1352         msix_state: Option<MsixConfigState>,
1353     ) -> Result<(), VfioPciError> {
1354         if let (Some(intx), Some(interrupt_source_group)) =
1355             (&state.intx_state, self.legacy_interrupt_group.clone())
1356         {
1357             self.interrupt.intx = Some(VfioIntx {
1358                 interrupt_source_group,
1359                 enabled: false,
1360             });
1361 
1362             if intx.enabled {
1363                 self.enable_intx()?;
1364             }
1365         }
1366 
1367         if let Some(msi) = &state.msi_state {
1368             self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state);
1369         }
1370 
1371         if let Some(msix) = &state.msix_state {
1372             self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state);
1373         }
1374 
1375         Ok(())
1376     }
1377 }
1378 
1379 impl Pausable for VfioCommon {}
1380 
1381 impl Snapshottable for VfioCommon {
1382     fn id(&self) -> String {
1383         String::from(VFIO_COMMON_ID)
1384     }
1385 
1386     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1387         let mut vfio_common_snapshot = Snapshot::new_from_state(&self.state())?;
1388 
1389         // Snapshot PciConfiguration
1390         vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?);
1391 
1392         // Snapshot MSI
1393         if let Some(msi) = &mut self.interrupt.msi {
1394             vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?);
1395         }
1396 
1397         // Snapshot MSI-X
1398         if let Some(msix) = &mut self.interrupt.msix {
1399             vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?);
1400         }
1401 
1402         Ok(vfio_common_snapshot)
1403     }
1404 }
1405 
1406 /// VfioPciDevice represents a VFIO PCI device.
1407 /// This structure implements the BusDevice and PciDevice traits.
1408 ///
1409 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device.
1410 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice,
1411 /// which then gets added to the PCI bus.
1412 pub struct VfioPciDevice {
1413     id: String,
1414     vm: Arc<dyn hypervisor::Vm>,
1415     device: Arc<VfioDevice>,
1416     container: Arc<VfioContainer>,
1417     common: VfioCommon,
1418     iommu_attached: bool,
1419     memory_slot_allocator: MemorySlotAllocator,
1420 }
1421 
1422 impl VfioPciDevice {
1423     /// Constructs a new Vfio Pci device for the given Vfio device
1424     #[allow(clippy::too_many_arguments)]
1425     pub fn new(
1426         id: String,
1427         vm: &Arc<dyn hypervisor::Vm>,
1428         device: VfioDevice,
1429         container: Arc<VfioContainer>,
1430         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
1431         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
1432         iommu_attached: bool,
1433         bdf: PciBdf,
1434         memory_slot_allocator: MemorySlotAllocator,
1435         snapshot: Option<Snapshot>,
1436         x_nv_gpudirect_clique: Option<u8>,
1437     ) -> Result<Self, VfioPciError> {
1438         let device = Arc::new(device);
1439         device.reset();
1440 
1441         let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device));
1442 
1443         let common = VfioCommon::new(
1444             msi_interrupt_manager,
1445             legacy_interrupt_group,
1446             Arc::new(vfio_wrapper) as Arc<dyn Vfio>,
1447             &PciVfioSubclass::VfioSubclass,
1448             bdf,
1449             vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID),
1450             x_nv_gpudirect_clique,
1451         )?;
1452 
1453         let vfio_pci_device = VfioPciDevice {
1454             id,
1455             vm: vm.clone(),
1456             device,
1457             container,
1458             common,
1459             iommu_attached,
1460             memory_slot_allocator,
1461         };
1462 
1463         Ok(vfio_pci_device)
1464     }
1465 
1466     pub fn iommu_attached(&self) -> bool {
1467         self.iommu_attached
1468     }
1469 
1470     fn generate_sparse_areas(
1471         caps: &[VfioRegionInfoCap],
1472         region_index: u32,
1473         region_start: u64,
1474         region_size: u64,
1475         vfio_msix: Option<&VfioMsix>,
1476     ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> {
1477         for cap in caps {
1478             match cap {
1479                 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()),
1480                 VfioRegionInfoCap::MsixMappable => {
1481                     if !is_4k_aligned(region_start) {
1482                         error!(
1483                             "Region start address 0x{:x} must be at least aligned on 4KiB",
1484                             region_start
1485                         );
1486                         return Err(VfioPciError::RegionAlignment);
1487                     }
1488                     if !is_4k_multiple(region_size) {
1489                         error!(
1490                             "Region size 0x{:x} must be at least a multiple of 4KiB",
1491                             region_size
1492                         );
1493                         return Err(VfioPciError::RegionSize);
1494                     }
1495 
1496                     // In case the region contains the MSI-X vectors table or
1497                     // the MSI-X PBA table, we must calculate the subregions
1498                     // around them, leading to a list of sparse areas.
1499                     // We want to make sure we will still trap MMIO accesses
1500                     // to these MSI-X specific ranges. If these region don't align
1501                     // with pagesize, we can achieve it by enlarging its range.
1502                     //
1503                     // Using a BtreeMap as the list provided through the iterator is sorted
1504                     // by key. This ensures proper split of the whole region.
1505                     let mut inter_ranges = BTreeMap::new();
1506                     if let Some(msix) = vfio_msix {
1507                         if region_index == msix.cap.table_bir() {
1508                             let (offset, size) = msix.cap.table_range();
1509                             let offset = align_page_size_down(offset);
1510                             let size = align_page_size_up(size);
1511                             inter_ranges.insert(offset, size);
1512                         }
1513                         if region_index == msix.cap.pba_bir() {
1514                             let (offset, size) = msix.cap.pba_range();
1515                             let offset = align_page_size_down(offset);
1516                             let size = align_page_size_up(size);
1517                             inter_ranges.insert(offset, size);
1518                         }
1519                     }
1520 
1521                     let mut sparse_areas = Vec::new();
1522                     let mut current_offset = 0;
1523                     for (range_offset, range_size) in inter_ranges {
1524                         if range_offset > current_offset {
1525                             sparse_areas.push(VfioRegionSparseMmapArea {
1526                                 offset: current_offset,
1527                                 size: range_offset - current_offset,
1528                             });
1529                         }
1530                         current_offset = align_page_size_down(range_offset + range_size);
1531                     }
1532 
1533                     if region_size > current_offset {
1534                         sparse_areas.push(VfioRegionSparseMmapArea {
1535                             offset: current_offset,
1536                             size: region_size - current_offset,
1537                         });
1538                     }
1539 
1540                     return Ok(sparse_areas);
1541                 }
1542                 _ => {}
1543             }
1544         }
1545 
1546         // In case no relevant capabilities have been found, create a single
1547         // sparse area corresponding to the entire MMIO region.
1548         Ok(vec![VfioRegionSparseMmapArea {
1549             offset: 0,
1550             size: region_size,
1551         }])
1552     }
1553 
1554     /// Map MMIO regions into the guest, and avoid VM exits when the guest tries
1555     /// to reach those regions.
1556     ///
1557     /// # Arguments
1558     ///
1559     /// * `vm` - The VM object. It is used to set the VFIO MMIO regions
1560     ///          as user memory regions.
1561     /// * `mem_slot` - The closure to return a memory slot.
1562     pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> {
1563         let fd = self.device.as_raw_fd();
1564 
1565         for region in self.common.mmio_regions.iter_mut() {
1566             let region_flags = self.device.get_region_flags(region.index);
1567             if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1568                 let mut prot = 0;
1569                 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 {
1570                     prot |= libc::PROT_READ;
1571                 }
1572                 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 {
1573                     prot |= libc::PROT_WRITE;
1574                 }
1575 
1576                 // Retrieve the list of capabilities found on the region
1577                 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 {
1578                     self.device.get_region_caps(region.index)
1579                 } else {
1580                     Vec::new()
1581                 };
1582 
1583                 // Don't try to mmap the region if it contains MSI-X table or
1584                 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE
1585                 // in the list of supported capabilities.
1586                 if let Some(msix) = self.common.interrupt.msix.as_ref() {
1587                     if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir())
1588                         && !caps.contains(&VfioRegionInfoCap::MsixMappable)
1589                     {
1590                         continue;
1591                     }
1592                 }
1593 
1594                 let mmap_size = self.device.get_region_size(region.index);
1595                 let mmap_offset = self.device.get_region_offset(region.index);
1596 
1597                 let sparse_areas = Self::generate_sparse_areas(
1598                     &caps,
1599                     region.index,
1600                     region.start.0,
1601                     mmap_size,
1602                     self.common.interrupt.msix.as_ref(),
1603                 )?;
1604 
1605                 for area in sparse_areas.iter() {
1606                     // SAFETY: FFI call with correct arguments
1607                     let host_addr = unsafe {
1608                         libc::mmap(
1609                             null_mut(),
1610                             area.size as usize,
1611                             prot,
1612                             libc::MAP_SHARED,
1613                             fd,
1614                             mmap_offset as libc::off_t + area.offset as libc::off_t,
1615                         )
1616                     };
1617 
1618                     if host_addr == libc::MAP_FAILED {
1619                         error!(
1620                             "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}",
1621                             area.offset,
1622                             area.size,
1623                             std::io::Error::last_os_error()
1624                         );
1625                         return Err(VfioPciError::MmapArea);
1626                     }
1627 
1628                     if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) {
1629                         warn!(
1630                             "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})",
1631                             area.offset,
1632                             area.size,
1633                             );
1634                         return Ok(());
1635                     }
1636 
1637                     let user_memory_region = UserMemoryRegion {
1638                         slot: self.memory_slot_allocator.next_memory_slot(),
1639                         start: region.start.0 + area.offset,
1640                         size: area.size,
1641                         host_addr: host_addr as u64,
1642                     };
1643 
1644                     region.user_memory_regions.push(user_memory_region);
1645 
1646                     let mem_region = self.vm.make_user_memory_region(
1647                         user_memory_region.slot,
1648                         user_memory_region.start,
1649                         user_memory_region.size,
1650                         user_memory_region.host_addr,
1651                         false,
1652                         false,
1653                     );
1654 
1655                     self.vm
1656                         .create_user_memory_region(mem_region)
1657                         .map_err(VfioPciError::CreateUserMemoryRegion)?;
1658 
1659                     if !self.iommu_attached {
1660                         self.container
1661                             .vfio_dma_map(
1662                                 user_memory_region.start,
1663                                 user_memory_region.size,
1664                                 user_memory_region.host_addr,
1665                             )
1666                             .map_err(VfioPciError::DmaMap)?;
1667                     }
1668                 }
1669             }
1670         }
1671 
1672         Ok(())
1673     }
1674 
1675     pub fn unmap_mmio_regions(&mut self) {
1676         for region in self.common.mmio_regions.iter() {
1677             for user_memory_region in region.user_memory_regions.iter() {
1678                 // Unmap from vfio container
1679                 if !self.iommu_attached {
1680                     if let Err(e) = self
1681                         .container
1682                         .vfio_dma_unmap(user_memory_region.start, user_memory_region.size)
1683                     {
1684                         error!("Could not unmap mmio region from vfio container: {}", e);
1685                     }
1686                 }
1687 
1688                 // Remove region
1689                 let r = self.vm.make_user_memory_region(
1690                     user_memory_region.slot,
1691                     user_memory_region.start,
1692                     user_memory_region.size,
1693                     user_memory_region.host_addr,
1694                     false,
1695                     false,
1696                 );
1697 
1698                 if let Err(e) = self.vm.remove_user_memory_region(r) {
1699                     error!("Could not remove the userspace memory region: {}", e);
1700                 }
1701 
1702                 self.memory_slot_allocator
1703                     .free_memory_slot(user_memory_region.slot);
1704 
1705                 // SAFETY: FFI call with correct arguments
1706                 let ret = unsafe {
1707                     libc::munmap(
1708                         user_memory_region.host_addr as *mut libc::c_void,
1709                         user_memory_region.size as usize,
1710                     )
1711                 };
1712                 if ret != 0 {
1713                     error!(
1714                         "Could not unmap region {}, error:{}",
1715                         region.index,
1716                         io::Error::last_os_error()
1717                     );
1718                 }
1719             }
1720         }
1721     }
1722 
1723     pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> {
1724         if !self.iommu_attached {
1725             self.container
1726                 .vfio_dma_map(iova, size, user_addr)
1727                 .map_err(VfioPciError::DmaMap)?;
1728         }
1729 
1730         Ok(())
1731     }
1732 
1733     pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> {
1734         if !self.iommu_attached {
1735             self.container
1736                 .vfio_dma_unmap(iova, size)
1737                 .map_err(VfioPciError::DmaUnmap)?;
1738         }
1739 
1740         Ok(())
1741     }
1742 
1743     pub fn mmio_regions(&self) -> Vec<MmioRegion> {
1744         self.common.mmio_regions.clone()
1745     }
1746 }
1747 
1748 impl Drop for VfioPciDevice {
1749     fn drop(&mut self) {
1750         self.unmap_mmio_regions();
1751 
1752         if let Some(msix) = &self.common.interrupt.msix {
1753             if msix.bar.enabled() {
1754                 self.common.disable_msix();
1755             }
1756         }
1757 
1758         if let Some(msi) = &self.common.interrupt.msi {
1759             if msi.cfg.enabled() {
1760                 self.common.disable_msi()
1761             }
1762         }
1763 
1764         if self.common.interrupt.intx_in_use() {
1765             self.common.disable_intx();
1766         }
1767     }
1768 }
1769 
1770 impl BusDevice for VfioPciDevice {
1771     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1772         self.read_bar(base, offset, data)
1773     }
1774 
1775     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1776         self.write_bar(base, offset, data)
1777     }
1778 }
1779 
1780 // First BAR offset in the PCI config space.
1781 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10;
1782 // Capability register offset in the PCI config space.
1783 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34;
1784 // Extended capabilities register offset in the PCI config space.
1785 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100;
1786 // IO BAR when first BAR bit is 1.
1787 const PCI_CONFIG_IO_BAR: u32 = 0x1;
1788 // 64-bit memory bar flag.
1789 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4;
1790 // Prefetchable BAR bit
1791 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8;
1792 // PCI config register size (4 bytes).
1793 const PCI_CONFIG_REGISTER_SIZE: usize = 4;
1794 // Number of BARs for a PCI device
1795 const BAR_NUMS: usize = 6;
1796 // PCI Header Type register index
1797 const PCI_HEADER_TYPE_REG_INDEX: usize = 3;
1798 // First BAR register index
1799 const PCI_CONFIG_BAR0_INDEX: usize = 4;
1800 // PCI ROM expansion BAR register index
1801 const PCI_ROM_EXP_BAR_INDEX: usize = 12;
1802 
1803 impl PciDevice for VfioPciDevice {
1804     fn allocate_bars(
1805         &mut self,
1806         allocator: &Arc<Mutex<SystemAllocator>>,
1807         mmio32_allocator: &mut AddressAllocator,
1808         mmio64_allocator: &mut AddressAllocator,
1809         resources: Option<Vec<Resource>>,
1810     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
1811         self.common
1812             .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources)
1813     }
1814 
1815     fn free_bars(
1816         &mut self,
1817         allocator: &mut SystemAllocator,
1818         mmio32_allocator: &mut AddressAllocator,
1819         mmio64_allocator: &mut AddressAllocator,
1820     ) -> Result<(), PciDeviceError> {
1821         self.common
1822             .free_bars(allocator, mmio32_allocator, mmio64_allocator)
1823     }
1824 
1825     fn write_config_register(
1826         &mut self,
1827         reg_idx: usize,
1828         offset: u64,
1829         data: &[u8],
1830     ) -> Option<Arc<Barrier>> {
1831         self.common.write_config_register(reg_idx, offset, data)
1832     }
1833 
1834     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1835         self.common.read_config_register(reg_idx)
1836     }
1837 
1838     fn detect_bar_reprogramming(
1839         &mut self,
1840         reg_idx: usize,
1841         data: &[u8],
1842     ) -> Option<BarReprogrammingParams> {
1843         self.common
1844             .configuration
1845             .detect_bar_reprogramming(reg_idx, data)
1846     }
1847 
1848     fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1849         self.common.read_bar(base, offset, data)
1850     }
1851 
1852     fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1853         self.common.write_bar(base, offset, data)
1854     }
1855 
1856     fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> {
1857         for region in self.common.mmio_regions.iter_mut() {
1858             if region.start.raw_value() == old_base {
1859                 region.start = GuestAddress(new_base);
1860 
1861                 for user_memory_region in region.user_memory_regions.iter_mut() {
1862                     // Remove old region
1863                     let old_mem_region = self.vm.make_user_memory_region(
1864                         user_memory_region.slot,
1865                         user_memory_region.start,
1866                         user_memory_region.size,
1867                         user_memory_region.host_addr,
1868                         false,
1869                         false,
1870                     );
1871 
1872                     self.vm
1873                         .remove_user_memory_region(old_mem_region)
1874                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1875 
1876                     // Update the user memory region with the correct start address.
1877                     if new_base > old_base {
1878                         user_memory_region.start += new_base - old_base;
1879                     } else {
1880                         user_memory_region.start -= old_base - new_base;
1881                     }
1882 
1883                     // Insert new region
1884                     let new_mem_region = self.vm.make_user_memory_region(
1885                         user_memory_region.slot,
1886                         user_memory_region.start,
1887                         user_memory_region.size,
1888                         user_memory_region.host_addr,
1889                         false,
1890                         false,
1891                     );
1892 
1893                     self.vm
1894                         .create_user_memory_region(new_mem_region)
1895                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1896                 }
1897             }
1898         }
1899 
1900         Ok(())
1901     }
1902 
1903     fn as_any(&mut self) -> &mut dyn Any {
1904         self
1905     }
1906 
1907     fn id(&self) -> Option<String> {
1908         Some(self.id.clone())
1909     }
1910 }
1911 
1912 impl Pausable for VfioPciDevice {}
1913 
1914 impl Snapshottable for VfioPciDevice {
1915     fn id(&self) -> String {
1916         self.id.clone()
1917     }
1918 
1919     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1920         let mut vfio_pci_dev_snapshot = Snapshot::default();
1921 
1922         // Snapshot VfioCommon
1923         vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?);
1924 
1925         Ok(vfio_pci_dev_snapshot)
1926     }
1927 }
1928 impl Transportable for VfioPciDevice {}
1929 impl Migratable for VfioPciDevice {}
1930 
1931 /// This structure implements the ExternalDmaMapping trait. It is meant to
1932 /// be used when the caller tries to provide a way to update the mappings
1933 /// associated with a specific VFIO container.
1934 pub struct VfioDmaMapping<M: GuestAddressSpace> {
1935     container: Arc<VfioContainer>,
1936     memory: Arc<M>,
1937     mmio_regions: Arc<Mutex<Vec<MmioRegion>>>,
1938 }
1939 
1940 impl<M: GuestAddressSpace> VfioDmaMapping<M> {
1941     /// Create a DmaMapping object.
1942     /// # Parameters
1943     /// * `container`: VFIO container object.
1944     /// * `memory`: guest memory to mmap.
1945     /// * `mmio_regions`: mmio_regions to mmap.
1946     pub fn new(
1947         container: Arc<VfioContainer>,
1948         memory: Arc<M>,
1949         mmio_regions: Arc<Mutex<Vec<MmioRegion>>>,
1950     ) -> Self {
1951         VfioDmaMapping {
1952             container,
1953             memory,
1954             mmio_regions,
1955         }
1956     }
1957 }
1958 
1959 impl<M: GuestAddressSpace + Sync + Send> ExternalDmaMapping for VfioDmaMapping<M> {
1960     fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), io::Error> {
1961         let mem = self.memory.memory();
1962         let guest_addr = GuestAddress(gpa);
1963         let user_addr = if mem.check_range(guest_addr, size as usize) {
1964             match mem.get_host_address(guest_addr) {
1965                 Ok(t) => t as u64,
1966                 Err(e) => {
1967                     return Err(io::Error::new(
1968                         io::ErrorKind::Other,
1969                         format!("unable to retrieve user address for gpa 0x{gpa:x} from guest memory region: {e}")
1970                     ));
1971                 }
1972             }
1973         } else if self.mmio_regions.lock().unwrap().check_range(gpa, size) {
1974             self.mmio_regions.lock().unwrap().find_user_address(gpa)?
1975         } else {
1976             return Err(io::Error::new(
1977                 io::ErrorKind::Other,
1978                 format!("failed to locate guest address 0x{gpa:x} in guest memory"),
1979             ));
1980         };
1981 
1982         self.container
1983             .vfio_dma_map(iova, size, user_addr)
1984             .map_err(|e| {
1985                 io::Error::new(
1986                     io::ErrorKind::Other,
1987                     format!(
1988                         "failed to map memory for VFIO container, \
1989                          iova 0x{iova:x}, gpa 0x{gpa:x}, size 0x{size:x}: {e:?}"
1990                     ),
1991                 )
1992             })
1993     }
1994 
1995     fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), io::Error> {
1996         self.container.vfio_dma_unmap(iova, size).map_err(|e| {
1997             io::Error::new(
1998                 io::ErrorKind::Other,
1999                 format!(
2000                     "failed to unmap memory for VFIO container, \
2001                      iova 0x{iova:x}, size 0x{size:x}: {e:?}"
2002                 ),
2003             )
2004         })
2005     }
2006 }
2007