xref: /cloud-hypervisor/pci/src/vfio.rs (revision d3fade85a725d36653dc4f636a1e55177eac2ddc)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 
6 use crate::msi::{MsiConfigState, MSI_CONFIG_ID};
7 use crate::msix::MsixConfigState;
8 use crate::{
9     msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig,
10     PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId,
11     PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId,
12     PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID,
13 };
14 use anyhow::anyhow;
15 use byteorder::{ByteOrder, LittleEndian};
16 use hypervisor::HypervisorVmError;
17 use libc::{sysconf, _SC_PAGESIZE};
18 use std::any::Any;
19 use std::collections::{BTreeMap, HashMap};
20 use std::io;
21 use std::os::unix::io::AsRawFd;
22 use std::ptr::null_mut;
23 use std::sync::{Arc, Barrier, Mutex};
24 use thiserror::Error;
25 use versionize::{VersionMap, Versionize, VersionizeResult};
26 use versionize_derive::Versionize;
27 use vfio_bindings::bindings::vfio::*;
28 use vfio_ioctls::{
29     VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea,
30 };
31 use vm_allocator::page_size::{
32     align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned,
33 };
34 use vm_allocator::{AddressAllocator, SystemAllocator};
35 use vm_device::interrupt::{
36     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
37 };
38 use vm_device::{BusDevice, Resource};
39 use vm_memory::{Address, GuestAddress, GuestUsize};
40 use vm_migration::{
41     Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped,
42 };
43 use vmm_sys_util::eventfd::EventFd;
44 
45 pub(crate) const VFIO_COMMON_ID: &str = "vfio_common";
46 
47 #[derive(Debug, Error)]
48 pub enum VfioPciError {
49     #[error("Failed to create user memory region: {0}")]
50     CreateUserMemoryRegion(#[source] HypervisorVmError),
51     #[error("Failed to DMA map: {0}")]
52     DmaMap(#[source] vfio_ioctls::VfioError),
53     #[error("Failed to DMA unmap: {0}")]
54     DmaUnmap(#[source] vfio_ioctls::VfioError),
55     #[error("Failed to enable INTx: {0}")]
56     EnableIntx(#[source] VfioError),
57     #[error("Failed to enable MSI: {0}")]
58     EnableMsi(#[source] VfioError),
59     #[error("Failed to enable MSI-x: {0}")]
60     EnableMsix(#[source] VfioError),
61     #[error("Failed to mmap the area")]
62     MmapArea,
63     #[error("Failed to notifier's eventfd")]
64     MissingNotifier,
65     #[error("Invalid region alignment")]
66     RegionAlignment,
67     #[error("Invalid region size")]
68     RegionSize,
69     #[error("Failed to retrieve MsiConfigState: {0}")]
70     RetrieveMsiConfigState(#[source] anyhow::Error),
71     #[error("Failed to retrieve MsixConfigState: {0}")]
72     RetrieveMsixConfigState(#[source] anyhow::Error),
73     #[error("Failed to retrieve PciConfigurationState: {0}")]
74     RetrievePciConfigurationState(#[source] anyhow::Error),
75     #[error("Failed to retrieve VfioCommonState: {0}")]
76     RetrieveVfioCommonState(#[source] anyhow::Error),
77 }
78 
79 #[derive(Copy, Clone)]
80 enum PciVfioSubclass {
81     VfioSubclass = 0xff,
82 }
83 
84 impl PciSubclass for PciVfioSubclass {
85     fn get_register_value(&self) -> u8 {
86         *self as u8
87     }
88 }
89 
90 enum InterruptUpdateAction {
91     EnableMsi,
92     DisableMsi,
93     EnableMsix,
94     DisableMsix,
95 }
96 
97 #[derive(Versionize)]
98 struct IntxState {
99     enabled: bool,
100 }
101 
102 pub(crate) struct VfioIntx {
103     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
104     enabled: bool,
105 }
106 
107 #[derive(Versionize)]
108 struct MsiState {
109     cap: MsiCap,
110     cap_offset: u32,
111 }
112 
113 pub(crate) struct VfioMsi {
114     pub(crate) cfg: MsiConfig,
115     cap_offset: u32,
116     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
117 }
118 
119 impl VfioMsi {
120     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
121         let old_enabled = self.cfg.enabled();
122 
123         self.cfg.update(offset, data);
124 
125         let new_enabled = self.cfg.enabled();
126 
127         if !old_enabled && new_enabled {
128             return Some(InterruptUpdateAction::EnableMsi);
129         }
130 
131         if old_enabled && !new_enabled {
132             return Some(InterruptUpdateAction::DisableMsi);
133         }
134 
135         None
136     }
137 }
138 
139 #[derive(Versionize)]
140 struct MsixState {
141     cap: MsixCap,
142     cap_offset: u32,
143     bdf: u32,
144 }
145 
146 pub(crate) struct VfioMsix {
147     pub(crate) bar: MsixConfig,
148     cap: MsixCap,
149     cap_offset: u32,
150     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
151 }
152 
153 impl VfioMsix {
154     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
155         let old_enabled = self.bar.enabled();
156 
157         // Update "Message Control" word
158         if offset == 2 && data.len() == 2 {
159             self.bar.set_msg_ctl(LittleEndian::read_u16(data));
160         }
161 
162         let new_enabled = self.bar.enabled();
163 
164         if !old_enabled && new_enabled {
165             return Some(InterruptUpdateAction::EnableMsix);
166         }
167 
168         if old_enabled && !new_enabled {
169             return Some(InterruptUpdateAction::DisableMsix);
170         }
171 
172         None
173     }
174 
175     fn table_accessed(&self, bar_index: u32, offset: u64) -> bool {
176         let table_offset: u64 = u64::from(self.cap.table_offset());
177         let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64);
178         let table_bir: u32 = self.cap.table_bir();
179 
180         bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size
181     }
182 }
183 
184 pub(crate) struct Interrupt {
185     pub(crate) intx: Option<VfioIntx>,
186     pub(crate) msi: Option<VfioMsi>,
187     pub(crate) msix: Option<VfioMsix>,
188 }
189 
190 impl Interrupt {
191     fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
192         if let Some(ref mut msi) = &mut self.msi {
193             let action = msi.update(offset, data);
194             return action;
195         }
196 
197         None
198     }
199 
200     fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
201         if let Some(ref mut msix) = &mut self.msix {
202             let action = msix.update(offset, data);
203             return action;
204         }
205 
206         None
207     }
208 
209     fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> {
210         if let Some(msi) = &self.msi {
211             if offset >= u64::from(msi.cap_offset)
212                 && offset < u64::from(msi.cap_offset) + msi.cfg.size()
213             {
214                 return Some((
215                     PciCapabilityId::MessageSignalledInterrupts,
216                     u64::from(msi.cap_offset),
217                 ));
218             }
219         }
220 
221         if let Some(msix) = &self.msix {
222             if offset == u64::from(msix.cap_offset) {
223                 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset)));
224             }
225         }
226 
227         None
228     }
229 
230     fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool {
231         if let Some(msix) = &self.msix {
232             return msix.table_accessed(bar_index, offset);
233         }
234 
235         false
236     }
237 
238     fn msix_write_table(&mut self, offset: u64, data: &[u8]) {
239         if let Some(ref mut msix) = &mut self.msix {
240             let offset = offset - u64::from(msix.cap.table_offset());
241             msix.bar.write_table(offset, data)
242         }
243     }
244 
245     fn msix_read_table(&self, offset: u64, data: &mut [u8]) {
246         if let Some(msix) = &self.msix {
247             let offset = offset - u64::from(msix.cap.table_offset());
248             msix.bar.read_table(offset, data)
249         }
250     }
251 
252     pub(crate) fn intx_in_use(&self) -> bool {
253         if let Some(intx) = &self.intx {
254             return intx.enabled;
255         }
256 
257         false
258     }
259 }
260 
261 #[derive(Copy, Clone)]
262 pub struct UserMemoryRegion {
263     pub slot: u32,
264     pub start: u64,
265     pub size: u64,
266     pub host_addr: u64,
267 }
268 
269 #[derive(Clone)]
270 pub struct MmioRegion {
271     pub start: GuestAddress,
272     pub length: GuestUsize,
273     pub(crate) type_: PciBarRegionType,
274     pub(crate) index: u32,
275     pub(crate) user_memory_regions: Vec<UserMemoryRegion>,
276 }
277 #[derive(Debug, Error)]
278 pub enum VfioError {
279     #[error("Kernel VFIO error: {0}")]
280     KernelVfio(#[source] vfio_ioctls::VfioError),
281     #[error("VFIO user error: {0}")]
282     VfioUser(#[source] vfio_user::Error),
283 }
284 
285 pub(crate) trait Vfio: Send + Sync {
286     fn read_config_byte(&self, offset: u32) -> u8 {
287         let mut data: [u8; 1] = [0];
288         self.read_config(offset, &mut data);
289         data[0]
290     }
291 
292     fn read_config_word(&self, offset: u32) -> u16 {
293         let mut data: [u8; 2] = [0, 0];
294         self.read_config(offset, &mut data);
295         u16::from_le_bytes(data)
296     }
297 
298     fn read_config_dword(&self, offset: u32) -> u32 {
299         let mut data: [u8; 4] = [0, 0, 0, 0];
300         self.read_config(offset, &mut data);
301         u32::from_le_bytes(data)
302     }
303 
304     fn write_config_dword(&self, offset: u32, buf: u32) {
305         let data: [u8; 4] = buf.to_le_bytes();
306         self.write_config(offset, &data)
307     }
308 
309     fn read_config(&self, offset: u32, data: &mut [u8]) {
310         self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut());
311     }
312 
313     fn write_config(&self, offset: u32, data: &[u8]) {
314         self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data)
315     }
316 
317     fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
318         self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds)
319     }
320 
321     fn disable_msi(&self) -> Result<(), VfioError> {
322         self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX)
323     }
324 
325     fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
326         self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds)
327     }
328 
329     fn disable_msix(&self) -> Result<(), VfioError> {
330         self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX)
331     }
332 
333     fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) {
334         unimplemented!()
335     }
336 
337     fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) {
338         unimplemented!()
339     }
340 
341     fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> {
342         unimplemented!()
343     }
344 
345     fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
346         unimplemented!()
347     }
348 
349     fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
350         unimplemented!()
351     }
352 
353     fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
354         unimplemented!()
355     }
356 }
357 
358 struct VfioDeviceWrapper {
359     device: Arc<VfioDevice>,
360 }
361 
362 impl VfioDeviceWrapper {
363     fn new(device: Arc<VfioDevice>) -> Self {
364         Self { device }
365     }
366 }
367 
368 impl Vfio for VfioDeviceWrapper {
369     fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) {
370         self.device.region_read(index, data, offset)
371     }
372 
373     fn region_write(&self, index: u32, offset: u64, data: &[u8]) {
374         self.device.region_write(index, data, offset)
375     }
376 
377     fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> {
378         self.device.get_irq_info(irq_index).copied()
379     }
380 
381     fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
382         self.device
383             .enable_irq(irq_index, event_fds)
384             .map_err(VfioError::KernelVfio)
385     }
386 
387     fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> {
388         self.device
389             .disable_irq(irq_index)
390             .map_err(VfioError::KernelVfio)
391     }
392 
393     fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> {
394         self.device
395             .unmask_irq(irq_index)
396             .map_err(VfioError::KernelVfio)
397     }
398 }
399 
400 #[derive(Versionize)]
401 struct VfioCommonState {
402     intx_state: Option<IntxState>,
403     msi_state: Option<MsiState>,
404     msix_state: Option<MsixState>,
405 }
406 
407 impl VersionMapped for VfioCommonState {}
408 
409 pub(crate) struct ConfigPatch {
410     mask: u32,
411     patch: u32,
412 }
413 
414 pub(crate) struct VfioCommon {
415     pub(crate) configuration: PciConfiguration,
416     pub(crate) mmio_regions: Vec<MmioRegion>,
417     pub(crate) interrupt: Interrupt,
418     pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
419     pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
420     pub(crate) vfio_wrapper: Arc<dyn Vfio>,
421     pub(crate) patches: HashMap<usize, ConfigPatch>,
422     x_nv_gpudirect_clique: Option<u8>,
423 }
424 
425 impl VfioCommon {
426     pub(crate) fn new(
427         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
428         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
429         vfio_wrapper: Arc<dyn Vfio>,
430         subclass: &dyn PciSubclass,
431         bdf: PciBdf,
432         snapshot: Option<Snapshot>,
433         x_nv_gpudirect_clique: Option<u8>,
434     ) -> Result<Self, VfioPciError> {
435         let pci_configuration_state =
436             vm_migration::versioned_state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID)
437                 .map_err(|e| {
438                     VfioPciError::RetrievePciConfigurationState(anyhow!(
439                         "Failed to get PciConfigurationState from Snapshot: {}",
440                         e
441                     ))
442                 })?;
443 
444         let configuration = PciConfiguration::new(
445             0,
446             0,
447             0,
448             PciClassCode::Other,
449             subclass,
450             None,
451             PciHeaderType::Device,
452             0,
453             0,
454             None,
455             pci_configuration_state,
456         );
457 
458         let mut vfio_common = VfioCommon {
459             mmio_regions: Vec::new(),
460             configuration,
461             interrupt: Interrupt {
462                 intx: None,
463                 msi: None,
464                 msix: None,
465             },
466             msi_interrupt_manager,
467             legacy_interrupt_group,
468             vfio_wrapper,
469             patches: HashMap::new(),
470             x_nv_gpudirect_clique,
471         };
472 
473         let state: Option<VfioCommonState> = snapshot
474             .as_ref()
475             .map(|s| s.to_versioned_state())
476             .transpose()
477             .map_err(|e| {
478                 VfioPciError::RetrieveVfioCommonState(anyhow!(
479                     "Failed to get VfioCommonState from Snapshot: {}",
480                     e
481                 ))
482             })?;
483         let msi_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSI_CONFIG_ID)
484             .map_err(|e| {
485                 VfioPciError::RetrieveMsiConfigState(anyhow!(
486                     "Failed to get MsiConfigState from Snapshot: {}",
487                     e
488                 ))
489             })?;
490         let msix_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID)
491             .map_err(|e| {
492             VfioPciError::RetrieveMsixConfigState(anyhow!(
493                 "Failed to get MsixConfigState from Snapshot: {}",
494                 e
495             ))
496         })?;
497 
498         if let Some(state) = state.as_ref() {
499             vfio_common.set_state(state, msi_state, msix_state)?;
500         } else {
501             vfio_common.parse_capabilities(bdf);
502             vfio_common.initialize_legacy_interrupt()?;
503         }
504 
505         Ok(vfio_common)
506     }
507 
508     /// In case msix table offset is not page size aligned, we need do some fixup to achieve it.
509     /// Because we don't want the MMIO RW region and trap region overlap each other.
510     fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 {
511         if let Some(msix) = self.interrupt.msix.as_mut() {
512             let msix_cap = &mut msix.cap;
513 
514             // Suppose table_bir equals to pba_bir here. Am I right?
515             let (table_offset, table_size) = msix_cap.table_range();
516             if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id {
517                 return region_size;
518             }
519 
520             let (pba_offset, pba_size) = msix_cap.pba_range();
521             let msix_sz = align_page_size_up(table_size + pba_size);
522             // Expand region to hold RW and trap region which both page size aligned
523             let size = std::cmp::max(region_size * 2, msix_sz * 2);
524             // let table starts from the middle of the region
525             msix_cap.table_set_offset((size / 2) as u32);
526             msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32);
527 
528             size
529         } else {
530             // MSI-X not supported for this device
531             region_size
532         }
533     }
534 
535     // The `allocator` argument is unused on `aarch64`
536     #[allow(unused_variables)]
537     pub(crate) fn allocate_bars(
538         &mut self,
539         allocator: &Arc<Mutex<SystemAllocator>>,
540         mmio32_allocator: &mut AddressAllocator,
541         mmio64_allocator: &mut AddressAllocator,
542         resources: Option<Vec<Resource>>,
543     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
544         let mut bars = Vec::new();
545         let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX;
546 
547         // Going through all regular regions to compute the BAR size.
548         // We're not saving the BAR address to restore it, because we
549         // are going to allocate a guest address for each BAR and write
550         // that new address back.
551         while bar_id < VFIO_PCI_CONFIG_REGION_INDEX {
552             let mut region_size: u64 = 0;
553             let mut region_type = PciBarRegionType::Memory32BitRegion;
554             let mut prefetchable = PciBarPrefetchable::NotPrefetchable;
555             let mut flags: u32 = 0;
556 
557             let mut restored_bar_addr = None;
558             if let Some(resources) = &resources {
559                 for resource in resources {
560                     if let Resource::PciBar {
561                         index,
562                         base,
563                         size,
564                         type_,
565                         ..
566                     } = resource
567                     {
568                         if *index == bar_id as usize {
569                             restored_bar_addr = Some(GuestAddress(*base));
570                             region_size = *size;
571                             region_type = PciBarRegionType::from(*type_);
572                             break;
573                         }
574                     }
575                 }
576                 if restored_bar_addr.is_none() {
577                     bar_id += 1;
578                     continue;
579                 }
580             } else {
581                 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
582                     (PCI_ROM_EXP_BAR_INDEX * 4) as u32
583                 } else {
584                     PCI_CONFIG_BAR_OFFSET + bar_id * 4
585                 };
586 
587                 // First read flags
588                 flags = self.vfio_wrapper.read_config_dword(bar_offset);
589 
590                 // Is this an IO BAR?
591                 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
592                     matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR)
593                 } else {
594                     false
595                 };
596 
597                 // Is this a 64-bit BAR?
598                 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
599                     matches!(
600                         flags & PCI_CONFIG_MEMORY_BAR_64BIT,
601                         PCI_CONFIG_MEMORY_BAR_64BIT
602                     )
603                 } else {
604                     false
605                 };
606 
607                 if matches!(
608                     flags & PCI_CONFIG_BAR_PREFETCHABLE,
609                     PCI_CONFIG_BAR_PREFETCHABLE
610                 ) {
611                     prefetchable = PciBarPrefetchable::Prefetchable
612                 };
613 
614                 // To get size write all 1s
615                 self.vfio_wrapper
616                     .write_config_dword(bar_offset, 0xffff_ffff);
617 
618                 // And read back BAR value. The device will write zeros for bits it doesn't care about
619                 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset);
620 
621                 if io_bar {
622                     // Mask flag bits (lowest 2 for I/O bars)
623                     lower &= !0b11;
624 
625                     // BAR is not enabled
626                     if lower == 0 {
627                         bar_id += 1;
628                         continue;
629                     }
630 
631                     // IO BAR
632                     region_type = PciBarRegionType::IoRegion;
633 
634                     // Invert bits and add 1 to calculate size
635                     region_size = (!lower + 1) as u64;
636                 } else if is_64bit_bar {
637                     // 64 bits Memory BAR
638                     region_type = PciBarRegionType::Memory64BitRegion;
639 
640                     // Query size of upper BAR of 64-bit BAR
641                     let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4;
642                     self.vfio_wrapper
643                         .write_config_dword(upper_offset, 0xffff_ffff);
644                     let upper = self.vfio_wrapper.read_config_dword(upper_offset);
645 
646                     let mut combined_size = u64::from(upper) << 32 | u64::from(lower);
647 
648                     // Mask out flag bits (lowest 4 for memory bars)
649                     combined_size &= !0b1111;
650 
651                     // BAR is not enabled
652                     if combined_size == 0 {
653                         bar_id += 1;
654                         continue;
655                     }
656 
657                     // Invert and add 1 to to find size
658                     region_size = !combined_size + 1;
659                 } else {
660                     region_type = PciBarRegionType::Memory32BitRegion;
661 
662                     // Mask out flag bits (lowest 4 for memory bars)
663                     lower &= !0b1111;
664 
665                     if lower == 0 {
666                         bar_id += 1;
667                         continue;
668                     }
669 
670                     // Invert and add 1 to to find size
671                     region_size = (!lower + 1) as u64;
672                 }
673             }
674 
675             let bar_addr = match region_type {
676                 PciBarRegionType::IoRegion => {
677                     #[cfg(target_arch = "aarch64")]
678                     unimplemented!();
679 
680                     // The address needs to be 4 bytes aligned.
681                     #[cfg(not(target_arch = "aarch64"))]
682                     allocator
683                         .lock()
684                         .unwrap()
685                         .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4))
686                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
687                 }
688                 PciBarRegionType::Memory32BitRegion => {
689                     // BAR allocation must be naturally aligned
690                     mmio32_allocator
691                         .allocate(restored_bar_addr, region_size, Some(region_size))
692                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
693                 }
694                 PciBarRegionType::Memory64BitRegion => {
695                     // We need do some fixup to keep MMIO RW region and msix cap region page size
696                     // aligned.
697                     region_size = self.fixup_msix_region(bar_id, region_size);
698                     mmio64_allocator
699                         .allocate(
700                             restored_bar_addr,
701                             region_size,
702                             Some(std::cmp::max(
703                                 // SAFETY: FFI call. Trivially safe.
704                                 unsafe { sysconf(_SC_PAGESIZE) as GuestUsize },
705                                 region_size,
706                             )),
707                         )
708                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
709                 }
710             };
711 
712             // We can now build our BAR configuration block.
713             let bar = PciBarConfiguration::default()
714                 .set_index(bar_id as usize)
715                 .set_address(bar_addr.raw_value())
716                 .set_size(region_size)
717                 .set_region_type(region_type)
718                 .set_prefetchable(prefetchable);
719 
720             if bar_id == VFIO_PCI_ROM_REGION_INDEX {
721                 self.configuration
722                     .add_pci_rom_bar(&bar, flags & 0x1)
723                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
724             } else {
725                 self.configuration
726                     .add_pci_bar(&bar)
727                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
728             }
729 
730             bars.push(bar);
731             self.mmio_regions.push(MmioRegion {
732                 start: bar_addr,
733                 length: region_size,
734                 type_: region_type,
735                 index: bar_id,
736                 user_memory_regions: Vec::new(),
737             });
738 
739             bar_id += 1;
740             if region_type == PciBarRegionType::Memory64BitRegion {
741                 bar_id += 1;
742             }
743         }
744 
745         Ok(bars)
746     }
747 
748     // The `allocator` argument is unused on `aarch64`
749     #[allow(unused_variables)]
750     pub(crate) fn free_bars(
751         &mut self,
752         allocator: &mut SystemAllocator,
753         mmio32_allocator: &mut AddressAllocator,
754         mmio64_allocator: &mut AddressAllocator,
755     ) -> Result<(), PciDeviceError> {
756         for region in self.mmio_regions.iter() {
757             match region.type_ {
758                 PciBarRegionType::IoRegion => {
759                     #[cfg(target_arch = "x86_64")]
760                     allocator.free_io_addresses(region.start, region.length);
761                     #[cfg(target_arch = "aarch64")]
762                     error!("I/O region is not supported");
763                 }
764                 PciBarRegionType::Memory32BitRegion => {
765                     mmio32_allocator.free(region.start, region.length);
766                 }
767                 PciBarRegionType::Memory64BitRegion => {
768                     mmio64_allocator.free(region.start, region.length);
769                 }
770             }
771         }
772         Ok(())
773     }
774 
775     pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap {
776         let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into());
777 
778         let table = self.vfio_wrapper.read_config_dword((cap + 4).into());
779 
780         let pba = self.vfio_wrapper.read_config_dword((cap + 8).into());
781 
782         MsixCap {
783             msg_ctl,
784             table,
785             pba,
786         }
787     }
788 
789     pub(crate) fn initialize_msix(
790         &mut self,
791         msix_cap: MsixCap,
792         cap_offset: u32,
793         bdf: PciBdf,
794         state: Option<MsixConfigState>,
795     ) {
796         let interrupt_source_group = self
797             .msi_interrupt_manager
798             .create_group(MsiIrqGroupConfig {
799                 base: 0,
800                 count: msix_cap.table_size() as InterruptIndex,
801             })
802             .unwrap();
803 
804         let msix_config = MsixConfig::new(
805             msix_cap.table_size(),
806             interrupt_source_group.clone(),
807             bdf.into(),
808             state,
809         )
810         .unwrap();
811 
812         self.interrupt.msix = Some(VfioMsix {
813             bar: msix_config,
814             cap: msix_cap,
815             cap_offset,
816             interrupt_source_group,
817         });
818     }
819 
820     pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 {
821         self.vfio_wrapper.read_config_word((cap + 2).into())
822     }
823 
824     pub(crate) fn initialize_msi(
825         &mut self,
826         msg_ctl: u16,
827         cap_offset: u32,
828         state: Option<MsiConfigState>,
829     ) {
830         let interrupt_source_group = self
831             .msi_interrupt_manager
832             .create_group(MsiIrqGroupConfig {
833                 base: 0,
834                 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex,
835             })
836             .unwrap();
837 
838         let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap();
839 
840         self.interrupt.msi = Some(VfioMsi {
841             cfg: msi_config,
842             cap_offset,
843             interrupt_source_group,
844         });
845     }
846 
847     pub(crate) fn get_msix_cap_idx(&self) -> Option<usize> {
848         let mut cap_next = self
849             .vfio_wrapper
850             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
851 
852         while cap_next != 0 {
853             let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into());
854             if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX {
855                 return Some(cap_next as usize);
856             } else {
857                 cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into());
858             }
859         }
860 
861         None
862     }
863 
864     pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) {
865         let mut cap_iter = self
866             .vfio_wrapper
867             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
868 
869         let mut pci_express_cap_found = false;
870         let mut power_management_cap_found = false;
871 
872         while cap_iter != 0 {
873             let cap_id = self.vfio_wrapper.read_config_byte(cap_iter.into());
874 
875             match PciCapabilityId::from(cap_id) {
876                 PciCapabilityId::MessageSignalledInterrupts => {
877                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) {
878                         if irq_info.count > 0 {
879                             // Parse capability only if the VFIO device
880                             // supports MSI.
881                             let msg_ctl = self.parse_msi_capabilities(cap_iter);
882                             self.initialize_msi(msg_ctl, cap_iter as u32, None);
883                         }
884                     }
885                 }
886                 PciCapabilityId::MsiX => {
887                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX)
888                     {
889                         if irq_info.count > 0 {
890                             // Parse capability only if the VFIO device
891                             // supports MSI-X.
892                             let msix_cap = self.parse_msix_capabilities(cap_iter);
893                             self.initialize_msix(msix_cap, cap_iter as u32, bdf, None);
894                         }
895                     }
896                 }
897                 PciCapabilityId::PciExpress => pci_express_cap_found = true,
898                 PciCapabilityId::PowerManagement => power_management_cap_found = true,
899                 _ => {}
900             };
901 
902             let cap_next = self.vfio_wrapper.read_config_byte((cap_iter + 1).into());
903             if cap_next == 0 {
904                 break;
905             }
906 
907             cap_iter = cap_next;
908         }
909 
910         if let Some(clique_id) = self.x_nv_gpudirect_clique {
911             self.add_nv_gpudirect_clique_cap(cap_iter, clique_id);
912         }
913 
914         if pci_express_cap_found && power_management_cap_found {
915             self.parse_extended_capabilities();
916         }
917     }
918 
919     fn add_nv_gpudirect_clique_cap(&mut self, cap_iter: u8, clique_id: u8) {
920         // Turing, Ampere, Hopper, and Lovelace GPUs have dedicated space
921         // at 0xD4 for this capability.
922         let cap_offset = 0xd4u32;
923 
924         let reg_idx = (cap_iter / 4) as usize;
925         self.patches.insert(
926             reg_idx,
927             ConfigPatch {
928                 mask: 0x0000_ff00,
929                 patch: cap_offset << 8,
930             },
931         );
932 
933         let reg_idx = (cap_offset / 4) as usize;
934         self.patches.insert(
935             reg_idx,
936             ConfigPatch {
937                 mask: 0xffff_ffff,
938                 patch: 0x50080009u32,
939             },
940         );
941         self.patches.insert(
942             reg_idx + 1,
943             ConfigPatch {
944                 mask: 0xffff_ffff,
945                 patch: u32::from(clique_id) << 19 | 0x5032,
946             },
947         );
948     }
949 
950     fn parse_extended_capabilities(&mut self) {
951         let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET;
952 
953         loop {
954             let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset);
955 
956             let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16;
957             let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16;
958 
959             match PciExpressCapabilityId::from(cap_id) {
960                 PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation
961                 | PciExpressCapabilityId::ResizeableBar
962                 | PciExpressCapabilityId::SingleRootIoVirtualization => {
963                     let reg_idx = (current_offset / 4) as usize;
964                     self.patches.insert(
965                         reg_idx,
966                         ConfigPatch {
967                             mask: 0x0000_ffff,
968                             patch: PciExpressCapabilityId::NullCapability as u32,
969                         },
970                     );
971                 }
972                 _ => {}
973             }
974 
975             if cap_next == 0 {
976                 break;
977             }
978 
979             current_offset = cap_next.into();
980         }
981     }
982 
983     pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> {
984         if let Some(intx) = &mut self.interrupt.intx {
985             if !intx.enabled {
986                 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) {
987                     self.vfio_wrapper
988                         .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd])
989                         .map_err(VfioPciError::EnableIntx)?;
990 
991                     intx.enabled = true;
992                 } else {
993                     return Err(VfioPciError::MissingNotifier);
994                 }
995             }
996         }
997 
998         Ok(())
999     }
1000 
1001     pub(crate) fn disable_intx(&mut self) {
1002         if let Some(intx) = &mut self.interrupt.intx {
1003             if intx.enabled {
1004                 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1005                     error!("Could not disable INTx: {}", e);
1006                 } else {
1007                     intx.enabled = false;
1008                 }
1009             }
1010         }
1011     }
1012 
1013     pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> {
1014         if let Some(msi) = &self.interrupt.msi {
1015             let mut irq_fds: Vec<EventFd> = Vec::new();
1016             for i in 0..msi.cfg.num_enabled_vectors() {
1017                 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) {
1018                     irq_fds.push(eventfd);
1019                 } else {
1020                     return Err(VfioPciError::MissingNotifier);
1021                 }
1022             }
1023 
1024             self.vfio_wrapper
1025                 .enable_msi(irq_fds.iter().collect())
1026                 .map_err(VfioPciError::EnableMsi)?;
1027         }
1028 
1029         Ok(())
1030     }
1031 
1032     pub(crate) fn disable_msi(&self) {
1033         if let Err(e) = self.vfio_wrapper.disable_msi() {
1034             error!("Could not disable MSI: {}", e);
1035         }
1036     }
1037 
1038     pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> {
1039         if let Some(msix) = &self.interrupt.msix {
1040             let mut irq_fds: Vec<EventFd> = Vec::new();
1041             for i in 0..msix.bar.table_entries.len() {
1042                 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) {
1043                     irq_fds.push(eventfd);
1044                 } else {
1045                     return Err(VfioPciError::MissingNotifier);
1046                 }
1047             }
1048 
1049             self.vfio_wrapper
1050                 .enable_msix(irq_fds.iter().collect())
1051                 .map_err(VfioPciError::EnableMsix)?;
1052         }
1053 
1054         Ok(())
1055     }
1056 
1057     pub(crate) fn disable_msix(&self) {
1058         if let Err(e) = self.vfio_wrapper.disable_msix() {
1059             error!("Could not disable MSI-X: {}", e);
1060         }
1061     }
1062 
1063     pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> {
1064         if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) {
1065             if irq_info.count == 0 {
1066                 // A count of 0 means the INTx IRQ is not supported, therefore
1067                 // it shouldn't be initialized.
1068                 return Ok(());
1069             }
1070         }
1071 
1072         if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() {
1073             self.interrupt.intx = Some(VfioIntx {
1074                 interrupt_source_group,
1075                 enabled: false,
1076             });
1077 
1078             self.enable_intx()?;
1079         }
1080 
1081         Ok(())
1082     }
1083 
1084     pub(crate) fn update_msi_capabilities(
1085         &mut self,
1086         offset: u64,
1087         data: &[u8],
1088     ) -> Result<(), VfioPciError> {
1089         match self.interrupt.update_msi(offset, data) {
1090             Some(InterruptUpdateAction::EnableMsi) => {
1091                 // Disable INTx before we can enable MSI
1092                 self.disable_intx();
1093                 self.enable_msi()?;
1094             }
1095             Some(InterruptUpdateAction::DisableMsi) => {
1096                 // Fallback onto INTx when disabling MSI
1097                 self.disable_msi();
1098                 self.enable_intx()?;
1099             }
1100             _ => {}
1101         }
1102 
1103         Ok(())
1104     }
1105 
1106     pub(crate) fn update_msix_capabilities(
1107         &mut self,
1108         offset: u64,
1109         data: &[u8],
1110     ) -> Result<(), VfioPciError> {
1111         match self.interrupt.update_msix(offset, data) {
1112             Some(InterruptUpdateAction::EnableMsix) => {
1113                 // Disable INTx before we can enable MSI-X
1114                 self.disable_intx();
1115                 self.enable_msix()?;
1116             }
1117             Some(InterruptUpdateAction::DisableMsix) => {
1118                 // Fallback onto INTx when disabling MSI-X
1119                 self.disable_msix();
1120                 self.enable_intx()?;
1121             }
1122             _ => {}
1123         }
1124 
1125         Ok(())
1126     }
1127 
1128     pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> {
1129         for region in self.mmio_regions.iter() {
1130             if addr >= region.start.raw_value()
1131                 && addr < region.start.unchecked_add(region.length).raw_value()
1132             {
1133                 return Some(region.clone());
1134             }
1135         }
1136         None
1137     }
1138 
1139     pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1140         let addr = base + offset;
1141         if let Some(region) = self.find_region(addr) {
1142             let offset = addr - region.start.raw_value();
1143 
1144             if self.interrupt.msix_table_accessed(region.index, offset) {
1145                 self.interrupt.msix_read_table(offset, data);
1146             } else {
1147                 self.vfio_wrapper.region_read(region.index, offset, data);
1148             }
1149         }
1150 
1151         // INTx EOI
1152         // The guest reading from the BAR potentially means the interrupt has
1153         // been received and can be acknowledged.
1154         if self.interrupt.intx_in_use() {
1155             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1156                 error!("Failed unmasking INTx IRQ: {}", e);
1157             }
1158         }
1159     }
1160 
1161     pub(crate) fn write_bar(
1162         &mut self,
1163         base: u64,
1164         offset: u64,
1165         data: &[u8],
1166     ) -> Option<Arc<Barrier>> {
1167         let addr = base + offset;
1168         if let Some(region) = self.find_region(addr) {
1169             let offset = addr - region.start.raw_value();
1170 
1171             // If the MSI-X table is written to, we need to update our cache.
1172             if self.interrupt.msix_table_accessed(region.index, offset) {
1173                 self.interrupt.msix_write_table(offset, data);
1174             } else {
1175                 self.vfio_wrapper.region_write(region.index, offset, data);
1176             }
1177         }
1178 
1179         // INTx EOI
1180         // The guest writing to the BAR potentially means the interrupt has
1181         // been received and can be acknowledged.
1182         if self.interrupt.intx_in_use() {
1183             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1184                 error!("Failed unmasking INTx IRQ: {}", e);
1185             }
1186         }
1187 
1188         None
1189     }
1190 
1191     pub(crate) fn write_config_register(
1192         &mut self,
1193         reg_idx: usize,
1194         offset: u64,
1195         data: &[u8],
1196     ) -> Option<Arc<Barrier>> {
1197         // When the guest wants to write to a BAR, we trap it into
1198         // our local configuration space. We're not reprogramming
1199         // VFIO device.
1200         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1201             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1202         {
1203             // We keep our local cache updated with the BARs.
1204             // We'll read it back from there when the guest is asking
1205             // for BARs (see read_config_register()).
1206             self.configuration
1207                 .write_config_register(reg_idx, offset, data);
1208             return None;
1209         }
1210 
1211         let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64;
1212 
1213         // If the MSI or MSI-X capabilities are accessed, we need to
1214         // update our local cache accordingly.
1215         // Depending on how the capabilities are modified, this could
1216         // trigger a VFIO MSI or MSI-X toggle.
1217         if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) {
1218             let cap_offset: u64 = reg - cap_base + offset;
1219             match cap_id {
1220                 PciCapabilityId::MessageSignalledInterrupts => {
1221                     if let Err(e) = self.update_msi_capabilities(cap_offset, data) {
1222                         error!("Could not update MSI capabilities: {}", e);
1223                     }
1224                 }
1225                 PciCapabilityId::MsiX => {
1226                     if let Err(e) = self.update_msix_capabilities(cap_offset, data) {
1227                         error!("Could not update MSI-X capabilities: {}", e);
1228                     }
1229                 }
1230                 _ => {}
1231             }
1232         }
1233 
1234         // Make sure to write to the device's PCI config space after MSI/MSI-X
1235         // interrupts have been enabled/disabled. In case of MSI, when the
1236         // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS),
1237         // the MSI Enable bit in the MSI capability structure found in the PCI
1238         // config space is disabled by default. That's why when the guest is
1239         // enabling this bit, we first need to enable the MSI interrupts with
1240         // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write
1241         // to the device region to update the MSI Enable bit.
1242         self.vfio_wrapper.write_config((reg + offset) as u32, data);
1243 
1244         None
1245     }
1246 
1247     pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1248         // When reading the BARs, we trap it and return what comes
1249         // from our local configuration space. We want the guest to
1250         // use that and not the VFIO device BARs as it does not map
1251         // with the guest address space.
1252         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1253             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1254         {
1255             return self.configuration.read_reg(reg_idx);
1256         }
1257 
1258         if let Some(id) = self.get_msix_cap_idx() {
1259             let msix = self.interrupt.msix.as_mut().unwrap();
1260             if reg_idx * 4 == id + 4 {
1261                 return msix.cap.table;
1262             } else if reg_idx * 4 == id + 8 {
1263                 return msix.cap.pba;
1264             }
1265         }
1266 
1267         // Since we don't support passing multi-functions devices, we should
1268         // mask the multi-function bit, bit 7 of the Header Type byte on the
1269         // register 3.
1270         let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX {
1271             0xff7f_ffff
1272         } else {
1273             0xffff_ffff
1274         };
1275 
1276         // The config register read comes from the VFIO device itself.
1277         let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask;
1278 
1279         if let Some(config_patch) = self.patches.get(&reg_idx) {
1280             value = (value & !config_patch.mask) | config_patch.patch;
1281         }
1282 
1283         value
1284     }
1285 
1286     fn state(&self) -> VfioCommonState {
1287         let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState {
1288             enabled: intx.enabled,
1289         });
1290 
1291         let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState {
1292             cap: msi.cfg.cap,
1293             cap_offset: msi.cap_offset,
1294         });
1295 
1296         let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState {
1297             cap: msix.cap,
1298             cap_offset: msix.cap_offset,
1299             bdf: msix.bar.devid,
1300         });
1301 
1302         VfioCommonState {
1303             intx_state,
1304             msi_state,
1305             msix_state,
1306         }
1307     }
1308 
1309     fn set_state(
1310         &mut self,
1311         state: &VfioCommonState,
1312         msi_state: Option<MsiConfigState>,
1313         msix_state: Option<MsixConfigState>,
1314     ) -> Result<(), VfioPciError> {
1315         if let (Some(intx), Some(interrupt_source_group)) =
1316             (&state.intx_state, self.legacy_interrupt_group.clone())
1317         {
1318             self.interrupt.intx = Some(VfioIntx {
1319                 interrupt_source_group,
1320                 enabled: false,
1321             });
1322 
1323             if intx.enabled {
1324                 self.enable_intx()?;
1325             }
1326         }
1327 
1328         if let Some(msi) = &state.msi_state {
1329             self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state);
1330         }
1331 
1332         if let Some(msix) = &state.msix_state {
1333             self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state);
1334         }
1335 
1336         Ok(())
1337     }
1338 }
1339 
1340 impl Pausable for VfioCommon {}
1341 
1342 impl Snapshottable for VfioCommon {
1343     fn id(&self) -> String {
1344         String::from(VFIO_COMMON_ID)
1345     }
1346 
1347     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1348         let mut vfio_common_snapshot = Snapshot::new_from_versioned_state(&self.state())?;
1349 
1350         // Snapshot PciConfiguration
1351         vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?);
1352 
1353         // Snapshot MSI
1354         if let Some(msi) = &mut self.interrupt.msi {
1355             vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?);
1356         }
1357 
1358         // Snapshot MSI-X
1359         if let Some(msix) = &mut self.interrupt.msix {
1360             vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?);
1361         }
1362 
1363         Ok(vfio_common_snapshot)
1364     }
1365 }
1366 
1367 /// VfioPciDevice represents a VFIO PCI device.
1368 /// This structure implements the BusDevice and PciDevice traits.
1369 ///
1370 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device.
1371 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice,
1372 /// which then gets added to the PCI bus.
1373 pub struct VfioPciDevice {
1374     id: String,
1375     vm: Arc<dyn hypervisor::Vm>,
1376     device: Arc<VfioDevice>,
1377     container: Arc<VfioContainer>,
1378     common: VfioCommon,
1379     iommu_attached: bool,
1380     memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
1381 }
1382 
1383 impl VfioPciDevice {
1384     /// Constructs a new Vfio Pci device for the given Vfio device
1385     #[allow(clippy::too_many_arguments)]
1386     pub fn new(
1387         id: String,
1388         vm: &Arc<dyn hypervisor::Vm>,
1389         device: VfioDevice,
1390         container: Arc<VfioContainer>,
1391         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
1392         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
1393         iommu_attached: bool,
1394         bdf: PciBdf,
1395         memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
1396         snapshot: Option<Snapshot>,
1397         x_nv_gpudirect_clique: Option<u8>,
1398     ) -> Result<Self, VfioPciError> {
1399         let device = Arc::new(device);
1400         device.reset();
1401 
1402         let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device));
1403 
1404         let common = VfioCommon::new(
1405             msi_interrupt_manager,
1406             legacy_interrupt_group,
1407             Arc::new(vfio_wrapper) as Arc<dyn Vfio>,
1408             &PciVfioSubclass::VfioSubclass,
1409             bdf,
1410             vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID),
1411             x_nv_gpudirect_clique,
1412         )?;
1413 
1414         let vfio_pci_device = VfioPciDevice {
1415             id,
1416             vm: vm.clone(),
1417             device,
1418             container,
1419             common,
1420             iommu_attached,
1421             memory_slot,
1422         };
1423 
1424         Ok(vfio_pci_device)
1425     }
1426 
1427     pub fn iommu_attached(&self) -> bool {
1428         self.iommu_attached
1429     }
1430 
1431     fn generate_sparse_areas(
1432         caps: &[VfioRegionInfoCap],
1433         region_index: u32,
1434         region_start: u64,
1435         region_size: u64,
1436         vfio_msix: Option<&VfioMsix>,
1437     ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> {
1438         for cap in caps {
1439             match cap {
1440                 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()),
1441                 VfioRegionInfoCap::MsixMappable => {
1442                     if !is_4k_aligned(region_start) {
1443                         error!(
1444                             "Region start address 0x{:x} must be at least aligned on 4KiB",
1445                             region_start
1446                         );
1447                         return Err(VfioPciError::RegionAlignment);
1448                     }
1449                     if !is_4k_multiple(region_size) {
1450                         error!(
1451                             "Region size 0x{:x} must be at least a multiple of 4KiB",
1452                             region_size
1453                         );
1454                         return Err(VfioPciError::RegionSize);
1455                     }
1456 
1457                     // In case the region contains the MSI-X vectors table or
1458                     // the MSI-X PBA table, we must calculate the subregions
1459                     // around them, leading to a list of sparse areas.
1460                     // We want to make sure we will still trap MMIO accesses
1461                     // to these MSI-X specific ranges. If these region don't align
1462                     // with pagesize, we can achieve it by enlarging its range.
1463                     //
1464                     // Using a BtreeMap as the list provided through the iterator is sorted
1465                     // by key. This ensures proper split of the whole region.
1466                     let mut inter_ranges = BTreeMap::new();
1467                     if let Some(msix) = vfio_msix {
1468                         if region_index == msix.cap.table_bir() {
1469                             let (offset, size) = msix.cap.table_range();
1470                             let offset = align_page_size_down(offset);
1471                             let size = align_page_size_up(size);
1472                             inter_ranges.insert(offset, size);
1473                         }
1474                         if region_index == msix.cap.pba_bir() {
1475                             let (offset, size) = msix.cap.pba_range();
1476                             let offset = align_page_size_down(offset);
1477                             let size = align_page_size_up(size);
1478                             inter_ranges.insert(offset, size);
1479                         }
1480                     }
1481 
1482                     let mut sparse_areas = Vec::new();
1483                     let mut current_offset = 0;
1484                     for (range_offset, range_size) in inter_ranges {
1485                         if range_offset > current_offset {
1486                             sparse_areas.push(VfioRegionSparseMmapArea {
1487                                 offset: current_offset,
1488                                 size: range_offset - current_offset,
1489                             });
1490                         }
1491                         current_offset = align_page_size_down(range_offset + range_size);
1492                     }
1493 
1494                     if region_size > current_offset {
1495                         sparse_areas.push(VfioRegionSparseMmapArea {
1496                             offset: current_offset,
1497                             size: region_size - current_offset,
1498                         });
1499                     }
1500 
1501                     return Ok(sparse_areas);
1502                 }
1503                 _ => {}
1504             }
1505         }
1506 
1507         // In case no relevant capabilities have been found, create a single
1508         // sparse area corresponding to the entire MMIO region.
1509         Ok(vec![VfioRegionSparseMmapArea {
1510             offset: 0,
1511             size: region_size,
1512         }])
1513     }
1514 
1515     /// Map MMIO regions into the guest, and avoid VM exits when the guest tries
1516     /// to reach those regions.
1517     ///
1518     /// # Arguments
1519     ///
1520     /// * `vm` - The VM object. It is used to set the VFIO MMIO regions
1521     ///          as user memory regions.
1522     /// * `mem_slot` - The closure to return a memory slot.
1523     pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> {
1524         let fd = self.device.as_raw_fd();
1525 
1526         for region in self.common.mmio_regions.iter_mut() {
1527             let region_flags = self.device.get_region_flags(region.index);
1528             if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1529                 let mut prot = 0;
1530                 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 {
1531                     prot |= libc::PROT_READ;
1532                 }
1533                 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 {
1534                     prot |= libc::PROT_WRITE;
1535                 }
1536 
1537                 // Retrieve the list of capabilities found on the region
1538                 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 {
1539                     self.device.get_region_caps(region.index)
1540                 } else {
1541                     Vec::new()
1542                 };
1543 
1544                 // Don't try to mmap the region if it contains MSI-X table or
1545                 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE
1546                 // in the list of supported capabilities.
1547                 if let Some(msix) = self.common.interrupt.msix.as_ref() {
1548                     if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir())
1549                         && !caps.contains(&VfioRegionInfoCap::MsixMappable)
1550                     {
1551                         continue;
1552                     }
1553                 }
1554 
1555                 let mmap_size = self.device.get_region_size(region.index);
1556                 let mmap_offset = self.device.get_region_offset(region.index);
1557 
1558                 let sparse_areas = Self::generate_sparse_areas(
1559                     &caps,
1560                     region.index,
1561                     region.start.0,
1562                     mmap_size,
1563                     self.common.interrupt.msix.as_ref(),
1564                 )?;
1565 
1566                 for area in sparse_areas.iter() {
1567                     // SAFETY: FFI call with correct arguments
1568                     let host_addr = unsafe {
1569                         libc::mmap(
1570                             null_mut(),
1571                             area.size as usize,
1572                             prot,
1573                             libc::MAP_SHARED,
1574                             fd,
1575                             mmap_offset as libc::off_t + area.offset as libc::off_t,
1576                         )
1577                     };
1578 
1579                     if host_addr == libc::MAP_FAILED {
1580                         error!(
1581                             "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}",
1582                             area.offset,
1583                             area.size,
1584                             std::io::Error::last_os_error()
1585                         );
1586                         return Err(VfioPciError::MmapArea);
1587                     }
1588 
1589                     if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) {
1590                         warn!(
1591                             "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})",
1592                             area.offset,
1593                             area.size,
1594                             );
1595                         return Ok(());
1596                     }
1597 
1598                     let user_memory_region = UserMemoryRegion {
1599                         slot: (self.memory_slot)(),
1600                         start: region.start.0 + area.offset,
1601                         size: area.size,
1602                         host_addr: host_addr as u64,
1603                     };
1604 
1605                     region.user_memory_regions.push(user_memory_region);
1606 
1607                     let mem_region = self.vm.make_user_memory_region(
1608                         user_memory_region.slot,
1609                         user_memory_region.start,
1610                         user_memory_region.size,
1611                         user_memory_region.host_addr,
1612                         false,
1613                         false,
1614                     );
1615 
1616                     self.vm
1617                         .create_user_memory_region(mem_region)
1618                         .map_err(VfioPciError::CreateUserMemoryRegion)?;
1619 
1620                     if !self.iommu_attached {
1621                         self.container
1622                             .vfio_dma_map(
1623                                 user_memory_region.start,
1624                                 user_memory_region.size,
1625                                 user_memory_region.host_addr,
1626                             )
1627                             .map_err(VfioPciError::DmaMap)?;
1628                     }
1629                 }
1630             }
1631         }
1632 
1633         Ok(())
1634     }
1635 
1636     pub fn unmap_mmio_regions(&mut self) {
1637         for region in self.common.mmio_regions.iter() {
1638             for user_memory_region in region.user_memory_regions.iter() {
1639                 // Unmap from vfio container
1640                 if !self.iommu_attached {
1641                     if let Err(e) = self
1642                         .container
1643                         .vfio_dma_unmap(user_memory_region.start, user_memory_region.size)
1644                     {
1645                         error!("Could not unmap mmio region from vfio container: {}", e);
1646                     }
1647                 }
1648 
1649                 // Remove region
1650                 let r = self.vm.make_user_memory_region(
1651                     user_memory_region.slot,
1652                     user_memory_region.start,
1653                     user_memory_region.size,
1654                     user_memory_region.host_addr,
1655                     false,
1656                     false,
1657                 );
1658 
1659                 if let Err(e) = self.vm.remove_user_memory_region(r) {
1660                     error!("Could not remove the userspace memory region: {}", e);
1661                 }
1662 
1663                 // SAFETY: FFI call with correct arguments
1664                 let ret = unsafe {
1665                     libc::munmap(
1666                         user_memory_region.host_addr as *mut libc::c_void,
1667                         user_memory_region.size as usize,
1668                     )
1669                 };
1670                 if ret != 0 {
1671                     error!(
1672                         "Could not unmap region {}, error:{}",
1673                         region.index,
1674                         io::Error::last_os_error()
1675                     );
1676                 }
1677             }
1678         }
1679     }
1680 
1681     pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> {
1682         if !self.iommu_attached {
1683             self.container
1684                 .vfio_dma_map(iova, size, user_addr)
1685                 .map_err(VfioPciError::DmaMap)?;
1686         }
1687 
1688         Ok(())
1689     }
1690 
1691     pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> {
1692         if !self.iommu_attached {
1693             self.container
1694                 .vfio_dma_unmap(iova, size)
1695                 .map_err(VfioPciError::DmaUnmap)?;
1696         }
1697 
1698         Ok(())
1699     }
1700 
1701     pub fn mmio_regions(&self) -> Vec<MmioRegion> {
1702         self.common.mmio_regions.clone()
1703     }
1704 }
1705 
1706 impl Drop for VfioPciDevice {
1707     fn drop(&mut self) {
1708         self.unmap_mmio_regions();
1709 
1710         if let Some(msix) = &self.common.interrupt.msix {
1711             if msix.bar.enabled() {
1712                 self.common.disable_msix();
1713             }
1714         }
1715 
1716         if let Some(msi) = &self.common.interrupt.msi {
1717             if msi.cfg.enabled() {
1718                 self.common.disable_msi()
1719             }
1720         }
1721 
1722         if self.common.interrupt.intx_in_use() {
1723             self.common.disable_intx();
1724         }
1725     }
1726 }
1727 
1728 impl BusDevice for VfioPciDevice {
1729     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1730         self.read_bar(base, offset, data)
1731     }
1732 
1733     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1734         self.write_bar(base, offset, data)
1735     }
1736 }
1737 
1738 // First BAR offset in the PCI config space.
1739 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10;
1740 // Capability register offset in the PCI config space.
1741 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34;
1742 // Extended capabilities register offset in the PCI config space.
1743 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100;
1744 // IO BAR when first BAR bit is 1.
1745 const PCI_CONFIG_IO_BAR: u32 = 0x1;
1746 // 64-bit memory bar flag.
1747 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4;
1748 // Prefetchable BAR bit
1749 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8;
1750 // PCI config register size (4 bytes).
1751 const PCI_CONFIG_REGISTER_SIZE: usize = 4;
1752 // Number of BARs for a PCI device
1753 const BAR_NUMS: usize = 6;
1754 // PCI Header Type register index
1755 const PCI_HEADER_TYPE_REG_INDEX: usize = 3;
1756 // First BAR register index
1757 const PCI_CONFIG_BAR0_INDEX: usize = 4;
1758 // PCI ROM expansion BAR register index
1759 const PCI_ROM_EXP_BAR_INDEX: usize = 12;
1760 
1761 impl PciDevice for VfioPciDevice {
1762     fn allocate_bars(
1763         &mut self,
1764         allocator: &Arc<Mutex<SystemAllocator>>,
1765         mmio32_allocator: &mut AddressAllocator,
1766         mmio64_allocator: &mut AddressAllocator,
1767         resources: Option<Vec<Resource>>,
1768     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
1769         self.common
1770             .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources)
1771     }
1772 
1773     fn free_bars(
1774         &mut self,
1775         allocator: &mut SystemAllocator,
1776         mmio32_allocator: &mut AddressAllocator,
1777         mmio64_allocator: &mut AddressAllocator,
1778     ) -> Result<(), PciDeviceError> {
1779         self.common
1780             .free_bars(allocator, mmio32_allocator, mmio64_allocator)
1781     }
1782 
1783     fn write_config_register(
1784         &mut self,
1785         reg_idx: usize,
1786         offset: u64,
1787         data: &[u8],
1788     ) -> Option<Arc<Barrier>> {
1789         self.common.write_config_register(reg_idx, offset, data)
1790     }
1791 
1792     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1793         self.common.read_config_register(reg_idx)
1794     }
1795 
1796     fn detect_bar_reprogramming(
1797         &mut self,
1798         reg_idx: usize,
1799         data: &[u8],
1800     ) -> Option<BarReprogrammingParams> {
1801         self.common
1802             .configuration
1803             .detect_bar_reprogramming(reg_idx, data)
1804     }
1805 
1806     fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1807         self.common.read_bar(base, offset, data)
1808     }
1809 
1810     fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1811         self.common.write_bar(base, offset, data)
1812     }
1813 
1814     fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> {
1815         for region in self.common.mmio_regions.iter_mut() {
1816             if region.start.raw_value() == old_base {
1817                 region.start = GuestAddress(new_base);
1818 
1819                 for user_memory_region in region.user_memory_regions.iter_mut() {
1820                     // Remove old region
1821                     let old_mem_region = self.vm.make_user_memory_region(
1822                         user_memory_region.slot,
1823                         user_memory_region.start,
1824                         user_memory_region.size,
1825                         user_memory_region.host_addr,
1826                         false,
1827                         false,
1828                     );
1829 
1830                     self.vm
1831                         .remove_user_memory_region(old_mem_region)
1832                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1833 
1834                     // Update the user memory region with the correct start address.
1835                     if new_base > old_base {
1836                         user_memory_region.start += new_base - old_base;
1837                     } else {
1838                         user_memory_region.start -= old_base - new_base;
1839                     }
1840 
1841                     // Insert new region
1842                     let new_mem_region = self.vm.make_user_memory_region(
1843                         user_memory_region.slot,
1844                         user_memory_region.start,
1845                         user_memory_region.size,
1846                         user_memory_region.host_addr,
1847                         false,
1848                         false,
1849                     );
1850 
1851                     self.vm
1852                         .create_user_memory_region(new_mem_region)
1853                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1854                 }
1855             }
1856         }
1857 
1858         Ok(())
1859     }
1860 
1861     fn as_any(&mut self) -> &mut dyn Any {
1862         self
1863     }
1864 
1865     fn id(&self) -> Option<String> {
1866         Some(self.id.clone())
1867     }
1868 }
1869 
1870 impl Pausable for VfioPciDevice {}
1871 
1872 impl Snapshottable for VfioPciDevice {
1873     fn id(&self) -> String {
1874         self.id.clone()
1875     }
1876 
1877     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1878         let mut vfio_pci_dev_snapshot = Snapshot::default();
1879 
1880         // Snapshot VfioCommon
1881         vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?);
1882 
1883         Ok(vfio_pci_dev_snapshot)
1884     }
1885 }
1886 impl Transportable for VfioPciDevice {}
1887 impl Migratable for VfioPciDevice {}
1888