xref: /cloud-hypervisor/pci/src/vfio.rs (revision 2571e59438597f53aa4993cd70d6462fe1364ba7)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 
6 use crate::msi::{MsiConfigState, MSI_CONFIG_ID};
7 use crate::msix::MsixConfigState;
8 use crate::{
9     msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig,
10     PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId,
11     PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId,
12     PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID,
13 };
14 use anyhow::anyhow;
15 use byteorder::{ByteOrder, LittleEndian};
16 use hypervisor::HypervisorVmError;
17 use libc::{sysconf, _SC_PAGESIZE};
18 use std::any::Any;
19 use std::collections::{BTreeMap, HashMap};
20 use std::io;
21 use std::os::unix::io::AsRawFd;
22 use std::ptr::null_mut;
23 use std::sync::{Arc, Barrier, Mutex};
24 use thiserror::Error;
25 use versionize::{VersionMap, Versionize, VersionizeResult};
26 use versionize_derive::Versionize;
27 use vfio_bindings::bindings::vfio::*;
28 use vfio_ioctls::{
29     VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea,
30 };
31 use vm_allocator::page_size::{
32     align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned,
33 };
34 use vm_allocator::{AddressAllocator, SystemAllocator};
35 use vm_device::interrupt::{
36     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
37 };
38 use vm_device::{BusDevice, Resource};
39 use vm_memory::{Address, GuestAddress, GuestUsize};
40 use vm_migration::{
41     Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped,
42 };
43 use vmm_sys_util::eventfd::EventFd;
44 
45 pub(crate) const VFIO_COMMON_ID: &str = "vfio_common";
46 
47 #[derive(Debug, Error)]
48 pub enum VfioPciError {
49     #[error("Failed to create user memory region: {0}")]
50     CreateUserMemoryRegion(#[source] HypervisorVmError),
51     #[error("Failed to DMA map: {0}")]
52     DmaMap(#[source] vfio_ioctls::VfioError),
53     #[error("Failed to DMA unmap: {0}")]
54     DmaUnmap(#[source] vfio_ioctls::VfioError),
55     #[error("Failed to enable INTx: {0}")]
56     EnableIntx(#[source] VfioError),
57     #[error("Failed to enable MSI: {0}")]
58     EnableMsi(#[source] VfioError),
59     #[error("Failed to enable MSI-x: {0}")]
60     EnableMsix(#[source] VfioError),
61     #[error("Failed to mmap the area")]
62     MmapArea,
63     #[error("Failed to notifier's eventfd")]
64     MissingNotifier,
65     #[error("Invalid region alignment")]
66     RegionAlignment,
67     #[error("Invalid region size")]
68     RegionSize,
69     #[error("Failed to retrieve MsiConfigState: {0}")]
70     RetrieveMsiConfigState(#[source] anyhow::Error),
71     #[error("Failed to retrieve MsixConfigState: {0}")]
72     RetrieveMsixConfigState(#[source] anyhow::Error),
73     #[error("Failed to retrieve PciConfigurationState: {0}")]
74     RetrievePciConfigurationState(#[source] anyhow::Error),
75     #[error("Failed to retrieve VfioCommonState: {0}")]
76     RetrieveVfioCommonState(#[source] anyhow::Error),
77 }
78 
79 #[derive(Copy, Clone)]
80 enum PciVfioSubclass {
81     VfioSubclass = 0xff,
82 }
83 
84 impl PciSubclass for PciVfioSubclass {
85     fn get_register_value(&self) -> u8 {
86         *self as u8
87     }
88 }
89 
90 enum InterruptUpdateAction {
91     EnableMsi,
92     DisableMsi,
93     EnableMsix,
94     DisableMsix,
95 }
96 
97 #[derive(Versionize)]
98 struct IntxState {
99     enabled: bool,
100 }
101 
102 pub(crate) struct VfioIntx {
103     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
104     enabled: bool,
105 }
106 
107 #[derive(Versionize)]
108 struct MsiState {
109     cap: MsiCap,
110     cap_offset: u32,
111 }
112 
113 pub(crate) struct VfioMsi {
114     pub(crate) cfg: MsiConfig,
115     cap_offset: u32,
116     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
117 }
118 
119 impl VfioMsi {
120     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
121         let old_enabled = self.cfg.enabled();
122 
123         self.cfg.update(offset, data);
124 
125         let new_enabled = self.cfg.enabled();
126 
127         if !old_enabled && new_enabled {
128             return Some(InterruptUpdateAction::EnableMsi);
129         }
130 
131         if old_enabled && !new_enabled {
132             return Some(InterruptUpdateAction::DisableMsi);
133         }
134 
135         None
136     }
137 }
138 
139 #[derive(Versionize)]
140 struct MsixState {
141     cap: MsixCap,
142     cap_offset: u32,
143     bdf: u32,
144 }
145 
146 pub(crate) struct VfioMsix {
147     pub(crate) bar: MsixConfig,
148     cap: MsixCap,
149     cap_offset: u32,
150     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
151 }
152 
153 impl VfioMsix {
154     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
155         let old_enabled = self.bar.enabled();
156 
157         // Update "Message Control" word
158         if offset == 2 && data.len() == 2 {
159             self.bar.set_msg_ctl(LittleEndian::read_u16(data));
160         }
161 
162         let new_enabled = self.bar.enabled();
163 
164         if !old_enabled && new_enabled {
165             return Some(InterruptUpdateAction::EnableMsix);
166         }
167 
168         if old_enabled && !new_enabled {
169             return Some(InterruptUpdateAction::DisableMsix);
170         }
171 
172         None
173     }
174 
175     fn table_accessed(&self, bar_index: u32, offset: u64) -> bool {
176         let table_offset: u64 = u64::from(self.cap.table_offset());
177         let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64);
178         let table_bir: u32 = self.cap.table_bir();
179 
180         bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size
181     }
182 }
183 
184 pub(crate) struct Interrupt {
185     pub(crate) intx: Option<VfioIntx>,
186     pub(crate) msi: Option<VfioMsi>,
187     pub(crate) msix: Option<VfioMsix>,
188 }
189 
190 impl Interrupt {
191     fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
192         if let Some(ref mut msi) = &mut self.msi {
193             let action = msi.update(offset, data);
194             return action;
195         }
196 
197         None
198     }
199 
200     fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
201         if let Some(ref mut msix) = &mut self.msix {
202             let action = msix.update(offset, data);
203             return action;
204         }
205 
206         None
207     }
208 
209     fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> {
210         if let Some(msi) = &self.msi {
211             if offset >= u64::from(msi.cap_offset)
212                 && offset < u64::from(msi.cap_offset) + msi.cfg.size()
213             {
214                 return Some((
215                     PciCapabilityId::MessageSignalledInterrupts,
216                     u64::from(msi.cap_offset),
217                 ));
218             }
219         }
220 
221         if let Some(msix) = &self.msix {
222             if offset == u64::from(msix.cap_offset) {
223                 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset)));
224             }
225         }
226 
227         None
228     }
229 
230     fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool {
231         if let Some(msix) = &self.msix {
232             return msix.table_accessed(bar_index, offset);
233         }
234 
235         false
236     }
237 
238     fn msix_write_table(&mut self, offset: u64, data: &[u8]) {
239         if let Some(ref mut msix) = &mut self.msix {
240             let offset = offset - u64::from(msix.cap.table_offset());
241             msix.bar.write_table(offset, data)
242         }
243     }
244 
245     fn msix_read_table(&self, offset: u64, data: &mut [u8]) {
246         if let Some(msix) = &self.msix {
247             let offset = offset - u64::from(msix.cap.table_offset());
248             msix.bar.read_table(offset, data)
249         }
250     }
251 
252     pub(crate) fn intx_in_use(&self) -> bool {
253         if let Some(intx) = &self.intx {
254             return intx.enabled;
255         }
256 
257         false
258     }
259 }
260 
261 #[derive(Copy, Clone)]
262 pub struct UserMemoryRegion {
263     pub slot: u32,
264     pub start: u64,
265     pub size: u64,
266     pub host_addr: u64,
267 }
268 
269 #[derive(Clone)]
270 pub struct MmioRegion {
271     pub start: GuestAddress,
272     pub length: GuestUsize,
273     pub(crate) type_: PciBarRegionType,
274     pub(crate) index: u32,
275     pub(crate) user_memory_regions: Vec<UserMemoryRegion>,
276 }
277 #[derive(Debug, Error)]
278 pub enum VfioError {
279     #[error("Kernel VFIO error: {0}")]
280     KernelVfio(#[source] vfio_ioctls::VfioError),
281     #[error("VFIO user error: {0}")]
282     VfioUser(#[source] vfio_user::Error),
283 }
284 
285 pub(crate) trait Vfio: Send + Sync {
286     fn read_config_byte(&self, offset: u32) -> u8 {
287         let mut data: [u8; 1] = [0];
288         self.read_config(offset, &mut data);
289         data[0]
290     }
291 
292     fn read_config_word(&self, offset: u32) -> u16 {
293         let mut data: [u8; 2] = [0, 0];
294         self.read_config(offset, &mut data);
295         u16::from_le_bytes(data)
296     }
297 
298     fn read_config_dword(&self, offset: u32) -> u32 {
299         let mut data: [u8; 4] = [0, 0, 0, 0];
300         self.read_config(offset, &mut data);
301         u32::from_le_bytes(data)
302     }
303 
304     fn write_config_dword(&self, offset: u32, buf: u32) {
305         let data: [u8; 4] = buf.to_le_bytes();
306         self.write_config(offset, &data)
307     }
308 
309     fn read_config(&self, offset: u32, data: &mut [u8]) {
310         self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut());
311     }
312 
313     fn write_config(&self, offset: u32, data: &[u8]) {
314         self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data)
315     }
316 
317     fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
318         self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds)
319     }
320 
321     fn disable_msi(&self) -> Result<(), VfioError> {
322         self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX)
323     }
324 
325     fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
326         self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds)
327     }
328 
329     fn disable_msix(&self) -> Result<(), VfioError> {
330         self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX)
331     }
332 
333     fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) {
334         unimplemented!()
335     }
336 
337     fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) {
338         unimplemented!()
339     }
340 
341     fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> {
342         unimplemented!()
343     }
344 
345     fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
346         unimplemented!()
347     }
348 
349     fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
350         unimplemented!()
351     }
352 
353     fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
354         unimplemented!()
355     }
356 }
357 
358 struct VfioDeviceWrapper {
359     device: Arc<VfioDevice>,
360 }
361 
362 impl VfioDeviceWrapper {
363     fn new(device: Arc<VfioDevice>) -> Self {
364         Self { device }
365     }
366 }
367 
368 impl Vfio for VfioDeviceWrapper {
369     fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) {
370         self.device.region_read(index, data, offset)
371     }
372 
373     fn region_write(&self, index: u32, offset: u64, data: &[u8]) {
374         self.device.region_write(index, data, offset)
375     }
376 
377     fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> {
378         self.device.get_irq_info(irq_index).copied()
379     }
380 
381     fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
382         self.device
383             .enable_irq(irq_index, event_fds)
384             .map_err(VfioError::KernelVfio)
385     }
386 
387     fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> {
388         self.device
389             .disable_irq(irq_index)
390             .map_err(VfioError::KernelVfio)
391     }
392 
393     fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> {
394         self.device
395             .unmask_irq(irq_index)
396             .map_err(VfioError::KernelVfio)
397     }
398 }
399 
400 #[derive(Versionize)]
401 struct VfioCommonState {
402     intx_state: Option<IntxState>,
403     msi_state: Option<MsiState>,
404     msix_state: Option<MsixState>,
405 }
406 
407 impl VersionMapped for VfioCommonState {}
408 
409 pub(crate) struct ConfigPatch {
410     mask: u32,
411     patch: u32,
412 }
413 
414 pub(crate) struct VfioCommon {
415     pub(crate) configuration: PciConfiguration,
416     pub(crate) mmio_regions: Vec<MmioRegion>,
417     pub(crate) interrupt: Interrupt,
418     pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
419     pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
420     pub(crate) vfio_wrapper: Arc<dyn Vfio>,
421     pub(crate) patches: HashMap<usize, ConfigPatch>,
422 }
423 
424 impl VfioCommon {
425     pub(crate) fn new(
426         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
427         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
428         vfio_wrapper: Arc<dyn Vfio>,
429         subclass: &dyn PciSubclass,
430         bdf: PciBdf,
431         snapshot: Option<Snapshot>,
432     ) -> Result<Self, VfioPciError> {
433         let pci_configuration_state =
434             vm_migration::versioned_state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID)
435                 .map_err(|e| {
436                     VfioPciError::RetrievePciConfigurationState(anyhow!(
437                         "Failed to get PciConfigurationState from Snapshot: {}",
438                         e
439                     ))
440                 })?;
441 
442         let configuration = PciConfiguration::new(
443             0,
444             0,
445             0,
446             PciClassCode::Other,
447             subclass,
448             None,
449             PciHeaderType::Device,
450             0,
451             0,
452             None,
453             pci_configuration_state,
454         );
455 
456         let mut vfio_common = VfioCommon {
457             mmio_regions: Vec::new(),
458             configuration,
459             interrupt: Interrupt {
460                 intx: None,
461                 msi: None,
462                 msix: None,
463             },
464             msi_interrupt_manager,
465             legacy_interrupt_group,
466             vfio_wrapper,
467             patches: HashMap::new(),
468         };
469 
470         let state: Option<VfioCommonState> = snapshot
471             .as_ref()
472             .map(|s| s.to_versioned_state())
473             .transpose()
474             .map_err(|e| {
475                 VfioPciError::RetrieveVfioCommonState(anyhow!(
476                     "Failed to get VfioCommonState from Snapshot: {}",
477                     e
478                 ))
479             })?;
480         let msi_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSI_CONFIG_ID)
481             .map_err(|e| {
482                 VfioPciError::RetrieveMsiConfigState(anyhow!(
483                     "Failed to get MsiConfigState from Snapshot: {}",
484                     e
485                 ))
486             })?;
487         let msix_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID)
488             .map_err(|e| {
489             VfioPciError::RetrieveMsixConfigState(anyhow!(
490                 "Failed to get MsixConfigState from Snapshot: {}",
491                 e
492             ))
493         })?;
494 
495         if let Some(state) = state.as_ref() {
496             vfio_common.set_state(state, msi_state, msix_state)?;
497         } else {
498             vfio_common.parse_capabilities(bdf);
499             vfio_common.initialize_legacy_interrupt()?;
500         }
501 
502         Ok(vfio_common)
503     }
504 
505     /// In case msix table offset is not page size aligned, we need do some fixup to achive it.
506     /// Becuse we don't want the MMIO RW region and trap region overlap each other.
507     fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 {
508         if let Some(msix) = self.interrupt.msix.as_mut() {
509             let msix_cap = &mut msix.cap;
510 
511             // Suppose table_bir equals to pba_bir here. Am I right?
512             let (table_offset, table_size) = msix_cap.table_range();
513             if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id {
514                 return region_size;
515             }
516 
517             let (pba_offset, pba_size) = msix_cap.pba_range();
518             let msix_sz = align_page_size_up(table_size + pba_size);
519             // Expand region to hold RW and trap region which both page size aligned
520             let size = std::cmp::max(region_size * 2, msix_sz * 2);
521             // let table starts from the middle of the region
522             msix_cap.table_set_offset((size / 2) as u32);
523             msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32);
524 
525             size
526         } else {
527             // MSI-X not supported for this device
528             region_size
529         }
530     }
531 
532     pub(crate) fn allocate_bars(
533         &mut self,
534         allocator: &Arc<Mutex<SystemAllocator>>,
535         mmio_allocator: &mut AddressAllocator,
536         resources: Option<Vec<Resource>>,
537     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
538         let mut bars = Vec::new();
539         let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX;
540 
541         // Going through all regular regions to compute the BAR size.
542         // We're not saving the BAR address to restore it, because we
543         // are going to allocate a guest address for each BAR and write
544         // that new address back.
545         while bar_id < VFIO_PCI_CONFIG_REGION_INDEX {
546             let mut region_size: u64 = 0;
547             let mut region_type = PciBarRegionType::Memory32BitRegion;
548             let mut prefetchable = PciBarPrefetchable::NotPrefetchable;
549             let mut flags: u32 = 0;
550 
551             let mut restored_bar_addr = None;
552             if let Some(resources) = &resources {
553                 for resource in resources {
554                     if let Resource::PciBar {
555                         index,
556                         base,
557                         size,
558                         type_,
559                         ..
560                     } = resource
561                     {
562                         if *index == bar_id as usize {
563                             restored_bar_addr = Some(GuestAddress(*base));
564                             region_size = *size;
565                             region_type = PciBarRegionType::from(*type_);
566                             break;
567                         }
568                     }
569                 }
570                 if restored_bar_addr.is_none() {
571                     bar_id += 1;
572                     continue;
573                 }
574             } else {
575                 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
576                     (PCI_ROM_EXP_BAR_INDEX * 4) as u32
577                 } else {
578                     PCI_CONFIG_BAR_OFFSET + bar_id * 4
579                 };
580 
581                 // First read flags
582                 flags = self.vfio_wrapper.read_config_dword(bar_offset);
583 
584                 // Is this an IO BAR?
585                 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
586                     matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR)
587                 } else {
588                     false
589                 };
590 
591                 // Is this a 64-bit BAR?
592                 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
593                     matches!(
594                         flags & PCI_CONFIG_MEMORY_BAR_64BIT,
595                         PCI_CONFIG_MEMORY_BAR_64BIT
596                     )
597                 } else {
598                     false
599                 };
600 
601                 if matches!(
602                     flags & PCI_CONFIG_BAR_PREFETCHABLE,
603                     PCI_CONFIG_BAR_PREFETCHABLE
604                 ) {
605                     prefetchable = PciBarPrefetchable::Prefetchable
606                 };
607 
608                 // To get size write all 1s
609                 self.vfio_wrapper
610                     .write_config_dword(bar_offset, 0xffff_ffff);
611 
612                 // And read back BAR value. The device will write zeros for bits it doesn't care about
613                 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset);
614 
615                 if io_bar {
616                     // Mask flag bits (lowest 2 for I/O bars)
617                     lower &= !0b11;
618 
619                     // BAR is not enabled
620                     if lower == 0 {
621                         bar_id += 1;
622                         continue;
623                     }
624 
625                     // IO BAR
626                     region_type = PciBarRegionType::IoRegion;
627 
628                     // Invert bits and add 1 to calculate size
629                     region_size = (!lower + 1) as u64;
630                 } else if is_64bit_bar {
631                     // 64 bits Memory BAR
632                     region_type = PciBarRegionType::Memory64BitRegion;
633 
634                     // Query size of upper BAR of 64-bit BAR
635                     let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4;
636                     self.vfio_wrapper
637                         .write_config_dword(upper_offset, 0xffff_ffff);
638                     let upper = self.vfio_wrapper.read_config_dword(upper_offset);
639 
640                     let mut combined_size = u64::from(upper) << 32 | u64::from(lower);
641 
642                     // Mask out flag bits (lowest 4 for memory bars)
643                     combined_size &= !0b1111;
644 
645                     // BAR is not enabled
646                     if combined_size == 0 {
647                         bar_id += 1;
648                         continue;
649                     }
650 
651                     // Invert and add 1 to to find size
652                     region_size = !combined_size + 1;
653                 } else {
654                     region_type = PciBarRegionType::Memory32BitRegion;
655 
656                     // Mask out flag bits (lowest 4 for memory bars)
657                     lower &= !0b1111;
658 
659                     if lower == 0 {
660                         bar_id += 1;
661                         continue;
662                     }
663 
664                     // Invert and add 1 to to find size
665                     region_size = (!lower + 1) as u64;
666                 }
667             }
668 
669             let bar_addr = match region_type {
670                 PciBarRegionType::IoRegion => {
671                     #[cfg(target_arch = "aarch64")]
672                     unimplemented!();
673 
674                     // The address needs to be 4 bytes aligned.
675                     #[cfg(not(target_arch = "aarch64"))]
676                     allocator
677                         .lock()
678                         .unwrap()
679                         .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4))
680                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
681                 }
682                 PciBarRegionType::Memory32BitRegion => {
683                     // BAR allocation must be naturally aligned
684                     allocator
685                         .lock()
686                         .unwrap()
687                         .allocate_mmio_hole_addresses(
688                             restored_bar_addr,
689                             region_size,
690                             Some(region_size),
691                         )
692                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
693                 }
694                 PciBarRegionType::Memory64BitRegion => {
695                     // We need do some fixup to keep MMIO RW region and msix cap region page size
696                     // aligned.
697                     region_size = self.fixup_msix_region(bar_id, region_size);
698                     mmio_allocator
699                         .allocate(
700                             restored_bar_addr,
701                             region_size,
702                             // SAFETY: FFI call. Trivially safe.
703                             Some(unsafe { sysconf(_SC_PAGESIZE) as GuestUsize }),
704                         )
705                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
706                 }
707             };
708 
709             // We can now build our BAR configuration block.
710             let bar = PciBarConfiguration::default()
711                 .set_index(bar_id as usize)
712                 .set_address(bar_addr.raw_value())
713                 .set_size(region_size)
714                 .set_region_type(region_type)
715                 .set_prefetchable(prefetchable);
716 
717             if bar_id == VFIO_PCI_ROM_REGION_INDEX {
718                 self.configuration
719                     .add_pci_rom_bar(&bar, flags & 0x1)
720                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
721             } else {
722                 self.configuration
723                     .add_pci_bar(&bar)
724                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
725             }
726 
727             bars.push(bar);
728             self.mmio_regions.push(MmioRegion {
729                 start: bar_addr,
730                 length: region_size,
731                 type_: region_type,
732                 index: bar_id,
733                 user_memory_regions: Vec::new(),
734             });
735 
736             bar_id += 1;
737             if region_type == PciBarRegionType::Memory64BitRegion {
738                 bar_id += 1;
739             }
740         }
741 
742         Ok(bars)
743     }
744 
745     pub(crate) fn free_bars(
746         &mut self,
747         allocator: &mut SystemAllocator,
748         mmio_allocator: &mut AddressAllocator,
749     ) -> Result<(), PciDeviceError> {
750         for region in self.mmio_regions.iter() {
751             match region.type_ {
752                 PciBarRegionType::IoRegion => {
753                     #[cfg(target_arch = "x86_64")]
754                     allocator.free_io_addresses(region.start, region.length);
755                     #[cfg(target_arch = "aarch64")]
756                     error!("I/O region is not supported");
757                 }
758                 PciBarRegionType::Memory32BitRegion => {
759                     allocator.free_mmio_hole_addresses(region.start, region.length);
760                 }
761                 PciBarRegionType::Memory64BitRegion => {
762                     mmio_allocator.free(region.start, region.length);
763                 }
764             }
765         }
766         Ok(())
767     }
768 
769     pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap {
770         let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into());
771 
772         let table = self.vfio_wrapper.read_config_dword((cap + 4).into());
773 
774         let pba = self.vfio_wrapper.read_config_dword((cap + 8).into());
775 
776         MsixCap {
777             msg_ctl,
778             table,
779             pba,
780         }
781     }
782 
783     pub(crate) fn initialize_msix(
784         &mut self,
785         msix_cap: MsixCap,
786         cap_offset: u32,
787         bdf: PciBdf,
788         state: Option<MsixConfigState>,
789     ) {
790         let interrupt_source_group = self
791             .msi_interrupt_manager
792             .create_group(MsiIrqGroupConfig {
793                 base: 0,
794                 count: msix_cap.table_size() as InterruptIndex,
795             })
796             .unwrap();
797 
798         let msix_config = MsixConfig::new(
799             msix_cap.table_size(),
800             interrupt_source_group.clone(),
801             bdf.into(),
802             state,
803         )
804         .unwrap();
805 
806         self.interrupt.msix = Some(VfioMsix {
807             bar: msix_config,
808             cap: msix_cap,
809             cap_offset,
810             interrupt_source_group,
811         });
812     }
813 
814     pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 {
815         self.vfio_wrapper.read_config_word((cap + 2).into())
816     }
817 
818     pub(crate) fn initialize_msi(
819         &mut self,
820         msg_ctl: u16,
821         cap_offset: u32,
822         state: Option<MsiConfigState>,
823     ) {
824         let interrupt_source_group = self
825             .msi_interrupt_manager
826             .create_group(MsiIrqGroupConfig {
827                 base: 0,
828                 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex,
829             })
830             .unwrap();
831 
832         let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap();
833 
834         self.interrupt.msi = Some(VfioMsi {
835             cfg: msi_config,
836             cap_offset,
837             interrupt_source_group,
838         });
839     }
840 
841     pub(crate) fn get_msix_cap_idx(&self) -> Option<usize> {
842         let mut cap_next = self
843             .vfio_wrapper
844             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
845 
846         while cap_next != 0 {
847             let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into());
848             if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX {
849                 return Some(cap_next as usize);
850             } else {
851                 cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into());
852             }
853         }
854 
855         None
856     }
857 
858     pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) {
859         let mut cap_next = self
860             .vfio_wrapper
861             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
862 
863         let mut pci_express_cap_found = false;
864         let mut power_management_cap_found = false;
865 
866         while cap_next != 0 {
867             let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into());
868 
869             match PciCapabilityId::from(cap_id) {
870                 PciCapabilityId::MessageSignalledInterrupts => {
871                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) {
872                         if irq_info.count > 0 {
873                             // Parse capability only if the VFIO device
874                             // supports MSI.
875                             let msg_ctl = self.parse_msi_capabilities(cap_next);
876                             self.initialize_msi(msg_ctl, cap_next as u32, None);
877                         }
878                     }
879                 }
880                 PciCapabilityId::MsiX => {
881                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX)
882                     {
883                         if irq_info.count > 0 {
884                             // Parse capability only if the VFIO device
885                             // supports MSI-X.
886                             let msix_cap = self.parse_msix_capabilities(cap_next);
887                             self.initialize_msix(msix_cap, cap_next as u32, bdf, None);
888                         }
889                     }
890                 }
891                 PciCapabilityId::PciExpress => pci_express_cap_found = true,
892                 PciCapabilityId::PowerManagement => power_management_cap_found = true,
893                 _ => {}
894             };
895 
896             cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into());
897         }
898 
899         if pci_express_cap_found && power_management_cap_found {
900             self.parse_extended_capabilities();
901         }
902     }
903 
904     fn parse_extended_capabilities(&mut self) {
905         let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET;
906 
907         loop {
908             let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset);
909 
910             let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16;
911             let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16;
912 
913             match PciExpressCapabilityId::from(cap_id) {
914                 PciExpressCapabilityId::AlternativeRoutingIdentificationIntepretation
915                 | PciExpressCapabilityId::ResizeableBar
916                 | PciExpressCapabilityId::SingleRootIoVirtualization => {
917                     let reg_idx = (current_offset / 4) as usize;
918                     self.patches.insert(
919                         reg_idx,
920                         ConfigPatch {
921                             mask: 0x0000_ffff,
922                             patch: PciExpressCapabilityId::NullCapability as u32,
923                         },
924                     );
925                 }
926                 _ => {}
927             }
928 
929             if cap_next == 0 {
930                 break;
931             }
932 
933             current_offset = cap_next.into();
934         }
935     }
936 
937     pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> {
938         if let Some(intx) = &mut self.interrupt.intx {
939             if !intx.enabled {
940                 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) {
941                     self.vfio_wrapper
942                         .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd])
943                         .map_err(VfioPciError::EnableIntx)?;
944 
945                     intx.enabled = true;
946                 } else {
947                     return Err(VfioPciError::MissingNotifier);
948                 }
949             }
950         }
951 
952         Ok(())
953     }
954 
955     pub(crate) fn disable_intx(&mut self) {
956         if let Some(intx) = &mut self.interrupt.intx {
957             if intx.enabled {
958                 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) {
959                     error!("Could not disable INTx: {}", e);
960                 } else {
961                     intx.enabled = false;
962                 }
963             }
964         }
965     }
966 
967     pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> {
968         if let Some(msi) = &self.interrupt.msi {
969             let mut irq_fds: Vec<EventFd> = Vec::new();
970             for i in 0..msi.cfg.num_enabled_vectors() {
971                 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) {
972                     irq_fds.push(eventfd);
973                 } else {
974                     return Err(VfioPciError::MissingNotifier);
975                 }
976             }
977 
978             self.vfio_wrapper
979                 .enable_msi(irq_fds.iter().collect())
980                 .map_err(VfioPciError::EnableMsi)?;
981         }
982 
983         Ok(())
984     }
985 
986     pub(crate) fn disable_msi(&self) {
987         if let Err(e) = self.vfio_wrapper.disable_msi() {
988             error!("Could not disable MSI: {}", e);
989         }
990     }
991 
992     pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> {
993         if let Some(msix) = &self.interrupt.msix {
994             let mut irq_fds: Vec<EventFd> = Vec::new();
995             for i in 0..msix.bar.table_entries.len() {
996                 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) {
997                     irq_fds.push(eventfd);
998                 } else {
999                     return Err(VfioPciError::MissingNotifier);
1000                 }
1001             }
1002 
1003             self.vfio_wrapper
1004                 .enable_msix(irq_fds.iter().collect())
1005                 .map_err(VfioPciError::EnableMsix)?;
1006         }
1007 
1008         Ok(())
1009     }
1010 
1011     pub(crate) fn disable_msix(&self) {
1012         if let Err(e) = self.vfio_wrapper.disable_msix() {
1013             error!("Could not disable MSI-X: {}", e);
1014         }
1015     }
1016 
1017     pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> {
1018         if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) {
1019             if irq_info.count == 0 {
1020                 // A count of 0 means the INTx IRQ is not supported, therefore
1021                 // it shouldn't be initialized.
1022                 return Ok(());
1023             }
1024         }
1025 
1026         if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() {
1027             self.interrupt.intx = Some(VfioIntx {
1028                 interrupt_source_group,
1029                 enabled: false,
1030             });
1031 
1032             self.enable_intx()?;
1033         }
1034 
1035         Ok(())
1036     }
1037 
1038     pub(crate) fn update_msi_capabilities(
1039         &mut self,
1040         offset: u64,
1041         data: &[u8],
1042     ) -> Result<(), VfioPciError> {
1043         match self.interrupt.update_msi(offset, data) {
1044             Some(InterruptUpdateAction::EnableMsi) => {
1045                 // Disable INTx before we can enable MSI
1046                 self.disable_intx();
1047                 self.enable_msi()?;
1048             }
1049             Some(InterruptUpdateAction::DisableMsi) => {
1050                 // Fallback onto INTx when disabling MSI
1051                 self.disable_msi();
1052                 self.enable_intx()?;
1053             }
1054             _ => {}
1055         }
1056 
1057         Ok(())
1058     }
1059 
1060     pub(crate) fn update_msix_capabilities(
1061         &mut self,
1062         offset: u64,
1063         data: &[u8],
1064     ) -> Result<(), VfioPciError> {
1065         match self.interrupt.update_msix(offset, data) {
1066             Some(InterruptUpdateAction::EnableMsix) => {
1067                 // Disable INTx before we can enable MSI-X
1068                 self.disable_intx();
1069                 self.enable_msix()?;
1070             }
1071             Some(InterruptUpdateAction::DisableMsix) => {
1072                 // Fallback onto INTx when disabling MSI-X
1073                 self.disable_msix();
1074                 self.enable_intx()?;
1075             }
1076             _ => {}
1077         }
1078 
1079         Ok(())
1080     }
1081 
1082     pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> {
1083         for region in self.mmio_regions.iter() {
1084             if addr >= region.start.raw_value()
1085                 && addr < region.start.unchecked_add(region.length).raw_value()
1086             {
1087                 return Some(region.clone());
1088             }
1089         }
1090         None
1091     }
1092 
1093     pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1094         let addr = base + offset;
1095         if let Some(region) = self.find_region(addr) {
1096             let offset = addr - region.start.raw_value();
1097 
1098             if self.interrupt.msix_table_accessed(region.index, offset) {
1099                 self.interrupt.msix_read_table(offset, data);
1100             } else {
1101                 self.vfio_wrapper.region_read(region.index, offset, data);
1102             }
1103         }
1104 
1105         // INTx EOI
1106         // The guest reading from the BAR potentially means the interrupt has
1107         // been received and can be acknowledged.
1108         if self.interrupt.intx_in_use() {
1109             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1110                 error!("Failed unmasking INTx IRQ: {}", e);
1111             }
1112         }
1113     }
1114 
1115     pub(crate) fn write_bar(
1116         &mut self,
1117         base: u64,
1118         offset: u64,
1119         data: &[u8],
1120     ) -> Option<Arc<Barrier>> {
1121         let addr = base + offset;
1122         if let Some(region) = self.find_region(addr) {
1123             let offset = addr - region.start.raw_value();
1124 
1125             // If the MSI-X table is written to, we need to update our cache.
1126             if self.interrupt.msix_table_accessed(region.index, offset) {
1127                 self.interrupt.msix_write_table(offset, data);
1128             } else {
1129                 self.vfio_wrapper.region_write(region.index, offset, data);
1130             }
1131         }
1132 
1133         // INTx EOI
1134         // The guest writing to the BAR potentially means the interrupt has
1135         // been received and can be acknowledged.
1136         if self.interrupt.intx_in_use() {
1137             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1138                 error!("Failed unmasking INTx IRQ: {}", e);
1139             }
1140         }
1141 
1142         None
1143     }
1144 
1145     pub(crate) fn write_config_register(
1146         &mut self,
1147         reg_idx: usize,
1148         offset: u64,
1149         data: &[u8],
1150     ) -> Option<Arc<Barrier>> {
1151         // When the guest wants to write to a BAR, we trap it into
1152         // our local configuration space. We're not reprogramming
1153         // VFIO device.
1154         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1155             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1156         {
1157             // We keep our local cache updated with the BARs.
1158             // We'll read it back from there when the guest is asking
1159             // for BARs (see read_config_register()).
1160             self.configuration
1161                 .write_config_register(reg_idx, offset, data);
1162             return None;
1163         }
1164 
1165         let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64;
1166 
1167         // If the MSI or MSI-X capabilities are accessed, we need to
1168         // update our local cache accordingly.
1169         // Depending on how the capabilities are modified, this could
1170         // trigger a VFIO MSI or MSI-X toggle.
1171         if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) {
1172             let cap_offset: u64 = reg - cap_base + offset;
1173             match cap_id {
1174                 PciCapabilityId::MessageSignalledInterrupts => {
1175                     if let Err(e) = self.update_msi_capabilities(cap_offset, data) {
1176                         error!("Could not update MSI capabilities: {}", e);
1177                     }
1178                 }
1179                 PciCapabilityId::MsiX => {
1180                     if let Err(e) = self.update_msix_capabilities(cap_offset, data) {
1181                         error!("Could not update MSI-X capabilities: {}", e);
1182                     }
1183                 }
1184                 _ => {}
1185             }
1186         }
1187 
1188         // Make sure to write to the device's PCI config space after MSI/MSI-X
1189         // interrupts have been enabled/disabled. In case of MSI, when the
1190         // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS),
1191         // the MSI Enable bit in the MSI capability structure found in the PCI
1192         // config space is disabled by default. That's why when the guest is
1193         // enabling this bit, we first need to enable the MSI interrupts with
1194         // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write
1195         // to the device region to update the MSI Enable bit.
1196         self.vfio_wrapper.write_config((reg + offset) as u32, data);
1197 
1198         None
1199     }
1200 
1201     pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1202         // When reading the BARs, we trap it and return what comes
1203         // from our local configuration space. We want the guest to
1204         // use that and not the VFIO device BARs as it does not map
1205         // with the guest address space.
1206         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1207             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1208         {
1209             return self.configuration.read_reg(reg_idx);
1210         }
1211 
1212         if let Some(id) = self.get_msix_cap_idx() {
1213             let msix = self.interrupt.msix.as_mut().unwrap();
1214             if reg_idx * 4 == id + 4 {
1215                 return msix.cap.table;
1216             } else if reg_idx * 4 == id + 8 {
1217                 return msix.cap.pba;
1218             }
1219         }
1220 
1221         // Since we don't support passing multi-functions devices, we should
1222         // mask the multi-function bit, bit 7 of the Header Type byte on the
1223         // register 3.
1224         let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX {
1225             0xff7f_ffff
1226         } else {
1227             0xffff_ffff
1228         };
1229 
1230         // The config register read comes from the VFIO device itself.
1231         let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask;
1232 
1233         if let Some(config_patch) = self.patches.get(&reg_idx) {
1234             value = (value & !config_patch.mask) | config_patch.patch;
1235         }
1236 
1237         value
1238     }
1239 
1240     fn state(&self) -> VfioCommonState {
1241         let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState {
1242             enabled: intx.enabled,
1243         });
1244 
1245         let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState {
1246             cap: msi.cfg.cap,
1247             cap_offset: msi.cap_offset,
1248         });
1249 
1250         let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState {
1251             cap: msix.cap,
1252             cap_offset: msix.cap_offset,
1253             bdf: msix.bar.devid,
1254         });
1255 
1256         VfioCommonState {
1257             intx_state,
1258             msi_state,
1259             msix_state,
1260         }
1261     }
1262 
1263     fn set_state(
1264         &mut self,
1265         state: &VfioCommonState,
1266         msi_state: Option<MsiConfigState>,
1267         msix_state: Option<MsixConfigState>,
1268     ) -> Result<(), VfioPciError> {
1269         if let (Some(intx), Some(interrupt_source_group)) =
1270             (&state.intx_state, self.legacy_interrupt_group.clone())
1271         {
1272             self.interrupt.intx = Some(VfioIntx {
1273                 interrupt_source_group,
1274                 enabled: false,
1275             });
1276 
1277             if intx.enabled {
1278                 self.enable_intx()?;
1279             }
1280         }
1281 
1282         if let Some(msi) = &state.msi_state {
1283             self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state);
1284         }
1285 
1286         if let Some(msix) = &state.msix_state {
1287             self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state);
1288         }
1289 
1290         Ok(())
1291     }
1292 }
1293 
1294 impl Pausable for VfioCommon {}
1295 
1296 impl Snapshottable for VfioCommon {
1297     fn id(&self) -> String {
1298         String::from(VFIO_COMMON_ID)
1299     }
1300 
1301     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1302         let mut vfio_common_snapshot = Snapshot::new_from_versioned_state(&self.state())?;
1303 
1304         // Snapshot PciConfiguration
1305         vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?);
1306 
1307         // Snapshot MSI
1308         if let Some(msi) = &mut self.interrupt.msi {
1309             vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?);
1310         }
1311 
1312         // Snapshot MSI-X
1313         if let Some(msix) = &mut self.interrupt.msix {
1314             vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?);
1315         }
1316 
1317         Ok(vfio_common_snapshot)
1318     }
1319 }
1320 
1321 /// VfioPciDevice represents a VFIO PCI device.
1322 /// This structure implements the BusDevice and PciDevice traits.
1323 ///
1324 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device.
1325 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice,
1326 /// which then gets added to the PCI bus.
1327 pub struct VfioPciDevice {
1328     id: String,
1329     vm: Arc<dyn hypervisor::Vm>,
1330     device: Arc<VfioDevice>,
1331     container: Arc<VfioContainer>,
1332     common: VfioCommon,
1333     iommu_attached: bool,
1334     memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
1335 }
1336 
1337 impl VfioPciDevice {
1338     /// Constructs a new Vfio Pci device for the given Vfio device
1339     #[allow(clippy::too_many_arguments)]
1340     pub fn new(
1341         id: String,
1342         vm: &Arc<dyn hypervisor::Vm>,
1343         device: VfioDevice,
1344         container: Arc<VfioContainer>,
1345         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
1346         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
1347         iommu_attached: bool,
1348         bdf: PciBdf,
1349         memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
1350         snapshot: Option<Snapshot>,
1351     ) -> Result<Self, VfioPciError> {
1352         let device = Arc::new(device);
1353         device.reset();
1354 
1355         let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device));
1356 
1357         let common = VfioCommon::new(
1358             msi_interrupt_manager,
1359             legacy_interrupt_group,
1360             Arc::new(vfio_wrapper) as Arc<dyn Vfio>,
1361             &PciVfioSubclass::VfioSubclass,
1362             bdf,
1363             vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID),
1364         )?;
1365 
1366         let vfio_pci_device = VfioPciDevice {
1367             id,
1368             vm: vm.clone(),
1369             device,
1370             container,
1371             common,
1372             iommu_attached,
1373             memory_slot,
1374         };
1375 
1376         Ok(vfio_pci_device)
1377     }
1378 
1379     pub fn iommu_attached(&self) -> bool {
1380         self.iommu_attached
1381     }
1382 
1383     fn generate_sparse_areas(
1384         caps: &[VfioRegionInfoCap],
1385         region_index: u32,
1386         region_start: u64,
1387         region_size: u64,
1388         vfio_msix: Option<&VfioMsix>,
1389     ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> {
1390         for cap in caps {
1391             match cap {
1392                 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()),
1393                 VfioRegionInfoCap::MsixMappable => {
1394                     if !is_4k_aligned(region_start) {
1395                         error!(
1396                             "Region start address 0x{:x} must be at least aligned on 4KiB",
1397                             region_start
1398                         );
1399                         return Err(VfioPciError::RegionAlignment);
1400                     }
1401                     if !is_4k_multiple(region_size) {
1402                         error!(
1403                             "Region size 0x{:x} must be at least a multiple of 4KiB",
1404                             region_size
1405                         );
1406                         return Err(VfioPciError::RegionSize);
1407                     }
1408 
1409                     // In case the region contains the MSI-X vectors table or
1410                     // the MSI-X PBA table, we must calculate the subregions
1411                     // around them, leading to a list of sparse areas.
1412                     // We want to make sure we will still trap MMIO accesses
1413                     // to these MSI-X specific ranges. If these region don't align
1414                     // with pagesize, we can achive it by enlarging its range.
1415                     //
1416                     // Using a BtreeMap as the list provided through the iterator is sorted
1417                     // by key. This ensures proper split of the whole region.
1418                     let mut inter_ranges = BTreeMap::new();
1419                     if let Some(msix) = vfio_msix {
1420                         if region_index == msix.cap.table_bir() {
1421                             let (offset, size) = msix.cap.table_range();
1422                             let offset = align_page_size_down(offset);
1423                             let size = align_page_size_up(size);
1424                             inter_ranges.insert(offset, size);
1425                         }
1426                         if region_index == msix.cap.pba_bir() {
1427                             let (offset, size) = msix.cap.pba_range();
1428                             let offset = align_page_size_down(offset);
1429                             let size = align_page_size_up(size);
1430                             inter_ranges.insert(offset, size);
1431                         }
1432                     }
1433 
1434                     let mut sparse_areas = Vec::new();
1435                     let mut current_offset = 0;
1436                     for (range_offset, range_size) in inter_ranges {
1437                         if range_offset > current_offset {
1438                             sparse_areas.push(VfioRegionSparseMmapArea {
1439                                 offset: current_offset,
1440                                 size: range_offset - current_offset,
1441                             });
1442                         }
1443                         current_offset = align_page_size_down(range_offset + range_size);
1444                     }
1445 
1446                     if region_size > current_offset {
1447                         sparse_areas.push(VfioRegionSparseMmapArea {
1448                             offset: current_offset,
1449                             size: region_size - current_offset,
1450                         });
1451                     }
1452 
1453                     return Ok(sparse_areas);
1454                 }
1455                 _ => {}
1456             }
1457         }
1458 
1459         // In case no relevant capabilities have been found, create a single
1460         // sparse area corresponding to the entire MMIO region.
1461         Ok(vec![VfioRegionSparseMmapArea {
1462             offset: 0,
1463             size: region_size,
1464         }])
1465     }
1466 
1467     /// Map MMIO regions into the guest, and avoid VM exits when the guest tries
1468     /// to reach those regions.
1469     ///
1470     /// # Arguments
1471     ///
1472     /// * `vm` - The VM object. It is used to set the VFIO MMIO regions
1473     ///          as user memory regions.
1474     /// * `mem_slot` - The closure to return a memory slot.
1475     pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> {
1476         let fd = self.device.as_raw_fd();
1477 
1478         for region in self.common.mmio_regions.iter_mut() {
1479             let region_flags = self.device.get_region_flags(region.index);
1480             if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1481                 let mut prot = 0;
1482                 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 {
1483                     prot |= libc::PROT_READ;
1484                 }
1485                 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 {
1486                     prot |= libc::PROT_WRITE;
1487                 }
1488 
1489                 // Retrieve the list of capabilities found on the region
1490                 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 {
1491                     self.device.get_region_caps(region.index)
1492                 } else {
1493                     Vec::new()
1494                 };
1495 
1496                 // Don't try to mmap the region if it contains MSI-X table or
1497                 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE
1498                 // in the list of supported capabilities.
1499                 if let Some(msix) = self.common.interrupt.msix.as_ref() {
1500                     if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir())
1501                         && !caps.contains(&VfioRegionInfoCap::MsixMappable)
1502                     {
1503                         continue;
1504                     }
1505                 }
1506 
1507                 let mmap_size = self.device.get_region_size(region.index);
1508                 let mmap_offset = self.device.get_region_offset(region.index);
1509 
1510                 let sparse_areas = Self::generate_sparse_areas(
1511                     &caps,
1512                     region.index,
1513                     region.start.0,
1514                     mmap_size,
1515                     self.common.interrupt.msix.as_ref(),
1516                 )?;
1517 
1518                 for area in sparse_areas.iter() {
1519                     // SAFETY: FFI call with correct arguments
1520                     let host_addr = unsafe {
1521                         libc::mmap(
1522                             null_mut(),
1523                             area.size as usize,
1524                             prot,
1525                             libc::MAP_SHARED,
1526                             fd,
1527                             mmap_offset as libc::off_t + area.offset as libc::off_t,
1528                         )
1529                     };
1530 
1531                     if host_addr == libc::MAP_FAILED {
1532                         error!(
1533                             "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}",
1534                             area.offset,
1535                             area.size,
1536                             std::io::Error::last_os_error()
1537                         );
1538                         return Err(VfioPciError::MmapArea);
1539                     }
1540 
1541                     if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) {
1542                         warn!(
1543                             "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})",
1544                             area.offset,
1545                             area.size,
1546                             );
1547                         return Ok(());
1548                     }
1549 
1550                     let user_memory_region = UserMemoryRegion {
1551                         slot: (self.memory_slot)(),
1552                         start: region.start.0 + area.offset,
1553                         size: area.size,
1554                         host_addr: host_addr as u64,
1555                     };
1556 
1557                     region.user_memory_regions.push(user_memory_region);
1558 
1559                     let mem_region = self.vm.make_user_memory_region(
1560                         user_memory_region.slot,
1561                         user_memory_region.start,
1562                         user_memory_region.size,
1563                         user_memory_region.host_addr,
1564                         false,
1565                         false,
1566                     );
1567 
1568                     self.vm
1569                         .create_user_memory_region(mem_region)
1570                         .map_err(VfioPciError::CreateUserMemoryRegion)?;
1571                 }
1572             }
1573         }
1574 
1575         Ok(())
1576     }
1577 
1578     pub fn unmap_mmio_regions(&mut self) {
1579         for region in self.common.mmio_regions.iter() {
1580             for user_memory_region in region.user_memory_regions.iter() {
1581                 // Remove region
1582                 let r = self.vm.make_user_memory_region(
1583                     user_memory_region.slot,
1584                     user_memory_region.start,
1585                     user_memory_region.size,
1586                     user_memory_region.host_addr,
1587                     false,
1588                     false,
1589                 );
1590 
1591                 if let Err(e) = self.vm.remove_user_memory_region(r) {
1592                     error!("Could not remove the userspace memory region: {}", e);
1593                 }
1594 
1595                 // SAFETY: FFI call with correct arguments
1596                 let ret = unsafe {
1597                     libc::munmap(
1598                         user_memory_region.host_addr as *mut libc::c_void,
1599                         user_memory_region.size as usize,
1600                     )
1601                 };
1602                 if ret != 0 {
1603                     error!(
1604                         "Could not unmap region {}, error:{}",
1605                         region.index,
1606                         io::Error::last_os_error()
1607                     );
1608                 }
1609             }
1610         }
1611     }
1612 
1613     pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> {
1614         if !self.iommu_attached {
1615             self.container
1616                 .vfio_dma_map(iova, size, user_addr)
1617                 .map_err(VfioPciError::DmaMap)?;
1618         }
1619 
1620         Ok(())
1621     }
1622 
1623     pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> {
1624         if !self.iommu_attached {
1625             self.container
1626                 .vfio_dma_unmap(iova, size)
1627                 .map_err(VfioPciError::DmaUnmap)?;
1628         }
1629 
1630         Ok(())
1631     }
1632 
1633     pub fn mmio_regions(&self) -> Vec<MmioRegion> {
1634         self.common.mmio_regions.clone()
1635     }
1636 }
1637 
1638 impl Drop for VfioPciDevice {
1639     fn drop(&mut self) {
1640         self.unmap_mmio_regions();
1641 
1642         if let Some(msix) = &self.common.interrupt.msix {
1643             if msix.bar.enabled() {
1644                 self.common.disable_msix();
1645             }
1646         }
1647 
1648         if let Some(msi) = &self.common.interrupt.msi {
1649             if msi.cfg.enabled() {
1650                 self.common.disable_msi()
1651             }
1652         }
1653 
1654         if self.common.interrupt.intx_in_use() {
1655             self.common.disable_intx();
1656         }
1657     }
1658 }
1659 
1660 impl BusDevice for VfioPciDevice {
1661     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1662         self.read_bar(base, offset, data)
1663     }
1664 
1665     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1666         self.write_bar(base, offset, data)
1667     }
1668 }
1669 
1670 // First BAR offset in the PCI config space.
1671 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10;
1672 // Capability register offset in the PCI config space.
1673 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34;
1674 // Extended capabilities register offset in the PCI config space.
1675 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100;
1676 // IO BAR when first BAR bit is 1.
1677 const PCI_CONFIG_IO_BAR: u32 = 0x1;
1678 // 64-bit memory bar flag.
1679 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4;
1680 // Prefetchable BAR bit
1681 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8;
1682 // PCI config register size (4 bytes).
1683 const PCI_CONFIG_REGISTER_SIZE: usize = 4;
1684 // Number of BARs for a PCI device
1685 const BAR_NUMS: usize = 6;
1686 // PCI Header Type register index
1687 const PCI_HEADER_TYPE_REG_INDEX: usize = 3;
1688 // First BAR register index
1689 const PCI_CONFIG_BAR0_INDEX: usize = 4;
1690 // PCI ROM expansion BAR register index
1691 const PCI_ROM_EXP_BAR_INDEX: usize = 12;
1692 
1693 impl PciDevice for VfioPciDevice {
1694     fn allocate_bars(
1695         &mut self,
1696         allocator: &Arc<Mutex<SystemAllocator>>,
1697         mmio_allocator: &mut AddressAllocator,
1698         resources: Option<Vec<Resource>>,
1699     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
1700         self.common
1701             .allocate_bars(allocator, mmio_allocator, resources)
1702     }
1703 
1704     fn free_bars(
1705         &mut self,
1706         allocator: &mut SystemAllocator,
1707         mmio_allocator: &mut AddressAllocator,
1708     ) -> Result<(), PciDeviceError> {
1709         self.common.free_bars(allocator, mmio_allocator)
1710     }
1711 
1712     fn write_config_register(
1713         &mut self,
1714         reg_idx: usize,
1715         offset: u64,
1716         data: &[u8],
1717     ) -> Option<Arc<Barrier>> {
1718         self.common.write_config_register(reg_idx, offset, data)
1719     }
1720 
1721     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1722         self.common.read_config_register(reg_idx)
1723     }
1724 
1725     fn detect_bar_reprogramming(
1726         &mut self,
1727         reg_idx: usize,
1728         data: &[u8],
1729     ) -> Option<BarReprogrammingParams> {
1730         self.common
1731             .configuration
1732             .detect_bar_reprogramming(reg_idx, data)
1733     }
1734 
1735     fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1736         self.common.read_bar(base, offset, data)
1737     }
1738 
1739     fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1740         self.common.write_bar(base, offset, data)
1741     }
1742 
1743     fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> {
1744         for region in self.common.mmio_regions.iter_mut() {
1745             if region.start.raw_value() == old_base {
1746                 region.start = GuestAddress(new_base);
1747 
1748                 for user_memory_region in region.user_memory_regions.iter_mut() {
1749                     // Remove old region
1750                     let old_mem_region = self.vm.make_user_memory_region(
1751                         user_memory_region.slot,
1752                         user_memory_region.start,
1753                         user_memory_region.size,
1754                         user_memory_region.host_addr,
1755                         false,
1756                         false,
1757                     );
1758 
1759                     self.vm
1760                         .remove_user_memory_region(old_mem_region)
1761                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1762 
1763                     // Update the user memory region with the correct start address.
1764                     if new_base > old_base {
1765                         user_memory_region.start += new_base - old_base;
1766                     } else {
1767                         user_memory_region.start -= old_base - new_base;
1768                     }
1769 
1770                     // Insert new region
1771                     let new_mem_region = self.vm.make_user_memory_region(
1772                         user_memory_region.slot,
1773                         user_memory_region.start,
1774                         user_memory_region.size,
1775                         user_memory_region.host_addr,
1776                         false,
1777                         false,
1778                     );
1779 
1780                     self.vm
1781                         .create_user_memory_region(new_mem_region)
1782                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1783                 }
1784             }
1785         }
1786 
1787         Ok(())
1788     }
1789 
1790     fn as_any(&mut self) -> &mut dyn Any {
1791         self
1792     }
1793 
1794     fn id(&self) -> Option<String> {
1795         Some(self.id.clone())
1796     }
1797 }
1798 
1799 impl Pausable for VfioPciDevice {}
1800 
1801 impl Snapshottable for VfioPciDevice {
1802     fn id(&self) -> String {
1803         self.id.clone()
1804     }
1805 
1806     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1807         let mut vfio_pci_dev_snapshot = Snapshot::default();
1808 
1809         // Snapshot VfioCommon
1810         vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?);
1811 
1812         Ok(vfio_pci_dev_snapshot)
1813     }
1814 }
1815 impl Transportable for VfioPciDevice {}
1816 impl Migratable for VfioPciDevice {}
1817