xref: /cloud-hypervisor/pci/src/vfio.rs (revision 6f8bd27cf7629733582d930519e98d19e90afb16)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 
6 use crate::{
7     msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig,
8     PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId,
9     PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId,
10     PciHeaderType, PciSubclass, MSIX_TABLE_ENTRY_SIZE,
11 };
12 use anyhow::anyhow;
13 use byteorder::{ByteOrder, LittleEndian};
14 use hypervisor::HypervisorVmError;
15 use std::any::Any;
16 use std::collections::{BTreeMap, HashMap};
17 use std::io;
18 use std::os::unix::io::AsRawFd;
19 use std::ptr::null_mut;
20 use std::sync::{Arc, Barrier, Mutex};
21 use thiserror::Error;
22 use versionize::{VersionMap, Versionize, VersionizeResult};
23 use versionize_derive::Versionize;
24 use vfio_bindings::bindings::vfio::*;
25 use vfio_ioctls::{
26     VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea,
27 };
28 use vm_allocator::{AddressAllocator, SystemAllocator};
29 use vm_device::interrupt::{
30     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
31 };
32 use vm_device::{BusDevice, Resource};
33 use vm_memory::{Address, GuestAddress, GuestUsize};
34 use vm_migration::{
35     Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped,
36 };
37 use vmm_sys_util::eventfd::EventFd;
38 
39 #[derive(Debug, Error)]
40 pub enum VfioPciError {
41     #[error("Failed to create user memory region: {0}")]
42     CreateUserMemoryRegion(#[source] HypervisorVmError),
43     #[error("Failed to DMA map: {0}")]
44     DmaMap(#[source] vfio_ioctls::VfioError),
45     #[error("Failed to DMA unmap: {0}")]
46     DmaUnmap(#[source] vfio_ioctls::VfioError),
47     #[error("Failed to enable INTx: {0}")]
48     EnableIntx(#[source] VfioError),
49     #[error("Failed to enable MSI: {0}")]
50     EnableMsi(#[source] VfioError),
51     #[error("Failed to enable MSI-x: {0}")]
52     EnableMsix(#[source] VfioError),
53     #[error("Failed to mmap the area")]
54     MmapArea,
55     #[error("Failed to notifier's eventfd")]
56     MissingNotifier,
57     #[error("Invalid region alignment")]
58     RegionAlignment,
59     #[error("Invalid region size")]
60     RegionSize,
61 }
62 
63 #[derive(Copy, Clone)]
64 enum PciVfioSubclass {
65     VfioSubclass = 0xff,
66 }
67 
68 impl PciSubclass for PciVfioSubclass {
69     fn get_register_value(&self) -> u8 {
70         *self as u8
71     }
72 }
73 
74 enum InterruptUpdateAction {
75     EnableMsi,
76     DisableMsi,
77     EnableMsix,
78     DisableMsix,
79 }
80 
81 #[derive(Versionize)]
82 struct IntxState {
83     enabled: bool,
84 }
85 
86 pub(crate) struct VfioIntx {
87     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
88     enabled: bool,
89 }
90 
91 #[derive(Versionize)]
92 struct MsiState {
93     cap: MsiCap,
94     cap_offset: u32,
95 }
96 
97 pub(crate) struct VfioMsi {
98     pub(crate) cfg: MsiConfig,
99     cap_offset: u32,
100     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
101 }
102 
103 impl VfioMsi {
104     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
105         let old_enabled = self.cfg.enabled();
106 
107         self.cfg.update(offset, data);
108 
109         let new_enabled = self.cfg.enabled();
110 
111         if !old_enabled && new_enabled {
112             return Some(InterruptUpdateAction::EnableMsi);
113         }
114 
115         if old_enabled && !new_enabled {
116             return Some(InterruptUpdateAction::DisableMsi);
117         }
118 
119         None
120     }
121 }
122 
123 #[derive(Versionize)]
124 struct MsixState {
125     cap: MsixCap,
126     cap_offset: u32,
127     bdf: u32,
128 }
129 
130 pub(crate) struct VfioMsix {
131     pub(crate) bar: MsixConfig,
132     cap: MsixCap,
133     cap_offset: u32,
134     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
135 }
136 
137 impl VfioMsix {
138     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
139         let old_enabled = self.bar.enabled();
140 
141         // Update "Message Control" word
142         if offset == 2 && data.len() == 2 {
143             self.bar.set_msg_ctl(LittleEndian::read_u16(data));
144         }
145 
146         let new_enabled = self.bar.enabled();
147 
148         if !old_enabled && new_enabled {
149             return Some(InterruptUpdateAction::EnableMsix);
150         }
151 
152         if old_enabled && !new_enabled {
153             return Some(InterruptUpdateAction::DisableMsix);
154         }
155 
156         None
157     }
158 
159     fn table_accessed(&self, bar_index: u32, offset: u64) -> bool {
160         let table_offset: u64 = u64::from(self.cap.table_offset());
161         let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64);
162         let table_bir: u32 = self.cap.table_bir();
163 
164         bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size
165     }
166 }
167 
168 pub(crate) struct Interrupt {
169     pub(crate) intx: Option<VfioIntx>,
170     pub(crate) msi: Option<VfioMsi>,
171     pub(crate) msix: Option<VfioMsix>,
172 }
173 
174 impl Interrupt {
175     fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
176         if let Some(ref mut msi) = &mut self.msi {
177             let action = msi.update(offset, data);
178             return action;
179         }
180 
181         None
182     }
183 
184     fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
185         if let Some(ref mut msix) = &mut self.msix {
186             let action = msix.update(offset, data);
187             return action;
188         }
189 
190         None
191     }
192 
193     fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> {
194         if let Some(msi) = &self.msi {
195             if offset >= u64::from(msi.cap_offset)
196                 && offset < u64::from(msi.cap_offset) + msi.cfg.size()
197             {
198                 return Some((
199                     PciCapabilityId::MessageSignalledInterrupts,
200                     u64::from(msi.cap_offset),
201                 ));
202             }
203         }
204 
205         if let Some(msix) = &self.msix {
206             if offset == u64::from(msix.cap_offset) {
207                 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset)));
208             }
209         }
210 
211         None
212     }
213 
214     fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool {
215         if let Some(msix) = &self.msix {
216             return msix.table_accessed(bar_index, offset);
217         }
218 
219         false
220     }
221 
222     fn msix_write_table(&mut self, offset: u64, data: &[u8]) {
223         if let Some(ref mut msix) = &mut self.msix {
224             let offset = offset - u64::from(msix.cap.table_offset());
225             msix.bar.write_table(offset, data)
226         }
227     }
228 
229     fn msix_read_table(&self, offset: u64, data: &mut [u8]) {
230         if let Some(msix) = &self.msix {
231             let offset = offset - u64::from(msix.cap.table_offset());
232             msix.bar.read_table(offset, data)
233         }
234     }
235 
236     pub(crate) fn intx_in_use(&self) -> bool {
237         if let Some(intx) = &self.intx {
238             return intx.enabled;
239         }
240 
241         false
242     }
243 }
244 
245 #[derive(Copy, Clone)]
246 pub struct UserMemoryRegion {
247     pub slot: u32,
248     pub start: u64,
249     pub size: u64,
250     pub host_addr: u64,
251 }
252 
253 #[derive(Clone)]
254 pub struct MmioRegion {
255     pub start: GuestAddress,
256     pub length: GuestUsize,
257     pub(crate) type_: PciBarRegionType,
258     pub(crate) index: u32,
259     pub(crate) user_memory_regions: Vec<UserMemoryRegion>,
260 }
261 #[derive(Debug, Error)]
262 pub enum VfioError {
263     #[error("Kernel VFIO error: {0}")]
264     KernelVfio(#[source] vfio_ioctls::VfioError),
265     #[error("VFIO user error: {0}")]
266     VfioUser(#[source] vfio_user::Error),
267 }
268 
269 pub(crate) trait Vfio: Send + Sync {
270     fn read_config_byte(&self, offset: u32) -> u8 {
271         let mut data: [u8; 1] = [0];
272         self.read_config(offset, &mut data);
273         data[0]
274     }
275 
276     fn read_config_word(&self, offset: u32) -> u16 {
277         let mut data: [u8; 2] = [0, 0];
278         self.read_config(offset, &mut data);
279         u16::from_le_bytes(data)
280     }
281 
282     fn read_config_dword(&self, offset: u32) -> u32 {
283         let mut data: [u8; 4] = [0, 0, 0, 0];
284         self.read_config(offset, &mut data);
285         u32::from_le_bytes(data)
286     }
287 
288     fn write_config_dword(&self, offset: u32, buf: u32) {
289         let data: [u8; 4] = buf.to_le_bytes();
290         self.write_config(offset, &data)
291     }
292 
293     fn read_config(&self, offset: u32, data: &mut [u8]) {
294         self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut());
295     }
296 
297     fn write_config(&self, offset: u32, data: &[u8]) {
298         self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data)
299     }
300 
301     fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
302         self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds)
303     }
304 
305     fn disable_msi(&self) -> Result<(), VfioError> {
306         self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX)
307     }
308 
309     fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
310         self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds)
311     }
312 
313     fn disable_msix(&self) -> Result<(), VfioError> {
314         self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX)
315     }
316 
317     fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) {
318         unimplemented!()
319     }
320 
321     fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) {
322         unimplemented!()
323     }
324 
325     fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> {
326         unimplemented!()
327     }
328 
329     fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
330         unimplemented!()
331     }
332 
333     fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
334         unimplemented!()
335     }
336 
337     fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
338         unimplemented!()
339     }
340 }
341 
342 struct VfioDeviceWrapper {
343     device: Arc<VfioDevice>,
344 }
345 
346 impl VfioDeviceWrapper {
347     fn new(device: Arc<VfioDevice>) -> Self {
348         Self { device }
349     }
350 }
351 
352 impl Vfio for VfioDeviceWrapper {
353     fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) {
354         self.device.region_read(index, data, offset)
355     }
356 
357     fn region_write(&self, index: u32, offset: u64, data: &[u8]) {
358         self.device.region_write(index, data, offset)
359     }
360 
361     fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> {
362         self.device.get_irq_info(irq_index).copied()
363     }
364 
365     fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
366         self.device
367             .enable_irq(irq_index, event_fds)
368             .map_err(VfioError::KernelVfio)
369     }
370 
371     fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> {
372         self.device
373             .disable_irq(irq_index)
374             .map_err(VfioError::KernelVfio)
375     }
376 
377     fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> {
378         self.device
379             .unmask_irq(irq_index)
380             .map_err(VfioError::KernelVfio)
381     }
382 }
383 
384 #[derive(Versionize)]
385 struct VfioCommonState {
386     intx_state: Option<IntxState>,
387     msi_state: Option<MsiState>,
388     msix_state: Option<MsixState>,
389 }
390 
391 impl VersionMapped for VfioCommonState {}
392 
393 pub(crate) struct ConfigPatch {
394     mask: u32,
395     patch: u32,
396 }
397 
398 pub(crate) struct VfioCommon {
399     pub(crate) configuration: PciConfiguration,
400     pub(crate) mmio_regions: Vec<MmioRegion>,
401     pub(crate) interrupt: Interrupt,
402     pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
403     pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
404     pub(crate) vfio_wrapper: Arc<dyn Vfio>,
405     pub(crate) patches: HashMap<usize, ConfigPatch>,
406 }
407 
408 impl VfioCommon {
409     pub(crate) fn allocate_bars(
410         &mut self,
411         allocator: &Arc<Mutex<SystemAllocator>>,
412         mmio_allocator: &mut AddressAllocator,
413         resources: Option<Vec<Resource>>,
414     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
415         let mut bars = Vec::new();
416         let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX;
417 
418         // Going through all regular regions to compute the BAR size.
419         // We're not saving the BAR address to restore it, because we
420         // are going to allocate a guest address for each BAR and write
421         // that new address back.
422         while bar_id < VFIO_PCI_CONFIG_REGION_INDEX {
423             let mut region_size: u64 = 0;
424             let mut region_type = PciBarRegionType::Memory32BitRegion;
425             let mut prefetchable = PciBarPrefetchable::NotPrefetchable;
426             let mut flags: u32 = 0;
427 
428             let mut restored_bar_addr = None;
429             if let Some(resources) = &resources {
430                 for resource in resources {
431                     if let Resource::PciBar {
432                         index,
433                         base,
434                         size,
435                         type_,
436                         ..
437                     } = resource
438                     {
439                         if *index == bar_id as usize {
440                             restored_bar_addr = Some(GuestAddress(*base));
441                             region_size = *size;
442                             region_type = PciBarRegionType::from(*type_);
443                             break;
444                         }
445                     }
446                 }
447                 if restored_bar_addr.is_none() {
448                     bar_id += 1;
449                     continue;
450                 }
451             } else {
452                 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
453                     (PCI_ROM_EXP_BAR_INDEX * 4) as u32
454                 } else {
455                     PCI_CONFIG_BAR_OFFSET + bar_id * 4
456                 };
457 
458                 // First read flags
459                 flags = self.vfio_wrapper.read_config_dword(bar_offset);
460 
461                 // Is this an IO BAR?
462                 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
463                     matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR)
464                 } else {
465                     false
466                 };
467 
468                 // Is this a 64-bit BAR?
469                 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
470                     matches!(
471                         flags & PCI_CONFIG_MEMORY_BAR_64BIT,
472                         PCI_CONFIG_MEMORY_BAR_64BIT
473                     )
474                 } else {
475                     false
476                 };
477 
478                 if matches!(
479                     flags & PCI_CONFIG_BAR_PREFETCHABLE,
480                     PCI_CONFIG_BAR_PREFETCHABLE
481                 ) {
482                     prefetchable = PciBarPrefetchable::Prefetchable
483                 };
484 
485                 // To get size write all 1s
486                 self.vfio_wrapper
487                     .write_config_dword(bar_offset, 0xffff_ffff);
488 
489                 // And read back BAR value. The device will write zeros for bits it doesn't care about
490                 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset);
491 
492                 if io_bar {
493                     // Mask flag bits (lowest 2 for I/O bars)
494                     lower &= !0b11;
495 
496                     // BAR is not enabled
497                     if lower == 0 {
498                         bar_id += 1;
499                         continue;
500                     }
501 
502                     // IO BAR
503                     region_type = PciBarRegionType::IoRegion;
504 
505                     // Invert bits and add 1 to calculate size
506                     region_size = (!lower + 1) as u64;
507                 } else if is_64bit_bar {
508                     // 64 bits Memory BAR
509                     region_type = PciBarRegionType::Memory64BitRegion;
510 
511                     // Query size of upper BAR of 64-bit BAR
512                     let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4;
513                     self.vfio_wrapper
514                         .write_config_dword(upper_offset, 0xffff_ffff);
515                     let upper = self.vfio_wrapper.read_config_dword(upper_offset);
516 
517                     let mut combined_size = u64::from(upper) << 32 | u64::from(lower);
518 
519                     // Mask out flag bits (lowest 4 for memory bars)
520                     combined_size &= !0b1111;
521 
522                     // BAR is not enabled
523                     if combined_size == 0 {
524                         bar_id += 1;
525                         continue;
526                     }
527 
528                     // Invert and add 1 to to find size
529                     region_size = !combined_size + 1;
530                 } else {
531                     region_type = PciBarRegionType::Memory32BitRegion;
532 
533                     // Mask out flag bits (lowest 4 for memory bars)
534                     lower &= !0b1111;
535 
536                     if lower == 0 {
537                         bar_id += 1;
538                         continue;
539                     }
540 
541                     // Invert and add 1 to to find size
542                     region_size = (!lower + 1) as u64;
543                 }
544             }
545 
546             let bar_addr = match region_type {
547                 PciBarRegionType::IoRegion => {
548                     #[cfg(target_arch = "aarch64")]
549                     unimplemented!();
550 
551                     // The address needs to be 4 bytes aligned.
552                     #[cfg(not(target_arch = "aarch64"))]
553                     allocator
554                         .lock()
555                         .unwrap()
556                         .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4))
557                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
558                 }
559                 PciBarRegionType::Memory32BitRegion => {
560                     // BAR allocation must be naturally aligned
561                     allocator
562                         .lock()
563                         .unwrap()
564                         .allocate_mmio_hole_addresses(
565                             restored_bar_addr,
566                             region_size,
567                             Some(region_size),
568                         )
569                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
570                 }
571                 PciBarRegionType::Memory64BitRegion => {
572                     // BAR allocation must be naturally aligned
573                     mmio_allocator
574                         .allocate(restored_bar_addr, region_size, Some(region_size))
575                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
576                 }
577             };
578 
579             // We can now build our BAR configuration block.
580             let bar = PciBarConfiguration::default()
581                 .set_index(bar_id as usize)
582                 .set_address(bar_addr.raw_value())
583                 .set_size(region_size)
584                 .set_region_type(region_type)
585                 .set_prefetchable(prefetchable);
586 
587             if bar_id == VFIO_PCI_ROM_REGION_INDEX {
588                 self.configuration
589                     .add_pci_rom_bar(&bar, flags & 0x1)
590                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
591             } else {
592                 self.configuration
593                     .add_pci_bar(&bar)
594                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
595             }
596 
597             bars.push(bar);
598             self.mmio_regions.push(MmioRegion {
599                 start: bar_addr,
600                 length: region_size,
601                 type_: region_type,
602                 index: bar_id,
603                 user_memory_regions: Vec::new(),
604             });
605 
606             bar_id += 1;
607             if region_type == PciBarRegionType::Memory64BitRegion {
608                 bar_id += 1;
609             }
610         }
611 
612         Ok(bars)
613     }
614 
615     pub(crate) fn free_bars(
616         &mut self,
617         allocator: &mut SystemAllocator,
618         mmio_allocator: &mut AddressAllocator,
619     ) -> Result<(), PciDeviceError> {
620         for region in self.mmio_regions.iter() {
621             match region.type_ {
622                 PciBarRegionType::IoRegion => {
623                     #[cfg(target_arch = "x86_64")]
624                     allocator.free_io_addresses(region.start, region.length);
625                     #[cfg(target_arch = "aarch64")]
626                     error!("I/O region is not supported");
627                 }
628                 PciBarRegionType::Memory32BitRegion => {
629                     allocator.free_mmio_hole_addresses(region.start, region.length);
630                 }
631                 PciBarRegionType::Memory64BitRegion => {
632                     mmio_allocator.free(region.start, region.length);
633                 }
634             }
635         }
636         Ok(())
637     }
638 
639     pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap {
640         let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into());
641 
642         let table = self.vfio_wrapper.read_config_dword((cap + 4).into());
643 
644         let pba = self.vfio_wrapper.read_config_dword((cap + 8).into());
645 
646         MsixCap {
647             msg_ctl,
648             table,
649             pba,
650         }
651     }
652 
653     pub(crate) fn initialize_msix(&mut self, msix_cap: MsixCap, cap_offset: u32, bdf: PciBdf) {
654         let interrupt_source_group = self
655             .msi_interrupt_manager
656             .create_group(MsiIrqGroupConfig {
657                 base: 0,
658                 count: msix_cap.table_size() as InterruptIndex,
659             })
660             .unwrap();
661 
662         let msix_config = MsixConfig::new(
663             msix_cap.table_size(),
664             interrupt_source_group.clone(),
665             bdf.into(),
666             None,
667         )
668         .unwrap();
669 
670         self.interrupt.msix = Some(VfioMsix {
671             bar: msix_config,
672             cap: msix_cap,
673             cap_offset,
674             interrupt_source_group,
675         });
676     }
677 
678     pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 {
679         self.vfio_wrapper.read_config_word((cap + 2).into())
680     }
681 
682     pub(crate) fn initialize_msi(&mut self, msg_ctl: u16, cap_offset: u32) {
683         let interrupt_source_group = self
684             .msi_interrupt_manager
685             .create_group(MsiIrqGroupConfig {
686                 base: 0,
687                 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex,
688             })
689             .unwrap();
690 
691         let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone());
692 
693         self.interrupt.msi = Some(VfioMsi {
694             cfg: msi_config,
695             cap_offset,
696             interrupt_source_group,
697         });
698     }
699 
700     pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) {
701         let mut cap_next = self
702             .vfio_wrapper
703             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
704 
705         let mut pci_express_cap_found = false;
706         let mut power_management_cap_found = false;
707 
708         while cap_next != 0 {
709             let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into());
710 
711             match PciCapabilityId::from(cap_id) {
712                 PciCapabilityId::MessageSignalledInterrupts => {
713                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) {
714                         if irq_info.count > 0 {
715                             // Parse capability only if the VFIO device
716                             // supports MSI.
717                             let msg_ctl = self.parse_msi_capabilities(cap_next);
718                             self.initialize_msi(msg_ctl, cap_next as u32);
719                         }
720                     }
721                 }
722                 PciCapabilityId::MsiX => {
723                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX)
724                     {
725                         if irq_info.count > 0 {
726                             // Parse capability only if the VFIO device
727                             // supports MSI-X.
728                             let msix_cap = self.parse_msix_capabilities(cap_next);
729                             self.initialize_msix(msix_cap, cap_next as u32, bdf);
730                         }
731                     }
732                 }
733                 PciCapabilityId::PciExpress => pci_express_cap_found = true,
734                 PciCapabilityId::PowerManagement => power_management_cap_found = true,
735                 _ => {}
736             };
737 
738             cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into());
739         }
740 
741         if pci_express_cap_found && power_management_cap_found {
742             self.parse_extended_capabilities();
743         }
744     }
745 
746     fn parse_extended_capabilities(&mut self) {
747         let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET;
748 
749         loop {
750             let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset);
751 
752             let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16;
753             let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16;
754 
755             match PciExpressCapabilityId::from(cap_id) {
756                 PciExpressCapabilityId::AlternativeRoutingIdentificationIntepretation
757                 | PciExpressCapabilityId::ResizeableBar
758                 | PciExpressCapabilityId::SingleRootIoVirtualization => {
759                     let reg_idx = (current_offset / 4) as usize;
760                     self.patches.insert(
761                         reg_idx,
762                         ConfigPatch {
763                             mask: 0x0000_ffff,
764                             patch: PciExpressCapabilityId::NullCapability as u32,
765                         },
766                     );
767                 }
768                 _ => {}
769             }
770 
771             if cap_next == 0 {
772                 break;
773             }
774 
775             current_offset = cap_next.into();
776         }
777     }
778 
779     pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> {
780         if let Some(intx) = &mut self.interrupt.intx {
781             if !intx.enabled {
782                 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) {
783                     self.vfio_wrapper
784                         .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd])
785                         .map_err(VfioPciError::EnableIntx)?;
786 
787                     intx.enabled = true;
788                 } else {
789                     return Err(VfioPciError::MissingNotifier);
790                 }
791             }
792         }
793 
794         Ok(())
795     }
796 
797     pub(crate) fn disable_intx(&mut self) {
798         if let Some(intx) = &mut self.interrupt.intx {
799             if intx.enabled {
800                 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) {
801                     error!("Could not disable INTx: {}", e);
802                 } else {
803                     intx.enabled = false;
804                 }
805             }
806         }
807     }
808 
809     pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> {
810         if let Some(msi) = &self.interrupt.msi {
811             let mut irq_fds: Vec<EventFd> = Vec::new();
812             for i in 0..msi.cfg.num_enabled_vectors() {
813                 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) {
814                     irq_fds.push(eventfd);
815                 } else {
816                     return Err(VfioPciError::MissingNotifier);
817                 }
818             }
819 
820             self.vfio_wrapper
821                 .enable_msi(irq_fds.iter().collect())
822                 .map_err(VfioPciError::EnableMsi)?;
823         }
824 
825         Ok(())
826     }
827 
828     pub(crate) fn disable_msi(&self) {
829         if let Err(e) = self.vfio_wrapper.disable_msi() {
830             error!("Could not disable MSI: {}", e);
831         }
832     }
833 
834     pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> {
835         if let Some(msix) = &self.interrupt.msix {
836             let mut irq_fds: Vec<EventFd> = Vec::new();
837             for i in 0..msix.bar.table_entries.len() {
838                 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) {
839                     irq_fds.push(eventfd);
840                 } else {
841                     return Err(VfioPciError::MissingNotifier);
842                 }
843             }
844 
845             self.vfio_wrapper
846                 .enable_msix(irq_fds.iter().collect())
847                 .map_err(VfioPciError::EnableMsix)?;
848         }
849 
850         Ok(())
851     }
852 
853     pub(crate) fn disable_msix(&self) {
854         if let Err(e) = self.vfio_wrapper.disable_msix() {
855             error!("Could not disable MSI-X: {}", e);
856         }
857     }
858 
859     pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> {
860         if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) {
861             if irq_info.count == 0 {
862                 // A count of 0 means the INTx IRQ is not supported, therefore
863                 // it shouldn't be initialized.
864                 return Ok(());
865             }
866         }
867 
868         if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() {
869             self.interrupt.intx = Some(VfioIntx {
870                 interrupt_source_group,
871                 enabled: false,
872             });
873 
874             self.enable_intx()?;
875         }
876 
877         Ok(())
878     }
879 
880     pub(crate) fn update_msi_capabilities(
881         &mut self,
882         offset: u64,
883         data: &[u8],
884     ) -> Result<(), VfioPciError> {
885         match self.interrupt.update_msi(offset, data) {
886             Some(InterruptUpdateAction::EnableMsi) => {
887                 // Disable INTx before we can enable MSI
888                 self.disable_intx();
889                 self.enable_msi()?;
890             }
891             Some(InterruptUpdateAction::DisableMsi) => {
892                 // Fallback onto INTx when disabling MSI
893                 self.disable_msi();
894                 self.enable_intx()?;
895             }
896             _ => {}
897         }
898 
899         Ok(())
900     }
901 
902     pub(crate) fn update_msix_capabilities(
903         &mut self,
904         offset: u64,
905         data: &[u8],
906     ) -> Result<(), VfioPciError> {
907         match self.interrupt.update_msix(offset, data) {
908             Some(InterruptUpdateAction::EnableMsix) => {
909                 // Disable INTx before we can enable MSI-X
910                 self.disable_intx();
911                 self.enable_msix()?;
912             }
913             Some(InterruptUpdateAction::DisableMsix) => {
914                 // Fallback onto INTx when disabling MSI-X
915                 self.disable_msix();
916                 self.enable_intx()?;
917             }
918             _ => {}
919         }
920 
921         Ok(())
922     }
923 
924     pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> {
925         for region in self.mmio_regions.iter() {
926             if addr >= region.start.raw_value()
927                 && addr < region.start.unchecked_add(region.length).raw_value()
928             {
929                 return Some(region.clone());
930             }
931         }
932         None
933     }
934 
935     pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
936         let addr = base + offset;
937         if let Some(region) = self.find_region(addr) {
938             let offset = addr - region.start.raw_value();
939 
940             if self.interrupt.msix_table_accessed(region.index, offset) {
941                 self.interrupt.msix_read_table(offset, data);
942             } else {
943                 self.vfio_wrapper.region_read(region.index, offset, data);
944             }
945         }
946 
947         // INTx EOI
948         // The guest reading from the BAR potentially means the interrupt has
949         // been received and can be acknowledged.
950         if self.interrupt.intx_in_use() {
951             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
952                 error!("Failed unmasking INTx IRQ: {}", e);
953             }
954         }
955     }
956 
957     pub(crate) fn write_bar(
958         &mut self,
959         base: u64,
960         offset: u64,
961         data: &[u8],
962     ) -> Option<Arc<Barrier>> {
963         let addr = base + offset;
964         if let Some(region) = self.find_region(addr) {
965             let offset = addr - region.start.raw_value();
966 
967             // If the MSI-X table is written to, we need to update our cache.
968             if self.interrupt.msix_table_accessed(region.index, offset) {
969                 self.interrupt.msix_write_table(offset, data);
970             } else {
971                 self.vfio_wrapper.region_write(region.index, offset, data);
972             }
973         }
974 
975         // INTx EOI
976         // The guest writing to the BAR potentially means the interrupt has
977         // been received and can be acknowledged.
978         if self.interrupt.intx_in_use() {
979             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
980                 error!("Failed unmasking INTx IRQ: {}", e);
981             }
982         }
983 
984         None
985     }
986 
987     pub(crate) fn write_config_register(
988         &mut self,
989         reg_idx: usize,
990         offset: u64,
991         data: &[u8],
992     ) -> Option<Arc<Barrier>> {
993         // When the guest wants to write to a BAR, we trap it into
994         // our local configuration space. We're not reprogramming
995         // VFIO device.
996         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
997             || reg_idx == PCI_ROM_EXP_BAR_INDEX
998         {
999             // We keep our local cache updated with the BARs.
1000             // We'll read it back from there when the guest is asking
1001             // for BARs (see read_config_register()).
1002             self.configuration
1003                 .write_config_register(reg_idx, offset, data);
1004             return None;
1005         }
1006 
1007         let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64;
1008 
1009         // If the MSI or MSI-X capabilities are accessed, we need to
1010         // update our local cache accordingly.
1011         // Depending on how the capabilities are modified, this could
1012         // trigger a VFIO MSI or MSI-X toggle.
1013         if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) {
1014             let cap_offset: u64 = reg - cap_base + offset;
1015             match cap_id {
1016                 PciCapabilityId::MessageSignalledInterrupts => {
1017                     if let Err(e) = self.update_msi_capabilities(cap_offset, data) {
1018                         error!("Could not update MSI capabilities: {}", e);
1019                     }
1020                 }
1021                 PciCapabilityId::MsiX => {
1022                     if let Err(e) = self.update_msix_capabilities(cap_offset, data) {
1023                         error!("Could not update MSI-X capabilities: {}", e);
1024                     }
1025                 }
1026                 _ => {}
1027             }
1028         }
1029 
1030         // Make sure to write to the device's PCI config space after MSI/MSI-X
1031         // interrupts have been enabled/disabled. In case of MSI, when the
1032         // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS),
1033         // the MSI Enable bit in the MSI capability structure found in the PCI
1034         // config space is disabled by default. That's why when the guest is
1035         // enabling this bit, we first need to enable the MSI interrupts with
1036         // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write
1037         // to the device region to update the MSI Enable bit.
1038         self.vfio_wrapper.write_config((reg + offset) as u32, data);
1039 
1040         None
1041     }
1042 
1043     pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1044         // When reading the BARs, we trap it and return what comes
1045         // from our local configuration space. We want the guest to
1046         // use that and not the VFIO device BARs as it does not map
1047         // with the guest address space.
1048         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1049             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1050         {
1051             return self.configuration.read_reg(reg_idx);
1052         }
1053 
1054         // Since we don't support passing multi-functions devices, we should
1055         // mask the multi-function bit, bit 7 of the Header Type byte on the
1056         // register 3.
1057         let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX {
1058             0xff7f_ffff
1059         } else {
1060             0xffff_ffff
1061         };
1062 
1063         // The config register read comes from the VFIO device itself.
1064         let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask;
1065 
1066         if let Some(config_patch) = self.patches.get(&reg_idx) {
1067             value = (value & !config_patch.mask) | config_patch.patch;
1068         }
1069 
1070         value
1071     }
1072 
1073     fn state(&self) -> VfioCommonState {
1074         let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState {
1075             enabled: intx.enabled,
1076         });
1077 
1078         let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState {
1079             cap: msi.cfg.cap,
1080             cap_offset: msi.cap_offset,
1081         });
1082 
1083         let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState {
1084             cap: msix.cap,
1085             cap_offset: msix.cap_offset,
1086             bdf: msix.bar.devid,
1087         });
1088 
1089         VfioCommonState {
1090             intx_state,
1091             msi_state,
1092             msix_state,
1093         }
1094     }
1095 
1096     fn set_state(&mut self, state: &VfioCommonState) -> Result<(), VfioPciError> {
1097         if let (Some(intx), Some(interrupt_source_group)) =
1098             (&state.intx_state, self.legacy_interrupt_group.clone())
1099         {
1100             self.interrupt.intx = Some(VfioIntx {
1101                 interrupt_source_group,
1102                 enabled: false,
1103             });
1104 
1105             if intx.enabled {
1106                 self.enable_intx()?;
1107             }
1108         }
1109 
1110         if let Some(msi) = &state.msi_state {
1111             self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset);
1112         }
1113 
1114         if let Some(msix) = &state.msix_state {
1115             self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into());
1116         }
1117 
1118         Ok(())
1119     }
1120 }
1121 
1122 impl Pausable for VfioCommon {}
1123 
1124 impl Snapshottable for VfioCommon {
1125     fn id(&self) -> String {
1126         String::from("vfio_common")
1127     }
1128 
1129     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1130         let mut vfio_common_snapshot =
1131             Snapshot::new_from_versioned_state(&self.id(), &self.state())?;
1132 
1133         // Snapshot PciConfiguration
1134         vfio_common_snapshot.add_snapshot(self.configuration.snapshot()?);
1135 
1136         // Snapshot MSI
1137         if let Some(msi) = &mut self.interrupt.msi {
1138             vfio_common_snapshot.add_snapshot(msi.cfg.snapshot()?);
1139         }
1140 
1141         // Snapshot MSI-X
1142         if let Some(msix) = &mut self.interrupt.msix {
1143             vfio_common_snapshot.add_snapshot(msix.bar.snapshot()?);
1144         }
1145 
1146         Ok(vfio_common_snapshot)
1147     }
1148 
1149     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
1150         if let Some(vfio_common_section) = snapshot
1151             .snapshot_data
1152             .get(&format!("{}-section", self.id()))
1153         {
1154             // It has to be invoked first as we want Interrupt to be initialized
1155             // correctly before we try to restore MSI and MSI-X configurations.
1156             self.set_state(&vfio_common_section.to_versioned_state()?)
1157                 .map_err(|e| {
1158                     MigratableError::Restore(anyhow!("Could not restore VFIO_COMMON state {:?}", e))
1159                 })?;
1160 
1161             // Restore PciConfiguration
1162             if let Some(pci_config_snapshot) = snapshot.snapshots.get(&self.configuration.id()) {
1163                 self.configuration.restore(*pci_config_snapshot.clone())?;
1164             }
1165 
1166             // Restore MSI
1167             if let Some(msi) = &mut self.interrupt.msi {
1168                 if let Some(msi_snapshot) = snapshot.snapshots.get(&msi.cfg.id()) {
1169                     msi.cfg.restore(*msi_snapshot.clone())?;
1170                 }
1171                 if msi.cfg.enabled() {
1172                     self.enable_msi().unwrap();
1173                 }
1174             }
1175 
1176             // Restore MSI-X
1177             if let Some(msix) = &mut self.interrupt.msix {
1178                 if let Some(msix_snapshot) = snapshot.snapshots.get(&msix.bar.id()) {
1179                     msix.bar.restore(*msix_snapshot.clone())?;
1180                 }
1181                 if msix.bar.enabled() {
1182                     self.enable_msix().unwrap();
1183                 }
1184             }
1185 
1186             return Ok(());
1187         }
1188 
1189         Err(MigratableError::Restore(anyhow!(
1190             "Could not find VFIO_COMMON snapshot section"
1191         )))
1192     }
1193 }
1194 
1195 /// VfioPciDevice represents a VFIO PCI device.
1196 /// This structure implements the BusDevice and PciDevice traits.
1197 ///
1198 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device.
1199 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice,
1200 /// which then gets added to the PCI bus.
1201 pub struct VfioPciDevice {
1202     id: String,
1203     vm: Arc<dyn hypervisor::Vm>,
1204     device: Arc<VfioDevice>,
1205     container: Arc<VfioContainer>,
1206     common: VfioCommon,
1207     iommu_attached: bool,
1208     memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
1209 }
1210 
1211 impl VfioPciDevice {
1212     /// Constructs a new Vfio Pci device for the given Vfio device
1213     #[allow(clippy::too_many_arguments)]
1214     pub fn new(
1215         id: String,
1216         vm: &Arc<dyn hypervisor::Vm>,
1217         device: VfioDevice,
1218         container: Arc<VfioContainer>,
1219         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
1220         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
1221         iommu_attached: bool,
1222         bdf: PciBdf,
1223         restoring: bool,
1224         memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
1225     ) -> Result<Self, VfioPciError> {
1226         let device = Arc::new(device);
1227         device.reset();
1228 
1229         let configuration = PciConfiguration::new(
1230             0,
1231             0,
1232             0,
1233             PciClassCode::Other,
1234             &PciVfioSubclass::VfioSubclass,
1235             None,
1236             PciHeaderType::Device,
1237             0,
1238             0,
1239             None,
1240             None,
1241         );
1242 
1243         let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device));
1244 
1245         let mut common = VfioCommon {
1246             mmio_regions: Vec::new(),
1247             configuration,
1248             interrupt: Interrupt {
1249                 intx: None,
1250                 msi: None,
1251                 msix: None,
1252             },
1253             msi_interrupt_manager,
1254             legacy_interrupt_group,
1255             vfio_wrapper: Arc::new(vfio_wrapper) as Arc<dyn Vfio>,
1256             patches: HashMap::new(),
1257         };
1258 
1259         // No need to parse capabilities from the device if on the restore path.
1260         // The initialization will be performed later when restore() will be
1261         // called.
1262         if !restoring {
1263             common.parse_capabilities(bdf);
1264             common.initialize_legacy_interrupt()?;
1265         }
1266 
1267         let vfio_pci_device = VfioPciDevice {
1268             id,
1269             vm: vm.clone(),
1270             device,
1271             container,
1272             common,
1273             iommu_attached,
1274             memory_slot,
1275         };
1276 
1277         Ok(vfio_pci_device)
1278     }
1279 
1280     pub fn iommu_attached(&self) -> bool {
1281         self.iommu_attached
1282     }
1283 
1284     fn align_4k(address: u64) -> u64 {
1285         (address + 0xfff) & 0xffff_ffff_ffff_f000
1286     }
1287 
1288     fn is_4k_aligned(address: u64) -> bool {
1289         (address & 0xfff) == 0
1290     }
1291 
1292     fn is_4k_multiple(size: u64) -> bool {
1293         (size & 0xfff) == 0
1294     }
1295 
1296     fn generate_sparse_areas(
1297         caps: &[VfioRegionInfoCap],
1298         region_index: u32,
1299         region_start: u64,
1300         region_size: u64,
1301         vfio_msix: Option<&VfioMsix>,
1302     ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> {
1303         for cap in caps {
1304             match cap {
1305                 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()),
1306                 VfioRegionInfoCap::MsixMappable => {
1307                     if !Self::is_4k_aligned(region_start) {
1308                         error!(
1309                             "Region start address 0x{:x} must be at least aligned on 4KiB",
1310                             region_start
1311                         );
1312                         return Err(VfioPciError::RegionAlignment);
1313                     }
1314                     if !Self::is_4k_multiple(region_size) {
1315                         error!(
1316                             "Region size 0x{:x} must be at least a multiple of 4KiB",
1317                             region_size
1318                         );
1319                         return Err(VfioPciError::RegionSize);
1320                     }
1321 
1322                     // In case the region contains the MSI-X vectors table or
1323                     // the MSI-X PBA table, we must calculate the subregions
1324                     // around them, leading to a list of sparse areas.
1325                     // We want to make sure we will still trap MMIO accesses
1326                     // to these MSI-X specific ranges.
1327                     //
1328                     // Using a BtreeMap as the list provided through the iterator is sorted
1329                     // by key. This ensures proper split of the whole region.
1330                     let mut inter_ranges = BTreeMap::new();
1331                     if let Some(msix) = vfio_msix {
1332                         if region_index == msix.cap.table_bir() {
1333                             let (offset, size) = msix.cap.table_range();
1334                             inter_ranges.insert(offset, size);
1335                         }
1336                         if region_index == msix.cap.pba_bir() {
1337                             let (offset, size) = msix.cap.pba_range();
1338                             inter_ranges.insert(offset, size);
1339                         }
1340                     }
1341 
1342                     let mut sparse_areas = Vec::new();
1343                     let mut current_offset = 0;
1344                     for (range_offset, range_size) in inter_ranges {
1345                         if range_offset > current_offset {
1346                             sparse_areas.push(VfioRegionSparseMmapArea {
1347                                 offset: current_offset,
1348                                 size: range_offset - current_offset,
1349                             });
1350                         }
1351 
1352                         current_offset = Self::align_4k(range_offset + range_size);
1353                     }
1354 
1355                     if region_size > current_offset {
1356                         sparse_areas.push(VfioRegionSparseMmapArea {
1357                             offset: current_offset,
1358                             size: region_size - current_offset,
1359                         });
1360                     }
1361 
1362                     return Ok(sparse_areas);
1363                 }
1364                 _ => {}
1365             }
1366         }
1367 
1368         // In case no relevant capabilities have been found, create a single
1369         // sparse area corresponding to the entire MMIO region.
1370         Ok(vec![VfioRegionSparseMmapArea {
1371             offset: 0,
1372             size: region_size,
1373         }])
1374     }
1375 
1376     /// Map MMIO regions into the guest, and avoid VM exits when the guest tries
1377     /// to reach those regions.
1378     ///
1379     /// # Arguments
1380     ///
1381     /// * `vm` - The VM object. It is used to set the VFIO MMIO regions
1382     ///          as user memory regions.
1383     /// * `mem_slot` - The closure to return a memory slot.
1384     pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> {
1385         let fd = self.device.as_raw_fd();
1386 
1387         for region in self.common.mmio_regions.iter_mut() {
1388             let region_flags = self.device.get_region_flags(region.index);
1389             if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1390                 let mut prot = 0;
1391                 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 {
1392                     prot |= libc::PROT_READ;
1393                 }
1394                 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 {
1395                     prot |= libc::PROT_WRITE;
1396                 }
1397 
1398                 // Retrieve the list of capabilities found on the region
1399                 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 {
1400                     self.device.get_region_caps(region.index)
1401                 } else {
1402                     Vec::new()
1403                 };
1404 
1405                 // Don't try to mmap the region if it contains MSI-X table or
1406                 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE
1407                 // in the list of supported capabilities.
1408                 if let Some(msix) = self.common.interrupt.msix.as_ref() {
1409                     if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir())
1410                         && !caps.contains(&VfioRegionInfoCap::MsixMappable)
1411                     {
1412                         continue;
1413                     }
1414                 }
1415 
1416                 let mmap_size = self.device.get_region_size(region.index);
1417                 let mmap_offset = self.device.get_region_offset(region.index);
1418 
1419                 let sparse_areas = Self::generate_sparse_areas(
1420                     &caps,
1421                     region.index,
1422                     region.start.0,
1423                     mmap_size,
1424                     self.common.interrupt.msix.as_ref(),
1425                 )?;
1426 
1427                 for area in sparse_areas.iter() {
1428                     // SAFETY: FFI call with correct arguments
1429                     let host_addr = unsafe {
1430                         libc::mmap(
1431                             null_mut(),
1432                             area.size as usize,
1433                             prot,
1434                             libc::MAP_SHARED,
1435                             fd,
1436                             mmap_offset as libc::off_t + area.offset as libc::off_t,
1437                         )
1438                     };
1439 
1440                     if host_addr == libc::MAP_FAILED {
1441                         error!(
1442                             "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}",
1443                             area.offset,
1444                             area.size,
1445                             std::io::Error::last_os_error()
1446                         );
1447                         return Err(VfioPciError::MmapArea);
1448                     }
1449 
1450                     let user_memory_region = UserMemoryRegion {
1451                         slot: (self.memory_slot)(),
1452                         start: region.start.0 + area.offset,
1453                         size: area.size,
1454                         host_addr: host_addr as u64,
1455                     };
1456 
1457                     region.user_memory_regions.push(user_memory_region);
1458 
1459                     let mem_region = self.vm.make_user_memory_region(
1460                         user_memory_region.slot,
1461                         user_memory_region.start,
1462                         user_memory_region.size,
1463                         user_memory_region.host_addr,
1464                         false,
1465                         false,
1466                     );
1467 
1468                     self.vm
1469                         .create_user_memory_region(mem_region)
1470                         .map_err(VfioPciError::CreateUserMemoryRegion)?;
1471                 }
1472             }
1473         }
1474 
1475         Ok(())
1476     }
1477 
1478     pub fn unmap_mmio_regions(&mut self) {
1479         for region in self.common.mmio_regions.iter() {
1480             for user_memory_region in region.user_memory_regions.iter() {
1481                 // Remove region
1482                 let r = self.vm.make_user_memory_region(
1483                     user_memory_region.slot,
1484                     user_memory_region.start,
1485                     user_memory_region.size,
1486                     user_memory_region.host_addr,
1487                     false,
1488                     false,
1489                 );
1490 
1491                 if let Err(e) = self.vm.remove_user_memory_region(r) {
1492                     error!("Could not remove the userspace memory region: {}", e);
1493                 }
1494 
1495                 // SAFETY: FFI call with correct arguments
1496                 let ret = unsafe {
1497                     libc::munmap(
1498                         user_memory_region.host_addr as *mut libc::c_void,
1499                         user_memory_region.size as usize,
1500                     )
1501                 };
1502                 if ret != 0 {
1503                     error!(
1504                         "Could not unmap region {}, error:{}",
1505                         region.index,
1506                         io::Error::last_os_error()
1507                     );
1508                 }
1509             }
1510         }
1511     }
1512 
1513     pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> {
1514         if !self.iommu_attached {
1515             self.container
1516                 .vfio_dma_map(iova, size, user_addr)
1517                 .map_err(VfioPciError::DmaMap)?;
1518         }
1519 
1520         Ok(())
1521     }
1522 
1523     pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> {
1524         if !self.iommu_attached {
1525             self.container
1526                 .vfio_dma_unmap(iova, size)
1527                 .map_err(VfioPciError::DmaUnmap)?;
1528         }
1529 
1530         Ok(())
1531     }
1532 
1533     pub fn mmio_regions(&self) -> Vec<MmioRegion> {
1534         self.common.mmio_regions.clone()
1535     }
1536 }
1537 
1538 impl Drop for VfioPciDevice {
1539     fn drop(&mut self) {
1540         self.unmap_mmio_regions();
1541 
1542         if let Some(msix) = &self.common.interrupt.msix {
1543             if msix.bar.enabled() {
1544                 self.common.disable_msix();
1545             }
1546         }
1547 
1548         if let Some(msi) = &self.common.interrupt.msi {
1549             if msi.cfg.enabled() {
1550                 self.common.disable_msi()
1551             }
1552         }
1553 
1554         if self.common.interrupt.intx_in_use() {
1555             self.common.disable_intx();
1556         }
1557     }
1558 }
1559 
1560 impl BusDevice for VfioPciDevice {
1561     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1562         self.read_bar(base, offset, data)
1563     }
1564 
1565     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1566         self.write_bar(base, offset, data)
1567     }
1568 }
1569 
1570 // First BAR offset in the PCI config space.
1571 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10;
1572 // Capability register offset in the PCI config space.
1573 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34;
1574 // Extended capabilities register offset in the PCI config space.
1575 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100;
1576 // IO BAR when first BAR bit is 1.
1577 const PCI_CONFIG_IO_BAR: u32 = 0x1;
1578 // 64-bit memory bar flag.
1579 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4;
1580 // Prefetchable BAR bit
1581 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8;
1582 // PCI config register size (4 bytes).
1583 const PCI_CONFIG_REGISTER_SIZE: usize = 4;
1584 // Number of BARs for a PCI device
1585 const BAR_NUMS: usize = 6;
1586 // PCI Header Type register index
1587 const PCI_HEADER_TYPE_REG_INDEX: usize = 3;
1588 // First BAR register index
1589 const PCI_CONFIG_BAR0_INDEX: usize = 4;
1590 // PCI ROM expansion BAR register index
1591 const PCI_ROM_EXP_BAR_INDEX: usize = 12;
1592 
1593 impl PciDevice for VfioPciDevice {
1594     fn allocate_bars(
1595         &mut self,
1596         allocator: &Arc<Mutex<SystemAllocator>>,
1597         mmio_allocator: &mut AddressAllocator,
1598         resources: Option<Vec<Resource>>,
1599     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
1600         self.common
1601             .allocate_bars(allocator, mmio_allocator, resources)
1602     }
1603 
1604     fn free_bars(
1605         &mut self,
1606         allocator: &mut SystemAllocator,
1607         mmio_allocator: &mut AddressAllocator,
1608     ) -> Result<(), PciDeviceError> {
1609         self.common.free_bars(allocator, mmio_allocator)
1610     }
1611 
1612     fn write_config_register(
1613         &mut self,
1614         reg_idx: usize,
1615         offset: u64,
1616         data: &[u8],
1617     ) -> Option<Arc<Barrier>> {
1618         self.common.write_config_register(reg_idx, offset, data)
1619     }
1620 
1621     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1622         self.common.read_config_register(reg_idx)
1623     }
1624 
1625     fn detect_bar_reprogramming(
1626         &mut self,
1627         reg_idx: usize,
1628         data: &[u8],
1629     ) -> Option<BarReprogrammingParams> {
1630         self.common
1631             .configuration
1632             .detect_bar_reprogramming(reg_idx, data)
1633     }
1634 
1635     fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1636         self.common.read_bar(base, offset, data)
1637     }
1638 
1639     fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1640         self.common.write_bar(base, offset, data)
1641     }
1642 
1643     fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> {
1644         for region in self.common.mmio_regions.iter_mut() {
1645             if region.start.raw_value() == old_base {
1646                 region.start = GuestAddress(new_base);
1647 
1648                 for user_memory_region in region.user_memory_regions.iter_mut() {
1649                     // Remove old region
1650                     let old_mem_region = self.vm.make_user_memory_region(
1651                         user_memory_region.slot,
1652                         user_memory_region.start,
1653                         user_memory_region.size,
1654                         user_memory_region.host_addr,
1655                         false,
1656                         false,
1657                     );
1658 
1659                     self.vm
1660                         .remove_user_memory_region(old_mem_region)
1661                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1662 
1663                     // Update the user memory region with the correct start address.
1664                     if new_base > old_base {
1665                         user_memory_region.start += new_base - old_base;
1666                     } else {
1667                         user_memory_region.start -= old_base - new_base;
1668                     }
1669 
1670                     // Insert new region
1671                     let new_mem_region = self.vm.make_user_memory_region(
1672                         user_memory_region.slot,
1673                         user_memory_region.start,
1674                         user_memory_region.size,
1675                         user_memory_region.host_addr,
1676                         false,
1677                         false,
1678                     );
1679 
1680                     self.vm
1681                         .create_user_memory_region(new_mem_region)
1682                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1683                 }
1684             }
1685         }
1686 
1687         Ok(())
1688     }
1689 
1690     fn as_any(&mut self) -> &mut dyn Any {
1691         self
1692     }
1693 
1694     fn id(&self) -> Option<String> {
1695         Some(self.id.clone())
1696     }
1697 }
1698 
1699 impl Pausable for VfioPciDevice {}
1700 
1701 impl Snapshottable for VfioPciDevice {
1702     fn id(&self) -> String {
1703         self.id.clone()
1704     }
1705 
1706     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1707         let mut vfio_pci_dev_snapshot = Snapshot::new(&self.id);
1708 
1709         // Snapshot VfioCommon
1710         vfio_pci_dev_snapshot.add_snapshot(self.common.snapshot()?);
1711 
1712         Ok(vfio_pci_dev_snapshot)
1713     }
1714 
1715     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
1716         // Restore VfioCommon
1717         if let Some(vfio_common_snapshot) = snapshot.snapshots.get(&self.common.id()) {
1718             self.common.restore(*vfio_common_snapshot.clone())?;
1719             self.map_mmio_regions().map_err(|e| {
1720                 MigratableError::Restore(anyhow!(
1721                     "Could not map MMIO regions for VfioPciDevice on restore {:?}",
1722                     e
1723                 ))
1724             })?;
1725         }
1726 
1727         Ok(())
1728     }
1729 }
1730 impl Transportable for VfioPciDevice {}
1731 impl Migratable for VfioPciDevice {}
1732