xref: /cloud-hypervisor/pci/src/vfio.rs (revision eea9bcea38e0c5649f444c829f3a4f9c22aa486c)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 
6 use crate::{
7     msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig,
8     PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId,
9     PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId,
10     PciHeaderType, PciSubclass, MSIX_TABLE_ENTRY_SIZE,
11 };
12 use anyhow::anyhow;
13 use byteorder::{ByteOrder, LittleEndian};
14 use hypervisor::HypervisorVmError;
15 use std::any::Any;
16 use std::collections::{BTreeMap, HashMap};
17 use std::io;
18 use std::os::unix::io::AsRawFd;
19 use std::ptr::null_mut;
20 use std::sync::{Arc, Barrier, Mutex};
21 use thiserror::Error;
22 use versionize::{VersionMap, Versionize, VersionizeResult};
23 use versionize_derive::Versionize;
24 use vfio_bindings::bindings::vfio::*;
25 use vfio_ioctls::{
26     VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea,
27 };
28 use vm_allocator::{AddressAllocator, SystemAllocator};
29 use vm_device::interrupt::{
30     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
31 };
32 use vm_device::{BusDevice, Resource};
33 use vm_memory::{Address, GuestAddress, GuestUsize};
34 use vm_migration::{
35     Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped,
36 };
37 use vmm_sys_util::eventfd::EventFd;
38 
39 #[derive(Debug, Error)]
40 pub enum VfioPciError {
41     #[error("Failed to create user memory region: {0}")]
42     CreateUserMemoryRegion(#[source] HypervisorVmError),
43     #[error("Failed to DMA map: {0}")]
44     DmaMap(#[source] vfio_ioctls::VfioError),
45     #[error("Failed to DMA unmap: {0}")]
46     DmaUnmap(#[source] vfio_ioctls::VfioError),
47     #[error("Failed to enable INTx: {0}")]
48     EnableIntx(#[source] VfioError),
49     #[error("Failed to enable MSI: {0}")]
50     EnableMsi(#[source] VfioError),
51     #[error("Failed to enable MSI-x: {0}")]
52     EnableMsix(#[source] VfioError),
53     #[error("Failed to mmap the area")]
54     MmapArea,
55     #[error("Failed to notifier's eventfd")]
56     MissingNotifier,
57     #[error("Invalid region alignment")]
58     RegionAlignment,
59     #[error("Invalid region size")]
60     RegionSize,
61 }
62 
63 #[derive(Copy, Clone)]
64 enum PciVfioSubclass {
65     VfioSubclass = 0xff,
66 }
67 
68 impl PciSubclass for PciVfioSubclass {
69     fn get_register_value(&self) -> u8 {
70         *self as u8
71     }
72 }
73 
74 enum InterruptUpdateAction {
75     EnableMsi,
76     DisableMsi,
77     EnableMsix,
78     DisableMsix,
79 }
80 
81 #[derive(Versionize)]
82 struct IntxState {
83     enabled: bool,
84 }
85 
86 pub(crate) struct VfioIntx {
87     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
88     enabled: bool,
89 }
90 
91 #[derive(Versionize)]
92 struct MsiState {
93     cap: MsiCap,
94     cap_offset: u32,
95 }
96 
97 pub(crate) struct VfioMsi {
98     pub(crate) cfg: MsiConfig,
99     cap_offset: u32,
100     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
101 }
102 
103 impl VfioMsi {
104     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
105         let old_enabled = self.cfg.enabled();
106 
107         self.cfg.update(offset, data);
108 
109         let new_enabled = self.cfg.enabled();
110 
111         if !old_enabled && new_enabled {
112             return Some(InterruptUpdateAction::EnableMsi);
113         }
114 
115         if old_enabled && !new_enabled {
116             return Some(InterruptUpdateAction::DisableMsi);
117         }
118 
119         None
120     }
121 }
122 
123 #[derive(Versionize)]
124 struct MsixState {
125     cap: MsixCap,
126     cap_offset: u32,
127     bdf: u32,
128 }
129 
130 pub(crate) struct VfioMsix {
131     pub(crate) bar: MsixConfig,
132     cap: MsixCap,
133     cap_offset: u32,
134     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
135 }
136 
137 impl VfioMsix {
138     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
139         let old_enabled = self.bar.enabled();
140 
141         // Update "Message Control" word
142         if offset == 2 && data.len() == 2 {
143             self.bar.set_msg_ctl(LittleEndian::read_u16(data));
144         }
145 
146         let new_enabled = self.bar.enabled();
147 
148         if !old_enabled && new_enabled {
149             return Some(InterruptUpdateAction::EnableMsix);
150         }
151 
152         if old_enabled && !new_enabled {
153             return Some(InterruptUpdateAction::DisableMsix);
154         }
155 
156         None
157     }
158 
159     fn table_accessed(&self, bar_index: u32, offset: u64) -> bool {
160         let table_offset: u64 = u64::from(self.cap.table_offset());
161         let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64);
162         let table_bir: u32 = self.cap.table_bir();
163 
164         bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size
165     }
166 }
167 
168 pub(crate) struct Interrupt {
169     pub(crate) intx: Option<VfioIntx>,
170     pub(crate) msi: Option<VfioMsi>,
171     pub(crate) msix: Option<VfioMsix>,
172 }
173 
174 impl Interrupt {
175     fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
176         if let Some(ref mut msi) = &mut self.msi {
177             let action = msi.update(offset, data);
178             return action;
179         }
180 
181         None
182     }
183 
184     fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
185         if let Some(ref mut msix) = &mut self.msix {
186             let action = msix.update(offset, data);
187             return action;
188         }
189 
190         None
191     }
192 
193     fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> {
194         if let Some(msi) = &self.msi {
195             if offset >= u64::from(msi.cap_offset)
196                 && offset < u64::from(msi.cap_offset) + msi.cfg.size()
197             {
198                 return Some((
199                     PciCapabilityId::MessageSignalledInterrupts,
200                     u64::from(msi.cap_offset),
201                 ));
202             }
203         }
204 
205         if let Some(msix) = &self.msix {
206             if offset == u64::from(msix.cap_offset) {
207                 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset)));
208             }
209         }
210 
211         None
212     }
213 
214     fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool {
215         if let Some(msix) = &self.msix {
216             return msix.table_accessed(bar_index, offset);
217         }
218 
219         false
220     }
221 
222     fn msix_write_table(&mut self, offset: u64, data: &[u8]) {
223         if let Some(ref mut msix) = &mut self.msix {
224             let offset = offset - u64::from(msix.cap.table_offset());
225             msix.bar.write_table(offset, data)
226         }
227     }
228 
229     fn msix_read_table(&self, offset: u64, data: &mut [u8]) {
230         if let Some(msix) = &self.msix {
231             let offset = offset - u64::from(msix.cap.table_offset());
232             msix.bar.read_table(offset, data)
233         }
234     }
235 
236     pub(crate) fn intx_in_use(&self) -> bool {
237         if let Some(intx) = &self.intx {
238             return intx.enabled;
239         }
240 
241         false
242     }
243 }
244 
245 #[derive(Copy, Clone)]
246 pub struct UserMemoryRegion {
247     pub slot: u32,
248     pub start: u64,
249     pub size: u64,
250     pub host_addr: u64,
251 }
252 
253 #[derive(Clone)]
254 pub struct MmioRegion {
255     pub start: GuestAddress,
256     pub length: GuestUsize,
257     pub(crate) type_: PciBarRegionType,
258     pub(crate) index: u32,
259     pub(crate) user_memory_regions: Vec<UserMemoryRegion>,
260 }
261 #[derive(Debug, Error)]
262 pub enum VfioError {
263     #[error("Kernel VFIO error: {0}")]
264     KernelVfio(#[source] vfio_ioctls::VfioError),
265     #[error("VFIO user error: {0}")]
266     VfioUser(#[source] vfio_user::Error),
267 }
268 
269 pub(crate) trait Vfio: Send + Sync {
270     fn read_config_byte(&self, offset: u32) -> u8 {
271         let mut data: [u8; 1] = [0];
272         self.read_config(offset, &mut data);
273         data[0]
274     }
275 
276     fn read_config_word(&self, offset: u32) -> u16 {
277         let mut data: [u8; 2] = [0, 0];
278         self.read_config(offset, &mut data);
279         u16::from_le_bytes(data)
280     }
281 
282     fn read_config_dword(&self, offset: u32) -> u32 {
283         let mut data: [u8; 4] = [0, 0, 0, 0];
284         self.read_config(offset, &mut data);
285         u32::from_le_bytes(data)
286     }
287 
288     fn write_config_dword(&self, offset: u32, buf: u32) {
289         let data: [u8; 4] = buf.to_le_bytes();
290         self.write_config(offset, &data)
291     }
292 
293     fn read_config(&self, offset: u32, data: &mut [u8]) {
294         self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut());
295     }
296 
297     fn write_config(&self, offset: u32, data: &[u8]) {
298         self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data)
299     }
300 
301     fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
302         self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds)
303     }
304 
305     fn disable_msi(&self) -> Result<(), VfioError> {
306         self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX)
307     }
308 
309     fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
310         self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds)
311     }
312 
313     fn disable_msix(&self) -> Result<(), VfioError> {
314         self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX)
315     }
316 
317     fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) {
318         unimplemented!()
319     }
320 
321     fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) {
322         unimplemented!()
323     }
324 
325     fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> {
326         unimplemented!()
327     }
328 
329     fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
330         unimplemented!()
331     }
332 
333     fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
334         unimplemented!()
335     }
336 
337     fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
338         unimplemented!()
339     }
340 }
341 
342 struct VfioDeviceWrapper {
343     device: Arc<VfioDevice>,
344 }
345 
346 impl VfioDeviceWrapper {
347     fn new(device: Arc<VfioDevice>) -> Self {
348         Self { device }
349     }
350 }
351 
352 impl Vfio for VfioDeviceWrapper {
353     fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) {
354         self.device.region_read(index, data, offset)
355     }
356 
357     fn region_write(&self, index: u32, offset: u64, data: &[u8]) {
358         self.device.region_write(index, data, offset)
359     }
360 
361     fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> {
362         self.device.get_irq_info(irq_index).copied()
363     }
364 
365     fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
366         self.device
367             .enable_irq(irq_index, event_fds)
368             .map_err(VfioError::KernelVfio)
369     }
370 
371     fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> {
372         self.device
373             .disable_irq(irq_index)
374             .map_err(VfioError::KernelVfio)
375     }
376 
377     fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> {
378         self.device
379             .unmask_irq(irq_index)
380             .map_err(VfioError::KernelVfio)
381     }
382 }
383 
384 #[derive(Versionize)]
385 struct VfioCommonState {
386     intx_state: Option<IntxState>,
387     msi_state: Option<MsiState>,
388     msix_state: Option<MsixState>,
389 }
390 
391 impl VersionMapped for VfioCommonState {}
392 
393 pub(crate) struct ConfigPatch {
394     mask: u32,
395     patch: u32,
396 }
397 
398 pub(crate) struct VfioCommon {
399     pub(crate) configuration: PciConfiguration,
400     pub(crate) mmio_regions: Vec<MmioRegion>,
401     pub(crate) interrupt: Interrupt,
402     pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
403     pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
404     pub(crate) vfio_wrapper: Arc<dyn Vfio>,
405     pub(crate) patches: HashMap<usize, ConfigPatch>,
406 }
407 
408 impl VfioCommon {
409     pub(crate) fn allocate_bars(
410         &mut self,
411         allocator: &Arc<Mutex<SystemAllocator>>,
412         mmio_allocator: &mut AddressAllocator,
413         resources: Option<Vec<Resource>>,
414     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
415         let mut bars = Vec::new();
416         let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX as u32;
417 
418         // Going through all regular regions to compute the BAR size.
419         // We're not saving the BAR address to restore it, because we
420         // are going to allocate a guest address for each BAR and write
421         // that new address back.
422         while bar_id < VFIO_PCI_CONFIG_REGION_INDEX {
423             let mut region_size: u64 = 0;
424             let mut region_type = PciBarRegionType::Memory32BitRegion;
425             let mut prefetchable = PciBarPrefetchable::NotPrefetchable;
426             let mut flags: u32 = 0;
427 
428             let mut restored_bar_addr = None;
429             if let Some(resources) = &resources {
430                 for resource in resources {
431                     if let Resource::PciBar {
432                         index,
433                         base,
434                         size,
435                         type_,
436                         ..
437                     } = resource
438                     {
439                         if *index == bar_id as usize {
440                             restored_bar_addr = Some(GuestAddress(*base));
441                             region_size = *size;
442                             region_type = PciBarRegionType::from(*type_);
443                             break;
444                         }
445                     }
446                 }
447                 if restored_bar_addr.is_none() {
448                     bar_id += 1;
449                     continue;
450                 }
451             } else {
452                 let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
453                     (PCI_ROM_EXP_BAR_INDEX * 4) as u32
454                 } else {
455                     PCI_CONFIG_BAR_OFFSET + bar_id * 4
456                 };
457 
458                 // First read flags
459                 flags = self.vfio_wrapper.read_config_dword(bar_offset);
460 
461                 // Is this an IO BAR?
462                 let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
463                     matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR)
464                 } else {
465                     false
466                 };
467 
468                 // Is this a 64-bit BAR?
469                 let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
470                     matches!(
471                         flags & PCI_CONFIG_MEMORY_BAR_64BIT,
472                         PCI_CONFIG_MEMORY_BAR_64BIT
473                     )
474                 } else {
475                     false
476                 };
477 
478                 if matches!(
479                     flags & PCI_CONFIG_BAR_PREFETCHABLE,
480                     PCI_CONFIG_BAR_PREFETCHABLE
481                 ) {
482                     prefetchable = PciBarPrefetchable::Prefetchable
483                 };
484 
485                 // To get size write all 1s
486                 self.vfio_wrapper
487                     .write_config_dword(bar_offset, 0xffff_ffff);
488 
489                 // And read back BAR value. The device will write zeros for bits it doesn't care about
490                 let mut lower = self.vfio_wrapper.read_config_dword(bar_offset);
491 
492                 if io_bar {
493                     // Mask flag bits (lowest 2 for I/O bars)
494                     lower &= !0b11;
495 
496                     // BAR is not enabled
497                     if lower == 0 {
498                         bar_id += 1;
499                         continue;
500                     }
501 
502                     // IO BAR
503                     region_type = PciBarRegionType::IoRegion;
504 
505                     // Invert bits and add 1 to calculate size
506                     region_size = (!lower + 1) as u64;
507                 } else if is_64bit_bar {
508                     // 64 bits Memory BAR
509                     region_type = PciBarRegionType::Memory64BitRegion;
510 
511                     // Query size of upper BAR of 64-bit BAR
512                     let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4;
513                     self.vfio_wrapper
514                         .write_config_dword(upper_offset, 0xffff_ffff);
515                     let upper = self.vfio_wrapper.read_config_dword(upper_offset);
516 
517                     let mut combined_size = u64::from(upper) << 32 | u64::from(lower);
518 
519                     // Mask out flag bits (lowest 4 for memory bars)
520                     combined_size &= !0b1111;
521 
522                     // BAR is not enabled
523                     if combined_size == 0 {
524                         bar_id += 1;
525                         continue;
526                     }
527 
528                     // Invert and add 1 to to find size
529                     region_size = (!combined_size + 1) as u64;
530                 } else {
531                     region_type = PciBarRegionType::Memory32BitRegion;
532 
533                     // Mask out flag bits (lowest 4 for memory bars)
534                     lower &= !0b1111;
535 
536                     if lower == 0 {
537                         bar_id += 1;
538                         continue;
539                     }
540 
541                     // Invert and add 1 to to find size
542                     region_size = (!lower + 1) as u64;
543                 }
544             }
545 
546             let bar_addr = match region_type {
547                 PciBarRegionType::IoRegion => {
548                     #[cfg(target_arch = "aarch64")]
549                     unimplemented!();
550 
551                     // The address needs to be 4 bytes aligned.
552                     #[cfg(not(target_arch = "aarch64"))]
553                     allocator
554                         .lock()
555                         .unwrap()
556                         .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4))
557                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
558                 }
559                 PciBarRegionType::Memory32BitRegion => {
560                     // BAR allocation must be naturally aligned
561                     allocator
562                         .lock()
563                         .unwrap()
564                         .allocate_mmio_hole_addresses(
565                             restored_bar_addr,
566                             region_size,
567                             Some(region_size),
568                         )
569                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
570                 }
571                 PciBarRegionType::Memory64BitRegion => {
572                     // BAR allocation must be naturally aligned
573                     mmio_allocator
574                         .allocate(restored_bar_addr, region_size, Some(region_size))
575                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?
576                 }
577             };
578 
579             // We can now build our BAR configuration block.
580             let bar = PciBarConfiguration::default()
581                 .set_index(bar_id as usize)
582                 .set_address(bar_addr.raw_value())
583                 .set_size(region_size)
584                 .set_region_type(region_type)
585                 .set_prefetchable(prefetchable);
586 
587             if bar_id == VFIO_PCI_ROM_REGION_INDEX {
588                 self.configuration
589                     .add_pci_rom_bar(&bar, flags & 0x1)
590                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
591             } else {
592                 self.configuration
593                     .add_pci_bar(&bar)
594                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
595             }
596 
597             bars.push(bar);
598             self.mmio_regions.push(MmioRegion {
599                 start: bar_addr,
600                 length: region_size,
601                 type_: region_type,
602                 index: bar_id as u32,
603                 user_memory_regions: Vec::new(),
604             });
605 
606             bar_id += 1;
607             if region_type == PciBarRegionType::Memory64BitRegion {
608                 bar_id += 1;
609             }
610         }
611 
612         Ok(bars)
613     }
614 
615     pub(crate) fn free_bars(
616         &mut self,
617         allocator: &mut SystemAllocator,
618         mmio_allocator: &mut AddressAllocator,
619     ) -> Result<(), PciDeviceError> {
620         for region in self.mmio_regions.iter() {
621             match region.type_ {
622                 PciBarRegionType::IoRegion => {
623                     #[cfg(target_arch = "x86_64")]
624                     allocator.free_io_addresses(region.start, region.length);
625                     #[cfg(target_arch = "aarch64")]
626                     error!("I/O region is not supported");
627                 }
628                 PciBarRegionType::Memory32BitRegion => {
629                     allocator.free_mmio_hole_addresses(region.start, region.length);
630                 }
631                 PciBarRegionType::Memory64BitRegion => {
632                     mmio_allocator.free(region.start, region.length);
633                 }
634             }
635         }
636         Ok(())
637     }
638 
639     pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap {
640         let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into());
641 
642         let table = self.vfio_wrapper.read_config_dword((cap + 4).into());
643 
644         let pba = self.vfio_wrapper.read_config_dword((cap + 8).into());
645 
646         MsixCap {
647             msg_ctl,
648             table,
649             pba,
650         }
651     }
652 
653     pub(crate) fn initialize_msix(&mut self, msix_cap: MsixCap, cap_offset: u32, bdf: PciBdf) {
654         let interrupt_source_group = self
655             .msi_interrupt_manager
656             .create_group(MsiIrqGroupConfig {
657                 base: 0,
658                 count: msix_cap.table_size() as InterruptIndex,
659             })
660             .unwrap();
661 
662         let msix_config = MsixConfig::new(
663             msix_cap.table_size(),
664             interrupt_source_group.clone(),
665             bdf.into(),
666         );
667 
668         self.interrupt.msix = Some(VfioMsix {
669             bar: msix_config,
670             cap: msix_cap,
671             cap_offset,
672             interrupt_source_group,
673         });
674     }
675 
676     pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 {
677         self.vfio_wrapper.read_config_word((cap + 2).into())
678     }
679 
680     pub(crate) fn initialize_msi(&mut self, msg_ctl: u16, cap_offset: u32) {
681         let interrupt_source_group = self
682             .msi_interrupt_manager
683             .create_group(MsiIrqGroupConfig {
684                 base: 0,
685                 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex,
686             })
687             .unwrap();
688 
689         let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone());
690 
691         self.interrupt.msi = Some(VfioMsi {
692             cfg: msi_config,
693             cap_offset,
694             interrupt_source_group,
695         });
696     }
697 
698     pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) {
699         let mut cap_next = self
700             .vfio_wrapper
701             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
702 
703         let mut pci_express_cap_found = false;
704         let mut power_management_cap_found = false;
705 
706         while cap_next != 0 {
707             let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into());
708 
709             match PciCapabilityId::from(cap_id) {
710                 PciCapabilityId::MessageSignalledInterrupts => {
711                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) {
712                         if irq_info.count > 0 {
713                             // Parse capability only if the VFIO device
714                             // supports MSI.
715                             let msg_ctl = self.parse_msi_capabilities(cap_next);
716                             self.initialize_msi(msg_ctl, cap_next as u32);
717                         }
718                     }
719                 }
720                 PciCapabilityId::MsiX => {
721                     if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX)
722                     {
723                         if irq_info.count > 0 {
724                             // Parse capability only if the VFIO device
725                             // supports MSI-X.
726                             let msix_cap = self.parse_msix_capabilities(cap_next);
727                             self.initialize_msix(msix_cap, cap_next as u32, bdf);
728                         }
729                     }
730                 }
731                 PciCapabilityId::PciExpress => pci_express_cap_found = true,
732                 PciCapabilityId::PowerManagement => power_management_cap_found = true,
733                 _ => {}
734             };
735 
736             cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into());
737         }
738 
739         if pci_express_cap_found && power_management_cap_found {
740             self.parse_extended_capabilities();
741         }
742     }
743 
744     fn parse_extended_capabilities(&mut self) {
745         let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET;
746 
747         loop {
748             let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset);
749 
750             let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16;
751             let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16;
752 
753             match PciExpressCapabilityId::from(cap_id) {
754                 PciExpressCapabilityId::AlternativeRoutingIdentificationIntepretation
755                 | PciExpressCapabilityId::ResizeableBar
756                 | PciExpressCapabilityId::SingleRootIoVirtualization => {
757                     let reg_idx = (current_offset / 4) as usize;
758                     self.patches.insert(
759                         reg_idx,
760                         ConfigPatch {
761                             mask: 0x0000_ffff,
762                             patch: PciExpressCapabilityId::NullCapability as u32,
763                         },
764                     );
765                 }
766                 _ => {}
767             }
768 
769             if cap_next == 0 {
770                 break;
771             }
772 
773             current_offset = cap_next.into();
774         }
775     }
776 
777     pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> {
778         if let Some(intx) = &mut self.interrupt.intx {
779             if !intx.enabled {
780                 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) {
781                     self.vfio_wrapper
782                         .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd])
783                         .map_err(VfioPciError::EnableIntx)?;
784 
785                     intx.enabled = true;
786                 } else {
787                     return Err(VfioPciError::MissingNotifier);
788                 }
789             }
790         }
791 
792         Ok(())
793     }
794 
795     pub(crate) fn disable_intx(&mut self) {
796         if let Some(intx) = &mut self.interrupt.intx {
797             if intx.enabled {
798                 if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) {
799                     error!("Could not disable INTx: {}", e);
800                 } else {
801                     intx.enabled = false;
802                 }
803             }
804         }
805     }
806 
807     pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> {
808         if let Some(msi) = &self.interrupt.msi {
809             let mut irq_fds: Vec<EventFd> = Vec::new();
810             for i in 0..msi.cfg.num_enabled_vectors() {
811                 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) {
812                     irq_fds.push(eventfd);
813                 } else {
814                     return Err(VfioPciError::MissingNotifier);
815                 }
816             }
817 
818             self.vfio_wrapper
819                 .enable_msi(irq_fds.iter().collect())
820                 .map_err(VfioPciError::EnableMsi)?;
821         }
822 
823         Ok(())
824     }
825 
826     pub(crate) fn disable_msi(&self) {
827         if let Err(e) = self.vfio_wrapper.disable_msi() {
828             error!("Could not disable MSI: {}", e);
829         }
830     }
831 
832     pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> {
833         if let Some(msix) = &self.interrupt.msix {
834             let mut irq_fds: Vec<EventFd> = Vec::new();
835             for i in 0..msix.bar.table_entries.len() {
836                 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) {
837                     irq_fds.push(eventfd);
838                 } else {
839                     return Err(VfioPciError::MissingNotifier);
840                 }
841             }
842 
843             self.vfio_wrapper
844                 .enable_msix(irq_fds.iter().collect())
845                 .map_err(VfioPciError::EnableMsix)?;
846         }
847 
848         Ok(())
849     }
850 
851     pub(crate) fn disable_msix(&self) {
852         if let Err(e) = self.vfio_wrapper.disable_msix() {
853             error!("Could not disable MSI-X: {}", e);
854         }
855     }
856 
857     pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> {
858         if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) {
859             if irq_info.count == 0 {
860                 // A count of 0 means the INTx IRQ is not supported, therefore
861                 // it shouldn't be initialized.
862                 return Ok(());
863             }
864         }
865 
866         if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() {
867             self.interrupt.intx = Some(VfioIntx {
868                 interrupt_source_group,
869                 enabled: false,
870             });
871 
872             self.enable_intx()?;
873         }
874 
875         Ok(())
876     }
877 
878     pub(crate) fn update_msi_capabilities(
879         &mut self,
880         offset: u64,
881         data: &[u8],
882     ) -> Result<(), VfioPciError> {
883         match self.interrupt.update_msi(offset, data) {
884             Some(InterruptUpdateAction::EnableMsi) => {
885                 // Disable INTx before we can enable MSI
886                 self.disable_intx();
887                 self.enable_msi()?;
888             }
889             Some(InterruptUpdateAction::DisableMsi) => {
890                 // Fallback onto INTx when disabling MSI
891                 self.disable_msi();
892                 self.enable_intx()?;
893             }
894             _ => {}
895         }
896 
897         Ok(())
898     }
899 
900     pub(crate) fn update_msix_capabilities(
901         &mut self,
902         offset: u64,
903         data: &[u8],
904     ) -> Result<(), VfioPciError> {
905         match self.interrupt.update_msix(offset, data) {
906             Some(InterruptUpdateAction::EnableMsix) => {
907                 // Disable INTx before we can enable MSI-X
908                 self.disable_intx();
909                 self.enable_msix()?;
910             }
911             Some(InterruptUpdateAction::DisableMsix) => {
912                 // Fallback onto INTx when disabling MSI-X
913                 self.disable_msix();
914                 self.enable_intx()?;
915             }
916             _ => {}
917         }
918 
919         Ok(())
920     }
921 
922     pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> {
923         for region in self.mmio_regions.iter() {
924             if addr >= region.start.raw_value()
925                 && addr < region.start.unchecked_add(region.length).raw_value()
926             {
927                 return Some(region.clone());
928             }
929         }
930         None
931     }
932 
933     pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
934         let addr = base + offset;
935         if let Some(region) = self.find_region(addr) {
936             let offset = addr - region.start.raw_value();
937 
938             if self.interrupt.msix_table_accessed(region.index, offset) {
939                 self.interrupt.msix_read_table(offset, data);
940             } else {
941                 self.vfio_wrapper.region_read(region.index, offset, data);
942             }
943         }
944 
945         // INTx EOI
946         // The guest reading from the BAR potentially means the interrupt has
947         // been received and can be acknowledged.
948         if self.interrupt.intx_in_use() {
949             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
950                 error!("Failed unmasking INTx IRQ: {}", e);
951             }
952         }
953     }
954 
955     pub(crate) fn write_bar(
956         &mut self,
957         base: u64,
958         offset: u64,
959         data: &[u8],
960     ) -> Option<Arc<Barrier>> {
961         let addr = base + offset;
962         if let Some(region) = self.find_region(addr) {
963             let offset = addr - region.start.raw_value();
964 
965             // If the MSI-X table is written to, we need to update our cache.
966             if self.interrupt.msix_table_accessed(region.index, offset) {
967                 self.interrupt.msix_write_table(offset, data);
968             } else {
969                 self.vfio_wrapper.region_write(region.index, offset, data);
970             }
971         }
972 
973         // INTx EOI
974         // The guest writing to the BAR potentially means the interrupt has
975         // been received and can be acknowledged.
976         if self.interrupt.intx_in_use() {
977             if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
978                 error!("Failed unmasking INTx IRQ: {}", e);
979             }
980         }
981 
982         None
983     }
984 
985     pub(crate) fn write_config_register(
986         &mut self,
987         reg_idx: usize,
988         offset: u64,
989         data: &[u8],
990     ) -> Option<Arc<Barrier>> {
991         // When the guest wants to write to a BAR, we trap it into
992         // our local configuration space. We're not reprogramming
993         // VFIO device.
994         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
995             || reg_idx == PCI_ROM_EXP_BAR_INDEX
996         {
997             // We keep our local cache updated with the BARs.
998             // We'll read it back from there when the guest is asking
999             // for BARs (see read_config_register()).
1000             self.configuration
1001                 .write_config_register(reg_idx, offset, data);
1002             return None;
1003         }
1004 
1005         let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64;
1006 
1007         // If the MSI or MSI-X capabilities are accessed, we need to
1008         // update our local cache accordingly.
1009         // Depending on how the capabilities are modified, this could
1010         // trigger a VFIO MSI or MSI-X toggle.
1011         if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) {
1012             let cap_offset: u64 = reg - cap_base + offset;
1013             match cap_id {
1014                 PciCapabilityId::MessageSignalledInterrupts => {
1015                     if let Err(e) = self.update_msi_capabilities(cap_offset, data) {
1016                         error!("Could not update MSI capabilities: {}", e);
1017                     }
1018                 }
1019                 PciCapabilityId::MsiX => {
1020                     if let Err(e) = self.update_msix_capabilities(cap_offset, data) {
1021                         error!("Could not update MSI-X capabilities: {}", e);
1022                     }
1023                 }
1024                 _ => {}
1025             }
1026         }
1027 
1028         // Make sure to write to the device's PCI config space after MSI/MSI-X
1029         // interrupts have been enabled/disabled. In case of MSI, when the
1030         // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS),
1031         // the MSI Enable bit in the MSI capability structure found in the PCI
1032         // config space is disabled by default. That's why when the guest is
1033         // enabling this bit, we first need to enable the MSI interrupts with
1034         // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write
1035         // to the device region to update the MSI Enable bit.
1036         self.vfio_wrapper.write_config((reg + offset) as u32, data);
1037 
1038         None
1039     }
1040 
1041     pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1042         // When reading the BARs, we trap it and return what comes
1043         // from our local configuration space. We want the guest to
1044         // use that and not the VFIO device BARs as it does not map
1045         // with the guest address space.
1046         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1047             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1048         {
1049             return self.configuration.read_reg(reg_idx);
1050         }
1051 
1052         // Since we don't support passing multi-functions devices, we should
1053         // mask the multi-function bit, bit 7 of the Header Type byte on the
1054         // register 3.
1055         let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX {
1056             0xff7f_ffff
1057         } else {
1058             0xffff_ffff
1059         };
1060 
1061         // The config register read comes from the VFIO device itself.
1062         let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask;
1063 
1064         if let Some(config_patch) = self.patches.get(&reg_idx) {
1065             value = (value & !config_patch.mask) | config_patch.patch;
1066         }
1067 
1068         value
1069     }
1070 
1071     fn state(&self) -> VfioCommonState {
1072         let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState {
1073             enabled: intx.enabled,
1074         });
1075 
1076         let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState {
1077             cap: msi.cfg.cap,
1078             cap_offset: msi.cap_offset,
1079         });
1080 
1081         let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState {
1082             cap: msix.cap,
1083             cap_offset: msix.cap_offset,
1084             bdf: msix.bar.devid,
1085         });
1086 
1087         VfioCommonState {
1088             intx_state,
1089             msi_state,
1090             msix_state,
1091         }
1092     }
1093 
1094     fn set_state(&mut self, state: &VfioCommonState) -> Result<(), VfioPciError> {
1095         if let (Some(intx), Some(interrupt_source_group)) =
1096             (&state.intx_state, self.legacy_interrupt_group.clone())
1097         {
1098             self.interrupt.intx = Some(VfioIntx {
1099                 interrupt_source_group,
1100                 enabled: false,
1101             });
1102 
1103             if intx.enabled {
1104                 self.enable_intx()?;
1105             }
1106         }
1107 
1108         if let Some(msi) = &state.msi_state {
1109             self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset);
1110         }
1111 
1112         if let Some(msix) = &state.msix_state {
1113             self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into());
1114         }
1115 
1116         Ok(())
1117     }
1118 }
1119 
1120 impl Pausable for VfioCommon {}
1121 
1122 impl Snapshottable for VfioCommon {
1123     fn id(&self) -> String {
1124         String::from("vfio_common")
1125     }
1126 
1127     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1128         let mut vfio_common_snapshot =
1129             Snapshot::new_from_versioned_state(&self.id(), &self.state())?;
1130 
1131         // Snapshot PciConfiguration
1132         vfio_common_snapshot.add_snapshot(self.configuration.snapshot()?);
1133 
1134         // Snapshot MSI
1135         if let Some(msi) = &mut self.interrupt.msi {
1136             vfio_common_snapshot.add_snapshot(msi.cfg.snapshot()?);
1137         }
1138 
1139         // Snapshot MSI-X
1140         if let Some(msix) = &mut self.interrupt.msix {
1141             vfio_common_snapshot.add_snapshot(msix.bar.snapshot()?);
1142         }
1143 
1144         Ok(vfio_common_snapshot)
1145     }
1146 
1147     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
1148         if let Some(vfio_common_section) = snapshot
1149             .snapshot_data
1150             .get(&format!("{}-section", self.id()))
1151         {
1152             // It has to be invoked first as we want Interrupt to be initialized
1153             // correctly before we try to restore MSI and MSI-X configurations.
1154             self.set_state(&vfio_common_section.to_versioned_state()?)
1155                 .map_err(|e| {
1156                     MigratableError::Restore(anyhow!("Could not restore VFIO_COMMON state {:?}", e))
1157                 })?;
1158 
1159             // Restore PciConfiguration
1160             if let Some(pci_config_snapshot) = snapshot.snapshots.get(&self.configuration.id()) {
1161                 self.configuration.restore(*pci_config_snapshot.clone())?;
1162             }
1163 
1164             // Restore MSI
1165             if let Some(msi) = &mut self.interrupt.msi {
1166                 if let Some(msi_snapshot) = snapshot.snapshots.get(&msi.cfg.id()) {
1167                     msi.cfg.restore(*msi_snapshot.clone())?;
1168                 }
1169                 if msi.cfg.enabled() {
1170                     self.enable_msi().unwrap();
1171                 }
1172             }
1173 
1174             // Restore MSI-X
1175             if let Some(msix) = &mut self.interrupt.msix {
1176                 if let Some(msix_snapshot) = snapshot.snapshots.get(&msix.bar.id()) {
1177                     msix.bar.restore(*msix_snapshot.clone())?;
1178                 }
1179                 if msix.bar.enabled() {
1180                     self.enable_msix().unwrap();
1181                 }
1182             }
1183 
1184             return Ok(());
1185         }
1186 
1187         Err(MigratableError::Restore(anyhow!(
1188             "Could not find VFIO_COMMON snapshot section"
1189         )))
1190     }
1191 }
1192 
1193 /// VfioPciDevice represents a VFIO PCI device.
1194 /// This structure implements the BusDevice and PciDevice traits.
1195 ///
1196 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device.
1197 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice,
1198 /// which then gets added to the PCI bus.
1199 pub struct VfioPciDevice {
1200     id: String,
1201     vm: Arc<dyn hypervisor::Vm>,
1202     device: Arc<VfioDevice>,
1203     container: Arc<VfioContainer>,
1204     common: VfioCommon,
1205     iommu_attached: bool,
1206     memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
1207 }
1208 
1209 impl VfioPciDevice {
1210     /// Constructs a new Vfio Pci device for the given Vfio device
1211     #[allow(clippy::too_many_arguments)]
1212     pub fn new(
1213         id: String,
1214         vm: &Arc<dyn hypervisor::Vm>,
1215         device: VfioDevice,
1216         container: Arc<VfioContainer>,
1217         msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
1218         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
1219         iommu_attached: bool,
1220         bdf: PciBdf,
1221         restoring: bool,
1222         memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
1223     ) -> Result<Self, VfioPciError> {
1224         let device = Arc::new(device);
1225         device.reset();
1226 
1227         let configuration = PciConfiguration::new(
1228             0,
1229             0,
1230             0,
1231             PciClassCode::Other,
1232             &PciVfioSubclass::VfioSubclass,
1233             None,
1234             PciHeaderType::Device,
1235             0,
1236             0,
1237             None,
1238         );
1239 
1240         let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device));
1241 
1242         let mut common = VfioCommon {
1243             mmio_regions: Vec::new(),
1244             configuration,
1245             interrupt: Interrupt {
1246                 intx: None,
1247                 msi: None,
1248                 msix: None,
1249             },
1250             msi_interrupt_manager,
1251             legacy_interrupt_group,
1252             vfio_wrapper: Arc::new(vfio_wrapper) as Arc<dyn Vfio>,
1253             patches: HashMap::new(),
1254         };
1255 
1256         // No need to parse capabilities from the device if on the restore path.
1257         // The initialization will be performed later when restore() will be
1258         // called.
1259         if !restoring {
1260             common.parse_capabilities(bdf);
1261             common.initialize_legacy_interrupt()?;
1262         }
1263 
1264         let vfio_pci_device = VfioPciDevice {
1265             id,
1266             vm: vm.clone(),
1267             device,
1268             container,
1269             common,
1270             iommu_attached,
1271             memory_slot,
1272         };
1273 
1274         Ok(vfio_pci_device)
1275     }
1276 
1277     pub fn iommu_attached(&self) -> bool {
1278         self.iommu_attached
1279     }
1280 
1281     fn align_4k(address: u64) -> u64 {
1282         (address + 0xfff) & 0xffff_ffff_ffff_f000
1283     }
1284 
1285     fn is_4k_aligned(address: u64) -> bool {
1286         (address & 0xfff) == 0
1287     }
1288 
1289     fn is_4k_multiple(size: u64) -> bool {
1290         (size & 0xfff) == 0
1291     }
1292 
1293     fn generate_sparse_areas(
1294         caps: &[VfioRegionInfoCap],
1295         region_index: u32,
1296         region_start: u64,
1297         region_size: u64,
1298         vfio_msix: Option<&VfioMsix>,
1299     ) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> {
1300         for cap in caps {
1301             match cap {
1302                 VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()),
1303                 VfioRegionInfoCap::MsixMappable => {
1304                     if !Self::is_4k_aligned(region_start) {
1305                         error!(
1306                             "Region start address 0x{:x} must be at least aligned on 4KiB",
1307                             region_start
1308                         );
1309                         return Err(VfioPciError::RegionAlignment);
1310                     }
1311                     if !Self::is_4k_multiple(region_size) {
1312                         error!(
1313                             "Region size 0x{:x} must be at least a multiple of 4KiB",
1314                             region_size
1315                         );
1316                         return Err(VfioPciError::RegionSize);
1317                     }
1318 
1319                     // In case the region contains the MSI-X vectors table or
1320                     // the MSI-X PBA table, we must calculate the subregions
1321                     // around them, leading to a list of sparse areas.
1322                     // We want to make sure we will still trap MMIO accesses
1323                     // to these MSI-X specific ranges.
1324                     //
1325                     // Using a BtreeMap as the list provided through the iterator is sorted
1326                     // by key. This ensures proper split of the whole region.
1327                     let mut inter_ranges = BTreeMap::new();
1328                     if let Some(msix) = vfio_msix {
1329                         if region_index == msix.cap.table_bir() {
1330                             let (offset, size) = msix.cap.table_range();
1331                             inter_ranges.insert(offset, size);
1332                         }
1333                         if region_index == msix.cap.pba_bir() {
1334                             let (offset, size) = msix.cap.pba_range();
1335                             inter_ranges.insert(offset, size);
1336                         }
1337                     }
1338 
1339                     let mut sparse_areas = Vec::new();
1340                     let mut current_offset = 0;
1341                     for (range_offset, range_size) in inter_ranges {
1342                         if range_offset > current_offset {
1343                             sparse_areas.push(VfioRegionSparseMmapArea {
1344                                 offset: current_offset,
1345                                 size: range_offset - current_offset,
1346                             });
1347                         }
1348 
1349                         current_offset = Self::align_4k(range_offset + range_size);
1350                     }
1351 
1352                     if region_size > current_offset {
1353                         sparse_areas.push(VfioRegionSparseMmapArea {
1354                             offset: current_offset,
1355                             size: region_size - current_offset,
1356                         });
1357                     }
1358 
1359                     return Ok(sparse_areas);
1360                 }
1361                 _ => {}
1362             }
1363         }
1364 
1365         // In case no relevant capabilities have been found, create a single
1366         // sparse area corresponding to the entire MMIO region.
1367         Ok(vec![VfioRegionSparseMmapArea {
1368             offset: 0,
1369             size: region_size,
1370         }])
1371     }
1372 
1373     /// Map MMIO regions into the guest, and avoid VM exits when the guest tries
1374     /// to reach those regions.
1375     ///
1376     /// # Arguments
1377     ///
1378     /// * `vm` - The VM object. It is used to set the VFIO MMIO regions
1379     ///          as user memory regions.
1380     /// * `mem_slot` - The closure to return a memory slot.
1381     pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> {
1382         let fd = self.device.as_raw_fd();
1383 
1384         for region in self.common.mmio_regions.iter_mut() {
1385             let region_flags = self.device.get_region_flags(region.index);
1386             if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1387                 let mut prot = 0;
1388                 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 {
1389                     prot |= libc::PROT_READ;
1390                 }
1391                 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 {
1392                     prot |= libc::PROT_WRITE;
1393                 }
1394 
1395                 // Retrieve the list of capabilities found on the region
1396                 let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 {
1397                     self.device.get_region_caps(region.index)
1398                 } else {
1399                     Vec::new()
1400                 };
1401 
1402                 // Don't try to mmap the region if it contains MSI-X table or
1403                 // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE
1404                 // in the list of supported capabilities.
1405                 if let Some(msix) = self.common.interrupt.msix.as_ref() {
1406                     if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir())
1407                         && !caps.contains(&VfioRegionInfoCap::MsixMappable)
1408                     {
1409                         continue;
1410                     }
1411                 }
1412 
1413                 let mmap_size = self.device.get_region_size(region.index);
1414                 let mmap_offset = self.device.get_region_offset(region.index);
1415 
1416                 let sparse_areas = Self::generate_sparse_areas(
1417                     &caps,
1418                     region.index,
1419                     region.start.0,
1420                     mmap_size,
1421                     self.common.interrupt.msix.as_ref(),
1422                 )?;
1423 
1424                 for area in sparse_areas.iter() {
1425                     let host_addr = unsafe {
1426                         libc::mmap(
1427                             null_mut(),
1428                             area.size as usize,
1429                             prot,
1430                             libc::MAP_SHARED,
1431                             fd,
1432                             mmap_offset as libc::off_t + area.offset as libc::off_t,
1433                         )
1434                     };
1435 
1436                     if host_addr == libc::MAP_FAILED {
1437                         error!(
1438                             "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}",
1439                             area.offset,
1440                             area.size,
1441                             std::io::Error::last_os_error()
1442                         );
1443                         return Err(VfioPciError::MmapArea);
1444                     }
1445 
1446                     let user_memory_region = UserMemoryRegion {
1447                         slot: (self.memory_slot)(),
1448                         start: region.start.0 + area.offset,
1449                         size: area.size,
1450                         host_addr: host_addr as u64,
1451                     };
1452 
1453                     region.user_memory_regions.push(user_memory_region);
1454 
1455                     let mem_region = self.vm.make_user_memory_region(
1456                         user_memory_region.slot,
1457                         user_memory_region.start,
1458                         user_memory_region.size,
1459                         user_memory_region.host_addr,
1460                         false,
1461                         false,
1462                     );
1463 
1464                     self.vm
1465                         .create_user_memory_region(mem_region)
1466                         .map_err(VfioPciError::CreateUserMemoryRegion)?;
1467                 }
1468             }
1469         }
1470 
1471         Ok(())
1472     }
1473 
1474     pub fn unmap_mmio_regions(&mut self) {
1475         for region in self.common.mmio_regions.iter() {
1476             for user_memory_region in region.user_memory_regions.iter() {
1477                 // Remove region
1478                 let r = self.vm.make_user_memory_region(
1479                     user_memory_region.slot,
1480                     user_memory_region.start,
1481                     user_memory_region.size,
1482                     user_memory_region.host_addr,
1483                     false,
1484                     false,
1485                 );
1486 
1487                 if let Err(e) = self.vm.remove_user_memory_region(r) {
1488                     error!("Could not remove the userspace memory region: {}", e);
1489                 }
1490 
1491                 let ret = unsafe {
1492                     libc::munmap(
1493                         user_memory_region.host_addr as *mut libc::c_void,
1494                         user_memory_region.size as usize,
1495                     )
1496                 };
1497                 if ret != 0 {
1498                     error!(
1499                         "Could not unmap region {}, error:{}",
1500                         region.index,
1501                         io::Error::last_os_error()
1502                     );
1503                 }
1504             }
1505         }
1506     }
1507 
1508     pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> {
1509         if !self.iommu_attached {
1510             self.container
1511                 .vfio_dma_map(iova, size, user_addr)
1512                 .map_err(VfioPciError::DmaMap)?;
1513         }
1514 
1515         Ok(())
1516     }
1517 
1518     pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> {
1519         if !self.iommu_attached {
1520             self.container
1521                 .vfio_dma_unmap(iova, size)
1522                 .map_err(VfioPciError::DmaUnmap)?;
1523         }
1524 
1525         Ok(())
1526     }
1527 
1528     pub fn mmio_regions(&self) -> Vec<MmioRegion> {
1529         self.common.mmio_regions.clone()
1530     }
1531 }
1532 
1533 impl Drop for VfioPciDevice {
1534     fn drop(&mut self) {
1535         self.unmap_mmio_regions();
1536 
1537         if let Some(msix) = &self.common.interrupt.msix {
1538             if msix.bar.enabled() {
1539                 self.common.disable_msix();
1540             }
1541         }
1542 
1543         if let Some(msi) = &self.common.interrupt.msi {
1544             if msi.cfg.enabled() {
1545                 self.common.disable_msi()
1546             }
1547         }
1548 
1549         if self.common.interrupt.intx_in_use() {
1550             self.common.disable_intx();
1551         }
1552     }
1553 }
1554 
1555 impl BusDevice for VfioPciDevice {
1556     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1557         self.read_bar(base, offset, data)
1558     }
1559 
1560     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1561         self.write_bar(base, offset, data)
1562     }
1563 }
1564 
1565 // First BAR offset in the PCI config space.
1566 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10;
1567 // Capability register offset in the PCI config space.
1568 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34;
1569 // Extended capabilities register offset in the PCI config space.
1570 const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100;
1571 // IO BAR when first BAR bit is 1.
1572 const PCI_CONFIG_IO_BAR: u32 = 0x1;
1573 // 64-bit memory bar flag.
1574 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4;
1575 // Prefetchable BAR bit
1576 const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8;
1577 // PCI config register size (4 bytes).
1578 const PCI_CONFIG_REGISTER_SIZE: usize = 4;
1579 // Number of BARs for a PCI device
1580 const BAR_NUMS: usize = 6;
1581 // PCI Header Type register index
1582 const PCI_HEADER_TYPE_REG_INDEX: usize = 3;
1583 // First BAR register index
1584 const PCI_CONFIG_BAR0_INDEX: usize = 4;
1585 // PCI ROM expansion BAR register index
1586 const PCI_ROM_EXP_BAR_INDEX: usize = 12;
1587 
1588 impl PciDevice for VfioPciDevice {
1589     fn allocate_bars(
1590         &mut self,
1591         allocator: &Arc<Mutex<SystemAllocator>>,
1592         mmio_allocator: &mut AddressAllocator,
1593         resources: Option<Vec<Resource>>,
1594     ) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
1595         self.common
1596             .allocate_bars(allocator, mmio_allocator, resources)
1597     }
1598 
1599     fn free_bars(
1600         &mut self,
1601         allocator: &mut SystemAllocator,
1602         mmio_allocator: &mut AddressAllocator,
1603     ) -> Result<(), PciDeviceError> {
1604         self.common.free_bars(allocator, mmio_allocator)
1605     }
1606 
1607     fn write_config_register(
1608         &mut self,
1609         reg_idx: usize,
1610         offset: u64,
1611         data: &[u8],
1612     ) -> Option<Arc<Barrier>> {
1613         self.common.write_config_register(reg_idx, offset, data)
1614     }
1615 
1616     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1617         self.common.read_config_register(reg_idx)
1618     }
1619 
1620     fn detect_bar_reprogramming(
1621         &mut self,
1622         reg_idx: usize,
1623         data: &[u8],
1624     ) -> Option<BarReprogrammingParams> {
1625         self.common
1626             .configuration
1627             .detect_bar_reprogramming(reg_idx, data)
1628     }
1629 
1630     fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1631         self.common.read_bar(base, offset, data)
1632     }
1633 
1634     fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1635         self.common.write_bar(base, offset, data)
1636     }
1637 
1638     fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> {
1639         for region in self.common.mmio_regions.iter_mut() {
1640             if region.start.raw_value() == old_base {
1641                 region.start = GuestAddress(new_base);
1642 
1643                 for user_memory_region in region.user_memory_regions.iter_mut() {
1644                     // Remove old region
1645                     let old_mem_region = self.vm.make_user_memory_region(
1646                         user_memory_region.slot,
1647                         user_memory_region.start,
1648                         user_memory_region.size,
1649                         user_memory_region.host_addr,
1650                         false,
1651                         false,
1652                     );
1653 
1654                     self.vm
1655                         .remove_user_memory_region(old_mem_region)
1656                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1657 
1658                     // Update the user memory region with the correct start address.
1659                     if new_base > old_base {
1660                         user_memory_region.start += new_base - old_base;
1661                     } else {
1662                         user_memory_region.start -= old_base - new_base;
1663                     }
1664 
1665                     // Insert new region
1666                     let new_mem_region = self.vm.make_user_memory_region(
1667                         user_memory_region.slot,
1668                         user_memory_region.start,
1669                         user_memory_region.size,
1670                         user_memory_region.host_addr,
1671                         false,
1672                         false,
1673                     );
1674 
1675                     self.vm
1676                         .create_user_memory_region(new_mem_region)
1677                         .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1678                 }
1679             }
1680         }
1681 
1682         Ok(())
1683     }
1684 
1685     fn as_any(&mut self) -> &mut dyn Any {
1686         self
1687     }
1688 
1689     fn id(&self) -> Option<String> {
1690         Some(self.id.clone())
1691     }
1692 }
1693 
1694 impl Pausable for VfioPciDevice {}
1695 
1696 impl Snapshottable for VfioPciDevice {
1697     fn id(&self) -> String {
1698         self.id.clone()
1699     }
1700 
1701     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1702         let mut vfio_pci_dev_snapshot = Snapshot::new(&self.id);
1703 
1704         // Snapshot VfioCommon
1705         vfio_pci_dev_snapshot.add_snapshot(self.common.snapshot()?);
1706 
1707         Ok(vfio_pci_dev_snapshot)
1708     }
1709 
1710     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
1711         // Restore VfioCommon
1712         if let Some(vfio_common_snapshot) = snapshot.snapshots.get(&self.common.id()) {
1713             self.common.restore(*vfio_common_snapshot.clone())?;
1714             self.map_mmio_regions().map_err(|e| {
1715                 MigratableError::Restore(anyhow!(
1716                     "Could not map MMIO regions for VfioPciDevice on restore {:?}",
1717                     e
1718                 ))
1719             })?;
1720         }
1721 
1722         Ok(())
1723     }
1724 }
1725 impl Transportable for VfioPciDevice {}
1726 impl Migratable for VfioPciDevice {}
1727