xref: /cloud-hypervisor/pci/src/vfio.rs (revision 9af2968a7dc47b89bf07ea9dc5e735084efcfa3a)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 
6 use crate::{
7     msi_num_enabled_vectors, BarReprogrammingParams, MsiConfig, MsixCap, MsixConfig,
8     PciBarConfiguration, PciBarRegionType, PciCapabilityId, PciClassCode, PciConfiguration,
9     PciDevice, PciDeviceError, PciHeaderType, PciSubclass, MSIX_TABLE_ENTRY_SIZE,
10 };
11 use byteorder::{ByteOrder, LittleEndian};
12 use std::any::Any;
13 use std::os::unix::io::AsRawFd;
14 use std::ptr::null_mut;
15 use std::sync::{Arc, Barrier};
16 use std::{fmt, io, result};
17 use vfio_bindings::bindings::vfio::*;
18 use vfio_ioctls::{VfioContainer, VfioDevice, VfioError};
19 use vm_allocator::SystemAllocator;
20 use vm_device::interrupt::{
21     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
22 };
23 use vm_device::BusDevice;
24 use vm_memory::{Address, GuestAddress, GuestUsize};
25 use vmm_sys_util::eventfd::EventFd;
26 
27 #[derive(Debug)]
28 pub enum VfioPciError {
29     AllocateGsi,
30     DmaMap(VfioError),
31     DmaUnmap(VfioError),
32     EnableIntx(VfioError),
33     EnableMsi(VfioError),
34     EnableMsix(VfioError),
35     EventFd(io::Error),
36     InterruptSourceGroupCreate(io::Error),
37     IrqFd(hypervisor::HypervisorVmError),
38     MapRegionGuest(anyhow::Error),
39     MissingNotifier,
40     MsiNotConfigured,
41     MsixNotConfigured,
42     NewVfioPciDevice,
43     SetGsiRouting(hypervisor::HypervisorVmError),
44 }
45 pub type Result<T> = std::result::Result<T, VfioPciError>;
46 
47 impl fmt::Display for VfioPciError {
48     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
49         match self {
50             VfioPciError::AllocateGsi => write!(f, "failed to allocate GSI"),
51             VfioPciError::DmaMap(e) => write!(f, "failed to DMA map: {}", e),
52             VfioPciError::DmaUnmap(e) => write!(f, "failed to DMA unmap: {}", e),
53             VfioPciError::EnableIntx(e) => write!(f, "failed to enable INTx: {}", e),
54             VfioPciError::EnableMsi(e) => write!(f, "failed to enable MSI: {}", e),
55             VfioPciError::EnableMsix(e) => write!(f, "failed to enable MSI-X: {}", e),
56             VfioPciError::EventFd(e) => write!(f, "failed to create eventfd: {}", e),
57             VfioPciError::InterruptSourceGroupCreate(e) => {
58                 write!(f, "failed to create interrupt source group: {}", e)
59             }
60             VfioPciError::IrqFd(e) => write!(f, "failed to register irqfd: {}", e),
61             VfioPciError::MapRegionGuest(e) => {
62                 write!(f, "failed to map VFIO PCI region into guest: {}", e)
63             }
64             VfioPciError::MissingNotifier => write!(f, "failed to notifier's eventfd"),
65             VfioPciError::MsiNotConfigured => write!(f, "MSI interrupt not yet configured"),
66             VfioPciError::MsixNotConfigured => write!(f, "MSI-X interrupt not yet configured"),
67             VfioPciError::NewVfioPciDevice => write!(f, "failed to create VFIO PCI device"),
68             VfioPciError::SetGsiRouting(e) => write!(f, "failed to set GSI routes: {}", e),
69         }
70     }
71 }
72 
73 #[derive(Copy, Clone)]
74 enum PciVfioSubclass {
75     VfioSubclass = 0xff,
76 }
77 
78 impl PciSubclass for PciVfioSubclass {
79     fn get_register_value(&self) -> u8 {
80         *self as u8
81     }
82 }
83 
84 enum InterruptUpdateAction {
85     EnableMsi,
86     DisableMsi,
87     EnableMsix,
88     DisableMsix,
89 }
90 
91 struct VfioIntx {
92     interrupt_source_group: Arc<Box<dyn InterruptSourceGroup>>,
93     enabled: bool,
94 }
95 
96 struct VfioMsi {
97     cfg: MsiConfig,
98     cap_offset: u32,
99     interrupt_source_group: Arc<Box<dyn InterruptSourceGroup>>,
100 }
101 
102 impl VfioMsi {
103     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
104         let old_enabled = self.cfg.enabled();
105 
106         self.cfg.update(offset, data);
107 
108         let new_enabled = self.cfg.enabled();
109 
110         if !old_enabled && new_enabled {
111             return Some(InterruptUpdateAction::EnableMsi);
112         }
113 
114         if old_enabled && !new_enabled {
115             return Some(InterruptUpdateAction::DisableMsi);
116         }
117 
118         None
119     }
120 }
121 
122 struct VfioMsix {
123     bar: MsixConfig,
124     cap: MsixCap,
125     cap_offset: u32,
126     interrupt_source_group: Arc<Box<dyn InterruptSourceGroup>>,
127 }
128 
129 impl VfioMsix {
130     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
131         let old_enabled = self.bar.enabled();
132 
133         // Update "Message Control" word
134         if offset == 2 && data.len() == 2 {
135             self.bar.set_msg_ctl(LittleEndian::read_u16(data));
136         }
137 
138         let new_enabled = self.bar.enabled();
139 
140         if !old_enabled && new_enabled {
141             return Some(InterruptUpdateAction::EnableMsix);
142         }
143 
144         if old_enabled && !new_enabled {
145             return Some(InterruptUpdateAction::DisableMsix);
146         }
147 
148         None
149     }
150 
151     fn table_accessed(&self, bar_index: u32, offset: u64) -> bool {
152         let table_offset: u64 = u64::from(self.cap.table_offset());
153         let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64);
154         let table_bir: u32 = self.cap.table_bir();
155 
156         bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size
157     }
158 }
159 
160 struct Interrupt {
161     intx: Option<VfioIntx>,
162     msi: Option<VfioMsi>,
163     msix: Option<VfioMsix>,
164 }
165 
166 impl Interrupt {
167     fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
168         if let Some(ref mut msi) = &mut self.msi {
169             let action = msi.update(offset, data);
170             return action;
171         }
172 
173         None
174     }
175 
176     fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
177         if let Some(ref mut msix) = &mut self.msix {
178             let action = msix.update(offset, data);
179             return action;
180         }
181 
182         None
183     }
184 
185     fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> {
186         if let Some(msi) = &self.msi {
187             if offset >= u64::from(msi.cap_offset)
188                 && offset < u64::from(msi.cap_offset) + msi.cfg.size()
189             {
190                 return Some((
191                     PciCapabilityId::MessageSignalledInterrupts,
192                     u64::from(msi.cap_offset),
193                 ));
194             }
195         }
196 
197         if let Some(msix) = &self.msix {
198             if offset == u64::from(msix.cap_offset) {
199                 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset)));
200             }
201         }
202 
203         None
204     }
205 
206     fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool {
207         if let Some(msix) = &self.msix {
208             return msix.table_accessed(bar_index, offset);
209         }
210 
211         false
212     }
213 
214     fn msix_write_table(&mut self, offset: u64, data: &[u8]) {
215         if let Some(ref mut msix) = &mut self.msix {
216             let offset = offset - u64::from(msix.cap.table_offset());
217             msix.bar.write_table(offset, data)
218         }
219     }
220 
221     fn msix_read_table(&self, offset: u64, data: &mut [u8]) {
222         if let Some(msix) = &self.msix {
223             let offset = offset - u64::from(msix.cap.table_offset());
224             msix.bar.read_table(offset, data)
225         }
226     }
227 
228     fn intx_in_use(&self) -> bool {
229         if let Some(intx) = &self.intx {
230             return intx.enabled;
231         }
232 
233         false
234     }
235 }
236 
237 #[derive(Copy, Clone)]
238 pub struct MmioRegion {
239     pub start: GuestAddress,
240     pub length: GuestUsize,
241     type_: PciBarRegionType,
242     index: u32,
243     mem_slot: Option<u32>,
244     host_addr: Option<u64>,
245     mmap_size: Option<usize>,
246 }
247 
248 struct VfioPciConfig {
249     device: Arc<VfioDevice>,
250 }
251 
252 impl VfioPciConfig {
253     fn new(device: Arc<VfioDevice>) -> Self {
254         VfioPciConfig { device }
255     }
256 
257     fn read_config_byte(&self, offset: u32) -> u8 {
258         let mut data: [u8; 1] = [0];
259         self.device
260             .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into());
261 
262         data[0]
263     }
264 
265     fn read_config_word(&self, offset: u32) -> u16 {
266         let mut data: [u8; 2] = [0, 0];
267         self.device
268             .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into());
269 
270         u16::from_le_bytes(data)
271     }
272 
273     fn read_config_dword(&self, offset: u32) -> u32 {
274         let mut data: [u8; 4] = [0, 0, 0, 0];
275         self.device
276             .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into());
277 
278         u32::from_le_bytes(data)
279     }
280 
281     fn write_config_dword(&self, buf: u32, offset: u32) {
282         let data: [u8; 4] = buf.to_le_bytes();
283         self.device
284             .region_write(VFIO_PCI_CONFIG_REGION_INDEX, &data, offset.into())
285     }
286 }
287 
288 /// VfioPciDevice represents a VFIO PCI device.
289 /// This structure implements the BusDevice and PciDevice traits.
290 ///
291 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device.
292 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice,
293 /// which then gets added to the PCI bus.
294 pub struct VfioPciDevice {
295     vm: Arc<dyn hypervisor::Vm>,
296     device: Arc<VfioDevice>,
297     container: Arc<VfioContainer>,
298     vfio_pci_configuration: VfioPciConfig,
299     configuration: PciConfiguration,
300     mmio_regions: Vec<MmioRegion>,
301     interrupt: Interrupt,
302     iommu_attached: bool,
303 }
304 
305 impl VfioPciDevice {
306     /// Constructs a new Vfio Pci device for the given Vfio device
307     pub fn new(
308         vm: &Arc<dyn hypervisor::Vm>,
309         device: VfioDevice,
310         container: Arc<VfioContainer>,
311         msi_interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
312         legacy_interrupt_group: Option<Arc<Box<dyn InterruptSourceGroup>>>,
313         iommu_attached: bool,
314     ) -> Result<Self> {
315         let device = Arc::new(device);
316         device.reset();
317 
318         let configuration = PciConfiguration::new(
319             0,
320             0,
321             0,
322             PciClassCode::Other,
323             &PciVfioSubclass::VfioSubclass,
324             None,
325             PciHeaderType::Device,
326             0,
327             0,
328             None,
329         );
330 
331         let vfio_pci_configuration = VfioPciConfig::new(Arc::clone(&device));
332 
333         let mut vfio_pci_device = VfioPciDevice {
334             vm: vm.clone(),
335             device,
336             container,
337             configuration,
338             vfio_pci_configuration,
339             mmio_regions: Vec::new(),
340             interrupt: Interrupt {
341                 intx: None,
342                 msi: None,
343                 msix: None,
344             },
345             iommu_attached,
346         };
347 
348         vfio_pci_device.parse_capabilities(msi_interrupt_manager);
349 
350         vfio_pci_device.initialize_legacy_interrupt(legacy_interrupt_group)?;
351 
352         Ok(vfio_pci_device)
353     }
354 
355     pub fn iommu_attached(&self) -> bool {
356         self.iommu_attached
357     }
358 
359     fn enable_intx(&mut self) -> Result<()> {
360         if let Some(intx) = &mut self.interrupt.intx {
361             if !intx.enabled {
362                 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) {
363                     self.device
364                         .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd])
365                         .map_err(VfioPciError::EnableIntx)?;
366 
367                     intx.enabled = true;
368                 } else {
369                     return Err(VfioPciError::MissingNotifier);
370                 }
371             }
372         }
373 
374         Ok(())
375     }
376 
377     fn disable_intx(&mut self) {
378         if let Some(intx) = &mut self.interrupt.intx {
379             if intx.enabled {
380                 if let Err(e) = self.device.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) {
381                     error!("Could not disable INTx: {}", e);
382                 } else {
383                     intx.enabled = false;
384                 }
385             }
386         }
387     }
388 
389     fn enable_msi(&self) -> Result<()> {
390         if let Some(msi) = &self.interrupt.msi {
391             let mut irq_fds: Vec<EventFd> = Vec::new();
392             for i in 0..msi.cfg.num_enabled_vectors() {
393                 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) {
394                     irq_fds.push(eventfd);
395                 } else {
396                     return Err(VfioPciError::MissingNotifier);
397                 }
398             }
399 
400             self.device
401                 .enable_msi(irq_fds.iter().collect())
402                 .map_err(VfioPciError::EnableMsi)?;
403         }
404 
405         Ok(())
406     }
407 
408     fn disable_msi(&self) {
409         if let Err(e) = self.device.disable_msi() {
410             error!("Could not disable MSI: {}", e);
411         }
412     }
413 
414     fn enable_msix(&self) -> Result<()> {
415         if let Some(msix) = &self.interrupt.msix {
416             let mut irq_fds: Vec<EventFd> = Vec::new();
417             for i in 0..msix.bar.table_entries.len() {
418                 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) {
419                     irq_fds.push(eventfd);
420                 } else {
421                     return Err(VfioPciError::MissingNotifier);
422                 }
423             }
424 
425             self.device
426                 .enable_msix(irq_fds.iter().collect())
427                 .map_err(VfioPciError::EnableMsix)?;
428         }
429 
430         Ok(())
431     }
432 
433     fn disable_msix(&self) {
434         if let Err(e) = self.device.disable_msix() {
435             error!("Could not disable MSI-X: {}", e);
436         }
437     }
438 
439     fn initialize_legacy_interrupt(
440         &mut self,
441         legacy_interrupt_group: Option<Arc<Box<dyn InterruptSourceGroup>>>,
442     ) -> Result<()> {
443         if let Some(irq_info) = self.device.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) {
444             if irq_info.count == 0 {
445                 // A count of 0 means the INTx IRQ is not supported, therefore
446                 // it shouldn't be initialized.
447                 return Ok(());
448             }
449         }
450 
451         if let Some(interrupt_source_group) = legacy_interrupt_group {
452             self.interrupt.intx = Some(VfioIntx {
453                 interrupt_source_group,
454                 enabled: false,
455             });
456 
457             self.enable_intx()?;
458         }
459 
460         Ok(())
461     }
462 
463     fn parse_msix_capabilities(
464         &mut self,
465         cap: u8,
466         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
467     ) {
468         let msg_ctl = self
469             .vfio_pci_configuration
470             .read_config_word((cap + 2).into());
471 
472         let table = self
473             .vfio_pci_configuration
474             .read_config_dword((cap + 4).into());
475 
476         let pba = self
477             .vfio_pci_configuration
478             .read_config_dword((cap + 8).into());
479 
480         let msix_cap = MsixCap {
481             msg_ctl,
482             table,
483             pba,
484         };
485 
486         let interrupt_source_group = interrupt_manager
487             .create_group(MsiIrqGroupConfig {
488                 base: 0,
489                 count: msix_cap.table_size() as InterruptIndex,
490             })
491             .unwrap();
492 
493         let msix_config = MsixConfig::new(msix_cap.table_size(), interrupt_source_group.clone(), 0);
494 
495         self.interrupt.msix = Some(VfioMsix {
496             bar: msix_config,
497             cap: msix_cap,
498             cap_offset: cap.into(),
499             interrupt_source_group,
500         });
501     }
502 
503     fn parse_msi_capabilities(
504         &mut self,
505         cap: u8,
506         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
507     ) {
508         let msg_ctl = self
509             .vfio_pci_configuration
510             .read_config_word((cap + 2).into());
511 
512         let interrupt_source_group = interrupt_manager
513             .create_group(MsiIrqGroupConfig {
514                 base: 0,
515                 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex,
516             })
517             .unwrap();
518 
519         let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone());
520 
521         self.interrupt.msi = Some(VfioMsi {
522             cfg: msi_config,
523             cap_offset: cap.into(),
524             interrupt_source_group,
525         });
526     }
527 
528     fn parse_capabilities(
529         &mut self,
530         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
531     ) {
532         let mut cap_next = self
533             .vfio_pci_configuration
534             .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
535 
536         while cap_next != 0 {
537             let cap_id = self
538                 .vfio_pci_configuration
539                 .read_config_byte(cap_next.into());
540 
541             match PciCapabilityId::from(cap_id) {
542                 PciCapabilityId::MessageSignalledInterrupts => {
543                     if let Some(irq_info) = self.device.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) {
544                         if irq_info.count > 0 {
545                             // Parse capability only if the VFIO device
546                             // supports MSI.
547                             self.parse_msi_capabilities(cap_next, interrupt_manager);
548                         }
549                     }
550                 }
551                 PciCapabilityId::MsiX => {
552                     if let Some(irq_info) = self.device.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) {
553                         if irq_info.count > 0 {
554                             // Parse capability only if the VFIO device
555                             // supports MSI-X.
556                             self.parse_msix_capabilities(cap_next, interrupt_manager);
557                         }
558                     }
559                 }
560                 _ => {}
561             };
562 
563             cap_next = self
564                 .vfio_pci_configuration
565                 .read_config_byte((cap_next + 1).into());
566         }
567     }
568 
569     fn update_msi_capabilities(&mut self, offset: u64, data: &[u8]) -> Result<()> {
570         match self.interrupt.update_msi(offset, data) {
571             Some(InterruptUpdateAction::EnableMsi) => {
572                 // Disable INTx before we can enable MSI
573                 self.disable_intx();
574                 self.enable_msi()?;
575             }
576             Some(InterruptUpdateAction::DisableMsi) => {
577                 // Fallback onto INTx when disabling MSI
578                 self.disable_msi();
579                 self.enable_intx()?;
580             }
581             _ => {}
582         }
583 
584         Ok(())
585     }
586 
587     fn update_msix_capabilities(&mut self, offset: u64, data: &[u8]) -> Result<()> {
588         match self.interrupt.update_msix(offset, data) {
589             Some(InterruptUpdateAction::EnableMsix) => {
590                 // Disable INTx before we can enable MSI-X
591                 self.disable_intx();
592                 self.enable_msix()?;
593             }
594             Some(InterruptUpdateAction::DisableMsix) => {
595                 // Fallback onto INTx when disabling MSI-X
596                 self.disable_msix();
597                 self.enable_intx()?;
598             }
599             _ => {}
600         }
601 
602         Ok(())
603     }
604 
605     fn find_region(&self, addr: u64) -> Option<MmioRegion> {
606         for region in self.mmio_regions.iter() {
607             if addr >= region.start.raw_value()
608                 && addr < region.start.unchecked_add(region.length).raw_value()
609             {
610                 return Some(*region);
611             }
612         }
613         None
614     }
615 
616     /// Map MMIO regions into the guest, and avoid VM exits when the guest tries
617     /// to reach those regions.
618     ///
619     /// # Arguments
620     ///
621     /// * `vm` - The VM object. It is used to set the VFIO MMIO regions
622     ///          as user memory regions.
623     /// * `mem_slot` - The closure to return a memory slot.
624     pub fn map_mmio_regions<F>(&mut self, vm: &Arc<dyn hypervisor::Vm>, mem_slot: F) -> Result<()>
625     where
626         F: Fn() -> u32,
627     {
628         let fd = self.device.as_raw_fd();
629 
630         for region in self.mmio_regions.iter_mut() {
631             // We want to skip the mapping of the BAR containing the MSI-X
632             // table even if it is mappable. The reason is we need to trap
633             // any access to the MSI-X table and update the GSI routing
634             // accordingly.
635             if let Some(msix) = &self.interrupt.msix {
636                 if region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir() {
637                     continue;
638                 }
639             }
640 
641             let region_flags = self.device.get_region_flags(region.index);
642             if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
643                 let mut prot = 0;
644                 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 {
645                     prot |= libc::PROT_READ;
646                 }
647                 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 {
648                     prot |= libc::PROT_WRITE;
649                 }
650                 let (mmap_offset, mmap_size) = self.device.get_region_mmap(region.index);
651                 let offset = self.device.get_region_offset(region.index) + mmap_offset;
652 
653                 let host_addr = unsafe {
654                     libc::mmap(
655                         null_mut(),
656                         mmap_size as usize,
657                         prot,
658                         libc::MAP_SHARED,
659                         fd,
660                         offset as libc::off_t,
661                     )
662                 };
663 
664                 if host_addr == libc::MAP_FAILED {
665                     error!(
666                         "Could not mmap regions, error:{}",
667                         io::Error::last_os_error()
668                     );
669                     continue;
670                 }
671 
672                 let slot = mem_slot();
673                 let mem_region = vm.make_user_memory_region(
674                     slot,
675                     region.start.raw_value() + mmap_offset,
676                     mmap_size as u64,
677                     host_addr as u64,
678                     false,
679                     false,
680                 );
681 
682                 vm.create_user_memory_region(mem_region)
683                     .map_err(|e| VfioPciError::MapRegionGuest(e.into()))?;
684 
685                 // Update the region with memory mapped info.
686                 region.mem_slot = Some(slot);
687                 region.host_addr = Some(host_addr as u64);
688                 region.mmap_size = Some(mmap_size as usize);
689             }
690         }
691 
692         Ok(())
693     }
694 
695     pub fn unmap_mmio_regions(&mut self) {
696         for region in self.mmio_regions.iter() {
697             if let (Some(host_addr), Some(mmap_size), Some(mem_slot)) =
698                 (region.host_addr, region.mmap_size, region.mem_slot)
699             {
700                 let (mmap_offset, _) = self.device.get_region_mmap(region.index);
701 
702                 // Remove region
703                 let r = self.vm.make_user_memory_region(
704                     mem_slot,
705                     region.start.raw_value() + mmap_offset,
706                     mmap_size as u64,
707                     host_addr as u64,
708                     false,
709                     false,
710                 );
711 
712                 if let Err(e) = self.vm.remove_user_memory_region(r) {
713                     error!("Could not remove the userspace memory region: {}", e);
714                 }
715 
716                 let ret = unsafe { libc::munmap(host_addr as *mut libc::c_void, mmap_size) };
717                 if ret != 0 {
718                     error!(
719                         "Could not unmap region {}, error:{}",
720                         region.index,
721                         io::Error::last_os_error()
722                     );
723                 }
724             }
725         }
726     }
727 
728     pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<()> {
729         if !self.iommu_attached {
730             self.container
731                 .vfio_dma_map(iova, size, user_addr)
732                 .map_err(VfioPciError::DmaMap)?;
733         }
734 
735         Ok(())
736     }
737 
738     pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
739         if !self.iommu_attached {
740             self.container
741                 .vfio_dma_unmap(iova, size)
742                 .map_err(VfioPciError::DmaUnmap)?;
743         }
744 
745         Ok(())
746     }
747 
748     pub fn mmio_regions(&self) -> Vec<MmioRegion> {
749         self.mmio_regions.clone()
750     }
751 }
752 
753 impl Drop for VfioPciDevice {
754     fn drop(&mut self) {
755         self.unmap_mmio_regions();
756 
757         if let Some(msix) = &self.interrupt.msix {
758             if msix.bar.enabled() {
759                 self.disable_msix();
760             }
761         }
762 
763         if let Some(msi) = &self.interrupt.msi {
764             if msi.cfg.enabled() {
765                 self.disable_msi();
766             }
767         }
768 
769         if self.interrupt.intx_in_use() {
770             self.disable_intx();
771         }
772     }
773 }
774 
775 impl BusDevice for VfioPciDevice {
776     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
777         self.read_bar(base, offset, data)
778     }
779 
780     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
781         self.write_bar(base, offset, data)
782     }
783 }
784 
785 // First BAR offset in the PCI config space.
786 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10;
787 // Capability register offset in the PCI config space.
788 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34;
789 // IO BAR when first BAR bit is 1.
790 const PCI_CONFIG_IO_BAR: u32 = 0x1;
791 // 64-bit memory bar flag.
792 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4;
793 // PCI config register size (4 bytes).
794 const PCI_CONFIG_REGISTER_SIZE: usize = 4;
795 // Number of BARs for a PCI device
796 const BAR_NUMS: usize = 6;
797 // PCI Header Type register index
798 const PCI_HEADER_TYPE_REG_INDEX: usize = 3;
799 // First BAR register index
800 const PCI_CONFIG_BAR0_INDEX: usize = 4;
801 // PCI ROM expansion BAR register index
802 const PCI_ROM_EXP_BAR_INDEX: usize = 12;
803 
804 impl PciDevice for VfioPciDevice {
805     fn allocate_bars(
806         &mut self,
807         allocator: &mut SystemAllocator,
808     ) -> std::result::Result<Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, PciDeviceError>
809     {
810         let mut ranges = Vec::new();
811         let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX as u32;
812 
813         // Going through all regular regions to compute the BAR size.
814         // We're not saving the BAR address to restore it, because we
815         // are going to allocate a guest address for each BAR and write
816         // that new address back.
817         while bar_id < VFIO_PCI_CONFIG_REGION_INDEX {
818             let region_size: u64;
819             let bar_addr: GuestAddress;
820 
821             let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
822                 (PCI_ROM_EXP_BAR_INDEX * 4) as u32
823             } else {
824                 PCI_CONFIG_BAR_OFFSET + bar_id * 4
825             };
826 
827             // First read flags
828             let flags = self.vfio_pci_configuration.read_config_dword(bar_offset);
829 
830             // Is this an IO BAR?
831             let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
832                 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR)
833             } else {
834                 false
835             };
836 
837             // Is this a 64-bit BAR?
838             let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
839                 matches!(
840                     flags & PCI_CONFIG_MEMORY_BAR_64BIT,
841                     PCI_CONFIG_MEMORY_BAR_64BIT
842                 )
843             } else {
844                 false
845             };
846 
847             // By default, the region type is 32 bits memory BAR.
848             let mut region_type = PciBarRegionType::Memory32BitRegion;
849 
850             // To get size write all 1s
851             self.vfio_pci_configuration
852                 .write_config_dword(0xffff_ffff, bar_offset);
853 
854             // And read back BAR value. The device will write zeros for bits it doesn't care about
855             let mut lower = self.vfio_pci_configuration.read_config_dword(bar_offset);
856 
857             if io_bar {
858                 #[cfg(target_arch = "x86_64")]
859                 {
860                     // IO BAR
861                     region_type = PciBarRegionType::IoRegion;
862 
863                     // Mask flag bits (lowest 2 for I/O bars)
864                     lower &= !0b11;
865 
866                     // BAR is not enabled
867                     if lower == 0 {
868                         bar_id += 1;
869                         continue;
870                     }
871 
872                     // Invert bits and add 1 to calculate size
873                     region_size = (!lower + 1) as u64;
874 
875                     // The address needs to be 4 bytes aligned.
876                     bar_addr = allocator
877                         .allocate_io_addresses(None, region_size, Some(0x4))
878                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?;
879                 }
880                 #[cfg(target_arch = "aarch64")]
881                 unimplemented!()
882             } else if is_64bit_bar {
883                 // 64 bits Memory BAR
884                 region_type = PciBarRegionType::Memory64BitRegion;
885 
886                 // Query size of upper BAR of 64-bit BAR
887                 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4;
888                 self.vfio_pci_configuration
889                     .write_config_dword(0xffff_ffff, upper_offset);
890                 let upper = self.vfio_pci_configuration.read_config_dword(upper_offset);
891 
892                 let mut combined_size = u64::from(upper) << 32 | u64::from(lower);
893 
894                 // Mask out flag bits (lowest 4 for memory bars)
895                 combined_size &= !0b1111;
896 
897                 // BAR is not enabled
898                 if combined_size == 0 {
899                     bar_id += 1;
900                     continue;
901                 }
902 
903                 // Invert and add 1 to to find size
904                 region_size = (!combined_size + 1) as u64;
905 
906                 // BAR allocation must be naturally aligned
907                 bar_addr = allocator
908                     .allocate_mmio_addresses(None, region_size, Some(region_size))
909                     .ok_or(PciDeviceError::IoAllocationFailed(region_size))?;
910             } else {
911                 // Mask out flag bits (lowest 4 for memory bars)
912                 lower &= !0b1111;
913 
914                 if lower == 0 {
915                     bar_id += 1;
916                     continue;
917                 }
918 
919                 // Invert and add 1 to to find size
920                 region_size = (!lower + 1) as u64;
921 
922                 // BAR allocation must be naturally aligned
923                 bar_addr = allocator
924                     .allocate_mmio_hole_addresses(None, region_size, Some(region_size))
925                     .ok_or(PciDeviceError::IoAllocationFailed(region_size))?;
926             }
927 
928             let reg_idx = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
929                 PCI_ROM_EXP_BAR_INDEX
930             } else {
931                 bar_id as usize
932             };
933 
934             // We can now build our BAR configuration block.
935             let config = PciBarConfiguration::default()
936                 .set_register_index(reg_idx)
937                 .set_address(bar_addr.raw_value())
938                 .set_size(region_size)
939                 .set_region_type(region_type);
940 
941             if bar_id == VFIO_PCI_ROM_REGION_INDEX {
942                 self.configuration
943                     .add_pci_rom_bar(&config, flags & 0x1)
944                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
945             } else {
946                 self.configuration
947                     .add_pci_bar(&config)
948                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
949             }
950 
951             ranges.push((bar_addr, region_size, region_type));
952             self.mmio_regions.push(MmioRegion {
953                 start: bar_addr,
954                 length: region_size,
955                 type_: region_type,
956                 index: bar_id as u32,
957                 mem_slot: None,
958                 host_addr: None,
959                 mmap_size: None,
960             });
961 
962             bar_id += 1;
963             if is_64bit_bar {
964                 bar_id += 1;
965             }
966         }
967 
968         Ok(ranges)
969     }
970 
971     fn free_bars(
972         &mut self,
973         allocator: &mut SystemAllocator,
974     ) -> std::result::Result<(), PciDeviceError> {
975         for region in self.mmio_regions.iter() {
976             match region.type_ {
977                 PciBarRegionType::IoRegion => {
978                     #[cfg(target_arch = "x86_64")]
979                     allocator.free_io_addresses(region.start, region.length);
980                     #[cfg(target_arch = "aarch64")]
981                     error!("I/O region is not supported");
982                 }
983                 PciBarRegionType::Memory32BitRegion => {
984                     allocator.free_mmio_hole_addresses(region.start, region.length);
985                 }
986                 PciBarRegionType::Memory64BitRegion => {
987                     allocator.free_mmio_addresses(region.start, region.length);
988                 }
989             }
990         }
991         Ok(())
992     }
993 
994     fn write_config_register(
995         &mut self,
996         reg_idx: usize,
997         offset: u64,
998         data: &[u8],
999     ) -> Option<Arc<Barrier>> {
1000         // When the guest wants to write to a BAR, we trap it into
1001         // our local configuration space. We're not reprogramming
1002         // VFIO device.
1003         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1004             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1005         {
1006             // We keep our local cache updated with the BARs.
1007             // We'll read it back from there when the guest is asking
1008             // for BARs (see read_config_register()).
1009             self.configuration
1010                 .write_config_register(reg_idx, offset, data);
1011             return None;
1012         }
1013 
1014         let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64;
1015 
1016         // If the MSI or MSI-X capabilities are accessed, we need to
1017         // update our local cache accordingly.
1018         // Depending on how the capabilities are modified, this could
1019         // trigger a VFIO MSI or MSI-X toggle.
1020         if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) {
1021             let cap_offset: u64 = reg - cap_base + offset;
1022             match cap_id {
1023                 PciCapabilityId::MessageSignalledInterrupts => {
1024                     if let Err(e) = self.update_msi_capabilities(cap_offset, data) {
1025                         error!("Could not update MSI capabilities: {}", e);
1026                     }
1027                 }
1028                 PciCapabilityId::MsiX => {
1029                     if let Err(e) = self.update_msix_capabilities(cap_offset, data) {
1030                         error!("Could not update MSI-X capabilities: {}", e);
1031                     }
1032                 }
1033                 _ => {}
1034             }
1035         }
1036 
1037         // Make sure to write to the device's PCI config space after MSI/MSI-X
1038         // interrupts have been enabled/disabled. In case of MSI, when the
1039         // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS),
1040         // the MSI Enable bit in the MSI capability structure found in the PCI
1041         // config space is disabled by default. That's why when the guest is
1042         // enabling this bit, we first need to enable the MSI interrupts with
1043         // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write
1044         // to the device region to update the MSI Enable bit.
1045         self.device
1046             .region_write(VFIO_PCI_CONFIG_REGION_INDEX, data, reg + offset);
1047 
1048         None
1049     }
1050 
1051     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1052         // When reading the BARs, we trap it and return what comes
1053         // from our local configuration space. We want the guest to
1054         // use that and not the VFIO device BARs as it does not map
1055         // with the guest address space.
1056         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
1057             || reg_idx == PCI_ROM_EXP_BAR_INDEX
1058         {
1059             return self.configuration.read_reg(reg_idx);
1060         }
1061 
1062         // Since we don't support passing multi-functions devices, we should
1063         // mask the multi-function bit, bit 7 of the Header Type byte on the
1064         // register 3.
1065         let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX {
1066             0xff7f_ffff
1067         } else {
1068             0xffff_ffff
1069         };
1070 
1071         // The config register read comes from the VFIO device itself.
1072         self.vfio_pci_configuration
1073             .read_config_dword((reg_idx * 4) as u32)
1074             & mask
1075     }
1076 
1077     fn detect_bar_reprogramming(
1078         &mut self,
1079         reg_idx: usize,
1080         data: &[u8],
1081     ) -> Option<BarReprogrammingParams> {
1082         self.configuration.detect_bar_reprogramming(reg_idx, data)
1083     }
1084 
1085     fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1086         let addr = base + offset;
1087         if let Some(region) = self.find_region(addr) {
1088             let offset = addr - region.start.raw_value();
1089 
1090             if self.interrupt.msix_table_accessed(region.index, offset) {
1091                 self.interrupt.msix_read_table(offset, data);
1092             } else {
1093                 self.device.region_read(region.index, data, offset);
1094             }
1095         }
1096 
1097         // INTx EOI
1098         // The guest reading from the BAR potentially means the interrupt has
1099         // been received and can be acknowledged.
1100         if self.interrupt.intx_in_use() {
1101             if let Err(e) = self.device.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1102                 error!("Failed unmasking INTx IRQ: {}", e);
1103             }
1104         }
1105     }
1106 
1107     fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1108         let addr = base + offset;
1109         if let Some(region) = self.find_region(addr) {
1110             let offset = addr - region.start.raw_value();
1111 
1112             // If the MSI-X table is written to, we need to update our cache.
1113             if self.interrupt.msix_table_accessed(region.index, offset) {
1114                 self.interrupt.msix_write_table(offset, data);
1115             } else {
1116                 self.device.region_write(region.index, data, offset);
1117             }
1118         }
1119 
1120         // INTx EOI
1121         // The guest writing to the BAR potentially means the interrupt has
1122         // been received and can be acknowledged.
1123         if self.interrupt.intx_in_use() {
1124             if let Err(e) = self.device.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
1125                 error!("Failed unmasking INTx IRQ: {}", e);
1126             }
1127         }
1128 
1129         None
1130     }
1131 
1132     fn move_bar(&mut self, old_base: u64, new_base: u64) -> result::Result<(), io::Error> {
1133         for region in self.mmio_regions.iter_mut() {
1134             if region.start.raw_value() == old_base {
1135                 region.start = GuestAddress(new_base);
1136 
1137                 if let Some(mem_slot) = region.mem_slot {
1138                     if let Some(host_addr) = region.host_addr {
1139                         let (mmap_offset, mmap_size) = self.device.get_region_mmap(region.index);
1140 
1141                         // Remove old region
1142                         let old_mem_region = self.vm.make_user_memory_region(
1143                             mem_slot,
1144                             old_base + mmap_offset,
1145                             mmap_size as u64,
1146                             host_addr as u64,
1147                             false,
1148                             false,
1149                         );
1150 
1151                         self.vm
1152                             .remove_user_memory_region(old_mem_region)
1153                             .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1154 
1155                         // Insert new region
1156                         let new_mem_region = self.vm.make_user_memory_region(
1157                             mem_slot,
1158                             new_base + mmap_offset,
1159                             mmap_size as u64,
1160                             host_addr as u64,
1161                             false,
1162                             false,
1163                         );
1164 
1165                         self.vm
1166                             .create_user_memory_region(new_mem_region)
1167                             .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1168                     }
1169                 }
1170             }
1171         }
1172 
1173         Ok(())
1174     }
1175 
1176     fn as_any(&mut self) -> &mut dyn Any {
1177         self
1178     }
1179 }
1180