xref: /cloud-hypervisor/pci/src/vfio.rs (revision f7f2f25a574b1b2dba22c094fc8226d404157d15)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 
6 use crate::{
7     msi_num_enabled_vectors, BarReprogrammingParams, MsiConfig, MsixCap, MsixConfig,
8     PciBarConfiguration, PciBarRegionType, PciCapabilityId, PciClassCode, PciConfiguration,
9     PciDevice, PciDeviceError, PciHeaderType, PciSubclass, MSIX_TABLE_ENTRY_SIZE,
10 };
11 use byteorder::{ByteOrder, LittleEndian};
12 use hypervisor::HypervisorVmError;
13 use std::any::Any;
14 use std::io;
15 use std::os::unix::io::AsRawFd;
16 use std::ptr::null_mut;
17 use std::sync::{Arc, Barrier};
18 use thiserror::Error;
19 use vfio_bindings::bindings::vfio::*;
20 use vfio_ioctls::{VfioContainer, VfioDevice, VfioIrq};
21 use vm_allocator::SystemAllocator;
22 use vm_device::interrupt::{
23     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
24 };
25 use vm_device::BusDevice;
26 use vm_memory::{Address, GuestAddress, GuestUsize};
27 use vmm_sys_util::eventfd::EventFd;
28 
29 #[derive(Debug, Error)]
30 pub enum VfioPciError {
31     #[error("Failed to DMA map: {0}")]
32     DmaMap(#[source] vfio_ioctls::VfioError),
33     #[error("Failed to DMA unmap: {0}")]
34     DmaUnmap(#[source] vfio_ioctls::VfioError),
35     #[error("Failed to enable INTx: {0}")]
36     EnableIntx(#[source] VfioError),
37     #[error("Failed to enable MSI: {0}")]
38     EnableMsi(#[source] VfioError),
39     #[error("Failed to enable MSI-x: {0}")]
40     EnableMsix(#[source] VfioError),
41     #[error("Failed to map VFIO PCI region into guest: {0}")]
42     MapRegionGuest(#[source] HypervisorVmError),
43     #[error("Failed to notifier's eventfd")]
44     MissingNotifier,
45 }
46 
47 #[derive(Copy, Clone)]
48 enum PciVfioSubclass {
49     VfioSubclass = 0xff,
50 }
51 
52 impl PciSubclass for PciVfioSubclass {
53     fn get_register_value(&self) -> u8 {
54         *self as u8
55     }
56 }
57 
58 enum InterruptUpdateAction {
59     EnableMsi,
60     DisableMsi,
61     EnableMsix,
62     DisableMsix,
63 }
64 
65 pub(crate) struct VfioIntx {
66     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
67     enabled: bool,
68 }
69 
70 pub(crate) struct VfioMsi {
71     pub(crate) cfg: MsiConfig,
72     cap_offset: u32,
73     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
74 }
75 
76 impl VfioMsi {
77     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
78         let old_enabled = self.cfg.enabled();
79 
80         self.cfg.update(offset, data);
81 
82         let new_enabled = self.cfg.enabled();
83 
84         if !old_enabled && new_enabled {
85             return Some(InterruptUpdateAction::EnableMsi);
86         }
87 
88         if old_enabled && !new_enabled {
89             return Some(InterruptUpdateAction::DisableMsi);
90         }
91 
92         None
93     }
94 }
95 
96 pub(crate) struct VfioMsix {
97     pub(crate) bar: MsixConfig,
98     cap: MsixCap,
99     cap_offset: u32,
100     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
101 }
102 
103 impl VfioMsix {
104     fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
105         let old_enabled = self.bar.enabled();
106 
107         // Update "Message Control" word
108         if offset == 2 && data.len() == 2 {
109             self.bar.set_msg_ctl(LittleEndian::read_u16(data));
110         }
111 
112         let new_enabled = self.bar.enabled();
113 
114         if !old_enabled && new_enabled {
115             return Some(InterruptUpdateAction::EnableMsix);
116         }
117 
118         if old_enabled && !new_enabled {
119             return Some(InterruptUpdateAction::DisableMsix);
120         }
121 
122         None
123     }
124 
125     fn table_accessed(&self, bar_index: u32, offset: u64) -> bool {
126         let table_offset: u64 = u64::from(self.cap.table_offset());
127         let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64);
128         let table_bir: u32 = self.cap.table_bir();
129 
130         bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size
131     }
132 }
133 
134 pub(crate) struct Interrupt {
135     pub(crate) intx: Option<VfioIntx>,
136     pub(crate) msi: Option<VfioMsi>,
137     pub(crate) msix: Option<VfioMsix>,
138 }
139 
140 impl Interrupt {
141     fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
142         if let Some(ref mut msi) = &mut self.msi {
143             let action = msi.update(offset, data);
144             return action;
145         }
146 
147         None
148     }
149 
150     fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
151         if let Some(ref mut msix) = &mut self.msix {
152             let action = msix.update(offset, data);
153             return action;
154         }
155 
156         None
157     }
158 
159     fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> {
160         if let Some(msi) = &self.msi {
161             if offset >= u64::from(msi.cap_offset)
162                 && offset < u64::from(msi.cap_offset) + msi.cfg.size()
163             {
164                 return Some((
165                     PciCapabilityId::MessageSignalledInterrupts,
166                     u64::from(msi.cap_offset),
167                 ));
168             }
169         }
170 
171         if let Some(msix) = &self.msix {
172             if offset == u64::from(msix.cap_offset) {
173                 return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset)));
174             }
175         }
176 
177         None
178     }
179 
180     fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool {
181         if let Some(msix) = &self.msix {
182             return msix.table_accessed(bar_index, offset);
183         }
184 
185         false
186     }
187 
188     fn msix_write_table(&mut self, offset: u64, data: &[u8]) {
189         if let Some(ref mut msix) = &mut self.msix {
190             let offset = offset - u64::from(msix.cap.table_offset());
191             msix.bar.write_table(offset, data)
192         }
193     }
194 
195     fn msix_read_table(&self, offset: u64, data: &mut [u8]) {
196         if let Some(msix) = &self.msix {
197             let offset = offset - u64::from(msix.cap.table_offset());
198             msix.bar.read_table(offset, data)
199         }
200     }
201 
202     pub(crate) fn intx_in_use(&self) -> bool {
203         if let Some(intx) = &self.intx {
204             return intx.enabled;
205         }
206 
207         false
208     }
209 }
210 
211 #[derive(Copy, Clone)]
212 pub struct MmioRegion {
213     pub start: GuestAddress,
214     pub length: GuestUsize,
215     pub(crate) type_: PciBarRegionType,
216     pub(crate) index: u32,
217     pub(crate) mem_slot: Option<u32>,
218     pub(crate) host_addr: Option<u64>,
219     pub(crate) mmap_size: Option<usize>,
220 }
221 #[derive(Debug, Error)]
222 pub enum VfioError {
223     #[error("Kernel VFIO error: {0}")]
224     KernelVfio(#[source] vfio_ioctls::VfioError),
225 }
226 
227 pub(crate) trait Vfio {
228     fn read_config_byte(&self, offset: u32) -> u8 {
229         let mut data: [u8; 1] = [0];
230         self.read_config(offset, &mut data);
231         data[0]
232     }
233 
234     fn read_config_word(&self, offset: u32) -> u16 {
235         let mut data: [u8; 2] = [0, 0];
236         self.read_config(offset, &mut data);
237         u16::from_le_bytes(data)
238     }
239 
240     fn read_config_dword(&self, offset: u32) -> u32 {
241         let mut data: [u8; 4] = [0, 0, 0, 0];
242         self.read_config(offset, &mut data);
243         u32::from_le_bytes(data)
244     }
245 
246     fn write_config_dword(&self, offset: u32, buf: u32) {
247         let data: [u8; 4] = buf.to_le_bytes();
248         self.write_config(offset, &data)
249     }
250 
251     fn read_config(&self, offset: u32, data: &mut [u8]) {
252         self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut());
253     }
254 
255     fn write_config(&self, offset: u32, data: &[u8]) {
256         self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data)
257     }
258 
259     fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
260         self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds)
261     }
262 
263     fn disable_msi(&self) -> Result<(), VfioError> {
264         self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX)
265     }
266 
267     fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
268         self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds)
269     }
270 
271     fn disable_msix(&self) -> Result<(), VfioError> {
272         self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX)
273     }
274 
275     fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) {
276         unimplemented!()
277     }
278 
279     fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) {
280         unimplemented!()
281     }
282 
283     fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> {
284         unimplemented!()
285     }
286 
287     fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
288         unimplemented!()
289     }
290 
291     fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
292         unimplemented!()
293     }
294 
295     fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
296         unimplemented!()
297     }
298 }
299 
300 struct VfioDeviceWrapper {
301     device: Arc<VfioDevice>,
302 }
303 
304 impl VfioDeviceWrapper {
305     fn new(device: Arc<VfioDevice>) -> Self {
306         Self { device }
307     }
308 }
309 
310 impl Vfio for VfioDeviceWrapper {
311     fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) {
312         self.device.region_read(index, data, offset)
313     }
314 
315     fn region_write(&self, index: u32, offset: u64, data: &[u8]) {
316         self.device.region_write(index, data, offset)
317     }
318 
319     fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> {
320         self.device.get_irq_info(irq_index).copied()
321     }
322 
323     fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
324         self.device
325             .enable_irq(irq_index, event_fds)
326             .map_err(VfioError::KernelVfio)
327     }
328 
329     fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> {
330         self.device
331             .disable_irq(irq_index)
332             .map_err(VfioError::KernelVfio)
333     }
334 
335     fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> {
336         self.device
337             .unmask_irq(irq_index)
338             .map_err(VfioError::KernelVfio)
339     }
340 }
341 
342 pub(crate) struct VfioCommon {
343     pub(crate) configuration: PciConfiguration,
344     pub(crate) mmio_regions: Vec<MmioRegion>,
345     pub(crate) interrupt: Interrupt,
346 }
347 
348 impl VfioCommon {
349     pub(crate) fn allocate_bars(
350         &mut self,
351         allocator: &mut SystemAllocator,
352         vfio_wrapper: &dyn Vfio,
353     ) -> Result<Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, PciDeviceError> {
354         let mut ranges = Vec::new();
355         let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX as u32;
356 
357         // Going through all regular regions to compute the BAR size.
358         // We're not saving the BAR address to restore it, because we
359         // are going to allocate a guest address for each BAR and write
360         // that new address back.
361         while bar_id < VFIO_PCI_CONFIG_REGION_INDEX {
362             let region_size: u64;
363             let bar_addr: GuestAddress;
364 
365             let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
366                 (PCI_ROM_EXP_BAR_INDEX * 4) as u32
367             } else {
368                 PCI_CONFIG_BAR_OFFSET + bar_id * 4
369             };
370 
371             // First read flags
372             let flags = vfio_wrapper.read_config_dword(bar_offset);
373 
374             // Is this an IO BAR?
375             let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
376                 matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR)
377             } else {
378                 false
379             };
380 
381             // Is this a 64-bit BAR?
382             let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
383                 matches!(
384                     flags & PCI_CONFIG_MEMORY_BAR_64BIT,
385                     PCI_CONFIG_MEMORY_BAR_64BIT
386                 )
387             } else {
388                 false
389             };
390 
391             // By default, the region type is 32 bits memory BAR.
392             let mut region_type = PciBarRegionType::Memory32BitRegion;
393 
394             // To get size write all 1s
395             vfio_wrapper.write_config_dword(bar_offset, 0xffff_ffff);
396 
397             // And read back BAR value. The device will write zeros for bits it doesn't care about
398             let mut lower = vfio_wrapper.read_config_dword(bar_offset);
399 
400             if io_bar {
401                 #[cfg(target_arch = "x86_64")]
402                 {
403                     // IO BAR
404                     region_type = PciBarRegionType::IoRegion;
405 
406                     // Mask flag bits (lowest 2 for I/O bars)
407                     lower &= !0b11;
408 
409                     // BAR is not enabled
410                     if lower == 0 {
411                         bar_id += 1;
412                         continue;
413                     }
414 
415                     // Invert bits and add 1 to calculate size
416                     region_size = (!lower + 1) as u64;
417 
418                     // The address needs to be 4 bytes aligned.
419                     bar_addr = allocator
420                         .allocate_io_addresses(None, region_size, Some(0x4))
421                         .ok_or(PciDeviceError::IoAllocationFailed(region_size))?;
422                 }
423                 #[cfg(target_arch = "aarch64")]
424                 unimplemented!()
425             } else if is_64bit_bar {
426                 // 64 bits Memory BAR
427                 region_type = PciBarRegionType::Memory64BitRegion;
428 
429                 // Query size of upper BAR of 64-bit BAR
430                 let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4;
431                 vfio_wrapper.write_config_dword(upper_offset, 0xffff_ffff);
432                 let upper = vfio_wrapper.read_config_dword(upper_offset);
433 
434                 let mut combined_size = u64::from(upper) << 32 | u64::from(lower);
435 
436                 // Mask out flag bits (lowest 4 for memory bars)
437                 combined_size &= !0b1111;
438 
439                 // BAR is not enabled
440                 if combined_size == 0 {
441                     bar_id += 1;
442                     continue;
443                 }
444 
445                 // Invert and add 1 to to find size
446                 region_size = (!combined_size + 1) as u64;
447 
448                 // BAR allocation must be naturally aligned
449                 bar_addr = allocator
450                     .allocate_mmio_addresses(None, region_size, Some(region_size))
451                     .ok_or(PciDeviceError::IoAllocationFailed(region_size))?;
452             } else {
453                 // Mask out flag bits (lowest 4 for memory bars)
454                 lower &= !0b1111;
455 
456                 if lower == 0 {
457                     bar_id += 1;
458                     continue;
459                 }
460 
461                 // Invert and add 1 to to find size
462                 region_size = (!lower + 1) as u64;
463 
464                 // BAR allocation must be naturally aligned
465                 bar_addr = allocator
466                     .allocate_mmio_hole_addresses(None, region_size, Some(region_size))
467                     .ok_or(PciDeviceError::IoAllocationFailed(region_size))?;
468             }
469 
470             let reg_idx = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
471                 PCI_ROM_EXP_BAR_INDEX
472             } else {
473                 bar_id as usize
474             };
475 
476             // We can now build our BAR configuration block.
477             let config = PciBarConfiguration::default()
478                 .set_register_index(reg_idx)
479                 .set_address(bar_addr.raw_value())
480                 .set_size(region_size)
481                 .set_region_type(region_type);
482 
483             if bar_id == VFIO_PCI_ROM_REGION_INDEX {
484                 self.configuration
485                     .add_pci_rom_bar(&config, flags & 0x1)
486                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
487             } else {
488                 self.configuration
489                     .add_pci_bar(&config)
490                     .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
491             }
492 
493             ranges.push((bar_addr, region_size, region_type));
494             self.mmio_regions.push(MmioRegion {
495                 start: bar_addr,
496                 length: region_size,
497                 type_: region_type,
498                 index: bar_id as u32,
499                 mem_slot: None,
500                 host_addr: None,
501                 mmap_size: None,
502             });
503 
504             bar_id += 1;
505             if is_64bit_bar {
506                 bar_id += 1;
507             }
508         }
509 
510         Ok(ranges)
511     }
512 
513     pub(crate) fn free_bars(
514         &mut self,
515         allocator: &mut SystemAllocator,
516     ) -> Result<(), PciDeviceError> {
517         for region in self.mmio_regions.iter() {
518             match region.type_ {
519                 PciBarRegionType::IoRegion => {
520                     #[cfg(target_arch = "x86_64")]
521                     allocator.free_io_addresses(region.start, region.length);
522                     #[cfg(target_arch = "aarch64")]
523                     error!("I/O region is not supported");
524                 }
525                 PciBarRegionType::Memory32BitRegion => {
526                     allocator.free_mmio_hole_addresses(region.start, region.length);
527                 }
528                 PciBarRegionType::Memory64BitRegion => {
529                     allocator.free_mmio_addresses(region.start, region.length);
530                 }
531             }
532         }
533         Ok(())
534     }
535 
536     pub(crate) fn parse_msix_capabilities(
537         &mut self,
538         cap: u8,
539         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
540         vfio_wrapper: &dyn Vfio,
541     ) {
542         let msg_ctl = vfio_wrapper.read_config_word((cap + 2).into());
543 
544         let table = vfio_wrapper.read_config_dword((cap + 4).into());
545 
546         let pba = vfio_wrapper.read_config_dword((cap + 8).into());
547 
548         let msix_cap = MsixCap {
549             msg_ctl,
550             table,
551             pba,
552         };
553 
554         let interrupt_source_group = interrupt_manager
555             .create_group(MsiIrqGroupConfig {
556                 base: 0,
557                 count: msix_cap.table_size() as InterruptIndex,
558             })
559             .unwrap();
560 
561         let msix_config = MsixConfig::new(msix_cap.table_size(), interrupt_source_group.clone(), 0);
562 
563         self.interrupt.msix = Some(VfioMsix {
564             bar: msix_config,
565             cap: msix_cap,
566             cap_offset: cap.into(),
567             interrupt_source_group,
568         });
569     }
570 
571     pub(crate) fn parse_msi_capabilities(
572         &mut self,
573         cap: u8,
574         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
575         vfio_wrapper: &dyn Vfio,
576     ) {
577         let msg_ctl = vfio_wrapper.read_config_word((cap + 2).into());
578 
579         let interrupt_source_group = interrupt_manager
580             .create_group(MsiIrqGroupConfig {
581                 base: 0,
582                 count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex,
583             })
584             .unwrap();
585 
586         let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone());
587 
588         self.interrupt.msi = Some(VfioMsi {
589             cfg: msi_config,
590             cap_offset: cap.into(),
591             interrupt_source_group,
592         });
593     }
594 
595     pub(crate) fn parse_capabilities(
596         &mut self,
597         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
598         vfio_wrapper: &dyn Vfio,
599     ) {
600         let mut cap_next = vfio_wrapper.read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
601 
602         while cap_next != 0 {
603             let cap_id = vfio_wrapper.read_config_byte(cap_next.into());
604 
605             match PciCapabilityId::from(cap_id) {
606                 PciCapabilityId::MessageSignalledInterrupts => {
607                     if let Some(irq_info) = vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) {
608                         if irq_info.count > 0 {
609                             // Parse capability only if the VFIO device
610                             // supports MSI.
611                             self.parse_msi_capabilities(cap_next, interrupt_manager, vfio_wrapper);
612                         }
613                     }
614                 }
615                 PciCapabilityId::MsiX => {
616                     if let Some(irq_info) = vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) {
617                         if irq_info.count > 0 {
618                             // Parse capability only if the VFIO device
619                             // supports MSI-X.
620                             self.parse_msix_capabilities(cap_next, interrupt_manager, vfio_wrapper);
621                         }
622                     }
623                 }
624                 _ => {}
625             };
626 
627             cap_next = vfio_wrapper.read_config_byte((cap_next + 1).into());
628         }
629     }
630 
631     pub(crate) fn enable_intx(&mut self, wrapper: &dyn Vfio) -> Result<(), VfioPciError> {
632         if let Some(intx) = &mut self.interrupt.intx {
633             if !intx.enabled {
634                 if let Some(eventfd) = intx.interrupt_source_group.notifier(0) {
635                     wrapper
636                         .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd])
637                         .map_err(VfioPciError::EnableIntx)?;
638 
639                     intx.enabled = true;
640                 } else {
641                     return Err(VfioPciError::MissingNotifier);
642                 }
643             }
644         }
645 
646         Ok(())
647     }
648 
649     pub(crate) fn disable_intx(&mut self, wrapper: &dyn Vfio) {
650         if let Some(intx) = &mut self.interrupt.intx {
651             if intx.enabled {
652                 if let Err(e) = wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) {
653                     error!("Could not disable INTx: {}", e);
654                 } else {
655                     intx.enabled = false;
656                 }
657             }
658         }
659     }
660 
661     pub(crate) fn enable_msi(&self, wrapper: &dyn Vfio) -> Result<(), VfioPciError> {
662         if let Some(msi) = &self.interrupt.msi {
663             let mut irq_fds: Vec<EventFd> = Vec::new();
664             for i in 0..msi.cfg.num_enabled_vectors() {
665                 if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) {
666                     irq_fds.push(eventfd);
667                 } else {
668                     return Err(VfioPciError::MissingNotifier);
669                 }
670             }
671 
672             wrapper
673                 .enable_msi(irq_fds.iter().collect())
674                 .map_err(VfioPciError::EnableMsi)?;
675         }
676 
677         Ok(())
678     }
679 
680     pub(crate) fn disable_msi(&self, wrapper: &dyn Vfio) {
681         if let Err(e) = wrapper.disable_msi() {
682             error!("Could not disable MSI: {}", e);
683         }
684     }
685 
686     pub(crate) fn enable_msix(&self, wrapper: &dyn Vfio) -> Result<(), VfioPciError> {
687         if let Some(msix) = &self.interrupt.msix {
688             let mut irq_fds: Vec<EventFd> = Vec::new();
689             for i in 0..msix.bar.table_entries.len() {
690                 if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) {
691                     irq_fds.push(eventfd);
692                 } else {
693                     return Err(VfioPciError::MissingNotifier);
694                 }
695             }
696 
697             wrapper
698                 .enable_msix(irq_fds.iter().collect())
699                 .map_err(VfioPciError::EnableMsix)?;
700         }
701 
702         Ok(())
703     }
704 
705     pub(crate) fn disable_msix(&self, wrapper: &dyn Vfio) {
706         if let Err(e) = wrapper.disable_msix() {
707             error!("Could not disable MSI-X: {}", e);
708         }
709     }
710 
711     pub(crate) fn initialize_legacy_interrupt(
712         &mut self,
713         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
714         wrapper: &dyn Vfio,
715     ) -> Result<(), VfioPciError> {
716         if let Some(irq_info) = wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) {
717             if irq_info.count == 0 {
718                 // A count of 0 means the INTx IRQ is not supported, therefore
719                 // it shouldn't be initialized.
720                 return Ok(());
721             }
722         }
723 
724         if let Some(interrupt_source_group) = legacy_interrupt_group {
725             self.interrupt.intx = Some(VfioIntx {
726                 interrupt_source_group,
727                 enabled: false,
728             });
729 
730             self.enable_intx(wrapper)?;
731         }
732 
733         Ok(())
734     }
735 
736     pub(crate) fn update_msi_capabilities(
737         &mut self,
738         offset: u64,
739         data: &[u8],
740         wrapper: &dyn Vfio,
741     ) -> Result<(), VfioPciError> {
742         match self.interrupt.update_msi(offset, data) {
743             Some(InterruptUpdateAction::EnableMsi) => {
744                 // Disable INTx before we can enable MSI
745                 self.disable_intx(wrapper);
746                 self.enable_msi(wrapper)?;
747             }
748             Some(InterruptUpdateAction::DisableMsi) => {
749                 // Fallback onto INTx when disabling MSI
750                 self.disable_msi(wrapper);
751                 self.enable_intx(wrapper)?;
752             }
753             _ => {}
754         }
755 
756         Ok(())
757     }
758 
759     pub(crate) fn update_msix_capabilities(
760         &mut self,
761         offset: u64,
762         data: &[u8],
763         wrapper: &dyn Vfio,
764     ) -> Result<(), VfioPciError> {
765         match self.interrupt.update_msix(offset, data) {
766             Some(InterruptUpdateAction::EnableMsix) => {
767                 // Disable INTx before we can enable MSI-X
768                 self.disable_intx(wrapper);
769                 self.enable_msix(wrapper)?;
770             }
771             Some(InterruptUpdateAction::DisableMsix) => {
772                 // Fallback onto INTx when disabling MSI-X
773                 self.disable_msix(wrapper);
774                 self.enable_intx(wrapper)?;
775             }
776             _ => {}
777         }
778 
779         Ok(())
780     }
781 
782     pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> {
783         for region in self.mmio_regions.iter() {
784             if addr >= region.start.raw_value()
785                 && addr < region.start.unchecked_add(region.length).raw_value()
786             {
787                 return Some(*region);
788             }
789         }
790         None
791     }
792 
793     pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8], wrapper: &dyn Vfio) {
794         let addr = base + offset;
795         if let Some(region) = self.find_region(addr) {
796             let offset = addr - region.start.raw_value();
797 
798             if self.interrupt.msix_table_accessed(region.index, offset) {
799                 self.interrupt.msix_read_table(offset, data);
800             } else {
801                 wrapper.region_read(region.index, offset, data);
802             }
803         }
804 
805         // INTx EOI
806         // The guest reading from the BAR potentially means the interrupt has
807         // been received and can be acknowledged.
808         if self.interrupt.intx_in_use() {
809             if let Err(e) = wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
810                 error!("Failed unmasking INTx IRQ: {}", e);
811             }
812         }
813     }
814 
815     pub(crate) fn write_bar(
816         &mut self,
817         base: u64,
818         offset: u64,
819         data: &[u8],
820         wrapper: &dyn Vfio,
821     ) -> Option<Arc<Barrier>> {
822         let addr = base + offset;
823         if let Some(region) = self.find_region(addr) {
824             let offset = addr - region.start.raw_value();
825 
826             // If the MSI-X table is written to, we need to update our cache.
827             if self.interrupt.msix_table_accessed(region.index, offset) {
828                 self.interrupt.msix_write_table(offset, data);
829             } else {
830                 wrapper.region_write(region.index, offset, data);
831             }
832         }
833 
834         // INTx EOI
835         // The guest writing to the BAR potentially means the interrupt has
836         // been received and can be acknowledged.
837         if self.interrupt.intx_in_use() {
838             if let Err(e) = wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
839                 error!("Failed unmasking INTx IRQ: {}", e);
840             }
841         }
842 
843         None
844     }
845 
846     pub(crate) fn write_config_register(
847         &mut self,
848         reg_idx: usize,
849         offset: u64,
850         data: &[u8],
851         wrapper: &dyn Vfio,
852     ) -> Option<Arc<Barrier>> {
853         // When the guest wants to write to a BAR, we trap it into
854         // our local configuration space. We're not reprogramming
855         // VFIO device.
856         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
857             || reg_idx == PCI_ROM_EXP_BAR_INDEX
858         {
859             // We keep our local cache updated with the BARs.
860             // We'll read it back from there when the guest is asking
861             // for BARs (see read_config_register()).
862             self.configuration
863                 .write_config_register(reg_idx, offset, data);
864             return None;
865         }
866 
867         let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64;
868 
869         // If the MSI or MSI-X capabilities are accessed, we need to
870         // update our local cache accordingly.
871         // Depending on how the capabilities are modified, this could
872         // trigger a VFIO MSI or MSI-X toggle.
873         if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) {
874             let cap_offset: u64 = reg - cap_base + offset;
875             match cap_id {
876                 PciCapabilityId::MessageSignalledInterrupts => {
877                     if let Err(e) = self.update_msi_capabilities(cap_offset, data, wrapper) {
878                         error!("Could not update MSI capabilities: {}", e);
879                     }
880                 }
881                 PciCapabilityId::MsiX => {
882                     if let Err(e) = self.update_msix_capabilities(cap_offset, data, wrapper) {
883                         error!("Could not update MSI-X capabilities: {}", e);
884                     }
885                 }
886                 _ => {}
887             }
888         }
889 
890         // Make sure to write to the device's PCI config space after MSI/MSI-X
891         // interrupts have been enabled/disabled. In case of MSI, when the
892         // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS),
893         // the MSI Enable bit in the MSI capability structure found in the PCI
894         // config space is disabled by default. That's why when the guest is
895         // enabling this bit, we first need to enable the MSI interrupts with
896         // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write
897         // to the device region to update the MSI Enable bit.
898         wrapper.write_config((reg + offset) as u32, data);
899 
900         None
901     }
902 
903     pub(crate) fn read_config_register(&mut self, reg_idx: usize, wrapper: &dyn Vfio) -> u32 {
904         // When reading the BARs, we trap it and return what comes
905         // from our local configuration space. We want the guest to
906         // use that and not the VFIO device BARs as it does not map
907         // with the guest address space.
908         if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
909             || reg_idx == PCI_ROM_EXP_BAR_INDEX
910         {
911             return self.configuration.read_reg(reg_idx);
912         }
913 
914         // Since we don't support passing multi-functions devices, we should
915         // mask the multi-function bit, bit 7 of the Header Type byte on the
916         // register 3.
917         let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX {
918             0xff7f_ffff
919         } else {
920             0xffff_ffff
921         };
922 
923         // The config register read comes from the VFIO device itself.
924         wrapper.read_config_dword((reg_idx * 4) as u32) & mask
925     }
926 }
927 
928 /// VfioPciDevice represents a VFIO PCI device.
929 /// This structure implements the BusDevice and PciDevice traits.
930 ///
931 /// A VfioPciDevice is bound to a VfioDevice and is also a PCI device.
932 /// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice,
933 /// which then gets added to the PCI bus.
934 pub struct VfioPciDevice {
935     vm: Arc<dyn hypervisor::Vm>,
936     device: Arc<VfioDevice>,
937     container: Arc<VfioContainer>,
938     vfio_wrapper: VfioDeviceWrapper,
939     common: VfioCommon,
940     iommu_attached: bool,
941 }
942 
943 impl VfioPciDevice {
944     /// Constructs a new Vfio Pci device for the given Vfio device
945     pub fn new(
946         vm: &Arc<dyn hypervisor::Vm>,
947         device: VfioDevice,
948         container: Arc<VfioContainer>,
949         msi_interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
950         legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
951         iommu_attached: bool,
952     ) -> Result<Self, VfioPciError> {
953         let device = Arc::new(device);
954         device.reset();
955 
956         let configuration = PciConfiguration::new(
957             0,
958             0,
959             0,
960             PciClassCode::Other,
961             &PciVfioSubclass::VfioSubclass,
962             None,
963             PciHeaderType::Device,
964             0,
965             0,
966             None,
967         );
968 
969         let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device));
970 
971         let mut common = VfioCommon {
972             mmio_regions: Vec::new(),
973             configuration,
974             interrupt: Interrupt {
975                 intx: None,
976                 msi: None,
977                 msix: None,
978             },
979         };
980 
981         common.parse_capabilities(msi_interrupt_manager, &vfio_wrapper);
982         common.initialize_legacy_interrupt(legacy_interrupt_group, &vfio_wrapper)?;
983 
984         let vfio_pci_device = VfioPciDevice {
985             vm: vm.clone(),
986             device,
987             container,
988             vfio_wrapper,
989             common,
990             iommu_attached,
991         };
992 
993         Ok(vfio_pci_device)
994     }
995 
996     pub fn iommu_attached(&self) -> bool {
997         self.iommu_attached
998     }
999 
1000     /// Map MMIO regions into the guest, and avoid VM exits when the guest tries
1001     /// to reach those regions.
1002     ///
1003     /// # Arguments
1004     ///
1005     /// * `vm` - The VM object. It is used to set the VFIO MMIO regions
1006     ///          as user memory regions.
1007     /// * `mem_slot` - The closure to return a memory slot.
1008     pub fn map_mmio_regions<F>(
1009         &mut self,
1010         vm: &Arc<dyn hypervisor::Vm>,
1011         mem_slot: F,
1012     ) -> Result<(), VfioPciError>
1013     where
1014         F: Fn() -> u32,
1015     {
1016         let fd = self.device.as_raw_fd();
1017 
1018         for region in self.common.mmio_regions.iter_mut() {
1019             // We want to skip the mapping of the BAR containing the MSI-X
1020             // table even if it is mappable. The reason is we need to trap
1021             // any access to the MSI-X table and update the GSI routing
1022             // accordingly.
1023             if let Some(msix) = &self.common.interrupt.msix {
1024                 if region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir() {
1025                     continue;
1026                 }
1027             }
1028 
1029             let region_flags = self.device.get_region_flags(region.index);
1030             if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1031                 let mut prot = 0;
1032                 if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 {
1033                     prot |= libc::PROT_READ;
1034                 }
1035                 if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 {
1036                     prot |= libc::PROT_WRITE;
1037                 }
1038                 let (mmap_offset, mmap_size) = self.device.get_region_mmap(region.index);
1039                 let offset = self.device.get_region_offset(region.index) + mmap_offset;
1040 
1041                 let host_addr = unsafe {
1042                     libc::mmap(
1043                         null_mut(),
1044                         mmap_size as usize,
1045                         prot,
1046                         libc::MAP_SHARED,
1047                         fd,
1048                         offset as libc::off_t,
1049                     )
1050                 };
1051 
1052                 if host_addr == libc::MAP_FAILED {
1053                     error!(
1054                         "Could not mmap regions, error:{}",
1055                         io::Error::last_os_error()
1056                     );
1057                     continue;
1058                 }
1059 
1060                 let slot = mem_slot();
1061                 let mem_region = vm.make_user_memory_region(
1062                     slot,
1063                     region.start.raw_value() + mmap_offset,
1064                     mmap_size as u64,
1065                     host_addr as u64,
1066                     false,
1067                     false,
1068                 );
1069 
1070                 vm.create_user_memory_region(mem_region)
1071                     .map_err(VfioPciError::MapRegionGuest)?;
1072 
1073                 // Update the region with memory mapped info.
1074                 region.mem_slot = Some(slot);
1075                 region.host_addr = Some(host_addr as u64);
1076                 region.mmap_size = Some(mmap_size as usize);
1077             }
1078         }
1079 
1080         Ok(())
1081     }
1082 
1083     pub fn unmap_mmio_regions(&mut self) {
1084         for region in self.common.mmio_regions.iter() {
1085             if let (Some(host_addr), Some(mmap_size), Some(mem_slot)) =
1086                 (region.host_addr, region.mmap_size, region.mem_slot)
1087             {
1088                 let (mmap_offset, _) = self.device.get_region_mmap(region.index);
1089 
1090                 // Remove region
1091                 let r = self.vm.make_user_memory_region(
1092                     mem_slot,
1093                     region.start.raw_value() + mmap_offset,
1094                     mmap_size as u64,
1095                     host_addr as u64,
1096                     false,
1097                     false,
1098                 );
1099 
1100                 if let Err(e) = self.vm.remove_user_memory_region(r) {
1101                     error!("Could not remove the userspace memory region: {}", e);
1102                 }
1103 
1104                 let ret = unsafe { libc::munmap(host_addr as *mut libc::c_void, mmap_size) };
1105                 if ret != 0 {
1106                     error!(
1107                         "Could not unmap region {}, error:{}",
1108                         region.index,
1109                         io::Error::last_os_error()
1110                     );
1111                 }
1112             }
1113         }
1114     }
1115 
1116     pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> {
1117         if !self.iommu_attached {
1118             self.container
1119                 .vfio_dma_map(iova, size, user_addr)
1120                 .map_err(VfioPciError::DmaMap)?;
1121         }
1122 
1123         Ok(())
1124     }
1125 
1126     pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> {
1127         if !self.iommu_attached {
1128             self.container
1129                 .vfio_dma_unmap(iova, size)
1130                 .map_err(VfioPciError::DmaUnmap)?;
1131         }
1132 
1133         Ok(())
1134     }
1135 
1136     pub fn mmio_regions(&self) -> Vec<MmioRegion> {
1137         self.common.mmio_regions.clone()
1138     }
1139 }
1140 
1141 impl Drop for VfioPciDevice {
1142     fn drop(&mut self) {
1143         self.unmap_mmio_regions();
1144 
1145         if let Some(msix) = &self.common.interrupt.msix {
1146             if msix.bar.enabled() {
1147                 self.common.disable_msix(&self.vfio_wrapper);
1148             }
1149         }
1150 
1151         if let Some(msi) = &self.common.interrupt.msi {
1152             if msi.cfg.enabled() {
1153                 self.common.disable_msi(&self.vfio_wrapper)
1154             }
1155         }
1156 
1157         if self.common.interrupt.intx_in_use() {
1158             self.common.disable_intx(&self.vfio_wrapper);
1159         }
1160     }
1161 }
1162 
1163 impl BusDevice for VfioPciDevice {
1164     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1165         self.read_bar(base, offset, data)
1166     }
1167 
1168     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1169         self.write_bar(base, offset, data)
1170     }
1171 }
1172 
1173 // First BAR offset in the PCI config space.
1174 const PCI_CONFIG_BAR_OFFSET: u32 = 0x10;
1175 // Capability register offset in the PCI config space.
1176 const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34;
1177 // IO BAR when first BAR bit is 1.
1178 const PCI_CONFIG_IO_BAR: u32 = 0x1;
1179 // 64-bit memory bar flag.
1180 const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4;
1181 // PCI config register size (4 bytes).
1182 const PCI_CONFIG_REGISTER_SIZE: usize = 4;
1183 // Number of BARs for a PCI device
1184 const BAR_NUMS: usize = 6;
1185 // PCI Header Type register index
1186 const PCI_HEADER_TYPE_REG_INDEX: usize = 3;
1187 // First BAR register index
1188 const PCI_CONFIG_BAR0_INDEX: usize = 4;
1189 // PCI ROM expansion BAR register index
1190 const PCI_ROM_EXP_BAR_INDEX: usize = 12;
1191 
1192 impl PciDevice for VfioPciDevice {
1193     fn allocate_bars(
1194         &mut self,
1195         allocator: &mut SystemAllocator,
1196     ) -> Result<Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, PciDeviceError> {
1197         self.common.allocate_bars(allocator, &self.vfio_wrapper)
1198     }
1199 
1200     fn free_bars(&mut self, allocator: &mut SystemAllocator) -> Result<(), PciDeviceError> {
1201         self.common.free_bars(allocator)
1202     }
1203 
1204     fn write_config_register(
1205         &mut self,
1206         reg_idx: usize,
1207         offset: u64,
1208         data: &[u8],
1209     ) -> Option<Arc<Barrier>> {
1210         self.common
1211             .write_config_register(reg_idx, offset, data, &self.vfio_wrapper)
1212     }
1213 
1214     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
1215         self.common
1216             .read_config_register(reg_idx, &self.vfio_wrapper)
1217     }
1218 
1219     fn detect_bar_reprogramming(
1220         &mut self,
1221         reg_idx: usize,
1222         data: &[u8],
1223     ) -> Option<BarReprogrammingParams> {
1224         self.common
1225             .configuration
1226             .detect_bar_reprogramming(reg_idx, data)
1227     }
1228 
1229     fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1230         self.common.read_bar(base, offset, data, &self.vfio_wrapper)
1231     }
1232 
1233     fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1234         self.common
1235             .write_bar(base, offset, data, &self.vfio_wrapper)
1236     }
1237 
1238     fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> {
1239         for region in self.common.mmio_regions.iter_mut() {
1240             if region.start.raw_value() == old_base {
1241                 region.start = GuestAddress(new_base);
1242 
1243                 if let Some(mem_slot) = region.mem_slot {
1244                     if let Some(host_addr) = region.host_addr {
1245                         let (mmap_offset, mmap_size) = self.device.get_region_mmap(region.index);
1246 
1247                         // Remove old region
1248                         let old_mem_region = self.vm.make_user_memory_region(
1249                             mem_slot,
1250                             old_base + mmap_offset,
1251                             mmap_size as u64,
1252                             host_addr as u64,
1253                             false,
1254                             false,
1255                         );
1256 
1257                         self.vm
1258                             .remove_user_memory_region(old_mem_region)
1259                             .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1260 
1261                         // Insert new region
1262                         let new_mem_region = self.vm.make_user_memory_region(
1263                             mem_slot,
1264                             new_base + mmap_offset,
1265                             mmap_size as u64,
1266                             host_addr as u64,
1267                             false,
1268                             false,
1269                         );
1270 
1271                         self.vm
1272                             .create_user_memory_region(new_mem_region)
1273                             .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
1274                     }
1275                 }
1276             }
1277         }
1278 
1279         Ok(())
1280     }
1281 
1282     fn as_any(&mut self) -> &mut dyn Any {
1283         self
1284     }
1285 }
1286