xref: /cloud-hypervisor/virtio-devices/src/transport/pci_device.rs (revision eea9bcea38e0c5649f444c829f3a4f9c22aa486c)
1 // Copyright 2018 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE-BSD-3-Clause file.
4 //
5 // Copyright © 2019 Intel Corporation
6 //
7 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
8 
9 use super::VirtioPciCommonConfig;
10 use crate::transport::VirtioTransport;
11 use crate::GuestMemoryMmap;
12 use crate::{
13     ActivateResult, VirtioDevice, VirtioDeviceType, VirtioInterrupt, VirtioInterruptType,
14     DEVICE_ACKNOWLEDGE, DEVICE_DRIVER, DEVICE_DRIVER_OK, DEVICE_FAILED, DEVICE_FEATURES_OK,
15     DEVICE_INIT,
16 };
17 use anyhow::anyhow;
18 use libc::EFD_NONBLOCK;
19 use pci::{
20     BarReprogrammingParams, MsixCap, MsixConfig, PciBarConfiguration, PciBarRegionType,
21     PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, PciDevice, PciDeviceError,
22     PciHeaderType, PciMassStorageSubclass, PciNetworkControllerSubclass, PciSubclass,
23 };
24 use std::any::Any;
25 use std::cmp;
26 use std::io::Write;
27 use std::ops::Deref;
28 use std::result;
29 use std::sync::atomic::{AtomicBool, AtomicU16, AtomicUsize, Ordering};
30 use std::sync::{Arc, Barrier, Mutex};
31 use versionize::{VersionMap, Versionize, VersionizeResult};
32 use versionize_derive::Versionize;
33 use virtio_queue::{Error as QueueError, Queue, QueueT};
34 use vm_allocator::{AddressAllocator, SystemAllocator};
35 use vm_device::dma_mapping::ExternalDmaMapping;
36 use vm_device::interrupt::{
37     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
38 };
39 use vm_device::{BusDevice, Resource};
40 use vm_memory::{Address, ByteValued, GuestAddress, GuestAddressSpace, GuestMemoryAtomic, Le32};
41 use vm_migration::{
42     Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped,
43 };
44 use vm_virtio::AccessPlatform;
45 use vmm_sys_util::{errno::Result, eventfd::EventFd};
46 
47 /// Vector value used to disable MSI for a queue.
48 const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff;
49 
50 #[derive(Debug)]
51 enum Error {
52     /// Failed to retrieve queue ring's index.
53     QueueRingIndex(QueueError),
54 }
55 
56 #[allow(clippy::enum_variant_names)]
57 enum PciCapabilityType {
58     CommonConfig = 1,
59     NotifyConfig = 2,
60     IsrConfig = 3,
61     DeviceConfig = 4,
62     PciConfig = 5,
63     SharedMemoryConfig = 8,
64 }
65 
66 // This offset represents the 2 bytes omitted from the VirtioPciCap structure
67 // as they are already handled through add_capability(). These 2 bytes are the
68 // fields cap_vndr (1 byte) and cap_next (1 byte) defined in the virtio spec.
69 const VIRTIO_PCI_CAP_OFFSET: usize = 2;
70 
71 #[allow(dead_code)]
72 #[repr(packed)]
73 #[derive(Clone, Copy, Default)]
74 struct VirtioPciCap {
75     cap_len: u8,      // Generic PCI field: capability length
76     cfg_type: u8,     // Identifies the structure.
77     pci_bar: u8,      // Where to find it.
78     id: u8,           // Multiple capabilities of the same type
79     padding: [u8; 2], // Pad to full dword.
80     offset: Le32,     // Offset within bar.
81     length: Le32,     // Length of the structure, in bytes.
82 }
83 // SAFETY: All members are simple numbers and any value is valid.
84 unsafe impl ByteValued for VirtioPciCap {}
85 
86 impl PciCapability for VirtioPciCap {
87     fn bytes(&self) -> &[u8] {
88         self.as_slice()
89     }
90 
91     fn id(&self) -> PciCapabilityId {
92         PciCapabilityId::VendorSpecific
93     }
94 }
95 
96 const VIRTIO_PCI_CAP_LEN_OFFSET: u8 = 2;
97 
98 impl VirtioPciCap {
99     pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, offset: u32, length: u32) -> Self {
100         VirtioPciCap {
101             cap_len: (std::mem::size_of::<VirtioPciCap>() as u8) + VIRTIO_PCI_CAP_LEN_OFFSET,
102             cfg_type: cfg_type as u8,
103             pci_bar,
104             id: 0,
105             padding: [0; 2],
106             offset: Le32::from(offset),
107             length: Le32::from(length),
108         }
109     }
110 }
111 
112 #[allow(dead_code)]
113 #[repr(packed)]
114 #[derive(Clone, Copy, Default)]
115 struct VirtioPciNotifyCap {
116     cap: VirtioPciCap,
117     notify_off_multiplier: Le32,
118 }
119 // SAFETY: All members are simple numbers and any value is valid.
120 unsafe impl ByteValued for VirtioPciNotifyCap {}
121 
122 impl PciCapability for VirtioPciNotifyCap {
123     fn bytes(&self) -> &[u8] {
124         self.as_slice()
125     }
126 
127     fn id(&self) -> PciCapabilityId {
128         PciCapabilityId::VendorSpecific
129     }
130 }
131 
132 impl VirtioPciNotifyCap {
133     pub fn new(
134         cfg_type: PciCapabilityType,
135         pci_bar: u8,
136         offset: u32,
137         length: u32,
138         multiplier: Le32,
139     ) -> Self {
140         VirtioPciNotifyCap {
141             cap: VirtioPciCap {
142                 cap_len: (std::mem::size_of::<VirtioPciNotifyCap>() as u8)
143                     + VIRTIO_PCI_CAP_LEN_OFFSET,
144                 cfg_type: cfg_type as u8,
145                 pci_bar,
146                 id: 0,
147                 padding: [0; 2],
148                 offset: Le32::from(offset),
149                 length: Le32::from(length),
150             },
151             notify_off_multiplier: multiplier,
152         }
153     }
154 }
155 
156 #[allow(dead_code)]
157 #[repr(packed)]
158 #[derive(Clone, Copy, Default)]
159 struct VirtioPciCap64 {
160     cap: VirtioPciCap,
161     offset_hi: Le32,
162     length_hi: Le32,
163 }
164 // SAFETY: All members are simple numbers and any value is valid.
165 unsafe impl ByteValued for VirtioPciCap64 {}
166 
167 impl PciCapability for VirtioPciCap64 {
168     fn bytes(&self) -> &[u8] {
169         self.as_slice()
170     }
171 
172     fn id(&self) -> PciCapabilityId {
173         PciCapabilityId::VendorSpecific
174     }
175 }
176 
177 impl VirtioPciCap64 {
178     pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, id: u8, offset: u64, length: u64) -> Self {
179         VirtioPciCap64 {
180             cap: VirtioPciCap {
181                 cap_len: (std::mem::size_of::<VirtioPciCap64>() as u8) + VIRTIO_PCI_CAP_LEN_OFFSET,
182                 cfg_type: cfg_type as u8,
183                 pci_bar,
184                 id,
185                 padding: [0; 2],
186                 offset: Le32::from(offset as u32),
187                 length: Le32::from(length as u32),
188             },
189             offset_hi: Le32::from((offset >> 32) as u32),
190             length_hi: Le32::from((length >> 32) as u32),
191         }
192     }
193 }
194 
195 #[allow(dead_code)]
196 #[repr(packed)]
197 #[derive(Clone, Copy, Default)]
198 struct VirtioPciCfgCap {
199     cap: VirtioPciCap,
200     pci_cfg_data: [u8; 4],
201 }
202 // SAFETY: All members are simple numbers and any value is valid.
203 unsafe impl ByteValued for VirtioPciCfgCap {}
204 
205 impl PciCapability for VirtioPciCfgCap {
206     fn bytes(&self) -> &[u8] {
207         self.as_slice()
208     }
209 
210     fn id(&self) -> PciCapabilityId {
211         PciCapabilityId::VendorSpecific
212     }
213 }
214 
215 impl VirtioPciCfgCap {
216     fn new() -> Self {
217         VirtioPciCfgCap {
218             cap: VirtioPciCap::new(PciCapabilityType::PciConfig, 0, 0, 0),
219             ..Default::default()
220         }
221     }
222 }
223 
224 #[derive(Clone, Copy, Default)]
225 struct VirtioPciCfgCapInfo {
226     offset: usize,
227     cap: VirtioPciCfgCap,
228 }
229 
230 #[allow(dead_code)]
231 #[derive(Copy, Clone)]
232 pub enum PciVirtioSubclass {
233     NonTransitionalBase = 0xff,
234 }
235 
236 impl PciSubclass for PciVirtioSubclass {
237     fn get_register_value(&self) -> u8 {
238         *self as u8
239     }
240 }
241 
242 // Allocate one bar for the structs pointed to by the capability structures.
243 // As per the PCI specification, because the same BAR shares MSI-X and non
244 // MSI-X structures, it is recommended to use 8KiB alignment for all those
245 // structures.
246 const COMMON_CONFIG_BAR_OFFSET: u64 = 0x0000;
247 const COMMON_CONFIG_SIZE: u64 = 56;
248 const ISR_CONFIG_BAR_OFFSET: u64 = 0x2000;
249 const ISR_CONFIG_SIZE: u64 = 1;
250 const DEVICE_CONFIG_BAR_OFFSET: u64 = 0x4000;
251 const DEVICE_CONFIG_SIZE: u64 = 0x1000;
252 const NOTIFICATION_BAR_OFFSET: u64 = 0x6000;
253 const NOTIFICATION_SIZE: u64 = 0x1000;
254 const MSIX_TABLE_BAR_OFFSET: u64 = 0x8000;
255 // The size is 256KiB because the table can hold up to 2048 entries, with each
256 // entry being 128 bits (4 DWORDS).
257 const MSIX_TABLE_SIZE: u64 = 0x40000;
258 const MSIX_PBA_BAR_OFFSET: u64 = 0x48000;
259 // The size is 2KiB because the Pending Bit Array has one bit per vector and it
260 // can support up to 2048 vectors.
261 const MSIX_PBA_SIZE: u64 = 0x800;
262 // The BAR size must be a power of 2.
263 const CAPABILITY_BAR_SIZE: u64 = 0x80000;
264 const VIRTIO_COMMON_BAR_INDEX: usize = 0;
265 const VIRTIO_SHM_BAR_INDEX: usize = 2;
266 
267 const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address.
268 
269 const VIRTIO_PCI_VENDOR_ID: u16 = 0x1af4;
270 const VIRTIO_PCI_DEVICE_ID_BASE: u16 = 0x1040; // Add to device type to get device ID.
271 
272 #[derive(Versionize)]
273 struct QueueState {
274     max_size: u16,
275     size: u16,
276     ready: bool,
277     desc_table: u64,
278     avail_ring: u64,
279     used_ring: u64,
280 }
281 
282 #[derive(Versionize)]
283 struct VirtioPciDeviceState {
284     device_activated: bool,
285     queues: Vec<QueueState>,
286     interrupt_status: usize,
287 }
288 
289 impl VersionMapped for VirtioPciDeviceState {}
290 
291 pub struct VirtioPciDeviceActivator {
292     interrupt: Option<Arc<dyn VirtioInterrupt>>,
293     memory: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
294     device: Arc<Mutex<dyn VirtioDevice>>,
295     device_activated: Arc<AtomicBool>,
296     queues: Option<Vec<(usize, Queue, EventFd)>>,
297     barrier: Option<Arc<Barrier>>,
298     id: String,
299 }
300 
301 impl VirtioPciDeviceActivator {
302     pub fn activate(&mut self) -> ActivateResult {
303         self.device.lock().unwrap().activate(
304             self.memory.take().unwrap(),
305             self.interrupt.take().unwrap(),
306             self.queues.take().unwrap(),
307         )?;
308         self.device_activated.store(true, Ordering::SeqCst);
309 
310         if let Some(barrier) = self.barrier.take() {
311             info!("{}: Waiting for barrier", self.id);
312             barrier.wait();
313             info!("{}: Barrier released", self.id);
314         }
315 
316         Ok(())
317     }
318 }
319 
320 pub struct VirtioPciDevice {
321     id: String,
322 
323     // PCI configuration registers.
324     configuration: PciConfiguration,
325 
326     // virtio PCI common configuration
327     common_config: VirtioPciCommonConfig,
328 
329     // MSI-X config
330     msix_config: Option<Arc<Mutex<MsixConfig>>>,
331 
332     // Number of MSI-X vectors
333     msix_num: u16,
334 
335     // Virtio device reference and status
336     device: Arc<Mutex<dyn VirtioDevice>>,
337     device_activated: Arc<AtomicBool>,
338 
339     // PCI interrupts.
340     interrupt_status: Arc<AtomicUsize>,
341     virtio_interrupt: Option<Arc<dyn VirtioInterrupt>>,
342     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
343 
344     // virtio queues
345     queues: Vec<Queue>,
346     queue_evts: Vec<EventFd>,
347 
348     // Guest memory
349     memory: GuestMemoryAtomic<GuestMemoryMmap>,
350 
351     // Settings PCI BAR
352     settings_bar: u8,
353 
354     // Whether to use 64-bit bar location or 32-bit
355     use_64bit_bar: bool,
356 
357     // Add a dedicated structure to hold information about the very specific
358     // virtio-pci capability VIRTIO_PCI_CAP_PCI_CFG. This is needed to support
359     // the legacy/backward compatible mechanism of letting the guest access the
360     // other virtio capabilities without mapping the PCI BARs. This can be
361     // needed when the guest tries to early access the virtio configuration of
362     // a device.
363     cap_pci_cfg_info: VirtioPciCfgCapInfo,
364 
365     // Details of bar regions to free
366     bar_regions: Vec<PciBarConfiguration>,
367 
368     // EventFd to signal on to request activation
369     activate_evt: EventFd,
370 
371     // Optional DMA handler
372     dma_handler: Option<Arc<dyn ExternalDmaMapping>>,
373 
374     // Pending activations
375     pending_activations: Arc<Mutex<Vec<VirtioPciDeviceActivator>>>,
376 }
377 
378 impl VirtioPciDevice {
379     /// Constructs a new PCI transport for the given virtio device.
380     #[allow(clippy::too_many_arguments)]
381     pub fn new(
382         id: String,
383         memory: GuestMemoryAtomic<GuestMemoryMmap>,
384         device: Arc<Mutex<dyn VirtioDevice>>,
385         msix_num: u16,
386         access_platform: Option<Arc<dyn AccessPlatform>>,
387         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
388         pci_device_bdf: u32,
389         activate_evt: EventFd,
390         use_64bit_bar: bool,
391         dma_handler: Option<Arc<dyn ExternalDmaMapping>>,
392         pending_activations: Arc<Mutex<Vec<VirtioPciDeviceActivator>>>,
393     ) -> Result<Self> {
394         let device_clone = device.clone();
395         let mut locked_device = device_clone.lock().unwrap();
396         let mut queue_evts = Vec::new();
397         for _ in locked_device.queue_max_sizes().iter() {
398             queue_evts.push(EventFd::new(EFD_NONBLOCK)?)
399         }
400         let num_queues = locked_device.queue_max_sizes().len();
401 
402         if let Some(access_platform) = &access_platform {
403             locked_device.set_access_platform(access_platform.clone());
404         }
405 
406         let queues = locked_device
407             .queue_max_sizes()
408             .iter()
409             .map(|&s| Queue::new(s).unwrap())
410             .collect();
411 
412         let pci_device_id = VIRTIO_PCI_DEVICE_ID_BASE + locked_device.device_type() as u16;
413 
414         let interrupt_source_group = interrupt_manager.create_group(MsiIrqGroupConfig {
415             base: 0,
416             count: msix_num as InterruptIndex,
417         })?;
418 
419         let (msix_config, msix_config_clone) = if msix_num > 0 {
420             let msix_config = Arc::new(Mutex::new(MsixConfig::new(
421                 msix_num,
422                 interrupt_source_group.clone(),
423                 pci_device_bdf,
424             )));
425             let msix_config_clone = msix_config.clone();
426             (Some(msix_config), Some(msix_config_clone))
427         } else {
428             (None, None)
429         };
430 
431         let (class, subclass) = match VirtioDeviceType::from(locked_device.device_type()) {
432             VirtioDeviceType::Net => (
433                 PciClassCode::NetworkController,
434                 &PciNetworkControllerSubclass::EthernetController as &dyn PciSubclass,
435             ),
436             VirtioDeviceType::Block => (
437                 PciClassCode::MassStorage,
438                 &PciMassStorageSubclass::MassStorage as &dyn PciSubclass,
439             ),
440             _ => (
441                 PciClassCode::Other,
442                 &PciVirtioSubclass::NonTransitionalBase as &dyn PciSubclass,
443             ),
444         };
445 
446         let configuration = PciConfiguration::new(
447             VIRTIO_PCI_VENDOR_ID,
448             pci_device_id,
449             0x1, // For modern virtio-PCI devices
450             class,
451             subclass,
452             None,
453             PciHeaderType::Device,
454             VIRTIO_PCI_VENDOR_ID,
455             pci_device_id,
456             msix_config_clone,
457         );
458 
459         let mut virtio_pci_device = VirtioPciDevice {
460             id,
461             configuration,
462             common_config: VirtioPciCommonConfig {
463                 access_platform,
464                 driver_status: 0,
465                 config_generation: 0,
466                 device_feature_select: 0,
467                 driver_feature_select: 0,
468                 queue_select: 0,
469                 msix_config: Arc::new(AtomicU16::new(VIRTQ_MSI_NO_VECTOR)),
470                 msix_queues: Arc::new(Mutex::new(vec![VIRTQ_MSI_NO_VECTOR; num_queues])),
471             },
472             msix_config,
473             msix_num,
474             device,
475             device_activated: Arc::new(AtomicBool::new(false)),
476             interrupt_status: Arc::new(AtomicUsize::new(0)),
477             virtio_interrupt: None,
478             queues,
479             queue_evts,
480             memory,
481             settings_bar: 0,
482             use_64bit_bar,
483             interrupt_source_group,
484             cap_pci_cfg_info: VirtioPciCfgCapInfo::default(),
485             bar_regions: vec![],
486             activate_evt,
487             dma_handler,
488             pending_activations,
489         };
490 
491         if let Some(msix_config) = &virtio_pci_device.msix_config {
492             virtio_pci_device.virtio_interrupt = Some(Arc::new(VirtioInterruptMsix::new(
493                 msix_config.clone(),
494                 virtio_pci_device.common_config.msix_config.clone(),
495                 virtio_pci_device.common_config.msix_queues.clone(),
496                 virtio_pci_device.interrupt_source_group.clone(),
497             )));
498         }
499 
500         Ok(virtio_pci_device)
501     }
502 
503     fn state(&self) -> VirtioPciDeviceState {
504         VirtioPciDeviceState {
505             device_activated: self.device_activated.load(Ordering::Acquire),
506             interrupt_status: self.interrupt_status.load(Ordering::Acquire),
507             queues: self
508                 .queues
509                 .iter()
510                 .map(|q| QueueState {
511                     max_size: q.max_size(),
512                     size: q.size(),
513                     ready: q.ready(),
514                     desc_table: q.desc_table(),
515                     avail_ring: q.avail_ring(),
516                     used_ring: q.used_ring(),
517                 })
518                 .collect(),
519         }
520     }
521 
522     fn set_state(&mut self, state: &VirtioPciDeviceState) -> std::result::Result<(), Error> {
523         self.device_activated
524             .store(state.device_activated, Ordering::Release);
525         self.interrupt_status
526             .store(state.interrupt_status, Ordering::Release);
527 
528         // Update virtqueues indexes for both available and used rings.
529         for (i, queue) in self.queues.iter_mut().enumerate() {
530             queue.set_size(state.queues[i].size);
531             queue.set_ready(state.queues[i].ready);
532             queue
533                 .try_set_desc_table_address(GuestAddress(state.queues[i].desc_table))
534                 .unwrap();
535             queue
536                 .try_set_avail_ring_address(GuestAddress(state.queues[i].avail_ring))
537                 .unwrap();
538             queue
539                 .try_set_used_ring_address(GuestAddress(state.queues[i].used_ring))
540                 .unwrap();
541             queue.set_next_avail(
542                 queue
543                     .used_idx(self.memory.memory().deref(), Ordering::Acquire)
544                     .map_err(Error::QueueRingIndex)?
545                     .0,
546             );
547             queue.set_next_used(
548                 queue
549                     .used_idx(self.memory.memory().deref(), Ordering::Acquire)
550                     .map_err(Error::QueueRingIndex)?
551                     .0,
552             );
553         }
554 
555         Ok(())
556     }
557 
558     /// Gets the list of queue events that must be triggered whenever the VM writes to
559     /// `virtio::NOTIFY_REG_OFFSET` past the MMIO base. Each event must be triggered when the
560     /// value being written equals the index of the event in this list.
561     fn queue_evts(&self) -> &[EventFd] {
562         self.queue_evts.as_slice()
563     }
564 
565     fn is_driver_ready(&self) -> bool {
566         let ready_bits =
567             (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK) as u8;
568         self.common_config.driver_status == ready_bits
569             && self.common_config.driver_status & DEVICE_FAILED as u8 == 0
570     }
571 
572     /// Determines if the driver has requested the device (re)init / reset itself
573     fn is_driver_init(&self) -> bool {
574         self.common_config.driver_status == DEVICE_INIT as u8
575     }
576 
577     pub fn config_bar_addr(&self) -> u64 {
578         self.configuration.get_bar_addr(self.settings_bar as usize)
579     }
580 
581     fn add_pci_capabilities(
582         &mut self,
583         settings_bar: u8,
584     ) -> std::result::Result<(), PciDeviceError> {
585         // Add pointers to the different configuration structures from the PCI capabilities.
586         let common_cap = VirtioPciCap::new(
587             PciCapabilityType::CommonConfig,
588             settings_bar,
589             COMMON_CONFIG_BAR_OFFSET as u32,
590             COMMON_CONFIG_SIZE as u32,
591         );
592         self.configuration
593             .add_capability(&common_cap)
594             .map_err(PciDeviceError::CapabilitiesSetup)?;
595 
596         let isr_cap = VirtioPciCap::new(
597             PciCapabilityType::IsrConfig,
598             settings_bar,
599             ISR_CONFIG_BAR_OFFSET as u32,
600             ISR_CONFIG_SIZE as u32,
601         );
602         self.configuration
603             .add_capability(&isr_cap)
604             .map_err(PciDeviceError::CapabilitiesSetup)?;
605 
606         // TODO(dgreid) - set based on device's configuration size?
607         let device_cap = VirtioPciCap::new(
608             PciCapabilityType::DeviceConfig,
609             settings_bar,
610             DEVICE_CONFIG_BAR_OFFSET as u32,
611             DEVICE_CONFIG_SIZE as u32,
612         );
613         self.configuration
614             .add_capability(&device_cap)
615             .map_err(PciDeviceError::CapabilitiesSetup)?;
616 
617         let notify_cap = VirtioPciNotifyCap::new(
618             PciCapabilityType::NotifyConfig,
619             settings_bar,
620             NOTIFICATION_BAR_OFFSET as u32,
621             NOTIFICATION_SIZE as u32,
622             Le32::from(NOTIFY_OFF_MULTIPLIER),
623         );
624         self.configuration
625             .add_capability(&notify_cap)
626             .map_err(PciDeviceError::CapabilitiesSetup)?;
627 
628         let configuration_cap = VirtioPciCfgCap::new();
629         self.cap_pci_cfg_info.offset = self
630             .configuration
631             .add_capability(&configuration_cap)
632             .map_err(PciDeviceError::CapabilitiesSetup)?
633             + VIRTIO_PCI_CAP_OFFSET;
634         self.cap_pci_cfg_info.cap = configuration_cap;
635 
636         if self.msix_config.is_some() {
637             let msix_cap = MsixCap::new(
638                 settings_bar,
639                 self.msix_num,
640                 MSIX_TABLE_BAR_OFFSET as u32,
641                 settings_bar,
642                 MSIX_PBA_BAR_OFFSET as u32,
643             );
644             self.configuration
645                 .add_capability(&msix_cap)
646                 .map_err(PciDeviceError::CapabilitiesSetup)?;
647         }
648 
649         self.settings_bar = settings_bar;
650         Ok(())
651     }
652 
653     fn read_cap_pci_cfg(&mut self, offset: usize, mut data: &mut [u8]) {
654         let cap_slice = self.cap_pci_cfg_info.cap.as_slice();
655         let data_len = data.len();
656         let cap_len = cap_slice.len();
657         if offset + data_len > cap_len {
658             error!("Failed to read cap_pci_cfg from config space");
659             return;
660         }
661 
662         if offset < std::mem::size_of::<VirtioPciCap>() {
663             if let Some(end) = offset.checked_add(data_len) {
664                 // This write can't fail, offset and end are checked against config_len.
665                 data.write_all(&cap_slice[offset..cmp::min(end, cap_len)])
666                     .unwrap();
667             }
668         } else {
669             // Safe since we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long.
670             let bar_offset: u32 =
671                 unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) };
672             self.read_bar(0, bar_offset as u64, data)
673         }
674     }
675 
676     fn write_cap_pci_cfg(&mut self, offset: usize, data: &[u8]) -> Option<Arc<Barrier>> {
677         let cap_slice = self.cap_pci_cfg_info.cap.as_mut_slice();
678         let data_len = data.len();
679         let cap_len = cap_slice.len();
680         if offset + data_len > cap_len {
681             error!("Failed to write cap_pci_cfg to config space");
682             return None;
683         }
684 
685         if offset < std::mem::size_of::<VirtioPciCap>() {
686             let (_, right) = cap_slice.split_at_mut(offset);
687             right[..data_len].copy_from_slice(data);
688             None
689         } else {
690             // Safe since we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long.
691             let bar_offset: u32 =
692                 unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) };
693             self.write_bar(0, bar_offset as u64, data)
694         }
695     }
696 
697     pub fn virtio_device(&self) -> Arc<Mutex<dyn VirtioDevice>> {
698         self.device.clone()
699     }
700 
701     fn prepare_activator(&mut self, barrier: Option<Arc<Barrier>>) -> VirtioPciDeviceActivator {
702         let mut queues = Vec::new();
703 
704         for (queue_index, queue) in self.queues.iter().enumerate() {
705             if !queue.ready() {
706                 continue;
707             }
708 
709             if !queue.is_valid(self.memory.memory().deref()) {
710                 error!("Queue {} is not valid", queue_index);
711             }
712 
713             queues.push((
714                 queue_index,
715                 vm_virtio::clone_queue(queue),
716                 self.queue_evts[queue_index].try_clone().unwrap(),
717             ));
718         }
719 
720         VirtioPciDeviceActivator {
721             interrupt: self.virtio_interrupt.take(),
722             memory: Some(self.memory.clone()),
723             device: self.device.clone(),
724             queues: Some(queues),
725             device_activated: self.device_activated.clone(),
726             barrier,
727             id: self.id.clone(),
728         }
729     }
730 
731     fn activate(&mut self) -> ActivateResult {
732         self.prepare_activator(None).activate()
733     }
734 
735     fn needs_activation(&self) -> bool {
736         !self.device_activated.load(Ordering::SeqCst) && self.is_driver_ready()
737     }
738 
739     pub fn dma_handler(&self) -> Option<&Arc<dyn ExternalDmaMapping>> {
740         self.dma_handler.as_ref()
741     }
742 }
743 
744 impl VirtioTransport for VirtioPciDevice {
745     fn ioeventfds(&self, base_addr: u64) -> Vec<(&EventFd, u64)> {
746         let notify_base = base_addr + NOTIFICATION_BAR_OFFSET;
747         self.queue_evts()
748             .iter()
749             .enumerate()
750             .map(|(i, event)| {
751                 (
752                     event,
753                     notify_base + i as u64 * u64::from(NOTIFY_OFF_MULTIPLIER),
754                 )
755             })
756             .collect()
757     }
758 }
759 
760 pub struct VirtioInterruptMsix {
761     msix_config: Arc<Mutex<MsixConfig>>,
762     config_vector: Arc<AtomicU16>,
763     queues_vectors: Arc<Mutex<Vec<u16>>>,
764     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
765 }
766 
767 impl VirtioInterruptMsix {
768     pub fn new(
769         msix_config: Arc<Mutex<MsixConfig>>,
770         config_vector: Arc<AtomicU16>,
771         queues_vectors: Arc<Mutex<Vec<u16>>>,
772         interrupt_source_group: Arc<dyn InterruptSourceGroup>,
773     ) -> Self {
774         VirtioInterruptMsix {
775             msix_config,
776             config_vector,
777             queues_vectors,
778             interrupt_source_group,
779         }
780     }
781 }
782 
783 impl VirtioInterrupt for VirtioInterruptMsix {
784     fn trigger(&self, int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> {
785         let vector = match int_type {
786             VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire),
787             VirtioInterruptType::Queue(queue_index) => {
788                 self.queues_vectors.lock().unwrap()[queue_index as usize]
789             }
790         };
791 
792         if vector == VIRTQ_MSI_NO_VECTOR {
793             return Ok(());
794         }
795 
796         let config = &mut self.msix_config.lock().unwrap();
797         let entry = &config.table_entries[vector as usize];
798         // In case the vector control register associated with the entry
799         // has its first bit set, this means the vector is masked and the
800         // device should not inject the interrupt.
801         // Instead, the Pending Bit Array table is updated to reflect there
802         // is a pending interrupt for this specific vector.
803         if config.masked() || entry.masked() {
804             config.set_pba_bit(vector, false);
805             return Ok(());
806         }
807 
808         self.interrupt_source_group
809             .trigger(vector as InterruptIndex)
810     }
811 
812     fn notifier(&self, int_type: VirtioInterruptType) -> Option<EventFd> {
813         let vector = match int_type {
814             VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire),
815             VirtioInterruptType::Queue(queue_index) => {
816                 self.queues_vectors.lock().unwrap()[queue_index as usize]
817             }
818         };
819 
820         self.interrupt_source_group
821             .notifier(vector as InterruptIndex)
822     }
823 }
824 
825 impl PciDevice for VirtioPciDevice {
826     fn write_config_register(
827         &mut self,
828         reg_idx: usize,
829         offset: u64,
830         data: &[u8],
831     ) -> Option<Arc<Barrier>> {
832         // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG
833         // is accessed. This capability has a special meaning as it allows the
834         // guest to access other capabilities without mapping the PCI BAR.
835         let base = reg_idx * 4;
836         if base + offset as usize >= self.cap_pci_cfg_info.offset
837             && base + offset as usize + data.len()
838                 <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len()
839         {
840             let offset = base + offset as usize - self.cap_pci_cfg_info.offset;
841             self.write_cap_pci_cfg(offset, data)
842         } else {
843             self.configuration
844                 .write_config_register(reg_idx, offset, data);
845             None
846         }
847     }
848 
849     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
850         // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG
851         // is accessed. This capability has a special meaning as it allows the
852         // guest to access other capabilities without mapping the PCI BAR.
853         let base = reg_idx * 4;
854         if base >= self.cap_pci_cfg_info.offset
855             && base + 4 <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len()
856         {
857             let offset = base - self.cap_pci_cfg_info.offset;
858             let mut data = [0u8; 4];
859             self.read_cap_pci_cfg(offset, &mut data);
860             u32::from_le_bytes(data)
861         } else {
862             self.configuration.read_reg(reg_idx)
863         }
864     }
865 
866     fn detect_bar_reprogramming(
867         &mut self,
868         reg_idx: usize,
869         data: &[u8],
870     ) -> Option<BarReprogrammingParams> {
871         self.configuration.detect_bar_reprogramming(reg_idx, data)
872     }
873 
874     fn allocate_bars(
875         &mut self,
876         allocator: &Arc<Mutex<SystemAllocator>>,
877         mmio_allocator: &mut AddressAllocator,
878         resources: Option<Vec<Resource>>,
879     ) -> std::result::Result<Vec<PciBarConfiguration>, PciDeviceError> {
880         let mut bars = Vec::new();
881         let device_clone = self.device.clone();
882         let device = device_clone.lock().unwrap();
883 
884         let mut settings_bar_addr = None;
885         if let Some(resources) = &resources {
886             for resource in resources {
887                 if let Resource::PciBar { index, base, .. } = resource {
888                     if *index == VIRTIO_COMMON_BAR_INDEX {
889                         settings_bar_addr = Some(GuestAddress(*base));
890                         break;
891                     }
892                 }
893             }
894             // Error out if no resource was matching the BAR id.
895             if settings_bar_addr.is_none() {
896                 return Err(PciDeviceError::MissingResource);
897             }
898         }
899 
900         // Allocate the virtio-pci capability BAR.
901         // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004
902         let (virtio_pci_bar_addr, region_type) = if self.use_64bit_bar {
903             let region_type = PciBarRegionType::Memory64BitRegion;
904             let addr = mmio_allocator
905                 .allocate(
906                     settings_bar_addr,
907                     CAPABILITY_BAR_SIZE,
908                     Some(CAPABILITY_BAR_SIZE),
909                 )
910                 .ok_or(PciDeviceError::IoAllocationFailed(CAPABILITY_BAR_SIZE))?;
911             (addr, region_type)
912         } else {
913             let region_type = PciBarRegionType::Memory32BitRegion;
914             let addr = allocator
915                 .lock()
916                 .unwrap()
917                 .allocate_mmio_hole_addresses(
918                     settings_bar_addr,
919                     CAPABILITY_BAR_SIZE,
920                     Some(CAPABILITY_BAR_SIZE),
921                 )
922                 .ok_or(PciDeviceError::IoAllocationFailed(CAPABILITY_BAR_SIZE))?;
923             (addr, region_type)
924         };
925 
926         let bar = PciBarConfiguration::default()
927             .set_index(VIRTIO_COMMON_BAR_INDEX)
928             .set_address(virtio_pci_bar_addr.raw_value())
929             .set_size(CAPABILITY_BAR_SIZE)
930             .set_region_type(region_type);
931         self.configuration.add_pci_bar(&bar).map_err(|e| {
932             PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr.raw_value(), e)
933         })?;
934 
935         bars.push(bar);
936 
937         // Once the BARs are allocated, the capabilities can be added to the PCI configuration.
938         self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX as u8)?;
939 
940         // Allocate a dedicated BAR if there are some shared memory regions.
941         if let Some(shm_list) = device.get_shm_regions() {
942             let bar = PciBarConfiguration::default()
943                 .set_index(VIRTIO_SHM_BAR_INDEX)
944                 .set_address(shm_list.addr.raw_value())
945                 .set_size(shm_list.len);
946             self.configuration
947                 .add_pci_bar(&bar)
948                 .map_err(|e| PciDeviceError::IoRegistrationFailed(shm_list.addr.raw_value(), e))?;
949 
950             bars.push(bar);
951 
952             for (idx, shm) in shm_list.region_list.iter().enumerate() {
953                 let shm_cap = VirtioPciCap64::new(
954                     PciCapabilityType::SharedMemoryConfig,
955                     VIRTIO_SHM_BAR_INDEX as u8,
956                     idx as u8,
957                     shm.offset,
958                     shm.len,
959                 );
960                 self.configuration
961                     .add_capability(&shm_cap)
962                     .map_err(PciDeviceError::CapabilitiesSetup)?;
963             }
964         }
965 
966         self.bar_regions = bars.clone();
967 
968         Ok(bars)
969     }
970 
971     fn free_bars(
972         &mut self,
973         allocator: &mut SystemAllocator,
974         mmio_allocator: &mut AddressAllocator,
975     ) -> std::result::Result<(), PciDeviceError> {
976         for bar in self.bar_regions.drain(..) {
977             match bar.region_type() {
978                 PciBarRegionType::Memory32BitRegion => {
979                     allocator.free_mmio_hole_addresses(GuestAddress(bar.addr()), bar.size());
980                 }
981                 PciBarRegionType::Memory64BitRegion => {
982                     mmio_allocator.free(GuestAddress(bar.addr()), bar.size());
983                 }
984                 _ => error!("Unexpected PCI bar type"),
985             }
986         }
987         Ok(())
988     }
989 
990     fn move_bar(&mut self, old_base: u64, new_base: u64) -> result::Result<(), std::io::Error> {
991         // We only update our idea of the bar in order to support free_bars() above.
992         // The majority of the reallocation is done inside DeviceManager.
993         for bar in self.bar_regions.iter_mut() {
994             if bar.addr() == old_base {
995                 *bar = bar.set_address(new_base);
996             }
997         }
998 
999         Ok(())
1000     }
1001 
1002     fn read_bar(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
1003         match offset {
1004             o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.read(
1005                 o - COMMON_CONFIG_BAR_OFFSET,
1006                 data,
1007                 &mut self.queues,
1008                 self.device.clone(),
1009             ),
1010             o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => {
1011                 if let Some(v) = data.get_mut(0) {
1012                     // Reading this register resets it to 0.
1013                     *v = self.interrupt_status.swap(0, Ordering::AcqRel) as u8;
1014                 }
1015             }
1016             o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE)
1017                 .contains(&o) =>
1018             {
1019                 let device = self.device.lock().unwrap();
1020                 device.read_config(o - DEVICE_CONFIG_BAR_OFFSET, data);
1021             }
1022             o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE)
1023                 .contains(&o) =>
1024             {
1025                 // Handled with ioeventfds.
1026             }
1027             o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => {
1028                 if let Some(msix_config) = &self.msix_config {
1029                     msix_config
1030                         .lock()
1031                         .unwrap()
1032                         .read_table(o - MSIX_TABLE_BAR_OFFSET, data);
1033                 }
1034             }
1035             o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => {
1036                 if let Some(msix_config) = &self.msix_config {
1037                     msix_config
1038                         .lock()
1039                         .unwrap()
1040                         .read_pba(o - MSIX_PBA_BAR_OFFSET, data);
1041                 }
1042             }
1043             _ => (),
1044         }
1045     }
1046 
1047     fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1048         match offset {
1049             o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.write(
1050                 o - COMMON_CONFIG_BAR_OFFSET,
1051                 data,
1052                 &mut self.queues,
1053                 self.device.clone(),
1054             ),
1055             o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => {
1056                 if let Some(v) = data.first() {
1057                     self.interrupt_status
1058                         .fetch_and(!(*v as usize), Ordering::AcqRel);
1059                 }
1060             }
1061             o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE)
1062                 .contains(&o) =>
1063             {
1064                 let mut device = self.device.lock().unwrap();
1065                 device.write_config(o - DEVICE_CONFIG_BAR_OFFSET, data);
1066             }
1067             o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE)
1068                 .contains(&o) =>
1069             {
1070                 // Handled with ioeventfds.
1071                 error!("Unexpected write to notification BAR: offset = 0x{:x}", o);
1072             }
1073             o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => {
1074                 if let Some(msix_config) = &self.msix_config {
1075                     msix_config
1076                         .lock()
1077                         .unwrap()
1078                         .write_table(o - MSIX_TABLE_BAR_OFFSET, data);
1079                 }
1080             }
1081             o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => {
1082                 if let Some(msix_config) = &self.msix_config {
1083                     msix_config
1084                         .lock()
1085                         .unwrap()
1086                         .write_pba(o - MSIX_PBA_BAR_OFFSET, data);
1087                 }
1088             }
1089             _ => (),
1090         };
1091 
1092         // Try and activate the device if the driver status has changed
1093         if self.needs_activation() {
1094             let barrier = Arc::new(Barrier::new(2));
1095             let activator = self.prepare_activator(Some(barrier.clone()));
1096             self.pending_activations.lock().unwrap().push(activator);
1097             info!(
1098                 "{}: Needs activation; writing to activate event fd",
1099                 self.id
1100             );
1101             self.activate_evt.write(1).ok();
1102             info!("{}: Needs activation; returning barrier", self.id);
1103             return Some(barrier);
1104         }
1105 
1106         // Device has been reset by the driver
1107         if self.device_activated.load(Ordering::SeqCst) && self.is_driver_init() {
1108             let mut device = self.device.lock().unwrap();
1109             if let Some(virtio_interrupt) = device.reset() {
1110                 // Upon reset the device returns its interrupt EventFD
1111                 self.virtio_interrupt = Some(virtio_interrupt);
1112                 self.device_activated.store(false, Ordering::SeqCst);
1113 
1114                 // Reset queue readiness (changes queue_enable), queue sizes
1115                 // and selected_queue as per spec for reset
1116                 self.queues.iter_mut().for_each(Queue::reset);
1117                 self.common_config.queue_select = 0;
1118             } else {
1119                 error!("Attempt to reset device when not implemented in underlying device");
1120                 self.common_config.driver_status = crate::DEVICE_FAILED as u8;
1121             }
1122         }
1123 
1124         None
1125     }
1126 
1127     fn as_any(&mut self) -> &mut dyn Any {
1128         self
1129     }
1130 
1131     fn id(&self) -> Option<String> {
1132         Some(self.id.clone())
1133     }
1134 }
1135 
1136 impl BusDevice for VirtioPciDevice {
1137     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1138         self.read_bar(base, offset, data)
1139     }
1140 
1141     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1142         self.write_bar(base, offset, data)
1143     }
1144 }
1145 
1146 impl Pausable for VirtioPciDevice {
1147     fn pause(&mut self) -> result::Result<(), MigratableError> {
1148         Ok(())
1149     }
1150 
1151     fn resume(&mut self) -> result::Result<(), MigratableError> {
1152         Ok(())
1153     }
1154 }
1155 
1156 impl Snapshottable for VirtioPciDevice {
1157     fn id(&self) -> String {
1158         self.id.clone()
1159     }
1160 
1161     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1162         let mut virtio_pci_dev_snapshot =
1163             Snapshot::new_from_versioned_state(&self.id, &self.state())?;
1164 
1165         // Snapshot PciConfiguration
1166         virtio_pci_dev_snapshot.add_snapshot(self.configuration.snapshot()?);
1167 
1168         // Snapshot VirtioPciCommonConfig
1169         virtio_pci_dev_snapshot.add_snapshot(self.common_config.snapshot()?);
1170 
1171         // Snapshot MSI-X
1172         if let Some(msix_config) = &self.msix_config {
1173             virtio_pci_dev_snapshot.add_snapshot(msix_config.lock().unwrap().snapshot()?);
1174         }
1175 
1176         Ok(virtio_pci_dev_snapshot)
1177     }
1178 
1179     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
1180         if let Some(virtio_pci_dev_section) =
1181             snapshot.snapshot_data.get(&format!("{}-section", self.id))
1182         {
1183             // Restore MSI-X
1184             if let Some(msix_config) = &self.msix_config {
1185                 let id = msix_config.lock().unwrap().id();
1186                 if let Some(msix_snapshot) = snapshot.snapshots.get(&id) {
1187                     msix_config
1188                         .lock()
1189                         .unwrap()
1190                         .restore(*msix_snapshot.clone())?;
1191                 }
1192             }
1193 
1194             // Restore VirtioPciCommonConfig
1195             if let Some(virtio_config_snapshot) = snapshot.snapshots.get(&self.common_config.id()) {
1196                 self.common_config
1197                     .restore(*virtio_config_snapshot.clone())?;
1198             }
1199 
1200             // Restore PciConfiguration
1201             if let Some(pci_config_snapshot) = snapshot.snapshots.get(&self.configuration.id()) {
1202                 self.configuration.restore(*pci_config_snapshot.clone())?;
1203             }
1204 
1205             // First restore the status of the virtqueues.
1206             self.set_state(&virtio_pci_dev_section.to_versioned_state()?)
1207                 .map_err(|e| {
1208                     MigratableError::Restore(anyhow!(
1209                         "Could not restore VIRTIO_PCI_DEVICE state {:?}",
1210                         e
1211                     ))
1212                 })?;
1213 
1214             // Then we can activate the device, as we know at this point that
1215             // the virtqueues are in the right state and the device is ready
1216             // to be activated, which will spawn each virtio worker thread.
1217             if self.device_activated.load(Ordering::SeqCst) && self.is_driver_ready() {
1218                 self.activate().map_err(|e| {
1219                     MigratableError::Restore(anyhow!("Failed activating the device: {:?}", e))
1220                 })?;
1221             }
1222 
1223             return Ok(());
1224         }
1225 
1226         Err(MigratableError::Restore(anyhow!(
1227             "Could not find VIRTIO_PCI_DEVICE snapshot section"
1228         )))
1229     }
1230 }
1231 impl Transportable for VirtioPciDevice {}
1232 impl Migratable for VirtioPciDevice {}
1233