xref: /cloud-hypervisor/virtio-devices/src/transport/pci_device.rs (revision f7f2f25a574b1b2dba22c094fc8226d404157d15)
1 // Copyright 2018 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE-BSD-3-Clause file.
4 //
5 // Copyright © 2019 Intel Corporation
6 //
7 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
8 
9 use super::VirtioPciCommonConfig;
10 use crate::transport::VirtioTransport;
11 use crate::GuestMemoryMmap;
12 use crate::{
13     ActivateResult, Queue, VirtioDevice, VirtioDeviceType, VirtioInterrupt, VirtioInterruptType,
14     DEVICE_ACKNOWLEDGE, DEVICE_DRIVER, DEVICE_DRIVER_OK, DEVICE_FAILED, DEVICE_FEATURES_OK,
15     DEVICE_INIT,
16 };
17 use anyhow::anyhow;
18 use libc::EFD_NONBLOCK;
19 use pci::{
20     BarReprogrammingParams, MsixCap, MsixConfig, PciBarConfiguration, PciBarRegionType,
21     PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, PciDevice, PciDeviceError,
22     PciHeaderType, PciMassStorageSubclass, PciNetworkControllerSubclass, PciSubclass,
23 };
24 use std::any::Any;
25 use std::cmp;
26 use std::io::Write;
27 use std::num::Wrapping;
28 use std::result;
29 use std::sync::atomic::{AtomicBool, AtomicU16, AtomicUsize, Ordering};
30 use std::sync::{Arc, Barrier, Mutex};
31 use versionize::{VersionMap, Versionize, VersionizeResult};
32 use versionize_derive::Versionize;
33 use vm_allocator::SystemAllocator;
34 use vm_device::interrupt::{
35     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
36 };
37 use vm_device::BusDevice;
38 use vm_memory::{
39     Address, ByteValued, GuestAddress, GuestAddressSpace, GuestMemoryAtomic, GuestUsize, Le32,
40 };
41 use vm_migration::{
42     Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped,
43 };
44 use vm_virtio::{queue, VirtioIommuRemapping, VIRTIO_MSI_NO_VECTOR};
45 use vmm_sys_util::{errno::Result, eventfd::EventFd};
46 
47 #[derive(Debug)]
48 enum Error {
49     /// Failed to retrieve queue ring's index.
50     QueueRingIndex(queue::Error),
51 }
52 
53 #[allow(clippy::enum_variant_names)]
54 enum PciCapabilityType {
55     CommonConfig = 1,
56     NotifyConfig = 2,
57     IsrConfig = 3,
58     DeviceConfig = 4,
59     PciConfig = 5,
60     SharedMemoryConfig = 8,
61 }
62 
63 // This offset represents the 2 bytes omitted from the VirtioPciCap structure
64 // as they are already handled through add_capability(). These 2 bytes are the
65 // fields cap_vndr (1 byte) and cap_next (1 byte) defined in the virtio spec.
66 const VIRTIO_PCI_CAP_OFFSET: usize = 2;
67 
68 #[allow(dead_code)]
69 #[repr(packed)]
70 #[derive(Clone, Copy, Default)]
71 struct VirtioPciCap {
72     cap_len: u8,      // Generic PCI field: capability length
73     cfg_type: u8,     // Identifies the structure.
74     pci_bar: u8,      // Where to find it.
75     id: u8,           // Multiple capabilities of the same type
76     padding: [u8; 2], // Pad to full dword.
77     offset: Le32,     // Offset within bar.
78     length: Le32,     // Length of the structure, in bytes.
79 }
80 // It is safe to implement ByteValued. All members are simple numbers and any value is valid.
81 unsafe impl ByteValued for VirtioPciCap {}
82 
83 impl PciCapability for VirtioPciCap {
84     fn bytes(&self) -> &[u8] {
85         self.as_slice()
86     }
87 
88     fn id(&self) -> PciCapabilityId {
89         PciCapabilityId::VendorSpecific
90     }
91 }
92 
93 const VIRTIO_PCI_CAP_LEN_OFFSET: u8 = 2;
94 
95 impl VirtioPciCap {
96     pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, offset: u32, length: u32) -> Self {
97         VirtioPciCap {
98             cap_len: (std::mem::size_of::<VirtioPciCap>() as u8) + VIRTIO_PCI_CAP_LEN_OFFSET,
99             cfg_type: cfg_type as u8,
100             pci_bar,
101             id: 0,
102             padding: [0; 2],
103             offset: Le32::from(offset),
104             length: Le32::from(length),
105         }
106     }
107 }
108 
109 #[allow(dead_code)]
110 #[repr(packed)]
111 #[derive(Clone, Copy, Default)]
112 struct VirtioPciNotifyCap {
113     cap: VirtioPciCap,
114     notify_off_multiplier: Le32,
115 }
116 // It is safe to implement ByteValued. All members are simple numbers and any value is valid.
117 unsafe impl ByteValued for VirtioPciNotifyCap {}
118 
119 impl PciCapability for VirtioPciNotifyCap {
120     fn bytes(&self) -> &[u8] {
121         self.as_slice()
122     }
123 
124     fn id(&self) -> PciCapabilityId {
125         PciCapabilityId::VendorSpecific
126     }
127 }
128 
129 impl VirtioPciNotifyCap {
130     pub fn new(
131         cfg_type: PciCapabilityType,
132         pci_bar: u8,
133         offset: u32,
134         length: u32,
135         multiplier: Le32,
136     ) -> Self {
137         VirtioPciNotifyCap {
138             cap: VirtioPciCap {
139                 cap_len: (std::mem::size_of::<VirtioPciNotifyCap>() as u8)
140                     + VIRTIO_PCI_CAP_LEN_OFFSET,
141                 cfg_type: cfg_type as u8,
142                 pci_bar,
143                 id: 0,
144                 padding: [0; 2],
145                 offset: Le32::from(offset),
146                 length: Le32::from(length),
147             },
148             notify_off_multiplier: multiplier,
149         }
150     }
151 }
152 
153 #[allow(dead_code)]
154 #[repr(packed)]
155 #[derive(Clone, Copy, Default)]
156 struct VirtioPciCap64 {
157     cap: VirtioPciCap,
158     offset_hi: Le32,
159     length_hi: Le32,
160 }
161 // It is safe to implement ByteValued. All members are simple numbers and any value is valid.
162 unsafe impl ByteValued for VirtioPciCap64 {}
163 
164 impl PciCapability for VirtioPciCap64 {
165     fn bytes(&self) -> &[u8] {
166         self.as_slice()
167     }
168 
169     fn id(&self) -> PciCapabilityId {
170         PciCapabilityId::VendorSpecific
171     }
172 }
173 
174 impl VirtioPciCap64 {
175     pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, id: u8, offset: u64, length: u64) -> Self {
176         VirtioPciCap64 {
177             cap: VirtioPciCap {
178                 cap_len: (std::mem::size_of::<VirtioPciCap64>() as u8) + VIRTIO_PCI_CAP_LEN_OFFSET,
179                 cfg_type: cfg_type as u8,
180                 pci_bar,
181                 id,
182                 padding: [0; 2],
183                 offset: Le32::from(offset as u32),
184                 length: Le32::from(length as u32),
185             },
186             offset_hi: Le32::from((offset >> 32) as u32),
187             length_hi: Le32::from((length >> 32) as u32),
188         }
189     }
190 }
191 
192 #[allow(dead_code)]
193 #[repr(packed)]
194 #[derive(Clone, Copy, Default)]
195 struct VirtioPciCfgCap {
196     cap: VirtioPciCap,
197     pci_cfg_data: [u8; 4],
198 }
199 // It is safe to implement ByteValued. All members are simple numbers and any value is valid.
200 unsafe impl ByteValued for VirtioPciCfgCap {}
201 
202 impl PciCapability for VirtioPciCfgCap {
203     fn bytes(&self) -> &[u8] {
204         self.as_slice()
205     }
206 
207     fn id(&self) -> PciCapabilityId {
208         PciCapabilityId::VendorSpecific
209     }
210 }
211 
212 impl VirtioPciCfgCap {
213     fn new() -> Self {
214         VirtioPciCfgCap {
215             cap: VirtioPciCap::new(PciCapabilityType::PciConfig, 0, 0, 0),
216             ..Default::default()
217         }
218     }
219 }
220 
221 #[derive(Clone, Copy, Default)]
222 struct VirtioPciCfgCapInfo {
223     offset: usize,
224     cap: VirtioPciCfgCap,
225 }
226 
227 #[allow(dead_code)]
228 #[derive(Copy, Clone)]
229 pub enum PciVirtioSubclass {
230     NonTransitionalBase = 0xff,
231 }
232 
233 impl PciSubclass for PciVirtioSubclass {
234     fn get_register_value(&self) -> u8 {
235         *self as u8
236     }
237 }
238 
239 // Allocate one bar for the structs pointed to by the capability structures.
240 // As per the PCI specification, because the same BAR shares MSI-X and non
241 // MSI-X structures, it is recommended to use 8KiB alignment for all those
242 // structures.
243 const COMMON_CONFIG_BAR_OFFSET: u64 = 0x0000;
244 const COMMON_CONFIG_SIZE: u64 = 56;
245 const ISR_CONFIG_BAR_OFFSET: u64 = 0x2000;
246 const ISR_CONFIG_SIZE: u64 = 1;
247 const DEVICE_CONFIG_BAR_OFFSET: u64 = 0x4000;
248 const DEVICE_CONFIG_SIZE: u64 = 0x1000;
249 const NOTIFICATION_BAR_OFFSET: u64 = 0x6000;
250 const NOTIFICATION_SIZE: u64 = 0x1000;
251 const MSIX_TABLE_BAR_OFFSET: u64 = 0x8000;
252 // The size is 256KiB because the table can hold up to 2048 entries, with each
253 // entry being 128 bits (4 DWORDS).
254 const MSIX_TABLE_SIZE: u64 = 0x40000;
255 const MSIX_PBA_BAR_OFFSET: u64 = 0x48000;
256 // The size is 2KiB because the Pending Bit Array has one bit per vector and it
257 // can support up to 2048 vectors.
258 const MSIX_PBA_SIZE: u64 = 0x800;
259 // The BAR size must be a power of 2.
260 const CAPABILITY_BAR_SIZE: u64 = 0x80000;
261 
262 const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address.
263 
264 const VIRTIO_PCI_VENDOR_ID: u16 = 0x1af4;
265 const VIRTIO_PCI_DEVICE_ID_BASE: u16 = 0x1040; // Add to device type to get device ID.
266 
267 #[derive(Versionize)]
268 struct QueueState {
269     max_size: u16,
270     size: u16,
271     ready: bool,
272     vector: u16,
273     desc_table: u64,
274     avail_ring: u64,
275     used_ring: u64,
276 }
277 
278 #[derive(Versionize)]
279 struct VirtioPciDeviceState {
280     device_activated: bool,
281     queues: Vec<QueueState>,
282     interrupt_status: usize,
283 }
284 
285 impl VersionMapped for VirtioPciDeviceState {}
286 
287 pub struct VirtioPciDevice {
288     id: String,
289 
290     // PCI configuration registers.
291     configuration: PciConfiguration,
292 
293     // virtio PCI common configuration
294     common_config: VirtioPciCommonConfig,
295 
296     // MSI-X config
297     msix_config: Option<Arc<Mutex<MsixConfig>>>,
298 
299     // Number of MSI-X vectors
300     msix_num: u16,
301 
302     // Virtio device reference and status
303     device: Arc<Mutex<dyn VirtioDevice>>,
304     device_activated: Arc<AtomicBool>,
305 
306     // PCI interrupts.
307     interrupt_status: Arc<AtomicUsize>,
308     virtio_interrupt: Option<Arc<dyn VirtioInterrupt>>,
309     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
310 
311     // virtio queues
312     queues: Vec<Queue>,
313     queue_evts: Vec<EventFd>,
314 
315     // Guest memory
316     memory: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
317 
318     // Settings PCI BAR
319     settings_bar: u8,
320     settings_bar_addr: Option<GuestAddress>,
321 
322     // Whether to use 64-bit bar location or 32-bit
323     use_64bit_bar: bool,
324 
325     // Add a dedicated structure to hold information about the very specific
326     // virtio-pci capability VIRTIO_PCI_CAP_PCI_CFG. This is needed to support
327     // the legacy/backward compatible mechanism of letting the guest access the
328     // other virtio capabilities without mapping the PCI BARs. This can be
329     // needed when the guest tries to early access the virtio configuration of
330     // a device.
331     cap_pci_cfg_info: VirtioPciCfgCapInfo,
332 
333     // Details of bar regions to free
334     bar_regions: Vec<(GuestAddress, GuestUsize, PciBarRegionType)>,
335 
336     // EventFd to signal on to request activation
337     activate_evt: EventFd,
338 
339     // Barrier that is used to wait on for activation
340     activate_barrier: Arc<Barrier>,
341 }
342 
343 impl VirtioPciDevice {
344     /// Constructs a new PCI transport for the given virtio device.
345     #[allow(clippy::too_many_arguments)]
346     pub fn new(
347         id: String,
348         memory: GuestMemoryAtomic<GuestMemoryMmap>,
349         device: Arc<Mutex<dyn VirtioDevice>>,
350         msix_num: u16,
351         iommu_mapping_cb: Option<Arc<VirtioIommuRemapping>>,
352         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
353         pci_device_bdf: u32,
354         activate_evt: EventFd,
355     ) -> Result<Self> {
356         let device_clone = device.clone();
357         let locked_device = device_clone.lock().unwrap();
358         let mut queue_evts = Vec::new();
359         for _ in locked_device.queue_max_sizes().iter() {
360             queue_evts.push(EventFd::new(EFD_NONBLOCK)?)
361         }
362         let queues = locked_device
363             .queue_max_sizes()
364             .iter()
365             .map(|&s| {
366                 let mut queue = Queue::new(s);
367                 queue.iommu_mapping_cb = iommu_mapping_cb.clone();
368                 queue
369             })
370             .collect();
371 
372         let pci_device_id = VIRTIO_PCI_DEVICE_ID_BASE + locked_device.device_type() as u16;
373 
374         let interrupt_source_group = interrupt_manager.create_group(MsiIrqGroupConfig {
375             base: 0,
376             count: msix_num as InterruptIndex,
377         })?;
378 
379         let (msix_config, msix_config_clone) = if msix_num > 0 {
380             let msix_config = Arc::new(Mutex::new(MsixConfig::new(
381                 msix_num,
382                 interrupt_source_group.clone(),
383                 pci_device_bdf,
384             )));
385             let msix_config_clone = msix_config.clone();
386             (Some(msix_config), Some(msix_config_clone))
387         } else {
388             (None, None)
389         };
390 
391         // All device types *except* virtio block devices should be allocated a 64-bit bar
392         // The block devices should be given a 32-bit BAR so that they are easily accessible
393         // to firmware without requiring excessive identity mapping.
394         let mut use_64bit_bar = true;
395         let (class, subclass) = match VirtioDeviceType::from(locked_device.device_type()) {
396             VirtioDeviceType::Net => (
397                 PciClassCode::NetworkController,
398                 &PciNetworkControllerSubclass::EthernetController as &dyn PciSubclass,
399             ),
400             VirtioDeviceType::Block => {
401                 use_64bit_bar = false;
402                 (
403                     PciClassCode::MassStorage,
404                     &PciMassStorageSubclass::MassStorage as &dyn PciSubclass,
405                 )
406             }
407             _ => (
408                 PciClassCode::Other,
409                 &PciVirtioSubclass::NonTransitionalBase as &dyn PciSubclass,
410             ),
411         };
412 
413         let configuration = PciConfiguration::new(
414             VIRTIO_PCI_VENDOR_ID,
415             pci_device_id,
416             0x1, // For modern virtio-PCI devices
417             class,
418             subclass,
419             None,
420             PciHeaderType::Device,
421             VIRTIO_PCI_VENDOR_ID,
422             pci_device_id,
423             msix_config_clone,
424         );
425 
426         let mut virtio_pci_device = VirtioPciDevice {
427             id,
428             configuration,
429             common_config: VirtioPciCommonConfig {
430                 driver_status: 0,
431                 config_generation: 0,
432                 device_feature_select: 0,
433                 driver_feature_select: 0,
434                 queue_select: 0,
435                 msix_config: Arc::new(AtomicU16::new(VIRTIO_MSI_NO_VECTOR)),
436             },
437             msix_config,
438             msix_num,
439             device,
440             device_activated: Arc::new(AtomicBool::new(false)),
441             interrupt_status: Arc::new(AtomicUsize::new(0)),
442             virtio_interrupt: None,
443             queues,
444             queue_evts,
445             memory: Some(memory),
446             settings_bar: 0,
447             settings_bar_addr: None,
448             use_64bit_bar,
449             interrupt_source_group,
450             cap_pci_cfg_info: VirtioPciCfgCapInfo::default(),
451             bar_regions: vec![],
452             activate_evt,
453             activate_barrier: Arc::new(Barrier::new(2)),
454         };
455 
456         if let Some(msix_config) = &virtio_pci_device.msix_config {
457             virtio_pci_device.virtio_interrupt = Some(Arc::new(VirtioInterruptMsix::new(
458                 msix_config.clone(),
459                 virtio_pci_device.common_config.msix_config.clone(),
460                 virtio_pci_device.interrupt_source_group.clone(),
461             )));
462         }
463 
464         Ok(virtio_pci_device)
465     }
466 
467     fn state(&self) -> VirtioPciDeviceState {
468         VirtioPciDeviceState {
469             device_activated: self.device_activated.load(Ordering::Acquire),
470             interrupt_status: self.interrupt_status.load(Ordering::Acquire),
471             queues: self
472                 .queues
473                 .iter()
474                 .map(|q| QueueState {
475                     max_size: q.max_size,
476                     size: q.size,
477                     ready: q.ready,
478                     vector: q.vector,
479                     desc_table: q.desc_table.0,
480                     avail_ring: q.avail_ring.0,
481                     used_ring: q.used_ring.0,
482                 })
483                 .collect(),
484         }
485     }
486 
487     fn set_state(&mut self, state: &VirtioPciDeviceState) -> std::result::Result<(), Error> {
488         self.device_activated
489             .store(state.device_activated, Ordering::Release);
490         self.interrupt_status
491             .store(state.interrupt_status, Ordering::Release);
492 
493         // Update virtqueues indexes for both available and used rings.
494         if let Some(mem) = self.memory.as_ref() {
495             let mem = mem.memory();
496             for (i, queue) in self.queues.iter_mut().enumerate() {
497                 queue.max_size = state.queues[i].max_size;
498                 queue.size = state.queues[i].size;
499                 queue.ready = state.queues[i].ready;
500                 queue.vector = state.queues[i].vector;
501                 queue.desc_table = GuestAddress(state.queues[i].desc_table);
502                 queue.avail_ring = GuestAddress(state.queues[i].avail_ring);
503                 queue.used_ring = GuestAddress(state.queues[i].used_ring);
504                 queue.next_avail = Wrapping(
505                     queue
506                         .used_index_from_memory(&mem)
507                         .map_err(Error::QueueRingIndex)?,
508                 );
509                 queue.next_used = Wrapping(
510                     queue
511                         .used_index_from_memory(&mem)
512                         .map_err(Error::QueueRingIndex)?,
513                 );
514             }
515         }
516 
517         Ok(())
518     }
519 
520     /// Gets the list of queue events that must be triggered whenever the VM writes to
521     /// `virtio::NOTIFY_REG_OFFSET` past the MMIO base. Each event must be triggered when the
522     /// value being written equals the index of the event in this list.
523     fn queue_evts(&self) -> &[EventFd] {
524         self.queue_evts.as_slice()
525     }
526 
527     fn is_driver_ready(&self) -> bool {
528         let ready_bits =
529             (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK) as u8;
530         self.common_config.driver_status == ready_bits
531             && self.common_config.driver_status & DEVICE_FAILED as u8 == 0
532     }
533 
534     /// Determines if the driver has requested the device (re)init / reset itself
535     fn is_driver_init(&self) -> bool {
536         self.common_config.driver_status == DEVICE_INIT as u8
537     }
538 
539     // This function is used by the caller to provide the expected base address
540     // for the virtio-pci configuration BAR.
541     pub fn set_config_bar_addr(&mut self, bar_addr: u64) {
542         self.settings_bar_addr = Some(GuestAddress(bar_addr));
543     }
544 
545     pub fn config_bar_addr(&self) -> u64 {
546         self.configuration.get_bar_addr(self.settings_bar as usize)
547     }
548 
549     fn add_pci_capabilities(
550         &mut self,
551         settings_bar: u8,
552     ) -> std::result::Result<(), PciDeviceError> {
553         // Add pointers to the different configuration structures from the PCI capabilities.
554         let common_cap = VirtioPciCap::new(
555             PciCapabilityType::CommonConfig,
556             settings_bar,
557             COMMON_CONFIG_BAR_OFFSET as u32,
558             COMMON_CONFIG_SIZE as u32,
559         );
560         self.configuration
561             .add_capability(&common_cap)
562             .map_err(PciDeviceError::CapabilitiesSetup)?;
563 
564         let isr_cap = VirtioPciCap::new(
565             PciCapabilityType::IsrConfig,
566             settings_bar,
567             ISR_CONFIG_BAR_OFFSET as u32,
568             ISR_CONFIG_SIZE as u32,
569         );
570         self.configuration
571             .add_capability(&isr_cap)
572             .map_err(PciDeviceError::CapabilitiesSetup)?;
573 
574         // TODO(dgreid) - set based on device's configuration size?
575         let device_cap = VirtioPciCap::new(
576             PciCapabilityType::DeviceConfig,
577             settings_bar,
578             DEVICE_CONFIG_BAR_OFFSET as u32,
579             DEVICE_CONFIG_SIZE as u32,
580         );
581         self.configuration
582             .add_capability(&device_cap)
583             .map_err(PciDeviceError::CapabilitiesSetup)?;
584 
585         let notify_cap = VirtioPciNotifyCap::new(
586             PciCapabilityType::NotifyConfig,
587             settings_bar,
588             NOTIFICATION_BAR_OFFSET as u32,
589             NOTIFICATION_SIZE as u32,
590             Le32::from(NOTIFY_OFF_MULTIPLIER),
591         );
592         self.configuration
593             .add_capability(&notify_cap)
594             .map_err(PciDeviceError::CapabilitiesSetup)?;
595 
596         let configuration_cap = VirtioPciCfgCap::new();
597         self.cap_pci_cfg_info.offset = self
598             .configuration
599             .add_capability(&configuration_cap)
600             .map_err(PciDeviceError::CapabilitiesSetup)?
601             + VIRTIO_PCI_CAP_OFFSET;
602         self.cap_pci_cfg_info.cap = configuration_cap;
603 
604         if self.msix_config.is_some() {
605             let msix_cap = MsixCap::new(
606                 settings_bar,
607                 self.msix_num,
608                 MSIX_TABLE_BAR_OFFSET as u32,
609                 settings_bar,
610                 MSIX_PBA_BAR_OFFSET as u32,
611             );
612             self.configuration
613                 .add_capability(&msix_cap)
614                 .map_err(PciDeviceError::CapabilitiesSetup)?;
615         }
616 
617         self.settings_bar = settings_bar;
618         Ok(())
619     }
620 
621     fn read_cap_pci_cfg(&mut self, offset: usize, mut data: &mut [u8]) {
622         let cap_slice = self.cap_pci_cfg_info.cap.as_slice();
623         let data_len = data.len();
624         let cap_len = cap_slice.len();
625         if offset + data_len > cap_len {
626             error!("Failed to read cap_pci_cfg from config space");
627             return;
628         }
629 
630         if offset < std::mem::size_of::<VirtioPciCap>() {
631             if let Some(end) = offset.checked_add(data_len) {
632                 // This write can't fail, offset and end are checked against config_len.
633                 data.write_all(&cap_slice[offset..cmp::min(end, cap_len)])
634                     .unwrap();
635             }
636         } else {
637             // Safe since we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long.
638             let bar_offset: u32 =
639                 unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) };
640             self.read_bar(0, bar_offset as u64, data)
641         }
642     }
643 
644     fn write_cap_pci_cfg(&mut self, offset: usize, data: &[u8]) -> Option<Arc<Barrier>> {
645         let cap_slice = self.cap_pci_cfg_info.cap.as_mut_slice();
646         let data_len = data.len();
647         let cap_len = cap_slice.len();
648         if offset + data_len > cap_len {
649             error!("Failed to write cap_pci_cfg to config space");
650             return None;
651         }
652 
653         if offset < std::mem::size_of::<VirtioPciCap>() {
654             let (_, right) = cap_slice.split_at_mut(offset);
655             right[..data_len].copy_from_slice(data);
656             None
657         } else {
658             // Safe since we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long.
659             let bar_offset: u32 =
660                 unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) };
661             self.write_bar(0, bar_offset as u64, data)
662         }
663     }
664 
665     pub fn virtio_device(&self) -> Arc<Mutex<dyn VirtioDevice>> {
666         self.device.clone()
667     }
668 
669     fn activate(&mut self) -> ActivateResult {
670         if let Some(virtio_interrupt) = self.virtio_interrupt.take() {
671             if self.memory.is_some() {
672                 let mem = self.memory.as_ref().unwrap().clone();
673                 let mut device = self.device.lock().unwrap();
674                 let mut queue_evts = Vec::new();
675                 let mut queues = self.queues.clone();
676                 queues.retain(|q| q.ready);
677                 for (i, queue) in queues.iter().enumerate() {
678                     queue_evts.push(self.queue_evts[i].try_clone().unwrap());
679                     if !queue.is_valid(&mem.memory()) {
680                         error!("Queue {} is not valid", i);
681                     }
682                 }
683                 return device.activate(mem, virtio_interrupt, queues, queue_evts);
684             }
685         }
686         Ok(())
687     }
688 
689     pub fn maybe_activate(&mut self) {
690         if self.needs_activation() {
691             self.activate().expect("Failed to activate device");
692             self.device_activated.store(true, Ordering::SeqCst);
693             info!("{}: Waiting for barrier", self.id);
694             self.activate_barrier.wait();
695             info!("{}: Barrier released", self.id);
696         } else {
697             info!("{}: Device does not need activation", self.id)
698         }
699     }
700 
701     fn needs_activation(&self) -> bool {
702         !self.device_activated.load(Ordering::SeqCst) && self.is_driver_ready()
703     }
704 }
705 
706 impl VirtioTransport for VirtioPciDevice {
707     fn ioeventfds(&self, base_addr: u64) -> Vec<(&EventFd, u64)> {
708         let notify_base = base_addr + NOTIFICATION_BAR_OFFSET;
709         self.queue_evts()
710             .iter()
711             .enumerate()
712             .map(|(i, event)| {
713                 (
714                     event,
715                     notify_base + i as u64 * u64::from(NOTIFY_OFF_MULTIPLIER),
716                 )
717             })
718             .collect()
719     }
720 }
721 
722 pub struct VirtioInterruptMsix {
723     msix_config: Arc<Mutex<MsixConfig>>,
724     config_vector: Arc<AtomicU16>,
725     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
726 }
727 
728 impl VirtioInterruptMsix {
729     pub fn new(
730         msix_config: Arc<Mutex<MsixConfig>>,
731         config_vector: Arc<AtomicU16>,
732         interrupt_source_group: Arc<dyn InterruptSourceGroup>,
733     ) -> Self {
734         VirtioInterruptMsix {
735             msix_config,
736             config_vector,
737             interrupt_source_group,
738         }
739     }
740 }
741 
742 impl VirtioInterrupt for VirtioInterruptMsix {
743     fn trigger(
744         &self,
745         int_type: &VirtioInterruptType,
746         queue: Option<&Queue>,
747     ) -> std::result::Result<(), std::io::Error> {
748         let vector = match int_type {
749             VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire),
750             VirtioInterruptType::Queue => {
751                 if let Some(q) = queue {
752                     q.vector
753                 } else {
754                     0
755                 }
756             }
757         };
758 
759         if vector == VIRTIO_MSI_NO_VECTOR {
760             return Ok(());
761         }
762 
763         let config = &mut self.msix_config.lock().unwrap();
764         let entry = &config.table_entries[vector as usize];
765         // In case the vector control register associated with the entry
766         // has its first bit set, this means the vector is masked and the
767         // device should not inject the interrupt.
768         // Instead, the Pending Bit Array table is updated to reflect there
769         // is a pending interrupt for this specific vector.
770         if config.masked() || entry.masked() {
771             config.set_pba_bit(vector, false);
772             return Ok(());
773         }
774 
775         self.interrupt_source_group
776             .trigger(vector as InterruptIndex)
777     }
778 
779     fn notifier(&self, int_type: &VirtioInterruptType, queue: Option<&Queue>) -> Option<EventFd> {
780         let vector = match int_type {
781             VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire),
782             VirtioInterruptType::Queue => {
783                 if let Some(q) = queue {
784                     q.vector
785                 } else {
786                     0
787                 }
788             }
789         };
790 
791         self.interrupt_source_group
792             .notifier(vector as InterruptIndex)
793     }
794 }
795 
796 impl PciDevice for VirtioPciDevice {
797     fn write_config_register(
798         &mut self,
799         reg_idx: usize,
800         offset: u64,
801         data: &[u8],
802     ) -> Option<Arc<Barrier>> {
803         // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG
804         // is accessed. This capability has a special meaning as it allows the
805         // guest to access other capabilities without mapping the PCI BAR.
806         let base = reg_idx * 4;
807         if base + offset as usize >= self.cap_pci_cfg_info.offset
808             && base + offset as usize + data.len()
809                 <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len()
810         {
811             let offset = base + offset as usize - self.cap_pci_cfg_info.offset;
812             self.write_cap_pci_cfg(offset, data)
813         } else {
814             self.configuration
815                 .write_config_register(reg_idx, offset, data);
816             None
817         }
818     }
819 
820     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
821         // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG
822         // is accessed. This capability has a special meaning as it allows the
823         // guest to access other capabilities without mapping the PCI BAR.
824         let base = reg_idx * 4;
825         if base >= self.cap_pci_cfg_info.offset
826             && base + 4 <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len()
827         {
828             let offset = base - self.cap_pci_cfg_info.offset;
829             let mut data = [0u8; 4];
830             self.read_cap_pci_cfg(offset, &mut data);
831             u32::from_le_bytes(data)
832         } else {
833             self.configuration.read_reg(reg_idx)
834         }
835     }
836 
837     fn detect_bar_reprogramming(
838         &mut self,
839         reg_idx: usize,
840         data: &[u8],
841     ) -> Option<BarReprogrammingParams> {
842         self.configuration.detect_bar_reprogramming(reg_idx, data)
843     }
844 
845     fn allocate_bars(
846         &mut self,
847         allocator: &mut SystemAllocator,
848     ) -> std::result::Result<Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, PciDeviceError>
849     {
850         let mut ranges = Vec::new();
851         let device_clone = self.device.clone();
852         let device = device_clone.lock().unwrap();
853 
854         // Allocate the virtio-pci capability BAR.
855         // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004
856         let (virtio_pci_bar_addr, region_type) = if self.use_64bit_bar {
857             let region_type = PciBarRegionType::Memory64BitRegion;
858             let addr = allocator
859                 .allocate_mmio_addresses(
860                     self.settings_bar_addr,
861                     CAPABILITY_BAR_SIZE,
862                     Some(CAPABILITY_BAR_SIZE),
863                 )
864                 .ok_or(PciDeviceError::IoAllocationFailed(CAPABILITY_BAR_SIZE))?;
865             ranges.push((addr, CAPABILITY_BAR_SIZE, region_type));
866             (addr, region_type)
867         } else {
868             let region_type = PciBarRegionType::Memory32BitRegion;
869             let addr = allocator
870                 .allocate_mmio_hole_addresses(
871                     self.settings_bar_addr,
872                     CAPABILITY_BAR_SIZE,
873                     Some(CAPABILITY_BAR_SIZE),
874                 )
875                 .ok_or(PciDeviceError::IoAllocationFailed(CAPABILITY_BAR_SIZE))?;
876             ranges.push((addr, CAPABILITY_BAR_SIZE, region_type));
877             (addr, region_type)
878         };
879         self.bar_regions
880             .push((virtio_pci_bar_addr, CAPABILITY_BAR_SIZE, region_type));
881 
882         let config = PciBarConfiguration::default()
883             .set_register_index(0)
884             .set_address(virtio_pci_bar_addr.raw_value())
885             .set_size(CAPABILITY_BAR_SIZE)
886             .set_region_type(region_type);
887         let virtio_pci_bar =
888             self.configuration.add_pci_bar(&config).map_err(|e| {
889                 PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr.raw_value(), e)
890             })? as u8;
891 
892         // Once the BARs are allocated, the capabilities can be added to the PCI configuration.
893         self.add_pci_capabilities(virtio_pci_bar)?;
894 
895         // Allocate a dedicated BAR if there are some shared memory regions.
896         if let Some(shm_list) = device.get_shm_regions() {
897             let config = PciBarConfiguration::default()
898                 .set_register_index(2)
899                 .set_address(shm_list.addr.raw_value())
900                 .set_size(shm_list.len);
901             let virtio_pci_shm_bar =
902                 self.configuration.add_pci_bar(&config).map_err(|e| {
903                     PciDeviceError::IoRegistrationFailed(shm_list.addr.raw_value(), e)
904                 })? as u8;
905 
906             let region_type = PciBarRegionType::Memory64BitRegion;
907             ranges.push((shm_list.addr, shm_list.len, region_type));
908             self.bar_regions
909                 .push((shm_list.addr, shm_list.len, region_type));
910 
911             for (idx, shm) in shm_list.region_list.iter().enumerate() {
912                 let shm_cap = VirtioPciCap64::new(
913                     PciCapabilityType::SharedMemoryConfig,
914                     virtio_pci_shm_bar,
915                     idx as u8,
916                     shm.offset,
917                     shm.len,
918                 );
919                 self.configuration
920                     .add_capability(&shm_cap)
921                     .map_err(PciDeviceError::CapabilitiesSetup)?;
922             }
923         }
924 
925         Ok(ranges)
926     }
927 
928     fn free_bars(
929         &mut self,
930         allocator: &mut SystemAllocator,
931     ) -> std::result::Result<(), PciDeviceError> {
932         for (addr, length, type_) in self.bar_regions.drain(..) {
933             match type_ {
934                 PciBarRegionType::Memory32BitRegion => {
935                     allocator.free_mmio_hole_addresses(addr, length);
936                 }
937                 PciBarRegionType::Memory64BitRegion => {
938                     allocator.free_mmio_addresses(addr, length);
939                 }
940                 _ => error!("Unexpected PCI bar type"),
941             }
942         }
943         Ok(())
944     }
945 
946     fn move_bar(&mut self, old_base: u64, new_base: u64) -> result::Result<(), std::io::Error> {
947         // We only update our idea of the bar in order to support free_bars() above.
948         // The majority of the reallocation is done inside DeviceManager.
949         for (addr, _, _) in self.bar_regions.iter_mut() {
950             if (*addr).0 == old_base {
951                 *addr = GuestAddress(new_base);
952             }
953         }
954 
955         Ok(())
956     }
957 
958     fn read_bar(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
959         match offset {
960             o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.read(
961                 o - COMMON_CONFIG_BAR_OFFSET,
962                 data,
963                 &mut self.queues,
964                 self.device.clone(),
965             ),
966             o if ISR_CONFIG_BAR_OFFSET <= o && o < ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE => {
967                 if let Some(v) = data.get_mut(0) {
968                     // Reading this register resets it to 0.
969                     *v = self.interrupt_status.swap(0, Ordering::AcqRel) as u8;
970                 }
971             }
972             o if DEVICE_CONFIG_BAR_OFFSET <= o
973                 && o < DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE =>
974             {
975                 let device = self.device.lock().unwrap();
976                 device.read_config(o - DEVICE_CONFIG_BAR_OFFSET, data);
977             }
978             o if NOTIFICATION_BAR_OFFSET <= o
979                 && o < NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE =>
980             {
981                 // Handled with ioeventfds.
982             }
983             o if MSIX_TABLE_BAR_OFFSET <= o && o < MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE => {
984                 if let Some(msix_config) = &self.msix_config {
985                     msix_config
986                         .lock()
987                         .unwrap()
988                         .read_table(o - MSIX_TABLE_BAR_OFFSET, data);
989                 }
990             }
991             o if MSIX_PBA_BAR_OFFSET <= o && o < MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE => {
992                 if let Some(msix_config) = &self.msix_config {
993                     msix_config
994                         .lock()
995                         .unwrap()
996                         .read_pba(o - MSIX_PBA_BAR_OFFSET, data);
997                 }
998             }
999             _ => (),
1000         }
1001     }
1002 
1003     fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1004         match offset {
1005             o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.write(
1006                 o - COMMON_CONFIG_BAR_OFFSET,
1007                 data,
1008                 &mut self.queues,
1009                 self.device.clone(),
1010             ),
1011             o if ISR_CONFIG_BAR_OFFSET <= o && o < ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE => {
1012                 if let Some(v) = data.get(0) {
1013                     self.interrupt_status
1014                         .fetch_and(!(*v as usize), Ordering::AcqRel);
1015                 }
1016             }
1017             o if DEVICE_CONFIG_BAR_OFFSET <= o
1018                 && o < DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE =>
1019             {
1020                 let mut device = self.device.lock().unwrap();
1021                 device.write_config(o - DEVICE_CONFIG_BAR_OFFSET, data);
1022             }
1023             o if NOTIFICATION_BAR_OFFSET <= o
1024                 && o < NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE =>
1025             {
1026                 // Handled with ioeventfds.
1027             }
1028             o if MSIX_TABLE_BAR_OFFSET <= o && o < MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE => {
1029                 if let Some(msix_config) = &self.msix_config {
1030                     msix_config
1031                         .lock()
1032                         .unwrap()
1033                         .write_table(o - MSIX_TABLE_BAR_OFFSET, data);
1034                 }
1035             }
1036             o if MSIX_PBA_BAR_OFFSET <= o && o < MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE => {
1037                 if let Some(msix_config) = &self.msix_config {
1038                     msix_config
1039                         .lock()
1040                         .unwrap()
1041                         .write_pba(o - MSIX_PBA_BAR_OFFSET, data);
1042                 }
1043             }
1044             _ => (),
1045         };
1046 
1047         // Try and activate the device if the driver status has changed
1048         if self.needs_activation() {
1049             info!(
1050                 "{}: Needs activation; writing to activate event fd",
1051                 self.id
1052             );
1053             self.activate_evt.write(1).ok();
1054             info!("{}: Needs activation; returning barrier", self.id);
1055             return Some(self.activate_barrier.clone());
1056         }
1057 
1058         // Device has been reset by the driver
1059         if self.device_activated.load(Ordering::SeqCst) && self.is_driver_init() {
1060             let mut device = self.device.lock().unwrap();
1061             if let Some(virtio_interrupt) = device.reset() {
1062                 // Upon reset the device returns its interrupt EventFD
1063                 self.virtio_interrupt = Some(virtio_interrupt);
1064                 self.device_activated.store(false, Ordering::SeqCst);
1065 
1066                 // Reset queue readiness (changes queue_enable), queue sizes
1067                 // and selected_queue as per spec for reset
1068                 self.queues.iter_mut().for_each(Queue::reset);
1069                 self.common_config.queue_select = 0;
1070             } else {
1071                 error!("Attempt to reset device when not implemented in underlying device");
1072                 self.common_config.driver_status = crate::DEVICE_FAILED as u8;
1073             }
1074         }
1075 
1076         None
1077     }
1078 
1079     fn as_any(&mut self) -> &mut dyn Any {
1080         self
1081     }
1082 }
1083 
1084 impl BusDevice for VirtioPciDevice {
1085     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1086         self.read_bar(base, offset, data)
1087     }
1088 
1089     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1090         self.write_bar(base, offset, data)
1091     }
1092 }
1093 
1094 impl Pausable for VirtioPciDevice {
1095     fn pause(&mut self) -> result::Result<(), MigratableError> {
1096         Ok(())
1097     }
1098 
1099     fn resume(&mut self) -> result::Result<(), MigratableError> {
1100         Ok(())
1101     }
1102 }
1103 
1104 impl Snapshottable for VirtioPciDevice {
1105     fn id(&self) -> String {
1106         self.id.clone()
1107     }
1108 
1109     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1110         let mut virtio_pci_dev_snapshot =
1111             Snapshot::new_from_versioned_state(&self.id, &self.state())?;
1112 
1113         // Snapshot PciConfiguration
1114         virtio_pci_dev_snapshot.add_snapshot(self.configuration.snapshot()?);
1115 
1116         // Snapshot VirtioPciCommonConfig
1117         virtio_pci_dev_snapshot.add_snapshot(self.common_config.snapshot()?);
1118 
1119         // Snapshot MSI-X
1120         if let Some(msix_config) = &self.msix_config {
1121             virtio_pci_dev_snapshot.add_snapshot(msix_config.lock().unwrap().snapshot()?);
1122         }
1123 
1124         Ok(virtio_pci_dev_snapshot)
1125     }
1126 
1127     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
1128         if let Some(virtio_pci_dev_section) =
1129             snapshot.snapshot_data.get(&format!("{}-section", self.id))
1130         {
1131             // Restore MSI-X
1132             if let Some(msix_config) = &self.msix_config {
1133                 let id = msix_config.lock().unwrap().id();
1134                 if let Some(msix_snapshot) = snapshot.snapshots.get(&id) {
1135                     msix_config
1136                         .lock()
1137                         .unwrap()
1138                         .restore(*msix_snapshot.clone())?;
1139                 }
1140             }
1141 
1142             // Restore VirtioPciCommonConfig
1143             if let Some(virtio_config_snapshot) = snapshot.snapshots.get(&self.common_config.id()) {
1144                 self.common_config
1145                     .restore(*virtio_config_snapshot.clone())?;
1146             }
1147 
1148             // Restore PciConfiguration
1149             if let Some(pci_config_snapshot) = snapshot.snapshots.get(&self.configuration.id()) {
1150                 self.configuration.restore(*pci_config_snapshot.clone())?;
1151             }
1152 
1153             // First restore the status of the virtqueues.
1154             self.set_state(&virtio_pci_dev_section.to_versioned_state()?)
1155                 .map_err(|e| {
1156                     MigratableError::Restore(anyhow!(
1157                         "Could not restore VIRTIO_PCI_DEVICE state {:?}",
1158                         e
1159                     ))
1160                 })?;
1161 
1162             // Then we can activate the device, as we know at this point that
1163             // the virtqueues are in the right state and the device is ready
1164             // to be activated, which will spawn each virtio worker thread.
1165             if self.device_activated.load(Ordering::SeqCst) && self.is_driver_ready() {
1166                 self.activate().map_err(|e| {
1167                     MigratableError::Restore(anyhow!("Failed activating the device: {:?}", e))
1168                 })?;
1169             }
1170 
1171             return Ok(());
1172         }
1173 
1174         Err(MigratableError::Restore(anyhow!(
1175             "Could not find VIRTIO_PCI_DEVICE snapshot section"
1176         )))
1177     }
1178 }
1179 impl Transportable for VirtioPciDevice {}
1180 impl Migratable for VirtioPciDevice {}
1181