xref: /cloud-hypervisor/virtio-devices/src/transport/pci_device.rs (revision 7d7bfb2034001d4cb15df2ddc56d2d350c8da30f)
1 // Copyright 2018 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE-BSD-3-Clause file.
4 //
5 // Copyright © 2019 Intel Corporation
6 //
7 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
8 
9 use super::VirtioPciCommonConfig;
10 use crate::transport::VirtioTransport;
11 use crate::GuestMemoryMmap;
12 use crate::{
13     ActivateResult, VirtioDevice, VirtioDeviceType, VirtioInterrupt, VirtioInterruptType,
14     DEVICE_ACKNOWLEDGE, DEVICE_DRIVER, DEVICE_DRIVER_OK, DEVICE_FAILED, DEVICE_FEATURES_OK,
15     DEVICE_INIT,
16 };
17 use anyhow::anyhow;
18 use libc::EFD_NONBLOCK;
19 use pci::{
20     BarReprogrammingParams, MsixCap, MsixConfig, PciBarConfiguration, PciBarRegionType,
21     PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, PciDevice, PciDeviceError,
22     PciHeaderType, PciMassStorageSubclass, PciNetworkControllerSubclass, PciSubclass,
23 };
24 use std::any::Any;
25 use std::cmp;
26 use std::io::Write;
27 use std::result;
28 use std::sync::atomic::{AtomicBool, AtomicU16, AtomicUsize, Ordering};
29 use std::sync::{Arc, Barrier, Mutex};
30 use versionize::{VersionMap, Versionize, VersionizeResult};
31 use versionize_derive::Versionize;
32 use virtio_queue::{Error as QueueError, Queue};
33 use vm_allocator::{AddressAllocator, SystemAllocator};
34 use vm_device::dma_mapping::ExternalDmaMapping;
35 use vm_device::interrupt::{
36     InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
37 };
38 use vm_device::BusDevice;
39 use vm_memory::{Address, ByteValued, GuestAddress, GuestMemoryAtomic, GuestUsize, Le32};
40 use vm_migration::{
41     Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped,
42 };
43 use vm_virtio::AccessPlatform;
44 use vmm_sys_util::{errno::Result, eventfd::EventFd};
45 
46 /// Vector value used to disable MSI for a queue.
47 const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff;
48 
49 #[derive(Debug)]
50 enum Error {
51     /// Failed to retrieve queue ring's index.
52     QueueRingIndex(QueueError),
53 }
54 
55 #[allow(clippy::enum_variant_names)]
56 enum PciCapabilityType {
57     CommonConfig = 1,
58     NotifyConfig = 2,
59     IsrConfig = 3,
60     DeviceConfig = 4,
61     PciConfig = 5,
62     SharedMemoryConfig = 8,
63 }
64 
65 // This offset represents the 2 bytes omitted from the VirtioPciCap structure
66 // as they are already handled through add_capability(). These 2 bytes are the
67 // fields cap_vndr (1 byte) and cap_next (1 byte) defined in the virtio spec.
68 const VIRTIO_PCI_CAP_OFFSET: usize = 2;
69 
70 #[allow(dead_code)]
71 #[repr(packed)]
72 #[derive(Clone, Copy, Default)]
73 struct VirtioPciCap {
74     cap_len: u8,      // Generic PCI field: capability length
75     cfg_type: u8,     // Identifies the structure.
76     pci_bar: u8,      // Where to find it.
77     id: u8,           // Multiple capabilities of the same type
78     padding: [u8; 2], // Pad to full dword.
79     offset: Le32,     // Offset within bar.
80     length: Le32,     // Length of the structure, in bytes.
81 }
82 // SAFETY: All members are simple numbers and any value is valid.
83 unsafe impl ByteValued for VirtioPciCap {}
84 
85 impl PciCapability for VirtioPciCap {
86     fn bytes(&self) -> &[u8] {
87         self.as_slice()
88     }
89 
90     fn id(&self) -> PciCapabilityId {
91         PciCapabilityId::VendorSpecific
92     }
93 }
94 
95 const VIRTIO_PCI_CAP_LEN_OFFSET: u8 = 2;
96 
97 impl VirtioPciCap {
98     pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, offset: u32, length: u32) -> Self {
99         VirtioPciCap {
100             cap_len: (std::mem::size_of::<VirtioPciCap>() as u8) + VIRTIO_PCI_CAP_LEN_OFFSET,
101             cfg_type: cfg_type as u8,
102             pci_bar,
103             id: 0,
104             padding: [0; 2],
105             offset: Le32::from(offset),
106             length: Le32::from(length),
107         }
108     }
109 }
110 
111 #[allow(dead_code)]
112 #[repr(packed)]
113 #[derive(Clone, Copy, Default)]
114 struct VirtioPciNotifyCap {
115     cap: VirtioPciCap,
116     notify_off_multiplier: Le32,
117 }
118 // SAFETY: All members are simple numbers and any value is valid.
119 unsafe impl ByteValued for VirtioPciNotifyCap {}
120 
121 impl PciCapability for VirtioPciNotifyCap {
122     fn bytes(&self) -> &[u8] {
123         self.as_slice()
124     }
125 
126     fn id(&self) -> PciCapabilityId {
127         PciCapabilityId::VendorSpecific
128     }
129 }
130 
131 impl VirtioPciNotifyCap {
132     pub fn new(
133         cfg_type: PciCapabilityType,
134         pci_bar: u8,
135         offset: u32,
136         length: u32,
137         multiplier: Le32,
138     ) -> Self {
139         VirtioPciNotifyCap {
140             cap: VirtioPciCap {
141                 cap_len: (std::mem::size_of::<VirtioPciNotifyCap>() as u8)
142                     + VIRTIO_PCI_CAP_LEN_OFFSET,
143                 cfg_type: cfg_type as u8,
144                 pci_bar,
145                 id: 0,
146                 padding: [0; 2],
147                 offset: Le32::from(offset),
148                 length: Le32::from(length),
149             },
150             notify_off_multiplier: multiplier,
151         }
152     }
153 }
154 
155 #[allow(dead_code)]
156 #[repr(packed)]
157 #[derive(Clone, Copy, Default)]
158 struct VirtioPciCap64 {
159     cap: VirtioPciCap,
160     offset_hi: Le32,
161     length_hi: Le32,
162 }
163 // SAFETY: All members are simple numbers and any value is valid.
164 unsafe impl ByteValued for VirtioPciCap64 {}
165 
166 impl PciCapability for VirtioPciCap64 {
167     fn bytes(&self) -> &[u8] {
168         self.as_slice()
169     }
170 
171     fn id(&self) -> PciCapabilityId {
172         PciCapabilityId::VendorSpecific
173     }
174 }
175 
176 impl VirtioPciCap64 {
177     pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, id: u8, offset: u64, length: u64) -> Self {
178         VirtioPciCap64 {
179             cap: VirtioPciCap {
180                 cap_len: (std::mem::size_of::<VirtioPciCap64>() as u8) + VIRTIO_PCI_CAP_LEN_OFFSET,
181                 cfg_type: cfg_type as u8,
182                 pci_bar,
183                 id,
184                 padding: [0; 2],
185                 offset: Le32::from(offset as u32),
186                 length: Le32::from(length as u32),
187             },
188             offset_hi: Le32::from((offset >> 32) as u32),
189             length_hi: Le32::from((length >> 32) as u32),
190         }
191     }
192 }
193 
194 #[allow(dead_code)]
195 #[repr(packed)]
196 #[derive(Clone, Copy, Default)]
197 struct VirtioPciCfgCap {
198     cap: VirtioPciCap,
199     pci_cfg_data: [u8; 4],
200 }
201 // SAFETY: All members are simple numbers and any value is valid.
202 unsafe impl ByteValued for VirtioPciCfgCap {}
203 
204 impl PciCapability for VirtioPciCfgCap {
205     fn bytes(&self) -> &[u8] {
206         self.as_slice()
207     }
208 
209     fn id(&self) -> PciCapabilityId {
210         PciCapabilityId::VendorSpecific
211     }
212 }
213 
214 impl VirtioPciCfgCap {
215     fn new() -> Self {
216         VirtioPciCfgCap {
217             cap: VirtioPciCap::new(PciCapabilityType::PciConfig, 0, 0, 0),
218             ..Default::default()
219         }
220     }
221 }
222 
223 #[derive(Clone, Copy, Default)]
224 struct VirtioPciCfgCapInfo {
225     offset: usize,
226     cap: VirtioPciCfgCap,
227 }
228 
229 #[allow(dead_code)]
230 #[derive(Copy, Clone)]
231 pub enum PciVirtioSubclass {
232     NonTransitionalBase = 0xff,
233 }
234 
235 impl PciSubclass for PciVirtioSubclass {
236     fn get_register_value(&self) -> u8 {
237         *self as u8
238     }
239 }
240 
241 // Allocate one bar for the structs pointed to by the capability structures.
242 // As per the PCI specification, because the same BAR shares MSI-X and non
243 // MSI-X structures, it is recommended to use 8KiB alignment for all those
244 // structures.
245 const COMMON_CONFIG_BAR_OFFSET: u64 = 0x0000;
246 const COMMON_CONFIG_SIZE: u64 = 56;
247 const ISR_CONFIG_BAR_OFFSET: u64 = 0x2000;
248 const ISR_CONFIG_SIZE: u64 = 1;
249 const DEVICE_CONFIG_BAR_OFFSET: u64 = 0x4000;
250 const DEVICE_CONFIG_SIZE: u64 = 0x1000;
251 const NOTIFICATION_BAR_OFFSET: u64 = 0x6000;
252 const NOTIFICATION_SIZE: u64 = 0x1000;
253 const MSIX_TABLE_BAR_OFFSET: u64 = 0x8000;
254 // The size is 256KiB because the table can hold up to 2048 entries, with each
255 // entry being 128 bits (4 DWORDS).
256 const MSIX_TABLE_SIZE: u64 = 0x40000;
257 const MSIX_PBA_BAR_OFFSET: u64 = 0x48000;
258 // The size is 2KiB because the Pending Bit Array has one bit per vector and it
259 // can support up to 2048 vectors.
260 const MSIX_PBA_SIZE: u64 = 0x800;
261 // The BAR size must be a power of 2.
262 const CAPABILITY_BAR_SIZE: u64 = 0x80000;
263 
264 const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address.
265 
266 const VIRTIO_PCI_VENDOR_ID: u16 = 0x1af4;
267 const VIRTIO_PCI_DEVICE_ID_BASE: u16 = 0x1040; // Add to device type to get device ID.
268 
269 #[derive(Versionize)]
270 struct QueueState {
271     max_size: u16,
272     size: u16,
273     ready: bool,
274     desc_table: u64,
275     avail_ring: u64,
276     used_ring: u64,
277 }
278 
279 #[derive(Versionize)]
280 struct VirtioPciDeviceState {
281     device_activated: bool,
282     queues: Vec<QueueState>,
283     interrupt_status: usize,
284 }
285 
286 impl VersionMapped for VirtioPciDeviceState {}
287 
288 pub struct VirtioPciDevice {
289     id: String,
290 
291     // PCI configuration registers.
292     configuration: PciConfiguration,
293 
294     // virtio PCI common configuration
295     common_config: VirtioPciCommonConfig,
296 
297     // MSI-X config
298     msix_config: Option<Arc<Mutex<MsixConfig>>>,
299 
300     // Number of MSI-X vectors
301     msix_num: u16,
302 
303     // Virtio device reference and status
304     device: Arc<Mutex<dyn VirtioDevice>>,
305     device_activated: Arc<AtomicBool>,
306 
307     // PCI interrupts.
308     interrupt_status: Arc<AtomicUsize>,
309     virtio_interrupt: Option<Arc<dyn VirtioInterrupt>>,
310     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
311 
312     // virtio queues
313     queues: Vec<Queue<GuestMemoryAtomic<GuestMemoryMmap>>>,
314     queue_evts: Vec<EventFd>,
315 
316     // Guest memory
317     memory: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
318 
319     // Settings PCI BAR
320     settings_bar: u8,
321     settings_bar_addr: Option<GuestAddress>,
322 
323     // Whether to use 64-bit bar location or 32-bit
324     use_64bit_bar: bool,
325 
326     // Add a dedicated structure to hold information about the very specific
327     // virtio-pci capability VIRTIO_PCI_CAP_PCI_CFG. This is needed to support
328     // the legacy/backward compatible mechanism of letting the guest access the
329     // other virtio capabilities without mapping the PCI BARs. This can be
330     // needed when the guest tries to early access the virtio configuration of
331     // a device.
332     cap_pci_cfg_info: VirtioPciCfgCapInfo,
333 
334     // Details of bar regions to free
335     bar_regions: Vec<(GuestAddress, GuestUsize, PciBarRegionType)>,
336 
337     // EventFd to signal on to request activation
338     activate_evt: EventFd,
339 
340     // Barrier that is used to wait on for activation
341     activate_barrier: Arc<Barrier>,
342 
343     // Optional DMA handler
344     dma_handler: Option<Arc<dyn ExternalDmaMapping>>,
345 }
346 
347 impl VirtioPciDevice {
348     /// Constructs a new PCI transport for the given virtio device.
349     #[allow(clippy::too_many_arguments)]
350     pub fn new(
351         id: String,
352         memory: GuestMemoryAtomic<GuestMemoryMmap>,
353         device: Arc<Mutex<dyn VirtioDevice>>,
354         msix_num: u16,
355         access_platform: Option<Arc<dyn AccessPlatform>>,
356         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
357         pci_device_bdf: u32,
358         activate_evt: EventFd,
359         use_64bit_bar: bool,
360         dma_handler: Option<Arc<dyn ExternalDmaMapping>>,
361     ) -> Result<Self> {
362         let device_clone = device.clone();
363         let mut locked_device = device_clone.lock().unwrap();
364         let mut queue_evts = Vec::new();
365         for _ in locked_device.queue_max_sizes().iter() {
366             queue_evts.push(EventFd::new(EFD_NONBLOCK)?)
367         }
368         let num_queues = locked_device.queue_max_sizes().len();
369 
370         if let Some(access_platform) = &access_platform {
371             locked_device.set_access_platform(access_platform.clone());
372         }
373 
374         let queues = locked_device
375             .queue_max_sizes()
376             .iter()
377             .map(|&s| {
378                 Queue::<GuestMemoryAtomic<GuestMemoryMmap>, virtio_queue::QueueState>::new(
379                     memory.clone(),
380                     s,
381                 )
382             })
383             .collect();
384 
385         let pci_device_id = VIRTIO_PCI_DEVICE_ID_BASE + locked_device.device_type() as u16;
386 
387         let interrupt_source_group = interrupt_manager.create_group(MsiIrqGroupConfig {
388             base: 0,
389             count: msix_num as InterruptIndex,
390         })?;
391 
392         let (msix_config, msix_config_clone) = if msix_num > 0 {
393             let msix_config = Arc::new(Mutex::new(MsixConfig::new(
394                 msix_num,
395                 interrupt_source_group.clone(),
396                 pci_device_bdf,
397             )));
398             let msix_config_clone = msix_config.clone();
399             (Some(msix_config), Some(msix_config_clone))
400         } else {
401             (None, None)
402         };
403 
404         let (class, subclass) = match VirtioDeviceType::from(locked_device.device_type()) {
405             VirtioDeviceType::Net => (
406                 PciClassCode::NetworkController,
407                 &PciNetworkControllerSubclass::EthernetController as &dyn PciSubclass,
408             ),
409             VirtioDeviceType::Block => (
410                 PciClassCode::MassStorage,
411                 &PciMassStorageSubclass::MassStorage as &dyn PciSubclass,
412             ),
413             _ => (
414                 PciClassCode::Other,
415                 &PciVirtioSubclass::NonTransitionalBase as &dyn PciSubclass,
416             ),
417         };
418 
419         let configuration = PciConfiguration::new(
420             VIRTIO_PCI_VENDOR_ID,
421             pci_device_id,
422             0x1, // For modern virtio-PCI devices
423             class,
424             subclass,
425             None,
426             PciHeaderType::Device,
427             VIRTIO_PCI_VENDOR_ID,
428             pci_device_id,
429             msix_config_clone,
430         );
431 
432         let mut virtio_pci_device = VirtioPciDevice {
433             id,
434             configuration,
435             common_config: VirtioPciCommonConfig {
436                 access_platform,
437                 driver_status: 0,
438                 config_generation: 0,
439                 device_feature_select: 0,
440                 driver_feature_select: 0,
441                 queue_select: 0,
442                 msix_config: Arc::new(AtomicU16::new(VIRTQ_MSI_NO_VECTOR)),
443                 msix_queues: Arc::new(Mutex::new(vec![VIRTQ_MSI_NO_VECTOR; num_queues])),
444             },
445             msix_config,
446             msix_num,
447             device,
448             device_activated: Arc::new(AtomicBool::new(false)),
449             interrupt_status: Arc::new(AtomicUsize::new(0)),
450             virtio_interrupt: None,
451             queues,
452             queue_evts,
453             memory: Some(memory),
454             settings_bar: 0,
455             settings_bar_addr: None,
456             use_64bit_bar,
457             interrupt_source_group,
458             cap_pci_cfg_info: VirtioPciCfgCapInfo::default(),
459             bar_regions: vec![],
460             activate_evt,
461             activate_barrier: Arc::new(Barrier::new(2)),
462             dma_handler,
463         };
464 
465         if let Some(msix_config) = &virtio_pci_device.msix_config {
466             virtio_pci_device.virtio_interrupt = Some(Arc::new(VirtioInterruptMsix::new(
467                 msix_config.clone(),
468                 virtio_pci_device.common_config.msix_config.clone(),
469                 virtio_pci_device.common_config.msix_queues.clone(),
470                 virtio_pci_device.interrupt_source_group.clone(),
471             )));
472         }
473 
474         Ok(virtio_pci_device)
475     }
476 
477     fn state(&self) -> VirtioPciDeviceState {
478         VirtioPciDeviceState {
479             device_activated: self.device_activated.load(Ordering::Acquire),
480             interrupt_status: self.interrupt_status.load(Ordering::Acquire),
481             queues: self
482                 .queues
483                 .iter()
484                 .map(|q| QueueState {
485                     max_size: q.max_size(),
486                     size: q.state.size,
487                     ready: q.state.ready,
488                     desc_table: q.state.desc_table.0,
489                     avail_ring: q.state.avail_ring.0,
490                     used_ring: q.state.used_ring.0,
491                 })
492                 .collect(),
493         }
494     }
495 
496     fn set_state(&mut self, state: &VirtioPciDeviceState) -> std::result::Result<(), Error> {
497         self.device_activated
498             .store(state.device_activated, Ordering::Release);
499         self.interrupt_status
500             .store(state.interrupt_status, Ordering::Release);
501 
502         // Update virtqueues indexes for both available and used rings.
503         for (i, queue) in self.queues.iter_mut().enumerate() {
504             queue.state.size = state.queues[i].size;
505             queue.state.ready = state.queues[i].ready;
506             queue.state.desc_table = GuestAddress(state.queues[i].desc_table);
507             queue.state.avail_ring = GuestAddress(state.queues[i].avail_ring);
508             queue.state.used_ring = GuestAddress(state.queues[i].used_ring);
509             queue.set_next_avail(
510                 queue
511                     .used_idx(Ordering::Acquire)
512                     .map_err(Error::QueueRingIndex)?
513                     .0,
514             );
515             queue.set_next_used(
516                 queue
517                     .used_idx(Ordering::Acquire)
518                     .map_err(Error::QueueRingIndex)?
519                     .0,
520             );
521         }
522 
523         Ok(())
524     }
525 
526     /// Gets the list of queue events that must be triggered whenever the VM writes to
527     /// `virtio::NOTIFY_REG_OFFSET` past the MMIO base. Each event must be triggered when the
528     /// value being written equals the index of the event in this list.
529     fn queue_evts(&self) -> &[EventFd] {
530         self.queue_evts.as_slice()
531     }
532 
533     fn is_driver_ready(&self) -> bool {
534         let ready_bits =
535             (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK) as u8;
536         self.common_config.driver_status == ready_bits
537             && self.common_config.driver_status & DEVICE_FAILED as u8 == 0
538     }
539 
540     /// Determines if the driver has requested the device (re)init / reset itself
541     fn is_driver_init(&self) -> bool {
542         self.common_config.driver_status == DEVICE_INIT as u8
543     }
544 
545     // This function is used by the caller to provide the expected base address
546     // for the virtio-pci configuration BAR.
547     pub fn set_config_bar_addr(&mut self, bar_addr: u64) {
548         self.settings_bar_addr = Some(GuestAddress(bar_addr));
549     }
550 
551     pub fn config_bar_addr(&self) -> u64 {
552         self.configuration.get_bar_addr(self.settings_bar as usize)
553     }
554 
555     fn add_pci_capabilities(
556         &mut self,
557         settings_bar: u8,
558     ) -> std::result::Result<(), PciDeviceError> {
559         // Add pointers to the different configuration structures from the PCI capabilities.
560         let common_cap = VirtioPciCap::new(
561             PciCapabilityType::CommonConfig,
562             settings_bar,
563             COMMON_CONFIG_BAR_OFFSET as u32,
564             COMMON_CONFIG_SIZE as u32,
565         );
566         self.configuration
567             .add_capability(&common_cap)
568             .map_err(PciDeviceError::CapabilitiesSetup)?;
569 
570         let isr_cap = VirtioPciCap::new(
571             PciCapabilityType::IsrConfig,
572             settings_bar,
573             ISR_CONFIG_BAR_OFFSET as u32,
574             ISR_CONFIG_SIZE as u32,
575         );
576         self.configuration
577             .add_capability(&isr_cap)
578             .map_err(PciDeviceError::CapabilitiesSetup)?;
579 
580         // TODO(dgreid) - set based on device's configuration size?
581         let device_cap = VirtioPciCap::new(
582             PciCapabilityType::DeviceConfig,
583             settings_bar,
584             DEVICE_CONFIG_BAR_OFFSET as u32,
585             DEVICE_CONFIG_SIZE as u32,
586         );
587         self.configuration
588             .add_capability(&device_cap)
589             .map_err(PciDeviceError::CapabilitiesSetup)?;
590 
591         let notify_cap = VirtioPciNotifyCap::new(
592             PciCapabilityType::NotifyConfig,
593             settings_bar,
594             NOTIFICATION_BAR_OFFSET as u32,
595             NOTIFICATION_SIZE as u32,
596             Le32::from(NOTIFY_OFF_MULTIPLIER),
597         );
598         self.configuration
599             .add_capability(&notify_cap)
600             .map_err(PciDeviceError::CapabilitiesSetup)?;
601 
602         let configuration_cap = VirtioPciCfgCap::new();
603         self.cap_pci_cfg_info.offset = self
604             .configuration
605             .add_capability(&configuration_cap)
606             .map_err(PciDeviceError::CapabilitiesSetup)?
607             + VIRTIO_PCI_CAP_OFFSET;
608         self.cap_pci_cfg_info.cap = configuration_cap;
609 
610         if self.msix_config.is_some() {
611             let msix_cap = MsixCap::new(
612                 settings_bar,
613                 self.msix_num,
614                 MSIX_TABLE_BAR_OFFSET as u32,
615                 settings_bar,
616                 MSIX_PBA_BAR_OFFSET as u32,
617             );
618             self.configuration
619                 .add_capability(&msix_cap)
620                 .map_err(PciDeviceError::CapabilitiesSetup)?;
621         }
622 
623         self.settings_bar = settings_bar;
624         Ok(())
625     }
626 
627     fn read_cap_pci_cfg(&mut self, offset: usize, mut data: &mut [u8]) {
628         let cap_slice = self.cap_pci_cfg_info.cap.as_slice();
629         let data_len = data.len();
630         let cap_len = cap_slice.len();
631         if offset + data_len > cap_len {
632             error!("Failed to read cap_pci_cfg from config space");
633             return;
634         }
635 
636         if offset < std::mem::size_of::<VirtioPciCap>() {
637             if let Some(end) = offset.checked_add(data_len) {
638                 // This write can't fail, offset and end are checked against config_len.
639                 data.write_all(&cap_slice[offset..cmp::min(end, cap_len)])
640                     .unwrap();
641             }
642         } else {
643             // Safe since we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long.
644             let bar_offset: u32 =
645                 unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) };
646             self.read_bar(0, bar_offset as u64, data)
647         }
648     }
649 
650     fn write_cap_pci_cfg(&mut self, offset: usize, data: &[u8]) -> Option<Arc<Barrier>> {
651         let cap_slice = self.cap_pci_cfg_info.cap.as_mut_slice();
652         let data_len = data.len();
653         let cap_len = cap_slice.len();
654         if offset + data_len > cap_len {
655             error!("Failed to write cap_pci_cfg to config space");
656             return None;
657         }
658 
659         if offset < std::mem::size_of::<VirtioPciCap>() {
660             let (_, right) = cap_slice.split_at_mut(offset);
661             right[..data_len].copy_from_slice(data);
662             None
663         } else {
664             // Safe since we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long.
665             let bar_offset: u32 =
666                 unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) };
667             self.write_bar(0, bar_offset as u64, data)
668         }
669     }
670 
671     pub fn virtio_device(&self) -> Arc<Mutex<dyn VirtioDevice>> {
672         self.device.clone()
673     }
674 
675     fn activate(&mut self) -> ActivateResult {
676         if let Some(virtio_interrupt) = self.virtio_interrupt.take() {
677             if self.memory.is_some() {
678                 let mem = self.memory.as_ref().unwrap().clone();
679                 let mut device = self.device.lock().unwrap();
680                 let mut queue_evts = Vec::new();
681                 let mut queues = self.queues.clone();
682                 queues.retain(|q| q.state.ready);
683                 for (i, queue) in queues.iter().enumerate() {
684                     queue_evts.push(self.queue_evts[i].try_clone().unwrap());
685                     if !queue.is_valid() {
686                         error!("Queue {} is not valid", i);
687                     }
688                 }
689                 return device.activate(mem, virtio_interrupt, queues, queue_evts);
690             }
691         }
692         Ok(())
693     }
694 
695     pub fn maybe_activate(&mut self) {
696         if self.needs_activation() {
697             self.activate().expect("Failed to activate device");
698             self.device_activated.store(true, Ordering::SeqCst);
699             info!("{}: Waiting for barrier", self.id);
700             self.activate_barrier.wait();
701             info!("{}: Barrier released", self.id);
702         } else {
703             info!("{}: Device does not need activation", self.id)
704         }
705     }
706 
707     fn needs_activation(&self) -> bool {
708         !self.device_activated.load(Ordering::SeqCst) && self.is_driver_ready()
709     }
710 
711     pub fn dma_handler(&self) -> Option<&Arc<dyn ExternalDmaMapping>> {
712         self.dma_handler.as_ref()
713     }
714 }
715 
716 impl VirtioTransport for VirtioPciDevice {
717     fn ioeventfds(&self, base_addr: u64) -> Vec<(&EventFd, u64)> {
718         let notify_base = base_addr + NOTIFICATION_BAR_OFFSET;
719         self.queue_evts()
720             .iter()
721             .enumerate()
722             .map(|(i, event)| {
723                 (
724                     event,
725                     notify_base + i as u64 * u64::from(NOTIFY_OFF_MULTIPLIER),
726                 )
727             })
728             .collect()
729     }
730 }
731 
732 pub struct VirtioInterruptMsix {
733     msix_config: Arc<Mutex<MsixConfig>>,
734     config_vector: Arc<AtomicU16>,
735     queues_vectors: Arc<Mutex<Vec<u16>>>,
736     interrupt_source_group: Arc<dyn InterruptSourceGroup>,
737 }
738 
739 impl VirtioInterruptMsix {
740     pub fn new(
741         msix_config: Arc<Mutex<MsixConfig>>,
742         config_vector: Arc<AtomicU16>,
743         queues_vectors: Arc<Mutex<Vec<u16>>>,
744         interrupt_source_group: Arc<dyn InterruptSourceGroup>,
745     ) -> Self {
746         VirtioInterruptMsix {
747             msix_config,
748             config_vector,
749             queues_vectors,
750             interrupt_source_group,
751         }
752     }
753 }
754 
755 impl VirtioInterrupt for VirtioInterruptMsix {
756     fn trigger(&self, int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> {
757         let vector = match int_type {
758             VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire),
759             VirtioInterruptType::Queue(queue_index) => {
760                 self.queues_vectors.lock().unwrap()[queue_index as usize]
761             }
762         };
763 
764         if vector == VIRTQ_MSI_NO_VECTOR {
765             return Ok(());
766         }
767 
768         let config = &mut self.msix_config.lock().unwrap();
769         let entry = &config.table_entries[vector as usize];
770         // In case the vector control register associated with the entry
771         // has its first bit set, this means the vector is masked and the
772         // device should not inject the interrupt.
773         // Instead, the Pending Bit Array table is updated to reflect there
774         // is a pending interrupt for this specific vector.
775         if config.masked() || entry.masked() {
776             config.set_pba_bit(vector, false);
777             return Ok(());
778         }
779 
780         self.interrupt_source_group
781             .trigger(vector as InterruptIndex)
782     }
783 
784     fn notifier(&self, int_type: VirtioInterruptType) -> Option<EventFd> {
785         let vector = match int_type {
786             VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire),
787             VirtioInterruptType::Queue(queue_index) => {
788                 self.queues_vectors.lock().unwrap()[queue_index as usize]
789             }
790         };
791 
792         self.interrupt_source_group
793             .notifier(vector as InterruptIndex)
794     }
795 }
796 
797 impl PciDevice for VirtioPciDevice {
798     fn write_config_register(
799         &mut self,
800         reg_idx: usize,
801         offset: u64,
802         data: &[u8],
803     ) -> Option<Arc<Barrier>> {
804         // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG
805         // is accessed. This capability has a special meaning as it allows the
806         // guest to access other capabilities without mapping the PCI BAR.
807         let base = reg_idx * 4;
808         if base + offset as usize >= self.cap_pci_cfg_info.offset
809             && base + offset as usize + data.len()
810                 <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len()
811         {
812             let offset = base + offset as usize - self.cap_pci_cfg_info.offset;
813             self.write_cap_pci_cfg(offset, data)
814         } else {
815             self.configuration
816                 .write_config_register(reg_idx, offset, data);
817             None
818         }
819     }
820 
821     fn read_config_register(&mut self, reg_idx: usize) -> u32 {
822         // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG
823         // is accessed. This capability has a special meaning as it allows the
824         // guest to access other capabilities without mapping the PCI BAR.
825         let base = reg_idx * 4;
826         if base >= self.cap_pci_cfg_info.offset
827             && base + 4 <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len()
828         {
829             let offset = base - self.cap_pci_cfg_info.offset;
830             let mut data = [0u8; 4];
831             self.read_cap_pci_cfg(offset, &mut data);
832             u32::from_le_bytes(data)
833         } else {
834             self.configuration.read_reg(reg_idx)
835         }
836     }
837 
838     fn detect_bar_reprogramming(
839         &mut self,
840         reg_idx: usize,
841         data: &[u8],
842     ) -> Option<BarReprogrammingParams> {
843         self.configuration.detect_bar_reprogramming(reg_idx, data)
844     }
845 
846     fn allocate_bars(
847         &mut self,
848         allocator: &Arc<Mutex<SystemAllocator>>,
849         mmio_allocator: &mut AddressAllocator,
850     ) -> std::result::Result<Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, PciDeviceError>
851     {
852         let mut ranges = Vec::new();
853         let device_clone = self.device.clone();
854         let device = device_clone.lock().unwrap();
855 
856         // Allocate the virtio-pci capability BAR.
857         // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004
858         let (virtio_pci_bar_addr, region_type) = if self.use_64bit_bar {
859             let region_type = PciBarRegionType::Memory64BitRegion;
860             let addr = mmio_allocator
861                 .allocate(
862                     self.settings_bar_addr,
863                     CAPABILITY_BAR_SIZE,
864                     Some(CAPABILITY_BAR_SIZE),
865                 )
866                 .ok_or(PciDeviceError::IoAllocationFailed(CAPABILITY_BAR_SIZE))?;
867             ranges.push((addr, CAPABILITY_BAR_SIZE, region_type));
868             (addr, region_type)
869         } else {
870             let region_type = PciBarRegionType::Memory32BitRegion;
871             let addr = allocator
872                 .lock()
873                 .unwrap()
874                 .allocate_mmio_hole_addresses(
875                     self.settings_bar_addr,
876                     CAPABILITY_BAR_SIZE,
877                     Some(CAPABILITY_BAR_SIZE),
878                 )
879                 .ok_or(PciDeviceError::IoAllocationFailed(CAPABILITY_BAR_SIZE))?;
880             ranges.push((addr, CAPABILITY_BAR_SIZE, region_type));
881             (addr, region_type)
882         };
883         self.bar_regions
884             .push((virtio_pci_bar_addr, CAPABILITY_BAR_SIZE, region_type));
885 
886         let config = PciBarConfiguration::default()
887             .set_register_index(0)
888             .set_address(virtio_pci_bar_addr.raw_value())
889             .set_size(CAPABILITY_BAR_SIZE)
890             .set_region_type(region_type);
891         let virtio_pci_bar =
892             self.configuration.add_pci_bar(&config).map_err(|e| {
893                 PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr.raw_value(), e)
894             })? as u8;
895 
896         // Once the BARs are allocated, the capabilities can be added to the PCI configuration.
897         self.add_pci_capabilities(virtio_pci_bar)?;
898 
899         // Allocate a dedicated BAR if there are some shared memory regions.
900         if let Some(shm_list) = device.get_shm_regions() {
901             let config = PciBarConfiguration::default()
902                 .set_register_index(2)
903                 .set_address(shm_list.addr.raw_value())
904                 .set_size(shm_list.len);
905             let virtio_pci_shm_bar =
906                 self.configuration.add_pci_bar(&config).map_err(|e| {
907                     PciDeviceError::IoRegistrationFailed(shm_list.addr.raw_value(), e)
908                 })? as u8;
909 
910             let region_type = PciBarRegionType::Memory64BitRegion;
911             ranges.push((shm_list.addr, shm_list.len, region_type));
912             self.bar_regions
913                 .push((shm_list.addr, shm_list.len, region_type));
914 
915             for (idx, shm) in shm_list.region_list.iter().enumerate() {
916                 let shm_cap = VirtioPciCap64::new(
917                     PciCapabilityType::SharedMemoryConfig,
918                     virtio_pci_shm_bar,
919                     idx as u8,
920                     shm.offset,
921                     shm.len,
922                 );
923                 self.configuration
924                     .add_capability(&shm_cap)
925                     .map_err(PciDeviceError::CapabilitiesSetup)?;
926             }
927         }
928 
929         Ok(ranges)
930     }
931 
932     fn free_bars(
933         &mut self,
934         allocator: &mut SystemAllocator,
935         mmio_allocator: &mut AddressAllocator,
936     ) -> std::result::Result<(), PciDeviceError> {
937         for (addr, length, type_) in self.bar_regions.drain(..) {
938             match type_ {
939                 PciBarRegionType::Memory32BitRegion => {
940                     allocator.free_mmio_hole_addresses(addr, length);
941                 }
942                 PciBarRegionType::Memory64BitRegion => {
943                     mmio_allocator.free(addr, length);
944                 }
945                 _ => error!("Unexpected PCI bar type"),
946             }
947         }
948         Ok(())
949     }
950 
951     fn move_bar(&mut self, old_base: u64, new_base: u64) -> result::Result<(), std::io::Error> {
952         // We only update our idea of the bar in order to support free_bars() above.
953         // The majority of the reallocation is done inside DeviceManager.
954         for (addr, _, _) in self.bar_regions.iter_mut() {
955             if (*addr).0 == old_base {
956                 *addr = GuestAddress(new_base);
957             }
958         }
959 
960         Ok(())
961     }
962 
963     fn read_bar(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
964         match offset {
965             o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.read(
966                 o - COMMON_CONFIG_BAR_OFFSET,
967                 data,
968                 &mut self.queues,
969                 self.device.clone(),
970             ),
971             o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => {
972                 if let Some(v) = data.get_mut(0) {
973                     // Reading this register resets it to 0.
974                     *v = self.interrupt_status.swap(0, Ordering::AcqRel) as u8;
975                 }
976             }
977             o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE)
978                 .contains(&o) =>
979             {
980                 let device = self.device.lock().unwrap();
981                 device.read_config(o - DEVICE_CONFIG_BAR_OFFSET, data);
982             }
983             o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE)
984                 .contains(&o) =>
985             {
986                 // Handled with ioeventfds.
987             }
988             o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => {
989                 if let Some(msix_config) = &self.msix_config {
990                     msix_config
991                         .lock()
992                         .unwrap()
993                         .read_table(o - MSIX_TABLE_BAR_OFFSET, data);
994                 }
995             }
996             o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => {
997                 if let Some(msix_config) = &self.msix_config {
998                     msix_config
999                         .lock()
1000                         .unwrap()
1001                         .read_pba(o - MSIX_PBA_BAR_OFFSET, data);
1002                 }
1003             }
1004             _ => (),
1005         }
1006     }
1007 
1008     fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1009         match offset {
1010             o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.write(
1011                 o - COMMON_CONFIG_BAR_OFFSET,
1012                 data,
1013                 &mut self.queues,
1014                 self.device.clone(),
1015             ),
1016             o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => {
1017                 if let Some(v) = data.get(0) {
1018                     self.interrupt_status
1019                         .fetch_and(!(*v as usize), Ordering::AcqRel);
1020                 }
1021             }
1022             o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE)
1023                 .contains(&o) =>
1024             {
1025                 let mut device = self.device.lock().unwrap();
1026                 device.write_config(o - DEVICE_CONFIG_BAR_OFFSET, data);
1027             }
1028             o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE)
1029                 .contains(&o) =>
1030             {
1031                 // Handled with ioeventfds.
1032                 error!("Unexpected write to notification BAR: offset = 0x{:x}", o);
1033             }
1034             o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => {
1035                 if let Some(msix_config) = &self.msix_config {
1036                     msix_config
1037                         .lock()
1038                         .unwrap()
1039                         .write_table(o - MSIX_TABLE_BAR_OFFSET, data);
1040                 }
1041             }
1042             o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => {
1043                 if let Some(msix_config) = &self.msix_config {
1044                     msix_config
1045                         .lock()
1046                         .unwrap()
1047                         .write_pba(o - MSIX_PBA_BAR_OFFSET, data);
1048                 }
1049             }
1050             _ => (),
1051         };
1052 
1053         // Try and activate the device if the driver status has changed
1054         if self.needs_activation() {
1055             info!(
1056                 "{}: Needs activation; writing to activate event fd",
1057                 self.id
1058             );
1059             self.activate_evt.write(1).ok();
1060             info!("{}: Needs activation; returning barrier", self.id);
1061             return Some(self.activate_barrier.clone());
1062         }
1063 
1064         // Device has been reset by the driver
1065         if self.device_activated.load(Ordering::SeqCst) && self.is_driver_init() {
1066             let mut device = self.device.lock().unwrap();
1067             if let Some(virtio_interrupt) = device.reset() {
1068                 // Upon reset the device returns its interrupt EventFD
1069                 self.virtio_interrupt = Some(virtio_interrupt);
1070                 self.device_activated.store(false, Ordering::SeqCst);
1071 
1072                 // Reset queue readiness (changes queue_enable), queue sizes
1073                 // and selected_queue as per spec for reset
1074                 self.queues.iter_mut().for_each(Queue::reset);
1075                 self.common_config.queue_select = 0;
1076             } else {
1077                 error!("Attempt to reset device when not implemented in underlying device");
1078                 self.common_config.driver_status = crate::DEVICE_FAILED as u8;
1079             }
1080         }
1081 
1082         None
1083     }
1084 
1085     fn as_any(&mut self) -> &mut dyn Any {
1086         self
1087     }
1088 }
1089 
1090 impl BusDevice for VirtioPciDevice {
1091     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
1092         self.read_bar(base, offset, data)
1093     }
1094 
1095     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
1096         self.write_bar(base, offset, data)
1097     }
1098 }
1099 
1100 impl Pausable for VirtioPciDevice {
1101     fn pause(&mut self) -> result::Result<(), MigratableError> {
1102         Ok(())
1103     }
1104 
1105     fn resume(&mut self) -> result::Result<(), MigratableError> {
1106         Ok(())
1107     }
1108 }
1109 
1110 impl Snapshottable for VirtioPciDevice {
1111     fn id(&self) -> String {
1112         self.id.clone()
1113     }
1114 
1115     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1116         let mut virtio_pci_dev_snapshot =
1117             Snapshot::new_from_versioned_state(&self.id, &self.state())?;
1118 
1119         // Snapshot PciConfiguration
1120         virtio_pci_dev_snapshot.add_snapshot(self.configuration.snapshot()?);
1121 
1122         // Snapshot VirtioPciCommonConfig
1123         virtio_pci_dev_snapshot.add_snapshot(self.common_config.snapshot()?);
1124 
1125         // Snapshot MSI-X
1126         if let Some(msix_config) = &self.msix_config {
1127             virtio_pci_dev_snapshot.add_snapshot(msix_config.lock().unwrap().snapshot()?);
1128         }
1129 
1130         Ok(virtio_pci_dev_snapshot)
1131     }
1132 
1133     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
1134         if let Some(virtio_pci_dev_section) =
1135             snapshot.snapshot_data.get(&format!("{}-section", self.id))
1136         {
1137             // Restore MSI-X
1138             if let Some(msix_config) = &self.msix_config {
1139                 let id = msix_config.lock().unwrap().id();
1140                 if let Some(msix_snapshot) = snapshot.snapshots.get(&id) {
1141                     msix_config
1142                         .lock()
1143                         .unwrap()
1144                         .restore(*msix_snapshot.clone())?;
1145                 }
1146             }
1147 
1148             // Restore VirtioPciCommonConfig
1149             if let Some(virtio_config_snapshot) = snapshot.snapshots.get(&self.common_config.id()) {
1150                 self.common_config
1151                     .restore(*virtio_config_snapshot.clone())?;
1152             }
1153 
1154             // Restore PciConfiguration
1155             if let Some(pci_config_snapshot) = snapshot.snapshots.get(&self.configuration.id()) {
1156                 self.configuration.restore(*pci_config_snapshot.clone())?;
1157             }
1158 
1159             // First restore the status of the virtqueues.
1160             self.set_state(&virtio_pci_dev_section.to_versioned_state()?)
1161                 .map_err(|e| {
1162                     MigratableError::Restore(anyhow!(
1163                         "Could not restore VIRTIO_PCI_DEVICE state {:?}",
1164                         e
1165                     ))
1166                 })?;
1167 
1168             // Then we can activate the device, as we know at this point that
1169             // the virtqueues are in the right state and the device is ready
1170             // to be activated, which will spawn each virtio worker thread.
1171             if self.device_activated.load(Ordering::SeqCst) && self.is_driver_ready() {
1172                 self.activate().map_err(|e| {
1173                     MigratableError::Restore(anyhow!("Failed activating the device: {:?}", e))
1174                 })?;
1175             }
1176 
1177             return Ok(());
1178         }
1179 
1180         Err(MigratableError::Restore(anyhow!(
1181             "Could not find VIRTIO_PCI_DEVICE snapshot section"
1182         )))
1183     }
1184 }
1185 impl Transportable for VirtioPciDevice {}
1186 impl Migratable for VirtioPciDevice {}
1187