xref: /cloud-hypervisor/vmm/src/device_manager.rs (revision ed63b352d1ebf70f36c7d36a0d6b52fc96186581)
1 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 //
3 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
4 // Use of this source code is governed by a BSD-style license that can be
5 // found in the LICENSE-BSD-3-Clause file.
6 //
7 // Copyright © 2019 Intel Corporation
8 //
9 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
10 //
11 
12 use std::collections::{BTreeMap, BTreeSet, HashMap};
13 use std::fs::{File, OpenOptions};
14 use std::io::{self, stdout, IsTerminal, Seek, SeekFrom};
15 use std::num::Wrapping;
16 use std::os::unix::fs::OpenOptionsExt;
17 use std::os::unix::io::{AsRawFd, FromRawFd};
18 use std::path::PathBuf;
19 use std::result;
20 use std::sync::{Arc, Mutex};
21 #[cfg(not(target_arch = "riscv64"))]
22 use std::time::Instant;
23 
24 use acpi_tables::sdt::GenericAddress;
25 #[cfg(not(target_arch = "riscv64"))]
26 use acpi_tables::{aml, Aml};
27 #[cfg(not(target_arch = "riscv64"))]
28 use anyhow::anyhow;
29 #[cfg(target_arch = "x86_64")]
30 use arch::layout::{APIC_START, IOAPIC_SIZE, IOAPIC_START};
31 use arch::{layout, NumaNodes};
32 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
33 use arch::{DeviceType, MmioDeviceInfo};
34 use block::async_io::DiskFile;
35 use block::fixed_vhd_sync::FixedVhdDiskSync;
36 use block::qcow_sync::QcowDiskSync;
37 use block::raw_async_aio::RawFileDiskAio;
38 use block::raw_sync::RawFileDiskSync;
39 use block::vhdx_sync::VhdxDiskSync;
40 use block::{
41     block_aio_is_supported, block_io_uring_is_supported, detect_image_type, qcow, vhdx, ImageType,
42 };
43 #[cfg(feature = "io_uring")]
44 use block::{fixed_vhd_async::FixedVhdDiskAsync, raw_async::RawFileDisk};
45 #[cfg(target_arch = "riscv64")]
46 use devices::aia;
47 #[cfg(target_arch = "x86_64")]
48 use devices::debug_console;
49 #[cfg(target_arch = "x86_64")]
50 use devices::debug_console::DebugConsole;
51 #[cfg(target_arch = "aarch64")]
52 use devices::gic;
53 use devices::interrupt_controller::InterruptController;
54 #[cfg(target_arch = "x86_64")]
55 use devices::ioapic;
56 #[cfg(target_arch = "aarch64")]
57 use devices::legacy::Pl011;
58 #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))]
59 use devices::legacy::Serial;
60 #[cfg(feature = "pvmemcontrol")]
61 use devices::pvmemcontrol::{PvmemcontrolBusDevice, PvmemcontrolPciDevice};
62 use devices::{interrupt_controller, AcpiNotificationFlags};
63 #[cfg(target_arch = "aarch64")]
64 use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ;
65 use hypervisor::IoEventAddress;
66 use libc::{
67     tcsetattr, termios, MAP_NORESERVE, MAP_PRIVATE, MAP_SHARED, O_TMPFILE, PROT_READ, PROT_WRITE,
68     TCSANOW,
69 };
70 use pci::{
71     DeviceRelocation, MmioRegion, PciBarRegionType, PciBdf, PciDevice, VfioDmaMapping,
72     VfioPciDevice, VfioUserDmaMapping, VfioUserPciDevice, VfioUserPciDeviceError,
73 };
74 use rate_limiter::group::RateLimiterGroup;
75 use seccompiler::SeccompAction;
76 use serde::{Deserialize, Serialize};
77 use thiserror::Error;
78 use tracer::trace_scoped;
79 use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd};
80 use virtio_devices::transport::{VirtioPciDevice, VirtioPciDeviceActivator, VirtioTransport};
81 use virtio_devices::vhost_user::VhostUserConfig;
82 use virtio_devices::{
83     AccessPlatformMapping, ActivateError, Block, Endpoint, IommuMapping, VdpaDmaMapping,
84     VirtioMemMappingSource,
85 };
86 use vm_allocator::{AddressAllocator, SystemAllocator};
87 use vm_device::dma_mapping::ExternalDmaMapping;
88 use vm_device::interrupt::{
89     InterruptIndex, InterruptManager, LegacyIrqGroupConfig, MsiIrqGroupConfig,
90 };
91 use vm_device::{Bus, BusDevice, BusDeviceSync, Resource};
92 use vm_memory::guest_memory::FileOffset;
93 use vm_memory::{Address, GuestAddress, GuestMemoryRegion, GuestUsize, MmapRegion};
94 #[cfg(target_arch = "x86_64")]
95 use vm_memory::{GuestAddressSpace, GuestMemory};
96 use vm_migration::protocol::MemoryRangeTable;
97 use vm_migration::{
98     snapshot_from_id, state_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData,
99     Snapshottable, Transportable,
100 };
101 use vm_virtio::{AccessPlatform, VirtioDeviceType};
102 use vmm_sys_util::eventfd::EventFd;
103 
104 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo, ConsoleOutput};
105 use crate::cpu::{CpuManager, CPU_MANAGER_ACPI_SIZE};
106 use crate::device_tree::{DeviceNode, DeviceTree};
107 use crate::interrupt::{LegacyUserspaceInterruptManager, MsiInterruptManager};
108 use crate::memory_manager::{Error as MemoryManagerError, MemoryManager, MEMORY_MANAGER_ACPI_SIZE};
109 use crate::pci_segment::PciSegment;
110 use crate::serial_manager::{Error as SerialManagerError, SerialManager};
111 use crate::vm_config::{
112     ConsoleOutputMode, DeviceConfig, DiskConfig, FsConfig, NetConfig, PmemConfig, UserDeviceConfig,
113     VdpaConfig, VhostMode, VmConfig, VsockConfig, DEFAULT_IOMMU_ADDRESS_WIDTH_BITS,
114     DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT,
115 };
116 use crate::{device_node, GuestRegionMmap, PciDeviceInfo, DEVICE_MANAGER_SNAPSHOT_ID};
117 
118 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
119 const MMIO_LEN: u64 = 0x1000;
120 
121 // Singleton devices / devices the user cannot name
122 #[cfg(target_arch = "x86_64")]
123 const IOAPIC_DEVICE_NAME: &str = "__ioapic";
124 const SERIAL_DEVICE_NAME: &str = "__serial";
125 #[cfg(target_arch = "x86_64")]
126 const DEBUGCON_DEVICE_NAME: &str = "__debug_console";
127 #[cfg(target_arch = "aarch64")]
128 const GPIO_DEVICE_NAME: &str = "__gpio";
129 const RNG_DEVICE_NAME: &str = "__rng";
130 const IOMMU_DEVICE_NAME: &str = "__iommu";
131 #[cfg(feature = "pvmemcontrol")]
132 const PVMEMCONTROL_DEVICE_NAME: &str = "__pvmemcontrol";
133 const BALLOON_DEVICE_NAME: &str = "__balloon";
134 const CONSOLE_DEVICE_NAME: &str = "__console";
135 const PVPANIC_DEVICE_NAME: &str = "__pvpanic";
136 
137 // Devices that the user may name and for which we generate
138 // identifiers if the user doesn't give one
139 const DISK_DEVICE_NAME_PREFIX: &str = "_disk";
140 const FS_DEVICE_NAME_PREFIX: &str = "_fs";
141 const NET_DEVICE_NAME_PREFIX: &str = "_net";
142 const PMEM_DEVICE_NAME_PREFIX: &str = "_pmem";
143 const VDPA_DEVICE_NAME_PREFIX: &str = "_vdpa";
144 const VSOCK_DEVICE_NAME_PREFIX: &str = "_vsock";
145 const WATCHDOG_DEVICE_NAME: &str = "__watchdog";
146 const VFIO_DEVICE_NAME_PREFIX: &str = "_vfio";
147 const VFIO_USER_DEVICE_NAME_PREFIX: &str = "_vfio_user";
148 const VIRTIO_PCI_DEVICE_NAME_PREFIX: &str = "_virtio-pci";
149 
150 /// Errors associated with device manager
151 #[derive(Error, Debug)]
152 pub enum DeviceManagerError {
153     /// Cannot create EventFd.
154     #[error("Cannot create EventFd")]
155     EventFd(#[source] io::Error),
156 
157     /// Cannot open disk path
158     #[error("Cannot open disk path")]
159     Disk(#[source] io::Error),
160 
161     /// Cannot create vhost-user-net device
162     #[error("Cannot create vhost-user-net device")]
163     CreateVhostUserNet(#[source] virtio_devices::vhost_user::Error),
164 
165     /// Cannot create virtio-blk device
166     #[error("Cannot create virtio-blk device")]
167     CreateVirtioBlock(#[source] io::Error),
168 
169     /// Cannot create virtio-net device
170     #[error("Cannot create virtio-net device")]
171     CreateVirtioNet(#[source] virtio_devices::net::Error),
172 
173     /// Cannot create virtio-console device
174     #[error("Cannot create virtio-console device")]
175     CreateVirtioConsole(#[source] io::Error),
176 
177     /// Cannot create virtio-rng device
178     #[error("Cannot create virtio-rng device")]
179     CreateVirtioRng(#[source] io::Error),
180 
181     /// Cannot create virtio-fs device
182     #[error("Cannot create virtio-fs device")]
183     CreateVirtioFs(#[source] virtio_devices::vhost_user::Error),
184 
185     /// Virtio-fs device was created without a socket.
186     #[error("Virtio-fs device was created without a socket")]
187     NoVirtioFsSock,
188 
189     /// Cannot create vhost-user-blk device
190     #[error("Cannot create vhost-user-blk device")]
191     CreateVhostUserBlk(#[source] virtio_devices::vhost_user::Error),
192 
193     /// Cannot create virtio-pmem device
194     #[error("Cannot create virtio-pmem device")]
195     CreateVirtioPmem(#[source] io::Error),
196 
197     /// Cannot create vDPA device
198     #[error("Cannot create vdpa device")]
199     CreateVdpa(#[source] virtio_devices::vdpa::Error),
200 
201     /// Cannot create virtio-vsock device
202     #[error("Cannot create virtio-vsock device")]
203     CreateVirtioVsock(#[source] io::Error),
204 
205     /// Cannot create tpm device
206     #[error("Cannot create tmp device")]
207     CreateTpmDevice(#[source] anyhow::Error),
208 
209     /// Failed to convert Path to &str for the vDPA device.
210     #[error("Failed to convert Path to &str for the vDPA device")]
211     CreateVdpaConvertPath,
212 
213     /// Failed to convert Path to &str for the virtio-vsock device.
214     #[error("Failed to convert Path to &str for the virtio-vsock device")]
215     CreateVsockConvertPath,
216 
217     /// Cannot create virtio-vsock backend
218     #[error("Cannot create virtio-vsock backend")]
219     CreateVsockBackend(#[source] virtio_devices::vsock::VsockUnixError),
220 
221     /// Cannot create virtio-iommu device
222     #[error("Cannot create virtio-iommu device")]
223     CreateVirtioIommu(#[source] io::Error),
224 
225     /// Cannot create virtio-balloon device
226     #[error("Cannot create virtio-balloon device")]
227     CreateVirtioBalloon(#[source] io::Error),
228 
229     /// Cannot create pvmemcontrol device
230     #[cfg(feature = "pvmemcontrol")]
231     #[error("Cannot create pvmemcontrol device")]
232     CreatePvmemcontrol(#[source] io::Error),
233 
234     /// Cannot create virtio-watchdog device
235     #[error("Cannot create virtio-watchdog device")]
236     CreateVirtioWatchdog(#[source] io::Error),
237 
238     /// Failed to parse disk image format
239     #[error("Failed to parse disk image format")]
240     DetectImageType(#[source] io::Error),
241 
242     /// Cannot open qcow disk path
243     #[error("Cannot open qcow disk path")]
244     QcowDeviceCreate(#[source] qcow::Error),
245 
246     /// Cannot create serial manager
247     #[error("Cannot create serial manager")]
248     CreateSerialManager(#[source] SerialManagerError),
249 
250     /// Cannot spawn the serial manager thread
251     #[error("Cannot spawn serial manager thread")]
252     SpawnSerialManager(#[source] SerialManagerError),
253 
254     /// Cannot open tap interface
255     #[error("Cannot open tap interface")]
256     OpenTap(#[source] net_util::TapError),
257 
258     /// Cannot allocate IRQ.
259     #[error("Cannot allocate IRQ")]
260     AllocateIrq,
261 
262     /// Cannot configure the IRQ.
263     #[error("Cannot configure the IRQ")]
264     Irq(#[source] vmm_sys_util::errno::Error),
265 
266     /// Cannot allocate PCI BARs
267     #[error("Cannot allocate PCI BARs")]
268     AllocateBars(#[source] pci::PciDeviceError),
269 
270     /// Could not free the BARs associated with a PCI device.
271     #[error("Could not free the BARs associated with a PCI device")]
272     FreePciBars(#[source] pci::PciDeviceError),
273 
274     /// Cannot register ioevent.
275     #[error("Cannot register ioevent")]
276     RegisterIoevent(#[source] anyhow::Error),
277 
278     /// Cannot unregister ioevent.
279     #[error("Cannot unregister ioevent")]
280     UnRegisterIoevent(#[source] anyhow::Error),
281 
282     /// Cannot create virtio device
283     #[error("Cannot create virtio device")]
284     VirtioDevice(#[source] virtio_devices::transport::VirtioPciDeviceError),
285 
286     /// Cannot add PCI device
287     #[error("Cannot add PCI device")]
288     AddPciDevice(#[source] pci::PciRootError),
289 
290     /// Cannot open persistent memory file
291     #[error("Cannot open persistent memory file")]
292     PmemFileOpen(#[source] io::Error),
293 
294     /// Cannot set persistent memory file size
295     #[error("Cannot set persistent memory file size")]
296     PmemFileSetLen(#[source] io::Error),
297 
298     /// Cannot find a memory range for persistent memory
299     #[error("Cannot find a memory range for persistent memory")]
300     PmemRangeAllocation,
301 
302     /// Cannot find a memory range for virtio-fs
303     #[error("Cannot find a memory range for virtio-fs")]
304     FsRangeAllocation,
305 
306     /// Error creating serial output file
307     #[error("Error creating serial output file")]
308     SerialOutputFileOpen(#[source] io::Error),
309 
310     /// Error creating debug-console output file
311     #[cfg(target_arch = "x86_64")]
312     #[error("Error creating debug-console output file")]
313     DebugconOutputFileOpen(#[source] io::Error),
314 
315     /// Error creating console output file
316     #[error("Error creating console output file")]
317     ConsoleOutputFileOpen(#[source] io::Error),
318 
319     /// Error creating serial pty
320     #[error("Error creating serial pty")]
321     SerialPtyOpen(#[source] io::Error),
322 
323     /// Error creating console pty
324     #[error("Error creating console pty")]
325     ConsolePtyOpen(#[source] io::Error),
326 
327     /// Error creating debugcon pty
328     #[error("Error creating console pty")]
329     DebugconPtyOpen(#[source] io::Error),
330 
331     /// Error setting pty raw mode
332     #[error("Error setting pty raw mode")]
333     SetPtyRaw(#[source] ConsoleDeviceError),
334 
335     /// Error getting pty peer
336     #[error("Error getting pty peer")]
337     GetPtyPeer(#[source] vmm_sys_util::errno::Error),
338 
339     /// Cannot create a VFIO device
340     #[error("Cannot create a VFIO device")]
341     VfioCreate(#[source] vfio_ioctls::VfioError),
342 
343     /// Cannot create a VFIO PCI device
344     #[error("Cannot create a VFIO PCI device")]
345     VfioPciCreate(#[source] pci::VfioPciError),
346 
347     /// Failed to map VFIO MMIO region.
348     #[error("Failed to map VFIO MMIO region")]
349     VfioMapRegion(#[source] pci::VfioPciError),
350 
351     /// Failed to DMA map VFIO device.
352     #[error("Failed to DMA map VFIO device")]
353     VfioDmaMap(#[source] vfio_ioctls::VfioError),
354 
355     /// Failed to DMA unmap VFIO device.
356     #[error("Failed to DMA unmap VFIO device")]
357     VfioDmaUnmap(#[source] pci::VfioPciError),
358 
359     /// Failed to create the passthrough device.
360     #[error("Failed to create the passthrough device")]
361     CreatePassthroughDevice(#[source] anyhow::Error),
362 
363     /// Failed to memory map.
364     #[error("Failed to memory map")]
365     Mmap(#[source] io::Error),
366 
367     /// Cannot add legacy device to Bus.
368     #[error("Cannot add legacy device to Bus")]
369     BusError(#[source] vm_device::BusError),
370 
371     /// Failed to allocate IO port
372     #[error("Failed to allocate IO port")]
373     AllocateIoPort,
374 
375     /// Failed to allocate MMIO address
376     #[error("Failed to allocate MMIO address")]
377     AllocateMmioAddress,
378 
379     /// Failed to make hotplug notification
380     #[error("Failed to make hotplug notification")]
381     HotPlugNotification(#[source] io::Error),
382 
383     /// Error from a memory manager operation
384     #[error("Error from a memory manager operation")]
385     MemoryManager(#[source] MemoryManagerError),
386 
387     /// Failed to create new interrupt source group.
388     #[error("Failed to create new interrupt source group")]
389     CreateInterruptGroup(#[source] io::Error),
390 
391     /// Failed to update interrupt source group.
392     #[error("Failed to update interrupt source group")]
393     UpdateInterruptGroup(#[source] io::Error),
394 
395     /// Failed to create interrupt controller.
396     #[error("Failed to create interrupt controller")]
397     CreateInterruptController(#[source] interrupt_controller::Error),
398 
399     /// Failed to create a new MmapRegion instance.
400     #[error("Failed to create a new MmapRegion instance")]
401     NewMmapRegion(#[source] vm_memory::mmap::MmapRegionError),
402 
403     /// Failed to clone a File.
404     #[error("Failed to clone a File")]
405     CloneFile(#[source] io::Error),
406 
407     /// Failed to create socket file
408     #[error("Failed to create socket file")]
409     CreateSocketFile(#[source] io::Error),
410 
411     /// Failed to spawn the network backend
412     #[error("Failed to spawn the network backend")]
413     SpawnNetBackend(#[source] io::Error),
414 
415     /// Failed to spawn the block backend
416     #[error("Failed to spawn the block backend")]
417     SpawnBlockBackend(#[source] io::Error),
418 
419     /// Missing PCI bus.
420     #[error("Missing PCI bus")]
421     NoPciBus,
422 
423     /// Could not find an available device name.
424     #[error("Could not find an available device name")]
425     NoAvailableDeviceName,
426 
427     /// Missing PCI device.
428     #[error("Missing PCI device")]
429     MissingPciDevice,
430 
431     /// Failed to remove a PCI device from the PCI bus.
432     #[error("Failed to remove a PCI device from the PCI bus")]
433     RemoveDeviceFromPciBus(#[source] pci::PciRootError),
434 
435     /// Failed to remove a bus device from the IO bus.
436     #[error("Failed to remove a bus device from the IO bus")]
437     RemoveDeviceFromIoBus(#[source] vm_device::BusError),
438 
439     /// Failed to remove a bus device from the MMIO bus.
440     #[error("Failed to remove a bus device from the MMIO bus")]
441     RemoveDeviceFromMmioBus(#[source] vm_device::BusError),
442 
443     /// Failed to find the device corresponding to a specific PCI b/d/f.
444     #[error("Failed to find the device corresponding to a specific PCI b/d/f")]
445     UnknownPciBdf(u32),
446 
447     /// Not allowed to remove this type of device from the VM.
448     #[error("Not allowed to remove this type of device from the VM: {0}")]
449     RemovalNotAllowed(vm_virtio::VirtioDeviceType),
450 
451     /// Failed to find device corresponding to the given identifier.
452     #[error("Failed to find device corresponding to the given identifier")]
453     UnknownDeviceId(String),
454 
455     /// Failed to find an available PCI device ID.
456     #[error("Failed to find an available PCI device ID")]
457     NextPciDeviceId(#[source] pci::PciRootError),
458 
459     /// Could not reserve the PCI device ID.
460     #[error("Could not reserve the PCI device ID")]
461     GetPciDeviceId(#[source] pci::PciRootError),
462 
463     /// Could not give the PCI device ID back.
464     #[error("Could not give the PCI device ID back")]
465     PutPciDeviceId(#[source] pci::PciRootError),
466 
467     /// No disk path was specified when one was expected
468     #[error("No disk path was specified when one was expected")]
469     NoDiskPath,
470 
471     /// Failed to update guest memory for virtio device.
472     #[error("Failed to update guest memory for virtio device")]
473     UpdateMemoryForVirtioDevice(#[source] virtio_devices::Error),
474 
475     /// Cannot create virtio-mem device
476     #[error("Cannot create virtio-mem device")]
477     CreateVirtioMem(#[source] io::Error),
478 
479     /// Cannot find a memory range for virtio-mem memory
480     #[error("Cannot find a memory range for virtio-mem memory")]
481     VirtioMemRangeAllocation,
482 
483     /// Failed to update guest memory for VFIO PCI device.
484     #[error("Failed to update guest memory for VFIO PCI device")]
485     UpdateMemoryForVfioPciDevice(#[source] vfio_ioctls::VfioError),
486 
487     /// Trying to use a directory for pmem but no size specified
488     #[error("Trying to use a directory for pmem but no size specified")]
489     PmemWithDirectorySizeMissing,
490 
491     /// Trying to use a size that is not multiple of 2MiB
492     #[error("Trying to use a size that is not multiple of 2MiB")]
493     PmemSizeNotAligned,
494 
495     /// Could not find the node in the device tree.
496     #[error("Could not find the node in the device tree")]
497     MissingNode,
498 
499     /// Resource was already found.
500     #[error("Resource was already found")]
501     ResourceAlreadyExists,
502 
503     /// Expected resources for virtio-pmem could not be found.
504     #[error("Expected resources for virtio-pmem could not be found")]
505     MissingVirtioPmemResources,
506 
507     /// Missing PCI b/d/f from the DeviceNode.
508     #[error("Missing PCI b/d/f from the DeviceNode")]
509     MissingDeviceNodePciBdf,
510 
511     /// No support for device passthrough
512     #[error("No support for device passthrough")]
513     NoDevicePassthroughSupport,
514 
515     /// No socket option support for console device
516     #[error("No socket option support for console device")]
517     NoSocketOptionSupportForConsoleDevice,
518 
519     /// Failed to resize virtio-balloon
520     #[error("Failed to resize virtio-balloon")]
521     VirtioBalloonResize(#[source] virtio_devices::balloon::Error),
522 
523     /// Missing virtio-balloon, can't proceed as expected.
524     #[error("Missing virtio-balloon, can't proceed as expected")]
525     MissingVirtioBalloon,
526 
527     /// Missing virtual IOMMU device
528     #[error("Missing virtual IOMMU device")]
529     MissingVirtualIommu,
530 
531     /// Failed to do power button notification
532     #[error("Failed to do power button notification")]
533     PowerButtonNotification(#[source] io::Error),
534 
535     /// Failed to do AArch64 GPIO power button notification
536     #[cfg(target_arch = "aarch64")]
537     #[error("Failed to do AArch64 GPIO power button notification")]
538     AArch64PowerButtonNotification(#[source] devices::legacy::GpioDeviceError),
539 
540     /// Failed to set O_DIRECT flag to file descriptor
541     #[error("Failed to set O_DIRECT flag to file descriptor")]
542     SetDirectIo,
543 
544     /// Failed to create FixedVhdDiskAsync
545     #[error("Failed to create FixedVhdDiskAsync")]
546     CreateFixedVhdDiskAsync(#[source] io::Error),
547 
548     /// Failed to create FixedVhdDiskSync
549     #[error("Failed to create FixedVhdDiskSync")]
550     CreateFixedVhdDiskSync(#[source] io::Error),
551 
552     /// Failed to create QcowDiskSync
553     #[error("Failed to create QcowDiskSync")]
554     CreateQcowDiskSync(#[source] qcow::Error),
555 
556     /// Failed to create FixedVhdxDiskSync
557     #[error("Failed to create FixedVhdxDiskSync")]
558     CreateFixedVhdxDiskSync(#[source] vhdx::VhdxError),
559 
560     /// Failed to add DMA mapping handler to virtio-mem device.
561     #[error("Failed to add DMA mapping handler to virtio-mem device")]
562     AddDmaMappingHandlerVirtioMem(#[source] virtio_devices::mem::Error),
563 
564     /// Failed to remove DMA mapping handler from virtio-mem device.
565     #[error("Failed to remove DMA mapping handler from virtio-mem device")]
566     RemoveDmaMappingHandlerVirtioMem(#[source] virtio_devices::mem::Error),
567 
568     /// Failed to create vfio-user client
569     #[error("Failed to create vfio-user client")]
570     VfioUserCreateClient(#[source] vfio_user::Error),
571 
572     /// Failed to create VFIO user device
573     #[error("Failed to create VFIO user device")]
574     VfioUserCreate(#[source] VfioUserPciDeviceError),
575 
576     /// Failed to map region from VFIO user device into guest
577     #[error("Failed to map region from VFIO user device into guest")]
578     VfioUserMapRegion(#[source] VfioUserPciDeviceError),
579 
580     /// Failed to DMA map VFIO user device.
581     #[error("Failed to DMA map VFIO user device")]
582     VfioUserDmaMap(#[source] VfioUserPciDeviceError),
583 
584     /// Failed to DMA unmap VFIO user device.
585     #[error("Failed to DMA unmap VFIO user device")]
586     VfioUserDmaUnmap(#[source] VfioUserPciDeviceError),
587 
588     /// Failed to update memory mappings for VFIO user device
589     #[error("Failed to update memory mappings for VFIO user device")]
590     UpdateMemoryForVfioUserPciDevice(#[source] VfioUserPciDeviceError),
591 
592     /// Cannot duplicate file descriptor
593     #[error("Cannot duplicate file descriptor")]
594     DupFd(#[source] vmm_sys_util::errno::Error),
595 
596     /// Failed to DMA map virtio device.
597     #[error("Failed to DMA map virtio device")]
598     VirtioDmaMap(#[source] std::io::Error),
599 
600     /// Failed to DMA unmap virtio device.
601     #[error("Failed to DMA unmap virtio device")]
602     VirtioDmaUnmap(#[source] std::io::Error),
603 
604     /// Cannot hotplug device behind vIOMMU
605     #[error("Cannot hotplug device behind vIOMMU")]
606     InvalidIommuHotplug,
607 
608     /// Invalid identifier as it is not unique.
609     #[error("Invalid identifier as it is not unique: {0}")]
610     IdentifierNotUnique(String),
611 
612     /// Invalid identifier
613     #[error("Invalid identifier: {0}")]
614     InvalidIdentifier(String),
615 
616     /// Error activating virtio device
617     #[error("Error activating virtio device")]
618     VirtioActivate(#[source] ActivateError),
619 
620     /// Failed retrieving device state from snapshot
621     #[error("Failed retrieving device state from snapshot")]
622     RestoreGetState(#[source] MigratableError),
623 
624     /// Cannot create a PvPanic device
625     #[error("Cannot create a PvPanic device")]
626     PvPanicCreate(#[source] devices::pvpanic::PvPanicError),
627 
628     /// Cannot create a RateLimiterGroup
629     #[error("Cannot create a RateLimiterGroup")]
630     RateLimiterGroupCreate(#[source] rate_limiter::group::Error),
631 
632     /// Cannot start sigwinch listener
633     #[error("Cannot start sigwinch listener")]
634     StartSigwinchListener(#[source] std::io::Error),
635 
636     // Invalid console info
637     #[error("Invalid console info")]
638     InvalidConsoleInfo,
639 
640     // Invalid console fd
641     #[error("Invalid console fd")]
642     InvalidConsoleFd,
643 
644     /// Cannot lock images of all block devices.
645     #[error("Cannot lock images of all block devices")]
646     DiskLockError(#[source] virtio_devices::block::Error),
647 }
648 
649 pub type DeviceManagerResult<T> = result::Result<T, DeviceManagerError>;
650 
651 const DEVICE_MANAGER_ACPI_SIZE: usize = 0x10;
652 
653 #[derive(Default)]
654 pub struct Console {
655     console_resizer: Option<Arc<virtio_devices::ConsoleResizer>>,
656 }
657 
658 impl Console {
659     pub fn need_resize(&self) -> bool {
660         if let Some(_resizer) = self.console_resizer.as_ref() {
661             return true;
662         }
663 
664         false
665     }
666 
667     pub fn update_console_size(&self) {
668         if let Some(resizer) = self.console_resizer.as_ref() {
669             resizer.update_console_size()
670         }
671     }
672 }
673 
674 pub(crate) struct AddressManager {
675     pub(crate) allocator: Arc<Mutex<SystemAllocator>>,
676     pub(crate) io_bus: Arc<Bus>,
677     pub(crate) mmio_bus: Arc<Bus>,
678     pub(crate) vm: Arc<dyn hypervisor::Vm>,
679     device_tree: Arc<Mutex<DeviceTree>>,
680     pci_mmio32_allocators: Vec<Arc<Mutex<AddressAllocator>>>,
681     pci_mmio64_allocators: Vec<Arc<Mutex<AddressAllocator>>>,
682 }
683 
684 impl DeviceRelocation for AddressManager {
685     fn move_bar(
686         &self,
687         old_base: u64,
688         new_base: u64,
689         len: u64,
690         pci_dev: &mut dyn PciDevice,
691         region_type: PciBarRegionType,
692     ) -> std::result::Result<(), std::io::Error> {
693         match region_type {
694             PciBarRegionType::IoRegion => {
695                 // Update system allocator
696                 self.allocator
697                     .lock()
698                     .unwrap()
699                     .free_io_addresses(GuestAddress(old_base), len as GuestUsize);
700 
701                 self.allocator
702                     .lock()
703                     .unwrap()
704                     .allocate_io_addresses(Some(GuestAddress(new_base)), len as GuestUsize, None)
705                     .ok_or_else(|| io::Error::other("failed allocating new IO range"))?;
706 
707                 // Update PIO bus
708                 self.io_bus
709                     .update_range(old_base, len, new_base, len)
710                     .map_err(io::Error::other)?;
711             }
712             PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => {
713                 let allocators = if region_type == PciBarRegionType::Memory32BitRegion {
714                     &self.pci_mmio32_allocators
715                 } else {
716                     &self.pci_mmio64_allocators
717                 };
718 
719                 // Find the specific allocator that this BAR was allocated from and use it for new one
720                 for allocator in allocators {
721                     let allocator_base = allocator.lock().unwrap().base();
722                     let allocator_end = allocator.lock().unwrap().end();
723 
724                     if old_base >= allocator_base.0 && old_base <= allocator_end.0 {
725                         allocator
726                             .lock()
727                             .unwrap()
728                             .free(GuestAddress(old_base), len as GuestUsize);
729 
730                         allocator
731                             .lock()
732                             .unwrap()
733                             .allocate(Some(GuestAddress(new_base)), len as GuestUsize, Some(len))
734                             .ok_or_else(|| io::Error::other("failed allocating new MMIO range"))?;
735 
736                         break;
737                     }
738                 }
739 
740                 // Update MMIO bus
741                 self.mmio_bus
742                     .update_range(old_base, len, new_base, len)
743                     .map_err(io::Error::other)?;
744             }
745         }
746 
747         // Update the device_tree resources associated with the device
748         if let Some(id) = pci_dev.id() {
749             if let Some(node) = self.device_tree.lock().unwrap().get_mut(&id) {
750                 let mut resource_updated = false;
751                 for resource in node.resources.iter_mut() {
752                     if let Resource::PciBar { base, type_, .. } = resource {
753                         if PciBarRegionType::from(*type_) == region_type && *base == old_base {
754                             *base = new_base;
755                             resource_updated = true;
756                             break;
757                         }
758                     }
759                 }
760 
761                 if !resource_updated {
762                     return Err(io::Error::other(format!(
763                         "Couldn't find a resource with base 0x{old_base:x} for device {id}"
764                     )));
765                 }
766             } else {
767                 return Err(io::Error::other(format!(
768                     "Couldn't find device {id} from device tree"
769                 )));
770             }
771         }
772 
773         let any_dev = pci_dev.as_any_mut();
774         if let Some(virtio_pci_dev) = any_dev.downcast_ref::<VirtioPciDevice>() {
775             let bar_addr = virtio_pci_dev.config_bar_addr();
776             if bar_addr == new_base {
777                 for (event, addr) in virtio_pci_dev.ioeventfds(old_base) {
778                     let io_addr = IoEventAddress::Mmio(addr);
779                     self.vm.unregister_ioevent(event, &io_addr).map_err(|e| {
780                         io::Error::other(format!("failed to unregister ioevent: {e:?}"))
781                     })?;
782                 }
783                 for (event, addr) in virtio_pci_dev.ioeventfds(new_base) {
784                     let io_addr = IoEventAddress::Mmio(addr);
785                     self.vm
786                         .register_ioevent(event, &io_addr, None)
787                         .map_err(|e| {
788                             io::Error::other(format!("failed to register ioevent: {e:?}"))
789                         })?;
790                 }
791             } else {
792                 let virtio_dev = virtio_pci_dev.virtio_device();
793                 let mut virtio_dev = virtio_dev.lock().unwrap();
794                 if let Some(mut shm_regions) = virtio_dev.get_shm_regions() {
795                     if shm_regions.addr.raw_value() == old_base {
796                         let mem_region = self.vm.make_user_memory_region(
797                             shm_regions.mem_slot,
798                             old_base,
799                             shm_regions.len,
800                             shm_regions.host_addr,
801                             false,
802                             false,
803                         );
804 
805                         self.vm.remove_user_memory_region(mem_region).map_err(|e| {
806                             io::Error::other(format!("failed to remove user memory region: {e:?}"))
807                         })?;
808 
809                         // Create new mapping by inserting new region to KVM.
810                         let mem_region = self.vm.make_user_memory_region(
811                             shm_regions.mem_slot,
812                             new_base,
813                             shm_regions.len,
814                             shm_regions.host_addr,
815                             false,
816                             false,
817                         );
818 
819                         self.vm.create_user_memory_region(mem_region).map_err(|e| {
820                             io::Error::other(format!("failed to create user memory regions: {e:?}"))
821                         })?;
822 
823                         // Update shared memory regions to reflect the new mapping.
824                         shm_regions.addr = GuestAddress(new_base);
825                         virtio_dev.set_shm_regions(shm_regions).map_err(|e| {
826                             io::Error::other(format!(
827                                 "failed to update shared memory regions: {e:?}"
828                             ))
829                         })?;
830                     }
831                 }
832             }
833         }
834 
835         pci_dev.move_bar(old_base, new_base)
836     }
837 }
838 
839 #[derive(Serialize, Deserialize)]
840 struct DeviceManagerState {
841     device_tree: DeviceTree,
842     device_id_cnt: Wrapping<usize>,
843 }
844 
845 #[derive(Debug)]
846 pub struct PtyPair {
847     pub main: File,
848     pub path: PathBuf,
849 }
850 
851 impl Clone for PtyPair {
852     fn clone(&self) -> Self {
853         PtyPair {
854             main: self.main.try_clone().unwrap(),
855             path: self.path.clone(),
856         }
857     }
858 }
859 
860 #[derive(Clone)]
861 pub enum PciDeviceHandle {
862     Vfio(Arc<Mutex<VfioPciDevice>>),
863     Virtio(Arc<Mutex<VirtioPciDevice>>),
864     VfioUser(Arc<Mutex<VfioUserPciDevice>>),
865 }
866 
867 #[derive(Clone)]
868 struct MetaVirtioDevice {
869     virtio_device: Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
870     iommu: bool,
871     id: String,
872     pci_segment: u16,
873     dma_handler: Option<Arc<dyn ExternalDmaMapping>>,
874 }
875 
876 #[derive(Default)]
877 pub struct AcpiPlatformAddresses {
878     pub pm_timer_address: Option<GenericAddress>,
879     pub reset_reg_address: Option<GenericAddress>,
880     pub sleep_control_reg_address: Option<GenericAddress>,
881     pub sleep_status_reg_address: Option<GenericAddress>,
882 }
883 
884 #[cfg(all(feature = "mshv", feature = "sev_snp"))]
885 struct SevSnpPageAccessProxy {
886     vm: Arc<dyn hypervisor::Vm>,
887 }
888 
889 #[cfg(all(feature = "mshv", feature = "sev_snp"))]
890 impl std::fmt::Debug for SevSnpPageAccessProxy {
891     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
892         write!(f, "SNP Page access proxy")
893     }
894 }
895 
896 #[cfg(all(feature = "mshv", feature = "sev_snp"))]
897 impl SevSnpPageAccessProxy {
898     fn new(vm: Arc<dyn hypervisor::Vm>) -> SevSnpPageAccessProxy {
899         SevSnpPageAccessProxy { vm }
900     }
901 }
902 
903 #[cfg(all(feature = "mshv", feature = "sev_snp"))]
904 impl AccessPlatform for SevSnpPageAccessProxy {
905     fn translate_gpa(&self, base: u64, _size: u64) -> std::result::Result<u64, std::io::Error> {
906         Ok(base)
907     }
908 
909     fn translate_gva(&self, base: u64, size: u64) -> std::result::Result<u64, std::io::Error> {
910         self.vm
911             .gain_page_access(base, size as u32)
912             .map_err(io::Error::other)?;
913         Ok(base)
914     }
915 }
916 
917 pub struct DeviceManager {
918     // Manage address space related to devices
919     address_manager: Arc<AddressManager>,
920 
921     // Console abstraction
922     console: Arc<Console>,
923 
924     // Serial Manager
925     serial_manager: Option<Arc<SerialManager>>,
926 
927     // pty foreground status,
928     console_resize_pipe: Option<Arc<File>>,
929 
930     // To restore on exit.
931     original_termios_opt: Arc<Mutex<Option<termios>>>,
932 
933     // Interrupt controller
934     #[cfg(target_arch = "x86_64")]
935     interrupt_controller: Option<Arc<Mutex<ioapic::Ioapic>>>,
936     #[cfg(target_arch = "aarch64")]
937     interrupt_controller: Option<Arc<Mutex<gic::Gic>>>,
938     #[cfg(target_arch = "riscv64")]
939     interrupt_controller: Option<Arc<Mutex<aia::Aia>>>,
940 
941     // Things to be added to the commandline (e.g. aarch64 or riscv64 early console)
942     #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
943     cmdline_additions: Vec<String>,
944 
945     // ACPI GED notification device
946     ged_notification_device: Option<Arc<Mutex<devices::AcpiGedDevice>>>,
947 
948     // VM configuration
949     config: Arc<Mutex<VmConfig>>,
950 
951     // Memory Manager
952     memory_manager: Arc<Mutex<MemoryManager>>,
953 
954     // CPU Manager
955     cpu_manager: Arc<Mutex<CpuManager>>,
956 
957     // The virtio devices on the system
958     virtio_devices: Vec<MetaVirtioDevice>,
959 
960     /// All disks. Needed for locking and unlocking the images.
961     block_devices: Vec<Arc<Mutex<Block>>>,
962 
963     // List of bus devices
964     // Let the DeviceManager keep strong references to the BusDevice devices.
965     // This allows the IO and MMIO buses to be provided with Weak references,
966     // which prevents cyclic dependencies.
967     bus_devices: Vec<Arc<dyn BusDeviceSync>>,
968 
969     // Counter to keep track of the consumed device IDs.
970     device_id_cnt: Wrapping<usize>,
971 
972     pci_segments: Vec<PciSegment>,
973 
974     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
975     // MSI Interrupt Manager
976     msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
977 
978     #[cfg_attr(feature = "mshv", allow(dead_code))]
979     // Legacy Interrupt Manager
980     legacy_interrupt_manager: Option<Arc<dyn InterruptManager<GroupConfig = LegacyIrqGroupConfig>>>,
981 
982     // Passthrough device handle
983     passthrough_device: Option<VfioDeviceFd>,
984 
985     // VFIO container
986     // Only one container can be created, therefore it is stored as part of the
987     // DeviceManager to be reused.
988     vfio_container: Option<Arc<VfioContainer>>,
989 
990     // Paravirtualized IOMMU
991     iommu_device: Option<Arc<Mutex<virtio_devices::Iommu>>>,
992     iommu_mapping: Option<Arc<IommuMapping>>,
993 
994     // PCI information about devices attached to the paravirtualized IOMMU
995     // It contains the virtual IOMMU PCI BDF along with the list of PCI BDF
996     // representing the devices attached to the virtual IOMMU. This is useful
997     // information for filling the ACPI VIOT table.
998     iommu_attached_devices: Option<(PciBdf, Vec<PciBdf>)>,
999 
1000     // Tree of devices, representing the dependencies between devices.
1001     // Useful for introspection, snapshot and restore.
1002     device_tree: Arc<Mutex<DeviceTree>>,
1003 
1004     // Exit event
1005     exit_evt: EventFd,
1006     reset_evt: EventFd,
1007 
1008     #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1009     id_to_dev_info: HashMap<(DeviceType, String), MmioDeviceInfo>,
1010 
1011     // seccomp action
1012     seccomp_action: SeccompAction,
1013 
1014     // List of guest NUMA nodes.
1015     numa_nodes: NumaNodes,
1016 
1017     // Possible handle to the virtio-balloon device
1018     balloon: Option<Arc<Mutex<virtio_devices::Balloon>>>,
1019 
1020     // Virtio Device activation EventFd to allow the VMM thread to trigger device
1021     // activation and thus start the threads from the VMM thread
1022     activate_evt: EventFd,
1023 
1024     #[cfg(not(target_arch = "riscv64"))]
1025     acpi_address: GuestAddress,
1026 
1027     selected_segment: usize,
1028 
1029     // Possible handle to the virtio-mem device
1030     virtio_mem_devices: Vec<Arc<Mutex<virtio_devices::Mem>>>,
1031 
1032     #[cfg(target_arch = "aarch64")]
1033     // GPIO device for AArch64
1034     gpio_device: Option<Arc<Mutex<devices::legacy::Gpio>>>,
1035 
1036     #[cfg(feature = "pvmemcontrol")]
1037     pvmemcontrol_devices: Option<(
1038         Arc<PvmemcontrolBusDevice>,
1039         Arc<Mutex<PvmemcontrolPciDevice>>,
1040     )>,
1041 
1042     // pvpanic device
1043     pvpanic_device: Option<Arc<Mutex<devices::PvPanicDevice>>>,
1044 
1045     // Flag to force setting the iommu on virtio devices
1046     force_iommu: bool,
1047 
1048     // io_uring availability if detected
1049     io_uring_supported: Option<bool>,
1050 
1051     // aio availability if detected
1052     aio_supported: Option<bool>,
1053 
1054     // List of unique identifiers provided at boot through the configuration.
1055     boot_id_list: BTreeSet<String>,
1056 
1057     #[cfg(not(target_arch = "riscv64"))]
1058     // Start time of the VM
1059     timestamp: Instant,
1060 
1061     // Pending activations
1062     pending_activations: Arc<Mutex<Vec<VirtioPciDeviceActivator>>>,
1063 
1064     #[cfg(not(target_arch = "riscv64"))]
1065     // Addresses for ACPI platform devices e.g. ACPI PM timer, sleep/reset registers
1066     acpi_platform_addresses: AcpiPlatformAddresses,
1067 
1068     snapshot: Option<Snapshot>,
1069 
1070     rate_limit_groups: HashMap<String, Arc<RateLimiterGroup>>,
1071 
1072     mmio_regions: Arc<Mutex<Vec<MmioRegion>>>,
1073 }
1074 
1075 fn create_mmio_allocators(
1076     start: u64,
1077     end: u64,
1078     num_pci_segments: u16,
1079     weights: Vec<u32>,
1080     alignment: u64,
1081 ) -> Vec<Arc<Mutex<AddressAllocator>>> {
1082     let total_weight: u32 = weights.iter().sum();
1083 
1084     // Start each PCI segment mmio range on an aligned boundary
1085     let pci_segment_mmio_size = (end - start + 1) / (alignment * total_weight as u64) * alignment;
1086 
1087     let mut mmio_allocators = vec![];
1088     let mut i = 0;
1089     for segment_id in 0..num_pci_segments as u64 {
1090         let weight = weights[segment_id as usize] as u64;
1091         let mmio_start = start + i * pci_segment_mmio_size;
1092         let mmio_size = pci_segment_mmio_size * weight;
1093         let allocator = Arc::new(Mutex::new(
1094             AddressAllocator::new(GuestAddress(mmio_start), mmio_size).unwrap(),
1095         ));
1096         mmio_allocators.push(allocator);
1097         i += weight;
1098     }
1099 
1100     mmio_allocators
1101 }
1102 
1103 impl DeviceManager {
1104     #[allow(clippy::too_many_arguments)]
1105     pub fn new(
1106         io_bus: Arc<Bus>,
1107         mmio_bus: Arc<Bus>,
1108         vm: Arc<dyn hypervisor::Vm>,
1109         config: Arc<Mutex<VmConfig>>,
1110         memory_manager: Arc<Mutex<MemoryManager>>,
1111         cpu_manager: Arc<Mutex<CpuManager>>,
1112         exit_evt: EventFd,
1113         reset_evt: EventFd,
1114         seccomp_action: SeccompAction,
1115         numa_nodes: NumaNodes,
1116         activate_evt: &EventFd,
1117         force_iommu: bool,
1118         boot_id_list: BTreeSet<String>,
1119         #[cfg(not(target_arch = "riscv64"))] timestamp: Instant,
1120         snapshot: Option<Snapshot>,
1121         dynamic: bool,
1122     ) -> DeviceManagerResult<Arc<Mutex<Self>>> {
1123         trace_scoped!("DeviceManager::new");
1124 
1125         let (device_tree, device_id_cnt) = if let Some(snapshot) = snapshot.as_ref() {
1126             let state: DeviceManagerState = snapshot.to_state().unwrap();
1127             (
1128                 Arc::new(Mutex::new(state.device_tree.clone())),
1129                 state.device_id_cnt,
1130             )
1131         } else {
1132             (Arc::new(Mutex::new(DeviceTree::new())), Wrapping(0))
1133         };
1134 
1135         let num_pci_segments =
1136             if let Some(platform_config) = config.lock().unwrap().platform.as_ref() {
1137                 platform_config.num_pci_segments
1138             } else {
1139                 1
1140             };
1141 
1142         let mut mmio32_aperture_weights: Vec<u32> =
1143             std::iter::repeat_n(DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT, num_pci_segments.into())
1144                 .collect();
1145         if let Some(pci_segments) = &config.lock().unwrap().pci_segments {
1146             for pci_segment in pci_segments.iter() {
1147                 mmio32_aperture_weights[pci_segment.pci_segment as usize] =
1148                     pci_segment.mmio32_aperture_weight
1149             }
1150         }
1151 
1152         let start_of_mmio32_area = layout::MEM_32BIT_DEVICES_START.0;
1153         let end_of_mmio32_area = layout::MEM_32BIT_DEVICES_START.0 + layout::MEM_32BIT_DEVICES_SIZE;
1154         let pci_mmio32_allocators = create_mmio_allocators(
1155             start_of_mmio32_area,
1156             end_of_mmio32_area,
1157             num_pci_segments,
1158             mmio32_aperture_weights,
1159             4 << 10,
1160         );
1161 
1162         let mut mmio64_aperture_weights: Vec<u32> =
1163             std::iter::repeat_n(DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT, num_pci_segments.into())
1164                 .collect();
1165         if let Some(pci_segments) = &config.lock().unwrap().pci_segments {
1166             for pci_segment in pci_segments.iter() {
1167                 mmio64_aperture_weights[pci_segment.pci_segment as usize] =
1168                     pci_segment.mmio64_aperture_weight
1169             }
1170         }
1171 
1172         let start_of_mmio64_area = memory_manager.lock().unwrap().start_of_device_area().0;
1173         let end_of_mmio64_area = memory_manager.lock().unwrap().end_of_device_area().0;
1174         let pci_mmio64_allocators = create_mmio_allocators(
1175             start_of_mmio64_area,
1176             end_of_mmio64_area,
1177             num_pci_segments,
1178             mmio64_aperture_weights,
1179             4 << 30,
1180         );
1181 
1182         let address_manager = Arc::new(AddressManager {
1183             allocator: memory_manager.lock().unwrap().allocator(),
1184             io_bus,
1185             mmio_bus,
1186             vm: vm.clone(),
1187             device_tree: Arc::clone(&device_tree),
1188             pci_mmio32_allocators,
1189             pci_mmio64_allocators,
1190         });
1191 
1192         // First we create the MSI interrupt manager, the legacy one is created
1193         // later, after the IOAPIC device creation.
1194         // The reason we create the MSI one first is because the IOAPIC needs it,
1195         // and then the legacy interrupt manager needs an IOAPIC. So we're
1196         // handling a linear dependency chain:
1197         // msi_interrupt_manager <- IOAPIC <- legacy_interrupt_manager.
1198         let msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>> =
1199             Arc::new(MsiInterruptManager::new(
1200                 Arc::clone(&address_manager.allocator),
1201                 vm,
1202             ));
1203 
1204         let acpi_address = address_manager
1205             .allocator
1206             .lock()
1207             .unwrap()
1208             .allocate_platform_mmio_addresses(None, DEVICE_MANAGER_ACPI_SIZE as u64, None)
1209             .ok_or(DeviceManagerError::AllocateIoPort)?;
1210 
1211         let mut pci_irq_slots = [0; 32];
1212         PciSegment::reserve_legacy_interrupts_for_pci_devices(
1213             &address_manager,
1214             &mut pci_irq_slots,
1215         )?;
1216 
1217         let mut pci_segments = vec![PciSegment::new_default_segment(
1218             &address_manager,
1219             Arc::clone(&address_manager.pci_mmio32_allocators[0]),
1220             Arc::clone(&address_manager.pci_mmio64_allocators[0]),
1221             &pci_irq_slots,
1222         )?];
1223 
1224         for i in 1..num_pci_segments as usize {
1225             pci_segments.push(PciSegment::new(
1226                 i as u16,
1227                 numa_node_id_from_pci_segment_id(&numa_nodes, i as u16),
1228                 &address_manager,
1229                 Arc::clone(&address_manager.pci_mmio32_allocators[i]),
1230                 Arc::clone(&address_manager.pci_mmio64_allocators[i]),
1231                 &pci_irq_slots,
1232             )?);
1233         }
1234 
1235         if dynamic {
1236             let acpi_address = address_manager
1237                 .allocator
1238                 .lock()
1239                 .unwrap()
1240                 .allocate_platform_mmio_addresses(None, CPU_MANAGER_ACPI_SIZE as u64, None)
1241                 .ok_or(DeviceManagerError::AllocateMmioAddress)?;
1242 
1243             address_manager
1244                 .mmio_bus
1245                 .insert(
1246                     cpu_manager.clone(),
1247                     acpi_address.0,
1248                     CPU_MANAGER_ACPI_SIZE as u64,
1249                 )
1250                 .map_err(DeviceManagerError::BusError)?;
1251 
1252             cpu_manager.lock().unwrap().set_acpi_address(acpi_address);
1253         }
1254 
1255         let mut rate_limit_groups = HashMap::<String, Arc<RateLimiterGroup>>::new();
1256         if let Some(rate_limit_groups_cfg) = config.lock().unwrap().rate_limit_groups.as_ref() {
1257             for rate_limit_group_cfg in rate_limit_groups_cfg {
1258                 let rate_limit_cfg = rate_limit_group_cfg.rate_limiter_config;
1259                 let bw = rate_limit_cfg.bandwidth.unwrap_or_default();
1260                 let ops = rate_limit_cfg.ops.unwrap_or_default();
1261                 let mut rate_limit_group = RateLimiterGroup::new(
1262                     &rate_limit_group_cfg.id,
1263                     bw.size,
1264                     bw.one_time_burst.unwrap_or(0),
1265                     bw.refill_time,
1266                     ops.size,
1267                     ops.one_time_burst.unwrap_or(0),
1268                     ops.refill_time,
1269                 )
1270                 .map_err(DeviceManagerError::RateLimiterGroupCreate)?;
1271 
1272                 let exit_evt = exit_evt.try_clone().map_err(DeviceManagerError::EventFd)?;
1273 
1274                 rate_limit_group.start_thread(exit_evt).unwrap();
1275                 rate_limit_groups
1276                     .insert(rate_limit_group_cfg.id.clone(), Arc::new(rate_limit_group));
1277             }
1278         }
1279 
1280         let device_manager = DeviceManager {
1281             address_manager: Arc::clone(&address_manager),
1282             console: Arc::new(Console::default()),
1283             interrupt_controller: None,
1284             #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1285             cmdline_additions: Vec::new(),
1286             ged_notification_device: None,
1287             config,
1288             memory_manager,
1289             cpu_manager,
1290             virtio_devices: Vec::new(),
1291             block_devices: vec![],
1292             bus_devices: Vec::new(),
1293             device_id_cnt,
1294             msi_interrupt_manager,
1295             legacy_interrupt_manager: None,
1296             passthrough_device: None,
1297             vfio_container: None,
1298             iommu_device: None,
1299             iommu_mapping: None,
1300             iommu_attached_devices: None,
1301             pci_segments,
1302             device_tree,
1303             exit_evt,
1304             reset_evt,
1305             #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1306             id_to_dev_info: HashMap::new(),
1307             seccomp_action,
1308             numa_nodes,
1309             balloon: None,
1310             activate_evt: activate_evt
1311                 .try_clone()
1312                 .map_err(DeviceManagerError::EventFd)?,
1313             #[cfg(not(target_arch = "riscv64"))]
1314             acpi_address,
1315             selected_segment: 0,
1316             serial_manager: None,
1317             console_resize_pipe: None,
1318             original_termios_opt: Arc::new(Mutex::new(None)),
1319             virtio_mem_devices: Vec::new(),
1320             #[cfg(target_arch = "aarch64")]
1321             gpio_device: None,
1322             #[cfg(feature = "pvmemcontrol")]
1323             pvmemcontrol_devices: None,
1324             pvpanic_device: None,
1325             force_iommu,
1326             io_uring_supported: None,
1327             aio_supported: None,
1328             boot_id_list,
1329             #[cfg(not(target_arch = "riscv64"))]
1330             timestamp,
1331             pending_activations: Arc::new(Mutex::new(Vec::default())),
1332             #[cfg(not(target_arch = "riscv64"))]
1333             acpi_platform_addresses: AcpiPlatformAddresses::default(),
1334             snapshot,
1335             rate_limit_groups,
1336             mmio_regions: Arc::new(Mutex::new(Vec::new())),
1337         };
1338 
1339         let device_manager = Arc::new(Mutex::new(device_manager));
1340 
1341         address_manager
1342             .mmio_bus
1343             .insert(
1344                 Arc::clone(&device_manager) as Arc<dyn BusDeviceSync>,
1345                 acpi_address.0,
1346                 DEVICE_MANAGER_ACPI_SIZE as u64,
1347             )
1348             .map_err(DeviceManagerError::BusError)?;
1349 
1350         Ok(device_manager)
1351     }
1352 
1353     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1354         self.console_resize_pipe.clone()
1355     }
1356 
1357     pub fn create_interrupt_controller(
1358         &mut self,
1359     ) -> DeviceManagerResult<Arc<Mutex<dyn InterruptController>>> {
1360         self.add_interrupt_controller()
1361     }
1362 
1363     pub fn create_devices(
1364         &mut self,
1365         console_info: Option<ConsoleInfo>,
1366         console_resize_pipe: Option<Arc<File>>,
1367         original_termios_opt: Arc<Mutex<Option<termios>>>,
1368         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1369     ) -> DeviceManagerResult<()> {
1370         trace_scoped!("create_devices");
1371 
1372         let mut virtio_devices: Vec<MetaVirtioDevice> = Vec::new();
1373 
1374         self.cpu_manager
1375             .lock()
1376             .unwrap()
1377             .set_interrupt_controller(interrupt_controller.clone());
1378 
1379         // Now we can create the legacy interrupt manager, which needs the freshly
1380         // formed IOAPIC device.
1381         let legacy_interrupt_manager: Arc<
1382             dyn InterruptManager<GroupConfig = LegacyIrqGroupConfig>,
1383         > = Arc::new(LegacyUserspaceInterruptManager::new(Arc::clone(
1384             &interrupt_controller,
1385         )));
1386 
1387         {
1388             if let Some(acpi_address) = self.memory_manager.lock().unwrap().acpi_address() {
1389                 self.address_manager
1390                     .mmio_bus
1391                     .insert(
1392                         Arc::clone(&self.memory_manager) as Arc<dyn BusDeviceSync>,
1393                         acpi_address.0,
1394                         MEMORY_MANAGER_ACPI_SIZE as u64,
1395                     )
1396                     .map_err(DeviceManagerError::BusError)?;
1397             }
1398         }
1399 
1400         #[cfg(target_arch = "x86_64")]
1401         self.add_legacy_devices(
1402             self.reset_evt
1403                 .try_clone()
1404                 .map_err(DeviceManagerError::EventFd)?,
1405         )?;
1406 
1407         #[cfg(target_arch = "aarch64")]
1408         self.add_legacy_devices(&legacy_interrupt_manager)?;
1409 
1410         {
1411             self.ged_notification_device = self.add_acpi_devices(
1412                 &legacy_interrupt_manager,
1413                 self.reset_evt
1414                     .try_clone()
1415                     .map_err(DeviceManagerError::EventFd)?,
1416                 self.exit_evt
1417                     .try_clone()
1418                     .map_err(DeviceManagerError::EventFd)?,
1419             )?;
1420         }
1421 
1422         self.original_termios_opt = original_termios_opt;
1423 
1424         self.console = self.add_console_devices(
1425             &legacy_interrupt_manager,
1426             &mut virtio_devices,
1427             console_info,
1428             console_resize_pipe,
1429         )?;
1430 
1431         #[cfg(not(target_arch = "riscv64"))]
1432         if let Some(tpm) = self.config.clone().lock().unwrap().tpm.as_ref() {
1433             let tpm_dev = self.add_tpm_device(tpm.socket.clone())?;
1434             self.bus_devices
1435                 .push(Arc::clone(&tpm_dev) as Arc<dyn BusDeviceSync>)
1436         }
1437         self.legacy_interrupt_manager = Some(legacy_interrupt_manager);
1438 
1439         virtio_devices.append(&mut self.make_virtio_devices()?);
1440 
1441         self.add_pci_devices(virtio_devices.clone())?;
1442 
1443         self.virtio_devices = virtio_devices;
1444 
1445         // Add pvmemcontrol if required
1446         #[cfg(feature = "pvmemcontrol")]
1447         {
1448             if self.config.lock().unwrap().pvmemcontrol.is_some() {
1449                 let (pvmemcontrol_bus_device, pvmemcontrol_pci_device) =
1450                     self.make_pvmemcontrol_device()?;
1451                 self.pvmemcontrol_devices =
1452                     Some((pvmemcontrol_bus_device, pvmemcontrol_pci_device));
1453             }
1454         }
1455 
1456         if self.config.clone().lock().unwrap().pvpanic {
1457             self.pvpanic_device = self.add_pvpanic_device()?;
1458         }
1459 
1460         Ok(())
1461     }
1462 
1463     fn state(&self) -> DeviceManagerState {
1464         DeviceManagerState {
1465             device_tree: self.device_tree.lock().unwrap().clone(),
1466             device_id_cnt: self.device_id_cnt,
1467         }
1468     }
1469 
1470     fn get_msi_iova_space(&mut self) -> (u64, u64) {
1471         #[cfg(target_arch = "aarch64")]
1472         {
1473             let vcpus = self.config.lock().unwrap().cpus.boot_vcpus;
1474             let vgic_config = gic::Gic::create_default_config(vcpus.into());
1475             (
1476                 vgic_config.msi_addr,
1477                 vgic_config.msi_addr + vgic_config.msi_size - 1,
1478             )
1479         }
1480         #[cfg(target_arch = "riscv64")]
1481         {
1482             let vcpus = self.config.lock().unwrap().cpus.boot_vcpus;
1483             let vaia_config = aia::Aia::create_default_config(vcpus.into());
1484             (
1485                 vaia_config.imsic_addr,
1486                 vaia_config.imsic_addr + vaia_config.vcpu_count as u64 * arch::layout::IMSIC_SIZE
1487                     - 1,
1488             )
1489         }
1490         #[cfg(target_arch = "x86_64")]
1491         (0xfee0_0000, 0xfeef_ffff)
1492     }
1493 
1494     #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1495     /// Gets the information of the devices registered up to some point in time.
1496     pub fn get_device_info(&self) -> &HashMap<(DeviceType, String), MmioDeviceInfo> {
1497         &self.id_to_dev_info
1498     }
1499 
1500     #[allow(unused_variables)]
1501     fn add_pci_devices(
1502         &mut self,
1503         virtio_devices: Vec<MetaVirtioDevice>,
1504     ) -> DeviceManagerResult<()> {
1505         let iommu_id = String::from(IOMMU_DEVICE_NAME);
1506 
1507         let iommu_address_width_bits =
1508             if let Some(ref platform) = self.config.lock().unwrap().platform {
1509                 platform.iommu_address_width_bits
1510             } else {
1511                 DEFAULT_IOMMU_ADDRESS_WIDTH_BITS
1512             };
1513 
1514         let iommu_device = if self.config.lock().unwrap().iommu {
1515             let (device, mapping) = virtio_devices::Iommu::new(
1516                 iommu_id.clone(),
1517                 self.seccomp_action.clone(),
1518                 self.exit_evt
1519                     .try_clone()
1520                     .map_err(DeviceManagerError::EventFd)?,
1521                 self.get_msi_iova_space(),
1522                 iommu_address_width_bits,
1523                 state_from_id(self.snapshot.as_ref(), iommu_id.as_str())
1524                     .map_err(DeviceManagerError::RestoreGetState)?,
1525             )
1526             .map_err(DeviceManagerError::CreateVirtioIommu)?;
1527             let device = Arc::new(Mutex::new(device));
1528             self.iommu_device = Some(Arc::clone(&device));
1529             self.iommu_mapping = Some(mapping);
1530 
1531             // Fill the device tree with a new node. In case of restore, we
1532             // know there is nothing to do, so we can simply override the
1533             // existing entry.
1534             self.device_tree
1535                 .lock()
1536                 .unwrap()
1537                 .insert(iommu_id.clone(), device_node!(iommu_id, device));
1538 
1539             Some(device)
1540         } else {
1541             None
1542         };
1543 
1544         let mut iommu_attached_devices = Vec::new();
1545         {
1546             for handle in virtio_devices {
1547                 let mapping: Option<Arc<IommuMapping>> = if handle.iommu {
1548                     self.iommu_mapping.clone()
1549                 } else {
1550                     None
1551                 };
1552 
1553                 let dev_id = self.add_virtio_pci_device(
1554                     handle.virtio_device,
1555                     &mapping,
1556                     handle.id,
1557                     handle.pci_segment,
1558                     handle.dma_handler,
1559                 )?;
1560 
1561                 if handle.iommu {
1562                     iommu_attached_devices.push(dev_id);
1563                 }
1564             }
1565 
1566             let mut vfio_iommu_device_ids = self.add_vfio_devices()?;
1567             iommu_attached_devices.append(&mut vfio_iommu_device_ids);
1568 
1569             let mut vfio_user_iommu_device_ids = self.add_user_devices()?;
1570             iommu_attached_devices.append(&mut vfio_user_iommu_device_ids);
1571 
1572             // Add all devices from forced iommu segments
1573             if let Some(platform_config) = self.config.lock().unwrap().platform.as_ref() {
1574                 if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() {
1575                     for segment in iommu_segments {
1576                         for device in 0..32 {
1577                             let bdf = PciBdf::new(*segment, 0, device, 0);
1578                             if !iommu_attached_devices.contains(&bdf) {
1579                                 iommu_attached_devices.push(bdf);
1580                             }
1581                         }
1582                     }
1583                 }
1584             }
1585 
1586             if let Some(iommu_device) = iommu_device {
1587                 let dev_id = self.add_virtio_pci_device(iommu_device, &None, iommu_id, 0, None)?;
1588                 self.iommu_attached_devices = Some((dev_id, iommu_attached_devices));
1589             }
1590         }
1591 
1592         for segment in &self.pci_segments {
1593             #[cfg(target_arch = "x86_64")]
1594             if let Some(pci_config_io) = segment.pci_config_io.as_ref() {
1595                 self.bus_devices
1596                     .push(Arc::clone(pci_config_io) as Arc<dyn BusDeviceSync>);
1597             }
1598 
1599             self.bus_devices
1600                 .push(Arc::clone(&segment.pci_config_mmio) as Arc<dyn BusDeviceSync>);
1601         }
1602 
1603         Ok(())
1604     }
1605 
1606     #[cfg(target_arch = "aarch64")]
1607     fn add_interrupt_controller(
1608         &mut self,
1609     ) -> DeviceManagerResult<Arc<Mutex<dyn InterruptController>>> {
1610         let interrupt_controller: Arc<Mutex<gic::Gic>> = Arc::new(Mutex::new(
1611             gic::Gic::new(
1612                 self.config.lock().unwrap().cpus.boot_vcpus,
1613                 Arc::clone(&self.msi_interrupt_manager),
1614                 self.address_manager.vm.clone(),
1615             )
1616             .map_err(DeviceManagerError::CreateInterruptController)?,
1617         ));
1618 
1619         self.interrupt_controller = Some(interrupt_controller.clone());
1620 
1621         // Restore the vGic if this is in the process of restoration
1622         let id = String::from(gic::GIC_SNAPSHOT_ID);
1623         if let Some(vgic_snapshot) = snapshot_from_id(self.snapshot.as_ref(), &id) {
1624             // PMU support is optional. Nothing should be impacted if the PMU initialization failed.
1625             if self
1626                 .cpu_manager
1627                 .lock()
1628                 .unwrap()
1629                 .init_pmu(AARCH64_PMU_IRQ + 16)
1630                 .is_err()
1631             {
1632                 info!("Failed to initialize PMU");
1633             }
1634 
1635             let vgic_state = vgic_snapshot
1636                 .to_state()
1637                 .map_err(DeviceManagerError::RestoreGetState)?;
1638             let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states();
1639             interrupt_controller
1640                 .lock()
1641                 .unwrap()
1642                 .restore_vgic(vgic_state, &saved_vcpu_states)
1643                 .unwrap();
1644         }
1645 
1646         self.device_tree
1647             .lock()
1648             .unwrap()
1649             .insert(id.clone(), device_node!(id, interrupt_controller));
1650 
1651         Ok(interrupt_controller)
1652     }
1653 
1654     #[cfg(target_arch = "aarch64")]
1655     pub fn get_interrupt_controller(&mut self) -> Option<&Arc<Mutex<gic::Gic>>> {
1656         self.interrupt_controller.as_ref()
1657     }
1658 
1659     #[cfg(target_arch = "riscv64")]
1660     fn add_interrupt_controller(
1661         &mut self,
1662     ) -> DeviceManagerResult<Arc<Mutex<dyn InterruptController>>> {
1663         let interrupt_controller: Arc<Mutex<aia::Aia>> = Arc::new(Mutex::new(
1664             aia::Aia::new(
1665                 self.config.lock().unwrap().cpus.boot_vcpus,
1666                 Arc::clone(&self.msi_interrupt_manager),
1667                 self.address_manager.vm.clone(),
1668             )
1669             .map_err(DeviceManagerError::CreateInterruptController)?,
1670         ));
1671 
1672         self.interrupt_controller = Some(interrupt_controller.clone());
1673 
1674         // Restore the vAia if this is in the process of restoration
1675         let id = String::from(aia::_AIA_SNAPSHOT_ID);
1676         if let Some(_vaia_snapshot) = snapshot_from_id(self.snapshot.as_ref(), &id) {
1677             // TODO: vAia snapshotting and restoration is scheduled to next stage of riscv64 support.
1678             // TODO: PMU support is scheduled to next stage of riscv64 support.
1679             // PMU support is optional. Nothing should be impacted if the PMU initialization failed.
1680             unimplemented!()
1681         }
1682 
1683         self.device_tree
1684             .lock()
1685             .unwrap()
1686             .insert(id.clone(), device_node!(id, interrupt_controller));
1687 
1688         Ok(interrupt_controller)
1689     }
1690 
1691     #[cfg(target_arch = "riscv64")]
1692     pub fn get_interrupt_controller(&mut self) -> Option<&Arc<Mutex<aia::Aia>>> {
1693         self.interrupt_controller.as_ref()
1694     }
1695 
1696     #[cfg(target_arch = "x86_64")]
1697     fn add_interrupt_controller(
1698         &mut self,
1699     ) -> DeviceManagerResult<Arc<Mutex<dyn InterruptController>>> {
1700         let id = String::from(IOAPIC_DEVICE_NAME);
1701 
1702         // Create IOAPIC
1703         let interrupt_controller = Arc::new(Mutex::new(
1704             ioapic::Ioapic::new(
1705                 id.clone(),
1706                 APIC_START,
1707                 Arc::clone(&self.msi_interrupt_manager),
1708                 state_from_id(self.snapshot.as_ref(), id.as_str())
1709                     .map_err(DeviceManagerError::RestoreGetState)?,
1710             )
1711             .map_err(DeviceManagerError::CreateInterruptController)?,
1712         ));
1713 
1714         self.interrupt_controller = Some(interrupt_controller.clone());
1715 
1716         self.address_manager
1717             .mmio_bus
1718             .insert(interrupt_controller.clone(), IOAPIC_START.0, IOAPIC_SIZE)
1719             .map_err(DeviceManagerError::BusError)?;
1720 
1721         self.bus_devices
1722             .push(Arc::clone(&interrupt_controller) as Arc<dyn BusDeviceSync>);
1723 
1724         // Fill the device tree with a new node. In case of restore, we
1725         // know there is nothing to do, so we can simply override the
1726         // existing entry.
1727         self.device_tree
1728             .lock()
1729             .unwrap()
1730             .insert(id.clone(), device_node!(id, interrupt_controller));
1731 
1732         Ok(interrupt_controller)
1733     }
1734 
1735     fn add_acpi_devices(
1736         &mut self,
1737         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = LegacyIrqGroupConfig>>,
1738         reset_evt: EventFd,
1739         exit_evt: EventFd,
1740     ) -> DeviceManagerResult<Option<Arc<Mutex<devices::AcpiGedDevice>>>> {
1741         let vcpus_kill_signalled = self
1742             .cpu_manager
1743             .lock()
1744             .unwrap()
1745             .vcpus_kill_signalled()
1746             .clone();
1747         let shutdown_device = Arc::new(Mutex::new(devices::AcpiShutdownDevice::new(
1748             exit_evt,
1749             reset_evt,
1750             vcpus_kill_signalled,
1751         )));
1752 
1753         self.bus_devices
1754             .push(Arc::clone(&shutdown_device) as Arc<dyn BusDeviceSync>);
1755 
1756         #[cfg(target_arch = "x86_64")]
1757         {
1758             let shutdown_pio_address: u16 = 0x600;
1759 
1760             self.address_manager
1761                 .allocator
1762                 .lock()
1763                 .unwrap()
1764                 .allocate_io_addresses(Some(GuestAddress(shutdown_pio_address.into())), 0x8, None)
1765                 .ok_or(DeviceManagerError::AllocateIoPort)?;
1766 
1767             self.address_manager
1768                 .io_bus
1769                 .insert(shutdown_device, shutdown_pio_address.into(), 0x4)
1770                 .map_err(DeviceManagerError::BusError)?;
1771 
1772             self.acpi_platform_addresses.sleep_control_reg_address =
1773                 Some(GenericAddress::io_port_address::<u8>(shutdown_pio_address));
1774             self.acpi_platform_addresses.sleep_status_reg_address =
1775                 Some(GenericAddress::io_port_address::<u8>(shutdown_pio_address));
1776             self.acpi_platform_addresses.reset_reg_address =
1777                 Some(GenericAddress::io_port_address::<u8>(shutdown_pio_address));
1778         }
1779 
1780         let ged_irq = self
1781             .address_manager
1782             .allocator
1783             .lock()
1784             .unwrap()
1785             .allocate_irq()
1786             .unwrap();
1787         let interrupt_group = interrupt_manager
1788             .create_group(LegacyIrqGroupConfig {
1789                 irq: ged_irq as InterruptIndex,
1790             })
1791             .map_err(DeviceManagerError::CreateInterruptGroup)?;
1792         let ged_address = self
1793             .address_manager
1794             .allocator
1795             .lock()
1796             .unwrap()
1797             .allocate_platform_mmio_addresses(
1798                 None,
1799                 devices::acpi::GED_DEVICE_ACPI_SIZE as u64,
1800                 None,
1801             )
1802             .ok_or(DeviceManagerError::AllocateMmioAddress)?;
1803         let ged_device = Arc::new(Mutex::new(devices::AcpiGedDevice::new(
1804             interrupt_group,
1805             ged_irq,
1806             ged_address,
1807         )));
1808         self.address_manager
1809             .mmio_bus
1810             .insert(
1811                 ged_device.clone(),
1812                 ged_address.0,
1813                 devices::acpi::GED_DEVICE_ACPI_SIZE as u64,
1814             )
1815             .map_err(DeviceManagerError::BusError)?;
1816         self.bus_devices
1817             .push(Arc::clone(&ged_device) as Arc<dyn BusDeviceSync>);
1818 
1819         let pm_timer_device = Arc::new(Mutex::new(devices::AcpiPmTimerDevice::new()));
1820 
1821         self.bus_devices
1822             .push(Arc::clone(&pm_timer_device) as Arc<dyn BusDeviceSync>);
1823 
1824         #[cfg(target_arch = "x86_64")]
1825         {
1826             let pm_timer_pio_address: u16 = 0x608;
1827 
1828             self.address_manager
1829                 .allocator
1830                 .lock()
1831                 .unwrap()
1832                 .allocate_io_addresses(Some(GuestAddress(pm_timer_pio_address.into())), 0x4, None)
1833                 .ok_or(DeviceManagerError::AllocateIoPort)?;
1834 
1835             self.address_manager
1836                 .io_bus
1837                 .insert(pm_timer_device, pm_timer_pio_address.into(), 0x4)
1838                 .map_err(DeviceManagerError::BusError)?;
1839 
1840             self.acpi_platform_addresses.pm_timer_address =
1841                 Some(GenericAddress::io_port_address::<u32>(pm_timer_pio_address));
1842         }
1843 
1844         Ok(Some(ged_device))
1845     }
1846 
1847     #[cfg(target_arch = "x86_64")]
1848     fn add_legacy_devices(&mut self, reset_evt: EventFd) -> DeviceManagerResult<()> {
1849         let vcpus_kill_signalled = self
1850             .cpu_manager
1851             .lock()
1852             .unwrap()
1853             .vcpus_kill_signalled()
1854             .clone();
1855         // Add a shutdown device (i8042)
1856         let i8042 = Arc::new(Mutex::new(devices::legacy::I8042Device::new(
1857             reset_evt.try_clone().unwrap(),
1858             vcpus_kill_signalled.clone(),
1859         )));
1860 
1861         self.bus_devices
1862             .push(Arc::clone(&i8042) as Arc<dyn BusDeviceSync>);
1863 
1864         self.address_manager
1865             .io_bus
1866             .insert(i8042, 0x61, 0x4)
1867             .map_err(DeviceManagerError::BusError)?;
1868         {
1869             // Add a CMOS emulated device
1870             let mem_size = self
1871                 .memory_manager
1872                 .lock()
1873                 .unwrap()
1874                 .guest_memory()
1875                 .memory()
1876                 .last_addr()
1877                 .0
1878                 + 1;
1879             let mem_below_4g = std::cmp::min(arch::layout::MEM_32BIT_RESERVED_START.0, mem_size);
1880             let mem_above_4g = mem_size.saturating_sub(arch::layout::RAM_64BIT_START.0);
1881 
1882             let cmos = Arc::new(Mutex::new(devices::legacy::Cmos::new(
1883                 mem_below_4g,
1884                 mem_above_4g,
1885                 reset_evt,
1886                 Some(vcpus_kill_signalled),
1887             )));
1888 
1889             self.bus_devices
1890                 .push(Arc::clone(&cmos) as Arc<dyn BusDeviceSync>);
1891 
1892             self.address_manager
1893                 .io_bus
1894                 .insert(cmos, 0x70, 0x2)
1895                 .map_err(DeviceManagerError::BusError)?;
1896 
1897             let fwdebug = Arc::new(Mutex::new(devices::legacy::FwDebugDevice::new()));
1898 
1899             self.bus_devices
1900                 .push(Arc::clone(&fwdebug) as Arc<dyn BusDeviceSync>);
1901 
1902             self.address_manager
1903                 .io_bus
1904                 .insert(fwdebug, 0x402, 0x1)
1905                 .map_err(DeviceManagerError::BusError)?;
1906         }
1907 
1908         // 0x80 debug port
1909         let debug_port = Arc::new(Mutex::new(devices::legacy::DebugPort::new(self.timestamp)));
1910         self.bus_devices
1911             .push(Arc::clone(&debug_port) as Arc<dyn BusDeviceSync>);
1912         self.address_manager
1913             .io_bus
1914             .insert(debug_port, 0x80, 0x1)
1915             .map_err(DeviceManagerError::BusError)?;
1916 
1917         Ok(())
1918     }
1919 
1920     #[cfg(target_arch = "aarch64")]
1921     fn add_legacy_devices(
1922         &mut self,
1923         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = LegacyIrqGroupConfig>>,
1924     ) -> DeviceManagerResult<()> {
1925         // Add a RTC device
1926         let rtc_irq = self
1927             .address_manager
1928             .allocator
1929             .lock()
1930             .unwrap()
1931             .allocate_irq()
1932             .unwrap();
1933 
1934         let interrupt_group = interrupt_manager
1935             .create_group(LegacyIrqGroupConfig {
1936                 irq: rtc_irq as InterruptIndex,
1937             })
1938             .map_err(DeviceManagerError::CreateInterruptGroup)?;
1939 
1940         let rtc_device = Arc::new(Mutex::new(devices::legacy::Rtc::new(interrupt_group)));
1941 
1942         self.bus_devices
1943             .push(Arc::clone(&rtc_device) as Arc<dyn BusDeviceSync>);
1944 
1945         let addr = arch::layout::LEGACY_RTC_MAPPED_IO_START;
1946 
1947         self.address_manager
1948             .mmio_bus
1949             .insert(rtc_device, addr.0, MMIO_LEN)
1950             .map_err(DeviceManagerError::BusError)?;
1951 
1952         self.id_to_dev_info.insert(
1953             (DeviceType::Rtc, "rtc".to_string()),
1954             MmioDeviceInfo {
1955                 addr: addr.0,
1956                 len: MMIO_LEN,
1957                 irq: rtc_irq,
1958             },
1959         );
1960 
1961         // Add a GPIO device
1962         let id = String::from(GPIO_DEVICE_NAME);
1963         let gpio_irq = self
1964             .address_manager
1965             .allocator
1966             .lock()
1967             .unwrap()
1968             .allocate_irq()
1969             .unwrap();
1970 
1971         let interrupt_group = interrupt_manager
1972             .create_group(LegacyIrqGroupConfig {
1973                 irq: gpio_irq as InterruptIndex,
1974             })
1975             .map_err(DeviceManagerError::CreateInterruptGroup)?;
1976 
1977         let gpio_device = Arc::new(Mutex::new(devices::legacy::Gpio::new(
1978             id.clone(),
1979             interrupt_group,
1980             state_from_id(self.snapshot.as_ref(), id.as_str())
1981                 .map_err(DeviceManagerError::RestoreGetState)?,
1982         )));
1983 
1984         self.bus_devices
1985             .push(Arc::clone(&gpio_device) as Arc<dyn BusDeviceSync>);
1986 
1987         let addr = arch::layout::LEGACY_GPIO_MAPPED_IO_START;
1988 
1989         self.address_manager
1990             .mmio_bus
1991             .insert(gpio_device.clone(), addr.0, MMIO_LEN)
1992             .map_err(DeviceManagerError::BusError)?;
1993 
1994         self.gpio_device = Some(gpio_device.clone());
1995 
1996         self.id_to_dev_info.insert(
1997             (DeviceType::Gpio, "gpio".to_string()),
1998             MmioDeviceInfo {
1999                 addr: addr.0,
2000                 len: MMIO_LEN,
2001                 irq: gpio_irq,
2002             },
2003         );
2004 
2005         self.device_tree
2006             .lock()
2007             .unwrap()
2008             .insert(id.clone(), device_node!(id, gpio_device));
2009 
2010         Ok(())
2011     }
2012 
2013     #[cfg(target_arch = "x86_64")]
2014     fn add_debug_console_device(
2015         &mut self,
2016         debug_console_writer: Box<dyn io::Write + Send>,
2017     ) -> DeviceManagerResult<Arc<Mutex<DebugConsole>>> {
2018         let id = String::from(DEBUGCON_DEVICE_NAME);
2019         let debug_console = Arc::new(Mutex::new(DebugConsole::new(
2020             id.clone(),
2021             debug_console_writer,
2022         )));
2023 
2024         let port = self
2025             .config
2026             .lock()
2027             .unwrap()
2028             .debug_console
2029             .clone()
2030             .iobase
2031             .map(|port| port as u64)
2032             .unwrap_or(debug_console::DEFAULT_PORT);
2033 
2034         self.bus_devices
2035             .push(Arc::clone(&debug_console) as Arc<dyn BusDeviceSync>);
2036 
2037         self.address_manager
2038             .allocator
2039             .lock()
2040             .unwrap()
2041             .allocate_io_addresses(Some(GuestAddress(port)), 0x1, None)
2042             .ok_or(DeviceManagerError::AllocateIoPort)?;
2043 
2044         self.address_manager
2045             .io_bus
2046             .insert(debug_console.clone(), port, 0x1)
2047             .map_err(DeviceManagerError::BusError)?;
2048 
2049         // Fill the device tree with a new node. In case of restore, we
2050         // know there is nothing to do, so we can simply override the
2051         // existing entry.
2052         self.device_tree
2053             .lock()
2054             .unwrap()
2055             .insert(id.clone(), device_node!(id, debug_console));
2056 
2057         Ok(debug_console)
2058     }
2059 
2060     #[cfg(target_arch = "x86_64")]
2061     fn add_serial_device(
2062         &mut self,
2063         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = LegacyIrqGroupConfig>>,
2064         serial_writer: Option<Box<dyn io::Write + Send>>,
2065     ) -> DeviceManagerResult<Arc<Mutex<Serial>>> {
2066         // Serial is tied to IRQ #4
2067         let serial_irq = 4;
2068 
2069         let id = String::from(SERIAL_DEVICE_NAME);
2070 
2071         let interrupt_group = interrupt_manager
2072             .create_group(LegacyIrqGroupConfig {
2073                 irq: serial_irq as InterruptIndex,
2074             })
2075             .map_err(DeviceManagerError::CreateInterruptGroup)?;
2076 
2077         let serial = Arc::new(Mutex::new(Serial::new(
2078             id.clone(),
2079             interrupt_group,
2080             serial_writer,
2081             state_from_id(self.snapshot.as_ref(), id.as_str())
2082                 .map_err(DeviceManagerError::RestoreGetState)?,
2083         )));
2084 
2085         self.bus_devices
2086             .push(Arc::clone(&serial) as Arc<dyn BusDeviceSync>);
2087 
2088         self.address_manager
2089             .allocator
2090             .lock()
2091             .unwrap()
2092             .allocate_io_addresses(Some(GuestAddress(0x3f8)), 0x8, None)
2093             .ok_or(DeviceManagerError::AllocateIoPort)?;
2094 
2095         self.address_manager
2096             .io_bus
2097             .insert(serial.clone(), 0x3f8, 0x8)
2098             .map_err(DeviceManagerError::BusError)?;
2099 
2100         // Fill the device tree with a new node. In case of restore, we
2101         // know there is nothing to do, so we can simply override the
2102         // existing entry.
2103         self.device_tree
2104             .lock()
2105             .unwrap()
2106             .insert(id.clone(), device_node!(id, serial));
2107 
2108         Ok(serial)
2109     }
2110 
2111     #[cfg(target_arch = "aarch64")]
2112     fn add_serial_device(
2113         &mut self,
2114         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = LegacyIrqGroupConfig>>,
2115         serial_writer: Option<Box<dyn io::Write + Send>>,
2116     ) -> DeviceManagerResult<Arc<Mutex<Pl011>>> {
2117         let id = String::from(SERIAL_DEVICE_NAME);
2118 
2119         let serial_irq = self
2120             .address_manager
2121             .allocator
2122             .lock()
2123             .unwrap()
2124             .allocate_irq()
2125             .unwrap();
2126 
2127         let interrupt_group = interrupt_manager
2128             .create_group(LegacyIrqGroupConfig {
2129                 irq: serial_irq as InterruptIndex,
2130             })
2131             .map_err(DeviceManagerError::CreateInterruptGroup)?;
2132 
2133         let serial = Arc::new(Mutex::new(devices::legacy::Pl011::new(
2134             id.clone(),
2135             interrupt_group,
2136             serial_writer,
2137             self.timestamp,
2138             state_from_id(self.snapshot.as_ref(), id.as_str())
2139                 .map_err(DeviceManagerError::RestoreGetState)?,
2140         )));
2141 
2142         self.bus_devices
2143             .push(Arc::clone(&serial) as Arc<dyn BusDeviceSync>);
2144 
2145         let addr = arch::layout::LEGACY_SERIAL_MAPPED_IO_START;
2146 
2147         self.address_manager
2148             .mmio_bus
2149             .insert(serial.clone(), addr.0, MMIO_LEN)
2150             .map_err(DeviceManagerError::BusError)?;
2151 
2152         self.id_to_dev_info.insert(
2153             (DeviceType::Serial, DeviceType::Serial.to_string()),
2154             MmioDeviceInfo {
2155                 addr: addr.0,
2156                 len: MMIO_LEN,
2157                 irq: serial_irq,
2158             },
2159         );
2160 
2161         self.cmdline_additions
2162             .push(format!("earlycon=pl011,mmio,0x{:08x}", addr.0));
2163 
2164         // Fill the device tree with a new node. In case of restore, we
2165         // know there is nothing to do, so we can simply override the
2166         // existing entry.
2167         self.device_tree
2168             .lock()
2169             .unwrap()
2170             .insert(id.clone(), device_node!(id, serial));
2171 
2172         Ok(serial)
2173     }
2174 
2175     #[cfg(target_arch = "riscv64")]
2176     fn add_serial_device(
2177         &mut self,
2178         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = LegacyIrqGroupConfig>>,
2179         serial_writer: Option<Box<dyn io::Write + Send>>,
2180     ) -> DeviceManagerResult<Arc<Mutex<Serial>>> {
2181         let id = String::from(SERIAL_DEVICE_NAME);
2182 
2183         let serial_irq = self
2184             .address_manager
2185             .allocator
2186             .lock()
2187             .unwrap()
2188             .allocate_irq()
2189             .unwrap();
2190 
2191         let interrupt_group = interrupt_manager
2192             .create_group(LegacyIrqGroupConfig {
2193                 irq: serial_irq as InterruptIndex,
2194             })
2195             .map_err(DeviceManagerError::CreateInterruptGroup)?;
2196 
2197         let serial = Arc::new(Mutex::new(Serial::new(
2198             id.clone(),
2199             interrupt_group,
2200             serial_writer,
2201             state_from_id(self.snapshot.as_ref(), id.as_str())
2202                 .map_err(DeviceManagerError::RestoreGetState)?,
2203         )));
2204 
2205         self.bus_devices
2206             .push(Arc::clone(&serial) as Arc<dyn BusDeviceSync>);
2207 
2208         let addr = arch::layout::LEGACY_SERIAL_MAPPED_IO_START;
2209 
2210         self.address_manager
2211             .mmio_bus
2212             .insert(serial.clone(), addr.0, MMIO_LEN)
2213             .map_err(DeviceManagerError::BusError)?;
2214 
2215         self.id_to_dev_info.insert(
2216             (DeviceType::Serial, DeviceType::Serial.to_string()),
2217             MmioDeviceInfo {
2218                 addr: addr.0,
2219                 len: MMIO_LEN,
2220                 irq: serial_irq,
2221             },
2222         );
2223 
2224         self.cmdline_additions
2225             .push(format!("earlycon=uart,mmio,0x{:08x}", addr.0));
2226 
2227         // Fill the device tree with a new node. In case of restore, we
2228         // know there is nothing to do, so we can simply override the
2229         // existing entry.
2230         self.device_tree
2231             .lock()
2232             .unwrap()
2233             .insert(id.clone(), device_node!(id, serial));
2234 
2235         Ok(serial)
2236     }
2237 
2238     fn add_virtio_console_device(
2239         &mut self,
2240         virtio_devices: &mut Vec<MetaVirtioDevice>,
2241         console_fd: ConsoleOutput,
2242         resize_pipe: Option<Arc<File>>,
2243     ) -> DeviceManagerResult<Option<Arc<virtio_devices::ConsoleResizer>>> {
2244         let console_config = self.config.lock().unwrap().console.clone();
2245         let endpoint = match console_fd {
2246             ConsoleOutput::File(file) => Endpoint::File(file),
2247             ConsoleOutput::Pty(file) => {
2248                 self.console_resize_pipe = resize_pipe;
2249                 Endpoint::PtyPair(Arc::new(file.try_clone().unwrap()), file)
2250             }
2251             ConsoleOutput::Tty(stdout) => {
2252                 if stdout.is_terminal() {
2253                     self.console_resize_pipe = resize_pipe;
2254                 }
2255 
2256                 // If an interactive TTY then we can accept input
2257                 // SAFETY: FFI call. Trivially safe.
2258                 if unsafe { libc::isatty(libc::STDIN_FILENO) == 1 } {
2259                     // SAFETY: FFI call to dup. Trivially safe.
2260                     let stdin = unsafe { libc::dup(libc::STDIN_FILENO) };
2261                     if stdin == -1 {
2262                         return vmm_sys_util::errno::errno_result()
2263                             .map_err(DeviceManagerError::DupFd);
2264                     }
2265                     // SAFETY: stdin is valid and owned solely by us.
2266                     let stdin = unsafe { File::from_raw_fd(stdin) };
2267                     Endpoint::FilePair(stdout, Arc::new(stdin))
2268                 } else {
2269                     Endpoint::File(stdout)
2270                 }
2271             }
2272             ConsoleOutput::Socket(_) => {
2273                 return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice);
2274             }
2275             ConsoleOutput::Null => Endpoint::Null,
2276             ConsoleOutput::Off => return Ok(None),
2277         };
2278         let id = String::from(CONSOLE_DEVICE_NAME);
2279 
2280         let (virtio_console_device, console_resizer) = virtio_devices::Console::new(
2281             id.clone(),
2282             endpoint,
2283             self.console_resize_pipe
2284                 .as_ref()
2285                 .map(|p| p.try_clone().unwrap()),
2286             self.force_iommu | console_config.iommu,
2287             self.seccomp_action.clone(),
2288             self.exit_evt
2289                 .try_clone()
2290                 .map_err(DeviceManagerError::EventFd)?,
2291             state_from_id(self.snapshot.as_ref(), id.as_str())
2292                 .map_err(DeviceManagerError::RestoreGetState)?,
2293         )
2294         .map_err(DeviceManagerError::CreateVirtioConsole)?;
2295         let virtio_console_device = Arc::new(Mutex::new(virtio_console_device));
2296         virtio_devices.push(MetaVirtioDevice {
2297             virtio_device: Arc::clone(&virtio_console_device)
2298                 as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
2299             iommu: console_config.iommu,
2300             id: id.clone(),
2301             pci_segment: 0,
2302             dma_handler: None,
2303         });
2304 
2305         // Fill the device tree with a new node. In case of restore, we
2306         // know there is nothing to do, so we can simply override the
2307         // existing entry.
2308         self.device_tree
2309             .lock()
2310             .unwrap()
2311             .insert(id.clone(), device_node!(id, virtio_console_device));
2312 
2313         // Only provide a resizer (for SIGWINCH handling) if the console is attached to the TTY
2314         Ok(if matches!(console_config.mode, ConsoleOutputMode::Tty) {
2315             Some(console_resizer)
2316         } else {
2317             None
2318         })
2319     }
2320 
2321     /// Adds all devices that behave like a console with respect to the VM
2322     /// configuration. This includes:
2323     /// - debug-console
2324     /// - serial-console
2325     /// - virtio-console
2326     fn add_console_devices(
2327         &mut self,
2328         interrupt_manager: &Arc<dyn InterruptManager<GroupConfig = LegacyIrqGroupConfig>>,
2329         virtio_devices: &mut Vec<MetaVirtioDevice>,
2330         console_info: Option<ConsoleInfo>,
2331         console_resize_pipe: Option<Arc<File>>,
2332     ) -> DeviceManagerResult<Arc<Console>> {
2333         let serial_config = self.config.lock().unwrap().serial.clone();
2334         if console_info.is_none() {
2335             return Err(DeviceManagerError::InvalidConsoleInfo);
2336         }
2337 
2338         // SAFETY: console_info is Some, so it's safe to unwrap.
2339         let console_info = console_info.unwrap();
2340 
2341         let serial_writer: Option<Box<dyn io::Write + Send>> = match console_info.serial_main_fd {
2342             ConsoleOutput::File(ref file) | ConsoleOutput::Tty(ref file) => {
2343                 Some(Box::new(Arc::clone(file)))
2344             }
2345             ConsoleOutput::Off
2346             | ConsoleOutput::Null
2347             | ConsoleOutput::Pty(_)
2348             | ConsoleOutput::Socket(_) => None,
2349         };
2350 
2351         if !matches!(console_info.serial_main_fd, ConsoleOutput::Off) {
2352             let serial = self.add_serial_device(interrupt_manager, serial_writer)?;
2353             self.serial_manager = match console_info.serial_main_fd {
2354                 ConsoleOutput::Pty(_) | ConsoleOutput::Tty(_) | ConsoleOutput::Socket(_) => {
2355                     let serial_manager = SerialManager::new(
2356                         serial,
2357                         console_info.serial_main_fd,
2358                         serial_config.socket,
2359                     )
2360                     .map_err(DeviceManagerError::CreateSerialManager)?;
2361                     if let Some(mut serial_manager) = serial_manager {
2362                         serial_manager
2363                             .start_thread(
2364                                 self.exit_evt
2365                                     .try_clone()
2366                                     .map_err(DeviceManagerError::EventFd)?,
2367                             )
2368                             .map_err(DeviceManagerError::SpawnSerialManager)?;
2369                         Some(Arc::new(serial_manager))
2370                     } else {
2371                         None
2372                     }
2373                 }
2374                 _ => None,
2375             };
2376         }
2377 
2378         #[cfg(target_arch = "x86_64")]
2379         {
2380             let debug_console_writer: Option<Box<dyn io::Write + Send>> =
2381                 match console_info.debug_main_fd {
2382                     ConsoleOutput::File(file) | ConsoleOutput::Tty(file) => Some(Box::new(file)),
2383                     ConsoleOutput::Off
2384                     | ConsoleOutput::Null
2385                     | ConsoleOutput::Pty(_)
2386                     | ConsoleOutput::Socket(_) => None,
2387                 };
2388             if let Some(writer) = debug_console_writer {
2389                 let _ = self.add_debug_console_device(writer)?;
2390             }
2391         }
2392 
2393         let console_resizer = self.add_virtio_console_device(
2394             virtio_devices,
2395             console_info.console_main_fd,
2396             console_resize_pipe,
2397         )?;
2398 
2399         Ok(Arc::new(Console { console_resizer }))
2400     }
2401 
2402     #[cfg(not(target_arch = "riscv64"))]
2403     fn add_tpm_device(
2404         &mut self,
2405         tpm_path: PathBuf,
2406     ) -> DeviceManagerResult<Arc<Mutex<devices::tpm::Tpm>>> {
2407         // Create TPM Device
2408         let tpm = devices::tpm::Tpm::new(tpm_path.to_str().unwrap().to_string()).map_err(|e| {
2409             DeviceManagerError::CreateTpmDevice(anyhow!("Failed to create TPM Device : {:?}", e))
2410         })?;
2411         let tpm = Arc::new(Mutex::new(tpm));
2412 
2413         // Add TPM Device to mmio
2414         self.address_manager
2415             .mmio_bus
2416             .insert(
2417                 tpm.clone(),
2418                 arch::layout::TPM_START.0,
2419                 arch::layout::TPM_SIZE,
2420             )
2421             .map_err(DeviceManagerError::BusError)?;
2422 
2423         Ok(tpm)
2424     }
2425 
2426     /// Tries to acquire advisory locks for all disk images.
2427     ///
2428     /// This should only be called when a VM boots or VM state is restored.
2429     /// For live-migration, the locks must be released on the destination side
2430     /// before they are acquired again by the receiving side.
2431     pub fn try_lock_disks(&self) -> DeviceManagerResult<()> {
2432         for dev in &self.block_devices {
2433             let mut dev = dev.lock().unwrap();
2434             dev.try_lock_image()
2435                 .map_err(DeviceManagerError::DiskLockError)?;
2436         }
2437         Ok(())
2438     }
2439 
2440     /// Release all advisory locks held for the disk images.
2441     ///
2442     /// This should only be called when the VM is stopped and the VMM supposed
2443     /// to shut down. A new VMM, either after a live migration or a
2444     /// state save/resume cycle, should then acquire all locks before the VM
2445     /// starts to run.
2446     pub fn release_disk_locks(&self) -> DeviceManagerResult<()> {
2447         for dev in &self.block_devices {
2448             let mut dev = dev.lock().unwrap();
2449             dev.unlock_image()
2450                 .map_err(DeviceManagerError::DiskLockError)?;
2451         }
2452         Ok(())
2453     }
2454 
2455     fn make_virtio_devices(&mut self) -> DeviceManagerResult<Vec<MetaVirtioDevice>> {
2456         let mut devices: Vec<MetaVirtioDevice> = Vec::new();
2457 
2458         // Create "standard" virtio devices (net/block/rng)
2459         devices.append(&mut self.make_virtio_block_devices()?);
2460         devices.append(&mut self.make_virtio_net_devices()?);
2461         devices.append(&mut self.make_virtio_rng_devices()?);
2462 
2463         // Add virtio-fs if required
2464         devices.append(&mut self.make_virtio_fs_devices()?);
2465 
2466         // Add virtio-pmem if required
2467         devices.append(&mut self.make_virtio_pmem_devices()?);
2468 
2469         // Add virtio-vsock if required
2470         devices.append(&mut self.make_virtio_vsock_devices()?);
2471 
2472         devices.append(&mut self.make_virtio_mem_devices()?);
2473 
2474         // Add virtio-balloon if required
2475         devices.append(&mut self.make_virtio_balloon_devices()?);
2476 
2477         // Add virtio-watchdog device
2478         devices.append(&mut self.make_virtio_watchdog_devices()?);
2479 
2480         // Add vDPA devices if required
2481         devices.append(&mut self.make_vdpa_devices()?);
2482 
2483         Ok(devices)
2484     }
2485 
2486     // Cache whether aio is supported to avoid checking for very block device
2487     fn aio_is_supported(&mut self) -> bool {
2488         if let Some(supported) = self.aio_supported {
2489             return supported;
2490         }
2491 
2492         let supported = block_aio_is_supported();
2493         self.aio_supported = Some(supported);
2494         supported
2495     }
2496 
2497     // Cache whether io_uring is supported to avoid probing for very block device
2498     fn io_uring_is_supported(&mut self) -> bool {
2499         if let Some(supported) = self.io_uring_supported {
2500             return supported;
2501         }
2502 
2503         let supported = block_io_uring_is_supported();
2504         self.io_uring_supported = Some(supported);
2505         supported
2506     }
2507 
2508     /// Creates a [`MetaVirtioDevice`] from the provided [`DiskConfig`].
2509     ///
2510     /// Depending on the config, this is a [`vhost_user::Blk`] device or a [`virtio_devices::Block`]
2511     /// device.
2512     ///
2513     /// # Arguments
2514     /// - `disk_cfg`: The [`DiskConfig`] used to create the block device.
2515     /// - `is_hotplug`: Whether the device is being hotplugged and the lock for the disk image
2516     ///   should be acquired right away. Locking will only happen for normal block devices, and not
2517     ///   vhost-user devices.
2518     fn make_virtio_block_device(
2519         &mut self,
2520         disk_cfg: &mut DiskConfig,
2521         is_hotplug: bool,
2522     ) -> DeviceManagerResult<MetaVirtioDevice> {
2523         let id = if let Some(id) = &disk_cfg.id {
2524             id.clone()
2525         } else {
2526             let id = self.next_device_name(DISK_DEVICE_NAME_PREFIX)?;
2527             disk_cfg.id = Some(id.clone());
2528             id
2529         };
2530 
2531         info!("Creating virtio-block device: {:?}", disk_cfg);
2532 
2533         let (virtio_device, migratable_device) = if disk_cfg.vhost_user {
2534             if is_hotplug {
2535                 log::debug!("Acquiring image lock for vhost-user block device not supported");
2536             }
2537             let socket = disk_cfg.vhost_socket.as_ref().unwrap().clone();
2538             let vu_cfg = VhostUserConfig {
2539                 socket,
2540                 num_queues: disk_cfg.num_queues,
2541                 queue_size: disk_cfg.queue_size,
2542             };
2543             let vhost_user_block = Arc::new(Mutex::new(
2544                 match virtio_devices::vhost_user::Blk::new(
2545                     id.clone(),
2546                     vu_cfg,
2547                     self.seccomp_action.clone(),
2548                     self.exit_evt
2549                         .try_clone()
2550                         .map_err(DeviceManagerError::EventFd)?,
2551                     self.force_iommu,
2552                     state_from_id(self.snapshot.as_ref(), id.as_str())
2553                         .map_err(DeviceManagerError::RestoreGetState)?,
2554                 ) {
2555                     Ok(vub_device) => vub_device,
2556                     Err(e) => {
2557                         return Err(DeviceManagerError::CreateVhostUserBlk(e));
2558                     }
2559                 },
2560             ));
2561 
2562             (
2563                 Arc::clone(&vhost_user_block) as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
2564                 vhost_user_block as Arc<Mutex<dyn Migratable>>,
2565             )
2566         } else {
2567             let mut options = OpenOptions::new();
2568             options.read(true);
2569             options.write(!disk_cfg.readonly);
2570             if disk_cfg.direct {
2571                 options.custom_flags(libc::O_DIRECT);
2572             }
2573             // Open block device path
2574             let mut file: File = options
2575                 .open(
2576                     disk_cfg
2577                         .path
2578                         .as_ref()
2579                         .ok_or(DeviceManagerError::NoDiskPath)?
2580                         .clone(),
2581                 )
2582                 .map_err(DeviceManagerError::Disk)?;
2583             let image_type =
2584                 detect_image_type(&mut file).map_err(DeviceManagerError::DetectImageType)?;
2585 
2586             let image = match image_type {
2587                 ImageType::FixedVhd => {
2588                     // Use asynchronous backend relying on io_uring if the
2589                     // syscalls are supported.
2590                     if cfg!(feature = "io_uring")
2591                         && !disk_cfg.disable_io_uring
2592                         && self.io_uring_is_supported()
2593                     {
2594                         info!("Using asynchronous fixed VHD disk file (io_uring)");
2595 
2596                         #[cfg(not(feature = "io_uring"))]
2597                         unreachable!("Checked in if statement above");
2598                         #[cfg(feature = "io_uring")]
2599                         {
2600                             Box::new(
2601                                 FixedVhdDiskAsync::new(file)
2602                                     .map_err(DeviceManagerError::CreateFixedVhdDiskAsync)?,
2603                             ) as Box<dyn DiskFile>
2604                         }
2605                     } else {
2606                         info!("Using synchronous fixed VHD disk file");
2607                         Box::new(
2608                             FixedVhdDiskSync::new(file)
2609                                 .map_err(DeviceManagerError::CreateFixedVhdDiskSync)?,
2610                         ) as Box<dyn DiskFile>
2611                     }
2612                 }
2613                 ImageType::Raw => {
2614                     // Use asynchronous backend relying on io_uring if the
2615                     // syscalls are supported.
2616                     if cfg!(feature = "io_uring")
2617                         && !disk_cfg.disable_io_uring
2618                         && self.io_uring_is_supported()
2619                     {
2620                         info!("Using asynchronous RAW disk file (io_uring)");
2621 
2622                         #[cfg(not(feature = "io_uring"))]
2623                         unreachable!("Checked in if statement above");
2624                         #[cfg(feature = "io_uring")]
2625                         {
2626                             Box::new(RawFileDisk::new(file)) as Box<dyn DiskFile>
2627                         }
2628                     } else if !disk_cfg.disable_aio && self.aio_is_supported() {
2629                         info!("Using asynchronous RAW disk file (aio)");
2630                         Box::new(RawFileDiskAio::new(file)) as Box<dyn DiskFile>
2631                     } else {
2632                         info!("Using synchronous RAW disk file");
2633                         Box::new(RawFileDiskSync::new(file)) as Box<dyn DiskFile>
2634                     }
2635                 }
2636                 ImageType::Qcow2 => {
2637                     info!("Using synchronous QCOW2 disk file");
2638                     Box::new(
2639                         QcowDiskSync::new(file, disk_cfg.direct)
2640                             .map_err(DeviceManagerError::CreateQcowDiskSync)?,
2641                     ) as Box<dyn DiskFile>
2642                 }
2643                 ImageType::Vhdx => {
2644                     info!("Using synchronous VHDX disk file");
2645                     Box::new(
2646                         VhdxDiskSync::new(file)
2647                             .map_err(DeviceManagerError::CreateFixedVhdxDiskSync)?,
2648                     ) as Box<dyn DiskFile>
2649                 }
2650             };
2651 
2652             let rate_limit_group =
2653                 if let Some(rate_limiter_cfg) = disk_cfg.rate_limiter_config.as_ref() {
2654                     // Create an anonymous RateLimiterGroup that is dropped when the Disk
2655                     // is dropped.
2656                     let bw = rate_limiter_cfg.bandwidth.unwrap_or_default();
2657                     let ops = rate_limiter_cfg.ops.unwrap_or_default();
2658                     let mut rate_limit_group = RateLimiterGroup::new(
2659                         disk_cfg.id.as_ref().unwrap(),
2660                         bw.size,
2661                         bw.one_time_burst.unwrap_or(0),
2662                         bw.refill_time,
2663                         ops.size,
2664                         ops.one_time_burst.unwrap_or(0),
2665                         ops.refill_time,
2666                     )
2667                     .map_err(DeviceManagerError::RateLimiterGroupCreate)?;
2668 
2669                     rate_limit_group
2670                         .start_thread(
2671                             self.exit_evt
2672                                 .try_clone()
2673                                 .map_err(DeviceManagerError::EventFd)?,
2674                         )
2675                         .unwrap();
2676 
2677                     Some(Arc::new(rate_limit_group))
2678                 } else if let Some(rate_limit_group) = disk_cfg.rate_limit_group.as_ref() {
2679                     self.rate_limit_groups.get(rate_limit_group).cloned()
2680                 } else {
2681                     None
2682                 };
2683 
2684             let queue_affinity = if let Some(queue_affinity) = disk_cfg.queue_affinity.as_ref() {
2685                 queue_affinity
2686                     .iter()
2687                     .map(|a| (a.queue_index, a.host_cpus.clone()))
2688                     .collect()
2689             } else {
2690                 BTreeMap::new()
2691             };
2692 
2693             let mut virtio_block = virtio_devices::Block::new(
2694                 id.clone(),
2695                 image,
2696                 disk_cfg
2697                     .path
2698                     .as_ref()
2699                     .ok_or(DeviceManagerError::NoDiskPath)?
2700                     .clone(),
2701                 disk_cfg.readonly,
2702                 self.force_iommu | disk_cfg.iommu,
2703                 disk_cfg.num_queues,
2704                 disk_cfg.queue_size,
2705                 disk_cfg.serial.clone(),
2706                 self.seccomp_action.clone(),
2707                 rate_limit_group,
2708                 self.exit_evt
2709                     .try_clone()
2710                     .map_err(DeviceManagerError::EventFd)?,
2711                 state_from_id(self.snapshot.as_ref(), id.as_str())
2712                     .map_err(DeviceManagerError::RestoreGetState)?,
2713                 queue_affinity,
2714             )
2715             .map_err(DeviceManagerError::CreateVirtioBlock)?;
2716 
2717             // We lock the file here only for hotplugging. In normal operation,
2718             // state save/resume, and live-migration, locking is part of the outer control flow
2719             // to ensure proper order of (un)locking.
2720             if is_hotplug {
2721                 log::debug!("Acquiring lock for hotplugged image");
2722                 virtio_block
2723                     .try_lock_image()
2724                     .map_err(DeviceManagerError::DiskLockError)?;
2725             }
2726 
2727             let virtio_block = Arc::new(Mutex::new(virtio_block));
2728 
2729             self.block_devices.push(virtio_block.clone());
2730 
2731             (
2732                 Arc::clone(&virtio_block) as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
2733                 virtio_block as Arc<Mutex<dyn Migratable>>,
2734             )
2735         };
2736 
2737         // Fill the device tree with a new node. In case of restore, we
2738         // know there is nothing to do, so we can simply override the
2739         // existing entry.
2740         self.device_tree
2741             .lock()
2742             .unwrap()
2743             .insert(id.clone(), device_node!(id, migratable_device));
2744 
2745         Ok(MetaVirtioDevice {
2746             virtio_device,
2747             iommu: disk_cfg.iommu,
2748             id,
2749             pci_segment: disk_cfg.pci_segment,
2750             dma_handler: None,
2751         })
2752     }
2753 
2754     fn make_virtio_block_devices(&mut self) -> DeviceManagerResult<Vec<MetaVirtioDevice>> {
2755         let mut devices = Vec::new();
2756 
2757         let mut block_devices = self.config.lock().unwrap().disks.clone();
2758         if let Some(disk_list_cfg) = &mut block_devices {
2759             for disk_cfg in disk_list_cfg.iter_mut() {
2760                 devices.push(self.make_virtio_block_device(disk_cfg, false)?);
2761             }
2762         }
2763         self.config.lock().unwrap().disks = block_devices;
2764 
2765         Ok(devices)
2766     }
2767 
2768     fn make_virtio_net_device(
2769         &mut self,
2770         net_cfg: &mut NetConfig,
2771     ) -> DeviceManagerResult<MetaVirtioDevice> {
2772         let id = if let Some(id) = &net_cfg.id {
2773             id.clone()
2774         } else {
2775             let id = self.next_device_name(NET_DEVICE_NAME_PREFIX)?;
2776             net_cfg.id = Some(id.clone());
2777             id
2778         };
2779         info!("Creating virtio-net device: {:?}", net_cfg);
2780 
2781         let (virtio_device, migratable_device) = if net_cfg.vhost_user {
2782             let socket = net_cfg.vhost_socket.as_ref().unwrap().clone();
2783             let vu_cfg = VhostUserConfig {
2784                 socket,
2785                 num_queues: net_cfg.num_queues,
2786                 queue_size: net_cfg.queue_size,
2787             };
2788             let server = match net_cfg.vhost_mode {
2789                 VhostMode::Client => false,
2790                 VhostMode::Server => true,
2791             };
2792             let vhost_user_net = Arc::new(Mutex::new(
2793                 match virtio_devices::vhost_user::Net::new(
2794                     id.clone(),
2795                     net_cfg.mac,
2796                     net_cfg.mtu,
2797                     vu_cfg,
2798                     server,
2799                     self.seccomp_action.clone(),
2800                     self.exit_evt
2801                         .try_clone()
2802                         .map_err(DeviceManagerError::EventFd)?,
2803                     self.force_iommu,
2804                     state_from_id(self.snapshot.as_ref(), id.as_str())
2805                         .map_err(DeviceManagerError::RestoreGetState)?,
2806                     net_cfg.offload_tso,
2807                     net_cfg.offload_ufo,
2808                     net_cfg.offload_csum,
2809                 ) {
2810                     Ok(vun_device) => vun_device,
2811                     Err(e) => {
2812                         return Err(DeviceManagerError::CreateVhostUserNet(e));
2813                     }
2814                 },
2815             ));
2816 
2817             (
2818                 Arc::clone(&vhost_user_net) as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
2819                 vhost_user_net as Arc<Mutex<dyn Migratable>>,
2820             )
2821         } else {
2822             let state = state_from_id(self.snapshot.as_ref(), id.as_str())
2823                 .map_err(DeviceManagerError::RestoreGetState)?;
2824             let virtio_net = if let Some(ref tap_if_name) = net_cfg.tap {
2825                 Arc::new(Mutex::new(
2826                     virtio_devices::Net::new(
2827                         id.clone(),
2828                         Some(tap_if_name),
2829                         Some(net_cfg.ip),
2830                         Some(net_cfg.mask),
2831                         Some(net_cfg.mac),
2832                         &mut net_cfg.host_mac,
2833                         net_cfg.mtu,
2834                         self.force_iommu | net_cfg.iommu,
2835                         net_cfg.num_queues,
2836                         net_cfg.queue_size,
2837                         self.seccomp_action.clone(),
2838                         net_cfg.rate_limiter_config,
2839                         self.exit_evt
2840                             .try_clone()
2841                             .map_err(DeviceManagerError::EventFd)?,
2842                         state,
2843                         net_cfg.offload_tso,
2844                         net_cfg.offload_ufo,
2845                         net_cfg.offload_csum,
2846                     )
2847                     .map_err(DeviceManagerError::CreateVirtioNet)?,
2848                 ))
2849             } else if let Some(fds) = &net_cfg.fds {
2850                 let net = virtio_devices::Net::from_tap_fds(
2851                     id.clone(),
2852                     fds,
2853                     Some(net_cfg.mac),
2854                     net_cfg.mtu,
2855                     self.force_iommu | net_cfg.iommu,
2856                     net_cfg.queue_size,
2857                     self.seccomp_action.clone(),
2858                     net_cfg.rate_limiter_config,
2859                     self.exit_evt
2860                         .try_clone()
2861                         .map_err(DeviceManagerError::EventFd)?,
2862                     state,
2863                     net_cfg.offload_tso,
2864                     net_cfg.offload_ufo,
2865                     net_cfg.offload_csum,
2866                 )
2867                 .map_err(DeviceManagerError::CreateVirtioNet)?;
2868 
2869                 // SAFETY: 'fds' are valid because TAP devices are created successfully
2870                 unsafe {
2871                     self.config.lock().unwrap().add_preserved_fds(fds.clone());
2872                 }
2873 
2874                 Arc::new(Mutex::new(net))
2875             } else {
2876                 Arc::new(Mutex::new(
2877                     virtio_devices::Net::new(
2878                         id.clone(),
2879                         None,
2880                         Some(net_cfg.ip),
2881                         Some(net_cfg.mask),
2882                         Some(net_cfg.mac),
2883                         &mut net_cfg.host_mac,
2884                         net_cfg.mtu,
2885                         self.force_iommu | net_cfg.iommu,
2886                         net_cfg.num_queues,
2887                         net_cfg.queue_size,
2888                         self.seccomp_action.clone(),
2889                         net_cfg.rate_limiter_config,
2890                         self.exit_evt
2891                             .try_clone()
2892                             .map_err(DeviceManagerError::EventFd)?,
2893                         state,
2894                         net_cfg.offload_tso,
2895                         net_cfg.offload_ufo,
2896                         net_cfg.offload_csum,
2897                     )
2898                     .map_err(DeviceManagerError::CreateVirtioNet)?,
2899                 ))
2900             };
2901 
2902             (
2903                 Arc::clone(&virtio_net) as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
2904                 virtio_net as Arc<Mutex<dyn Migratable>>,
2905             )
2906         };
2907 
2908         // Fill the device tree with a new node. In case of restore, we
2909         // know there is nothing to do, so we can simply override the
2910         // existing entry.
2911         self.device_tree
2912             .lock()
2913             .unwrap()
2914             .insert(id.clone(), device_node!(id, migratable_device));
2915 
2916         Ok(MetaVirtioDevice {
2917             virtio_device,
2918             iommu: net_cfg.iommu,
2919             id,
2920             pci_segment: net_cfg.pci_segment,
2921             dma_handler: None,
2922         })
2923     }
2924 
2925     /// Add virto-net and vhost-user-net devices
2926     fn make_virtio_net_devices(&mut self) -> DeviceManagerResult<Vec<MetaVirtioDevice>> {
2927         let mut devices = Vec::new();
2928         let mut net_devices = self.config.lock().unwrap().net.clone();
2929         if let Some(net_list_cfg) = &mut net_devices {
2930             for net_cfg in net_list_cfg.iter_mut() {
2931                 devices.push(self.make_virtio_net_device(net_cfg)?);
2932             }
2933         }
2934         self.config.lock().unwrap().net = net_devices;
2935 
2936         Ok(devices)
2937     }
2938 
2939     fn make_virtio_rng_devices(&mut self) -> DeviceManagerResult<Vec<MetaVirtioDevice>> {
2940         let mut devices = Vec::new();
2941 
2942         // Add virtio-rng if required
2943         let rng_config = self.config.lock().unwrap().rng.clone();
2944         if let Some(rng_path) = rng_config.src.to_str() {
2945             info!("Creating virtio-rng device: {:?}", rng_config);
2946             let id = String::from(RNG_DEVICE_NAME);
2947 
2948             let virtio_rng_device = Arc::new(Mutex::new(
2949                 virtio_devices::Rng::new(
2950                     id.clone(),
2951                     rng_path,
2952                     self.force_iommu | rng_config.iommu,
2953                     self.seccomp_action.clone(),
2954                     self.exit_evt
2955                         .try_clone()
2956                         .map_err(DeviceManagerError::EventFd)?,
2957                     state_from_id(self.snapshot.as_ref(), id.as_str())
2958                         .map_err(DeviceManagerError::RestoreGetState)?,
2959                 )
2960                 .map_err(DeviceManagerError::CreateVirtioRng)?,
2961             ));
2962             devices.push(MetaVirtioDevice {
2963                 virtio_device: Arc::clone(&virtio_rng_device)
2964                     as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
2965                 iommu: rng_config.iommu,
2966                 id: id.clone(),
2967                 pci_segment: 0,
2968                 dma_handler: None,
2969             });
2970 
2971             // Fill the device tree with a new node. In case of restore, we
2972             // know there is nothing to do, so we can simply override the
2973             // existing entry.
2974             self.device_tree
2975                 .lock()
2976                 .unwrap()
2977                 .insert(id.clone(), device_node!(id, virtio_rng_device));
2978         }
2979 
2980         Ok(devices)
2981     }
2982 
2983     fn make_virtio_fs_device(
2984         &mut self,
2985         fs_cfg: &mut FsConfig,
2986     ) -> DeviceManagerResult<MetaVirtioDevice> {
2987         let id = if let Some(id) = &fs_cfg.id {
2988             id.clone()
2989         } else {
2990             let id = self.next_device_name(FS_DEVICE_NAME_PREFIX)?;
2991             fs_cfg.id = Some(id.clone());
2992             id
2993         };
2994 
2995         info!("Creating virtio-fs device: {:?}", fs_cfg);
2996 
2997         let mut node = device_node!(id);
2998 
2999         if let Some(fs_socket) = fs_cfg.socket.to_str() {
3000             let virtio_fs_device = Arc::new(Mutex::new(
3001                 virtio_devices::vhost_user::Fs::new(
3002                     id.clone(),
3003                     fs_socket,
3004                     &fs_cfg.tag,
3005                     fs_cfg.num_queues,
3006                     fs_cfg.queue_size,
3007                     None,
3008                     self.seccomp_action.clone(),
3009                     self.exit_evt
3010                         .try_clone()
3011                         .map_err(DeviceManagerError::EventFd)?,
3012                     self.force_iommu,
3013                     state_from_id(self.snapshot.as_ref(), id.as_str())
3014                         .map_err(DeviceManagerError::RestoreGetState)?,
3015                 )
3016                 .map_err(DeviceManagerError::CreateVirtioFs)?,
3017             ));
3018 
3019             // Update the device tree with the migratable device.
3020             node.migratable = Some(Arc::clone(&virtio_fs_device) as Arc<Mutex<dyn Migratable>>);
3021             self.device_tree.lock().unwrap().insert(id.clone(), node);
3022 
3023             Ok(MetaVirtioDevice {
3024                 virtio_device: Arc::clone(&virtio_fs_device)
3025                     as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
3026                 iommu: false,
3027                 id,
3028                 pci_segment: fs_cfg.pci_segment,
3029                 dma_handler: None,
3030             })
3031         } else {
3032             Err(DeviceManagerError::NoVirtioFsSock)
3033         }
3034     }
3035 
3036     fn make_virtio_fs_devices(&mut self) -> DeviceManagerResult<Vec<MetaVirtioDevice>> {
3037         let mut devices = Vec::new();
3038 
3039         let mut fs_devices = self.config.lock().unwrap().fs.clone();
3040         if let Some(fs_list_cfg) = &mut fs_devices {
3041             for fs_cfg in fs_list_cfg.iter_mut() {
3042                 devices.push(self.make_virtio_fs_device(fs_cfg)?);
3043             }
3044         }
3045         self.config.lock().unwrap().fs = fs_devices;
3046 
3047         Ok(devices)
3048     }
3049 
3050     fn make_virtio_pmem_device(
3051         &mut self,
3052         pmem_cfg: &mut PmemConfig,
3053     ) -> DeviceManagerResult<MetaVirtioDevice> {
3054         let id = if let Some(id) = &pmem_cfg.id {
3055             id.clone()
3056         } else {
3057             let id = self.next_device_name(PMEM_DEVICE_NAME_PREFIX)?;
3058             pmem_cfg.id = Some(id.clone());
3059             id
3060         };
3061 
3062         info!("Creating virtio-pmem device: {:?}", pmem_cfg);
3063 
3064         let mut node = device_node!(id);
3065 
3066         // Look for the id in the device tree. If it can be found, that means
3067         // the device is being restored, otherwise it's created from scratch.
3068         let region_range = if let Some(node) = self.device_tree.lock().unwrap().get(&id) {
3069             info!("Restoring virtio-pmem {} resources", id);
3070 
3071             let mut region_range: Option<(u64, u64)> = None;
3072             for resource in node.resources.iter() {
3073                 match resource {
3074                     Resource::MmioAddressRange { base, size } => {
3075                         if region_range.is_some() {
3076                             return Err(DeviceManagerError::ResourceAlreadyExists);
3077                         }
3078 
3079                         region_range = Some((*base, *size));
3080                     }
3081                     _ => {
3082                         error!("Unexpected resource {:?} for {}", resource, id);
3083                     }
3084                 }
3085             }
3086 
3087             if region_range.is_none() {
3088                 return Err(DeviceManagerError::MissingVirtioPmemResources);
3089             }
3090 
3091             region_range
3092         } else {
3093             None
3094         };
3095 
3096         let (custom_flags, set_len) = if pmem_cfg.file.is_dir() {
3097             if pmem_cfg.size.is_none() {
3098                 return Err(DeviceManagerError::PmemWithDirectorySizeMissing);
3099             }
3100             (O_TMPFILE, true)
3101         } else {
3102             (0, false)
3103         };
3104 
3105         let mut file = OpenOptions::new()
3106             .read(true)
3107             .write(!pmem_cfg.discard_writes)
3108             .custom_flags(custom_flags)
3109             .open(&pmem_cfg.file)
3110             .map_err(DeviceManagerError::PmemFileOpen)?;
3111 
3112         let size = if let Some(size) = pmem_cfg.size {
3113             if set_len {
3114                 file.set_len(size)
3115                     .map_err(DeviceManagerError::PmemFileSetLen)?;
3116             }
3117             size
3118         } else {
3119             file.seek(SeekFrom::End(0))
3120                 .map_err(DeviceManagerError::PmemFileSetLen)?
3121         };
3122 
3123         if size % 0x20_0000 != 0 {
3124             return Err(DeviceManagerError::PmemSizeNotAligned);
3125         }
3126 
3127         let (region_base, region_size) = if let Some((base, size)) = region_range {
3128             // The memory needs to be 2MiB aligned in order to support
3129             // hugepages.
3130             self.pci_segments[pmem_cfg.pci_segment as usize]
3131                 .mem64_allocator
3132                 .lock()
3133                 .unwrap()
3134                 .allocate(
3135                     Some(GuestAddress(base)),
3136                     size as GuestUsize,
3137                     Some(0x0020_0000),
3138                 )
3139                 .ok_or(DeviceManagerError::PmemRangeAllocation)?;
3140 
3141             (base, size)
3142         } else {
3143             // The memory needs to be 2MiB aligned in order to support
3144             // hugepages.
3145             let base = self.pci_segments[pmem_cfg.pci_segment as usize]
3146                 .mem64_allocator
3147                 .lock()
3148                 .unwrap()
3149                 .allocate(None, size as GuestUsize, Some(0x0020_0000))
3150                 .ok_or(DeviceManagerError::PmemRangeAllocation)?;
3151 
3152             (base.raw_value(), size)
3153         };
3154 
3155         let cloned_file = file.try_clone().map_err(DeviceManagerError::CloneFile)?;
3156         let mmap_region = MmapRegion::build(
3157             Some(FileOffset::new(cloned_file, 0)),
3158             region_size as usize,
3159             PROT_READ | PROT_WRITE,
3160             MAP_NORESERVE
3161                 | if pmem_cfg.discard_writes {
3162                     MAP_PRIVATE
3163                 } else {
3164                     MAP_SHARED
3165                 },
3166         )
3167         .map_err(DeviceManagerError::NewMmapRegion)?;
3168         let host_addr: u64 = mmap_region.as_ptr() as u64;
3169 
3170         let mem_slot = self
3171             .memory_manager
3172             .lock()
3173             .unwrap()
3174             .create_userspace_mapping(region_base, region_size, host_addr, false, false, false)
3175             .map_err(DeviceManagerError::MemoryManager)?;
3176 
3177         let mapping = virtio_devices::UserspaceMapping {
3178             host_addr,
3179             mem_slot,
3180             addr: GuestAddress(region_base),
3181             len: region_size,
3182             mergeable: false,
3183         };
3184 
3185         let virtio_pmem_device = Arc::new(Mutex::new(
3186             virtio_devices::Pmem::new(
3187                 id.clone(),
3188                 file,
3189                 GuestAddress(region_base),
3190                 mapping,
3191                 mmap_region,
3192                 self.force_iommu | pmem_cfg.iommu,
3193                 self.seccomp_action.clone(),
3194                 self.exit_evt
3195                     .try_clone()
3196                     .map_err(DeviceManagerError::EventFd)?,
3197                 state_from_id(self.snapshot.as_ref(), id.as_str())
3198                     .map_err(DeviceManagerError::RestoreGetState)?,
3199             )
3200             .map_err(DeviceManagerError::CreateVirtioPmem)?,
3201         ));
3202 
3203         // Update the device tree with correct resource information and with
3204         // the migratable device.
3205         node.resources.push(Resource::MmioAddressRange {
3206             base: region_base,
3207             size: region_size,
3208         });
3209         node.migratable = Some(Arc::clone(&virtio_pmem_device) as Arc<Mutex<dyn Migratable>>);
3210         self.device_tree.lock().unwrap().insert(id.clone(), node);
3211 
3212         Ok(MetaVirtioDevice {
3213             virtio_device: Arc::clone(&virtio_pmem_device)
3214                 as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
3215             iommu: pmem_cfg.iommu,
3216             id,
3217             pci_segment: pmem_cfg.pci_segment,
3218             dma_handler: None,
3219         })
3220     }
3221 
3222     fn make_virtio_pmem_devices(&mut self) -> DeviceManagerResult<Vec<MetaVirtioDevice>> {
3223         let mut devices = Vec::new();
3224         // Add virtio-pmem if required
3225         let mut pmem_devices = self.config.lock().unwrap().pmem.clone();
3226         if let Some(pmem_list_cfg) = &mut pmem_devices {
3227             for pmem_cfg in pmem_list_cfg.iter_mut() {
3228                 devices.push(self.make_virtio_pmem_device(pmem_cfg)?);
3229             }
3230         }
3231         self.config.lock().unwrap().pmem = pmem_devices;
3232 
3233         Ok(devices)
3234     }
3235 
3236     fn make_virtio_vsock_device(
3237         &mut self,
3238         vsock_cfg: &mut VsockConfig,
3239     ) -> DeviceManagerResult<MetaVirtioDevice> {
3240         let id = if let Some(id) = &vsock_cfg.id {
3241             id.clone()
3242         } else {
3243             let id = self.next_device_name(VSOCK_DEVICE_NAME_PREFIX)?;
3244             vsock_cfg.id = Some(id.clone());
3245             id
3246         };
3247 
3248         info!("Creating virtio-vsock device: {:?}", vsock_cfg);
3249 
3250         let socket_path = vsock_cfg
3251             .socket
3252             .to_str()
3253             .ok_or(DeviceManagerError::CreateVsockConvertPath)?;
3254         let backend =
3255             virtio_devices::vsock::VsockUnixBackend::new(vsock_cfg.cid, socket_path.to_string())
3256                 .map_err(DeviceManagerError::CreateVsockBackend)?;
3257 
3258         let vsock_device = Arc::new(Mutex::new(
3259             virtio_devices::Vsock::new(
3260                 id.clone(),
3261                 vsock_cfg.cid,
3262                 vsock_cfg.socket.clone(),
3263                 backend,
3264                 self.force_iommu | vsock_cfg.iommu,
3265                 self.seccomp_action.clone(),
3266                 self.exit_evt
3267                     .try_clone()
3268                     .map_err(DeviceManagerError::EventFd)?,
3269                 state_from_id(self.snapshot.as_ref(), id.as_str())
3270                     .map_err(DeviceManagerError::RestoreGetState)?,
3271             )
3272             .map_err(DeviceManagerError::CreateVirtioVsock)?,
3273         ));
3274 
3275         // Fill the device tree with a new node. In case of restore, we
3276         // know there is nothing to do, so we can simply override the
3277         // existing entry.
3278         self.device_tree
3279             .lock()
3280             .unwrap()
3281             .insert(id.clone(), device_node!(id, vsock_device));
3282 
3283         Ok(MetaVirtioDevice {
3284             virtio_device: Arc::clone(&vsock_device)
3285                 as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
3286             iommu: vsock_cfg.iommu,
3287             id,
3288             pci_segment: vsock_cfg.pci_segment,
3289             dma_handler: None,
3290         })
3291     }
3292 
3293     fn make_virtio_vsock_devices(&mut self) -> DeviceManagerResult<Vec<MetaVirtioDevice>> {
3294         let mut devices = Vec::new();
3295 
3296         let mut vsock = self.config.lock().unwrap().vsock.clone();
3297         if let Some(ref mut vsock_cfg) = &mut vsock {
3298             devices.push(self.make_virtio_vsock_device(vsock_cfg)?);
3299         }
3300         self.config.lock().unwrap().vsock = vsock;
3301 
3302         Ok(devices)
3303     }
3304 
3305     fn make_virtio_mem_devices(&mut self) -> DeviceManagerResult<Vec<MetaVirtioDevice>> {
3306         let mut devices = Vec::new();
3307 
3308         let mm = self.memory_manager.clone();
3309         let mut mm = mm.lock().unwrap();
3310         for (memory_zone_id, memory_zone) in mm.memory_zones_mut().iter_mut() {
3311             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone_mut() {
3312                 info!("Creating virtio-mem device: id = {}", memory_zone_id);
3313 
3314                 let node_id = numa_node_id_from_memory_zone_id(&self.numa_nodes, memory_zone_id)
3315                     .map(|i| i as u16);
3316 
3317                 let virtio_mem_device = Arc::new(Mutex::new(
3318                     virtio_devices::Mem::new(
3319                         memory_zone_id.clone(),
3320                         virtio_mem_zone.region(),
3321                         self.seccomp_action.clone(),
3322                         node_id,
3323                         virtio_mem_zone.hotplugged_size(),
3324                         virtio_mem_zone.hugepages(),
3325                         self.exit_evt
3326                             .try_clone()
3327                             .map_err(DeviceManagerError::EventFd)?,
3328                         virtio_mem_zone.blocks_state().clone(),
3329                         state_from_id(self.snapshot.as_ref(), memory_zone_id.as_str())
3330                             .map_err(DeviceManagerError::RestoreGetState)?,
3331                     )
3332                     .map_err(DeviceManagerError::CreateVirtioMem)?,
3333                 ));
3334 
3335                 // Update the virtio-mem zone so that it has a handle onto the
3336                 // virtio-mem device, which will be used for triggering a resize
3337                 // if needed.
3338                 virtio_mem_zone.set_virtio_device(Arc::clone(&virtio_mem_device));
3339 
3340                 self.virtio_mem_devices.push(Arc::clone(&virtio_mem_device));
3341 
3342                 devices.push(MetaVirtioDevice {
3343                     virtio_device: Arc::clone(&virtio_mem_device)
3344                         as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
3345                     iommu: false,
3346                     id: memory_zone_id.clone(),
3347                     pci_segment: 0,
3348                     dma_handler: None,
3349                 });
3350 
3351                 // Fill the device tree with a new node. In case of restore, we
3352                 // know there is nothing to do, so we can simply override the
3353                 // existing entry.
3354                 self.device_tree.lock().unwrap().insert(
3355                     memory_zone_id.clone(),
3356                     device_node!(memory_zone_id, virtio_mem_device),
3357                 );
3358             }
3359         }
3360 
3361         Ok(devices)
3362     }
3363 
3364     #[cfg(feature = "pvmemcontrol")]
3365     fn make_pvmemcontrol_device(
3366         &mut self,
3367     ) -> DeviceManagerResult<(
3368         Arc<PvmemcontrolBusDevice>,
3369         Arc<Mutex<PvmemcontrolPciDevice>>,
3370     )> {
3371         let id = String::from(PVMEMCONTROL_DEVICE_NAME);
3372         let pci_segment_id = 0x0_u16;
3373 
3374         let (pci_segment_id, pci_device_bdf, resources) =
3375             self.pci_resources(&id, pci_segment_id)?;
3376 
3377         info!("Creating pvmemcontrol device: id = {}", id);
3378         let (pvmemcontrol_pci_device, pvmemcontrol_bus_device) =
3379             devices::pvmemcontrol::PvmemcontrolDevice::make_device(
3380                 id.clone(),
3381                 self.memory_manager.lock().unwrap().guest_memory(),
3382             );
3383 
3384         let pvmemcontrol_pci_device = Arc::new(Mutex::new(pvmemcontrol_pci_device));
3385         let pvmemcontrol_bus_device = Arc::new(pvmemcontrol_bus_device);
3386 
3387         let new_resources = self.add_pci_device(
3388             pvmemcontrol_bus_device.clone(),
3389             pvmemcontrol_pci_device.clone(),
3390             pci_segment_id,
3391             pci_device_bdf,
3392             resources,
3393         )?;
3394 
3395         let mut node = device_node!(id, pvmemcontrol_pci_device);
3396 
3397         node.resources = new_resources;
3398         node.pci_bdf = Some(pci_device_bdf);
3399         node.pci_device_handle = None;
3400 
3401         self.device_tree.lock().unwrap().insert(id, node);
3402 
3403         Ok((pvmemcontrol_bus_device, pvmemcontrol_pci_device))
3404     }
3405 
3406     fn make_virtio_balloon_devices(&mut self) -> DeviceManagerResult<Vec<MetaVirtioDevice>> {
3407         let mut devices = Vec::new();
3408 
3409         if let Some(balloon_config) = &self.config.lock().unwrap().balloon {
3410             let id = String::from(BALLOON_DEVICE_NAME);
3411             info!("Creating virtio-balloon device: id = {}", id);
3412 
3413             let virtio_balloon_device = Arc::new(Mutex::new(
3414                 virtio_devices::Balloon::new(
3415                     id.clone(),
3416                     balloon_config.size,
3417                     balloon_config.deflate_on_oom,
3418                     balloon_config.free_page_reporting,
3419                     self.seccomp_action.clone(),
3420                     self.exit_evt
3421                         .try_clone()
3422                         .map_err(DeviceManagerError::EventFd)?,
3423                     state_from_id(self.snapshot.as_ref(), id.as_str())
3424                         .map_err(DeviceManagerError::RestoreGetState)?,
3425                 )
3426                 .map_err(DeviceManagerError::CreateVirtioBalloon)?,
3427             ));
3428 
3429             self.balloon = Some(virtio_balloon_device.clone());
3430 
3431             devices.push(MetaVirtioDevice {
3432                 virtio_device: Arc::clone(&virtio_balloon_device)
3433                     as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
3434                 iommu: false,
3435                 id: id.clone(),
3436                 pci_segment: 0,
3437                 dma_handler: None,
3438             });
3439 
3440             self.device_tree
3441                 .lock()
3442                 .unwrap()
3443                 .insert(id.clone(), device_node!(id, virtio_balloon_device));
3444         }
3445 
3446         Ok(devices)
3447     }
3448 
3449     fn make_virtio_watchdog_devices(&mut self) -> DeviceManagerResult<Vec<MetaVirtioDevice>> {
3450         let mut devices = Vec::new();
3451 
3452         if !self.config.lock().unwrap().watchdog {
3453             return Ok(devices);
3454         }
3455 
3456         let id = String::from(WATCHDOG_DEVICE_NAME);
3457         info!("Creating virtio-watchdog device: id = {}", id);
3458 
3459         let virtio_watchdog_device = Arc::new(Mutex::new(
3460             virtio_devices::Watchdog::new(
3461                 id.clone(),
3462                 self.reset_evt.try_clone().unwrap(),
3463                 self.seccomp_action.clone(),
3464                 self.exit_evt
3465                     .try_clone()
3466                     .map_err(DeviceManagerError::EventFd)?,
3467                 state_from_id(self.snapshot.as_ref(), id.as_str())
3468                     .map_err(DeviceManagerError::RestoreGetState)?,
3469             )
3470             .map_err(DeviceManagerError::CreateVirtioWatchdog)?,
3471         ));
3472         devices.push(MetaVirtioDevice {
3473             virtio_device: Arc::clone(&virtio_watchdog_device)
3474                 as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
3475             iommu: false,
3476             id: id.clone(),
3477             pci_segment: 0,
3478             dma_handler: None,
3479         });
3480 
3481         self.device_tree
3482             .lock()
3483             .unwrap()
3484             .insert(id.clone(), device_node!(id, virtio_watchdog_device));
3485 
3486         Ok(devices)
3487     }
3488 
3489     fn make_vdpa_device(
3490         &mut self,
3491         vdpa_cfg: &mut VdpaConfig,
3492     ) -> DeviceManagerResult<MetaVirtioDevice> {
3493         let id = if let Some(id) = &vdpa_cfg.id {
3494             id.clone()
3495         } else {
3496             let id = self.next_device_name(VDPA_DEVICE_NAME_PREFIX)?;
3497             vdpa_cfg.id = Some(id.clone());
3498             id
3499         };
3500 
3501         info!("Creating vDPA device: {:?}", vdpa_cfg);
3502 
3503         let device_path = vdpa_cfg
3504             .path
3505             .to_str()
3506             .ok_or(DeviceManagerError::CreateVdpaConvertPath)?;
3507 
3508         let vdpa_device = Arc::new(Mutex::new(
3509             virtio_devices::Vdpa::new(
3510                 id.clone(),
3511                 device_path,
3512                 self.memory_manager.lock().unwrap().guest_memory(),
3513                 vdpa_cfg.num_queues as u16,
3514                 state_from_id(self.snapshot.as_ref(), id.as_str())
3515                     .map_err(DeviceManagerError::RestoreGetState)?,
3516             )
3517             .map_err(DeviceManagerError::CreateVdpa)?,
3518         ));
3519 
3520         // Create the DMA handler that is required by the vDPA device
3521         let vdpa_mapping = Arc::new(VdpaDmaMapping::new(
3522             Arc::clone(&vdpa_device),
3523             Arc::new(self.memory_manager.lock().unwrap().guest_memory()),
3524         ));
3525 
3526         self.device_tree
3527             .lock()
3528             .unwrap()
3529             .insert(id.clone(), device_node!(id, vdpa_device));
3530 
3531         Ok(MetaVirtioDevice {
3532             virtio_device: vdpa_device as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
3533             iommu: vdpa_cfg.iommu,
3534             id,
3535             pci_segment: vdpa_cfg.pci_segment,
3536             dma_handler: Some(vdpa_mapping),
3537         })
3538     }
3539 
3540     fn make_vdpa_devices(&mut self) -> DeviceManagerResult<Vec<MetaVirtioDevice>> {
3541         let mut devices = Vec::new();
3542         // Add vdpa if required
3543         let mut vdpa_devices = self.config.lock().unwrap().vdpa.clone();
3544         if let Some(vdpa_list_cfg) = &mut vdpa_devices {
3545             for vdpa_cfg in vdpa_list_cfg.iter_mut() {
3546                 devices.push(self.make_vdpa_device(vdpa_cfg)?);
3547             }
3548         }
3549         self.config.lock().unwrap().vdpa = vdpa_devices;
3550 
3551         Ok(devices)
3552     }
3553 
3554     fn next_device_name(&mut self, prefix: &str) -> DeviceManagerResult<String> {
3555         let start_id = self.device_id_cnt;
3556         loop {
3557             // Generate the temporary name.
3558             let name = format!("{}{}", prefix, self.device_id_cnt);
3559             // Increment the counter.
3560             self.device_id_cnt += Wrapping(1);
3561             // Check if the name is already in use.
3562             if !self.boot_id_list.contains(&name)
3563                 && !self.device_tree.lock().unwrap().contains_key(&name)
3564             {
3565                 return Ok(name);
3566             }
3567 
3568             if self.device_id_cnt == start_id {
3569                 // We went through a full loop and there's nothing else we can
3570                 // do.
3571                 break;
3572             }
3573         }
3574         Err(DeviceManagerError::NoAvailableDeviceName)
3575     }
3576 
3577     fn add_passthrough_device(
3578         &mut self,
3579         device_cfg: &mut DeviceConfig,
3580     ) -> DeviceManagerResult<(PciBdf, String)> {
3581         // If the passthrough device has not been created yet, it is created
3582         // here and stored in the DeviceManager structure for future needs.
3583         if self.passthrough_device.is_none() {
3584             self.passthrough_device = Some(
3585                 self.address_manager
3586                     .vm
3587                     .create_passthrough_device()
3588                     .map_err(|e| DeviceManagerError::CreatePassthroughDevice(e.into()))?,
3589             );
3590         }
3591 
3592         self.add_vfio_device(device_cfg)
3593     }
3594 
3595     fn create_vfio_container(&self) -> DeviceManagerResult<Arc<VfioContainer>> {
3596         let passthrough_device = self
3597             .passthrough_device
3598             .as_ref()
3599             .ok_or(DeviceManagerError::NoDevicePassthroughSupport)?;
3600 
3601         let dup = passthrough_device
3602             .try_clone()
3603             .map_err(DeviceManagerError::VfioCreate)?;
3604 
3605         Ok(Arc::new(
3606             VfioContainer::new(Some(Arc::new(dup))).map_err(DeviceManagerError::VfioCreate)?,
3607         ))
3608     }
3609 
3610     fn add_vfio_device(
3611         &mut self,
3612         device_cfg: &mut DeviceConfig,
3613     ) -> DeviceManagerResult<(PciBdf, String)> {
3614         let vfio_name = if let Some(id) = &device_cfg.id {
3615             id.clone()
3616         } else {
3617             let id = self.next_device_name(VFIO_DEVICE_NAME_PREFIX)?;
3618             device_cfg.id = Some(id.clone());
3619             id
3620         };
3621 
3622         let (pci_segment_id, pci_device_bdf, resources) =
3623             self.pci_resources(&vfio_name, device_cfg.pci_segment)?;
3624 
3625         let mut needs_dma_mapping = false;
3626 
3627         // Here we create a new VFIO container for two reasons. Either this is
3628         // the first VFIO device, meaning we need a new VFIO container, which
3629         // will be shared with other VFIO devices. Or the new VFIO device is
3630         // attached to a vIOMMU, meaning we must create a dedicated VFIO
3631         // container. In the vIOMMU use case, we can't let all devices under
3632         // the same VFIO container since we couldn't map/unmap memory for each
3633         // device. That's simply because the map/unmap operations happen at the
3634         // VFIO container level.
3635         let vfio_container = if device_cfg.iommu {
3636             let vfio_container = self.create_vfio_container()?;
3637 
3638             let vfio_mapping = Arc::new(VfioDmaMapping::new(
3639                 Arc::clone(&vfio_container),
3640                 Arc::new(self.memory_manager.lock().unwrap().guest_memory()),
3641                 Arc::clone(&self.mmio_regions),
3642             ));
3643 
3644             if let Some(iommu) = &self.iommu_device {
3645                 iommu
3646                     .lock()
3647                     .unwrap()
3648                     .add_external_mapping(pci_device_bdf.into(), vfio_mapping);
3649             } else {
3650                 return Err(DeviceManagerError::MissingVirtualIommu);
3651             }
3652 
3653             vfio_container
3654         } else if let Some(vfio_container) = &self.vfio_container {
3655             Arc::clone(vfio_container)
3656         } else {
3657             let vfio_container = self.create_vfio_container()?;
3658             needs_dma_mapping = true;
3659             self.vfio_container = Some(Arc::clone(&vfio_container));
3660 
3661             vfio_container
3662         };
3663 
3664         let vfio_device = VfioDevice::new(&device_cfg.path, Arc::clone(&vfio_container))
3665             .map_err(DeviceManagerError::VfioCreate)?;
3666 
3667         if needs_dma_mapping {
3668             // Register DMA mapping in IOMMU.
3669             // Do not register virtio-mem regions, as they are handled directly by
3670             // virtio-mem device itself.
3671             for (_, zone) in self.memory_manager.lock().unwrap().memory_zones().iter() {
3672                 for region in zone.regions() {
3673                     vfio_container
3674                         .vfio_dma_map(
3675                             region.start_addr().raw_value(),
3676                             region.len(),
3677                             region.as_ptr() as u64,
3678                         )
3679                         .map_err(DeviceManagerError::VfioDmaMap)?;
3680                 }
3681             }
3682 
3683             let vfio_mapping = Arc::new(VfioDmaMapping::new(
3684                 Arc::clone(&vfio_container),
3685                 Arc::new(self.memory_manager.lock().unwrap().guest_memory()),
3686                 Arc::clone(&self.mmio_regions),
3687             ));
3688 
3689             for virtio_mem_device in self.virtio_mem_devices.iter() {
3690                 virtio_mem_device
3691                     .lock()
3692                     .unwrap()
3693                     .add_dma_mapping_handler(
3694                         VirtioMemMappingSource::Container,
3695                         vfio_mapping.clone(),
3696                     )
3697                     .map_err(DeviceManagerError::AddDmaMappingHandlerVirtioMem)?;
3698             }
3699         }
3700 
3701         let legacy_interrupt_group =
3702             if let Some(legacy_interrupt_manager) = &self.legacy_interrupt_manager {
3703                 Some(
3704                     legacy_interrupt_manager
3705                         .create_group(LegacyIrqGroupConfig {
3706                             irq: self.pci_segments[pci_segment_id as usize].pci_irq_slots
3707                                 [pci_device_bdf.device() as usize]
3708                                 as InterruptIndex,
3709                         })
3710                         .map_err(DeviceManagerError::CreateInterruptGroup)?,
3711                 )
3712             } else {
3713                 None
3714             };
3715 
3716         let memory_manager = self.memory_manager.clone();
3717 
3718         let vfio_pci_device = VfioPciDevice::new(
3719             vfio_name.clone(),
3720             &self.address_manager.vm,
3721             vfio_device,
3722             vfio_container,
3723             self.msi_interrupt_manager.clone(),
3724             legacy_interrupt_group,
3725             device_cfg.iommu,
3726             pci_device_bdf,
3727             memory_manager.lock().unwrap().memory_slot_allocator(),
3728             vm_migration::snapshot_from_id(self.snapshot.as_ref(), vfio_name.as_str()),
3729             device_cfg.x_nv_gpudirect_clique,
3730             device_cfg.path.clone(),
3731         )
3732         .map_err(DeviceManagerError::VfioPciCreate)?;
3733 
3734         let vfio_pci_device = Arc::new(Mutex::new(vfio_pci_device));
3735 
3736         let new_resources = self.add_pci_device(
3737             vfio_pci_device.clone(),
3738             vfio_pci_device.clone(),
3739             pci_segment_id,
3740             pci_device_bdf,
3741             resources,
3742         )?;
3743 
3744         vfio_pci_device
3745             .lock()
3746             .unwrap()
3747             .map_mmio_regions()
3748             .map_err(DeviceManagerError::VfioMapRegion)?;
3749 
3750         for mmio_region in vfio_pci_device.lock().unwrap().mmio_regions() {
3751             self.mmio_regions.lock().unwrap().push(mmio_region);
3752         }
3753 
3754         let mut node = device_node!(vfio_name, vfio_pci_device);
3755 
3756         // Update the device tree with correct resource information.
3757         node.resources = new_resources;
3758         node.pci_bdf = Some(pci_device_bdf);
3759         node.pci_device_handle = Some(PciDeviceHandle::Vfio(vfio_pci_device));
3760 
3761         self.device_tree
3762             .lock()
3763             .unwrap()
3764             .insert(vfio_name.clone(), node);
3765 
3766         Ok((pci_device_bdf, vfio_name))
3767     }
3768 
3769     fn add_pci_device(
3770         &mut self,
3771         bus_device: Arc<dyn BusDeviceSync>,
3772         pci_device: Arc<Mutex<dyn PciDevice>>,
3773         segment_id: u16,
3774         bdf: PciBdf,
3775         resources: Option<Vec<Resource>>,
3776     ) -> DeviceManagerResult<Vec<Resource>> {
3777         let bars = pci_device
3778             .lock()
3779             .unwrap()
3780             .allocate_bars(
3781                 &self.address_manager.allocator,
3782                 &mut self.pci_segments[segment_id as usize]
3783                     .mem32_allocator
3784                     .lock()
3785                     .unwrap(),
3786                 &mut self.pci_segments[segment_id as usize]
3787                     .mem64_allocator
3788                     .lock()
3789                     .unwrap(),
3790                 resources,
3791             )
3792             .map_err(DeviceManagerError::AllocateBars)?;
3793 
3794         let mut pci_bus = self.pci_segments[segment_id as usize]
3795             .pci_bus
3796             .lock()
3797             .unwrap();
3798 
3799         pci_bus
3800             .add_device(bdf.device() as u32, pci_device)
3801             .map_err(DeviceManagerError::AddPciDevice)?;
3802 
3803         self.bus_devices.push(Arc::clone(&bus_device));
3804 
3805         pci_bus
3806             .register_mapping(
3807                 bus_device,
3808                 self.address_manager.io_bus.as_ref(),
3809                 self.address_manager.mmio_bus.as_ref(),
3810                 bars.clone(),
3811             )
3812             .map_err(DeviceManagerError::AddPciDevice)?;
3813 
3814         let mut new_resources = Vec::new();
3815         for bar in bars {
3816             new_resources.push(Resource::PciBar {
3817                 index: bar.idx(),
3818                 base: bar.addr(),
3819                 size: bar.size(),
3820                 type_: bar.region_type().into(),
3821                 prefetchable: bar.prefetchable().into(),
3822             });
3823         }
3824 
3825         Ok(new_resources)
3826     }
3827 
3828     fn add_vfio_devices(&mut self) -> DeviceManagerResult<Vec<PciBdf>> {
3829         let mut iommu_attached_device_ids = Vec::new();
3830         let mut devices = self.config.lock().unwrap().devices.clone();
3831 
3832         if let Some(device_list_cfg) = &mut devices {
3833             for device_cfg in device_list_cfg.iter_mut() {
3834                 let (device_id, _) = self.add_passthrough_device(device_cfg)?;
3835                 if device_cfg.iommu && self.iommu_device.is_some() {
3836                     iommu_attached_device_ids.push(device_id);
3837                 }
3838             }
3839         }
3840 
3841         // Update the list of devices
3842         self.config.lock().unwrap().devices = devices;
3843 
3844         Ok(iommu_attached_device_ids)
3845     }
3846 
3847     fn add_vfio_user_device(
3848         &mut self,
3849         device_cfg: &mut UserDeviceConfig,
3850     ) -> DeviceManagerResult<(PciBdf, String)> {
3851         let vfio_user_name = if let Some(id) = &device_cfg.id {
3852             id.clone()
3853         } else {
3854             let id = self.next_device_name(VFIO_USER_DEVICE_NAME_PREFIX)?;
3855             device_cfg.id = Some(id.clone());
3856             id
3857         };
3858 
3859         let (pci_segment_id, pci_device_bdf, resources) =
3860             self.pci_resources(&vfio_user_name, device_cfg.pci_segment)?;
3861 
3862         let legacy_interrupt_group =
3863             if let Some(legacy_interrupt_manager) = &self.legacy_interrupt_manager {
3864                 Some(
3865                     legacy_interrupt_manager
3866                         .create_group(LegacyIrqGroupConfig {
3867                             irq: self.pci_segments[pci_segment_id as usize].pci_irq_slots
3868                                 [pci_device_bdf.device() as usize]
3869                                 as InterruptIndex,
3870                         })
3871                         .map_err(DeviceManagerError::CreateInterruptGroup)?,
3872                 )
3873             } else {
3874                 None
3875             };
3876 
3877         let client = Arc::new(Mutex::new(
3878             vfio_user::Client::new(&device_cfg.socket)
3879                 .map_err(DeviceManagerError::VfioUserCreateClient)?,
3880         ));
3881 
3882         let memory_manager = self.memory_manager.clone();
3883 
3884         let mut vfio_user_pci_device = VfioUserPciDevice::new(
3885             vfio_user_name.clone(),
3886             &self.address_manager.vm,
3887             client.clone(),
3888             self.msi_interrupt_manager.clone(),
3889             legacy_interrupt_group,
3890             pci_device_bdf,
3891             memory_manager.lock().unwrap().memory_slot_allocator(),
3892             vm_migration::snapshot_from_id(self.snapshot.as_ref(), vfio_user_name.as_str()),
3893         )
3894         .map_err(DeviceManagerError::VfioUserCreate)?;
3895 
3896         let memory = self.memory_manager.lock().unwrap().guest_memory();
3897         let vfio_user_mapping = Arc::new(VfioUserDmaMapping::new(client, Arc::new(memory)));
3898         for virtio_mem_device in self.virtio_mem_devices.iter() {
3899             virtio_mem_device
3900                 .lock()
3901                 .unwrap()
3902                 .add_dma_mapping_handler(
3903                     VirtioMemMappingSource::Device(pci_device_bdf.into()),
3904                     vfio_user_mapping.clone(),
3905                 )
3906                 .map_err(DeviceManagerError::AddDmaMappingHandlerVirtioMem)?;
3907         }
3908 
3909         for (_, zone) in self.memory_manager.lock().unwrap().memory_zones().iter() {
3910             for region in zone.regions() {
3911                 vfio_user_pci_device
3912                     .dma_map(region)
3913                     .map_err(DeviceManagerError::VfioUserDmaMap)?;
3914             }
3915         }
3916 
3917         let vfio_user_pci_device = Arc::new(Mutex::new(vfio_user_pci_device));
3918 
3919         let new_resources = self.add_pci_device(
3920             vfio_user_pci_device.clone(),
3921             vfio_user_pci_device.clone(),
3922             pci_segment_id,
3923             pci_device_bdf,
3924             resources,
3925         )?;
3926 
3927         // Note it is required to call 'add_pci_device()' in advance to have the list of
3928         // mmio regions provisioned correctly
3929         vfio_user_pci_device
3930             .lock()
3931             .unwrap()
3932             .map_mmio_regions()
3933             .map_err(DeviceManagerError::VfioUserMapRegion)?;
3934 
3935         let mut node = device_node!(vfio_user_name, vfio_user_pci_device);
3936 
3937         // Update the device tree with correct resource information.
3938         node.resources = new_resources;
3939         node.pci_bdf = Some(pci_device_bdf);
3940         node.pci_device_handle = Some(PciDeviceHandle::VfioUser(vfio_user_pci_device));
3941 
3942         self.device_tree
3943             .lock()
3944             .unwrap()
3945             .insert(vfio_user_name.clone(), node);
3946 
3947         Ok((pci_device_bdf, vfio_user_name))
3948     }
3949 
3950     fn add_user_devices(&mut self) -> DeviceManagerResult<Vec<PciBdf>> {
3951         let mut user_devices = self.config.lock().unwrap().user_devices.clone();
3952 
3953         if let Some(device_list_cfg) = &mut user_devices {
3954             for device_cfg in device_list_cfg.iter_mut() {
3955                 let (_device_id, _id) = self.add_vfio_user_device(device_cfg)?;
3956             }
3957         }
3958 
3959         // Update the list of devices
3960         self.config.lock().unwrap().user_devices = user_devices;
3961 
3962         Ok(vec![])
3963     }
3964 
3965     fn add_virtio_pci_device(
3966         &mut self,
3967         virtio_device: Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
3968         iommu_mapping: &Option<Arc<IommuMapping>>,
3969         virtio_device_id: String,
3970         pci_segment_id: u16,
3971         dma_handler: Option<Arc<dyn ExternalDmaMapping>>,
3972     ) -> DeviceManagerResult<PciBdf> {
3973         let id = format!("{VIRTIO_PCI_DEVICE_NAME_PREFIX}-{virtio_device_id}");
3974 
3975         // Add the new virtio-pci node to the device tree.
3976         let mut node = device_node!(id);
3977         node.children = vec![virtio_device_id.clone()];
3978 
3979         let (pci_segment_id, pci_device_bdf, resources) =
3980             self.pci_resources(&id, pci_segment_id)?;
3981 
3982         // Update the existing virtio node by setting the parent.
3983         if let Some(node) = self.device_tree.lock().unwrap().get_mut(&virtio_device_id) {
3984             node.parent = Some(id.clone());
3985         } else {
3986             return Err(DeviceManagerError::MissingNode);
3987         }
3988 
3989         // Allows support for one MSI-X vector per queue. It also adds 1
3990         // as we need to take into account the dedicated vector to notify
3991         // about a virtio config change.
3992         let msix_num = (virtio_device.lock().unwrap().queue_max_sizes().len() + 1) as u16;
3993 
3994         // Create the AccessPlatform trait from the implementation IommuMapping.
3995         // This will provide address translation for any virtio device sitting
3996         // behind a vIOMMU.
3997         let mut access_platform: Option<Arc<dyn AccessPlatform>> = None;
3998 
3999         if let Some(mapping) = iommu_mapping {
4000             access_platform = Some(Arc::new(AccessPlatformMapping::new(
4001                 pci_device_bdf.into(),
4002                 mapping.clone(),
4003             )));
4004         }
4005 
4006         // If SEV-SNP is enabled create the AccessPlatform from SevSnpPageAccessProxy
4007         #[cfg(feature = "sev_snp")]
4008         if self.config.lock().unwrap().is_sev_snp_enabled() {
4009             access_platform = Some(Arc::new(SevSnpPageAccessProxy::new(
4010                 self.address_manager.vm.clone(),
4011             )));
4012         }
4013 
4014         let memory = self.memory_manager.lock().unwrap().guest_memory();
4015 
4016         // Map DMA ranges if a DMA handler is available and if the device is
4017         // not attached to a virtual IOMMU.
4018         if let Some(dma_handler) = &dma_handler {
4019             if iommu_mapping.is_some() {
4020                 if let Some(iommu) = &self.iommu_device {
4021                     iommu
4022                         .lock()
4023                         .unwrap()
4024                         .add_external_mapping(pci_device_bdf.into(), dma_handler.clone());
4025                 } else {
4026                     return Err(DeviceManagerError::MissingVirtualIommu);
4027                 }
4028             } else {
4029                 // Let every virtio-mem device handle the DMA map/unmap through the
4030                 // DMA handler provided.
4031                 for virtio_mem_device in self.virtio_mem_devices.iter() {
4032                     virtio_mem_device
4033                         .lock()
4034                         .unwrap()
4035                         .add_dma_mapping_handler(
4036                             VirtioMemMappingSource::Device(pci_device_bdf.into()),
4037                             dma_handler.clone(),
4038                         )
4039                         .map_err(DeviceManagerError::AddDmaMappingHandlerVirtioMem)?;
4040                 }
4041 
4042                 // Do not register virtio-mem regions, as they are handled directly by
4043                 // virtio-mem devices.
4044                 for (_, zone) in self.memory_manager.lock().unwrap().memory_zones().iter() {
4045                     for region in zone.regions() {
4046                         let gpa = region.start_addr().0;
4047                         let size = region.len();
4048                         dma_handler
4049                             .map(gpa, gpa, size)
4050                             .map_err(DeviceManagerError::VirtioDmaMap)?;
4051                     }
4052                 }
4053             }
4054         }
4055 
4056         let device_type = virtio_device.lock().unwrap().device_type();
4057         let virtio_pci_device = Arc::new(Mutex::new(
4058             VirtioPciDevice::new(
4059                 id.clone(),
4060                 memory,
4061                 virtio_device,
4062                 msix_num,
4063                 access_platform,
4064                 &self.msi_interrupt_manager,
4065                 pci_device_bdf.into(),
4066                 self.activate_evt
4067                     .try_clone()
4068                     .map_err(DeviceManagerError::EventFd)?,
4069                 // All device types *except* virtio block devices should be allocated a 64-bit bar
4070                 // The block devices should be given a 32-bit BAR so that they are easily accessible
4071                 // to firmware without requiring excessive identity mapping.
4072                 // The exception being if not on the default PCI segment.
4073                 pci_segment_id > 0 || device_type != VirtioDeviceType::Block as u32,
4074                 dma_handler,
4075                 self.pending_activations.clone(),
4076                 vm_migration::snapshot_from_id(self.snapshot.as_ref(), id.as_str()),
4077             )
4078             .map_err(DeviceManagerError::VirtioDevice)?,
4079         ));
4080 
4081         let new_resources = self.add_pci_device(
4082             virtio_pci_device.clone(),
4083             virtio_pci_device.clone(),
4084             pci_segment_id,
4085             pci_device_bdf,
4086             resources,
4087         )?;
4088 
4089         let bar_addr = virtio_pci_device.lock().unwrap().config_bar_addr();
4090         for (event, addr) in virtio_pci_device.lock().unwrap().ioeventfds(bar_addr) {
4091             let io_addr = IoEventAddress::Mmio(addr);
4092             self.address_manager
4093                 .vm
4094                 .register_ioevent(event, &io_addr, None)
4095                 .map_err(|e| DeviceManagerError::RegisterIoevent(e.into()))?;
4096         }
4097 
4098         // Update the device tree with correct resource information.
4099         node.resources = new_resources;
4100         node.migratable = Some(Arc::clone(&virtio_pci_device) as Arc<Mutex<dyn Migratable>>);
4101         node.pci_bdf = Some(pci_device_bdf);
4102         node.pci_device_handle = Some(PciDeviceHandle::Virtio(virtio_pci_device));
4103         self.device_tree.lock().unwrap().insert(id, node);
4104 
4105         Ok(pci_device_bdf)
4106     }
4107 
4108     fn add_pvpanic_device(
4109         &mut self,
4110     ) -> DeviceManagerResult<Option<Arc<Mutex<devices::PvPanicDevice>>>> {
4111         let id = String::from(PVPANIC_DEVICE_NAME);
4112         let pci_segment_id = 0x0_u16;
4113 
4114         info!("Creating pvpanic device {}", id);
4115 
4116         let (pci_segment_id, pci_device_bdf, resources) =
4117             self.pci_resources(&id, pci_segment_id)?;
4118 
4119         let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str());
4120 
4121         let pvpanic_device = devices::PvPanicDevice::new(id.clone(), snapshot)
4122             .map_err(DeviceManagerError::PvPanicCreate)?;
4123 
4124         let pvpanic_device = Arc::new(Mutex::new(pvpanic_device));
4125 
4126         let new_resources = self.add_pci_device(
4127             pvpanic_device.clone(),
4128             pvpanic_device.clone(),
4129             pci_segment_id,
4130             pci_device_bdf,
4131             resources,
4132         )?;
4133 
4134         let mut node = device_node!(id, pvpanic_device);
4135 
4136         node.resources = new_resources;
4137         node.pci_bdf = Some(pci_device_bdf);
4138         node.pci_device_handle = None;
4139 
4140         self.device_tree.lock().unwrap().insert(id, node);
4141 
4142         Ok(Some(pvpanic_device))
4143     }
4144 
4145     fn pci_resources(
4146         &self,
4147         id: &str,
4148         pci_segment_id: u16,
4149     ) -> DeviceManagerResult<(u16, PciBdf, Option<Vec<Resource>>)> {
4150         // Look for the id in the device tree. If it can be found, that means
4151         // the device is being restored, otherwise it's created from scratch.
4152         let (pci_device_bdf, resources) =
4153             if let Some(node) = self.device_tree.lock().unwrap().get(id) {
4154                 info!("Restoring virtio-pci {} resources", id);
4155                 let pci_device_bdf: PciBdf = node
4156                     .pci_bdf
4157                     .ok_or(DeviceManagerError::MissingDeviceNodePciBdf)?;
4158                 (Some(pci_device_bdf), Some(node.resources.clone()))
4159             } else {
4160                 (None, None)
4161             };
4162 
4163         Ok(if let Some(pci_device_bdf) = pci_device_bdf {
4164             let pci_segment_id = pci_device_bdf.segment();
4165 
4166             self.pci_segments[pci_segment_id as usize]
4167                 .pci_bus
4168                 .lock()
4169                 .unwrap()
4170                 .get_device_id(pci_device_bdf.device() as usize)
4171                 .map_err(DeviceManagerError::GetPciDeviceId)?;
4172 
4173             (pci_segment_id, pci_device_bdf, resources)
4174         } else {
4175             let pci_device_bdf = self.pci_segments[pci_segment_id as usize].next_device_bdf()?;
4176 
4177             (pci_segment_id, pci_device_bdf, None)
4178         })
4179     }
4180 
4181     #[cfg(target_arch = "x86_64")]
4182     pub fn io_bus(&self) -> &Arc<Bus> {
4183         &self.address_manager.io_bus
4184     }
4185 
4186     pub fn mmio_bus(&self) -> &Arc<Bus> {
4187         &self.address_manager.mmio_bus
4188     }
4189 
4190     pub fn allocator(&self) -> &Arc<Mutex<SystemAllocator>> {
4191         &self.address_manager.allocator
4192     }
4193 
4194     pub fn interrupt_controller(&self) -> Option<Arc<Mutex<dyn InterruptController>>> {
4195         self.interrupt_controller
4196             .as_ref()
4197             .map(|ic| ic.clone() as Arc<Mutex<dyn InterruptController>>)
4198     }
4199 
4200     pub(crate) fn pci_segments(&self) -> &Vec<PciSegment> {
4201         &self.pci_segments
4202     }
4203 
4204     #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
4205     pub fn cmdline_additions(&self) -> &[String] {
4206         self.cmdline_additions.as_slice()
4207     }
4208 
4209     pub fn update_memory(&self, new_region: &Arc<GuestRegionMmap>) -> DeviceManagerResult<()> {
4210         for handle in self.virtio_devices.iter() {
4211             handle
4212                 .virtio_device
4213                 .lock()
4214                 .unwrap()
4215                 .add_memory_region(new_region)
4216                 .map_err(DeviceManagerError::UpdateMemoryForVirtioDevice)?;
4217 
4218             if let Some(dma_handler) = &handle.dma_handler {
4219                 if !handle.iommu {
4220                     let gpa = new_region.start_addr().0;
4221                     let size = new_region.len();
4222                     dma_handler
4223                         .map(gpa, gpa, size)
4224                         .map_err(DeviceManagerError::VirtioDmaMap)?;
4225                 }
4226             }
4227         }
4228 
4229         // Take care of updating the memory for VFIO PCI devices.
4230         if let Some(vfio_container) = &self.vfio_container {
4231             vfio_container
4232                 .vfio_dma_map(
4233                     new_region.start_addr().raw_value(),
4234                     new_region.len(),
4235                     new_region.as_ptr() as u64,
4236                 )
4237                 .map_err(DeviceManagerError::UpdateMemoryForVfioPciDevice)?;
4238         }
4239 
4240         // Take care of updating the memory for vfio-user devices.
4241         {
4242             let device_tree = self.device_tree.lock().unwrap();
4243             for pci_device_node in device_tree.pci_devices() {
4244                 if let PciDeviceHandle::VfioUser(vfio_user_pci_device) = pci_device_node
4245                     .pci_device_handle
4246                     .as_ref()
4247                     .ok_or(DeviceManagerError::MissingPciDevice)?
4248                 {
4249                     vfio_user_pci_device
4250                         .lock()
4251                         .unwrap()
4252                         .dma_map(new_region)
4253                         .map_err(DeviceManagerError::UpdateMemoryForVfioUserPciDevice)?;
4254                 }
4255             }
4256         }
4257 
4258         Ok(())
4259     }
4260 
4261     pub fn activate_virtio_devices(&self) -> DeviceManagerResult<()> {
4262         for mut activator in self.pending_activations.lock().unwrap().drain(..) {
4263             activator
4264                 .activate()
4265                 .map_err(DeviceManagerError::VirtioActivate)?;
4266         }
4267         Ok(())
4268     }
4269 
4270     pub fn notify_hotplug(
4271         &self,
4272         _notification_type: AcpiNotificationFlags,
4273     ) -> DeviceManagerResult<()> {
4274         return self
4275             .ged_notification_device
4276             .as_ref()
4277             .unwrap()
4278             .lock()
4279             .unwrap()
4280             .notify(_notification_type)
4281             .map_err(DeviceManagerError::HotPlugNotification);
4282     }
4283 
4284     pub fn add_device(
4285         &mut self,
4286         device_cfg: &mut DeviceConfig,
4287     ) -> DeviceManagerResult<PciDeviceInfo> {
4288         self.validate_identifier(&device_cfg.id)?;
4289 
4290         if device_cfg.iommu && !self.is_iommu_segment(device_cfg.pci_segment) {
4291             return Err(DeviceManagerError::InvalidIommuHotplug);
4292         }
4293 
4294         let (bdf, device_name) = self.add_passthrough_device(device_cfg)?;
4295 
4296         // Update the PCIU bitmap
4297         self.pci_segments[device_cfg.pci_segment as usize].pci_devices_up |= 1 << bdf.device();
4298 
4299         Ok(PciDeviceInfo {
4300             id: device_name,
4301             bdf,
4302         })
4303     }
4304 
4305     pub fn add_user_device(
4306         &mut self,
4307         device_cfg: &mut UserDeviceConfig,
4308     ) -> DeviceManagerResult<PciDeviceInfo> {
4309         self.validate_identifier(&device_cfg.id)?;
4310 
4311         let (bdf, device_name) = self.add_vfio_user_device(device_cfg)?;
4312 
4313         // Update the PCIU bitmap
4314         self.pci_segments[device_cfg.pci_segment as usize].pci_devices_up |= 1 << bdf.device();
4315 
4316         Ok(PciDeviceInfo {
4317             id: device_name,
4318             bdf,
4319         })
4320     }
4321 
4322     pub fn remove_device(&mut self, id: String) -> DeviceManagerResult<()> {
4323         // The node can be directly a PCI node in case the 'id' refers to a
4324         // VFIO device or a virtio-pci one.
4325         // In case the 'id' refers to a virtio device, we must find the PCI
4326         // node by looking at the parent.
4327         let device_tree = self.device_tree.lock().unwrap();
4328         let node = device_tree
4329             .get(&id)
4330             .ok_or(DeviceManagerError::UnknownDeviceId(id.clone()))?;
4331 
4332         // Release advisory locks by dropping all references.
4333         // Linux automatically releases all locks of that file if the last open FD is closed.
4334         {
4335             let maybe_block_device_index = self
4336                 .block_devices
4337                 .iter()
4338                 .enumerate()
4339                 .find(|(_, dev)| {
4340                     let dev = dev.lock().unwrap();
4341                     dev.id() == id
4342                 })
4343                 .map(|(i, _)| i);
4344             if let Some(index) = maybe_block_device_index {
4345                 let _ = self.block_devices.swap_remove(index);
4346             }
4347         }
4348 
4349         let pci_device_node = if node.pci_bdf.is_some() && node.pci_device_handle.is_some() {
4350             node
4351         } else {
4352             let parent = node
4353                 .parent
4354                 .as_ref()
4355                 .ok_or(DeviceManagerError::MissingNode)?;
4356             device_tree
4357                 .get(parent)
4358                 .ok_or(DeviceManagerError::MissingNode)?
4359         };
4360 
4361         let pci_device_bdf: PciBdf = pci_device_node
4362             .pci_bdf
4363             .ok_or(DeviceManagerError::MissingDeviceNodePciBdf)?;
4364         let pci_segment_id = pci_device_bdf.segment();
4365 
4366         let pci_device_handle = pci_device_node
4367             .pci_device_handle
4368             .as_ref()
4369             .ok_or(DeviceManagerError::MissingPciDevice)?;
4370         #[allow(irrefutable_let_patterns)]
4371         if let PciDeviceHandle::Virtio(virtio_pci_device) = pci_device_handle {
4372             let device_type = VirtioDeviceType::from(
4373                 virtio_pci_device
4374                     .lock()
4375                     .unwrap()
4376                     .virtio_device()
4377                     .lock()
4378                     .unwrap()
4379                     .device_type(),
4380             );
4381             match device_type {
4382                 VirtioDeviceType::Net
4383                 | VirtioDeviceType::Block
4384                 | VirtioDeviceType::Pmem
4385                 | VirtioDeviceType::Fs
4386                 | VirtioDeviceType::Vsock => {}
4387                 _ => return Err(DeviceManagerError::RemovalNotAllowed(device_type)),
4388             }
4389         }
4390 
4391         // Update the PCID bitmap
4392         self.pci_segments[pci_segment_id as usize].pci_devices_down |= 1 << pci_device_bdf.device();
4393 
4394         Ok(())
4395     }
4396 
4397     pub fn eject_device(&mut self, pci_segment_id: u16, device_id: u8) -> DeviceManagerResult<()> {
4398         info!(
4399             "Ejecting device_id = {} on segment_id={}",
4400             device_id, pci_segment_id
4401         );
4402 
4403         // Convert the device ID into the corresponding b/d/f.
4404         let pci_device_bdf = PciBdf::new(pci_segment_id, 0, device_id, 0);
4405 
4406         // Give the PCI device ID back to the PCI bus.
4407         self.pci_segments[pci_segment_id as usize]
4408             .pci_bus
4409             .lock()
4410             .unwrap()
4411             .put_device_id(device_id as usize)
4412             .map_err(DeviceManagerError::PutPciDeviceId)?;
4413 
4414         let (pci_device_handle, id) = {
4415             // Remove the device from the device tree along with its children.
4416             let mut device_tree = self.device_tree.lock().unwrap();
4417             let pci_device_node = device_tree
4418                 .remove_node_by_pci_bdf(pci_device_bdf)
4419                 .ok_or(DeviceManagerError::MissingPciDevice)?;
4420 
4421             // For VFIO and vfio-user the PCI device id is the id.
4422             // For virtio we overwrite it later as we want the id of the
4423             // underlying device.
4424             let mut id = pci_device_node.id;
4425             let pci_device_handle = pci_device_node
4426                 .pci_device_handle
4427                 .ok_or(DeviceManagerError::MissingPciDevice)?;
4428             if matches!(pci_device_handle, PciDeviceHandle::Virtio(_)) {
4429                 // The virtio-pci device has a single child
4430                 if !pci_device_node.children.is_empty() {
4431                     assert_eq!(pci_device_node.children.len(), 1);
4432                     let child_id = &pci_device_node.children[0];
4433                     id.clone_from(child_id);
4434                 }
4435             }
4436             for child in pci_device_node.children.iter() {
4437                 device_tree.remove(child);
4438             }
4439 
4440             (pci_device_handle, id)
4441         };
4442 
4443         let mut iommu_attached = false;
4444         if let Some((_, iommu_attached_devices)) = &self.iommu_attached_devices {
4445             if iommu_attached_devices.contains(&pci_device_bdf) {
4446                 iommu_attached = true;
4447             }
4448         }
4449 
4450         let (pci_device, bus_device, virtio_device, remove_dma_handler) = match pci_device_handle {
4451             // No need to remove any virtio-mem mapping here as the container outlives all devices
4452             PciDeviceHandle::Vfio(vfio_pci_device) => {
4453                 for mmio_region in vfio_pci_device.lock().unwrap().mmio_regions() {
4454                     self.mmio_regions
4455                         .lock()
4456                         .unwrap()
4457                         .retain(|x| x.start != mmio_region.start)
4458                 }
4459 
4460                 (
4461                     Arc::clone(&vfio_pci_device) as Arc<Mutex<dyn PciDevice>>,
4462                     Arc::clone(&vfio_pci_device) as Arc<dyn BusDeviceSync>,
4463                     None as Option<Arc<Mutex<dyn virtio_devices::VirtioDevice>>>,
4464                     false,
4465                 )
4466             }
4467             PciDeviceHandle::Virtio(virtio_pci_device) => {
4468                 let dev = virtio_pci_device.lock().unwrap();
4469                 let bar_addr = dev.config_bar_addr();
4470                 for (event, addr) in dev.ioeventfds(bar_addr) {
4471                     let io_addr = IoEventAddress::Mmio(addr);
4472                     self.address_manager
4473                         .vm
4474                         .unregister_ioevent(event, &io_addr)
4475                         .map_err(|e| DeviceManagerError::UnRegisterIoevent(e.into()))?;
4476                 }
4477 
4478                 if let Some(dma_handler) = dev.dma_handler() {
4479                     if !iommu_attached {
4480                         for (_, zone) in self.memory_manager.lock().unwrap().memory_zones().iter() {
4481                             for region in zone.regions() {
4482                                 let iova = region.start_addr().0;
4483                                 let size = region.len();
4484                                 dma_handler
4485                                     .unmap(iova, size)
4486                                     .map_err(DeviceManagerError::VirtioDmaUnmap)?;
4487                             }
4488                         }
4489                     }
4490                 }
4491 
4492                 (
4493                     Arc::clone(&virtio_pci_device) as Arc<Mutex<dyn PciDevice>>,
4494                     Arc::clone(&virtio_pci_device) as Arc<dyn BusDeviceSync>,
4495                     Some(dev.virtio_device()),
4496                     dev.dma_handler().is_some() && !iommu_attached,
4497                 )
4498             }
4499             PciDeviceHandle::VfioUser(vfio_user_pci_device) => {
4500                 let mut dev = vfio_user_pci_device.lock().unwrap();
4501                 for (_, zone) in self.memory_manager.lock().unwrap().memory_zones().iter() {
4502                     for region in zone.regions() {
4503                         dev.dma_unmap(region)
4504                             .map_err(DeviceManagerError::VfioUserDmaUnmap)?;
4505                     }
4506                 }
4507 
4508                 (
4509                     Arc::clone(&vfio_user_pci_device) as Arc<Mutex<dyn PciDevice>>,
4510                     Arc::clone(&vfio_user_pci_device) as Arc<dyn BusDeviceSync>,
4511                     None as Option<Arc<Mutex<dyn virtio_devices::VirtioDevice>>>,
4512                     true,
4513                 )
4514             }
4515         };
4516 
4517         if remove_dma_handler {
4518             for virtio_mem_device in self.virtio_mem_devices.iter() {
4519                 virtio_mem_device
4520                     .lock()
4521                     .unwrap()
4522                     .remove_dma_mapping_handler(VirtioMemMappingSource::Device(
4523                         pci_device_bdf.into(),
4524                     ))
4525                     .map_err(DeviceManagerError::RemoveDmaMappingHandlerVirtioMem)?;
4526             }
4527         }
4528 
4529         // Free the allocated BARs
4530         pci_device
4531             .lock()
4532             .unwrap()
4533             .free_bars(
4534                 &mut self.address_manager.allocator.lock().unwrap(),
4535                 &mut self.pci_segments[pci_segment_id as usize]
4536                     .mem32_allocator
4537                     .lock()
4538                     .unwrap(),
4539                 &mut self.pci_segments[pci_segment_id as usize]
4540                     .mem64_allocator
4541                     .lock()
4542                     .unwrap(),
4543             )
4544             .map_err(DeviceManagerError::FreePciBars)?;
4545 
4546         // Remove the device from the PCI bus
4547         self.pci_segments[pci_segment_id as usize]
4548             .pci_bus
4549             .lock()
4550             .unwrap()
4551             .remove_by_device(&pci_device)
4552             .map_err(DeviceManagerError::RemoveDeviceFromPciBus)?;
4553 
4554         #[cfg(target_arch = "x86_64")]
4555         // Remove the device from the IO bus
4556         self.io_bus()
4557             .remove_by_device(&bus_device)
4558             .map_err(DeviceManagerError::RemoveDeviceFromIoBus)?;
4559 
4560         // Remove the device from the MMIO bus
4561         self.mmio_bus()
4562             .remove_by_device(&bus_device)
4563             .map_err(DeviceManagerError::RemoveDeviceFromMmioBus)?;
4564 
4565         // Remove the device from the list of BusDevice held by the
4566         // DeviceManager.
4567         self.bus_devices
4568             .retain(|dev| !Arc::ptr_eq(dev, &bus_device));
4569 
4570         // Shutdown and remove the underlying virtio-device if present
4571         if let Some(virtio_device) = virtio_device {
4572             for mapping in virtio_device.lock().unwrap().userspace_mappings() {
4573                 self.memory_manager
4574                     .lock()
4575                     .unwrap()
4576                     .remove_userspace_mapping(
4577                         mapping.addr.raw_value(),
4578                         mapping.len,
4579                         mapping.host_addr,
4580                         mapping.mergeable,
4581                         mapping.mem_slot,
4582                     )
4583                     .map_err(DeviceManagerError::MemoryManager)?;
4584             }
4585 
4586             virtio_device.lock().unwrap().shutdown();
4587 
4588             self.virtio_devices
4589                 .retain(|handler| !Arc::ptr_eq(&handler.virtio_device, &virtio_device));
4590         }
4591 
4592         event!(
4593             "vm",
4594             "device-removed",
4595             "id",
4596             &id,
4597             "bdf",
4598             pci_device_bdf.to_string()
4599         );
4600 
4601         // At this point, the device has been removed from all the list and
4602         // buses where it was stored. At the end of this function, after
4603         // any_device, bus_device and pci_device are released, the actual
4604         // device will be dropped.
4605         Ok(())
4606     }
4607 
4608     fn hotplug_virtio_pci_device(
4609         &mut self,
4610         handle: MetaVirtioDevice,
4611     ) -> DeviceManagerResult<PciDeviceInfo> {
4612         // Add the virtio device to the device manager list. This is important
4613         // as the list is used to notify virtio devices about memory updates
4614         // for instance.
4615         self.virtio_devices.push(handle.clone());
4616 
4617         let mapping: Option<Arc<IommuMapping>> = if handle.iommu {
4618             self.iommu_mapping.clone()
4619         } else {
4620             None
4621         };
4622 
4623         let bdf = self.add_virtio_pci_device(
4624             handle.virtio_device,
4625             &mapping,
4626             handle.id.clone(),
4627             handle.pci_segment,
4628             handle.dma_handler,
4629         )?;
4630 
4631         // Update the PCIU bitmap
4632         self.pci_segments[handle.pci_segment as usize].pci_devices_up |= 1 << bdf.device();
4633 
4634         Ok(PciDeviceInfo { id: handle.id, bdf })
4635     }
4636 
4637     fn is_iommu_segment(&self, pci_segment_id: u16) -> bool {
4638         self.config
4639             .lock()
4640             .as_ref()
4641             .unwrap()
4642             .platform
4643             .as_ref()
4644             .map(|pc| {
4645                 pc.iommu_segments
4646                     .as_ref()
4647                     .map(|v| v.contains(&pci_segment_id))
4648                     .unwrap_or_default()
4649             })
4650             .unwrap_or_default()
4651     }
4652 
4653     pub fn add_disk(&mut self, disk_cfg: &mut DiskConfig) -> DeviceManagerResult<PciDeviceInfo> {
4654         self.validate_identifier(&disk_cfg.id)?;
4655 
4656         if disk_cfg.iommu && !self.is_iommu_segment(disk_cfg.pci_segment) {
4657             return Err(DeviceManagerError::InvalidIommuHotplug);
4658         }
4659 
4660         let device = self.make_virtio_block_device(disk_cfg, true)?;
4661         self.hotplug_virtio_pci_device(device)
4662     }
4663 
4664     pub fn add_fs(&mut self, fs_cfg: &mut FsConfig) -> DeviceManagerResult<PciDeviceInfo> {
4665         self.validate_identifier(&fs_cfg.id)?;
4666 
4667         let device = self.make_virtio_fs_device(fs_cfg)?;
4668         self.hotplug_virtio_pci_device(device)
4669     }
4670 
4671     pub fn add_pmem(&mut self, pmem_cfg: &mut PmemConfig) -> DeviceManagerResult<PciDeviceInfo> {
4672         self.validate_identifier(&pmem_cfg.id)?;
4673 
4674         if pmem_cfg.iommu && !self.is_iommu_segment(pmem_cfg.pci_segment) {
4675             return Err(DeviceManagerError::InvalidIommuHotplug);
4676         }
4677 
4678         let device = self.make_virtio_pmem_device(pmem_cfg)?;
4679         self.hotplug_virtio_pci_device(device)
4680     }
4681 
4682     pub fn add_net(&mut self, net_cfg: &mut NetConfig) -> DeviceManagerResult<PciDeviceInfo> {
4683         self.validate_identifier(&net_cfg.id)?;
4684 
4685         if net_cfg.iommu && !self.is_iommu_segment(net_cfg.pci_segment) {
4686             return Err(DeviceManagerError::InvalidIommuHotplug);
4687         }
4688 
4689         let device = self.make_virtio_net_device(net_cfg)?;
4690         self.hotplug_virtio_pci_device(device)
4691     }
4692 
4693     pub fn add_vdpa(&mut self, vdpa_cfg: &mut VdpaConfig) -> DeviceManagerResult<PciDeviceInfo> {
4694         self.validate_identifier(&vdpa_cfg.id)?;
4695 
4696         if vdpa_cfg.iommu && !self.is_iommu_segment(vdpa_cfg.pci_segment) {
4697             return Err(DeviceManagerError::InvalidIommuHotplug);
4698         }
4699 
4700         let device = self.make_vdpa_device(vdpa_cfg)?;
4701         self.hotplug_virtio_pci_device(device)
4702     }
4703 
4704     pub fn add_vsock(&mut self, vsock_cfg: &mut VsockConfig) -> DeviceManagerResult<PciDeviceInfo> {
4705         self.validate_identifier(&vsock_cfg.id)?;
4706 
4707         if vsock_cfg.iommu && !self.is_iommu_segment(vsock_cfg.pci_segment) {
4708             return Err(DeviceManagerError::InvalidIommuHotplug);
4709         }
4710 
4711         let device = self.make_virtio_vsock_device(vsock_cfg)?;
4712         self.hotplug_virtio_pci_device(device)
4713     }
4714 
4715     pub fn counters(&self) -> HashMap<String, HashMap<&'static str, Wrapping<u64>>> {
4716         let mut counters = HashMap::new();
4717 
4718         for handle in &self.virtio_devices {
4719             let virtio_device = handle.virtio_device.lock().unwrap();
4720             if let Some(device_counters) = virtio_device.counters() {
4721                 counters.insert(handle.id.clone(), device_counters.clone());
4722             }
4723         }
4724 
4725         counters
4726     }
4727 
4728     pub fn resize_balloon(&mut self, size: u64) -> DeviceManagerResult<()> {
4729         if let Some(balloon) = &self.balloon {
4730             return balloon
4731                 .lock()
4732                 .unwrap()
4733                 .resize(size)
4734                 .map_err(DeviceManagerError::VirtioBalloonResize);
4735         }
4736 
4737         warn!("No balloon setup: Can't resize the balloon");
4738         Err(DeviceManagerError::MissingVirtioBalloon)
4739     }
4740 
4741     pub fn balloon_size(&self) -> u64 {
4742         if let Some(balloon) = &self.balloon {
4743             return balloon.lock().unwrap().get_actual();
4744         }
4745 
4746         0
4747     }
4748 
4749     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
4750         self.device_tree.clone()
4751     }
4752 
4753     #[cfg(target_arch = "x86_64")]
4754     pub fn notify_power_button(&self) -> DeviceManagerResult<()> {
4755         self.ged_notification_device
4756             .as_ref()
4757             .unwrap()
4758             .lock()
4759             .unwrap()
4760             .notify(AcpiNotificationFlags::POWER_BUTTON_CHANGED)
4761             .map_err(DeviceManagerError::PowerButtonNotification)
4762     }
4763 
4764     #[cfg(target_arch = "aarch64")]
4765     pub fn notify_power_button(&self) -> DeviceManagerResult<()> {
4766         // There are two use cases:
4767         // 1. Users will use direct kernel boot with device tree.
4768         // 2. Users will use ACPI+UEFI boot.
4769 
4770         // Trigger a GPIO pin 3 event to satisfy use case 1.
4771         self.gpio_device
4772             .as_ref()
4773             .unwrap()
4774             .lock()
4775             .unwrap()
4776             .trigger_key(3)
4777             .map_err(DeviceManagerError::AArch64PowerButtonNotification)?;
4778         // Trigger a GED power button event to satisfy use case 2.
4779         return self
4780             .ged_notification_device
4781             .as_ref()
4782             .unwrap()
4783             .lock()
4784             .unwrap()
4785             .notify(AcpiNotificationFlags::POWER_BUTTON_CHANGED)
4786             .map_err(DeviceManagerError::PowerButtonNotification);
4787     }
4788 
4789     pub fn iommu_attached_devices(&self) -> &Option<(PciBdf, Vec<PciBdf>)> {
4790         &self.iommu_attached_devices
4791     }
4792 
4793     fn validate_identifier(&self, id: &Option<String>) -> DeviceManagerResult<()> {
4794         if let Some(id) = id {
4795             if id.starts_with("__") {
4796                 return Err(DeviceManagerError::InvalidIdentifier(id.clone()));
4797             }
4798 
4799             if self.device_tree.lock().unwrap().contains_key(id) {
4800                 return Err(DeviceManagerError::IdentifierNotUnique(id.clone()));
4801             }
4802         }
4803 
4804         Ok(())
4805     }
4806 
4807     #[cfg(not(target_arch = "riscv64"))]
4808     pub(crate) fn acpi_platform_addresses(&self) -> &AcpiPlatformAddresses {
4809         &self.acpi_platform_addresses
4810     }
4811 }
4812 
4813 fn numa_node_id_from_memory_zone_id(numa_nodes: &NumaNodes, memory_zone_id: &str) -> Option<u32> {
4814     for (numa_node_id, numa_node) in numa_nodes.iter() {
4815         if numa_node.memory_zones.contains(&memory_zone_id.to_owned()) {
4816             return Some(*numa_node_id);
4817         }
4818     }
4819 
4820     None
4821 }
4822 
4823 fn numa_node_id_from_pci_segment_id(numa_nodes: &NumaNodes, pci_segment_id: u16) -> u32 {
4824     for (numa_node_id, numa_node) in numa_nodes.iter() {
4825         if numa_node.pci_segments.contains(&pci_segment_id) {
4826             return *numa_node_id;
4827         }
4828     }
4829 
4830     0
4831 }
4832 
4833 #[cfg(not(target_arch = "riscv64"))]
4834 struct TpmDevice {}
4835 
4836 #[cfg(not(target_arch = "riscv64"))]
4837 impl Aml for TpmDevice {
4838     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
4839         aml::Device::new(
4840             "TPM2".into(),
4841             vec![
4842                 &aml::Name::new("_HID".into(), &"MSFT0101"),
4843                 &aml::Name::new("_STA".into(), &(0xF_usize)),
4844                 &aml::Name::new(
4845                     "_CRS".into(),
4846                     &aml::ResourceTemplate::new(vec![&aml::Memory32Fixed::new(
4847                         true,
4848                         layout::TPM_START.0 as u32,
4849                         layout::TPM_SIZE as u32,
4850                     )]),
4851                 ),
4852             ],
4853         )
4854         .to_aml_bytes(sink)
4855     }
4856 }
4857 
4858 #[cfg(not(target_arch = "riscv64"))]
4859 impl Aml for DeviceManager {
4860     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
4861         #[cfg(target_arch = "aarch64")]
4862         use arch::aarch64::DeviceInfoForFdt;
4863 
4864         let mut pci_scan_methods = Vec::new();
4865         for i in 0..self.pci_segments.len() {
4866             pci_scan_methods.push(aml::MethodCall::new(
4867                 format!("\\_SB_.PC{i:02X}.PCNT").as_str().into(),
4868                 vec![],
4869             ));
4870         }
4871         let mut pci_scan_inner: Vec<&dyn Aml> = Vec::new();
4872         for method in &pci_scan_methods {
4873             pci_scan_inner.push(method)
4874         }
4875 
4876         // PCI hotplug controller
4877         aml::Device::new(
4878             "_SB_.PHPR".into(),
4879             vec![
4880                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
4881                 &aml::Name::new("_STA".into(), &0x0bu8),
4882                 &aml::Name::new("_UID".into(), &"PCI Hotplug Controller"),
4883                 &aml::Mutex::new("BLCK".into(), 0),
4884                 &aml::Name::new(
4885                     "_CRS".into(),
4886                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
4887                         aml::AddressSpaceCacheable::NotCacheable,
4888                         true,
4889                         self.acpi_address.0,
4890                         self.acpi_address.0 + DEVICE_MANAGER_ACPI_SIZE as u64 - 1,
4891                         None,
4892                     )]),
4893                 ),
4894                 // OpRegion and Fields map MMIO range into individual field values
4895                 &aml::OpRegion::new(
4896                     "PCST".into(),
4897                     aml::OpRegionSpace::SystemMemory,
4898                     &(self.acpi_address.0 as usize),
4899                     &DEVICE_MANAGER_ACPI_SIZE,
4900                 ),
4901                 &aml::Field::new(
4902                     "PCST".into(),
4903                     aml::FieldAccessType::DWord,
4904                     aml::FieldLockRule::NoLock,
4905                     aml::FieldUpdateRule::WriteAsZeroes,
4906                     vec![
4907                         aml::FieldEntry::Named(*b"PCIU", 32),
4908                         aml::FieldEntry::Named(*b"PCID", 32),
4909                         aml::FieldEntry::Named(*b"B0EJ", 32),
4910                         aml::FieldEntry::Named(*b"PSEG", 32),
4911                     ],
4912                 ),
4913                 &aml::Method::new(
4914                     "PCEJ".into(),
4915                     2,
4916                     true,
4917                     vec![
4918                         // Take lock defined above
4919                         &aml::Acquire::new("BLCK".into(), 0xffff),
4920                         // Choose the current segment
4921                         &aml::Store::new(&aml::Path::new("PSEG"), &aml::Arg(1)),
4922                         // Write PCI bus number (in first argument) to I/O port via field
4923                         &aml::ShiftLeft::new(&aml::Path::new("B0EJ"), &aml::ONE, &aml::Arg(0)),
4924                         // Release lock
4925                         &aml::Release::new("BLCK".into()),
4926                         // Return 0
4927                         &aml::Return::new(&aml::ZERO),
4928                     ],
4929                 ),
4930                 &aml::Method::new("PSCN".into(), 0, true, pci_scan_inner),
4931             ],
4932         )
4933         .to_aml_bytes(sink);
4934 
4935         for segment in &self.pci_segments {
4936             segment.to_aml_bytes(sink);
4937         }
4938 
4939         let mut mbrd_memory = Vec::new();
4940 
4941         for segment in &self.pci_segments {
4942             mbrd_memory.push(aml::Memory32Fixed::new(
4943                 true,
4944                 segment.mmio_config_address as u32,
4945                 layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT as u32,
4946             ))
4947         }
4948 
4949         let mut mbrd_memory_refs = Vec::new();
4950         for mbrd_memory_ref in &mbrd_memory {
4951             mbrd_memory_refs.push(mbrd_memory_ref as &dyn Aml);
4952         }
4953 
4954         aml::Device::new(
4955             "_SB_.MBRD".into(),
4956             vec![
4957                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C02")),
4958                 &aml::Name::new("_UID".into(), &aml::ZERO),
4959                 &aml::Name::new("_CRS".into(), &aml::ResourceTemplate::new(mbrd_memory_refs)),
4960             ],
4961         )
4962         .to_aml_bytes(sink);
4963 
4964         // Serial device
4965         #[cfg(target_arch = "x86_64")]
4966         let serial_irq = 4;
4967         #[cfg(target_arch = "aarch64")]
4968         let serial_irq =
4969             if self.config.lock().unwrap().serial.clone().mode != ConsoleOutputMode::Off {
4970                 self.get_device_info()
4971                     .clone()
4972                     .get(&(DeviceType::Serial, DeviceType::Serial.to_string()))
4973                     .unwrap()
4974                     .irq()
4975             } else {
4976                 // If serial is turned off, add a fake device with invalid irq.
4977                 31
4978             };
4979         if self.config.lock().unwrap().serial.mode != ConsoleOutputMode::Off {
4980             aml::Device::new(
4981                 "_SB_.COM1".into(),
4982                 vec![
4983                     &aml::Name::new(
4984                         "_HID".into(),
4985                         #[cfg(target_arch = "x86_64")]
4986                         &aml::EISAName::new("PNP0501"),
4987                         #[cfg(target_arch = "aarch64")]
4988                         &"ARMH0011",
4989                     ),
4990                     &aml::Name::new("_UID".into(), &aml::ZERO),
4991                     &aml::Name::new("_DDN".into(), &"COM1"),
4992                     &aml::Name::new(
4993                         "_CRS".into(),
4994                         &aml::ResourceTemplate::new(vec![
4995                             &aml::Interrupt::new(true, true, false, false, serial_irq),
4996                             #[cfg(target_arch = "x86_64")]
4997                             &aml::IO::new(0x3f8, 0x3f8, 0, 0x8),
4998                             #[cfg(target_arch = "aarch64")]
4999                             &aml::Memory32Fixed::new(
5000                                 true,
5001                                 arch::layout::LEGACY_SERIAL_MAPPED_IO_START.raw_value() as u32,
5002                                 MMIO_LEN as u32,
5003                             ),
5004                         ]),
5005                     ),
5006                 ],
5007             )
5008             .to_aml_bytes(sink);
5009         }
5010 
5011         aml::Name::new("_S5_".into(), &aml::Package::new(vec![&5u8])).to_aml_bytes(sink);
5012 
5013         aml::Device::new(
5014             "_SB_.PWRB".into(),
5015             vec![
5016                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C0C")),
5017                 &aml::Name::new("_UID".into(), &aml::ZERO),
5018             ],
5019         )
5020         .to_aml_bytes(sink);
5021 
5022         if self.config.lock().unwrap().tpm.is_some() {
5023             // Add tpm device
5024             TpmDevice {}.to_aml_bytes(sink);
5025         }
5026 
5027         self.ged_notification_device
5028             .as_ref()
5029             .unwrap()
5030             .lock()
5031             .unwrap()
5032             .to_aml_bytes(sink)
5033     }
5034 }
5035 
5036 impl Pausable for DeviceManager {
5037     fn pause(&mut self) -> result::Result<(), MigratableError> {
5038         for (_, device_node) in self.device_tree.lock().unwrap().iter() {
5039             if let Some(migratable) = &device_node.migratable {
5040                 migratable.lock().unwrap().pause()?;
5041             }
5042         }
5043         // On AArch64, the pause of device manager needs to trigger
5044         // a "pause" of GIC, which will flush the GIC pending tables
5045         // and ITS tables to guest RAM.
5046         #[cfg(target_arch = "aarch64")]
5047         {
5048             self.get_interrupt_controller()
5049                 .unwrap()
5050                 .lock()
5051                 .unwrap()
5052                 .pause()?;
5053         };
5054 
5055         Ok(())
5056     }
5057 
5058     fn resume(&mut self) -> result::Result<(), MigratableError> {
5059         for (_, device_node) in self.device_tree.lock().unwrap().iter() {
5060             if let Some(migratable) = &device_node.migratable {
5061                 migratable.lock().unwrap().resume()?;
5062             }
5063         }
5064         Ok(())
5065     }
5066 }
5067 
5068 impl Snapshottable for DeviceManager {
5069     fn id(&self) -> String {
5070         DEVICE_MANAGER_SNAPSHOT_ID.to_string()
5071     }
5072 
5073     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
5074         let mut snapshot = Snapshot::from_data(SnapshotData::new_from_state(&self.state())?);
5075 
5076         // We aggregate all devices snapshots.
5077         for (_, device_node) in self.device_tree.lock().unwrap().iter() {
5078             if let Some(migratable) = &device_node.migratable {
5079                 let mut migratable = migratable.lock().unwrap();
5080                 snapshot.add_snapshot(migratable.id(), migratable.snapshot()?);
5081             }
5082         }
5083 
5084         Ok(snapshot)
5085     }
5086 }
5087 
5088 impl Transportable for DeviceManager {}
5089 
5090 impl Migratable for DeviceManager {
5091     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
5092         for (_, device_node) in self.device_tree.lock().unwrap().iter() {
5093             if let Some(migratable) = &device_node.migratable {
5094                 migratable.lock().unwrap().start_dirty_log()?;
5095             }
5096         }
5097         Ok(())
5098     }
5099 
5100     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
5101         for (_, device_node) in self.device_tree.lock().unwrap().iter() {
5102             if let Some(migratable) = &device_node.migratable {
5103                 migratable.lock().unwrap().stop_dirty_log()?;
5104             }
5105         }
5106         Ok(())
5107     }
5108 
5109     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
5110         let mut tables = Vec::new();
5111         for (_, device_node) in self.device_tree.lock().unwrap().iter() {
5112             if let Some(migratable) = &device_node.migratable {
5113                 tables.push(migratable.lock().unwrap().dirty_log()?);
5114             }
5115         }
5116         Ok(MemoryRangeTable::new_from_tables(tables))
5117     }
5118 
5119     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
5120         for (_, device_node) in self.device_tree.lock().unwrap().iter() {
5121             if let Some(migratable) = &device_node.migratable {
5122                 migratable.lock().unwrap().start_migration()?;
5123             }
5124         }
5125         Ok(())
5126     }
5127 
5128     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
5129         for (_, device_node) in self.device_tree.lock().unwrap().iter() {
5130             if let Some(migratable) = &device_node.migratable {
5131                 migratable.lock().unwrap().complete_migration()?;
5132             }
5133         }
5134         Ok(())
5135     }
5136 }
5137 
5138 const PCIU_FIELD_OFFSET: u64 = 0;
5139 const PCID_FIELD_OFFSET: u64 = 4;
5140 const B0EJ_FIELD_OFFSET: u64 = 8;
5141 const PSEG_FIELD_OFFSET: u64 = 12;
5142 const PCIU_FIELD_SIZE: usize = 4;
5143 const PCID_FIELD_SIZE: usize = 4;
5144 const B0EJ_FIELD_SIZE: usize = 4;
5145 const PSEG_FIELD_SIZE: usize = 4;
5146 
5147 impl BusDevice for DeviceManager {
5148     fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
5149         match offset {
5150             PCIU_FIELD_OFFSET => {
5151                 assert!(data.len() == PCIU_FIELD_SIZE);
5152                 data.copy_from_slice(
5153                     &self.pci_segments[self.selected_segment]
5154                         .pci_devices_up
5155                         .to_le_bytes(),
5156                 );
5157                 // Clear the PCIU bitmap
5158                 self.pci_segments[self.selected_segment].pci_devices_up = 0;
5159             }
5160             PCID_FIELD_OFFSET => {
5161                 assert!(data.len() == PCID_FIELD_SIZE);
5162                 data.copy_from_slice(
5163                     &self.pci_segments[self.selected_segment]
5164                         .pci_devices_down
5165                         .to_le_bytes(),
5166                 );
5167                 // Clear the PCID bitmap
5168                 self.pci_segments[self.selected_segment].pci_devices_down = 0;
5169             }
5170             B0EJ_FIELD_OFFSET => {
5171                 assert!(data.len() == B0EJ_FIELD_SIZE);
5172                 // Always return an empty bitmap since the eject is always
5173                 // taken care of right away during a write access.
5174                 data.fill(0);
5175             }
5176             PSEG_FIELD_OFFSET => {
5177                 assert_eq!(data.len(), PSEG_FIELD_SIZE);
5178                 data.copy_from_slice(&(self.selected_segment as u32).to_le_bytes());
5179             }
5180             _ => error!(
5181                 "Accessing unknown location at base 0x{:x}, offset 0x{:x}",
5182                 base, offset
5183             ),
5184         }
5185 
5186         debug!(
5187             "PCI_HP_REG_R: base 0x{:x}, offset 0x{:x}, data {:?}",
5188             base, offset, data
5189         )
5190     }
5191 
5192     fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<std::sync::Barrier>> {
5193         match offset {
5194             B0EJ_FIELD_OFFSET => {
5195                 assert!(data.len() == B0EJ_FIELD_SIZE);
5196                 let mut data_array: [u8; 4] = [0, 0, 0, 0];
5197                 data_array.copy_from_slice(data);
5198                 let mut slot_bitmap = u32::from_le_bytes(data_array);
5199 
5200                 while slot_bitmap > 0 {
5201                     let slot_id = slot_bitmap.trailing_zeros();
5202                     if let Err(e) = self.eject_device(self.selected_segment as u16, slot_id as u8) {
5203                         error!("Failed ejecting device {}: {:?}", slot_id, e);
5204                     }
5205                     slot_bitmap &= !(1 << slot_id);
5206                 }
5207             }
5208             PSEG_FIELD_OFFSET => {
5209                 assert_eq!(data.len(), PSEG_FIELD_SIZE);
5210                 let mut data_array: [u8; 4] = [0, 0, 0, 0];
5211                 data_array.copy_from_slice(data);
5212                 let selected_segment = u32::from_le_bytes(data_array) as usize;
5213                 if selected_segment >= self.pci_segments.len() {
5214                     error!(
5215                         "Segment selection out of range: {} >= {}",
5216                         selected_segment,
5217                         self.pci_segments.len()
5218                     );
5219                     return None;
5220                 }
5221                 self.selected_segment = selected_segment;
5222             }
5223             _ => error!(
5224                 "Accessing unknown location at base 0x{:x}, offset 0x{:x}",
5225                 base, offset
5226             ),
5227         }
5228 
5229         debug!(
5230             "PCI_HP_REG_W: base 0x{:x}, offset 0x{:x}, data {:?}",
5231             base, offset, data
5232         );
5233 
5234         None
5235     }
5236 }
5237 
5238 impl Drop for DeviceManager {
5239     fn drop(&mut self) {
5240         // Wake up the DeviceManager threads (mainly virtio device workers),
5241         // to avoid deadlock on waiting for paused/parked worker threads.
5242         if let Err(e) = self.resume() {
5243             error!("Error resuming DeviceManager: {:?}", e);
5244         }
5245 
5246         for handle in self.virtio_devices.drain(..) {
5247             handle.virtio_device.lock().unwrap().shutdown();
5248         }
5249 
5250         if let Some(termios) = *self.original_termios_opt.lock().unwrap() {
5251             // SAFETY: FFI call
5252             let _ = unsafe { tcsetattr(stdout().lock().as_raw_fd(), TCSANOW, &termios) };
5253         }
5254     }
5255 }
5256 
5257 #[cfg(test)]
5258 mod tests {
5259     use super::*;
5260 
5261     #[test]
5262     fn test_create_mmio_allocators() {
5263         let res = create_mmio_allocators(0x100000, 0x400000, 1, vec![1], 4 << 10);
5264         assert_eq!(res.len(), 1);
5265         assert_eq!(
5266             res[0].lock().unwrap().base(),
5267             vm_memory::GuestAddress(0x100000)
5268         );
5269         assert_eq!(
5270             res[0].lock().unwrap().end(),
5271             vm_memory::GuestAddress(0x3fffff)
5272         );
5273 
5274         let res = create_mmio_allocators(0x100000, 0x400000, 2, vec![1, 1], 4 << 10);
5275         assert_eq!(res.len(), 2);
5276         assert_eq!(
5277             res[0].lock().unwrap().base(),
5278             vm_memory::GuestAddress(0x100000)
5279         );
5280         assert_eq!(
5281             res[0].lock().unwrap().end(),
5282             vm_memory::GuestAddress(0x27ffff)
5283         );
5284         assert_eq!(
5285             res[1].lock().unwrap().base(),
5286             vm_memory::GuestAddress(0x280000)
5287         );
5288         assert_eq!(
5289             res[1].lock().unwrap().end(),
5290             vm_memory::GuestAddress(0x3fffff)
5291         );
5292 
5293         let res = create_mmio_allocators(0x100000, 0x400000, 2, vec![2, 1], 4 << 10);
5294         assert_eq!(res.len(), 2);
5295         assert_eq!(
5296             res[0].lock().unwrap().base(),
5297             vm_memory::GuestAddress(0x100000)
5298         );
5299         assert_eq!(
5300             res[0].lock().unwrap().end(),
5301             vm_memory::GuestAddress(0x2fffff)
5302         );
5303         assert_eq!(
5304             res[1].lock().unwrap().base(),
5305             vm_memory::GuestAddress(0x300000)
5306         );
5307         assert_eq!(
5308             res[1].lock().unwrap().end(),
5309             vm_memory::GuestAddress(0x3fffff)
5310         );
5311     }
5312 }
5313