xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision fa7a000dbe9637eb256af18ae8c3c4a8d5bf9c8f)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 #[cfg(target_arch = "x86_64")]
6 use crate::config::SgxEpcConfig;
7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
8 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
9 use crate::coredump::{
10     CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError,
11 };
12 use crate::migration::url_to_path;
13 use crate::MEMORY_MANAGER_SNAPSHOT_ID;
14 use crate::{GuestMemoryMmap, GuestRegionMmap};
15 use acpi_tables::{aml, Aml};
16 use anyhow::anyhow;
17 #[cfg(target_arch = "x86_64")]
18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
19 use arch::RegionType;
20 #[cfg(target_arch = "x86_64")]
21 use devices::ioapic;
22 #[cfg(target_arch = "aarch64")]
23 use hypervisor::HypervisorVmError;
24 use libc::_SC_NPROCESSORS_ONLN;
25 #[cfg(target_arch = "x86_64")]
26 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
27 use serde::{Deserialize, Serialize};
28 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
29 use std::collections::BTreeMap;
30 use std::collections::HashMap;
31 use std::fs::{File, OpenOptions};
32 use std::io::{self};
33 use std::ops::{BitAnd, Deref, Not, Sub};
34 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
35 use std::os::fd::AsFd;
36 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
37 use std::path::PathBuf;
38 use std::result;
39 use std::sync::{Arc, Barrier, Mutex};
40 use std::{ffi, thread};
41 use tracer::trace_scoped;
42 use versionize::{VersionMap, Versionize, VersionizeResult};
43 use versionize_derive::Versionize;
44 use virtio_devices::BlocksState;
45 #[cfg(target_arch = "x86_64")]
46 use vm_allocator::GsiApic;
47 use vm_allocator::{AddressAllocator, SystemAllocator};
48 use vm_device::BusDevice;
49 use vm_memory::bitmap::AtomicBitmap;
50 use vm_memory::guest_memory::FileOffset;
51 use vm_memory::{
52     mmap::MmapRegionError, Address, Error as MmapError, GuestAddress, GuestAddressSpace,
53     GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion,
54     ReadVolatile,
55 };
56 use vm_migration::{
57     protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable,
58     Snapshot, SnapshotData, Snapshottable, Transportable, VersionMapped,
59 };
60 
61 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
62 
63 const DEFAULT_MEMORY_ZONE: &str = "mem0";
64 
65 const SNAPSHOT_FILENAME: &str = "memory-ranges";
66 
67 #[cfg(target_arch = "x86_64")]
68 const X86_64_IRQ_BASE: u32 = 5;
69 
70 #[cfg(target_arch = "x86_64")]
71 const SGX_PAGE_SIZE: u64 = 1 << 12;
72 
73 const HOTPLUG_COUNT: usize = 8;
74 
75 // Memory policy constants
76 const MPOL_BIND: u32 = 2;
77 const MPOL_MF_STRICT: u32 = 1;
78 const MPOL_MF_MOVE: u32 = 1 << 1;
79 
80 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
81 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
82 
83 const MAX_PREFAULT_THREAD_COUNT: usize = 16;
84 
85 #[derive(Clone, Default, Serialize, Deserialize, Versionize)]
86 struct HotPlugState {
87     base: u64,
88     length: u64,
89     active: bool,
90     inserting: bool,
91     removing: bool,
92 }
93 
94 pub struct VirtioMemZone {
95     region: Arc<GuestRegionMmap>,
96     virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>,
97     hotplugged_size: u64,
98     hugepages: bool,
99     blocks_state: Arc<Mutex<BlocksState>>,
100 }
101 
102 impl VirtioMemZone {
103     pub fn region(&self) -> &Arc<GuestRegionMmap> {
104         &self.region
105     }
106     pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) {
107         self.virtio_device = Some(virtio_device);
108     }
109     pub fn hotplugged_size(&self) -> u64 {
110         self.hotplugged_size
111     }
112     pub fn hugepages(&self) -> bool {
113         self.hugepages
114     }
115     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
116         &self.blocks_state
117     }
118     pub fn plugged_ranges(&self) -> MemoryRangeTable {
119         self.blocks_state
120             .lock()
121             .unwrap()
122             .memory_ranges(self.region.start_addr().raw_value(), true)
123     }
124 }
125 
126 #[derive(Default)]
127 pub struct MemoryZone {
128     regions: Vec<Arc<GuestRegionMmap>>,
129     virtio_mem_zone: Option<VirtioMemZone>,
130 }
131 
132 impl MemoryZone {
133     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
134         &self.regions
135     }
136     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
137         &self.virtio_mem_zone
138     }
139     pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> {
140         self.virtio_mem_zone.as_mut()
141     }
142 }
143 
144 pub type MemoryZones = HashMap<String, MemoryZone>;
145 
146 #[derive(Clone, Serialize, Deserialize, Versionize)]
147 struct GuestRamMapping {
148     slot: u32,
149     gpa: u64,
150     size: u64,
151     zone_id: String,
152     virtio_mem: bool,
153     file_offset: u64,
154 }
155 
156 #[derive(Clone, Serialize, Deserialize, Versionize)]
157 struct ArchMemRegion {
158     base: u64,
159     size: usize,
160     r_type: RegionType,
161 }
162 
163 pub struct MemoryManager {
164     boot_guest_memory: GuestMemoryMmap,
165     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
166     next_memory_slot: u32,
167     start_of_device_area: GuestAddress,
168     end_of_device_area: GuestAddress,
169     end_of_ram_area: GuestAddress,
170     pub vm: Arc<dyn hypervisor::Vm>,
171     hotplug_slots: Vec<HotPlugState>,
172     selected_slot: usize,
173     mergeable: bool,
174     allocator: Arc<Mutex<SystemAllocator>>,
175     hotplug_method: HotplugMethod,
176     boot_ram: u64,
177     current_ram: u64,
178     next_hotplug_slot: usize,
179     shared: bool,
180     hugepages: bool,
181     hugepage_size: Option<u64>,
182     prefault: bool,
183     thp: bool,
184     #[cfg(target_arch = "x86_64")]
185     sgx_epc_region: Option<SgxEpcRegion>,
186     user_provided_zones: bool,
187     snapshot_memory_ranges: MemoryRangeTable,
188     memory_zones: MemoryZones,
189     log_dirty: bool, // Enable dirty logging for created RAM regions
190     arch_mem_regions: Vec<ArchMemRegion>,
191     ram_allocator: AddressAllocator,
192     dynamic: bool,
193 
194     // Keep track of calls to create_userspace_mapping() for guest RAM.
195     // This is useful for getting the dirty pages as we need to know the
196     // slots that the mapping is created in.
197     guest_ram_mappings: Vec<GuestRamMapping>,
198 
199     pub acpi_address: Option<GuestAddress>,
200     #[cfg(target_arch = "aarch64")]
201     uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
202 }
203 
204 #[derive(Debug)]
205 pub enum Error {
206     /// Failed to create shared file.
207     SharedFileCreate(io::Error),
208 
209     /// Failed to set shared file length.
210     SharedFileSetLen(io::Error),
211 
212     /// Mmap backed guest memory error
213     GuestMemory(MmapError),
214 
215     /// Failed to allocate a memory range.
216     MemoryRangeAllocation,
217 
218     /// Error from region creation
219     GuestMemoryRegion(MmapRegionError),
220 
221     /// No ACPI slot available
222     NoSlotAvailable,
223 
224     /// Not enough space in the hotplug RAM region
225     InsufficientHotplugRam,
226 
227     /// The requested hotplug memory addition is not a valid size
228     InvalidSize,
229 
230     /// Failed to create the user memory region.
231     CreateUserMemoryRegion(hypervisor::HypervisorVmError),
232 
233     /// Failed to remove the user memory region.
234     RemoveUserMemoryRegion(hypervisor::HypervisorVmError),
235 
236     /// Failed to EventFd.
237     EventFdFail(io::Error),
238 
239     /// Eventfd write error
240     EventfdError(io::Error),
241 
242     /// Failed to virtio-mem resize
243     VirtioMemResizeFail(virtio_devices::mem::Error),
244 
245     /// Cannot restore VM
246     Restore(MigratableError),
247 
248     /// Cannot restore VM because source URL is missing
249     RestoreMissingSourceUrl,
250 
251     /// Cannot create the system allocator
252     CreateSystemAllocator,
253 
254     /// Invalid SGX EPC section size
255     #[cfg(target_arch = "x86_64")]
256     EpcSectionSizeInvalid,
257 
258     /// Failed allocating SGX EPC region
259     #[cfg(target_arch = "x86_64")]
260     SgxEpcRangeAllocation,
261 
262     /// Failed opening SGX virtual EPC device
263     #[cfg(target_arch = "x86_64")]
264     SgxVirtEpcOpen(io::Error),
265 
266     /// Failed setting the SGX virtual EPC section size
267     #[cfg(target_arch = "x86_64")]
268     SgxVirtEpcFileSetLen(io::Error),
269 
270     /// Failed opening SGX provisioning device
271     #[cfg(target_arch = "x86_64")]
272     SgxProvisionOpen(io::Error),
273 
274     /// Failed enabling SGX provisioning
275     #[cfg(target_arch = "x86_64")]
276     SgxEnableProvisioning(hypervisor::HypervisorVmError),
277 
278     /// Failed creating a new MmapRegion instance.
279     #[cfg(target_arch = "x86_64")]
280     NewMmapRegion(vm_memory::mmap::MmapRegionError),
281 
282     /// No memory zones found.
283     MissingMemoryZones,
284 
285     /// Memory configuration is not valid.
286     InvalidMemoryParameters,
287 
288     /// Forbidden operation. Impossible to resize guest memory if it is
289     /// backed by user defined memory regions.
290     InvalidResizeWithMemoryZones,
291 
292     /// It's invalid to try applying a NUMA policy to a memory zone that is
293     /// memory mapped with MAP_SHARED.
294     InvalidSharedMemoryZoneWithHostNuma,
295 
296     /// Failed applying NUMA memory policy.
297     ApplyNumaPolicy(io::Error),
298 
299     /// Memory zone identifier is not unique.
300     DuplicateZoneId,
301 
302     /// No virtio-mem resizing handler found.
303     MissingVirtioMemHandler,
304 
305     /// Unknown memory zone.
306     UnknownMemoryZone,
307 
308     /// Invalid size for resizing. Can be anything except 0.
309     InvalidHotplugSize,
310 
311     /// Invalid hotplug method associated with memory zones resizing capability.
312     InvalidHotplugMethodWithMemoryZones,
313 
314     /// Could not find specified memory zone identifier from hash map.
315     MissingZoneIdentifier,
316 
317     /// Resizing the memory zone failed.
318     ResizeZone,
319 
320     /// Guest address overflow
321     GuestAddressOverFlow,
322 
323     /// Error opening snapshot file
324     SnapshotOpen(io::Error),
325 
326     // Error copying snapshot into region
327     SnapshotCopy(GuestMemoryError),
328 
329     /// Failed to allocate MMIO address
330     AllocateMmioAddress,
331 
332     #[cfg(target_arch = "aarch64")]
333     /// Failed to create UEFI flash
334     CreateUefiFlash(HypervisorVmError),
335 
336     /// Using a directory as a backing file for memory is not supported
337     DirectoryAsBackingFileForMemory,
338 
339     /// Failed to stat filesystem
340     GetFileSystemBlockSize(io::Error),
341 
342     /// Memory size is misaligned with default page size or its hugepage size
343     MisalignedMemorySize,
344 }
345 
346 const ENABLE_FLAG: usize = 0;
347 const INSERTING_FLAG: usize = 1;
348 const REMOVING_FLAG: usize = 2;
349 const EJECT_FLAG: usize = 3;
350 
351 const BASE_OFFSET_LOW: u64 = 0;
352 const BASE_OFFSET_HIGH: u64 = 0x4;
353 const LENGTH_OFFSET_LOW: u64 = 0x8;
354 const LENGTH_OFFSET_HIGH: u64 = 0xC;
355 const STATUS_OFFSET: u64 = 0x14;
356 const SELECTION_OFFSET: u64 = 0;
357 
358 // The MMIO address space size is subtracted with 64k. This is done for the
359 // following reasons:
360 //  - Reduce the addressable space size by at least 4k to workaround a Linux
361 //    bug when the VMM allocates devices at the end of the addressable space
362 //  - Windows requires the addressable space size to be 64k aligned
363 fn mmio_address_space_size(phys_bits: u8) -> u64 {
364     (1 << phys_bits) - (1 << 16)
365 }
366 
367 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the
368 // `f_bsize` field.
369 //
370 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169
371 fn statfs_get_bsize(path: &str) -> Result<u64, Error> {
372     let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?;
373     let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit();
374 
375     // SAFETY: FFI call with a valid path and buffer
376     let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) };
377     if ret != 0 {
378         return Err(Error::GetFileSystemBlockSize(
379             std::io::Error::last_os_error(),
380         ));
381     }
382 
383     // SAFETY: `buf` is valid at this point
384     // Because this value is always positive, just convert it directly.
385     // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned
386     // by `clippy` on musl target.  To avoid the warning, there should be `as _` instead of
387     // `as u64`.
388     let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _;
389     Ok(bsize)
390 }
391 
392 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> {
393     // SAFETY: FFI call. Trivially safe.
394     let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
395 
396     // There is no backend file and the `hugepages` is disabled, just use system page size.
397     if zone.file.is_none() && !zone.hugepages {
398         return Ok(page_size);
399     }
400 
401     // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly.
402     if zone.hugepages && zone.hugepage_size.is_some() {
403         return Ok(zone.hugepage_size.unwrap());
404     }
405 
406     // There are two scenarios here:
407     //  - `hugepages` is enabled but `hugepage_size` is not specified:
408     //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
409     //  - The backing file is specified:
410     //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
411     //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
412     //     value is less than or equal to the page size, just use the page size.
413     let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| {
414         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
415     })?;
416 
417     let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
418 
419     Ok(align_size)
420 }
421 
422 #[inline]
423 fn align_down<T>(val: T, align: T) -> T
424 where
425     T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>,
426 {
427     val & !(align - 1u8.into())
428 }
429 
430 #[inline]
431 fn is_aligned<T>(val: T, align: T) -> bool
432 where
433     T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq,
434 {
435     (val & (align - 1u8.into())) == 0u8.into()
436 }
437 
438 impl BusDevice for MemoryManager {
439     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
440         if self.selected_slot < self.hotplug_slots.len() {
441             let state = &self.hotplug_slots[self.selected_slot];
442             match offset {
443                 BASE_OFFSET_LOW => {
444                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
445                 }
446                 BASE_OFFSET_HIGH => {
447                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
448                 }
449                 LENGTH_OFFSET_LOW => {
450                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
451                 }
452                 LENGTH_OFFSET_HIGH => {
453                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
454                 }
455                 STATUS_OFFSET => {
456                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
457                     data.fill(0);
458                     if state.active {
459                         data[0] |= 1 << ENABLE_FLAG;
460                     }
461                     if state.inserting {
462                         data[0] |= 1 << INSERTING_FLAG;
463                     }
464                     if state.removing {
465                         data[0] |= 1 << REMOVING_FLAG;
466                     }
467                 }
468                 _ => {
469                     warn!(
470                         "Unexpected offset for accessing memory manager device: {:#}",
471                         offset
472                     );
473                 }
474             }
475         } else {
476             warn!("Out of range memory slot: {}", self.selected_slot);
477         }
478     }
479 
480     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
481         match offset {
482             SELECTION_OFFSET => {
483                 self.selected_slot = usize::from(data[0]);
484             }
485             STATUS_OFFSET => {
486                 if self.selected_slot < self.hotplug_slots.len() {
487                     let state = &mut self.hotplug_slots[self.selected_slot];
488                     // The ACPI code writes back a 1 to acknowledge the insertion
489                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
490                         state.inserting = false;
491                     }
492                     // Ditto for removal
493                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
494                         state.removing = false;
495                     }
496                     // Trigger removal of "DIMM"
497                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
498                         warn!("Ejection of memory not currently supported");
499                     }
500                 } else {
501                     warn!("Out of range memory slot: {}", self.selected_slot);
502                 }
503             }
504             _ => {
505                 warn!(
506                     "Unexpected offset for accessing memory manager device: {:#}",
507                     offset
508                 );
509             }
510         };
511         None
512     }
513 }
514 
515 impl MemoryManager {
516     /// Creates all memory regions based on the available RAM ranges defined
517     /// by `ram_regions`, and based on the description of the memory zones.
518     /// In practice, this function can perform multiple memory mappings of the
519     /// same backing file if there's a hole in the address space between two
520     /// RAM ranges.
521     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
522     /// and zones containing two zones (size 1G and size 4G).
523     /// This function will create 3 resulting memory regions:
524     /// - First one mapping entirely the first memory zone on 0-1G range
525     /// - Second one mapping partially the second memory zone on 1G-3G range
526     /// - Third one mapping partially the second memory zone on 4G-6G range
527     /// Also, all memory regions are page-size aligned (e.g. their sizes must
528     /// be multiple of page-size), which may leave an additional hole in the
529     /// address space when hugepage is used.
530     fn create_memory_regions_from_zones(
531         ram_regions: &[(GuestAddress, usize)],
532         zones: &[MemoryZoneConfig],
533         prefault: Option<bool>,
534         thp: bool,
535     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
536         let mut zone_iter = zones.iter();
537         let mut mem_regions = Vec::new();
538         let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?;
539         let mut zone_align_size = memory_zone_get_align_size(zone)?;
540         let mut zone_offset = 0u64;
541         let mut memory_zones = HashMap::new();
542 
543         if !is_aligned(zone.size, zone_align_size) {
544             return Err(Error::MisalignedMemorySize);
545         }
546 
547         // Add zone id to the list of memory zones.
548         memory_zones.insert(zone.id.clone(), MemoryZone::default());
549 
550         for ram_region in ram_regions.iter() {
551             let mut ram_region_offset = 0;
552             let mut exit = false;
553 
554             loop {
555                 let mut ram_region_consumed = false;
556                 let mut pull_next_zone = false;
557 
558                 let ram_region_available_size =
559                     align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size);
560                 if ram_region_available_size == 0 {
561                     break;
562                 }
563                 let zone_sub_size = zone.size - zone_offset;
564 
565                 let file_offset = zone_offset;
566                 let region_start = ram_region
567                     .0
568                     .checked_add(ram_region_offset)
569                     .ok_or(Error::GuestAddressOverFlow)?;
570                 let region_size = if zone_sub_size <= ram_region_available_size {
571                     if zone_sub_size == ram_region_available_size {
572                         ram_region_consumed = true;
573                     }
574 
575                     ram_region_offset += zone_sub_size;
576                     pull_next_zone = true;
577 
578                     zone_sub_size
579                 } else {
580                     zone_offset += ram_region_available_size;
581                     ram_region_consumed = true;
582 
583                     ram_region_available_size
584                 };
585 
586                 info!(
587                     "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}",
588                     zone.id,
589                     region_start.raw_value(),
590                     region_size
591                 );
592                 let region = MemoryManager::create_ram_region(
593                     &zone.file,
594                     file_offset,
595                     region_start,
596                     region_size as usize,
597                     prefault.unwrap_or(zone.prefault),
598                     zone.shared,
599                     zone.hugepages,
600                     zone.hugepage_size,
601                     zone.host_numa_node,
602                     None,
603                     thp,
604                 )?;
605 
606                 // Add region to the list of regions associated with the
607                 // current memory zone.
608                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
609                     memory_zone.regions.push(region.clone());
610                 }
611 
612                 mem_regions.push(region);
613 
614                 if pull_next_zone {
615                     // Get the next zone and reset the offset.
616                     zone_offset = 0;
617                     if let Some(z) = zone_iter.next() {
618                         zone = z;
619                     } else {
620                         exit = true;
621                         break;
622                     }
623                     zone_align_size = memory_zone_get_align_size(zone)?;
624                     if !is_aligned(zone.size, zone_align_size) {
625                         return Err(Error::MisalignedMemorySize);
626                     }
627 
628                     // Check if zone id already exist. In case it does, throw
629                     // an error as we need unique identifiers. Otherwise, add
630                     // the new zone id to the list of memory zones.
631                     if memory_zones.contains_key(&zone.id) {
632                         error!(
633                             "Memory zone identifier '{}' found more than once. \
634                             It must be unique",
635                             zone.id,
636                         );
637                         return Err(Error::DuplicateZoneId);
638                     }
639                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
640                 }
641 
642                 if ram_region_consumed {
643                     break;
644                 }
645             }
646 
647             if exit {
648                 break;
649             }
650         }
651 
652         Ok((mem_regions, memory_zones))
653     }
654 
655     // Restore both GuestMemory regions along with MemoryZone zones.
656     fn restore_memory_regions_and_zones(
657         guest_ram_mappings: &[GuestRamMapping],
658         zones_config: &[MemoryZoneConfig],
659         prefault: Option<bool>,
660         mut existing_memory_files: HashMap<u32, File>,
661         thp: bool,
662     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
663         let mut memory_regions = Vec::new();
664         let mut memory_zones = HashMap::new();
665 
666         for zone_config in zones_config {
667             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
668         }
669 
670         for guest_ram_mapping in guest_ram_mappings {
671             for zone_config in zones_config {
672                 if guest_ram_mapping.zone_id == zone_config.id {
673                     let region = MemoryManager::create_ram_region(
674                         if guest_ram_mapping.virtio_mem {
675                             &None
676                         } else {
677                             &zone_config.file
678                         },
679                         guest_ram_mapping.file_offset,
680                         GuestAddress(guest_ram_mapping.gpa),
681                         guest_ram_mapping.size as usize,
682                         prefault.unwrap_or(zone_config.prefault),
683                         zone_config.shared,
684                         zone_config.hugepages,
685                         zone_config.hugepage_size,
686                         zone_config.host_numa_node,
687                         existing_memory_files.remove(&guest_ram_mapping.slot),
688                         thp,
689                     )?;
690                     memory_regions.push(Arc::clone(&region));
691                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
692                         if guest_ram_mapping.virtio_mem {
693                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
694                             let region_size = region.len();
695                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
696                                 region,
697                                 virtio_device: None,
698                                 hotplugged_size,
699                                 hugepages: zone_config.hugepages,
700                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
701                             });
702                         } else {
703                             memory_zone.regions.push(region);
704                         }
705                     }
706                 }
707             }
708         }
709 
710         memory_regions.sort_by_key(|x| x.start_addr());
711 
712         Ok((memory_regions, memory_zones))
713     }
714 
715     fn fill_saved_regions(
716         &mut self,
717         file_path: PathBuf,
718         saved_regions: MemoryRangeTable,
719     ) -> Result<(), Error> {
720         if saved_regions.is_empty() {
721             return Ok(());
722         }
723 
724         // Open (read only) the snapshot file.
725         let mut memory_file = OpenOptions::new()
726             .read(true)
727             .open(file_path)
728             .map_err(Error::SnapshotOpen)?;
729 
730         let guest_memory = self.guest_memory.memory();
731         for range in saved_regions.regions() {
732             let mut offset: u64 = 0;
733             // Here we are manually handling the retry in case we can't write
734             // the whole region at once because we can't use the implementation
735             // from vm-memory::GuestMemory of read_exact_from() as it is not
736             // following the correct behavior. For more info about this issue
737             // see: https://github.com/rust-vmm/vm-memory/issues/174
738             loop {
739                 let bytes_read = guest_memory
740                     .read_volatile_from(
741                         GuestAddress(range.gpa + offset),
742                         &mut memory_file,
743                         (range.length - offset) as usize,
744                     )
745                     .map_err(Error::SnapshotCopy)?;
746                 offset += bytes_read as u64;
747 
748                 if offset == range.length {
749                     break;
750                 }
751             }
752         }
753 
754         Ok(())
755     }
756 
757     fn validate_memory_config(
758         config: &MemoryConfig,
759         user_provided_zones: bool,
760     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
761         let mut allow_mem_hotplug = false;
762 
763         if !user_provided_zones {
764             if config.zones.is_some() {
765                 error!(
766                     "User defined memory regions can't be provided if the \
767                     memory size is not 0"
768                 );
769                 return Err(Error::InvalidMemoryParameters);
770             }
771 
772             if config.hotplug_size.is_some() {
773                 allow_mem_hotplug = true;
774             }
775 
776             if let Some(hotplugged_size) = config.hotplugged_size {
777                 if let Some(hotplug_size) = config.hotplug_size {
778                     if hotplugged_size > hotplug_size {
779                         error!(
780                             "'hotplugged_size' {} can't be bigger than \
781                             'hotplug_size' {}",
782                             hotplugged_size, hotplug_size,
783                         );
784                         return Err(Error::InvalidMemoryParameters);
785                     }
786                 } else {
787                     error!(
788                         "Invalid to define 'hotplugged_size' when there is\
789                         no 'hotplug_size'"
790                     );
791                     return Err(Error::InvalidMemoryParameters);
792                 }
793                 if config.hotplug_method == HotplugMethod::Acpi {
794                     error!(
795                         "Invalid to define 'hotplugged_size' with hotplug \
796                         method 'acpi'"
797                     );
798                     return Err(Error::InvalidMemoryParameters);
799                 }
800             }
801 
802             // Create a single zone from the global memory config. This lets
803             // us reuse the codepath for user defined memory zones.
804             let zones = vec![MemoryZoneConfig {
805                 id: String::from(DEFAULT_MEMORY_ZONE),
806                 size: config.size,
807                 file: None,
808                 shared: config.shared,
809                 hugepages: config.hugepages,
810                 hugepage_size: config.hugepage_size,
811                 host_numa_node: None,
812                 hotplug_size: config.hotplug_size,
813                 hotplugged_size: config.hotplugged_size,
814                 prefault: config.prefault,
815             }];
816 
817             Ok((config.size, zones, allow_mem_hotplug))
818         } else {
819             if config.zones.is_none() {
820                 error!(
821                     "User defined memory regions must be provided if the \
822                     memory size is 0"
823                 );
824                 return Err(Error::MissingMemoryZones);
825             }
826 
827             // Safe to unwrap as we checked right above there were some
828             // regions.
829             let zones = config.zones.clone().unwrap();
830             if zones.is_empty() {
831                 return Err(Error::MissingMemoryZones);
832             }
833 
834             let mut total_ram_size: u64 = 0;
835             for zone in zones.iter() {
836                 total_ram_size += zone.size;
837 
838                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
839                     error!(
840                         "Invalid to set host NUMA policy for a memory zone \
841                         backed by a regular file and mapped as 'shared'"
842                     );
843                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
844                 }
845 
846                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
847                     error!("Invalid to set ACPI hotplug method for memory zones");
848                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
849                 }
850 
851                 if let Some(hotplugged_size) = zone.hotplugged_size {
852                     if let Some(hotplug_size) = zone.hotplug_size {
853                         if hotplugged_size > hotplug_size {
854                             error!(
855                                 "'hotplugged_size' {} can't be bigger than \
856                                 'hotplug_size' {}",
857                                 hotplugged_size, hotplug_size,
858                             );
859                             return Err(Error::InvalidMemoryParameters);
860                         }
861                     } else {
862                         error!(
863                             "Invalid to define 'hotplugged_size' when there is\
864                             no 'hotplug_size' for a memory zone"
865                         );
866                         return Err(Error::InvalidMemoryParameters);
867                     }
868                     if config.hotplug_method == HotplugMethod::Acpi {
869                         error!(
870                             "Invalid to define 'hotplugged_size' with hotplug \
871                             method 'acpi'"
872                         );
873                         return Err(Error::InvalidMemoryParameters);
874                     }
875                 }
876             }
877 
878             Ok((total_ram_size, zones, allow_mem_hotplug))
879         }
880     }
881 
882     pub fn allocate_address_space(&mut self) -> Result<(), Error> {
883         let mut list = Vec::new();
884 
885         for (zone_id, memory_zone) in self.memory_zones.iter() {
886             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
887                 memory_zone
888                     .regions()
889                     .iter()
890                     .map(|r| (r.clone(), false))
891                     .collect();
892 
893             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
894                 regions.push((virtio_mem_zone.region().clone(), true));
895             }
896 
897             list.push((zone_id.clone(), regions));
898         }
899 
900         for (zone_id, regions) in list {
901             for (region, virtio_mem) in regions {
902                 let slot = self.create_userspace_mapping(
903                     region.start_addr().raw_value(),
904                     region.len(),
905                     region.as_ptr() as u64,
906                     self.mergeable,
907                     false,
908                     self.log_dirty,
909                 )?;
910 
911                 let file_offset = if let Some(file_offset) = region.file_offset() {
912                     file_offset.start()
913                 } else {
914                     0
915                 };
916 
917                 self.guest_ram_mappings.push(GuestRamMapping {
918                     gpa: region.start_addr().raw_value(),
919                     size: region.len(),
920                     slot,
921                     zone_id: zone_id.clone(),
922                     virtio_mem,
923                     file_offset,
924                 });
925                 self.ram_allocator
926                     .allocate(Some(region.start_addr()), region.len(), None)
927                     .ok_or(Error::MemoryRangeAllocation)?;
928             }
929         }
930 
931         // Allocate SubRegion and Reserved address ranges.
932         for region in self.arch_mem_regions.iter() {
933             if region.r_type == RegionType::Ram {
934                 // Ignore the RAM type since ranges have already been allocated
935                 // based on the GuestMemory regions.
936                 continue;
937             }
938             self.ram_allocator
939                 .allocate(
940                     Some(GuestAddress(region.base)),
941                     region.size as GuestUsize,
942                     None,
943                 )
944                 .ok_or(Error::MemoryRangeAllocation)?;
945         }
946 
947         Ok(())
948     }
949 
950     #[cfg(target_arch = "aarch64")]
951     fn add_uefi_flash(&mut self) -> Result<(), Error> {
952         // On AArch64, the UEFI binary requires a flash device at address 0.
953         // 4 MiB memory is mapped to simulate the flash.
954         let uefi_mem_slot = self.allocate_memory_slot();
955         let uefi_region = GuestRegionMmap::new(
956             MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(),
957             arch::layout::UEFI_START,
958         )
959         .unwrap();
960         let uefi_mem_region = self.vm.make_user_memory_region(
961             uefi_mem_slot,
962             uefi_region.start_addr().raw_value(),
963             uefi_region.len(),
964             uefi_region.as_ptr() as u64,
965             false,
966             false,
967         );
968         self.vm
969             .create_user_memory_region(uefi_mem_region)
970             .map_err(Error::CreateUefiFlash)?;
971 
972         let uefi_flash =
973             GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap());
974 
975         self.uefi_flash = Some(uefi_flash);
976 
977         Ok(())
978     }
979 
980     #[allow(clippy::too_many_arguments)]
981     pub fn new(
982         vm: Arc<dyn hypervisor::Vm>,
983         config: &MemoryConfig,
984         prefault: Option<bool>,
985         phys_bits: u8,
986         #[cfg(feature = "tdx")] tdx_enabled: bool,
987         restore_data: Option<&MemoryManagerSnapshotData>,
988         existing_memory_files: Option<HashMap<u32, File>>,
989         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
990     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
991         trace_scoped!("MemoryManager::new");
992 
993         let user_provided_zones = config.size == 0;
994 
995         let mmio_address_space_size = mmio_address_space_size(phys_bits);
996         debug_assert_eq!(
997             (((mmio_address_space_size) >> 16) << 16),
998             mmio_address_space_size
999         );
1000         let start_of_platform_device_area =
1001             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
1002         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
1003 
1004         let (ram_size, zones, allow_mem_hotplug) =
1005             Self::validate_memory_config(config, user_provided_zones)?;
1006 
1007         let (
1008             start_of_device_area,
1009             boot_ram,
1010             current_ram,
1011             arch_mem_regions,
1012             memory_zones,
1013             guest_memory,
1014             boot_guest_memory,
1015             hotplug_slots,
1016             next_memory_slot,
1017             selected_slot,
1018             next_hotplug_slot,
1019         ) = if let Some(data) = restore_data {
1020             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
1021                 &data.guest_ram_mappings,
1022                 &zones,
1023                 prefault,
1024                 existing_memory_files.unwrap_or_default(),
1025                 config.thp,
1026             )?;
1027             let guest_memory =
1028                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
1029             let boot_guest_memory = guest_memory.clone();
1030             (
1031                 GuestAddress(data.start_of_device_area),
1032                 data.boot_ram,
1033                 data.current_ram,
1034                 data.arch_mem_regions.clone(),
1035                 memory_zones,
1036                 guest_memory,
1037                 boot_guest_memory,
1038                 data.hotplug_slots.clone(),
1039                 data.next_memory_slot,
1040                 data.selected_slot,
1041                 data.next_hotplug_slot,
1042             )
1043         } else {
1044             // Init guest memory
1045             let arch_mem_regions = arch::arch_memory_regions();
1046 
1047             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1048                 .iter()
1049                 .filter(|r| r.2 == RegionType::Ram)
1050                 .map(|r| (r.0, r.1))
1051                 .collect();
1052 
1053             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
1054                 .iter()
1055                 .map(|(a, b, c)| ArchMemRegion {
1056                     base: a.0,
1057                     size: *b,
1058                     r_type: *c,
1059                 })
1060                 .collect();
1061 
1062             let (mem_regions, mut memory_zones) =
1063                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?;
1064 
1065             let mut guest_memory =
1066                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
1067 
1068             let boot_guest_memory = guest_memory.clone();
1069 
1070             let mut start_of_device_area =
1071                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
1072 
1073             // Update list of memory zones for resize.
1074             for zone in zones.iter() {
1075                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
1076                     if let Some(hotplug_size) = zone.hotplug_size {
1077                         if hotplug_size == 0 {
1078                             error!("'hotplug_size' can't be 0");
1079                             return Err(Error::InvalidHotplugSize);
1080                         }
1081 
1082                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
1083                             start_of_device_area = start_of_device_area
1084                                 .checked_add(hotplug_size)
1085                                 .ok_or(Error::GuestAddressOverFlow)?;
1086                         } else {
1087                             // Alignment must be "natural" i.e. same as size of block
1088                             let start_addr = GuestAddress(
1089                                 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1090                                     - 1)
1091                                     / virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1092                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
1093                             );
1094 
1095                             // When `prefault` is set by vm_restore, memory manager
1096                             // will create ram region with `prefault` option in
1097                             // restore config rather than same option in zone
1098                             let region = MemoryManager::create_ram_region(
1099                                 &None,
1100                                 0,
1101                                 start_addr,
1102                                 hotplug_size as usize,
1103                                 prefault.unwrap_or(zone.prefault),
1104                                 zone.shared,
1105                                 zone.hugepages,
1106                                 zone.hugepage_size,
1107                                 zone.host_numa_node,
1108                                 None,
1109                                 config.thp,
1110                             )?;
1111 
1112                             guest_memory = guest_memory
1113                                 .insert_region(Arc::clone(&region))
1114                                 .map_err(Error::GuestMemory)?;
1115 
1116                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
1117                             let region_size = region.len();
1118                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
1119                                 region,
1120                                 virtio_device: None,
1121                                 hotplugged_size,
1122                                 hugepages: zone.hugepages,
1123                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
1124                             });
1125 
1126                             start_of_device_area = start_addr
1127                                 .checked_add(hotplug_size)
1128                                 .ok_or(Error::GuestAddressOverFlow)?;
1129                         }
1130                     }
1131                 } else {
1132                     return Err(Error::MissingZoneIdentifier);
1133                 }
1134             }
1135 
1136             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
1137             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
1138 
1139             (
1140                 start_of_device_area,
1141                 ram_size,
1142                 ram_size,
1143                 arch_mem_regions,
1144                 memory_zones,
1145                 guest_memory,
1146                 boot_guest_memory,
1147                 hotplug_slots,
1148                 0,
1149                 0,
1150                 0,
1151             )
1152         };
1153 
1154         let guest_memory = GuestMemoryAtomic::new(guest_memory);
1155 
1156         // Both MMIO and PIO address spaces start at address 0.
1157         let allocator = Arc::new(Mutex::new(
1158             SystemAllocator::new(
1159                 #[cfg(target_arch = "x86_64")]
1160                 {
1161                     GuestAddress(0)
1162                 },
1163                 #[cfg(target_arch = "x86_64")]
1164                 {
1165                     1 << 16
1166                 },
1167                 start_of_platform_device_area,
1168                 PLATFORM_DEVICE_AREA_SIZE,
1169                 #[cfg(target_arch = "x86_64")]
1170                 vec![GsiApic::new(
1171                     X86_64_IRQ_BASE,
1172                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1173                 )],
1174             )
1175             .ok_or(Error::CreateSystemAllocator)?,
1176         ));
1177 
1178         #[cfg(not(feature = "tdx"))]
1179         let dynamic = true;
1180         #[cfg(feature = "tdx")]
1181         let dynamic = !tdx_enabled;
1182 
1183         let acpi_address = if dynamic
1184             && config.hotplug_method == HotplugMethod::Acpi
1185             && (config.hotplug_size.unwrap_or_default() > 0)
1186         {
1187             Some(
1188                 allocator
1189                     .lock()
1190                     .unwrap()
1191                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1192                     .ok_or(Error::AllocateMmioAddress)?,
1193             )
1194         } else {
1195             None
1196         };
1197 
1198         // If running on SGX the start of device area and RAM area may diverge but
1199         // at this point they are next to each other.
1200         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1201         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1202 
1203         let mut memory_manager = MemoryManager {
1204             boot_guest_memory,
1205             guest_memory,
1206             next_memory_slot,
1207             start_of_device_area,
1208             end_of_device_area,
1209             end_of_ram_area,
1210             vm,
1211             hotplug_slots,
1212             selected_slot,
1213             mergeable: config.mergeable,
1214             allocator,
1215             hotplug_method: config.hotplug_method,
1216             boot_ram,
1217             current_ram,
1218             next_hotplug_slot,
1219             shared: config.shared,
1220             hugepages: config.hugepages,
1221             hugepage_size: config.hugepage_size,
1222             prefault: config.prefault,
1223             #[cfg(target_arch = "x86_64")]
1224             sgx_epc_region: None,
1225             user_provided_zones,
1226             snapshot_memory_ranges: MemoryRangeTable::default(),
1227             memory_zones,
1228             guest_ram_mappings: Vec::new(),
1229             acpi_address,
1230             log_dirty: dynamic, // Cannot log dirty pages on a TD
1231             arch_mem_regions,
1232             ram_allocator,
1233             dynamic,
1234             #[cfg(target_arch = "aarch64")]
1235             uefi_flash: None,
1236             thp: config.thp,
1237         };
1238 
1239         #[cfg(target_arch = "aarch64")]
1240         {
1241             // For Aarch64 we cannot lazily allocate the address space like we
1242             // do for x86, because while restoring a VM from snapshot we would
1243             // need the address space to be allocated to properly restore VGIC.
1244             // And the restore of VGIC happens before we attempt to run the vCPUs
1245             // for the first time, thus we need to allocate the address space
1246             // beforehand.
1247             memory_manager.allocate_address_space()?;
1248             memory_manager.add_uefi_flash()?;
1249         }
1250 
1251         #[cfg(target_arch = "x86_64")]
1252         if let Some(sgx_epc_config) = sgx_epc_config {
1253             memory_manager.setup_sgx(sgx_epc_config)?;
1254         }
1255 
1256         Ok(Arc::new(Mutex::new(memory_manager)))
1257     }
1258 
1259     pub fn new_from_snapshot(
1260         snapshot: &Snapshot,
1261         vm: Arc<dyn hypervisor::Vm>,
1262         config: &MemoryConfig,
1263         source_url: Option<&str>,
1264         prefault: bool,
1265         phys_bits: u8,
1266     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1267         if let Some(source_url) = source_url {
1268             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1269             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1270 
1271             let mem_snapshot: MemoryManagerSnapshotData =
1272                 snapshot.to_versioned_state().map_err(Error::Restore)?;
1273 
1274             let mm = MemoryManager::new(
1275                 vm,
1276                 config,
1277                 Some(prefault),
1278                 phys_bits,
1279                 #[cfg(feature = "tdx")]
1280                 false,
1281                 Some(&mem_snapshot),
1282                 None,
1283                 #[cfg(target_arch = "x86_64")]
1284                 None,
1285             )?;
1286 
1287             mm.lock()
1288                 .unwrap()
1289                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1290 
1291             Ok(mm)
1292         } else {
1293             Err(Error::RestoreMissingSourceUrl)
1294         }
1295     }
1296 
1297     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1298         // SAFETY: FFI call with correct arguments
1299         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1300 
1301         if res < 0 {
1302             Err(io::Error::last_os_error())
1303         } else {
1304             Ok(res as RawFd)
1305         }
1306     }
1307 
1308     fn mbind(
1309         addr: *mut u8,
1310         len: u64,
1311         mode: u32,
1312         nodemask: Vec<u64>,
1313         maxnode: u64,
1314         flags: u32,
1315     ) -> Result<(), io::Error> {
1316         // SAFETY: FFI call with correct arguments
1317         let res = unsafe {
1318             libc::syscall(
1319                 libc::SYS_mbind,
1320                 addr as *mut libc::c_void,
1321                 len,
1322                 mode,
1323                 nodemask.as_ptr(),
1324                 maxnode,
1325                 flags,
1326             )
1327         };
1328 
1329         if res < 0 {
1330             Err(io::Error::last_os_error())
1331         } else {
1332             Ok(())
1333         }
1334     }
1335 
1336     fn create_anonymous_file(
1337         size: usize,
1338         hugepages: bool,
1339         hugepage_size: Option<u64>,
1340     ) -> Result<FileOffset, Error> {
1341         let fd = Self::memfd_create(
1342             &ffi::CString::new("ch_ram").unwrap(),
1343             libc::MFD_CLOEXEC
1344                 | if hugepages {
1345                     libc::MFD_HUGETLB
1346                         | if let Some(hugepage_size) = hugepage_size {
1347                             /*
1348                              * From the Linux kernel:
1349                              * Several system calls take a flag to request "hugetlb" huge pages.
1350                              * Without further specification, these system calls will use the
1351                              * system's default huge page size.  If a system supports multiple
1352                              * huge page sizes, the desired huge page size can be specified in
1353                              * bits [26:31] of the flag arguments.  The value in these 6 bits
1354                              * will encode the log2 of the huge page size.
1355                              */
1356 
1357                             hugepage_size.trailing_zeros() << 26
1358                         } else {
1359                             // Use the system default huge page size
1360                             0
1361                         }
1362                 } else {
1363                     0
1364                 },
1365         )
1366         .map_err(Error::SharedFileCreate)?;
1367 
1368         // SAFETY: fd is valid
1369         let f = unsafe { File::from_raw_fd(fd) };
1370         f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1371 
1372         Ok(FileOffset::new(f, 0))
1373     }
1374 
1375     fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> {
1376         if backing_file.is_dir() {
1377             Err(Error::DirectoryAsBackingFileForMemory)
1378         } else {
1379             let f = OpenOptions::new()
1380                 .read(true)
1381                 .write(true)
1382                 .open(backing_file)
1383                 .map_err(Error::SharedFileCreate)?;
1384 
1385             Ok(FileOffset::new(f, file_offset))
1386         }
1387     }
1388 
1389     #[allow(clippy::too_many_arguments)]
1390     pub fn create_ram_region(
1391         backing_file: &Option<PathBuf>,
1392         file_offset: u64,
1393         start_addr: GuestAddress,
1394         size: usize,
1395         prefault: bool,
1396         shared: bool,
1397         hugepages: bool,
1398         hugepage_size: Option<u64>,
1399         host_numa_node: Option<u32>,
1400         existing_memory_file: Option<File>,
1401         thp: bool,
1402     ) -> Result<Arc<GuestRegionMmap>, Error> {
1403         let mut mmap_flags = libc::MAP_NORESERVE;
1404 
1405         // The duplication of mmap_flags ORing here is unfortunate but it also makes
1406         // the complexity of the handling clear.
1407         let fo = if let Some(f) = existing_memory_file {
1408             // It must be MAP_SHARED as we wouldn't already have an FD
1409             mmap_flags |= libc::MAP_SHARED;
1410             Some(FileOffset::new(f, file_offset))
1411         } else if let Some(backing_file) = backing_file {
1412             if shared {
1413                 mmap_flags |= libc::MAP_SHARED;
1414             } else {
1415                 mmap_flags |= libc::MAP_PRIVATE;
1416             }
1417             Some(Self::open_backing_file(backing_file, file_offset)?)
1418         } else if shared || hugepages {
1419             // For hugepages we must also MAP_SHARED otherwise we will trigger #4805
1420             // because the MAP_PRIVATE will trigger CoW against the backing file with
1421             // the VFIO pinning
1422             mmap_flags |= libc::MAP_SHARED;
1423             Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?)
1424         } else {
1425             mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS;
1426             None
1427         };
1428 
1429         let region = GuestRegionMmap::new(
1430             MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
1431                 .map_err(Error::GuestMemoryRegion)?,
1432             start_addr,
1433         )
1434         .map_err(Error::GuestMemory)?;
1435 
1436         // Apply NUMA policy if needed.
1437         if let Some(node) = host_numa_node {
1438             let addr = region.deref().as_ptr();
1439             let len = region.deref().size() as u64;
1440             let mode = MPOL_BIND;
1441             let mut nodemask: Vec<u64> = Vec::new();
1442             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1443 
1444             // Linux is kind of buggy in the way it interprets maxnode as it
1445             // will cut off the last node. That's why we have to add 1 to what
1446             // we would consider as the proper maxnode value.
1447             let maxnode = node as u64 + 1 + 1;
1448 
1449             // Allocate the right size for the vector.
1450             nodemask.resize((node as usize / 64) + 1, 0);
1451 
1452             // Fill the global bitmask through the nodemask vector.
1453             let idx = (node / 64) as usize;
1454             let shift = node % 64;
1455             nodemask[idx] |= 1u64 << shift;
1456 
1457             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1458             // force the kernel to move all pages that might have been already
1459             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1460             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1461             // MPOL_BIND is the selected mode as it specifies a strict policy
1462             // that restricts memory allocation to the nodes specified in the
1463             // nodemask.
1464             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1465                 .map_err(Error::ApplyNumaPolicy)?;
1466         }
1467 
1468         // Prefault the region if needed, in parallel.
1469         if prefault {
1470             let page_size =
1471                 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize;
1472 
1473             if !is_aligned(size, page_size) {
1474                 warn!(
1475                     "Prefaulting memory size {} misaligned with page size {}",
1476                     size, page_size
1477                 );
1478             }
1479 
1480             let num_pages = size / page_size;
1481 
1482             let num_threads = Self::get_prefault_num_threads(page_size, num_pages);
1483 
1484             let pages_per_thread = num_pages / num_threads;
1485             let remainder = num_pages % num_threads;
1486 
1487             let barrier = Arc::new(Barrier::new(num_threads));
1488             thread::scope(|s| {
1489                 let r = &region;
1490                 for i in 0..num_threads {
1491                     let barrier = Arc::clone(&barrier);
1492                     s.spawn(move || {
1493                         // Wait until all threads have been spawned to avoid contention
1494                         // over mmap_sem between thread stack allocation and page faulting.
1495                         barrier.wait();
1496                         let pages = pages_per_thread + if i < remainder { 1 } else { 0 };
1497                         let offset =
1498                             page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder));
1499                         // SAFETY: FFI call with correct arguments
1500                         let ret = unsafe {
1501                             let addr = r.as_ptr().add(offset);
1502                             libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE)
1503                         };
1504                         if ret != 0 {
1505                             let e = io::Error::last_os_error();
1506                             warn!("Failed to prefault pages: {}", e);
1507                         }
1508                     });
1509                 }
1510             });
1511         }
1512 
1513         if region.file_offset().is_none() && thp {
1514             info!(
1515                 "Anonymous mapping at 0x{:x} (size = 0x{:x})",
1516                 region.as_ptr() as u64,
1517                 size
1518             );
1519             // SAFETY: FFI call with correct arguments
1520             let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
1521             if ret != 0 {
1522                 let e = io::Error::last_os_error();
1523                 warn!("Failed to mark pages as THP eligible: {}", e);
1524             }
1525         }
1526 
1527         Ok(Arc::new(region))
1528     }
1529 
1530     // Duplicate of `memory_zone_get_align_size` that does not require a `zone`
1531     fn get_prefault_align_size(
1532         backing_file: &Option<PathBuf>,
1533         hugepages: bool,
1534         hugepage_size: Option<u64>,
1535     ) -> Result<u64, Error> {
1536         // SAFETY: FFI call. Trivially safe.
1537         let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
1538         match (hugepages, hugepage_size, backing_file) {
1539             (false, _, _) => Ok(page_size),
1540             (true, Some(hugepage_size), _) => Ok(hugepage_size),
1541             (true, None, _) => {
1542                 // There are two scenarios here:
1543                 //  - `hugepages` is enabled but `hugepage_size` is not specified:
1544                 //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
1545                 //  - The backing file is specified:
1546                 //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
1547                 //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
1548                 //     value is less than or equal to the page size, just use the page size.
1549                 let path = backing_file
1550                     .as_ref()
1551                     .map_or(Ok("/dev/hugepages"), |pathbuf| {
1552                         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
1553                     })?;
1554                 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
1555                 Ok(align_size)
1556             }
1557         }
1558     }
1559 
1560     fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize {
1561         let mut n: usize = 1;
1562 
1563         // Do not create more threads than processors available.
1564         // SAFETY: FFI call. Trivially safe.
1565         let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) };
1566         if procs > 0 {
1567             n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT);
1568         }
1569 
1570         // Do not create more threads than pages being allocated.
1571         n = std::cmp::min(n, num_pages);
1572 
1573         // Do not create threads to allocate less than 64 MiB of memory.
1574         n = std::cmp::min(
1575             n,
1576             std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))),
1577         );
1578 
1579         n
1580     }
1581 
1582     // Update the GuestMemoryMmap with the new range
1583     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1584         let guest_memory = self
1585             .guest_memory
1586             .memory()
1587             .insert_region(region)
1588             .map_err(Error::GuestMemory)?;
1589         self.guest_memory.lock().unwrap().replace(guest_memory);
1590 
1591         Ok(())
1592     }
1593 
1594     //
1595     // Calculate the start address of an area next to RAM.
1596     //
1597     // If memory hotplug is allowed, the start address needs to be aligned
1598     // (rounded-up) to 128MiB boundary.
1599     // If memory hotplug is not allowed, there is no alignment required.
1600     // And it must also start at the 64bit start.
1601     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1602         let mut start_addr = if allow_mem_hotplug {
1603             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1604         } else {
1605             mem_end
1606         };
1607 
1608         start_addr = start_addr
1609             .checked_add(1)
1610             .ok_or(Error::GuestAddressOverFlow)?;
1611 
1612         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1613             return Ok(arch::layout::RAM_64BIT_START);
1614         }
1615 
1616         Ok(start_addr)
1617     }
1618 
1619     pub fn add_ram_region(
1620         &mut self,
1621         start_addr: GuestAddress,
1622         size: usize,
1623     ) -> Result<Arc<GuestRegionMmap>, Error> {
1624         // Allocate memory for the region
1625         let region = MemoryManager::create_ram_region(
1626             &None,
1627             0,
1628             start_addr,
1629             size,
1630             self.prefault,
1631             self.shared,
1632             self.hugepages,
1633             self.hugepage_size,
1634             None,
1635             None,
1636             self.thp,
1637         )?;
1638 
1639         // Map it into the guest
1640         let slot = self.create_userspace_mapping(
1641             region.start_addr().0,
1642             region.len(),
1643             region.as_ptr() as u64,
1644             self.mergeable,
1645             false,
1646             self.log_dirty,
1647         )?;
1648         self.guest_ram_mappings.push(GuestRamMapping {
1649             gpa: region.start_addr().raw_value(),
1650             size: region.len(),
1651             slot,
1652             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1653             virtio_mem: false,
1654             file_offset: 0,
1655         });
1656 
1657         self.add_region(Arc::clone(&region))?;
1658 
1659         Ok(region)
1660     }
1661 
1662     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1663         info!("Hotplugging new RAM: {}", size);
1664 
1665         // Check that there is a free slot
1666         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1667             return Err(Error::NoSlotAvailable);
1668         }
1669 
1670         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1671         if size % (128 << 20) != 0 {
1672             return Err(Error::InvalidSize);
1673         }
1674 
1675         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1676 
1677         if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area {
1678             return Err(Error::InsufficientHotplugRam);
1679         }
1680 
1681         let region = self.add_ram_region(start_addr, size)?;
1682 
1683         // Add region to the list of regions associated with the default
1684         // memory zone.
1685         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1686             memory_zone.regions.push(Arc::clone(&region));
1687         }
1688 
1689         // Tell the allocator
1690         self.ram_allocator
1691             .allocate(Some(start_addr), size as GuestUsize, None)
1692             .ok_or(Error::MemoryRangeAllocation)?;
1693 
1694         // Update the slot so that it can be queried via the I/O port
1695         let slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1696         slot.active = true;
1697         slot.inserting = true;
1698         slot.base = region.start_addr().0;
1699         slot.length = region.len();
1700 
1701         self.next_hotplug_slot += 1;
1702 
1703         Ok(region)
1704     }
1705 
1706     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1707         self.guest_memory.clone()
1708     }
1709 
1710     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1711         self.boot_guest_memory.clone()
1712     }
1713 
1714     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1715         self.allocator.clone()
1716     }
1717 
1718     pub fn start_of_device_area(&self) -> GuestAddress {
1719         self.start_of_device_area
1720     }
1721 
1722     pub fn end_of_device_area(&self) -> GuestAddress {
1723         self.end_of_device_area
1724     }
1725 
1726     pub fn allocate_memory_slot(&mut self) -> u32 {
1727         let slot_id = self.next_memory_slot;
1728         self.next_memory_slot += 1;
1729         slot_id
1730     }
1731 
1732     pub fn create_userspace_mapping(
1733         &mut self,
1734         guest_phys_addr: u64,
1735         memory_size: u64,
1736         userspace_addr: u64,
1737         mergeable: bool,
1738         readonly: bool,
1739         log_dirty: bool,
1740     ) -> Result<u32, Error> {
1741         let slot = self.allocate_memory_slot();
1742         let mem_region = self.vm.make_user_memory_region(
1743             slot,
1744             guest_phys_addr,
1745             memory_size,
1746             userspace_addr,
1747             readonly,
1748             log_dirty,
1749         );
1750 
1751         info!(
1752             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1753             guest_phys_addr, userspace_addr, memory_size, slot
1754         );
1755 
1756         self.vm
1757             .create_user_memory_region(mem_region)
1758             .map_err(Error::CreateUserMemoryRegion)?;
1759 
1760         // SAFETY: the address and size are valid since the
1761         // mmap succeeded.
1762         let ret = unsafe {
1763             libc::madvise(
1764                 userspace_addr as *mut libc::c_void,
1765                 memory_size as libc::size_t,
1766                 libc::MADV_DONTDUMP,
1767             )
1768         };
1769         if ret != 0 {
1770             let e = io::Error::last_os_error();
1771             warn!("Failed to mark mappin as MADV_DONTDUMP: {}", e);
1772         }
1773 
1774         // Mark the pages as mergeable if explicitly asked for.
1775         if mergeable {
1776             // SAFETY: the address and size are valid since the
1777             // mmap succeeded.
1778             let ret = unsafe {
1779                 libc::madvise(
1780                     userspace_addr as *mut libc::c_void,
1781                     memory_size as libc::size_t,
1782                     libc::MADV_MERGEABLE,
1783                 )
1784             };
1785             if ret != 0 {
1786                 let err = io::Error::last_os_error();
1787                 // Safe to unwrap because the error is constructed with
1788                 // last_os_error(), which ensures the output will be Some().
1789                 let errno = err.raw_os_error().unwrap();
1790                 if errno == libc::EINVAL {
1791                     warn!("kernel not configured with CONFIG_KSM");
1792                 } else {
1793                     warn!("madvise error: {}", err);
1794                 }
1795                 warn!("failed to mark pages as mergeable");
1796             }
1797         }
1798 
1799         info!(
1800             "Created userspace mapping: {:x} -> {:x} {:x}",
1801             guest_phys_addr, userspace_addr, memory_size
1802         );
1803 
1804         Ok(slot)
1805     }
1806 
1807     pub fn remove_userspace_mapping(
1808         &mut self,
1809         guest_phys_addr: u64,
1810         memory_size: u64,
1811         userspace_addr: u64,
1812         mergeable: bool,
1813         slot: u32,
1814     ) -> Result<(), Error> {
1815         let mem_region = self.vm.make_user_memory_region(
1816             slot,
1817             guest_phys_addr,
1818             memory_size,
1819             userspace_addr,
1820             false, /* readonly -- don't care */
1821             false, /* log dirty */
1822         );
1823 
1824         self.vm
1825             .remove_user_memory_region(mem_region)
1826             .map_err(Error::RemoveUserMemoryRegion)?;
1827 
1828         // Mark the pages as unmergeable if there were previously marked as
1829         // mergeable.
1830         if mergeable {
1831             // SAFETY: the address and size are valid as the region was
1832             // previously advised.
1833             let ret = unsafe {
1834                 libc::madvise(
1835                     userspace_addr as *mut libc::c_void,
1836                     memory_size as libc::size_t,
1837                     libc::MADV_UNMERGEABLE,
1838                 )
1839             };
1840             if ret != 0 {
1841                 let err = io::Error::last_os_error();
1842                 // Safe to unwrap because the error is constructed with
1843                 // last_os_error(), which ensures the output will be Some().
1844                 let errno = err.raw_os_error().unwrap();
1845                 if errno == libc::EINVAL {
1846                     warn!("kernel not configured with CONFIG_KSM");
1847                 } else {
1848                     warn!("madvise error: {}", err);
1849                 }
1850                 warn!("failed to mark pages as unmergeable");
1851             }
1852         }
1853 
1854         info!(
1855             "Removed userspace mapping: {:x} -> {:x} {:x}",
1856             guest_phys_addr, userspace_addr, memory_size
1857         );
1858 
1859         Ok(())
1860     }
1861 
1862     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1863         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1864             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1865                 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() {
1866                     virtio_mem_device
1867                         .lock()
1868                         .unwrap()
1869                         .resize(size)
1870                         .map_err(Error::VirtioMemResizeFail)?;
1871                 }
1872 
1873                 // Keep the hotplugged_size up to date.
1874                 virtio_mem_zone.hotplugged_size = size;
1875             } else {
1876                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1877                 return Err(Error::MissingVirtioMemHandler);
1878             }
1879 
1880             return Ok(());
1881         }
1882 
1883         error!("Failed resizing virtio-mem region: Unknown memory zone");
1884         Err(Error::UnknownMemoryZone)
1885     }
1886 
1887     /// In case this function resulted in adding a new memory region to the
1888     /// guest memory, the new region is returned to the caller. The virtio-mem
1889     /// use case never adds a new region as the whole hotpluggable memory has
1890     /// already been allocated at boot time.
1891     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1892         if self.user_provided_zones {
1893             error!(
1894                 "Not allowed to resize guest memory when backed with user \
1895                 defined memory zones."
1896             );
1897             return Err(Error::InvalidResizeWithMemoryZones);
1898         }
1899 
1900         let mut region: Option<Arc<GuestRegionMmap>> = None;
1901         match self.hotplug_method {
1902             HotplugMethod::VirtioMem => {
1903                 if desired_ram >= self.boot_ram {
1904                     if !self.dynamic {
1905                         return Ok(region);
1906                     }
1907 
1908                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1909                     self.current_ram = desired_ram;
1910                 }
1911             }
1912             HotplugMethod::Acpi => {
1913                 if desired_ram > self.current_ram {
1914                     if !self.dynamic {
1915                         return Ok(region);
1916                     }
1917 
1918                     region =
1919                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1920                     self.current_ram = desired_ram;
1921                 }
1922             }
1923         }
1924         Ok(region)
1925     }
1926 
1927     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1928         if !self.user_provided_zones {
1929             error!(
1930                 "Not allowed to resize guest memory zone when no zone is \
1931                 defined."
1932             );
1933             return Err(Error::ResizeZone);
1934         }
1935 
1936         self.virtio_mem_resize(id, virtio_mem_size)
1937     }
1938 
1939     #[cfg(target_arch = "x86_64")]
1940     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1941         let file = OpenOptions::new()
1942             .read(true)
1943             .open("/dev/sgx_provision")
1944             .map_err(Error::SgxProvisionOpen)?;
1945         self.vm
1946             .enable_sgx_attribute(file)
1947             .map_err(Error::SgxEnableProvisioning)?;
1948 
1949         // Go over each EPC section and verify its size is a 4k multiple. At
1950         // the same time, calculate the total size needed for the contiguous
1951         // EPC region.
1952         let mut epc_region_size = 0;
1953         for epc_section in sgx_epc_config.iter() {
1954             if epc_section.size == 0 {
1955                 return Err(Error::EpcSectionSizeInvalid);
1956             }
1957             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1958                 return Err(Error::EpcSectionSizeInvalid);
1959             }
1960 
1961             epc_region_size += epc_section.size;
1962         }
1963 
1964         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
1965         let epc_region_start = GuestAddress(
1966             ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE,
1967         );
1968 
1969         self.start_of_device_area = epc_region_start
1970             .checked_add(epc_region_size)
1971             .ok_or(Error::GuestAddressOverFlow)?;
1972 
1973         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
1974         info!(
1975             "SGX EPC region: 0x{:x} (0x{:x})",
1976             epc_region_start.0, epc_region_size
1977         );
1978 
1979         // Each section can be memory mapped into the allocated region.
1980         let mut epc_section_start = epc_region_start.raw_value();
1981         for epc_section in sgx_epc_config.iter() {
1982             let file = OpenOptions::new()
1983                 .read(true)
1984                 .write(true)
1985                 .open("/dev/sgx_vepc")
1986                 .map_err(Error::SgxVirtEpcOpen)?;
1987 
1988             let prot = PROT_READ | PROT_WRITE;
1989             let mut flags = MAP_NORESERVE | MAP_SHARED;
1990             if epc_section.prefault {
1991                 flags |= MAP_POPULATE;
1992             }
1993 
1994             // We can't use the vm-memory crate to perform the memory mapping
1995             // here as it would try to ensure the size of the backing file is
1996             // matching the size of the expected mapping. The /dev/sgx_vepc
1997             // device does not work that way, it provides a file descriptor
1998             // which is not matching the mapping size, as it's a just a way to
1999             // let KVM know that an EPC section is being created for the guest.
2000             // SAFETY: FFI call with correct arguments
2001             let host_addr = unsafe {
2002                 libc::mmap(
2003                     std::ptr::null_mut(),
2004                     epc_section.size as usize,
2005                     prot,
2006                     flags,
2007                     file.as_raw_fd(),
2008                     0,
2009                 )
2010             } as u64;
2011 
2012             info!(
2013                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
2014                 epc_section_start, epc_section.size
2015             );
2016 
2017             let _mem_slot = self.create_userspace_mapping(
2018                 epc_section_start,
2019                 epc_section.size,
2020                 host_addr,
2021                 false,
2022                 false,
2023                 false,
2024             )?;
2025 
2026             sgx_epc_region.insert(
2027                 epc_section.id.clone(),
2028                 SgxEpcSection::new(
2029                     GuestAddress(epc_section_start),
2030                     epc_section.size as GuestUsize,
2031                 ),
2032             );
2033 
2034             epc_section_start += epc_section.size;
2035         }
2036 
2037         self.sgx_epc_region = Some(sgx_epc_region);
2038 
2039         Ok(())
2040     }
2041 
2042     #[cfg(target_arch = "x86_64")]
2043     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
2044         &self.sgx_epc_region
2045     }
2046 
2047     pub fn is_hardlink(f: &File) -> bool {
2048         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
2049         // SAFETY: FFI call with correct arguments
2050         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
2051         if ret != 0 {
2052             error!("Couldn't fstat the backing file");
2053             return false;
2054         }
2055 
2056         // SAFETY: stat is valid
2057         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
2058     }
2059 
2060     pub fn memory_zones(&self) -> &MemoryZones {
2061         &self.memory_zones
2062     }
2063 
2064     pub fn memory_zones_mut(&mut self) -> &mut MemoryZones {
2065         &mut self.memory_zones
2066     }
2067 
2068     pub fn memory_range_table(
2069         &self,
2070         snapshot: bool,
2071     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
2072         let mut table = MemoryRangeTable::default();
2073 
2074         for memory_zone in self.memory_zones.values() {
2075             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
2076                 table.extend(virtio_mem_zone.plugged_ranges());
2077             }
2078 
2079             for region in memory_zone.regions() {
2080                 if snapshot {
2081                     if let Some(file_offset) = region.file_offset() {
2082                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
2083                             && Self::is_hardlink(file_offset.file())
2084                         {
2085                             // In this very specific case, we know the memory
2086                             // region is backed by a file on the host filesystem
2087                             // that can be accessed by the user, and additionally
2088                             // the mapping is shared, which means that modifications
2089                             // to the content are written to the actual file.
2090                             // When meeting these conditions, we can skip the
2091                             // copy of the memory content for this specific region,
2092                             // as we can assume the user will have it saved through
2093                             // the backing file already.
2094                             continue;
2095                         }
2096                     }
2097                 }
2098 
2099                 table.push(MemoryRange {
2100                     gpa: region.start_addr().raw_value(),
2101                     length: region.len(),
2102                 });
2103             }
2104         }
2105 
2106         Ok(table)
2107     }
2108 
2109     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
2110         MemoryManagerSnapshotData {
2111             memory_ranges: self.snapshot_memory_ranges.clone(),
2112             guest_ram_mappings: self.guest_ram_mappings.clone(),
2113             start_of_device_area: self.start_of_device_area.0,
2114             boot_ram: self.boot_ram,
2115             current_ram: self.current_ram,
2116             arch_mem_regions: self.arch_mem_regions.clone(),
2117             hotplug_slots: self.hotplug_slots.clone(),
2118             next_memory_slot: self.next_memory_slot,
2119             selected_slot: self.selected_slot,
2120             next_hotplug_slot: self.next_hotplug_slot,
2121         }
2122     }
2123 
2124     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
2125         let mut memory_slot_fds = HashMap::new();
2126         for guest_ram_mapping in &self.guest_ram_mappings {
2127             let slot = guest_ram_mapping.slot;
2128             let guest_memory = self.guest_memory.memory();
2129             let file = guest_memory
2130                 .find_region(GuestAddress(guest_ram_mapping.gpa))
2131                 .unwrap()
2132                 .file_offset()
2133                 .unwrap()
2134                 .file();
2135             memory_slot_fds.insert(slot, file.as_raw_fd());
2136         }
2137         memory_slot_fds
2138     }
2139 
2140     pub fn acpi_address(&self) -> Option<GuestAddress> {
2141         self.acpi_address
2142     }
2143 
2144     pub fn num_guest_ram_mappings(&self) -> u32 {
2145         self.guest_ram_mappings.len() as u32
2146     }
2147 
2148     #[cfg(target_arch = "aarch64")]
2149     pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
2150         self.uefi_flash.as_ref().unwrap().clone()
2151     }
2152 
2153     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2154     pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
2155         let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
2156         mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
2157 
2158         let mut mem_offset_in_elf = mem_offset;
2159         let mut ram_maps = BTreeMap::new();
2160         for mapping in mapping_sorted_by_gpa.iter() {
2161             ram_maps.insert(
2162                 mapping.gpa,
2163                 CoredumpMemoryRegion {
2164                     mem_offset_in_elf,
2165                     mem_size: mapping.size,
2166                 },
2167             );
2168             mem_offset_in_elf += mapping.size;
2169         }
2170 
2171         CoredumpMemoryRegions { ram_maps }
2172     }
2173 
2174     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2175     pub fn coredump_iterate_save_mem(
2176         &mut self,
2177         dump_state: &DumpState,
2178     ) -> std::result::Result<(), GuestDebuggableError> {
2179         let snapshot_memory_ranges = self
2180             .memory_range_table(false)
2181             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2182 
2183         if snapshot_memory_ranges.is_empty() {
2184             return Ok(());
2185         }
2186 
2187         let coredump_file = dump_state.file.as_ref().unwrap();
2188 
2189         let guest_memory = self.guest_memory.memory();
2190         let mut total_bytes: u64 = 0;
2191 
2192         for range in snapshot_memory_ranges.regions() {
2193             let mut offset: u64 = 0;
2194             loop {
2195                 let bytes_written = guest_memory
2196                     .write_volatile_to(
2197                         GuestAddress(range.gpa + offset),
2198                         &mut coredump_file.as_fd(),
2199                         (range.length - offset) as usize,
2200                     )
2201                     .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2202                 offset += bytes_written as u64;
2203                 total_bytes += bytes_written as u64;
2204 
2205                 if offset == range.length {
2206                     break;
2207                 }
2208             }
2209         }
2210 
2211         debug!("coredump total bytes {}", total_bytes);
2212         Ok(())
2213     }
2214 
2215     pub fn receive_memory_regions<F>(
2216         &mut self,
2217         ranges: &MemoryRangeTable,
2218         fd: &mut F,
2219     ) -> std::result::Result<(), MigratableError>
2220     where
2221         F: ReadVolatile,
2222     {
2223         let guest_memory = self.guest_memory();
2224         let mem = guest_memory.memory();
2225 
2226         for range in ranges.regions() {
2227             let mut offset: u64 = 0;
2228             // Here we are manually handling the retry in case we can't the
2229             // whole region at once because we can't use the implementation
2230             // from vm-memory::GuestMemory of read_exact_from() as it is not
2231             // following the correct behavior. For more info about this issue
2232             // see: https://github.com/rust-vmm/vm-memory/issues/174
2233             loop {
2234                 let bytes_read = mem
2235                     .read_volatile_from(
2236                         GuestAddress(range.gpa + offset),
2237                         fd,
2238                         (range.length - offset) as usize,
2239                     )
2240                     .map_err(|e| {
2241                         MigratableError::MigrateReceive(anyhow!(
2242                             "Error receiving memory from socket: {}",
2243                             e
2244                         ))
2245                     })?;
2246                 offset += bytes_read as u64;
2247 
2248                 if offset == range.length {
2249                     break;
2250                 }
2251             }
2252         }
2253 
2254         Ok(())
2255     }
2256 }
2257 
2258 struct MemoryNotify {
2259     slot_id: usize,
2260 }
2261 
2262 impl Aml for MemoryNotify {
2263     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2264         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
2265         aml::If::new(
2266             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
2267             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2268         )
2269         .to_aml_bytes(sink)
2270     }
2271 }
2272 
2273 struct MemorySlot {
2274     slot_id: usize,
2275 }
2276 
2277 impl Aml for MemorySlot {
2278     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2279         aml::Device::new(
2280             format!("M{:03}", self.slot_id).as_str().into(),
2281             vec![
2282                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")),
2283                 &aml::Name::new("_UID".into(), &self.slot_id),
2284                 /*
2285                 _STA return value:
2286                 Bit [0] – Set if the device is present.
2287                 Bit [1] – Set if the device is enabled and decoding its resources.
2288                 Bit [2] – Set if the device should be shown in the UI.
2289                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
2290                 Bit [4] – Set if the battery is present.
2291                 Bits [31:5] – Reserved (must be cleared).
2292                 */
2293                 &aml::Method::new(
2294                     "_STA".into(),
2295                     0,
2296                     false,
2297                     // Call into MSTA method which will interrogate device
2298                     vec![&aml::Return::new(&aml::MethodCall::new(
2299                         "MSTA".into(),
2300                         vec![&self.slot_id],
2301                     ))],
2302                 ),
2303                 // Get details of memory
2304                 &aml::Method::new(
2305                     "_CRS".into(),
2306                     0,
2307                     false,
2308                     // Call into MCRS which provides actual memory details
2309                     vec![&aml::Return::new(&aml::MethodCall::new(
2310                         "MCRS".into(),
2311                         vec![&self.slot_id],
2312                     ))],
2313                 ),
2314             ],
2315         )
2316         .to_aml_bytes(sink)
2317     }
2318 }
2319 
2320 struct MemorySlots {
2321     slots: usize,
2322 }
2323 
2324 impl Aml for MemorySlots {
2325     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2326         for slot_id in 0..self.slots {
2327             MemorySlot { slot_id }.to_aml_bytes(sink);
2328         }
2329     }
2330 }
2331 
2332 struct MemoryMethods {
2333     slots: usize,
2334 }
2335 
2336 impl Aml for MemoryMethods {
2337     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2338         // Add "MTFY" notification method
2339         let mut memory_notifies = Vec::new();
2340         for slot_id in 0..self.slots {
2341             memory_notifies.push(MemoryNotify { slot_id });
2342         }
2343 
2344         let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new();
2345         for memory_notifier in memory_notifies.iter() {
2346             memory_notifies_refs.push(memory_notifier);
2347         }
2348 
2349         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink);
2350 
2351         // MSCN method
2352         aml::Method::new(
2353             "MSCN".into(),
2354             0,
2355             true,
2356             vec![
2357                 // Take lock defined above
2358                 &aml::Acquire::new("MLCK".into(), 0xffff),
2359                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2360                 &aml::While::new(
2361                     &aml::LessThan::new(&aml::Local(0), &self.slots),
2362                     vec![
2363                         // Write slot number (in first argument) to I/O port via field
2364                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2365                         // Check if MINS bit is set (inserting)
2366                         &aml::If::new(
2367                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2368                             // Notify device if it is
2369                             vec![
2370                                 &aml::MethodCall::new(
2371                                     "MTFY".into(),
2372                                     vec![&aml::Local(0), &aml::ONE],
2373                                 ),
2374                                 // Reset MINS bit
2375                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2376                             ],
2377                         ),
2378                         // Check if MRMV bit is set
2379                         &aml::If::new(
2380                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2381                             // Notify device if it is (with the eject constant 0x3)
2382                             vec![
2383                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2384                                 // Reset MRMV bit
2385                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2386                             ],
2387                         ),
2388                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2389                     ],
2390                 ),
2391                 // Release lock
2392                 &aml::Release::new("MLCK".into()),
2393             ],
2394         )
2395         .to_aml_bytes(sink);
2396 
2397         // Memory status method
2398         aml::Method::new(
2399             "MSTA".into(),
2400             1,
2401             true,
2402             vec![
2403                 // Take lock defined above
2404                 &aml::Acquire::new("MLCK".into(), 0xffff),
2405                 // Write slot number (in first argument) to I/O port via field
2406                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2407                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2408                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2409                 &aml::If::new(
2410                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2411                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2412                 ),
2413                 // Release lock
2414                 &aml::Release::new("MLCK".into()),
2415                 // Return 0 or 0xf
2416                 &aml::Return::new(&aml::Local(0)),
2417             ],
2418         )
2419         .to_aml_bytes(sink);
2420 
2421         // Memory range method
2422         aml::Method::new(
2423             "MCRS".into(),
2424             1,
2425             true,
2426             vec![
2427                 // Take lock defined above
2428                 &aml::Acquire::new("MLCK".into(), 0xffff),
2429                 // Write slot number (in first argument) to I/O port via field
2430                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2431                 &aml::Name::new(
2432                     "MR64".into(),
2433                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2434                         aml::AddressSpaceCacheable::Cacheable,
2435                         true,
2436                         0x0000_0000_0000_0000u64,
2437                         0xFFFF_FFFF_FFFF_FFFEu64,
2438                         None,
2439                     )]),
2440                 ),
2441                 &aml::CreateQWordField::new(
2442                     &aml::Path::new("MINL"),
2443                     &aml::Path::new("MR64"),
2444                     &14usize,
2445                 ),
2446                 &aml::CreateDWordField::new(
2447                     &aml::Path::new("MINH"),
2448                     &aml::Path::new("MR64"),
2449                     &18usize,
2450                 ),
2451                 &aml::CreateQWordField::new(
2452                     &aml::Path::new("MAXL"),
2453                     &aml::Path::new("MR64"),
2454                     &22usize,
2455                 ),
2456                 &aml::CreateDWordField::new(
2457                     &aml::Path::new("MAXH"),
2458                     &aml::Path::new("MR64"),
2459                     &26usize,
2460                 ),
2461                 &aml::CreateQWordField::new(
2462                     &aml::Path::new("LENL"),
2463                     &aml::Path::new("MR64"),
2464                     &38usize,
2465                 ),
2466                 &aml::CreateDWordField::new(
2467                     &aml::Path::new("LENH"),
2468                     &aml::Path::new("MR64"),
2469                     &42usize,
2470                 ),
2471                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2472                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2473                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2474                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2475                 &aml::Add::new(
2476                     &aml::Path::new("MAXL"),
2477                     &aml::Path::new("MINL"),
2478                     &aml::Path::new("LENL"),
2479                 ),
2480                 &aml::Add::new(
2481                     &aml::Path::new("MAXH"),
2482                     &aml::Path::new("MINH"),
2483                     &aml::Path::new("LENH"),
2484                 ),
2485                 &aml::If::new(
2486                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2487                     vec![&aml::Add::new(
2488                         &aml::Path::new("MAXH"),
2489                         &aml::ONE,
2490                         &aml::Path::new("MAXH"),
2491                     )],
2492                 ),
2493                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2494                 // Release lock
2495                 &aml::Release::new("MLCK".into()),
2496                 &aml::Return::new(&aml::Path::new("MR64")),
2497             ],
2498         )
2499         .to_aml_bytes(sink)
2500     }
2501 }
2502 
2503 impl Aml for MemoryManager {
2504     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2505         if let Some(acpi_address) = self.acpi_address {
2506             // Memory Hotplug Controller
2507             aml::Device::new(
2508                 "_SB_.MHPC".into(),
2509                 vec![
2510                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2511                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2512                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2513                     &aml::Mutex::new("MLCK".into(), 0),
2514                     &aml::Name::new(
2515                         "_CRS".into(),
2516                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2517                             aml::AddressSpaceCacheable::NotCacheable,
2518                             true,
2519                             acpi_address.0,
2520                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2521                             None,
2522                         )]),
2523                     ),
2524                     // OpRegion and Fields map MMIO range into individual field values
2525                     &aml::OpRegion::new(
2526                         "MHPR".into(),
2527                         aml::OpRegionSpace::SystemMemory,
2528                         &(acpi_address.0 as usize),
2529                         &MEMORY_MANAGER_ACPI_SIZE,
2530                     ),
2531                     &aml::Field::new(
2532                         "MHPR".into(),
2533                         aml::FieldAccessType::DWord,
2534                         aml::FieldLockRule::NoLock,
2535                         aml::FieldUpdateRule::Preserve,
2536                         vec![
2537                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2538                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2539                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2540                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2541                         ],
2542                     ),
2543                     &aml::Field::new(
2544                         "MHPR".into(),
2545                         aml::FieldAccessType::DWord,
2546                         aml::FieldLockRule::NoLock,
2547                         aml::FieldUpdateRule::Preserve,
2548                         vec![
2549                             aml::FieldEntry::Reserved(128),
2550                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2551                         ],
2552                     ),
2553                     &aml::Field::new(
2554                         "MHPR".into(),
2555                         aml::FieldAccessType::Byte,
2556                         aml::FieldLockRule::NoLock,
2557                         aml::FieldUpdateRule::WriteAsZeroes,
2558                         vec![
2559                             aml::FieldEntry::Reserved(160),
2560                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2561                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2562                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2563                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2564                         ],
2565                     ),
2566                     &aml::Field::new(
2567                         "MHPR".into(),
2568                         aml::FieldAccessType::DWord,
2569                         aml::FieldLockRule::NoLock,
2570                         aml::FieldUpdateRule::Preserve,
2571                         vec![
2572                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2573                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2574                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2575                         ],
2576                     ),
2577                     &MemoryMethods {
2578                         slots: self.hotplug_slots.len(),
2579                     },
2580                     &MemorySlots {
2581                         slots: self.hotplug_slots.len(),
2582                     },
2583                 ],
2584             )
2585             .to_aml_bytes(sink);
2586         } else {
2587             aml::Device::new(
2588                 "_SB_.MHPC".into(),
2589                 vec![
2590                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2591                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2592                     // Empty MSCN for GED
2593                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2594                 ],
2595             )
2596             .to_aml_bytes(sink);
2597         }
2598 
2599         #[cfg(target_arch = "x86_64")]
2600         {
2601             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2602                 let min = sgx_epc_region.start().raw_value();
2603                 let max = min + sgx_epc_region.size() - 1;
2604                 // SGX EPC region
2605                 aml::Device::new(
2606                     "_SB_.EPC_".into(),
2607                     vec![
2608                         &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")),
2609                         // QWORD describing the EPC region start and size
2610                         &aml::Name::new(
2611                             "_CRS".into(),
2612                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2613                                 aml::AddressSpaceCacheable::NotCacheable,
2614                                 true,
2615                                 min,
2616                                 max,
2617                                 None,
2618                             )]),
2619                         ),
2620                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2621                     ],
2622                 )
2623                 .to_aml_bytes(sink);
2624             }
2625         }
2626     }
2627 }
2628 
2629 impl Pausable for MemoryManager {}
2630 
2631 #[derive(Clone, Serialize, Deserialize, Versionize)]
2632 pub struct MemoryManagerSnapshotData {
2633     memory_ranges: MemoryRangeTable,
2634     guest_ram_mappings: Vec<GuestRamMapping>,
2635     start_of_device_area: u64,
2636     boot_ram: u64,
2637     current_ram: u64,
2638     arch_mem_regions: Vec<ArchMemRegion>,
2639     hotplug_slots: Vec<HotPlugState>,
2640     next_memory_slot: u32,
2641     selected_slot: usize,
2642     next_hotplug_slot: usize,
2643 }
2644 
2645 impl VersionMapped for MemoryManagerSnapshotData {}
2646 
2647 impl Snapshottable for MemoryManager {
2648     fn id(&self) -> String {
2649         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2650     }
2651 
2652     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2653         let memory_ranges = self.memory_range_table(true)?;
2654 
2655         // Store locally this list of ranges as it will be used through the
2656         // Transportable::send() implementation. The point is to avoid the
2657         // duplication of code regarding the creation of the path for each
2658         // region. The 'snapshot' step creates the list of memory regions,
2659         // including information about the need to copy a memory region or
2660         // not. This saves the 'send' step having to go through the same
2661         // process, and instead it can directly proceed with storing the
2662         // memory range content for the ranges requiring it.
2663         self.snapshot_memory_ranges = memory_ranges;
2664 
2665         Ok(Snapshot::from_data(SnapshotData::new_from_versioned_state(
2666             &self.snapshot_data(),
2667         )?))
2668     }
2669 }
2670 
2671 impl Transportable for MemoryManager {
2672     fn send(
2673         &self,
2674         _snapshot: &Snapshot,
2675         destination_url: &str,
2676     ) -> result::Result<(), MigratableError> {
2677         if self.snapshot_memory_ranges.is_empty() {
2678             return Ok(());
2679         }
2680 
2681         let mut memory_file_path = url_to_path(destination_url)?;
2682         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2683 
2684         // Create the snapshot file for the entire memory
2685         let mut memory_file = OpenOptions::new()
2686             .read(true)
2687             .write(true)
2688             .create_new(true)
2689             .open(memory_file_path)
2690             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2691 
2692         let guest_memory = self.guest_memory.memory();
2693 
2694         for range in self.snapshot_memory_ranges.regions() {
2695             let mut offset: u64 = 0;
2696             // Here we are manually handling the retry in case we can't read
2697             // the whole region at once because we can't use the implementation
2698             // from vm-memory::GuestMemory of write_all_to() as it is not
2699             // following the correct behavior. For more info about this issue
2700             // see: https://github.com/rust-vmm/vm-memory/issues/174
2701             loop {
2702                 let bytes_written = guest_memory
2703                     .write_volatile_to(
2704                         GuestAddress(range.gpa + offset),
2705                         &mut memory_file,
2706                         (range.length - offset) as usize,
2707                     )
2708                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2709                 offset += bytes_written as u64;
2710 
2711                 if offset == range.length {
2712                     break;
2713                 }
2714             }
2715         }
2716         Ok(())
2717     }
2718 }
2719 
2720 impl Migratable for MemoryManager {
2721     // Start the dirty log in the hypervisor (kvm/mshv).
2722     // Also, reset the dirty bitmap logged by the vmm.
2723     // Just before we do a bulk copy we want to start/clear the dirty log so that
2724     // pages touched during our bulk copy are tracked.
2725     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2726         self.vm.start_dirty_log().map_err(|e| {
2727             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2728         })?;
2729 
2730         for r in self.guest_memory.memory().iter() {
2731             r.bitmap().reset();
2732         }
2733 
2734         Ok(())
2735     }
2736 
2737     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2738         self.vm.stop_dirty_log().map_err(|e| {
2739             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2740         })?;
2741 
2742         Ok(())
2743     }
2744 
2745     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2746     // together in the table if they are contiguous.
2747     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2748         let mut table = MemoryRangeTable::default();
2749         for r in &self.guest_ram_mappings {
2750             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2751                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2752             })?;
2753             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2754             {
2755                 Some(region) => {
2756                     assert!(region.start_addr().raw_value() == r.gpa);
2757                     assert!(region.len() == r.size);
2758                     region.bitmap().get_and_reset()
2759                 }
2760                 None => {
2761                     return Err(MigratableError::MigrateSend(anyhow!(
2762                         "Error finding 'guest memory region' with address {:x}",
2763                         r.gpa
2764                     )))
2765                 }
2766             };
2767 
2768             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2769                 .iter()
2770                 .zip(vmm_dirty_bitmap.iter())
2771                 .map(|(x, y)| x | y)
2772                 .collect();
2773 
2774             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2775 
2776             if sub_table.regions().is_empty() {
2777                 info!("Dirty Memory Range Table is empty");
2778             } else {
2779                 info!("Dirty Memory Range Table:");
2780                 for range in sub_table.regions() {
2781                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2782                 }
2783             }
2784 
2785             table.extend(sub_table);
2786         }
2787         Ok(table)
2788     }
2789 }
2790