xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision 88a9f799449c04180c6b9a21d3b9c0c4b57e2bd6)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
6 use std::collections::BTreeMap;
7 use std::collections::HashMap;
8 use std::fs::{File, OpenOptions};
9 use std::io::{self};
10 use std::ops::{BitAnd, Deref, Not, Sub};
11 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
12 use std::os::fd::AsFd;
13 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
14 use std::path::PathBuf;
15 use std::result;
16 use std::sync::{Arc, Barrier, Mutex};
17 use std::{ffi, thread};
18 
19 use acpi_tables::{aml, Aml};
20 use anyhow::anyhow;
21 #[cfg(target_arch = "x86_64")]
22 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
23 use arch::RegionType;
24 #[cfg(target_arch = "x86_64")]
25 use devices::ioapic;
26 #[cfg(target_arch = "aarch64")]
27 use hypervisor::HypervisorVmError;
28 use libc::_SC_NPROCESSORS_ONLN;
29 #[cfg(target_arch = "x86_64")]
30 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
31 use serde::{Deserialize, Serialize};
32 use tracer::trace_scoped;
33 use virtio_devices::BlocksState;
34 #[cfg(target_arch = "x86_64")]
35 use vm_allocator::GsiApic;
36 use vm_allocator::{AddressAllocator, SystemAllocator};
37 use vm_device::BusDevice;
38 use vm_memory::bitmap::AtomicBitmap;
39 use vm_memory::guest_memory::FileOffset;
40 use vm_memory::{
41     mmap::MmapRegionError, Address, Error as MmapError, GuestAddress, GuestAddressSpace,
42     GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion,
43     ReadVolatile,
44 };
45 use vm_migration::{
46     protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable,
47     Snapshot, SnapshotData, Snapshottable, Transportable,
48 };
49 
50 #[cfg(target_arch = "x86_64")]
51 use crate::config::SgxEpcConfig;
52 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
53 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
54 use crate::coredump::{
55     CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError,
56 };
57 use crate::migration::url_to_path;
58 use crate::MEMORY_MANAGER_SNAPSHOT_ID;
59 use crate::{GuestMemoryMmap, GuestRegionMmap};
60 
61 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
62 
63 const DEFAULT_MEMORY_ZONE: &str = "mem0";
64 
65 const SNAPSHOT_FILENAME: &str = "memory-ranges";
66 
67 #[cfg(target_arch = "x86_64")]
68 const X86_64_IRQ_BASE: u32 = 5;
69 
70 #[cfg(target_arch = "x86_64")]
71 const SGX_PAGE_SIZE: u64 = 1 << 12;
72 
73 const HOTPLUG_COUNT: usize = 8;
74 
75 // Memory policy constants
76 const MPOL_BIND: u32 = 2;
77 const MPOL_MF_STRICT: u32 = 1;
78 const MPOL_MF_MOVE: u32 = 1 << 1;
79 
80 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
81 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
82 
83 const MAX_PREFAULT_THREAD_COUNT: usize = 16;
84 
85 #[derive(Clone, Default, Serialize, Deserialize)]
86 struct HotPlugState {
87     base: u64,
88     length: u64,
89     active: bool,
90     inserting: bool,
91     removing: bool,
92 }
93 
94 pub struct VirtioMemZone {
95     region: Arc<GuestRegionMmap>,
96     virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>,
97     hotplugged_size: u64,
98     hugepages: bool,
99     blocks_state: Arc<Mutex<BlocksState>>,
100 }
101 
102 impl VirtioMemZone {
103     pub fn region(&self) -> &Arc<GuestRegionMmap> {
104         &self.region
105     }
106     pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) {
107         self.virtio_device = Some(virtio_device);
108     }
109     pub fn hotplugged_size(&self) -> u64 {
110         self.hotplugged_size
111     }
112     pub fn hugepages(&self) -> bool {
113         self.hugepages
114     }
115     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
116         &self.blocks_state
117     }
118     pub fn plugged_ranges(&self) -> MemoryRangeTable {
119         self.blocks_state
120             .lock()
121             .unwrap()
122             .memory_ranges(self.region.start_addr().raw_value(), true)
123     }
124 }
125 
126 #[derive(Default)]
127 pub struct MemoryZone {
128     regions: Vec<Arc<GuestRegionMmap>>,
129     virtio_mem_zone: Option<VirtioMemZone>,
130 }
131 
132 impl MemoryZone {
133     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
134         &self.regions
135     }
136     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
137         &self.virtio_mem_zone
138     }
139     pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> {
140         self.virtio_mem_zone.as_mut()
141     }
142 }
143 
144 pub type MemoryZones = HashMap<String, MemoryZone>;
145 
146 #[derive(Clone, Serialize, Deserialize)]
147 struct GuestRamMapping {
148     slot: u32,
149     gpa: u64,
150     size: u64,
151     zone_id: String,
152     virtio_mem: bool,
153     file_offset: u64,
154 }
155 
156 #[derive(Clone, Serialize, Deserialize)]
157 struct ArchMemRegion {
158     base: u64,
159     size: usize,
160     r_type: RegionType,
161 }
162 
163 pub struct MemoryManager {
164     boot_guest_memory: GuestMemoryMmap,
165     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
166     next_memory_slot: u32,
167     start_of_device_area: GuestAddress,
168     end_of_device_area: GuestAddress,
169     end_of_ram_area: GuestAddress,
170     pub vm: Arc<dyn hypervisor::Vm>,
171     hotplug_slots: Vec<HotPlugState>,
172     selected_slot: usize,
173     mergeable: bool,
174     allocator: Arc<Mutex<SystemAllocator>>,
175     hotplug_method: HotplugMethod,
176     boot_ram: u64,
177     current_ram: u64,
178     next_hotplug_slot: usize,
179     shared: bool,
180     hugepages: bool,
181     hugepage_size: Option<u64>,
182     prefault: bool,
183     thp: bool,
184     #[cfg(target_arch = "x86_64")]
185     sgx_epc_region: Option<SgxEpcRegion>,
186     user_provided_zones: bool,
187     snapshot_memory_ranges: MemoryRangeTable,
188     memory_zones: MemoryZones,
189     log_dirty: bool, // Enable dirty logging for created RAM regions
190     arch_mem_regions: Vec<ArchMemRegion>,
191     ram_allocator: AddressAllocator,
192     dynamic: bool,
193 
194     // Keep track of calls to create_userspace_mapping() for guest RAM.
195     // This is useful for getting the dirty pages as we need to know the
196     // slots that the mapping is created in.
197     guest_ram_mappings: Vec<GuestRamMapping>,
198 
199     pub acpi_address: Option<GuestAddress>,
200     #[cfg(target_arch = "aarch64")]
201     uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
202 }
203 
204 #[derive(Debug)]
205 pub enum Error {
206     /// Failed to create shared file.
207     SharedFileCreate(io::Error),
208 
209     /// Failed to set shared file length.
210     SharedFileSetLen(io::Error),
211 
212     /// Mmap backed guest memory error
213     GuestMemory(MmapError),
214 
215     /// Failed to allocate a memory range.
216     MemoryRangeAllocation,
217 
218     /// Error from region creation
219     GuestMemoryRegion(MmapRegionError),
220 
221     /// No ACPI slot available
222     NoSlotAvailable,
223 
224     /// Not enough space in the hotplug RAM region
225     InsufficientHotplugRam,
226 
227     /// The requested hotplug memory addition is not a valid size
228     InvalidSize,
229 
230     /// Failed to create the user memory region.
231     CreateUserMemoryRegion(hypervisor::HypervisorVmError),
232 
233     /// Failed to remove the user memory region.
234     RemoveUserMemoryRegion(hypervisor::HypervisorVmError),
235 
236     /// Failed to EventFd.
237     EventFdFail(io::Error),
238 
239     /// Eventfd write error
240     EventfdError(io::Error),
241 
242     /// Failed to virtio-mem resize
243     VirtioMemResizeFail(virtio_devices::mem::Error),
244 
245     /// Cannot restore VM
246     Restore(MigratableError),
247 
248     /// Cannot restore VM because source URL is missing
249     RestoreMissingSourceUrl,
250 
251     /// Cannot create the system allocator
252     CreateSystemAllocator,
253 
254     /// Invalid SGX EPC section size
255     #[cfg(target_arch = "x86_64")]
256     EpcSectionSizeInvalid,
257 
258     /// Failed allocating SGX EPC region
259     #[cfg(target_arch = "x86_64")]
260     SgxEpcRangeAllocation,
261 
262     /// Failed opening SGX virtual EPC device
263     #[cfg(target_arch = "x86_64")]
264     SgxVirtEpcOpen(io::Error),
265 
266     /// Failed setting the SGX virtual EPC section size
267     #[cfg(target_arch = "x86_64")]
268     SgxVirtEpcFileSetLen(io::Error),
269 
270     /// Failed opening SGX provisioning device
271     #[cfg(target_arch = "x86_64")]
272     SgxProvisionOpen(io::Error),
273 
274     /// Failed enabling SGX provisioning
275     #[cfg(target_arch = "x86_64")]
276     SgxEnableProvisioning(hypervisor::HypervisorVmError),
277 
278     /// Failed creating a new MmapRegion instance.
279     #[cfg(target_arch = "x86_64")]
280     NewMmapRegion(vm_memory::mmap::MmapRegionError),
281 
282     /// No memory zones found.
283     MissingMemoryZones,
284 
285     /// Memory configuration is not valid.
286     InvalidMemoryParameters,
287 
288     /// Forbidden operation. Impossible to resize guest memory if it is
289     /// backed by user defined memory regions.
290     InvalidResizeWithMemoryZones,
291 
292     /// It's invalid to try applying a NUMA policy to a memory zone that is
293     /// memory mapped with MAP_SHARED.
294     InvalidSharedMemoryZoneWithHostNuma,
295 
296     /// Failed applying NUMA memory policy.
297     ApplyNumaPolicy(io::Error),
298 
299     /// Memory zone identifier is not unique.
300     DuplicateZoneId,
301 
302     /// No virtio-mem resizing handler found.
303     MissingVirtioMemHandler,
304 
305     /// Unknown memory zone.
306     UnknownMemoryZone,
307 
308     /// Invalid size for resizing. Can be anything except 0.
309     InvalidHotplugSize,
310 
311     /// Invalid hotplug method associated with memory zones resizing capability.
312     InvalidHotplugMethodWithMemoryZones,
313 
314     /// Could not find specified memory zone identifier from hash map.
315     MissingZoneIdentifier,
316 
317     /// Resizing the memory zone failed.
318     ResizeZone,
319 
320     /// Guest address overflow
321     GuestAddressOverFlow,
322 
323     /// Error opening snapshot file
324     SnapshotOpen(io::Error),
325 
326     // Error copying snapshot into region
327     SnapshotCopy(GuestMemoryError),
328 
329     /// Failed to allocate MMIO address
330     AllocateMmioAddress,
331 
332     #[cfg(target_arch = "aarch64")]
333     /// Failed to create UEFI flash
334     CreateUefiFlash(HypervisorVmError),
335 
336     /// Using a directory as a backing file for memory is not supported
337     DirectoryAsBackingFileForMemory,
338 
339     /// Failed to stat filesystem
340     GetFileSystemBlockSize(io::Error),
341 
342     /// Memory size is misaligned with default page size or its hugepage size
343     MisalignedMemorySize,
344 }
345 
346 const ENABLE_FLAG: usize = 0;
347 const INSERTING_FLAG: usize = 1;
348 const REMOVING_FLAG: usize = 2;
349 const EJECT_FLAG: usize = 3;
350 
351 const BASE_OFFSET_LOW: u64 = 0;
352 const BASE_OFFSET_HIGH: u64 = 0x4;
353 const LENGTH_OFFSET_LOW: u64 = 0x8;
354 const LENGTH_OFFSET_HIGH: u64 = 0xC;
355 const STATUS_OFFSET: u64 = 0x14;
356 const SELECTION_OFFSET: u64 = 0;
357 
358 // The MMIO address space size is subtracted with 64k. This is done for the
359 // following reasons:
360 //  - Reduce the addressable space size by at least 4k to workaround a Linux
361 //    bug when the VMM allocates devices at the end of the addressable space
362 //  - Windows requires the addressable space size to be 64k aligned
363 fn mmio_address_space_size(phys_bits: u8) -> u64 {
364     (1 << phys_bits) - (1 << 16)
365 }
366 
367 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the
368 // `f_bsize` field.
369 //
370 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169
371 fn statfs_get_bsize(path: &str) -> Result<u64, Error> {
372     let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?;
373     let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit();
374 
375     // SAFETY: FFI call with a valid path and buffer
376     let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) };
377     if ret != 0 {
378         return Err(Error::GetFileSystemBlockSize(
379             std::io::Error::last_os_error(),
380         ));
381     }
382 
383     // SAFETY: `buf` is valid at this point
384     // Because this value is always positive, just convert it directly.
385     // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned
386     // by `clippy` on musl target.  To avoid the warning, there should be `as _` instead of
387     // `as u64`.
388     let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _;
389     Ok(bsize)
390 }
391 
392 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> {
393     // SAFETY: FFI call. Trivially safe.
394     let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
395 
396     // There is no backend file and the `hugepages` is disabled, just use system page size.
397     if zone.file.is_none() && !zone.hugepages {
398         return Ok(page_size);
399     }
400 
401     // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly.
402     if zone.hugepages && zone.hugepage_size.is_some() {
403         return Ok(zone.hugepage_size.unwrap());
404     }
405 
406     // There are two scenarios here:
407     //  - `hugepages` is enabled but `hugepage_size` is not specified:
408     //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
409     //  - The backing file is specified:
410     //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
411     //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
412     //     value is less than or equal to the page size, just use the page size.
413     let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| {
414         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
415     })?;
416 
417     let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
418 
419     Ok(align_size)
420 }
421 
422 #[inline]
423 fn align_down<T>(val: T, align: T) -> T
424 where
425     T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>,
426 {
427     val & !(align - 1u8.into())
428 }
429 
430 #[inline]
431 fn is_aligned<T>(val: T, align: T) -> bool
432 where
433     T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq,
434 {
435     (val & (align - 1u8.into())) == 0u8.into()
436 }
437 
438 impl BusDevice for MemoryManager {
439     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
440         if self.selected_slot < self.hotplug_slots.len() {
441             let state = &self.hotplug_slots[self.selected_slot];
442             match offset {
443                 BASE_OFFSET_LOW => {
444                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
445                 }
446                 BASE_OFFSET_HIGH => {
447                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
448                 }
449                 LENGTH_OFFSET_LOW => {
450                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
451                 }
452                 LENGTH_OFFSET_HIGH => {
453                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
454                 }
455                 STATUS_OFFSET => {
456                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
457                     data.fill(0);
458                     if state.active {
459                         data[0] |= 1 << ENABLE_FLAG;
460                     }
461                     if state.inserting {
462                         data[0] |= 1 << INSERTING_FLAG;
463                     }
464                     if state.removing {
465                         data[0] |= 1 << REMOVING_FLAG;
466                     }
467                 }
468                 _ => {
469                     warn!(
470                         "Unexpected offset for accessing memory manager device: {:#}",
471                         offset
472                     );
473                 }
474             }
475         } else {
476             warn!("Out of range memory slot: {}", self.selected_slot);
477         }
478     }
479 
480     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
481         match offset {
482             SELECTION_OFFSET => {
483                 self.selected_slot = usize::from(data[0]);
484             }
485             STATUS_OFFSET => {
486                 if self.selected_slot < self.hotplug_slots.len() {
487                     let state = &mut self.hotplug_slots[self.selected_slot];
488                     // The ACPI code writes back a 1 to acknowledge the insertion
489                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
490                         state.inserting = false;
491                     }
492                     // Ditto for removal
493                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
494                         state.removing = false;
495                     }
496                     // Trigger removal of "DIMM"
497                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
498                         warn!("Ejection of memory not currently supported");
499                     }
500                 } else {
501                     warn!("Out of range memory slot: {}", self.selected_slot);
502                 }
503             }
504             _ => {
505                 warn!(
506                     "Unexpected offset for accessing memory manager device: {:#}",
507                     offset
508                 );
509             }
510         };
511         None
512     }
513 }
514 
515 impl MemoryManager {
516     /// Creates all memory regions based on the available RAM ranges defined
517     /// by `ram_regions`, and based on the description of the memory zones.
518     /// In practice, this function can perform multiple memory mappings of the
519     /// same backing file if there's a hole in the address space between two
520     /// RAM ranges.
521     ///
522     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
523     /// and zones containing two zones (size 1G and size 4G).
524     ///
525     /// This function will create 3 resulting memory regions:
526     /// - First one mapping entirely the first memory zone on 0-1G range
527     /// - Second one mapping partially the second memory zone on 1G-3G range
528     /// - Third one mapping partially the second memory zone on 4G-6G range
529     ///
530     /// Also, all memory regions are page-size aligned (e.g. their sizes must
531     /// be multiple of page-size), which may leave an additional hole in the
532     /// address space when hugepage is used.
533     fn create_memory_regions_from_zones(
534         ram_regions: &[(GuestAddress, usize)],
535         zones: &[MemoryZoneConfig],
536         prefault: Option<bool>,
537         thp: bool,
538     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
539         let mut zone_iter = zones.iter();
540         let mut mem_regions = Vec::new();
541         let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?;
542         let mut zone_align_size = memory_zone_get_align_size(zone)?;
543         let mut zone_offset = 0u64;
544         let mut memory_zones = HashMap::new();
545 
546         if !is_aligned(zone.size, zone_align_size) {
547             return Err(Error::MisalignedMemorySize);
548         }
549 
550         // Add zone id to the list of memory zones.
551         memory_zones.insert(zone.id.clone(), MemoryZone::default());
552 
553         for ram_region in ram_regions.iter() {
554             let mut ram_region_offset = 0;
555             let mut exit = false;
556 
557             loop {
558                 let mut ram_region_consumed = false;
559                 let mut pull_next_zone = false;
560 
561                 let ram_region_available_size =
562                     align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size);
563                 if ram_region_available_size == 0 {
564                     break;
565                 }
566                 let zone_sub_size = zone.size - zone_offset;
567 
568                 let file_offset = zone_offset;
569                 let region_start = ram_region
570                     .0
571                     .checked_add(ram_region_offset)
572                     .ok_or(Error::GuestAddressOverFlow)?;
573                 let region_size = if zone_sub_size <= ram_region_available_size {
574                     if zone_sub_size == ram_region_available_size {
575                         ram_region_consumed = true;
576                     }
577 
578                     ram_region_offset += zone_sub_size;
579                     pull_next_zone = true;
580 
581                     zone_sub_size
582                 } else {
583                     zone_offset += ram_region_available_size;
584                     ram_region_consumed = true;
585 
586                     ram_region_available_size
587                 };
588 
589                 info!(
590                     "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}",
591                     zone.id,
592                     region_start.raw_value(),
593                     region_size
594                 );
595                 let region = MemoryManager::create_ram_region(
596                     &zone.file,
597                     file_offset,
598                     region_start,
599                     region_size as usize,
600                     prefault.unwrap_or(zone.prefault),
601                     zone.shared,
602                     zone.hugepages,
603                     zone.hugepage_size,
604                     zone.host_numa_node,
605                     None,
606                     thp,
607                 )?;
608 
609                 // Add region to the list of regions associated with the
610                 // current memory zone.
611                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
612                     memory_zone.regions.push(region.clone());
613                 }
614 
615                 mem_regions.push(region);
616 
617                 if pull_next_zone {
618                     // Get the next zone and reset the offset.
619                     zone_offset = 0;
620                     if let Some(z) = zone_iter.next() {
621                         zone = z;
622                     } else {
623                         exit = true;
624                         break;
625                     }
626                     zone_align_size = memory_zone_get_align_size(zone)?;
627                     if !is_aligned(zone.size, zone_align_size) {
628                         return Err(Error::MisalignedMemorySize);
629                     }
630 
631                     // Check if zone id already exist. In case it does, throw
632                     // an error as we need unique identifiers. Otherwise, add
633                     // the new zone id to the list of memory zones.
634                     if memory_zones.contains_key(&zone.id) {
635                         error!(
636                             "Memory zone identifier '{}' found more than once. \
637                             It must be unique",
638                             zone.id,
639                         );
640                         return Err(Error::DuplicateZoneId);
641                     }
642                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
643                 }
644 
645                 if ram_region_consumed {
646                     break;
647                 }
648             }
649 
650             if exit {
651                 break;
652             }
653         }
654 
655         Ok((mem_regions, memory_zones))
656     }
657 
658     // Restore both GuestMemory regions along with MemoryZone zones.
659     fn restore_memory_regions_and_zones(
660         guest_ram_mappings: &[GuestRamMapping],
661         zones_config: &[MemoryZoneConfig],
662         prefault: Option<bool>,
663         mut existing_memory_files: HashMap<u32, File>,
664         thp: bool,
665     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
666         let mut memory_regions = Vec::new();
667         let mut memory_zones = HashMap::new();
668 
669         for zone_config in zones_config {
670             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
671         }
672 
673         for guest_ram_mapping in guest_ram_mappings {
674             for zone_config in zones_config {
675                 if guest_ram_mapping.zone_id == zone_config.id {
676                     let region = MemoryManager::create_ram_region(
677                         if guest_ram_mapping.virtio_mem {
678                             &None
679                         } else {
680                             &zone_config.file
681                         },
682                         guest_ram_mapping.file_offset,
683                         GuestAddress(guest_ram_mapping.gpa),
684                         guest_ram_mapping.size as usize,
685                         prefault.unwrap_or(zone_config.prefault),
686                         zone_config.shared,
687                         zone_config.hugepages,
688                         zone_config.hugepage_size,
689                         zone_config.host_numa_node,
690                         existing_memory_files.remove(&guest_ram_mapping.slot),
691                         thp,
692                     )?;
693                     memory_regions.push(Arc::clone(&region));
694                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
695                         if guest_ram_mapping.virtio_mem {
696                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
697                             let region_size = region.len();
698                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
699                                 region,
700                                 virtio_device: None,
701                                 hotplugged_size,
702                                 hugepages: zone_config.hugepages,
703                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
704                             });
705                         } else {
706                             memory_zone.regions.push(region);
707                         }
708                     }
709                 }
710             }
711         }
712 
713         memory_regions.sort_by_key(|x| x.start_addr());
714 
715         Ok((memory_regions, memory_zones))
716     }
717 
718     fn fill_saved_regions(
719         &mut self,
720         file_path: PathBuf,
721         saved_regions: MemoryRangeTable,
722     ) -> Result<(), Error> {
723         if saved_regions.is_empty() {
724             return Ok(());
725         }
726 
727         // Open (read only) the snapshot file.
728         let mut memory_file = OpenOptions::new()
729             .read(true)
730             .open(file_path)
731             .map_err(Error::SnapshotOpen)?;
732 
733         let guest_memory = self.guest_memory.memory();
734         for range in saved_regions.regions() {
735             let mut offset: u64 = 0;
736             // Here we are manually handling the retry in case we can't write
737             // the whole region at once because we can't use the implementation
738             // from vm-memory::GuestMemory of read_exact_from() as it is not
739             // following the correct behavior. For more info about this issue
740             // see: https://github.com/rust-vmm/vm-memory/issues/174
741             loop {
742                 let bytes_read = guest_memory
743                     .read_volatile_from(
744                         GuestAddress(range.gpa + offset),
745                         &mut memory_file,
746                         (range.length - offset) as usize,
747                     )
748                     .map_err(Error::SnapshotCopy)?;
749                 offset += bytes_read as u64;
750 
751                 if offset == range.length {
752                     break;
753                 }
754             }
755         }
756 
757         Ok(())
758     }
759 
760     fn validate_memory_config(
761         config: &MemoryConfig,
762         user_provided_zones: bool,
763     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
764         let mut allow_mem_hotplug = false;
765 
766         if !user_provided_zones {
767             if config.zones.is_some() {
768                 error!(
769                     "User defined memory regions can't be provided if the \
770                     memory size is not 0"
771                 );
772                 return Err(Error::InvalidMemoryParameters);
773             }
774 
775             if config.hotplug_size.is_some() {
776                 allow_mem_hotplug = true;
777             }
778 
779             if let Some(hotplugged_size) = config.hotplugged_size {
780                 if let Some(hotplug_size) = config.hotplug_size {
781                     if hotplugged_size > hotplug_size {
782                         error!(
783                             "'hotplugged_size' {} can't be bigger than \
784                             'hotplug_size' {}",
785                             hotplugged_size, hotplug_size,
786                         );
787                         return Err(Error::InvalidMemoryParameters);
788                     }
789                 } else {
790                     error!(
791                         "Invalid to define 'hotplugged_size' when there is\
792                         no 'hotplug_size'"
793                     );
794                     return Err(Error::InvalidMemoryParameters);
795                 }
796                 if config.hotplug_method == HotplugMethod::Acpi {
797                     error!(
798                         "Invalid to define 'hotplugged_size' with hotplug \
799                         method 'acpi'"
800                     );
801                     return Err(Error::InvalidMemoryParameters);
802                 }
803             }
804 
805             // Create a single zone from the global memory config. This lets
806             // us reuse the codepath for user defined memory zones.
807             let zones = vec![MemoryZoneConfig {
808                 id: String::from(DEFAULT_MEMORY_ZONE),
809                 size: config.size,
810                 file: None,
811                 shared: config.shared,
812                 hugepages: config.hugepages,
813                 hugepage_size: config.hugepage_size,
814                 host_numa_node: None,
815                 hotplug_size: config.hotplug_size,
816                 hotplugged_size: config.hotplugged_size,
817                 prefault: config.prefault,
818             }];
819 
820             Ok((config.size, zones, allow_mem_hotplug))
821         } else {
822             if config.zones.is_none() {
823                 error!(
824                     "User defined memory regions must be provided if the \
825                     memory size is 0"
826                 );
827                 return Err(Error::MissingMemoryZones);
828             }
829 
830             // Safe to unwrap as we checked right above there were some
831             // regions.
832             let zones = config.zones.clone().unwrap();
833             if zones.is_empty() {
834                 return Err(Error::MissingMemoryZones);
835             }
836 
837             let mut total_ram_size: u64 = 0;
838             for zone in zones.iter() {
839                 total_ram_size += zone.size;
840 
841                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
842                     error!(
843                         "Invalid to set host NUMA policy for a memory zone \
844                         backed by a regular file and mapped as 'shared'"
845                     );
846                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
847                 }
848 
849                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
850                     error!("Invalid to set ACPI hotplug method for memory zones");
851                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
852                 }
853 
854                 if let Some(hotplugged_size) = zone.hotplugged_size {
855                     if let Some(hotplug_size) = zone.hotplug_size {
856                         if hotplugged_size > hotplug_size {
857                             error!(
858                                 "'hotplugged_size' {} can't be bigger than \
859                                 'hotplug_size' {}",
860                                 hotplugged_size, hotplug_size,
861                             );
862                             return Err(Error::InvalidMemoryParameters);
863                         }
864                     } else {
865                         error!(
866                             "Invalid to define 'hotplugged_size' when there is\
867                             no 'hotplug_size' for a memory zone"
868                         );
869                         return Err(Error::InvalidMemoryParameters);
870                     }
871                     if config.hotplug_method == HotplugMethod::Acpi {
872                         error!(
873                             "Invalid to define 'hotplugged_size' with hotplug \
874                             method 'acpi'"
875                         );
876                         return Err(Error::InvalidMemoryParameters);
877                     }
878                 }
879             }
880 
881             Ok((total_ram_size, zones, allow_mem_hotplug))
882         }
883     }
884 
885     pub fn allocate_address_space(&mut self) -> Result<(), Error> {
886         let mut list = Vec::new();
887 
888         for (zone_id, memory_zone) in self.memory_zones.iter() {
889             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
890                 memory_zone
891                     .regions()
892                     .iter()
893                     .map(|r| (r.clone(), false))
894                     .collect();
895 
896             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
897                 regions.push((virtio_mem_zone.region().clone(), true));
898             }
899 
900             list.push((zone_id.clone(), regions));
901         }
902 
903         for (zone_id, regions) in list {
904             for (region, virtio_mem) in regions {
905                 let slot = self.create_userspace_mapping(
906                     region.start_addr().raw_value(),
907                     region.len(),
908                     region.as_ptr() as u64,
909                     self.mergeable,
910                     false,
911                     self.log_dirty,
912                 )?;
913 
914                 let file_offset = if let Some(file_offset) = region.file_offset() {
915                     file_offset.start()
916                 } else {
917                     0
918                 };
919 
920                 self.guest_ram_mappings.push(GuestRamMapping {
921                     gpa: region.start_addr().raw_value(),
922                     size: region.len(),
923                     slot,
924                     zone_id: zone_id.clone(),
925                     virtio_mem,
926                     file_offset,
927                 });
928                 self.ram_allocator
929                     .allocate(Some(region.start_addr()), region.len(), None)
930                     .ok_or(Error::MemoryRangeAllocation)?;
931             }
932         }
933 
934         // Allocate SubRegion and Reserved address ranges.
935         for region in self.arch_mem_regions.iter() {
936             if region.r_type == RegionType::Ram {
937                 // Ignore the RAM type since ranges have already been allocated
938                 // based on the GuestMemory regions.
939                 continue;
940             }
941             self.ram_allocator
942                 .allocate(
943                     Some(GuestAddress(region.base)),
944                     region.size as GuestUsize,
945                     None,
946                 )
947                 .ok_or(Error::MemoryRangeAllocation)?;
948         }
949 
950         Ok(())
951     }
952 
953     #[cfg(target_arch = "aarch64")]
954     fn add_uefi_flash(&mut self) -> Result<(), Error> {
955         // On AArch64, the UEFI binary requires a flash device at address 0.
956         // 4 MiB memory is mapped to simulate the flash.
957         let uefi_mem_slot = self.allocate_memory_slot();
958         let uefi_region = GuestRegionMmap::new(
959             MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(),
960             arch::layout::UEFI_START,
961         )
962         .unwrap();
963         let uefi_mem_region = self.vm.make_user_memory_region(
964             uefi_mem_slot,
965             uefi_region.start_addr().raw_value(),
966             uefi_region.len(),
967             uefi_region.as_ptr() as u64,
968             false,
969             false,
970         );
971         self.vm
972             .create_user_memory_region(uefi_mem_region)
973             .map_err(Error::CreateUefiFlash)?;
974 
975         let uefi_flash =
976             GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap());
977 
978         self.uefi_flash = Some(uefi_flash);
979 
980         Ok(())
981     }
982 
983     #[allow(clippy::too_many_arguments)]
984     pub fn new(
985         vm: Arc<dyn hypervisor::Vm>,
986         config: &MemoryConfig,
987         prefault: Option<bool>,
988         phys_bits: u8,
989         #[cfg(feature = "tdx")] tdx_enabled: bool,
990         restore_data: Option<&MemoryManagerSnapshotData>,
991         existing_memory_files: Option<HashMap<u32, File>>,
992         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
993     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
994         trace_scoped!("MemoryManager::new");
995 
996         let user_provided_zones = config.size == 0;
997 
998         let mmio_address_space_size = mmio_address_space_size(phys_bits);
999         debug_assert_eq!(
1000             (((mmio_address_space_size) >> 16) << 16),
1001             mmio_address_space_size
1002         );
1003         let start_of_platform_device_area =
1004             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
1005         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
1006 
1007         let (ram_size, zones, allow_mem_hotplug) =
1008             Self::validate_memory_config(config, user_provided_zones)?;
1009 
1010         let (
1011             start_of_device_area,
1012             boot_ram,
1013             current_ram,
1014             arch_mem_regions,
1015             memory_zones,
1016             guest_memory,
1017             boot_guest_memory,
1018             hotplug_slots,
1019             next_memory_slot,
1020             selected_slot,
1021             next_hotplug_slot,
1022         ) = if let Some(data) = restore_data {
1023             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
1024                 &data.guest_ram_mappings,
1025                 &zones,
1026                 prefault,
1027                 existing_memory_files.unwrap_or_default(),
1028                 config.thp,
1029             )?;
1030             let guest_memory =
1031                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
1032             let boot_guest_memory = guest_memory.clone();
1033             (
1034                 GuestAddress(data.start_of_device_area),
1035                 data.boot_ram,
1036                 data.current_ram,
1037                 data.arch_mem_regions.clone(),
1038                 memory_zones,
1039                 guest_memory,
1040                 boot_guest_memory,
1041                 data.hotplug_slots.clone(),
1042                 data.next_memory_slot,
1043                 data.selected_slot,
1044                 data.next_hotplug_slot,
1045             )
1046         } else {
1047             // Init guest memory
1048             let arch_mem_regions = arch::arch_memory_regions();
1049 
1050             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1051                 .iter()
1052                 .filter(|r| r.2 == RegionType::Ram)
1053                 .map(|r| (r.0, r.1))
1054                 .collect();
1055 
1056             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
1057                 .iter()
1058                 .map(|(a, b, c)| ArchMemRegion {
1059                     base: a.0,
1060                     size: *b,
1061                     r_type: *c,
1062                 })
1063                 .collect();
1064 
1065             let (mem_regions, mut memory_zones) =
1066                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?;
1067 
1068             let mut guest_memory =
1069                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
1070 
1071             let boot_guest_memory = guest_memory.clone();
1072 
1073             let mut start_of_device_area =
1074                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
1075 
1076             // Update list of memory zones for resize.
1077             for zone in zones.iter() {
1078                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
1079                     if let Some(hotplug_size) = zone.hotplug_size {
1080                         if hotplug_size == 0 {
1081                             error!("'hotplug_size' can't be 0");
1082                             return Err(Error::InvalidHotplugSize);
1083                         }
1084 
1085                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
1086                             start_of_device_area = start_of_device_area
1087                                 .checked_add(hotplug_size)
1088                                 .ok_or(Error::GuestAddressOverFlow)?;
1089                         } else {
1090                             // Alignment must be "natural" i.e. same as size of block
1091                             let start_addr = GuestAddress(
1092                                 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1093                                     - 1)
1094                                     / virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1095                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
1096                             );
1097 
1098                             // When `prefault` is set by vm_restore, memory manager
1099                             // will create ram region with `prefault` option in
1100                             // restore config rather than same option in zone
1101                             let region = MemoryManager::create_ram_region(
1102                                 &None,
1103                                 0,
1104                                 start_addr,
1105                                 hotplug_size as usize,
1106                                 prefault.unwrap_or(zone.prefault),
1107                                 zone.shared,
1108                                 zone.hugepages,
1109                                 zone.hugepage_size,
1110                                 zone.host_numa_node,
1111                                 None,
1112                                 config.thp,
1113                             )?;
1114 
1115                             guest_memory = guest_memory
1116                                 .insert_region(Arc::clone(&region))
1117                                 .map_err(Error::GuestMemory)?;
1118 
1119                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
1120                             let region_size = region.len();
1121                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
1122                                 region,
1123                                 virtio_device: None,
1124                                 hotplugged_size,
1125                                 hugepages: zone.hugepages,
1126                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
1127                             });
1128 
1129                             start_of_device_area = start_addr
1130                                 .checked_add(hotplug_size)
1131                                 .ok_or(Error::GuestAddressOverFlow)?;
1132                         }
1133                     }
1134                 } else {
1135                     return Err(Error::MissingZoneIdentifier);
1136                 }
1137             }
1138 
1139             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
1140             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
1141 
1142             (
1143                 start_of_device_area,
1144                 ram_size,
1145                 ram_size,
1146                 arch_mem_regions,
1147                 memory_zones,
1148                 guest_memory,
1149                 boot_guest_memory,
1150                 hotplug_slots,
1151                 0,
1152                 0,
1153                 0,
1154             )
1155         };
1156 
1157         let guest_memory = GuestMemoryAtomic::new(guest_memory);
1158 
1159         // Both MMIO and PIO address spaces start at address 0.
1160         let allocator = Arc::new(Mutex::new(
1161             SystemAllocator::new(
1162                 #[cfg(target_arch = "x86_64")]
1163                 {
1164                     GuestAddress(0)
1165                 },
1166                 #[cfg(target_arch = "x86_64")]
1167                 {
1168                     1 << 16
1169                 },
1170                 start_of_platform_device_area,
1171                 PLATFORM_DEVICE_AREA_SIZE,
1172                 #[cfg(target_arch = "x86_64")]
1173                 vec![GsiApic::new(
1174                     X86_64_IRQ_BASE,
1175                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1176                 )],
1177             )
1178             .ok_or(Error::CreateSystemAllocator)?,
1179         ));
1180 
1181         #[cfg(not(feature = "tdx"))]
1182         let dynamic = true;
1183         #[cfg(feature = "tdx")]
1184         let dynamic = !tdx_enabled;
1185 
1186         let acpi_address = if dynamic
1187             && config.hotplug_method == HotplugMethod::Acpi
1188             && (config.hotplug_size.unwrap_or_default() > 0)
1189         {
1190             Some(
1191                 allocator
1192                     .lock()
1193                     .unwrap()
1194                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1195                     .ok_or(Error::AllocateMmioAddress)?,
1196             )
1197         } else {
1198             None
1199         };
1200 
1201         // If running on SGX the start of device area and RAM area may diverge but
1202         // at this point they are next to each other.
1203         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1204         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1205 
1206         let mut memory_manager = MemoryManager {
1207             boot_guest_memory,
1208             guest_memory,
1209             next_memory_slot,
1210             start_of_device_area,
1211             end_of_device_area,
1212             end_of_ram_area,
1213             vm,
1214             hotplug_slots,
1215             selected_slot,
1216             mergeable: config.mergeable,
1217             allocator,
1218             hotplug_method: config.hotplug_method,
1219             boot_ram,
1220             current_ram,
1221             next_hotplug_slot,
1222             shared: config.shared,
1223             hugepages: config.hugepages,
1224             hugepage_size: config.hugepage_size,
1225             prefault: config.prefault,
1226             #[cfg(target_arch = "x86_64")]
1227             sgx_epc_region: None,
1228             user_provided_zones,
1229             snapshot_memory_ranges: MemoryRangeTable::default(),
1230             memory_zones,
1231             guest_ram_mappings: Vec::new(),
1232             acpi_address,
1233             log_dirty: dynamic, // Cannot log dirty pages on a TD
1234             arch_mem_regions,
1235             ram_allocator,
1236             dynamic,
1237             #[cfg(target_arch = "aarch64")]
1238             uefi_flash: None,
1239             thp: config.thp,
1240         };
1241 
1242         #[cfg(target_arch = "aarch64")]
1243         {
1244             // For Aarch64 we cannot lazily allocate the address space like we
1245             // do for x86, because while restoring a VM from snapshot we would
1246             // need the address space to be allocated to properly restore VGIC.
1247             // And the restore of VGIC happens before we attempt to run the vCPUs
1248             // for the first time, thus we need to allocate the address space
1249             // beforehand.
1250             memory_manager.allocate_address_space()?;
1251             memory_manager.add_uefi_flash()?;
1252         }
1253 
1254         #[cfg(target_arch = "x86_64")]
1255         if let Some(sgx_epc_config) = sgx_epc_config {
1256             memory_manager.setup_sgx(sgx_epc_config)?;
1257         }
1258 
1259         Ok(Arc::new(Mutex::new(memory_manager)))
1260     }
1261 
1262     pub fn new_from_snapshot(
1263         snapshot: &Snapshot,
1264         vm: Arc<dyn hypervisor::Vm>,
1265         config: &MemoryConfig,
1266         source_url: Option<&str>,
1267         prefault: bool,
1268         phys_bits: u8,
1269     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1270         if let Some(source_url) = source_url {
1271             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1272             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1273 
1274             let mem_snapshot: MemoryManagerSnapshotData =
1275                 snapshot.to_state().map_err(Error::Restore)?;
1276 
1277             let mm = MemoryManager::new(
1278                 vm,
1279                 config,
1280                 Some(prefault),
1281                 phys_bits,
1282                 #[cfg(feature = "tdx")]
1283                 false,
1284                 Some(&mem_snapshot),
1285                 None,
1286                 #[cfg(target_arch = "x86_64")]
1287                 None,
1288             )?;
1289 
1290             mm.lock()
1291                 .unwrap()
1292                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1293 
1294             Ok(mm)
1295         } else {
1296             Err(Error::RestoreMissingSourceUrl)
1297         }
1298     }
1299 
1300     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1301         // SAFETY: FFI call with correct arguments
1302         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1303 
1304         if res < 0 {
1305             Err(io::Error::last_os_error())
1306         } else {
1307             Ok(res as RawFd)
1308         }
1309     }
1310 
1311     fn mbind(
1312         addr: *mut u8,
1313         len: u64,
1314         mode: u32,
1315         nodemask: Vec<u64>,
1316         maxnode: u64,
1317         flags: u32,
1318     ) -> Result<(), io::Error> {
1319         // SAFETY: FFI call with correct arguments
1320         let res = unsafe {
1321             libc::syscall(
1322                 libc::SYS_mbind,
1323                 addr as *mut libc::c_void,
1324                 len,
1325                 mode,
1326                 nodemask.as_ptr(),
1327                 maxnode,
1328                 flags,
1329             )
1330         };
1331 
1332         if res < 0 {
1333             Err(io::Error::last_os_error())
1334         } else {
1335             Ok(())
1336         }
1337     }
1338 
1339     fn create_anonymous_file(
1340         size: usize,
1341         hugepages: bool,
1342         hugepage_size: Option<u64>,
1343     ) -> Result<FileOffset, Error> {
1344         let fd = Self::memfd_create(
1345             &ffi::CString::new("ch_ram").unwrap(),
1346             libc::MFD_CLOEXEC
1347                 | if hugepages {
1348                     libc::MFD_HUGETLB
1349                         | if let Some(hugepage_size) = hugepage_size {
1350                             /*
1351                              * From the Linux kernel:
1352                              * Several system calls take a flag to request "hugetlb" huge pages.
1353                              * Without further specification, these system calls will use the
1354                              * system's default huge page size.  If a system supports multiple
1355                              * huge page sizes, the desired huge page size can be specified in
1356                              * bits [26:31] of the flag arguments.  The value in these 6 bits
1357                              * will encode the log2 of the huge page size.
1358                              */
1359 
1360                             hugepage_size.trailing_zeros() << 26
1361                         } else {
1362                             // Use the system default huge page size
1363                             0
1364                         }
1365                 } else {
1366                     0
1367                 },
1368         )
1369         .map_err(Error::SharedFileCreate)?;
1370 
1371         // SAFETY: fd is valid
1372         let f = unsafe { File::from_raw_fd(fd) };
1373         f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1374 
1375         Ok(FileOffset::new(f, 0))
1376     }
1377 
1378     fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> {
1379         if backing_file.is_dir() {
1380             Err(Error::DirectoryAsBackingFileForMemory)
1381         } else {
1382             let f = OpenOptions::new()
1383                 .read(true)
1384                 .write(true)
1385                 .open(backing_file)
1386                 .map_err(Error::SharedFileCreate)?;
1387 
1388             Ok(FileOffset::new(f, file_offset))
1389         }
1390     }
1391 
1392     #[allow(clippy::too_many_arguments)]
1393     pub fn create_ram_region(
1394         backing_file: &Option<PathBuf>,
1395         file_offset: u64,
1396         start_addr: GuestAddress,
1397         size: usize,
1398         prefault: bool,
1399         shared: bool,
1400         hugepages: bool,
1401         hugepage_size: Option<u64>,
1402         host_numa_node: Option<u32>,
1403         existing_memory_file: Option<File>,
1404         thp: bool,
1405     ) -> Result<Arc<GuestRegionMmap>, Error> {
1406         let mut mmap_flags = libc::MAP_NORESERVE;
1407 
1408         // The duplication of mmap_flags ORing here is unfortunate but it also makes
1409         // the complexity of the handling clear.
1410         let fo = if let Some(f) = existing_memory_file {
1411             // It must be MAP_SHARED as we wouldn't already have an FD
1412             mmap_flags |= libc::MAP_SHARED;
1413             Some(FileOffset::new(f, file_offset))
1414         } else if let Some(backing_file) = backing_file {
1415             if shared {
1416                 mmap_flags |= libc::MAP_SHARED;
1417             } else {
1418                 mmap_flags |= libc::MAP_PRIVATE;
1419             }
1420             Some(Self::open_backing_file(backing_file, file_offset)?)
1421         } else if shared || hugepages {
1422             // For hugepages we must also MAP_SHARED otherwise we will trigger #4805
1423             // because the MAP_PRIVATE will trigger CoW against the backing file with
1424             // the VFIO pinning
1425             mmap_flags |= libc::MAP_SHARED;
1426             Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?)
1427         } else {
1428             mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS;
1429             None
1430         };
1431 
1432         let region = GuestRegionMmap::new(
1433             MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
1434                 .map_err(Error::GuestMemoryRegion)?,
1435             start_addr,
1436         )
1437         .map_err(Error::GuestMemory)?;
1438 
1439         // Apply NUMA policy if needed.
1440         if let Some(node) = host_numa_node {
1441             let addr = region.deref().as_ptr();
1442             let len = region.deref().size() as u64;
1443             let mode = MPOL_BIND;
1444             let mut nodemask: Vec<u64> = Vec::new();
1445             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1446 
1447             // Linux is kind of buggy in the way it interprets maxnode as it
1448             // will cut off the last node. That's why we have to add 1 to what
1449             // we would consider as the proper maxnode value.
1450             let maxnode = node as u64 + 1 + 1;
1451 
1452             // Allocate the right size for the vector.
1453             nodemask.resize((node as usize / 64) + 1, 0);
1454 
1455             // Fill the global bitmask through the nodemask vector.
1456             let idx = (node / 64) as usize;
1457             let shift = node % 64;
1458             nodemask[idx] |= 1u64 << shift;
1459 
1460             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1461             // force the kernel to move all pages that might have been already
1462             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1463             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1464             // MPOL_BIND is the selected mode as it specifies a strict policy
1465             // that restricts memory allocation to the nodes specified in the
1466             // nodemask.
1467             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1468                 .map_err(Error::ApplyNumaPolicy)?;
1469         }
1470 
1471         // Prefault the region if needed, in parallel.
1472         if prefault {
1473             let page_size =
1474                 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize;
1475 
1476             if !is_aligned(size, page_size) {
1477                 warn!(
1478                     "Prefaulting memory size {} misaligned with page size {}",
1479                     size, page_size
1480                 );
1481             }
1482 
1483             let num_pages = size / page_size;
1484 
1485             let num_threads = Self::get_prefault_num_threads(page_size, num_pages);
1486 
1487             let pages_per_thread = num_pages / num_threads;
1488             let remainder = num_pages % num_threads;
1489 
1490             let barrier = Arc::new(Barrier::new(num_threads));
1491             thread::scope(|s| {
1492                 let r = &region;
1493                 for i in 0..num_threads {
1494                     let barrier = Arc::clone(&barrier);
1495                     s.spawn(move || {
1496                         // Wait until all threads have been spawned to avoid contention
1497                         // over mmap_sem between thread stack allocation and page faulting.
1498                         barrier.wait();
1499                         let pages = pages_per_thread + if i < remainder { 1 } else { 0 };
1500                         let offset =
1501                             page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder));
1502                         // SAFETY: FFI call with correct arguments
1503                         let ret = unsafe {
1504                             let addr = r.as_ptr().add(offset);
1505                             libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE)
1506                         };
1507                         if ret != 0 {
1508                             let e = io::Error::last_os_error();
1509                             warn!("Failed to prefault pages: {}", e);
1510                         }
1511                     });
1512                 }
1513             });
1514         }
1515 
1516         if region.file_offset().is_none() && thp {
1517             info!(
1518                 "Anonymous mapping at 0x{:x} (size = 0x{:x})",
1519                 region.as_ptr() as u64,
1520                 size
1521             );
1522             // SAFETY: FFI call with correct arguments
1523             let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
1524             if ret != 0 {
1525                 let e = io::Error::last_os_error();
1526                 warn!("Failed to mark pages as THP eligible: {}", e);
1527             }
1528         }
1529 
1530         Ok(Arc::new(region))
1531     }
1532 
1533     // Duplicate of `memory_zone_get_align_size` that does not require a `zone`
1534     fn get_prefault_align_size(
1535         backing_file: &Option<PathBuf>,
1536         hugepages: bool,
1537         hugepage_size: Option<u64>,
1538     ) -> Result<u64, Error> {
1539         // SAFETY: FFI call. Trivially safe.
1540         let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
1541         match (hugepages, hugepage_size, backing_file) {
1542             (false, _, _) => Ok(page_size),
1543             (true, Some(hugepage_size), _) => Ok(hugepage_size),
1544             (true, None, _) => {
1545                 // There are two scenarios here:
1546                 //  - `hugepages` is enabled but `hugepage_size` is not specified:
1547                 //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
1548                 //  - The backing file is specified:
1549                 //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
1550                 //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
1551                 //     value is less than or equal to the page size, just use the page size.
1552                 let path = backing_file
1553                     .as_ref()
1554                     .map_or(Ok("/dev/hugepages"), |pathbuf| {
1555                         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
1556                     })?;
1557                 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
1558                 Ok(align_size)
1559             }
1560         }
1561     }
1562 
1563     fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize {
1564         let mut n: usize = 1;
1565 
1566         // Do not create more threads than processors available.
1567         // SAFETY: FFI call. Trivially safe.
1568         let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) };
1569         if procs > 0 {
1570             n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT);
1571         }
1572 
1573         // Do not create more threads than pages being allocated.
1574         n = std::cmp::min(n, num_pages);
1575 
1576         // Do not create threads to allocate less than 64 MiB of memory.
1577         n = std::cmp::min(
1578             n,
1579             std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))),
1580         );
1581 
1582         n
1583     }
1584 
1585     // Update the GuestMemoryMmap with the new range
1586     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1587         let guest_memory = self
1588             .guest_memory
1589             .memory()
1590             .insert_region(region)
1591             .map_err(Error::GuestMemory)?;
1592         self.guest_memory.lock().unwrap().replace(guest_memory);
1593 
1594         Ok(())
1595     }
1596 
1597     //
1598     // Calculate the start address of an area next to RAM.
1599     //
1600     // If memory hotplug is allowed, the start address needs to be aligned
1601     // (rounded-up) to 128MiB boundary.
1602     // If memory hotplug is not allowed, there is no alignment required.
1603     // And it must also start at the 64bit start.
1604     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1605         let mut start_addr = if allow_mem_hotplug {
1606             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1607         } else {
1608             mem_end
1609         };
1610 
1611         start_addr = start_addr
1612             .checked_add(1)
1613             .ok_or(Error::GuestAddressOverFlow)?;
1614 
1615         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1616             return Ok(arch::layout::RAM_64BIT_START);
1617         }
1618 
1619         Ok(start_addr)
1620     }
1621 
1622     pub fn add_ram_region(
1623         &mut self,
1624         start_addr: GuestAddress,
1625         size: usize,
1626     ) -> Result<Arc<GuestRegionMmap>, Error> {
1627         // Allocate memory for the region
1628         let region = MemoryManager::create_ram_region(
1629             &None,
1630             0,
1631             start_addr,
1632             size,
1633             self.prefault,
1634             self.shared,
1635             self.hugepages,
1636             self.hugepage_size,
1637             None,
1638             None,
1639             self.thp,
1640         )?;
1641 
1642         // Map it into the guest
1643         let slot = self.create_userspace_mapping(
1644             region.start_addr().0,
1645             region.len(),
1646             region.as_ptr() as u64,
1647             self.mergeable,
1648             false,
1649             self.log_dirty,
1650         )?;
1651         self.guest_ram_mappings.push(GuestRamMapping {
1652             gpa: region.start_addr().raw_value(),
1653             size: region.len(),
1654             slot,
1655             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1656             virtio_mem: false,
1657             file_offset: 0,
1658         });
1659 
1660         self.add_region(Arc::clone(&region))?;
1661 
1662         Ok(region)
1663     }
1664 
1665     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1666         info!("Hotplugging new RAM: {}", size);
1667 
1668         // Check that there is a free slot
1669         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1670             return Err(Error::NoSlotAvailable);
1671         }
1672 
1673         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1674         if size % (128 << 20) != 0 {
1675             return Err(Error::InvalidSize);
1676         }
1677 
1678         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1679 
1680         if start_addr
1681             .checked_add((size - 1).try_into().unwrap())
1682             .unwrap()
1683             > self.end_of_ram_area
1684         {
1685             return Err(Error::InsufficientHotplugRam);
1686         }
1687 
1688         let region = self.add_ram_region(start_addr, size)?;
1689 
1690         // Add region to the list of regions associated with the default
1691         // memory zone.
1692         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1693             memory_zone.regions.push(Arc::clone(&region));
1694         }
1695 
1696         // Tell the allocator
1697         self.ram_allocator
1698             .allocate(Some(start_addr), size as GuestUsize, None)
1699             .ok_or(Error::MemoryRangeAllocation)?;
1700 
1701         // Update the slot so that it can be queried via the I/O port
1702         let slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1703         slot.active = true;
1704         slot.inserting = true;
1705         slot.base = region.start_addr().0;
1706         slot.length = region.len();
1707 
1708         self.next_hotplug_slot += 1;
1709 
1710         Ok(region)
1711     }
1712 
1713     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1714         self.guest_memory.clone()
1715     }
1716 
1717     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1718         self.boot_guest_memory.clone()
1719     }
1720 
1721     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1722         self.allocator.clone()
1723     }
1724 
1725     pub fn start_of_device_area(&self) -> GuestAddress {
1726         self.start_of_device_area
1727     }
1728 
1729     pub fn end_of_device_area(&self) -> GuestAddress {
1730         self.end_of_device_area
1731     }
1732 
1733     pub fn allocate_memory_slot(&mut self) -> u32 {
1734         let slot_id = self.next_memory_slot;
1735         self.next_memory_slot += 1;
1736         slot_id
1737     }
1738 
1739     pub fn create_userspace_mapping(
1740         &mut self,
1741         guest_phys_addr: u64,
1742         memory_size: u64,
1743         userspace_addr: u64,
1744         mergeable: bool,
1745         readonly: bool,
1746         log_dirty: bool,
1747     ) -> Result<u32, Error> {
1748         let slot = self.allocate_memory_slot();
1749         let mem_region = self.vm.make_user_memory_region(
1750             slot,
1751             guest_phys_addr,
1752             memory_size,
1753             userspace_addr,
1754             readonly,
1755             log_dirty,
1756         );
1757 
1758         info!(
1759             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1760             guest_phys_addr, userspace_addr, memory_size, slot
1761         );
1762 
1763         self.vm
1764             .create_user_memory_region(mem_region)
1765             .map_err(Error::CreateUserMemoryRegion)?;
1766 
1767         // SAFETY: the address and size are valid since the
1768         // mmap succeeded.
1769         let ret = unsafe {
1770             libc::madvise(
1771                 userspace_addr as *mut libc::c_void,
1772                 memory_size as libc::size_t,
1773                 libc::MADV_DONTDUMP,
1774             )
1775         };
1776         if ret != 0 {
1777             let e = io::Error::last_os_error();
1778             warn!("Failed to mark mapping as MADV_DONTDUMP: {}", e);
1779         }
1780 
1781         // Mark the pages as mergeable if explicitly asked for.
1782         if mergeable {
1783             // SAFETY: the address and size are valid since the
1784             // mmap succeeded.
1785             let ret = unsafe {
1786                 libc::madvise(
1787                     userspace_addr as *mut libc::c_void,
1788                     memory_size as libc::size_t,
1789                     libc::MADV_MERGEABLE,
1790                 )
1791             };
1792             if ret != 0 {
1793                 let err = io::Error::last_os_error();
1794                 // Safe to unwrap because the error is constructed with
1795                 // last_os_error(), which ensures the output will be Some().
1796                 let errno = err.raw_os_error().unwrap();
1797                 if errno == libc::EINVAL {
1798                     warn!("kernel not configured with CONFIG_KSM");
1799                 } else {
1800                     warn!("madvise error: {}", err);
1801                 }
1802                 warn!("failed to mark pages as mergeable");
1803             }
1804         }
1805 
1806         info!(
1807             "Created userspace mapping: {:x} -> {:x} {:x}",
1808             guest_phys_addr, userspace_addr, memory_size
1809         );
1810 
1811         Ok(slot)
1812     }
1813 
1814     pub fn remove_userspace_mapping(
1815         &mut self,
1816         guest_phys_addr: u64,
1817         memory_size: u64,
1818         userspace_addr: u64,
1819         mergeable: bool,
1820         slot: u32,
1821     ) -> Result<(), Error> {
1822         let mem_region = self.vm.make_user_memory_region(
1823             slot,
1824             guest_phys_addr,
1825             memory_size,
1826             userspace_addr,
1827             false, /* readonly -- don't care */
1828             false, /* log dirty */
1829         );
1830 
1831         self.vm
1832             .remove_user_memory_region(mem_region)
1833             .map_err(Error::RemoveUserMemoryRegion)?;
1834 
1835         // Mark the pages as unmergeable if there were previously marked as
1836         // mergeable.
1837         if mergeable {
1838             // SAFETY: the address and size are valid as the region was
1839             // previously advised.
1840             let ret = unsafe {
1841                 libc::madvise(
1842                     userspace_addr as *mut libc::c_void,
1843                     memory_size as libc::size_t,
1844                     libc::MADV_UNMERGEABLE,
1845                 )
1846             };
1847             if ret != 0 {
1848                 let err = io::Error::last_os_error();
1849                 // Safe to unwrap because the error is constructed with
1850                 // last_os_error(), which ensures the output will be Some().
1851                 let errno = err.raw_os_error().unwrap();
1852                 if errno == libc::EINVAL {
1853                     warn!("kernel not configured with CONFIG_KSM");
1854                 } else {
1855                     warn!("madvise error: {}", err);
1856                 }
1857                 warn!("failed to mark pages as unmergeable");
1858             }
1859         }
1860 
1861         info!(
1862             "Removed userspace mapping: {:x} -> {:x} {:x}",
1863             guest_phys_addr, userspace_addr, memory_size
1864         );
1865 
1866         Ok(())
1867     }
1868 
1869     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1870         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1871             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1872                 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() {
1873                     virtio_mem_device
1874                         .lock()
1875                         .unwrap()
1876                         .resize(size)
1877                         .map_err(Error::VirtioMemResizeFail)?;
1878                 }
1879 
1880                 // Keep the hotplugged_size up to date.
1881                 virtio_mem_zone.hotplugged_size = size;
1882             } else {
1883                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1884                 return Err(Error::MissingVirtioMemHandler);
1885             }
1886 
1887             return Ok(());
1888         }
1889 
1890         error!("Failed resizing virtio-mem region: Unknown memory zone");
1891         Err(Error::UnknownMemoryZone)
1892     }
1893 
1894     /// In case this function resulted in adding a new memory region to the
1895     /// guest memory, the new region is returned to the caller. The virtio-mem
1896     /// use case never adds a new region as the whole hotpluggable memory has
1897     /// already been allocated at boot time.
1898     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1899         if self.user_provided_zones {
1900             error!(
1901                 "Not allowed to resize guest memory when backed with user \
1902                 defined memory zones."
1903             );
1904             return Err(Error::InvalidResizeWithMemoryZones);
1905         }
1906 
1907         let mut region: Option<Arc<GuestRegionMmap>> = None;
1908         match self.hotplug_method {
1909             HotplugMethod::VirtioMem => {
1910                 if desired_ram >= self.boot_ram {
1911                     if !self.dynamic {
1912                         return Ok(region);
1913                     }
1914 
1915                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1916                     self.current_ram = desired_ram;
1917                 }
1918             }
1919             HotplugMethod::Acpi => {
1920                 if desired_ram > self.current_ram {
1921                     if !self.dynamic {
1922                         return Ok(region);
1923                     }
1924 
1925                     region =
1926                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1927                     self.current_ram = desired_ram;
1928                 }
1929             }
1930         }
1931         Ok(region)
1932     }
1933 
1934     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1935         if !self.user_provided_zones {
1936             error!(
1937                 "Not allowed to resize guest memory zone when no zone is \
1938                 defined."
1939             );
1940             return Err(Error::ResizeZone);
1941         }
1942 
1943         self.virtio_mem_resize(id, virtio_mem_size)
1944     }
1945 
1946     #[cfg(target_arch = "x86_64")]
1947     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1948         let file = OpenOptions::new()
1949             .read(true)
1950             .open("/dev/sgx_provision")
1951             .map_err(Error::SgxProvisionOpen)?;
1952         self.vm
1953             .enable_sgx_attribute(file)
1954             .map_err(Error::SgxEnableProvisioning)?;
1955 
1956         // Go over each EPC section and verify its size is a 4k multiple. At
1957         // the same time, calculate the total size needed for the contiguous
1958         // EPC region.
1959         let mut epc_region_size = 0;
1960         for epc_section in sgx_epc_config.iter() {
1961             if epc_section.size == 0 {
1962                 return Err(Error::EpcSectionSizeInvalid);
1963             }
1964             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1965                 return Err(Error::EpcSectionSizeInvalid);
1966             }
1967 
1968             epc_region_size += epc_section.size;
1969         }
1970 
1971         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
1972         let epc_region_start = GuestAddress(
1973             ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE,
1974         );
1975 
1976         self.start_of_device_area = epc_region_start
1977             .checked_add(epc_region_size)
1978             .ok_or(Error::GuestAddressOverFlow)?;
1979 
1980         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
1981         info!(
1982             "SGX EPC region: 0x{:x} (0x{:x})",
1983             epc_region_start.0, epc_region_size
1984         );
1985 
1986         // Each section can be memory mapped into the allocated region.
1987         let mut epc_section_start = epc_region_start.raw_value();
1988         for epc_section in sgx_epc_config.iter() {
1989             let file = OpenOptions::new()
1990                 .read(true)
1991                 .write(true)
1992                 .open("/dev/sgx_vepc")
1993                 .map_err(Error::SgxVirtEpcOpen)?;
1994 
1995             let prot = PROT_READ | PROT_WRITE;
1996             let mut flags = MAP_NORESERVE | MAP_SHARED;
1997             if epc_section.prefault {
1998                 flags |= MAP_POPULATE;
1999             }
2000 
2001             // We can't use the vm-memory crate to perform the memory mapping
2002             // here as it would try to ensure the size of the backing file is
2003             // matching the size of the expected mapping. The /dev/sgx_vepc
2004             // device does not work that way, it provides a file descriptor
2005             // which is not matching the mapping size, as it's a just a way to
2006             // let KVM know that an EPC section is being created for the guest.
2007             // SAFETY: FFI call with correct arguments
2008             let host_addr = unsafe {
2009                 libc::mmap(
2010                     std::ptr::null_mut(),
2011                     epc_section.size as usize,
2012                     prot,
2013                     flags,
2014                     file.as_raw_fd(),
2015                     0,
2016                 )
2017             } as u64;
2018 
2019             info!(
2020                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
2021                 epc_section_start, epc_section.size
2022             );
2023 
2024             let _mem_slot = self.create_userspace_mapping(
2025                 epc_section_start,
2026                 epc_section.size,
2027                 host_addr,
2028                 false,
2029                 false,
2030                 false,
2031             )?;
2032 
2033             sgx_epc_region.insert(
2034                 epc_section.id.clone(),
2035                 SgxEpcSection::new(
2036                     GuestAddress(epc_section_start),
2037                     epc_section.size as GuestUsize,
2038                 ),
2039             );
2040 
2041             epc_section_start += epc_section.size;
2042         }
2043 
2044         self.sgx_epc_region = Some(sgx_epc_region);
2045 
2046         Ok(())
2047     }
2048 
2049     #[cfg(target_arch = "x86_64")]
2050     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
2051         &self.sgx_epc_region
2052     }
2053 
2054     pub fn is_hardlink(f: &File) -> bool {
2055         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
2056         // SAFETY: FFI call with correct arguments
2057         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
2058         if ret != 0 {
2059             error!("Couldn't fstat the backing file");
2060             return false;
2061         }
2062 
2063         // SAFETY: stat is valid
2064         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
2065     }
2066 
2067     pub fn memory_zones(&self) -> &MemoryZones {
2068         &self.memory_zones
2069     }
2070 
2071     pub fn memory_zones_mut(&mut self) -> &mut MemoryZones {
2072         &mut self.memory_zones
2073     }
2074 
2075     pub fn memory_range_table(
2076         &self,
2077         snapshot: bool,
2078     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
2079         let mut table = MemoryRangeTable::default();
2080 
2081         for memory_zone in self.memory_zones.values() {
2082             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
2083                 table.extend(virtio_mem_zone.plugged_ranges());
2084             }
2085 
2086             for region in memory_zone.regions() {
2087                 if snapshot {
2088                     if let Some(file_offset) = region.file_offset() {
2089                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
2090                             && Self::is_hardlink(file_offset.file())
2091                         {
2092                             // In this very specific case, we know the memory
2093                             // region is backed by a file on the host filesystem
2094                             // that can be accessed by the user, and additionally
2095                             // the mapping is shared, which means that modifications
2096                             // to the content are written to the actual file.
2097                             // When meeting these conditions, we can skip the
2098                             // copy of the memory content for this specific region,
2099                             // as we can assume the user will have it saved through
2100                             // the backing file already.
2101                             continue;
2102                         }
2103                     }
2104                 }
2105 
2106                 table.push(MemoryRange {
2107                     gpa: region.start_addr().raw_value(),
2108                     length: region.len(),
2109                 });
2110             }
2111         }
2112 
2113         Ok(table)
2114     }
2115 
2116     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
2117         MemoryManagerSnapshotData {
2118             memory_ranges: self.snapshot_memory_ranges.clone(),
2119             guest_ram_mappings: self.guest_ram_mappings.clone(),
2120             start_of_device_area: self.start_of_device_area.0,
2121             boot_ram: self.boot_ram,
2122             current_ram: self.current_ram,
2123             arch_mem_regions: self.arch_mem_regions.clone(),
2124             hotplug_slots: self.hotplug_slots.clone(),
2125             next_memory_slot: self.next_memory_slot,
2126             selected_slot: self.selected_slot,
2127             next_hotplug_slot: self.next_hotplug_slot,
2128         }
2129     }
2130 
2131     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
2132         let mut memory_slot_fds = HashMap::new();
2133         for guest_ram_mapping in &self.guest_ram_mappings {
2134             let slot = guest_ram_mapping.slot;
2135             let guest_memory = self.guest_memory.memory();
2136             let file = guest_memory
2137                 .find_region(GuestAddress(guest_ram_mapping.gpa))
2138                 .unwrap()
2139                 .file_offset()
2140                 .unwrap()
2141                 .file();
2142             memory_slot_fds.insert(slot, file.as_raw_fd());
2143         }
2144         memory_slot_fds
2145     }
2146 
2147     pub fn acpi_address(&self) -> Option<GuestAddress> {
2148         self.acpi_address
2149     }
2150 
2151     pub fn num_guest_ram_mappings(&self) -> u32 {
2152         self.guest_ram_mappings.len() as u32
2153     }
2154 
2155     #[cfg(target_arch = "aarch64")]
2156     pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
2157         self.uefi_flash.as_ref().unwrap().clone()
2158     }
2159 
2160     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2161     pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
2162         let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
2163         mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
2164 
2165         let mut mem_offset_in_elf = mem_offset;
2166         let mut ram_maps = BTreeMap::new();
2167         for mapping in mapping_sorted_by_gpa.iter() {
2168             ram_maps.insert(
2169                 mapping.gpa,
2170                 CoredumpMemoryRegion {
2171                     mem_offset_in_elf,
2172                     mem_size: mapping.size,
2173                 },
2174             );
2175             mem_offset_in_elf += mapping.size;
2176         }
2177 
2178         CoredumpMemoryRegions { ram_maps }
2179     }
2180 
2181     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2182     pub fn coredump_iterate_save_mem(
2183         &mut self,
2184         dump_state: &DumpState,
2185     ) -> std::result::Result<(), GuestDebuggableError> {
2186         let snapshot_memory_ranges = self
2187             .memory_range_table(false)
2188             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2189 
2190         if snapshot_memory_ranges.is_empty() {
2191             return Ok(());
2192         }
2193 
2194         let coredump_file = dump_state.file.as_ref().unwrap();
2195 
2196         let guest_memory = self.guest_memory.memory();
2197         let mut total_bytes: u64 = 0;
2198 
2199         for range in snapshot_memory_ranges.regions() {
2200             let mut offset: u64 = 0;
2201             loop {
2202                 let bytes_written = guest_memory
2203                     .write_volatile_to(
2204                         GuestAddress(range.gpa + offset),
2205                         &mut coredump_file.as_fd(),
2206                         (range.length - offset) as usize,
2207                     )
2208                     .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2209                 offset += bytes_written as u64;
2210                 total_bytes += bytes_written as u64;
2211 
2212                 if offset == range.length {
2213                     break;
2214                 }
2215             }
2216         }
2217 
2218         debug!("coredump total bytes {}", total_bytes);
2219         Ok(())
2220     }
2221 
2222     pub fn receive_memory_regions<F>(
2223         &mut self,
2224         ranges: &MemoryRangeTable,
2225         fd: &mut F,
2226     ) -> std::result::Result<(), MigratableError>
2227     where
2228         F: ReadVolatile,
2229     {
2230         let guest_memory = self.guest_memory();
2231         let mem = guest_memory.memory();
2232 
2233         for range in ranges.regions() {
2234             let mut offset: u64 = 0;
2235             // Here we are manually handling the retry in case we can't the
2236             // whole region at once because we can't use the implementation
2237             // from vm-memory::GuestMemory of read_exact_from() as it is not
2238             // following the correct behavior. For more info about this issue
2239             // see: https://github.com/rust-vmm/vm-memory/issues/174
2240             loop {
2241                 let bytes_read = mem
2242                     .read_volatile_from(
2243                         GuestAddress(range.gpa + offset),
2244                         fd,
2245                         (range.length - offset) as usize,
2246                     )
2247                     .map_err(|e| {
2248                         MigratableError::MigrateReceive(anyhow!(
2249                             "Error receiving memory from socket: {}",
2250                             e
2251                         ))
2252                     })?;
2253                 offset += bytes_read as u64;
2254 
2255                 if offset == range.length {
2256                     break;
2257                 }
2258             }
2259         }
2260 
2261         Ok(())
2262     }
2263 }
2264 
2265 struct MemoryNotify {
2266     slot_id: usize,
2267 }
2268 
2269 impl Aml for MemoryNotify {
2270     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2271         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
2272         aml::If::new(
2273             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
2274             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2275         )
2276         .to_aml_bytes(sink)
2277     }
2278 }
2279 
2280 struct MemorySlot {
2281     slot_id: usize,
2282 }
2283 
2284 impl Aml for MemorySlot {
2285     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2286         aml::Device::new(
2287             format!("M{:03}", self.slot_id).as_str().into(),
2288             vec![
2289                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")),
2290                 &aml::Name::new("_UID".into(), &self.slot_id),
2291                 /*
2292                 _STA return value:
2293                 Bit [0] – Set if the device is present.
2294                 Bit [1] – Set if the device is enabled and decoding its resources.
2295                 Bit [2] – Set if the device should be shown in the UI.
2296                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
2297                 Bit [4] – Set if the battery is present.
2298                 Bits [31:5] – Reserved (must be cleared).
2299                 */
2300                 &aml::Method::new(
2301                     "_STA".into(),
2302                     0,
2303                     false,
2304                     // Call into MSTA method which will interrogate device
2305                     vec![&aml::Return::new(&aml::MethodCall::new(
2306                         "MSTA".into(),
2307                         vec![&self.slot_id],
2308                     ))],
2309                 ),
2310                 // Get details of memory
2311                 &aml::Method::new(
2312                     "_CRS".into(),
2313                     0,
2314                     false,
2315                     // Call into MCRS which provides actual memory details
2316                     vec![&aml::Return::new(&aml::MethodCall::new(
2317                         "MCRS".into(),
2318                         vec![&self.slot_id],
2319                     ))],
2320                 ),
2321             ],
2322         )
2323         .to_aml_bytes(sink)
2324     }
2325 }
2326 
2327 struct MemorySlots {
2328     slots: usize,
2329 }
2330 
2331 impl Aml for MemorySlots {
2332     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2333         for slot_id in 0..self.slots {
2334             MemorySlot { slot_id }.to_aml_bytes(sink);
2335         }
2336     }
2337 }
2338 
2339 struct MemoryMethods {
2340     slots: usize,
2341 }
2342 
2343 impl Aml for MemoryMethods {
2344     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2345         // Add "MTFY" notification method
2346         let mut memory_notifies = Vec::new();
2347         for slot_id in 0..self.slots {
2348             memory_notifies.push(MemoryNotify { slot_id });
2349         }
2350 
2351         let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new();
2352         for memory_notifier in memory_notifies.iter() {
2353             memory_notifies_refs.push(memory_notifier);
2354         }
2355 
2356         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink);
2357 
2358         // MSCN method
2359         aml::Method::new(
2360             "MSCN".into(),
2361             0,
2362             true,
2363             vec![
2364                 // Take lock defined above
2365                 &aml::Acquire::new("MLCK".into(), 0xffff),
2366                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2367                 &aml::While::new(
2368                     &aml::LessThan::new(&aml::Local(0), &self.slots),
2369                     vec![
2370                         // Write slot number (in first argument) to I/O port via field
2371                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2372                         // Check if MINS bit is set (inserting)
2373                         &aml::If::new(
2374                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2375                             // Notify device if it is
2376                             vec![
2377                                 &aml::MethodCall::new(
2378                                     "MTFY".into(),
2379                                     vec![&aml::Local(0), &aml::ONE],
2380                                 ),
2381                                 // Reset MINS bit
2382                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2383                             ],
2384                         ),
2385                         // Check if MRMV bit is set
2386                         &aml::If::new(
2387                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2388                             // Notify device if it is (with the eject constant 0x3)
2389                             vec![
2390                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2391                                 // Reset MRMV bit
2392                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2393                             ],
2394                         ),
2395                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2396                     ],
2397                 ),
2398                 // Release lock
2399                 &aml::Release::new("MLCK".into()),
2400             ],
2401         )
2402         .to_aml_bytes(sink);
2403 
2404         // Memory status method
2405         aml::Method::new(
2406             "MSTA".into(),
2407             1,
2408             true,
2409             vec![
2410                 // Take lock defined above
2411                 &aml::Acquire::new("MLCK".into(), 0xffff),
2412                 // Write slot number (in first argument) to I/O port via field
2413                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2414                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2415                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2416                 &aml::If::new(
2417                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2418                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2419                 ),
2420                 // Release lock
2421                 &aml::Release::new("MLCK".into()),
2422                 // Return 0 or 0xf
2423                 &aml::Return::new(&aml::Local(0)),
2424             ],
2425         )
2426         .to_aml_bytes(sink);
2427 
2428         // Memory range method
2429         aml::Method::new(
2430             "MCRS".into(),
2431             1,
2432             true,
2433             vec![
2434                 // Take lock defined above
2435                 &aml::Acquire::new("MLCK".into(), 0xffff),
2436                 // Write slot number (in first argument) to I/O port via field
2437                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2438                 &aml::Name::new(
2439                     "MR64".into(),
2440                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2441                         aml::AddressSpaceCacheable::Cacheable,
2442                         true,
2443                         0x0000_0000_0000_0000u64,
2444                         0xFFFF_FFFF_FFFF_FFFEu64,
2445                         None,
2446                     )]),
2447                 ),
2448                 &aml::CreateQWordField::new(
2449                     &aml::Path::new("MINL"),
2450                     &aml::Path::new("MR64"),
2451                     &14usize,
2452                 ),
2453                 &aml::CreateDWordField::new(
2454                     &aml::Path::new("MINH"),
2455                     &aml::Path::new("MR64"),
2456                     &18usize,
2457                 ),
2458                 &aml::CreateQWordField::new(
2459                     &aml::Path::new("MAXL"),
2460                     &aml::Path::new("MR64"),
2461                     &22usize,
2462                 ),
2463                 &aml::CreateDWordField::new(
2464                     &aml::Path::new("MAXH"),
2465                     &aml::Path::new("MR64"),
2466                     &26usize,
2467                 ),
2468                 &aml::CreateQWordField::new(
2469                     &aml::Path::new("LENL"),
2470                     &aml::Path::new("MR64"),
2471                     &38usize,
2472                 ),
2473                 &aml::CreateDWordField::new(
2474                     &aml::Path::new("LENH"),
2475                     &aml::Path::new("MR64"),
2476                     &42usize,
2477                 ),
2478                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2479                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2480                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2481                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2482                 &aml::Add::new(
2483                     &aml::Path::new("MAXL"),
2484                     &aml::Path::new("MINL"),
2485                     &aml::Path::new("LENL"),
2486                 ),
2487                 &aml::Add::new(
2488                     &aml::Path::new("MAXH"),
2489                     &aml::Path::new("MINH"),
2490                     &aml::Path::new("LENH"),
2491                 ),
2492                 &aml::If::new(
2493                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2494                     vec![&aml::Add::new(
2495                         &aml::Path::new("MAXH"),
2496                         &aml::ONE,
2497                         &aml::Path::new("MAXH"),
2498                     )],
2499                 ),
2500                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2501                 // Release lock
2502                 &aml::Release::new("MLCK".into()),
2503                 &aml::Return::new(&aml::Path::new("MR64")),
2504             ],
2505         )
2506         .to_aml_bytes(sink)
2507     }
2508 }
2509 
2510 impl Aml for MemoryManager {
2511     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2512         if let Some(acpi_address) = self.acpi_address {
2513             // Memory Hotplug Controller
2514             aml::Device::new(
2515                 "_SB_.MHPC".into(),
2516                 vec![
2517                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2518                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2519                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2520                     &aml::Mutex::new("MLCK".into(), 0),
2521                     &aml::Name::new(
2522                         "_CRS".into(),
2523                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2524                             aml::AddressSpaceCacheable::NotCacheable,
2525                             true,
2526                             acpi_address.0,
2527                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2528                             None,
2529                         )]),
2530                     ),
2531                     // OpRegion and Fields map MMIO range into individual field values
2532                     &aml::OpRegion::new(
2533                         "MHPR".into(),
2534                         aml::OpRegionSpace::SystemMemory,
2535                         &(acpi_address.0 as usize),
2536                         &MEMORY_MANAGER_ACPI_SIZE,
2537                     ),
2538                     &aml::Field::new(
2539                         "MHPR".into(),
2540                         aml::FieldAccessType::DWord,
2541                         aml::FieldLockRule::NoLock,
2542                         aml::FieldUpdateRule::Preserve,
2543                         vec![
2544                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2545                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2546                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2547                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2548                         ],
2549                     ),
2550                     &aml::Field::new(
2551                         "MHPR".into(),
2552                         aml::FieldAccessType::DWord,
2553                         aml::FieldLockRule::NoLock,
2554                         aml::FieldUpdateRule::Preserve,
2555                         vec![
2556                             aml::FieldEntry::Reserved(128),
2557                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2558                         ],
2559                     ),
2560                     &aml::Field::new(
2561                         "MHPR".into(),
2562                         aml::FieldAccessType::Byte,
2563                         aml::FieldLockRule::NoLock,
2564                         aml::FieldUpdateRule::WriteAsZeroes,
2565                         vec![
2566                             aml::FieldEntry::Reserved(160),
2567                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2568                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2569                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2570                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2571                         ],
2572                     ),
2573                     &aml::Field::new(
2574                         "MHPR".into(),
2575                         aml::FieldAccessType::DWord,
2576                         aml::FieldLockRule::NoLock,
2577                         aml::FieldUpdateRule::Preserve,
2578                         vec![
2579                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2580                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2581                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2582                         ],
2583                     ),
2584                     &MemoryMethods {
2585                         slots: self.hotplug_slots.len(),
2586                     },
2587                     &MemorySlots {
2588                         slots: self.hotplug_slots.len(),
2589                     },
2590                 ],
2591             )
2592             .to_aml_bytes(sink);
2593         } else {
2594             aml::Device::new(
2595                 "_SB_.MHPC".into(),
2596                 vec![
2597                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2598                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2599                     // Empty MSCN for GED
2600                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2601                 ],
2602             )
2603             .to_aml_bytes(sink);
2604         }
2605 
2606         #[cfg(target_arch = "x86_64")]
2607         {
2608             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2609                 let min = sgx_epc_region.start().raw_value();
2610                 let max = min + sgx_epc_region.size() - 1;
2611                 // SGX EPC region
2612                 aml::Device::new(
2613                     "_SB_.EPC_".into(),
2614                     vec![
2615                         &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")),
2616                         // QWORD describing the EPC region start and size
2617                         &aml::Name::new(
2618                             "_CRS".into(),
2619                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2620                                 aml::AddressSpaceCacheable::NotCacheable,
2621                                 true,
2622                                 min,
2623                                 max,
2624                                 None,
2625                             )]),
2626                         ),
2627                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2628                     ],
2629                 )
2630                 .to_aml_bytes(sink);
2631             }
2632         }
2633     }
2634 }
2635 
2636 impl Pausable for MemoryManager {}
2637 
2638 #[derive(Clone, Serialize, Deserialize)]
2639 pub struct MemoryManagerSnapshotData {
2640     memory_ranges: MemoryRangeTable,
2641     guest_ram_mappings: Vec<GuestRamMapping>,
2642     start_of_device_area: u64,
2643     boot_ram: u64,
2644     current_ram: u64,
2645     arch_mem_regions: Vec<ArchMemRegion>,
2646     hotplug_slots: Vec<HotPlugState>,
2647     next_memory_slot: u32,
2648     selected_slot: usize,
2649     next_hotplug_slot: usize,
2650 }
2651 
2652 impl Snapshottable for MemoryManager {
2653     fn id(&self) -> String {
2654         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2655     }
2656 
2657     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2658         let memory_ranges = self.memory_range_table(true)?;
2659 
2660         // Store locally this list of ranges as it will be used through the
2661         // Transportable::send() implementation. The point is to avoid the
2662         // duplication of code regarding the creation of the path for each
2663         // region. The 'snapshot' step creates the list of memory regions,
2664         // including information about the need to copy a memory region or
2665         // not. This saves the 'send' step having to go through the same
2666         // process, and instead it can directly proceed with storing the
2667         // memory range content for the ranges requiring it.
2668         self.snapshot_memory_ranges = memory_ranges;
2669 
2670         Ok(Snapshot::from_data(SnapshotData::new_from_state(
2671             &self.snapshot_data(),
2672         )?))
2673     }
2674 }
2675 
2676 impl Transportable for MemoryManager {
2677     fn send(
2678         &self,
2679         _snapshot: &Snapshot,
2680         destination_url: &str,
2681     ) -> result::Result<(), MigratableError> {
2682         if self.snapshot_memory_ranges.is_empty() {
2683             return Ok(());
2684         }
2685 
2686         let mut memory_file_path = url_to_path(destination_url)?;
2687         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2688 
2689         // Create the snapshot file for the entire memory
2690         let mut memory_file = OpenOptions::new()
2691             .read(true)
2692             .write(true)
2693             .create_new(true)
2694             .open(memory_file_path)
2695             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2696 
2697         let guest_memory = self.guest_memory.memory();
2698 
2699         for range in self.snapshot_memory_ranges.regions() {
2700             let mut offset: u64 = 0;
2701             // Here we are manually handling the retry in case we can't read
2702             // the whole region at once because we can't use the implementation
2703             // from vm-memory::GuestMemory of write_all_to() as it is not
2704             // following the correct behavior. For more info about this issue
2705             // see: https://github.com/rust-vmm/vm-memory/issues/174
2706             loop {
2707                 let bytes_written = guest_memory
2708                     .write_volatile_to(
2709                         GuestAddress(range.gpa + offset),
2710                         &mut memory_file,
2711                         (range.length - offset) as usize,
2712                     )
2713                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2714                 offset += bytes_written as u64;
2715 
2716                 if offset == range.length {
2717                     break;
2718                 }
2719             }
2720         }
2721         Ok(())
2722     }
2723 }
2724 
2725 impl Migratable for MemoryManager {
2726     // Start the dirty log in the hypervisor (kvm/mshv).
2727     // Also, reset the dirty bitmap logged by the vmm.
2728     // Just before we do a bulk copy we want to start/clear the dirty log so that
2729     // pages touched during our bulk copy are tracked.
2730     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2731         self.vm.start_dirty_log().map_err(|e| {
2732             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2733         })?;
2734 
2735         for r in self.guest_memory.memory().iter() {
2736             r.bitmap().reset();
2737         }
2738 
2739         Ok(())
2740     }
2741 
2742     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2743         self.vm.stop_dirty_log().map_err(|e| {
2744             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2745         })?;
2746 
2747         Ok(())
2748     }
2749 
2750     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2751     // together in the table if they are contiguous.
2752     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2753         let mut table = MemoryRangeTable::default();
2754         for r in &self.guest_ram_mappings {
2755             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2756                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2757             })?;
2758             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2759             {
2760                 Some(region) => {
2761                     assert!(region.start_addr().raw_value() == r.gpa);
2762                     assert!(region.len() == r.size);
2763                     region.bitmap().get_and_reset()
2764                 }
2765                 None => {
2766                     return Err(MigratableError::MigrateSend(anyhow!(
2767                         "Error finding 'guest memory region' with address {:x}",
2768                         r.gpa
2769                     )))
2770                 }
2771             };
2772 
2773             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2774                 .iter()
2775                 .zip(vmm_dirty_bitmap.iter())
2776                 .map(|(x, y)| x | y)
2777                 .collect();
2778 
2779             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2780 
2781             if sub_table.regions().is_empty() {
2782                 info!("Dirty Memory Range Table is empty");
2783             } else {
2784                 info!("Dirty Memory Range Table:");
2785                 for range in sub_table.regions() {
2786                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2787                 }
2788             }
2789 
2790             table.extend(sub_table);
2791         }
2792         Ok(table)
2793     }
2794 }
2795