xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision 5641e3a283db4149052b1e9278c640bcef8a000e)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 #[cfg(target_arch = "x86_64")]
6 use crate::config::SgxEpcConfig;
7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
8 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
9 use crate::coredump::{
10     CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError,
11 };
12 use crate::migration::url_to_path;
13 use crate::MEMORY_MANAGER_SNAPSHOT_ID;
14 use crate::{GuestMemoryMmap, GuestRegionMmap};
15 use acpi_tables::{aml, Aml};
16 use anyhow::anyhow;
17 #[cfg(target_arch = "x86_64")]
18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
19 use arch::RegionType;
20 #[cfg(target_arch = "x86_64")]
21 use devices::ioapic;
22 #[cfg(target_arch = "aarch64")]
23 use hypervisor::HypervisorVmError;
24 use libc::_SC_NPROCESSORS_ONLN;
25 #[cfg(target_arch = "x86_64")]
26 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
27 use serde::{Deserialize, Serialize};
28 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
29 use std::collections::BTreeMap;
30 use std::collections::HashMap;
31 use std::convert::TryInto;
32 use std::fs::{File, OpenOptions};
33 use std::io::{self};
34 use std::ops::{BitAnd, Deref, Not, Sub};
35 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
36 use std::os::fd::AsFd;
37 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
38 use std::path::PathBuf;
39 use std::result;
40 use std::sync::{Arc, Barrier, Mutex};
41 use std::{ffi, thread};
42 use tracer::trace_scoped;
43 use versionize::{VersionMap, Versionize, VersionizeResult};
44 use versionize_derive::Versionize;
45 use virtio_devices::BlocksState;
46 #[cfg(target_arch = "x86_64")]
47 use vm_allocator::GsiApic;
48 use vm_allocator::{AddressAllocator, SystemAllocator};
49 use vm_device::BusDevice;
50 use vm_memory::bitmap::AtomicBitmap;
51 use vm_memory::guest_memory::FileOffset;
52 use vm_memory::{
53     mmap::MmapRegionError, Address, Error as MmapError, GuestAddress, GuestAddressSpace,
54     GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion,
55     ReadVolatile,
56 };
57 use vm_migration::{
58     protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable,
59     Snapshot, SnapshotData, Snapshottable, Transportable, VersionMapped,
60 };
61 
62 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
63 
64 const DEFAULT_MEMORY_ZONE: &str = "mem0";
65 
66 const SNAPSHOT_FILENAME: &str = "memory-ranges";
67 
68 #[cfg(target_arch = "x86_64")]
69 const X86_64_IRQ_BASE: u32 = 5;
70 
71 #[cfg(target_arch = "x86_64")]
72 const SGX_PAGE_SIZE: u64 = 1 << 12;
73 
74 const HOTPLUG_COUNT: usize = 8;
75 
76 // Memory policy constants
77 const MPOL_BIND: u32 = 2;
78 const MPOL_MF_STRICT: u32 = 1;
79 const MPOL_MF_MOVE: u32 = 1 << 1;
80 
81 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
82 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
83 
84 const MAX_PREFAULT_THREAD_COUNT: usize = 16;
85 
86 #[derive(Clone, Default, Serialize, Deserialize, Versionize)]
87 struct HotPlugState {
88     base: u64,
89     length: u64,
90     active: bool,
91     inserting: bool,
92     removing: bool,
93 }
94 
95 pub struct VirtioMemZone {
96     region: Arc<GuestRegionMmap>,
97     virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>,
98     hotplugged_size: u64,
99     hugepages: bool,
100     blocks_state: Arc<Mutex<BlocksState>>,
101 }
102 
103 impl VirtioMemZone {
104     pub fn region(&self) -> &Arc<GuestRegionMmap> {
105         &self.region
106     }
107     pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) {
108         self.virtio_device = Some(virtio_device);
109     }
110     pub fn hotplugged_size(&self) -> u64 {
111         self.hotplugged_size
112     }
113     pub fn hugepages(&self) -> bool {
114         self.hugepages
115     }
116     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
117         &self.blocks_state
118     }
119     pub fn plugged_ranges(&self) -> MemoryRangeTable {
120         self.blocks_state
121             .lock()
122             .unwrap()
123             .memory_ranges(self.region.start_addr().raw_value(), true)
124     }
125 }
126 
127 #[derive(Default)]
128 pub struct MemoryZone {
129     regions: Vec<Arc<GuestRegionMmap>>,
130     virtio_mem_zone: Option<VirtioMemZone>,
131 }
132 
133 impl MemoryZone {
134     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
135         &self.regions
136     }
137     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
138         &self.virtio_mem_zone
139     }
140     pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> {
141         self.virtio_mem_zone.as_mut()
142     }
143 }
144 
145 pub type MemoryZones = HashMap<String, MemoryZone>;
146 
147 #[derive(Clone, Serialize, Deserialize, Versionize)]
148 struct GuestRamMapping {
149     slot: u32,
150     gpa: u64,
151     size: u64,
152     zone_id: String,
153     virtio_mem: bool,
154     file_offset: u64,
155 }
156 
157 #[derive(Clone, Serialize, Deserialize, Versionize)]
158 struct ArchMemRegion {
159     base: u64,
160     size: usize,
161     r_type: RegionType,
162 }
163 
164 pub struct MemoryManager {
165     boot_guest_memory: GuestMemoryMmap,
166     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
167     next_memory_slot: u32,
168     start_of_device_area: GuestAddress,
169     end_of_device_area: GuestAddress,
170     end_of_ram_area: GuestAddress,
171     pub vm: Arc<dyn hypervisor::Vm>,
172     hotplug_slots: Vec<HotPlugState>,
173     selected_slot: usize,
174     mergeable: bool,
175     allocator: Arc<Mutex<SystemAllocator>>,
176     hotplug_method: HotplugMethod,
177     boot_ram: u64,
178     current_ram: u64,
179     next_hotplug_slot: usize,
180     shared: bool,
181     hugepages: bool,
182     hugepage_size: Option<u64>,
183     prefault: bool,
184     thp: bool,
185     #[cfg(target_arch = "x86_64")]
186     sgx_epc_region: Option<SgxEpcRegion>,
187     user_provided_zones: bool,
188     snapshot_memory_ranges: MemoryRangeTable,
189     memory_zones: MemoryZones,
190     log_dirty: bool, // Enable dirty logging for created RAM regions
191     arch_mem_regions: Vec<ArchMemRegion>,
192     ram_allocator: AddressAllocator,
193     dynamic: bool,
194 
195     // Keep track of calls to create_userspace_mapping() for guest RAM.
196     // This is useful for getting the dirty pages as we need to know the
197     // slots that the mapping is created in.
198     guest_ram_mappings: Vec<GuestRamMapping>,
199 
200     pub acpi_address: Option<GuestAddress>,
201     #[cfg(target_arch = "aarch64")]
202     uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
203 }
204 
205 #[derive(Debug)]
206 pub enum Error {
207     /// Failed to create shared file.
208     SharedFileCreate(io::Error),
209 
210     /// Failed to set shared file length.
211     SharedFileSetLen(io::Error),
212 
213     /// Mmap backed guest memory error
214     GuestMemory(MmapError),
215 
216     /// Failed to allocate a memory range.
217     MemoryRangeAllocation,
218 
219     /// Error from region creation
220     GuestMemoryRegion(MmapRegionError),
221 
222     /// No ACPI slot available
223     NoSlotAvailable,
224 
225     /// Not enough space in the hotplug RAM region
226     InsufficientHotplugRam,
227 
228     /// The requested hotplug memory addition is not a valid size
229     InvalidSize,
230 
231     /// Failed to create the user memory region.
232     CreateUserMemoryRegion(hypervisor::HypervisorVmError),
233 
234     /// Failed to remove the user memory region.
235     RemoveUserMemoryRegion(hypervisor::HypervisorVmError),
236 
237     /// Failed to EventFd.
238     EventFdFail(io::Error),
239 
240     /// Eventfd write error
241     EventfdError(io::Error),
242 
243     /// Failed to virtio-mem resize
244     VirtioMemResizeFail(virtio_devices::mem::Error),
245 
246     /// Cannot restore VM
247     Restore(MigratableError),
248 
249     /// Cannot restore VM because source URL is missing
250     RestoreMissingSourceUrl,
251 
252     /// Cannot create the system allocator
253     CreateSystemAllocator,
254 
255     /// Invalid SGX EPC section size
256     #[cfg(target_arch = "x86_64")]
257     EpcSectionSizeInvalid,
258 
259     /// Failed allocating SGX EPC region
260     #[cfg(target_arch = "x86_64")]
261     SgxEpcRangeAllocation,
262 
263     /// Failed opening SGX virtual EPC device
264     #[cfg(target_arch = "x86_64")]
265     SgxVirtEpcOpen(io::Error),
266 
267     /// Failed setting the SGX virtual EPC section size
268     #[cfg(target_arch = "x86_64")]
269     SgxVirtEpcFileSetLen(io::Error),
270 
271     /// Failed opening SGX provisioning device
272     #[cfg(target_arch = "x86_64")]
273     SgxProvisionOpen(io::Error),
274 
275     /// Failed enabling SGX provisioning
276     #[cfg(target_arch = "x86_64")]
277     SgxEnableProvisioning(hypervisor::HypervisorVmError),
278 
279     /// Failed creating a new MmapRegion instance.
280     #[cfg(target_arch = "x86_64")]
281     NewMmapRegion(vm_memory::mmap::MmapRegionError),
282 
283     /// No memory zones found.
284     MissingMemoryZones,
285 
286     /// Memory configuration is not valid.
287     InvalidMemoryParameters,
288 
289     /// Forbidden operation. Impossible to resize guest memory if it is
290     /// backed by user defined memory regions.
291     InvalidResizeWithMemoryZones,
292 
293     /// It's invalid to try applying a NUMA policy to a memory zone that is
294     /// memory mapped with MAP_SHARED.
295     InvalidSharedMemoryZoneWithHostNuma,
296 
297     /// Failed applying NUMA memory policy.
298     ApplyNumaPolicy(io::Error),
299 
300     /// Memory zone identifier is not unique.
301     DuplicateZoneId,
302 
303     /// No virtio-mem resizing handler found.
304     MissingVirtioMemHandler,
305 
306     /// Unknown memory zone.
307     UnknownMemoryZone,
308 
309     /// Invalid size for resizing. Can be anything except 0.
310     InvalidHotplugSize,
311 
312     /// Invalid hotplug method associated with memory zones resizing capability.
313     InvalidHotplugMethodWithMemoryZones,
314 
315     /// Could not find specified memory zone identifier from hash map.
316     MissingZoneIdentifier,
317 
318     /// Resizing the memory zone failed.
319     ResizeZone,
320 
321     /// Guest address overflow
322     GuestAddressOverFlow,
323 
324     /// Error opening snapshot file
325     SnapshotOpen(io::Error),
326 
327     // Error copying snapshot into region
328     SnapshotCopy(GuestMemoryError),
329 
330     /// Failed to allocate MMIO address
331     AllocateMmioAddress,
332 
333     #[cfg(target_arch = "aarch64")]
334     /// Failed to create UEFI flash
335     CreateUefiFlash(HypervisorVmError),
336 
337     /// Using a directory as a backing file for memory is not supported
338     DirectoryAsBackingFileForMemory,
339 
340     /// Failed to stat filesystem
341     GetFileSystemBlockSize(io::Error),
342 
343     /// Memory size is misaligned with default page size or its hugepage size
344     MisalignedMemorySize,
345 }
346 
347 const ENABLE_FLAG: usize = 0;
348 const INSERTING_FLAG: usize = 1;
349 const REMOVING_FLAG: usize = 2;
350 const EJECT_FLAG: usize = 3;
351 
352 const BASE_OFFSET_LOW: u64 = 0;
353 const BASE_OFFSET_HIGH: u64 = 0x4;
354 const LENGTH_OFFSET_LOW: u64 = 0x8;
355 const LENGTH_OFFSET_HIGH: u64 = 0xC;
356 const STATUS_OFFSET: u64 = 0x14;
357 const SELECTION_OFFSET: u64 = 0;
358 
359 // The MMIO address space size is subtracted with 64k. This is done for the
360 // following reasons:
361 //  - Reduce the addressable space size by at least 4k to workaround a Linux
362 //    bug when the VMM allocates devices at the end of the addressable space
363 //  - Windows requires the addressable space size to be 64k aligned
364 fn mmio_address_space_size(phys_bits: u8) -> u64 {
365     (1 << phys_bits) - (1 << 16)
366 }
367 
368 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the
369 // `f_bsize` field.
370 //
371 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169
372 fn statfs_get_bsize(path: &str) -> Result<u64, Error> {
373     let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?;
374     let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit();
375 
376     // SAFETY: FFI call with a valid path and buffer
377     let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) };
378     if ret != 0 {
379         return Err(Error::GetFileSystemBlockSize(
380             std::io::Error::last_os_error(),
381         ));
382     }
383 
384     // SAFETY: `buf` is valid at this point
385     // Because this value is always positive, just convert it directly.
386     // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned
387     // by `clippy` on musl target.  To avoid the warning, there should be `as _` instead of
388     // `as u64`.
389     let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _;
390     Ok(bsize)
391 }
392 
393 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> {
394     // SAFETY: FFI call. Trivially safe.
395     let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
396 
397     // There is no backend file and the `hugepages` is disabled, just use system page size.
398     if zone.file.is_none() && !zone.hugepages {
399         return Ok(page_size);
400     }
401 
402     // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly.
403     if zone.hugepages && zone.hugepage_size.is_some() {
404         return Ok(zone.hugepage_size.unwrap());
405     }
406 
407     // There are two scenarios here:
408     //  - `hugepages` is enabled but `hugepage_size` is not specified:
409     //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
410     //  - The backing file is specified:
411     //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
412     //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
413     //     value is less than or equal to the page size, just use the page size.
414     let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| {
415         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
416     })?;
417 
418     let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
419 
420     Ok(align_size)
421 }
422 
423 #[inline]
424 fn align_down<T>(val: T, align: T) -> T
425 where
426     T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>,
427 {
428     val & !(align - 1u8.into())
429 }
430 
431 #[inline]
432 fn is_aligned<T>(val: T, align: T) -> bool
433 where
434     T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq,
435 {
436     (val & (align - 1u8.into())) == 0u8.into()
437 }
438 
439 impl BusDevice for MemoryManager {
440     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
441         if self.selected_slot < self.hotplug_slots.len() {
442             let state = &self.hotplug_slots[self.selected_slot];
443             match offset {
444                 BASE_OFFSET_LOW => {
445                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
446                 }
447                 BASE_OFFSET_HIGH => {
448                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
449                 }
450                 LENGTH_OFFSET_LOW => {
451                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
452                 }
453                 LENGTH_OFFSET_HIGH => {
454                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
455                 }
456                 STATUS_OFFSET => {
457                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
458                     data.fill(0);
459                     if state.active {
460                         data[0] |= 1 << ENABLE_FLAG;
461                     }
462                     if state.inserting {
463                         data[0] |= 1 << INSERTING_FLAG;
464                     }
465                     if state.removing {
466                         data[0] |= 1 << REMOVING_FLAG;
467                     }
468                 }
469                 _ => {
470                     warn!(
471                         "Unexpected offset for accessing memory manager device: {:#}",
472                         offset
473                     );
474                 }
475             }
476         } else {
477             warn!("Out of range memory slot: {}", self.selected_slot);
478         }
479     }
480 
481     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
482         match offset {
483             SELECTION_OFFSET => {
484                 self.selected_slot = usize::from(data[0]);
485             }
486             STATUS_OFFSET => {
487                 if self.selected_slot < self.hotplug_slots.len() {
488                     let state = &mut self.hotplug_slots[self.selected_slot];
489                     // The ACPI code writes back a 1 to acknowledge the insertion
490                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
491                         state.inserting = false;
492                     }
493                     // Ditto for removal
494                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
495                         state.removing = false;
496                     }
497                     // Trigger removal of "DIMM"
498                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
499                         warn!("Ejection of memory not currently supported");
500                     }
501                 } else {
502                     warn!("Out of range memory slot: {}", self.selected_slot);
503                 }
504             }
505             _ => {
506                 warn!(
507                     "Unexpected offset for accessing memory manager device: {:#}",
508                     offset
509                 );
510             }
511         };
512         None
513     }
514 }
515 
516 impl MemoryManager {
517     /// Creates all memory regions based on the available RAM ranges defined
518     /// by `ram_regions`, and based on the description of the memory zones.
519     /// In practice, this function can perform multiple memory mappings of the
520     /// same backing file if there's a hole in the address space between two
521     /// RAM ranges.
522     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
523     /// and zones containing two zones (size 1G and size 4G).
524     /// This function will create 3 resulting memory regions:
525     /// - First one mapping entirely the first memory zone on 0-1G range
526     /// - Second one mapping partially the second memory zone on 1G-3G range
527     /// - Third one mapping partially the second memory zone on 4G-6G range
528     /// Also, all memory regions are page-size aligned (e.g. their sizes must
529     /// be multiple of page-size), which may leave an additional hole in the
530     /// address space when hugepage is used.
531     fn create_memory_regions_from_zones(
532         ram_regions: &[(GuestAddress, usize)],
533         zones: &[MemoryZoneConfig],
534         prefault: Option<bool>,
535         thp: bool,
536     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
537         let mut zone_iter = zones.iter();
538         let mut mem_regions = Vec::new();
539         let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?;
540         let mut zone_align_size = memory_zone_get_align_size(zone)?;
541         let mut zone_offset = 0u64;
542         let mut memory_zones = HashMap::new();
543 
544         if !is_aligned(zone.size, zone_align_size) {
545             return Err(Error::MisalignedMemorySize);
546         }
547 
548         // Add zone id to the list of memory zones.
549         memory_zones.insert(zone.id.clone(), MemoryZone::default());
550 
551         for ram_region in ram_regions.iter() {
552             let mut ram_region_offset = 0;
553             let mut exit = false;
554 
555             loop {
556                 let mut ram_region_consumed = false;
557                 let mut pull_next_zone = false;
558 
559                 let ram_region_available_size =
560                     align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size);
561                 if ram_region_available_size == 0 {
562                     break;
563                 }
564                 let zone_sub_size = zone.size - zone_offset;
565 
566                 let file_offset = zone_offset;
567                 let region_start = ram_region
568                     .0
569                     .checked_add(ram_region_offset)
570                     .ok_or(Error::GuestAddressOverFlow)?;
571                 let region_size = if zone_sub_size <= ram_region_available_size {
572                     if zone_sub_size == ram_region_available_size {
573                         ram_region_consumed = true;
574                     }
575 
576                     ram_region_offset += zone_sub_size;
577                     pull_next_zone = true;
578 
579                     zone_sub_size
580                 } else {
581                     zone_offset += ram_region_available_size;
582                     ram_region_consumed = true;
583 
584                     ram_region_available_size
585                 };
586 
587                 info!(
588                     "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}",
589                     zone.id,
590                     region_start.raw_value(),
591                     region_size
592                 );
593                 let region = MemoryManager::create_ram_region(
594                     &zone.file,
595                     file_offset,
596                     region_start,
597                     region_size as usize,
598                     prefault.unwrap_or(zone.prefault),
599                     zone.shared,
600                     zone.hugepages,
601                     zone.hugepage_size,
602                     zone.host_numa_node,
603                     None,
604                     thp,
605                 )?;
606 
607                 // Add region to the list of regions associated with the
608                 // current memory zone.
609                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
610                     memory_zone.regions.push(region.clone());
611                 }
612 
613                 mem_regions.push(region);
614 
615                 if pull_next_zone {
616                     // Get the next zone and reset the offset.
617                     zone_offset = 0;
618                     if let Some(z) = zone_iter.next() {
619                         zone = z;
620                     } else {
621                         exit = true;
622                         break;
623                     }
624                     zone_align_size = memory_zone_get_align_size(zone)?;
625                     if !is_aligned(zone.size, zone_align_size) {
626                         return Err(Error::MisalignedMemorySize);
627                     }
628 
629                     // Check if zone id already exist. In case it does, throw
630                     // an error as we need unique identifiers. Otherwise, add
631                     // the new zone id to the list of memory zones.
632                     if memory_zones.contains_key(&zone.id) {
633                         error!(
634                             "Memory zone identifier '{}' found more than once. \
635                             It must be unique",
636                             zone.id,
637                         );
638                         return Err(Error::DuplicateZoneId);
639                     }
640                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
641                 }
642 
643                 if ram_region_consumed {
644                     break;
645                 }
646             }
647 
648             if exit {
649                 break;
650             }
651         }
652 
653         Ok((mem_regions, memory_zones))
654     }
655 
656     // Restore both GuestMemory regions along with MemoryZone zones.
657     fn restore_memory_regions_and_zones(
658         guest_ram_mappings: &[GuestRamMapping],
659         zones_config: &[MemoryZoneConfig],
660         prefault: Option<bool>,
661         mut existing_memory_files: HashMap<u32, File>,
662         thp: bool,
663     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
664         let mut memory_regions = Vec::new();
665         let mut memory_zones = HashMap::new();
666 
667         for zone_config in zones_config {
668             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
669         }
670 
671         for guest_ram_mapping in guest_ram_mappings {
672             for zone_config in zones_config {
673                 if guest_ram_mapping.zone_id == zone_config.id {
674                     let region = MemoryManager::create_ram_region(
675                         &zone_config.file,
676                         guest_ram_mapping.file_offset,
677                         GuestAddress(guest_ram_mapping.gpa),
678                         guest_ram_mapping.size as usize,
679                         prefault.unwrap_or(zone_config.prefault),
680                         zone_config.shared,
681                         zone_config.hugepages,
682                         zone_config.hugepage_size,
683                         zone_config.host_numa_node,
684                         existing_memory_files.remove(&guest_ram_mapping.slot),
685                         thp,
686                     )?;
687                     memory_regions.push(Arc::clone(&region));
688                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
689                         if guest_ram_mapping.virtio_mem {
690                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
691                             let region_size = region.len();
692                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
693                                 region,
694                                 virtio_device: None,
695                                 hotplugged_size,
696                                 hugepages: zone_config.hugepages,
697                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
698                             });
699                         } else {
700                             memory_zone.regions.push(region);
701                         }
702                     }
703                 }
704             }
705         }
706 
707         memory_regions.sort_by_key(|x| x.start_addr());
708 
709         Ok((memory_regions, memory_zones))
710     }
711 
712     fn fill_saved_regions(
713         &mut self,
714         file_path: PathBuf,
715         saved_regions: MemoryRangeTable,
716     ) -> Result<(), Error> {
717         if saved_regions.is_empty() {
718             return Ok(());
719         }
720 
721         // Open (read only) the snapshot file.
722         let mut memory_file = OpenOptions::new()
723             .read(true)
724             .open(file_path)
725             .map_err(Error::SnapshotOpen)?;
726 
727         let guest_memory = self.guest_memory.memory();
728         for range in saved_regions.regions() {
729             let mut offset: u64 = 0;
730             // Here we are manually handling the retry in case we can't write
731             // the whole region at once because we can't use the implementation
732             // from vm-memory::GuestMemory of read_exact_from() as it is not
733             // following the correct behavior. For more info about this issue
734             // see: https://github.com/rust-vmm/vm-memory/issues/174
735             loop {
736                 let bytes_read = guest_memory
737                     .read_volatile_from(
738                         GuestAddress(range.gpa + offset),
739                         &mut memory_file,
740                         (range.length - offset) as usize,
741                     )
742                     .map_err(Error::SnapshotCopy)?;
743                 offset += bytes_read as u64;
744 
745                 if offset == range.length {
746                     break;
747                 }
748             }
749         }
750 
751         Ok(())
752     }
753 
754     fn validate_memory_config(
755         config: &MemoryConfig,
756         user_provided_zones: bool,
757     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
758         let mut allow_mem_hotplug = false;
759 
760         if !user_provided_zones {
761             if config.zones.is_some() {
762                 error!(
763                     "User defined memory regions can't be provided if the \
764                     memory size is not 0"
765                 );
766                 return Err(Error::InvalidMemoryParameters);
767             }
768 
769             if config.hotplug_size.is_some() {
770                 allow_mem_hotplug = true;
771             }
772 
773             if let Some(hotplugged_size) = config.hotplugged_size {
774                 if let Some(hotplug_size) = config.hotplug_size {
775                     if hotplugged_size > hotplug_size {
776                         error!(
777                             "'hotplugged_size' {} can't be bigger than \
778                             'hotplug_size' {}",
779                             hotplugged_size, hotplug_size,
780                         );
781                         return Err(Error::InvalidMemoryParameters);
782                     }
783                 } else {
784                     error!(
785                         "Invalid to define 'hotplugged_size' when there is\
786                         no 'hotplug_size'"
787                     );
788                     return Err(Error::InvalidMemoryParameters);
789                 }
790                 if config.hotplug_method == HotplugMethod::Acpi {
791                     error!(
792                         "Invalid to define 'hotplugged_size' with hotplug \
793                         method 'acpi'"
794                     );
795                     return Err(Error::InvalidMemoryParameters);
796                 }
797             }
798 
799             // Create a single zone from the global memory config. This lets
800             // us reuse the codepath for user defined memory zones.
801             let zones = vec![MemoryZoneConfig {
802                 id: String::from(DEFAULT_MEMORY_ZONE),
803                 size: config.size,
804                 file: None,
805                 shared: config.shared,
806                 hugepages: config.hugepages,
807                 hugepage_size: config.hugepage_size,
808                 host_numa_node: None,
809                 hotplug_size: config.hotplug_size,
810                 hotplugged_size: config.hotplugged_size,
811                 prefault: config.prefault,
812             }];
813 
814             Ok((config.size, zones, allow_mem_hotplug))
815         } else {
816             if config.zones.is_none() {
817                 error!(
818                     "User defined memory regions must be provided if the \
819                     memory size is 0"
820                 );
821                 return Err(Error::MissingMemoryZones);
822             }
823 
824             // Safe to unwrap as we checked right above there were some
825             // regions.
826             let zones = config.zones.clone().unwrap();
827             if zones.is_empty() {
828                 return Err(Error::MissingMemoryZones);
829             }
830 
831             let mut total_ram_size: u64 = 0;
832             for zone in zones.iter() {
833                 total_ram_size += zone.size;
834 
835                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
836                     error!(
837                         "Invalid to set host NUMA policy for a memory zone \
838                         backed by a regular file and mapped as 'shared'"
839                     );
840                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
841                 }
842 
843                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
844                     error!("Invalid to set ACPI hotplug method for memory zones");
845                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
846                 }
847 
848                 if let Some(hotplugged_size) = zone.hotplugged_size {
849                     if let Some(hotplug_size) = zone.hotplug_size {
850                         if hotplugged_size > hotplug_size {
851                             error!(
852                                 "'hotplugged_size' {} can't be bigger than \
853                                 'hotplug_size' {}",
854                                 hotplugged_size, hotplug_size,
855                             );
856                             return Err(Error::InvalidMemoryParameters);
857                         }
858                     } else {
859                         error!(
860                             "Invalid to define 'hotplugged_size' when there is\
861                             no 'hotplug_size' for a memory zone"
862                         );
863                         return Err(Error::InvalidMemoryParameters);
864                     }
865                     if config.hotplug_method == HotplugMethod::Acpi {
866                         error!(
867                             "Invalid to define 'hotplugged_size' with hotplug \
868                             method 'acpi'"
869                         );
870                         return Err(Error::InvalidMemoryParameters);
871                     }
872                 }
873             }
874 
875             Ok((total_ram_size, zones, allow_mem_hotplug))
876         }
877     }
878 
879     pub fn allocate_address_space(&mut self) -> Result<(), Error> {
880         let mut list = Vec::new();
881 
882         for (zone_id, memory_zone) in self.memory_zones.iter() {
883             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
884                 memory_zone
885                     .regions()
886                     .iter()
887                     .map(|r| (r.clone(), false))
888                     .collect();
889 
890             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
891                 regions.push((virtio_mem_zone.region().clone(), true));
892             }
893 
894             list.push((zone_id.clone(), regions));
895         }
896 
897         for (zone_id, regions) in list {
898             for (region, virtio_mem) in regions {
899                 let slot = self.create_userspace_mapping(
900                     region.start_addr().raw_value(),
901                     region.len(),
902                     region.as_ptr() as u64,
903                     self.mergeable,
904                     false,
905                     self.log_dirty,
906                 )?;
907 
908                 let file_offset = if let Some(file_offset) = region.file_offset() {
909                     file_offset.start()
910                 } else {
911                     0
912                 };
913 
914                 self.guest_ram_mappings.push(GuestRamMapping {
915                     gpa: region.start_addr().raw_value(),
916                     size: region.len(),
917                     slot,
918                     zone_id: zone_id.clone(),
919                     virtio_mem,
920                     file_offset,
921                 });
922                 self.ram_allocator
923                     .allocate(Some(region.start_addr()), region.len(), None)
924                     .ok_or(Error::MemoryRangeAllocation)?;
925             }
926         }
927 
928         // Allocate SubRegion and Reserved address ranges.
929         for region in self.arch_mem_regions.iter() {
930             if region.r_type == RegionType::Ram {
931                 // Ignore the RAM type since ranges have already been allocated
932                 // based on the GuestMemory regions.
933                 continue;
934             }
935             self.ram_allocator
936                 .allocate(
937                     Some(GuestAddress(region.base)),
938                     region.size as GuestUsize,
939                     None,
940                 )
941                 .ok_or(Error::MemoryRangeAllocation)?;
942         }
943 
944         Ok(())
945     }
946 
947     #[cfg(target_arch = "aarch64")]
948     fn add_uefi_flash(&mut self) -> Result<(), Error> {
949         // On AArch64, the UEFI binary requires a flash device at address 0.
950         // 4 MiB memory is mapped to simulate the flash.
951         let uefi_mem_slot = self.allocate_memory_slot();
952         let uefi_region = GuestRegionMmap::new(
953             MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(),
954             arch::layout::UEFI_START,
955         )
956         .unwrap();
957         let uefi_mem_region = self.vm.make_user_memory_region(
958             uefi_mem_slot,
959             uefi_region.start_addr().raw_value(),
960             uefi_region.len(),
961             uefi_region.as_ptr() as u64,
962             false,
963             false,
964         );
965         self.vm
966             .create_user_memory_region(uefi_mem_region)
967             .map_err(Error::CreateUefiFlash)?;
968 
969         let uefi_flash =
970             GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap());
971 
972         self.uefi_flash = Some(uefi_flash);
973 
974         Ok(())
975     }
976 
977     #[allow(clippy::too_many_arguments)]
978     pub fn new(
979         vm: Arc<dyn hypervisor::Vm>,
980         config: &MemoryConfig,
981         prefault: Option<bool>,
982         phys_bits: u8,
983         #[cfg(feature = "tdx")] tdx_enabled: bool,
984         restore_data: Option<&MemoryManagerSnapshotData>,
985         existing_memory_files: Option<HashMap<u32, File>>,
986         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
987     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
988         trace_scoped!("MemoryManager::new");
989 
990         let user_provided_zones = config.size == 0;
991 
992         let mmio_address_space_size = mmio_address_space_size(phys_bits);
993         debug_assert_eq!(
994             (((mmio_address_space_size) >> 16) << 16),
995             mmio_address_space_size
996         );
997         let start_of_platform_device_area =
998             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
999         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
1000 
1001         let (ram_size, zones, allow_mem_hotplug) =
1002             Self::validate_memory_config(config, user_provided_zones)?;
1003 
1004         let (
1005             start_of_device_area,
1006             boot_ram,
1007             current_ram,
1008             arch_mem_regions,
1009             memory_zones,
1010             guest_memory,
1011             boot_guest_memory,
1012             hotplug_slots,
1013             next_memory_slot,
1014             selected_slot,
1015             next_hotplug_slot,
1016         ) = if let Some(data) = restore_data {
1017             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
1018                 &data.guest_ram_mappings,
1019                 &zones,
1020                 prefault,
1021                 existing_memory_files.unwrap_or_default(),
1022                 config.thp,
1023             )?;
1024             let guest_memory =
1025                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
1026             let boot_guest_memory = guest_memory.clone();
1027             (
1028                 GuestAddress(data.start_of_device_area),
1029                 data.boot_ram,
1030                 data.current_ram,
1031                 data.arch_mem_regions.clone(),
1032                 memory_zones,
1033                 guest_memory,
1034                 boot_guest_memory,
1035                 data.hotplug_slots.clone(),
1036                 data.next_memory_slot,
1037                 data.selected_slot,
1038                 data.next_hotplug_slot,
1039             )
1040         } else {
1041             // Init guest memory
1042             let arch_mem_regions = arch::arch_memory_regions();
1043 
1044             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1045                 .iter()
1046                 .filter(|r| r.2 == RegionType::Ram)
1047                 .map(|r| (r.0, r.1))
1048                 .collect();
1049 
1050             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
1051                 .iter()
1052                 .map(|(a, b, c)| ArchMemRegion {
1053                     base: a.0,
1054                     size: *b,
1055                     r_type: *c,
1056                 })
1057                 .collect();
1058 
1059             let (mem_regions, mut memory_zones) =
1060                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?;
1061 
1062             let mut guest_memory =
1063                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
1064 
1065             let boot_guest_memory = guest_memory.clone();
1066 
1067             let mut start_of_device_area =
1068                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
1069 
1070             // Update list of memory zones for resize.
1071             for zone in zones.iter() {
1072                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
1073                     if let Some(hotplug_size) = zone.hotplug_size {
1074                         if hotplug_size == 0 {
1075                             error!("'hotplug_size' can't be 0");
1076                             return Err(Error::InvalidHotplugSize);
1077                         }
1078 
1079                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
1080                             start_of_device_area = start_of_device_area
1081                                 .checked_add(hotplug_size)
1082                                 .ok_or(Error::GuestAddressOverFlow)?;
1083                         } else {
1084                             // Alignment must be "natural" i.e. same as size of block
1085                             let start_addr = GuestAddress(
1086                                 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1087                                     - 1)
1088                                     / virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1089                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
1090                             );
1091 
1092                             // When `prefault` is set by vm_restore, memory manager
1093                             // will create ram region with `prefault` option in
1094                             // restore config rather than same option in zone
1095                             let region = MemoryManager::create_ram_region(
1096                                 &None,
1097                                 0,
1098                                 start_addr,
1099                                 hotplug_size as usize,
1100                                 prefault.unwrap_or(zone.prefault),
1101                                 zone.shared,
1102                                 zone.hugepages,
1103                                 zone.hugepage_size,
1104                                 zone.host_numa_node,
1105                                 None,
1106                                 config.thp,
1107                             )?;
1108 
1109                             guest_memory = guest_memory
1110                                 .insert_region(Arc::clone(&region))
1111                                 .map_err(Error::GuestMemory)?;
1112 
1113                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
1114                             let region_size = region.len();
1115                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
1116                                 region,
1117                                 virtio_device: None,
1118                                 hotplugged_size,
1119                                 hugepages: zone.hugepages,
1120                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
1121                             });
1122 
1123                             start_of_device_area = start_addr
1124                                 .checked_add(hotplug_size)
1125                                 .ok_or(Error::GuestAddressOverFlow)?;
1126                         }
1127                     }
1128                 } else {
1129                     return Err(Error::MissingZoneIdentifier);
1130                 }
1131             }
1132 
1133             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
1134             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
1135 
1136             (
1137                 start_of_device_area,
1138                 ram_size,
1139                 ram_size,
1140                 arch_mem_regions,
1141                 memory_zones,
1142                 guest_memory,
1143                 boot_guest_memory,
1144                 hotplug_slots,
1145                 0,
1146                 0,
1147                 0,
1148             )
1149         };
1150 
1151         let guest_memory = GuestMemoryAtomic::new(guest_memory);
1152 
1153         // Both MMIO and PIO address spaces start at address 0.
1154         let allocator = Arc::new(Mutex::new(
1155             SystemAllocator::new(
1156                 #[cfg(target_arch = "x86_64")]
1157                 {
1158                     GuestAddress(0)
1159                 },
1160                 #[cfg(target_arch = "x86_64")]
1161                 {
1162                     1 << 16
1163                 },
1164                 start_of_platform_device_area,
1165                 PLATFORM_DEVICE_AREA_SIZE,
1166                 #[cfg(target_arch = "x86_64")]
1167                 vec![GsiApic::new(
1168                     X86_64_IRQ_BASE,
1169                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1170                 )],
1171             )
1172             .ok_or(Error::CreateSystemAllocator)?,
1173         ));
1174 
1175         #[cfg(not(feature = "tdx"))]
1176         let dynamic = true;
1177         #[cfg(feature = "tdx")]
1178         let dynamic = !tdx_enabled;
1179 
1180         let acpi_address = if dynamic
1181             && config.hotplug_method == HotplugMethod::Acpi
1182             && (config.hotplug_size.unwrap_or_default() > 0)
1183         {
1184             Some(
1185                 allocator
1186                     .lock()
1187                     .unwrap()
1188                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1189                     .ok_or(Error::AllocateMmioAddress)?,
1190             )
1191         } else {
1192             None
1193         };
1194 
1195         // If running on SGX the start of device area and RAM area may diverge but
1196         // at this point they are next to each other.
1197         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1198         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1199 
1200         let mut memory_manager = MemoryManager {
1201             boot_guest_memory,
1202             guest_memory,
1203             next_memory_slot,
1204             start_of_device_area,
1205             end_of_device_area,
1206             end_of_ram_area,
1207             vm,
1208             hotplug_slots,
1209             selected_slot,
1210             mergeable: config.mergeable,
1211             allocator,
1212             hotplug_method: config.hotplug_method,
1213             boot_ram,
1214             current_ram,
1215             next_hotplug_slot,
1216             shared: config.shared,
1217             hugepages: config.hugepages,
1218             hugepage_size: config.hugepage_size,
1219             prefault: config.prefault,
1220             #[cfg(target_arch = "x86_64")]
1221             sgx_epc_region: None,
1222             user_provided_zones,
1223             snapshot_memory_ranges: MemoryRangeTable::default(),
1224             memory_zones,
1225             guest_ram_mappings: Vec::new(),
1226             acpi_address,
1227             log_dirty: dynamic, // Cannot log dirty pages on a TD
1228             arch_mem_regions,
1229             ram_allocator,
1230             dynamic,
1231             #[cfg(target_arch = "aarch64")]
1232             uefi_flash: None,
1233             thp: config.thp,
1234         };
1235 
1236         #[cfg(target_arch = "aarch64")]
1237         {
1238             // For Aarch64 we cannot lazily allocate the address space like we
1239             // do for x86, because while restoring a VM from snapshot we would
1240             // need the address space to be allocated to properly restore VGIC.
1241             // And the restore of VGIC happens before we attempt to run the vCPUs
1242             // for the first time, thus we need to allocate the address space
1243             // beforehand.
1244             memory_manager.allocate_address_space()?;
1245             memory_manager.add_uefi_flash()?;
1246         }
1247 
1248         #[cfg(target_arch = "x86_64")]
1249         if let Some(sgx_epc_config) = sgx_epc_config {
1250             memory_manager.setup_sgx(sgx_epc_config)?;
1251         }
1252 
1253         Ok(Arc::new(Mutex::new(memory_manager)))
1254     }
1255 
1256     pub fn new_from_snapshot(
1257         snapshot: &Snapshot,
1258         vm: Arc<dyn hypervisor::Vm>,
1259         config: &MemoryConfig,
1260         source_url: Option<&str>,
1261         prefault: bool,
1262         phys_bits: u8,
1263     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1264         if let Some(source_url) = source_url {
1265             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1266             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1267 
1268             let mem_snapshot: MemoryManagerSnapshotData =
1269                 snapshot.to_versioned_state().map_err(Error::Restore)?;
1270 
1271             let mm = MemoryManager::new(
1272                 vm,
1273                 config,
1274                 Some(prefault),
1275                 phys_bits,
1276                 #[cfg(feature = "tdx")]
1277                 false,
1278                 Some(&mem_snapshot),
1279                 None,
1280                 #[cfg(target_arch = "x86_64")]
1281                 None,
1282             )?;
1283 
1284             mm.lock()
1285                 .unwrap()
1286                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1287 
1288             Ok(mm)
1289         } else {
1290             Err(Error::RestoreMissingSourceUrl)
1291         }
1292     }
1293 
1294     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1295         // SAFETY: FFI call with correct arguments
1296         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1297 
1298         if res < 0 {
1299             Err(io::Error::last_os_error())
1300         } else {
1301             Ok(res as RawFd)
1302         }
1303     }
1304 
1305     fn mbind(
1306         addr: *mut u8,
1307         len: u64,
1308         mode: u32,
1309         nodemask: Vec<u64>,
1310         maxnode: u64,
1311         flags: u32,
1312     ) -> Result<(), io::Error> {
1313         // SAFETY: FFI call with correct arguments
1314         let res = unsafe {
1315             libc::syscall(
1316                 libc::SYS_mbind,
1317                 addr as *mut libc::c_void,
1318                 len,
1319                 mode,
1320                 nodemask.as_ptr(),
1321                 maxnode,
1322                 flags,
1323             )
1324         };
1325 
1326         if res < 0 {
1327             Err(io::Error::last_os_error())
1328         } else {
1329             Ok(())
1330         }
1331     }
1332 
1333     fn create_anonymous_file(
1334         size: usize,
1335         hugepages: bool,
1336         hugepage_size: Option<u64>,
1337     ) -> Result<FileOffset, Error> {
1338         let fd = Self::memfd_create(
1339             &ffi::CString::new("ch_ram").unwrap(),
1340             libc::MFD_CLOEXEC
1341                 | if hugepages {
1342                     libc::MFD_HUGETLB
1343                         | if let Some(hugepage_size) = hugepage_size {
1344                             /*
1345                              * From the Linux kernel:
1346                              * Several system calls take a flag to request "hugetlb" huge pages.
1347                              * Without further specification, these system calls will use the
1348                              * system's default huge page size.  If a system supports multiple
1349                              * huge page sizes, the desired huge page size can be specified in
1350                              * bits [26:31] of the flag arguments.  The value in these 6 bits
1351                              * will encode the log2 of the huge page size.
1352                              */
1353 
1354                             hugepage_size.trailing_zeros() << 26
1355                         } else {
1356                             // Use the system default huge page size
1357                             0
1358                         }
1359                 } else {
1360                     0
1361                 },
1362         )
1363         .map_err(Error::SharedFileCreate)?;
1364 
1365         // SAFETY: fd is valid
1366         let f = unsafe { File::from_raw_fd(fd) };
1367         f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1368 
1369         Ok(FileOffset::new(f, 0))
1370     }
1371 
1372     fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> {
1373         if backing_file.is_dir() {
1374             Err(Error::DirectoryAsBackingFileForMemory)
1375         } else {
1376             let f = OpenOptions::new()
1377                 .read(true)
1378                 .write(true)
1379                 .open(backing_file)
1380                 .map_err(Error::SharedFileCreate)?;
1381 
1382             Ok(FileOffset::new(f, file_offset))
1383         }
1384     }
1385 
1386     #[allow(clippy::too_many_arguments)]
1387     pub fn create_ram_region(
1388         backing_file: &Option<PathBuf>,
1389         file_offset: u64,
1390         start_addr: GuestAddress,
1391         size: usize,
1392         prefault: bool,
1393         shared: bool,
1394         hugepages: bool,
1395         hugepage_size: Option<u64>,
1396         host_numa_node: Option<u32>,
1397         existing_memory_file: Option<File>,
1398         thp: bool,
1399     ) -> Result<Arc<GuestRegionMmap>, Error> {
1400         let mut mmap_flags = libc::MAP_NORESERVE;
1401 
1402         // The duplication of mmap_flags ORing here is unfortunate but it also makes
1403         // the complexity of the handling clear.
1404         let fo = if let Some(f) = existing_memory_file {
1405             // It must be MAP_SHARED as we wouldn't already have an FD
1406             mmap_flags |= libc::MAP_SHARED;
1407             Some(FileOffset::new(f, file_offset))
1408         } else if let Some(backing_file) = backing_file {
1409             if shared {
1410                 mmap_flags |= libc::MAP_SHARED;
1411             } else {
1412                 mmap_flags |= libc::MAP_PRIVATE;
1413             }
1414             Some(Self::open_backing_file(backing_file, file_offset)?)
1415         } else if shared || hugepages {
1416             // For hugepages we must also MAP_SHARED otherwise we will trigger #4805
1417             // because the MAP_PRIVATE will trigger CoW against the backing file with
1418             // the VFIO pinning
1419             mmap_flags |= libc::MAP_SHARED;
1420             Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?)
1421         } else {
1422             mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS;
1423             None
1424         };
1425 
1426         let region = GuestRegionMmap::new(
1427             MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
1428                 .map_err(Error::GuestMemoryRegion)?,
1429             start_addr,
1430         )
1431         .map_err(Error::GuestMemory)?;
1432 
1433         // Apply NUMA policy if needed.
1434         if let Some(node) = host_numa_node {
1435             let addr = region.deref().as_ptr();
1436             let len = region.deref().size() as u64;
1437             let mode = MPOL_BIND;
1438             let mut nodemask: Vec<u64> = Vec::new();
1439             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1440 
1441             // Linux is kind of buggy in the way it interprets maxnode as it
1442             // will cut off the last node. That's why we have to add 1 to what
1443             // we would consider as the proper maxnode value.
1444             let maxnode = node as u64 + 1 + 1;
1445 
1446             // Allocate the right size for the vector.
1447             nodemask.resize((node as usize / 64) + 1, 0);
1448 
1449             // Fill the global bitmask through the nodemask vector.
1450             let idx = (node / 64) as usize;
1451             let shift = node % 64;
1452             nodemask[idx] |= 1u64 << shift;
1453 
1454             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1455             // force the kernel to move all pages that might have been already
1456             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1457             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1458             // MPOL_BIND is the selected mode as it specifies a strict policy
1459             // that restricts memory allocation to the nodes specified in the
1460             // nodemask.
1461             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1462                 .map_err(Error::ApplyNumaPolicy)?;
1463         }
1464 
1465         // Prefault the region if needed, in parallel.
1466         if prefault {
1467             let page_size =
1468                 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize;
1469 
1470             if !is_aligned(size, page_size) {
1471                 warn!(
1472                     "Prefaulting memory size {} misaligned with page size {}",
1473                     size, page_size
1474                 );
1475             }
1476 
1477             let num_pages = size / page_size;
1478 
1479             let num_threads = Self::get_prefault_num_threads(page_size, num_pages);
1480 
1481             let pages_per_thread = num_pages / num_threads;
1482             let remainder = num_pages % num_threads;
1483 
1484             let barrier = Arc::new(Barrier::new(num_threads));
1485             thread::scope(|s| {
1486                 let r = &region;
1487                 for i in 0..num_threads {
1488                     let barrier = Arc::clone(&barrier);
1489                     s.spawn(move || {
1490                         // Wait until all threads have been spawned to avoid contention
1491                         // over mmap_sem between thread stack allocation and page faulting.
1492                         barrier.wait();
1493                         let pages = pages_per_thread + if i < remainder { 1 } else { 0 };
1494                         let offset =
1495                             page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder));
1496                         // SAFETY: FFI call with correct arguments
1497                         let ret = unsafe {
1498                             let addr = r.as_ptr().add(offset);
1499                             libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE)
1500                         };
1501                         if ret != 0 {
1502                             let e = io::Error::last_os_error();
1503                             warn!("Failed to prefault pages: {}", e);
1504                         }
1505                     });
1506                 }
1507             });
1508         }
1509 
1510         if region.file_offset().is_none() && thp {
1511             info!(
1512                 "Anonymous mapping at 0x{:x} (size = 0x{:x})",
1513                 region.as_ptr() as u64,
1514                 size
1515             );
1516             // SAFETY: FFI call with correct arguments
1517             let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
1518             if ret != 0 {
1519                 let e = io::Error::last_os_error();
1520                 warn!("Failed to mark pages as THP eligible: {}", e);
1521             }
1522         }
1523 
1524         Ok(Arc::new(region))
1525     }
1526 
1527     // Duplicate of `memory_zone_get_align_size` that does not require a `zone`
1528     fn get_prefault_align_size(
1529         backing_file: &Option<PathBuf>,
1530         hugepages: bool,
1531         hugepage_size: Option<u64>,
1532     ) -> Result<u64, Error> {
1533         // SAFETY: FFI call. Trivially safe.
1534         let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
1535         match (hugepages, hugepage_size, backing_file) {
1536             (false, _, _) => Ok(page_size),
1537             (true, Some(hugepage_size), _) => Ok(hugepage_size),
1538             (true, None, _) => {
1539                 // There are two scenarios here:
1540                 //  - `hugepages` is enabled but `hugepage_size` is not specified:
1541                 //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
1542                 //  - The backing file is specified:
1543                 //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
1544                 //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
1545                 //     value is less than or equal to the page size, just use the page size.
1546                 let path = backing_file
1547                     .as_ref()
1548                     .map_or(Ok("/dev/hugepages"), |pathbuf| {
1549                         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
1550                     })?;
1551                 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
1552                 Ok(align_size)
1553             }
1554         }
1555     }
1556 
1557     fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize {
1558         let mut n: usize = 1;
1559 
1560         // Do not create more threads than processors available.
1561         // SAFETY: FFI call. Trivially safe.
1562         let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) };
1563         if procs > 0 {
1564             n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT);
1565         }
1566 
1567         // Do not create more threads than pages being allocated.
1568         n = std::cmp::min(n, num_pages);
1569 
1570         // Do not create threads to allocate less than 64 MiB of memory.
1571         n = std::cmp::min(
1572             n,
1573             std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))),
1574         );
1575 
1576         n
1577     }
1578 
1579     // Update the GuestMemoryMmap with the new range
1580     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1581         let guest_memory = self
1582             .guest_memory
1583             .memory()
1584             .insert_region(region)
1585             .map_err(Error::GuestMemory)?;
1586         self.guest_memory.lock().unwrap().replace(guest_memory);
1587 
1588         Ok(())
1589     }
1590 
1591     //
1592     // Calculate the start address of an area next to RAM.
1593     //
1594     // If memory hotplug is allowed, the start address needs to be aligned
1595     // (rounded-up) to 128MiB boundary.
1596     // If memory hotplug is not allowed, there is no alignment required.
1597     // And it must also start at the 64bit start.
1598     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1599         let mut start_addr = if allow_mem_hotplug {
1600             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1601         } else {
1602             mem_end
1603         };
1604 
1605         start_addr = start_addr
1606             .checked_add(1)
1607             .ok_or(Error::GuestAddressOverFlow)?;
1608 
1609         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1610             return Ok(arch::layout::RAM_64BIT_START);
1611         }
1612 
1613         Ok(start_addr)
1614     }
1615 
1616     pub fn add_ram_region(
1617         &mut self,
1618         start_addr: GuestAddress,
1619         size: usize,
1620     ) -> Result<Arc<GuestRegionMmap>, Error> {
1621         // Allocate memory for the region
1622         let region = MemoryManager::create_ram_region(
1623             &None,
1624             0,
1625             start_addr,
1626             size,
1627             self.prefault,
1628             self.shared,
1629             self.hugepages,
1630             self.hugepage_size,
1631             None,
1632             None,
1633             self.thp,
1634         )?;
1635 
1636         // Map it into the guest
1637         let slot = self.create_userspace_mapping(
1638             region.start_addr().0,
1639             region.len(),
1640             region.as_ptr() as u64,
1641             self.mergeable,
1642             false,
1643             self.log_dirty,
1644         )?;
1645         self.guest_ram_mappings.push(GuestRamMapping {
1646             gpa: region.start_addr().raw_value(),
1647             size: region.len(),
1648             slot,
1649             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1650             virtio_mem: false,
1651             file_offset: 0,
1652         });
1653 
1654         self.add_region(Arc::clone(&region))?;
1655 
1656         Ok(region)
1657     }
1658 
1659     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1660         info!("Hotplugging new RAM: {}", size);
1661 
1662         // Check that there is a free slot
1663         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1664             return Err(Error::NoSlotAvailable);
1665         }
1666 
1667         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1668         if size % (128 << 20) != 0 {
1669             return Err(Error::InvalidSize);
1670         }
1671 
1672         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1673 
1674         if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area {
1675             return Err(Error::InsufficientHotplugRam);
1676         }
1677 
1678         let region = self.add_ram_region(start_addr, size)?;
1679 
1680         // Add region to the list of regions associated with the default
1681         // memory zone.
1682         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1683             memory_zone.regions.push(Arc::clone(&region));
1684         }
1685 
1686         // Tell the allocator
1687         self.ram_allocator
1688             .allocate(Some(start_addr), size as GuestUsize, None)
1689             .ok_or(Error::MemoryRangeAllocation)?;
1690 
1691         // Update the slot so that it can be queried via the I/O port
1692         let slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1693         slot.active = true;
1694         slot.inserting = true;
1695         slot.base = region.start_addr().0;
1696         slot.length = region.len();
1697 
1698         self.next_hotplug_slot += 1;
1699 
1700         Ok(region)
1701     }
1702 
1703     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1704         self.guest_memory.clone()
1705     }
1706 
1707     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1708         self.boot_guest_memory.clone()
1709     }
1710 
1711     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1712         self.allocator.clone()
1713     }
1714 
1715     pub fn start_of_device_area(&self) -> GuestAddress {
1716         self.start_of_device_area
1717     }
1718 
1719     pub fn end_of_device_area(&self) -> GuestAddress {
1720         self.end_of_device_area
1721     }
1722 
1723     pub fn allocate_memory_slot(&mut self) -> u32 {
1724         let slot_id = self.next_memory_slot;
1725         self.next_memory_slot += 1;
1726         slot_id
1727     }
1728 
1729     pub fn create_userspace_mapping(
1730         &mut self,
1731         guest_phys_addr: u64,
1732         memory_size: u64,
1733         userspace_addr: u64,
1734         mergeable: bool,
1735         readonly: bool,
1736         log_dirty: bool,
1737     ) -> Result<u32, Error> {
1738         let slot = self.allocate_memory_slot();
1739         let mem_region = self.vm.make_user_memory_region(
1740             slot,
1741             guest_phys_addr,
1742             memory_size,
1743             userspace_addr,
1744             readonly,
1745             log_dirty,
1746         );
1747 
1748         info!(
1749             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1750             guest_phys_addr, userspace_addr, memory_size, slot
1751         );
1752 
1753         self.vm
1754             .create_user_memory_region(mem_region)
1755             .map_err(Error::CreateUserMemoryRegion)?;
1756 
1757         // SAFETY: the address and size are valid since the
1758         // mmap succeeded.
1759         let ret = unsafe {
1760             libc::madvise(
1761                 userspace_addr as *mut libc::c_void,
1762                 memory_size as libc::size_t,
1763                 libc::MADV_DONTDUMP,
1764             )
1765         };
1766         if ret != 0 {
1767             let e = io::Error::last_os_error();
1768             warn!("Failed to mark mappin as MADV_DONTDUMP: {}", e);
1769         }
1770 
1771         // Mark the pages as mergeable if explicitly asked for.
1772         if mergeable {
1773             // SAFETY: the address and size are valid since the
1774             // mmap succeeded.
1775             let ret = unsafe {
1776                 libc::madvise(
1777                     userspace_addr as *mut libc::c_void,
1778                     memory_size as libc::size_t,
1779                     libc::MADV_MERGEABLE,
1780                 )
1781             };
1782             if ret != 0 {
1783                 let err = io::Error::last_os_error();
1784                 // Safe to unwrap because the error is constructed with
1785                 // last_os_error(), which ensures the output will be Some().
1786                 let errno = err.raw_os_error().unwrap();
1787                 if errno == libc::EINVAL {
1788                     warn!("kernel not configured with CONFIG_KSM");
1789                 } else {
1790                     warn!("madvise error: {}", err);
1791                 }
1792                 warn!("failed to mark pages as mergeable");
1793             }
1794         }
1795 
1796         info!(
1797             "Created userspace mapping: {:x} -> {:x} {:x}",
1798             guest_phys_addr, userspace_addr, memory_size
1799         );
1800 
1801         Ok(slot)
1802     }
1803 
1804     pub fn remove_userspace_mapping(
1805         &mut self,
1806         guest_phys_addr: u64,
1807         memory_size: u64,
1808         userspace_addr: u64,
1809         mergeable: bool,
1810         slot: u32,
1811     ) -> Result<(), Error> {
1812         let mem_region = self.vm.make_user_memory_region(
1813             slot,
1814             guest_phys_addr,
1815             memory_size,
1816             userspace_addr,
1817             false, /* readonly -- don't care */
1818             false, /* log dirty */
1819         );
1820 
1821         self.vm
1822             .remove_user_memory_region(mem_region)
1823             .map_err(Error::RemoveUserMemoryRegion)?;
1824 
1825         // Mark the pages as unmergeable if there were previously marked as
1826         // mergeable.
1827         if mergeable {
1828             // SAFETY: the address and size are valid as the region was
1829             // previously advised.
1830             let ret = unsafe {
1831                 libc::madvise(
1832                     userspace_addr as *mut libc::c_void,
1833                     memory_size as libc::size_t,
1834                     libc::MADV_UNMERGEABLE,
1835                 )
1836             };
1837             if ret != 0 {
1838                 let err = io::Error::last_os_error();
1839                 // Safe to unwrap because the error is constructed with
1840                 // last_os_error(), which ensures the output will be Some().
1841                 let errno = err.raw_os_error().unwrap();
1842                 if errno == libc::EINVAL {
1843                     warn!("kernel not configured with CONFIG_KSM");
1844                 } else {
1845                     warn!("madvise error: {}", err);
1846                 }
1847                 warn!("failed to mark pages as unmergeable");
1848             }
1849         }
1850 
1851         info!(
1852             "Removed userspace mapping: {:x} -> {:x} {:x}",
1853             guest_phys_addr, userspace_addr, memory_size
1854         );
1855 
1856         Ok(())
1857     }
1858 
1859     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1860         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1861             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1862                 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() {
1863                     virtio_mem_device
1864                         .lock()
1865                         .unwrap()
1866                         .resize(size)
1867                         .map_err(Error::VirtioMemResizeFail)?;
1868                 }
1869 
1870                 // Keep the hotplugged_size up to date.
1871                 virtio_mem_zone.hotplugged_size = size;
1872             } else {
1873                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1874                 return Err(Error::MissingVirtioMemHandler);
1875             }
1876 
1877             return Ok(());
1878         }
1879 
1880         error!("Failed resizing virtio-mem region: Unknown memory zone");
1881         Err(Error::UnknownMemoryZone)
1882     }
1883 
1884     /// In case this function resulted in adding a new memory region to the
1885     /// guest memory, the new region is returned to the caller. The virtio-mem
1886     /// use case never adds a new region as the whole hotpluggable memory has
1887     /// already been allocated at boot time.
1888     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1889         if self.user_provided_zones {
1890             error!(
1891                 "Not allowed to resize guest memory when backed with user \
1892                 defined memory zones."
1893             );
1894             return Err(Error::InvalidResizeWithMemoryZones);
1895         }
1896 
1897         let mut region: Option<Arc<GuestRegionMmap>> = None;
1898         match self.hotplug_method {
1899             HotplugMethod::VirtioMem => {
1900                 if desired_ram >= self.boot_ram {
1901                     if !self.dynamic {
1902                         return Ok(region);
1903                     }
1904 
1905                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1906                     self.current_ram = desired_ram;
1907                 }
1908             }
1909             HotplugMethod::Acpi => {
1910                 if desired_ram > self.current_ram {
1911                     if !self.dynamic {
1912                         return Ok(region);
1913                     }
1914 
1915                     region =
1916                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1917                     self.current_ram = desired_ram;
1918                 }
1919             }
1920         }
1921         Ok(region)
1922     }
1923 
1924     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1925         if !self.user_provided_zones {
1926             error!(
1927                 "Not allowed to resize guest memory zone when no zone is \
1928                 defined."
1929             );
1930             return Err(Error::ResizeZone);
1931         }
1932 
1933         self.virtio_mem_resize(id, virtio_mem_size)
1934     }
1935 
1936     #[cfg(target_arch = "x86_64")]
1937     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1938         let file = OpenOptions::new()
1939             .read(true)
1940             .open("/dev/sgx_provision")
1941             .map_err(Error::SgxProvisionOpen)?;
1942         self.vm
1943             .enable_sgx_attribute(file)
1944             .map_err(Error::SgxEnableProvisioning)?;
1945 
1946         // Go over each EPC section and verify its size is a 4k multiple. At
1947         // the same time, calculate the total size needed for the contiguous
1948         // EPC region.
1949         let mut epc_region_size = 0;
1950         for epc_section in sgx_epc_config.iter() {
1951             if epc_section.size == 0 {
1952                 return Err(Error::EpcSectionSizeInvalid);
1953             }
1954             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1955                 return Err(Error::EpcSectionSizeInvalid);
1956             }
1957 
1958             epc_region_size += epc_section.size;
1959         }
1960 
1961         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
1962         let epc_region_start = GuestAddress(
1963             ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE,
1964         );
1965 
1966         self.start_of_device_area = epc_region_start
1967             .checked_add(epc_region_size)
1968             .ok_or(Error::GuestAddressOverFlow)?;
1969 
1970         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
1971         info!(
1972             "SGX EPC region: 0x{:x} (0x{:x})",
1973             epc_region_start.0, epc_region_size
1974         );
1975 
1976         // Each section can be memory mapped into the allocated region.
1977         let mut epc_section_start = epc_region_start.raw_value();
1978         for epc_section in sgx_epc_config.iter() {
1979             let file = OpenOptions::new()
1980                 .read(true)
1981                 .write(true)
1982                 .open("/dev/sgx_vepc")
1983                 .map_err(Error::SgxVirtEpcOpen)?;
1984 
1985             let prot = PROT_READ | PROT_WRITE;
1986             let mut flags = MAP_NORESERVE | MAP_SHARED;
1987             if epc_section.prefault {
1988                 flags |= MAP_POPULATE;
1989             }
1990 
1991             // We can't use the vm-memory crate to perform the memory mapping
1992             // here as it would try to ensure the size of the backing file is
1993             // matching the size of the expected mapping. The /dev/sgx_vepc
1994             // device does not work that way, it provides a file descriptor
1995             // which is not matching the mapping size, as it's a just a way to
1996             // let KVM know that an EPC section is being created for the guest.
1997             // SAFETY: FFI call with correct arguments
1998             let host_addr = unsafe {
1999                 libc::mmap(
2000                     std::ptr::null_mut(),
2001                     epc_section.size as usize,
2002                     prot,
2003                     flags,
2004                     file.as_raw_fd(),
2005                     0,
2006                 )
2007             } as u64;
2008 
2009             info!(
2010                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
2011                 epc_section_start, epc_section.size
2012             );
2013 
2014             let _mem_slot = self.create_userspace_mapping(
2015                 epc_section_start,
2016                 epc_section.size,
2017                 host_addr,
2018                 false,
2019                 false,
2020                 false,
2021             )?;
2022 
2023             sgx_epc_region.insert(
2024                 epc_section.id.clone(),
2025                 SgxEpcSection::new(
2026                     GuestAddress(epc_section_start),
2027                     epc_section.size as GuestUsize,
2028                 ),
2029             );
2030 
2031             epc_section_start += epc_section.size;
2032         }
2033 
2034         self.sgx_epc_region = Some(sgx_epc_region);
2035 
2036         Ok(())
2037     }
2038 
2039     #[cfg(target_arch = "x86_64")]
2040     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
2041         &self.sgx_epc_region
2042     }
2043 
2044     pub fn is_hardlink(f: &File) -> bool {
2045         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
2046         // SAFETY: FFI call with correct arguments
2047         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
2048         if ret != 0 {
2049             error!("Couldn't fstat the backing file");
2050             return false;
2051         }
2052 
2053         // SAFETY: stat is valid
2054         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
2055     }
2056 
2057     pub fn memory_zones(&self) -> &MemoryZones {
2058         &self.memory_zones
2059     }
2060 
2061     pub fn memory_zones_mut(&mut self) -> &mut MemoryZones {
2062         &mut self.memory_zones
2063     }
2064 
2065     pub fn memory_range_table(
2066         &self,
2067         snapshot: bool,
2068     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
2069         let mut table = MemoryRangeTable::default();
2070 
2071         for memory_zone in self.memory_zones.values() {
2072             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
2073                 table.extend(virtio_mem_zone.plugged_ranges());
2074             }
2075 
2076             for region in memory_zone.regions() {
2077                 if snapshot {
2078                     if let Some(file_offset) = region.file_offset() {
2079                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
2080                             && Self::is_hardlink(file_offset.file())
2081                         {
2082                             // In this very specific case, we know the memory
2083                             // region is backed by a file on the host filesystem
2084                             // that can be accessed by the user, and additionally
2085                             // the mapping is shared, which means that modifications
2086                             // to the content are written to the actual file.
2087                             // When meeting these conditions, we can skip the
2088                             // copy of the memory content for this specific region,
2089                             // as we can assume the user will have it saved through
2090                             // the backing file already.
2091                             continue;
2092                         }
2093                     }
2094                 }
2095 
2096                 table.push(MemoryRange {
2097                     gpa: region.start_addr().raw_value(),
2098                     length: region.len(),
2099                 });
2100             }
2101         }
2102 
2103         Ok(table)
2104     }
2105 
2106     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
2107         MemoryManagerSnapshotData {
2108             memory_ranges: self.snapshot_memory_ranges.clone(),
2109             guest_ram_mappings: self.guest_ram_mappings.clone(),
2110             start_of_device_area: self.start_of_device_area.0,
2111             boot_ram: self.boot_ram,
2112             current_ram: self.current_ram,
2113             arch_mem_regions: self.arch_mem_regions.clone(),
2114             hotplug_slots: self.hotplug_slots.clone(),
2115             next_memory_slot: self.next_memory_slot,
2116             selected_slot: self.selected_slot,
2117             next_hotplug_slot: self.next_hotplug_slot,
2118         }
2119     }
2120 
2121     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
2122         let mut memory_slot_fds = HashMap::new();
2123         for guest_ram_mapping in &self.guest_ram_mappings {
2124             let slot = guest_ram_mapping.slot;
2125             let guest_memory = self.guest_memory.memory();
2126             let file = guest_memory
2127                 .find_region(GuestAddress(guest_ram_mapping.gpa))
2128                 .unwrap()
2129                 .file_offset()
2130                 .unwrap()
2131                 .file();
2132             memory_slot_fds.insert(slot, file.as_raw_fd());
2133         }
2134         memory_slot_fds
2135     }
2136 
2137     pub fn acpi_address(&self) -> Option<GuestAddress> {
2138         self.acpi_address
2139     }
2140 
2141     pub fn num_guest_ram_mappings(&self) -> u32 {
2142         self.guest_ram_mappings.len() as u32
2143     }
2144 
2145     #[cfg(target_arch = "aarch64")]
2146     pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
2147         self.uefi_flash.as_ref().unwrap().clone()
2148     }
2149 
2150     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2151     pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
2152         let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
2153         mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
2154 
2155         let mut mem_offset_in_elf = mem_offset;
2156         let mut ram_maps = BTreeMap::new();
2157         for mapping in mapping_sorted_by_gpa.iter() {
2158             ram_maps.insert(
2159                 mapping.gpa,
2160                 CoredumpMemoryRegion {
2161                     mem_offset_in_elf,
2162                     mem_size: mapping.size,
2163                 },
2164             );
2165             mem_offset_in_elf += mapping.size;
2166         }
2167 
2168         CoredumpMemoryRegions { ram_maps }
2169     }
2170 
2171     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2172     pub fn coredump_iterate_save_mem(
2173         &mut self,
2174         dump_state: &DumpState,
2175     ) -> std::result::Result<(), GuestDebuggableError> {
2176         let snapshot_memory_ranges = self
2177             .memory_range_table(false)
2178             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2179 
2180         if snapshot_memory_ranges.is_empty() {
2181             return Ok(());
2182         }
2183 
2184         let coredump_file = dump_state.file.as_ref().unwrap();
2185 
2186         let guest_memory = self.guest_memory.memory();
2187         let mut total_bytes: u64 = 0;
2188 
2189         for range in snapshot_memory_ranges.regions() {
2190             let mut offset: u64 = 0;
2191             loop {
2192                 let bytes_written = guest_memory
2193                     .write_volatile_to(
2194                         GuestAddress(range.gpa + offset),
2195                         &mut coredump_file.as_fd(),
2196                         (range.length - offset) as usize,
2197                     )
2198                     .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2199                 offset += bytes_written as u64;
2200                 total_bytes += bytes_written as u64;
2201 
2202                 if offset == range.length {
2203                     break;
2204                 }
2205             }
2206         }
2207 
2208         debug!("coredump total bytes {}", total_bytes);
2209         Ok(())
2210     }
2211 
2212     pub fn receive_memory_regions<F>(
2213         &mut self,
2214         ranges: &MemoryRangeTable,
2215         fd: &mut F,
2216     ) -> std::result::Result<(), MigratableError>
2217     where
2218         F: ReadVolatile,
2219     {
2220         let guest_memory = self.guest_memory();
2221         let mem = guest_memory.memory();
2222 
2223         for range in ranges.regions() {
2224             let mut offset: u64 = 0;
2225             // Here we are manually handling the retry in case we can't the
2226             // whole region at once because we can't use the implementation
2227             // from vm-memory::GuestMemory of read_exact_from() as it is not
2228             // following the correct behavior. For more info about this issue
2229             // see: https://github.com/rust-vmm/vm-memory/issues/174
2230             loop {
2231                 let bytes_read = mem
2232                     .read_volatile_from(
2233                         GuestAddress(range.gpa + offset),
2234                         fd,
2235                         (range.length - offset) as usize,
2236                     )
2237                     .map_err(|e| {
2238                         MigratableError::MigrateReceive(anyhow!(
2239                             "Error receiving memory from socket: {}",
2240                             e
2241                         ))
2242                     })?;
2243                 offset += bytes_read as u64;
2244 
2245                 if offset == range.length {
2246                     break;
2247                 }
2248             }
2249         }
2250 
2251         Ok(())
2252     }
2253 }
2254 
2255 struct MemoryNotify {
2256     slot_id: usize,
2257 }
2258 
2259 impl Aml for MemoryNotify {
2260     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2261         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
2262         aml::If::new(
2263             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
2264             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2265         )
2266         .to_aml_bytes(sink)
2267     }
2268 }
2269 
2270 struct MemorySlot {
2271     slot_id: usize,
2272 }
2273 
2274 impl Aml for MemorySlot {
2275     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2276         aml::Device::new(
2277             format!("M{:03}", self.slot_id).as_str().into(),
2278             vec![
2279                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")),
2280                 &aml::Name::new("_UID".into(), &self.slot_id),
2281                 /*
2282                 _STA return value:
2283                 Bit [0] – Set if the device is present.
2284                 Bit [1] – Set if the device is enabled and decoding its resources.
2285                 Bit [2] – Set if the device should be shown in the UI.
2286                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
2287                 Bit [4] – Set if the battery is present.
2288                 Bits [31:5] – Reserved (must be cleared).
2289                 */
2290                 &aml::Method::new(
2291                     "_STA".into(),
2292                     0,
2293                     false,
2294                     // Call into MSTA method which will interrogate device
2295                     vec![&aml::Return::new(&aml::MethodCall::new(
2296                         "MSTA".into(),
2297                         vec![&self.slot_id],
2298                     ))],
2299                 ),
2300                 // Get details of memory
2301                 &aml::Method::new(
2302                     "_CRS".into(),
2303                     0,
2304                     false,
2305                     // Call into MCRS which provides actual memory details
2306                     vec![&aml::Return::new(&aml::MethodCall::new(
2307                         "MCRS".into(),
2308                         vec![&self.slot_id],
2309                     ))],
2310                 ),
2311             ],
2312         )
2313         .to_aml_bytes(sink)
2314     }
2315 }
2316 
2317 struct MemorySlots {
2318     slots: usize,
2319 }
2320 
2321 impl Aml for MemorySlots {
2322     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2323         for slot_id in 0..self.slots {
2324             MemorySlot { slot_id }.to_aml_bytes(sink);
2325         }
2326     }
2327 }
2328 
2329 struct MemoryMethods {
2330     slots: usize,
2331 }
2332 
2333 impl Aml for MemoryMethods {
2334     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2335         // Add "MTFY" notification method
2336         let mut memory_notifies = Vec::new();
2337         for slot_id in 0..self.slots {
2338             memory_notifies.push(MemoryNotify { slot_id });
2339         }
2340 
2341         let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new();
2342         for memory_notifier in memory_notifies.iter() {
2343             memory_notifies_refs.push(memory_notifier);
2344         }
2345 
2346         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink);
2347 
2348         // MSCN method
2349         aml::Method::new(
2350             "MSCN".into(),
2351             0,
2352             true,
2353             vec![
2354                 // Take lock defined above
2355                 &aml::Acquire::new("MLCK".into(), 0xffff),
2356                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2357                 &aml::While::new(
2358                     &aml::LessThan::new(&aml::Local(0), &self.slots),
2359                     vec![
2360                         // Write slot number (in first argument) to I/O port via field
2361                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2362                         // Check if MINS bit is set (inserting)
2363                         &aml::If::new(
2364                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2365                             // Notify device if it is
2366                             vec![
2367                                 &aml::MethodCall::new(
2368                                     "MTFY".into(),
2369                                     vec![&aml::Local(0), &aml::ONE],
2370                                 ),
2371                                 // Reset MINS bit
2372                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2373                             ],
2374                         ),
2375                         // Check if MRMV bit is set
2376                         &aml::If::new(
2377                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2378                             // Notify device if it is (with the eject constant 0x3)
2379                             vec![
2380                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2381                                 // Reset MRMV bit
2382                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2383                             ],
2384                         ),
2385                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2386                     ],
2387                 ),
2388                 // Release lock
2389                 &aml::Release::new("MLCK".into()),
2390             ],
2391         )
2392         .to_aml_bytes(sink);
2393 
2394         // Memory status method
2395         aml::Method::new(
2396             "MSTA".into(),
2397             1,
2398             true,
2399             vec![
2400                 // Take lock defined above
2401                 &aml::Acquire::new("MLCK".into(), 0xffff),
2402                 // Write slot number (in first argument) to I/O port via field
2403                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2404                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2405                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2406                 &aml::If::new(
2407                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2408                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2409                 ),
2410                 // Release lock
2411                 &aml::Release::new("MLCK".into()),
2412                 // Return 0 or 0xf
2413                 &aml::Return::new(&aml::Local(0)),
2414             ],
2415         )
2416         .to_aml_bytes(sink);
2417 
2418         // Memory range method
2419         aml::Method::new(
2420             "MCRS".into(),
2421             1,
2422             true,
2423             vec![
2424                 // Take lock defined above
2425                 &aml::Acquire::new("MLCK".into(), 0xffff),
2426                 // Write slot number (in first argument) to I/O port via field
2427                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2428                 &aml::Name::new(
2429                     "MR64".into(),
2430                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2431                         aml::AddressSpaceCacheable::Cacheable,
2432                         true,
2433                         0x0000_0000_0000_0000u64,
2434                         0xFFFF_FFFF_FFFF_FFFEu64,
2435                         None,
2436                     )]),
2437                 ),
2438                 &aml::CreateQWordField::new(
2439                     &aml::Path::new("MINL"),
2440                     &aml::Path::new("MR64"),
2441                     &14usize,
2442                 ),
2443                 &aml::CreateDWordField::new(
2444                     &aml::Path::new("MINH"),
2445                     &aml::Path::new("MR64"),
2446                     &18usize,
2447                 ),
2448                 &aml::CreateQWordField::new(
2449                     &aml::Path::new("MAXL"),
2450                     &aml::Path::new("MR64"),
2451                     &22usize,
2452                 ),
2453                 &aml::CreateDWordField::new(
2454                     &aml::Path::new("MAXH"),
2455                     &aml::Path::new("MR64"),
2456                     &26usize,
2457                 ),
2458                 &aml::CreateQWordField::new(
2459                     &aml::Path::new("LENL"),
2460                     &aml::Path::new("MR64"),
2461                     &38usize,
2462                 ),
2463                 &aml::CreateDWordField::new(
2464                     &aml::Path::new("LENH"),
2465                     &aml::Path::new("MR64"),
2466                     &42usize,
2467                 ),
2468                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2469                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2470                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2471                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2472                 &aml::Add::new(
2473                     &aml::Path::new("MAXL"),
2474                     &aml::Path::new("MINL"),
2475                     &aml::Path::new("LENL"),
2476                 ),
2477                 &aml::Add::new(
2478                     &aml::Path::new("MAXH"),
2479                     &aml::Path::new("MINH"),
2480                     &aml::Path::new("LENH"),
2481                 ),
2482                 &aml::If::new(
2483                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2484                     vec![&aml::Add::new(
2485                         &aml::Path::new("MAXH"),
2486                         &aml::ONE,
2487                         &aml::Path::new("MAXH"),
2488                     )],
2489                 ),
2490                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2491                 // Release lock
2492                 &aml::Release::new("MLCK".into()),
2493                 &aml::Return::new(&aml::Path::new("MR64")),
2494             ],
2495         )
2496         .to_aml_bytes(sink)
2497     }
2498 }
2499 
2500 impl Aml for MemoryManager {
2501     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2502         if let Some(acpi_address) = self.acpi_address {
2503             // Memory Hotplug Controller
2504             aml::Device::new(
2505                 "_SB_.MHPC".into(),
2506                 vec![
2507                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2508                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2509                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2510                     &aml::Mutex::new("MLCK".into(), 0),
2511                     &aml::Name::new(
2512                         "_CRS".into(),
2513                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2514                             aml::AddressSpaceCacheable::NotCacheable,
2515                             true,
2516                             acpi_address.0,
2517                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2518                             None,
2519                         )]),
2520                     ),
2521                     // OpRegion and Fields map MMIO range into individual field values
2522                     &aml::OpRegion::new(
2523                         "MHPR".into(),
2524                         aml::OpRegionSpace::SystemMemory,
2525                         &(acpi_address.0 as usize),
2526                         &MEMORY_MANAGER_ACPI_SIZE,
2527                     ),
2528                     &aml::Field::new(
2529                         "MHPR".into(),
2530                         aml::FieldAccessType::DWord,
2531                         aml::FieldLockRule::NoLock,
2532                         aml::FieldUpdateRule::Preserve,
2533                         vec![
2534                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2535                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2536                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2537                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2538                         ],
2539                     ),
2540                     &aml::Field::new(
2541                         "MHPR".into(),
2542                         aml::FieldAccessType::DWord,
2543                         aml::FieldLockRule::NoLock,
2544                         aml::FieldUpdateRule::Preserve,
2545                         vec![
2546                             aml::FieldEntry::Reserved(128),
2547                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2548                         ],
2549                     ),
2550                     &aml::Field::new(
2551                         "MHPR".into(),
2552                         aml::FieldAccessType::Byte,
2553                         aml::FieldLockRule::NoLock,
2554                         aml::FieldUpdateRule::WriteAsZeroes,
2555                         vec![
2556                             aml::FieldEntry::Reserved(160),
2557                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2558                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2559                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2560                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2561                         ],
2562                     ),
2563                     &aml::Field::new(
2564                         "MHPR".into(),
2565                         aml::FieldAccessType::DWord,
2566                         aml::FieldLockRule::NoLock,
2567                         aml::FieldUpdateRule::Preserve,
2568                         vec![
2569                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2570                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2571                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2572                         ],
2573                     ),
2574                     &MemoryMethods {
2575                         slots: self.hotplug_slots.len(),
2576                     },
2577                     &MemorySlots {
2578                         slots: self.hotplug_slots.len(),
2579                     },
2580                 ],
2581             )
2582             .to_aml_bytes(sink);
2583         } else {
2584             aml::Device::new(
2585                 "_SB_.MHPC".into(),
2586                 vec![
2587                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2588                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2589                     // Empty MSCN for GED
2590                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2591                 ],
2592             )
2593             .to_aml_bytes(sink);
2594         }
2595 
2596         #[cfg(target_arch = "x86_64")]
2597         {
2598             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2599                 let min = sgx_epc_region.start().raw_value();
2600                 let max = min + sgx_epc_region.size() - 1;
2601                 // SGX EPC region
2602                 aml::Device::new(
2603                     "_SB_.EPC_".into(),
2604                     vec![
2605                         &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")),
2606                         // QWORD describing the EPC region start and size
2607                         &aml::Name::new(
2608                             "_CRS".into(),
2609                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2610                                 aml::AddressSpaceCacheable::NotCacheable,
2611                                 true,
2612                                 min,
2613                                 max,
2614                                 None,
2615                             )]),
2616                         ),
2617                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2618                     ],
2619                 )
2620                 .to_aml_bytes(sink);
2621             }
2622         }
2623     }
2624 }
2625 
2626 impl Pausable for MemoryManager {}
2627 
2628 #[derive(Clone, Serialize, Deserialize, Versionize)]
2629 pub struct MemoryManagerSnapshotData {
2630     memory_ranges: MemoryRangeTable,
2631     guest_ram_mappings: Vec<GuestRamMapping>,
2632     start_of_device_area: u64,
2633     boot_ram: u64,
2634     current_ram: u64,
2635     arch_mem_regions: Vec<ArchMemRegion>,
2636     hotplug_slots: Vec<HotPlugState>,
2637     next_memory_slot: u32,
2638     selected_slot: usize,
2639     next_hotplug_slot: usize,
2640 }
2641 
2642 impl VersionMapped for MemoryManagerSnapshotData {}
2643 
2644 impl Snapshottable for MemoryManager {
2645     fn id(&self) -> String {
2646         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2647     }
2648 
2649     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2650         let memory_ranges = self.memory_range_table(true)?;
2651 
2652         // Store locally this list of ranges as it will be used through the
2653         // Transportable::send() implementation. The point is to avoid the
2654         // duplication of code regarding the creation of the path for each
2655         // region. The 'snapshot' step creates the list of memory regions,
2656         // including information about the need to copy a memory region or
2657         // not. This saves the 'send' step having to go through the same
2658         // process, and instead it can directly proceed with storing the
2659         // memory range content for the ranges requiring it.
2660         self.snapshot_memory_ranges = memory_ranges;
2661 
2662         Ok(Snapshot::from_data(SnapshotData::new_from_versioned_state(
2663             &self.snapshot_data(),
2664         )?))
2665     }
2666 }
2667 
2668 impl Transportable for MemoryManager {
2669     fn send(
2670         &self,
2671         _snapshot: &Snapshot,
2672         destination_url: &str,
2673     ) -> result::Result<(), MigratableError> {
2674         if self.snapshot_memory_ranges.is_empty() {
2675             return Ok(());
2676         }
2677 
2678         let mut memory_file_path = url_to_path(destination_url)?;
2679         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2680 
2681         // Create the snapshot file for the entire memory
2682         let mut memory_file = OpenOptions::new()
2683             .read(true)
2684             .write(true)
2685             .create_new(true)
2686             .open(memory_file_path)
2687             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2688 
2689         let guest_memory = self.guest_memory.memory();
2690 
2691         for range in self.snapshot_memory_ranges.regions() {
2692             let mut offset: u64 = 0;
2693             // Here we are manually handling the retry in case we can't read
2694             // the whole region at once because we can't use the implementation
2695             // from vm-memory::GuestMemory of write_all_to() as it is not
2696             // following the correct behavior. For more info about this issue
2697             // see: https://github.com/rust-vmm/vm-memory/issues/174
2698             loop {
2699                 let bytes_written = guest_memory
2700                     .write_volatile_to(
2701                         GuestAddress(range.gpa + offset),
2702                         &mut memory_file,
2703                         (range.length - offset) as usize,
2704                     )
2705                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2706                 offset += bytes_written as u64;
2707 
2708                 if offset == range.length {
2709                     break;
2710                 }
2711             }
2712         }
2713         Ok(())
2714     }
2715 }
2716 
2717 impl Migratable for MemoryManager {
2718     // Start the dirty log in the hypervisor (kvm/mshv).
2719     // Also, reset the dirty bitmap logged by the vmm.
2720     // Just before we do a bulk copy we want to start/clear the dirty log so that
2721     // pages touched during our bulk copy are tracked.
2722     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2723         self.vm.start_dirty_log().map_err(|e| {
2724             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2725         })?;
2726 
2727         for r in self.guest_memory.memory().iter() {
2728             r.bitmap().reset();
2729         }
2730 
2731         Ok(())
2732     }
2733 
2734     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2735         self.vm.stop_dirty_log().map_err(|e| {
2736             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2737         })?;
2738 
2739         Ok(())
2740     }
2741 
2742     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2743     // together in the table if they are contiguous.
2744     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2745         let mut table = MemoryRangeTable::default();
2746         for r in &self.guest_ram_mappings {
2747             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2748                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2749             })?;
2750             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2751             {
2752                 Some(region) => {
2753                     assert!(region.start_addr().raw_value() == r.gpa);
2754                     assert!(region.len() == r.size);
2755                     region.bitmap().get_and_reset()
2756                 }
2757                 None => {
2758                     return Err(MigratableError::MigrateSend(anyhow!(
2759                         "Error finding 'guest memory region' with address {:x}",
2760                         r.gpa
2761                     )))
2762                 }
2763             };
2764 
2765             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2766                 .iter()
2767                 .zip(vmm_dirty_bitmap.iter())
2768                 .map(|(x, y)| x | y)
2769                 .collect();
2770 
2771             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2772 
2773             if sub_table.regions().is_empty() {
2774                 info!("Dirty Memory Range Table is empty");
2775             } else {
2776                 info!("Dirty Memory Range Table:");
2777                 for range in sub_table.regions() {
2778                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2779                 }
2780             }
2781 
2782             table.extend(sub_table);
2783         }
2784         Ok(table)
2785     }
2786 }
2787