xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision 3ce0fef7fd546467398c914dbc74d8542e45cf6f)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 #[cfg(target_arch = "x86_64")]
6 use crate::config::SgxEpcConfig;
7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
8 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
9 use crate::coredump::{
10     CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError,
11 };
12 use crate::migration::url_to_path;
13 use crate::MEMORY_MANAGER_SNAPSHOT_ID;
14 use crate::{GuestMemoryMmap, GuestRegionMmap};
15 use acpi_tables::{aml, Aml};
16 use anyhow::anyhow;
17 #[cfg(target_arch = "x86_64")]
18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
19 use arch::RegionType;
20 #[cfg(target_arch = "x86_64")]
21 use devices::ioapic;
22 #[cfg(target_arch = "aarch64")]
23 use hypervisor::HypervisorVmError;
24 #[cfg(target_arch = "x86_64")]
25 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
26 use serde::{Deserialize, Serialize};
27 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
28 use std::collections::BTreeMap;
29 use std::collections::HashMap;
30 use std::convert::TryInto;
31 use std::ffi;
32 use std::fs::{File, OpenOptions};
33 use std::io::{self};
34 use std::ops::{BitAnd, Deref, Not, Sub};
35 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
36 use std::os::fd::AsFd;
37 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
38 use std::path::PathBuf;
39 use std::result;
40 use std::sync::{Arc, Barrier, Mutex};
41 use tracer::trace_scoped;
42 use versionize::{VersionMap, Versionize, VersionizeResult};
43 use versionize_derive::Versionize;
44 use virtio_devices::BlocksState;
45 #[cfg(target_arch = "x86_64")]
46 use vm_allocator::GsiApic;
47 use vm_allocator::{AddressAllocator, SystemAllocator};
48 use vm_device::BusDevice;
49 use vm_memory::bitmap::AtomicBitmap;
50 use vm_memory::guest_memory::FileOffset;
51 use vm_memory::{
52     mmap::MmapRegionError, Address, Error as MmapError, GuestAddress, GuestAddressSpace,
53     GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion,
54     ReadVolatile,
55 };
56 use vm_migration::{
57     protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable,
58     Snapshot, SnapshotData, Snapshottable, Transportable, VersionMapped,
59 };
60 
61 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
62 
63 const DEFAULT_MEMORY_ZONE: &str = "mem0";
64 
65 const SNAPSHOT_FILENAME: &str = "memory-ranges";
66 
67 #[cfg(target_arch = "x86_64")]
68 const X86_64_IRQ_BASE: u32 = 5;
69 
70 #[cfg(target_arch = "x86_64")]
71 const SGX_PAGE_SIZE: u64 = 1 << 12;
72 
73 const HOTPLUG_COUNT: usize = 8;
74 
75 // Memory policy constants
76 const MPOL_BIND: u32 = 2;
77 const MPOL_MF_STRICT: u32 = 1;
78 const MPOL_MF_MOVE: u32 = 1 << 1;
79 
80 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
81 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
82 
83 #[derive(Clone, Default, Serialize, Deserialize, Versionize)]
84 struct HotPlugState {
85     base: u64,
86     length: u64,
87     active: bool,
88     inserting: bool,
89     removing: bool,
90 }
91 
92 pub struct VirtioMemZone {
93     region: Arc<GuestRegionMmap>,
94     virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>,
95     hotplugged_size: u64,
96     hugepages: bool,
97     blocks_state: Arc<Mutex<BlocksState>>,
98 }
99 
100 impl VirtioMemZone {
101     pub fn region(&self) -> &Arc<GuestRegionMmap> {
102         &self.region
103     }
104     pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) {
105         self.virtio_device = Some(virtio_device);
106     }
107     pub fn hotplugged_size(&self) -> u64 {
108         self.hotplugged_size
109     }
110     pub fn hugepages(&self) -> bool {
111         self.hugepages
112     }
113     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
114         &self.blocks_state
115     }
116     pub fn plugged_ranges(&self) -> MemoryRangeTable {
117         self.blocks_state
118             .lock()
119             .unwrap()
120             .memory_ranges(self.region.start_addr().raw_value(), true)
121     }
122 }
123 
124 #[derive(Default)]
125 pub struct MemoryZone {
126     regions: Vec<Arc<GuestRegionMmap>>,
127     virtio_mem_zone: Option<VirtioMemZone>,
128 }
129 
130 impl MemoryZone {
131     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
132         &self.regions
133     }
134     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
135         &self.virtio_mem_zone
136     }
137     pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> {
138         self.virtio_mem_zone.as_mut()
139     }
140 }
141 
142 pub type MemoryZones = HashMap<String, MemoryZone>;
143 
144 #[derive(Clone, Serialize, Deserialize, Versionize)]
145 struct GuestRamMapping {
146     slot: u32,
147     gpa: u64,
148     size: u64,
149     zone_id: String,
150     virtio_mem: bool,
151     file_offset: u64,
152 }
153 
154 #[derive(Clone, Serialize, Deserialize, Versionize)]
155 struct ArchMemRegion {
156     base: u64,
157     size: usize,
158     r_type: RegionType,
159 }
160 
161 pub struct MemoryManager {
162     boot_guest_memory: GuestMemoryMmap,
163     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
164     next_memory_slot: u32,
165     start_of_device_area: GuestAddress,
166     end_of_device_area: GuestAddress,
167     end_of_ram_area: GuestAddress,
168     pub vm: Arc<dyn hypervisor::Vm>,
169     hotplug_slots: Vec<HotPlugState>,
170     selected_slot: usize,
171     mergeable: bool,
172     allocator: Arc<Mutex<SystemAllocator>>,
173     hotplug_method: HotplugMethod,
174     boot_ram: u64,
175     current_ram: u64,
176     next_hotplug_slot: usize,
177     shared: bool,
178     hugepages: bool,
179     hugepage_size: Option<u64>,
180     prefault: bool,
181     thp: bool,
182     #[cfg(target_arch = "x86_64")]
183     sgx_epc_region: Option<SgxEpcRegion>,
184     user_provided_zones: bool,
185     snapshot_memory_ranges: MemoryRangeTable,
186     memory_zones: MemoryZones,
187     log_dirty: bool, // Enable dirty logging for created RAM regions
188     arch_mem_regions: Vec<ArchMemRegion>,
189     ram_allocator: AddressAllocator,
190     dynamic: bool,
191 
192     // Keep track of calls to create_userspace_mapping() for guest RAM.
193     // This is useful for getting the dirty pages as we need to know the
194     // slots that the mapping is created in.
195     guest_ram_mappings: Vec<GuestRamMapping>,
196 
197     pub acpi_address: Option<GuestAddress>,
198     #[cfg(target_arch = "aarch64")]
199     uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
200 }
201 
202 #[derive(Debug)]
203 pub enum Error {
204     /// Failed to create shared file.
205     SharedFileCreate(io::Error),
206 
207     /// Failed to set shared file length.
208     SharedFileSetLen(io::Error),
209 
210     /// Mmap backed guest memory error
211     GuestMemory(MmapError),
212 
213     /// Failed to allocate a memory range.
214     MemoryRangeAllocation,
215 
216     /// Error from region creation
217     GuestMemoryRegion(MmapRegionError),
218 
219     /// No ACPI slot available
220     NoSlotAvailable,
221 
222     /// Not enough space in the hotplug RAM region
223     InsufficientHotplugRam,
224 
225     /// The requested hotplug memory addition is not a valid size
226     InvalidSize,
227 
228     /// Failed to create the user memory region.
229     CreateUserMemoryRegion(hypervisor::HypervisorVmError),
230 
231     /// Failed to remove the user memory region.
232     RemoveUserMemoryRegion(hypervisor::HypervisorVmError),
233 
234     /// Failed to EventFd.
235     EventFdFail(io::Error),
236 
237     /// Eventfd write error
238     EventfdError(io::Error),
239 
240     /// Failed to virtio-mem resize
241     VirtioMemResizeFail(virtio_devices::mem::Error),
242 
243     /// Cannot restore VM
244     Restore(MigratableError),
245 
246     /// Cannot restore VM because source URL is missing
247     RestoreMissingSourceUrl,
248 
249     /// Cannot create the system allocator
250     CreateSystemAllocator,
251 
252     /// Invalid SGX EPC section size
253     #[cfg(target_arch = "x86_64")]
254     EpcSectionSizeInvalid,
255 
256     /// Failed allocating SGX EPC region
257     #[cfg(target_arch = "x86_64")]
258     SgxEpcRangeAllocation,
259 
260     /// Failed opening SGX virtual EPC device
261     #[cfg(target_arch = "x86_64")]
262     SgxVirtEpcOpen(io::Error),
263 
264     /// Failed setting the SGX virtual EPC section size
265     #[cfg(target_arch = "x86_64")]
266     SgxVirtEpcFileSetLen(io::Error),
267 
268     /// Failed opening SGX provisioning device
269     #[cfg(target_arch = "x86_64")]
270     SgxProvisionOpen(io::Error),
271 
272     /// Failed enabling SGX provisioning
273     #[cfg(target_arch = "x86_64")]
274     SgxEnableProvisioning(hypervisor::HypervisorVmError),
275 
276     /// Failed creating a new MmapRegion instance.
277     #[cfg(target_arch = "x86_64")]
278     NewMmapRegion(vm_memory::mmap::MmapRegionError),
279 
280     /// No memory zones found.
281     MissingMemoryZones,
282 
283     /// Memory configuration is not valid.
284     InvalidMemoryParameters,
285 
286     /// Forbidden operation. Impossible to resize guest memory if it is
287     /// backed by user defined memory regions.
288     InvalidResizeWithMemoryZones,
289 
290     /// It's invalid to try applying a NUMA policy to a memory zone that is
291     /// memory mapped with MAP_SHARED.
292     InvalidSharedMemoryZoneWithHostNuma,
293 
294     /// Failed applying NUMA memory policy.
295     ApplyNumaPolicy(io::Error),
296 
297     /// Memory zone identifier is not unique.
298     DuplicateZoneId,
299 
300     /// No virtio-mem resizing handler found.
301     MissingVirtioMemHandler,
302 
303     /// Unknown memory zone.
304     UnknownMemoryZone,
305 
306     /// Invalid size for resizing. Can be anything except 0.
307     InvalidHotplugSize,
308 
309     /// Invalid hotplug method associated with memory zones resizing capability.
310     InvalidHotplugMethodWithMemoryZones,
311 
312     /// Could not find specified memory zone identifier from hash map.
313     MissingZoneIdentifier,
314 
315     /// Resizing the memory zone failed.
316     ResizeZone,
317 
318     /// Guest address overflow
319     GuestAddressOverFlow,
320 
321     /// Error opening snapshot file
322     SnapshotOpen(io::Error),
323 
324     // Error copying snapshot into region
325     SnapshotCopy(GuestMemoryError),
326 
327     /// Failed to allocate MMIO address
328     AllocateMmioAddress,
329 
330     #[cfg(target_arch = "aarch64")]
331     /// Failed to create UEFI flash
332     CreateUefiFlash(HypervisorVmError),
333 
334     /// Using a directory as a backing file for memory is not supported
335     DirectoryAsBackingFileForMemory,
336 
337     /// Failed to stat filesystem
338     GetFileSystemBlockSize(io::Error),
339 
340     /// Memory size is misaligned with default page size or its hugepage size
341     MisalignedMemorySize,
342 }
343 
344 const ENABLE_FLAG: usize = 0;
345 const INSERTING_FLAG: usize = 1;
346 const REMOVING_FLAG: usize = 2;
347 const EJECT_FLAG: usize = 3;
348 
349 const BASE_OFFSET_LOW: u64 = 0;
350 const BASE_OFFSET_HIGH: u64 = 0x4;
351 const LENGTH_OFFSET_LOW: u64 = 0x8;
352 const LENGTH_OFFSET_HIGH: u64 = 0xC;
353 const STATUS_OFFSET: u64 = 0x14;
354 const SELECTION_OFFSET: u64 = 0;
355 
356 // The MMIO address space size is subtracted with 64k. This is done for the
357 // following reasons:
358 //  - Reduce the addressable space size by at least 4k to workaround a Linux
359 //    bug when the VMM allocates devices at the end of the addressable space
360 //  - Windows requires the addressable space size to be 64k aligned
361 fn mmio_address_space_size(phys_bits: u8) -> u64 {
362     (1 << phys_bits) - (1 << 16)
363 }
364 
365 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the
366 // `f_bsize` field.
367 //
368 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169
369 fn statfs_get_bsize(path: &str) -> Result<u64, Error> {
370     let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?;
371     let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit();
372 
373     // SAFETY: FFI call with a valid path and buffer
374     let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) };
375     if ret != 0 {
376         return Err(Error::GetFileSystemBlockSize(
377             std::io::Error::last_os_error(),
378         ));
379     }
380 
381     // SAFETY: `buf` is valid at this point
382     // Because this value is always positive, just convert it directly.
383     // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned
384     // by `clippy` on musl target.  To avoid the warning, there should be `as _` instead of
385     // `as u64`.
386     let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _;
387     Ok(bsize)
388 }
389 
390 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> {
391     // SAFETY: FFI call. Trivially safe.
392     let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
393 
394     // There is no backend file and the `hugepages` is disabled, just use system page size.
395     if zone.file.is_none() && !zone.hugepages {
396         return Ok(page_size);
397     }
398 
399     // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly.
400     if zone.hugepages && zone.hugepage_size.is_some() {
401         return Ok(zone.hugepage_size.unwrap());
402     }
403 
404     // There are two scenarios here:
405     //  - `hugepages` is enabled but `hugepage_size` is not specified:
406     //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
407     //  - The backing file is specified:
408     //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
409     //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
410     //     value is less than or equal to the page size, just use the page size.
411     let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| {
412         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
413     })?;
414 
415     let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
416 
417     Ok(align_size)
418 }
419 
420 #[inline]
421 fn align_down<T>(val: T, align: T) -> T
422 where
423     T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>,
424 {
425     val & !(align - 1u8.into())
426 }
427 
428 #[inline]
429 fn is_aligned<T>(val: T, align: T) -> bool
430 where
431     T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq,
432 {
433     (val & (align - 1u8.into())) == 0u8.into()
434 }
435 
436 impl BusDevice for MemoryManager {
437     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
438         if self.selected_slot < self.hotplug_slots.len() {
439             let state = &self.hotplug_slots[self.selected_slot];
440             match offset {
441                 BASE_OFFSET_LOW => {
442                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
443                 }
444                 BASE_OFFSET_HIGH => {
445                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
446                 }
447                 LENGTH_OFFSET_LOW => {
448                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
449                 }
450                 LENGTH_OFFSET_HIGH => {
451                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
452                 }
453                 STATUS_OFFSET => {
454                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
455                     data.fill(0);
456                     if state.active {
457                         data[0] |= 1 << ENABLE_FLAG;
458                     }
459                     if state.inserting {
460                         data[0] |= 1 << INSERTING_FLAG;
461                     }
462                     if state.removing {
463                         data[0] |= 1 << REMOVING_FLAG;
464                     }
465                 }
466                 _ => {
467                     warn!(
468                         "Unexpected offset for accessing memory manager device: {:#}",
469                         offset
470                     );
471                 }
472             }
473         } else {
474             warn!("Out of range memory slot: {}", self.selected_slot);
475         }
476     }
477 
478     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
479         match offset {
480             SELECTION_OFFSET => {
481                 self.selected_slot = usize::from(data[0]);
482             }
483             STATUS_OFFSET => {
484                 if self.selected_slot < self.hotplug_slots.len() {
485                     let state = &mut self.hotplug_slots[self.selected_slot];
486                     // The ACPI code writes back a 1 to acknowledge the insertion
487                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
488                         state.inserting = false;
489                     }
490                     // Ditto for removal
491                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
492                         state.removing = false;
493                     }
494                     // Trigger removal of "DIMM"
495                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
496                         warn!("Ejection of memory not currently supported");
497                     }
498                 } else {
499                     warn!("Out of range memory slot: {}", self.selected_slot);
500                 }
501             }
502             _ => {
503                 warn!(
504                     "Unexpected offset for accessing memory manager device: {:#}",
505                     offset
506                 );
507             }
508         };
509         None
510     }
511 }
512 
513 impl MemoryManager {
514     /// Creates all memory regions based on the available RAM ranges defined
515     /// by `ram_regions`, and based on the description of the memory zones.
516     /// In practice, this function can perform multiple memory mappings of the
517     /// same backing file if there's a hole in the address space between two
518     /// RAM ranges.
519     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
520     /// and zones containing two zones (size 1G and size 4G).
521     /// This function will create 3 resulting memory regions:
522     /// - First one mapping entirely the first memory zone on 0-1G range
523     /// - Second one mapping partially the second memory zone on 1G-3G range
524     /// - Third one mapping partially the second memory zone on 4G-6G range
525     /// Also, all memory regions are page-size aligned (e.g. their sizes must
526     /// be multiple of page-size), which may leave an additional hole in the
527     /// address space when hugepage is used.
528     fn create_memory_regions_from_zones(
529         ram_regions: &[(GuestAddress, usize)],
530         zones: &[MemoryZoneConfig],
531         prefault: Option<bool>,
532         thp: bool,
533     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
534         let mut zone_iter = zones.iter();
535         let mut mem_regions = Vec::new();
536         let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?;
537         let mut zone_align_size = memory_zone_get_align_size(zone)?;
538         let mut zone_offset = 0u64;
539         let mut memory_zones = HashMap::new();
540 
541         if !is_aligned(zone.size, zone_align_size) {
542             return Err(Error::MisalignedMemorySize);
543         }
544 
545         // Add zone id to the list of memory zones.
546         memory_zones.insert(zone.id.clone(), MemoryZone::default());
547 
548         for ram_region in ram_regions.iter() {
549             let mut ram_region_offset = 0;
550             let mut exit = false;
551 
552             loop {
553                 let mut ram_region_consumed = false;
554                 let mut pull_next_zone = false;
555 
556                 let ram_region_available_size =
557                     align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size);
558                 if ram_region_available_size == 0 {
559                     break;
560                 }
561                 let zone_sub_size = zone.size - zone_offset;
562 
563                 let file_offset = zone_offset;
564                 let region_start = ram_region
565                     .0
566                     .checked_add(ram_region_offset)
567                     .ok_or(Error::GuestAddressOverFlow)?;
568                 let region_size = if zone_sub_size <= ram_region_available_size {
569                     if zone_sub_size == ram_region_available_size {
570                         ram_region_consumed = true;
571                     }
572 
573                     ram_region_offset += zone_sub_size;
574                     pull_next_zone = true;
575 
576                     zone_sub_size
577                 } else {
578                     zone_offset += ram_region_available_size;
579                     ram_region_consumed = true;
580 
581                     ram_region_available_size
582                 };
583 
584                 info!(
585                     "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}",
586                     zone.id,
587                     region_start.raw_value(),
588                     region_size
589                 );
590                 let region = MemoryManager::create_ram_region(
591                     &zone.file,
592                     file_offset,
593                     region_start,
594                     region_size as usize,
595                     prefault.unwrap_or(zone.prefault),
596                     zone.shared,
597                     zone.hugepages,
598                     zone.hugepage_size,
599                     zone.host_numa_node,
600                     None,
601                     thp,
602                 )?;
603 
604                 // Add region to the list of regions associated with the
605                 // current memory zone.
606                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
607                     memory_zone.regions.push(region.clone());
608                 }
609 
610                 mem_regions.push(region);
611 
612                 if pull_next_zone {
613                     // Get the next zone and reset the offset.
614                     zone_offset = 0;
615                     if let Some(z) = zone_iter.next() {
616                         zone = z;
617                     } else {
618                         exit = true;
619                         break;
620                     }
621                     zone_align_size = memory_zone_get_align_size(zone)?;
622                     if !is_aligned(zone.size, zone_align_size) {
623                         return Err(Error::MisalignedMemorySize);
624                     }
625 
626                     // Check if zone id already exist. In case it does, throw
627                     // an error as we need unique identifiers. Otherwise, add
628                     // the new zone id to the list of memory zones.
629                     if memory_zones.contains_key(&zone.id) {
630                         error!(
631                             "Memory zone identifier '{}' found more than once. \
632                             It must be unique",
633                             zone.id,
634                         );
635                         return Err(Error::DuplicateZoneId);
636                     }
637                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
638                 }
639 
640                 if ram_region_consumed {
641                     break;
642                 }
643             }
644 
645             if exit {
646                 break;
647             }
648         }
649 
650         Ok((mem_regions, memory_zones))
651     }
652 
653     // Restore both GuestMemory regions along with MemoryZone zones.
654     fn restore_memory_regions_and_zones(
655         guest_ram_mappings: &[GuestRamMapping],
656         zones_config: &[MemoryZoneConfig],
657         prefault: Option<bool>,
658         mut existing_memory_files: HashMap<u32, File>,
659         thp: bool,
660     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
661         let mut memory_regions = Vec::new();
662         let mut memory_zones = HashMap::new();
663 
664         for zone_config in zones_config {
665             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
666         }
667 
668         for guest_ram_mapping in guest_ram_mappings {
669             for zone_config in zones_config {
670                 if guest_ram_mapping.zone_id == zone_config.id {
671                     let region = MemoryManager::create_ram_region(
672                         &zone_config.file,
673                         guest_ram_mapping.file_offset,
674                         GuestAddress(guest_ram_mapping.gpa),
675                         guest_ram_mapping.size as usize,
676                         prefault.unwrap_or(zone_config.prefault),
677                         zone_config.shared,
678                         zone_config.hugepages,
679                         zone_config.hugepage_size,
680                         zone_config.host_numa_node,
681                         existing_memory_files.remove(&guest_ram_mapping.slot),
682                         thp,
683                     )?;
684                     memory_regions.push(Arc::clone(&region));
685                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
686                         if guest_ram_mapping.virtio_mem {
687                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
688                             let region_size = region.len();
689                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
690                                 region,
691                                 virtio_device: None,
692                                 hotplugged_size,
693                                 hugepages: zone_config.hugepages,
694                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
695                             });
696                         } else {
697                             memory_zone.regions.push(region);
698                         }
699                     }
700                 }
701             }
702         }
703 
704         memory_regions.sort_by_key(|x| x.start_addr());
705 
706         Ok((memory_regions, memory_zones))
707     }
708 
709     fn fill_saved_regions(
710         &mut self,
711         file_path: PathBuf,
712         saved_regions: MemoryRangeTable,
713     ) -> Result<(), Error> {
714         if saved_regions.is_empty() {
715             return Ok(());
716         }
717 
718         // Open (read only) the snapshot file.
719         let mut memory_file = OpenOptions::new()
720             .read(true)
721             .open(file_path)
722             .map_err(Error::SnapshotOpen)?;
723 
724         let guest_memory = self.guest_memory.memory();
725         for range in saved_regions.regions() {
726             let mut offset: u64 = 0;
727             // Here we are manually handling the retry in case we can't write
728             // the whole region at once because we can't use the implementation
729             // from vm-memory::GuestMemory of read_exact_from() as it is not
730             // following the correct behavior. For more info about this issue
731             // see: https://github.com/rust-vmm/vm-memory/issues/174
732             loop {
733                 let bytes_read = guest_memory
734                     .read_volatile_from(
735                         GuestAddress(range.gpa + offset),
736                         &mut memory_file,
737                         (range.length - offset) as usize,
738                     )
739                     .map_err(Error::SnapshotCopy)?;
740                 offset += bytes_read as u64;
741 
742                 if offset == range.length {
743                     break;
744                 }
745             }
746         }
747 
748         Ok(())
749     }
750 
751     fn validate_memory_config(
752         config: &MemoryConfig,
753         user_provided_zones: bool,
754     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
755         let mut allow_mem_hotplug = false;
756 
757         if !user_provided_zones {
758             if config.zones.is_some() {
759                 error!(
760                     "User defined memory regions can't be provided if the \
761                     memory size is not 0"
762                 );
763                 return Err(Error::InvalidMemoryParameters);
764             }
765 
766             if config.hotplug_size.is_some() {
767                 allow_mem_hotplug = true;
768             }
769 
770             if let Some(hotplugged_size) = config.hotplugged_size {
771                 if let Some(hotplug_size) = config.hotplug_size {
772                     if hotplugged_size > hotplug_size {
773                         error!(
774                             "'hotplugged_size' {} can't be bigger than \
775                             'hotplug_size' {}",
776                             hotplugged_size, hotplug_size,
777                         );
778                         return Err(Error::InvalidMemoryParameters);
779                     }
780                 } else {
781                     error!(
782                         "Invalid to define 'hotplugged_size' when there is\
783                         no 'hotplug_size'"
784                     );
785                     return Err(Error::InvalidMemoryParameters);
786                 }
787                 if config.hotplug_method == HotplugMethod::Acpi {
788                     error!(
789                         "Invalid to define 'hotplugged_size' with hotplug \
790                         method 'acpi'"
791                     );
792                     return Err(Error::InvalidMemoryParameters);
793                 }
794             }
795 
796             // Create a single zone from the global memory config. This lets
797             // us reuse the codepath for user defined memory zones.
798             let zones = vec![MemoryZoneConfig {
799                 id: String::from(DEFAULT_MEMORY_ZONE),
800                 size: config.size,
801                 file: None,
802                 shared: config.shared,
803                 hugepages: config.hugepages,
804                 hugepage_size: config.hugepage_size,
805                 host_numa_node: None,
806                 hotplug_size: config.hotplug_size,
807                 hotplugged_size: config.hotplugged_size,
808                 prefault: config.prefault,
809             }];
810 
811             Ok((config.size, zones, allow_mem_hotplug))
812         } else {
813             if config.zones.is_none() {
814                 error!(
815                     "User defined memory regions must be provided if the \
816                     memory size is 0"
817                 );
818                 return Err(Error::MissingMemoryZones);
819             }
820 
821             // Safe to unwrap as we checked right above there were some
822             // regions.
823             let zones = config.zones.clone().unwrap();
824             if zones.is_empty() {
825                 return Err(Error::MissingMemoryZones);
826             }
827 
828             let mut total_ram_size: u64 = 0;
829             for zone in zones.iter() {
830                 total_ram_size += zone.size;
831 
832                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
833                     error!(
834                         "Invalid to set host NUMA policy for a memory zone \
835                         backed by a regular file and mapped as 'shared'"
836                     );
837                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
838                 }
839 
840                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
841                     error!("Invalid to set ACPI hotplug method for memory zones");
842                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
843                 }
844 
845                 if let Some(hotplugged_size) = zone.hotplugged_size {
846                     if let Some(hotplug_size) = zone.hotplug_size {
847                         if hotplugged_size > hotplug_size {
848                             error!(
849                                 "'hotplugged_size' {} can't be bigger than \
850                                 'hotplug_size' {}",
851                                 hotplugged_size, hotplug_size,
852                             );
853                             return Err(Error::InvalidMemoryParameters);
854                         }
855                     } else {
856                         error!(
857                             "Invalid to define 'hotplugged_size' when there is\
858                             no 'hotplug_size' for a memory zone"
859                         );
860                         return Err(Error::InvalidMemoryParameters);
861                     }
862                     if config.hotplug_method == HotplugMethod::Acpi {
863                         error!(
864                             "Invalid to define 'hotplugged_size' with hotplug \
865                             method 'acpi'"
866                         );
867                         return Err(Error::InvalidMemoryParameters);
868                     }
869                 }
870             }
871 
872             Ok((total_ram_size, zones, allow_mem_hotplug))
873         }
874     }
875 
876     pub fn allocate_address_space(&mut self) -> Result<(), Error> {
877         let mut list = Vec::new();
878 
879         for (zone_id, memory_zone) in self.memory_zones.iter() {
880             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
881                 memory_zone
882                     .regions()
883                     .iter()
884                     .map(|r| (r.clone(), false))
885                     .collect();
886 
887             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
888                 regions.push((virtio_mem_zone.region().clone(), true));
889             }
890 
891             list.push((zone_id.clone(), regions));
892         }
893 
894         for (zone_id, regions) in list {
895             for (region, virtio_mem) in regions {
896                 let slot = self.create_userspace_mapping(
897                     region.start_addr().raw_value(),
898                     region.len(),
899                     region.as_ptr() as u64,
900                     self.mergeable,
901                     false,
902                     self.log_dirty,
903                 )?;
904 
905                 let file_offset = if let Some(file_offset) = region.file_offset() {
906                     file_offset.start()
907                 } else {
908                     0
909                 };
910 
911                 self.guest_ram_mappings.push(GuestRamMapping {
912                     gpa: region.start_addr().raw_value(),
913                     size: region.len(),
914                     slot,
915                     zone_id: zone_id.clone(),
916                     virtio_mem,
917                     file_offset,
918                 });
919                 self.ram_allocator
920                     .allocate(Some(region.start_addr()), region.len(), None)
921                     .ok_or(Error::MemoryRangeAllocation)?;
922             }
923         }
924 
925         // Allocate SubRegion and Reserved address ranges.
926         for region in self.arch_mem_regions.iter() {
927             if region.r_type == RegionType::Ram {
928                 // Ignore the RAM type since ranges have already been allocated
929                 // based on the GuestMemory regions.
930                 continue;
931             }
932             self.ram_allocator
933                 .allocate(
934                     Some(GuestAddress(region.base)),
935                     region.size as GuestUsize,
936                     None,
937                 )
938                 .ok_or(Error::MemoryRangeAllocation)?;
939         }
940 
941         Ok(())
942     }
943 
944     #[cfg(target_arch = "aarch64")]
945     fn add_uefi_flash(&mut self) -> Result<(), Error> {
946         // On AArch64, the UEFI binary requires a flash device at address 0.
947         // 4 MiB memory is mapped to simulate the flash.
948         let uefi_mem_slot = self.allocate_memory_slot();
949         let uefi_region = GuestRegionMmap::new(
950             MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(),
951             arch::layout::UEFI_START,
952         )
953         .unwrap();
954         let uefi_mem_region = self.vm.make_user_memory_region(
955             uefi_mem_slot,
956             uefi_region.start_addr().raw_value(),
957             uefi_region.len(),
958             uefi_region.as_ptr() as u64,
959             false,
960             false,
961         );
962         self.vm
963             .create_user_memory_region(uefi_mem_region)
964             .map_err(Error::CreateUefiFlash)?;
965 
966         let uefi_flash =
967             GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap());
968 
969         self.uefi_flash = Some(uefi_flash);
970 
971         Ok(())
972     }
973 
974     #[allow(clippy::too_many_arguments)]
975     pub fn new(
976         vm: Arc<dyn hypervisor::Vm>,
977         config: &MemoryConfig,
978         prefault: Option<bool>,
979         phys_bits: u8,
980         #[cfg(feature = "tdx")] tdx_enabled: bool,
981         restore_data: Option<&MemoryManagerSnapshotData>,
982         existing_memory_files: Option<HashMap<u32, File>>,
983         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
984     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
985         trace_scoped!("MemoryManager::new");
986 
987         let user_provided_zones = config.size == 0;
988 
989         let mmio_address_space_size = mmio_address_space_size(phys_bits);
990         debug_assert_eq!(
991             (((mmio_address_space_size) >> 16) << 16),
992             mmio_address_space_size
993         );
994         let start_of_platform_device_area =
995             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
996         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
997 
998         let (ram_size, zones, allow_mem_hotplug) =
999             Self::validate_memory_config(config, user_provided_zones)?;
1000 
1001         let (
1002             start_of_device_area,
1003             boot_ram,
1004             current_ram,
1005             arch_mem_regions,
1006             memory_zones,
1007             guest_memory,
1008             boot_guest_memory,
1009             hotplug_slots,
1010             next_memory_slot,
1011             selected_slot,
1012             next_hotplug_slot,
1013         ) = if let Some(data) = restore_data {
1014             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
1015                 &data.guest_ram_mappings,
1016                 &zones,
1017                 prefault,
1018                 existing_memory_files.unwrap_or_default(),
1019                 config.thp,
1020             )?;
1021             let guest_memory =
1022                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
1023             let boot_guest_memory = guest_memory.clone();
1024             (
1025                 GuestAddress(data.start_of_device_area),
1026                 data.boot_ram,
1027                 data.current_ram,
1028                 data.arch_mem_regions.clone(),
1029                 memory_zones,
1030                 guest_memory,
1031                 boot_guest_memory,
1032                 data.hotplug_slots.clone(),
1033                 data.next_memory_slot,
1034                 data.selected_slot,
1035                 data.next_hotplug_slot,
1036             )
1037         } else {
1038             // Init guest memory
1039             let arch_mem_regions = arch::arch_memory_regions();
1040 
1041             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1042                 .iter()
1043                 .filter(|r| r.2 == RegionType::Ram)
1044                 .map(|r| (r.0, r.1))
1045                 .collect();
1046 
1047             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
1048                 .iter()
1049                 .map(|(a, b, c)| ArchMemRegion {
1050                     base: a.0,
1051                     size: *b,
1052                     r_type: *c,
1053                 })
1054                 .collect();
1055 
1056             let (mem_regions, mut memory_zones) =
1057                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?;
1058 
1059             let mut guest_memory =
1060                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
1061 
1062             let boot_guest_memory = guest_memory.clone();
1063 
1064             let mut start_of_device_area =
1065                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
1066 
1067             // Update list of memory zones for resize.
1068             for zone in zones.iter() {
1069                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
1070                     if let Some(hotplug_size) = zone.hotplug_size {
1071                         if hotplug_size == 0 {
1072                             error!("'hotplug_size' can't be 0");
1073                             return Err(Error::InvalidHotplugSize);
1074                         }
1075 
1076                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
1077                             start_of_device_area = start_of_device_area
1078                                 .checked_add(hotplug_size)
1079                                 .ok_or(Error::GuestAddressOverFlow)?;
1080                         } else {
1081                             // Alignment must be "natural" i.e. same as size of block
1082                             let start_addr = GuestAddress(
1083                                 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1084                                     - 1)
1085                                     / virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1086                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
1087                             );
1088 
1089                             // When `prefault` is set by vm_restore, memory manager
1090                             // will create ram region with `prefault` option in
1091                             // restore config rather than same option in zone
1092                             let region = MemoryManager::create_ram_region(
1093                                 &None,
1094                                 0,
1095                                 start_addr,
1096                                 hotplug_size as usize,
1097                                 prefault.unwrap_or(zone.prefault),
1098                                 zone.shared,
1099                                 zone.hugepages,
1100                                 zone.hugepage_size,
1101                                 zone.host_numa_node,
1102                                 None,
1103                                 config.thp,
1104                             )?;
1105 
1106                             guest_memory = guest_memory
1107                                 .insert_region(Arc::clone(&region))
1108                                 .map_err(Error::GuestMemory)?;
1109 
1110                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
1111                             let region_size = region.len();
1112                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
1113                                 region,
1114                                 virtio_device: None,
1115                                 hotplugged_size,
1116                                 hugepages: zone.hugepages,
1117                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
1118                             });
1119 
1120                             start_of_device_area = start_addr
1121                                 .checked_add(hotplug_size)
1122                                 .ok_or(Error::GuestAddressOverFlow)?;
1123                         }
1124                     }
1125                 } else {
1126                     return Err(Error::MissingZoneIdentifier);
1127                 }
1128             }
1129 
1130             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
1131             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
1132 
1133             (
1134                 start_of_device_area,
1135                 ram_size,
1136                 ram_size,
1137                 arch_mem_regions,
1138                 memory_zones,
1139                 guest_memory,
1140                 boot_guest_memory,
1141                 hotplug_slots,
1142                 0,
1143                 0,
1144                 0,
1145             )
1146         };
1147 
1148         let guest_memory = GuestMemoryAtomic::new(guest_memory);
1149 
1150         // Both MMIO and PIO address spaces start at address 0.
1151         let allocator = Arc::new(Mutex::new(
1152             SystemAllocator::new(
1153                 #[cfg(target_arch = "x86_64")]
1154                 {
1155                     GuestAddress(0)
1156                 },
1157                 #[cfg(target_arch = "x86_64")]
1158                 {
1159                     1 << 16
1160                 },
1161                 start_of_platform_device_area,
1162                 PLATFORM_DEVICE_AREA_SIZE,
1163                 #[cfg(target_arch = "x86_64")]
1164                 vec![GsiApic::new(
1165                     X86_64_IRQ_BASE,
1166                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1167                 )],
1168             )
1169             .ok_or(Error::CreateSystemAllocator)?,
1170         ));
1171 
1172         #[cfg(not(feature = "tdx"))]
1173         let dynamic = true;
1174         #[cfg(feature = "tdx")]
1175         let dynamic = !tdx_enabled;
1176 
1177         let acpi_address = if dynamic
1178             && config.hotplug_method == HotplugMethod::Acpi
1179             && (config.hotplug_size.unwrap_or_default() > 0)
1180         {
1181             Some(
1182                 allocator
1183                     .lock()
1184                     .unwrap()
1185                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1186                     .ok_or(Error::AllocateMmioAddress)?,
1187             )
1188         } else {
1189             None
1190         };
1191 
1192         // If running on SGX the start of device area and RAM area may diverge but
1193         // at this point they are next to each other.
1194         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1195         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1196 
1197         let mut memory_manager = MemoryManager {
1198             boot_guest_memory,
1199             guest_memory,
1200             next_memory_slot,
1201             start_of_device_area,
1202             end_of_device_area,
1203             end_of_ram_area,
1204             vm,
1205             hotplug_slots,
1206             selected_slot,
1207             mergeable: config.mergeable,
1208             allocator,
1209             hotplug_method: config.hotplug_method,
1210             boot_ram,
1211             current_ram,
1212             next_hotplug_slot,
1213             shared: config.shared,
1214             hugepages: config.hugepages,
1215             hugepage_size: config.hugepage_size,
1216             prefault: config.prefault,
1217             #[cfg(target_arch = "x86_64")]
1218             sgx_epc_region: None,
1219             user_provided_zones,
1220             snapshot_memory_ranges: MemoryRangeTable::default(),
1221             memory_zones,
1222             guest_ram_mappings: Vec::new(),
1223             acpi_address,
1224             log_dirty: dynamic, // Cannot log dirty pages on a TD
1225             arch_mem_regions,
1226             ram_allocator,
1227             dynamic,
1228             #[cfg(target_arch = "aarch64")]
1229             uefi_flash: None,
1230             thp: config.thp,
1231         };
1232 
1233         #[cfg(target_arch = "aarch64")]
1234         {
1235             // For Aarch64 we cannot lazily allocate the address space like we
1236             // do for x86, because while restoring a VM from snapshot we would
1237             // need the address space to be allocated to properly restore VGIC.
1238             // And the restore of VGIC happens before we attempt to run the vCPUs
1239             // for the first time, thus we need to allocate the address space
1240             // beforehand.
1241             memory_manager.allocate_address_space()?;
1242             memory_manager.add_uefi_flash()?;
1243         }
1244 
1245         #[cfg(target_arch = "x86_64")]
1246         if let Some(sgx_epc_config) = sgx_epc_config {
1247             memory_manager.setup_sgx(sgx_epc_config)?;
1248         }
1249 
1250         Ok(Arc::new(Mutex::new(memory_manager)))
1251     }
1252 
1253     pub fn new_from_snapshot(
1254         snapshot: &Snapshot,
1255         vm: Arc<dyn hypervisor::Vm>,
1256         config: &MemoryConfig,
1257         source_url: Option<&str>,
1258         prefault: bool,
1259         phys_bits: u8,
1260     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1261         if let Some(source_url) = source_url {
1262             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1263             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1264 
1265             let mem_snapshot: MemoryManagerSnapshotData =
1266                 snapshot.to_versioned_state().map_err(Error::Restore)?;
1267 
1268             let mm = MemoryManager::new(
1269                 vm,
1270                 config,
1271                 Some(prefault),
1272                 phys_bits,
1273                 #[cfg(feature = "tdx")]
1274                 false,
1275                 Some(&mem_snapshot),
1276                 None,
1277                 #[cfg(target_arch = "x86_64")]
1278                 None,
1279             )?;
1280 
1281             mm.lock()
1282                 .unwrap()
1283                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1284 
1285             Ok(mm)
1286         } else {
1287             Err(Error::RestoreMissingSourceUrl)
1288         }
1289     }
1290 
1291     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1292         // SAFETY: FFI call with correct arguments
1293         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1294 
1295         if res < 0 {
1296             Err(io::Error::last_os_error())
1297         } else {
1298             Ok(res as RawFd)
1299         }
1300     }
1301 
1302     fn mbind(
1303         addr: *mut u8,
1304         len: u64,
1305         mode: u32,
1306         nodemask: Vec<u64>,
1307         maxnode: u64,
1308         flags: u32,
1309     ) -> Result<(), io::Error> {
1310         // SAFETY: FFI call with correct arguments
1311         let res = unsafe {
1312             libc::syscall(
1313                 libc::SYS_mbind,
1314                 addr as *mut libc::c_void,
1315                 len,
1316                 mode,
1317                 nodemask.as_ptr(),
1318                 maxnode,
1319                 flags,
1320             )
1321         };
1322 
1323         if res < 0 {
1324             Err(io::Error::last_os_error())
1325         } else {
1326             Ok(())
1327         }
1328     }
1329 
1330     fn create_anonymous_file(
1331         size: usize,
1332         hugepages: bool,
1333         hugepage_size: Option<u64>,
1334     ) -> Result<FileOffset, Error> {
1335         let fd = Self::memfd_create(
1336             &ffi::CString::new("ch_ram").unwrap(),
1337             libc::MFD_CLOEXEC
1338                 | if hugepages {
1339                     libc::MFD_HUGETLB
1340                         | if let Some(hugepage_size) = hugepage_size {
1341                             /*
1342                              * From the Linux kernel:
1343                              * Several system calls take a flag to request "hugetlb" huge pages.
1344                              * Without further specification, these system calls will use the
1345                              * system's default huge page size.  If a system supports multiple
1346                              * huge page sizes, the desired huge page size can be specified in
1347                              * bits [26:31] of the flag arguments.  The value in these 6 bits
1348                              * will encode the log2 of the huge page size.
1349                              */
1350 
1351                             hugepage_size.trailing_zeros() << 26
1352                         } else {
1353                             // Use the system default huge page size
1354                             0
1355                         }
1356                 } else {
1357                     0
1358                 },
1359         )
1360         .map_err(Error::SharedFileCreate)?;
1361 
1362         // SAFETY: fd is valid
1363         let f = unsafe { File::from_raw_fd(fd) };
1364         f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1365 
1366         Ok(FileOffset::new(f, 0))
1367     }
1368 
1369     fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> {
1370         if backing_file.is_dir() {
1371             Err(Error::DirectoryAsBackingFileForMemory)
1372         } else {
1373             let f = OpenOptions::new()
1374                 .read(true)
1375                 .write(true)
1376                 .open(backing_file)
1377                 .map_err(Error::SharedFileCreate)?;
1378 
1379             Ok(FileOffset::new(f, file_offset))
1380         }
1381     }
1382 
1383     #[allow(clippy::too_many_arguments)]
1384     pub fn create_ram_region(
1385         backing_file: &Option<PathBuf>,
1386         file_offset: u64,
1387         start_addr: GuestAddress,
1388         size: usize,
1389         prefault: bool,
1390         shared: bool,
1391         hugepages: bool,
1392         hugepage_size: Option<u64>,
1393         host_numa_node: Option<u32>,
1394         existing_memory_file: Option<File>,
1395         thp: bool,
1396     ) -> Result<Arc<GuestRegionMmap>, Error> {
1397         let mut mmap_flags = libc::MAP_NORESERVE;
1398 
1399         // The duplication of mmap_flags ORing here is unfortunate but it also makes
1400         // the complexity of the handling clear.
1401         let fo = if let Some(f) = existing_memory_file {
1402             // It must be MAP_SHARED as we wouldn't already have an FD
1403             mmap_flags |= libc::MAP_SHARED;
1404             Some(FileOffset::new(f, file_offset))
1405         } else if let Some(backing_file) = backing_file {
1406             if shared {
1407                 mmap_flags |= libc::MAP_SHARED;
1408             } else {
1409                 mmap_flags |= libc::MAP_PRIVATE;
1410             }
1411             Some(Self::open_backing_file(backing_file, file_offset)?)
1412         } else if shared || hugepages {
1413             // For hugepages we must also MAP_SHARED otherwise we will trigger #4805
1414             // because the MAP_PRIVATE will trigger CoW against the backing file with
1415             // the VFIO pinning
1416             mmap_flags |= libc::MAP_SHARED;
1417             Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?)
1418         } else {
1419             mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS;
1420             None
1421         };
1422 
1423         if prefault {
1424             mmap_flags |= libc::MAP_POPULATE;
1425         }
1426 
1427         let region = GuestRegionMmap::new(
1428             MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
1429                 .map_err(Error::GuestMemoryRegion)?,
1430             start_addr,
1431         )
1432         .map_err(Error::GuestMemory)?;
1433 
1434         if region.file_offset().is_none() && thp {
1435             info!(
1436                 "Anonymous mapping at 0x{:x} (size = 0x{:x})",
1437                 region.as_ptr() as u64,
1438                 size
1439             );
1440             // SAFETY: FFI call with correct arguments
1441             let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
1442             if ret != 0 {
1443                 let e = io::Error::last_os_error();
1444                 warn!("Failed to mark pages as THP eligible: {}", e);
1445             }
1446         }
1447 
1448         // Apply NUMA policy if needed.
1449         if let Some(node) = host_numa_node {
1450             let addr = region.deref().as_ptr();
1451             let len = region.deref().size() as u64;
1452             let mode = MPOL_BIND;
1453             let mut nodemask: Vec<u64> = Vec::new();
1454             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1455 
1456             // Linux is kind of buggy in the way it interprets maxnode as it
1457             // will cut off the last node. That's why we have to add 1 to what
1458             // we would consider as the proper maxnode value.
1459             let maxnode = node as u64 + 1 + 1;
1460 
1461             // Allocate the right size for the vector.
1462             nodemask.resize((node as usize / 64) + 1, 0);
1463 
1464             // Fill the global bitmask through the nodemask vector.
1465             let idx = (node / 64) as usize;
1466             let shift = node % 64;
1467             nodemask[idx] |= 1u64 << shift;
1468 
1469             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1470             // force the kernel to move all pages that might have been already
1471             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1472             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1473             // MPOL_BIND is the selected mode as it specifies a strict policy
1474             // that restricts memory allocation to the nodes specified in the
1475             // nodemask.
1476             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1477                 .map_err(Error::ApplyNumaPolicy)?;
1478         }
1479 
1480         Ok(Arc::new(region))
1481     }
1482 
1483     // Update the GuestMemoryMmap with the new range
1484     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1485         let guest_memory = self
1486             .guest_memory
1487             .memory()
1488             .insert_region(region)
1489             .map_err(Error::GuestMemory)?;
1490         self.guest_memory.lock().unwrap().replace(guest_memory);
1491 
1492         Ok(())
1493     }
1494 
1495     //
1496     // Calculate the start address of an area next to RAM.
1497     //
1498     // If memory hotplug is allowed, the start address needs to be aligned
1499     // (rounded-up) to 128MiB boundary.
1500     // If memory hotplug is not allowed, there is no alignment required.
1501     // And it must also start at the 64bit start.
1502     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1503         let mut start_addr = if allow_mem_hotplug {
1504             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1505         } else {
1506             mem_end
1507         };
1508 
1509         start_addr = start_addr
1510             .checked_add(1)
1511             .ok_or(Error::GuestAddressOverFlow)?;
1512 
1513         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1514             return Ok(arch::layout::RAM_64BIT_START);
1515         }
1516 
1517         Ok(start_addr)
1518     }
1519 
1520     pub fn add_ram_region(
1521         &mut self,
1522         start_addr: GuestAddress,
1523         size: usize,
1524     ) -> Result<Arc<GuestRegionMmap>, Error> {
1525         // Allocate memory for the region
1526         let region = MemoryManager::create_ram_region(
1527             &None,
1528             0,
1529             start_addr,
1530             size,
1531             self.prefault,
1532             self.shared,
1533             self.hugepages,
1534             self.hugepage_size,
1535             None,
1536             None,
1537             self.thp,
1538         )?;
1539 
1540         // Map it into the guest
1541         let slot = self.create_userspace_mapping(
1542             region.start_addr().0,
1543             region.len(),
1544             region.as_ptr() as u64,
1545             self.mergeable,
1546             false,
1547             self.log_dirty,
1548         )?;
1549         self.guest_ram_mappings.push(GuestRamMapping {
1550             gpa: region.start_addr().raw_value(),
1551             size: region.len(),
1552             slot,
1553             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1554             virtio_mem: false,
1555             file_offset: 0,
1556         });
1557 
1558         self.add_region(Arc::clone(&region))?;
1559 
1560         Ok(region)
1561     }
1562 
1563     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1564         info!("Hotplugging new RAM: {}", size);
1565 
1566         // Check that there is a free slot
1567         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1568             return Err(Error::NoSlotAvailable);
1569         }
1570 
1571         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1572         if size % (128 << 20) != 0 {
1573             return Err(Error::InvalidSize);
1574         }
1575 
1576         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1577 
1578         if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area {
1579             return Err(Error::InsufficientHotplugRam);
1580         }
1581 
1582         let region = self.add_ram_region(start_addr, size)?;
1583 
1584         // Add region to the list of regions associated with the default
1585         // memory zone.
1586         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1587             memory_zone.regions.push(Arc::clone(&region));
1588         }
1589 
1590         // Tell the allocator
1591         self.ram_allocator
1592             .allocate(Some(start_addr), size as GuestUsize, None)
1593             .ok_or(Error::MemoryRangeAllocation)?;
1594 
1595         // Update the slot so that it can be queried via the I/O port
1596         let slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1597         slot.active = true;
1598         slot.inserting = true;
1599         slot.base = region.start_addr().0;
1600         slot.length = region.len();
1601 
1602         self.next_hotplug_slot += 1;
1603 
1604         Ok(region)
1605     }
1606 
1607     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1608         self.guest_memory.clone()
1609     }
1610 
1611     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1612         self.boot_guest_memory.clone()
1613     }
1614 
1615     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1616         self.allocator.clone()
1617     }
1618 
1619     pub fn start_of_device_area(&self) -> GuestAddress {
1620         self.start_of_device_area
1621     }
1622 
1623     pub fn end_of_device_area(&self) -> GuestAddress {
1624         self.end_of_device_area
1625     }
1626 
1627     pub fn allocate_memory_slot(&mut self) -> u32 {
1628         let slot_id = self.next_memory_slot;
1629         self.next_memory_slot += 1;
1630         slot_id
1631     }
1632 
1633     pub fn create_userspace_mapping(
1634         &mut self,
1635         guest_phys_addr: u64,
1636         memory_size: u64,
1637         userspace_addr: u64,
1638         mergeable: bool,
1639         readonly: bool,
1640         log_dirty: bool,
1641     ) -> Result<u32, Error> {
1642         let slot = self.allocate_memory_slot();
1643         let mem_region = self.vm.make_user_memory_region(
1644             slot,
1645             guest_phys_addr,
1646             memory_size,
1647             userspace_addr,
1648             readonly,
1649             log_dirty,
1650         );
1651 
1652         info!(
1653             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1654             guest_phys_addr, userspace_addr, memory_size, slot
1655         );
1656 
1657         self.vm
1658             .create_user_memory_region(mem_region)
1659             .map_err(Error::CreateUserMemoryRegion)?;
1660 
1661         // SAFETY: the address and size are valid since the
1662         // mmap succeeded.
1663         let ret = unsafe {
1664             libc::madvise(
1665                 userspace_addr as *mut libc::c_void,
1666                 memory_size as libc::size_t,
1667                 libc::MADV_DONTDUMP,
1668             )
1669         };
1670         if ret != 0 {
1671             let e = io::Error::last_os_error();
1672             warn!("Failed to mark mappin as MADV_DONTDUMP: {}", e);
1673         }
1674 
1675         // Mark the pages as mergeable if explicitly asked for.
1676         if mergeable {
1677             // SAFETY: the address and size are valid since the
1678             // mmap succeeded.
1679             let ret = unsafe {
1680                 libc::madvise(
1681                     userspace_addr as *mut libc::c_void,
1682                     memory_size as libc::size_t,
1683                     libc::MADV_MERGEABLE,
1684                 )
1685             };
1686             if ret != 0 {
1687                 let err = io::Error::last_os_error();
1688                 // Safe to unwrap because the error is constructed with
1689                 // last_os_error(), which ensures the output will be Some().
1690                 let errno = err.raw_os_error().unwrap();
1691                 if errno == libc::EINVAL {
1692                     warn!("kernel not configured with CONFIG_KSM");
1693                 } else {
1694                     warn!("madvise error: {}", err);
1695                 }
1696                 warn!("failed to mark pages as mergeable");
1697             }
1698         }
1699 
1700         info!(
1701             "Created userspace mapping: {:x} -> {:x} {:x}",
1702             guest_phys_addr, userspace_addr, memory_size
1703         );
1704 
1705         Ok(slot)
1706     }
1707 
1708     pub fn remove_userspace_mapping(
1709         &mut self,
1710         guest_phys_addr: u64,
1711         memory_size: u64,
1712         userspace_addr: u64,
1713         mergeable: bool,
1714         slot: u32,
1715     ) -> Result<(), Error> {
1716         let mem_region = self.vm.make_user_memory_region(
1717             slot,
1718             guest_phys_addr,
1719             memory_size,
1720             userspace_addr,
1721             false, /* readonly -- don't care */
1722             false, /* log dirty */
1723         );
1724 
1725         self.vm
1726             .remove_user_memory_region(mem_region)
1727             .map_err(Error::RemoveUserMemoryRegion)?;
1728 
1729         // Mark the pages as unmergeable if there were previously marked as
1730         // mergeable.
1731         if mergeable {
1732             // SAFETY: the address and size are valid as the region was
1733             // previously advised.
1734             let ret = unsafe {
1735                 libc::madvise(
1736                     userspace_addr as *mut libc::c_void,
1737                     memory_size as libc::size_t,
1738                     libc::MADV_UNMERGEABLE,
1739                 )
1740             };
1741             if ret != 0 {
1742                 let err = io::Error::last_os_error();
1743                 // Safe to unwrap because the error is constructed with
1744                 // last_os_error(), which ensures the output will be Some().
1745                 let errno = err.raw_os_error().unwrap();
1746                 if errno == libc::EINVAL {
1747                     warn!("kernel not configured with CONFIG_KSM");
1748                 } else {
1749                     warn!("madvise error: {}", err);
1750                 }
1751                 warn!("failed to mark pages as unmergeable");
1752             }
1753         }
1754 
1755         info!(
1756             "Removed userspace mapping: {:x} -> {:x} {:x}",
1757             guest_phys_addr, userspace_addr, memory_size
1758         );
1759 
1760         Ok(())
1761     }
1762 
1763     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1764         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1765             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1766                 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() {
1767                     virtio_mem_device
1768                         .lock()
1769                         .unwrap()
1770                         .resize(size)
1771                         .map_err(Error::VirtioMemResizeFail)?;
1772                 }
1773 
1774                 // Keep the hotplugged_size up to date.
1775                 virtio_mem_zone.hotplugged_size = size;
1776             } else {
1777                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1778                 return Err(Error::MissingVirtioMemHandler);
1779             }
1780 
1781             return Ok(());
1782         }
1783 
1784         error!("Failed resizing virtio-mem region: Unknown memory zone");
1785         Err(Error::UnknownMemoryZone)
1786     }
1787 
1788     /// In case this function resulted in adding a new memory region to the
1789     /// guest memory, the new region is returned to the caller. The virtio-mem
1790     /// use case never adds a new region as the whole hotpluggable memory has
1791     /// already been allocated at boot time.
1792     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1793         if self.user_provided_zones {
1794             error!(
1795                 "Not allowed to resize guest memory when backed with user \
1796                 defined memory zones."
1797             );
1798             return Err(Error::InvalidResizeWithMemoryZones);
1799         }
1800 
1801         let mut region: Option<Arc<GuestRegionMmap>> = None;
1802         match self.hotplug_method {
1803             HotplugMethod::VirtioMem => {
1804                 if desired_ram >= self.boot_ram {
1805                     if !self.dynamic {
1806                         return Ok(region);
1807                     }
1808 
1809                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1810                     self.current_ram = desired_ram;
1811                 }
1812             }
1813             HotplugMethod::Acpi => {
1814                 if desired_ram > self.current_ram {
1815                     if !self.dynamic {
1816                         return Ok(region);
1817                     }
1818 
1819                     region =
1820                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1821                     self.current_ram = desired_ram;
1822                 }
1823             }
1824         }
1825         Ok(region)
1826     }
1827 
1828     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1829         if !self.user_provided_zones {
1830             error!(
1831                 "Not allowed to resize guest memory zone when no zone is \
1832                 defined."
1833             );
1834             return Err(Error::ResizeZone);
1835         }
1836 
1837         self.virtio_mem_resize(id, virtio_mem_size)
1838     }
1839 
1840     #[cfg(target_arch = "x86_64")]
1841     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1842         let file = OpenOptions::new()
1843             .read(true)
1844             .open("/dev/sgx_provision")
1845             .map_err(Error::SgxProvisionOpen)?;
1846         self.vm
1847             .enable_sgx_attribute(file)
1848             .map_err(Error::SgxEnableProvisioning)?;
1849 
1850         // Go over each EPC section and verify its size is a 4k multiple. At
1851         // the same time, calculate the total size needed for the contiguous
1852         // EPC region.
1853         let mut epc_region_size = 0;
1854         for epc_section in sgx_epc_config.iter() {
1855             if epc_section.size == 0 {
1856                 return Err(Error::EpcSectionSizeInvalid);
1857             }
1858             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1859                 return Err(Error::EpcSectionSizeInvalid);
1860             }
1861 
1862             epc_region_size += epc_section.size;
1863         }
1864 
1865         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
1866         let epc_region_start = GuestAddress(
1867             ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE,
1868         );
1869 
1870         self.start_of_device_area = epc_region_start
1871             .checked_add(epc_region_size)
1872             .ok_or(Error::GuestAddressOverFlow)?;
1873 
1874         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
1875         info!(
1876             "SGX EPC region: 0x{:x} (0x{:x})",
1877             epc_region_start.0, epc_region_size
1878         );
1879 
1880         // Each section can be memory mapped into the allocated region.
1881         let mut epc_section_start = epc_region_start.raw_value();
1882         for epc_section in sgx_epc_config.iter() {
1883             let file = OpenOptions::new()
1884                 .read(true)
1885                 .write(true)
1886                 .open("/dev/sgx_vepc")
1887                 .map_err(Error::SgxVirtEpcOpen)?;
1888 
1889             let prot = PROT_READ | PROT_WRITE;
1890             let mut flags = MAP_NORESERVE | MAP_SHARED;
1891             if epc_section.prefault {
1892                 flags |= MAP_POPULATE;
1893             }
1894 
1895             // We can't use the vm-memory crate to perform the memory mapping
1896             // here as it would try to ensure the size of the backing file is
1897             // matching the size of the expected mapping. The /dev/sgx_vepc
1898             // device does not work that way, it provides a file descriptor
1899             // which is not matching the mapping size, as it's a just a way to
1900             // let KVM know that an EPC section is being created for the guest.
1901             // SAFETY: FFI call with correct arguments
1902             let host_addr = unsafe {
1903                 libc::mmap(
1904                     std::ptr::null_mut(),
1905                     epc_section.size as usize,
1906                     prot,
1907                     flags,
1908                     file.as_raw_fd(),
1909                     0,
1910                 )
1911             } as u64;
1912 
1913             info!(
1914                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
1915                 epc_section_start, epc_section.size
1916             );
1917 
1918             let _mem_slot = self.create_userspace_mapping(
1919                 epc_section_start,
1920                 epc_section.size,
1921                 host_addr,
1922                 false,
1923                 false,
1924                 false,
1925             )?;
1926 
1927             sgx_epc_region.insert(
1928                 epc_section.id.clone(),
1929                 SgxEpcSection::new(
1930                     GuestAddress(epc_section_start),
1931                     epc_section.size as GuestUsize,
1932                 ),
1933             );
1934 
1935             epc_section_start += epc_section.size;
1936         }
1937 
1938         self.sgx_epc_region = Some(sgx_epc_region);
1939 
1940         Ok(())
1941     }
1942 
1943     #[cfg(target_arch = "x86_64")]
1944     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
1945         &self.sgx_epc_region
1946     }
1947 
1948     pub fn is_hardlink(f: &File) -> bool {
1949         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
1950         // SAFETY: FFI call with correct arguments
1951         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
1952         if ret != 0 {
1953             error!("Couldn't fstat the backing file");
1954             return false;
1955         }
1956 
1957         // SAFETY: stat is valid
1958         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
1959     }
1960 
1961     pub fn memory_zones(&self) -> &MemoryZones {
1962         &self.memory_zones
1963     }
1964 
1965     pub fn memory_zones_mut(&mut self) -> &mut MemoryZones {
1966         &mut self.memory_zones
1967     }
1968 
1969     pub fn memory_range_table(
1970         &self,
1971         snapshot: bool,
1972     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
1973         let mut table = MemoryRangeTable::default();
1974 
1975         for memory_zone in self.memory_zones.values() {
1976             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
1977                 table.extend(virtio_mem_zone.plugged_ranges());
1978             }
1979 
1980             for region in memory_zone.regions() {
1981                 if snapshot {
1982                     if let Some(file_offset) = region.file_offset() {
1983                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
1984                             && Self::is_hardlink(file_offset.file())
1985                         {
1986                             // In this very specific case, we know the memory
1987                             // region is backed by a file on the host filesystem
1988                             // that can be accessed by the user, and additionally
1989                             // the mapping is shared, which means that modifications
1990                             // to the content are written to the actual file.
1991                             // When meeting these conditions, we can skip the
1992                             // copy of the memory content for this specific region,
1993                             // as we can assume the user will have it saved through
1994                             // the backing file already.
1995                             continue;
1996                         }
1997                     }
1998                 }
1999 
2000                 table.push(MemoryRange {
2001                     gpa: region.start_addr().raw_value(),
2002                     length: region.len(),
2003                 });
2004             }
2005         }
2006 
2007         Ok(table)
2008     }
2009 
2010     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
2011         MemoryManagerSnapshotData {
2012             memory_ranges: self.snapshot_memory_ranges.clone(),
2013             guest_ram_mappings: self.guest_ram_mappings.clone(),
2014             start_of_device_area: self.start_of_device_area.0,
2015             boot_ram: self.boot_ram,
2016             current_ram: self.current_ram,
2017             arch_mem_regions: self.arch_mem_regions.clone(),
2018             hotplug_slots: self.hotplug_slots.clone(),
2019             next_memory_slot: self.next_memory_slot,
2020             selected_slot: self.selected_slot,
2021             next_hotplug_slot: self.next_hotplug_slot,
2022         }
2023     }
2024 
2025     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
2026         let mut memory_slot_fds = HashMap::new();
2027         for guest_ram_mapping in &self.guest_ram_mappings {
2028             let slot = guest_ram_mapping.slot;
2029             let guest_memory = self.guest_memory.memory();
2030             let file = guest_memory
2031                 .find_region(GuestAddress(guest_ram_mapping.gpa))
2032                 .unwrap()
2033                 .file_offset()
2034                 .unwrap()
2035                 .file();
2036             memory_slot_fds.insert(slot, file.as_raw_fd());
2037         }
2038         memory_slot_fds
2039     }
2040 
2041     pub fn acpi_address(&self) -> Option<GuestAddress> {
2042         self.acpi_address
2043     }
2044 
2045     pub fn num_guest_ram_mappings(&self) -> u32 {
2046         self.guest_ram_mappings.len() as u32
2047     }
2048 
2049     #[cfg(target_arch = "aarch64")]
2050     pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
2051         self.uefi_flash.as_ref().unwrap().clone()
2052     }
2053 
2054     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2055     pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
2056         let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
2057         mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
2058 
2059         let mut mem_offset_in_elf = mem_offset;
2060         let mut ram_maps = BTreeMap::new();
2061         for mapping in mapping_sorted_by_gpa.iter() {
2062             ram_maps.insert(
2063                 mapping.gpa,
2064                 CoredumpMemoryRegion {
2065                     mem_offset_in_elf,
2066                     mem_size: mapping.size,
2067                 },
2068             );
2069             mem_offset_in_elf += mapping.size;
2070         }
2071 
2072         CoredumpMemoryRegions { ram_maps }
2073     }
2074 
2075     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2076     pub fn coredump_iterate_save_mem(
2077         &mut self,
2078         dump_state: &DumpState,
2079     ) -> std::result::Result<(), GuestDebuggableError> {
2080         let snapshot_memory_ranges = self
2081             .memory_range_table(false)
2082             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2083 
2084         if snapshot_memory_ranges.is_empty() {
2085             return Ok(());
2086         }
2087 
2088         let coredump_file = dump_state.file.as_ref().unwrap();
2089 
2090         let guest_memory = self.guest_memory.memory();
2091         let mut total_bytes: u64 = 0;
2092 
2093         for range in snapshot_memory_ranges.regions() {
2094             let mut offset: u64 = 0;
2095             loop {
2096                 let bytes_written = guest_memory
2097                     .write_volatile_to(
2098                         GuestAddress(range.gpa + offset),
2099                         &mut coredump_file.as_fd(),
2100                         (range.length - offset) as usize,
2101                     )
2102                     .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2103                 offset += bytes_written as u64;
2104                 total_bytes += bytes_written as u64;
2105 
2106                 if offset == range.length {
2107                     break;
2108                 }
2109             }
2110         }
2111 
2112         debug!("coredump total bytes {}", total_bytes);
2113         Ok(())
2114     }
2115 
2116     pub fn receive_memory_regions<F>(
2117         &mut self,
2118         ranges: &MemoryRangeTable,
2119         fd: &mut F,
2120     ) -> std::result::Result<(), MigratableError>
2121     where
2122         F: ReadVolatile,
2123     {
2124         let guest_memory = self.guest_memory();
2125         let mem = guest_memory.memory();
2126 
2127         for range in ranges.regions() {
2128             let mut offset: u64 = 0;
2129             // Here we are manually handling the retry in case we can't the
2130             // whole region at once because we can't use the implementation
2131             // from vm-memory::GuestMemory of read_exact_from() as it is not
2132             // following the correct behavior. For more info about this issue
2133             // see: https://github.com/rust-vmm/vm-memory/issues/174
2134             loop {
2135                 let bytes_read = mem
2136                     .read_volatile_from(
2137                         GuestAddress(range.gpa + offset),
2138                         fd,
2139                         (range.length - offset) as usize,
2140                     )
2141                     .map_err(|e| {
2142                         MigratableError::MigrateReceive(anyhow!(
2143                             "Error receiving memory from socket: {}",
2144                             e
2145                         ))
2146                     })?;
2147                 offset += bytes_read as u64;
2148 
2149                 if offset == range.length {
2150                     break;
2151                 }
2152             }
2153         }
2154 
2155         Ok(())
2156     }
2157 }
2158 
2159 struct MemoryNotify {
2160     slot_id: usize,
2161 }
2162 
2163 impl Aml for MemoryNotify {
2164     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2165         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
2166         aml::If::new(
2167             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
2168             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2169         )
2170         .to_aml_bytes(sink)
2171     }
2172 }
2173 
2174 struct MemorySlot {
2175     slot_id: usize,
2176 }
2177 
2178 impl Aml for MemorySlot {
2179     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2180         aml::Device::new(
2181             format!("M{:03}", self.slot_id).as_str().into(),
2182             vec![
2183                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")),
2184                 &aml::Name::new("_UID".into(), &self.slot_id),
2185                 /*
2186                 _STA return value:
2187                 Bit [0] – Set if the device is present.
2188                 Bit [1] – Set if the device is enabled and decoding its resources.
2189                 Bit [2] – Set if the device should be shown in the UI.
2190                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
2191                 Bit [4] – Set if the battery is present.
2192                 Bits [31:5] – Reserved (must be cleared).
2193                 */
2194                 &aml::Method::new(
2195                     "_STA".into(),
2196                     0,
2197                     false,
2198                     // Call into MSTA method which will interrogate device
2199                     vec![&aml::Return::new(&aml::MethodCall::new(
2200                         "MSTA".into(),
2201                         vec![&self.slot_id],
2202                     ))],
2203                 ),
2204                 // Get details of memory
2205                 &aml::Method::new(
2206                     "_CRS".into(),
2207                     0,
2208                     false,
2209                     // Call into MCRS which provides actual memory details
2210                     vec![&aml::Return::new(&aml::MethodCall::new(
2211                         "MCRS".into(),
2212                         vec![&self.slot_id],
2213                     ))],
2214                 ),
2215             ],
2216         )
2217         .to_aml_bytes(sink)
2218     }
2219 }
2220 
2221 struct MemorySlots {
2222     slots: usize,
2223 }
2224 
2225 impl Aml for MemorySlots {
2226     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2227         for slot_id in 0..self.slots {
2228             MemorySlot { slot_id }.to_aml_bytes(sink);
2229         }
2230     }
2231 }
2232 
2233 struct MemoryMethods {
2234     slots: usize,
2235 }
2236 
2237 impl Aml for MemoryMethods {
2238     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2239         // Add "MTFY" notification method
2240         let mut memory_notifies = Vec::new();
2241         for slot_id in 0..self.slots {
2242             memory_notifies.push(MemoryNotify { slot_id });
2243         }
2244 
2245         let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new();
2246         for memory_notifier in memory_notifies.iter() {
2247             memory_notifies_refs.push(memory_notifier);
2248         }
2249 
2250         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink);
2251 
2252         // MSCN method
2253         aml::Method::new(
2254             "MSCN".into(),
2255             0,
2256             true,
2257             vec![
2258                 // Take lock defined above
2259                 &aml::Acquire::new("MLCK".into(), 0xffff),
2260                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2261                 &aml::While::new(
2262                     &aml::LessThan::new(&aml::Local(0), &self.slots),
2263                     vec![
2264                         // Write slot number (in first argument) to I/O port via field
2265                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2266                         // Check if MINS bit is set (inserting)
2267                         &aml::If::new(
2268                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2269                             // Notify device if it is
2270                             vec![
2271                                 &aml::MethodCall::new(
2272                                     "MTFY".into(),
2273                                     vec![&aml::Local(0), &aml::ONE],
2274                                 ),
2275                                 // Reset MINS bit
2276                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2277                             ],
2278                         ),
2279                         // Check if MRMV bit is set
2280                         &aml::If::new(
2281                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2282                             // Notify device if it is (with the eject constant 0x3)
2283                             vec![
2284                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2285                                 // Reset MRMV bit
2286                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2287                             ],
2288                         ),
2289                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2290                     ],
2291                 ),
2292                 // Release lock
2293                 &aml::Release::new("MLCK".into()),
2294             ],
2295         )
2296         .to_aml_bytes(sink);
2297 
2298         // Memory status method
2299         aml::Method::new(
2300             "MSTA".into(),
2301             1,
2302             true,
2303             vec![
2304                 // Take lock defined above
2305                 &aml::Acquire::new("MLCK".into(), 0xffff),
2306                 // Write slot number (in first argument) to I/O port via field
2307                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2308                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2309                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2310                 &aml::If::new(
2311                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2312                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2313                 ),
2314                 // Release lock
2315                 &aml::Release::new("MLCK".into()),
2316                 // Return 0 or 0xf
2317                 &aml::Return::new(&aml::Local(0)),
2318             ],
2319         )
2320         .to_aml_bytes(sink);
2321 
2322         // Memory range method
2323         aml::Method::new(
2324             "MCRS".into(),
2325             1,
2326             true,
2327             vec![
2328                 // Take lock defined above
2329                 &aml::Acquire::new("MLCK".into(), 0xffff),
2330                 // Write slot number (in first argument) to I/O port via field
2331                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2332                 &aml::Name::new(
2333                     "MR64".into(),
2334                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2335                         aml::AddressSpaceCacheable::Cacheable,
2336                         true,
2337                         0x0000_0000_0000_0000u64,
2338                         0xFFFF_FFFF_FFFF_FFFEu64,
2339                         None,
2340                     )]),
2341                 ),
2342                 &aml::CreateQWordField::new(
2343                     &aml::Path::new("MINL"),
2344                     &aml::Path::new("MR64"),
2345                     &14usize,
2346                 ),
2347                 &aml::CreateDWordField::new(
2348                     &aml::Path::new("MINH"),
2349                     &aml::Path::new("MR64"),
2350                     &18usize,
2351                 ),
2352                 &aml::CreateQWordField::new(
2353                     &aml::Path::new("MAXL"),
2354                     &aml::Path::new("MR64"),
2355                     &22usize,
2356                 ),
2357                 &aml::CreateDWordField::new(
2358                     &aml::Path::new("MAXH"),
2359                     &aml::Path::new("MR64"),
2360                     &26usize,
2361                 ),
2362                 &aml::CreateQWordField::new(
2363                     &aml::Path::new("LENL"),
2364                     &aml::Path::new("MR64"),
2365                     &38usize,
2366                 ),
2367                 &aml::CreateDWordField::new(
2368                     &aml::Path::new("LENH"),
2369                     &aml::Path::new("MR64"),
2370                     &42usize,
2371                 ),
2372                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2373                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2374                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2375                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2376                 &aml::Add::new(
2377                     &aml::Path::new("MAXL"),
2378                     &aml::Path::new("MINL"),
2379                     &aml::Path::new("LENL"),
2380                 ),
2381                 &aml::Add::new(
2382                     &aml::Path::new("MAXH"),
2383                     &aml::Path::new("MINH"),
2384                     &aml::Path::new("LENH"),
2385                 ),
2386                 &aml::If::new(
2387                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2388                     vec![&aml::Add::new(
2389                         &aml::Path::new("MAXH"),
2390                         &aml::ONE,
2391                         &aml::Path::new("MAXH"),
2392                     )],
2393                 ),
2394                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2395                 // Release lock
2396                 &aml::Release::new("MLCK".into()),
2397                 &aml::Return::new(&aml::Path::new("MR64")),
2398             ],
2399         )
2400         .to_aml_bytes(sink)
2401     }
2402 }
2403 
2404 impl Aml for MemoryManager {
2405     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2406         if let Some(acpi_address) = self.acpi_address {
2407             // Memory Hotplug Controller
2408             aml::Device::new(
2409                 "_SB_.MHPC".into(),
2410                 vec![
2411                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2412                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2413                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2414                     &aml::Mutex::new("MLCK".into(), 0),
2415                     &aml::Name::new(
2416                         "_CRS".into(),
2417                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2418                             aml::AddressSpaceCacheable::NotCacheable,
2419                             true,
2420                             acpi_address.0,
2421                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2422                             None,
2423                         )]),
2424                     ),
2425                     // OpRegion and Fields map MMIO range into individual field values
2426                     &aml::OpRegion::new(
2427                         "MHPR".into(),
2428                         aml::OpRegionSpace::SystemMemory,
2429                         &(acpi_address.0 as usize),
2430                         &MEMORY_MANAGER_ACPI_SIZE,
2431                     ),
2432                     &aml::Field::new(
2433                         "MHPR".into(),
2434                         aml::FieldAccessType::DWord,
2435                         aml::FieldLockRule::NoLock,
2436                         aml::FieldUpdateRule::Preserve,
2437                         vec![
2438                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2439                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2440                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2441                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2442                         ],
2443                     ),
2444                     &aml::Field::new(
2445                         "MHPR".into(),
2446                         aml::FieldAccessType::DWord,
2447                         aml::FieldLockRule::NoLock,
2448                         aml::FieldUpdateRule::Preserve,
2449                         vec![
2450                             aml::FieldEntry::Reserved(128),
2451                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2452                         ],
2453                     ),
2454                     &aml::Field::new(
2455                         "MHPR".into(),
2456                         aml::FieldAccessType::Byte,
2457                         aml::FieldLockRule::NoLock,
2458                         aml::FieldUpdateRule::WriteAsZeroes,
2459                         vec![
2460                             aml::FieldEntry::Reserved(160),
2461                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2462                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2463                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2464                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2465                         ],
2466                     ),
2467                     &aml::Field::new(
2468                         "MHPR".into(),
2469                         aml::FieldAccessType::DWord,
2470                         aml::FieldLockRule::NoLock,
2471                         aml::FieldUpdateRule::Preserve,
2472                         vec![
2473                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2474                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2475                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2476                         ],
2477                     ),
2478                     &MemoryMethods {
2479                         slots: self.hotplug_slots.len(),
2480                     },
2481                     &MemorySlots {
2482                         slots: self.hotplug_slots.len(),
2483                     },
2484                 ],
2485             )
2486             .to_aml_bytes(sink);
2487         } else {
2488             aml::Device::new(
2489                 "_SB_.MHPC".into(),
2490                 vec![
2491                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2492                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2493                     // Empty MSCN for GED
2494                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2495                 ],
2496             )
2497             .to_aml_bytes(sink);
2498         }
2499 
2500         #[cfg(target_arch = "x86_64")]
2501         {
2502             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2503                 let min = sgx_epc_region.start().raw_value();
2504                 let max = min + sgx_epc_region.size() - 1;
2505                 // SGX EPC region
2506                 aml::Device::new(
2507                     "_SB_.EPC_".into(),
2508                     vec![
2509                         &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")),
2510                         // QWORD describing the EPC region start and size
2511                         &aml::Name::new(
2512                             "_CRS".into(),
2513                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2514                                 aml::AddressSpaceCacheable::NotCacheable,
2515                                 true,
2516                                 min,
2517                                 max,
2518                                 None,
2519                             )]),
2520                         ),
2521                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2522                     ],
2523                 )
2524                 .to_aml_bytes(sink);
2525             }
2526         }
2527     }
2528 }
2529 
2530 impl Pausable for MemoryManager {}
2531 
2532 #[derive(Clone, Serialize, Deserialize, Versionize)]
2533 pub struct MemoryManagerSnapshotData {
2534     memory_ranges: MemoryRangeTable,
2535     guest_ram_mappings: Vec<GuestRamMapping>,
2536     start_of_device_area: u64,
2537     boot_ram: u64,
2538     current_ram: u64,
2539     arch_mem_regions: Vec<ArchMemRegion>,
2540     hotplug_slots: Vec<HotPlugState>,
2541     next_memory_slot: u32,
2542     selected_slot: usize,
2543     next_hotplug_slot: usize,
2544 }
2545 
2546 impl VersionMapped for MemoryManagerSnapshotData {}
2547 
2548 impl Snapshottable for MemoryManager {
2549     fn id(&self) -> String {
2550         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2551     }
2552 
2553     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2554         let memory_ranges = self.memory_range_table(true)?;
2555 
2556         // Store locally this list of ranges as it will be used through the
2557         // Transportable::send() implementation. The point is to avoid the
2558         // duplication of code regarding the creation of the path for each
2559         // region. The 'snapshot' step creates the list of memory regions,
2560         // including information about the need to copy a memory region or
2561         // not. This saves the 'send' step having to go through the same
2562         // process, and instead it can directly proceed with storing the
2563         // memory range content for the ranges requiring it.
2564         self.snapshot_memory_ranges = memory_ranges;
2565 
2566         Ok(Snapshot::from_data(SnapshotData::new_from_versioned_state(
2567             &self.snapshot_data(),
2568         )?))
2569     }
2570 }
2571 
2572 impl Transportable for MemoryManager {
2573     fn send(
2574         &self,
2575         _snapshot: &Snapshot,
2576         destination_url: &str,
2577     ) -> result::Result<(), MigratableError> {
2578         if self.snapshot_memory_ranges.is_empty() {
2579             return Ok(());
2580         }
2581 
2582         let mut memory_file_path = url_to_path(destination_url)?;
2583         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2584 
2585         // Create the snapshot file for the entire memory
2586         let mut memory_file = OpenOptions::new()
2587             .read(true)
2588             .write(true)
2589             .create_new(true)
2590             .open(memory_file_path)
2591             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2592 
2593         let guest_memory = self.guest_memory.memory();
2594 
2595         for range in self.snapshot_memory_ranges.regions() {
2596             let mut offset: u64 = 0;
2597             // Here we are manually handling the retry in case we can't read
2598             // the whole region at once because we can't use the implementation
2599             // from vm-memory::GuestMemory of write_all_to() as it is not
2600             // following the correct behavior. For more info about this issue
2601             // see: https://github.com/rust-vmm/vm-memory/issues/174
2602             loop {
2603                 let bytes_written = guest_memory
2604                     .write_volatile_to(
2605                         GuestAddress(range.gpa + offset),
2606                         &mut memory_file,
2607                         (range.length - offset) as usize,
2608                     )
2609                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2610                 offset += bytes_written as u64;
2611 
2612                 if offset == range.length {
2613                     break;
2614                 }
2615             }
2616         }
2617         Ok(())
2618     }
2619 }
2620 
2621 impl Migratable for MemoryManager {
2622     // Start the dirty log in the hypervisor (kvm/mshv).
2623     // Also, reset the dirty bitmap logged by the vmm.
2624     // Just before we do a bulk copy we want to start/clear the dirty log so that
2625     // pages touched during our bulk copy are tracked.
2626     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2627         self.vm.start_dirty_log().map_err(|e| {
2628             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2629         })?;
2630 
2631         for r in self.guest_memory.memory().iter() {
2632             r.bitmap().reset();
2633         }
2634 
2635         Ok(())
2636     }
2637 
2638     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2639         self.vm.stop_dirty_log().map_err(|e| {
2640             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2641         })?;
2642 
2643         Ok(())
2644     }
2645 
2646     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2647     // together in the table if they are contiguous.
2648     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2649         let mut table = MemoryRangeTable::default();
2650         for r in &self.guest_ram_mappings {
2651             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2652                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2653             })?;
2654             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2655             {
2656                 Some(region) => {
2657                     assert!(region.start_addr().raw_value() == r.gpa);
2658                     assert!(region.len() == r.size);
2659                     region.bitmap().get_and_reset()
2660                 }
2661                 None => {
2662                     return Err(MigratableError::MigrateSend(anyhow!(
2663                         "Error finding 'guest memory region' with address {:x}",
2664                         r.gpa
2665                     )))
2666                 }
2667             };
2668 
2669             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2670                 .iter()
2671                 .zip(vmm_dirty_bitmap.iter())
2672                 .map(|(x, y)| x | y)
2673                 .collect();
2674 
2675             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2676 
2677             if sub_table.regions().is_empty() {
2678                 info!("Dirty Memory Range Table is empty");
2679             } else {
2680                 info!("Dirty Memory Range Table:");
2681                 for range in sub_table.regions() {
2682                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2683                 }
2684             }
2685 
2686             table.extend(sub_table);
2687         }
2688         Ok(table)
2689     }
2690 }
2691