xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision b440cb7d2330770cd415b63544a371d4caa2db3a)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 #[cfg(target_arch = "x86_64")]
6 use crate::config::SgxEpcConfig;
7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
8 #[cfg(feature = "guest_debug")]
9 use crate::coredump::{CoredumpMemoryRegion, CoredumpMemoryRegions};
10 #[cfg(feature = "guest_debug")]
11 use crate::coredump::{DumpState, GuestDebuggableError};
12 use crate::migration::url_to_path;
13 use crate::MEMORY_MANAGER_SNAPSHOT_ID;
14 use crate::{GuestMemoryMmap, GuestRegionMmap};
15 use acpi_tables::{aml, aml::Aml};
16 use anyhow::anyhow;
17 #[cfg(target_arch = "x86_64")]
18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
19 use arch::{layout, RegionType};
20 #[cfg(target_arch = "x86_64")]
21 use devices::ioapic;
22 #[cfg(target_arch = "x86_64")]
23 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
24 use serde::{Deserialize, Serialize};
25 #[cfg(feature = "guest_debug")]
26 use std::collections::BTreeMap;
27 use std::collections::HashMap;
28 use std::convert::TryInto;
29 use std::ffi;
30 use std::fs::{File, OpenOptions};
31 use std::io;
32 use std::ops::Deref;
33 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
34 use std::path::PathBuf;
35 use std::result;
36 use std::sync::{Arc, Barrier, Mutex};
37 use versionize::{VersionMap, Versionize, VersionizeResult};
38 use versionize_derive::Versionize;
39 use virtio_devices::BlocksState;
40 #[cfg(target_arch = "x86_64")]
41 use vm_allocator::GsiApic;
42 use vm_allocator::{AddressAllocator, SystemAllocator};
43 use vm_device::BusDevice;
44 use vm_memory::bitmap::AtomicBitmap;
45 use vm_memory::guest_memory::FileOffset;
46 use vm_memory::{
47     mmap::MmapRegionError, Address, Bytes, Error as MmapError, GuestAddress, GuestAddressSpace,
48     GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion,
49 };
50 use vm_migration::{
51     protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable,
52     Snapshot, SnapshotDataSection, Snapshottable, Transportable, VersionMapped,
53 };
54 
55 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
56 
57 const DEFAULT_MEMORY_ZONE: &str = "mem0";
58 
59 const SNAPSHOT_FILENAME: &str = "memory-ranges";
60 
61 #[cfg(target_arch = "x86_64")]
62 const X86_64_IRQ_BASE: u32 = 5;
63 
64 #[cfg(target_arch = "x86_64")]
65 const SGX_PAGE_SIZE: u64 = 1 << 12;
66 
67 const HOTPLUG_COUNT: usize = 8;
68 
69 // Memory policy constants
70 const MPOL_BIND: u32 = 2;
71 const MPOL_MF_STRICT: u32 = 1;
72 const MPOL_MF_MOVE: u32 = 1 << 1;
73 
74 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
75 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
76 
77 #[derive(Clone, Default, Serialize, Deserialize, Versionize)]
78 struct HotPlugState {
79     base: u64,
80     length: u64,
81     active: bool,
82     inserting: bool,
83     removing: bool,
84 }
85 
86 pub struct VirtioMemZone {
87     region: Arc<GuestRegionMmap>,
88     resize_handler: virtio_devices::Resize,
89     hotplugged_size: u64,
90     hugepages: bool,
91     blocks_state: Arc<Mutex<BlocksState>>,
92 }
93 
94 impl VirtioMemZone {
95     pub fn region(&self) -> &Arc<GuestRegionMmap> {
96         &self.region
97     }
98     pub fn resize_handler(&self) -> &virtio_devices::Resize {
99         &self.resize_handler
100     }
101     pub fn hotplugged_size(&self) -> u64 {
102         self.hotplugged_size
103     }
104     pub fn hugepages(&self) -> bool {
105         self.hugepages
106     }
107     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
108         &self.blocks_state
109     }
110     pub fn plugged_ranges(&self) -> MemoryRangeTable {
111         self.blocks_state
112             .lock()
113             .unwrap()
114             .memory_ranges(self.region.start_addr().raw_value(), true)
115     }
116 }
117 
118 #[derive(Default)]
119 pub struct MemoryZone {
120     regions: Vec<Arc<GuestRegionMmap>>,
121     virtio_mem_zone: Option<VirtioMemZone>,
122 }
123 
124 impl MemoryZone {
125     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
126         &self.regions
127     }
128     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
129         &self.virtio_mem_zone
130     }
131 }
132 
133 pub type MemoryZones = HashMap<String, MemoryZone>;
134 
135 #[derive(Clone, Serialize, Deserialize, Versionize)]
136 struct GuestRamMapping {
137     slot: u32,
138     gpa: u64,
139     size: u64,
140     zone_id: String,
141     virtio_mem: bool,
142     file_offset: u64,
143 }
144 
145 #[derive(Clone, Serialize, Deserialize, Versionize)]
146 struct ArchMemRegion {
147     base: u64,
148     size: usize,
149     r_type: RegionType,
150 }
151 
152 pub struct MemoryManager {
153     boot_guest_memory: GuestMemoryMmap,
154     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
155     next_memory_slot: u32,
156     start_of_device_area: GuestAddress,
157     end_of_device_area: GuestAddress,
158     end_of_ram_area: GuestAddress,
159     pub vm: Arc<dyn hypervisor::Vm>,
160     hotplug_slots: Vec<HotPlugState>,
161     selected_slot: usize,
162     mergeable: bool,
163     allocator: Arc<Mutex<SystemAllocator>>,
164     hotplug_method: HotplugMethod,
165     boot_ram: u64,
166     current_ram: u64,
167     next_hotplug_slot: usize,
168     shared: bool,
169     hugepages: bool,
170     hugepage_size: Option<u64>,
171     prefault: bool,
172     #[cfg(target_arch = "x86_64")]
173     sgx_epc_region: Option<SgxEpcRegion>,
174     user_provided_zones: bool,
175     snapshot_memory_ranges: MemoryRangeTable,
176     memory_zones: MemoryZones,
177     log_dirty: bool, // Enable dirty logging for created RAM regions
178     arch_mem_regions: Vec<ArchMemRegion>,
179     ram_allocator: AddressAllocator,
180     dynamic: bool,
181 
182     // Keep track of calls to create_userspace_mapping() for guest RAM.
183     // This is useful for getting the dirty pages as we need to know the
184     // slots that the mapping is created in.
185     guest_ram_mappings: Vec<GuestRamMapping>,
186 
187     pub acpi_address: Option<GuestAddress>,
188 }
189 
190 #[derive(Debug)]
191 pub enum Error {
192     /// Failed to create shared file.
193     SharedFileCreate(io::Error),
194 
195     /// Failed to set shared file length.
196     SharedFileSetLen(io::Error),
197 
198     /// Mmap backed guest memory error
199     GuestMemory(MmapError),
200 
201     /// Failed to allocate a memory range.
202     MemoryRangeAllocation,
203 
204     /// Error from region creation
205     GuestMemoryRegion(MmapRegionError),
206 
207     /// No ACPI slot available
208     NoSlotAvailable,
209 
210     /// Not enough space in the hotplug RAM region
211     InsufficientHotplugRam,
212 
213     /// The requested hotplug memory addition is not a valid size
214     InvalidSize,
215 
216     /// Failed to create the user memory region.
217     CreateUserMemoryRegion(hypervisor::HypervisorVmError),
218 
219     /// Failed to remove the user memory region.
220     RemoveUserMemoryRegion(hypervisor::HypervisorVmError),
221 
222     /// Failed to EventFd.
223     EventFdFail(io::Error),
224 
225     /// Eventfd write error
226     EventfdError(io::Error),
227 
228     /// Failed to virtio-mem resize
229     VirtioMemResizeFail(virtio_devices::mem::Error),
230 
231     /// Cannot restore VM
232     Restore(MigratableError),
233 
234     /// Cannot restore VM because source URL is missing
235     RestoreMissingSourceUrl,
236 
237     /// Cannot create the system allocator
238     CreateSystemAllocator,
239 
240     /// Invalid SGX EPC section size
241     #[cfg(target_arch = "x86_64")]
242     EpcSectionSizeInvalid,
243 
244     /// Failed allocating SGX EPC region
245     #[cfg(target_arch = "x86_64")]
246     SgxEpcRangeAllocation,
247 
248     /// Failed opening SGX virtual EPC device
249     #[cfg(target_arch = "x86_64")]
250     SgxVirtEpcOpen(io::Error),
251 
252     /// Failed setting the SGX virtual EPC section size
253     #[cfg(target_arch = "x86_64")]
254     SgxVirtEpcFileSetLen(io::Error),
255 
256     /// Failed opening SGX provisioning device
257     #[cfg(target_arch = "x86_64")]
258     SgxProvisionOpen(io::Error),
259 
260     /// Failed enabling SGX provisioning
261     #[cfg(target_arch = "x86_64")]
262     SgxEnableProvisioning(hypervisor::HypervisorVmError),
263 
264     /// Failed creating a new MmapRegion instance.
265     #[cfg(target_arch = "x86_64")]
266     NewMmapRegion(vm_memory::mmap::MmapRegionError),
267 
268     /// No memory zones found.
269     MissingMemoryZones,
270 
271     /// Memory configuration is not valid.
272     InvalidMemoryParameters,
273 
274     /// Forbidden operation. Impossible to resize guest memory if it is
275     /// backed by user defined memory regions.
276     InvalidResizeWithMemoryZones,
277 
278     /// It's invalid to try applying a NUMA policy to a memory zone that is
279     /// memory mapped with MAP_SHARED.
280     InvalidSharedMemoryZoneWithHostNuma,
281 
282     /// Failed applying NUMA memory policy.
283     ApplyNumaPolicy(io::Error),
284 
285     /// Memory zone identifier is not unique.
286     DuplicateZoneId,
287 
288     /// No virtio-mem resizing handler found.
289     MissingVirtioMemHandler,
290 
291     /// Unknown memory zone.
292     UnknownMemoryZone,
293 
294     /// Invalid size for resizing. Can be anything except 0.
295     InvalidHotplugSize,
296 
297     /// Invalid hotplug method associated with memory zones resizing capability.
298     InvalidHotplugMethodWithMemoryZones,
299 
300     /// Could not find specified memory zone identifier from hash map.
301     MissingZoneIdentifier,
302 
303     /// Resizing the memory zone failed.
304     ResizeZone,
305 
306     /// Guest address overflow
307     GuestAddressOverFlow,
308 
309     /// Error opening snapshot file
310     SnapshotOpen(io::Error),
311 
312     // Error copying snapshot into region
313     SnapshotCopy(GuestMemoryError),
314 
315     /// Failed to allocate MMIO address
316     AllocateMmioAddress,
317 }
318 
319 const ENABLE_FLAG: usize = 0;
320 const INSERTING_FLAG: usize = 1;
321 const REMOVING_FLAG: usize = 2;
322 const EJECT_FLAG: usize = 3;
323 
324 const BASE_OFFSET_LOW: u64 = 0;
325 const BASE_OFFSET_HIGH: u64 = 0x4;
326 const LENGTH_OFFSET_LOW: u64 = 0x8;
327 const LENGTH_OFFSET_HIGH: u64 = 0xC;
328 const STATUS_OFFSET: u64 = 0x14;
329 const SELECTION_OFFSET: u64 = 0;
330 
331 // The MMIO address space size is subtracted with 64k. This is done for the
332 // following reasons:
333 //  - Reduce the addressable space size by at least 4k to workaround a Linux
334 //    bug when the VMM allocates devices at the end of the addressable space
335 //  - Windows requires the addressable space size to be 64k aligned
336 fn mmio_address_space_size(phys_bits: u8) -> u64 {
337     (1 << phys_bits) - (1 << 16)
338 }
339 
340 impl BusDevice for MemoryManager {
341     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
342         if self.selected_slot < self.hotplug_slots.len() {
343             let state = &self.hotplug_slots[self.selected_slot];
344             match offset {
345                 BASE_OFFSET_LOW => {
346                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
347                 }
348                 BASE_OFFSET_HIGH => {
349                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
350                 }
351                 LENGTH_OFFSET_LOW => {
352                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
353                 }
354                 LENGTH_OFFSET_HIGH => {
355                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
356                 }
357                 STATUS_OFFSET => {
358                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
359                     data.fill(0);
360                     if state.active {
361                         data[0] |= 1 << ENABLE_FLAG;
362                     }
363                     if state.inserting {
364                         data[0] |= 1 << INSERTING_FLAG;
365                     }
366                     if state.removing {
367                         data[0] |= 1 << REMOVING_FLAG;
368                     }
369                 }
370                 _ => {
371                     warn!(
372                         "Unexpected offset for accessing memory manager device: {:#}",
373                         offset
374                     );
375                 }
376             }
377         } else {
378             warn!("Out of range memory slot: {}", self.selected_slot);
379         }
380     }
381 
382     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
383         match offset {
384             SELECTION_OFFSET => {
385                 self.selected_slot = usize::from(data[0]);
386             }
387             STATUS_OFFSET => {
388                 if self.selected_slot < self.hotplug_slots.len() {
389                     let state = &mut self.hotplug_slots[self.selected_slot];
390                     // The ACPI code writes back a 1 to acknowledge the insertion
391                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
392                         state.inserting = false;
393                     }
394                     // Ditto for removal
395                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
396                         state.removing = false;
397                     }
398                     // Trigger removal of "DIMM"
399                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
400                         warn!("Ejection of memory not currently supported");
401                     }
402                 } else {
403                     warn!("Out of range memory slot: {}", self.selected_slot);
404                 }
405             }
406             _ => {
407                 warn!(
408                     "Unexpected offset for accessing memory manager device: {:#}",
409                     offset
410                 );
411             }
412         };
413         None
414     }
415 }
416 
417 impl MemoryManager {
418     /// Creates all memory regions based on the available RAM ranges defined
419     /// by `ram_regions`, and based on the description of the memory zones.
420     /// In practice, this function can perform multiple memory mappings of the
421     /// same backing file if there's a hole in the address space between two
422     /// RAM ranges.
423     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
424     /// and zones containing two zones (size 1G and size 4G).
425     /// This function will create 3 resulting memory regions:
426     /// - First one mapping entirely the first memory zone on 0-1G range
427     /// - Second one mapping partially the second memory zone on 1G-3G range
428     /// - Third one mapping partially the second memory zone on 4G-6G range
429     fn create_memory_regions_from_zones(
430         ram_regions: &[(GuestAddress, usize)],
431         zones: &[MemoryZoneConfig],
432         prefault: Option<bool>,
433     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
434         let mut zones = zones.to_owned();
435         let mut mem_regions = Vec::new();
436         let mut zone = zones.remove(0);
437         let mut zone_offset = 0;
438         let mut memory_zones = HashMap::new();
439 
440         // Add zone id to the list of memory zones.
441         memory_zones.insert(zone.id.clone(), MemoryZone::default());
442 
443         for ram_region in ram_regions.iter() {
444             let mut ram_region_offset = 0;
445             let mut exit = false;
446 
447             loop {
448                 let mut ram_region_consumed = false;
449                 let mut pull_next_zone = false;
450 
451                 let ram_region_sub_size = ram_region.1 - ram_region_offset;
452                 let zone_sub_size = zone.size as usize - zone_offset;
453 
454                 let file_offset = zone_offset as u64;
455                 let region_start = ram_region
456                     .0
457                     .checked_add(ram_region_offset as u64)
458                     .ok_or(Error::GuestAddressOverFlow)?;
459                 let region_size = if zone_sub_size <= ram_region_sub_size {
460                     if zone_sub_size == ram_region_sub_size {
461                         ram_region_consumed = true;
462                     }
463 
464                     ram_region_offset += zone_sub_size;
465                     pull_next_zone = true;
466 
467                     zone_sub_size
468                 } else {
469                     zone_offset += ram_region_sub_size;
470                     ram_region_consumed = true;
471 
472                     ram_region_sub_size
473                 };
474 
475                 let region = MemoryManager::create_ram_region(
476                     &zone.file,
477                     file_offset,
478                     region_start,
479                     region_size,
480                     match prefault {
481                         Some(pf) => pf,
482                         None => zone.prefault,
483                     },
484                     zone.shared,
485                     zone.hugepages,
486                     zone.hugepage_size,
487                     zone.host_numa_node,
488                     None,
489                 )?;
490 
491                 // Add region to the list of regions associated with the
492                 // current memory zone.
493                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
494                     memory_zone.regions.push(region.clone());
495                 }
496 
497                 mem_regions.push(region);
498 
499                 if pull_next_zone {
500                     // Get the next zone and reset the offset.
501                     zone_offset = 0;
502                     if zones.is_empty() {
503                         exit = true;
504                         break;
505                     }
506                     zone = zones.remove(0);
507 
508                     // Check if zone id already exist. In case it does, throw
509                     // an error as we need unique identifiers. Otherwise, add
510                     // the new zone id to the list of memory zones.
511                     if memory_zones.contains_key(&zone.id) {
512                         error!(
513                             "Memory zone identifier '{}' found more than once. \
514                             It must be unique",
515                             zone.id,
516                         );
517                         return Err(Error::DuplicateZoneId);
518                     }
519                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
520                 }
521 
522                 if ram_region_consumed {
523                     break;
524                 }
525             }
526 
527             if exit {
528                 break;
529             }
530         }
531 
532         Ok((mem_regions, memory_zones))
533     }
534 
535     // Restore both GuestMemory regions along with MemoryZone zones.
536     fn restore_memory_regions_and_zones(
537         guest_ram_mappings: &[GuestRamMapping],
538         zones_config: &[MemoryZoneConfig],
539         prefault: Option<bool>,
540         mut existing_memory_files: HashMap<u32, File>,
541     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
542         let mut memory_regions = Vec::new();
543         let mut memory_zones = HashMap::new();
544 
545         for zone_config in zones_config {
546             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
547         }
548 
549         for guest_ram_mapping in guest_ram_mappings {
550             for zone_config in zones_config {
551                 if guest_ram_mapping.zone_id == zone_config.id {
552                     let region = MemoryManager::create_ram_region(
553                         &zone_config.file,
554                         guest_ram_mapping.file_offset,
555                         GuestAddress(guest_ram_mapping.gpa),
556                         guest_ram_mapping.size as usize,
557                         match prefault {
558                             Some(pf) => pf,
559                             None => zone_config.prefault,
560                         },
561                         zone_config.shared,
562                         zone_config.hugepages,
563                         zone_config.hugepage_size,
564                         zone_config.host_numa_node,
565                         existing_memory_files.remove(&guest_ram_mapping.slot),
566                     )?;
567                     memory_regions.push(Arc::clone(&region));
568                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
569                         if guest_ram_mapping.virtio_mem {
570                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
571                             let region_size = region.len();
572                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
573                                 region,
574                                 resize_handler: virtio_devices::Resize::new(hotplugged_size)
575                                     .map_err(Error::EventFdFail)?,
576                                 hotplugged_size,
577                                 hugepages: zone_config.hugepages,
578                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
579                             });
580                         } else {
581                             memory_zone.regions.push(region);
582                         }
583                     }
584                 }
585             }
586         }
587 
588         memory_regions.sort_by_key(|x| x.start_addr());
589 
590         Ok((memory_regions, memory_zones))
591     }
592 
593     fn fill_saved_regions(
594         &mut self,
595         file_path: PathBuf,
596         saved_regions: MemoryRangeTable,
597     ) -> Result<(), Error> {
598         if saved_regions.is_empty() {
599             return Ok(());
600         }
601 
602         // Open (read only) the snapshot file.
603         let mut memory_file = OpenOptions::new()
604             .read(true)
605             .open(file_path)
606             .map_err(Error::SnapshotOpen)?;
607 
608         let guest_memory = self.guest_memory.memory();
609         for range in saved_regions.regions() {
610             let mut offset: u64 = 0;
611             // Here we are manually handling the retry in case we can't write
612             // the whole region at once because we can't use the implementation
613             // from vm-memory::GuestMemory of read_exact_from() as it is not
614             // following the correct behavior. For more info about this issue
615             // see: https://github.com/rust-vmm/vm-memory/issues/174
616             loop {
617                 let bytes_read = guest_memory
618                     .read_from(
619                         GuestAddress(range.gpa + offset),
620                         &mut memory_file,
621                         (range.length - offset) as usize,
622                     )
623                     .map_err(Error::SnapshotCopy)?;
624                 offset += bytes_read as u64;
625 
626                 if offset == range.length {
627                     break;
628                 }
629             }
630         }
631 
632         Ok(())
633     }
634 
635     fn validate_memory_config(
636         config: &MemoryConfig,
637         user_provided_zones: bool,
638     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
639         let mut allow_mem_hotplug = false;
640 
641         if !user_provided_zones {
642             if config.zones.is_some() {
643                 error!(
644                     "User defined memory regions can't be provided if the \
645                     memory size is not 0"
646                 );
647                 return Err(Error::InvalidMemoryParameters);
648             }
649 
650             if config.hotplug_size.is_some() {
651                 allow_mem_hotplug = true;
652             }
653 
654             if let Some(hotplugged_size) = config.hotplugged_size {
655                 if let Some(hotplug_size) = config.hotplug_size {
656                     if hotplugged_size > hotplug_size {
657                         error!(
658                             "'hotplugged_size' {} can't be bigger than \
659                             'hotplug_size' {}",
660                             hotplugged_size, hotplug_size,
661                         );
662                         return Err(Error::InvalidMemoryParameters);
663                     }
664                 } else {
665                     error!(
666                         "Invalid to define 'hotplugged_size' when there is\
667                         no 'hotplug_size'"
668                     );
669                     return Err(Error::InvalidMemoryParameters);
670                 }
671                 if config.hotplug_method == HotplugMethod::Acpi {
672                     error!(
673                         "Invalid to define 'hotplugged_size' with hotplug \
674                         method 'acpi'"
675                     );
676                     return Err(Error::InvalidMemoryParameters);
677                 }
678             }
679 
680             // Create a single zone from the global memory config. This lets
681             // us reuse the codepath for user defined memory zones.
682             let zones = vec![MemoryZoneConfig {
683                 id: String::from(DEFAULT_MEMORY_ZONE),
684                 size: config.size,
685                 file: None,
686                 shared: config.shared,
687                 hugepages: config.hugepages,
688                 hugepage_size: config.hugepage_size,
689                 host_numa_node: None,
690                 hotplug_size: config.hotplug_size,
691                 hotplugged_size: config.hotplugged_size,
692                 prefault: config.prefault,
693             }];
694 
695             Ok((config.size, zones, allow_mem_hotplug))
696         } else {
697             if config.zones.is_none() {
698                 error!(
699                     "User defined memory regions must be provided if the \
700                     memory size is 0"
701                 );
702                 return Err(Error::MissingMemoryZones);
703             }
704 
705             // Safe to unwrap as we checked right above there were some
706             // regions.
707             let zones = config.zones.clone().unwrap();
708             if zones.is_empty() {
709                 return Err(Error::MissingMemoryZones);
710             }
711 
712             let mut total_ram_size: u64 = 0;
713             for zone in zones.iter() {
714                 total_ram_size += zone.size;
715 
716                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
717                     error!(
718                         "Invalid to set host NUMA policy for a memory zone \
719                         backed by a regular file and mapped as 'shared'"
720                     );
721                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
722                 }
723 
724                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
725                     error!("Invalid to set ACPI hotplug method for memory zones");
726                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
727                 }
728 
729                 if let Some(hotplugged_size) = zone.hotplugged_size {
730                     if let Some(hotplug_size) = zone.hotplug_size {
731                         if hotplugged_size > hotplug_size {
732                             error!(
733                                 "'hotplugged_size' {} can't be bigger than \
734                                 'hotplug_size' {}",
735                                 hotplugged_size, hotplug_size,
736                             );
737                             return Err(Error::InvalidMemoryParameters);
738                         }
739                     } else {
740                         error!(
741                             "Invalid to define 'hotplugged_size' when there is\
742                             no 'hotplug_size' for a memory zone"
743                         );
744                         return Err(Error::InvalidMemoryParameters);
745                     }
746                     if config.hotplug_method == HotplugMethod::Acpi {
747                         error!(
748                             "Invalid to define 'hotplugged_size' with hotplug \
749                             method 'acpi'"
750                         );
751                         return Err(Error::InvalidMemoryParameters);
752                     }
753                 }
754             }
755 
756             Ok((total_ram_size, zones, allow_mem_hotplug))
757         }
758     }
759 
760     fn allocate_address_space(&mut self) -> Result<(), Error> {
761         let mut list = Vec::new();
762 
763         for (zone_id, memory_zone) in self.memory_zones.iter() {
764             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
765                 memory_zone
766                     .regions()
767                     .iter()
768                     .map(|r| (r.clone(), false))
769                     .collect();
770 
771             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
772                 regions.push((virtio_mem_zone.region().clone(), true));
773             }
774 
775             list.push((zone_id.clone(), regions));
776         }
777 
778         for (zone_id, regions) in list {
779             for (region, virtio_mem) in regions {
780                 let slot = self.create_userspace_mapping(
781                     region.start_addr().raw_value(),
782                     region.len() as u64,
783                     region.as_ptr() as u64,
784                     self.mergeable,
785                     false,
786                     self.log_dirty,
787                 )?;
788 
789                 let file_offset = if let Some(file_offset) = region.file_offset() {
790                     file_offset.start()
791                 } else {
792                     0
793                 };
794 
795                 self.guest_ram_mappings.push(GuestRamMapping {
796                     gpa: region.start_addr().raw_value(),
797                     size: region.len(),
798                     slot,
799                     zone_id: zone_id.clone(),
800                     virtio_mem,
801                     file_offset,
802                 });
803                 self.ram_allocator
804                     .allocate(Some(region.start_addr()), region.len(), None)
805                     .ok_or(Error::MemoryRangeAllocation)?;
806             }
807         }
808 
809         // Allocate SubRegion and Reserved address ranges.
810         for region in self.arch_mem_regions.iter() {
811             if region.r_type == RegionType::Ram {
812                 // Ignore the RAM type since ranges have already been allocated
813                 // based on the GuestMemory regions.
814                 continue;
815             }
816             self.ram_allocator
817                 .allocate(
818                     Some(GuestAddress(region.base)),
819                     region.size as GuestUsize,
820                     None,
821                 )
822                 .ok_or(Error::MemoryRangeAllocation)?;
823         }
824 
825         Ok(())
826     }
827 
828     #[allow(clippy::too_many_arguments)]
829     pub fn new(
830         vm: Arc<dyn hypervisor::Vm>,
831         config: &MemoryConfig,
832         prefault: Option<bool>,
833         phys_bits: u8,
834         #[cfg(feature = "tdx")] tdx_enabled: bool,
835         restore_data: Option<&MemoryManagerSnapshotData>,
836         existing_memory_files: Option<HashMap<u32, File>>,
837         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
838     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
839         let user_provided_zones = config.size == 0;
840 
841         let mmio_address_space_size = mmio_address_space_size(phys_bits);
842         debug_assert_eq!(
843             (((mmio_address_space_size) >> 16) << 16),
844             mmio_address_space_size
845         );
846         let start_of_platform_device_area =
847             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
848         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
849 
850         let (ram_size, zones, allow_mem_hotplug) =
851             Self::validate_memory_config(config, user_provided_zones)?;
852 
853         let (
854             start_of_device_area,
855             boot_ram,
856             current_ram,
857             arch_mem_regions,
858             memory_zones,
859             guest_memory,
860             boot_guest_memory,
861             hotplug_slots,
862             next_memory_slot,
863             selected_slot,
864             next_hotplug_slot,
865         ) = if let Some(data) = restore_data {
866             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
867                 &data.guest_ram_mappings,
868                 &zones,
869                 prefault,
870                 existing_memory_files.unwrap_or_default(),
871             )?;
872             let guest_memory =
873                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
874             let boot_guest_memory = guest_memory.clone();
875             (
876                 GuestAddress(data.start_of_device_area),
877                 data.boot_ram,
878                 data.current_ram,
879                 data.arch_mem_regions.clone(),
880                 memory_zones,
881                 guest_memory,
882                 boot_guest_memory,
883                 data.hotplug_slots.clone(),
884                 data.next_memory_slot,
885                 data.selected_slot,
886                 data.next_hotplug_slot,
887             )
888         } else {
889             // Init guest memory
890             let arch_mem_regions = arch::arch_memory_regions(ram_size);
891 
892             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
893                 .iter()
894                 .filter(|r| r.2 == RegionType::Ram)
895                 .map(|r| (r.0, r.1))
896                 .collect();
897 
898             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
899                 .iter()
900                 .map(|(a, b, c)| ArchMemRegion {
901                     base: a.0,
902                     size: *b,
903                     r_type: *c,
904                 })
905                 .collect();
906 
907             let (mem_regions, mut memory_zones) =
908                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault)?;
909 
910             let mut guest_memory =
911                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
912 
913             let boot_guest_memory = guest_memory.clone();
914 
915             let mut start_of_device_area =
916                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
917 
918             // Update list of memory zones for resize.
919             for zone in zones.iter() {
920                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
921                     if let Some(hotplug_size) = zone.hotplug_size {
922                         if hotplug_size == 0 {
923                             error!("'hotplug_size' can't be 0");
924                             return Err(Error::InvalidHotplugSize);
925                         }
926 
927                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
928                             start_of_device_area = start_of_device_area
929                                 .checked_add(hotplug_size)
930                                 .ok_or(Error::GuestAddressOverFlow)?;
931                         } else {
932                             // Alignment must be "natural" i.e. same as size of block
933                             let start_addr = GuestAddress(
934                                 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE
935                                     - 1)
936                                     / virtio_devices::VIRTIO_MEM_ALIGN_SIZE
937                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
938                             );
939 
940                             // When `prefault` is set by vm_restore, memory manager
941                             // will create ram region with `prefault` option in
942                             // restore config rather than same option in zone
943                             let region = MemoryManager::create_ram_region(
944                                 &None,
945                                 0,
946                                 start_addr,
947                                 hotplug_size as usize,
948                                 match prefault {
949                                     Some(pf) => pf,
950                                     None => zone.prefault,
951                                 },
952                                 zone.shared,
953                                 zone.hugepages,
954                                 zone.hugepage_size,
955                                 zone.host_numa_node,
956                                 None,
957                             )?;
958 
959                             guest_memory = guest_memory
960                                 .insert_region(Arc::clone(&region))
961                                 .map_err(Error::GuestMemory)?;
962 
963                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
964                             let region_size = region.len();
965                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
966                                 region,
967                                 resize_handler: virtio_devices::Resize::new(hotplugged_size)
968                                     .map_err(Error::EventFdFail)?,
969                                 hotplugged_size,
970                                 hugepages: zone.hugepages,
971                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
972                             });
973 
974                             start_of_device_area = start_addr
975                                 .checked_add(hotplug_size)
976                                 .ok_or(Error::GuestAddressOverFlow)?;
977                         }
978                     }
979                 } else {
980                     return Err(Error::MissingZoneIdentifier);
981                 }
982             }
983 
984             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
985             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
986 
987             (
988                 start_of_device_area,
989                 ram_size,
990                 ram_size,
991                 arch_mem_regions,
992                 memory_zones,
993                 guest_memory,
994                 boot_guest_memory,
995                 hotplug_slots,
996                 0,
997                 0,
998                 0,
999             )
1000         };
1001 
1002         let guest_memory = GuestMemoryAtomic::new(guest_memory);
1003 
1004         // Both MMIO and PIO address spaces start at address 0.
1005         let allocator = Arc::new(Mutex::new(
1006             SystemAllocator::new(
1007                 #[cfg(target_arch = "x86_64")]
1008                 {
1009                     GuestAddress(0)
1010                 },
1011                 #[cfg(target_arch = "x86_64")]
1012                 {
1013                     1 << 16
1014                 },
1015                 start_of_platform_device_area,
1016                 PLATFORM_DEVICE_AREA_SIZE,
1017                 layout::MEM_32BIT_DEVICES_START,
1018                 layout::MEM_32BIT_DEVICES_SIZE,
1019                 #[cfg(target_arch = "x86_64")]
1020                 vec![GsiApic::new(
1021                     X86_64_IRQ_BASE,
1022                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1023                 )],
1024             )
1025             .ok_or(Error::CreateSystemAllocator)?,
1026         ));
1027 
1028         #[cfg(not(feature = "tdx"))]
1029         let dynamic = true;
1030         #[cfg(feature = "tdx")]
1031         let dynamic = !tdx_enabled;
1032 
1033         let acpi_address = if dynamic
1034             && config.hotplug_method == HotplugMethod::Acpi
1035             && (config.hotplug_size.unwrap_or_default() > 0)
1036         {
1037             Some(
1038                 allocator
1039                     .lock()
1040                     .unwrap()
1041                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1042                     .ok_or(Error::AllocateMmioAddress)?,
1043             )
1044         } else {
1045             None
1046         };
1047 
1048         // If running on SGX the start of device area and RAM area may diverge but
1049         // at this point they are next to each other.
1050         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1051         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1052 
1053         let mut memory_manager = MemoryManager {
1054             boot_guest_memory,
1055             guest_memory,
1056             next_memory_slot,
1057             start_of_device_area,
1058             end_of_device_area,
1059             end_of_ram_area,
1060             vm,
1061             hotplug_slots,
1062             selected_slot,
1063             mergeable: config.mergeable,
1064             allocator,
1065             hotplug_method: config.hotplug_method,
1066             boot_ram,
1067             current_ram,
1068             next_hotplug_slot,
1069             shared: config.shared,
1070             hugepages: config.hugepages,
1071             hugepage_size: config.hugepage_size,
1072             prefault: config.prefault,
1073             #[cfg(target_arch = "x86_64")]
1074             sgx_epc_region: None,
1075             user_provided_zones,
1076             snapshot_memory_ranges: MemoryRangeTable::default(),
1077             memory_zones,
1078             guest_ram_mappings: Vec::new(),
1079             acpi_address,
1080             log_dirty: dynamic, // Cannot log dirty pages on a TD
1081             arch_mem_regions,
1082             ram_allocator,
1083             dynamic,
1084         };
1085 
1086         memory_manager.allocate_address_space()?;
1087         #[cfg(target_arch = "x86_64")]
1088         if let Some(sgx_epc_config) = sgx_epc_config {
1089             memory_manager.setup_sgx(sgx_epc_config)?;
1090         }
1091 
1092         Ok(Arc::new(Mutex::new(memory_manager)))
1093     }
1094 
1095     pub fn new_from_snapshot(
1096         snapshot: &Snapshot,
1097         vm: Arc<dyn hypervisor::Vm>,
1098         config: &MemoryConfig,
1099         source_url: Option<&str>,
1100         prefault: bool,
1101         phys_bits: u8,
1102     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1103         if let Some(source_url) = source_url {
1104             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1105             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1106 
1107             let mem_snapshot: MemoryManagerSnapshotData = snapshot
1108                 .to_versioned_state(MEMORY_MANAGER_SNAPSHOT_ID)
1109                 .map_err(Error::Restore)?;
1110 
1111             let mm = MemoryManager::new(
1112                 vm,
1113                 config,
1114                 Some(prefault),
1115                 phys_bits,
1116                 #[cfg(feature = "tdx")]
1117                 false,
1118                 Some(&mem_snapshot),
1119                 None,
1120                 #[cfg(target_arch = "x86_64")]
1121                 None,
1122             )?;
1123 
1124             mm.lock()
1125                 .unwrap()
1126                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1127 
1128             Ok(mm)
1129         } else {
1130             Err(Error::RestoreMissingSourceUrl)
1131         }
1132     }
1133 
1134     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1135         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1136 
1137         if res < 0 {
1138             Err(io::Error::last_os_error())
1139         } else {
1140             Ok(res as RawFd)
1141         }
1142     }
1143 
1144     fn mbind(
1145         addr: *mut u8,
1146         len: u64,
1147         mode: u32,
1148         nodemask: Vec<u64>,
1149         maxnode: u64,
1150         flags: u32,
1151     ) -> Result<(), io::Error> {
1152         let res = unsafe {
1153             libc::syscall(
1154                 libc::SYS_mbind,
1155                 addr as *mut libc::c_void,
1156                 len,
1157                 mode,
1158                 nodemask.as_ptr(),
1159                 maxnode,
1160                 flags,
1161             )
1162         };
1163 
1164         if res < 0 {
1165             Err(io::Error::last_os_error())
1166         } else {
1167             Ok(())
1168         }
1169     }
1170 
1171     fn open_memory_file(
1172         backing_file: &Option<PathBuf>,
1173         file_offset: u64,
1174         size: usize,
1175         hugepages: bool,
1176         hugepage_size: Option<u64>,
1177     ) -> Result<(File, u64), Error> {
1178         let (f, f_off) = match backing_file {
1179             Some(ref file) => {
1180                 if file.is_dir() {
1181                     // Override file offset as it does not apply in this case.
1182                     info!(
1183                         "Ignoring file offset since the backing file is a \
1184                         temporary file created from the specified directory."
1185                     );
1186                     let fs_str = format!("{}{}", file.display(), "/tmpfile_XXXXXX");
1187                     let fs = ffi::CString::new(fs_str).unwrap();
1188                     let mut path = fs.as_bytes_with_nul().to_owned();
1189                     let path_ptr = path.as_mut_ptr() as *mut _;
1190                     let fd = unsafe { libc::mkstemp(path_ptr) };
1191                     unsafe { libc::unlink(path_ptr) };
1192                     let f = unsafe { File::from_raw_fd(fd) };
1193                     f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1194 
1195                     (f, 0)
1196                 } else {
1197                     let f = OpenOptions::new()
1198                         .read(true)
1199                         .write(true)
1200                         .open(file)
1201                         .map_err(Error::SharedFileCreate)?;
1202 
1203                     (f, file_offset)
1204                 }
1205             }
1206             None => {
1207                 let fd = Self::memfd_create(
1208                     &ffi::CString::new("ch_ram").unwrap(),
1209                     if hugepages {
1210                         libc::MFD_HUGETLB
1211                             | if let Some(hugepage_size) = hugepage_size {
1212                                 /*
1213                                  * From the Linux kernel:
1214                                  * Several system calls take a flag to request "hugetlb" huge pages.
1215                                  * Without further specification, these system calls will use the
1216                                  * system's default huge page size.  If a system supports multiple
1217                                  * huge page sizes, the desired huge page size can be specified in
1218                                  * bits [26:31] of the flag arguments.  The value in these 6 bits
1219                                  * will encode the log2 of the huge page size.
1220                                  */
1221 
1222                                 hugepage_size.trailing_zeros() << 26
1223                             } else {
1224                                 // Use the system default huge page size
1225                                 0
1226                             }
1227                     } else {
1228                         0
1229                     },
1230                 )
1231                 .map_err(Error::SharedFileCreate)?;
1232 
1233                 let f = unsafe { File::from_raw_fd(fd) };
1234                 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1235 
1236                 (f, 0)
1237             }
1238         };
1239 
1240         Ok((f, f_off))
1241     }
1242 
1243     #[allow(clippy::too_many_arguments)]
1244     fn create_ram_region(
1245         backing_file: &Option<PathBuf>,
1246         file_offset: u64,
1247         start_addr: GuestAddress,
1248         size: usize,
1249         prefault: bool,
1250         shared: bool,
1251         hugepages: bool,
1252         hugepage_size: Option<u64>,
1253         host_numa_node: Option<u32>,
1254         existing_memory_file: Option<File>,
1255     ) -> Result<Arc<GuestRegionMmap>, Error> {
1256         let (f, f_off) = if let Some(f) = existing_memory_file {
1257             (f, file_offset)
1258         } else {
1259             Self::open_memory_file(backing_file, file_offset, size, hugepages, hugepage_size)?
1260         };
1261 
1262         let mut mmap_flags = libc::MAP_NORESERVE
1263             | if shared {
1264                 libc::MAP_SHARED
1265             } else {
1266                 libc::MAP_PRIVATE
1267             };
1268         if prefault {
1269             mmap_flags |= libc::MAP_POPULATE;
1270         }
1271 
1272         let region = GuestRegionMmap::new(
1273             MmapRegion::build(
1274                 Some(FileOffset::new(f, f_off)),
1275                 size,
1276                 libc::PROT_READ | libc::PROT_WRITE,
1277                 mmap_flags,
1278             )
1279             .map_err(Error::GuestMemoryRegion)?,
1280             start_addr,
1281         )
1282         .map_err(Error::GuestMemory)?;
1283 
1284         // Apply NUMA policy if needed.
1285         if let Some(node) = host_numa_node {
1286             let addr = region.deref().as_ptr();
1287             let len = region.deref().size() as u64;
1288             let mode = MPOL_BIND;
1289             let mut nodemask: Vec<u64> = Vec::new();
1290             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1291 
1292             // Linux is kind of buggy in the way it interprets maxnode as it
1293             // will cut off the last node. That's why we have to add 1 to what
1294             // we would consider as the proper maxnode value.
1295             let maxnode = node as u64 + 1 + 1;
1296 
1297             // Allocate the right size for the vector.
1298             nodemask.resize((node as usize / 64) + 1, 0);
1299 
1300             // Fill the global bitmask through the nodemask vector.
1301             let idx = (node / 64) as usize;
1302             let shift = node % 64;
1303             nodemask[idx] |= 1u64 << shift;
1304 
1305             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1306             // force the kernel to move all pages that might have been already
1307             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1308             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1309             // MPOL_BIND is the selected mode as it specifies a strict policy
1310             // that restricts memory allocation to the nodes specified in the
1311             // nodemask.
1312             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1313                 .map_err(Error::ApplyNumaPolicy)?;
1314         }
1315 
1316         Ok(Arc::new(region))
1317     }
1318 
1319     // Update the GuestMemoryMmap with the new range
1320     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1321         let guest_memory = self
1322             .guest_memory
1323             .memory()
1324             .insert_region(region)
1325             .map_err(Error::GuestMemory)?;
1326         self.guest_memory.lock().unwrap().replace(guest_memory);
1327 
1328         Ok(())
1329     }
1330 
1331     //
1332     // Calculate the start address of an area next to RAM.
1333     //
1334     // If memory hotplug is allowed, the start address needs to be aligned
1335     // (rounded-up) to 128MiB boundary.
1336     // If memory hotplug is not allowed, there is no alignment required.
1337     // And it must also start at the 64bit start.
1338     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1339         let mut start_addr = if allow_mem_hotplug {
1340             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1341         } else {
1342             mem_end
1343         };
1344 
1345         start_addr = start_addr
1346             .checked_add(1)
1347             .ok_or(Error::GuestAddressOverFlow)?;
1348 
1349         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1350             return Ok(arch::layout::RAM_64BIT_START);
1351         }
1352 
1353         Ok(start_addr)
1354     }
1355 
1356     pub fn add_ram_region(
1357         &mut self,
1358         start_addr: GuestAddress,
1359         size: usize,
1360     ) -> Result<Arc<GuestRegionMmap>, Error> {
1361         // Allocate memory for the region
1362         let region = MemoryManager::create_ram_region(
1363             &None,
1364             0,
1365             start_addr,
1366             size,
1367             self.prefault,
1368             self.shared,
1369             self.hugepages,
1370             self.hugepage_size,
1371             None,
1372             None,
1373         )?;
1374 
1375         // Map it into the guest
1376         let slot = self.create_userspace_mapping(
1377             region.start_addr().0,
1378             region.len() as u64,
1379             region.as_ptr() as u64,
1380             self.mergeable,
1381             false,
1382             self.log_dirty,
1383         )?;
1384         self.guest_ram_mappings.push(GuestRamMapping {
1385             gpa: region.start_addr().raw_value(),
1386             size: region.len(),
1387             slot,
1388             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1389             virtio_mem: false,
1390             file_offset: 0,
1391         });
1392 
1393         self.add_region(Arc::clone(&region))?;
1394 
1395         Ok(region)
1396     }
1397 
1398     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1399         info!("Hotplugging new RAM: {}", size);
1400 
1401         // Check that there is a free slot
1402         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1403             return Err(Error::NoSlotAvailable);
1404         }
1405 
1406         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1407         if size % (128 << 20) != 0 {
1408             return Err(Error::InvalidSize);
1409         }
1410 
1411         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1412 
1413         if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area {
1414             return Err(Error::InsufficientHotplugRam);
1415         }
1416 
1417         let region = self.add_ram_region(start_addr, size)?;
1418 
1419         // Add region to the list of regions associated with the default
1420         // memory zone.
1421         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1422             memory_zone.regions.push(Arc::clone(&region));
1423         }
1424 
1425         // Tell the allocator
1426         self.ram_allocator
1427             .allocate(Some(start_addr), size as GuestUsize, None)
1428             .ok_or(Error::MemoryRangeAllocation)?;
1429 
1430         // Update the slot so that it can be queried via the I/O port
1431         let mut slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1432         slot.active = true;
1433         slot.inserting = true;
1434         slot.base = region.start_addr().0;
1435         slot.length = region.len() as u64;
1436 
1437         self.next_hotplug_slot += 1;
1438 
1439         Ok(region)
1440     }
1441 
1442     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1443         self.guest_memory.clone()
1444     }
1445 
1446     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1447         self.boot_guest_memory.clone()
1448     }
1449 
1450     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1451         self.allocator.clone()
1452     }
1453 
1454     pub fn start_of_device_area(&self) -> GuestAddress {
1455         self.start_of_device_area
1456     }
1457 
1458     pub fn end_of_device_area(&self) -> GuestAddress {
1459         self.end_of_device_area
1460     }
1461 
1462     pub fn allocate_memory_slot(&mut self) -> u32 {
1463         let slot_id = self.next_memory_slot;
1464         self.next_memory_slot += 1;
1465         slot_id
1466     }
1467 
1468     pub fn create_userspace_mapping(
1469         &mut self,
1470         guest_phys_addr: u64,
1471         memory_size: u64,
1472         userspace_addr: u64,
1473         mergeable: bool,
1474         readonly: bool,
1475         log_dirty: bool,
1476     ) -> Result<u32, Error> {
1477         let slot = self.allocate_memory_slot();
1478         let mem_region = self.vm.make_user_memory_region(
1479             slot,
1480             guest_phys_addr,
1481             memory_size,
1482             userspace_addr,
1483             readonly,
1484             log_dirty,
1485         );
1486 
1487         info!(
1488             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1489             guest_phys_addr, userspace_addr, memory_size, slot
1490         );
1491 
1492         self.vm
1493             .create_user_memory_region(mem_region)
1494             .map_err(Error::CreateUserMemoryRegion)?;
1495 
1496         // Mark the pages as mergeable if explicitly asked for.
1497         if mergeable {
1498             // Safe because the address and size are valid since the
1499             // mmap succeeded.
1500             let ret = unsafe {
1501                 libc::madvise(
1502                     userspace_addr as *mut libc::c_void,
1503                     memory_size as libc::size_t,
1504                     libc::MADV_MERGEABLE,
1505                 )
1506             };
1507             if ret != 0 {
1508                 let err = io::Error::last_os_error();
1509                 // Safe to unwrap because the error is constructed with
1510                 // last_os_error(), which ensures the output will be Some().
1511                 let errno = err.raw_os_error().unwrap();
1512                 if errno == libc::EINVAL {
1513                     warn!("kernel not configured with CONFIG_KSM");
1514                 } else {
1515                     warn!("madvise error: {}", err);
1516                 }
1517                 warn!("failed to mark pages as mergeable");
1518             }
1519         }
1520 
1521         info!(
1522             "Created userspace mapping: {:x} -> {:x} {:x}",
1523             guest_phys_addr, userspace_addr, memory_size
1524         );
1525 
1526         Ok(slot)
1527     }
1528 
1529     pub fn remove_userspace_mapping(
1530         &mut self,
1531         guest_phys_addr: u64,
1532         memory_size: u64,
1533         userspace_addr: u64,
1534         mergeable: bool,
1535         slot: u32,
1536     ) -> Result<(), Error> {
1537         let mem_region = self.vm.make_user_memory_region(
1538             slot,
1539             guest_phys_addr,
1540             memory_size,
1541             userspace_addr,
1542             false, /* readonly -- don't care */
1543             false, /* log dirty */
1544         );
1545 
1546         self.vm
1547             .remove_user_memory_region(mem_region)
1548             .map_err(Error::RemoveUserMemoryRegion)?;
1549 
1550         // Mark the pages as unmergeable if there were previously marked as
1551         // mergeable.
1552         if mergeable {
1553             // Safe because the address and size are valid as the region was
1554             // previously advised.
1555             let ret = unsafe {
1556                 libc::madvise(
1557                     userspace_addr as *mut libc::c_void,
1558                     memory_size as libc::size_t,
1559                     libc::MADV_UNMERGEABLE,
1560                 )
1561             };
1562             if ret != 0 {
1563                 let err = io::Error::last_os_error();
1564                 // Safe to unwrap because the error is constructed with
1565                 // last_os_error(), which ensures the output will be Some().
1566                 let errno = err.raw_os_error().unwrap();
1567                 if errno == libc::EINVAL {
1568                     warn!("kernel not configured with CONFIG_KSM");
1569                 } else {
1570                     warn!("madvise error: {}", err);
1571                 }
1572                 warn!("failed to mark pages as unmergeable");
1573             }
1574         }
1575 
1576         info!(
1577             "Removed userspace mapping: {:x} -> {:x} {:x}",
1578             guest_phys_addr, userspace_addr, memory_size
1579         );
1580 
1581         Ok(())
1582     }
1583 
1584     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1585         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1586             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1587                 virtio_mem_zone
1588                     .resize_handler()
1589                     .work(size)
1590                     .map_err(Error::VirtioMemResizeFail)?;
1591 
1592                 // Keep the hotplugged_size up to date.
1593                 virtio_mem_zone.hotplugged_size = size;
1594             } else {
1595                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1596                 return Err(Error::MissingVirtioMemHandler);
1597             }
1598 
1599             return Ok(());
1600         }
1601 
1602         error!("Failed resizing virtio-mem region: Unknown memory zone");
1603         Err(Error::UnknownMemoryZone)
1604     }
1605 
1606     /// In case this function resulted in adding a new memory region to the
1607     /// guest memory, the new region is returned to the caller. The virtio-mem
1608     /// use case never adds a new region as the whole hotpluggable memory has
1609     /// already been allocated at boot time.
1610     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1611         if self.user_provided_zones {
1612             error!(
1613                 "Not allowed to resize guest memory when backed with user \
1614                 defined memory zones."
1615             );
1616             return Err(Error::InvalidResizeWithMemoryZones);
1617         }
1618 
1619         let mut region: Option<Arc<GuestRegionMmap>> = None;
1620         match self.hotplug_method {
1621             HotplugMethod::VirtioMem => {
1622                 if desired_ram >= self.boot_ram {
1623                     if !self.dynamic {
1624                         return Ok(region);
1625                     }
1626 
1627                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1628                     self.current_ram = desired_ram;
1629                 }
1630             }
1631             HotplugMethod::Acpi => {
1632                 if desired_ram > self.current_ram {
1633                     if !self.dynamic {
1634                         return Ok(region);
1635                     }
1636 
1637                     region =
1638                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1639                     self.current_ram = desired_ram;
1640                 }
1641             }
1642         }
1643         Ok(region)
1644     }
1645 
1646     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1647         if !self.user_provided_zones {
1648             error!(
1649                 "Not allowed to resize guest memory zone when no zone is \
1650                 defined."
1651             );
1652             return Err(Error::ResizeZone);
1653         }
1654 
1655         self.virtio_mem_resize(id, virtio_mem_size)
1656     }
1657 
1658     #[cfg(target_arch = "x86_64")]
1659     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1660         let file = OpenOptions::new()
1661             .read(true)
1662             .open("/dev/sgx_provision")
1663             .map_err(Error::SgxProvisionOpen)?;
1664         self.vm
1665             .enable_sgx_attribute(file)
1666             .map_err(Error::SgxEnableProvisioning)?;
1667 
1668         // Go over each EPC section and verify its size is a 4k multiple. At
1669         // the same time, calculate the total size needed for the contiguous
1670         // EPC region.
1671         let mut epc_region_size = 0;
1672         for epc_section in sgx_epc_config.iter() {
1673             if epc_section.size == 0 {
1674                 return Err(Error::EpcSectionSizeInvalid);
1675             }
1676             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1677                 return Err(Error::EpcSectionSizeInvalid);
1678             }
1679 
1680             epc_region_size += epc_section.size;
1681         }
1682 
1683         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
1684         let epc_region_start = GuestAddress(
1685             ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE,
1686         );
1687 
1688         self.start_of_device_area = epc_region_start
1689             .checked_add(epc_region_size)
1690             .ok_or(Error::GuestAddressOverFlow)?;
1691 
1692         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
1693         info!(
1694             "SGX EPC region: 0x{:x} (0x{:x})",
1695             epc_region_start.0, epc_region_size
1696         );
1697 
1698         // Each section can be memory mapped into the allocated region.
1699         let mut epc_section_start = epc_region_start.raw_value();
1700         for epc_section in sgx_epc_config.iter() {
1701             let file = OpenOptions::new()
1702                 .read(true)
1703                 .write(true)
1704                 .open("/dev/sgx_vepc")
1705                 .map_err(Error::SgxVirtEpcOpen)?;
1706 
1707             let prot = PROT_READ | PROT_WRITE;
1708             let mut flags = MAP_NORESERVE | MAP_SHARED;
1709             if epc_section.prefault {
1710                 flags |= MAP_POPULATE;
1711             }
1712 
1713             // We can't use the vm-memory crate to perform the memory mapping
1714             // here as it would try to ensure the size of the backing file is
1715             // matching the size of the expected mapping. The /dev/sgx_vepc
1716             // device does not work that way, it provides a file descriptor
1717             // which is not matching the mapping size, as it's a just a way to
1718             // let KVM know that an EPC section is being created for the guest.
1719             let host_addr = unsafe {
1720                 libc::mmap(
1721                     std::ptr::null_mut(),
1722                     epc_section.size as usize,
1723                     prot,
1724                     flags,
1725                     file.as_raw_fd(),
1726                     0,
1727                 )
1728             } as u64;
1729 
1730             info!(
1731                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
1732                 epc_section_start, epc_section.size
1733             );
1734 
1735             let _mem_slot = self.create_userspace_mapping(
1736                 epc_section_start,
1737                 epc_section.size,
1738                 host_addr,
1739                 false,
1740                 false,
1741                 false,
1742             )?;
1743 
1744             sgx_epc_region.insert(
1745                 epc_section.id.clone(),
1746                 SgxEpcSection::new(
1747                     GuestAddress(epc_section_start),
1748                     epc_section.size as GuestUsize,
1749                 ),
1750             );
1751 
1752             epc_section_start += epc_section.size;
1753         }
1754 
1755         self.sgx_epc_region = Some(sgx_epc_region);
1756 
1757         Ok(())
1758     }
1759 
1760     #[cfg(target_arch = "x86_64")]
1761     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
1762         &self.sgx_epc_region
1763     }
1764 
1765     pub fn is_hardlink(f: &File) -> bool {
1766         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
1767         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
1768         if ret != 0 {
1769             error!("Couldn't fstat the backing file");
1770             return false;
1771         }
1772 
1773         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
1774     }
1775 
1776     pub fn memory_zones(&self) -> &MemoryZones {
1777         &self.memory_zones
1778     }
1779 
1780     pub fn memory_range_table(
1781         &self,
1782         snapshot: bool,
1783     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
1784         let mut table = MemoryRangeTable::default();
1785 
1786         for memory_zone in self.memory_zones.values() {
1787             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
1788                 table.extend(virtio_mem_zone.plugged_ranges());
1789             }
1790 
1791             for region in memory_zone.regions() {
1792                 if snapshot {
1793                     if let Some(file_offset) = region.file_offset() {
1794                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
1795                             && Self::is_hardlink(file_offset.file())
1796                         {
1797                             // In this very specific case, we know the memory
1798                             // region is backed by a file on the host filesystem
1799                             // that can be accessed by the user, and additionally
1800                             // the mapping is shared, which means that modifications
1801                             // to the content are written to the actual file.
1802                             // When meeting these conditions, we can skip the
1803                             // copy of the memory content for this specific region,
1804                             // as we can assume the user will have it saved through
1805                             // the backing file already.
1806                             continue;
1807                         }
1808                     }
1809                 }
1810 
1811                 table.push(MemoryRange {
1812                     gpa: region.start_addr().raw_value(),
1813                     length: region.len() as u64,
1814                 });
1815             }
1816         }
1817 
1818         Ok(table)
1819     }
1820 
1821     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
1822         MemoryManagerSnapshotData {
1823             memory_ranges: self.snapshot_memory_ranges.clone(),
1824             guest_ram_mappings: self.guest_ram_mappings.clone(),
1825             start_of_device_area: self.start_of_device_area.0,
1826             boot_ram: self.boot_ram,
1827             current_ram: self.current_ram,
1828             arch_mem_regions: self.arch_mem_regions.clone(),
1829             hotplug_slots: self.hotplug_slots.clone(),
1830             next_memory_slot: self.next_memory_slot,
1831             selected_slot: self.selected_slot,
1832             next_hotplug_slot: self.next_hotplug_slot,
1833         }
1834     }
1835 
1836     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
1837         let mut memory_slot_fds = HashMap::new();
1838         for guest_ram_mapping in &self.guest_ram_mappings {
1839             let slot = guest_ram_mapping.slot;
1840             let guest_memory = self.guest_memory.memory();
1841             let file = guest_memory
1842                 .find_region(GuestAddress(guest_ram_mapping.gpa))
1843                 .unwrap()
1844                 .file_offset()
1845                 .unwrap()
1846                 .file();
1847             memory_slot_fds.insert(slot, file.as_raw_fd());
1848         }
1849         memory_slot_fds
1850     }
1851 
1852     pub fn acpi_address(&self) -> Option<GuestAddress> {
1853         self.acpi_address
1854     }
1855 
1856     pub fn num_guest_ram_mappings(&self) -> u32 {
1857         self.guest_ram_mappings.len() as u32
1858     }
1859 
1860     #[cfg(feature = "guest_debug")]
1861     pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
1862         let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
1863         mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
1864 
1865         let mut mem_offset_in_elf = mem_offset;
1866         let mut ram_maps = BTreeMap::new();
1867         for mapping in mapping_sorted_by_gpa.iter() {
1868             ram_maps.insert(
1869                 mapping.gpa,
1870                 CoredumpMemoryRegion {
1871                     mem_offset_in_elf,
1872                     mem_size: mapping.size,
1873                 },
1874             );
1875             mem_offset_in_elf += mapping.size;
1876         }
1877 
1878         CoredumpMemoryRegions { ram_maps }
1879     }
1880 
1881     #[cfg(feature = "guest_debug")]
1882     pub fn coredump_iterate_save_mem(
1883         &mut self,
1884         dump_state: &DumpState,
1885     ) -> std::result::Result<(), GuestDebuggableError> {
1886         let snapshot_memory_ranges = self
1887             .memory_range_table(false)
1888             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
1889 
1890         if snapshot_memory_ranges.is_empty() {
1891             return Ok(());
1892         }
1893 
1894         let mut coredump_file = dump_state.file.as_ref().unwrap();
1895 
1896         let guest_memory = self.guest_memory.memory();
1897         let mut total_bytes: u64 = 0;
1898 
1899         for range in snapshot_memory_ranges.regions() {
1900             let mut offset: u64 = 0;
1901             loop {
1902                 let bytes_written = guest_memory
1903                     .write_to(
1904                         GuestAddress(range.gpa + offset),
1905                         &mut coredump_file,
1906                         (range.length - offset) as usize,
1907                     )
1908                     .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
1909                 offset += bytes_written as u64;
1910                 total_bytes += bytes_written as u64;
1911 
1912                 if offset == range.length {
1913                     break;
1914                 }
1915             }
1916         }
1917 
1918         debug!("coredump total bytes {}", total_bytes);
1919         Ok(())
1920     }
1921 }
1922 
1923 struct MemoryNotify {
1924     slot_id: usize,
1925 }
1926 
1927 impl Aml for MemoryNotify {
1928     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1929         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
1930         aml::If::new(
1931             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
1932             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1933         )
1934         .append_aml_bytes(bytes)
1935     }
1936 }
1937 
1938 struct MemorySlot {
1939     slot_id: usize,
1940 }
1941 
1942 impl Aml for MemorySlot {
1943     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1944         aml::Device::new(
1945             format!("M{:03}", self.slot_id).as_str().into(),
1946             vec![
1947                 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0C80")),
1948                 &aml::Name::new("_UID".into(), &self.slot_id),
1949                 /*
1950                 _STA return value:
1951                 Bit [0] – Set if the device is present.
1952                 Bit [1] – Set if the device is enabled and decoding its resources.
1953                 Bit [2] – Set if the device should be shown in the UI.
1954                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1955                 Bit [4] – Set if the battery is present.
1956                 Bits [31:5] – Reserved (must be cleared).
1957                 */
1958                 &aml::Method::new(
1959                     "_STA".into(),
1960                     0,
1961                     false,
1962                     // Call into MSTA method which will interrogate device
1963                     vec![&aml::Return::new(&aml::MethodCall::new(
1964                         "MSTA".into(),
1965                         vec![&self.slot_id],
1966                     ))],
1967                 ),
1968                 // Get details of memory
1969                 &aml::Method::new(
1970                     "_CRS".into(),
1971                     0,
1972                     false,
1973                     // Call into MCRS which provides actual memory details
1974                     vec![&aml::Return::new(&aml::MethodCall::new(
1975                         "MCRS".into(),
1976                         vec![&self.slot_id],
1977                     ))],
1978                 ),
1979             ],
1980         )
1981         .append_aml_bytes(bytes)
1982     }
1983 }
1984 
1985 struct MemorySlots {
1986     slots: usize,
1987 }
1988 
1989 impl Aml for MemorySlots {
1990     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1991         for slot_id in 0..self.slots {
1992             MemorySlot { slot_id }.append_aml_bytes(bytes);
1993         }
1994     }
1995 }
1996 
1997 struct MemoryMethods {
1998     slots: usize,
1999 }
2000 
2001 impl Aml for MemoryMethods {
2002     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2003         // Add "MTFY" notification method
2004         let mut memory_notifies = Vec::new();
2005         for slot_id in 0..self.slots {
2006             memory_notifies.push(MemoryNotify { slot_id });
2007         }
2008 
2009         let mut memory_notifies_refs: Vec<&dyn aml::Aml> = Vec::new();
2010         for memory_notifier in memory_notifies.iter() {
2011             memory_notifies_refs.push(memory_notifier);
2012         }
2013 
2014         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).append_aml_bytes(bytes);
2015 
2016         // MSCN method
2017         aml::Method::new(
2018             "MSCN".into(),
2019             0,
2020             true,
2021             vec![
2022                 // Take lock defined above
2023                 &aml::Acquire::new("MLCK".into(), 0xffff),
2024                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2025                 &aml::While::new(
2026                     &aml::LessThan::new(&aml::Local(0), &self.slots),
2027                     vec![
2028                         // Write slot number (in first argument) to I/O port via field
2029                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2030                         // Check if MINS bit is set (inserting)
2031                         &aml::If::new(
2032                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2033                             // Notify device if it is
2034                             vec![
2035                                 &aml::MethodCall::new(
2036                                     "MTFY".into(),
2037                                     vec![&aml::Local(0), &aml::ONE],
2038                                 ),
2039                                 // Reset MINS bit
2040                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2041                             ],
2042                         ),
2043                         // Check if MRMV bit is set
2044                         &aml::If::new(
2045                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2046                             // Notify device if it is (with the eject constant 0x3)
2047                             vec![
2048                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2049                                 // Reset MRMV bit
2050                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2051                             ],
2052                         ),
2053                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2054                     ],
2055                 ),
2056                 // Release lock
2057                 &aml::Release::new("MLCK".into()),
2058             ],
2059         )
2060         .append_aml_bytes(bytes);
2061 
2062         // Memory status method
2063         aml::Method::new(
2064             "MSTA".into(),
2065             1,
2066             true,
2067             vec![
2068                 // Take lock defined above
2069                 &aml::Acquire::new("MLCK".into(), 0xffff),
2070                 // Write slot number (in first argument) to I/O port via field
2071                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2072                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2073                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2074                 &aml::If::new(
2075                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2076                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2077                 ),
2078                 // Release lock
2079                 &aml::Release::new("MLCK".into()),
2080                 // Return 0 or 0xf
2081                 &aml::Return::new(&aml::Local(0)),
2082             ],
2083         )
2084         .append_aml_bytes(bytes);
2085 
2086         // Memory range method
2087         aml::Method::new(
2088             "MCRS".into(),
2089             1,
2090             true,
2091             vec![
2092                 // Take lock defined above
2093                 &aml::Acquire::new("MLCK".into(), 0xffff),
2094                 // Write slot number (in first argument) to I/O port via field
2095                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2096                 &aml::Name::new(
2097                     "MR64".into(),
2098                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2099                         aml::AddressSpaceCachable::Cacheable,
2100                         true,
2101                         0x0000_0000_0000_0000u64,
2102                         0xFFFF_FFFF_FFFF_FFFEu64,
2103                     )]),
2104                 ),
2105                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &14usize, "MINL".into()),
2106                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &18usize, "MINH".into()),
2107                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &22usize, "MAXL".into()),
2108                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &26usize, "MAXH".into()),
2109                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &38usize, "LENL".into()),
2110                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &42usize, "LENH".into()),
2111                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2112                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2113                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2114                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2115                 &aml::Add::new(
2116                     &aml::Path::new("MAXL"),
2117                     &aml::Path::new("MINL"),
2118                     &aml::Path::new("LENL"),
2119                 ),
2120                 &aml::Add::new(
2121                     &aml::Path::new("MAXH"),
2122                     &aml::Path::new("MINH"),
2123                     &aml::Path::new("LENH"),
2124                 ),
2125                 &aml::If::new(
2126                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2127                     vec![&aml::Add::new(
2128                         &aml::Path::new("MAXH"),
2129                         &aml::ONE,
2130                         &aml::Path::new("MAXH"),
2131                     )],
2132                 ),
2133                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2134                 // Release lock
2135                 &aml::Release::new("MLCK".into()),
2136                 &aml::Return::new(&aml::Path::new("MR64")),
2137             ],
2138         )
2139         .append_aml_bytes(bytes)
2140     }
2141 }
2142 
2143 impl Aml for MemoryManager {
2144     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2145         if let Some(acpi_address) = self.acpi_address {
2146             // Memory Hotplug Controller
2147             aml::Device::new(
2148                 "_SB_.MHPC".into(),
2149                 vec![
2150                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
2151                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2152                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2153                     &aml::Mutex::new("MLCK".into(), 0),
2154                     &aml::Name::new(
2155                         "_CRS".into(),
2156                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2157                             aml::AddressSpaceCachable::NotCacheable,
2158                             true,
2159                             acpi_address.0 as u64,
2160                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2161                         )]),
2162                     ),
2163                     // OpRegion and Fields map MMIO range into individual field values
2164                     &aml::OpRegion::new(
2165                         "MHPR".into(),
2166                         aml::OpRegionSpace::SystemMemory,
2167                         acpi_address.0 as usize,
2168                         MEMORY_MANAGER_ACPI_SIZE,
2169                     ),
2170                     &aml::Field::new(
2171                         "MHPR".into(),
2172                         aml::FieldAccessType::DWord,
2173                         aml::FieldUpdateRule::Preserve,
2174                         vec![
2175                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2176                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2177                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2178                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2179                         ],
2180                     ),
2181                     &aml::Field::new(
2182                         "MHPR".into(),
2183                         aml::FieldAccessType::DWord,
2184                         aml::FieldUpdateRule::Preserve,
2185                         vec![
2186                             aml::FieldEntry::Reserved(128),
2187                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2188                         ],
2189                     ),
2190                     &aml::Field::new(
2191                         "MHPR".into(),
2192                         aml::FieldAccessType::Byte,
2193                         aml::FieldUpdateRule::WriteAsZeroes,
2194                         vec![
2195                             aml::FieldEntry::Reserved(160),
2196                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2197                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2198                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2199                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2200                         ],
2201                     ),
2202                     &aml::Field::new(
2203                         "MHPR".into(),
2204                         aml::FieldAccessType::DWord,
2205                         aml::FieldUpdateRule::Preserve,
2206                         vec![
2207                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2208                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2209                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2210                         ],
2211                     ),
2212                     &MemoryMethods {
2213                         slots: self.hotplug_slots.len(),
2214                     },
2215                     &MemorySlots {
2216                         slots: self.hotplug_slots.len(),
2217                     },
2218                 ],
2219             )
2220             .append_aml_bytes(bytes);
2221         } else {
2222             aml::Device::new(
2223                 "_SB_.MHPC".into(),
2224                 vec![
2225                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
2226                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2227                     // Empty MSCN for GED
2228                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2229                 ],
2230             )
2231             .append_aml_bytes(bytes);
2232         }
2233 
2234         #[cfg(target_arch = "x86_64")]
2235         {
2236             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2237                 let min = sgx_epc_region.start().raw_value() as u64;
2238                 let max = min + sgx_epc_region.size() as u64 - 1;
2239                 // SGX EPC region
2240                 aml::Device::new(
2241                     "_SB_.EPC_".into(),
2242                     vec![
2243                         &aml::Name::new("_HID".into(), &aml::EisaName::new("INT0E0C")),
2244                         // QWORD describing the EPC region start and size
2245                         &aml::Name::new(
2246                             "_CRS".into(),
2247                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2248                                 aml::AddressSpaceCachable::NotCacheable,
2249                                 true,
2250                                 min,
2251                                 max,
2252                             )]),
2253                         ),
2254                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2255                     ],
2256                 )
2257                 .append_aml_bytes(bytes);
2258             }
2259         }
2260     }
2261 }
2262 
2263 impl Pausable for MemoryManager {}
2264 
2265 #[derive(Clone, Serialize, Deserialize, Versionize)]
2266 pub struct MemoryManagerSnapshotData {
2267     memory_ranges: MemoryRangeTable,
2268     guest_ram_mappings: Vec<GuestRamMapping>,
2269     start_of_device_area: u64,
2270     boot_ram: u64,
2271     current_ram: u64,
2272     arch_mem_regions: Vec<ArchMemRegion>,
2273     hotplug_slots: Vec<HotPlugState>,
2274     next_memory_slot: u32,
2275     selected_slot: usize,
2276     next_hotplug_slot: usize,
2277 }
2278 
2279 impl VersionMapped for MemoryManagerSnapshotData {}
2280 
2281 impl Snapshottable for MemoryManager {
2282     fn id(&self) -> String {
2283         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2284     }
2285 
2286     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2287         let mut memory_manager_snapshot = Snapshot::new(MEMORY_MANAGER_SNAPSHOT_ID);
2288 
2289         let memory_ranges = self.memory_range_table(true)?;
2290 
2291         // Store locally this list of ranges as it will be used through the
2292         // Transportable::send() implementation. The point is to avoid the
2293         // duplication of code regarding the creation of the path for each
2294         // region. The 'snapshot' step creates the list of memory regions,
2295         // including information about the need to copy a memory region or
2296         // not. This saves the 'send' step having to go through the same
2297         // process, and instead it can directly proceed with storing the
2298         // memory range content for the ranges requiring it.
2299         self.snapshot_memory_ranges = memory_ranges;
2300 
2301         memory_manager_snapshot.add_data_section(SnapshotDataSection::new_from_versioned_state(
2302             MEMORY_MANAGER_SNAPSHOT_ID,
2303             &self.snapshot_data(),
2304         )?);
2305 
2306         Ok(memory_manager_snapshot)
2307     }
2308 }
2309 
2310 impl Transportable for MemoryManager {
2311     fn send(
2312         &self,
2313         _snapshot: &Snapshot,
2314         destination_url: &str,
2315     ) -> result::Result<(), MigratableError> {
2316         if self.snapshot_memory_ranges.is_empty() {
2317             return Ok(());
2318         }
2319 
2320         let mut memory_file_path = url_to_path(destination_url)?;
2321         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2322 
2323         // Create the snapshot file for the entire memory
2324         let mut memory_file = OpenOptions::new()
2325             .read(true)
2326             .write(true)
2327             .create_new(true)
2328             .open(memory_file_path)
2329             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2330 
2331         let guest_memory = self.guest_memory.memory();
2332 
2333         for range in self.snapshot_memory_ranges.regions() {
2334             let mut offset: u64 = 0;
2335             // Here we are manually handling the retry in case we can't read
2336             // the whole region at once because we can't use the implementation
2337             // from vm-memory::GuestMemory of write_all_to() as it is not
2338             // following the correct behavior. For more info about this issue
2339             // see: https://github.com/rust-vmm/vm-memory/issues/174
2340             loop {
2341                 let bytes_written = guest_memory
2342                     .write_to(
2343                         GuestAddress(range.gpa + offset),
2344                         &mut memory_file,
2345                         (range.length - offset) as usize,
2346                     )
2347                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2348                 offset += bytes_written as u64;
2349 
2350                 if offset == range.length {
2351                     break;
2352                 }
2353             }
2354         }
2355         Ok(())
2356     }
2357 }
2358 
2359 impl Migratable for MemoryManager {
2360     // Start the dirty log in the hypervisor (kvm/mshv).
2361     // Also, reset the dirty bitmap logged by the vmm.
2362     // Just before we do a bulk copy we want to start/clear the dirty log so that
2363     // pages touched during our bulk copy are tracked.
2364     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2365         self.vm.start_dirty_log().map_err(|e| {
2366             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2367         })?;
2368 
2369         for r in self.guest_memory.memory().iter() {
2370             r.bitmap().reset();
2371         }
2372 
2373         Ok(())
2374     }
2375 
2376     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2377         self.vm.stop_dirty_log().map_err(|e| {
2378             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2379         })?;
2380 
2381         Ok(())
2382     }
2383 
2384     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2385     // together in the table if they are contiguous.
2386     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2387         let mut table = MemoryRangeTable::default();
2388         for r in &self.guest_ram_mappings {
2389             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2390                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2391             })?;
2392             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2393             {
2394                 Some(region) => {
2395                     assert!(region.start_addr().raw_value() == r.gpa);
2396                     assert!(region.len() == r.size);
2397                     region.bitmap().get_and_reset()
2398                 }
2399                 None => {
2400                     return Err(MigratableError::MigrateSend(anyhow!(
2401                         "Error finding 'guest memory region' with address {:x}",
2402                         r.gpa
2403                     )))
2404                 }
2405             };
2406 
2407             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2408                 .iter()
2409                 .zip(vmm_dirty_bitmap.iter())
2410                 .map(|(x, y)| x | y)
2411                 .collect();
2412 
2413             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2414 
2415             if sub_table.regions().is_empty() {
2416                 info!("Dirty Memory Range Table is empty");
2417             } else {
2418                 info!("Dirty Memory Range Table:");
2419                 for range in sub_table.regions() {
2420                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2421                 }
2422             }
2423 
2424             table.extend(sub_table);
2425         }
2426         Ok(table)
2427     }
2428 }
2429