xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision eea9bcea38e0c5649f444c829f3a4f9c22aa486c)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 #[cfg(target_arch = "x86_64")]
6 use crate::config::SgxEpcConfig;
7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
8 #[cfg(feature = "guest_debug")]
9 use crate::coredump::{CoredumpMemoryRegion, CoredumpMemoryRegions};
10 #[cfg(feature = "guest_debug")]
11 use crate::coredump::{DumpState, GuestDebuggableError};
12 use crate::migration::url_to_path;
13 use crate::MEMORY_MANAGER_SNAPSHOT_ID;
14 use crate::{GuestMemoryMmap, GuestRegionMmap};
15 use acpi_tables::{aml, aml::Aml};
16 use anyhow::anyhow;
17 #[cfg(target_arch = "x86_64")]
18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
19 use arch::{layout, RegionType};
20 #[cfg(target_arch = "x86_64")]
21 use devices::ioapic;
22 #[cfg(target_arch = "aarch64")]
23 use hypervisor::HypervisorVmError;
24 #[cfg(target_arch = "x86_64")]
25 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
26 use serde::{Deserialize, Serialize};
27 #[cfg(feature = "guest_debug")]
28 use std::collections::BTreeMap;
29 use std::collections::HashMap;
30 use std::convert::TryInto;
31 use std::ffi;
32 use std::fs::{File, OpenOptions};
33 use std::io;
34 use std::ops::Deref;
35 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
36 use std::path::PathBuf;
37 use std::result;
38 use std::sync::{Arc, Barrier, Mutex};
39 use tracer::trace_scoped;
40 use versionize::{VersionMap, Versionize, VersionizeResult};
41 use versionize_derive::Versionize;
42 use virtio_devices::BlocksState;
43 #[cfg(target_arch = "x86_64")]
44 use vm_allocator::GsiApic;
45 use vm_allocator::{AddressAllocator, SystemAllocator};
46 use vm_device::BusDevice;
47 use vm_memory::bitmap::AtomicBitmap;
48 use vm_memory::guest_memory::FileOffset;
49 use vm_memory::{
50     mmap::MmapRegionError, Address, Bytes, Error as MmapError, GuestAddress, GuestAddressSpace,
51     GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion,
52 };
53 use vm_migration::{
54     protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable,
55     Snapshot, SnapshotDataSection, Snapshottable, Transportable, VersionMapped,
56 };
57 
58 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
59 
60 const DEFAULT_MEMORY_ZONE: &str = "mem0";
61 
62 const SNAPSHOT_FILENAME: &str = "memory-ranges";
63 
64 #[cfg(target_arch = "x86_64")]
65 const X86_64_IRQ_BASE: u32 = 5;
66 
67 #[cfg(target_arch = "x86_64")]
68 const SGX_PAGE_SIZE: u64 = 1 << 12;
69 
70 const HOTPLUG_COUNT: usize = 8;
71 
72 // Memory policy constants
73 const MPOL_BIND: u32 = 2;
74 const MPOL_MF_STRICT: u32 = 1;
75 const MPOL_MF_MOVE: u32 = 1 << 1;
76 
77 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
78 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
79 
80 #[derive(Clone, Default, Serialize, Deserialize, Versionize)]
81 struct HotPlugState {
82     base: u64,
83     length: u64,
84     active: bool,
85     inserting: bool,
86     removing: bool,
87 }
88 
89 pub struct VirtioMemZone {
90     region: Arc<GuestRegionMmap>,
91     virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>,
92     hotplugged_size: u64,
93     hugepages: bool,
94     blocks_state: Arc<Mutex<BlocksState>>,
95 }
96 
97 impl VirtioMemZone {
98     pub fn region(&self) -> &Arc<GuestRegionMmap> {
99         &self.region
100     }
101     pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) {
102         self.virtio_device = Some(virtio_device);
103     }
104     pub fn hotplugged_size(&self) -> u64 {
105         self.hotplugged_size
106     }
107     pub fn hugepages(&self) -> bool {
108         self.hugepages
109     }
110     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
111         &self.blocks_state
112     }
113     pub fn plugged_ranges(&self) -> MemoryRangeTable {
114         self.blocks_state
115             .lock()
116             .unwrap()
117             .memory_ranges(self.region.start_addr().raw_value(), true)
118     }
119 }
120 
121 #[derive(Default)]
122 pub struct MemoryZone {
123     regions: Vec<Arc<GuestRegionMmap>>,
124     virtio_mem_zone: Option<VirtioMemZone>,
125 }
126 
127 impl MemoryZone {
128     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
129         &self.regions
130     }
131     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
132         &self.virtio_mem_zone
133     }
134     pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> {
135         self.virtio_mem_zone.as_mut()
136     }
137 }
138 
139 pub type MemoryZones = HashMap<String, MemoryZone>;
140 
141 #[derive(Clone, Serialize, Deserialize, Versionize)]
142 struct GuestRamMapping {
143     slot: u32,
144     gpa: u64,
145     size: u64,
146     zone_id: String,
147     virtio_mem: bool,
148     file_offset: u64,
149 }
150 
151 #[derive(Clone, Serialize, Deserialize, Versionize)]
152 struct ArchMemRegion {
153     base: u64,
154     size: usize,
155     r_type: RegionType,
156 }
157 
158 pub struct MemoryManager {
159     boot_guest_memory: GuestMemoryMmap,
160     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
161     next_memory_slot: u32,
162     start_of_device_area: GuestAddress,
163     end_of_device_area: GuestAddress,
164     end_of_ram_area: GuestAddress,
165     pub vm: Arc<dyn hypervisor::Vm>,
166     hotplug_slots: Vec<HotPlugState>,
167     selected_slot: usize,
168     mergeable: bool,
169     allocator: Arc<Mutex<SystemAllocator>>,
170     hotplug_method: HotplugMethod,
171     boot_ram: u64,
172     current_ram: u64,
173     next_hotplug_slot: usize,
174     shared: bool,
175     hugepages: bool,
176     hugepage_size: Option<u64>,
177     prefault: bool,
178     #[cfg(target_arch = "x86_64")]
179     sgx_epc_region: Option<SgxEpcRegion>,
180     user_provided_zones: bool,
181     snapshot_memory_ranges: MemoryRangeTable,
182     memory_zones: MemoryZones,
183     log_dirty: bool, // Enable dirty logging for created RAM regions
184     arch_mem_regions: Vec<ArchMemRegion>,
185     ram_allocator: AddressAllocator,
186     dynamic: bool,
187 
188     // Keep track of calls to create_userspace_mapping() for guest RAM.
189     // This is useful for getting the dirty pages as we need to know the
190     // slots that the mapping is created in.
191     guest_ram_mappings: Vec<GuestRamMapping>,
192 
193     pub acpi_address: Option<GuestAddress>,
194     #[cfg(target_arch = "aarch64")]
195     uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
196 }
197 
198 #[derive(Debug)]
199 pub enum Error {
200     /// Failed to create shared file.
201     SharedFileCreate(io::Error),
202 
203     /// Failed to set shared file length.
204     SharedFileSetLen(io::Error),
205 
206     /// Mmap backed guest memory error
207     GuestMemory(MmapError),
208 
209     /// Failed to allocate a memory range.
210     MemoryRangeAllocation,
211 
212     /// Error from region creation
213     GuestMemoryRegion(MmapRegionError),
214 
215     /// No ACPI slot available
216     NoSlotAvailable,
217 
218     /// Not enough space in the hotplug RAM region
219     InsufficientHotplugRam,
220 
221     /// The requested hotplug memory addition is not a valid size
222     InvalidSize,
223 
224     /// Failed to create the user memory region.
225     CreateUserMemoryRegion(hypervisor::HypervisorVmError),
226 
227     /// Failed to remove the user memory region.
228     RemoveUserMemoryRegion(hypervisor::HypervisorVmError),
229 
230     /// Failed to EventFd.
231     EventFdFail(io::Error),
232 
233     /// Eventfd write error
234     EventfdError(io::Error),
235 
236     /// Failed to virtio-mem resize
237     VirtioMemResizeFail(virtio_devices::mem::Error),
238 
239     /// Cannot restore VM
240     Restore(MigratableError),
241 
242     /// Cannot restore VM because source URL is missing
243     RestoreMissingSourceUrl,
244 
245     /// Cannot create the system allocator
246     CreateSystemAllocator,
247 
248     /// Invalid SGX EPC section size
249     #[cfg(target_arch = "x86_64")]
250     EpcSectionSizeInvalid,
251 
252     /// Failed allocating SGX EPC region
253     #[cfg(target_arch = "x86_64")]
254     SgxEpcRangeAllocation,
255 
256     /// Failed opening SGX virtual EPC device
257     #[cfg(target_arch = "x86_64")]
258     SgxVirtEpcOpen(io::Error),
259 
260     /// Failed setting the SGX virtual EPC section size
261     #[cfg(target_arch = "x86_64")]
262     SgxVirtEpcFileSetLen(io::Error),
263 
264     /// Failed opening SGX provisioning device
265     #[cfg(target_arch = "x86_64")]
266     SgxProvisionOpen(io::Error),
267 
268     /// Failed enabling SGX provisioning
269     #[cfg(target_arch = "x86_64")]
270     SgxEnableProvisioning(hypervisor::HypervisorVmError),
271 
272     /// Failed creating a new MmapRegion instance.
273     #[cfg(target_arch = "x86_64")]
274     NewMmapRegion(vm_memory::mmap::MmapRegionError),
275 
276     /// No memory zones found.
277     MissingMemoryZones,
278 
279     /// Memory configuration is not valid.
280     InvalidMemoryParameters,
281 
282     /// Forbidden operation. Impossible to resize guest memory if it is
283     /// backed by user defined memory regions.
284     InvalidResizeWithMemoryZones,
285 
286     /// It's invalid to try applying a NUMA policy to a memory zone that is
287     /// memory mapped with MAP_SHARED.
288     InvalidSharedMemoryZoneWithHostNuma,
289 
290     /// Failed applying NUMA memory policy.
291     ApplyNumaPolicy(io::Error),
292 
293     /// Memory zone identifier is not unique.
294     DuplicateZoneId,
295 
296     /// No virtio-mem resizing handler found.
297     MissingVirtioMemHandler,
298 
299     /// Unknown memory zone.
300     UnknownMemoryZone,
301 
302     /// Invalid size for resizing. Can be anything except 0.
303     InvalidHotplugSize,
304 
305     /// Invalid hotplug method associated with memory zones resizing capability.
306     InvalidHotplugMethodWithMemoryZones,
307 
308     /// Could not find specified memory zone identifier from hash map.
309     MissingZoneIdentifier,
310 
311     /// Resizing the memory zone failed.
312     ResizeZone,
313 
314     /// Guest address overflow
315     GuestAddressOverFlow,
316 
317     /// Error opening snapshot file
318     SnapshotOpen(io::Error),
319 
320     // Error copying snapshot into region
321     SnapshotCopy(GuestMemoryError),
322 
323     /// Failed to allocate MMIO address
324     AllocateMmioAddress,
325 
326     #[cfg(target_arch = "aarch64")]
327     /// Failed to create UEFI flash
328     CreateUefiFlash(HypervisorVmError),
329 }
330 
331 const ENABLE_FLAG: usize = 0;
332 const INSERTING_FLAG: usize = 1;
333 const REMOVING_FLAG: usize = 2;
334 const EJECT_FLAG: usize = 3;
335 
336 const BASE_OFFSET_LOW: u64 = 0;
337 const BASE_OFFSET_HIGH: u64 = 0x4;
338 const LENGTH_OFFSET_LOW: u64 = 0x8;
339 const LENGTH_OFFSET_HIGH: u64 = 0xC;
340 const STATUS_OFFSET: u64 = 0x14;
341 const SELECTION_OFFSET: u64 = 0;
342 
343 // The MMIO address space size is subtracted with 64k. This is done for the
344 // following reasons:
345 //  - Reduce the addressable space size by at least 4k to workaround a Linux
346 //    bug when the VMM allocates devices at the end of the addressable space
347 //  - Windows requires the addressable space size to be 64k aligned
348 fn mmio_address_space_size(phys_bits: u8) -> u64 {
349     (1 << phys_bits) - (1 << 16)
350 }
351 
352 impl BusDevice for MemoryManager {
353     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
354         if self.selected_slot < self.hotplug_slots.len() {
355             let state = &self.hotplug_slots[self.selected_slot];
356             match offset {
357                 BASE_OFFSET_LOW => {
358                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
359                 }
360                 BASE_OFFSET_HIGH => {
361                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
362                 }
363                 LENGTH_OFFSET_LOW => {
364                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
365                 }
366                 LENGTH_OFFSET_HIGH => {
367                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
368                 }
369                 STATUS_OFFSET => {
370                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
371                     data.fill(0);
372                     if state.active {
373                         data[0] |= 1 << ENABLE_FLAG;
374                     }
375                     if state.inserting {
376                         data[0] |= 1 << INSERTING_FLAG;
377                     }
378                     if state.removing {
379                         data[0] |= 1 << REMOVING_FLAG;
380                     }
381                 }
382                 _ => {
383                     warn!(
384                         "Unexpected offset for accessing memory manager device: {:#}",
385                         offset
386                     );
387                 }
388             }
389         } else {
390             warn!("Out of range memory slot: {}", self.selected_slot);
391         }
392     }
393 
394     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
395         match offset {
396             SELECTION_OFFSET => {
397                 self.selected_slot = usize::from(data[0]);
398             }
399             STATUS_OFFSET => {
400                 if self.selected_slot < self.hotplug_slots.len() {
401                     let state = &mut self.hotplug_slots[self.selected_slot];
402                     // The ACPI code writes back a 1 to acknowledge the insertion
403                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
404                         state.inserting = false;
405                     }
406                     // Ditto for removal
407                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
408                         state.removing = false;
409                     }
410                     // Trigger removal of "DIMM"
411                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
412                         warn!("Ejection of memory not currently supported");
413                     }
414                 } else {
415                     warn!("Out of range memory slot: {}", self.selected_slot);
416                 }
417             }
418             _ => {
419                 warn!(
420                     "Unexpected offset for accessing memory manager device: {:#}",
421                     offset
422                 );
423             }
424         };
425         None
426     }
427 }
428 
429 impl MemoryManager {
430     /// Creates all memory regions based on the available RAM ranges defined
431     /// by `ram_regions`, and based on the description of the memory zones.
432     /// In practice, this function can perform multiple memory mappings of the
433     /// same backing file if there's a hole in the address space between two
434     /// RAM ranges.
435     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
436     /// and zones containing two zones (size 1G and size 4G).
437     /// This function will create 3 resulting memory regions:
438     /// - First one mapping entirely the first memory zone on 0-1G range
439     /// - Second one mapping partially the second memory zone on 1G-3G range
440     /// - Third one mapping partially the second memory zone on 4G-6G range
441     fn create_memory_regions_from_zones(
442         ram_regions: &[(GuestAddress, usize)],
443         zones: &[MemoryZoneConfig],
444         prefault: Option<bool>,
445     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
446         let mut zones = zones.to_owned();
447         let mut mem_regions = Vec::new();
448         let mut zone = zones.remove(0);
449         let mut zone_offset = 0;
450         let mut memory_zones = HashMap::new();
451 
452         // Add zone id to the list of memory zones.
453         memory_zones.insert(zone.id.clone(), MemoryZone::default());
454 
455         for ram_region in ram_regions.iter() {
456             let mut ram_region_offset = 0;
457             let mut exit = false;
458 
459             loop {
460                 let mut ram_region_consumed = false;
461                 let mut pull_next_zone = false;
462 
463                 let ram_region_sub_size = ram_region.1 - ram_region_offset;
464                 let zone_sub_size = zone.size as usize - zone_offset;
465 
466                 let file_offset = zone_offset as u64;
467                 let region_start = ram_region
468                     .0
469                     .checked_add(ram_region_offset as u64)
470                     .ok_or(Error::GuestAddressOverFlow)?;
471                 let region_size = if zone_sub_size <= ram_region_sub_size {
472                     if zone_sub_size == ram_region_sub_size {
473                         ram_region_consumed = true;
474                     }
475 
476                     ram_region_offset += zone_sub_size;
477                     pull_next_zone = true;
478 
479                     zone_sub_size
480                 } else {
481                     zone_offset += ram_region_sub_size;
482                     ram_region_consumed = true;
483 
484                     ram_region_sub_size
485                 };
486 
487                 let region = MemoryManager::create_ram_region(
488                     &zone.file,
489                     file_offset,
490                     region_start,
491                     region_size,
492                     match prefault {
493                         Some(pf) => pf,
494                         None => zone.prefault,
495                     },
496                     zone.shared,
497                     zone.hugepages,
498                     zone.hugepage_size,
499                     zone.host_numa_node,
500                     None,
501                 )?;
502 
503                 // Add region to the list of regions associated with the
504                 // current memory zone.
505                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
506                     memory_zone.regions.push(region.clone());
507                 }
508 
509                 mem_regions.push(region);
510 
511                 if pull_next_zone {
512                     // Get the next zone and reset the offset.
513                     zone_offset = 0;
514                     if zones.is_empty() {
515                         exit = true;
516                         break;
517                     }
518                     zone = zones.remove(0);
519 
520                     // Check if zone id already exist. In case it does, throw
521                     // an error as we need unique identifiers. Otherwise, add
522                     // the new zone id to the list of memory zones.
523                     if memory_zones.contains_key(&zone.id) {
524                         error!(
525                             "Memory zone identifier '{}' found more than once. \
526                             It must be unique",
527                             zone.id,
528                         );
529                         return Err(Error::DuplicateZoneId);
530                     }
531                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
532                 }
533 
534                 if ram_region_consumed {
535                     break;
536                 }
537             }
538 
539             if exit {
540                 break;
541             }
542         }
543 
544         Ok((mem_regions, memory_zones))
545     }
546 
547     // Restore both GuestMemory regions along with MemoryZone zones.
548     fn restore_memory_regions_and_zones(
549         guest_ram_mappings: &[GuestRamMapping],
550         zones_config: &[MemoryZoneConfig],
551         prefault: Option<bool>,
552         mut existing_memory_files: HashMap<u32, File>,
553     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
554         let mut memory_regions = Vec::new();
555         let mut memory_zones = HashMap::new();
556 
557         for zone_config in zones_config {
558             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
559         }
560 
561         for guest_ram_mapping in guest_ram_mappings {
562             for zone_config in zones_config {
563                 if guest_ram_mapping.zone_id == zone_config.id {
564                     let region = MemoryManager::create_ram_region(
565                         &zone_config.file,
566                         guest_ram_mapping.file_offset,
567                         GuestAddress(guest_ram_mapping.gpa),
568                         guest_ram_mapping.size as usize,
569                         match prefault {
570                             Some(pf) => pf,
571                             None => zone_config.prefault,
572                         },
573                         zone_config.shared,
574                         zone_config.hugepages,
575                         zone_config.hugepage_size,
576                         zone_config.host_numa_node,
577                         existing_memory_files.remove(&guest_ram_mapping.slot),
578                     )?;
579                     memory_regions.push(Arc::clone(&region));
580                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
581                         if guest_ram_mapping.virtio_mem {
582                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
583                             let region_size = region.len();
584                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
585                                 region,
586                                 virtio_device: None,
587                                 hotplugged_size,
588                                 hugepages: zone_config.hugepages,
589                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
590                             });
591                         } else {
592                             memory_zone.regions.push(region);
593                         }
594                     }
595                 }
596             }
597         }
598 
599         memory_regions.sort_by_key(|x| x.start_addr());
600 
601         Ok((memory_regions, memory_zones))
602     }
603 
604     fn fill_saved_regions(
605         &mut self,
606         file_path: PathBuf,
607         saved_regions: MemoryRangeTable,
608     ) -> Result<(), Error> {
609         if saved_regions.is_empty() {
610             return Ok(());
611         }
612 
613         // Open (read only) the snapshot file.
614         let mut memory_file = OpenOptions::new()
615             .read(true)
616             .open(file_path)
617             .map_err(Error::SnapshotOpen)?;
618 
619         let guest_memory = self.guest_memory.memory();
620         for range in saved_regions.regions() {
621             let mut offset: u64 = 0;
622             // Here we are manually handling the retry in case we can't write
623             // the whole region at once because we can't use the implementation
624             // from vm-memory::GuestMemory of read_exact_from() as it is not
625             // following the correct behavior. For more info about this issue
626             // see: https://github.com/rust-vmm/vm-memory/issues/174
627             loop {
628                 let bytes_read = guest_memory
629                     .read_from(
630                         GuestAddress(range.gpa + offset),
631                         &mut memory_file,
632                         (range.length - offset) as usize,
633                     )
634                     .map_err(Error::SnapshotCopy)?;
635                 offset += bytes_read as u64;
636 
637                 if offset == range.length {
638                     break;
639                 }
640             }
641         }
642 
643         Ok(())
644     }
645 
646     fn validate_memory_config(
647         config: &MemoryConfig,
648         user_provided_zones: bool,
649     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
650         let mut allow_mem_hotplug = false;
651 
652         if !user_provided_zones {
653             if config.zones.is_some() {
654                 error!(
655                     "User defined memory regions can't be provided if the \
656                     memory size is not 0"
657                 );
658                 return Err(Error::InvalidMemoryParameters);
659             }
660 
661             if config.hotplug_size.is_some() {
662                 allow_mem_hotplug = true;
663             }
664 
665             if let Some(hotplugged_size) = config.hotplugged_size {
666                 if let Some(hotplug_size) = config.hotplug_size {
667                     if hotplugged_size > hotplug_size {
668                         error!(
669                             "'hotplugged_size' {} can't be bigger than \
670                             'hotplug_size' {}",
671                             hotplugged_size, hotplug_size,
672                         );
673                         return Err(Error::InvalidMemoryParameters);
674                     }
675                 } else {
676                     error!(
677                         "Invalid to define 'hotplugged_size' when there is\
678                         no 'hotplug_size'"
679                     );
680                     return Err(Error::InvalidMemoryParameters);
681                 }
682                 if config.hotplug_method == HotplugMethod::Acpi {
683                     error!(
684                         "Invalid to define 'hotplugged_size' with hotplug \
685                         method 'acpi'"
686                     );
687                     return Err(Error::InvalidMemoryParameters);
688                 }
689             }
690 
691             // Create a single zone from the global memory config. This lets
692             // us reuse the codepath for user defined memory zones.
693             let zones = vec![MemoryZoneConfig {
694                 id: String::from(DEFAULT_MEMORY_ZONE),
695                 size: config.size,
696                 file: None,
697                 shared: config.shared,
698                 hugepages: config.hugepages,
699                 hugepage_size: config.hugepage_size,
700                 host_numa_node: None,
701                 hotplug_size: config.hotplug_size,
702                 hotplugged_size: config.hotplugged_size,
703                 prefault: config.prefault,
704             }];
705 
706             Ok((config.size, zones, allow_mem_hotplug))
707         } else {
708             if config.zones.is_none() {
709                 error!(
710                     "User defined memory regions must be provided if the \
711                     memory size is 0"
712                 );
713                 return Err(Error::MissingMemoryZones);
714             }
715 
716             // Safe to unwrap as we checked right above there were some
717             // regions.
718             let zones = config.zones.clone().unwrap();
719             if zones.is_empty() {
720                 return Err(Error::MissingMemoryZones);
721             }
722 
723             let mut total_ram_size: u64 = 0;
724             for zone in zones.iter() {
725                 total_ram_size += zone.size;
726 
727                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
728                     error!(
729                         "Invalid to set host NUMA policy for a memory zone \
730                         backed by a regular file and mapped as 'shared'"
731                     );
732                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
733                 }
734 
735                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
736                     error!("Invalid to set ACPI hotplug method for memory zones");
737                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
738                 }
739 
740                 if let Some(hotplugged_size) = zone.hotplugged_size {
741                     if let Some(hotplug_size) = zone.hotplug_size {
742                         if hotplugged_size > hotplug_size {
743                             error!(
744                                 "'hotplugged_size' {} can't be bigger than \
745                                 'hotplug_size' {}",
746                                 hotplugged_size, hotplug_size,
747                             );
748                             return Err(Error::InvalidMemoryParameters);
749                         }
750                     } else {
751                         error!(
752                             "Invalid to define 'hotplugged_size' when there is\
753                             no 'hotplug_size' for a memory zone"
754                         );
755                         return Err(Error::InvalidMemoryParameters);
756                     }
757                     if config.hotplug_method == HotplugMethod::Acpi {
758                         error!(
759                             "Invalid to define 'hotplugged_size' with hotplug \
760                             method 'acpi'"
761                         );
762                         return Err(Error::InvalidMemoryParameters);
763                     }
764                 }
765             }
766 
767             Ok((total_ram_size, zones, allow_mem_hotplug))
768         }
769     }
770 
771     fn allocate_address_space(&mut self) -> Result<(), Error> {
772         let mut list = Vec::new();
773 
774         for (zone_id, memory_zone) in self.memory_zones.iter() {
775             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
776                 memory_zone
777                     .regions()
778                     .iter()
779                     .map(|r| (r.clone(), false))
780                     .collect();
781 
782             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
783                 regions.push((virtio_mem_zone.region().clone(), true));
784             }
785 
786             list.push((zone_id.clone(), regions));
787         }
788 
789         for (zone_id, regions) in list {
790             for (region, virtio_mem) in regions {
791                 let slot = self.create_userspace_mapping(
792                     region.start_addr().raw_value(),
793                     region.len() as u64,
794                     region.as_ptr() as u64,
795                     self.mergeable,
796                     false,
797                     self.log_dirty,
798                 )?;
799 
800                 let file_offset = if let Some(file_offset) = region.file_offset() {
801                     file_offset.start()
802                 } else {
803                     0
804                 };
805 
806                 self.guest_ram_mappings.push(GuestRamMapping {
807                     gpa: region.start_addr().raw_value(),
808                     size: region.len(),
809                     slot,
810                     zone_id: zone_id.clone(),
811                     virtio_mem,
812                     file_offset,
813                 });
814                 self.ram_allocator
815                     .allocate(Some(region.start_addr()), region.len(), None)
816                     .ok_or(Error::MemoryRangeAllocation)?;
817             }
818         }
819 
820         // Allocate SubRegion and Reserved address ranges.
821         for region in self.arch_mem_regions.iter() {
822             if region.r_type == RegionType::Ram {
823                 // Ignore the RAM type since ranges have already been allocated
824                 // based on the GuestMemory regions.
825                 continue;
826             }
827             self.ram_allocator
828                 .allocate(
829                     Some(GuestAddress(region.base)),
830                     region.size as GuestUsize,
831                     None,
832                 )
833                 .ok_or(Error::MemoryRangeAllocation)?;
834         }
835 
836         Ok(())
837     }
838 
839     #[cfg(target_arch = "aarch64")]
840     fn add_uefi_flash(&mut self) -> Result<(), Error> {
841         // On AArch64, the UEFI binary requires a flash device at address 0.
842         // 4 MiB memory is mapped to simulate the flash.
843         let uefi_mem_slot = self.allocate_memory_slot();
844         let uefi_region = GuestRegionMmap::new(
845             MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(),
846             arch::layout::UEFI_START,
847         )
848         .unwrap();
849         let uefi_mem_region = self.vm.make_user_memory_region(
850             uefi_mem_slot,
851             uefi_region.start_addr().raw_value(),
852             uefi_region.len() as u64,
853             uefi_region.as_ptr() as u64,
854             false,
855             false,
856         );
857         self.vm
858             .create_user_memory_region(uefi_mem_region)
859             .map_err(Error::CreateUefiFlash)?;
860 
861         let uefi_flash =
862             GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap());
863 
864         self.uefi_flash = Some(uefi_flash);
865 
866         Ok(())
867     }
868 
869     #[allow(clippy::too_many_arguments)]
870     pub fn new(
871         vm: Arc<dyn hypervisor::Vm>,
872         config: &MemoryConfig,
873         prefault: Option<bool>,
874         phys_bits: u8,
875         #[cfg(feature = "tdx")] tdx_enabled: bool,
876         restore_data: Option<&MemoryManagerSnapshotData>,
877         existing_memory_files: Option<HashMap<u32, File>>,
878         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
879     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
880         trace_scoped!("MemoryManager::new");
881 
882         let user_provided_zones = config.size == 0;
883 
884         let mmio_address_space_size = mmio_address_space_size(phys_bits);
885         debug_assert_eq!(
886             (((mmio_address_space_size) >> 16) << 16),
887             mmio_address_space_size
888         );
889         let start_of_platform_device_area =
890             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
891         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
892 
893         let (ram_size, zones, allow_mem_hotplug) =
894             Self::validate_memory_config(config, user_provided_zones)?;
895 
896         let (
897             start_of_device_area,
898             boot_ram,
899             current_ram,
900             arch_mem_regions,
901             memory_zones,
902             guest_memory,
903             boot_guest_memory,
904             hotplug_slots,
905             next_memory_slot,
906             selected_slot,
907             next_hotplug_slot,
908         ) = if let Some(data) = restore_data {
909             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
910                 &data.guest_ram_mappings,
911                 &zones,
912                 prefault,
913                 existing_memory_files.unwrap_or_default(),
914             )?;
915             let guest_memory =
916                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
917             let boot_guest_memory = guest_memory.clone();
918             (
919                 GuestAddress(data.start_of_device_area),
920                 data.boot_ram,
921                 data.current_ram,
922                 data.arch_mem_regions.clone(),
923                 memory_zones,
924                 guest_memory,
925                 boot_guest_memory,
926                 data.hotplug_slots.clone(),
927                 data.next_memory_slot,
928                 data.selected_slot,
929                 data.next_hotplug_slot,
930             )
931         } else {
932             // Init guest memory
933             let arch_mem_regions = arch::arch_memory_regions(ram_size);
934 
935             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
936                 .iter()
937                 .filter(|r| r.2 == RegionType::Ram)
938                 .map(|r| (r.0, r.1))
939                 .collect();
940 
941             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
942                 .iter()
943                 .map(|(a, b, c)| ArchMemRegion {
944                     base: a.0,
945                     size: *b,
946                     r_type: *c,
947                 })
948                 .collect();
949 
950             let (mem_regions, mut memory_zones) =
951                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault)?;
952 
953             let mut guest_memory =
954                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
955 
956             let boot_guest_memory = guest_memory.clone();
957 
958             let mut start_of_device_area =
959                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
960 
961             // Update list of memory zones for resize.
962             for zone in zones.iter() {
963                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
964                     if let Some(hotplug_size) = zone.hotplug_size {
965                         if hotplug_size == 0 {
966                             error!("'hotplug_size' can't be 0");
967                             return Err(Error::InvalidHotplugSize);
968                         }
969 
970                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
971                             start_of_device_area = start_of_device_area
972                                 .checked_add(hotplug_size)
973                                 .ok_or(Error::GuestAddressOverFlow)?;
974                         } else {
975                             // Alignment must be "natural" i.e. same as size of block
976                             let start_addr = GuestAddress(
977                                 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE
978                                     - 1)
979                                     / virtio_devices::VIRTIO_MEM_ALIGN_SIZE
980                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
981                             );
982 
983                             // When `prefault` is set by vm_restore, memory manager
984                             // will create ram region with `prefault` option in
985                             // restore config rather than same option in zone
986                             let region = MemoryManager::create_ram_region(
987                                 &None,
988                                 0,
989                                 start_addr,
990                                 hotplug_size as usize,
991                                 match prefault {
992                                     Some(pf) => pf,
993                                     None => zone.prefault,
994                                 },
995                                 zone.shared,
996                                 zone.hugepages,
997                                 zone.hugepage_size,
998                                 zone.host_numa_node,
999                                 None,
1000                             )?;
1001 
1002                             guest_memory = guest_memory
1003                                 .insert_region(Arc::clone(&region))
1004                                 .map_err(Error::GuestMemory)?;
1005 
1006                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
1007                             let region_size = region.len();
1008                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
1009                                 region,
1010                                 virtio_device: None,
1011                                 hotplugged_size,
1012                                 hugepages: zone.hugepages,
1013                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
1014                             });
1015 
1016                             start_of_device_area = start_addr
1017                                 .checked_add(hotplug_size)
1018                                 .ok_or(Error::GuestAddressOverFlow)?;
1019                         }
1020                     }
1021                 } else {
1022                     return Err(Error::MissingZoneIdentifier);
1023                 }
1024             }
1025 
1026             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
1027             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
1028 
1029             (
1030                 start_of_device_area,
1031                 ram_size,
1032                 ram_size,
1033                 arch_mem_regions,
1034                 memory_zones,
1035                 guest_memory,
1036                 boot_guest_memory,
1037                 hotplug_slots,
1038                 0,
1039                 0,
1040                 0,
1041             )
1042         };
1043 
1044         let guest_memory = GuestMemoryAtomic::new(guest_memory);
1045 
1046         // Both MMIO and PIO address spaces start at address 0.
1047         let allocator = Arc::new(Mutex::new(
1048             SystemAllocator::new(
1049                 #[cfg(target_arch = "x86_64")]
1050                 {
1051                     GuestAddress(0)
1052                 },
1053                 #[cfg(target_arch = "x86_64")]
1054                 {
1055                     1 << 16
1056                 },
1057                 start_of_platform_device_area,
1058                 PLATFORM_DEVICE_AREA_SIZE,
1059                 layout::MEM_32BIT_DEVICES_START,
1060                 layout::MEM_32BIT_DEVICES_SIZE,
1061                 #[cfg(target_arch = "x86_64")]
1062                 vec![GsiApic::new(
1063                     X86_64_IRQ_BASE,
1064                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1065                 )],
1066             )
1067             .ok_or(Error::CreateSystemAllocator)?,
1068         ));
1069 
1070         #[cfg(not(feature = "tdx"))]
1071         let dynamic = true;
1072         #[cfg(feature = "tdx")]
1073         let dynamic = !tdx_enabled;
1074 
1075         let acpi_address = if dynamic
1076             && config.hotplug_method == HotplugMethod::Acpi
1077             && (config.hotplug_size.unwrap_or_default() > 0)
1078         {
1079             Some(
1080                 allocator
1081                     .lock()
1082                     .unwrap()
1083                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1084                     .ok_or(Error::AllocateMmioAddress)?,
1085             )
1086         } else {
1087             None
1088         };
1089 
1090         // If running on SGX the start of device area and RAM area may diverge but
1091         // at this point they are next to each other.
1092         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1093         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1094 
1095         let mut memory_manager = MemoryManager {
1096             boot_guest_memory,
1097             guest_memory,
1098             next_memory_slot,
1099             start_of_device_area,
1100             end_of_device_area,
1101             end_of_ram_area,
1102             vm,
1103             hotplug_slots,
1104             selected_slot,
1105             mergeable: config.mergeable,
1106             allocator,
1107             hotplug_method: config.hotplug_method,
1108             boot_ram,
1109             current_ram,
1110             next_hotplug_slot,
1111             shared: config.shared,
1112             hugepages: config.hugepages,
1113             hugepage_size: config.hugepage_size,
1114             prefault: config.prefault,
1115             #[cfg(target_arch = "x86_64")]
1116             sgx_epc_region: None,
1117             user_provided_zones,
1118             snapshot_memory_ranges: MemoryRangeTable::default(),
1119             memory_zones,
1120             guest_ram_mappings: Vec::new(),
1121             acpi_address,
1122             log_dirty: dynamic, // Cannot log dirty pages on a TD
1123             arch_mem_regions,
1124             ram_allocator,
1125             dynamic,
1126             #[cfg(target_arch = "aarch64")]
1127             uefi_flash: None,
1128         };
1129 
1130         memory_manager.allocate_address_space()?;
1131 
1132         #[cfg(target_arch = "aarch64")]
1133         memory_manager.add_uefi_flash()?;
1134 
1135         #[cfg(target_arch = "x86_64")]
1136         if let Some(sgx_epc_config) = sgx_epc_config {
1137             memory_manager.setup_sgx(sgx_epc_config)?;
1138         }
1139 
1140         Ok(Arc::new(Mutex::new(memory_manager)))
1141     }
1142 
1143     pub fn new_from_snapshot(
1144         snapshot: &Snapshot,
1145         vm: Arc<dyn hypervisor::Vm>,
1146         config: &MemoryConfig,
1147         source_url: Option<&str>,
1148         prefault: bool,
1149         phys_bits: u8,
1150     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1151         if let Some(source_url) = source_url {
1152             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1153             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1154 
1155             let mem_snapshot: MemoryManagerSnapshotData = snapshot
1156                 .to_versioned_state(MEMORY_MANAGER_SNAPSHOT_ID)
1157                 .map_err(Error::Restore)?;
1158 
1159             let mm = MemoryManager::new(
1160                 vm,
1161                 config,
1162                 Some(prefault),
1163                 phys_bits,
1164                 #[cfg(feature = "tdx")]
1165                 false,
1166                 Some(&mem_snapshot),
1167                 None,
1168                 #[cfg(target_arch = "x86_64")]
1169                 None,
1170             )?;
1171 
1172             mm.lock()
1173                 .unwrap()
1174                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1175 
1176             Ok(mm)
1177         } else {
1178             Err(Error::RestoreMissingSourceUrl)
1179         }
1180     }
1181 
1182     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1183         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1184 
1185         if res < 0 {
1186             Err(io::Error::last_os_error())
1187         } else {
1188             Ok(res as RawFd)
1189         }
1190     }
1191 
1192     fn mbind(
1193         addr: *mut u8,
1194         len: u64,
1195         mode: u32,
1196         nodemask: Vec<u64>,
1197         maxnode: u64,
1198         flags: u32,
1199     ) -> Result<(), io::Error> {
1200         let res = unsafe {
1201             libc::syscall(
1202                 libc::SYS_mbind,
1203                 addr as *mut libc::c_void,
1204                 len,
1205                 mode,
1206                 nodemask.as_ptr(),
1207                 maxnode,
1208                 flags,
1209             )
1210         };
1211 
1212         if res < 0 {
1213             Err(io::Error::last_os_error())
1214         } else {
1215             Ok(())
1216         }
1217     }
1218 
1219     fn open_memory_file(
1220         backing_file: &Option<PathBuf>,
1221         file_offset: u64,
1222         size: usize,
1223         hugepages: bool,
1224         hugepage_size: Option<u64>,
1225     ) -> Result<(File, u64), Error> {
1226         let (f, f_off) = match backing_file {
1227             Some(ref file) => {
1228                 if file.is_dir() {
1229                     // Override file offset as it does not apply in this case.
1230                     info!(
1231                         "Ignoring file offset since the backing file is a \
1232                         temporary file created from the specified directory."
1233                     );
1234                     let fs_str = format!("{}{}", file.display(), "/tmpfile_XXXXXX");
1235                     let fs = ffi::CString::new(fs_str).unwrap();
1236                     let mut path = fs.as_bytes_with_nul().to_owned();
1237                     let path_ptr = path.as_mut_ptr() as *mut _;
1238                     let fd = unsafe { libc::mkstemp(path_ptr) };
1239                     unsafe { libc::unlink(path_ptr) };
1240                     let f = unsafe { File::from_raw_fd(fd) };
1241                     f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1242 
1243                     (f, 0)
1244                 } else {
1245                     let f = OpenOptions::new()
1246                         .read(true)
1247                         .write(true)
1248                         .open(file)
1249                         .map_err(Error::SharedFileCreate)?;
1250 
1251                     (f, file_offset)
1252                 }
1253             }
1254             None => {
1255                 let fd = Self::memfd_create(
1256                     &ffi::CString::new("ch_ram").unwrap(),
1257                     if hugepages {
1258                         libc::MFD_HUGETLB
1259                             | if let Some(hugepage_size) = hugepage_size {
1260                                 /*
1261                                  * From the Linux kernel:
1262                                  * Several system calls take a flag to request "hugetlb" huge pages.
1263                                  * Without further specification, these system calls will use the
1264                                  * system's default huge page size.  If a system supports multiple
1265                                  * huge page sizes, the desired huge page size can be specified in
1266                                  * bits [26:31] of the flag arguments.  The value in these 6 bits
1267                                  * will encode the log2 of the huge page size.
1268                                  */
1269 
1270                                 hugepage_size.trailing_zeros() << 26
1271                             } else {
1272                                 // Use the system default huge page size
1273                                 0
1274                             }
1275                     } else {
1276                         0
1277                     },
1278                 )
1279                 .map_err(Error::SharedFileCreate)?;
1280 
1281                 let f = unsafe { File::from_raw_fd(fd) };
1282                 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1283 
1284                 (f, 0)
1285             }
1286         };
1287 
1288         Ok((f, f_off))
1289     }
1290 
1291     #[allow(clippy::too_many_arguments)]
1292     fn create_ram_region(
1293         backing_file: &Option<PathBuf>,
1294         file_offset: u64,
1295         start_addr: GuestAddress,
1296         size: usize,
1297         prefault: bool,
1298         shared: bool,
1299         hugepages: bool,
1300         hugepage_size: Option<u64>,
1301         host_numa_node: Option<u32>,
1302         existing_memory_file: Option<File>,
1303     ) -> Result<Arc<GuestRegionMmap>, Error> {
1304         let (f, f_off) = if let Some(f) = existing_memory_file {
1305             (f, file_offset)
1306         } else {
1307             Self::open_memory_file(backing_file, file_offset, size, hugepages, hugepage_size)?
1308         };
1309 
1310         let mut mmap_flags = libc::MAP_NORESERVE
1311             | if shared {
1312                 libc::MAP_SHARED
1313             } else {
1314                 libc::MAP_PRIVATE
1315             };
1316         if prefault {
1317             mmap_flags |= libc::MAP_POPULATE;
1318         }
1319 
1320         let region = GuestRegionMmap::new(
1321             MmapRegion::build(
1322                 Some(FileOffset::new(f, f_off)),
1323                 size,
1324                 libc::PROT_READ | libc::PROT_WRITE,
1325                 mmap_flags,
1326             )
1327             .map_err(Error::GuestMemoryRegion)?,
1328             start_addr,
1329         )
1330         .map_err(Error::GuestMemory)?;
1331 
1332         // Apply NUMA policy if needed.
1333         if let Some(node) = host_numa_node {
1334             let addr = region.deref().as_ptr();
1335             let len = region.deref().size() as u64;
1336             let mode = MPOL_BIND;
1337             let mut nodemask: Vec<u64> = Vec::new();
1338             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1339 
1340             // Linux is kind of buggy in the way it interprets maxnode as it
1341             // will cut off the last node. That's why we have to add 1 to what
1342             // we would consider as the proper maxnode value.
1343             let maxnode = node as u64 + 1 + 1;
1344 
1345             // Allocate the right size for the vector.
1346             nodemask.resize((node as usize / 64) + 1, 0);
1347 
1348             // Fill the global bitmask through the nodemask vector.
1349             let idx = (node / 64) as usize;
1350             let shift = node % 64;
1351             nodemask[idx] |= 1u64 << shift;
1352 
1353             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1354             // force the kernel to move all pages that might have been already
1355             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1356             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1357             // MPOL_BIND is the selected mode as it specifies a strict policy
1358             // that restricts memory allocation to the nodes specified in the
1359             // nodemask.
1360             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1361                 .map_err(Error::ApplyNumaPolicy)?;
1362         }
1363 
1364         Ok(Arc::new(region))
1365     }
1366 
1367     // Update the GuestMemoryMmap with the new range
1368     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1369         let guest_memory = self
1370             .guest_memory
1371             .memory()
1372             .insert_region(region)
1373             .map_err(Error::GuestMemory)?;
1374         self.guest_memory.lock().unwrap().replace(guest_memory);
1375 
1376         Ok(())
1377     }
1378 
1379     //
1380     // Calculate the start address of an area next to RAM.
1381     //
1382     // If memory hotplug is allowed, the start address needs to be aligned
1383     // (rounded-up) to 128MiB boundary.
1384     // If memory hotplug is not allowed, there is no alignment required.
1385     // And it must also start at the 64bit start.
1386     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1387         let mut start_addr = if allow_mem_hotplug {
1388             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1389         } else {
1390             mem_end
1391         };
1392 
1393         start_addr = start_addr
1394             .checked_add(1)
1395             .ok_or(Error::GuestAddressOverFlow)?;
1396 
1397         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1398             return Ok(arch::layout::RAM_64BIT_START);
1399         }
1400 
1401         Ok(start_addr)
1402     }
1403 
1404     pub fn add_ram_region(
1405         &mut self,
1406         start_addr: GuestAddress,
1407         size: usize,
1408     ) -> Result<Arc<GuestRegionMmap>, Error> {
1409         // Allocate memory for the region
1410         let region = MemoryManager::create_ram_region(
1411             &None,
1412             0,
1413             start_addr,
1414             size,
1415             self.prefault,
1416             self.shared,
1417             self.hugepages,
1418             self.hugepage_size,
1419             None,
1420             None,
1421         )?;
1422 
1423         // Map it into the guest
1424         let slot = self.create_userspace_mapping(
1425             region.start_addr().0,
1426             region.len() as u64,
1427             region.as_ptr() as u64,
1428             self.mergeable,
1429             false,
1430             self.log_dirty,
1431         )?;
1432         self.guest_ram_mappings.push(GuestRamMapping {
1433             gpa: region.start_addr().raw_value(),
1434             size: region.len(),
1435             slot,
1436             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1437             virtio_mem: false,
1438             file_offset: 0,
1439         });
1440 
1441         self.add_region(Arc::clone(&region))?;
1442 
1443         Ok(region)
1444     }
1445 
1446     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1447         info!("Hotplugging new RAM: {}", size);
1448 
1449         // Check that there is a free slot
1450         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1451             return Err(Error::NoSlotAvailable);
1452         }
1453 
1454         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1455         if size % (128 << 20) != 0 {
1456             return Err(Error::InvalidSize);
1457         }
1458 
1459         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1460 
1461         if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area {
1462             return Err(Error::InsufficientHotplugRam);
1463         }
1464 
1465         let region = self.add_ram_region(start_addr, size)?;
1466 
1467         // Add region to the list of regions associated with the default
1468         // memory zone.
1469         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1470             memory_zone.regions.push(Arc::clone(&region));
1471         }
1472 
1473         // Tell the allocator
1474         self.ram_allocator
1475             .allocate(Some(start_addr), size as GuestUsize, None)
1476             .ok_or(Error::MemoryRangeAllocation)?;
1477 
1478         // Update the slot so that it can be queried via the I/O port
1479         let mut slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1480         slot.active = true;
1481         slot.inserting = true;
1482         slot.base = region.start_addr().0;
1483         slot.length = region.len() as u64;
1484 
1485         self.next_hotplug_slot += 1;
1486 
1487         Ok(region)
1488     }
1489 
1490     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1491         self.guest_memory.clone()
1492     }
1493 
1494     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1495         self.boot_guest_memory.clone()
1496     }
1497 
1498     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1499         self.allocator.clone()
1500     }
1501 
1502     pub fn start_of_device_area(&self) -> GuestAddress {
1503         self.start_of_device_area
1504     }
1505 
1506     pub fn end_of_device_area(&self) -> GuestAddress {
1507         self.end_of_device_area
1508     }
1509 
1510     pub fn allocate_memory_slot(&mut self) -> u32 {
1511         let slot_id = self.next_memory_slot;
1512         self.next_memory_slot += 1;
1513         slot_id
1514     }
1515 
1516     pub fn create_userspace_mapping(
1517         &mut self,
1518         guest_phys_addr: u64,
1519         memory_size: u64,
1520         userspace_addr: u64,
1521         mergeable: bool,
1522         readonly: bool,
1523         log_dirty: bool,
1524     ) -> Result<u32, Error> {
1525         let slot = self.allocate_memory_slot();
1526         let mem_region = self.vm.make_user_memory_region(
1527             slot,
1528             guest_phys_addr,
1529             memory_size,
1530             userspace_addr,
1531             readonly,
1532             log_dirty,
1533         );
1534 
1535         info!(
1536             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1537             guest_phys_addr, userspace_addr, memory_size, slot
1538         );
1539 
1540         self.vm
1541             .create_user_memory_region(mem_region)
1542             .map_err(Error::CreateUserMemoryRegion)?;
1543 
1544         // Mark the pages as mergeable if explicitly asked for.
1545         if mergeable {
1546             // Safe because the address and size are valid since the
1547             // mmap succeeded.
1548             let ret = unsafe {
1549                 libc::madvise(
1550                     userspace_addr as *mut libc::c_void,
1551                     memory_size as libc::size_t,
1552                     libc::MADV_MERGEABLE,
1553                 )
1554             };
1555             if ret != 0 {
1556                 let err = io::Error::last_os_error();
1557                 // Safe to unwrap because the error is constructed with
1558                 // last_os_error(), which ensures the output will be Some().
1559                 let errno = err.raw_os_error().unwrap();
1560                 if errno == libc::EINVAL {
1561                     warn!("kernel not configured with CONFIG_KSM");
1562                 } else {
1563                     warn!("madvise error: {}", err);
1564                 }
1565                 warn!("failed to mark pages as mergeable");
1566             }
1567         }
1568 
1569         info!(
1570             "Created userspace mapping: {:x} -> {:x} {:x}",
1571             guest_phys_addr, userspace_addr, memory_size
1572         );
1573 
1574         Ok(slot)
1575     }
1576 
1577     pub fn remove_userspace_mapping(
1578         &mut self,
1579         guest_phys_addr: u64,
1580         memory_size: u64,
1581         userspace_addr: u64,
1582         mergeable: bool,
1583         slot: u32,
1584     ) -> Result<(), Error> {
1585         let mem_region = self.vm.make_user_memory_region(
1586             slot,
1587             guest_phys_addr,
1588             memory_size,
1589             userspace_addr,
1590             false, /* readonly -- don't care */
1591             false, /* log dirty */
1592         );
1593 
1594         self.vm
1595             .remove_user_memory_region(mem_region)
1596             .map_err(Error::RemoveUserMemoryRegion)?;
1597 
1598         // Mark the pages as unmergeable if there were previously marked as
1599         // mergeable.
1600         if mergeable {
1601             // Safe because the address and size are valid as the region was
1602             // previously advised.
1603             let ret = unsafe {
1604                 libc::madvise(
1605                     userspace_addr as *mut libc::c_void,
1606                     memory_size as libc::size_t,
1607                     libc::MADV_UNMERGEABLE,
1608                 )
1609             };
1610             if ret != 0 {
1611                 let err = io::Error::last_os_error();
1612                 // Safe to unwrap because the error is constructed with
1613                 // last_os_error(), which ensures the output will be Some().
1614                 let errno = err.raw_os_error().unwrap();
1615                 if errno == libc::EINVAL {
1616                     warn!("kernel not configured with CONFIG_KSM");
1617                 } else {
1618                     warn!("madvise error: {}", err);
1619                 }
1620                 warn!("failed to mark pages as unmergeable");
1621             }
1622         }
1623 
1624         info!(
1625             "Removed userspace mapping: {:x} -> {:x} {:x}",
1626             guest_phys_addr, userspace_addr, memory_size
1627         );
1628 
1629         Ok(())
1630     }
1631 
1632     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1633         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1634             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1635                 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() {
1636                     virtio_mem_device
1637                         .lock()
1638                         .unwrap()
1639                         .resize(size)
1640                         .map_err(Error::VirtioMemResizeFail)?;
1641                 }
1642 
1643                 // Keep the hotplugged_size up to date.
1644                 virtio_mem_zone.hotplugged_size = size;
1645             } else {
1646                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1647                 return Err(Error::MissingVirtioMemHandler);
1648             }
1649 
1650             return Ok(());
1651         }
1652 
1653         error!("Failed resizing virtio-mem region: Unknown memory zone");
1654         Err(Error::UnknownMemoryZone)
1655     }
1656 
1657     /// In case this function resulted in adding a new memory region to the
1658     /// guest memory, the new region is returned to the caller. The virtio-mem
1659     /// use case never adds a new region as the whole hotpluggable memory has
1660     /// already been allocated at boot time.
1661     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1662         if self.user_provided_zones {
1663             error!(
1664                 "Not allowed to resize guest memory when backed with user \
1665                 defined memory zones."
1666             );
1667             return Err(Error::InvalidResizeWithMemoryZones);
1668         }
1669 
1670         let mut region: Option<Arc<GuestRegionMmap>> = None;
1671         match self.hotplug_method {
1672             HotplugMethod::VirtioMem => {
1673                 if desired_ram >= self.boot_ram {
1674                     if !self.dynamic {
1675                         return Ok(region);
1676                     }
1677 
1678                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1679                     self.current_ram = desired_ram;
1680                 }
1681             }
1682             HotplugMethod::Acpi => {
1683                 if desired_ram > self.current_ram {
1684                     if !self.dynamic {
1685                         return Ok(region);
1686                     }
1687 
1688                     region =
1689                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1690                     self.current_ram = desired_ram;
1691                 }
1692             }
1693         }
1694         Ok(region)
1695     }
1696 
1697     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1698         if !self.user_provided_zones {
1699             error!(
1700                 "Not allowed to resize guest memory zone when no zone is \
1701                 defined."
1702             );
1703             return Err(Error::ResizeZone);
1704         }
1705 
1706         self.virtio_mem_resize(id, virtio_mem_size)
1707     }
1708 
1709     #[cfg(target_arch = "x86_64")]
1710     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1711         let file = OpenOptions::new()
1712             .read(true)
1713             .open("/dev/sgx_provision")
1714             .map_err(Error::SgxProvisionOpen)?;
1715         self.vm
1716             .enable_sgx_attribute(file)
1717             .map_err(Error::SgxEnableProvisioning)?;
1718 
1719         // Go over each EPC section and verify its size is a 4k multiple. At
1720         // the same time, calculate the total size needed for the contiguous
1721         // EPC region.
1722         let mut epc_region_size = 0;
1723         for epc_section in sgx_epc_config.iter() {
1724             if epc_section.size == 0 {
1725                 return Err(Error::EpcSectionSizeInvalid);
1726             }
1727             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1728                 return Err(Error::EpcSectionSizeInvalid);
1729             }
1730 
1731             epc_region_size += epc_section.size;
1732         }
1733 
1734         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
1735         let epc_region_start = GuestAddress(
1736             ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE,
1737         );
1738 
1739         self.start_of_device_area = epc_region_start
1740             .checked_add(epc_region_size)
1741             .ok_or(Error::GuestAddressOverFlow)?;
1742 
1743         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
1744         info!(
1745             "SGX EPC region: 0x{:x} (0x{:x})",
1746             epc_region_start.0, epc_region_size
1747         );
1748 
1749         // Each section can be memory mapped into the allocated region.
1750         let mut epc_section_start = epc_region_start.raw_value();
1751         for epc_section in sgx_epc_config.iter() {
1752             let file = OpenOptions::new()
1753                 .read(true)
1754                 .write(true)
1755                 .open("/dev/sgx_vepc")
1756                 .map_err(Error::SgxVirtEpcOpen)?;
1757 
1758             let prot = PROT_READ | PROT_WRITE;
1759             let mut flags = MAP_NORESERVE | MAP_SHARED;
1760             if epc_section.prefault {
1761                 flags |= MAP_POPULATE;
1762             }
1763 
1764             // We can't use the vm-memory crate to perform the memory mapping
1765             // here as it would try to ensure the size of the backing file is
1766             // matching the size of the expected mapping. The /dev/sgx_vepc
1767             // device does not work that way, it provides a file descriptor
1768             // which is not matching the mapping size, as it's a just a way to
1769             // let KVM know that an EPC section is being created for the guest.
1770             let host_addr = unsafe {
1771                 libc::mmap(
1772                     std::ptr::null_mut(),
1773                     epc_section.size as usize,
1774                     prot,
1775                     flags,
1776                     file.as_raw_fd(),
1777                     0,
1778                 )
1779             } as u64;
1780 
1781             info!(
1782                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
1783                 epc_section_start, epc_section.size
1784             );
1785 
1786             let _mem_slot = self.create_userspace_mapping(
1787                 epc_section_start,
1788                 epc_section.size,
1789                 host_addr,
1790                 false,
1791                 false,
1792                 false,
1793             )?;
1794 
1795             sgx_epc_region.insert(
1796                 epc_section.id.clone(),
1797                 SgxEpcSection::new(
1798                     GuestAddress(epc_section_start),
1799                     epc_section.size as GuestUsize,
1800                 ),
1801             );
1802 
1803             epc_section_start += epc_section.size;
1804         }
1805 
1806         self.sgx_epc_region = Some(sgx_epc_region);
1807 
1808         Ok(())
1809     }
1810 
1811     #[cfg(target_arch = "x86_64")]
1812     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
1813         &self.sgx_epc_region
1814     }
1815 
1816     pub fn is_hardlink(f: &File) -> bool {
1817         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
1818         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
1819         if ret != 0 {
1820             error!("Couldn't fstat the backing file");
1821             return false;
1822         }
1823 
1824         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
1825     }
1826 
1827     pub fn memory_zones(&self) -> &MemoryZones {
1828         &self.memory_zones
1829     }
1830 
1831     pub fn memory_zones_mut(&mut self) -> &mut MemoryZones {
1832         &mut self.memory_zones
1833     }
1834 
1835     pub fn memory_range_table(
1836         &self,
1837         snapshot: bool,
1838     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
1839         let mut table = MemoryRangeTable::default();
1840 
1841         for memory_zone in self.memory_zones.values() {
1842             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
1843                 table.extend(virtio_mem_zone.plugged_ranges());
1844             }
1845 
1846             for region in memory_zone.regions() {
1847                 if snapshot {
1848                     if let Some(file_offset) = region.file_offset() {
1849                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
1850                             && Self::is_hardlink(file_offset.file())
1851                         {
1852                             // In this very specific case, we know the memory
1853                             // region is backed by a file on the host filesystem
1854                             // that can be accessed by the user, and additionally
1855                             // the mapping is shared, which means that modifications
1856                             // to the content are written to the actual file.
1857                             // When meeting these conditions, we can skip the
1858                             // copy of the memory content for this specific region,
1859                             // as we can assume the user will have it saved through
1860                             // the backing file already.
1861                             continue;
1862                         }
1863                     }
1864                 }
1865 
1866                 table.push(MemoryRange {
1867                     gpa: region.start_addr().raw_value(),
1868                     length: region.len() as u64,
1869                 });
1870             }
1871         }
1872 
1873         Ok(table)
1874     }
1875 
1876     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
1877         MemoryManagerSnapshotData {
1878             memory_ranges: self.snapshot_memory_ranges.clone(),
1879             guest_ram_mappings: self.guest_ram_mappings.clone(),
1880             start_of_device_area: self.start_of_device_area.0,
1881             boot_ram: self.boot_ram,
1882             current_ram: self.current_ram,
1883             arch_mem_regions: self.arch_mem_regions.clone(),
1884             hotplug_slots: self.hotplug_slots.clone(),
1885             next_memory_slot: self.next_memory_slot,
1886             selected_slot: self.selected_slot,
1887             next_hotplug_slot: self.next_hotplug_slot,
1888         }
1889     }
1890 
1891     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
1892         let mut memory_slot_fds = HashMap::new();
1893         for guest_ram_mapping in &self.guest_ram_mappings {
1894             let slot = guest_ram_mapping.slot;
1895             let guest_memory = self.guest_memory.memory();
1896             let file = guest_memory
1897                 .find_region(GuestAddress(guest_ram_mapping.gpa))
1898                 .unwrap()
1899                 .file_offset()
1900                 .unwrap()
1901                 .file();
1902             memory_slot_fds.insert(slot, file.as_raw_fd());
1903         }
1904         memory_slot_fds
1905     }
1906 
1907     pub fn acpi_address(&self) -> Option<GuestAddress> {
1908         self.acpi_address
1909     }
1910 
1911     pub fn num_guest_ram_mappings(&self) -> u32 {
1912         self.guest_ram_mappings.len() as u32
1913     }
1914 
1915     #[cfg(target_arch = "aarch64")]
1916     pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1917         self.uefi_flash.as_ref().unwrap().clone()
1918     }
1919 
1920     #[cfg(feature = "guest_debug")]
1921     pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
1922         let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
1923         mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
1924 
1925         let mut mem_offset_in_elf = mem_offset;
1926         let mut ram_maps = BTreeMap::new();
1927         for mapping in mapping_sorted_by_gpa.iter() {
1928             ram_maps.insert(
1929                 mapping.gpa,
1930                 CoredumpMemoryRegion {
1931                     mem_offset_in_elf,
1932                     mem_size: mapping.size,
1933                 },
1934             );
1935             mem_offset_in_elf += mapping.size;
1936         }
1937 
1938         CoredumpMemoryRegions { ram_maps }
1939     }
1940 
1941     #[cfg(feature = "guest_debug")]
1942     pub fn coredump_iterate_save_mem(
1943         &mut self,
1944         dump_state: &DumpState,
1945     ) -> std::result::Result<(), GuestDebuggableError> {
1946         let snapshot_memory_ranges = self
1947             .memory_range_table(false)
1948             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
1949 
1950         if snapshot_memory_ranges.is_empty() {
1951             return Ok(());
1952         }
1953 
1954         let mut coredump_file = dump_state.file.as_ref().unwrap();
1955 
1956         let guest_memory = self.guest_memory.memory();
1957         let mut total_bytes: u64 = 0;
1958 
1959         for range in snapshot_memory_ranges.regions() {
1960             let mut offset: u64 = 0;
1961             loop {
1962                 let bytes_written = guest_memory
1963                     .write_to(
1964                         GuestAddress(range.gpa + offset),
1965                         &mut coredump_file,
1966                         (range.length - offset) as usize,
1967                     )
1968                     .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
1969                 offset += bytes_written as u64;
1970                 total_bytes += bytes_written as u64;
1971 
1972                 if offset == range.length {
1973                     break;
1974                 }
1975             }
1976         }
1977 
1978         debug!("coredump total bytes {}", total_bytes);
1979         Ok(())
1980     }
1981 }
1982 
1983 struct MemoryNotify {
1984     slot_id: usize,
1985 }
1986 
1987 impl Aml for MemoryNotify {
1988     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1989         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
1990         aml::If::new(
1991             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
1992             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1993         )
1994         .append_aml_bytes(bytes)
1995     }
1996 }
1997 
1998 struct MemorySlot {
1999     slot_id: usize,
2000 }
2001 
2002 impl Aml for MemorySlot {
2003     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2004         aml::Device::new(
2005             format!("M{:03}", self.slot_id).as_str().into(),
2006             vec![
2007                 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0C80")),
2008                 &aml::Name::new("_UID".into(), &self.slot_id),
2009                 /*
2010                 _STA return value:
2011                 Bit [0] – Set if the device is present.
2012                 Bit [1] – Set if the device is enabled and decoding its resources.
2013                 Bit [2] – Set if the device should be shown in the UI.
2014                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
2015                 Bit [4] – Set if the battery is present.
2016                 Bits [31:5] – Reserved (must be cleared).
2017                 */
2018                 &aml::Method::new(
2019                     "_STA".into(),
2020                     0,
2021                     false,
2022                     // Call into MSTA method which will interrogate device
2023                     vec![&aml::Return::new(&aml::MethodCall::new(
2024                         "MSTA".into(),
2025                         vec![&self.slot_id],
2026                     ))],
2027                 ),
2028                 // Get details of memory
2029                 &aml::Method::new(
2030                     "_CRS".into(),
2031                     0,
2032                     false,
2033                     // Call into MCRS which provides actual memory details
2034                     vec![&aml::Return::new(&aml::MethodCall::new(
2035                         "MCRS".into(),
2036                         vec![&self.slot_id],
2037                     ))],
2038                 ),
2039             ],
2040         )
2041         .append_aml_bytes(bytes)
2042     }
2043 }
2044 
2045 struct MemorySlots {
2046     slots: usize,
2047 }
2048 
2049 impl Aml for MemorySlots {
2050     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2051         for slot_id in 0..self.slots {
2052             MemorySlot { slot_id }.append_aml_bytes(bytes);
2053         }
2054     }
2055 }
2056 
2057 struct MemoryMethods {
2058     slots: usize,
2059 }
2060 
2061 impl Aml for MemoryMethods {
2062     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2063         // Add "MTFY" notification method
2064         let mut memory_notifies = Vec::new();
2065         for slot_id in 0..self.slots {
2066             memory_notifies.push(MemoryNotify { slot_id });
2067         }
2068 
2069         let mut memory_notifies_refs: Vec<&dyn aml::Aml> = Vec::new();
2070         for memory_notifier in memory_notifies.iter() {
2071             memory_notifies_refs.push(memory_notifier);
2072         }
2073 
2074         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).append_aml_bytes(bytes);
2075 
2076         // MSCN method
2077         aml::Method::new(
2078             "MSCN".into(),
2079             0,
2080             true,
2081             vec![
2082                 // Take lock defined above
2083                 &aml::Acquire::new("MLCK".into(), 0xffff),
2084                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2085                 &aml::While::new(
2086                     &aml::LessThan::new(&aml::Local(0), &self.slots),
2087                     vec![
2088                         // Write slot number (in first argument) to I/O port via field
2089                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2090                         // Check if MINS bit is set (inserting)
2091                         &aml::If::new(
2092                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2093                             // Notify device if it is
2094                             vec![
2095                                 &aml::MethodCall::new(
2096                                     "MTFY".into(),
2097                                     vec![&aml::Local(0), &aml::ONE],
2098                                 ),
2099                                 // Reset MINS bit
2100                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2101                             ],
2102                         ),
2103                         // Check if MRMV bit is set
2104                         &aml::If::new(
2105                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2106                             // Notify device if it is (with the eject constant 0x3)
2107                             vec![
2108                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2109                                 // Reset MRMV bit
2110                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2111                             ],
2112                         ),
2113                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2114                     ],
2115                 ),
2116                 // Release lock
2117                 &aml::Release::new("MLCK".into()),
2118             ],
2119         )
2120         .append_aml_bytes(bytes);
2121 
2122         // Memory status method
2123         aml::Method::new(
2124             "MSTA".into(),
2125             1,
2126             true,
2127             vec![
2128                 // Take lock defined above
2129                 &aml::Acquire::new("MLCK".into(), 0xffff),
2130                 // Write slot number (in first argument) to I/O port via field
2131                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2132                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2133                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2134                 &aml::If::new(
2135                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2136                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2137                 ),
2138                 // Release lock
2139                 &aml::Release::new("MLCK".into()),
2140                 // Return 0 or 0xf
2141                 &aml::Return::new(&aml::Local(0)),
2142             ],
2143         )
2144         .append_aml_bytes(bytes);
2145 
2146         // Memory range method
2147         aml::Method::new(
2148             "MCRS".into(),
2149             1,
2150             true,
2151             vec![
2152                 // Take lock defined above
2153                 &aml::Acquire::new("MLCK".into(), 0xffff),
2154                 // Write slot number (in first argument) to I/O port via field
2155                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2156                 &aml::Name::new(
2157                     "MR64".into(),
2158                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2159                         aml::AddressSpaceCachable::Cacheable,
2160                         true,
2161                         0x0000_0000_0000_0000u64,
2162                         0xFFFF_FFFF_FFFF_FFFEu64,
2163                     )]),
2164                 ),
2165                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &14usize, "MINL".into()),
2166                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &18usize, "MINH".into()),
2167                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &22usize, "MAXL".into()),
2168                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &26usize, "MAXH".into()),
2169                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &38usize, "LENL".into()),
2170                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &42usize, "LENH".into()),
2171                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2172                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2173                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2174                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2175                 &aml::Add::new(
2176                     &aml::Path::new("MAXL"),
2177                     &aml::Path::new("MINL"),
2178                     &aml::Path::new("LENL"),
2179                 ),
2180                 &aml::Add::new(
2181                     &aml::Path::new("MAXH"),
2182                     &aml::Path::new("MINH"),
2183                     &aml::Path::new("LENH"),
2184                 ),
2185                 &aml::If::new(
2186                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2187                     vec![&aml::Add::new(
2188                         &aml::Path::new("MAXH"),
2189                         &aml::ONE,
2190                         &aml::Path::new("MAXH"),
2191                     )],
2192                 ),
2193                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2194                 // Release lock
2195                 &aml::Release::new("MLCK".into()),
2196                 &aml::Return::new(&aml::Path::new("MR64")),
2197             ],
2198         )
2199         .append_aml_bytes(bytes)
2200     }
2201 }
2202 
2203 impl Aml for MemoryManager {
2204     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2205         if let Some(acpi_address) = self.acpi_address {
2206             // Memory Hotplug Controller
2207             aml::Device::new(
2208                 "_SB_.MHPC".into(),
2209                 vec![
2210                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
2211                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2212                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2213                     &aml::Mutex::new("MLCK".into(), 0),
2214                     &aml::Name::new(
2215                         "_CRS".into(),
2216                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2217                             aml::AddressSpaceCachable::NotCacheable,
2218                             true,
2219                             acpi_address.0 as u64,
2220                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2221                         )]),
2222                     ),
2223                     // OpRegion and Fields map MMIO range into individual field values
2224                     &aml::OpRegion::new(
2225                         "MHPR".into(),
2226                         aml::OpRegionSpace::SystemMemory,
2227                         acpi_address.0 as usize,
2228                         MEMORY_MANAGER_ACPI_SIZE,
2229                     ),
2230                     &aml::Field::new(
2231                         "MHPR".into(),
2232                         aml::FieldAccessType::DWord,
2233                         aml::FieldUpdateRule::Preserve,
2234                         vec![
2235                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2236                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2237                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2238                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2239                         ],
2240                     ),
2241                     &aml::Field::new(
2242                         "MHPR".into(),
2243                         aml::FieldAccessType::DWord,
2244                         aml::FieldUpdateRule::Preserve,
2245                         vec![
2246                             aml::FieldEntry::Reserved(128),
2247                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2248                         ],
2249                     ),
2250                     &aml::Field::new(
2251                         "MHPR".into(),
2252                         aml::FieldAccessType::Byte,
2253                         aml::FieldUpdateRule::WriteAsZeroes,
2254                         vec![
2255                             aml::FieldEntry::Reserved(160),
2256                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2257                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2258                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2259                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2260                         ],
2261                     ),
2262                     &aml::Field::new(
2263                         "MHPR".into(),
2264                         aml::FieldAccessType::DWord,
2265                         aml::FieldUpdateRule::Preserve,
2266                         vec![
2267                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2268                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2269                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2270                         ],
2271                     ),
2272                     &MemoryMethods {
2273                         slots: self.hotplug_slots.len(),
2274                     },
2275                     &MemorySlots {
2276                         slots: self.hotplug_slots.len(),
2277                     },
2278                 ],
2279             )
2280             .append_aml_bytes(bytes);
2281         } else {
2282             aml::Device::new(
2283                 "_SB_.MHPC".into(),
2284                 vec![
2285                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
2286                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2287                     // Empty MSCN for GED
2288                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2289                 ],
2290             )
2291             .append_aml_bytes(bytes);
2292         }
2293 
2294         #[cfg(target_arch = "x86_64")]
2295         {
2296             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2297                 let min = sgx_epc_region.start().raw_value() as u64;
2298                 let max = min + sgx_epc_region.size() as u64 - 1;
2299                 // SGX EPC region
2300                 aml::Device::new(
2301                     "_SB_.EPC_".into(),
2302                     vec![
2303                         &aml::Name::new("_HID".into(), &aml::EisaName::new("INT0E0C")),
2304                         // QWORD describing the EPC region start and size
2305                         &aml::Name::new(
2306                             "_CRS".into(),
2307                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2308                                 aml::AddressSpaceCachable::NotCacheable,
2309                                 true,
2310                                 min,
2311                                 max,
2312                             )]),
2313                         ),
2314                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2315                     ],
2316                 )
2317                 .append_aml_bytes(bytes);
2318             }
2319         }
2320     }
2321 }
2322 
2323 impl Pausable for MemoryManager {}
2324 
2325 #[derive(Clone, Serialize, Deserialize, Versionize)]
2326 pub struct MemoryManagerSnapshotData {
2327     memory_ranges: MemoryRangeTable,
2328     guest_ram_mappings: Vec<GuestRamMapping>,
2329     start_of_device_area: u64,
2330     boot_ram: u64,
2331     current_ram: u64,
2332     arch_mem_regions: Vec<ArchMemRegion>,
2333     hotplug_slots: Vec<HotPlugState>,
2334     next_memory_slot: u32,
2335     selected_slot: usize,
2336     next_hotplug_slot: usize,
2337 }
2338 
2339 impl VersionMapped for MemoryManagerSnapshotData {}
2340 
2341 impl Snapshottable for MemoryManager {
2342     fn id(&self) -> String {
2343         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2344     }
2345 
2346     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2347         let mut memory_manager_snapshot = Snapshot::new(MEMORY_MANAGER_SNAPSHOT_ID);
2348 
2349         let memory_ranges = self.memory_range_table(true)?;
2350 
2351         // Store locally this list of ranges as it will be used through the
2352         // Transportable::send() implementation. The point is to avoid the
2353         // duplication of code regarding the creation of the path for each
2354         // region. The 'snapshot' step creates the list of memory regions,
2355         // including information about the need to copy a memory region or
2356         // not. This saves the 'send' step having to go through the same
2357         // process, and instead it can directly proceed with storing the
2358         // memory range content for the ranges requiring it.
2359         self.snapshot_memory_ranges = memory_ranges;
2360 
2361         memory_manager_snapshot.add_data_section(SnapshotDataSection::new_from_versioned_state(
2362             MEMORY_MANAGER_SNAPSHOT_ID,
2363             &self.snapshot_data(),
2364         )?);
2365 
2366         Ok(memory_manager_snapshot)
2367     }
2368 }
2369 
2370 impl Transportable for MemoryManager {
2371     fn send(
2372         &self,
2373         _snapshot: &Snapshot,
2374         destination_url: &str,
2375     ) -> result::Result<(), MigratableError> {
2376         if self.snapshot_memory_ranges.is_empty() {
2377             return Ok(());
2378         }
2379 
2380         let mut memory_file_path = url_to_path(destination_url)?;
2381         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2382 
2383         // Create the snapshot file for the entire memory
2384         let mut memory_file = OpenOptions::new()
2385             .read(true)
2386             .write(true)
2387             .create_new(true)
2388             .open(memory_file_path)
2389             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2390 
2391         let guest_memory = self.guest_memory.memory();
2392 
2393         for range in self.snapshot_memory_ranges.regions() {
2394             let mut offset: u64 = 0;
2395             // Here we are manually handling the retry in case we can't read
2396             // the whole region at once because we can't use the implementation
2397             // from vm-memory::GuestMemory of write_all_to() as it is not
2398             // following the correct behavior. For more info about this issue
2399             // see: https://github.com/rust-vmm/vm-memory/issues/174
2400             loop {
2401                 let bytes_written = guest_memory
2402                     .write_to(
2403                         GuestAddress(range.gpa + offset),
2404                         &mut memory_file,
2405                         (range.length - offset) as usize,
2406                     )
2407                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2408                 offset += bytes_written as u64;
2409 
2410                 if offset == range.length {
2411                     break;
2412                 }
2413             }
2414         }
2415         Ok(())
2416     }
2417 }
2418 
2419 impl Migratable for MemoryManager {
2420     // Start the dirty log in the hypervisor (kvm/mshv).
2421     // Also, reset the dirty bitmap logged by the vmm.
2422     // Just before we do a bulk copy we want to start/clear the dirty log so that
2423     // pages touched during our bulk copy are tracked.
2424     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2425         self.vm.start_dirty_log().map_err(|e| {
2426             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2427         })?;
2428 
2429         for r in self.guest_memory.memory().iter() {
2430             r.bitmap().reset();
2431         }
2432 
2433         Ok(())
2434     }
2435 
2436     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2437         self.vm.stop_dirty_log().map_err(|e| {
2438             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2439         })?;
2440 
2441         Ok(())
2442     }
2443 
2444     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2445     // together in the table if they are contiguous.
2446     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2447         let mut table = MemoryRangeTable::default();
2448         for r in &self.guest_ram_mappings {
2449             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2450                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2451             })?;
2452             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2453             {
2454                 Some(region) => {
2455                     assert!(region.start_addr().raw_value() == r.gpa);
2456                     assert!(region.len() == r.size);
2457                     region.bitmap().get_and_reset()
2458                 }
2459                 None => {
2460                     return Err(MigratableError::MigrateSend(anyhow!(
2461                         "Error finding 'guest memory region' with address {:x}",
2462                         r.gpa
2463                     )))
2464                 }
2465             };
2466 
2467             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2468                 .iter()
2469                 .zip(vmm_dirty_bitmap.iter())
2470                 .map(|(x, y)| x | y)
2471                 .collect();
2472 
2473             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2474 
2475             if sub_table.regions().is_empty() {
2476                 info!("Dirty Memory Range Table is empty");
2477             } else {
2478                 info!("Dirty Memory Range Table:");
2479                 for range in sub_table.regions() {
2480                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2481                 }
2482             }
2483 
2484             table.extend(sub_table);
2485         }
2486         Ok(table)
2487     }
2488 }
2489