xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision 7d7bfb2034001d4cb15df2ddc56d2d350c8da30f)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 #[cfg(target_arch = "x86_64")]
6 use crate::config::SgxEpcConfig;
7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
8 use crate::migration::url_to_path;
9 use crate::MEMORY_MANAGER_SNAPSHOT_ID;
10 use crate::{GuestMemoryMmap, GuestRegionMmap};
11 use acpi_tables::{aml, aml::Aml};
12 use anyhow::anyhow;
13 #[cfg(target_arch = "x86_64")]
14 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
15 use arch::{layout, RegionType};
16 #[cfg(target_arch = "x86_64")]
17 use devices::ioapic;
18 #[cfg(target_arch = "x86_64")]
19 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
20 use std::collections::HashMap;
21 use std::convert::TryInto;
22 use std::ffi;
23 use std::fs::{File, OpenOptions};
24 use std::io;
25 use std::ops::Deref;
26 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
27 use std::path::PathBuf;
28 use std::result;
29 use std::sync::{Arc, Barrier, Mutex};
30 use versionize::{VersionMap, Versionize, VersionizeResult};
31 use versionize_derive::Versionize;
32 use virtio_devices::BlocksState;
33 #[cfg(target_arch = "x86_64")]
34 use vm_allocator::GsiApic;
35 use vm_allocator::{AddressAllocator, SystemAllocator};
36 use vm_device::BusDevice;
37 use vm_memory::bitmap::AtomicBitmap;
38 use vm_memory::guest_memory::FileOffset;
39 use vm_memory::{
40     mmap::MmapRegionError, Address, Bytes, Error as MmapError, GuestAddress, GuestAddressSpace,
41     GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion,
42 };
43 use vm_migration::{
44     protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable,
45     Snapshot, SnapshotDataSection, Snapshottable, Transportable, VersionMapped,
46 };
47 
48 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
49 
50 const DEFAULT_MEMORY_ZONE: &str = "mem0";
51 
52 const SNAPSHOT_FILENAME: &str = "memory-ranges";
53 
54 #[cfg(target_arch = "x86_64")]
55 const X86_64_IRQ_BASE: u32 = 5;
56 
57 #[cfg(target_arch = "x86_64")]
58 const SGX_PAGE_SIZE: u64 = 1 << 12;
59 
60 const HOTPLUG_COUNT: usize = 8;
61 
62 // Memory policy constants
63 const MPOL_BIND: u32 = 2;
64 const MPOL_MF_STRICT: u32 = 1;
65 const MPOL_MF_MOVE: u32 = 1 << 1;
66 
67 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
68 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
69 
70 #[derive(Clone, Default, Serialize, Deserialize, Versionize)]
71 struct HotPlugState {
72     base: u64,
73     length: u64,
74     active: bool,
75     inserting: bool,
76     removing: bool,
77 }
78 
79 pub struct VirtioMemZone {
80     region: Arc<GuestRegionMmap>,
81     resize_handler: virtio_devices::Resize,
82     hotplugged_size: u64,
83     hugepages: bool,
84     blocks_state: Arc<Mutex<BlocksState>>,
85 }
86 
87 impl VirtioMemZone {
88     pub fn region(&self) -> &Arc<GuestRegionMmap> {
89         &self.region
90     }
91     pub fn resize_handler(&self) -> &virtio_devices::Resize {
92         &self.resize_handler
93     }
94     pub fn hotplugged_size(&self) -> u64 {
95         self.hotplugged_size
96     }
97     pub fn hugepages(&self) -> bool {
98         self.hugepages
99     }
100     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
101         &self.blocks_state
102     }
103     pub fn plugged_ranges(&self) -> MemoryRangeTable {
104         self.blocks_state
105             .lock()
106             .unwrap()
107             .memory_ranges(self.region.start_addr().raw_value(), true)
108     }
109 }
110 
111 #[derive(Default)]
112 pub struct MemoryZone {
113     regions: Vec<Arc<GuestRegionMmap>>,
114     virtio_mem_zone: Option<VirtioMemZone>,
115 }
116 
117 impl MemoryZone {
118     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
119         &self.regions
120     }
121     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
122         &self.virtio_mem_zone
123     }
124 }
125 
126 pub type MemoryZones = HashMap<String, MemoryZone>;
127 
128 #[derive(Clone, Serialize, Deserialize, Versionize)]
129 struct GuestRamMapping {
130     slot: u32,
131     gpa: u64,
132     size: u64,
133     zone_id: String,
134     virtio_mem: bool,
135     file_offset: u64,
136 }
137 
138 #[derive(Clone, Serialize, Deserialize, Versionize)]
139 struct ArchMemRegion {
140     base: u64,
141     size: usize,
142     r_type: RegionType,
143 }
144 
145 pub struct MemoryManager {
146     boot_guest_memory: GuestMemoryMmap,
147     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
148     next_memory_slot: u32,
149     start_of_device_area: GuestAddress,
150     end_of_device_area: GuestAddress,
151     end_of_ram_area: GuestAddress,
152     pub vm: Arc<dyn hypervisor::Vm>,
153     hotplug_slots: Vec<HotPlugState>,
154     selected_slot: usize,
155     mergeable: bool,
156     allocator: Arc<Mutex<SystemAllocator>>,
157     hotplug_method: HotplugMethod,
158     boot_ram: u64,
159     current_ram: u64,
160     next_hotplug_slot: usize,
161     shared: bool,
162     hugepages: bool,
163     hugepage_size: Option<u64>,
164     prefault: bool,
165     #[cfg(target_arch = "x86_64")]
166     sgx_epc_region: Option<SgxEpcRegion>,
167     user_provided_zones: bool,
168     snapshot_memory_ranges: MemoryRangeTable,
169     memory_zones: MemoryZones,
170     log_dirty: bool, // Enable dirty logging for created RAM regions
171     arch_mem_regions: Vec<ArchMemRegion>,
172     ram_allocator: AddressAllocator,
173     dynamic: bool,
174 
175     // Keep track of calls to create_userspace_mapping() for guest RAM.
176     // This is useful for getting the dirty pages as we need to know the
177     // slots that the mapping is created in.
178     guest_ram_mappings: Vec<GuestRamMapping>,
179 
180     pub acpi_address: Option<GuestAddress>,
181 }
182 
183 #[derive(Debug)]
184 pub enum Error {
185     /// Failed to create shared file.
186     SharedFileCreate(io::Error),
187 
188     /// Failed to set shared file length.
189     SharedFileSetLen(io::Error),
190 
191     /// Mmap backed guest memory error
192     GuestMemory(MmapError),
193 
194     /// Failed to allocate a memory range.
195     MemoryRangeAllocation,
196 
197     /// Error from region creation
198     GuestMemoryRegion(MmapRegionError),
199 
200     /// No ACPI slot available
201     NoSlotAvailable,
202 
203     /// Not enough space in the hotplug RAM region
204     InsufficientHotplugRam,
205 
206     /// The requested hotplug memory addition is not a valid size
207     InvalidSize,
208 
209     /// Failed to create the user memory region.
210     CreateUserMemoryRegion(hypervisor::HypervisorVmError),
211 
212     /// Failed to remove the user memory region.
213     RemoveUserMemoryRegion(hypervisor::HypervisorVmError),
214 
215     /// Failed to EventFd.
216     EventFdFail(io::Error),
217 
218     /// Eventfd write error
219     EventfdError(io::Error),
220 
221     /// Failed to virtio-mem resize
222     VirtioMemResizeFail(virtio_devices::mem::Error),
223 
224     /// Cannot restore VM
225     Restore(MigratableError),
226 
227     /// Cannot restore VM because source URL is missing
228     RestoreMissingSourceUrl,
229 
230     /// Cannot create the system allocator
231     CreateSystemAllocator,
232 
233     /// Invalid SGX EPC section size
234     #[cfg(target_arch = "x86_64")]
235     EpcSectionSizeInvalid,
236 
237     /// Failed allocating SGX EPC region
238     #[cfg(target_arch = "x86_64")]
239     SgxEpcRangeAllocation,
240 
241     /// Failed opening SGX virtual EPC device
242     #[cfg(target_arch = "x86_64")]
243     SgxVirtEpcOpen(io::Error),
244 
245     /// Failed setting the SGX virtual EPC section size
246     #[cfg(target_arch = "x86_64")]
247     SgxVirtEpcFileSetLen(io::Error),
248 
249     /// Failed opening SGX provisioning device
250     #[cfg(target_arch = "x86_64")]
251     SgxProvisionOpen(io::Error),
252 
253     /// Failed enabling SGX provisioning
254     #[cfg(target_arch = "x86_64")]
255     SgxEnableProvisioning(hypervisor::HypervisorVmError),
256 
257     /// Failed creating a new MmapRegion instance.
258     #[cfg(target_arch = "x86_64")]
259     NewMmapRegion(vm_memory::mmap::MmapRegionError),
260 
261     /// No memory zones found.
262     MissingMemoryZones,
263 
264     /// Memory configuration is not valid.
265     InvalidMemoryParameters,
266 
267     /// Forbidden operation. Impossible to resize guest memory if it is
268     /// backed by user defined memory regions.
269     InvalidResizeWithMemoryZones,
270 
271     /// It's invalid to try applying a NUMA policy to a memory zone that is
272     /// memory mapped with MAP_SHARED.
273     InvalidSharedMemoryZoneWithHostNuma,
274 
275     /// Failed applying NUMA memory policy.
276     ApplyNumaPolicy(io::Error),
277 
278     /// Memory zone identifier is not unique.
279     DuplicateZoneId,
280 
281     /// No virtio-mem resizing handler found.
282     MissingVirtioMemHandler,
283 
284     /// Unknown memory zone.
285     UnknownMemoryZone,
286 
287     /// Invalid size for resizing. Can be anything except 0.
288     InvalidHotplugSize,
289 
290     /// Invalid hotplug method associated with memory zones resizing capability.
291     InvalidHotplugMethodWithMemoryZones,
292 
293     /// Could not find specified memory zone identifier from hash map.
294     MissingZoneIdentifier,
295 
296     /// Resizing the memory zone failed.
297     ResizeZone,
298 
299     /// Guest address overflow
300     GuestAddressOverFlow,
301 
302     /// Error opening snapshot file
303     SnapshotOpen(io::Error),
304 
305     // Error copying snapshot into region
306     SnapshotCopy(GuestMemoryError),
307 
308     /// Failed to allocate MMIO address
309     AllocateMmioAddress,
310 }
311 
312 const ENABLE_FLAG: usize = 0;
313 const INSERTING_FLAG: usize = 1;
314 const REMOVING_FLAG: usize = 2;
315 const EJECT_FLAG: usize = 3;
316 
317 const BASE_OFFSET_LOW: u64 = 0;
318 const BASE_OFFSET_HIGH: u64 = 0x4;
319 const LENGTH_OFFSET_LOW: u64 = 0x8;
320 const LENGTH_OFFSET_HIGH: u64 = 0xC;
321 const STATUS_OFFSET: u64 = 0x14;
322 const SELECTION_OFFSET: u64 = 0;
323 
324 // The MMIO address space size is subtracted with 64k. This is done for the
325 // following reasons:
326 //  - Reduce the addressable space size by at least 4k to workaround a Linux
327 //    bug when the VMM allocates devices at the end of the addressable space
328 //  - Windows requires the addressable space size to be 64k aligned
329 fn mmio_address_space_size(phys_bits: u8) -> u64 {
330     (1 << phys_bits) - (1 << 16)
331 }
332 
333 impl BusDevice for MemoryManager {
334     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
335         if self.selected_slot < self.hotplug_slots.len() {
336             let state = &self.hotplug_slots[self.selected_slot];
337             match offset {
338                 BASE_OFFSET_LOW => {
339                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
340                 }
341                 BASE_OFFSET_HIGH => {
342                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
343                 }
344                 LENGTH_OFFSET_LOW => {
345                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
346                 }
347                 LENGTH_OFFSET_HIGH => {
348                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
349                 }
350                 STATUS_OFFSET => {
351                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
352                     data.fill(0);
353                     if state.active {
354                         data[0] |= 1 << ENABLE_FLAG;
355                     }
356                     if state.inserting {
357                         data[0] |= 1 << INSERTING_FLAG;
358                     }
359                     if state.removing {
360                         data[0] |= 1 << REMOVING_FLAG;
361                     }
362                 }
363                 _ => {
364                     warn!(
365                         "Unexpected offset for accessing memory manager device: {:#}",
366                         offset
367                     );
368                 }
369             }
370         } else {
371             warn!("Out of range memory slot: {}", self.selected_slot);
372         }
373     }
374 
375     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
376         match offset {
377             SELECTION_OFFSET => {
378                 self.selected_slot = usize::from(data[0]);
379             }
380             STATUS_OFFSET => {
381                 if self.selected_slot < self.hotplug_slots.len() {
382                     let state = &mut self.hotplug_slots[self.selected_slot];
383                     // The ACPI code writes back a 1 to acknowledge the insertion
384                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
385                         state.inserting = false;
386                     }
387                     // Ditto for removal
388                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
389                         state.removing = false;
390                     }
391                     // Trigger removal of "DIMM"
392                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
393                         warn!("Ejection of memory not currently supported");
394                     }
395                 } else {
396                     warn!("Out of range memory slot: {}", self.selected_slot);
397                 }
398             }
399             _ => {
400                 warn!(
401                     "Unexpected offset for accessing memory manager device: {:#}",
402                     offset
403                 );
404             }
405         };
406         None
407     }
408 }
409 
410 impl MemoryManager {
411     /// Creates all memory regions based on the available RAM ranges defined
412     /// by `ram_regions`, and based on the description of the memory zones.
413     /// In practice, this function can perform multiple memory mappings of the
414     /// same backing file if there's a hole in the address space between two
415     /// RAM ranges.
416     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
417     /// and zones containing two zones (size 1G and size 4G).
418     /// This function will create 3 resulting memory regions:
419     /// - First one mapping entirely the first memory zone on 0-1G range
420     /// - Second one mapping partially the second memory zone on 1G-3G range
421     /// - Third one mapping partially the second memory zone on 4G-6G range
422     fn create_memory_regions_from_zones(
423         ram_regions: &[(GuestAddress, usize)],
424         zones: &[MemoryZoneConfig],
425         prefault: Option<bool>,
426     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
427         let mut zones = zones.to_owned();
428         let mut mem_regions = Vec::new();
429         let mut zone = zones.remove(0);
430         let mut zone_offset = 0;
431         let mut memory_zones = HashMap::new();
432 
433         // Add zone id to the list of memory zones.
434         memory_zones.insert(zone.id.clone(), MemoryZone::default());
435 
436         for ram_region in ram_regions.iter() {
437             let mut ram_region_offset = 0;
438             let mut exit = false;
439 
440             loop {
441                 let mut ram_region_consumed = false;
442                 let mut pull_next_zone = false;
443 
444                 let ram_region_sub_size = ram_region.1 - ram_region_offset;
445                 let zone_sub_size = zone.size as usize - zone_offset;
446 
447                 let file_offset = zone_offset as u64;
448                 let region_start = ram_region
449                     .0
450                     .checked_add(ram_region_offset as u64)
451                     .ok_or(Error::GuestAddressOverFlow)?;
452                 let region_size = if zone_sub_size <= ram_region_sub_size {
453                     if zone_sub_size == ram_region_sub_size {
454                         ram_region_consumed = true;
455                     }
456 
457                     ram_region_offset += zone_sub_size;
458                     pull_next_zone = true;
459 
460                     zone_sub_size
461                 } else {
462                     zone_offset += ram_region_sub_size;
463                     ram_region_consumed = true;
464 
465                     ram_region_sub_size
466                 };
467 
468                 let region = MemoryManager::create_ram_region(
469                     &zone.file,
470                     file_offset,
471                     region_start,
472                     region_size,
473                     match prefault {
474                         Some(pf) => pf,
475                         None => zone.prefault,
476                     },
477                     zone.shared,
478                     zone.hugepages,
479                     zone.hugepage_size,
480                     zone.host_numa_node,
481                     None,
482                 )?;
483 
484                 // Add region to the list of regions associated with the
485                 // current memory zone.
486                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
487                     memory_zone.regions.push(region.clone());
488                 }
489 
490                 mem_regions.push(region);
491 
492                 if pull_next_zone {
493                     // Get the next zone and reset the offset.
494                     zone_offset = 0;
495                     if zones.is_empty() {
496                         exit = true;
497                         break;
498                     }
499                     zone = zones.remove(0);
500 
501                     // Check if zone id already exist. In case it does, throw
502                     // an error as we need unique identifiers. Otherwise, add
503                     // the new zone id to the list of memory zones.
504                     if memory_zones.contains_key(&zone.id) {
505                         error!(
506                             "Memory zone identifier '{}' found more than once. \
507                             It must be unique",
508                             zone.id,
509                         );
510                         return Err(Error::DuplicateZoneId);
511                     }
512                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
513                 }
514 
515                 if ram_region_consumed {
516                     break;
517                 }
518             }
519 
520             if exit {
521                 break;
522             }
523         }
524 
525         Ok((mem_regions, memory_zones))
526     }
527 
528     // Restore both GuestMemory regions along with MemoryZone zones.
529     fn restore_memory_regions_and_zones(
530         guest_ram_mappings: &[GuestRamMapping],
531         zones_config: &[MemoryZoneConfig],
532         prefault: Option<bool>,
533         mut existing_memory_files: HashMap<u32, File>,
534     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
535         let mut memory_regions = Vec::new();
536         let mut memory_zones = HashMap::new();
537 
538         for zone_config in zones_config {
539             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
540         }
541 
542         for guest_ram_mapping in guest_ram_mappings {
543             for zone_config in zones_config {
544                 if guest_ram_mapping.zone_id == zone_config.id {
545                     let region = MemoryManager::create_ram_region(
546                         &zone_config.file,
547                         guest_ram_mapping.file_offset,
548                         GuestAddress(guest_ram_mapping.gpa),
549                         guest_ram_mapping.size as usize,
550                         match prefault {
551                             Some(pf) => pf,
552                             None => zone_config.prefault,
553                         },
554                         zone_config.shared,
555                         zone_config.hugepages,
556                         zone_config.hugepage_size,
557                         zone_config.host_numa_node,
558                         existing_memory_files.remove(&guest_ram_mapping.slot),
559                     )?;
560                     memory_regions.push(Arc::clone(&region));
561                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
562                         if guest_ram_mapping.virtio_mem {
563                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
564                             let region_size = region.len();
565                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
566                                 region,
567                                 resize_handler: virtio_devices::Resize::new(hotplugged_size)
568                                     .map_err(Error::EventFdFail)?,
569                                 hotplugged_size,
570                                 hugepages: zone_config.hugepages,
571                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
572                             });
573                         } else {
574                             memory_zone.regions.push(region);
575                         }
576                     }
577                 }
578             }
579         }
580 
581         memory_regions.sort_by_key(|x| x.start_addr());
582 
583         Ok((memory_regions, memory_zones))
584     }
585 
586     fn fill_saved_regions(
587         &mut self,
588         file_path: PathBuf,
589         saved_regions: MemoryRangeTable,
590     ) -> Result<(), Error> {
591         if saved_regions.is_empty() {
592             return Ok(());
593         }
594 
595         // Open (read only) the snapshot file.
596         let mut memory_file = OpenOptions::new()
597             .read(true)
598             .open(file_path)
599             .map_err(Error::SnapshotOpen)?;
600 
601         let guest_memory = self.guest_memory.memory();
602         for range in saved_regions.regions() {
603             let mut offset: u64 = 0;
604             // Here we are manually handling the retry in case we can't write
605             // the whole region at once because we can't use the implementation
606             // from vm-memory::GuestMemory of read_exact_from() as it is not
607             // following the correct behavior. For more info about this issue
608             // see: https://github.com/rust-vmm/vm-memory/issues/174
609             loop {
610                 let bytes_read = guest_memory
611                     .read_from(
612                         GuestAddress(range.gpa + offset),
613                         &mut memory_file,
614                         (range.length - offset) as usize,
615                     )
616                     .map_err(Error::SnapshotCopy)?;
617                 offset += bytes_read as u64;
618 
619                 if offset == range.length {
620                     break;
621                 }
622             }
623         }
624 
625         Ok(())
626     }
627 
628     fn validate_memory_config(
629         config: &MemoryConfig,
630         user_provided_zones: bool,
631     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
632         let mut allow_mem_hotplug = false;
633 
634         if !user_provided_zones {
635             if config.zones.is_some() {
636                 error!(
637                     "User defined memory regions can't be provided if the \
638                     memory size is not 0"
639                 );
640                 return Err(Error::InvalidMemoryParameters);
641             }
642 
643             if config.hotplug_size.is_some() {
644                 allow_mem_hotplug = true;
645             }
646 
647             if let Some(hotplugged_size) = config.hotplugged_size {
648                 if let Some(hotplug_size) = config.hotplug_size {
649                     if hotplugged_size > hotplug_size {
650                         error!(
651                             "'hotplugged_size' {} can't be bigger than \
652                             'hotplug_size' {}",
653                             hotplugged_size, hotplug_size,
654                         );
655                         return Err(Error::InvalidMemoryParameters);
656                     }
657                 } else {
658                     error!(
659                         "Invalid to define 'hotplugged_size' when there is\
660                         no 'hotplug_size'"
661                     );
662                     return Err(Error::InvalidMemoryParameters);
663                 }
664                 if config.hotplug_method == HotplugMethod::Acpi {
665                     error!(
666                         "Invalid to define 'hotplugged_size' with hotplug \
667                         method 'acpi'"
668                     );
669                     return Err(Error::InvalidMemoryParameters);
670                 }
671             }
672 
673             // Create a single zone from the global memory config. This lets
674             // us reuse the codepath for user defined memory zones.
675             let zones = vec![MemoryZoneConfig {
676                 id: String::from(DEFAULT_MEMORY_ZONE),
677                 size: config.size,
678                 file: None,
679                 shared: config.shared,
680                 hugepages: config.hugepages,
681                 hugepage_size: config.hugepage_size,
682                 host_numa_node: None,
683                 hotplug_size: config.hotplug_size,
684                 hotplugged_size: config.hotplugged_size,
685                 prefault: config.prefault,
686             }];
687 
688             Ok((config.size, zones, allow_mem_hotplug))
689         } else {
690             if config.zones.is_none() {
691                 error!(
692                     "User defined memory regions must be provided if the \
693                     memory size is 0"
694                 );
695                 return Err(Error::MissingMemoryZones);
696             }
697 
698             // Safe to unwrap as we checked right above there were some
699             // regions.
700             let zones = config.zones.clone().unwrap();
701             if zones.is_empty() {
702                 return Err(Error::MissingMemoryZones);
703             }
704 
705             let mut total_ram_size: u64 = 0;
706             for zone in zones.iter() {
707                 total_ram_size += zone.size;
708 
709                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
710                     error!(
711                         "Invalid to set host NUMA policy for a memory zone \
712                         backed by a regular file and mapped as 'shared'"
713                     );
714                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
715                 }
716 
717                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
718                     error!("Invalid to set ACPI hotplug method for memory zones");
719                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
720                 }
721 
722                 if let Some(hotplugged_size) = zone.hotplugged_size {
723                     if let Some(hotplug_size) = zone.hotplug_size {
724                         if hotplugged_size > hotplug_size {
725                             error!(
726                                 "'hotplugged_size' {} can't be bigger than \
727                                 'hotplug_size' {}",
728                                 hotplugged_size, hotplug_size,
729                             );
730                             return Err(Error::InvalidMemoryParameters);
731                         }
732                     } else {
733                         error!(
734                             "Invalid to define 'hotplugged_size' when there is\
735                             no 'hotplug_size' for a memory zone"
736                         );
737                         return Err(Error::InvalidMemoryParameters);
738                     }
739                     if config.hotplug_method == HotplugMethod::Acpi {
740                         error!(
741                             "Invalid to define 'hotplugged_size' with hotplug \
742                             method 'acpi'"
743                         );
744                         return Err(Error::InvalidMemoryParameters);
745                     }
746                 }
747             }
748 
749             Ok((total_ram_size, zones, allow_mem_hotplug))
750         }
751     }
752 
753     fn allocate_address_space(&mut self) -> Result<(), Error> {
754         let mut list = Vec::new();
755 
756         for (zone_id, memory_zone) in self.memory_zones.iter() {
757             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
758                 memory_zone
759                     .regions()
760                     .iter()
761                     .map(|r| (r.clone(), false))
762                     .collect();
763 
764             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
765                 regions.push((virtio_mem_zone.region().clone(), true));
766             }
767 
768             list.push((zone_id.clone(), regions));
769         }
770 
771         for (zone_id, regions) in list {
772             for (region, virtio_mem) in regions {
773                 let slot = self.create_userspace_mapping(
774                     region.start_addr().raw_value(),
775                     region.len() as u64,
776                     region.as_ptr() as u64,
777                     self.mergeable,
778                     false,
779                     self.log_dirty,
780                 )?;
781 
782                 let file_offset = if let Some(file_offset) = region.file_offset() {
783                     file_offset.start()
784                 } else {
785                     0
786                 };
787 
788                 self.guest_ram_mappings.push(GuestRamMapping {
789                     gpa: region.start_addr().raw_value(),
790                     size: region.len(),
791                     slot,
792                     zone_id: zone_id.clone(),
793                     virtio_mem,
794                     file_offset,
795                 });
796                 self.ram_allocator
797                     .allocate(Some(region.start_addr()), region.len(), None)
798                     .ok_or(Error::MemoryRangeAllocation)?;
799             }
800         }
801 
802         // Allocate SubRegion and Reserved address ranges.
803         for region in self.arch_mem_regions.iter() {
804             if region.r_type == RegionType::Ram {
805                 // Ignore the RAM type since ranges have already been allocated
806                 // based on the GuestMemory regions.
807                 continue;
808             }
809             self.ram_allocator
810                 .allocate(
811                     Some(GuestAddress(region.base)),
812                     region.size as GuestUsize,
813                     None,
814                 )
815                 .ok_or(Error::MemoryRangeAllocation)?;
816         }
817 
818         Ok(())
819     }
820 
821     #[allow(clippy::too_many_arguments)]
822     pub fn new(
823         vm: Arc<dyn hypervisor::Vm>,
824         config: &MemoryConfig,
825         prefault: Option<bool>,
826         phys_bits: u8,
827         #[cfg(feature = "tdx")] tdx_enabled: bool,
828         restore_data: Option<&MemoryManagerSnapshotData>,
829         existing_memory_files: Option<HashMap<u32, File>>,
830         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
831     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
832         let user_provided_zones = config.size == 0;
833 
834         let mmio_address_space_size = mmio_address_space_size(phys_bits);
835         debug_assert_eq!(
836             (((mmio_address_space_size) >> 16) << 16),
837             mmio_address_space_size
838         );
839         let start_of_platform_device_area =
840             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
841         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
842 
843         let (ram_size, zones, allow_mem_hotplug) =
844             Self::validate_memory_config(config, user_provided_zones)?;
845 
846         let (
847             start_of_device_area,
848             boot_ram,
849             current_ram,
850             arch_mem_regions,
851             memory_zones,
852             guest_memory,
853             boot_guest_memory,
854             hotplug_slots,
855             next_memory_slot,
856             selected_slot,
857             next_hotplug_slot,
858         ) = if let Some(data) = restore_data {
859             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
860                 &data.guest_ram_mappings,
861                 &zones,
862                 prefault,
863                 existing_memory_files.unwrap_or_default(),
864             )?;
865             let guest_memory =
866                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
867             let boot_guest_memory = guest_memory.clone();
868             (
869                 GuestAddress(data.start_of_device_area),
870                 data.boot_ram,
871                 data.current_ram,
872                 data.arch_mem_regions.clone(),
873                 memory_zones,
874                 guest_memory,
875                 boot_guest_memory,
876                 data.hotplug_slots.clone(),
877                 data.next_memory_slot,
878                 data.selected_slot,
879                 data.next_hotplug_slot,
880             )
881         } else {
882             // Init guest memory
883             let arch_mem_regions = arch::arch_memory_regions(ram_size);
884 
885             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
886                 .iter()
887                 .filter(|r| r.2 == RegionType::Ram)
888                 .map(|r| (r.0, r.1))
889                 .collect();
890 
891             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
892                 .iter()
893                 .map(|(a, b, c)| ArchMemRegion {
894                     base: a.0,
895                     size: *b,
896                     r_type: *c,
897                 })
898                 .collect();
899 
900             let (mem_regions, mut memory_zones) =
901                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault)?;
902 
903             let mut guest_memory =
904                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
905 
906             let boot_guest_memory = guest_memory.clone();
907 
908             let mut start_of_device_area =
909                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
910 
911             // Update list of memory zones for resize.
912             for zone in zones.iter() {
913                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
914                     if let Some(hotplug_size) = zone.hotplug_size {
915                         if hotplug_size == 0 {
916                             error!("'hotplug_size' can't be 0");
917                             return Err(Error::InvalidHotplugSize);
918                         }
919 
920                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
921                             start_of_device_area = start_of_device_area
922                                 .checked_add(hotplug_size)
923                                 .ok_or(Error::GuestAddressOverFlow)?;
924                         } else {
925                             // Alignment must be "natural" i.e. same as size of block
926                             let start_addr = GuestAddress(
927                                 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE
928                                     - 1)
929                                     / virtio_devices::VIRTIO_MEM_ALIGN_SIZE
930                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
931                             );
932 
933                             // When `prefault` is set by vm_restore, memory manager
934                             // will create ram region with `prefault` option in
935                             // restore config rather than same option in zone
936                             let region = MemoryManager::create_ram_region(
937                                 &None,
938                                 0,
939                                 start_addr,
940                                 hotplug_size as usize,
941                                 match prefault {
942                                     Some(pf) => pf,
943                                     None => zone.prefault,
944                                 },
945                                 zone.shared,
946                                 zone.hugepages,
947                                 zone.hugepage_size,
948                                 zone.host_numa_node,
949                                 None,
950                             )?;
951 
952                             guest_memory = guest_memory
953                                 .insert_region(Arc::clone(&region))
954                                 .map_err(Error::GuestMemory)?;
955 
956                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
957                             let region_size = region.len();
958                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
959                                 region,
960                                 resize_handler: virtio_devices::Resize::new(hotplugged_size)
961                                     .map_err(Error::EventFdFail)?,
962                                 hotplugged_size,
963                                 hugepages: zone.hugepages,
964                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
965                             });
966 
967                             start_of_device_area = start_addr
968                                 .checked_add(hotplug_size)
969                                 .ok_or(Error::GuestAddressOverFlow)?;
970                         }
971                     }
972                 } else {
973                     return Err(Error::MissingZoneIdentifier);
974                 }
975             }
976 
977             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
978             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
979 
980             (
981                 start_of_device_area,
982                 ram_size,
983                 ram_size,
984                 arch_mem_regions,
985                 memory_zones,
986                 guest_memory,
987                 boot_guest_memory,
988                 hotplug_slots,
989                 0,
990                 0,
991                 0,
992             )
993         };
994 
995         let guest_memory = GuestMemoryAtomic::new(guest_memory);
996 
997         // Both MMIO and PIO address spaces start at address 0.
998         let allocator = Arc::new(Mutex::new(
999             SystemAllocator::new(
1000                 #[cfg(target_arch = "x86_64")]
1001                 {
1002                     GuestAddress(0)
1003                 },
1004                 #[cfg(target_arch = "x86_64")]
1005                 {
1006                     1 << 16
1007                 },
1008                 start_of_platform_device_area,
1009                 PLATFORM_DEVICE_AREA_SIZE,
1010                 layout::MEM_32BIT_DEVICES_START,
1011                 layout::MEM_32BIT_DEVICES_SIZE,
1012                 #[cfg(target_arch = "x86_64")]
1013                 vec![GsiApic::new(
1014                     X86_64_IRQ_BASE,
1015                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1016                 )],
1017             )
1018             .ok_or(Error::CreateSystemAllocator)?,
1019         ));
1020 
1021         #[cfg(not(feature = "tdx"))]
1022         let dynamic = true;
1023         #[cfg(feature = "tdx")]
1024         let dynamic = !tdx_enabled;
1025 
1026         let hotplug_method = config.hotplug_method.clone();
1027 
1028         let acpi_address = if dynamic && hotplug_method == HotplugMethod::Acpi {
1029             Some(
1030                 allocator
1031                     .lock()
1032                     .unwrap()
1033                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1034                     .ok_or(Error::AllocateMmioAddress)?,
1035             )
1036         } else {
1037             None
1038         };
1039 
1040         // If running on SGX the start of device area and RAM area may diverge but
1041         // at this point they are next to each other.
1042         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1043         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1044 
1045         let mut memory_manager = MemoryManager {
1046             boot_guest_memory,
1047             guest_memory,
1048             next_memory_slot,
1049             start_of_device_area,
1050             end_of_device_area,
1051             end_of_ram_area,
1052             vm,
1053             hotplug_slots,
1054             selected_slot,
1055             mergeable: config.mergeable,
1056             allocator,
1057             hotplug_method,
1058             boot_ram,
1059             current_ram,
1060             next_hotplug_slot,
1061             shared: config.shared,
1062             hugepages: config.hugepages,
1063             hugepage_size: config.hugepage_size,
1064             prefault: config.prefault,
1065             #[cfg(target_arch = "x86_64")]
1066             sgx_epc_region: None,
1067             user_provided_zones,
1068             snapshot_memory_ranges: MemoryRangeTable::default(),
1069             memory_zones,
1070             guest_ram_mappings: Vec::new(),
1071 
1072             acpi_address,
1073             log_dirty: dynamic, // Cannot log dirty pages on a TD
1074             arch_mem_regions,
1075             ram_allocator,
1076             dynamic,
1077         };
1078 
1079         memory_manager.allocate_address_space()?;
1080         #[cfg(target_arch = "x86_64")]
1081         if let Some(sgx_epc_config) = sgx_epc_config {
1082             memory_manager.setup_sgx(sgx_epc_config)?;
1083         }
1084 
1085         Ok(Arc::new(Mutex::new(memory_manager)))
1086     }
1087 
1088     pub fn new_from_snapshot(
1089         snapshot: &Snapshot,
1090         vm: Arc<dyn hypervisor::Vm>,
1091         config: &MemoryConfig,
1092         source_url: Option<&str>,
1093         prefault: bool,
1094         phys_bits: u8,
1095     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1096         if let Some(source_url) = source_url {
1097             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1098             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1099 
1100             let mem_snapshot: MemoryManagerSnapshotData = snapshot
1101                 .to_versioned_state(MEMORY_MANAGER_SNAPSHOT_ID)
1102                 .map_err(Error::Restore)?;
1103 
1104             let mm = MemoryManager::new(
1105                 vm,
1106                 config,
1107                 Some(prefault),
1108                 phys_bits,
1109                 #[cfg(feature = "tdx")]
1110                 false,
1111                 Some(&mem_snapshot),
1112                 None,
1113                 #[cfg(target_arch = "x86_64")]
1114                 None,
1115             )?;
1116 
1117             mm.lock()
1118                 .unwrap()
1119                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1120 
1121             Ok(mm)
1122         } else {
1123             Err(Error::RestoreMissingSourceUrl)
1124         }
1125     }
1126 
1127     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1128         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1129 
1130         if res < 0 {
1131             Err(io::Error::last_os_error())
1132         } else {
1133             Ok(res as RawFd)
1134         }
1135     }
1136 
1137     fn mbind(
1138         addr: *mut u8,
1139         len: u64,
1140         mode: u32,
1141         nodemask: Vec<u64>,
1142         maxnode: u64,
1143         flags: u32,
1144     ) -> Result<(), io::Error> {
1145         let res = unsafe {
1146             libc::syscall(
1147                 libc::SYS_mbind,
1148                 addr as *mut libc::c_void,
1149                 len,
1150                 mode,
1151                 nodemask.as_ptr(),
1152                 maxnode,
1153                 flags,
1154             )
1155         };
1156 
1157         if res < 0 {
1158             Err(io::Error::last_os_error())
1159         } else {
1160             Ok(())
1161         }
1162     }
1163 
1164     fn open_memory_file(
1165         backing_file: &Option<PathBuf>,
1166         file_offset: u64,
1167         size: usize,
1168         hugepages: bool,
1169         hugepage_size: Option<u64>,
1170     ) -> Result<(File, u64), Error> {
1171         let (f, f_off) = match backing_file {
1172             Some(ref file) => {
1173                 if file.is_dir() {
1174                     // Override file offset as it does not apply in this case.
1175                     info!(
1176                         "Ignoring file offset since the backing file is a \
1177                         temporary file created from the specified directory."
1178                     );
1179                     let fs_str = format!("{}{}", file.display(), "/tmpfile_XXXXXX");
1180                     let fs = ffi::CString::new(fs_str).unwrap();
1181                     let mut path = fs.as_bytes_with_nul().to_owned();
1182                     let path_ptr = path.as_mut_ptr() as *mut _;
1183                     let fd = unsafe { libc::mkstemp(path_ptr) };
1184                     unsafe { libc::unlink(path_ptr) };
1185                     let f = unsafe { File::from_raw_fd(fd) };
1186                     f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1187 
1188                     (f, 0)
1189                 } else {
1190                     let f = OpenOptions::new()
1191                         .read(true)
1192                         .write(true)
1193                         .open(file)
1194                         .map_err(Error::SharedFileCreate)?;
1195 
1196                     (f, file_offset)
1197                 }
1198             }
1199             None => {
1200                 let fd = Self::memfd_create(
1201                     &ffi::CString::new("ch_ram").unwrap(),
1202                     if hugepages {
1203                         libc::MFD_HUGETLB
1204                             | if let Some(hugepage_size) = hugepage_size {
1205                                 /*
1206                                  * From the Linux kernel:
1207                                  * Several system calls take a flag to request "hugetlb" huge pages.
1208                                  * Without further specification, these system calls will use the
1209                                  * system's default huge page size.  If a system supports multiple
1210                                  * huge page sizes, the desired huge page size can be specified in
1211                                  * bits [26:31] of the flag arguments.  The value in these 6 bits
1212                                  * will encode the log2 of the huge page size.
1213                                  */
1214 
1215                                 hugepage_size.trailing_zeros() << 26
1216                             } else {
1217                                 // Use the system default huge page size
1218                                 0
1219                             }
1220                     } else {
1221                         0
1222                     },
1223                 )
1224                 .map_err(Error::SharedFileCreate)?;
1225 
1226                 let f = unsafe { File::from_raw_fd(fd) };
1227                 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1228 
1229                 (f, 0)
1230             }
1231         };
1232 
1233         Ok((f, f_off))
1234     }
1235 
1236     #[allow(clippy::too_many_arguments)]
1237     fn create_ram_region(
1238         backing_file: &Option<PathBuf>,
1239         file_offset: u64,
1240         start_addr: GuestAddress,
1241         size: usize,
1242         prefault: bool,
1243         shared: bool,
1244         hugepages: bool,
1245         hugepage_size: Option<u64>,
1246         host_numa_node: Option<u32>,
1247         existing_memory_file: Option<File>,
1248     ) -> Result<Arc<GuestRegionMmap>, Error> {
1249         let (f, f_off) = if let Some(f) = existing_memory_file {
1250             (f, file_offset)
1251         } else {
1252             Self::open_memory_file(backing_file, file_offset, size, hugepages, hugepage_size)?
1253         };
1254 
1255         let mut mmap_flags = libc::MAP_NORESERVE
1256             | if shared {
1257                 libc::MAP_SHARED
1258             } else {
1259                 libc::MAP_PRIVATE
1260             };
1261         if prefault {
1262             mmap_flags |= libc::MAP_POPULATE;
1263         }
1264 
1265         let region = GuestRegionMmap::new(
1266             MmapRegion::build(
1267                 Some(FileOffset::new(f, f_off)),
1268                 size,
1269                 libc::PROT_READ | libc::PROT_WRITE,
1270                 mmap_flags,
1271             )
1272             .map_err(Error::GuestMemoryRegion)?,
1273             start_addr,
1274         )
1275         .map_err(Error::GuestMemory)?;
1276 
1277         // Apply NUMA policy if needed.
1278         if let Some(node) = host_numa_node {
1279             let addr = region.deref().as_ptr();
1280             let len = region.deref().size() as u64;
1281             let mode = MPOL_BIND;
1282             let mut nodemask: Vec<u64> = Vec::new();
1283             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1284 
1285             // Linux is kind of buggy in the way it interprets maxnode as it
1286             // will cut off the last node. That's why we have to add 1 to what
1287             // we would consider as the proper maxnode value.
1288             let maxnode = node as u64 + 1 + 1;
1289 
1290             // Allocate the right size for the vector.
1291             nodemask.resize((node as usize / 64) + 1, 0);
1292 
1293             // Fill the global bitmask through the nodemask vector.
1294             let idx = (node / 64) as usize;
1295             let shift = node % 64;
1296             nodemask[idx] |= 1u64 << shift;
1297 
1298             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1299             // force the kernel to move all pages that might have been already
1300             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1301             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1302             // MPOL_BIND is the selected mode as it specifies a strict policy
1303             // that restricts memory allocation to the nodes specified in the
1304             // nodemask.
1305             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1306                 .map_err(Error::ApplyNumaPolicy)?;
1307         }
1308 
1309         Ok(Arc::new(region))
1310     }
1311 
1312     // Update the GuestMemoryMmap with the new range
1313     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1314         let guest_memory = self
1315             .guest_memory
1316             .memory()
1317             .insert_region(region)
1318             .map_err(Error::GuestMemory)?;
1319         self.guest_memory.lock().unwrap().replace(guest_memory);
1320 
1321         Ok(())
1322     }
1323 
1324     //
1325     // Calculate the start address of an area next to RAM.
1326     //
1327     // If memory hotplug is allowed, the start address needs to be aligned
1328     // (rounded-up) to 128MiB boundary.
1329     // If memory hotplug is not allowed, there is no alignment required.
1330     // And it must also start at the 64bit start.
1331     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1332         let mut start_addr = if allow_mem_hotplug {
1333             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1334         } else {
1335             mem_end
1336         };
1337 
1338         start_addr = start_addr
1339             .checked_add(1)
1340             .ok_or(Error::GuestAddressOverFlow)?;
1341 
1342         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1343             return Ok(arch::layout::RAM_64BIT_START);
1344         }
1345 
1346         Ok(start_addr)
1347     }
1348 
1349     pub fn add_ram_region(
1350         &mut self,
1351         start_addr: GuestAddress,
1352         size: usize,
1353     ) -> Result<Arc<GuestRegionMmap>, Error> {
1354         // Allocate memory for the region
1355         let region = MemoryManager::create_ram_region(
1356             &None,
1357             0,
1358             start_addr,
1359             size,
1360             self.prefault,
1361             self.shared,
1362             self.hugepages,
1363             self.hugepage_size,
1364             None,
1365             None,
1366         )?;
1367 
1368         // Map it into the guest
1369         let slot = self.create_userspace_mapping(
1370             region.start_addr().0,
1371             region.len() as u64,
1372             region.as_ptr() as u64,
1373             self.mergeable,
1374             false,
1375             self.log_dirty,
1376         )?;
1377         self.guest_ram_mappings.push(GuestRamMapping {
1378             gpa: region.start_addr().raw_value(),
1379             size: region.len(),
1380             slot,
1381             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1382             virtio_mem: false,
1383             file_offset: 0,
1384         });
1385 
1386         self.add_region(Arc::clone(&region))?;
1387 
1388         Ok(region)
1389     }
1390 
1391     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1392         info!("Hotplugging new RAM: {}", size);
1393 
1394         // Check that there is a free slot
1395         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1396             return Err(Error::NoSlotAvailable);
1397         }
1398 
1399         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1400         if size % (128 << 20) != 0 {
1401             return Err(Error::InvalidSize);
1402         }
1403 
1404         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1405 
1406         if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area {
1407             return Err(Error::InsufficientHotplugRam);
1408         }
1409 
1410         let region = self.add_ram_region(start_addr, size)?;
1411 
1412         // Add region to the list of regions associated with the default
1413         // memory zone.
1414         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1415             memory_zone.regions.push(Arc::clone(&region));
1416         }
1417 
1418         // Tell the allocator
1419         self.ram_allocator
1420             .allocate(Some(start_addr), size as GuestUsize, None)
1421             .ok_or(Error::MemoryRangeAllocation)?;
1422 
1423         // Update the slot so that it can be queried via the I/O port
1424         let mut slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1425         slot.active = true;
1426         slot.inserting = true;
1427         slot.base = region.start_addr().0;
1428         slot.length = region.len() as u64;
1429 
1430         self.next_hotplug_slot += 1;
1431 
1432         Ok(region)
1433     }
1434 
1435     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1436         self.guest_memory.clone()
1437     }
1438 
1439     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1440         self.boot_guest_memory.clone()
1441     }
1442 
1443     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1444         self.allocator.clone()
1445     }
1446 
1447     pub fn start_of_device_area(&self) -> GuestAddress {
1448         self.start_of_device_area
1449     }
1450 
1451     pub fn end_of_device_area(&self) -> GuestAddress {
1452         self.end_of_device_area
1453     }
1454 
1455     pub fn allocate_memory_slot(&mut self) -> u32 {
1456         let slot_id = self.next_memory_slot;
1457         self.next_memory_slot += 1;
1458         slot_id
1459     }
1460 
1461     pub fn create_userspace_mapping(
1462         &mut self,
1463         guest_phys_addr: u64,
1464         memory_size: u64,
1465         userspace_addr: u64,
1466         mergeable: bool,
1467         readonly: bool,
1468         log_dirty: bool,
1469     ) -> Result<u32, Error> {
1470         let slot = self.allocate_memory_slot();
1471         let mem_region = self.vm.make_user_memory_region(
1472             slot,
1473             guest_phys_addr,
1474             memory_size,
1475             userspace_addr,
1476             readonly,
1477             log_dirty,
1478         );
1479 
1480         info!(
1481             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1482             guest_phys_addr, userspace_addr, memory_size, slot
1483         );
1484 
1485         self.vm
1486             .create_user_memory_region(mem_region)
1487             .map_err(Error::CreateUserMemoryRegion)?;
1488 
1489         // Mark the pages as mergeable if explicitly asked for.
1490         if mergeable {
1491             // Safe because the address and size are valid since the
1492             // mmap succeeded.
1493             let ret = unsafe {
1494                 libc::madvise(
1495                     userspace_addr as *mut libc::c_void,
1496                     memory_size as libc::size_t,
1497                     libc::MADV_MERGEABLE,
1498                 )
1499             };
1500             if ret != 0 {
1501                 let err = io::Error::last_os_error();
1502                 // Safe to unwrap because the error is constructed with
1503                 // last_os_error(), which ensures the output will be Some().
1504                 let errno = err.raw_os_error().unwrap();
1505                 if errno == libc::EINVAL {
1506                     warn!("kernel not configured with CONFIG_KSM");
1507                 } else {
1508                     warn!("madvise error: {}", err);
1509                 }
1510                 warn!("failed to mark pages as mergeable");
1511             }
1512         }
1513 
1514         info!(
1515             "Created userspace mapping: {:x} -> {:x} {:x}",
1516             guest_phys_addr, userspace_addr, memory_size
1517         );
1518 
1519         Ok(slot)
1520     }
1521 
1522     pub fn remove_userspace_mapping(
1523         &mut self,
1524         guest_phys_addr: u64,
1525         memory_size: u64,
1526         userspace_addr: u64,
1527         mergeable: bool,
1528         slot: u32,
1529     ) -> Result<(), Error> {
1530         let mem_region = self.vm.make_user_memory_region(
1531             slot,
1532             guest_phys_addr,
1533             memory_size,
1534             userspace_addr,
1535             false, /* readonly -- don't care */
1536             false, /* log dirty */
1537         );
1538 
1539         self.vm
1540             .remove_user_memory_region(mem_region)
1541             .map_err(Error::RemoveUserMemoryRegion)?;
1542 
1543         // Mark the pages as unmergeable if there were previously marked as
1544         // mergeable.
1545         if mergeable {
1546             // Safe because the address and size are valid as the region was
1547             // previously advised.
1548             let ret = unsafe {
1549                 libc::madvise(
1550                     userspace_addr as *mut libc::c_void,
1551                     memory_size as libc::size_t,
1552                     libc::MADV_UNMERGEABLE,
1553                 )
1554             };
1555             if ret != 0 {
1556                 let err = io::Error::last_os_error();
1557                 // Safe to unwrap because the error is constructed with
1558                 // last_os_error(), which ensures the output will be Some().
1559                 let errno = err.raw_os_error().unwrap();
1560                 if errno == libc::EINVAL {
1561                     warn!("kernel not configured with CONFIG_KSM");
1562                 } else {
1563                     warn!("madvise error: {}", err);
1564                 }
1565                 warn!("failed to mark pages as unmergeable");
1566             }
1567         }
1568 
1569         info!(
1570             "Removed userspace mapping: {:x} -> {:x} {:x}",
1571             guest_phys_addr, userspace_addr, memory_size
1572         );
1573 
1574         Ok(())
1575     }
1576 
1577     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1578         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1579             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1580                 virtio_mem_zone
1581                     .resize_handler()
1582                     .work(size)
1583                     .map_err(Error::VirtioMemResizeFail)?;
1584 
1585                 // Keep the hotplugged_size up to date.
1586                 virtio_mem_zone.hotplugged_size = size;
1587             } else {
1588                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1589                 return Err(Error::MissingVirtioMemHandler);
1590             }
1591 
1592             return Ok(());
1593         }
1594 
1595         error!("Failed resizing virtio-mem region: Unknown memory zone");
1596         Err(Error::UnknownMemoryZone)
1597     }
1598 
1599     /// In case this function resulted in adding a new memory region to the
1600     /// guest memory, the new region is returned to the caller. The virtio-mem
1601     /// use case never adds a new region as the whole hotpluggable memory has
1602     /// already been allocated at boot time.
1603     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1604         if self.user_provided_zones {
1605             error!(
1606                 "Not allowed to resize guest memory when backed with user \
1607                 defined memory zones."
1608             );
1609             return Err(Error::InvalidResizeWithMemoryZones);
1610         }
1611 
1612         let mut region: Option<Arc<GuestRegionMmap>> = None;
1613         match self.hotplug_method {
1614             HotplugMethod::VirtioMem => {
1615                 if desired_ram >= self.boot_ram {
1616                     if !self.dynamic {
1617                         return Ok(region);
1618                     }
1619 
1620                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1621                     self.current_ram = desired_ram;
1622                 }
1623             }
1624             HotplugMethod::Acpi => {
1625                 if desired_ram > self.current_ram {
1626                     if !self.dynamic {
1627                         return Ok(region);
1628                     }
1629 
1630                     region =
1631                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1632                     self.current_ram = desired_ram;
1633                 }
1634             }
1635         }
1636         Ok(region)
1637     }
1638 
1639     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1640         if !self.user_provided_zones {
1641             error!(
1642                 "Not allowed to resize guest memory zone when no zone is \
1643                 defined."
1644             );
1645             return Err(Error::ResizeZone);
1646         }
1647 
1648         self.virtio_mem_resize(id, virtio_mem_size)
1649     }
1650 
1651     #[cfg(target_arch = "x86_64")]
1652     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1653         let file = OpenOptions::new()
1654             .read(true)
1655             .open("/dev/sgx_provision")
1656             .map_err(Error::SgxProvisionOpen)?;
1657         self.vm
1658             .enable_sgx_attribute(file)
1659             .map_err(Error::SgxEnableProvisioning)?;
1660 
1661         // Go over each EPC section and verify its size is a 4k multiple. At
1662         // the same time, calculate the total size needed for the contiguous
1663         // EPC region.
1664         let mut epc_region_size = 0;
1665         for epc_section in sgx_epc_config.iter() {
1666             if epc_section.size == 0 {
1667                 return Err(Error::EpcSectionSizeInvalid);
1668             }
1669             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1670                 return Err(Error::EpcSectionSizeInvalid);
1671             }
1672 
1673             epc_region_size += epc_section.size;
1674         }
1675 
1676         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
1677         let epc_region_start = GuestAddress(
1678             ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE,
1679         );
1680 
1681         self.start_of_device_area = epc_region_start
1682             .checked_add(epc_region_size)
1683             .ok_or(Error::GuestAddressOverFlow)?;
1684 
1685         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
1686         info!(
1687             "SGX EPC region: 0x{:x} (0x{:x})",
1688             epc_region_start.0, epc_region_size
1689         );
1690 
1691         // Each section can be memory mapped into the allocated region.
1692         let mut epc_section_start = epc_region_start.raw_value();
1693         for epc_section in sgx_epc_config.iter() {
1694             let file = OpenOptions::new()
1695                 .read(true)
1696                 .write(true)
1697                 .open("/dev/sgx_vepc")
1698                 .map_err(Error::SgxVirtEpcOpen)?;
1699 
1700             let prot = PROT_READ | PROT_WRITE;
1701             let mut flags = MAP_NORESERVE | MAP_SHARED;
1702             if epc_section.prefault {
1703                 flags |= MAP_POPULATE;
1704             }
1705 
1706             // We can't use the vm-memory crate to perform the memory mapping
1707             // here as it would try to ensure the size of the backing file is
1708             // matching the size of the expected mapping. The /dev/sgx_vepc
1709             // device does not work that way, it provides a file descriptor
1710             // which is not matching the mapping size, as it's a just a way to
1711             // let KVM know that an EPC section is being created for the guest.
1712             let host_addr = unsafe {
1713                 libc::mmap(
1714                     std::ptr::null_mut(),
1715                     epc_section.size as usize,
1716                     prot,
1717                     flags,
1718                     file.as_raw_fd(),
1719                     0,
1720                 )
1721             } as u64;
1722 
1723             info!(
1724                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
1725                 epc_section_start, epc_section.size
1726             );
1727 
1728             let _mem_slot = self.create_userspace_mapping(
1729                 epc_section_start,
1730                 epc_section.size,
1731                 host_addr,
1732                 false,
1733                 false,
1734                 false,
1735             )?;
1736 
1737             sgx_epc_region.insert(
1738                 epc_section.id.clone(),
1739                 SgxEpcSection::new(
1740                     GuestAddress(epc_section_start),
1741                     epc_section.size as GuestUsize,
1742                 ),
1743             );
1744 
1745             epc_section_start += epc_section.size;
1746         }
1747 
1748         self.sgx_epc_region = Some(sgx_epc_region);
1749 
1750         Ok(())
1751     }
1752 
1753     #[cfg(target_arch = "x86_64")]
1754     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
1755         &self.sgx_epc_region
1756     }
1757 
1758     pub fn is_hardlink(f: &File) -> bool {
1759         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
1760         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
1761         if ret != 0 {
1762             error!("Couldn't fstat the backing file");
1763             return false;
1764         }
1765 
1766         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
1767     }
1768 
1769     pub fn memory_zones(&self) -> &MemoryZones {
1770         &self.memory_zones
1771     }
1772 
1773     pub fn memory_range_table(
1774         &self,
1775         snapshot: bool,
1776     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
1777         let mut table = MemoryRangeTable::default();
1778 
1779         for memory_zone in self.memory_zones.values() {
1780             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
1781                 table.extend(virtio_mem_zone.plugged_ranges());
1782             }
1783 
1784             for region in memory_zone.regions() {
1785                 if snapshot {
1786                     if let Some(file_offset) = region.file_offset() {
1787                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
1788                             && Self::is_hardlink(file_offset.file())
1789                         {
1790                             // In this very specific case, we know the memory
1791                             // region is backed by a file on the host filesystem
1792                             // that can be accessed by the user, and additionally
1793                             // the mapping is shared, which means that modifications
1794                             // to the content are written to the actual file.
1795                             // When meeting these conditions, we can skip the
1796                             // copy of the memory content for this specific region,
1797                             // as we can assume the user will have it saved through
1798                             // the backing file already.
1799                             continue;
1800                         }
1801                     }
1802                 }
1803 
1804                 table.push(MemoryRange {
1805                     gpa: region.start_addr().raw_value(),
1806                     length: region.len() as u64,
1807                 });
1808             }
1809         }
1810 
1811         Ok(table)
1812     }
1813 
1814     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
1815         MemoryManagerSnapshotData {
1816             memory_ranges: self.snapshot_memory_ranges.clone(),
1817             guest_ram_mappings: self.guest_ram_mappings.clone(),
1818             start_of_device_area: self.start_of_device_area.0,
1819             boot_ram: self.boot_ram,
1820             current_ram: self.current_ram,
1821             arch_mem_regions: self.arch_mem_regions.clone(),
1822             hotplug_slots: self.hotplug_slots.clone(),
1823             next_memory_slot: self.next_memory_slot,
1824             selected_slot: self.selected_slot,
1825             next_hotplug_slot: self.next_hotplug_slot,
1826         }
1827     }
1828 
1829     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
1830         let mut memory_slot_fds = HashMap::new();
1831         for guest_ram_mapping in &self.guest_ram_mappings {
1832             let slot = guest_ram_mapping.slot;
1833             let guest_memory = self.guest_memory.memory();
1834             let file = guest_memory
1835                 .find_region(GuestAddress(guest_ram_mapping.gpa))
1836                 .unwrap()
1837                 .file_offset()
1838                 .unwrap()
1839                 .file();
1840             memory_slot_fds.insert(slot, file.as_raw_fd());
1841         }
1842         memory_slot_fds
1843     }
1844 
1845     pub fn acpi_address(&self) -> Option<GuestAddress> {
1846         self.acpi_address
1847     }
1848 }
1849 
1850 struct MemoryNotify {
1851     slot_id: usize,
1852 }
1853 
1854 impl Aml for MemoryNotify {
1855     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1856         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
1857         aml::If::new(
1858             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
1859             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1860         )
1861         .append_aml_bytes(bytes)
1862     }
1863 }
1864 
1865 struct MemorySlot {
1866     slot_id: usize,
1867 }
1868 
1869 impl Aml for MemorySlot {
1870     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1871         aml::Device::new(
1872             format!("M{:03}", self.slot_id).as_str().into(),
1873             vec![
1874                 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0C80")),
1875                 &aml::Name::new("_UID".into(), &self.slot_id),
1876                 /*
1877                 _STA return value:
1878                 Bit [0] – Set if the device is present.
1879                 Bit [1] – Set if the device is enabled and decoding its resources.
1880                 Bit [2] – Set if the device should be shown in the UI.
1881                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1882                 Bit [4] – Set if the battery is present.
1883                 Bits [31:5] – Reserved (must be cleared).
1884                 */
1885                 &aml::Method::new(
1886                     "_STA".into(),
1887                     0,
1888                     false,
1889                     // Call into MSTA method which will interrogate device
1890                     vec![&aml::Return::new(&aml::MethodCall::new(
1891                         "MSTA".into(),
1892                         vec![&self.slot_id],
1893                     ))],
1894                 ),
1895                 // Get details of memory
1896                 &aml::Method::new(
1897                     "_CRS".into(),
1898                     0,
1899                     false,
1900                     // Call into MCRS which provides actual memory details
1901                     vec![&aml::Return::new(&aml::MethodCall::new(
1902                         "MCRS".into(),
1903                         vec![&self.slot_id],
1904                     ))],
1905                 ),
1906             ],
1907         )
1908         .append_aml_bytes(bytes)
1909     }
1910 }
1911 
1912 struct MemorySlots {
1913     slots: usize,
1914 }
1915 
1916 impl Aml for MemorySlots {
1917     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1918         for slot_id in 0..self.slots {
1919             MemorySlot { slot_id }.append_aml_bytes(bytes);
1920         }
1921     }
1922 }
1923 
1924 struct MemoryMethods {
1925     slots: usize,
1926 }
1927 
1928 impl Aml for MemoryMethods {
1929     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1930         // Add "MTFY" notification method
1931         let mut memory_notifies = Vec::new();
1932         for slot_id in 0..self.slots {
1933             memory_notifies.push(MemoryNotify { slot_id });
1934         }
1935 
1936         let mut memory_notifies_refs: Vec<&dyn aml::Aml> = Vec::new();
1937         for memory_notifier in memory_notifies.iter() {
1938             memory_notifies_refs.push(memory_notifier);
1939         }
1940 
1941         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).append_aml_bytes(bytes);
1942 
1943         // MSCN method
1944         aml::Method::new(
1945             "MSCN".into(),
1946             0,
1947             true,
1948             vec![
1949                 // Take lock defined above
1950                 &aml::Acquire::new("MLCK".into(), 0xffff),
1951                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
1952                 &aml::While::new(
1953                     &aml::LessThan::new(&aml::Local(0), &self.slots),
1954                     vec![
1955                         // Write slot number (in first argument) to I/O port via field
1956                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
1957                         // Check if MINS bit is set (inserting)
1958                         &aml::If::new(
1959                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
1960                             // Notify device if it is
1961                             vec![
1962                                 &aml::MethodCall::new(
1963                                     "MTFY".into(),
1964                                     vec![&aml::Local(0), &aml::ONE],
1965                                 ),
1966                                 // Reset MINS bit
1967                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
1968                             ],
1969                         ),
1970                         // Check if MRMV bit is set
1971                         &aml::If::new(
1972                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
1973                             // Notify device if it is (with the eject constant 0x3)
1974                             vec![
1975                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
1976                                 // Reset MRMV bit
1977                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
1978                             ],
1979                         ),
1980                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
1981                     ],
1982                 ),
1983                 // Release lock
1984                 &aml::Release::new("MLCK".into()),
1985             ],
1986         )
1987         .append_aml_bytes(bytes);
1988 
1989         // Memory status method
1990         aml::Method::new(
1991             "MSTA".into(),
1992             1,
1993             true,
1994             vec![
1995                 // Take lock defined above
1996                 &aml::Acquire::new("MLCK".into(), 0xffff),
1997                 // Write slot number (in first argument) to I/O port via field
1998                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
1999                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2000                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2001                 &aml::If::new(
2002                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2003                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2004                 ),
2005                 // Release lock
2006                 &aml::Release::new("MLCK".into()),
2007                 // Return 0 or 0xf
2008                 &aml::Return::new(&aml::Local(0)),
2009             ],
2010         )
2011         .append_aml_bytes(bytes);
2012 
2013         // Memory range method
2014         aml::Method::new(
2015             "MCRS".into(),
2016             1,
2017             true,
2018             vec![
2019                 // Take lock defined above
2020                 &aml::Acquire::new("MLCK".into(), 0xffff),
2021                 // Write slot number (in first argument) to I/O port via field
2022                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2023                 &aml::Name::new(
2024                     "MR64".into(),
2025                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2026                         aml::AddressSpaceCachable::Cacheable,
2027                         true,
2028                         0x0000_0000_0000_0000u64,
2029                         0xFFFF_FFFF_FFFF_FFFEu64,
2030                     )]),
2031                 ),
2032                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &14usize, "MINL".into()),
2033                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &18usize, "MINH".into()),
2034                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &22usize, "MAXL".into()),
2035                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &26usize, "MAXH".into()),
2036                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &38usize, "LENL".into()),
2037                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &42usize, "LENH".into()),
2038                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2039                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2040                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2041                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2042                 &aml::Add::new(
2043                     &aml::Path::new("MAXL"),
2044                     &aml::Path::new("MINL"),
2045                     &aml::Path::new("LENL"),
2046                 ),
2047                 &aml::Add::new(
2048                     &aml::Path::new("MAXH"),
2049                     &aml::Path::new("MINH"),
2050                     &aml::Path::new("LENH"),
2051                 ),
2052                 &aml::If::new(
2053                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2054                     vec![&aml::Add::new(
2055                         &aml::Path::new("MAXH"),
2056                         &aml::ONE,
2057                         &aml::Path::new("MAXH"),
2058                     )],
2059                 ),
2060                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2061                 // Release lock
2062                 &aml::Release::new("MLCK".into()),
2063                 &aml::Return::new(&aml::Path::new("MR64")),
2064             ],
2065         )
2066         .append_aml_bytes(bytes)
2067     }
2068 }
2069 
2070 impl Aml for MemoryManager {
2071     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2072         if let Some(acpi_address) = self.acpi_address {
2073             // Memory Hotplug Controller
2074             aml::Device::new(
2075                 "_SB_.MHPC".into(),
2076                 vec![
2077                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
2078                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2079                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2080                     &aml::Mutex::new("MLCK".into(), 0),
2081                     &aml::Name::new(
2082                         "_CRS".into(),
2083                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2084                             aml::AddressSpaceCachable::NotCacheable,
2085                             true,
2086                             acpi_address.0 as u64,
2087                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2088                         )]),
2089                     ),
2090                     // OpRegion and Fields map MMIO range into individual field values
2091                     &aml::OpRegion::new(
2092                         "MHPR".into(),
2093                         aml::OpRegionSpace::SystemMemory,
2094                         acpi_address.0 as usize,
2095                         MEMORY_MANAGER_ACPI_SIZE,
2096                     ),
2097                     &aml::Field::new(
2098                         "MHPR".into(),
2099                         aml::FieldAccessType::DWord,
2100                         aml::FieldUpdateRule::Preserve,
2101                         vec![
2102                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2103                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2104                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2105                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2106                         ],
2107                     ),
2108                     &aml::Field::new(
2109                         "MHPR".into(),
2110                         aml::FieldAccessType::DWord,
2111                         aml::FieldUpdateRule::Preserve,
2112                         vec![
2113                             aml::FieldEntry::Reserved(128),
2114                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2115                         ],
2116                     ),
2117                     &aml::Field::new(
2118                         "MHPR".into(),
2119                         aml::FieldAccessType::Byte,
2120                         aml::FieldUpdateRule::WriteAsZeroes,
2121                         vec![
2122                             aml::FieldEntry::Reserved(160),
2123                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2124                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2125                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2126                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2127                         ],
2128                     ),
2129                     &aml::Field::new(
2130                         "MHPR".into(),
2131                         aml::FieldAccessType::DWord,
2132                         aml::FieldUpdateRule::Preserve,
2133                         vec![
2134                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2135                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2136                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2137                         ],
2138                     ),
2139                     &MemoryMethods {
2140                         slots: self.hotplug_slots.len(),
2141                     },
2142                     &MemorySlots {
2143                         slots: self.hotplug_slots.len(),
2144                     },
2145                 ],
2146             )
2147             .append_aml_bytes(bytes);
2148         } else {
2149             aml::Device::new(
2150                 "_SB_.MHPC".into(),
2151                 vec![
2152                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
2153                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2154                     // Empty MSCN for GED
2155                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2156                 ],
2157             )
2158             .append_aml_bytes(bytes);
2159         }
2160 
2161         #[cfg(target_arch = "x86_64")]
2162         {
2163             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2164                 let min = sgx_epc_region.start().raw_value() as u64;
2165                 let max = min + sgx_epc_region.size() as u64 - 1;
2166                 // SGX EPC region
2167                 aml::Device::new(
2168                     "_SB_.EPC_".into(),
2169                     vec![
2170                         &aml::Name::new("_HID".into(), &aml::EisaName::new("INT0E0C")),
2171                         // QWORD describing the EPC region start and size
2172                         &aml::Name::new(
2173                             "_CRS".into(),
2174                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2175                                 aml::AddressSpaceCachable::NotCacheable,
2176                                 true,
2177                                 min,
2178                                 max,
2179                             )]),
2180                         ),
2181                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2182                     ],
2183                 )
2184                 .append_aml_bytes(bytes);
2185             }
2186         }
2187     }
2188 }
2189 
2190 impl Pausable for MemoryManager {}
2191 
2192 #[derive(Clone, Serialize, Deserialize, Versionize)]
2193 pub struct MemoryManagerSnapshotData {
2194     memory_ranges: MemoryRangeTable,
2195     guest_ram_mappings: Vec<GuestRamMapping>,
2196     start_of_device_area: u64,
2197     boot_ram: u64,
2198     current_ram: u64,
2199     arch_mem_regions: Vec<ArchMemRegion>,
2200     hotplug_slots: Vec<HotPlugState>,
2201     next_memory_slot: u32,
2202     selected_slot: usize,
2203     next_hotplug_slot: usize,
2204 }
2205 
2206 impl VersionMapped for MemoryManagerSnapshotData {}
2207 
2208 impl Snapshottable for MemoryManager {
2209     fn id(&self) -> String {
2210         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2211     }
2212 
2213     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2214         let mut memory_manager_snapshot = Snapshot::new(MEMORY_MANAGER_SNAPSHOT_ID);
2215 
2216         let memory_ranges = self.memory_range_table(true)?;
2217 
2218         // Store locally this list of ranges as it will be used through the
2219         // Transportable::send() implementation. The point is to avoid the
2220         // duplication of code regarding the creation of the path for each
2221         // region. The 'snapshot' step creates the list of memory regions,
2222         // including information about the need to copy a memory region or
2223         // not. This saves the 'send' step having to go through the same
2224         // process, and instead it can directly proceed with storing the
2225         // memory range content for the ranges requiring it.
2226         self.snapshot_memory_ranges = memory_ranges;
2227 
2228         memory_manager_snapshot.add_data_section(SnapshotDataSection::new_from_versioned_state(
2229             MEMORY_MANAGER_SNAPSHOT_ID,
2230             &self.snapshot_data(),
2231         )?);
2232 
2233         Ok(memory_manager_snapshot)
2234     }
2235 }
2236 
2237 impl Transportable for MemoryManager {
2238     fn send(
2239         &self,
2240         _snapshot: &Snapshot,
2241         destination_url: &str,
2242     ) -> result::Result<(), MigratableError> {
2243         if self.snapshot_memory_ranges.is_empty() {
2244             return Ok(());
2245         }
2246 
2247         let mut memory_file_path = url_to_path(destination_url)?;
2248         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2249 
2250         // Create the snapshot file for the entire memory
2251         let mut memory_file = OpenOptions::new()
2252             .read(true)
2253             .write(true)
2254             .create_new(true)
2255             .open(memory_file_path)
2256             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2257 
2258         let guest_memory = self.guest_memory.memory();
2259 
2260         for range in self.snapshot_memory_ranges.regions() {
2261             let mut offset: u64 = 0;
2262             // Here we are manually handling the retry in case we can't read
2263             // the whole region at once because we can't use the implementation
2264             // from vm-memory::GuestMemory of write_all_to() as it is not
2265             // following the correct behavior. For more info about this issue
2266             // see: https://github.com/rust-vmm/vm-memory/issues/174
2267             loop {
2268                 let bytes_written = guest_memory
2269                     .write_to(
2270                         GuestAddress(range.gpa + offset),
2271                         &mut memory_file,
2272                         (range.length - offset) as usize,
2273                     )
2274                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2275                 offset += bytes_written as u64;
2276 
2277                 if offset == range.length {
2278                     break;
2279                 }
2280             }
2281         }
2282         Ok(())
2283     }
2284 }
2285 
2286 impl Migratable for MemoryManager {
2287     // Start the dirty log in the hypervisor (kvm/mshv).
2288     // Also, reset the dirty bitmap logged by the vmm.
2289     // Just before we do a bulk copy we want to start/clear the dirty log so that
2290     // pages touched during our bulk copy are tracked.
2291     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2292         self.vm.start_dirty_log().map_err(|e| {
2293             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2294         })?;
2295 
2296         for r in self.guest_memory.memory().iter() {
2297             r.bitmap().reset();
2298         }
2299 
2300         Ok(())
2301     }
2302 
2303     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2304         self.vm.stop_dirty_log().map_err(|e| {
2305             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2306         })?;
2307 
2308         Ok(())
2309     }
2310 
2311     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2312     // together in the table if they are contiguous.
2313     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2314         let mut table = MemoryRangeTable::default();
2315         for r in &self.guest_ram_mappings {
2316             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2317                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2318             })?;
2319             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2320             {
2321                 Some(region) => {
2322                     assert!(region.start_addr().raw_value() == r.gpa);
2323                     assert!(region.len() == r.size);
2324                     region.bitmap().get_and_reset()
2325                 }
2326                 None => {
2327                     return Err(MigratableError::MigrateSend(anyhow!(
2328                         "Error finding 'guest memory region' with address {:x}",
2329                         r.gpa
2330                     )))
2331                 }
2332             };
2333 
2334             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2335                 .iter()
2336                 .zip(vmm_dirty_bitmap.iter())
2337                 .map(|(x, y)| x | y)
2338                 .collect();
2339 
2340             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2341 
2342             if sub_table.regions().is_empty() {
2343                 info!("Dirty Memory Range Table is empty");
2344             } else {
2345                 info!("Dirty Memory Range Table:");
2346                 for range in sub_table.regions() {
2347                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2348                 }
2349             }
2350 
2351             table.extend(sub_table);
2352         }
2353         Ok(table)
2354     }
2355 }
2356