1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5
6 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
7 use std::collections::BTreeMap;
8 use std::collections::HashMap;
9 use std::fs::{File, OpenOptions};
10 use std::io::{self};
11 use std::ops::{BitAnd, Deref, Not, Sub};
12 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
13 use std::os::fd::AsFd;
14 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
15 use std::path::PathBuf;
16 use std::sync::atomic::{AtomicU32, Ordering};
17 use std::sync::{Arc, Barrier, Mutex};
18 use std::{ffi, result, thread};
19
20 use acpi_tables::{aml, Aml};
21 use anyhow::anyhow;
22 #[cfg(target_arch = "x86_64")]
23 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
24 use arch::RegionType;
25 #[cfg(target_arch = "x86_64")]
26 use devices::ioapic;
27 #[cfg(target_arch = "aarch64")]
28 use hypervisor::HypervisorVmError;
29 use libc::_SC_NPROCESSORS_ONLN;
30 #[cfg(target_arch = "x86_64")]
31 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
32 use serde::{Deserialize, Serialize};
33 use thiserror::Error;
34 use tracer::trace_scoped;
35 use virtio_devices::BlocksState;
36 #[cfg(target_arch = "x86_64")]
37 use vm_allocator::GsiApic;
38 use vm_allocator::{AddressAllocator, MemorySlotAllocator, SystemAllocator};
39 use vm_device::BusDevice;
40 use vm_memory::bitmap::AtomicBitmap;
41 use vm_memory::guest_memory::FileOffset;
42 use vm_memory::mmap::MmapRegionError;
43 use vm_memory::{
44 Address, Error as MmapError, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic,
45 GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile,
46 };
47 use vm_migration::protocol::{MemoryRange, MemoryRangeTable};
48 use vm_migration::{
49 Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, Transportable,
50 };
51
52 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
53 use crate::coredump::{
54 CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError,
55 };
56 use crate::migration::url_to_path;
57 #[cfg(target_arch = "x86_64")]
58 use crate::vm_config::SgxEpcConfig;
59 use crate::vm_config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
60 use crate::{GuestMemoryMmap, GuestRegionMmap, MEMORY_MANAGER_SNAPSHOT_ID};
61
62 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
63
64 const DEFAULT_MEMORY_ZONE: &str = "mem0";
65
66 const SNAPSHOT_FILENAME: &str = "memory-ranges";
67
68 #[cfg(target_arch = "x86_64")]
69 const X86_64_IRQ_BASE: u32 = 5;
70
71 #[cfg(target_arch = "x86_64")]
72 const SGX_PAGE_SIZE: u64 = 1 << 12;
73
74 const HOTPLUG_COUNT: usize = 8;
75
76 // Memory policy constants
77 const MPOL_BIND: u32 = 2;
78 const MPOL_MF_STRICT: u32 = 1;
79 const MPOL_MF_MOVE: u32 = 1 << 1;
80
81 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
82 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
83
84 const MAX_PREFAULT_THREAD_COUNT: usize = 16;
85
86 #[derive(Clone, Default, Serialize, Deserialize)]
87 struct HotPlugState {
88 base: u64,
89 length: u64,
90 active: bool,
91 inserting: bool,
92 removing: bool,
93 }
94
95 pub struct VirtioMemZone {
96 region: Arc<GuestRegionMmap>,
97 virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>,
98 hotplugged_size: u64,
99 hugepages: bool,
100 blocks_state: Arc<Mutex<BlocksState>>,
101 }
102
103 impl VirtioMemZone {
region(&self) -> &Arc<GuestRegionMmap>104 pub fn region(&self) -> &Arc<GuestRegionMmap> {
105 &self.region
106 }
set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>)107 pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) {
108 self.virtio_device = Some(virtio_device);
109 }
hotplugged_size(&self) -> u64110 pub fn hotplugged_size(&self) -> u64 {
111 self.hotplugged_size
112 }
hugepages(&self) -> bool113 pub fn hugepages(&self) -> bool {
114 self.hugepages
115 }
blocks_state(&self) -> &Arc<Mutex<BlocksState>>116 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
117 &self.blocks_state
118 }
plugged_ranges(&self) -> MemoryRangeTable119 pub fn plugged_ranges(&self) -> MemoryRangeTable {
120 self.blocks_state
121 .lock()
122 .unwrap()
123 .memory_ranges(self.region.start_addr().raw_value(), true)
124 }
125 }
126
127 #[derive(Default)]
128 pub struct MemoryZone {
129 regions: Vec<Arc<GuestRegionMmap>>,
130 virtio_mem_zone: Option<VirtioMemZone>,
131 }
132
133 impl MemoryZone {
regions(&self) -> &Vec<Arc<GuestRegionMmap>>134 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
135 &self.regions
136 }
virtio_mem_zone(&self) -> &Option<VirtioMemZone>137 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
138 &self.virtio_mem_zone
139 }
virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone>140 pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> {
141 self.virtio_mem_zone.as_mut()
142 }
143 }
144
145 pub type MemoryZones = HashMap<String, MemoryZone>;
146
147 #[derive(Clone, Serialize, Deserialize)]
148 struct GuestRamMapping {
149 slot: u32,
150 gpa: u64,
151 size: u64,
152 zone_id: String,
153 virtio_mem: bool,
154 file_offset: u64,
155 }
156
157 #[derive(Clone, Serialize, Deserialize)]
158 struct ArchMemRegion {
159 base: u64,
160 size: usize,
161 r_type: RegionType,
162 }
163
164 pub struct MemoryManager {
165 boot_guest_memory: GuestMemoryMmap,
166 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
167 next_memory_slot: Arc<AtomicU32>,
168 memory_slot_free_list: Arc<Mutex<Vec<u32>>>,
169 start_of_device_area: GuestAddress,
170 end_of_device_area: GuestAddress,
171 end_of_ram_area: GuestAddress,
172 pub vm: Arc<dyn hypervisor::Vm>,
173 hotplug_slots: Vec<HotPlugState>,
174 selected_slot: usize,
175 mergeable: bool,
176 allocator: Arc<Mutex<SystemAllocator>>,
177 hotplug_method: HotplugMethod,
178 boot_ram: u64,
179 current_ram: u64,
180 next_hotplug_slot: usize,
181 shared: bool,
182 hugepages: bool,
183 hugepage_size: Option<u64>,
184 prefault: bool,
185 thp: bool,
186 #[cfg(target_arch = "x86_64")]
187 sgx_epc_region: Option<SgxEpcRegion>,
188 user_provided_zones: bool,
189 snapshot_memory_ranges: MemoryRangeTable,
190 memory_zones: MemoryZones,
191 log_dirty: bool, // Enable dirty logging for created RAM regions
192 arch_mem_regions: Vec<ArchMemRegion>,
193 ram_allocator: AddressAllocator,
194 dynamic: bool,
195
196 // Keep track of calls to create_userspace_mapping() for guest RAM.
197 // This is useful for getting the dirty pages as we need to know the
198 // slots that the mapping is created in.
199 guest_ram_mappings: Vec<GuestRamMapping>,
200
201 pub acpi_address: Option<GuestAddress>,
202 #[cfg(target_arch = "aarch64")]
203 uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
204 }
205
206 #[derive(Error, Debug)]
207 pub enum Error {
208 /// Failed to create shared file.
209 #[error("Failed to create shared file")]
210 SharedFileCreate(#[source] io::Error),
211
212 /// Failed to set shared file length.
213 #[error("Failed to set shared file length")]
214 SharedFileSetLen(#[source] io::Error),
215
216 /// Mmap backed guest memory error
217 #[error("Mmap backed guest memory error")]
218 GuestMemory(#[source] MmapError),
219
220 /// Failed to allocate a memory range.
221 #[error("Failed to allocate a memory range")]
222 MemoryRangeAllocation,
223
224 /// Error from region creation
225 #[error("Error from region creation")]
226 GuestMemoryRegion(#[source] MmapRegionError),
227
228 /// No ACPI slot available
229 #[error("No ACPI slot available")]
230 NoSlotAvailable,
231
232 /// Not enough space in the hotplug RAM region
233 #[error("Not enough space in the hotplug RAM region")]
234 InsufficientHotplugRam,
235
236 /// The requested hotplug memory addition is not a valid size
237 #[error("The requested hotplug memory addition is not a valid size")]
238 InvalidSize,
239
240 /// Failed to create the user memory region.
241 #[error("Failed to create the user memory region")]
242 CreateUserMemoryRegion(#[source] hypervisor::HypervisorVmError),
243
244 /// Failed to remove the user memory region.
245 #[error("Failed to remove the user memory region")]
246 RemoveUserMemoryRegion(#[source] hypervisor::HypervisorVmError),
247
248 /// Failed to EventFd.
249 #[error("Failed to EventFd")]
250 EventFdFail(#[source] io::Error),
251
252 /// Eventfd write error
253 #[error("Eventfd write error")]
254 EventfdError(#[source] io::Error),
255
256 /// Failed to virtio-mem resize
257 #[error("Failed to virtio-mem resize")]
258 VirtioMemResizeFail(#[source] virtio_devices::mem::Error),
259
260 /// Cannot restore VM
261 #[error("Cannot restore VM")]
262 Restore(#[source] MigratableError),
263
264 /// Cannot restore VM because source URL is missing
265 #[error("Cannot restore VM because source URL is missing")]
266 RestoreMissingSourceUrl,
267
268 /// Cannot create the system allocator
269 #[error("Cannot create the system allocator")]
270 CreateSystemAllocator,
271
272 /// Invalid SGX EPC section size
273 #[cfg(target_arch = "x86_64")]
274 #[error("Invalid SGX EPC section size")]
275 EpcSectionSizeInvalid,
276
277 /// Failed allocating SGX EPC region
278 #[cfg(target_arch = "x86_64")]
279 #[error("Failed allocating SGX EPC region")]
280 SgxEpcRangeAllocation,
281
282 /// Failed opening SGX virtual EPC device
283 #[cfg(target_arch = "x86_64")]
284 #[error("Failed opening SGX virtual EPC device")]
285 SgxVirtEpcOpen(#[source] io::Error),
286
287 /// Failed setting the SGX virtual EPC section size
288 #[cfg(target_arch = "x86_64")]
289 #[error("Failed setting the SGX virtual EPC section size")]
290 SgxVirtEpcFileSetLen(#[source] io::Error),
291
292 /// Failed opening SGX provisioning device
293 #[cfg(target_arch = "x86_64")]
294 #[error("Failed opening SGX provisioning device")]
295 SgxProvisionOpen(#[source] io::Error),
296
297 /// Failed enabling SGX provisioning
298 #[cfg(target_arch = "x86_64")]
299 #[error("Failed enabling SGX provisioning")]
300 SgxEnableProvisioning(#[source] hypervisor::HypervisorVmError),
301
302 /// Failed creating a new MmapRegion instance.
303 #[cfg(target_arch = "x86_64")]
304 #[error("Failed creating a new MmapRegion instance")]
305 NewMmapRegion(#[source] vm_memory::mmap::MmapRegionError),
306
307 /// No memory zones found.
308 #[error("No memory zones found")]
309 MissingMemoryZones,
310
311 /// Memory configuration is not valid.
312 #[error("Memory configuration is not valid")]
313 InvalidMemoryParameters,
314
315 /// Forbidden operation. Impossible to resize guest memory if it is
316 /// backed by user defined memory regions.
317 #[error("Impossible to resize guest memory if it is backed by user defined memory regions")]
318 InvalidResizeWithMemoryZones,
319
320 /// It's invalid to try applying a NUMA policy to a memory zone that is
321 /// memory mapped with MAP_SHARED.
322 #[error("Invalid to try applying a NUMA policy to a memory zone that is memory mapped with MAP_SHARED")]
323 InvalidSharedMemoryZoneWithHostNuma,
324
325 /// Failed applying NUMA memory policy.
326 #[error("Failed applying NUMA memory policy")]
327 ApplyNumaPolicy(#[source] io::Error),
328
329 /// Memory zone identifier is not unique.
330 #[error("Memory zone identifier is not unique")]
331 DuplicateZoneId,
332
333 /// No virtio-mem resizing handler found.
334 #[error("No virtio-mem resizing handler found")]
335 MissingVirtioMemHandler,
336
337 /// Unknown memory zone.
338 #[error("Unknown memory zone")]
339 UnknownMemoryZone,
340
341 /// Invalid size for resizing. Can be anything except 0.
342 #[error("Invalid size for resizing. Can be anything except 0")]
343 InvalidHotplugSize,
344
345 /// Invalid hotplug method associated with memory zones resizing capability.
346 #[error("Invalid hotplug method associated with memory zones resizing capability")]
347 InvalidHotplugMethodWithMemoryZones,
348
349 /// Could not find specified memory zone identifier from hash map.
350 #[error("Could not find specified memory zone identifier from hash map")]
351 MissingZoneIdentifier,
352
353 /// Resizing the memory zone failed.
354 #[error("Resizing the memory zone failed")]
355 ResizeZone,
356
357 /// Guest address overflow
358 #[error("Guest address overflow")]
359 GuestAddressOverFlow,
360
361 /// Error opening snapshot file
362 #[error("Error opening snapshot file")]
363 SnapshotOpen(#[source] io::Error),
364
365 // Error copying snapshot into region
366 #[error("Error copying snapshot into region")]
367 SnapshotCopy(#[source] GuestMemoryError),
368
369 /// Failed to allocate MMIO address
370 #[error("Failed to allocate MMIO address")]
371 AllocateMmioAddress,
372
373 #[cfg(target_arch = "aarch64")]
374 /// Failed to create UEFI flash
375 #[error("Failed to create UEFI flash")]
376 CreateUefiFlash(#[source] HypervisorVmError),
377
378 /// Using a directory as a backing file for memory is not supported
379 #[error("Using a directory as a backing file for memory is not supported")]
380 DirectoryAsBackingFileForMemory,
381
382 /// Failed to stat filesystem
383 #[error("Failed to stat filesystem")]
384 GetFileSystemBlockSize(#[source] io::Error),
385
386 /// Memory size is misaligned with default page size or its hugepage size
387 #[error("Memory size is misaligned with default page size or its hugepage size")]
388 MisalignedMemorySize,
389 }
390
391 const ENABLE_FLAG: usize = 0;
392 const INSERTING_FLAG: usize = 1;
393 const REMOVING_FLAG: usize = 2;
394 const EJECT_FLAG: usize = 3;
395
396 const BASE_OFFSET_LOW: u64 = 0;
397 const BASE_OFFSET_HIGH: u64 = 0x4;
398 const LENGTH_OFFSET_LOW: u64 = 0x8;
399 const LENGTH_OFFSET_HIGH: u64 = 0xC;
400 const STATUS_OFFSET: u64 = 0x14;
401 const SELECTION_OFFSET: u64 = 0;
402
403 // The MMIO address space size is subtracted with 64k. This is done for the
404 // following reasons:
405 // - Reduce the addressable space size by at least 4k to workaround a Linux
406 // bug when the VMM allocates devices at the end of the addressable space
407 // - Windows requires the addressable space size to be 64k aligned
mmio_address_space_size(phys_bits: u8) -> u64408 fn mmio_address_space_size(phys_bits: u8) -> u64 {
409 (1 << phys_bits) - (1 << 16)
410 }
411
412 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the
413 // `f_bsize` field.
414 //
415 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169
statfs_get_bsize(path: &str) -> Result<u64, Error>416 fn statfs_get_bsize(path: &str) -> Result<u64, Error> {
417 let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?;
418 let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit();
419
420 // SAFETY: FFI call with a valid path and buffer
421 let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) };
422 if ret != 0 {
423 return Err(Error::GetFileSystemBlockSize(
424 std::io::Error::last_os_error(),
425 ));
426 }
427
428 // SAFETY: `buf` is valid at this point
429 // Because this value is always positive, just convert it directly.
430 // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned
431 // by `clippy` on musl target. To avoid the warning, there should be `as _` instead of
432 // `as u64`.
433 let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _;
434 Ok(bsize)
435 }
436
memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error>437 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> {
438 // SAFETY: FFI call. Trivially safe.
439 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
440
441 // There is no backend file and the `hugepages` is disabled, just use system page size.
442 if zone.file.is_none() && !zone.hugepages {
443 return Ok(page_size);
444 }
445
446 // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly.
447 if zone.hugepages && zone.hugepage_size.is_some() {
448 return Ok(zone.hugepage_size.unwrap());
449 }
450
451 // There are two scenarios here:
452 // - `hugepages` is enabled but `hugepage_size` is not specified:
453 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
454 // - The backing file is specified:
455 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page
456 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the
457 // value is less than or equal to the page size, just use the page size.
458 let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| {
459 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
460 })?;
461
462 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
463
464 Ok(align_size)
465 }
466
467 #[inline]
align_down<T>(val: T, align: T) -> T where T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>,468 fn align_down<T>(val: T, align: T) -> T
469 where
470 T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>,
471 {
472 val & !(align - 1u8.into())
473 }
474
475 #[inline]
is_aligned<T>(val: T, align: T) -> bool where T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq,476 fn is_aligned<T>(val: T, align: T) -> bool
477 where
478 T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq,
479 {
480 (val & (align - 1u8.into())) == 0u8.into()
481 }
482
483 impl BusDevice for MemoryManager {
read(&mut self, _base: u64, offset: u64, data: &mut [u8])484 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
485 if self.selected_slot < self.hotplug_slots.len() {
486 let state = &self.hotplug_slots[self.selected_slot];
487 match offset {
488 BASE_OFFSET_LOW => {
489 data.copy_from_slice(&state.base.to_le_bytes()[..4]);
490 }
491 BASE_OFFSET_HIGH => {
492 data.copy_from_slice(&state.base.to_le_bytes()[4..]);
493 }
494 LENGTH_OFFSET_LOW => {
495 data.copy_from_slice(&state.length.to_le_bytes()[..4]);
496 }
497 LENGTH_OFFSET_HIGH => {
498 data.copy_from_slice(&state.length.to_le_bytes()[4..]);
499 }
500 STATUS_OFFSET => {
501 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
502 data.fill(0);
503 if state.active {
504 data[0] |= 1 << ENABLE_FLAG;
505 }
506 if state.inserting {
507 data[0] |= 1 << INSERTING_FLAG;
508 }
509 if state.removing {
510 data[0] |= 1 << REMOVING_FLAG;
511 }
512 }
513 _ => {
514 warn!(
515 "Unexpected offset for accessing memory manager device: {:#}",
516 offset
517 );
518 }
519 }
520 } else {
521 warn!("Out of range memory slot: {}", self.selected_slot);
522 }
523 }
524
write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>>525 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
526 match offset {
527 SELECTION_OFFSET => {
528 self.selected_slot = usize::from(data[0]);
529 }
530 STATUS_OFFSET => {
531 if self.selected_slot < self.hotplug_slots.len() {
532 let state = &mut self.hotplug_slots[self.selected_slot];
533 // The ACPI code writes back a 1 to acknowledge the insertion
534 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
535 state.inserting = false;
536 }
537 // Ditto for removal
538 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
539 state.removing = false;
540 }
541 // Trigger removal of "DIMM"
542 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
543 warn!("Ejection of memory not currently supported");
544 }
545 } else {
546 warn!("Out of range memory slot: {}", self.selected_slot);
547 }
548 }
549 _ => {
550 warn!(
551 "Unexpected offset for accessing memory manager device: {:#}",
552 offset
553 );
554 }
555 };
556 None
557 }
558 }
559
560 impl MemoryManager {
561 /// Creates all memory regions based on the available RAM ranges defined
562 /// by `ram_regions`, and based on the description of the memory zones.
563 /// In practice, this function can perform multiple memory mappings of the
564 /// same backing file if there's a hole in the address space between two
565 /// RAM ranges.
566 ///
567 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
568 /// and zones containing two zones (size 1G and size 4G).
569 ///
570 /// This function will create 3 resulting memory regions:
571 /// - First one mapping entirely the first memory zone on 0-1G range
572 /// - Second one mapping partially the second memory zone on 1G-3G range
573 /// - Third one mapping partially the second memory zone on 4G-6G range
574 ///
575 /// Also, all memory regions are page-size aligned (e.g. their sizes must
576 /// be multiple of page-size), which may leave an additional hole in the
577 /// address space when hugepage is used.
create_memory_regions_from_zones( ram_regions: &[(GuestAddress, usize)], zones: &[MemoryZoneConfig], prefault: Option<bool>, thp: bool, ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error>578 fn create_memory_regions_from_zones(
579 ram_regions: &[(GuestAddress, usize)],
580 zones: &[MemoryZoneConfig],
581 prefault: Option<bool>,
582 thp: bool,
583 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
584 let mut zone_iter = zones.iter();
585 let mut mem_regions = Vec::new();
586 let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?;
587 let mut zone_align_size = memory_zone_get_align_size(zone)?;
588 let mut zone_offset = 0u64;
589 let mut memory_zones = HashMap::new();
590
591 if !is_aligned(zone.size, zone_align_size) {
592 return Err(Error::MisalignedMemorySize);
593 }
594
595 // Add zone id to the list of memory zones.
596 memory_zones.insert(zone.id.clone(), MemoryZone::default());
597
598 for ram_region in ram_regions.iter() {
599 let mut ram_region_offset = 0;
600 let mut exit = false;
601
602 loop {
603 let mut ram_region_consumed = false;
604 let mut pull_next_zone = false;
605
606 let ram_region_available_size =
607 align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size);
608 if ram_region_available_size == 0 {
609 break;
610 }
611 let zone_sub_size = zone.size - zone_offset;
612
613 let file_offset = zone_offset;
614 let region_start = ram_region
615 .0
616 .checked_add(ram_region_offset)
617 .ok_or(Error::GuestAddressOverFlow)?;
618 let region_size = if zone_sub_size <= ram_region_available_size {
619 if zone_sub_size == ram_region_available_size {
620 ram_region_consumed = true;
621 }
622
623 ram_region_offset += zone_sub_size;
624 pull_next_zone = true;
625
626 zone_sub_size
627 } else {
628 zone_offset += ram_region_available_size;
629 ram_region_consumed = true;
630
631 ram_region_available_size
632 };
633
634 info!(
635 "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}",
636 zone.id,
637 region_start.raw_value(),
638 region_size
639 );
640 let region = MemoryManager::create_ram_region(
641 &zone.file,
642 file_offset,
643 region_start,
644 region_size as usize,
645 prefault.unwrap_or(zone.prefault),
646 zone.shared,
647 zone.hugepages,
648 zone.hugepage_size,
649 zone.host_numa_node,
650 None,
651 thp,
652 )?;
653
654 // Add region to the list of regions associated with the
655 // current memory zone.
656 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
657 memory_zone.regions.push(region.clone());
658 }
659
660 mem_regions.push(region);
661
662 if pull_next_zone {
663 // Get the next zone and reset the offset.
664 zone_offset = 0;
665 if let Some(z) = zone_iter.next() {
666 zone = z;
667 } else {
668 exit = true;
669 break;
670 }
671 zone_align_size = memory_zone_get_align_size(zone)?;
672 if !is_aligned(zone.size, zone_align_size) {
673 return Err(Error::MisalignedMemorySize);
674 }
675
676 // Check if zone id already exist. In case it does, throw
677 // an error as we need unique identifiers. Otherwise, add
678 // the new zone id to the list of memory zones.
679 if memory_zones.contains_key(&zone.id) {
680 error!(
681 "Memory zone identifier '{}' found more than once. \
682 It must be unique",
683 zone.id,
684 );
685 return Err(Error::DuplicateZoneId);
686 }
687 memory_zones.insert(zone.id.clone(), MemoryZone::default());
688 }
689
690 if ram_region_consumed {
691 break;
692 }
693 }
694
695 if exit {
696 break;
697 }
698 }
699
700 Ok((mem_regions, memory_zones))
701 }
702
703 // Restore both GuestMemory regions along with MemoryZone zones.
restore_memory_regions_and_zones( guest_ram_mappings: &[GuestRamMapping], zones_config: &[MemoryZoneConfig], prefault: Option<bool>, mut existing_memory_files: HashMap<u32, File>, thp: bool, ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error>704 fn restore_memory_regions_and_zones(
705 guest_ram_mappings: &[GuestRamMapping],
706 zones_config: &[MemoryZoneConfig],
707 prefault: Option<bool>,
708 mut existing_memory_files: HashMap<u32, File>,
709 thp: bool,
710 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
711 let mut memory_regions = Vec::new();
712 let mut memory_zones = HashMap::new();
713
714 for zone_config in zones_config {
715 memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
716 }
717
718 for guest_ram_mapping in guest_ram_mappings {
719 for zone_config in zones_config {
720 if guest_ram_mapping.zone_id == zone_config.id {
721 let region = MemoryManager::create_ram_region(
722 if guest_ram_mapping.virtio_mem {
723 &None
724 } else {
725 &zone_config.file
726 },
727 guest_ram_mapping.file_offset,
728 GuestAddress(guest_ram_mapping.gpa),
729 guest_ram_mapping.size as usize,
730 prefault.unwrap_or(zone_config.prefault),
731 zone_config.shared,
732 zone_config.hugepages,
733 zone_config.hugepage_size,
734 zone_config.host_numa_node,
735 existing_memory_files.remove(&guest_ram_mapping.slot),
736 thp,
737 )?;
738 memory_regions.push(Arc::clone(®ion));
739 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
740 if guest_ram_mapping.virtio_mem {
741 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
742 let region_size = region.len();
743 memory_zone.virtio_mem_zone = Some(VirtioMemZone {
744 region,
745 virtio_device: None,
746 hotplugged_size,
747 hugepages: zone_config.hugepages,
748 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
749 });
750 } else {
751 memory_zone.regions.push(region);
752 }
753 }
754 }
755 }
756 }
757
758 memory_regions.sort_by_key(|x| x.start_addr());
759
760 Ok((memory_regions, memory_zones))
761 }
762
fill_saved_regions( &mut self, file_path: PathBuf, saved_regions: MemoryRangeTable, ) -> Result<(), Error>763 fn fill_saved_regions(
764 &mut self,
765 file_path: PathBuf,
766 saved_regions: MemoryRangeTable,
767 ) -> Result<(), Error> {
768 if saved_regions.is_empty() {
769 return Ok(());
770 }
771
772 // Open (read only) the snapshot file.
773 let mut memory_file = OpenOptions::new()
774 .read(true)
775 .open(file_path)
776 .map_err(Error::SnapshotOpen)?;
777
778 let guest_memory = self.guest_memory.memory();
779 for range in saved_regions.regions() {
780 let mut offset: u64 = 0;
781 // Here we are manually handling the retry in case we can't write
782 // the whole region at once because we can't use the implementation
783 // from vm-memory::GuestMemory of read_exact_from() as it is not
784 // following the correct behavior. For more info about this issue
785 // see: https://github.com/rust-vmm/vm-memory/issues/174
786 loop {
787 let bytes_read = guest_memory
788 .read_volatile_from(
789 GuestAddress(range.gpa + offset),
790 &mut memory_file,
791 (range.length - offset) as usize,
792 )
793 .map_err(Error::SnapshotCopy)?;
794 offset += bytes_read as u64;
795
796 if offset == range.length {
797 break;
798 }
799 }
800 }
801
802 Ok(())
803 }
804
validate_memory_config( config: &MemoryConfig, user_provided_zones: bool, ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error>805 fn validate_memory_config(
806 config: &MemoryConfig,
807 user_provided_zones: bool,
808 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
809 let mut allow_mem_hotplug = false;
810
811 if !user_provided_zones {
812 if config.zones.is_some() {
813 error!(
814 "User defined memory regions can't be provided if the \
815 memory size is not 0"
816 );
817 return Err(Error::InvalidMemoryParameters);
818 }
819
820 if config.hotplug_size.is_some() {
821 allow_mem_hotplug = true;
822 }
823
824 if let Some(hotplugged_size) = config.hotplugged_size {
825 if let Some(hotplug_size) = config.hotplug_size {
826 if hotplugged_size > hotplug_size {
827 error!(
828 "'hotplugged_size' {} can't be bigger than \
829 'hotplug_size' {}",
830 hotplugged_size, hotplug_size,
831 );
832 return Err(Error::InvalidMemoryParameters);
833 }
834 } else {
835 error!(
836 "Invalid to define 'hotplugged_size' when there is\
837 no 'hotplug_size'"
838 );
839 return Err(Error::InvalidMemoryParameters);
840 }
841 if config.hotplug_method == HotplugMethod::Acpi {
842 error!(
843 "Invalid to define 'hotplugged_size' with hotplug \
844 method 'acpi'"
845 );
846 return Err(Error::InvalidMemoryParameters);
847 }
848 }
849
850 // Create a single zone from the global memory config. This lets
851 // us reuse the codepath for user defined memory zones.
852 let zones = vec![MemoryZoneConfig {
853 id: String::from(DEFAULT_MEMORY_ZONE),
854 size: config.size,
855 file: None,
856 shared: config.shared,
857 hugepages: config.hugepages,
858 hugepage_size: config.hugepage_size,
859 host_numa_node: None,
860 hotplug_size: config.hotplug_size,
861 hotplugged_size: config.hotplugged_size,
862 prefault: config.prefault,
863 }];
864
865 Ok((config.size, zones, allow_mem_hotplug))
866 } else {
867 if config.zones.is_none() {
868 error!(
869 "User defined memory regions must be provided if the \
870 memory size is 0"
871 );
872 return Err(Error::MissingMemoryZones);
873 }
874
875 // Safe to unwrap as we checked right above there were some
876 // regions.
877 let zones = config.zones.clone().unwrap();
878 if zones.is_empty() {
879 return Err(Error::MissingMemoryZones);
880 }
881
882 let mut total_ram_size: u64 = 0;
883 for zone in zones.iter() {
884 total_ram_size += zone.size;
885
886 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
887 error!(
888 "Invalid to set host NUMA policy for a memory zone \
889 backed by a regular file and mapped as 'shared'"
890 );
891 return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
892 }
893
894 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
895 error!("Invalid to set ACPI hotplug method for memory zones");
896 return Err(Error::InvalidHotplugMethodWithMemoryZones);
897 }
898
899 if let Some(hotplugged_size) = zone.hotplugged_size {
900 if let Some(hotplug_size) = zone.hotplug_size {
901 if hotplugged_size > hotplug_size {
902 error!(
903 "'hotplugged_size' {} can't be bigger than \
904 'hotplug_size' {}",
905 hotplugged_size, hotplug_size,
906 );
907 return Err(Error::InvalidMemoryParameters);
908 }
909 } else {
910 error!(
911 "Invalid to define 'hotplugged_size' when there is\
912 no 'hotplug_size' for a memory zone"
913 );
914 return Err(Error::InvalidMemoryParameters);
915 }
916 if config.hotplug_method == HotplugMethod::Acpi {
917 error!(
918 "Invalid to define 'hotplugged_size' with hotplug \
919 method 'acpi'"
920 );
921 return Err(Error::InvalidMemoryParameters);
922 }
923 }
924 }
925
926 Ok((total_ram_size, zones, allow_mem_hotplug))
927 }
928 }
929
allocate_address_space(&mut self) -> Result<(), Error>930 pub fn allocate_address_space(&mut self) -> Result<(), Error> {
931 let mut list = Vec::new();
932
933 for (zone_id, memory_zone) in self.memory_zones.iter() {
934 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
935 memory_zone
936 .regions()
937 .iter()
938 .map(|r| (r.clone(), false))
939 .collect();
940
941 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
942 regions.push((virtio_mem_zone.region().clone(), true));
943 }
944
945 list.push((zone_id.clone(), regions));
946 }
947
948 for (zone_id, regions) in list {
949 for (region, virtio_mem) in regions {
950 let slot = self.create_userspace_mapping(
951 region.start_addr().raw_value(),
952 region.len(),
953 region.as_ptr() as u64,
954 self.mergeable,
955 false,
956 self.log_dirty,
957 )?;
958
959 let file_offset = if let Some(file_offset) = region.file_offset() {
960 file_offset.start()
961 } else {
962 0
963 };
964
965 self.guest_ram_mappings.push(GuestRamMapping {
966 gpa: region.start_addr().raw_value(),
967 size: region.len(),
968 slot,
969 zone_id: zone_id.clone(),
970 virtio_mem,
971 file_offset,
972 });
973 self.ram_allocator
974 .allocate(Some(region.start_addr()), region.len(), None)
975 .ok_or(Error::MemoryRangeAllocation)?;
976 }
977 }
978
979 // Allocate SubRegion and Reserved address ranges.
980 for region in self.arch_mem_regions.iter() {
981 if region.r_type == RegionType::Ram {
982 // Ignore the RAM type since ranges have already been allocated
983 // based on the GuestMemory regions.
984 continue;
985 }
986 self.ram_allocator
987 .allocate(
988 Some(GuestAddress(region.base)),
989 region.size as GuestUsize,
990 None,
991 )
992 .ok_or(Error::MemoryRangeAllocation)?;
993 }
994
995 Ok(())
996 }
997
998 #[cfg(target_arch = "aarch64")]
add_uefi_flash(&mut self) -> Result<(), Error>999 pub fn add_uefi_flash(&mut self) -> Result<(), Error> {
1000 // On AArch64, the UEFI binary requires a flash device at address 0.
1001 // 4 MiB memory is mapped to simulate the flash.
1002 let uefi_mem_slot = self.allocate_memory_slot();
1003 let uefi_region = GuestRegionMmap::new(
1004 MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(),
1005 arch::layout::UEFI_START,
1006 )
1007 .unwrap();
1008 let uefi_mem_region = self.vm.make_user_memory_region(
1009 uefi_mem_slot,
1010 uefi_region.start_addr().raw_value(),
1011 uefi_region.len(),
1012 uefi_region.as_ptr() as u64,
1013 false,
1014 false,
1015 );
1016 self.vm
1017 .create_user_memory_region(uefi_mem_region)
1018 .map_err(Error::CreateUefiFlash)?;
1019
1020 let uefi_flash =
1021 GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap());
1022
1023 self.uefi_flash = Some(uefi_flash);
1024
1025 Ok(())
1026 }
1027
1028 #[allow(clippy::too_many_arguments)]
new( vm: Arc<dyn hypervisor::Vm>, config: &MemoryConfig, prefault: Option<bool>, phys_bits: u8, #[cfg(feature = "tdx")] tdx_enabled: bool, restore_data: Option<&MemoryManagerSnapshotData>, existing_memory_files: Option<HashMap<u32, File>>, #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, ) -> Result<Arc<Mutex<MemoryManager>>, Error>1029 pub fn new(
1030 vm: Arc<dyn hypervisor::Vm>,
1031 config: &MemoryConfig,
1032 prefault: Option<bool>,
1033 phys_bits: u8,
1034 #[cfg(feature = "tdx")] tdx_enabled: bool,
1035 restore_data: Option<&MemoryManagerSnapshotData>,
1036 existing_memory_files: Option<HashMap<u32, File>>,
1037 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
1038 ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1039 trace_scoped!("MemoryManager::new");
1040
1041 let user_provided_zones = config.size == 0;
1042
1043 let mmio_address_space_size = mmio_address_space_size(phys_bits);
1044 debug_assert_eq!(
1045 (((mmio_address_space_size) >> 16) << 16),
1046 mmio_address_space_size
1047 );
1048 let start_of_platform_device_area =
1049 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
1050 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
1051
1052 let (ram_size, zones, allow_mem_hotplug) =
1053 Self::validate_memory_config(config, user_provided_zones)?;
1054
1055 let (
1056 start_of_device_area,
1057 boot_ram,
1058 current_ram,
1059 arch_mem_regions,
1060 memory_zones,
1061 guest_memory,
1062 boot_guest_memory,
1063 hotplug_slots,
1064 next_memory_slot,
1065 selected_slot,
1066 next_hotplug_slot,
1067 ) = if let Some(data) = restore_data {
1068 let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
1069 &data.guest_ram_mappings,
1070 &zones,
1071 prefault,
1072 existing_memory_files.unwrap_or_default(),
1073 config.thp,
1074 )?;
1075 let guest_memory =
1076 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
1077 let boot_guest_memory = guest_memory.clone();
1078 (
1079 GuestAddress(data.start_of_device_area),
1080 data.boot_ram,
1081 data.current_ram,
1082 data.arch_mem_regions.clone(),
1083 memory_zones,
1084 guest_memory,
1085 boot_guest_memory,
1086 data.hotplug_slots.clone(),
1087 data.next_memory_slot,
1088 data.selected_slot,
1089 data.next_hotplug_slot,
1090 )
1091 } else {
1092 // Init guest memory
1093 let arch_mem_regions = arch::arch_memory_regions();
1094
1095 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1096 .iter()
1097 .filter(|r| r.2 == RegionType::Ram)
1098 .map(|r| (r.0, r.1))
1099 .collect();
1100
1101 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
1102 .iter()
1103 .map(|(a, b, c)| ArchMemRegion {
1104 base: a.0,
1105 size: *b,
1106 r_type: *c,
1107 })
1108 .collect();
1109
1110 let (mem_regions, mut memory_zones) =
1111 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?;
1112
1113 let mut guest_memory =
1114 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
1115
1116 let boot_guest_memory = guest_memory.clone();
1117
1118 let mut start_of_device_area =
1119 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
1120
1121 // Update list of memory zones for resize.
1122 for zone in zones.iter() {
1123 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
1124 if let Some(hotplug_size) = zone.hotplug_size {
1125 if hotplug_size == 0 {
1126 error!("'hotplug_size' can't be 0");
1127 return Err(Error::InvalidHotplugSize);
1128 }
1129
1130 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
1131 start_of_device_area = start_of_device_area
1132 .checked_add(hotplug_size)
1133 .ok_or(Error::GuestAddressOverFlow)?;
1134 } else {
1135 // Alignment must be "natural" i.e. same as size of block
1136 let start_addr = GuestAddress(
1137 start_of_device_area
1138 .0
1139 .div_ceil(virtio_devices::VIRTIO_MEM_ALIGN_SIZE)
1140 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
1141 );
1142
1143 // When `prefault` is set by vm_restore, memory manager
1144 // will create ram region with `prefault` option in
1145 // restore config rather than same option in zone
1146 let region = MemoryManager::create_ram_region(
1147 &None,
1148 0,
1149 start_addr,
1150 hotplug_size as usize,
1151 prefault.unwrap_or(zone.prefault),
1152 zone.shared,
1153 zone.hugepages,
1154 zone.hugepage_size,
1155 zone.host_numa_node,
1156 None,
1157 config.thp,
1158 )?;
1159
1160 guest_memory = guest_memory
1161 .insert_region(Arc::clone(®ion))
1162 .map_err(Error::GuestMemory)?;
1163
1164 let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
1165 let region_size = region.len();
1166 memory_zone.virtio_mem_zone = Some(VirtioMemZone {
1167 region,
1168 virtio_device: None,
1169 hotplugged_size,
1170 hugepages: zone.hugepages,
1171 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
1172 });
1173
1174 start_of_device_area = start_addr
1175 .checked_add(hotplug_size)
1176 .ok_or(Error::GuestAddressOverFlow)?;
1177 }
1178 }
1179 } else {
1180 return Err(Error::MissingZoneIdentifier);
1181 }
1182 }
1183
1184 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
1185 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
1186
1187 (
1188 start_of_device_area,
1189 ram_size,
1190 ram_size,
1191 arch_mem_regions,
1192 memory_zones,
1193 guest_memory,
1194 boot_guest_memory,
1195 hotplug_slots,
1196 0,
1197 0,
1198 0,
1199 )
1200 };
1201
1202 let guest_memory = GuestMemoryAtomic::new(guest_memory);
1203
1204 let allocator = Arc::new(Mutex::new(
1205 SystemAllocator::new(
1206 GuestAddress(0),
1207 1 << 16,
1208 start_of_platform_device_area,
1209 PLATFORM_DEVICE_AREA_SIZE,
1210 #[cfg(target_arch = "x86_64")]
1211 vec![GsiApic::new(
1212 X86_64_IRQ_BASE,
1213 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1214 )],
1215 )
1216 .ok_or(Error::CreateSystemAllocator)?,
1217 ));
1218
1219 #[cfg(not(feature = "tdx"))]
1220 let dynamic = true;
1221 #[cfg(feature = "tdx")]
1222 let dynamic = !tdx_enabled;
1223
1224 let acpi_address = if dynamic
1225 && config.hotplug_method == HotplugMethod::Acpi
1226 && (config.hotplug_size.unwrap_or_default() > 0)
1227 {
1228 Some(
1229 allocator
1230 .lock()
1231 .unwrap()
1232 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1233 .ok_or(Error::AllocateMmioAddress)?,
1234 )
1235 } else {
1236 None
1237 };
1238
1239 // If running on SGX the start of device area and RAM area may diverge but
1240 // at this point they are next to each other.
1241 let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1242 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1243
1244 #[allow(unused_mut)]
1245 let mut memory_manager = MemoryManager {
1246 boot_guest_memory,
1247 guest_memory,
1248 next_memory_slot: Arc::new(AtomicU32::new(next_memory_slot)),
1249 memory_slot_free_list: Arc::new(Mutex::new(Vec::new())),
1250 start_of_device_area,
1251 end_of_device_area,
1252 end_of_ram_area,
1253 vm,
1254 hotplug_slots,
1255 selected_slot,
1256 mergeable: config.mergeable,
1257 allocator,
1258 hotplug_method: config.hotplug_method,
1259 boot_ram,
1260 current_ram,
1261 next_hotplug_slot,
1262 shared: config.shared,
1263 hugepages: config.hugepages,
1264 hugepage_size: config.hugepage_size,
1265 prefault: config.prefault,
1266 #[cfg(target_arch = "x86_64")]
1267 sgx_epc_region: None,
1268 user_provided_zones,
1269 snapshot_memory_ranges: MemoryRangeTable::default(),
1270 memory_zones,
1271 guest_ram_mappings: Vec::new(),
1272 acpi_address,
1273 log_dirty: dynamic, // Cannot log dirty pages on a TD
1274 arch_mem_regions,
1275 ram_allocator,
1276 dynamic,
1277 #[cfg(target_arch = "aarch64")]
1278 uefi_flash: None,
1279 thp: config.thp,
1280 };
1281
1282 #[cfg(target_arch = "x86_64")]
1283 if let Some(sgx_epc_config) = sgx_epc_config {
1284 memory_manager.setup_sgx(sgx_epc_config)?;
1285 }
1286
1287 Ok(Arc::new(Mutex::new(memory_manager)))
1288 }
1289
new_from_snapshot( snapshot: &Snapshot, vm: Arc<dyn hypervisor::Vm>, config: &MemoryConfig, source_url: Option<&str>, prefault: bool, phys_bits: u8, ) -> Result<Arc<Mutex<MemoryManager>>, Error>1290 pub fn new_from_snapshot(
1291 snapshot: &Snapshot,
1292 vm: Arc<dyn hypervisor::Vm>,
1293 config: &MemoryConfig,
1294 source_url: Option<&str>,
1295 prefault: bool,
1296 phys_bits: u8,
1297 ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1298 if let Some(source_url) = source_url {
1299 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1300 memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1301
1302 let mem_snapshot: MemoryManagerSnapshotData =
1303 snapshot.to_state().map_err(Error::Restore)?;
1304
1305 let mm = MemoryManager::new(
1306 vm,
1307 config,
1308 Some(prefault),
1309 phys_bits,
1310 #[cfg(feature = "tdx")]
1311 false,
1312 Some(&mem_snapshot),
1313 None,
1314 #[cfg(target_arch = "x86_64")]
1315 None,
1316 )?;
1317
1318 mm.lock()
1319 .unwrap()
1320 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1321
1322 Ok(mm)
1323 } else {
1324 Err(Error::RestoreMissingSourceUrl)
1325 }
1326 }
1327
memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error>1328 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1329 // SAFETY: FFI call with correct arguments
1330 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1331
1332 if res < 0 {
1333 Err(io::Error::last_os_error())
1334 } else {
1335 Ok(res as RawFd)
1336 }
1337 }
1338
mbind( addr: *mut u8, len: u64, mode: u32, nodemask: Vec<u64>, maxnode: u64, flags: u32, ) -> Result<(), io::Error>1339 fn mbind(
1340 addr: *mut u8,
1341 len: u64,
1342 mode: u32,
1343 nodemask: Vec<u64>,
1344 maxnode: u64,
1345 flags: u32,
1346 ) -> Result<(), io::Error> {
1347 // SAFETY: FFI call with correct arguments
1348 let res = unsafe {
1349 libc::syscall(
1350 libc::SYS_mbind,
1351 addr as *mut libc::c_void,
1352 len,
1353 mode,
1354 nodemask.as_ptr(),
1355 maxnode,
1356 flags,
1357 )
1358 };
1359
1360 if res < 0 {
1361 Err(io::Error::last_os_error())
1362 } else {
1363 Ok(())
1364 }
1365 }
1366
create_anonymous_file( size: usize, hugepages: bool, hugepage_size: Option<u64>, ) -> Result<FileOffset, Error>1367 fn create_anonymous_file(
1368 size: usize,
1369 hugepages: bool,
1370 hugepage_size: Option<u64>,
1371 ) -> Result<FileOffset, Error> {
1372 let fd = Self::memfd_create(
1373 &ffi::CString::new("ch_ram").unwrap(),
1374 libc::MFD_CLOEXEC
1375 | if hugepages {
1376 libc::MFD_HUGETLB
1377 | if let Some(hugepage_size) = hugepage_size {
1378 /*
1379 * From the Linux kernel:
1380 * Several system calls take a flag to request "hugetlb" huge pages.
1381 * Without further specification, these system calls will use the
1382 * system's default huge page size. If a system supports multiple
1383 * huge page sizes, the desired huge page size can be specified in
1384 * bits [26:31] of the flag arguments. The value in these 6 bits
1385 * will encode the log2 of the huge page size.
1386 */
1387
1388 hugepage_size.trailing_zeros() << 26
1389 } else {
1390 // Use the system default huge page size
1391 0
1392 }
1393 } else {
1394 0
1395 },
1396 )
1397 .map_err(Error::SharedFileCreate)?;
1398
1399 // SAFETY: fd is valid
1400 let f = unsafe { File::from_raw_fd(fd) };
1401 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1402
1403 Ok(FileOffset::new(f, 0))
1404 }
1405
open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error>1406 fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> {
1407 if backing_file.is_dir() {
1408 Err(Error::DirectoryAsBackingFileForMemory)
1409 } else {
1410 let f = OpenOptions::new()
1411 .read(true)
1412 .write(true)
1413 .open(backing_file)
1414 .map_err(Error::SharedFileCreate)?;
1415
1416 Ok(FileOffset::new(f, file_offset))
1417 }
1418 }
1419
1420 #[allow(clippy::too_many_arguments)]
create_ram_region( backing_file: &Option<PathBuf>, file_offset: u64, start_addr: GuestAddress, size: usize, prefault: bool, shared: bool, hugepages: bool, hugepage_size: Option<u64>, host_numa_node: Option<u32>, existing_memory_file: Option<File>, thp: bool, ) -> Result<Arc<GuestRegionMmap>, Error>1421 pub fn create_ram_region(
1422 backing_file: &Option<PathBuf>,
1423 file_offset: u64,
1424 start_addr: GuestAddress,
1425 size: usize,
1426 prefault: bool,
1427 shared: bool,
1428 hugepages: bool,
1429 hugepage_size: Option<u64>,
1430 host_numa_node: Option<u32>,
1431 existing_memory_file: Option<File>,
1432 thp: bool,
1433 ) -> Result<Arc<GuestRegionMmap>, Error> {
1434 let mut mmap_flags = libc::MAP_NORESERVE;
1435
1436 // The duplication of mmap_flags ORing here is unfortunate but it also makes
1437 // the complexity of the handling clear.
1438 let fo = if let Some(f) = existing_memory_file {
1439 // It must be MAP_SHARED as we wouldn't already have an FD
1440 mmap_flags |= libc::MAP_SHARED;
1441 Some(FileOffset::new(f, file_offset))
1442 } else if let Some(backing_file) = backing_file {
1443 if shared {
1444 mmap_flags |= libc::MAP_SHARED;
1445 } else {
1446 mmap_flags |= libc::MAP_PRIVATE;
1447 }
1448 Some(Self::open_backing_file(backing_file, file_offset)?)
1449 } else if shared || hugepages {
1450 // For hugepages we must also MAP_SHARED otherwise we will trigger #4805
1451 // because the MAP_PRIVATE will trigger CoW against the backing file with
1452 // the VFIO pinning
1453 mmap_flags |= libc::MAP_SHARED;
1454 Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?)
1455 } else {
1456 mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS;
1457 None
1458 };
1459
1460 let region = GuestRegionMmap::new(
1461 MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
1462 .map_err(Error::GuestMemoryRegion)?,
1463 start_addr,
1464 )
1465 .map_err(Error::GuestMemory)?;
1466
1467 // Apply NUMA policy if needed.
1468 if let Some(node) = host_numa_node {
1469 let addr = region.deref().as_ptr();
1470 let len = region.deref().size() as u64;
1471 let mode = MPOL_BIND;
1472 let mut nodemask: Vec<u64> = Vec::new();
1473 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1474
1475 // Linux is kind of buggy in the way it interprets maxnode as it
1476 // will cut off the last node. That's why we have to add 1 to what
1477 // we would consider as the proper maxnode value.
1478 let maxnode = node as u64 + 1 + 1;
1479
1480 // Allocate the right size for the vector.
1481 nodemask.resize((node as usize / 64) + 1, 0);
1482
1483 // Fill the global bitmask through the nodemask vector.
1484 let idx = (node / 64) as usize;
1485 let shift = node % 64;
1486 nodemask[idx] |= 1u64 << shift;
1487
1488 // Policies are enforced by using MPOL_MF_MOVE flag as it will
1489 // force the kernel to move all pages that might have been already
1490 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1491 // used to throw an error if MPOL_MF_MOVE didn't succeed.
1492 // MPOL_BIND is the selected mode as it specifies a strict policy
1493 // that restricts memory allocation to the nodes specified in the
1494 // nodemask.
1495 Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1496 .map_err(Error::ApplyNumaPolicy)?;
1497 }
1498
1499 // Prefault the region if needed, in parallel.
1500 if prefault {
1501 let page_size =
1502 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize;
1503
1504 if !is_aligned(size, page_size) {
1505 warn!(
1506 "Prefaulting memory size {} misaligned with page size {}",
1507 size, page_size
1508 );
1509 }
1510
1511 let num_pages = size / page_size;
1512
1513 let num_threads = Self::get_prefault_num_threads(page_size, num_pages);
1514
1515 let pages_per_thread = num_pages / num_threads;
1516 let remainder = num_pages % num_threads;
1517
1518 let barrier = Arc::new(Barrier::new(num_threads));
1519 thread::scope(|s| {
1520 let r = ®ion;
1521 for i in 0..num_threads {
1522 let barrier = Arc::clone(&barrier);
1523 s.spawn(move || {
1524 // Wait until all threads have been spawned to avoid contention
1525 // over mmap_sem between thread stack allocation and page faulting.
1526 barrier.wait();
1527 let pages = pages_per_thread + if i < remainder { 1 } else { 0 };
1528 let offset =
1529 page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder));
1530 // SAFETY: FFI call with correct arguments
1531 let ret = unsafe {
1532 let addr = r.as_ptr().add(offset);
1533 libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE)
1534 };
1535 if ret != 0 {
1536 let e = io::Error::last_os_error();
1537 warn!("Failed to prefault pages: {}", e);
1538 }
1539 });
1540 }
1541 });
1542 }
1543
1544 if region.file_offset().is_none() && thp {
1545 info!(
1546 "Anonymous mapping at 0x{:x} (size = 0x{:x})",
1547 region.as_ptr() as u64,
1548 size
1549 );
1550 // SAFETY: FFI call with correct arguments
1551 let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
1552 if ret != 0 {
1553 let e = io::Error::last_os_error();
1554 warn!("Failed to mark pages as THP eligible: {}", e);
1555 }
1556 }
1557
1558 Ok(Arc::new(region))
1559 }
1560
1561 // Duplicate of `memory_zone_get_align_size` that does not require a `zone`
get_prefault_align_size( backing_file: &Option<PathBuf>, hugepages: bool, hugepage_size: Option<u64>, ) -> Result<u64, Error>1562 fn get_prefault_align_size(
1563 backing_file: &Option<PathBuf>,
1564 hugepages: bool,
1565 hugepage_size: Option<u64>,
1566 ) -> Result<u64, Error> {
1567 // SAFETY: FFI call. Trivially safe.
1568 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
1569 match (hugepages, hugepage_size, backing_file) {
1570 (false, _, _) => Ok(page_size),
1571 (true, Some(hugepage_size), _) => Ok(hugepage_size),
1572 (true, None, _) => {
1573 // There are two scenarios here:
1574 // - `hugepages` is enabled but `hugepage_size` is not specified:
1575 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
1576 // - The backing file is specified:
1577 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page
1578 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the
1579 // value is less than or equal to the page size, just use the page size.
1580 let path = backing_file
1581 .as_ref()
1582 .map_or(Ok("/dev/hugepages"), |pathbuf| {
1583 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
1584 })?;
1585 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
1586 Ok(align_size)
1587 }
1588 }
1589 }
1590
get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize1591 fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize {
1592 let mut n: usize = 1;
1593
1594 // Do not create more threads than processors available.
1595 // SAFETY: FFI call. Trivially safe.
1596 let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) };
1597 if procs > 0 {
1598 n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT);
1599 }
1600
1601 // Do not create more threads than pages being allocated.
1602 n = std::cmp::min(n, num_pages);
1603
1604 // Do not create threads to allocate less than 64 MiB of memory.
1605 n = std::cmp::min(
1606 n,
1607 std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))),
1608 );
1609
1610 n
1611 }
1612
1613 // Update the GuestMemoryMmap with the new range
add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error>1614 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1615 let guest_memory = self
1616 .guest_memory
1617 .memory()
1618 .insert_region(region)
1619 .map_err(Error::GuestMemory)?;
1620 self.guest_memory.lock().unwrap().replace(guest_memory);
1621
1622 Ok(())
1623 }
1624
1625 //
1626 // Calculate the start address of an area next to RAM.
1627 //
1628 // If memory hotplug is allowed, the start address needs to be aligned
1629 // (rounded-up) to 128MiB boundary.
1630 // If memory hotplug is not allowed, there is no alignment required.
1631 // And it must also start at the 64bit start.
start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error>1632 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1633 let mut start_addr = if allow_mem_hotplug {
1634 GuestAddress(mem_end.0 | ((128 << 20) - 1))
1635 } else {
1636 mem_end
1637 };
1638
1639 start_addr = start_addr
1640 .checked_add(1)
1641 .ok_or(Error::GuestAddressOverFlow)?;
1642
1643 #[cfg(not(target_arch = "riscv64"))]
1644 if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1645 return Ok(arch::layout::RAM_64BIT_START);
1646 }
1647
1648 Ok(start_addr)
1649 }
1650
add_ram_region( &mut self, start_addr: GuestAddress, size: usize, ) -> Result<Arc<GuestRegionMmap>, Error>1651 pub fn add_ram_region(
1652 &mut self,
1653 start_addr: GuestAddress,
1654 size: usize,
1655 ) -> Result<Arc<GuestRegionMmap>, Error> {
1656 // Allocate memory for the region
1657 let region = MemoryManager::create_ram_region(
1658 &None,
1659 0,
1660 start_addr,
1661 size,
1662 self.prefault,
1663 self.shared,
1664 self.hugepages,
1665 self.hugepage_size,
1666 None,
1667 None,
1668 self.thp,
1669 )?;
1670
1671 // Map it into the guest
1672 let slot = self.create_userspace_mapping(
1673 region.start_addr().0,
1674 region.len(),
1675 region.as_ptr() as u64,
1676 self.mergeable,
1677 false,
1678 self.log_dirty,
1679 )?;
1680 self.guest_ram_mappings.push(GuestRamMapping {
1681 gpa: region.start_addr().raw_value(),
1682 size: region.len(),
1683 slot,
1684 zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1685 virtio_mem: false,
1686 file_offset: 0,
1687 });
1688
1689 self.add_region(Arc::clone(®ion))?;
1690
1691 Ok(region)
1692 }
1693
hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error>1694 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1695 info!("Hotplugging new RAM: {}", size);
1696
1697 // Check that there is a free slot
1698 if self.next_hotplug_slot >= HOTPLUG_COUNT {
1699 return Err(Error::NoSlotAvailable);
1700 }
1701
1702 // "Inserted" DIMM must have a size that is a multiple of 128MiB
1703 if size % (128 << 20) != 0 {
1704 return Err(Error::InvalidSize);
1705 }
1706
1707 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1708
1709 if start_addr
1710 .checked_add((size - 1).try_into().unwrap())
1711 .unwrap()
1712 > self.end_of_ram_area
1713 {
1714 return Err(Error::InsufficientHotplugRam);
1715 }
1716
1717 let region = self.add_ram_region(start_addr, size)?;
1718
1719 // Add region to the list of regions associated with the default
1720 // memory zone.
1721 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1722 memory_zone.regions.push(Arc::clone(®ion));
1723 }
1724
1725 // Tell the allocator
1726 self.ram_allocator
1727 .allocate(Some(start_addr), size as GuestUsize, None)
1728 .ok_or(Error::MemoryRangeAllocation)?;
1729
1730 // Update the slot so that it can be queried via the I/O port
1731 let slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1732 slot.active = true;
1733 slot.inserting = true;
1734 slot.base = region.start_addr().0;
1735 slot.length = region.len();
1736
1737 self.next_hotplug_slot += 1;
1738
1739 Ok(region)
1740 }
1741
guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap>1742 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1743 self.guest_memory.clone()
1744 }
1745
boot_guest_memory(&self) -> GuestMemoryMmap1746 pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1747 self.boot_guest_memory.clone()
1748 }
1749
allocator(&self) -> Arc<Mutex<SystemAllocator>>1750 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1751 self.allocator.clone()
1752 }
1753
start_of_device_area(&self) -> GuestAddress1754 pub fn start_of_device_area(&self) -> GuestAddress {
1755 self.start_of_device_area
1756 }
1757
end_of_device_area(&self) -> GuestAddress1758 pub fn end_of_device_area(&self) -> GuestAddress {
1759 self.end_of_device_area
1760 }
1761
memory_slot_allocator(&mut self) -> MemorySlotAllocator1762 pub fn memory_slot_allocator(&mut self) -> MemorySlotAllocator {
1763 let memory_slot_free_list = Arc::clone(&self.memory_slot_free_list);
1764 let next_memory_slot = Arc::clone(&self.next_memory_slot);
1765 MemorySlotAllocator::new(next_memory_slot, memory_slot_free_list)
1766 }
1767
allocate_memory_slot(&mut self) -> u321768 pub fn allocate_memory_slot(&mut self) -> u32 {
1769 self.memory_slot_allocator().next_memory_slot()
1770 }
1771
create_userspace_mapping( &mut self, guest_phys_addr: u64, memory_size: u64, userspace_addr: u64, mergeable: bool, readonly: bool, log_dirty: bool, ) -> Result<u32, Error>1772 pub fn create_userspace_mapping(
1773 &mut self,
1774 guest_phys_addr: u64,
1775 memory_size: u64,
1776 userspace_addr: u64,
1777 mergeable: bool,
1778 readonly: bool,
1779 log_dirty: bool,
1780 ) -> Result<u32, Error> {
1781 let slot = self.allocate_memory_slot();
1782 let mem_region = self.vm.make_user_memory_region(
1783 slot,
1784 guest_phys_addr,
1785 memory_size,
1786 userspace_addr,
1787 readonly,
1788 log_dirty,
1789 );
1790
1791 info!(
1792 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1793 guest_phys_addr, userspace_addr, memory_size, slot
1794 );
1795
1796 self.vm
1797 .create_user_memory_region(mem_region)
1798 .map_err(Error::CreateUserMemoryRegion)?;
1799
1800 // SAFETY: the address and size are valid since the
1801 // mmap succeeded.
1802 let ret = unsafe {
1803 libc::madvise(
1804 userspace_addr as *mut libc::c_void,
1805 memory_size as libc::size_t,
1806 libc::MADV_DONTDUMP,
1807 )
1808 };
1809 if ret != 0 {
1810 let e = io::Error::last_os_error();
1811 warn!("Failed to mark mapping as MADV_DONTDUMP: {}", e);
1812 }
1813
1814 // Mark the pages as mergeable if explicitly asked for.
1815 if mergeable {
1816 // SAFETY: the address and size are valid since the
1817 // mmap succeeded.
1818 let ret = unsafe {
1819 libc::madvise(
1820 userspace_addr as *mut libc::c_void,
1821 memory_size as libc::size_t,
1822 libc::MADV_MERGEABLE,
1823 )
1824 };
1825 if ret != 0 {
1826 let err = io::Error::last_os_error();
1827 // Safe to unwrap because the error is constructed with
1828 // last_os_error(), which ensures the output will be Some().
1829 let errno = err.raw_os_error().unwrap();
1830 if errno == libc::EINVAL {
1831 warn!("kernel not configured with CONFIG_KSM");
1832 } else {
1833 warn!("madvise error: {}", err);
1834 }
1835 warn!("failed to mark pages as mergeable");
1836 }
1837 }
1838
1839 info!(
1840 "Created userspace mapping: {:x} -> {:x} {:x}",
1841 guest_phys_addr, userspace_addr, memory_size
1842 );
1843
1844 Ok(slot)
1845 }
1846
remove_userspace_mapping( &mut self, guest_phys_addr: u64, memory_size: u64, userspace_addr: u64, mergeable: bool, slot: u32, ) -> Result<(), Error>1847 pub fn remove_userspace_mapping(
1848 &mut self,
1849 guest_phys_addr: u64,
1850 memory_size: u64,
1851 userspace_addr: u64,
1852 mergeable: bool,
1853 slot: u32,
1854 ) -> Result<(), Error> {
1855 let mem_region = self.vm.make_user_memory_region(
1856 slot,
1857 guest_phys_addr,
1858 memory_size,
1859 userspace_addr,
1860 false, /* readonly -- don't care */
1861 false, /* log dirty */
1862 );
1863
1864 self.vm
1865 .remove_user_memory_region(mem_region)
1866 .map_err(Error::RemoveUserMemoryRegion)?;
1867
1868 // Mark the pages as unmergeable if there were previously marked as
1869 // mergeable.
1870 if mergeable {
1871 // SAFETY: the address and size are valid as the region was
1872 // previously advised.
1873 let ret = unsafe {
1874 libc::madvise(
1875 userspace_addr as *mut libc::c_void,
1876 memory_size as libc::size_t,
1877 libc::MADV_UNMERGEABLE,
1878 )
1879 };
1880 if ret != 0 {
1881 let err = io::Error::last_os_error();
1882 // Safe to unwrap because the error is constructed with
1883 // last_os_error(), which ensures the output will be Some().
1884 let errno = err.raw_os_error().unwrap();
1885 if errno == libc::EINVAL {
1886 warn!("kernel not configured with CONFIG_KSM");
1887 } else {
1888 warn!("madvise error: {}", err);
1889 }
1890 warn!("failed to mark pages as unmergeable");
1891 }
1892 }
1893
1894 info!(
1895 "Removed userspace mapping: {:x} -> {:x} {:x}",
1896 guest_phys_addr, userspace_addr, memory_size
1897 );
1898
1899 Ok(())
1900 }
1901
virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error>1902 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1903 if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1904 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1905 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() {
1906 virtio_mem_device
1907 .lock()
1908 .unwrap()
1909 .resize(size)
1910 .map_err(Error::VirtioMemResizeFail)?;
1911 }
1912
1913 // Keep the hotplugged_size up to date.
1914 virtio_mem_zone.hotplugged_size = size;
1915 } else {
1916 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1917 return Err(Error::MissingVirtioMemHandler);
1918 }
1919
1920 return Ok(());
1921 }
1922
1923 error!("Failed resizing virtio-mem region: Unknown memory zone");
1924 Err(Error::UnknownMemoryZone)
1925 }
1926
1927 /// In case this function resulted in adding a new memory region to the
1928 /// guest memory, the new region is returned to the caller. The virtio-mem
1929 /// use case never adds a new region as the whole hotpluggable memory has
1930 /// already been allocated at boot time.
resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error>1931 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1932 if self.user_provided_zones {
1933 error!(
1934 "Not allowed to resize guest memory when backed with user \
1935 defined memory zones."
1936 );
1937 return Err(Error::InvalidResizeWithMemoryZones);
1938 }
1939
1940 let mut region: Option<Arc<GuestRegionMmap>> = None;
1941 match self.hotplug_method {
1942 HotplugMethod::VirtioMem => {
1943 if desired_ram >= self.boot_ram {
1944 if !self.dynamic {
1945 return Ok(region);
1946 }
1947
1948 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1949 self.current_ram = desired_ram;
1950 }
1951 }
1952 HotplugMethod::Acpi => {
1953 if desired_ram > self.current_ram {
1954 if !self.dynamic {
1955 return Ok(region);
1956 }
1957
1958 region =
1959 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1960 self.current_ram = desired_ram;
1961 }
1962 }
1963 }
1964 Ok(region)
1965 }
1966
resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error>1967 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1968 if !self.user_provided_zones {
1969 error!(
1970 "Not allowed to resize guest memory zone when no zone is \
1971 defined."
1972 );
1973 return Err(Error::ResizeZone);
1974 }
1975
1976 self.virtio_mem_resize(id, virtio_mem_size)
1977 }
1978
1979 #[cfg(target_arch = "x86_64")]
setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error>1980 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1981 let file = OpenOptions::new()
1982 .read(true)
1983 .open("/dev/sgx_provision")
1984 .map_err(Error::SgxProvisionOpen)?;
1985 self.vm
1986 .enable_sgx_attribute(file)
1987 .map_err(Error::SgxEnableProvisioning)?;
1988
1989 // Go over each EPC section and verify its size is a 4k multiple. At
1990 // the same time, calculate the total size needed for the contiguous
1991 // EPC region.
1992 let mut epc_region_size = 0;
1993 for epc_section in sgx_epc_config.iter() {
1994 if epc_section.size == 0 {
1995 return Err(Error::EpcSectionSizeInvalid);
1996 }
1997 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1998 return Err(Error::EpcSectionSizeInvalid);
1999 }
2000
2001 epc_region_size += epc_section.size;
2002 }
2003
2004 // Place the SGX EPC region on a 4k boundary between the RAM and the device area
2005 let epc_region_start =
2006 GuestAddress(self.start_of_device_area.0.div_ceil(SGX_PAGE_SIZE) * SGX_PAGE_SIZE);
2007
2008 self.start_of_device_area = epc_region_start
2009 .checked_add(epc_region_size)
2010 .ok_or(Error::GuestAddressOverFlow)?;
2011
2012 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
2013 info!(
2014 "SGX EPC region: 0x{:x} (0x{:x})",
2015 epc_region_start.0, epc_region_size
2016 );
2017
2018 // Each section can be memory mapped into the allocated region.
2019 let mut epc_section_start = epc_region_start.raw_value();
2020 for epc_section in sgx_epc_config.iter() {
2021 let file = OpenOptions::new()
2022 .read(true)
2023 .write(true)
2024 .open("/dev/sgx_vepc")
2025 .map_err(Error::SgxVirtEpcOpen)?;
2026
2027 let prot = PROT_READ | PROT_WRITE;
2028 let mut flags = MAP_NORESERVE | MAP_SHARED;
2029 if epc_section.prefault {
2030 flags |= MAP_POPULATE;
2031 }
2032
2033 // We can't use the vm-memory crate to perform the memory mapping
2034 // here as it would try to ensure the size of the backing file is
2035 // matching the size of the expected mapping. The /dev/sgx_vepc
2036 // device does not work that way, it provides a file descriptor
2037 // which is not matching the mapping size, as it's a just a way to
2038 // let KVM know that an EPC section is being created for the guest.
2039 // SAFETY: FFI call with correct arguments
2040 let host_addr = unsafe {
2041 libc::mmap(
2042 std::ptr::null_mut(),
2043 epc_section.size as usize,
2044 prot,
2045 flags,
2046 file.as_raw_fd(),
2047 0,
2048 )
2049 };
2050
2051 if host_addr == libc::MAP_FAILED {
2052 error!(
2053 "Could not add SGX EPC section (size 0x{:x})",
2054 epc_section.size
2055 );
2056 return Err(Error::SgxEpcRangeAllocation);
2057 }
2058
2059 info!(
2060 "Adding SGX EPC section: 0x{:x} (0x{:x})",
2061 epc_section_start, epc_section.size
2062 );
2063
2064 let _mem_slot = self.create_userspace_mapping(
2065 epc_section_start,
2066 epc_section.size,
2067 host_addr as u64,
2068 false,
2069 false,
2070 false,
2071 )?;
2072
2073 sgx_epc_region.insert(
2074 epc_section.id.clone(),
2075 SgxEpcSection::new(
2076 GuestAddress(epc_section_start),
2077 epc_section.size as GuestUsize,
2078 ),
2079 );
2080
2081 epc_section_start += epc_section.size;
2082 }
2083
2084 self.sgx_epc_region = Some(sgx_epc_region);
2085
2086 Ok(())
2087 }
2088
2089 #[cfg(target_arch = "x86_64")]
sgx_epc_region(&self) -> &Option<SgxEpcRegion>2090 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
2091 &self.sgx_epc_region
2092 }
2093
is_hardlink(f: &File) -> bool2094 pub fn is_hardlink(f: &File) -> bool {
2095 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
2096 // SAFETY: FFI call with correct arguments
2097 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
2098 if ret != 0 {
2099 error!("Couldn't fstat the backing file");
2100 return false;
2101 }
2102
2103 // SAFETY: stat is valid
2104 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
2105 }
2106
memory_zones(&self) -> &MemoryZones2107 pub fn memory_zones(&self) -> &MemoryZones {
2108 &self.memory_zones
2109 }
2110
memory_zones_mut(&mut self) -> &mut MemoryZones2111 pub fn memory_zones_mut(&mut self) -> &mut MemoryZones {
2112 &mut self.memory_zones
2113 }
2114
memory_range_table( &self, snapshot: bool, ) -> std::result::Result<MemoryRangeTable, MigratableError>2115 pub fn memory_range_table(
2116 &self,
2117 snapshot: bool,
2118 ) -> std::result::Result<MemoryRangeTable, MigratableError> {
2119 let mut table = MemoryRangeTable::default();
2120
2121 for memory_zone in self.memory_zones.values() {
2122 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
2123 table.extend(virtio_mem_zone.plugged_ranges());
2124 }
2125
2126 for region in memory_zone.regions() {
2127 if snapshot {
2128 if let Some(file_offset) = region.file_offset() {
2129 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
2130 && Self::is_hardlink(file_offset.file())
2131 {
2132 // In this very specific case, we know the memory
2133 // region is backed by a file on the host filesystem
2134 // that can be accessed by the user, and additionally
2135 // the mapping is shared, which means that modifications
2136 // to the content are written to the actual file.
2137 // When meeting these conditions, we can skip the
2138 // copy of the memory content for this specific region,
2139 // as we can assume the user will have it saved through
2140 // the backing file already.
2141 continue;
2142 }
2143 }
2144 }
2145
2146 table.push(MemoryRange {
2147 gpa: region.start_addr().raw_value(),
2148 length: region.len(),
2149 });
2150 }
2151 }
2152
2153 Ok(table)
2154 }
2155
snapshot_data(&self) -> MemoryManagerSnapshotData2156 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
2157 MemoryManagerSnapshotData {
2158 memory_ranges: self.snapshot_memory_ranges.clone(),
2159 guest_ram_mappings: self.guest_ram_mappings.clone(),
2160 start_of_device_area: self.start_of_device_area.0,
2161 boot_ram: self.boot_ram,
2162 current_ram: self.current_ram,
2163 arch_mem_regions: self.arch_mem_regions.clone(),
2164 hotplug_slots: self.hotplug_slots.clone(),
2165 next_memory_slot: self.next_memory_slot.load(Ordering::SeqCst),
2166 selected_slot: self.selected_slot,
2167 next_hotplug_slot: self.next_hotplug_slot,
2168 }
2169 }
2170
memory_slot_fds(&self) -> HashMap<u32, RawFd>2171 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
2172 let mut memory_slot_fds = HashMap::new();
2173 for guest_ram_mapping in &self.guest_ram_mappings {
2174 let slot = guest_ram_mapping.slot;
2175 let guest_memory = self.guest_memory.memory();
2176 let file = guest_memory
2177 .find_region(GuestAddress(guest_ram_mapping.gpa))
2178 .unwrap()
2179 .file_offset()
2180 .unwrap()
2181 .file();
2182 memory_slot_fds.insert(slot, file.as_raw_fd());
2183 }
2184 memory_slot_fds
2185 }
2186
acpi_address(&self) -> Option<GuestAddress>2187 pub fn acpi_address(&self) -> Option<GuestAddress> {
2188 self.acpi_address
2189 }
2190
num_guest_ram_mappings(&self) -> u322191 pub fn num_guest_ram_mappings(&self) -> u32 {
2192 self.guest_ram_mappings.len() as u32
2193 }
2194
2195 #[cfg(target_arch = "aarch64")]
uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap>2196 pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
2197 self.uefi_flash.as_ref().unwrap().clone()
2198 }
2199
2200 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions2201 pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
2202 let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
2203 mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
2204
2205 let mut mem_offset_in_elf = mem_offset;
2206 let mut ram_maps = BTreeMap::new();
2207 for mapping in mapping_sorted_by_gpa.iter() {
2208 ram_maps.insert(
2209 mapping.gpa,
2210 CoredumpMemoryRegion {
2211 mem_offset_in_elf,
2212 mem_size: mapping.size,
2213 },
2214 );
2215 mem_offset_in_elf += mapping.size;
2216 }
2217
2218 CoredumpMemoryRegions { ram_maps }
2219 }
2220
2221 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
coredump_iterate_save_mem( &mut self, dump_state: &DumpState, ) -> std::result::Result<(), GuestDebuggableError>2222 pub fn coredump_iterate_save_mem(
2223 &mut self,
2224 dump_state: &DumpState,
2225 ) -> std::result::Result<(), GuestDebuggableError> {
2226 let snapshot_memory_ranges = self
2227 .memory_range_table(false)
2228 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2229
2230 if snapshot_memory_ranges.is_empty() {
2231 return Ok(());
2232 }
2233
2234 let coredump_file = dump_state.file.as_ref().unwrap();
2235
2236 let guest_memory = self.guest_memory.memory();
2237 let mut total_bytes: u64 = 0;
2238
2239 for range in snapshot_memory_ranges.regions() {
2240 let mut offset: u64 = 0;
2241 loop {
2242 let bytes_written = guest_memory
2243 .write_volatile_to(
2244 GuestAddress(range.gpa + offset),
2245 &mut coredump_file.as_fd(),
2246 (range.length - offset) as usize,
2247 )
2248 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2249 offset += bytes_written as u64;
2250 total_bytes += bytes_written as u64;
2251
2252 if offset == range.length {
2253 break;
2254 }
2255 }
2256 }
2257
2258 debug!("coredump total bytes {}", total_bytes);
2259 Ok(())
2260 }
2261
receive_memory_regions<F>( &mut self, ranges: &MemoryRangeTable, fd: &mut F, ) -> std::result::Result<(), MigratableError> where F: ReadVolatile,2262 pub fn receive_memory_regions<F>(
2263 &mut self,
2264 ranges: &MemoryRangeTable,
2265 fd: &mut F,
2266 ) -> std::result::Result<(), MigratableError>
2267 where
2268 F: ReadVolatile,
2269 {
2270 let guest_memory = self.guest_memory();
2271 let mem = guest_memory.memory();
2272
2273 for range in ranges.regions() {
2274 let mut offset: u64 = 0;
2275 // Here we are manually handling the retry in case we can't the
2276 // whole region at once because we can't use the implementation
2277 // from vm-memory::GuestMemory of read_exact_from() as it is not
2278 // following the correct behavior. For more info about this issue
2279 // see: https://github.com/rust-vmm/vm-memory/issues/174
2280 loop {
2281 let bytes_read = mem
2282 .read_volatile_from(
2283 GuestAddress(range.gpa + offset),
2284 fd,
2285 (range.length - offset) as usize,
2286 )
2287 .map_err(|e| {
2288 MigratableError::MigrateReceive(anyhow!(
2289 "Error receiving memory from socket: {}",
2290 e
2291 ))
2292 })?;
2293 offset += bytes_read as u64;
2294
2295 if offset == range.length {
2296 break;
2297 }
2298 }
2299 }
2300
2301 Ok(())
2302 }
2303 }
2304
2305 struct MemoryNotify {
2306 slot_id: usize,
2307 }
2308
2309 impl Aml for MemoryNotify {
to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink)2310 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2311 let object = aml::Path::new(&format!("M{:03}", self.slot_id));
2312 aml::If::new(
2313 &aml::Equal::new(&aml::Arg(0), &self.slot_id),
2314 vec![&aml::Notify::new(&object, &aml::Arg(1))],
2315 )
2316 .to_aml_bytes(sink)
2317 }
2318 }
2319
2320 struct MemorySlot {
2321 slot_id: usize,
2322 }
2323
2324 impl Aml for MemorySlot {
to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink)2325 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2326 aml::Device::new(
2327 format!("M{:03}", self.slot_id).as_str().into(),
2328 vec![
2329 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")),
2330 &aml::Name::new("_UID".into(), &self.slot_id),
2331 /*
2332 _STA return value:
2333 Bit [0] – Set if the device is present.
2334 Bit [1] – Set if the device is enabled and decoding its resources.
2335 Bit [2] – Set if the device should be shown in the UI.
2336 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
2337 Bit [4] – Set if the battery is present.
2338 Bits [31:5] – Reserved (must be cleared).
2339 */
2340 &aml::Method::new(
2341 "_STA".into(),
2342 0,
2343 false,
2344 // Call into MSTA method which will interrogate device
2345 vec![&aml::Return::new(&aml::MethodCall::new(
2346 "MSTA".into(),
2347 vec![&self.slot_id],
2348 ))],
2349 ),
2350 // Get details of memory
2351 &aml::Method::new(
2352 "_CRS".into(),
2353 0,
2354 false,
2355 // Call into MCRS which provides actual memory details
2356 vec![&aml::Return::new(&aml::MethodCall::new(
2357 "MCRS".into(),
2358 vec![&self.slot_id],
2359 ))],
2360 ),
2361 ],
2362 )
2363 .to_aml_bytes(sink)
2364 }
2365 }
2366
2367 struct MemorySlots {
2368 slots: usize,
2369 }
2370
2371 impl Aml for MemorySlots {
to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink)2372 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2373 for slot_id in 0..self.slots {
2374 MemorySlot { slot_id }.to_aml_bytes(sink);
2375 }
2376 }
2377 }
2378
2379 struct MemoryMethods {
2380 slots: usize,
2381 }
2382
2383 impl Aml for MemoryMethods {
to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink)2384 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2385 // Add "MTFY" notification method
2386 let mut memory_notifies = Vec::new();
2387 for slot_id in 0..self.slots {
2388 memory_notifies.push(MemoryNotify { slot_id });
2389 }
2390
2391 let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new();
2392 for memory_notifier in memory_notifies.iter() {
2393 memory_notifies_refs.push(memory_notifier);
2394 }
2395
2396 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink);
2397
2398 // MSCN method
2399 aml::Method::new(
2400 "MSCN".into(),
2401 0,
2402 true,
2403 vec![
2404 // Take lock defined above
2405 &aml::Acquire::new("MLCK".into(), 0xffff),
2406 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2407 &aml::While::new(
2408 &aml::LessThan::new(&aml::Local(0), &self.slots),
2409 vec![
2410 // Write slot number (in first argument) to I/O port via field
2411 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2412 // Check if MINS bit is set (inserting)
2413 &aml::If::new(
2414 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2415 // Notify device if it is
2416 vec![
2417 &aml::MethodCall::new(
2418 "MTFY".into(),
2419 vec![&aml::Local(0), &aml::ONE],
2420 ),
2421 // Reset MINS bit
2422 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2423 ],
2424 ),
2425 // Check if MRMV bit is set
2426 &aml::If::new(
2427 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2428 // Notify device if it is (with the eject constant 0x3)
2429 vec![
2430 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2431 // Reset MRMV bit
2432 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2433 ],
2434 ),
2435 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2436 ],
2437 ),
2438 // Release lock
2439 &aml::Release::new("MLCK".into()),
2440 ],
2441 )
2442 .to_aml_bytes(sink);
2443
2444 // Memory status method
2445 aml::Method::new(
2446 "MSTA".into(),
2447 1,
2448 true,
2449 vec![
2450 // Take lock defined above
2451 &aml::Acquire::new("MLCK".into(), 0xffff),
2452 // Write slot number (in first argument) to I/O port via field
2453 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2454 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2455 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2456 &aml::If::new(
2457 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2458 vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2459 ),
2460 // Release lock
2461 &aml::Release::new("MLCK".into()),
2462 // Return 0 or 0xf
2463 &aml::Return::new(&aml::Local(0)),
2464 ],
2465 )
2466 .to_aml_bytes(sink);
2467
2468 // Memory range method
2469 aml::Method::new(
2470 "MCRS".into(),
2471 1,
2472 true,
2473 vec![
2474 // Take lock defined above
2475 &aml::Acquire::new("MLCK".into(), 0xffff),
2476 // Write slot number (in first argument) to I/O port via field
2477 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2478 &aml::Name::new(
2479 "MR64".into(),
2480 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2481 aml::AddressSpaceCacheable::Cacheable,
2482 true,
2483 0x0000_0000_0000_0000u64,
2484 0xFFFF_FFFF_FFFF_FFFEu64,
2485 None,
2486 )]),
2487 ),
2488 &aml::CreateQWordField::new(
2489 &aml::Path::new("MINL"),
2490 &aml::Path::new("MR64"),
2491 &14usize,
2492 ),
2493 &aml::CreateDWordField::new(
2494 &aml::Path::new("MINH"),
2495 &aml::Path::new("MR64"),
2496 &18usize,
2497 ),
2498 &aml::CreateQWordField::new(
2499 &aml::Path::new("MAXL"),
2500 &aml::Path::new("MR64"),
2501 &22usize,
2502 ),
2503 &aml::CreateDWordField::new(
2504 &aml::Path::new("MAXH"),
2505 &aml::Path::new("MR64"),
2506 &26usize,
2507 ),
2508 &aml::CreateQWordField::new(
2509 &aml::Path::new("LENL"),
2510 &aml::Path::new("MR64"),
2511 &38usize,
2512 ),
2513 &aml::CreateDWordField::new(
2514 &aml::Path::new("LENH"),
2515 &aml::Path::new("MR64"),
2516 &42usize,
2517 ),
2518 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2519 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2520 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2521 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2522 &aml::Add::new(
2523 &aml::Path::new("MAXL"),
2524 &aml::Path::new("MINL"),
2525 &aml::Path::new("LENL"),
2526 ),
2527 &aml::Add::new(
2528 &aml::Path::new("MAXH"),
2529 &aml::Path::new("MINH"),
2530 &aml::Path::new("LENH"),
2531 ),
2532 &aml::If::new(
2533 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2534 vec![&aml::Add::new(
2535 &aml::Path::new("MAXH"),
2536 &aml::ONE,
2537 &aml::Path::new("MAXH"),
2538 )],
2539 ),
2540 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2541 // Release lock
2542 &aml::Release::new("MLCK".into()),
2543 &aml::Return::new(&aml::Path::new("MR64")),
2544 ],
2545 )
2546 .to_aml_bytes(sink)
2547 }
2548 }
2549
2550 impl Aml for MemoryManager {
to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink)2551 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2552 if let Some(acpi_address) = self.acpi_address {
2553 // Memory Hotplug Controller
2554 aml::Device::new(
2555 "_SB_.MHPC".into(),
2556 vec![
2557 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2558 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2559 // Mutex to protect concurrent access as we write to choose slot and then read back status
2560 &aml::Mutex::new("MLCK".into(), 0),
2561 &aml::Name::new(
2562 "_CRS".into(),
2563 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2564 aml::AddressSpaceCacheable::NotCacheable,
2565 true,
2566 acpi_address.0,
2567 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2568 None,
2569 )]),
2570 ),
2571 // OpRegion and Fields map MMIO range into individual field values
2572 &aml::OpRegion::new(
2573 "MHPR".into(),
2574 aml::OpRegionSpace::SystemMemory,
2575 &(acpi_address.0 as usize),
2576 &MEMORY_MANAGER_ACPI_SIZE,
2577 ),
2578 &aml::Field::new(
2579 "MHPR".into(),
2580 aml::FieldAccessType::DWord,
2581 aml::FieldLockRule::NoLock,
2582 aml::FieldUpdateRule::Preserve,
2583 vec![
2584 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2585 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2586 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2587 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2588 ],
2589 ),
2590 &aml::Field::new(
2591 "MHPR".into(),
2592 aml::FieldAccessType::DWord,
2593 aml::FieldLockRule::NoLock,
2594 aml::FieldUpdateRule::Preserve,
2595 vec![
2596 aml::FieldEntry::Reserved(128),
2597 aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2598 ],
2599 ),
2600 &aml::Field::new(
2601 "MHPR".into(),
2602 aml::FieldAccessType::Byte,
2603 aml::FieldLockRule::NoLock,
2604 aml::FieldUpdateRule::WriteAsZeroes,
2605 vec![
2606 aml::FieldEntry::Reserved(160),
2607 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2608 aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2609 aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2610 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2611 ],
2612 ),
2613 &aml::Field::new(
2614 "MHPR".into(),
2615 aml::FieldAccessType::DWord,
2616 aml::FieldLockRule::NoLock,
2617 aml::FieldUpdateRule::Preserve,
2618 vec![
2619 aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2620 aml::FieldEntry::Named(*b"MOEV", 32), // Event
2621 aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2622 ],
2623 ),
2624 &MemoryMethods {
2625 slots: self.hotplug_slots.len(),
2626 },
2627 &MemorySlots {
2628 slots: self.hotplug_slots.len(),
2629 },
2630 ],
2631 )
2632 .to_aml_bytes(sink);
2633 } else {
2634 aml::Device::new(
2635 "_SB_.MHPC".into(),
2636 vec![
2637 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2638 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2639 // Empty MSCN for GED
2640 &aml::Method::new("MSCN".into(), 0, true, vec![]),
2641 ],
2642 )
2643 .to_aml_bytes(sink);
2644 }
2645
2646 #[cfg(target_arch = "x86_64")]
2647 {
2648 if let Some(sgx_epc_region) = &self.sgx_epc_region {
2649 let min = sgx_epc_region.start().raw_value();
2650 let max = min + sgx_epc_region.size() - 1;
2651 // SGX EPC region
2652 aml::Device::new(
2653 "_SB_.EPC_".into(),
2654 vec![
2655 &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")),
2656 // QWORD describing the EPC region start and size
2657 &aml::Name::new(
2658 "_CRS".into(),
2659 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2660 aml::AddressSpaceCacheable::NotCacheable,
2661 true,
2662 min,
2663 max,
2664 None,
2665 )]),
2666 ),
2667 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2668 ],
2669 )
2670 .to_aml_bytes(sink);
2671 }
2672 }
2673 }
2674 }
2675
2676 impl Pausable for MemoryManager {}
2677
2678 #[derive(Clone, Serialize, Deserialize)]
2679 pub struct MemoryManagerSnapshotData {
2680 memory_ranges: MemoryRangeTable,
2681 guest_ram_mappings: Vec<GuestRamMapping>,
2682 start_of_device_area: u64,
2683 boot_ram: u64,
2684 current_ram: u64,
2685 arch_mem_regions: Vec<ArchMemRegion>,
2686 hotplug_slots: Vec<HotPlugState>,
2687 next_memory_slot: u32,
2688 selected_slot: usize,
2689 next_hotplug_slot: usize,
2690 }
2691
2692 impl Snapshottable for MemoryManager {
id(&self) -> String2693 fn id(&self) -> String {
2694 MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2695 }
2696
snapshot(&mut self) -> result::Result<Snapshot, MigratableError>2697 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2698 let memory_ranges = self.memory_range_table(true)?;
2699
2700 // Store locally this list of ranges as it will be used through the
2701 // Transportable::send() implementation. The point is to avoid the
2702 // duplication of code regarding the creation of the path for each
2703 // region. The 'snapshot' step creates the list of memory regions,
2704 // including information about the need to copy a memory region or
2705 // not. This saves the 'send' step having to go through the same
2706 // process, and instead it can directly proceed with storing the
2707 // memory range content for the ranges requiring it.
2708 self.snapshot_memory_ranges = memory_ranges;
2709
2710 Ok(Snapshot::from_data(SnapshotData::new_from_state(
2711 &self.snapshot_data(),
2712 )?))
2713 }
2714 }
2715
2716 impl Transportable for MemoryManager {
send( &self, _snapshot: &Snapshot, destination_url: &str, ) -> result::Result<(), MigratableError>2717 fn send(
2718 &self,
2719 _snapshot: &Snapshot,
2720 destination_url: &str,
2721 ) -> result::Result<(), MigratableError> {
2722 if self.snapshot_memory_ranges.is_empty() {
2723 return Ok(());
2724 }
2725
2726 let mut memory_file_path = url_to_path(destination_url)?;
2727 memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2728
2729 // Create the snapshot file for the entire memory
2730 let mut memory_file = OpenOptions::new()
2731 .read(true)
2732 .write(true)
2733 .create_new(true)
2734 .open(memory_file_path)
2735 .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2736
2737 let guest_memory = self.guest_memory.memory();
2738
2739 for range in self.snapshot_memory_ranges.regions() {
2740 let mut offset: u64 = 0;
2741 // Here we are manually handling the retry in case we can't read
2742 // the whole region at once because we can't use the implementation
2743 // from vm-memory::GuestMemory of write_all_to() as it is not
2744 // following the correct behavior. For more info about this issue
2745 // see: https://github.com/rust-vmm/vm-memory/issues/174
2746 loop {
2747 let bytes_written = guest_memory
2748 .write_volatile_to(
2749 GuestAddress(range.gpa + offset),
2750 &mut memory_file,
2751 (range.length - offset) as usize,
2752 )
2753 .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2754 offset += bytes_written as u64;
2755
2756 if offset == range.length {
2757 break;
2758 }
2759 }
2760 }
2761 Ok(())
2762 }
2763 }
2764
2765 impl Migratable for MemoryManager {
2766 // Start the dirty log in the hypervisor (kvm/mshv).
2767 // Also, reset the dirty bitmap logged by the vmm.
2768 // Just before we do a bulk copy we want to start/clear the dirty log so that
2769 // pages touched during our bulk copy are tracked.
start_dirty_log(&mut self) -> std::result::Result<(), MigratableError>2770 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2771 self.vm.start_dirty_log().map_err(|e| {
2772 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2773 })?;
2774
2775 for r in self.guest_memory.memory().iter() {
2776 (**r).bitmap().reset();
2777 }
2778
2779 Ok(())
2780 }
2781
stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError>2782 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2783 self.vm.stop_dirty_log().map_err(|e| {
2784 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2785 })?;
2786
2787 Ok(())
2788 }
2789
2790 // Generate a table for the pages that are dirty. The dirty pages are collapsed
2791 // together in the table if they are contiguous.
dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError>2792 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2793 let mut table = MemoryRangeTable::default();
2794 for r in &self.guest_ram_mappings {
2795 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2796 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2797 })?;
2798 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2799 {
2800 Some(region) => {
2801 assert!(region.start_addr().raw_value() == r.gpa);
2802 assert!(region.len() == r.size);
2803 (**region).bitmap().get_and_reset()
2804 }
2805 None => {
2806 return Err(MigratableError::MigrateSend(anyhow!(
2807 "Error finding 'guest memory region' with address {:x}",
2808 r.gpa
2809 )))
2810 }
2811 };
2812
2813 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2814 .iter()
2815 .zip(vmm_dirty_bitmap.iter())
2816 .map(|(x, y)| x | y)
2817 .collect();
2818
2819 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2820
2821 if sub_table.regions().is_empty() {
2822 info!("Dirty Memory Range Table is empty");
2823 } else {
2824 info!("Dirty Memory Range Table:");
2825 for range in sub_table.regions() {
2826 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2827 }
2828 }
2829
2830 table.extend(sub_table);
2831 }
2832 Ok(table)
2833 }
2834 }
2835