1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 #[cfg(target_arch = "x86_64")] 6 use crate::config::SgxEpcConfig; 7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; 8 #[cfg(feature = "guest_debug")] 9 use crate::coredump::{CoredumpMemoryRegion, CoredumpMemoryRegions}; 10 #[cfg(feature = "guest_debug")] 11 use crate::coredump::{DumpState, GuestDebuggableError}; 12 use crate::migration::url_to_path; 13 use crate::MEMORY_MANAGER_SNAPSHOT_ID; 14 use crate::{GuestMemoryMmap, GuestRegionMmap}; 15 use acpi_tables::{aml, aml::Aml}; 16 use anyhow::anyhow; 17 #[cfg(target_arch = "x86_64")] 18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; 19 use arch::{layout, RegionType}; 20 #[cfg(target_arch = "x86_64")] 21 use devices::ioapic; 22 #[cfg(target_arch = "aarch64")] 23 use hypervisor::HypervisorVmError; 24 #[cfg(target_arch = "x86_64")] 25 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; 26 use serde::{Deserialize, Serialize}; 27 #[cfg(feature = "guest_debug")] 28 use std::collections::BTreeMap; 29 use std::collections::HashMap; 30 use std::convert::TryInto; 31 use std::ffi; 32 use std::fs::{File, OpenOptions}; 33 use std::io; 34 use std::ops::Deref; 35 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 36 use std::path::PathBuf; 37 use std::result; 38 use std::sync::{Arc, Barrier, Mutex}; 39 use tracer::trace_scoped; 40 use versionize::{VersionMap, Versionize, VersionizeResult}; 41 use versionize_derive::Versionize; 42 use virtio_devices::BlocksState; 43 #[cfg(target_arch = "x86_64")] 44 use vm_allocator::GsiApic; 45 use vm_allocator::{AddressAllocator, SystemAllocator}; 46 use vm_device::BusDevice; 47 use vm_memory::bitmap::AtomicBitmap; 48 use vm_memory::guest_memory::FileOffset; 49 use vm_memory::{ 50 mmap::MmapRegionError, Address, Bytes, Error as MmapError, GuestAddress, GuestAddressSpace, 51 GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, 52 }; 53 use vm_migration::{ 54 protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, 55 Snapshot, SnapshotDataSection, Snapshottable, Transportable, VersionMapped, 56 }; 57 58 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; 59 60 const DEFAULT_MEMORY_ZONE: &str = "mem0"; 61 62 const SNAPSHOT_FILENAME: &str = "memory-ranges"; 63 64 #[cfg(target_arch = "x86_64")] 65 const X86_64_IRQ_BASE: u32 = 5; 66 67 #[cfg(target_arch = "x86_64")] 68 const SGX_PAGE_SIZE: u64 = 1 << 12; 69 70 const HOTPLUG_COUNT: usize = 8; 71 72 // Memory policy constants 73 const MPOL_BIND: u32 = 2; 74 const MPOL_MF_STRICT: u32 = 1; 75 const MPOL_MF_MOVE: u32 = 1 << 1; 76 77 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices) 78 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20; 79 80 #[derive(Clone, Default, Serialize, Deserialize, Versionize)] 81 struct HotPlugState { 82 base: u64, 83 length: u64, 84 active: bool, 85 inserting: bool, 86 removing: bool, 87 } 88 89 pub struct VirtioMemZone { 90 region: Arc<GuestRegionMmap>, 91 virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>, 92 hotplugged_size: u64, 93 hugepages: bool, 94 blocks_state: Arc<Mutex<BlocksState>>, 95 } 96 97 impl VirtioMemZone { 98 pub fn region(&self) -> &Arc<GuestRegionMmap> { 99 &self.region 100 } 101 pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) { 102 self.virtio_device = Some(virtio_device); 103 } 104 pub fn hotplugged_size(&self) -> u64 { 105 self.hotplugged_size 106 } 107 pub fn hugepages(&self) -> bool { 108 self.hugepages 109 } 110 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> { 111 &self.blocks_state 112 } 113 pub fn plugged_ranges(&self) -> MemoryRangeTable { 114 self.blocks_state 115 .lock() 116 .unwrap() 117 .memory_ranges(self.region.start_addr().raw_value(), true) 118 } 119 } 120 121 #[derive(Default)] 122 pub struct MemoryZone { 123 regions: Vec<Arc<GuestRegionMmap>>, 124 virtio_mem_zone: Option<VirtioMemZone>, 125 } 126 127 impl MemoryZone { 128 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 129 &self.regions 130 } 131 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> { 132 &self.virtio_mem_zone 133 } 134 pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> { 135 self.virtio_mem_zone.as_mut() 136 } 137 } 138 139 pub type MemoryZones = HashMap<String, MemoryZone>; 140 141 #[derive(Clone, Serialize, Deserialize, Versionize)] 142 struct GuestRamMapping { 143 slot: u32, 144 gpa: u64, 145 size: u64, 146 zone_id: String, 147 virtio_mem: bool, 148 file_offset: u64, 149 } 150 151 #[derive(Clone, Serialize, Deserialize, Versionize)] 152 struct ArchMemRegion { 153 base: u64, 154 size: usize, 155 r_type: RegionType, 156 } 157 158 pub struct MemoryManager { 159 boot_guest_memory: GuestMemoryMmap, 160 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>, 161 next_memory_slot: u32, 162 start_of_device_area: GuestAddress, 163 end_of_device_area: GuestAddress, 164 end_of_ram_area: GuestAddress, 165 pub vm: Arc<dyn hypervisor::Vm>, 166 hotplug_slots: Vec<HotPlugState>, 167 selected_slot: usize, 168 mergeable: bool, 169 allocator: Arc<Mutex<SystemAllocator>>, 170 hotplug_method: HotplugMethod, 171 boot_ram: u64, 172 current_ram: u64, 173 next_hotplug_slot: usize, 174 shared: bool, 175 hugepages: bool, 176 hugepage_size: Option<u64>, 177 prefault: bool, 178 #[cfg(target_arch = "x86_64")] 179 sgx_epc_region: Option<SgxEpcRegion>, 180 user_provided_zones: bool, 181 snapshot_memory_ranges: MemoryRangeTable, 182 memory_zones: MemoryZones, 183 log_dirty: bool, // Enable dirty logging for created RAM regions 184 arch_mem_regions: Vec<ArchMemRegion>, 185 ram_allocator: AddressAllocator, 186 dynamic: bool, 187 188 // Keep track of calls to create_userspace_mapping() for guest RAM. 189 // This is useful for getting the dirty pages as we need to know the 190 // slots that the mapping is created in. 191 guest_ram_mappings: Vec<GuestRamMapping>, 192 193 pub acpi_address: Option<GuestAddress>, 194 #[cfg(target_arch = "aarch64")] 195 uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>, 196 } 197 198 #[derive(Debug)] 199 pub enum Error { 200 /// Failed to create shared file. 201 SharedFileCreate(io::Error), 202 203 /// Failed to set shared file length. 204 SharedFileSetLen(io::Error), 205 206 /// Mmap backed guest memory error 207 GuestMemory(MmapError), 208 209 /// Failed to allocate a memory range. 210 MemoryRangeAllocation, 211 212 /// Error from region creation 213 GuestMemoryRegion(MmapRegionError), 214 215 /// No ACPI slot available 216 NoSlotAvailable, 217 218 /// Not enough space in the hotplug RAM region 219 InsufficientHotplugRam, 220 221 /// The requested hotplug memory addition is not a valid size 222 InvalidSize, 223 224 /// Failed to create the user memory region. 225 CreateUserMemoryRegion(hypervisor::HypervisorVmError), 226 227 /// Failed to remove the user memory region. 228 RemoveUserMemoryRegion(hypervisor::HypervisorVmError), 229 230 /// Failed to EventFd. 231 EventFdFail(io::Error), 232 233 /// Eventfd write error 234 EventfdError(io::Error), 235 236 /// Failed to virtio-mem resize 237 VirtioMemResizeFail(virtio_devices::mem::Error), 238 239 /// Cannot restore VM 240 Restore(MigratableError), 241 242 /// Cannot restore VM because source URL is missing 243 RestoreMissingSourceUrl, 244 245 /// Cannot create the system allocator 246 CreateSystemAllocator, 247 248 /// Invalid SGX EPC section size 249 #[cfg(target_arch = "x86_64")] 250 EpcSectionSizeInvalid, 251 252 /// Failed allocating SGX EPC region 253 #[cfg(target_arch = "x86_64")] 254 SgxEpcRangeAllocation, 255 256 /// Failed opening SGX virtual EPC device 257 #[cfg(target_arch = "x86_64")] 258 SgxVirtEpcOpen(io::Error), 259 260 /// Failed setting the SGX virtual EPC section size 261 #[cfg(target_arch = "x86_64")] 262 SgxVirtEpcFileSetLen(io::Error), 263 264 /// Failed opening SGX provisioning device 265 #[cfg(target_arch = "x86_64")] 266 SgxProvisionOpen(io::Error), 267 268 /// Failed enabling SGX provisioning 269 #[cfg(target_arch = "x86_64")] 270 SgxEnableProvisioning(hypervisor::HypervisorVmError), 271 272 /// Failed creating a new MmapRegion instance. 273 #[cfg(target_arch = "x86_64")] 274 NewMmapRegion(vm_memory::mmap::MmapRegionError), 275 276 /// No memory zones found. 277 MissingMemoryZones, 278 279 /// Memory configuration is not valid. 280 InvalidMemoryParameters, 281 282 /// Forbidden operation. Impossible to resize guest memory if it is 283 /// backed by user defined memory regions. 284 InvalidResizeWithMemoryZones, 285 286 /// It's invalid to try applying a NUMA policy to a memory zone that is 287 /// memory mapped with MAP_SHARED. 288 InvalidSharedMemoryZoneWithHostNuma, 289 290 /// Failed applying NUMA memory policy. 291 ApplyNumaPolicy(io::Error), 292 293 /// Memory zone identifier is not unique. 294 DuplicateZoneId, 295 296 /// No virtio-mem resizing handler found. 297 MissingVirtioMemHandler, 298 299 /// Unknown memory zone. 300 UnknownMemoryZone, 301 302 /// Invalid size for resizing. Can be anything except 0. 303 InvalidHotplugSize, 304 305 /// Invalid hotplug method associated with memory zones resizing capability. 306 InvalidHotplugMethodWithMemoryZones, 307 308 /// Could not find specified memory zone identifier from hash map. 309 MissingZoneIdentifier, 310 311 /// Resizing the memory zone failed. 312 ResizeZone, 313 314 /// Guest address overflow 315 GuestAddressOverFlow, 316 317 /// Error opening snapshot file 318 SnapshotOpen(io::Error), 319 320 // Error copying snapshot into region 321 SnapshotCopy(GuestMemoryError), 322 323 /// Failed to allocate MMIO address 324 AllocateMmioAddress, 325 326 #[cfg(target_arch = "aarch64")] 327 /// Failed to create UEFI flash 328 CreateUefiFlash(HypervisorVmError), 329 } 330 331 const ENABLE_FLAG: usize = 0; 332 const INSERTING_FLAG: usize = 1; 333 const REMOVING_FLAG: usize = 2; 334 const EJECT_FLAG: usize = 3; 335 336 const BASE_OFFSET_LOW: u64 = 0; 337 const BASE_OFFSET_HIGH: u64 = 0x4; 338 const LENGTH_OFFSET_LOW: u64 = 0x8; 339 const LENGTH_OFFSET_HIGH: u64 = 0xC; 340 const STATUS_OFFSET: u64 = 0x14; 341 const SELECTION_OFFSET: u64 = 0; 342 343 // The MMIO address space size is subtracted with 64k. This is done for the 344 // following reasons: 345 // - Reduce the addressable space size by at least 4k to workaround a Linux 346 // bug when the VMM allocates devices at the end of the addressable space 347 // - Windows requires the addressable space size to be 64k aligned 348 fn mmio_address_space_size(phys_bits: u8) -> u64 { 349 (1 << phys_bits) - (1 << 16) 350 } 351 352 impl BusDevice for MemoryManager { 353 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 354 if self.selected_slot < self.hotplug_slots.len() { 355 let state = &self.hotplug_slots[self.selected_slot]; 356 match offset { 357 BASE_OFFSET_LOW => { 358 data.copy_from_slice(&state.base.to_le_bytes()[..4]); 359 } 360 BASE_OFFSET_HIGH => { 361 data.copy_from_slice(&state.base.to_le_bytes()[4..]); 362 } 363 LENGTH_OFFSET_LOW => { 364 data.copy_from_slice(&state.length.to_le_bytes()[..4]); 365 } 366 LENGTH_OFFSET_HIGH => { 367 data.copy_from_slice(&state.length.to_le_bytes()[4..]); 368 } 369 STATUS_OFFSET => { 370 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 371 data.fill(0); 372 if state.active { 373 data[0] |= 1 << ENABLE_FLAG; 374 } 375 if state.inserting { 376 data[0] |= 1 << INSERTING_FLAG; 377 } 378 if state.removing { 379 data[0] |= 1 << REMOVING_FLAG; 380 } 381 } 382 _ => { 383 warn!( 384 "Unexpected offset for accessing memory manager device: {:#}", 385 offset 386 ); 387 } 388 } 389 } else { 390 warn!("Out of range memory slot: {}", self.selected_slot); 391 } 392 } 393 394 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 395 match offset { 396 SELECTION_OFFSET => { 397 self.selected_slot = usize::from(data[0]); 398 } 399 STATUS_OFFSET => { 400 if self.selected_slot < self.hotplug_slots.len() { 401 let state = &mut self.hotplug_slots[self.selected_slot]; 402 // The ACPI code writes back a 1 to acknowledge the insertion 403 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting { 404 state.inserting = false; 405 } 406 // Ditto for removal 407 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing { 408 state.removing = false; 409 } 410 // Trigger removal of "DIMM" 411 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG { 412 warn!("Ejection of memory not currently supported"); 413 } 414 } else { 415 warn!("Out of range memory slot: {}", self.selected_slot); 416 } 417 } 418 _ => { 419 warn!( 420 "Unexpected offset for accessing memory manager device: {:#}", 421 offset 422 ); 423 } 424 }; 425 None 426 } 427 } 428 429 impl MemoryManager { 430 /// Creates all memory regions based on the available RAM ranges defined 431 /// by `ram_regions`, and based on the description of the memory zones. 432 /// In practice, this function can perform multiple memory mappings of the 433 /// same backing file if there's a hole in the address space between two 434 /// RAM ranges. 435 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G) 436 /// and zones containing two zones (size 1G and size 4G). 437 /// This function will create 3 resulting memory regions: 438 /// - First one mapping entirely the first memory zone on 0-1G range 439 /// - Second one mapping partially the second memory zone on 1G-3G range 440 /// - Third one mapping partially the second memory zone on 4G-6G range 441 fn create_memory_regions_from_zones( 442 ram_regions: &[(GuestAddress, usize)], 443 zones: &[MemoryZoneConfig], 444 prefault: Option<bool>, 445 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 446 let mut zones = zones.to_owned(); 447 let mut mem_regions = Vec::new(); 448 let mut zone = zones.remove(0); 449 let mut zone_offset = 0; 450 let mut memory_zones = HashMap::new(); 451 452 // Add zone id to the list of memory zones. 453 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 454 455 for ram_region in ram_regions.iter() { 456 let mut ram_region_offset = 0; 457 let mut exit = false; 458 459 loop { 460 let mut ram_region_consumed = false; 461 let mut pull_next_zone = false; 462 463 let ram_region_sub_size = ram_region.1 - ram_region_offset; 464 let zone_sub_size = zone.size as usize - zone_offset; 465 466 let file_offset = zone_offset as u64; 467 let region_start = ram_region 468 .0 469 .checked_add(ram_region_offset as u64) 470 .ok_or(Error::GuestAddressOverFlow)?; 471 let region_size = if zone_sub_size <= ram_region_sub_size { 472 if zone_sub_size == ram_region_sub_size { 473 ram_region_consumed = true; 474 } 475 476 ram_region_offset += zone_sub_size; 477 pull_next_zone = true; 478 479 zone_sub_size 480 } else { 481 zone_offset += ram_region_sub_size; 482 ram_region_consumed = true; 483 484 ram_region_sub_size 485 }; 486 487 let region = MemoryManager::create_ram_region( 488 &zone.file, 489 file_offset, 490 region_start, 491 region_size, 492 match prefault { 493 Some(pf) => pf, 494 None => zone.prefault, 495 }, 496 zone.shared, 497 zone.hugepages, 498 zone.hugepage_size, 499 zone.host_numa_node, 500 None, 501 )?; 502 503 // Add region to the list of regions associated with the 504 // current memory zone. 505 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 506 memory_zone.regions.push(region.clone()); 507 } 508 509 mem_regions.push(region); 510 511 if pull_next_zone { 512 // Get the next zone and reset the offset. 513 zone_offset = 0; 514 if zones.is_empty() { 515 exit = true; 516 break; 517 } 518 zone = zones.remove(0); 519 520 // Check if zone id already exist. In case it does, throw 521 // an error as we need unique identifiers. Otherwise, add 522 // the new zone id to the list of memory zones. 523 if memory_zones.contains_key(&zone.id) { 524 error!( 525 "Memory zone identifier '{}' found more than once. \ 526 It must be unique", 527 zone.id, 528 ); 529 return Err(Error::DuplicateZoneId); 530 } 531 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 532 } 533 534 if ram_region_consumed { 535 break; 536 } 537 } 538 539 if exit { 540 break; 541 } 542 } 543 544 Ok((mem_regions, memory_zones)) 545 } 546 547 // Restore both GuestMemory regions along with MemoryZone zones. 548 fn restore_memory_regions_and_zones( 549 guest_ram_mappings: &[GuestRamMapping], 550 zones_config: &[MemoryZoneConfig], 551 prefault: Option<bool>, 552 mut existing_memory_files: HashMap<u32, File>, 553 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 554 let mut memory_regions = Vec::new(); 555 let mut memory_zones = HashMap::new(); 556 557 for zone_config in zones_config { 558 memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); 559 } 560 561 for guest_ram_mapping in guest_ram_mappings { 562 for zone_config in zones_config { 563 if guest_ram_mapping.zone_id == zone_config.id { 564 let region = MemoryManager::create_ram_region( 565 &zone_config.file, 566 guest_ram_mapping.file_offset, 567 GuestAddress(guest_ram_mapping.gpa), 568 guest_ram_mapping.size as usize, 569 match prefault { 570 Some(pf) => pf, 571 None => zone_config.prefault, 572 }, 573 zone_config.shared, 574 zone_config.hugepages, 575 zone_config.hugepage_size, 576 zone_config.host_numa_node, 577 existing_memory_files.remove(&guest_ram_mapping.slot), 578 )?; 579 memory_regions.push(Arc::clone(®ion)); 580 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) { 581 if guest_ram_mapping.virtio_mem { 582 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0); 583 let region_size = region.len(); 584 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 585 region, 586 virtio_device: None, 587 hotplugged_size, 588 hugepages: zone_config.hugepages, 589 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 590 }); 591 } else { 592 memory_zone.regions.push(region); 593 } 594 } 595 } 596 } 597 } 598 599 memory_regions.sort_by_key(|x| x.start_addr()); 600 601 Ok((memory_regions, memory_zones)) 602 } 603 604 fn fill_saved_regions( 605 &mut self, 606 file_path: PathBuf, 607 saved_regions: MemoryRangeTable, 608 ) -> Result<(), Error> { 609 if saved_regions.is_empty() { 610 return Ok(()); 611 } 612 613 // Open (read only) the snapshot file. 614 let mut memory_file = OpenOptions::new() 615 .read(true) 616 .open(file_path) 617 .map_err(Error::SnapshotOpen)?; 618 619 let guest_memory = self.guest_memory.memory(); 620 for range in saved_regions.regions() { 621 let mut offset: u64 = 0; 622 // Here we are manually handling the retry in case we can't write 623 // the whole region at once because we can't use the implementation 624 // from vm-memory::GuestMemory of read_exact_from() as it is not 625 // following the correct behavior. For more info about this issue 626 // see: https://github.com/rust-vmm/vm-memory/issues/174 627 loop { 628 let bytes_read = guest_memory 629 .read_from( 630 GuestAddress(range.gpa + offset), 631 &mut memory_file, 632 (range.length - offset) as usize, 633 ) 634 .map_err(Error::SnapshotCopy)?; 635 offset += bytes_read as u64; 636 637 if offset == range.length { 638 break; 639 } 640 } 641 } 642 643 Ok(()) 644 } 645 646 fn validate_memory_config( 647 config: &MemoryConfig, 648 user_provided_zones: bool, 649 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> { 650 let mut allow_mem_hotplug = false; 651 652 if !user_provided_zones { 653 if config.zones.is_some() { 654 error!( 655 "User defined memory regions can't be provided if the \ 656 memory size is not 0" 657 ); 658 return Err(Error::InvalidMemoryParameters); 659 } 660 661 if config.hotplug_size.is_some() { 662 allow_mem_hotplug = true; 663 } 664 665 if let Some(hotplugged_size) = config.hotplugged_size { 666 if let Some(hotplug_size) = config.hotplug_size { 667 if hotplugged_size > hotplug_size { 668 error!( 669 "'hotplugged_size' {} can't be bigger than \ 670 'hotplug_size' {}", 671 hotplugged_size, hotplug_size, 672 ); 673 return Err(Error::InvalidMemoryParameters); 674 } 675 } else { 676 error!( 677 "Invalid to define 'hotplugged_size' when there is\ 678 no 'hotplug_size'" 679 ); 680 return Err(Error::InvalidMemoryParameters); 681 } 682 if config.hotplug_method == HotplugMethod::Acpi { 683 error!( 684 "Invalid to define 'hotplugged_size' with hotplug \ 685 method 'acpi'" 686 ); 687 return Err(Error::InvalidMemoryParameters); 688 } 689 } 690 691 // Create a single zone from the global memory config. This lets 692 // us reuse the codepath for user defined memory zones. 693 let zones = vec![MemoryZoneConfig { 694 id: String::from(DEFAULT_MEMORY_ZONE), 695 size: config.size, 696 file: None, 697 shared: config.shared, 698 hugepages: config.hugepages, 699 hugepage_size: config.hugepage_size, 700 host_numa_node: None, 701 hotplug_size: config.hotplug_size, 702 hotplugged_size: config.hotplugged_size, 703 prefault: config.prefault, 704 }]; 705 706 Ok((config.size, zones, allow_mem_hotplug)) 707 } else { 708 if config.zones.is_none() { 709 error!( 710 "User defined memory regions must be provided if the \ 711 memory size is 0" 712 ); 713 return Err(Error::MissingMemoryZones); 714 } 715 716 // Safe to unwrap as we checked right above there were some 717 // regions. 718 let zones = config.zones.clone().unwrap(); 719 if zones.is_empty() { 720 return Err(Error::MissingMemoryZones); 721 } 722 723 let mut total_ram_size: u64 = 0; 724 for zone in zones.iter() { 725 total_ram_size += zone.size; 726 727 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() { 728 error!( 729 "Invalid to set host NUMA policy for a memory zone \ 730 backed by a regular file and mapped as 'shared'" 731 ); 732 return Err(Error::InvalidSharedMemoryZoneWithHostNuma); 733 } 734 735 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi { 736 error!("Invalid to set ACPI hotplug method for memory zones"); 737 return Err(Error::InvalidHotplugMethodWithMemoryZones); 738 } 739 740 if let Some(hotplugged_size) = zone.hotplugged_size { 741 if let Some(hotplug_size) = zone.hotplug_size { 742 if hotplugged_size > hotplug_size { 743 error!( 744 "'hotplugged_size' {} can't be bigger than \ 745 'hotplug_size' {}", 746 hotplugged_size, hotplug_size, 747 ); 748 return Err(Error::InvalidMemoryParameters); 749 } 750 } else { 751 error!( 752 "Invalid to define 'hotplugged_size' when there is\ 753 no 'hotplug_size' for a memory zone" 754 ); 755 return Err(Error::InvalidMemoryParameters); 756 } 757 if config.hotplug_method == HotplugMethod::Acpi { 758 error!( 759 "Invalid to define 'hotplugged_size' with hotplug \ 760 method 'acpi'" 761 ); 762 return Err(Error::InvalidMemoryParameters); 763 } 764 } 765 } 766 767 Ok((total_ram_size, zones, allow_mem_hotplug)) 768 } 769 } 770 771 fn allocate_address_space(&mut self) -> Result<(), Error> { 772 let mut list = Vec::new(); 773 774 for (zone_id, memory_zone) in self.memory_zones.iter() { 775 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> = 776 memory_zone 777 .regions() 778 .iter() 779 .map(|r| (r.clone(), false)) 780 .collect(); 781 782 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 783 regions.push((virtio_mem_zone.region().clone(), true)); 784 } 785 786 list.push((zone_id.clone(), regions)); 787 } 788 789 for (zone_id, regions) in list { 790 for (region, virtio_mem) in regions { 791 let slot = self.create_userspace_mapping( 792 region.start_addr().raw_value(), 793 region.len() as u64, 794 region.as_ptr() as u64, 795 self.mergeable, 796 false, 797 self.log_dirty, 798 )?; 799 800 let file_offset = if let Some(file_offset) = region.file_offset() { 801 file_offset.start() 802 } else { 803 0 804 }; 805 806 self.guest_ram_mappings.push(GuestRamMapping { 807 gpa: region.start_addr().raw_value(), 808 size: region.len(), 809 slot, 810 zone_id: zone_id.clone(), 811 virtio_mem, 812 file_offset, 813 }); 814 self.ram_allocator 815 .allocate(Some(region.start_addr()), region.len(), None) 816 .ok_or(Error::MemoryRangeAllocation)?; 817 } 818 } 819 820 // Allocate SubRegion and Reserved address ranges. 821 for region in self.arch_mem_regions.iter() { 822 if region.r_type == RegionType::Ram { 823 // Ignore the RAM type since ranges have already been allocated 824 // based on the GuestMemory regions. 825 continue; 826 } 827 self.ram_allocator 828 .allocate( 829 Some(GuestAddress(region.base)), 830 region.size as GuestUsize, 831 None, 832 ) 833 .ok_or(Error::MemoryRangeAllocation)?; 834 } 835 836 Ok(()) 837 } 838 839 #[cfg(target_arch = "aarch64")] 840 fn add_uefi_flash(&mut self) -> Result<(), Error> { 841 // On AArch64, the UEFI binary requires a flash device at address 0. 842 // 4 MiB memory is mapped to simulate the flash. 843 let uefi_mem_slot = self.allocate_memory_slot(); 844 let uefi_region = GuestRegionMmap::new( 845 MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(), 846 arch::layout::UEFI_START, 847 ) 848 .unwrap(); 849 let uefi_mem_region = self.vm.make_user_memory_region( 850 uefi_mem_slot, 851 uefi_region.start_addr().raw_value(), 852 uefi_region.len() as u64, 853 uefi_region.as_ptr() as u64, 854 false, 855 false, 856 ); 857 self.vm 858 .create_user_memory_region(uefi_mem_region) 859 .map_err(Error::CreateUefiFlash)?; 860 861 let uefi_flash = 862 GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap()); 863 864 self.uefi_flash = Some(uefi_flash); 865 866 Ok(()) 867 } 868 869 #[allow(clippy::too_many_arguments)] 870 pub fn new( 871 vm: Arc<dyn hypervisor::Vm>, 872 config: &MemoryConfig, 873 prefault: Option<bool>, 874 phys_bits: u8, 875 #[cfg(feature = "tdx")] tdx_enabled: bool, 876 restore_data: Option<&MemoryManagerSnapshotData>, 877 existing_memory_files: Option<HashMap<u32, File>>, 878 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, 879 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 880 trace_scoped!("MemoryManager::new"); 881 882 let user_provided_zones = config.size == 0; 883 884 let mmio_address_space_size = mmio_address_space_size(phys_bits); 885 debug_assert_eq!( 886 (((mmio_address_space_size) >> 16) << 16), 887 mmio_address_space_size 888 ); 889 let start_of_platform_device_area = 890 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE); 891 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1); 892 893 let (ram_size, zones, allow_mem_hotplug) = 894 Self::validate_memory_config(config, user_provided_zones)?; 895 896 let ( 897 start_of_device_area, 898 boot_ram, 899 current_ram, 900 arch_mem_regions, 901 memory_zones, 902 guest_memory, 903 boot_guest_memory, 904 hotplug_slots, 905 next_memory_slot, 906 selected_slot, 907 next_hotplug_slot, 908 ) = if let Some(data) = restore_data { 909 let (regions, memory_zones) = Self::restore_memory_regions_and_zones( 910 &data.guest_ram_mappings, 911 &zones, 912 prefault, 913 existing_memory_files.unwrap_or_default(), 914 )?; 915 let guest_memory = 916 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; 917 let boot_guest_memory = guest_memory.clone(); 918 ( 919 GuestAddress(data.start_of_device_area), 920 data.boot_ram, 921 data.current_ram, 922 data.arch_mem_regions.clone(), 923 memory_zones, 924 guest_memory, 925 boot_guest_memory, 926 data.hotplug_slots.clone(), 927 data.next_memory_slot, 928 data.selected_slot, 929 data.next_hotplug_slot, 930 ) 931 } else { 932 // Init guest memory 933 let arch_mem_regions = arch::arch_memory_regions(ram_size); 934 935 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 936 .iter() 937 .filter(|r| r.2 == RegionType::Ram) 938 .map(|r| (r.0, r.1)) 939 .collect(); 940 941 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions 942 .iter() 943 .map(|(a, b, c)| ArchMemRegion { 944 base: a.0, 945 size: *b, 946 r_type: *c, 947 }) 948 .collect(); 949 950 let (mem_regions, mut memory_zones) = 951 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault)?; 952 953 let mut guest_memory = 954 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; 955 956 let boot_guest_memory = guest_memory.clone(); 957 958 let mut start_of_device_area = 959 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?; 960 961 // Update list of memory zones for resize. 962 for zone in zones.iter() { 963 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 964 if let Some(hotplug_size) = zone.hotplug_size { 965 if hotplug_size == 0 { 966 error!("'hotplug_size' can't be 0"); 967 return Err(Error::InvalidHotplugSize); 968 } 969 970 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi { 971 start_of_device_area = start_of_device_area 972 .checked_add(hotplug_size) 973 .ok_or(Error::GuestAddressOverFlow)?; 974 } else { 975 // Alignment must be "natural" i.e. same as size of block 976 let start_addr = GuestAddress( 977 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE 978 - 1) 979 / virtio_devices::VIRTIO_MEM_ALIGN_SIZE 980 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE, 981 ); 982 983 // When `prefault` is set by vm_restore, memory manager 984 // will create ram region with `prefault` option in 985 // restore config rather than same option in zone 986 let region = MemoryManager::create_ram_region( 987 &None, 988 0, 989 start_addr, 990 hotplug_size as usize, 991 match prefault { 992 Some(pf) => pf, 993 None => zone.prefault, 994 }, 995 zone.shared, 996 zone.hugepages, 997 zone.hugepage_size, 998 zone.host_numa_node, 999 None, 1000 )?; 1001 1002 guest_memory = guest_memory 1003 .insert_region(Arc::clone(®ion)) 1004 .map_err(Error::GuestMemory)?; 1005 1006 let hotplugged_size = zone.hotplugged_size.unwrap_or(0); 1007 let region_size = region.len(); 1008 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 1009 region, 1010 virtio_device: None, 1011 hotplugged_size, 1012 hugepages: zone.hugepages, 1013 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 1014 }); 1015 1016 start_of_device_area = start_addr 1017 .checked_add(hotplug_size) 1018 .ok_or(Error::GuestAddressOverFlow)?; 1019 } 1020 } 1021 } else { 1022 return Err(Error::MissingZoneIdentifier); 1023 } 1024 } 1025 1026 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT); 1027 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default); 1028 1029 ( 1030 start_of_device_area, 1031 ram_size, 1032 ram_size, 1033 arch_mem_regions, 1034 memory_zones, 1035 guest_memory, 1036 boot_guest_memory, 1037 hotplug_slots, 1038 0, 1039 0, 1040 0, 1041 ) 1042 }; 1043 1044 let guest_memory = GuestMemoryAtomic::new(guest_memory); 1045 1046 // Both MMIO and PIO address spaces start at address 0. 1047 let allocator = Arc::new(Mutex::new( 1048 SystemAllocator::new( 1049 #[cfg(target_arch = "x86_64")] 1050 { 1051 GuestAddress(0) 1052 }, 1053 #[cfg(target_arch = "x86_64")] 1054 { 1055 1 << 16 1056 }, 1057 start_of_platform_device_area, 1058 PLATFORM_DEVICE_AREA_SIZE, 1059 layout::MEM_32BIT_DEVICES_START, 1060 layout::MEM_32BIT_DEVICES_SIZE, 1061 #[cfg(target_arch = "x86_64")] 1062 vec![GsiApic::new( 1063 X86_64_IRQ_BASE, 1064 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, 1065 )], 1066 ) 1067 .ok_or(Error::CreateSystemAllocator)?, 1068 )); 1069 1070 #[cfg(not(feature = "tdx"))] 1071 let dynamic = true; 1072 #[cfg(feature = "tdx")] 1073 let dynamic = !tdx_enabled; 1074 1075 let acpi_address = if dynamic 1076 && config.hotplug_method == HotplugMethod::Acpi 1077 && (config.hotplug_size.unwrap_or_default() > 0) 1078 { 1079 Some( 1080 allocator 1081 .lock() 1082 .unwrap() 1083 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None) 1084 .ok_or(Error::AllocateMmioAddress)?, 1085 ) 1086 } else { 1087 None 1088 }; 1089 1090 // If running on SGX the start of device area and RAM area may diverge but 1091 // at this point they are next to each other. 1092 let end_of_ram_area = start_of_device_area.unchecked_sub(1); 1093 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); 1094 1095 let mut memory_manager = MemoryManager { 1096 boot_guest_memory, 1097 guest_memory, 1098 next_memory_slot, 1099 start_of_device_area, 1100 end_of_device_area, 1101 end_of_ram_area, 1102 vm, 1103 hotplug_slots, 1104 selected_slot, 1105 mergeable: config.mergeable, 1106 allocator, 1107 hotplug_method: config.hotplug_method, 1108 boot_ram, 1109 current_ram, 1110 next_hotplug_slot, 1111 shared: config.shared, 1112 hugepages: config.hugepages, 1113 hugepage_size: config.hugepage_size, 1114 prefault: config.prefault, 1115 #[cfg(target_arch = "x86_64")] 1116 sgx_epc_region: None, 1117 user_provided_zones, 1118 snapshot_memory_ranges: MemoryRangeTable::default(), 1119 memory_zones, 1120 guest_ram_mappings: Vec::new(), 1121 acpi_address, 1122 log_dirty: dynamic, // Cannot log dirty pages on a TD 1123 arch_mem_regions, 1124 ram_allocator, 1125 dynamic, 1126 #[cfg(target_arch = "aarch64")] 1127 uefi_flash: None, 1128 }; 1129 1130 memory_manager.allocate_address_space()?; 1131 1132 #[cfg(target_arch = "aarch64")] 1133 memory_manager.add_uefi_flash()?; 1134 1135 #[cfg(target_arch = "x86_64")] 1136 if let Some(sgx_epc_config) = sgx_epc_config { 1137 memory_manager.setup_sgx(sgx_epc_config)?; 1138 } 1139 1140 Ok(Arc::new(Mutex::new(memory_manager))) 1141 } 1142 1143 pub fn new_from_snapshot( 1144 snapshot: &Snapshot, 1145 vm: Arc<dyn hypervisor::Vm>, 1146 config: &MemoryConfig, 1147 source_url: Option<&str>, 1148 prefault: bool, 1149 phys_bits: u8, 1150 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1151 if let Some(source_url) = source_url { 1152 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; 1153 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 1154 1155 let mem_snapshot: MemoryManagerSnapshotData = snapshot 1156 .to_versioned_state(MEMORY_MANAGER_SNAPSHOT_ID) 1157 .map_err(Error::Restore)?; 1158 1159 let mm = MemoryManager::new( 1160 vm, 1161 config, 1162 Some(prefault), 1163 phys_bits, 1164 #[cfg(feature = "tdx")] 1165 false, 1166 Some(&mem_snapshot), 1167 None, 1168 #[cfg(target_arch = "x86_64")] 1169 None, 1170 )?; 1171 1172 mm.lock() 1173 .unwrap() 1174 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?; 1175 1176 Ok(mm) 1177 } else { 1178 Err(Error::RestoreMissingSourceUrl) 1179 } 1180 } 1181 1182 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> { 1183 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) }; 1184 1185 if res < 0 { 1186 Err(io::Error::last_os_error()) 1187 } else { 1188 Ok(res as RawFd) 1189 } 1190 } 1191 1192 fn mbind( 1193 addr: *mut u8, 1194 len: u64, 1195 mode: u32, 1196 nodemask: Vec<u64>, 1197 maxnode: u64, 1198 flags: u32, 1199 ) -> Result<(), io::Error> { 1200 let res = unsafe { 1201 libc::syscall( 1202 libc::SYS_mbind, 1203 addr as *mut libc::c_void, 1204 len, 1205 mode, 1206 nodemask.as_ptr(), 1207 maxnode, 1208 flags, 1209 ) 1210 }; 1211 1212 if res < 0 { 1213 Err(io::Error::last_os_error()) 1214 } else { 1215 Ok(()) 1216 } 1217 } 1218 1219 fn open_memory_file( 1220 backing_file: &Option<PathBuf>, 1221 file_offset: u64, 1222 size: usize, 1223 hugepages: bool, 1224 hugepage_size: Option<u64>, 1225 ) -> Result<(File, u64), Error> { 1226 let (f, f_off) = match backing_file { 1227 Some(ref file) => { 1228 if file.is_dir() { 1229 // Override file offset as it does not apply in this case. 1230 info!( 1231 "Ignoring file offset since the backing file is a \ 1232 temporary file created from the specified directory." 1233 ); 1234 let fs_str = format!("{}{}", file.display(), "/tmpfile_XXXXXX"); 1235 let fs = ffi::CString::new(fs_str).unwrap(); 1236 let mut path = fs.as_bytes_with_nul().to_owned(); 1237 let path_ptr = path.as_mut_ptr() as *mut _; 1238 let fd = unsafe { libc::mkstemp(path_ptr) }; 1239 unsafe { libc::unlink(path_ptr) }; 1240 let f = unsafe { File::from_raw_fd(fd) }; 1241 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1242 1243 (f, 0) 1244 } else { 1245 let f = OpenOptions::new() 1246 .read(true) 1247 .write(true) 1248 .open(file) 1249 .map_err(Error::SharedFileCreate)?; 1250 1251 (f, file_offset) 1252 } 1253 } 1254 None => { 1255 let fd = Self::memfd_create( 1256 &ffi::CString::new("ch_ram").unwrap(), 1257 if hugepages { 1258 libc::MFD_HUGETLB 1259 | if let Some(hugepage_size) = hugepage_size { 1260 /* 1261 * From the Linux kernel: 1262 * Several system calls take a flag to request "hugetlb" huge pages. 1263 * Without further specification, these system calls will use the 1264 * system's default huge page size. If a system supports multiple 1265 * huge page sizes, the desired huge page size can be specified in 1266 * bits [26:31] of the flag arguments. The value in these 6 bits 1267 * will encode the log2 of the huge page size. 1268 */ 1269 1270 hugepage_size.trailing_zeros() << 26 1271 } else { 1272 // Use the system default huge page size 1273 0 1274 } 1275 } else { 1276 0 1277 }, 1278 ) 1279 .map_err(Error::SharedFileCreate)?; 1280 1281 let f = unsafe { File::from_raw_fd(fd) }; 1282 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1283 1284 (f, 0) 1285 } 1286 }; 1287 1288 Ok((f, f_off)) 1289 } 1290 1291 #[allow(clippy::too_many_arguments)] 1292 fn create_ram_region( 1293 backing_file: &Option<PathBuf>, 1294 file_offset: u64, 1295 start_addr: GuestAddress, 1296 size: usize, 1297 prefault: bool, 1298 shared: bool, 1299 hugepages: bool, 1300 hugepage_size: Option<u64>, 1301 host_numa_node: Option<u32>, 1302 existing_memory_file: Option<File>, 1303 ) -> Result<Arc<GuestRegionMmap>, Error> { 1304 let (f, f_off) = if let Some(f) = existing_memory_file { 1305 (f, file_offset) 1306 } else { 1307 Self::open_memory_file(backing_file, file_offset, size, hugepages, hugepage_size)? 1308 }; 1309 1310 let mut mmap_flags = libc::MAP_NORESERVE 1311 | if shared { 1312 libc::MAP_SHARED 1313 } else { 1314 libc::MAP_PRIVATE 1315 }; 1316 if prefault { 1317 mmap_flags |= libc::MAP_POPULATE; 1318 } 1319 1320 let region = GuestRegionMmap::new( 1321 MmapRegion::build( 1322 Some(FileOffset::new(f, f_off)), 1323 size, 1324 libc::PROT_READ | libc::PROT_WRITE, 1325 mmap_flags, 1326 ) 1327 .map_err(Error::GuestMemoryRegion)?, 1328 start_addr, 1329 ) 1330 .map_err(Error::GuestMemory)?; 1331 1332 // Apply NUMA policy if needed. 1333 if let Some(node) = host_numa_node { 1334 let addr = region.deref().as_ptr(); 1335 let len = region.deref().size() as u64; 1336 let mode = MPOL_BIND; 1337 let mut nodemask: Vec<u64> = Vec::new(); 1338 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 1339 1340 // Linux is kind of buggy in the way it interprets maxnode as it 1341 // will cut off the last node. That's why we have to add 1 to what 1342 // we would consider as the proper maxnode value. 1343 let maxnode = node as u64 + 1 + 1; 1344 1345 // Allocate the right size for the vector. 1346 nodemask.resize((node as usize / 64) + 1, 0); 1347 1348 // Fill the global bitmask through the nodemask vector. 1349 let idx = (node / 64) as usize; 1350 let shift = node % 64; 1351 nodemask[idx] |= 1u64 << shift; 1352 1353 // Policies are enforced by using MPOL_MF_MOVE flag as it will 1354 // force the kernel to move all pages that might have been already 1355 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is 1356 // used to throw an error if MPOL_MF_MOVE didn't succeed. 1357 // MPOL_BIND is the selected mode as it specifies a strict policy 1358 // that restricts memory allocation to the nodes specified in the 1359 // nodemask. 1360 Self::mbind(addr, len, mode, nodemask, maxnode, flags) 1361 .map_err(Error::ApplyNumaPolicy)?; 1362 } 1363 1364 Ok(Arc::new(region)) 1365 } 1366 1367 // Update the GuestMemoryMmap with the new range 1368 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> { 1369 let guest_memory = self 1370 .guest_memory 1371 .memory() 1372 .insert_region(region) 1373 .map_err(Error::GuestMemory)?; 1374 self.guest_memory.lock().unwrap().replace(guest_memory); 1375 1376 Ok(()) 1377 } 1378 1379 // 1380 // Calculate the start address of an area next to RAM. 1381 // 1382 // If memory hotplug is allowed, the start address needs to be aligned 1383 // (rounded-up) to 128MiB boundary. 1384 // If memory hotplug is not allowed, there is no alignment required. 1385 // And it must also start at the 64bit start. 1386 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> { 1387 let mut start_addr = if allow_mem_hotplug { 1388 GuestAddress(mem_end.0 | ((128 << 20) - 1)) 1389 } else { 1390 mem_end 1391 }; 1392 1393 start_addr = start_addr 1394 .checked_add(1) 1395 .ok_or(Error::GuestAddressOverFlow)?; 1396 1397 if mem_end < arch::layout::MEM_32BIT_RESERVED_START { 1398 return Ok(arch::layout::RAM_64BIT_START); 1399 } 1400 1401 Ok(start_addr) 1402 } 1403 1404 pub fn add_ram_region( 1405 &mut self, 1406 start_addr: GuestAddress, 1407 size: usize, 1408 ) -> Result<Arc<GuestRegionMmap>, Error> { 1409 // Allocate memory for the region 1410 let region = MemoryManager::create_ram_region( 1411 &None, 1412 0, 1413 start_addr, 1414 size, 1415 self.prefault, 1416 self.shared, 1417 self.hugepages, 1418 self.hugepage_size, 1419 None, 1420 None, 1421 )?; 1422 1423 // Map it into the guest 1424 let slot = self.create_userspace_mapping( 1425 region.start_addr().0, 1426 region.len() as u64, 1427 region.as_ptr() as u64, 1428 self.mergeable, 1429 false, 1430 self.log_dirty, 1431 )?; 1432 self.guest_ram_mappings.push(GuestRamMapping { 1433 gpa: region.start_addr().raw_value(), 1434 size: region.len(), 1435 slot, 1436 zone_id: DEFAULT_MEMORY_ZONE.to_string(), 1437 virtio_mem: false, 1438 file_offset: 0, 1439 }); 1440 1441 self.add_region(Arc::clone(®ion))?; 1442 1443 Ok(region) 1444 } 1445 1446 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> { 1447 info!("Hotplugging new RAM: {}", size); 1448 1449 // Check that there is a free slot 1450 if self.next_hotplug_slot >= HOTPLUG_COUNT { 1451 return Err(Error::NoSlotAvailable); 1452 } 1453 1454 // "Inserted" DIMM must have a size that is a multiple of 128MiB 1455 if size % (128 << 20) != 0 { 1456 return Err(Error::InvalidSize); 1457 } 1458 1459 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?; 1460 1461 if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area { 1462 return Err(Error::InsufficientHotplugRam); 1463 } 1464 1465 let region = self.add_ram_region(start_addr, size)?; 1466 1467 // Add region to the list of regions associated with the default 1468 // memory zone. 1469 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) { 1470 memory_zone.regions.push(Arc::clone(®ion)); 1471 } 1472 1473 // Tell the allocator 1474 self.ram_allocator 1475 .allocate(Some(start_addr), size as GuestUsize, None) 1476 .ok_or(Error::MemoryRangeAllocation)?; 1477 1478 // Update the slot so that it can be queried via the I/O port 1479 let mut slot = &mut self.hotplug_slots[self.next_hotplug_slot]; 1480 slot.active = true; 1481 slot.inserting = true; 1482 slot.base = region.start_addr().0; 1483 slot.length = region.len() as u64; 1484 1485 self.next_hotplug_slot += 1; 1486 1487 Ok(region) 1488 } 1489 1490 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1491 self.guest_memory.clone() 1492 } 1493 1494 pub fn boot_guest_memory(&self) -> GuestMemoryMmap { 1495 self.boot_guest_memory.clone() 1496 } 1497 1498 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> { 1499 self.allocator.clone() 1500 } 1501 1502 pub fn start_of_device_area(&self) -> GuestAddress { 1503 self.start_of_device_area 1504 } 1505 1506 pub fn end_of_device_area(&self) -> GuestAddress { 1507 self.end_of_device_area 1508 } 1509 1510 pub fn allocate_memory_slot(&mut self) -> u32 { 1511 let slot_id = self.next_memory_slot; 1512 self.next_memory_slot += 1; 1513 slot_id 1514 } 1515 1516 pub fn create_userspace_mapping( 1517 &mut self, 1518 guest_phys_addr: u64, 1519 memory_size: u64, 1520 userspace_addr: u64, 1521 mergeable: bool, 1522 readonly: bool, 1523 log_dirty: bool, 1524 ) -> Result<u32, Error> { 1525 let slot = self.allocate_memory_slot(); 1526 let mem_region = self.vm.make_user_memory_region( 1527 slot, 1528 guest_phys_addr, 1529 memory_size, 1530 userspace_addr, 1531 readonly, 1532 log_dirty, 1533 ); 1534 1535 info!( 1536 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}", 1537 guest_phys_addr, userspace_addr, memory_size, slot 1538 ); 1539 1540 self.vm 1541 .create_user_memory_region(mem_region) 1542 .map_err(Error::CreateUserMemoryRegion)?; 1543 1544 // Mark the pages as mergeable if explicitly asked for. 1545 if mergeable { 1546 // Safe because the address and size are valid since the 1547 // mmap succeeded. 1548 let ret = unsafe { 1549 libc::madvise( 1550 userspace_addr as *mut libc::c_void, 1551 memory_size as libc::size_t, 1552 libc::MADV_MERGEABLE, 1553 ) 1554 }; 1555 if ret != 0 { 1556 let err = io::Error::last_os_error(); 1557 // Safe to unwrap because the error is constructed with 1558 // last_os_error(), which ensures the output will be Some(). 1559 let errno = err.raw_os_error().unwrap(); 1560 if errno == libc::EINVAL { 1561 warn!("kernel not configured with CONFIG_KSM"); 1562 } else { 1563 warn!("madvise error: {}", err); 1564 } 1565 warn!("failed to mark pages as mergeable"); 1566 } 1567 } 1568 1569 info!( 1570 "Created userspace mapping: {:x} -> {:x} {:x}", 1571 guest_phys_addr, userspace_addr, memory_size 1572 ); 1573 1574 Ok(slot) 1575 } 1576 1577 pub fn remove_userspace_mapping( 1578 &mut self, 1579 guest_phys_addr: u64, 1580 memory_size: u64, 1581 userspace_addr: u64, 1582 mergeable: bool, 1583 slot: u32, 1584 ) -> Result<(), Error> { 1585 let mem_region = self.vm.make_user_memory_region( 1586 slot, 1587 guest_phys_addr, 1588 memory_size, 1589 userspace_addr, 1590 false, /* readonly -- don't care */ 1591 false, /* log dirty */ 1592 ); 1593 1594 self.vm 1595 .remove_user_memory_region(mem_region) 1596 .map_err(Error::RemoveUserMemoryRegion)?; 1597 1598 // Mark the pages as unmergeable if there were previously marked as 1599 // mergeable. 1600 if mergeable { 1601 // Safe because the address and size are valid as the region was 1602 // previously advised. 1603 let ret = unsafe { 1604 libc::madvise( 1605 userspace_addr as *mut libc::c_void, 1606 memory_size as libc::size_t, 1607 libc::MADV_UNMERGEABLE, 1608 ) 1609 }; 1610 if ret != 0 { 1611 let err = io::Error::last_os_error(); 1612 // Safe to unwrap because the error is constructed with 1613 // last_os_error(), which ensures the output will be Some(). 1614 let errno = err.raw_os_error().unwrap(); 1615 if errno == libc::EINVAL { 1616 warn!("kernel not configured with CONFIG_KSM"); 1617 } else { 1618 warn!("madvise error: {}", err); 1619 } 1620 warn!("failed to mark pages as unmergeable"); 1621 } 1622 } 1623 1624 info!( 1625 "Removed userspace mapping: {:x} -> {:x} {:x}", 1626 guest_phys_addr, userspace_addr, memory_size 1627 ); 1628 1629 Ok(()) 1630 } 1631 1632 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> { 1633 if let Some(memory_zone) = self.memory_zones.get_mut(id) { 1634 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone { 1635 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() { 1636 virtio_mem_device 1637 .lock() 1638 .unwrap() 1639 .resize(size) 1640 .map_err(Error::VirtioMemResizeFail)?; 1641 } 1642 1643 // Keep the hotplugged_size up to date. 1644 virtio_mem_zone.hotplugged_size = size; 1645 } else { 1646 error!("Failed resizing virtio-mem region: No virtio-mem handler"); 1647 return Err(Error::MissingVirtioMemHandler); 1648 } 1649 1650 return Ok(()); 1651 } 1652 1653 error!("Failed resizing virtio-mem region: Unknown memory zone"); 1654 Err(Error::UnknownMemoryZone) 1655 } 1656 1657 /// In case this function resulted in adding a new memory region to the 1658 /// guest memory, the new region is returned to the caller. The virtio-mem 1659 /// use case never adds a new region as the whole hotpluggable memory has 1660 /// already been allocated at boot time. 1661 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> { 1662 if self.user_provided_zones { 1663 error!( 1664 "Not allowed to resize guest memory when backed with user \ 1665 defined memory zones." 1666 ); 1667 return Err(Error::InvalidResizeWithMemoryZones); 1668 } 1669 1670 let mut region: Option<Arc<GuestRegionMmap>> = None; 1671 match self.hotplug_method { 1672 HotplugMethod::VirtioMem => { 1673 if desired_ram >= self.boot_ram { 1674 if !self.dynamic { 1675 return Ok(region); 1676 } 1677 1678 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?; 1679 self.current_ram = desired_ram; 1680 } 1681 } 1682 HotplugMethod::Acpi => { 1683 if desired_ram > self.current_ram { 1684 if !self.dynamic { 1685 return Ok(region); 1686 } 1687 1688 region = 1689 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?); 1690 self.current_ram = desired_ram; 1691 } 1692 } 1693 } 1694 Ok(region) 1695 } 1696 1697 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> { 1698 if !self.user_provided_zones { 1699 error!( 1700 "Not allowed to resize guest memory zone when no zone is \ 1701 defined." 1702 ); 1703 return Err(Error::ResizeZone); 1704 } 1705 1706 self.virtio_mem_resize(id, virtio_mem_size) 1707 } 1708 1709 #[cfg(target_arch = "x86_64")] 1710 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> { 1711 let file = OpenOptions::new() 1712 .read(true) 1713 .open("/dev/sgx_provision") 1714 .map_err(Error::SgxProvisionOpen)?; 1715 self.vm 1716 .enable_sgx_attribute(file) 1717 .map_err(Error::SgxEnableProvisioning)?; 1718 1719 // Go over each EPC section and verify its size is a 4k multiple. At 1720 // the same time, calculate the total size needed for the contiguous 1721 // EPC region. 1722 let mut epc_region_size = 0; 1723 for epc_section in sgx_epc_config.iter() { 1724 if epc_section.size == 0 { 1725 return Err(Error::EpcSectionSizeInvalid); 1726 } 1727 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { 1728 return Err(Error::EpcSectionSizeInvalid); 1729 } 1730 1731 epc_region_size += epc_section.size; 1732 } 1733 1734 // Place the SGX EPC region on a 4k boundary between the RAM and the device area 1735 let epc_region_start = GuestAddress( 1736 ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE, 1737 ); 1738 1739 self.start_of_device_area = epc_region_start 1740 .checked_add(epc_region_size) 1741 .ok_or(Error::GuestAddressOverFlow)?; 1742 1743 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); 1744 info!( 1745 "SGX EPC region: 0x{:x} (0x{:x})", 1746 epc_region_start.0, epc_region_size 1747 ); 1748 1749 // Each section can be memory mapped into the allocated region. 1750 let mut epc_section_start = epc_region_start.raw_value(); 1751 for epc_section in sgx_epc_config.iter() { 1752 let file = OpenOptions::new() 1753 .read(true) 1754 .write(true) 1755 .open("/dev/sgx_vepc") 1756 .map_err(Error::SgxVirtEpcOpen)?; 1757 1758 let prot = PROT_READ | PROT_WRITE; 1759 let mut flags = MAP_NORESERVE | MAP_SHARED; 1760 if epc_section.prefault { 1761 flags |= MAP_POPULATE; 1762 } 1763 1764 // We can't use the vm-memory crate to perform the memory mapping 1765 // here as it would try to ensure the size of the backing file is 1766 // matching the size of the expected mapping. The /dev/sgx_vepc 1767 // device does not work that way, it provides a file descriptor 1768 // which is not matching the mapping size, as it's a just a way to 1769 // let KVM know that an EPC section is being created for the guest. 1770 let host_addr = unsafe { 1771 libc::mmap( 1772 std::ptr::null_mut(), 1773 epc_section.size as usize, 1774 prot, 1775 flags, 1776 file.as_raw_fd(), 1777 0, 1778 ) 1779 } as u64; 1780 1781 info!( 1782 "Adding SGX EPC section: 0x{:x} (0x{:x})", 1783 epc_section_start, epc_section.size 1784 ); 1785 1786 let _mem_slot = self.create_userspace_mapping( 1787 epc_section_start, 1788 epc_section.size, 1789 host_addr, 1790 false, 1791 false, 1792 false, 1793 )?; 1794 1795 sgx_epc_region.insert( 1796 epc_section.id.clone(), 1797 SgxEpcSection::new( 1798 GuestAddress(epc_section_start), 1799 epc_section.size as GuestUsize, 1800 ), 1801 ); 1802 1803 epc_section_start += epc_section.size; 1804 } 1805 1806 self.sgx_epc_region = Some(sgx_epc_region); 1807 1808 Ok(()) 1809 } 1810 1811 #[cfg(target_arch = "x86_64")] 1812 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> { 1813 &self.sgx_epc_region 1814 } 1815 1816 pub fn is_hardlink(f: &File) -> bool { 1817 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 1818 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 1819 if ret != 0 { 1820 error!("Couldn't fstat the backing file"); 1821 return false; 1822 } 1823 1824 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } 1825 } 1826 1827 pub fn memory_zones(&self) -> &MemoryZones { 1828 &self.memory_zones 1829 } 1830 1831 pub fn memory_zones_mut(&mut self) -> &mut MemoryZones { 1832 &mut self.memory_zones 1833 } 1834 1835 pub fn memory_range_table( 1836 &self, 1837 snapshot: bool, 1838 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 1839 let mut table = MemoryRangeTable::default(); 1840 1841 for memory_zone in self.memory_zones.values() { 1842 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 1843 table.extend(virtio_mem_zone.plugged_ranges()); 1844 } 1845 1846 for region in memory_zone.regions() { 1847 if snapshot { 1848 if let Some(file_offset) = region.file_offset() { 1849 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) 1850 && Self::is_hardlink(file_offset.file()) 1851 { 1852 // In this very specific case, we know the memory 1853 // region is backed by a file on the host filesystem 1854 // that can be accessed by the user, and additionally 1855 // the mapping is shared, which means that modifications 1856 // to the content are written to the actual file. 1857 // When meeting these conditions, we can skip the 1858 // copy of the memory content for this specific region, 1859 // as we can assume the user will have it saved through 1860 // the backing file already. 1861 continue; 1862 } 1863 } 1864 } 1865 1866 table.push(MemoryRange { 1867 gpa: region.start_addr().raw_value(), 1868 length: region.len() as u64, 1869 }); 1870 } 1871 } 1872 1873 Ok(table) 1874 } 1875 1876 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData { 1877 MemoryManagerSnapshotData { 1878 memory_ranges: self.snapshot_memory_ranges.clone(), 1879 guest_ram_mappings: self.guest_ram_mappings.clone(), 1880 start_of_device_area: self.start_of_device_area.0, 1881 boot_ram: self.boot_ram, 1882 current_ram: self.current_ram, 1883 arch_mem_regions: self.arch_mem_regions.clone(), 1884 hotplug_slots: self.hotplug_slots.clone(), 1885 next_memory_slot: self.next_memory_slot, 1886 selected_slot: self.selected_slot, 1887 next_hotplug_slot: self.next_hotplug_slot, 1888 } 1889 } 1890 1891 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> { 1892 let mut memory_slot_fds = HashMap::new(); 1893 for guest_ram_mapping in &self.guest_ram_mappings { 1894 let slot = guest_ram_mapping.slot; 1895 let guest_memory = self.guest_memory.memory(); 1896 let file = guest_memory 1897 .find_region(GuestAddress(guest_ram_mapping.gpa)) 1898 .unwrap() 1899 .file_offset() 1900 .unwrap() 1901 .file(); 1902 memory_slot_fds.insert(slot, file.as_raw_fd()); 1903 } 1904 memory_slot_fds 1905 } 1906 1907 pub fn acpi_address(&self) -> Option<GuestAddress> { 1908 self.acpi_address 1909 } 1910 1911 pub fn num_guest_ram_mappings(&self) -> u32 { 1912 self.guest_ram_mappings.len() as u32 1913 } 1914 1915 #[cfg(target_arch = "aarch64")] 1916 pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1917 self.uefi_flash.as_ref().unwrap().clone() 1918 } 1919 1920 #[cfg(feature = "guest_debug")] 1921 pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions { 1922 let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone(); 1923 mapping_sorted_by_gpa.sort_by_key(|m| m.gpa); 1924 1925 let mut mem_offset_in_elf = mem_offset; 1926 let mut ram_maps = BTreeMap::new(); 1927 for mapping in mapping_sorted_by_gpa.iter() { 1928 ram_maps.insert( 1929 mapping.gpa, 1930 CoredumpMemoryRegion { 1931 mem_offset_in_elf, 1932 mem_size: mapping.size, 1933 }, 1934 ); 1935 mem_offset_in_elf += mapping.size; 1936 } 1937 1938 CoredumpMemoryRegions { ram_maps } 1939 } 1940 1941 #[cfg(feature = "guest_debug")] 1942 pub fn coredump_iterate_save_mem( 1943 &mut self, 1944 dump_state: &DumpState, 1945 ) -> std::result::Result<(), GuestDebuggableError> { 1946 let snapshot_memory_ranges = self 1947 .memory_range_table(false) 1948 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 1949 1950 if snapshot_memory_ranges.is_empty() { 1951 return Ok(()); 1952 } 1953 1954 let mut coredump_file = dump_state.file.as_ref().unwrap(); 1955 1956 let guest_memory = self.guest_memory.memory(); 1957 let mut total_bytes: u64 = 0; 1958 1959 for range in snapshot_memory_ranges.regions() { 1960 let mut offset: u64 = 0; 1961 loop { 1962 let bytes_written = guest_memory 1963 .write_to( 1964 GuestAddress(range.gpa + offset), 1965 &mut coredump_file, 1966 (range.length - offset) as usize, 1967 ) 1968 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 1969 offset += bytes_written as u64; 1970 total_bytes += bytes_written as u64; 1971 1972 if offset == range.length { 1973 break; 1974 } 1975 } 1976 } 1977 1978 debug!("coredump total bytes {}", total_bytes); 1979 Ok(()) 1980 } 1981 } 1982 1983 struct MemoryNotify { 1984 slot_id: usize, 1985 } 1986 1987 impl Aml for MemoryNotify { 1988 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1989 let object = aml::Path::new(&format!("M{:03}", self.slot_id)); 1990 aml::If::new( 1991 &aml::Equal::new(&aml::Arg(0), &self.slot_id), 1992 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1993 ) 1994 .append_aml_bytes(bytes) 1995 } 1996 } 1997 1998 struct MemorySlot { 1999 slot_id: usize, 2000 } 2001 2002 impl Aml for MemorySlot { 2003 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2004 aml::Device::new( 2005 format!("M{:03}", self.slot_id).as_str().into(), 2006 vec![ 2007 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0C80")), 2008 &aml::Name::new("_UID".into(), &self.slot_id), 2009 /* 2010 _STA return value: 2011 Bit [0] – Set if the device is present. 2012 Bit [1] – Set if the device is enabled and decoding its resources. 2013 Bit [2] – Set if the device should be shown in the UI. 2014 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 2015 Bit [4] – Set if the battery is present. 2016 Bits [31:5] – Reserved (must be cleared). 2017 */ 2018 &aml::Method::new( 2019 "_STA".into(), 2020 0, 2021 false, 2022 // Call into MSTA method which will interrogate device 2023 vec![&aml::Return::new(&aml::MethodCall::new( 2024 "MSTA".into(), 2025 vec![&self.slot_id], 2026 ))], 2027 ), 2028 // Get details of memory 2029 &aml::Method::new( 2030 "_CRS".into(), 2031 0, 2032 false, 2033 // Call into MCRS which provides actual memory details 2034 vec![&aml::Return::new(&aml::MethodCall::new( 2035 "MCRS".into(), 2036 vec![&self.slot_id], 2037 ))], 2038 ), 2039 ], 2040 ) 2041 .append_aml_bytes(bytes) 2042 } 2043 } 2044 2045 struct MemorySlots { 2046 slots: usize, 2047 } 2048 2049 impl Aml for MemorySlots { 2050 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2051 for slot_id in 0..self.slots { 2052 MemorySlot { slot_id }.append_aml_bytes(bytes); 2053 } 2054 } 2055 } 2056 2057 struct MemoryMethods { 2058 slots: usize, 2059 } 2060 2061 impl Aml for MemoryMethods { 2062 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2063 // Add "MTFY" notification method 2064 let mut memory_notifies = Vec::new(); 2065 for slot_id in 0..self.slots { 2066 memory_notifies.push(MemoryNotify { slot_id }); 2067 } 2068 2069 let mut memory_notifies_refs: Vec<&dyn aml::Aml> = Vec::new(); 2070 for memory_notifier in memory_notifies.iter() { 2071 memory_notifies_refs.push(memory_notifier); 2072 } 2073 2074 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).append_aml_bytes(bytes); 2075 2076 // MSCN method 2077 aml::Method::new( 2078 "MSCN".into(), 2079 0, 2080 true, 2081 vec![ 2082 // Take lock defined above 2083 &aml::Acquire::new("MLCK".into(), 0xffff), 2084 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2085 &aml::While::new( 2086 &aml::LessThan::new(&aml::Local(0), &self.slots), 2087 vec![ 2088 // Write slot number (in first argument) to I/O port via field 2089 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)), 2090 // Check if MINS bit is set (inserting) 2091 &aml::If::new( 2092 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2093 // Notify device if it is 2094 vec![ 2095 &aml::MethodCall::new( 2096 "MTFY".into(), 2097 vec![&aml::Local(0), &aml::ONE], 2098 ), 2099 // Reset MINS bit 2100 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2101 ], 2102 ), 2103 // Check if MRMV bit is set 2104 &aml::If::new( 2105 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2106 // Notify device if it is (with the eject constant 0x3) 2107 vec![ 2108 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]), 2109 // Reset MRMV bit 2110 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2111 ], 2112 ), 2113 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2114 ], 2115 ), 2116 // Release lock 2117 &aml::Release::new("MLCK".into()), 2118 ], 2119 ) 2120 .append_aml_bytes(bytes); 2121 2122 // Memory status method 2123 aml::Method::new( 2124 "MSTA".into(), 2125 1, 2126 true, 2127 vec![ 2128 // Take lock defined above 2129 &aml::Acquire::new("MLCK".into(), 0xffff), 2130 // Write slot number (in first argument) to I/O port via field 2131 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2132 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2133 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2134 &aml::If::new( 2135 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE), 2136 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2137 ), 2138 // Release lock 2139 &aml::Release::new("MLCK".into()), 2140 // Return 0 or 0xf 2141 &aml::Return::new(&aml::Local(0)), 2142 ], 2143 ) 2144 .append_aml_bytes(bytes); 2145 2146 // Memory range method 2147 aml::Method::new( 2148 "MCRS".into(), 2149 1, 2150 true, 2151 vec![ 2152 // Take lock defined above 2153 &aml::Acquire::new("MLCK".into(), 0xffff), 2154 // Write slot number (in first argument) to I/O port via field 2155 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2156 &aml::Name::new( 2157 "MR64".into(), 2158 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2159 aml::AddressSpaceCachable::Cacheable, 2160 true, 2161 0x0000_0000_0000_0000u64, 2162 0xFFFF_FFFF_FFFF_FFFEu64, 2163 )]), 2164 ), 2165 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &14usize, "MINL".into()), 2166 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &18usize, "MINH".into()), 2167 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &22usize, "MAXL".into()), 2168 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &26usize, "MAXH".into()), 2169 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &38usize, "LENL".into()), 2170 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &42usize, "LENH".into()), 2171 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")), 2172 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")), 2173 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")), 2174 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")), 2175 &aml::Add::new( 2176 &aml::Path::new("MAXL"), 2177 &aml::Path::new("MINL"), 2178 &aml::Path::new("LENL"), 2179 ), 2180 &aml::Add::new( 2181 &aml::Path::new("MAXH"), 2182 &aml::Path::new("MINH"), 2183 &aml::Path::new("LENH"), 2184 ), 2185 &aml::If::new( 2186 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")), 2187 vec![&aml::Add::new( 2188 &aml::Path::new("MAXH"), 2189 &aml::ONE, 2190 &aml::Path::new("MAXH"), 2191 )], 2192 ), 2193 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE), 2194 // Release lock 2195 &aml::Release::new("MLCK".into()), 2196 &aml::Return::new(&aml::Path::new("MR64")), 2197 ], 2198 ) 2199 .append_aml_bytes(bytes) 2200 } 2201 } 2202 2203 impl Aml for MemoryManager { 2204 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2205 if let Some(acpi_address) = self.acpi_address { 2206 // Memory Hotplug Controller 2207 aml::Device::new( 2208 "_SB_.MHPC".into(), 2209 vec![ 2210 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 2211 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2212 // Mutex to protect concurrent access as we write to choose slot and then read back status 2213 &aml::Mutex::new("MLCK".into(), 0), 2214 &aml::Name::new( 2215 "_CRS".into(), 2216 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2217 aml::AddressSpaceCachable::NotCacheable, 2218 true, 2219 acpi_address.0 as u64, 2220 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1, 2221 )]), 2222 ), 2223 // OpRegion and Fields map MMIO range into individual field values 2224 &aml::OpRegion::new( 2225 "MHPR".into(), 2226 aml::OpRegionSpace::SystemMemory, 2227 acpi_address.0 as usize, 2228 MEMORY_MANAGER_ACPI_SIZE, 2229 ), 2230 &aml::Field::new( 2231 "MHPR".into(), 2232 aml::FieldAccessType::DWord, 2233 aml::FieldUpdateRule::Preserve, 2234 vec![ 2235 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes) 2236 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes) 2237 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes) 2238 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes) 2239 ], 2240 ), 2241 &aml::Field::new( 2242 "MHPR".into(), 2243 aml::FieldAccessType::DWord, 2244 aml::FieldUpdateRule::Preserve, 2245 vec![ 2246 aml::FieldEntry::Reserved(128), 2247 aml::FieldEntry::Named(*b"MHPX", 32), // PXM 2248 ], 2249 ), 2250 &aml::Field::new( 2251 "MHPR".into(), 2252 aml::FieldAccessType::Byte, 2253 aml::FieldUpdateRule::WriteAsZeroes, 2254 vec![ 2255 aml::FieldEntry::Reserved(160), 2256 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled 2257 aml::FieldEntry::Named(*b"MINS", 1), // Inserting 2258 aml::FieldEntry::Named(*b"MRMV", 1), // Removing 2259 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting 2260 ], 2261 ), 2262 &aml::Field::new( 2263 "MHPR".into(), 2264 aml::FieldAccessType::DWord, 2265 aml::FieldUpdateRule::Preserve, 2266 vec![ 2267 aml::FieldEntry::Named(*b"MSEL", 32), // Selector 2268 aml::FieldEntry::Named(*b"MOEV", 32), // Event 2269 aml::FieldEntry::Named(*b"MOSC", 32), // OSC 2270 ], 2271 ), 2272 &MemoryMethods { 2273 slots: self.hotplug_slots.len(), 2274 }, 2275 &MemorySlots { 2276 slots: self.hotplug_slots.len(), 2277 }, 2278 ], 2279 ) 2280 .append_aml_bytes(bytes); 2281 } else { 2282 aml::Device::new( 2283 "_SB_.MHPC".into(), 2284 vec![ 2285 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 2286 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2287 // Empty MSCN for GED 2288 &aml::Method::new("MSCN".into(), 0, true, vec![]), 2289 ], 2290 ) 2291 .append_aml_bytes(bytes); 2292 } 2293 2294 #[cfg(target_arch = "x86_64")] 2295 { 2296 if let Some(sgx_epc_region) = &self.sgx_epc_region { 2297 let min = sgx_epc_region.start().raw_value() as u64; 2298 let max = min + sgx_epc_region.size() as u64 - 1; 2299 // SGX EPC region 2300 aml::Device::new( 2301 "_SB_.EPC_".into(), 2302 vec![ 2303 &aml::Name::new("_HID".into(), &aml::EisaName::new("INT0E0C")), 2304 // QWORD describing the EPC region start and size 2305 &aml::Name::new( 2306 "_CRS".into(), 2307 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2308 aml::AddressSpaceCachable::NotCacheable, 2309 true, 2310 min, 2311 max, 2312 )]), 2313 ), 2314 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), 2315 ], 2316 ) 2317 .append_aml_bytes(bytes); 2318 } 2319 } 2320 } 2321 } 2322 2323 impl Pausable for MemoryManager {} 2324 2325 #[derive(Clone, Serialize, Deserialize, Versionize)] 2326 pub struct MemoryManagerSnapshotData { 2327 memory_ranges: MemoryRangeTable, 2328 guest_ram_mappings: Vec<GuestRamMapping>, 2329 start_of_device_area: u64, 2330 boot_ram: u64, 2331 current_ram: u64, 2332 arch_mem_regions: Vec<ArchMemRegion>, 2333 hotplug_slots: Vec<HotPlugState>, 2334 next_memory_slot: u32, 2335 selected_slot: usize, 2336 next_hotplug_slot: usize, 2337 } 2338 2339 impl VersionMapped for MemoryManagerSnapshotData {} 2340 2341 impl Snapshottable for MemoryManager { 2342 fn id(&self) -> String { 2343 MEMORY_MANAGER_SNAPSHOT_ID.to_string() 2344 } 2345 2346 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> { 2347 let mut memory_manager_snapshot = Snapshot::new(MEMORY_MANAGER_SNAPSHOT_ID); 2348 2349 let memory_ranges = self.memory_range_table(true)?; 2350 2351 // Store locally this list of ranges as it will be used through the 2352 // Transportable::send() implementation. The point is to avoid the 2353 // duplication of code regarding the creation of the path for each 2354 // region. The 'snapshot' step creates the list of memory regions, 2355 // including information about the need to copy a memory region or 2356 // not. This saves the 'send' step having to go through the same 2357 // process, and instead it can directly proceed with storing the 2358 // memory range content for the ranges requiring it. 2359 self.snapshot_memory_ranges = memory_ranges; 2360 2361 memory_manager_snapshot.add_data_section(SnapshotDataSection::new_from_versioned_state( 2362 MEMORY_MANAGER_SNAPSHOT_ID, 2363 &self.snapshot_data(), 2364 )?); 2365 2366 Ok(memory_manager_snapshot) 2367 } 2368 } 2369 2370 impl Transportable for MemoryManager { 2371 fn send( 2372 &self, 2373 _snapshot: &Snapshot, 2374 destination_url: &str, 2375 ) -> result::Result<(), MigratableError> { 2376 if self.snapshot_memory_ranges.is_empty() { 2377 return Ok(()); 2378 } 2379 2380 let mut memory_file_path = url_to_path(destination_url)?; 2381 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 2382 2383 // Create the snapshot file for the entire memory 2384 let mut memory_file = OpenOptions::new() 2385 .read(true) 2386 .write(true) 2387 .create_new(true) 2388 .open(memory_file_path) 2389 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2390 2391 let guest_memory = self.guest_memory.memory(); 2392 2393 for range in self.snapshot_memory_ranges.regions() { 2394 let mut offset: u64 = 0; 2395 // Here we are manually handling the retry in case we can't read 2396 // the whole region at once because we can't use the implementation 2397 // from vm-memory::GuestMemory of write_all_to() as it is not 2398 // following the correct behavior. For more info about this issue 2399 // see: https://github.com/rust-vmm/vm-memory/issues/174 2400 loop { 2401 let bytes_written = guest_memory 2402 .write_to( 2403 GuestAddress(range.gpa + offset), 2404 &mut memory_file, 2405 (range.length - offset) as usize, 2406 ) 2407 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2408 offset += bytes_written as u64; 2409 2410 if offset == range.length { 2411 break; 2412 } 2413 } 2414 } 2415 Ok(()) 2416 } 2417 } 2418 2419 impl Migratable for MemoryManager { 2420 // Start the dirty log in the hypervisor (kvm/mshv). 2421 // Also, reset the dirty bitmap logged by the vmm. 2422 // Just before we do a bulk copy we want to start/clear the dirty log so that 2423 // pages touched during our bulk copy are tracked. 2424 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2425 self.vm.start_dirty_log().map_err(|e| { 2426 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e)) 2427 })?; 2428 2429 for r in self.guest_memory.memory().iter() { 2430 r.bitmap().reset(); 2431 } 2432 2433 Ok(()) 2434 } 2435 2436 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2437 self.vm.stop_dirty_log().map_err(|e| { 2438 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e)) 2439 })?; 2440 2441 Ok(()) 2442 } 2443 2444 // Generate a table for the pages that are dirty. The dirty pages are collapsed 2445 // together in the table if they are contiguous. 2446 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2447 let mut table = MemoryRangeTable::default(); 2448 for r in &self.guest_ram_mappings { 2449 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { 2450 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e)) 2451 })?; 2452 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) 2453 { 2454 Some(region) => { 2455 assert!(region.start_addr().raw_value() == r.gpa); 2456 assert!(region.len() == r.size); 2457 region.bitmap().get_and_reset() 2458 } 2459 None => { 2460 return Err(MigratableError::MigrateSend(anyhow!( 2461 "Error finding 'guest memory region' with address {:x}", 2462 r.gpa 2463 ))) 2464 } 2465 }; 2466 2467 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap 2468 .iter() 2469 .zip(vmm_dirty_bitmap.iter()) 2470 .map(|(x, y)| x | y) 2471 .collect(); 2472 2473 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); 2474 2475 if sub_table.regions().is_empty() { 2476 info!("Dirty Memory Range Table is empty"); 2477 } else { 2478 info!("Dirty Memory Range Table:"); 2479 for range in sub_table.regions() { 2480 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); 2481 } 2482 } 2483 2484 table.extend(sub_table); 2485 } 2486 Ok(table) 2487 } 2488 } 2489