1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 #[cfg(target_arch = "x86_64")] 6 use crate::config::SgxEpcConfig; 7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; 8 #[cfg(feature = "guest_debug")] 9 use crate::coredump::{CoredumpMemoryRegion, CoredumpMemoryRegions}; 10 #[cfg(feature = "guest_debug")] 11 use crate::coredump::{DumpState, GuestDebuggableError}; 12 use crate::migration::url_to_path; 13 use crate::MEMORY_MANAGER_SNAPSHOT_ID; 14 use crate::{GuestMemoryMmap, GuestRegionMmap}; 15 use acpi_tables::{aml, aml::Aml}; 16 use anyhow::anyhow; 17 #[cfg(target_arch = "x86_64")] 18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; 19 use arch::{layout, RegionType}; 20 #[cfg(target_arch = "x86_64")] 21 use devices::ioapic; 22 #[cfg(target_arch = "x86_64")] 23 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; 24 use serde::{Deserialize, Serialize}; 25 #[cfg(feature = "guest_debug")] 26 use std::collections::BTreeMap; 27 use std::collections::HashMap; 28 use std::convert::TryInto; 29 use std::ffi; 30 use std::fs::{File, OpenOptions}; 31 use std::io; 32 use std::ops::Deref; 33 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 34 use std::path::PathBuf; 35 use std::result; 36 use std::sync::{Arc, Barrier, Mutex}; 37 use versionize::{VersionMap, Versionize, VersionizeResult}; 38 use versionize_derive::Versionize; 39 use virtio_devices::BlocksState; 40 #[cfg(target_arch = "x86_64")] 41 use vm_allocator::GsiApic; 42 use vm_allocator::{AddressAllocator, SystemAllocator}; 43 use vm_device::BusDevice; 44 use vm_memory::bitmap::AtomicBitmap; 45 use vm_memory::guest_memory::FileOffset; 46 use vm_memory::{ 47 mmap::MmapRegionError, Address, Bytes, Error as MmapError, GuestAddress, GuestAddressSpace, 48 GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, 49 }; 50 use vm_migration::{ 51 protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, 52 Snapshot, SnapshotDataSection, Snapshottable, Transportable, VersionMapped, 53 }; 54 55 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; 56 57 const DEFAULT_MEMORY_ZONE: &str = "mem0"; 58 59 const SNAPSHOT_FILENAME: &str = "memory-ranges"; 60 61 #[cfg(target_arch = "x86_64")] 62 const X86_64_IRQ_BASE: u32 = 5; 63 64 #[cfg(target_arch = "x86_64")] 65 const SGX_PAGE_SIZE: u64 = 1 << 12; 66 67 const HOTPLUG_COUNT: usize = 8; 68 69 // Memory policy constants 70 const MPOL_BIND: u32 = 2; 71 const MPOL_MF_STRICT: u32 = 1; 72 const MPOL_MF_MOVE: u32 = 1 << 1; 73 74 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices) 75 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20; 76 77 #[derive(Clone, Default, Serialize, Deserialize, Versionize)] 78 struct HotPlugState { 79 base: u64, 80 length: u64, 81 active: bool, 82 inserting: bool, 83 removing: bool, 84 } 85 86 pub struct VirtioMemZone { 87 region: Arc<GuestRegionMmap>, 88 resize_handler: virtio_devices::Resize, 89 hotplugged_size: u64, 90 hugepages: bool, 91 blocks_state: Arc<Mutex<BlocksState>>, 92 } 93 94 impl VirtioMemZone { 95 pub fn region(&self) -> &Arc<GuestRegionMmap> { 96 &self.region 97 } 98 pub fn resize_handler(&self) -> &virtio_devices::Resize { 99 &self.resize_handler 100 } 101 pub fn hotplugged_size(&self) -> u64 { 102 self.hotplugged_size 103 } 104 pub fn hugepages(&self) -> bool { 105 self.hugepages 106 } 107 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> { 108 &self.blocks_state 109 } 110 pub fn plugged_ranges(&self) -> MemoryRangeTable { 111 self.blocks_state 112 .lock() 113 .unwrap() 114 .memory_ranges(self.region.start_addr().raw_value(), true) 115 } 116 } 117 118 #[derive(Default)] 119 pub struct MemoryZone { 120 regions: Vec<Arc<GuestRegionMmap>>, 121 virtio_mem_zone: Option<VirtioMemZone>, 122 } 123 124 impl MemoryZone { 125 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 126 &self.regions 127 } 128 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> { 129 &self.virtio_mem_zone 130 } 131 } 132 133 pub type MemoryZones = HashMap<String, MemoryZone>; 134 135 #[derive(Clone, Serialize, Deserialize, Versionize)] 136 struct GuestRamMapping { 137 slot: u32, 138 gpa: u64, 139 size: u64, 140 zone_id: String, 141 virtio_mem: bool, 142 file_offset: u64, 143 } 144 145 #[derive(Clone, Serialize, Deserialize, Versionize)] 146 struct ArchMemRegion { 147 base: u64, 148 size: usize, 149 r_type: RegionType, 150 } 151 152 pub struct MemoryManager { 153 boot_guest_memory: GuestMemoryMmap, 154 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>, 155 next_memory_slot: u32, 156 start_of_device_area: GuestAddress, 157 end_of_device_area: GuestAddress, 158 end_of_ram_area: GuestAddress, 159 pub vm: Arc<dyn hypervisor::Vm>, 160 hotplug_slots: Vec<HotPlugState>, 161 selected_slot: usize, 162 mergeable: bool, 163 allocator: Arc<Mutex<SystemAllocator>>, 164 hotplug_method: HotplugMethod, 165 boot_ram: u64, 166 current_ram: u64, 167 next_hotplug_slot: usize, 168 shared: bool, 169 hugepages: bool, 170 hugepage_size: Option<u64>, 171 prefault: bool, 172 #[cfg(target_arch = "x86_64")] 173 sgx_epc_region: Option<SgxEpcRegion>, 174 user_provided_zones: bool, 175 snapshot_memory_ranges: MemoryRangeTable, 176 memory_zones: MemoryZones, 177 log_dirty: bool, // Enable dirty logging for created RAM regions 178 arch_mem_regions: Vec<ArchMemRegion>, 179 ram_allocator: AddressAllocator, 180 dynamic: bool, 181 182 // Keep track of calls to create_userspace_mapping() for guest RAM. 183 // This is useful for getting the dirty pages as we need to know the 184 // slots that the mapping is created in. 185 guest_ram_mappings: Vec<GuestRamMapping>, 186 187 pub acpi_address: Option<GuestAddress>, 188 } 189 190 #[derive(Debug)] 191 pub enum Error { 192 /// Failed to create shared file. 193 SharedFileCreate(io::Error), 194 195 /// Failed to set shared file length. 196 SharedFileSetLen(io::Error), 197 198 /// Mmap backed guest memory error 199 GuestMemory(MmapError), 200 201 /// Failed to allocate a memory range. 202 MemoryRangeAllocation, 203 204 /// Error from region creation 205 GuestMemoryRegion(MmapRegionError), 206 207 /// No ACPI slot available 208 NoSlotAvailable, 209 210 /// Not enough space in the hotplug RAM region 211 InsufficientHotplugRam, 212 213 /// The requested hotplug memory addition is not a valid size 214 InvalidSize, 215 216 /// Failed to create the user memory region. 217 CreateUserMemoryRegion(hypervisor::HypervisorVmError), 218 219 /// Failed to remove the user memory region. 220 RemoveUserMemoryRegion(hypervisor::HypervisorVmError), 221 222 /// Failed to EventFd. 223 EventFdFail(io::Error), 224 225 /// Eventfd write error 226 EventfdError(io::Error), 227 228 /// Failed to virtio-mem resize 229 VirtioMemResizeFail(virtio_devices::mem::Error), 230 231 /// Cannot restore VM 232 Restore(MigratableError), 233 234 /// Cannot restore VM because source URL is missing 235 RestoreMissingSourceUrl, 236 237 /// Cannot create the system allocator 238 CreateSystemAllocator, 239 240 /// Invalid SGX EPC section size 241 #[cfg(target_arch = "x86_64")] 242 EpcSectionSizeInvalid, 243 244 /// Failed allocating SGX EPC region 245 #[cfg(target_arch = "x86_64")] 246 SgxEpcRangeAllocation, 247 248 /// Failed opening SGX virtual EPC device 249 #[cfg(target_arch = "x86_64")] 250 SgxVirtEpcOpen(io::Error), 251 252 /// Failed setting the SGX virtual EPC section size 253 #[cfg(target_arch = "x86_64")] 254 SgxVirtEpcFileSetLen(io::Error), 255 256 /// Failed opening SGX provisioning device 257 #[cfg(target_arch = "x86_64")] 258 SgxProvisionOpen(io::Error), 259 260 /// Failed enabling SGX provisioning 261 #[cfg(target_arch = "x86_64")] 262 SgxEnableProvisioning(hypervisor::HypervisorVmError), 263 264 /// Failed creating a new MmapRegion instance. 265 #[cfg(target_arch = "x86_64")] 266 NewMmapRegion(vm_memory::mmap::MmapRegionError), 267 268 /// No memory zones found. 269 MissingMemoryZones, 270 271 /// Memory configuration is not valid. 272 InvalidMemoryParameters, 273 274 /// Forbidden operation. Impossible to resize guest memory if it is 275 /// backed by user defined memory regions. 276 InvalidResizeWithMemoryZones, 277 278 /// It's invalid to try applying a NUMA policy to a memory zone that is 279 /// memory mapped with MAP_SHARED. 280 InvalidSharedMemoryZoneWithHostNuma, 281 282 /// Failed applying NUMA memory policy. 283 ApplyNumaPolicy(io::Error), 284 285 /// Memory zone identifier is not unique. 286 DuplicateZoneId, 287 288 /// No virtio-mem resizing handler found. 289 MissingVirtioMemHandler, 290 291 /// Unknown memory zone. 292 UnknownMemoryZone, 293 294 /// Invalid size for resizing. Can be anything except 0. 295 InvalidHotplugSize, 296 297 /// Invalid hotplug method associated with memory zones resizing capability. 298 InvalidHotplugMethodWithMemoryZones, 299 300 /// Could not find specified memory zone identifier from hash map. 301 MissingZoneIdentifier, 302 303 /// Resizing the memory zone failed. 304 ResizeZone, 305 306 /// Guest address overflow 307 GuestAddressOverFlow, 308 309 /// Error opening snapshot file 310 SnapshotOpen(io::Error), 311 312 // Error copying snapshot into region 313 SnapshotCopy(GuestMemoryError), 314 315 /// Failed to allocate MMIO address 316 AllocateMmioAddress, 317 } 318 319 const ENABLE_FLAG: usize = 0; 320 const INSERTING_FLAG: usize = 1; 321 const REMOVING_FLAG: usize = 2; 322 const EJECT_FLAG: usize = 3; 323 324 const BASE_OFFSET_LOW: u64 = 0; 325 const BASE_OFFSET_HIGH: u64 = 0x4; 326 const LENGTH_OFFSET_LOW: u64 = 0x8; 327 const LENGTH_OFFSET_HIGH: u64 = 0xC; 328 const STATUS_OFFSET: u64 = 0x14; 329 const SELECTION_OFFSET: u64 = 0; 330 331 // The MMIO address space size is subtracted with 64k. This is done for the 332 // following reasons: 333 // - Reduce the addressable space size by at least 4k to workaround a Linux 334 // bug when the VMM allocates devices at the end of the addressable space 335 // - Windows requires the addressable space size to be 64k aligned 336 fn mmio_address_space_size(phys_bits: u8) -> u64 { 337 (1 << phys_bits) - (1 << 16) 338 } 339 340 impl BusDevice for MemoryManager { 341 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 342 if self.selected_slot < self.hotplug_slots.len() { 343 let state = &self.hotplug_slots[self.selected_slot]; 344 match offset { 345 BASE_OFFSET_LOW => { 346 data.copy_from_slice(&state.base.to_le_bytes()[..4]); 347 } 348 BASE_OFFSET_HIGH => { 349 data.copy_from_slice(&state.base.to_le_bytes()[4..]); 350 } 351 LENGTH_OFFSET_LOW => { 352 data.copy_from_slice(&state.length.to_le_bytes()[..4]); 353 } 354 LENGTH_OFFSET_HIGH => { 355 data.copy_from_slice(&state.length.to_le_bytes()[4..]); 356 } 357 STATUS_OFFSET => { 358 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 359 data.fill(0); 360 if state.active { 361 data[0] |= 1 << ENABLE_FLAG; 362 } 363 if state.inserting { 364 data[0] |= 1 << INSERTING_FLAG; 365 } 366 if state.removing { 367 data[0] |= 1 << REMOVING_FLAG; 368 } 369 } 370 _ => { 371 warn!( 372 "Unexpected offset for accessing memory manager device: {:#}", 373 offset 374 ); 375 } 376 } 377 } else { 378 warn!("Out of range memory slot: {}", self.selected_slot); 379 } 380 } 381 382 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 383 match offset { 384 SELECTION_OFFSET => { 385 self.selected_slot = usize::from(data[0]); 386 } 387 STATUS_OFFSET => { 388 if self.selected_slot < self.hotplug_slots.len() { 389 let state = &mut self.hotplug_slots[self.selected_slot]; 390 // The ACPI code writes back a 1 to acknowledge the insertion 391 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting { 392 state.inserting = false; 393 } 394 // Ditto for removal 395 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing { 396 state.removing = false; 397 } 398 // Trigger removal of "DIMM" 399 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG { 400 warn!("Ejection of memory not currently supported"); 401 } 402 } else { 403 warn!("Out of range memory slot: {}", self.selected_slot); 404 } 405 } 406 _ => { 407 warn!( 408 "Unexpected offset for accessing memory manager device: {:#}", 409 offset 410 ); 411 } 412 }; 413 None 414 } 415 } 416 417 impl MemoryManager { 418 /// Creates all memory regions based on the available RAM ranges defined 419 /// by `ram_regions`, and based on the description of the memory zones. 420 /// In practice, this function can perform multiple memory mappings of the 421 /// same backing file if there's a hole in the address space between two 422 /// RAM ranges. 423 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G) 424 /// and zones containing two zones (size 1G and size 4G). 425 /// This function will create 3 resulting memory regions: 426 /// - First one mapping entirely the first memory zone on 0-1G range 427 /// - Second one mapping partially the second memory zone on 1G-3G range 428 /// - Third one mapping partially the second memory zone on 4G-6G range 429 fn create_memory_regions_from_zones( 430 ram_regions: &[(GuestAddress, usize)], 431 zones: &[MemoryZoneConfig], 432 prefault: Option<bool>, 433 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 434 let mut zones = zones.to_owned(); 435 let mut mem_regions = Vec::new(); 436 let mut zone = zones.remove(0); 437 let mut zone_offset = 0; 438 let mut memory_zones = HashMap::new(); 439 440 // Add zone id to the list of memory zones. 441 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 442 443 for ram_region in ram_regions.iter() { 444 let mut ram_region_offset = 0; 445 let mut exit = false; 446 447 loop { 448 let mut ram_region_consumed = false; 449 let mut pull_next_zone = false; 450 451 let ram_region_sub_size = ram_region.1 - ram_region_offset; 452 let zone_sub_size = zone.size as usize - zone_offset; 453 454 let file_offset = zone_offset as u64; 455 let region_start = ram_region 456 .0 457 .checked_add(ram_region_offset as u64) 458 .ok_or(Error::GuestAddressOverFlow)?; 459 let region_size = if zone_sub_size <= ram_region_sub_size { 460 if zone_sub_size == ram_region_sub_size { 461 ram_region_consumed = true; 462 } 463 464 ram_region_offset += zone_sub_size; 465 pull_next_zone = true; 466 467 zone_sub_size 468 } else { 469 zone_offset += ram_region_sub_size; 470 ram_region_consumed = true; 471 472 ram_region_sub_size 473 }; 474 475 let region = MemoryManager::create_ram_region( 476 &zone.file, 477 file_offset, 478 region_start, 479 region_size, 480 match prefault { 481 Some(pf) => pf, 482 None => zone.prefault, 483 }, 484 zone.shared, 485 zone.hugepages, 486 zone.hugepage_size, 487 zone.host_numa_node, 488 None, 489 )?; 490 491 // Add region to the list of regions associated with the 492 // current memory zone. 493 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 494 memory_zone.regions.push(region.clone()); 495 } 496 497 mem_regions.push(region); 498 499 if pull_next_zone { 500 // Get the next zone and reset the offset. 501 zone_offset = 0; 502 if zones.is_empty() { 503 exit = true; 504 break; 505 } 506 zone = zones.remove(0); 507 508 // Check if zone id already exist. In case it does, throw 509 // an error as we need unique identifiers. Otherwise, add 510 // the new zone id to the list of memory zones. 511 if memory_zones.contains_key(&zone.id) { 512 error!( 513 "Memory zone identifier '{}' found more than once. \ 514 It must be unique", 515 zone.id, 516 ); 517 return Err(Error::DuplicateZoneId); 518 } 519 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 520 } 521 522 if ram_region_consumed { 523 break; 524 } 525 } 526 527 if exit { 528 break; 529 } 530 } 531 532 Ok((mem_regions, memory_zones)) 533 } 534 535 // Restore both GuestMemory regions along with MemoryZone zones. 536 fn restore_memory_regions_and_zones( 537 guest_ram_mappings: &[GuestRamMapping], 538 zones_config: &[MemoryZoneConfig], 539 prefault: Option<bool>, 540 mut existing_memory_files: HashMap<u32, File>, 541 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 542 let mut memory_regions = Vec::new(); 543 let mut memory_zones = HashMap::new(); 544 545 for zone_config in zones_config { 546 memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); 547 } 548 549 for guest_ram_mapping in guest_ram_mappings { 550 for zone_config in zones_config { 551 if guest_ram_mapping.zone_id == zone_config.id { 552 let region = MemoryManager::create_ram_region( 553 &zone_config.file, 554 guest_ram_mapping.file_offset, 555 GuestAddress(guest_ram_mapping.gpa), 556 guest_ram_mapping.size as usize, 557 match prefault { 558 Some(pf) => pf, 559 None => zone_config.prefault, 560 }, 561 zone_config.shared, 562 zone_config.hugepages, 563 zone_config.hugepage_size, 564 zone_config.host_numa_node, 565 existing_memory_files.remove(&guest_ram_mapping.slot), 566 )?; 567 memory_regions.push(Arc::clone(®ion)); 568 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) { 569 if guest_ram_mapping.virtio_mem { 570 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0); 571 let region_size = region.len(); 572 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 573 region, 574 resize_handler: virtio_devices::Resize::new(hotplugged_size) 575 .map_err(Error::EventFdFail)?, 576 hotplugged_size, 577 hugepages: zone_config.hugepages, 578 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 579 }); 580 } else { 581 memory_zone.regions.push(region); 582 } 583 } 584 } 585 } 586 } 587 588 memory_regions.sort_by_key(|x| x.start_addr()); 589 590 Ok((memory_regions, memory_zones)) 591 } 592 593 fn fill_saved_regions( 594 &mut self, 595 file_path: PathBuf, 596 saved_regions: MemoryRangeTable, 597 ) -> Result<(), Error> { 598 if saved_regions.is_empty() { 599 return Ok(()); 600 } 601 602 // Open (read only) the snapshot file. 603 let mut memory_file = OpenOptions::new() 604 .read(true) 605 .open(file_path) 606 .map_err(Error::SnapshotOpen)?; 607 608 let guest_memory = self.guest_memory.memory(); 609 for range in saved_regions.regions() { 610 let mut offset: u64 = 0; 611 // Here we are manually handling the retry in case we can't write 612 // the whole region at once because we can't use the implementation 613 // from vm-memory::GuestMemory of read_exact_from() as it is not 614 // following the correct behavior. For more info about this issue 615 // see: https://github.com/rust-vmm/vm-memory/issues/174 616 loop { 617 let bytes_read = guest_memory 618 .read_from( 619 GuestAddress(range.gpa + offset), 620 &mut memory_file, 621 (range.length - offset) as usize, 622 ) 623 .map_err(Error::SnapshotCopy)?; 624 offset += bytes_read as u64; 625 626 if offset == range.length { 627 break; 628 } 629 } 630 } 631 632 Ok(()) 633 } 634 635 fn validate_memory_config( 636 config: &MemoryConfig, 637 user_provided_zones: bool, 638 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> { 639 let mut allow_mem_hotplug = false; 640 641 if !user_provided_zones { 642 if config.zones.is_some() { 643 error!( 644 "User defined memory regions can't be provided if the \ 645 memory size is not 0" 646 ); 647 return Err(Error::InvalidMemoryParameters); 648 } 649 650 if config.hotplug_size.is_some() { 651 allow_mem_hotplug = true; 652 } 653 654 if let Some(hotplugged_size) = config.hotplugged_size { 655 if let Some(hotplug_size) = config.hotplug_size { 656 if hotplugged_size > hotplug_size { 657 error!( 658 "'hotplugged_size' {} can't be bigger than \ 659 'hotplug_size' {}", 660 hotplugged_size, hotplug_size, 661 ); 662 return Err(Error::InvalidMemoryParameters); 663 } 664 } else { 665 error!( 666 "Invalid to define 'hotplugged_size' when there is\ 667 no 'hotplug_size'" 668 ); 669 return Err(Error::InvalidMemoryParameters); 670 } 671 if config.hotplug_method == HotplugMethod::Acpi { 672 error!( 673 "Invalid to define 'hotplugged_size' with hotplug \ 674 method 'acpi'" 675 ); 676 return Err(Error::InvalidMemoryParameters); 677 } 678 } 679 680 // Create a single zone from the global memory config. This lets 681 // us reuse the codepath for user defined memory zones. 682 let zones = vec![MemoryZoneConfig { 683 id: String::from(DEFAULT_MEMORY_ZONE), 684 size: config.size, 685 file: None, 686 shared: config.shared, 687 hugepages: config.hugepages, 688 hugepage_size: config.hugepage_size, 689 host_numa_node: None, 690 hotplug_size: config.hotplug_size, 691 hotplugged_size: config.hotplugged_size, 692 prefault: config.prefault, 693 }]; 694 695 Ok((config.size, zones, allow_mem_hotplug)) 696 } else { 697 if config.zones.is_none() { 698 error!( 699 "User defined memory regions must be provided if the \ 700 memory size is 0" 701 ); 702 return Err(Error::MissingMemoryZones); 703 } 704 705 // Safe to unwrap as we checked right above there were some 706 // regions. 707 let zones = config.zones.clone().unwrap(); 708 if zones.is_empty() { 709 return Err(Error::MissingMemoryZones); 710 } 711 712 let mut total_ram_size: u64 = 0; 713 for zone in zones.iter() { 714 total_ram_size += zone.size; 715 716 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() { 717 error!( 718 "Invalid to set host NUMA policy for a memory zone \ 719 backed by a regular file and mapped as 'shared'" 720 ); 721 return Err(Error::InvalidSharedMemoryZoneWithHostNuma); 722 } 723 724 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi { 725 error!("Invalid to set ACPI hotplug method for memory zones"); 726 return Err(Error::InvalidHotplugMethodWithMemoryZones); 727 } 728 729 if let Some(hotplugged_size) = zone.hotplugged_size { 730 if let Some(hotplug_size) = zone.hotplug_size { 731 if hotplugged_size > hotplug_size { 732 error!( 733 "'hotplugged_size' {} can't be bigger than \ 734 'hotplug_size' {}", 735 hotplugged_size, hotplug_size, 736 ); 737 return Err(Error::InvalidMemoryParameters); 738 } 739 } else { 740 error!( 741 "Invalid to define 'hotplugged_size' when there is\ 742 no 'hotplug_size' for a memory zone" 743 ); 744 return Err(Error::InvalidMemoryParameters); 745 } 746 if config.hotplug_method == HotplugMethod::Acpi { 747 error!( 748 "Invalid to define 'hotplugged_size' with hotplug \ 749 method 'acpi'" 750 ); 751 return Err(Error::InvalidMemoryParameters); 752 } 753 } 754 } 755 756 Ok((total_ram_size, zones, allow_mem_hotplug)) 757 } 758 } 759 760 fn allocate_address_space(&mut self) -> Result<(), Error> { 761 let mut list = Vec::new(); 762 763 for (zone_id, memory_zone) in self.memory_zones.iter() { 764 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> = 765 memory_zone 766 .regions() 767 .iter() 768 .map(|r| (r.clone(), false)) 769 .collect(); 770 771 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 772 regions.push((virtio_mem_zone.region().clone(), true)); 773 } 774 775 list.push((zone_id.clone(), regions)); 776 } 777 778 for (zone_id, regions) in list { 779 for (region, virtio_mem) in regions { 780 let slot = self.create_userspace_mapping( 781 region.start_addr().raw_value(), 782 region.len() as u64, 783 region.as_ptr() as u64, 784 self.mergeable, 785 false, 786 self.log_dirty, 787 )?; 788 789 let file_offset = if let Some(file_offset) = region.file_offset() { 790 file_offset.start() 791 } else { 792 0 793 }; 794 795 self.guest_ram_mappings.push(GuestRamMapping { 796 gpa: region.start_addr().raw_value(), 797 size: region.len(), 798 slot, 799 zone_id: zone_id.clone(), 800 virtio_mem, 801 file_offset, 802 }); 803 self.ram_allocator 804 .allocate(Some(region.start_addr()), region.len(), None) 805 .ok_or(Error::MemoryRangeAllocation)?; 806 } 807 } 808 809 // Allocate SubRegion and Reserved address ranges. 810 for region in self.arch_mem_regions.iter() { 811 if region.r_type == RegionType::Ram { 812 // Ignore the RAM type since ranges have already been allocated 813 // based on the GuestMemory regions. 814 continue; 815 } 816 self.ram_allocator 817 .allocate( 818 Some(GuestAddress(region.base)), 819 region.size as GuestUsize, 820 None, 821 ) 822 .ok_or(Error::MemoryRangeAllocation)?; 823 } 824 825 Ok(()) 826 } 827 828 #[allow(clippy::too_many_arguments)] 829 pub fn new( 830 vm: Arc<dyn hypervisor::Vm>, 831 config: &MemoryConfig, 832 prefault: Option<bool>, 833 phys_bits: u8, 834 #[cfg(feature = "tdx")] tdx_enabled: bool, 835 restore_data: Option<&MemoryManagerSnapshotData>, 836 existing_memory_files: Option<HashMap<u32, File>>, 837 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, 838 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 839 let user_provided_zones = config.size == 0; 840 841 let mmio_address_space_size = mmio_address_space_size(phys_bits); 842 debug_assert_eq!( 843 (((mmio_address_space_size) >> 16) << 16), 844 mmio_address_space_size 845 ); 846 let start_of_platform_device_area = 847 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE); 848 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1); 849 850 let (ram_size, zones, allow_mem_hotplug) = 851 Self::validate_memory_config(config, user_provided_zones)?; 852 853 let ( 854 start_of_device_area, 855 boot_ram, 856 current_ram, 857 arch_mem_regions, 858 memory_zones, 859 guest_memory, 860 boot_guest_memory, 861 hotplug_slots, 862 next_memory_slot, 863 selected_slot, 864 next_hotplug_slot, 865 ) = if let Some(data) = restore_data { 866 let (regions, memory_zones) = Self::restore_memory_regions_and_zones( 867 &data.guest_ram_mappings, 868 &zones, 869 prefault, 870 existing_memory_files.unwrap_or_default(), 871 )?; 872 let guest_memory = 873 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; 874 let boot_guest_memory = guest_memory.clone(); 875 ( 876 GuestAddress(data.start_of_device_area), 877 data.boot_ram, 878 data.current_ram, 879 data.arch_mem_regions.clone(), 880 memory_zones, 881 guest_memory, 882 boot_guest_memory, 883 data.hotplug_slots.clone(), 884 data.next_memory_slot, 885 data.selected_slot, 886 data.next_hotplug_slot, 887 ) 888 } else { 889 // Init guest memory 890 let arch_mem_regions = arch::arch_memory_regions(ram_size); 891 892 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 893 .iter() 894 .filter(|r| r.2 == RegionType::Ram) 895 .map(|r| (r.0, r.1)) 896 .collect(); 897 898 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions 899 .iter() 900 .map(|(a, b, c)| ArchMemRegion { 901 base: a.0, 902 size: *b, 903 r_type: *c, 904 }) 905 .collect(); 906 907 let (mem_regions, mut memory_zones) = 908 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault)?; 909 910 let mut guest_memory = 911 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; 912 913 let boot_guest_memory = guest_memory.clone(); 914 915 let mut start_of_device_area = 916 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?; 917 918 // Update list of memory zones for resize. 919 for zone in zones.iter() { 920 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 921 if let Some(hotplug_size) = zone.hotplug_size { 922 if hotplug_size == 0 { 923 error!("'hotplug_size' can't be 0"); 924 return Err(Error::InvalidHotplugSize); 925 } 926 927 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi { 928 start_of_device_area = start_of_device_area 929 .checked_add(hotplug_size) 930 .ok_or(Error::GuestAddressOverFlow)?; 931 } else { 932 // Alignment must be "natural" i.e. same as size of block 933 let start_addr = GuestAddress( 934 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE 935 - 1) 936 / virtio_devices::VIRTIO_MEM_ALIGN_SIZE 937 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE, 938 ); 939 940 // When `prefault` is set by vm_restore, memory manager 941 // will create ram region with `prefault` option in 942 // restore config rather than same option in zone 943 let region = MemoryManager::create_ram_region( 944 &None, 945 0, 946 start_addr, 947 hotplug_size as usize, 948 match prefault { 949 Some(pf) => pf, 950 None => zone.prefault, 951 }, 952 zone.shared, 953 zone.hugepages, 954 zone.hugepage_size, 955 zone.host_numa_node, 956 None, 957 )?; 958 959 guest_memory = guest_memory 960 .insert_region(Arc::clone(®ion)) 961 .map_err(Error::GuestMemory)?; 962 963 let hotplugged_size = zone.hotplugged_size.unwrap_or(0); 964 let region_size = region.len(); 965 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 966 region, 967 resize_handler: virtio_devices::Resize::new(hotplugged_size) 968 .map_err(Error::EventFdFail)?, 969 hotplugged_size, 970 hugepages: zone.hugepages, 971 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 972 }); 973 974 start_of_device_area = start_addr 975 .checked_add(hotplug_size) 976 .ok_or(Error::GuestAddressOverFlow)?; 977 } 978 } 979 } else { 980 return Err(Error::MissingZoneIdentifier); 981 } 982 } 983 984 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT); 985 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default); 986 987 ( 988 start_of_device_area, 989 ram_size, 990 ram_size, 991 arch_mem_regions, 992 memory_zones, 993 guest_memory, 994 boot_guest_memory, 995 hotplug_slots, 996 0, 997 0, 998 0, 999 ) 1000 }; 1001 1002 let guest_memory = GuestMemoryAtomic::new(guest_memory); 1003 1004 // Both MMIO and PIO address spaces start at address 0. 1005 let allocator = Arc::new(Mutex::new( 1006 SystemAllocator::new( 1007 #[cfg(target_arch = "x86_64")] 1008 { 1009 GuestAddress(0) 1010 }, 1011 #[cfg(target_arch = "x86_64")] 1012 { 1013 1 << 16 1014 }, 1015 start_of_platform_device_area, 1016 PLATFORM_DEVICE_AREA_SIZE, 1017 layout::MEM_32BIT_DEVICES_START, 1018 layout::MEM_32BIT_DEVICES_SIZE, 1019 #[cfg(target_arch = "x86_64")] 1020 vec![GsiApic::new( 1021 X86_64_IRQ_BASE, 1022 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, 1023 )], 1024 ) 1025 .ok_or(Error::CreateSystemAllocator)?, 1026 )); 1027 1028 #[cfg(not(feature = "tdx"))] 1029 let dynamic = true; 1030 #[cfg(feature = "tdx")] 1031 let dynamic = !tdx_enabled; 1032 1033 let acpi_address = if dynamic 1034 && config.hotplug_method == HotplugMethod::Acpi 1035 && (config.hotplug_size.unwrap_or_default() > 0) 1036 { 1037 Some( 1038 allocator 1039 .lock() 1040 .unwrap() 1041 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None) 1042 .ok_or(Error::AllocateMmioAddress)?, 1043 ) 1044 } else { 1045 None 1046 }; 1047 1048 // If running on SGX the start of device area and RAM area may diverge but 1049 // at this point they are next to each other. 1050 let end_of_ram_area = start_of_device_area.unchecked_sub(1); 1051 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); 1052 1053 let mut memory_manager = MemoryManager { 1054 boot_guest_memory, 1055 guest_memory, 1056 next_memory_slot, 1057 start_of_device_area, 1058 end_of_device_area, 1059 end_of_ram_area, 1060 vm, 1061 hotplug_slots, 1062 selected_slot, 1063 mergeable: config.mergeable, 1064 allocator, 1065 hotplug_method: config.hotplug_method, 1066 boot_ram, 1067 current_ram, 1068 next_hotplug_slot, 1069 shared: config.shared, 1070 hugepages: config.hugepages, 1071 hugepage_size: config.hugepage_size, 1072 prefault: config.prefault, 1073 #[cfg(target_arch = "x86_64")] 1074 sgx_epc_region: None, 1075 user_provided_zones, 1076 snapshot_memory_ranges: MemoryRangeTable::default(), 1077 memory_zones, 1078 guest_ram_mappings: Vec::new(), 1079 acpi_address, 1080 log_dirty: dynamic, // Cannot log dirty pages on a TD 1081 arch_mem_regions, 1082 ram_allocator, 1083 dynamic, 1084 }; 1085 1086 memory_manager.allocate_address_space()?; 1087 #[cfg(target_arch = "x86_64")] 1088 if let Some(sgx_epc_config) = sgx_epc_config { 1089 memory_manager.setup_sgx(sgx_epc_config)?; 1090 } 1091 1092 Ok(Arc::new(Mutex::new(memory_manager))) 1093 } 1094 1095 pub fn new_from_snapshot( 1096 snapshot: &Snapshot, 1097 vm: Arc<dyn hypervisor::Vm>, 1098 config: &MemoryConfig, 1099 source_url: Option<&str>, 1100 prefault: bool, 1101 phys_bits: u8, 1102 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1103 if let Some(source_url) = source_url { 1104 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; 1105 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 1106 1107 let mem_snapshot: MemoryManagerSnapshotData = snapshot 1108 .to_versioned_state(MEMORY_MANAGER_SNAPSHOT_ID) 1109 .map_err(Error::Restore)?; 1110 1111 let mm = MemoryManager::new( 1112 vm, 1113 config, 1114 Some(prefault), 1115 phys_bits, 1116 #[cfg(feature = "tdx")] 1117 false, 1118 Some(&mem_snapshot), 1119 None, 1120 #[cfg(target_arch = "x86_64")] 1121 None, 1122 )?; 1123 1124 mm.lock() 1125 .unwrap() 1126 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?; 1127 1128 Ok(mm) 1129 } else { 1130 Err(Error::RestoreMissingSourceUrl) 1131 } 1132 } 1133 1134 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> { 1135 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) }; 1136 1137 if res < 0 { 1138 Err(io::Error::last_os_error()) 1139 } else { 1140 Ok(res as RawFd) 1141 } 1142 } 1143 1144 fn mbind( 1145 addr: *mut u8, 1146 len: u64, 1147 mode: u32, 1148 nodemask: Vec<u64>, 1149 maxnode: u64, 1150 flags: u32, 1151 ) -> Result<(), io::Error> { 1152 let res = unsafe { 1153 libc::syscall( 1154 libc::SYS_mbind, 1155 addr as *mut libc::c_void, 1156 len, 1157 mode, 1158 nodemask.as_ptr(), 1159 maxnode, 1160 flags, 1161 ) 1162 }; 1163 1164 if res < 0 { 1165 Err(io::Error::last_os_error()) 1166 } else { 1167 Ok(()) 1168 } 1169 } 1170 1171 fn open_memory_file( 1172 backing_file: &Option<PathBuf>, 1173 file_offset: u64, 1174 size: usize, 1175 hugepages: bool, 1176 hugepage_size: Option<u64>, 1177 ) -> Result<(File, u64), Error> { 1178 let (f, f_off) = match backing_file { 1179 Some(ref file) => { 1180 if file.is_dir() { 1181 // Override file offset as it does not apply in this case. 1182 info!( 1183 "Ignoring file offset since the backing file is a \ 1184 temporary file created from the specified directory." 1185 ); 1186 let fs_str = format!("{}{}", file.display(), "/tmpfile_XXXXXX"); 1187 let fs = ffi::CString::new(fs_str).unwrap(); 1188 let mut path = fs.as_bytes_with_nul().to_owned(); 1189 let path_ptr = path.as_mut_ptr() as *mut _; 1190 let fd = unsafe { libc::mkstemp(path_ptr) }; 1191 unsafe { libc::unlink(path_ptr) }; 1192 let f = unsafe { File::from_raw_fd(fd) }; 1193 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1194 1195 (f, 0) 1196 } else { 1197 let f = OpenOptions::new() 1198 .read(true) 1199 .write(true) 1200 .open(file) 1201 .map_err(Error::SharedFileCreate)?; 1202 1203 (f, file_offset) 1204 } 1205 } 1206 None => { 1207 let fd = Self::memfd_create( 1208 &ffi::CString::new("ch_ram").unwrap(), 1209 if hugepages { 1210 libc::MFD_HUGETLB 1211 | if let Some(hugepage_size) = hugepage_size { 1212 /* 1213 * From the Linux kernel: 1214 * Several system calls take a flag to request "hugetlb" huge pages. 1215 * Without further specification, these system calls will use the 1216 * system's default huge page size. If a system supports multiple 1217 * huge page sizes, the desired huge page size can be specified in 1218 * bits [26:31] of the flag arguments. The value in these 6 bits 1219 * will encode the log2 of the huge page size. 1220 */ 1221 1222 hugepage_size.trailing_zeros() << 26 1223 } else { 1224 // Use the system default huge page size 1225 0 1226 } 1227 } else { 1228 0 1229 }, 1230 ) 1231 .map_err(Error::SharedFileCreate)?; 1232 1233 let f = unsafe { File::from_raw_fd(fd) }; 1234 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1235 1236 (f, 0) 1237 } 1238 }; 1239 1240 Ok((f, f_off)) 1241 } 1242 1243 #[allow(clippy::too_many_arguments)] 1244 fn create_ram_region( 1245 backing_file: &Option<PathBuf>, 1246 file_offset: u64, 1247 start_addr: GuestAddress, 1248 size: usize, 1249 prefault: bool, 1250 shared: bool, 1251 hugepages: bool, 1252 hugepage_size: Option<u64>, 1253 host_numa_node: Option<u32>, 1254 existing_memory_file: Option<File>, 1255 ) -> Result<Arc<GuestRegionMmap>, Error> { 1256 let (f, f_off) = if let Some(f) = existing_memory_file { 1257 (f, file_offset) 1258 } else { 1259 Self::open_memory_file(backing_file, file_offset, size, hugepages, hugepage_size)? 1260 }; 1261 1262 let mut mmap_flags = libc::MAP_NORESERVE 1263 | if shared { 1264 libc::MAP_SHARED 1265 } else { 1266 libc::MAP_PRIVATE 1267 }; 1268 if prefault { 1269 mmap_flags |= libc::MAP_POPULATE; 1270 } 1271 1272 let region = GuestRegionMmap::new( 1273 MmapRegion::build( 1274 Some(FileOffset::new(f, f_off)), 1275 size, 1276 libc::PROT_READ | libc::PROT_WRITE, 1277 mmap_flags, 1278 ) 1279 .map_err(Error::GuestMemoryRegion)?, 1280 start_addr, 1281 ) 1282 .map_err(Error::GuestMemory)?; 1283 1284 // Apply NUMA policy if needed. 1285 if let Some(node) = host_numa_node { 1286 let addr = region.deref().as_ptr(); 1287 let len = region.deref().size() as u64; 1288 let mode = MPOL_BIND; 1289 let mut nodemask: Vec<u64> = Vec::new(); 1290 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 1291 1292 // Linux is kind of buggy in the way it interprets maxnode as it 1293 // will cut off the last node. That's why we have to add 1 to what 1294 // we would consider as the proper maxnode value. 1295 let maxnode = node as u64 + 1 + 1; 1296 1297 // Allocate the right size for the vector. 1298 nodemask.resize((node as usize / 64) + 1, 0); 1299 1300 // Fill the global bitmask through the nodemask vector. 1301 let idx = (node / 64) as usize; 1302 let shift = node % 64; 1303 nodemask[idx] |= 1u64 << shift; 1304 1305 // Policies are enforced by using MPOL_MF_MOVE flag as it will 1306 // force the kernel to move all pages that might have been already 1307 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is 1308 // used to throw an error if MPOL_MF_MOVE didn't succeed. 1309 // MPOL_BIND is the selected mode as it specifies a strict policy 1310 // that restricts memory allocation to the nodes specified in the 1311 // nodemask. 1312 Self::mbind(addr, len, mode, nodemask, maxnode, flags) 1313 .map_err(Error::ApplyNumaPolicy)?; 1314 } 1315 1316 Ok(Arc::new(region)) 1317 } 1318 1319 // Update the GuestMemoryMmap with the new range 1320 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> { 1321 let guest_memory = self 1322 .guest_memory 1323 .memory() 1324 .insert_region(region) 1325 .map_err(Error::GuestMemory)?; 1326 self.guest_memory.lock().unwrap().replace(guest_memory); 1327 1328 Ok(()) 1329 } 1330 1331 // 1332 // Calculate the start address of an area next to RAM. 1333 // 1334 // If memory hotplug is allowed, the start address needs to be aligned 1335 // (rounded-up) to 128MiB boundary. 1336 // If memory hotplug is not allowed, there is no alignment required. 1337 // And it must also start at the 64bit start. 1338 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> { 1339 let mut start_addr = if allow_mem_hotplug { 1340 GuestAddress(mem_end.0 | ((128 << 20) - 1)) 1341 } else { 1342 mem_end 1343 }; 1344 1345 start_addr = start_addr 1346 .checked_add(1) 1347 .ok_or(Error::GuestAddressOverFlow)?; 1348 1349 if mem_end < arch::layout::MEM_32BIT_RESERVED_START { 1350 return Ok(arch::layout::RAM_64BIT_START); 1351 } 1352 1353 Ok(start_addr) 1354 } 1355 1356 pub fn add_ram_region( 1357 &mut self, 1358 start_addr: GuestAddress, 1359 size: usize, 1360 ) -> Result<Arc<GuestRegionMmap>, Error> { 1361 // Allocate memory for the region 1362 let region = MemoryManager::create_ram_region( 1363 &None, 1364 0, 1365 start_addr, 1366 size, 1367 self.prefault, 1368 self.shared, 1369 self.hugepages, 1370 self.hugepage_size, 1371 None, 1372 None, 1373 )?; 1374 1375 // Map it into the guest 1376 let slot = self.create_userspace_mapping( 1377 region.start_addr().0, 1378 region.len() as u64, 1379 region.as_ptr() as u64, 1380 self.mergeable, 1381 false, 1382 self.log_dirty, 1383 )?; 1384 self.guest_ram_mappings.push(GuestRamMapping { 1385 gpa: region.start_addr().raw_value(), 1386 size: region.len(), 1387 slot, 1388 zone_id: DEFAULT_MEMORY_ZONE.to_string(), 1389 virtio_mem: false, 1390 file_offset: 0, 1391 }); 1392 1393 self.add_region(Arc::clone(®ion))?; 1394 1395 Ok(region) 1396 } 1397 1398 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> { 1399 info!("Hotplugging new RAM: {}", size); 1400 1401 // Check that there is a free slot 1402 if self.next_hotplug_slot >= HOTPLUG_COUNT { 1403 return Err(Error::NoSlotAvailable); 1404 } 1405 1406 // "Inserted" DIMM must have a size that is a multiple of 128MiB 1407 if size % (128 << 20) != 0 { 1408 return Err(Error::InvalidSize); 1409 } 1410 1411 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?; 1412 1413 if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area { 1414 return Err(Error::InsufficientHotplugRam); 1415 } 1416 1417 let region = self.add_ram_region(start_addr, size)?; 1418 1419 // Add region to the list of regions associated with the default 1420 // memory zone. 1421 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) { 1422 memory_zone.regions.push(Arc::clone(®ion)); 1423 } 1424 1425 // Tell the allocator 1426 self.ram_allocator 1427 .allocate(Some(start_addr), size as GuestUsize, None) 1428 .ok_or(Error::MemoryRangeAllocation)?; 1429 1430 // Update the slot so that it can be queried via the I/O port 1431 let mut slot = &mut self.hotplug_slots[self.next_hotplug_slot]; 1432 slot.active = true; 1433 slot.inserting = true; 1434 slot.base = region.start_addr().0; 1435 slot.length = region.len() as u64; 1436 1437 self.next_hotplug_slot += 1; 1438 1439 Ok(region) 1440 } 1441 1442 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1443 self.guest_memory.clone() 1444 } 1445 1446 pub fn boot_guest_memory(&self) -> GuestMemoryMmap { 1447 self.boot_guest_memory.clone() 1448 } 1449 1450 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> { 1451 self.allocator.clone() 1452 } 1453 1454 pub fn start_of_device_area(&self) -> GuestAddress { 1455 self.start_of_device_area 1456 } 1457 1458 pub fn end_of_device_area(&self) -> GuestAddress { 1459 self.end_of_device_area 1460 } 1461 1462 pub fn allocate_memory_slot(&mut self) -> u32 { 1463 let slot_id = self.next_memory_slot; 1464 self.next_memory_slot += 1; 1465 slot_id 1466 } 1467 1468 pub fn create_userspace_mapping( 1469 &mut self, 1470 guest_phys_addr: u64, 1471 memory_size: u64, 1472 userspace_addr: u64, 1473 mergeable: bool, 1474 readonly: bool, 1475 log_dirty: bool, 1476 ) -> Result<u32, Error> { 1477 let slot = self.allocate_memory_slot(); 1478 let mem_region = self.vm.make_user_memory_region( 1479 slot, 1480 guest_phys_addr, 1481 memory_size, 1482 userspace_addr, 1483 readonly, 1484 log_dirty, 1485 ); 1486 1487 info!( 1488 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}", 1489 guest_phys_addr, userspace_addr, memory_size, slot 1490 ); 1491 1492 self.vm 1493 .create_user_memory_region(mem_region) 1494 .map_err(Error::CreateUserMemoryRegion)?; 1495 1496 // Mark the pages as mergeable if explicitly asked for. 1497 if mergeable { 1498 // Safe because the address and size are valid since the 1499 // mmap succeeded. 1500 let ret = unsafe { 1501 libc::madvise( 1502 userspace_addr as *mut libc::c_void, 1503 memory_size as libc::size_t, 1504 libc::MADV_MERGEABLE, 1505 ) 1506 }; 1507 if ret != 0 { 1508 let err = io::Error::last_os_error(); 1509 // Safe to unwrap because the error is constructed with 1510 // last_os_error(), which ensures the output will be Some(). 1511 let errno = err.raw_os_error().unwrap(); 1512 if errno == libc::EINVAL { 1513 warn!("kernel not configured with CONFIG_KSM"); 1514 } else { 1515 warn!("madvise error: {}", err); 1516 } 1517 warn!("failed to mark pages as mergeable"); 1518 } 1519 } 1520 1521 info!( 1522 "Created userspace mapping: {:x} -> {:x} {:x}", 1523 guest_phys_addr, userspace_addr, memory_size 1524 ); 1525 1526 Ok(slot) 1527 } 1528 1529 pub fn remove_userspace_mapping( 1530 &mut self, 1531 guest_phys_addr: u64, 1532 memory_size: u64, 1533 userspace_addr: u64, 1534 mergeable: bool, 1535 slot: u32, 1536 ) -> Result<(), Error> { 1537 let mem_region = self.vm.make_user_memory_region( 1538 slot, 1539 guest_phys_addr, 1540 memory_size, 1541 userspace_addr, 1542 false, /* readonly -- don't care */ 1543 false, /* log dirty */ 1544 ); 1545 1546 self.vm 1547 .remove_user_memory_region(mem_region) 1548 .map_err(Error::RemoveUserMemoryRegion)?; 1549 1550 // Mark the pages as unmergeable if there were previously marked as 1551 // mergeable. 1552 if mergeable { 1553 // Safe because the address and size are valid as the region was 1554 // previously advised. 1555 let ret = unsafe { 1556 libc::madvise( 1557 userspace_addr as *mut libc::c_void, 1558 memory_size as libc::size_t, 1559 libc::MADV_UNMERGEABLE, 1560 ) 1561 }; 1562 if ret != 0 { 1563 let err = io::Error::last_os_error(); 1564 // Safe to unwrap because the error is constructed with 1565 // last_os_error(), which ensures the output will be Some(). 1566 let errno = err.raw_os_error().unwrap(); 1567 if errno == libc::EINVAL { 1568 warn!("kernel not configured with CONFIG_KSM"); 1569 } else { 1570 warn!("madvise error: {}", err); 1571 } 1572 warn!("failed to mark pages as unmergeable"); 1573 } 1574 } 1575 1576 info!( 1577 "Removed userspace mapping: {:x} -> {:x} {:x}", 1578 guest_phys_addr, userspace_addr, memory_size 1579 ); 1580 1581 Ok(()) 1582 } 1583 1584 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> { 1585 if let Some(memory_zone) = self.memory_zones.get_mut(id) { 1586 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone { 1587 virtio_mem_zone 1588 .resize_handler() 1589 .work(size) 1590 .map_err(Error::VirtioMemResizeFail)?; 1591 1592 // Keep the hotplugged_size up to date. 1593 virtio_mem_zone.hotplugged_size = size; 1594 } else { 1595 error!("Failed resizing virtio-mem region: No virtio-mem handler"); 1596 return Err(Error::MissingVirtioMemHandler); 1597 } 1598 1599 return Ok(()); 1600 } 1601 1602 error!("Failed resizing virtio-mem region: Unknown memory zone"); 1603 Err(Error::UnknownMemoryZone) 1604 } 1605 1606 /// In case this function resulted in adding a new memory region to the 1607 /// guest memory, the new region is returned to the caller. The virtio-mem 1608 /// use case never adds a new region as the whole hotpluggable memory has 1609 /// already been allocated at boot time. 1610 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> { 1611 if self.user_provided_zones { 1612 error!( 1613 "Not allowed to resize guest memory when backed with user \ 1614 defined memory zones." 1615 ); 1616 return Err(Error::InvalidResizeWithMemoryZones); 1617 } 1618 1619 let mut region: Option<Arc<GuestRegionMmap>> = None; 1620 match self.hotplug_method { 1621 HotplugMethod::VirtioMem => { 1622 if desired_ram >= self.boot_ram { 1623 if !self.dynamic { 1624 return Ok(region); 1625 } 1626 1627 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?; 1628 self.current_ram = desired_ram; 1629 } 1630 } 1631 HotplugMethod::Acpi => { 1632 if desired_ram > self.current_ram { 1633 if !self.dynamic { 1634 return Ok(region); 1635 } 1636 1637 region = 1638 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?); 1639 self.current_ram = desired_ram; 1640 } 1641 } 1642 } 1643 Ok(region) 1644 } 1645 1646 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> { 1647 if !self.user_provided_zones { 1648 error!( 1649 "Not allowed to resize guest memory zone when no zone is \ 1650 defined." 1651 ); 1652 return Err(Error::ResizeZone); 1653 } 1654 1655 self.virtio_mem_resize(id, virtio_mem_size) 1656 } 1657 1658 #[cfg(target_arch = "x86_64")] 1659 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> { 1660 let file = OpenOptions::new() 1661 .read(true) 1662 .open("/dev/sgx_provision") 1663 .map_err(Error::SgxProvisionOpen)?; 1664 self.vm 1665 .enable_sgx_attribute(file) 1666 .map_err(Error::SgxEnableProvisioning)?; 1667 1668 // Go over each EPC section and verify its size is a 4k multiple. At 1669 // the same time, calculate the total size needed for the contiguous 1670 // EPC region. 1671 let mut epc_region_size = 0; 1672 for epc_section in sgx_epc_config.iter() { 1673 if epc_section.size == 0 { 1674 return Err(Error::EpcSectionSizeInvalid); 1675 } 1676 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { 1677 return Err(Error::EpcSectionSizeInvalid); 1678 } 1679 1680 epc_region_size += epc_section.size; 1681 } 1682 1683 // Place the SGX EPC region on a 4k boundary between the RAM and the device area 1684 let epc_region_start = GuestAddress( 1685 ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE, 1686 ); 1687 1688 self.start_of_device_area = epc_region_start 1689 .checked_add(epc_region_size) 1690 .ok_or(Error::GuestAddressOverFlow)?; 1691 1692 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); 1693 info!( 1694 "SGX EPC region: 0x{:x} (0x{:x})", 1695 epc_region_start.0, epc_region_size 1696 ); 1697 1698 // Each section can be memory mapped into the allocated region. 1699 let mut epc_section_start = epc_region_start.raw_value(); 1700 for epc_section in sgx_epc_config.iter() { 1701 let file = OpenOptions::new() 1702 .read(true) 1703 .write(true) 1704 .open("/dev/sgx_vepc") 1705 .map_err(Error::SgxVirtEpcOpen)?; 1706 1707 let prot = PROT_READ | PROT_WRITE; 1708 let mut flags = MAP_NORESERVE | MAP_SHARED; 1709 if epc_section.prefault { 1710 flags |= MAP_POPULATE; 1711 } 1712 1713 // We can't use the vm-memory crate to perform the memory mapping 1714 // here as it would try to ensure the size of the backing file is 1715 // matching the size of the expected mapping. The /dev/sgx_vepc 1716 // device does not work that way, it provides a file descriptor 1717 // which is not matching the mapping size, as it's a just a way to 1718 // let KVM know that an EPC section is being created for the guest. 1719 let host_addr = unsafe { 1720 libc::mmap( 1721 std::ptr::null_mut(), 1722 epc_section.size as usize, 1723 prot, 1724 flags, 1725 file.as_raw_fd(), 1726 0, 1727 ) 1728 } as u64; 1729 1730 info!( 1731 "Adding SGX EPC section: 0x{:x} (0x{:x})", 1732 epc_section_start, epc_section.size 1733 ); 1734 1735 let _mem_slot = self.create_userspace_mapping( 1736 epc_section_start, 1737 epc_section.size, 1738 host_addr, 1739 false, 1740 false, 1741 false, 1742 )?; 1743 1744 sgx_epc_region.insert( 1745 epc_section.id.clone(), 1746 SgxEpcSection::new( 1747 GuestAddress(epc_section_start), 1748 epc_section.size as GuestUsize, 1749 ), 1750 ); 1751 1752 epc_section_start += epc_section.size; 1753 } 1754 1755 self.sgx_epc_region = Some(sgx_epc_region); 1756 1757 Ok(()) 1758 } 1759 1760 #[cfg(target_arch = "x86_64")] 1761 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> { 1762 &self.sgx_epc_region 1763 } 1764 1765 pub fn is_hardlink(f: &File) -> bool { 1766 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 1767 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 1768 if ret != 0 { 1769 error!("Couldn't fstat the backing file"); 1770 return false; 1771 } 1772 1773 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } 1774 } 1775 1776 pub fn memory_zones(&self) -> &MemoryZones { 1777 &self.memory_zones 1778 } 1779 1780 pub fn memory_range_table( 1781 &self, 1782 snapshot: bool, 1783 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 1784 let mut table = MemoryRangeTable::default(); 1785 1786 for memory_zone in self.memory_zones.values() { 1787 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 1788 table.extend(virtio_mem_zone.plugged_ranges()); 1789 } 1790 1791 for region in memory_zone.regions() { 1792 if snapshot { 1793 if let Some(file_offset) = region.file_offset() { 1794 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) 1795 && Self::is_hardlink(file_offset.file()) 1796 { 1797 // In this very specific case, we know the memory 1798 // region is backed by a file on the host filesystem 1799 // that can be accessed by the user, and additionally 1800 // the mapping is shared, which means that modifications 1801 // to the content are written to the actual file. 1802 // When meeting these conditions, we can skip the 1803 // copy of the memory content for this specific region, 1804 // as we can assume the user will have it saved through 1805 // the backing file already. 1806 continue; 1807 } 1808 } 1809 } 1810 1811 table.push(MemoryRange { 1812 gpa: region.start_addr().raw_value(), 1813 length: region.len() as u64, 1814 }); 1815 } 1816 } 1817 1818 Ok(table) 1819 } 1820 1821 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData { 1822 MemoryManagerSnapshotData { 1823 memory_ranges: self.snapshot_memory_ranges.clone(), 1824 guest_ram_mappings: self.guest_ram_mappings.clone(), 1825 start_of_device_area: self.start_of_device_area.0, 1826 boot_ram: self.boot_ram, 1827 current_ram: self.current_ram, 1828 arch_mem_regions: self.arch_mem_regions.clone(), 1829 hotplug_slots: self.hotplug_slots.clone(), 1830 next_memory_slot: self.next_memory_slot, 1831 selected_slot: self.selected_slot, 1832 next_hotplug_slot: self.next_hotplug_slot, 1833 } 1834 } 1835 1836 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> { 1837 let mut memory_slot_fds = HashMap::new(); 1838 for guest_ram_mapping in &self.guest_ram_mappings { 1839 let slot = guest_ram_mapping.slot; 1840 let guest_memory = self.guest_memory.memory(); 1841 let file = guest_memory 1842 .find_region(GuestAddress(guest_ram_mapping.gpa)) 1843 .unwrap() 1844 .file_offset() 1845 .unwrap() 1846 .file(); 1847 memory_slot_fds.insert(slot, file.as_raw_fd()); 1848 } 1849 memory_slot_fds 1850 } 1851 1852 pub fn acpi_address(&self) -> Option<GuestAddress> { 1853 self.acpi_address 1854 } 1855 1856 pub fn num_guest_ram_mappings(&self) -> u32 { 1857 self.guest_ram_mappings.len() as u32 1858 } 1859 1860 #[cfg(feature = "guest_debug")] 1861 pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions { 1862 let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone(); 1863 mapping_sorted_by_gpa.sort_by_key(|m| m.gpa); 1864 1865 let mut mem_offset_in_elf = mem_offset; 1866 let mut ram_maps = BTreeMap::new(); 1867 for mapping in mapping_sorted_by_gpa.iter() { 1868 ram_maps.insert( 1869 mapping.gpa, 1870 CoredumpMemoryRegion { 1871 mem_offset_in_elf, 1872 mem_size: mapping.size, 1873 }, 1874 ); 1875 mem_offset_in_elf += mapping.size; 1876 } 1877 1878 CoredumpMemoryRegions { ram_maps } 1879 } 1880 1881 #[cfg(feature = "guest_debug")] 1882 pub fn coredump_iterate_save_mem( 1883 &mut self, 1884 dump_state: &DumpState, 1885 ) -> std::result::Result<(), GuestDebuggableError> { 1886 let snapshot_memory_ranges = self 1887 .memory_range_table(false) 1888 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 1889 1890 if snapshot_memory_ranges.is_empty() { 1891 return Ok(()); 1892 } 1893 1894 let mut coredump_file = dump_state.file.as_ref().unwrap(); 1895 1896 let guest_memory = self.guest_memory.memory(); 1897 let mut total_bytes: u64 = 0; 1898 1899 for range in snapshot_memory_ranges.regions() { 1900 let mut offset: u64 = 0; 1901 loop { 1902 let bytes_written = guest_memory 1903 .write_to( 1904 GuestAddress(range.gpa + offset), 1905 &mut coredump_file, 1906 (range.length - offset) as usize, 1907 ) 1908 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 1909 offset += bytes_written as u64; 1910 total_bytes += bytes_written as u64; 1911 1912 if offset == range.length { 1913 break; 1914 } 1915 } 1916 } 1917 1918 debug!("coredump total bytes {}", total_bytes); 1919 Ok(()) 1920 } 1921 } 1922 1923 struct MemoryNotify { 1924 slot_id: usize, 1925 } 1926 1927 impl Aml for MemoryNotify { 1928 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1929 let object = aml::Path::new(&format!("M{:03}", self.slot_id)); 1930 aml::If::new( 1931 &aml::Equal::new(&aml::Arg(0), &self.slot_id), 1932 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1933 ) 1934 .append_aml_bytes(bytes) 1935 } 1936 } 1937 1938 struct MemorySlot { 1939 slot_id: usize, 1940 } 1941 1942 impl Aml for MemorySlot { 1943 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1944 aml::Device::new( 1945 format!("M{:03}", self.slot_id).as_str().into(), 1946 vec![ 1947 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0C80")), 1948 &aml::Name::new("_UID".into(), &self.slot_id), 1949 /* 1950 _STA return value: 1951 Bit [0] – Set if the device is present. 1952 Bit [1] – Set if the device is enabled and decoding its resources. 1953 Bit [2] – Set if the device should be shown in the UI. 1954 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1955 Bit [4] – Set if the battery is present. 1956 Bits [31:5] – Reserved (must be cleared). 1957 */ 1958 &aml::Method::new( 1959 "_STA".into(), 1960 0, 1961 false, 1962 // Call into MSTA method which will interrogate device 1963 vec![&aml::Return::new(&aml::MethodCall::new( 1964 "MSTA".into(), 1965 vec![&self.slot_id], 1966 ))], 1967 ), 1968 // Get details of memory 1969 &aml::Method::new( 1970 "_CRS".into(), 1971 0, 1972 false, 1973 // Call into MCRS which provides actual memory details 1974 vec![&aml::Return::new(&aml::MethodCall::new( 1975 "MCRS".into(), 1976 vec![&self.slot_id], 1977 ))], 1978 ), 1979 ], 1980 ) 1981 .append_aml_bytes(bytes) 1982 } 1983 } 1984 1985 struct MemorySlots { 1986 slots: usize, 1987 } 1988 1989 impl Aml for MemorySlots { 1990 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1991 for slot_id in 0..self.slots { 1992 MemorySlot { slot_id }.append_aml_bytes(bytes); 1993 } 1994 } 1995 } 1996 1997 struct MemoryMethods { 1998 slots: usize, 1999 } 2000 2001 impl Aml for MemoryMethods { 2002 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2003 // Add "MTFY" notification method 2004 let mut memory_notifies = Vec::new(); 2005 for slot_id in 0..self.slots { 2006 memory_notifies.push(MemoryNotify { slot_id }); 2007 } 2008 2009 let mut memory_notifies_refs: Vec<&dyn aml::Aml> = Vec::new(); 2010 for memory_notifier in memory_notifies.iter() { 2011 memory_notifies_refs.push(memory_notifier); 2012 } 2013 2014 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).append_aml_bytes(bytes); 2015 2016 // MSCN method 2017 aml::Method::new( 2018 "MSCN".into(), 2019 0, 2020 true, 2021 vec![ 2022 // Take lock defined above 2023 &aml::Acquire::new("MLCK".into(), 0xffff), 2024 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2025 &aml::While::new( 2026 &aml::LessThan::new(&aml::Local(0), &self.slots), 2027 vec![ 2028 // Write slot number (in first argument) to I/O port via field 2029 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)), 2030 // Check if MINS bit is set (inserting) 2031 &aml::If::new( 2032 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2033 // Notify device if it is 2034 vec![ 2035 &aml::MethodCall::new( 2036 "MTFY".into(), 2037 vec![&aml::Local(0), &aml::ONE], 2038 ), 2039 // Reset MINS bit 2040 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2041 ], 2042 ), 2043 // Check if MRMV bit is set 2044 &aml::If::new( 2045 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2046 // Notify device if it is (with the eject constant 0x3) 2047 vec![ 2048 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]), 2049 // Reset MRMV bit 2050 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2051 ], 2052 ), 2053 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2054 ], 2055 ), 2056 // Release lock 2057 &aml::Release::new("MLCK".into()), 2058 ], 2059 ) 2060 .append_aml_bytes(bytes); 2061 2062 // Memory status method 2063 aml::Method::new( 2064 "MSTA".into(), 2065 1, 2066 true, 2067 vec![ 2068 // Take lock defined above 2069 &aml::Acquire::new("MLCK".into(), 0xffff), 2070 // Write slot number (in first argument) to I/O port via field 2071 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2072 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2073 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2074 &aml::If::new( 2075 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE), 2076 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2077 ), 2078 // Release lock 2079 &aml::Release::new("MLCK".into()), 2080 // Return 0 or 0xf 2081 &aml::Return::new(&aml::Local(0)), 2082 ], 2083 ) 2084 .append_aml_bytes(bytes); 2085 2086 // Memory range method 2087 aml::Method::new( 2088 "MCRS".into(), 2089 1, 2090 true, 2091 vec![ 2092 // Take lock defined above 2093 &aml::Acquire::new("MLCK".into(), 0xffff), 2094 // Write slot number (in first argument) to I/O port via field 2095 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2096 &aml::Name::new( 2097 "MR64".into(), 2098 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2099 aml::AddressSpaceCachable::Cacheable, 2100 true, 2101 0x0000_0000_0000_0000u64, 2102 0xFFFF_FFFF_FFFF_FFFEu64, 2103 )]), 2104 ), 2105 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &14usize, "MINL".into()), 2106 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &18usize, "MINH".into()), 2107 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &22usize, "MAXL".into()), 2108 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &26usize, "MAXH".into()), 2109 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &38usize, "LENL".into()), 2110 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &42usize, "LENH".into()), 2111 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")), 2112 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")), 2113 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")), 2114 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")), 2115 &aml::Add::new( 2116 &aml::Path::new("MAXL"), 2117 &aml::Path::new("MINL"), 2118 &aml::Path::new("LENL"), 2119 ), 2120 &aml::Add::new( 2121 &aml::Path::new("MAXH"), 2122 &aml::Path::new("MINH"), 2123 &aml::Path::new("LENH"), 2124 ), 2125 &aml::If::new( 2126 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")), 2127 vec![&aml::Add::new( 2128 &aml::Path::new("MAXH"), 2129 &aml::ONE, 2130 &aml::Path::new("MAXH"), 2131 )], 2132 ), 2133 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE), 2134 // Release lock 2135 &aml::Release::new("MLCK".into()), 2136 &aml::Return::new(&aml::Path::new("MR64")), 2137 ], 2138 ) 2139 .append_aml_bytes(bytes) 2140 } 2141 } 2142 2143 impl Aml for MemoryManager { 2144 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2145 if let Some(acpi_address) = self.acpi_address { 2146 // Memory Hotplug Controller 2147 aml::Device::new( 2148 "_SB_.MHPC".into(), 2149 vec![ 2150 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 2151 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2152 // Mutex to protect concurrent access as we write to choose slot and then read back status 2153 &aml::Mutex::new("MLCK".into(), 0), 2154 &aml::Name::new( 2155 "_CRS".into(), 2156 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2157 aml::AddressSpaceCachable::NotCacheable, 2158 true, 2159 acpi_address.0 as u64, 2160 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1, 2161 )]), 2162 ), 2163 // OpRegion and Fields map MMIO range into individual field values 2164 &aml::OpRegion::new( 2165 "MHPR".into(), 2166 aml::OpRegionSpace::SystemMemory, 2167 acpi_address.0 as usize, 2168 MEMORY_MANAGER_ACPI_SIZE, 2169 ), 2170 &aml::Field::new( 2171 "MHPR".into(), 2172 aml::FieldAccessType::DWord, 2173 aml::FieldUpdateRule::Preserve, 2174 vec![ 2175 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes) 2176 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes) 2177 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes) 2178 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes) 2179 ], 2180 ), 2181 &aml::Field::new( 2182 "MHPR".into(), 2183 aml::FieldAccessType::DWord, 2184 aml::FieldUpdateRule::Preserve, 2185 vec![ 2186 aml::FieldEntry::Reserved(128), 2187 aml::FieldEntry::Named(*b"MHPX", 32), // PXM 2188 ], 2189 ), 2190 &aml::Field::new( 2191 "MHPR".into(), 2192 aml::FieldAccessType::Byte, 2193 aml::FieldUpdateRule::WriteAsZeroes, 2194 vec![ 2195 aml::FieldEntry::Reserved(160), 2196 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled 2197 aml::FieldEntry::Named(*b"MINS", 1), // Inserting 2198 aml::FieldEntry::Named(*b"MRMV", 1), // Removing 2199 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting 2200 ], 2201 ), 2202 &aml::Field::new( 2203 "MHPR".into(), 2204 aml::FieldAccessType::DWord, 2205 aml::FieldUpdateRule::Preserve, 2206 vec![ 2207 aml::FieldEntry::Named(*b"MSEL", 32), // Selector 2208 aml::FieldEntry::Named(*b"MOEV", 32), // Event 2209 aml::FieldEntry::Named(*b"MOSC", 32), // OSC 2210 ], 2211 ), 2212 &MemoryMethods { 2213 slots: self.hotplug_slots.len(), 2214 }, 2215 &MemorySlots { 2216 slots: self.hotplug_slots.len(), 2217 }, 2218 ], 2219 ) 2220 .append_aml_bytes(bytes); 2221 } else { 2222 aml::Device::new( 2223 "_SB_.MHPC".into(), 2224 vec![ 2225 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 2226 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2227 // Empty MSCN for GED 2228 &aml::Method::new("MSCN".into(), 0, true, vec![]), 2229 ], 2230 ) 2231 .append_aml_bytes(bytes); 2232 } 2233 2234 #[cfg(target_arch = "x86_64")] 2235 { 2236 if let Some(sgx_epc_region) = &self.sgx_epc_region { 2237 let min = sgx_epc_region.start().raw_value() as u64; 2238 let max = min + sgx_epc_region.size() as u64 - 1; 2239 // SGX EPC region 2240 aml::Device::new( 2241 "_SB_.EPC_".into(), 2242 vec![ 2243 &aml::Name::new("_HID".into(), &aml::EisaName::new("INT0E0C")), 2244 // QWORD describing the EPC region start and size 2245 &aml::Name::new( 2246 "_CRS".into(), 2247 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2248 aml::AddressSpaceCachable::NotCacheable, 2249 true, 2250 min, 2251 max, 2252 )]), 2253 ), 2254 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), 2255 ], 2256 ) 2257 .append_aml_bytes(bytes); 2258 } 2259 } 2260 } 2261 } 2262 2263 impl Pausable for MemoryManager {} 2264 2265 #[derive(Clone, Serialize, Deserialize, Versionize)] 2266 pub struct MemoryManagerSnapshotData { 2267 memory_ranges: MemoryRangeTable, 2268 guest_ram_mappings: Vec<GuestRamMapping>, 2269 start_of_device_area: u64, 2270 boot_ram: u64, 2271 current_ram: u64, 2272 arch_mem_regions: Vec<ArchMemRegion>, 2273 hotplug_slots: Vec<HotPlugState>, 2274 next_memory_slot: u32, 2275 selected_slot: usize, 2276 next_hotplug_slot: usize, 2277 } 2278 2279 impl VersionMapped for MemoryManagerSnapshotData {} 2280 2281 impl Snapshottable for MemoryManager { 2282 fn id(&self) -> String { 2283 MEMORY_MANAGER_SNAPSHOT_ID.to_string() 2284 } 2285 2286 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> { 2287 let mut memory_manager_snapshot = Snapshot::new(MEMORY_MANAGER_SNAPSHOT_ID); 2288 2289 let memory_ranges = self.memory_range_table(true)?; 2290 2291 // Store locally this list of ranges as it will be used through the 2292 // Transportable::send() implementation. The point is to avoid the 2293 // duplication of code regarding the creation of the path for each 2294 // region. The 'snapshot' step creates the list of memory regions, 2295 // including information about the need to copy a memory region or 2296 // not. This saves the 'send' step having to go through the same 2297 // process, and instead it can directly proceed with storing the 2298 // memory range content for the ranges requiring it. 2299 self.snapshot_memory_ranges = memory_ranges; 2300 2301 memory_manager_snapshot.add_data_section(SnapshotDataSection::new_from_versioned_state( 2302 MEMORY_MANAGER_SNAPSHOT_ID, 2303 &self.snapshot_data(), 2304 )?); 2305 2306 Ok(memory_manager_snapshot) 2307 } 2308 } 2309 2310 impl Transportable for MemoryManager { 2311 fn send( 2312 &self, 2313 _snapshot: &Snapshot, 2314 destination_url: &str, 2315 ) -> result::Result<(), MigratableError> { 2316 if self.snapshot_memory_ranges.is_empty() { 2317 return Ok(()); 2318 } 2319 2320 let mut memory_file_path = url_to_path(destination_url)?; 2321 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 2322 2323 // Create the snapshot file for the entire memory 2324 let mut memory_file = OpenOptions::new() 2325 .read(true) 2326 .write(true) 2327 .create_new(true) 2328 .open(memory_file_path) 2329 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2330 2331 let guest_memory = self.guest_memory.memory(); 2332 2333 for range in self.snapshot_memory_ranges.regions() { 2334 let mut offset: u64 = 0; 2335 // Here we are manually handling the retry in case we can't read 2336 // the whole region at once because we can't use the implementation 2337 // from vm-memory::GuestMemory of write_all_to() as it is not 2338 // following the correct behavior. For more info about this issue 2339 // see: https://github.com/rust-vmm/vm-memory/issues/174 2340 loop { 2341 let bytes_written = guest_memory 2342 .write_to( 2343 GuestAddress(range.gpa + offset), 2344 &mut memory_file, 2345 (range.length - offset) as usize, 2346 ) 2347 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2348 offset += bytes_written as u64; 2349 2350 if offset == range.length { 2351 break; 2352 } 2353 } 2354 } 2355 Ok(()) 2356 } 2357 } 2358 2359 impl Migratable for MemoryManager { 2360 // Start the dirty log in the hypervisor (kvm/mshv). 2361 // Also, reset the dirty bitmap logged by the vmm. 2362 // Just before we do a bulk copy we want to start/clear the dirty log so that 2363 // pages touched during our bulk copy are tracked. 2364 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2365 self.vm.start_dirty_log().map_err(|e| { 2366 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e)) 2367 })?; 2368 2369 for r in self.guest_memory.memory().iter() { 2370 r.bitmap().reset(); 2371 } 2372 2373 Ok(()) 2374 } 2375 2376 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2377 self.vm.stop_dirty_log().map_err(|e| { 2378 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e)) 2379 })?; 2380 2381 Ok(()) 2382 } 2383 2384 // Generate a table for the pages that are dirty. The dirty pages are collapsed 2385 // together in the table if they are contiguous. 2386 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2387 let mut table = MemoryRangeTable::default(); 2388 for r in &self.guest_ram_mappings { 2389 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { 2390 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e)) 2391 })?; 2392 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) 2393 { 2394 Some(region) => { 2395 assert!(region.start_addr().raw_value() == r.gpa); 2396 assert!(region.len() == r.size); 2397 region.bitmap().get_and_reset() 2398 } 2399 None => { 2400 return Err(MigratableError::MigrateSend(anyhow!( 2401 "Error finding 'guest memory region' with address {:x}", 2402 r.gpa 2403 ))) 2404 } 2405 }; 2406 2407 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap 2408 .iter() 2409 .zip(vmm_dirty_bitmap.iter()) 2410 .map(|(x, y)| x | y) 2411 .collect(); 2412 2413 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); 2414 2415 if sub_table.regions().is_empty() { 2416 info!("Dirty Memory Range Table is empty"); 2417 } else { 2418 info!("Dirty Memory Range Table:"); 2419 for range in sub_table.regions() { 2420 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); 2421 } 2422 } 2423 2424 table.extend(sub_table); 2425 } 2426 Ok(table) 2427 } 2428 } 2429