1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // SPDX-License-Identifier: Apache-2.0 5 // 6 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 7 // Use of this source code is governed by a BSD-style license that can be 8 // found in the LICENSE-BSD-3-Clause file. 9 use std::sync::Arc; 10 pub mod interrupts; 11 pub mod layout; 12 mod mpspec; 13 mod mptable; 14 pub mod regs; 15 use crate::GuestMemoryMmap; 16 use crate::InitramfsConfig; 17 use crate::RegionType; 18 use hypervisor::arch::x86::{CpuIdEntry, CPUID_FLAG_VALID_INDEX}; 19 use hypervisor::{CpuVendor, HypervisorCpuError, HypervisorError}; 20 use linux_loader::loader::bootparam::{boot_params, setup_header}; 21 use linux_loader::loader::elf::start_info::{ 22 hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info, 23 }; 24 use std::collections::BTreeMap; 25 use std::mem; 26 use thiserror::Error; 27 use vm_memory::{ 28 Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, 29 GuestMemoryRegion, GuestUsize, 30 }; 31 mod smbios; 32 use std::arch::x86_64; 33 #[cfg(feature = "tdx")] 34 pub mod tdx; 35 36 // CPUID feature bits 37 const TSC_DEADLINE_TIMER_ECX_BIT: u8 = 24; // tsc deadline timer ecx bit. 38 const HYPERVISOR_ECX_BIT: u8 = 31; // Hypervisor ecx bit. 39 const MTRR_EDX_BIT: u8 = 12; // Hypervisor ecx bit. 40 const INVARIANT_TSC_EDX_BIT: u8 = 8; // Invariant TSC bit on 0x8000_0007 EDX 41 const AMX_BF16: u8 = 22; // AMX tile computation on bfloat16 numbers 42 const AMX_TILE: u8 = 24; // AMX tile load/store instructions 43 const AMX_INT8: u8 = 25; // AMX tile computation on 8-bit integers 44 45 // KVM feature bits 46 #[cfg(feature = "tdx")] 47 const KVM_FEATURE_CLOCKSOURCE_BIT: u8 = 0; 48 #[cfg(feature = "tdx")] 49 const KVM_FEATURE_CLOCKSOURCE2_BIT: u8 = 3; 50 #[cfg(feature = "tdx")] 51 const KVM_FEATURE_CLOCKSOURCE_STABLE_BIT: u8 = 24; 52 #[cfg(feature = "tdx")] 53 const KVM_FEATURE_ASYNC_PF_BIT: u8 = 4; 54 #[cfg(feature = "tdx")] 55 const KVM_FEATURE_ASYNC_PF_VMEXIT_BIT: u8 = 10; 56 #[cfg(feature = "tdx")] 57 const KVM_FEATURE_STEAL_TIME_BIT: u8 = 5; 58 59 pub const _NSIG: i32 = 65; 60 61 #[derive(Debug, Copy, Clone)] 62 /// Specifies the entry point address where the guest must start 63 /// executing code, as well as which of the supported boot protocols 64 /// is to be used to configure the guest initial state. 65 pub struct EntryPoint { 66 /// Address in guest memory where the guest must start execution 67 pub entry_addr: GuestAddress, 68 /// This field is used for bzImage to fill the zero page 69 pub setup_header: Option<setup_header>, 70 } 71 72 const E820_RAM: u32 = 1; 73 const E820_RESERVED: u32 = 2; 74 75 #[derive(Clone)] 76 pub struct SgxEpcSection { 77 start: GuestAddress, 78 size: GuestUsize, 79 } 80 81 impl SgxEpcSection { 82 pub fn new(start: GuestAddress, size: GuestUsize) -> Self { 83 SgxEpcSection { start, size } 84 } 85 pub fn start(&self) -> GuestAddress { 86 self.start 87 } 88 pub fn size(&self) -> GuestUsize { 89 self.size 90 } 91 } 92 93 #[derive(Clone)] 94 pub struct SgxEpcRegion { 95 start: GuestAddress, 96 size: GuestUsize, 97 epc_sections: BTreeMap<String, SgxEpcSection>, 98 } 99 100 impl SgxEpcRegion { 101 pub fn new(start: GuestAddress, size: GuestUsize) -> Self { 102 SgxEpcRegion { 103 start, 104 size, 105 epc_sections: BTreeMap::new(), 106 } 107 } 108 pub fn start(&self) -> GuestAddress { 109 self.start 110 } 111 pub fn size(&self) -> GuestUsize { 112 self.size 113 } 114 pub fn epc_sections(&self) -> &BTreeMap<String, SgxEpcSection> { 115 &self.epc_sections 116 } 117 pub fn insert(&mut self, id: String, epc_section: SgxEpcSection) { 118 self.epc_sections.insert(id, epc_section); 119 } 120 } 121 122 pub struct CpuidConfig { 123 pub sgx_epc_sections: Option<Vec<SgxEpcSection>>, 124 pub phys_bits: u8, 125 pub kvm_hyperv: bool, 126 #[cfg(feature = "tdx")] 127 pub tdx: bool, 128 pub amx: bool, 129 } 130 131 #[derive(Debug, Error)] 132 pub enum Error { 133 /// Error writing MP table to memory. 134 #[error("Error writing MP table to memory: {0}")] 135 MpTableSetup(mptable::Error), 136 137 /// Error configuring the general purpose registers 138 #[error("Error configuring the general purpose registers: {0}")] 139 RegsConfiguration(regs::Error), 140 141 /// Error configuring the special registers 142 #[error("Error configuring the special registers: {0}")] 143 SregsConfiguration(regs::Error), 144 145 /// Error configuring the floating point related registers 146 #[error("Error configuring the floating point related registers: {0}")] 147 FpuConfiguration(regs::Error), 148 149 /// Error configuring the MSR registers 150 #[error("Error configuring the MSR registers: {0}")] 151 MsrsConfiguration(regs::Error), 152 153 /// Failed to set supported CPUs. 154 #[error("Failed to set supported CPUs: {0}")] 155 SetSupportedCpusFailed(anyhow::Error), 156 157 /// Cannot set the local interruption due to bad configuration. 158 #[error("Cannot set the local interruption due to bad configuration: {0}")] 159 LocalIntConfiguration(anyhow::Error), 160 161 /// Error setting up SMBIOS table 162 #[error("Error setting up SMBIOS table: {0}")] 163 SmbiosSetup(smbios::Error), 164 165 /// Could not find any SGX EPC section 166 #[error("Could not find any SGX EPC section")] 167 NoSgxEpcSection, 168 169 /// Missing SGX CPU feature 170 #[error("Missing SGX CPU feature")] 171 MissingSgxFeature, 172 173 /// Missing SGX_LC CPU feature 174 #[error("Missing SGX_LC CPU feature")] 175 MissingSgxLaunchControlFeature, 176 177 /// Error getting supported CPUID through the hypervisor (kvm/mshv) API 178 #[error("Error getting supported CPUID through the hypervisor API: {0}")] 179 CpuidGetSupported(HypervisorError), 180 181 /// Error populating CPUID with KVM HyperV emulation details 182 #[error("Error populating CPUID with KVM HyperV emulation details: {0}")] 183 CpuidKvmHyperV(vmm_sys_util::fam::Error), 184 185 /// Error populating CPUID with CPU identification 186 #[error("Error populating CPUID with CPU identification: {0}")] 187 CpuidIdentification(vmm_sys_util::fam::Error), 188 189 /// Error checking CPUID compatibility 190 #[error("Error checking CPUID compatibility")] 191 CpuidCheckCompatibility, 192 193 // Error writing EBDA address 194 #[error("Error writing EBDA address: {0}")] 195 EbdaSetup(vm_memory::GuestMemoryError), 196 197 // Error getting CPU TSC frequency 198 #[error("Error getting CPU TSC frequency: {0}")] 199 GetTscFrequency(HypervisorCpuError), 200 201 /// Error retrieving TDX capabilities through the hypervisor (kvm/mshv) API 202 #[cfg(feature = "tdx")] 203 #[error("Error retrieving TDX capabilities through the hypervisor API: {0}")] 204 TdxCapabilities(HypervisorError), 205 206 /// Failed to configure E820 map for bzImage 207 #[error("Failed to configure E820 map for bzImage")] 208 E820Configuration, 209 } 210 211 impl From<Error> for super::Error { 212 fn from(e: Error) -> super::Error { 213 super::Error::PlatformSpecific(e) 214 } 215 } 216 217 pub fn get_x2apic_id(cpu_id: u32, topology: Option<(u8, u8, u8)>) -> u32 { 218 if let Some(t) = topology { 219 let thread_mask_width = u8::BITS - (t.0 - 1).leading_zeros(); 220 let core_mask_width = u8::BITS - (t.1 - 1).leading_zeros(); 221 let die_mask_width = u8::BITS - (t.2 - 1).leading_zeros(); 222 223 let thread_id = cpu_id % (t.0 as u32); 224 let core_id = cpu_id / (t.0 as u32) % (t.1 as u32); 225 let die_id = cpu_id / ((t.0 * t.1) as u32) % (t.2 as u32); 226 let socket_id = cpu_id / ((t.0 * t.1 * t.2) as u32); 227 228 return thread_id 229 | (core_id << thread_mask_width) 230 | (die_id << (thread_mask_width + core_mask_width)) 231 | (socket_id << (thread_mask_width + core_mask_width + die_mask_width)); 232 } 233 234 cpu_id 235 } 236 237 #[derive(Copy, Clone, Debug)] 238 pub enum CpuidReg { 239 EAX, 240 EBX, 241 ECX, 242 EDX, 243 } 244 245 pub struct CpuidPatch { 246 pub function: u32, 247 pub index: u32, 248 pub flags_bit: Option<u8>, 249 pub eax_bit: Option<u8>, 250 pub ebx_bit: Option<u8>, 251 pub ecx_bit: Option<u8>, 252 pub edx_bit: Option<u8>, 253 } 254 255 impl CpuidPatch { 256 pub fn get_cpuid_reg( 257 cpuid: &[CpuIdEntry], 258 function: u32, 259 index: Option<u32>, 260 reg: CpuidReg, 261 ) -> Option<u32> { 262 for entry in cpuid.iter() { 263 if entry.function == function && (index.is_none() || index.unwrap() == entry.index) { 264 return match reg { 265 CpuidReg::EAX => Some(entry.eax), 266 CpuidReg::EBX => Some(entry.ebx), 267 CpuidReg::ECX => Some(entry.ecx), 268 CpuidReg::EDX => Some(entry.edx), 269 }; 270 } 271 } 272 273 None 274 } 275 276 pub fn set_cpuid_reg( 277 cpuid: &mut Vec<CpuIdEntry>, 278 function: u32, 279 index: Option<u32>, 280 reg: CpuidReg, 281 value: u32, 282 ) { 283 let mut entry_found = false; 284 for entry in cpuid.iter_mut() { 285 if entry.function == function && (index.is_none() || index.unwrap() == entry.index) { 286 entry_found = true; 287 match reg { 288 CpuidReg::EAX => { 289 entry.eax = value; 290 } 291 CpuidReg::EBX => { 292 entry.ebx = value; 293 } 294 CpuidReg::ECX => { 295 entry.ecx = value; 296 } 297 CpuidReg::EDX => { 298 entry.edx = value; 299 } 300 } 301 } 302 } 303 304 if entry_found { 305 return; 306 } 307 308 // Entry not found, so let's add it. 309 if let Some(index) = index { 310 let mut entry = CpuIdEntry { 311 function, 312 index, 313 flags: CPUID_FLAG_VALID_INDEX, 314 ..Default::default() 315 }; 316 match reg { 317 CpuidReg::EAX => { 318 entry.eax = value; 319 } 320 CpuidReg::EBX => { 321 entry.ebx = value; 322 } 323 CpuidReg::ECX => { 324 entry.ecx = value; 325 } 326 CpuidReg::EDX => { 327 entry.edx = value; 328 } 329 } 330 331 cpuid.push(entry); 332 } 333 } 334 335 pub fn patch_cpuid(cpuid: &mut [CpuIdEntry], patches: Vec<CpuidPatch>) { 336 for entry in cpuid { 337 for patch in patches.iter() { 338 if entry.function == patch.function && entry.index == patch.index { 339 if let Some(flags_bit) = patch.flags_bit { 340 entry.flags |= 1 << flags_bit; 341 } 342 if let Some(eax_bit) = patch.eax_bit { 343 entry.eax |= 1 << eax_bit; 344 } 345 if let Some(ebx_bit) = patch.ebx_bit { 346 entry.ebx |= 1 << ebx_bit; 347 } 348 if let Some(ecx_bit) = patch.ecx_bit { 349 entry.ecx |= 1 << ecx_bit; 350 } 351 if let Some(edx_bit) = patch.edx_bit { 352 entry.edx |= 1 << edx_bit; 353 } 354 } 355 } 356 } 357 } 358 359 pub fn is_feature_enabled( 360 cpuid: &[CpuIdEntry], 361 function: u32, 362 index: u32, 363 reg: CpuidReg, 364 feature_bit: usize, 365 ) -> bool { 366 let mask = 1 << feature_bit; 367 368 for entry in cpuid { 369 if entry.function == function && entry.index == index { 370 let reg_val = match reg { 371 CpuidReg::EAX => entry.eax, 372 CpuidReg::EBX => entry.ebx, 373 CpuidReg::ECX => entry.ecx, 374 CpuidReg::EDX => entry.edx, 375 }; 376 377 return (reg_val & mask) == mask; 378 } 379 } 380 381 false 382 } 383 } 384 385 #[derive(Debug)] 386 enum CpuidCompatibleCheck { 387 BitwiseSubset, // bitwise subset 388 Equal, // equal in value 389 NumNotGreater, // smaller or equal as a number 390 } 391 392 pub struct CpuidFeatureEntry { 393 function: u32, 394 index: u32, 395 feature_reg: CpuidReg, 396 compatible_check: CpuidCompatibleCheck, 397 } 398 399 impl CpuidFeatureEntry { 400 fn checked_feature_entry_list() -> Vec<CpuidFeatureEntry> { 401 vec![ 402 // The following list includes all hardware features bits from 403 // the CPUID Wiki Page: https://en.wikipedia.org/wiki/CPUID 404 // Leaf 0x1, ECX/EDX, feature bits 405 CpuidFeatureEntry { 406 function: 1, 407 index: 0, 408 feature_reg: CpuidReg::ECX, 409 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 410 }, 411 CpuidFeatureEntry { 412 function: 1, 413 index: 0, 414 feature_reg: CpuidReg::EDX, 415 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 416 }, 417 // Leaf 0x7, EAX/EBX/ECX/EDX, extended features 418 CpuidFeatureEntry { 419 function: 7, 420 index: 0, 421 feature_reg: CpuidReg::EAX, 422 compatible_check: CpuidCompatibleCheck::NumNotGreater, 423 }, 424 CpuidFeatureEntry { 425 function: 7, 426 index: 0, 427 feature_reg: CpuidReg::EBX, 428 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 429 }, 430 CpuidFeatureEntry { 431 function: 7, 432 index: 0, 433 feature_reg: CpuidReg::ECX, 434 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 435 }, 436 CpuidFeatureEntry { 437 function: 7, 438 index: 0, 439 feature_reg: CpuidReg::EDX, 440 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 441 }, 442 // Leaf 0x7 subleaf 0x1, EAX, extended features 443 CpuidFeatureEntry { 444 function: 7, 445 index: 1, 446 feature_reg: CpuidReg::EAX, 447 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 448 }, 449 // Leaf 0x8000_0001, ECX/EDX, CPUID features bits 450 CpuidFeatureEntry { 451 function: 0x8000_0001, 452 index: 0, 453 feature_reg: CpuidReg::ECX, 454 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 455 }, 456 CpuidFeatureEntry { 457 function: 0x8000_0001, 458 index: 0, 459 feature_reg: CpuidReg::EDX, 460 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 461 }, 462 // KVM CPUID bits: https://www.kernel.org/doc/html/latest/virt/kvm/cpuid.html 463 // Leaf 0x4000_0000, EAX/EBX/ECX/EDX, KVM CPUID SIGNATURE 464 CpuidFeatureEntry { 465 function: 0x4000_0000, 466 index: 0, 467 feature_reg: CpuidReg::EAX, 468 compatible_check: CpuidCompatibleCheck::NumNotGreater, 469 }, 470 CpuidFeatureEntry { 471 function: 0x4000_0000, 472 index: 0, 473 feature_reg: CpuidReg::EBX, 474 compatible_check: CpuidCompatibleCheck::Equal, 475 }, 476 CpuidFeatureEntry { 477 function: 0x4000_0000, 478 index: 0, 479 feature_reg: CpuidReg::ECX, 480 compatible_check: CpuidCompatibleCheck::Equal, 481 }, 482 CpuidFeatureEntry { 483 function: 0x4000_0000, 484 index: 0, 485 feature_reg: CpuidReg::EDX, 486 compatible_check: CpuidCompatibleCheck::Equal, 487 }, 488 // Leaf 0x4000_0001, EAX/EBX/ECX/EDX, KVM CPUID features 489 CpuidFeatureEntry { 490 function: 0x4000_0001, 491 index: 0, 492 feature_reg: CpuidReg::EAX, 493 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 494 }, 495 CpuidFeatureEntry { 496 function: 0x4000_0001, 497 index: 0, 498 feature_reg: CpuidReg::EBX, 499 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 500 }, 501 CpuidFeatureEntry { 502 function: 0x4000_0001, 503 index: 0, 504 feature_reg: CpuidReg::ECX, 505 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 506 }, 507 CpuidFeatureEntry { 508 function: 0x4000_0001, 509 index: 0, 510 feature_reg: CpuidReg::EDX, 511 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 512 }, 513 ] 514 } 515 516 fn get_features_from_cpuid( 517 cpuid: &[CpuIdEntry], 518 feature_entry_list: &[CpuidFeatureEntry], 519 ) -> Vec<u32> { 520 let mut features = vec![0; feature_entry_list.len()]; 521 for (i, feature_entry) in feature_entry_list.iter().enumerate() { 522 for cpuid_entry in cpuid { 523 if cpuid_entry.function == feature_entry.function 524 && cpuid_entry.index == feature_entry.index 525 { 526 match feature_entry.feature_reg { 527 CpuidReg::EAX => { 528 features[i] = cpuid_entry.eax; 529 } 530 CpuidReg::EBX => { 531 features[i] = cpuid_entry.ebx; 532 } 533 CpuidReg::ECX => { 534 features[i] = cpuid_entry.ecx; 535 } 536 CpuidReg::EDX => { 537 features[i] = cpuid_entry.edx; 538 } 539 } 540 541 break; 542 } 543 } 544 } 545 546 features 547 } 548 549 // The function returns `Error` (a.k.a. "incompatible"), when the CPUID features from `src_vm_cpuid` 550 // is not a subset of those of the `dest_vm_cpuid`. 551 pub fn check_cpuid_compatibility( 552 src_vm_cpuid: &[CpuIdEntry], 553 dest_vm_cpuid: &[CpuIdEntry], 554 ) -> Result<(), Error> { 555 let feature_entry_list = &Self::checked_feature_entry_list(); 556 let src_vm_features = Self::get_features_from_cpuid(src_vm_cpuid, feature_entry_list); 557 let dest_vm_features = Self::get_features_from_cpuid(dest_vm_cpuid, feature_entry_list); 558 559 // Loop on feature bit and check if the 'source vm' feature is a subset 560 // of those of the 'destination vm' feature 561 let mut compatible = true; 562 for (i, (src_vm_feature, dest_vm_feature)) in src_vm_features 563 .iter() 564 .zip(dest_vm_features.iter()) 565 .enumerate() 566 { 567 let entry = &feature_entry_list[i]; 568 let entry_compatible = match entry.compatible_check { 569 CpuidCompatibleCheck::BitwiseSubset => { 570 let different_feature_bits = src_vm_feature ^ dest_vm_feature; 571 let src_vm_feature_bits_only = different_feature_bits & src_vm_feature; 572 src_vm_feature_bits_only == 0 573 } 574 CpuidCompatibleCheck::Equal => src_vm_feature == dest_vm_feature, 575 CpuidCompatibleCheck::NumNotGreater => src_vm_feature <= dest_vm_feature, 576 }; 577 if !entry_compatible { 578 error!( 579 "Detected incompatible CPUID entry: leaf={:#02x} (subleaf={:#02x}), register='{:?}', \ 580 compatible_check='{:?}', source VM feature='{:#04x}', destination VM feature'{:#04x}'.", 581 entry.function, entry.index, entry.feature_reg, 582 entry.compatible_check, src_vm_feature, dest_vm_feature 583 ); 584 585 compatible = false; 586 } 587 } 588 589 if compatible { 590 info!("No CPU incompatibility detected."); 591 Ok(()) 592 } else { 593 Err(Error::CpuidCheckCompatibility) 594 } 595 } 596 } 597 598 pub fn generate_common_cpuid( 599 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 600 config: &CpuidConfig, 601 ) -> super::Result<Vec<CpuIdEntry>> { 602 // SAFETY: cpuid called with valid leaves 603 if unsafe { x86_64::__cpuid(1) }.ecx & 1 << HYPERVISOR_ECX_BIT == 1 << HYPERVISOR_ECX_BIT { 604 // SAFETY: cpuid called with valid leaves 605 let hypervisor_cpuid = unsafe { x86_64::__cpuid(0x4000_0000) }; 606 607 let mut identifier: [u8; 12] = [0; 12]; 608 identifier[0..4].copy_from_slice(&hypervisor_cpuid.ebx.to_le_bytes()[..]); 609 identifier[4..8].copy_from_slice(&hypervisor_cpuid.ecx.to_le_bytes()[..]); 610 identifier[8..12].copy_from_slice(&hypervisor_cpuid.edx.to_le_bytes()[..]); 611 612 info!( 613 "Running under nested virtualisation. Hypervisor string: {}", 614 String::from_utf8_lossy(&identifier) 615 ); 616 } 617 618 info!( 619 "Generating guest CPUID for with physical address size: {}", 620 config.phys_bits 621 ); 622 let cpuid_patches = vec![ 623 // Patch tsc deadline timer bit 624 CpuidPatch { 625 function: 1, 626 index: 0, 627 flags_bit: None, 628 eax_bit: None, 629 ebx_bit: None, 630 ecx_bit: Some(TSC_DEADLINE_TIMER_ECX_BIT), 631 edx_bit: None, 632 }, 633 // Patch hypervisor bit 634 CpuidPatch { 635 function: 1, 636 index: 0, 637 flags_bit: None, 638 eax_bit: None, 639 ebx_bit: None, 640 ecx_bit: Some(HYPERVISOR_ECX_BIT), 641 edx_bit: None, 642 }, 643 // Enable MTRR feature 644 CpuidPatch { 645 function: 1, 646 index: 0, 647 flags_bit: None, 648 eax_bit: None, 649 ebx_bit: None, 650 ecx_bit: None, 651 edx_bit: Some(MTRR_EDX_BIT), 652 }, 653 ]; 654 655 // Supported CPUID 656 let mut cpuid = hypervisor 657 .get_supported_cpuid() 658 .map_err(Error::CpuidGetSupported)?; 659 660 CpuidPatch::patch_cpuid(&mut cpuid, cpuid_patches); 661 662 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 663 update_cpuid_sgx(&mut cpuid, sgx_epc_sections)?; 664 } 665 666 #[cfg(feature = "tdx")] 667 let tdx_capabilities = if config.tdx { 668 let caps = hypervisor 669 .tdx_capabilities() 670 .map_err(Error::TdxCapabilities)?; 671 info!("TDX capabilities {:#?}", caps); 672 Some(caps) 673 } else { 674 None 675 }; 676 677 // Update some existing CPUID 678 for entry in cpuid.as_mut_slice().iter_mut() { 679 match entry.function { 680 // Clear AMX related bits if the AMX feature is not enabled 681 0x7 => { 682 if !config.amx && entry.index == 0 { 683 entry.edx &= !(1 << AMX_BF16 | 1 << AMX_TILE | 1 << AMX_INT8) 684 } 685 } 686 0xd => 687 { 688 #[cfg(feature = "tdx")] 689 if let Some(caps) = &tdx_capabilities { 690 let xcr0_mask: u64 = 0x82ff; 691 let xss_mask: u64 = !xcr0_mask; 692 if entry.index == 0 { 693 entry.eax &= (caps.xfam_fixed0 as u32) & (xcr0_mask as u32); 694 entry.eax |= (caps.xfam_fixed1 as u32) & (xcr0_mask as u32); 695 entry.edx &= ((caps.xfam_fixed0 & xcr0_mask) >> 32) as u32; 696 entry.edx |= ((caps.xfam_fixed1 & xcr0_mask) >> 32) as u32; 697 } else if entry.index == 1 { 698 entry.ecx &= (caps.xfam_fixed0 as u32) & (xss_mask as u32); 699 entry.ecx |= (caps.xfam_fixed1 as u32) & (xss_mask as u32); 700 entry.edx &= ((caps.xfam_fixed0 & xss_mask) >> 32) as u32; 701 entry.edx |= ((caps.xfam_fixed1 & xss_mask) >> 32) as u32; 702 } 703 } 704 } 705 // Copy host L1 cache details if not populated by KVM 706 0x8000_0005 => { 707 if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 { 708 // SAFETY: cpuid called with valid leaves 709 if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 { 710 // SAFETY: cpuid called with valid leaves 711 let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) }; 712 entry.eax = leaf.eax; 713 entry.ebx = leaf.ebx; 714 entry.ecx = leaf.ecx; 715 entry.edx = leaf.edx; 716 } 717 } 718 } 719 // Copy host L2 cache details if not populated by KVM 720 0x8000_0006 => { 721 if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 { 722 // SAFETY: cpuid called with valid leaves 723 if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 { 724 // SAFETY: cpuid called with valid leaves 725 let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) }; 726 entry.eax = leaf.eax; 727 entry.ebx = leaf.ebx; 728 entry.ecx = leaf.ecx; 729 entry.edx = leaf.edx; 730 } 731 } 732 } 733 // Set CPU physical bits 734 0x8000_0008 => { 735 entry.eax = (entry.eax & 0xffff_ff00) | (config.phys_bits as u32 & 0xff); 736 } 737 0x4000_0001 => { 738 // These features are not supported by TDX 739 #[cfg(feature = "tdx")] 740 if config.tdx { 741 entry.eax &= !(1 << KVM_FEATURE_CLOCKSOURCE_BIT 742 | 1 << KVM_FEATURE_CLOCKSOURCE2_BIT 743 | 1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 744 | 1 << KVM_FEATURE_ASYNC_PF_BIT 745 | 1 << KVM_FEATURE_ASYNC_PF_VMEXIT_BIT 746 | 1 << KVM_FEATURE_STEAL_TIME_BIT) 747 } 748 } 749 _ => {} 750 } 751 } 752 753 // Copy CPU identification string 754 for i in 0x8000_0002..=0x8000_0004 { 755 cpuid.retain(|c| c.function != i); 756 // SAFETY: call cpuid with valid leaves 757 let leaf = unsafe { std::arch::x86_64::__cpuid(i) }; 758 cpuid.push(CpuIdEntry { 759 function: i, 760 eax: leaf.eax, 761 ebx: leaf.ebx, 762 ecx: leaf.ecx, 763 edx: leaf.edx, 764 ..Default::default() 765 }); 766 } 767 768 if config.kvm_hyperv { 769 // Remove conflicting entries 770 cpuid.retain(|c| c.function != 0x4000_0000); 771 cpuid.retain(|c| c.function != 0x4000_0001); 772 // See "Hypervisor Top Level Functional Specification" for details 773 // Compliance with "Hv#1" requires leaves up to 0x4000_000a 774 cpuid.push(CpuIdEntry { 775 function: 0x40000000, 776 eax: 0x4000000a, // Maximum cpuid leaf 777 ebx: 0x756e694c, // "Linu" 778 ecx: 0x564b2078, // "x KV" 779 edx: 0x7648204d, // "M Hv" 780 ..Default::default() 781 }); 782 cpuid.push(CpuIdEntry { 783 function: 0x40000001, 784 eax: 0x31237648, // "Hv#1" 785 ..Default::default() 786 }); 787 cpuid.push(CpuIdEntry { 788 function: 0x40000002, 789 eax: 0x3839, // "Build number" 790 ebx: 0xa0000, // "Version" 791 ..Default::default() 792 }); 793 cpuid.push(CpuIdEntry { 794 function: 0x4000_0003, 795 eax: 1 << 1 // AccessPartitionReferenceCounter 796 | 1 << 2 // AccessSynicRegs 797 | 1 << 3 // AccessSyntheticTimerRegs 798 | 1 << 9, // AccessPartitionReferenceTsc 799 edx: 1 << 3, // CPU dynamic partitioning 800 ..Default::default() 801 }); 802 cpuid.push(CpuIdEntry { 803 function: 0x4000_0004, 804 eax: 1 << 5, // Recommend relaxed timing 805 ..Default::default() 806 }); 807 for i in 0x4000_0005..=0x4000_000a { 808 cpuid.push(CpuIdEntry { 809 function: i, 810 ..Default::default() 811 }); 812 } 813 } 814 815 Ok(cpuid) 816 } 817 818 pub fn configure_vcpu( 819 vcpu: &Arc<dyn hypervisor::Vcpu>, 820 id: u8, 821 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 822 cpuid: Vec<CpuIdEntry>, 823 kvm_hyperv: bool, 824 cpu_vendor: CpuVendor, 825 topology: Option<(u8, u8, u8)>, 826 ) -> super::Result<()> { 827 let x2apic_id = get_x2apic_id(id as u32, topology); 828 829 // Per vCPU CPUID changes; common are handled via generate_common_cpuid() 830 let mut cpuid = cpuid; 831 CpuidPatch::set_cpuid_reg(&mut cpuid, 0xb, None, CpuidReg::EDX, x2apic_id); 832 CpuidPatch::set_cpuid_reg(&mut cpuid, 0x1f, None, CpuidReg::EDX, x2apic_id); 833 if matches!(cpu_vendor, CpuVendor::AMD) { 834 CpuidPatch::set_cpuid_reg(&mut cpuid, 0x8000_001e, Some(0), CpuidReg::EAX, x2apic_id); 835 } 836 837 // Set ApicId in cpuid for each vcpu - found in cpuid ebx when eax = 1 838 let mut apic_id_patched = false; 839 for entry in &mut cpuid { 840 if entry.function == 1 { 841 entry.ebx &= 0xffffff; 842 entry.ebx |= x2apic_id << 24; 843 apic_id_patched = true; 844 break; 845 } 846 } 847 assert!(apic_id_patched); 848 849 if let Some(t) = topology { 850 update_cpuid_topology(&mut cpuid, t.0, t.1, t.2, cpu_vendor, id); 851 } 852 853 // The TSC frequency CPUID leaf should not be included when running with HyperV emulation 854 if !kvm_hyperv { 855 if let Some(tsc_khz) = vcpu.tsc_khz().map_err(Error::GetTscFrequency)? { 856 // Need to check that the TSC doesn't vary with dynamic frequency 857 // SAFETY: cpuid called with valid leaves 858 if unsafe { std::arch::x86_64::__cpuid(0x8000_0007) }.edx 859 & (1u32 << INVARIANT_TSC_EDX_BIT) 860 > 0 861 { 862 CpuidPatch::set_cpuid_reg( 863 &mut cpuid, 864 0x4000_0000, 865 None, 866 CpuidReg::EAX, 867 0x4000_0010, 868 ); 869 cpuid.retain(|c| c.function != 0x4000_0010); 870 cpuid.push(CpuIdEntry { 871 function: 0x4000_0010, 872 eax: tsc_khz, 873 ebx: 1000000, /* LAPIC resolution of 1ns (freq: 1GHz) is hardcoded in KVM's 874 * APIC_BUS_CYCLE_NS */ 875 ..Default::default() 876 }); 877 }; 878 } 879 } 880 881 vcpu.set_cpuid2(&cpuid) 882 .map_err(|e| Error::SetSupportedCpusFailed(e.into()))?; 883 884 if kvm_hyperv { 885 vcpu.enable_hyperv_synic().unwrap(); 886 } 887 888 regs::setup_msrs(vcpu).map_err(Error::MsrsConfiguration)?; 889 if let Some((kernel_entry_point, guest_memory)) = boot_setup { 890 regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?; 891 regs::setup_fpu(vcpu).map_err(Error::FpuConfiguration)?; 892 regs::setup_sregs(&guest_memory.memory(), vcpu).map_err(Error::SregsConfiguration)?; 893 } 894 interrupts::set_lint(vcpu).map_err(|e| Error::LocalIntConfiguration(e.into()))?; 895 Ok(()) 896 } 897 898 /// Returns a Vec of the valid memory addresses. 899 /// These should be used to configure the GuestMemory structure for the platform. 900 /// For x86_64 all addresses are valid from the start of the kernel except a 901 /// carve out at the end of 32bit address space. 902 pub fn arch_memory_regions() -> Vec<(GuestAddress, usize, RegionType)> { 903 vec![ 904 // 0 GiB ~ 3GiB: memory before the gap 905 ( 906 GuestAddress(0), 907 layout::MEM_32BIT_RESERVED_START.raw_value() as usize, 908 RegionType::Ram, 909 ), 910 // 4 GiB ~ inf: memory after the gap 911 (layout::RAM_64BIT_START, usize::MAX, RegionType::Ram), 912 // 3 GiB ~ 3712 MiB: 32-bit device memory hole 913 ( 914 layout::MEM_32BIT_RESERVED_START, 915 layout::MEM_32BIT_DEVICES_SIZE as usize, 916 RegionType::SubRegion, 917 ), 918 // 3712 MiB ~ 3968 MiB: 32-bit reserved memory hole 919 ( 920 layout::MEM_32BIT_RESERVED_START.unchecked_add(layout::MEM_32BIT_DEVICES_SIZE), 921 (layout::MEM_32BIT_RESERVED_SIZE - layout::MEM_32BIT_DEVICES_SIZE) as usize, 922 RegionType::Reserved, 923 ), 924 ] 925 } 926 927 /// Configures the system and should be called once per vm before starting vcpu threads. 928 /// 929 /// # Arguments 930 /// 931 /// * `guest_mem` - The memory to be used by the guest. 932 /// * `cmdline_addr` - Address in `guest_mem` where the kernel command line was loaded. 933 /// * `cmdline_size` - Size of the kernel command line in bytes including the null terminator. 934 /// * `num_cpus` - Number of virtual CPUs the guest will have. 935 #[allow(clippy::too_many_arguments)] 936 pub fn configure_system( 937 guest_mem: &GuestMemoryMmap, 938 cmdline_addr: GuestAddress, 939 cmdline_size: usize, 940 initramfs: &Option<InitramfsConfig>, 941 _num_cpus: u8, 942 setup_header: Option<setup_header>, 943 rsdp_addr: Option<GuestAddress>, 944 sgx_epc_region: Option<SgxEpcRegion>, 945 serial_number: Option<&str>, 946 uuid: Option<&str>, 947 oem_strings: Option<&[&str]>, 948 topology: Option<(u8, u8, u8)>, 949 ) -> super::Result<()> { 950 // Write EBDA address to location where ACPICA expects to find it 951 guest_mem 952 .write_obj((layout::EBDA_START.0 >> 4) as u16, layout::EBDA_POINTER) 953 .map_err(Error::EbdaSetup)?; 954 955 let size = smbios::setup_smbios(guest_mem, serial_number, uuid, oem_strings) 956 .map_err(Error::SmbiosSetup)?; 957 958 // Place the MP table after the SMIOS table aligned to 16 bytes 959 let offset = GuestAddress(layout::SMBIOS_START).unchecked_add(size); 960 let offset = GuestAddress((offset.0 + 16) & !0xf); 961 mptable::setup_mptable(offset, guest_mem, _num_cpus, topology).map_err(Error::MpTableSetup)?; 962 963 // Check that the RAM is not smaller than the RSDP start address 964 if let Some(rsdp_addr) = rsdp_addr { 965 if rsdp_addr.0 > guest_mem.last_addr().0 { 966 return Err(super::Error::RsdpPastRamEnd); 967 } 968 } 969 970 match setup_header { 971 Some(hdr) => configure_32bit_entry( 972 guest_mem, 973 cmdline_addr, 974 cmdline_size, 975 initramfs, 976 hdr, 977 rsdp_addr, 978 sgx_epc_region, 979 ), 980 None => configure_pvh( 981 guest_mem, 982 cmdline_addr, 983 initramfs, 984 rsdp_addr, 985 sgx_epc_region, 986 ), 987 } 988 } 989 990 type RamRange = (u64, u64); 991 992 /// Returns usable physical memory ranges for the guest 993 /// These should be used to create e820_RAM memory maps 994 pub fn generate_ram_ranges(guest_mem: &GuestMemoryMmap) -> super::Result<Vec<RamRange>> { 995 // Merge continuous memory regions into one region. 996 // Note: memory regions from "GuestMemory" are sorted and non-zero sized. 997 let ram_regions = { 998 let mut ram_regions = Vec::new(); 999 let mut current_start = guest_mem 1000 .iter() 1001 .next() 1002 .map(GuestMemoryRegion::start_addr) 1003 .expect("GuestMemory must have one memory region at least") 1004 .raw_value(); 1005 let mut current_end = current_start; 1006 1007 for (start, size) in guest_mem 1008 .iter() 1009 .map(|m| (m.start_addr().raw_value(), m.len())) 1010 { 1011 if current_end == start { 1012 // This zone is continuous with the previous one. 1013 current_end += size; 1014 } else { 1015 ram_regions.push((current_start, current_end)); 1016 1017 current_start = start; 1018 current_end = start + size; 1019 } 1020 } 1021 1022 ram_regions.push((current_start, current_end)); 1023 1024 ram_regions 1025 }; 1026 1027 // Create the memory map entry for memory region before the gap 1028 let mut ram_ranges = vec![]; 1029 1030 // Generate the first usable physical memory range before the gap. The e820 map 1031 // should only report memory above 1MiB. 1032 let first_ram_range = { 1033 let (first_region_start, first_region_end) = 1034 ram_regions.first().ok_or(super::Error::MemmapTableSetup)?; 1035 let high_ram_start = layout::HIGH_RAM_START.raw_value(); 1036 let mem_32bit_reserved_start = layout::MEM_32BIT_RESERVED_START.raw_value(); 1037 1038 if !((first_region_start <= &high_ram_start) 1039 && (first_region_end > &high_ram_start) 1040 && (first_region_end <= &mem_32bit_reserved_start)) 1041 { 1042 error!( 1043 "Unexpected first memory region layout: (start: 0x{:08x}, end: 0x{:08x}). 1044 high_ram_start: 0x{:08x}, mem_32bit_reserved_start: 0x{:08x}", 1045 first_region_start, first_region_end, high_ram_start, mem_32bit_reserved_start 1046 ); 1047 1048 return Err(super::Error::MemmapTableSetup); 1049 } 1050 1051 info!( 1052 "first usable physical memory range, start: 0x{:08x}, end: 0x{:08x}", 1053 high_ram_start, first_region_end 1054 ); 1055 1056 (high_ram_start, *first_region_end) 1057 }; 1058 ram_ranges.push(first_ram_range); 1059 1060 // Generate additional usable physical memory range after the gap if any. 1061 for ram_region in ram_regions.iter().skip(1) { 1062 info!( 1063 "found usable physical memory range, start: 0x{:08x}, end: 0x{:08x}", 1064 ram_region.0, ram_region.1 1065 ); 1066 1067 ram_ranges.push(*ram_region); 1068 } 1069 1070 Ok(ram_ranges) 1071 } 1072 1073 fn configure_pvh( 1074 guest_mem: &GuestMemoryMmap, 1075 cmdline_addr: GuestAddress, 1076 initramfs: &Option<InitramfsConfig>, 1077 rsdp_addr: Option<GuestAddress>, 1078 sgx_epc_region: Option<SgxEpcRegion>, 1079 ) -> super::Result<()> { 1080 const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336ec578; 1081 1082 let mut start_info = hvm_start_info { 1083 magic: XEN_HVM_START_MAGIC_VALUE, 1084 version: 1, // pvh has version 1 1085 nr_modules: 0, 1086 cmdline_paddr: cmdline_addr.raw_value(), 1087 memmap_paddr: layout::MEMMAP_START.raw_value(), 1088 ..Default::default() 1089 }; 1090 1091 if let Some(rsdp_addr) = rsdp_addr { 1092 start_info.rsdp_paddr = rsdp_addr.0; 1093 } 1094 1095 if let Some(initramfs_config) = initramfs { 1096 // The initramfs has been written to guest memory already, here we just need to 1097 // create the module structure that describes it. 1098 let ramdisk_mod = hvm_modlist_entry { 1099 paddr: initramfs_config.address.raw_value(), 1100 size: initramfs_config.size as u64, 1101 ..Default::default() 1102 }; 1103 1104 start_info.nr_modules += 1; 1105 start_info.modlist_paddr = layout::MODLIST_START.raw_value(); 1106 1107 // Write the modlist struct to guest memory. 1108 guest_mem 1109 .write_obj(ramdisk_mod, layout::MODLIST_START) 1110 .map_err(super::Error::ModlistSetup)?; 1111 } 1112 1113 // Vector to hold the memory maps which needs to be written to guest memory 1114 // at MEMMAP_START after all of the mappings are recorded. 1115 let mut memmap: Vec<hvm_memmap_table_entry> = Vec::new(); 1116 1117 // Create the memory map entries. 1118 add_memmap_entry(&mut memmap, 0, layout::EBDA_START.raw_value(), E820_RAM); 1119 1120 // Get usable physical memory ranges 1121 let ram_ranges = generate_ram_ranges(guest_mem)?; 1122 1123 // Create e820 memory map entries 1124 for ram_range in ram_ranges { 1125 info!( 1126 "create_memmap_entry, start: 0x{:08x}, end: 0x{:08x}", 1127 ram_range.0, ram_range.1 1128 ); 1129 add_memmap_entry( 1130 &mut memmap, 1131 ram_range.0, 1132 ram_range.1 - ram_range.0, 1133 E820_RAM, 1134 ); 1135 } 1136 1137 add_memmap_entry( 1138 &mut memmap, 1139 layout::PCI_MMCONFIG_START.0, 1140 layout::PCI_MMCONFIG_SIZE, 1141 E820_RESERVED, 1142 ); 1143 1144 if let Some(sgx_epc_region) = sgx_epc_region { 1145 add_memmap_entry( 1146 &mut memmap, 1147 sgx_epc_region.start().raw_value(), 1148 sgx_epc_region.size(), 1149 E820_RESERVED, 1150 ); 1151 } 1152 1153 start_info.memmap_entries = memmap.len() as u32; 1154 1155 // Copy the vector with the memmap table to the MEMMAP_START address 1156 // which is already saved in the memmap_paddr field of hvm_start_info struct. 1157 let mut memmap_start_addr = layout::MEMMAP_START; 1158 1159 guest_mem 1160 .checked_offset( 1161 memmap_start_addr, 1162 mem::size_of::<hvm_memmap_table_entry>() * start_info.memmap_entries as usize, 1163 ) 1164 .ok_or(super::Error::MemmapTablePastRamEnd)?; 1165 1166 // For every entry in the memmap vector, write it to guest memory. 1167 for memmap_entry in memmap { 1168 guest_mem 1169 .write_obj(memmap_entry, memmap_start_addr) 1170 .map_err(|_| super::Error::MemmapTableSetup)?; 1171 memmap_start_addr = 1172 memmap_start_addr.unchecked_add(mem::size_of::<hvm_memmap_table_entry>() as u64); 1173 } 1174 1175 // The hvm_start_info struct itself must be stored at PVH_START_INFO 1176 // address, and %rbx will be initialized to contain PVH_INFO_START prior to 1177 // starting the guest, as required by the PVH ABI. 1178 let start_info_addr = layout::PVH_INFO_START; 1179 1180 guest_mem 1181 .checked_offset(start_info_addr, mem::size_of::<hvm_start_info>()) 1182 .ok_or(super::Error::StartInfoPastRamEnd)?; 1183 1184 // Write the start_info struct to guest memory. 1185 guest_mem 1186 .write_obj(start_info, start_info_addr) 1187 .map_err(|_| super::Error::StartInfoSetup)?; 1188 1189 Ok(()) 1190 } 1191 1192 fn configure_32bit_entry( 1193 guest_mem: &GuestMemoryMmap, 1194 cmdline_addr: GuestAddress, 1195 cmdline_size: usize, 1196 initramfs: &Option<InitramfsConfig>, 1197 setup_hdr: setup_header, 1198 rsdp_addr: Option<GuestAddress>, 1199 sgx_epc_region: Option<SgxEpcRegion>, 1200 ) -> super::Result<()> { 1201 const KERNEL_LOADER_OTHER: u8 = 0xff; 1202 1203 // Use the provided setup header 1204 let mut params = boot_params { 1205 hdr: setup_hdr, 1206 ..Default::default() 1207 }; 1208 1209 // Common bootparams settings 1210 if params.hdr.type_of_loader == 0 { 1211 params.hdr.type_of_loader = KERNEL_LOADER_OTHER; 1212 } 1213 params.hdr.cmd_line_ptr = cmdline_addr.raw_value() as u32; 1214 params.hdr.cmdline_size = cmdline_size as u32; 1215 1216 if let Some(initramfs_config) = initramfs { 1217 params.hdr.ramdisk_image = initramfs_config.address.raw_value() as u32; 1218 params.hdr.ramdisk_size = initramfs_config.size as u32; 1219 } 1220 1221 add_e820_entry(&mut params, 0, layout::EBDA_START.raw_value(), E820_RAM)?; 1222 1223 let mem_end = guest_mem.last_addr(); 1224 if mem_end < layout::MEM_32BIT_RESERVED_START { 1225 add_e820_entry( 1226 &mut params, 1227 layout::HIGH_RAM_START.raw_value(), 1228 mem_end.unchecked_offset_from(layout::HIGH_RAM_START) + 1, 1229 E820_RAM, 1230 )?; 1231 } else { 1232 add_e820_entry( 1233 &mut params, 1234 layout::HIGH_RAM_START.raw_value(), 1235 layout::MEM_32BIT_RESERVED_START.unchecked_offset_from(layout::HIGH_RAM_START), 1236 E820_RAM, 1237 )?; 1238 if mem_end > layout::RAM_64BIT_START { 1239 add_e820_entry( 1240 &mut params, 1241 layout::RAM_64BIT_START.raw_value(), 1242 mem_end.unchecked_offset_from(layout::RAM_64BIT_START) + 1, 1243 E820_RAM, 1244 )?; 1245 } 1246 } 1247 1248 add_e820_entry( 1249 &mut params, 1250 layout::PCI_MMCONFIG_START.0, 1251 layout::PCI_MMCONFIG_SIZE, 1252 E820_RESERVED, 1253 )?; 1254 1255 if let Some(sgx_epc_region) = sgx_epc_region { 1256 add_e820_entry( 1257 &mut params, 1258 sgx_epc_region.start().raw_value(), 1259 sgx_epc_region.size(), 1260 E820_RESERVED, 1261 )?; 1262 } 1263 1264 if let Some(rsdp_addr) = rsdp_addr { 1265 params.acpi_rsdp_addr = rsdp_addr.0; 1266 } 1267 1268 let zero_page_addr = layout::ZERO_PAGE_START; 1269 guest_mem 1270 .checked_offset(zero_page_addr, mem::size_of::<boot_params>()) 1271 .ok_or(super::Error::ZeroPagePastRamEnd)?; 1272 guest_mem 1273 .write_obj(params, zero_page_addr) 1274 .map_err(super::Error::ZeroPageSetup)?; 1275 1276 Ok(()) 1277 } 1278 1279 /// Add an e820 region to the e820 map. 1280 /// Returns Ok(()) if successful, or an error if there is no space left in the map. 1281 fn add_e820_entry( 1282 params: &mut boot_params, 1283 addr: u64, 1284 size: u64, 1285 mem_type: u32, 1286 ) -> Result<(), Error> { 1287 if params.e820_entries >= params.e820_table.len() as u8 { 1288 return Err(Error::E820Configuration); 1289 } 1290 1291 params.e820_table[params.e820_entries as usize].addr = addr; 1292 params.e820_table[params.e820_entries as usize].size = size; 1293 params.e820_table[params.e820_entries as usize].type_ = mem_type; 1294 params.e820_entries += 1; 1295 1296 Ok(()) 1297 } 1298 1299 fn add_memmap_entry(memmap: &mut Vec<hvm_memmap_table_entry>, addr: u64, size: u64, mem_type: u32) { 1300 // Add the table entry to the vector 1301 memmap.push(hvm_memmap_table_entry { 1302 addr, 1303 size, 1304 type_: mem_type, 1305 reserved: 0, 1306 }); 1307 } 1308 1309 /// Returns the memory address where the initramfs could be loaded. 1310 pub fn initramfs_load_addr( 1311 guest_mem: &GuestMemoryMmap, 1312 initramfs_size: usize, 1313 ) -> super::Result<u64> { 1314 let first_region = guest_mem 1315 .find_region(GuestAddress::new(0)) 1316 .ok_or(super::Error::InitramfsAddress)?; 1317 // It's safe to cast to usize because the size of a region can't be greater than usize. 1318 let lowmem_size = first_region.len() as usize; 1319 1320 if lowmem_size < initramfs_size { 1321 return Err(super::Error::InitramfsAddress); 1322 } 1323 1324 let aligned_addr: u64 = ((lowmem_size - initramfs_size) & !(crate::pagesize() - 1)) as u64; 1325 Ok(aligned_addr) 1326 } 1327 1328 pub fn get_host_cpu_phys_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>) -> u8 { 1329 // SAFETY: call cpuid with valid leaves 1330 unsafe { 1331 let leaf = x86_64::__cpuid(0x8000_0000); 1332 1333 // Detect and handle AMD SME (Secure Memory Encryption) properly. 1334 // Some physical address bits may become reserved when the feature is enabled. 1335 // See AMD64 Architecture Programmer's Manual Volume 2, Section 7.10.1 1336 let reduced = if leaf.eax >= 0x8000_001f 1337 && matches!(hypervisor.get_cpu_vendor(), CpuVendor::AMD) 1338 && x86_64::__cpuid(0x8000_001f).eax & 0x1 != 0 1339 { 1340 (x86_64::__cpuid(0x8000_001f).ebx >> 6) & 0x3f 1341 } else { 1342 0 1343 }; 1344 1345 if leaf.eax >= 0x8000_0008 { 1346 let leaf = x86_64::__cpuid(0x8000_0008); 1347 ((leaf.eax & 0xff) - reduced) as u8 1348 } else { 1349 36 1350 } 1351 } 1352 } 1353 1354 fn update_cpuid_topology( 1355 cpuid: &mut Vec<CpuIdEntry>, 1356 threads_per_core: u8, 1357 cores_per_die: u8, 1358 dies_per_package: u8, 1359 cpu_vendor: CpuVendor, 1360 id: u8, 1361 ) { 1362 let x2apic_id = get_x2apic_id( 1363 id as u32, 1364 Some((threads_per_core, cores_per_die, dies_per_package)), 1365 ); 1366 1367 let thread_width = 8 - (threads_per_core - 1).leading_zeros(); 1368 let core_width = (8 - (cores_per_die - 1).leading_zeros()) + thread_width; 1369 let die_width = (8 - (dies_per_package - 1).leading_zeros()) + core_width; 1370 1371 let mut cpu_ebx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX).unwrap_or(0); 1372 cpu_ebx |= ((dies_per_package as u32) * (cores_per_die as u32) * (threads_per_core as u32)) 1373 & 0xff << 16; 1374 CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX, cpu_ebx); 1375 1376 let mut cpu_edx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX).unwrap_or(0); 1377 cpu_edx |= 1 << 28; 1378 CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX, cpu_edx); 1379 1380 // CPU Topology leaf 0xb 1381 CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::EAX, thread_width); 1382 CpuidPatch::set_cpuid_reg( 1383 cpuid, 1384 0xb, 1385 Some(0), 1386 CpuidReg::EBX, 1387 u32::from(threads_per_core), 1388 ); 1389 CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::ECX, 1 << 8); 1390 1391 CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::EAX, die_width); 1392 CpuidPatch::set_cpuid_reg( 1393 cpuid, 1394 0xb, 1395 Some(1), 1396 CpuidReg::EBX, 1397 u32::from(dies_per_package * cores_per_die * threads_per_core), 1398 ); 1399 CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::ECX, 2 << 8); 1400 1401 // CPU Topology leaf 0x1f 1402 CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::EAX, thread_width); 1403 CpuidPatch::set_cpuid_reg( 1404 cpuid, 1405 0x1f, 1406 Some(0), 1407 CpuidReg::EBX, 1408 u32::from(threads_per_core), 1409 ); 1410 CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::ECX, 1 << 8); 1411 1412 CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(1), CpuidReg::EAX, core_width); 1413 CpuidPatch::set_cpuid_reg( 1414 cpuid, 1415 0x1f, 1416 Some(1), 1417 CpuidReg::EBX, 1418 u32::from(cores_per_die * threads_per_core), 1419 ); 1420 CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(1), CpuidReg::ECX, 2 << 8); 1421 1422 CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(2), CpuidReg::EAX, die_width); 1423 CpuidPatch::set_cpuid_reg( 1424 cpuid, 1425 0x1f, 1426 Some(2), 1427 CpuidReg::EBX, 1428 u32::from(dies_per_package * cores_per_die * threads_per_core), 1429 ); 1430 CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(2), CpuidReg::ECX, 5 << 8); 1431 1432 if matches!(cpu_vendor, CpuVendor::AMD) { 1433 CpuidPatch::set_cpuid_reg( 1434 cpuid, 1435 0x8000_001e, 1436 Some(0), 1437 CpuidReg::EBX, 1438 ((threads_per_core as u32 - 1) << 8) | (x2apic_id & 0xff), 1439 ); 1440 CpuidPatch::set_cpuid_reg( 1441 cpuid, 1442 0x8000_001e, 1443 Some(0), 1444 CpuidReg::ECX, 1445 ((dies_per_package as u32 - 1) << 8) | (thread_width + die_width) & 0xff, 1446 ); 1447 CpuidPatch::set_cpuid_reg(cpuid, 0x8000_001e, Some(0), CpuidReg::EDX, 0); 1448 if cores_per_die * threads_per_core > 1 { 1449 let ecx = 1450 CpuidPatch::get_cpuid_reg(cpuid, 0x8000_0001, Some(0), CpuidReg::ECX).unwrap_or(0); 1451 CpuidPatch::set_cpuid_reg( 1452 cpuid, 1453 0x8000_0001, 1454 Some(0), 1455 CpuidReg::ECX, 1456 ecx | (1u32 << 1) | (1u32 << 22), 1457 ); 1458 CpuidPatch::set_cpuid_reg( 1459 cpuid, 1460 0x0000_0001, 1461 Some(0), 1462 CpuidReg::EBX, 1463 (x2apic_id << 24) | (8 << 8) | (((cores_per_die * threads_per_core) as u32) << 16), 1464 ); 1465 let cpuid_patches = vec![ 1466 // Patch tsc deadline timer bit 1467 CpuidPatch { 1468 function: 1, 1469 index: 0, 1470 flags_bit: None, 1471 eax_bit: None, 1472 ebx_bit: None, 1473 ecx_bit: None, 1474 edx_bit: Some(28), 1475 }, 1476 ]; 1477 CpuidPatch::patch_cpuid(cpuid, cpuid_patches); 1478 CpuidPatch::set_cpuid_reg( 1479 cpuid, 1480 0x8000_0008, 1481 Some(0), 1482 CpuidReg::ECX, 1483 ((thread_width + core_width + die_width) << 12) 1484 | ((cores_per_die * threads_per_core) - 1) as u32, 1485 ); 1486 } else { 1487 CpuidPatch::set_cpuid_reg(cpuid, 0x8000_0008, Some(0), CpuidReg::ECX, 0u32); 1488 } 1489 } 1490 } 1491 1492 // The goal is to update the CPUID sub-leaves to reflect the number of EPC 1493 // sections exposed to the guest. 1494 fn update_cpuid_sgx( 1495 cpuid: &mut Vec<CpuIdEntry>, 1496 epc_sections: &[SgxEpcSection], 1497 ) -> Result<(), Error> { 1498 // Something's wrong if there's no EPC section. 1499 if epc_sections.is_empty() { 1500 return Err(Error::NoSgxEpcSection); 1501 } 1502 // We can't go further if the hypervisor does not support SGX feature. 1503 if !CpuidPatch::is_feature_enabled(cpuid, 0x7, 0, CpuidReg::EBX, 2) { 1504 return Err(Error::MissingSgxFeature); 1505 } 1506 // We can't go further if the hypervisor does not support SGX_LC feature. 1507 if !CpuidPatch::is_feature_enabled(cpuid, 0x7, 0, CpuidReg::ECX, 30) { 1508 return Err(Error::MissingSgxLaunchControlFeature); 1509 } 1510 1511 // Get host CPUID for leaf 0x12, subleaf 0x2. This is to retrieve EPC 1512 // properties such as confidentiality and integrity. 1513 // SAFETY: call cpuid with valid leaves 1514 let leaf = unsafe { std::arch::x86_64::__cpuid_count(0x12, 0x2) }; 1515 1516 for (i, epc_section) in epc_sections.iter().enumerate() { 1517 let subleaf_idx = i + 2; 1518 let start = epc_section.start().raw_value(); 1519 let size = epc_section.size(); 1520 let eax = (start & 0xffff_f000) as u32 | 0x1; 1521 let ebx = (start >> 32) as u32; 1522 let ecx = (size & 0xffff_f000) as u32 | (leaf.ecx & 0xf); 1523 let edx = (size >> 32) as u32; 1524 // CPU Topology leaf 0x12 1525 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EAX, eax); 1526 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EBX, ebx); 1527 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::ECX, ecx); 1528 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EDX, edx); 1529 } 1530 1531 // Add one NULL entry to terminate the dynamic list 1532 let subleaf_idx = epc_sections.len() + 2; 1533 // CPU Topology leaf 0x12 1534 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EAX, 0); 1535 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EBX, 0); 1536 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::ECX, 0); 1537 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EDX, 0); 1538 1539 Ok(()) 1540 } 1541 1542 #[cfg(test)] 1543 mod tests { 1544 use super::*; 1545 use linux_loader::loader::bootparam::boot_e820_entry; 1546 1547 #[test] 1548 fn regions_base_addr() { 1549 let regions = arch_memory_regions(); 1550 assert_eq!(4, regions.len()); 1551 assert_eq!(GuestAddress(0), regions[0].0); 1552 assert_eq!(GuestAddress(1 << 32), regions[1].0); 1553 } 1554 1555 #[test] 1556 fn test_system_configuration() { 1557 let no_vcpus = 4; 1558 let gm = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); 1559 let config_err = configure_system( 1560 &gm, 1561 GuestAddress(0), 1562 0, 1563 &None, 1564 1, 1565 None, 1566 Some(layout::RSDP_POINTER), 1567 None, 1568 None, 1569 None, 1570 None, 1571 None, 1572 ); 1573 assert!(config_err.is_err()); 1574 1575 // Now assigning some memory that falls before the 32bit memory hole. 1576 let arch_mem_regions = arch_memory_regions(); 1577 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 1578 .iter() 1579 .filter(|r| r.2 == RegionType::Ram && r.1 != usize::MAX) 1580 .map(|r| (r.0, r.1)) 1581 .collect(); 1582 let gm = GuestMemoryMmap::from_ranges(&ram_regions).unwrap(); 1583 1584 configure_system( 1585 &gm, 1586 GuestAddress(0), 1587 0, 1588 &None, 1589 no_vcpus, 1590 None, 1591 None, 1592 None, 1593 None, 1594 None, 1595 None, 1596 None, 1597 ) 1598 .unwrap(); 1599 1600 // Now assigning some memory that falls after the 32bit memory hole. 1601 let arch_mem_regions = arch_memory_regions(); 1602 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 1603 .iter() 1604 .filter(|r| r.2 == RegionType::Ram) 1605 .map(|r| { 1606 if r.1 == usize::MAX { 1607 (r.0, 128 << 20) 1608 } else { 1609 (r.0, r.1) 1610 } 1611 }) 1612 .collect(); 1613 let gm = GuestMemoryMmap::from_ranges(&ram_regions).unwrap(); 1614 configure_system( 1615 &gm, 1616 GuestAddress(0), 1617 0, 1618 &None, 1619 no_vcpus, 1620 None, 1621 None, 1622 None, 1623 None, 1624 None, 1625 None, 1626 None, 1627 ) 1628 .unwrap(); 1629 1630 configure_system( 1631 &gm, 1632 GuestAddress(0), 1633 0, 1634 &None, 1635 no_vcpus, 1636 None, 1637 None, 1638 None, 1639 None, 1640 None, 1641 None, 1642 None, 1643 ) 1644 .unwrap(); 1645 } 1646 1647 #[test] 1648 fn test_add_e820_entry() { 1649 let e820_table = [(boot_e820_entry { 1650 addr: 0x1, 1651 size: 4, 1652 type_: 1, 1653 }); 128]; 1654 1655 let expected_params = boot_params { 1656 e820_table, 1657 e820_entries: 1, 1658 ..Default::default() 1659 }; 1660 1661 let mut params: boot_params = Default::default(); 1662 add_e820_entry( 1663 &mut params, 1664 e820_table[0].addr, 1665 e820_table[0].size, 1666 e820_table[0].type_, 1667 ) 1668 .unwrap(); 1669 assert_eq!( 1670 format!("{:?}", params.e820_table[0]), 1671 format!("{:?}", expected_params.e820_table[0]) 1672 ); 1673 assert_eq!(params.e820_entries, expected_params.e820_entries); 1674 1675 // Exercise the scenario where the field storing the length of the e820 entry table is 1676 // is bigger than the allocated memory. 1677 params.e820_entries = params.e820_table.len() as u8 + 1; 1678 assert!(add_e820_entry( 1679 &mut params, 1680 e820_table[0].addr, 1681 e820_table[0].size, 1682 e820_table[0].type_ 1683 ) 1684 .is_err()); 1685 } 1686 1687 #[test] 1688 fn test_add_memmap_entry() { 1689 let mut memmap: Vec<hvm_memmap_table_entry> = Vec::new(); 1690 1691 let expected_memmap = vec![ 1692 hvm_memmap_table_entry { 1693 addr: 0x0, 1694 size: 0x1000, 1695 type_: E820_RAM, 1696 ..Default::default() 1697 }, 1698 hvm_memmap_table_entry { 1699 addr: 0x10000, 1700 size: 0xa000, 1701 type_: E820_RESERVED, 1702 ..Default::default() 1703 }, 1704 ]; 1705 1706 add_memmap_entry(&mut memmap, 0, 0x1000, E820_RAM); 1707 add_memmap_entry(&mut memmap, 0x10000, 0xa000, E820_RESERVED); 1708 1709 assert_eq!(format!("{memmap:?}"), format!("{expected_memmap:?}")); 1710 } 1711 1712 #[test] 1713 fn test_get_x2apic_id() { 1714 let x2apic_id = get_x2apic_id(0, Some((2, 3, 1))); 1715 assert_eq!(x2apic_id, 0); 1716 1717 let x2apic_id = get_x2apic_id(1, Some((2, 3, 1))); 1718 assert_eq!(x2apic_id, 1); 1719 1720 let x2apic_id = get_x2apic_id(2, Some((2, 3, 1))); 1721 assert_eq!(x2apic_id, 2); 1722 1723 let x2apic_id = get_x2apic_id(6, Some((2, 3, 1))); 1724 assert_eq!(x2apic_id, 8); 1725 1726 let x2apic_id = get_x2apic_id(7, Some((2, 3, 1))); 1727 assert_eq!(x2apic_id, 9); 1728 1729 let x2apic_id = get_x2apic_id(8, Some((2, 3, 1))); 1730 assert_eq!(x2apic_id, 10); 1731 } 1732 } 1733