1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use std::collections::BTreeMap; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use std::io::Write; 17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 18 use std::mem::size_of; 19 use std::os::unix::thread::JoinHandleExt; 20 use std::sync::atomic::{AtomicBool, Ordering}; 21 use std::sync::{Arc, Barrier, Mutex}; 22 use std::{cmp, io, result, thread}; 23 24 use acpi_tables::sdt::Sdt; 25 use acpi_tables::{aml, Aml}; 26 use anyhow::anyhow; 27 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 28 use arch::aarch64::regs; 29 #[cfg(target_arch = "x86_64")] 30 use arch::x86_64::get_x2apic_id; 31 use arch::{EntryPoint, NumaNodes}; 32 #[cfg(target_arch = "aarch64")] 33 use devices::gic::Gic; 34 use devices::interrupt_controller::InterruptController; 35 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 36 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 37 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 38 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 39 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 40 use hypervisor::arch::x86::msr_index; 41 #[cfg(target_arch = "x86_64")] 42 use hypervisor::arch::x86::CpuIdEntry; 43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 44 use hypervisor::arch::x86::MsrEntry; 45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 46 use hypervisor::arch::x86::SpecialRegisters; 47 #[cfg(target_arch = "aarch64")] 48 use hypervisor::kvm::kvm_bindings; 49 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 50 use hypervisor::kvm::kvm_ioctls::Cap; 51 #[cfg(feature = "tdx")] 52 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 53 #[cfg(target_arch = "x86_64")] 54 use hypervisor::CpuVendor; 55 #[cfg(feature = "kvm")] 56 use hypervisor::HypervisorType; 57 #[cfg(feature = "guest_debug")] 58 use hypervisor::StandardRegisters; 59 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps}; 60 use libc::{c_void, siginfo_t}; 61 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 62 use linux_loader::elf::Elf64_Nhdr; 63 use seccompiler::{apply_filter, SeccompAction}; 64 use thiserror::Error; 65 use tracer::trace_scoped; 66 use vm_device::BusDevice; 67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 68 use vm_memory::ByteValued; 69 #[cfg(feature = "guest_debug")] 70 use vm_memory::{Bytes, GuestAddressSpace}; 71 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 72 use vm_migration::{ 73 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 74 Transportable, 75 }; 76 use vmm_sys_util::eventfd::EventFd; 77 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 78 use zerocopy::AsBytes; 79 80 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 81 use crate::coredump::{ 82 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 83 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 84 NT_PRSTATUS, 85 }; 86 #[cfg(feature = "guest_debug")] 87 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 88 #[cfg(target_arch = "x86_64")] 89 use crate::memory_manager::MemoryManager; 90 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 91 #[cfg(target_arch = "x86_64")] 92 use crate::vm::physical_bits; 93 use crate::vm_config::CpusConfig; 94 use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID}; 95 96 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 97 /// Extract the specified bits of a 64-bit integer. 98 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 99 /// following expression should return 3 (`0b11`): 100 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 101 /// 102 macro_rules! extract_bits_64 { 103 ($value: tt, $offset: tt, $length: tt) => { 104 ($value >> $offset) & (!0u64 >> (64 - $length)) 105 }; 106 } 107 108 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 109 macro_rules! extract_bits_64_without_offset { 110 ($value: tt, $length: tt) => { 111 $value & (!0u64 >> (64 - $length)) 112 }; 113 } 114 115 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 116 117 #[derive(Debug, Error)] 118 pub enum Error { 119 #[error("Error creating vCPU: {0}")] 120 VcpuCreate(#[source] anyhow::Error), 121 122 #[error("Error running bCPU: {0}")] 123 VcpuRun(#[source] anyhow::Error), 124 125 #[error("Error spawning vCPU thread: {0}")] 126 VcpuSpawn(#[source] io::Error), 127 128 #[error("Error generating common CPUID: {0}")] 129 CommonCpuId(#[source] arch::Error), 130 131 #[error("Error configuring vCPU: {0}")] 132 VcpuConfiguration(#[source] arch::Error), 133 134 #[error("Still pending removed vcpu")] 135 VcpuPendingRemovedVcpu, 136 137 #[cfg(target_arch = "aarch64")] 138 #[error("Error fetching preferred target: {0}")] 139 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 140 141 #[cfg(target_arch = "aarch64")] 142 #[error("Error initialising vCPU: {0}")] 143 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 144 145 #[cfg(target_arch = "aarch64")] 146 #[error("Error finalising vCPU: {0}")] 147 VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError), 148 149 #[error("Failed to join on vCPU threads: {0:?}")] 150 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 151 152 #[error("Error adding CpuManager to MMIO bus: {0}")] 153 BusError(#[source] vm_device::BusError), 154 155 #[error("Requested vCPUs exceed maximum")] 156 DesiredVCpuCountExceedsMax, 157 158 #[error("Cannot create seccomp filter: {0}")] 159 CreateSeccompFilter(#[source] seccompiler::Error), 160 161 #[error("Cannot apply seccomp filter: {0}")] 162 ApplySeccompFilter(#[source] seccompiler::Error), 163 164 #[error("Error starting vCPU after restore: {0}")] 165 StartRestoreVcpu(#[source] anyhow::Error), 166 167 #[error("Unexpected VmExit")] 168 UnexpectedVmExit, 169 170 #[error("Failed to allocate MMIO address for CpuManager")] 171 AllocateMmmioAddress, 172 173 #[cfg(feature = "tdx")] 174 #[error("Error initializing TDX: {0}")] 175 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 176 177 #[cfg(target_arch = "aarch64")] 178 #[error("Error initializing PMU: {0}")] 179 InitPmu(#[source] hypervisor::HypervisorCpuError), 180 181 #[cfg(feature = "guest_debug")] 182 #[error("Error during CPU debug: {0}")] 183 CpuDebug(#[source] hypervisor::HypervisorCpuError), 184 185 #[cfg(feature = "guest_debug")] 186 #[error("Error translating virtual address: {0}")] 187 TranslateVirtualAddress(#[source] anyhow::Error), 188 189 #[cfg(target_arch = "x86_64")] 190 #[error("Error setting up AMX: {0}")] 191 AmxEnable(#[source] anyhow::Error), 192 193 #[error("Maximum number of vCPUs exceeds host limit")] 194 MaximumVcpusExceeded, 195 196 #[cfg(feature = "sev_snp")] 197 #[error("Failed to set sev control register: {0}")] 198 SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), 199 200 #[cfg(target_arch = "x86_64")] 201 #[error("Failed to inject NMI")] 202 NmiError(hypervisor::HypervisorCpuError), 203 } 204 pub type Result<T> = result::Result<T, Error>; 205 206 #[cfg(target_arch = "x86_64")] 207 #[allow(dead_code)] 208 #[repr(packed)] 209 #[derive(AsBytes)] 210 struct LocalX2Apic { 211 pub r#type: u8, 212 pub length: u8, 213 pub _reserved: u16, 214 pub apic_id: u32, 215 pub flags: u32, 216 pub processor_id: u32, 217 } 218 219 #[allow(dead_code)] 220 #[repr(packed)] 221 #[derive(Default, AsBytes)] 222 struct Ioapic { 223 pub r#type: u8, 224 pub length: u8, 225 pub ioapic_id: u8, 226 _reserved: u8, 227 pub apic_address: u32, 228 pub gsi_base: u32, 229 } 230 231 #[cfg(target_arch = "aarch64")] 232 #[allow(dead_code)] 233 #[repr(packed)] 234 #[derive(AsBytes)] 235 struct GicC { 236 pub r#type: u8, 237 pub length: u8, 238 pub reserved0: u16, 239 pub cpu_interface_number: u32, 240 pub uid: u32, 241 pub flags: u32, 242 pub parking_version: u32, 243 pub performance_interrupt: u32, 244 pub parked_address: u64, 245 pub base_address: u64, 246 pub gicv_base_address: u64, 247 pub gich_base_address: u64, 248 pub vgic_interrupt: u32, 249 pub gicr_base_address: u64, 250 pub mpidr: u64, 251 pub proc_power_effi_class: u8, 252 pub reserved1: u8, 253 pub spe_overflow_interrupt: u16, 254 } 255 256 #[cfg(target_arch = "aarch64")] 257 #[allow(dead_code)] 258 #[repr(packed)] 259 #[derive(AsBytes)] 260 struct GicD { 261 pub r#type: u8, 262 pub length: u8, 263 pub reserved0: u16, 264 pub gic_id: u32, 265 pub base_address: u64, 266 pub global_irq_base: u32, 267 pub version: u8, 268 pub reserved1: [u8; 3], 269 } 270 271 #[cfg(target_arch = "aarch64")] 272 #[allow(dead_code)] 273 #[repr(packed)] 274 #[derive(AsBytes)] 275 struct GicR { 276 pub r#type: u8, 277 pub length: u8, 278 pub reserved: u16, 279 pub base_address: u64, 280 pub range_length: u32, 281 } 282 283 #[cfg(target_arch = "aarch64")] 284 #[allow(dead_code)] 285 #[repr(packed)] 286 #[derive(AsBytes)] 287 struct GicIts { 288 pub r#type: u8, 289 pub length: u8, 290 pub reserved0: u16, 291 pub translation_id: u32, 292 pub base_address: u64, 293 pub reserved1: u32, 294 } 295 296 #[cfg(target_arch = "aarch64")] 297 #[allow(dead_code)] 298 #[repr(packed)] 299 #[derive(AsBytes)] 300 struct ProcessorHierarchyNode { 301 pub r#type: u8, 302 pub length: u8, 303 pub reserved: u16, 304 pub flags: u32, 305 pub parent: u32, 306 pub acpi_processor_id: u32, 307 pub num_private_resources: u32, 308 } 309 310 #[allow(dead_code)] 311 #[repr(packed)] 312 #[derive(Default, AsBytes)] 313 struct InterruptSourceOverride { 314 pub r#type: u8, 315 pub length: u8, 316 pub bus: u8, 317 pub source: u8, 318 pub gsi: u32, 319 pub flags: u16, 320 } 321 322 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 323 macro_rules! round_up { 324 ($n:expr,$d:expr) => { 325 (($n / ($d + 1)) + 1) * $d 326 }; 327 } 328 329 /// A wrapper around creating and using a kvm-based VCPU. 330 pub struct Vcpu { 331 // The hypervisor abstracted CPU. 332 vcpu: Arc<dyn hypervisor::Vcpu>, 333 id: u8, 334 #[cfg(target_arch = "aarch64")] 335 mpidr: u64, 336 saved_state: Option<CpuState>, 337 #[cfg(target_arch = "x86_64")] 338 vendor: CpuVendor, 339 } 340 341 impl Vcpu { 342 /// Constructs a new VCPU for `vm`. 343 /// 344 /// # Arguments 345 /// 346 /// * `id` - Represents the CPU number between [0, max vcpus). 347 /// * `vm` - The virtual machine this vcpu will get attached to. 348 /// * `vm_ops` - Optional object for exit handling. 349 /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) 350 pub fn new( 351 id: u8, 352 apic_id: u8, 353 vm: &Arc<dyn hypervisor::Vm>, 354 vm_ops: Option<Arc<dyn VmOps>>, 355 #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, 356 ) -> Result<Self> { 357 let vcpu = vm 358 .create_vcpu(apic_id, vm_ops) 359 .map_err(|e| Error::VcpuCreate(e.into()))?; 360 // Initially the cpuid per vCPU is the one supported by this VM. 361 Ok(Vcpu { 362 vcpu, 363 id, 364 #[cfg(target_arch = "aarch64")] 365 mpidr: 0, 366 saved_state: None, 367 #[cfg(target_arch = "x86_64")] 368 vendor: cpu_vendor, 369 }) 370 } 371 372 /// Configures a vcpu and should be called once per vcpu when created. 373 /// 374 /// # Arguments 375 /// 376 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 377 /// * `guest_memory` - Guest memory. 378 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 379 pub fn configure( 380 &mut self, 381 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 382 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 383 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 384 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 385 #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, 386 ) -> Result<()> { 387 #[cfg(target_arch = "aarch64")] 388 { 389 self.init(vm)?; 390 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 391 .map_err(Error::VcpuConfiguration)?; 392 } 393 info!("Configuring vCPU: cpu_id = {}", self.id); 394 #[cfg(target_arch = "x86_64")] 395 arch::configure_vcpu( 396 &self.vcpu, 397 self.id, 398 boot_setup, 399 cpuid, 400 kvm_hyperv, 401 self.vendor, 402 topology, 403 ) 404 .map_err(Error::VcpuConfiguration)?; 405 406 Ok(()) 407 } 408 409 /// Gets the MPIDR register value. 410 #[cfg(target_arch = "aarch64")] 411 pub fn get_mpidr(&self) -> u64 { 412 self.mpidr 413 } 414 415 /// Gets the saved vCPU state. 416 #[cfg(target_arch = "aarch64")] 417 pub fn get_saved_state(&self) -> Option<CpuState> { 418 self.saved_state.clone() 419 } 420 421 /// Initializes an aarch64 specific vcpu for booting Linux. 422 #[cfg(target_arch = "aarch64")] 423 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 424 use std::arch::is_aarch64_feature_detected; 425 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 426 #[allow(clippy::nonminimal_bool)] 427 let sve_supported = 428 is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2"); 429 // This reads back the kernel's preferred target type. 430 vm.get_preferred_target(&mut kvi) 431 .map_err(Error::VcpuArmPreferredTarget)?; 432 // We already checked that the capability is supported. 433 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 434 if vm 435 .as_any() 436 .downcast_ref::<hypervisor::kvm::KvmVm>() 437 .unwrap() 438 .check_extension(Cap::ArmPmuV3) 439 { 440 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 441 } 442 443 if sve_supported 444 && vm 445 .as_any() 446 .downcast_ref::<hypervisor::kvm::KvmVm>() 447 .unwrap() 448 .check_extension(Cap::ArmSve) 449 { 450 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_SVE; 451 } 452 453 // Non-boot cpus are powered off initially. 454 if self.id > 0 { 455 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 456 } 457 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?; 458 if sve_supported { 459 self.vcpu 460 .vcpu_finalize(kvm_bindings::KVM_ARM_VCPU_SVE as i32) 461 .map_err(Error::VcpuArmFinalize)?; 462 } 463 Ok(()) 464 } 465 466 /// Runs the VCPU until it exits, returning the reason. 467 /// 468 /// Note that the state of the VCPU and associated VM must be setup first for this to do 469 /// anything useful. 470 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 471 self.vcpu.run() 472 } 473 474 #[cfg(feature = "sev_snp")] 475 pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> { 476 self.vcpu 477 .set_sev_control_register(vmsa_pfn) 478 .map_err(Error::SetSevControlRegister) 479 } 480 } 481 482 impl Pausable for Vcpu {} 483 impl Snapshottable for Vcpu { 484 fn id(&self) -> String { 485 self.id.to_string() 486 } 487 488 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 489 let saved_state = self 490 .vcpu 491 .state() 492 .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?; 493 494 self.saved_state = Some(saved_state.clone()); 495 496 Ok(Snapshot::from_data(SnapshotData::new_from_state( 497 &saved_state, 498 )?)) 499 } 500 } 501 502 pub struct CpuManager { 503 config: CpusConfig, 504 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 505 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 506 #[cfg(target_arch = "x86_64")] 507 cpuid: Vec<CpuIdEntry>, 508 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 509 vm: Arc<dyn hypervisor::Vm>, 510 vcpus_kill_signalled: Arc<AtomicBool>, 511 vcpus_pause_signalled: Arc<AtomicBool>, 512 vcpus_kick_signalled: Arc<AtomicBool>, 513 exit_evt: EventFd, 514 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 515 reset_evt: EventFd, 516 #[cfg(feature = "guest_debug")] 517 vm_debug_evt: EventFd, 518 vcpu_states: Vec<VcpuState>, 519 selected_cpu: u8, 520 vcpus: Vec<Arc<Mutex<Vcpu>>>, 521 seccomp_action: SeccompAction, 522 vm_ops: Arc<dyn VmOps>, 523 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 524 acpi_address: Option<GuestAddress>, 525 proximity_domain_per_cpu: BTreeMap<u8, u32>, 526 affinity: BTreeMap<u8, Vec<usize>>, 527 dynamic: bool, 528 hypervisor: Arc<dyn hypervisor::Hypervisor>, 529 #[cfg(feature = "sev_snp")] 530 sev_snp_enabled: bool, 531 } 532 533 const CPU_ENABLE_FLAG: usize = 0; 534 const CPU_INSERTING_FLAG: usize = 1; 535 const CPU_REMOVING_FLAG: usize = 2; 536 const CPU_EJECT_FLAG: usize = 3; 537 538 const CPU_STATUS_OFFSET: u64 = 4; 539 const CPU_SELECTION_OFFSET: u64 = 0; 540 541 impl BusDevice for CpuManager { 542 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 543 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 544 data.fill(0); 545 546 match offset { 547 CPU_SELECTION_OFFSET => { 548 data[0] = self.selected_cpu; 549 } 550 CPU_STATUS_OFFSET => { 551 if self.selected_cpu < self.max_vcpus() { 552 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 553 if state.active() { 554 data[0] |= 1 << CPU_ENABLE_FLAG; 555 } 556 if state.inserting { 557 data[0] |= 1 << CPU_INSERTING_FLAG; 558 } 559 if state.removing { 560 data[0] |= 1 << CPU_REMOVING_FLAG; 561 } 562 } else { 563 warn!("Out of range vCPU id: {}", self.selected_cpu); 564 } 565 } 566 _ => { 567 warn!( 568 "Unexpected offset for accessing CPU manager device: {:#}", 569 offset 570 ); 571 } 572 } 573 } 574 575 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 576 match offset { 577 CPU_SELECTION_OFFSET => { 578 self.selected_cpu = data[0]; 579 } 580 CPU_STATUS_OFFSET => { 581 if self.selected_cpu < self.max_vcpus() { 582 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 583 // The ACPI code writes back a 1 to acknowledge the insertion 584 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 585 && state.inserting 586 { 587 state.inserting = false; 588 } 589 // Ditto for removal 590 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 591 && state.removing 592 { 593 state.removing = false; 594 } 595 // Trigger removal of vCPU 596 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 597 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 598 error!("Error removing vCPU: {:?}", e); 599 } 600 } 601 } else { 602 warn!("Out of range vCPU id: {}", self.selected_cpu); 603 } 604 } 605 _ => { 606 warn!( 607 "Unexpected offset for accessing CPU manager device: {:#}", 608 offset 609 ); 610 } 611 } 612 None 613 } 614 } 615 616 #[derive(Default)] 617 struct VcpuState { 618 inserting: bool, 619 removing: bool, 620 pending_removal: Arc<AtomicBool>, 621 handle: Option<thread::JoinHandle<()>>, 622 kill: Arc<AtomicBool>, 623 vcpu_run_interrupted: Arc<AtomicBool>, 624 paused: Arc<AtomicBool>, 625 } 626 627 impl VcpuState { 628 fn active(&self) -> bool { 629 self.handle.is_some() 630 } 631 632 fn signal_thread(&self) { 633 if let Some(handle) = self.handle.as_ref() { 634 loop { 635 // SAFETY: FFI call with correct arguments 636 unsafe { 637 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 638 } 639 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 640 break; 641 } else { 642 // This is more effective than thread::yield_now() at 643 // avoiding a priority inversion with the vCPU thread 644 thread::sleep(std::time::Duration::from_millis(1)); 645 } 646 } 647 } 648 } 649 650 fn join_thread(&mut self) -> Result<()> { 651 if let Some(handle) = self.handle.take() { 652 handle.join().map_err(Error::ThreadCleanup)? 653 } 654 655 Ok(()) 656 } 657 658 fn unpark_thread(&self) { 659 if let Some(handle) = self.handle.as_ref() { 660 handle.thread().unpark() 661 } 662 } 663 } 664 665 impl CpuManager { 666 #[allow(unused_variables)] 667 #[allow(clippy::too_many_arguments)] 668 pub fn new( 669 config: &CpusConfig, 670 vm: Arc<dyn hypervisor::Vm>, 671 exit_evt: EventFd, 672 reset_evt: EventFd, 673 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 674 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 675 seccomp_action: SeccompAction, 676 vm_ops: Arc<dyn VmOps>, 677 #[cfg(feature = "tdx")] tdx_enabled: bool, 678 numa_nodes: &NumaNodes, 679 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 680 ) -> Result<Arc<Mutex<CpuManager>>> { 681 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 682 return Err(Error::MaximumVcpusExceeded); 683 } 684 685 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 686 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 687 let hypervisor_type = hypervisor.hypervisor_type(); 688 #[cfg(target_arch = "x86_64")] 689 let cpu_vendor = hypervisor.get_cpu_vendor(); 690 691 #[cfg(target_arch = "x86_64")] 692 if config.features.amx { 693 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 694 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 695 const XFEATURE_XTILEDATA: usize = 18; 696 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 697 698 // SAFETY: the syscall is only modifying kernel internal 699 // data structures that the kernel is itself expected to safeguard. 700 let amx_tile = unsafe { 701 libc::syscall( 702 libc::SYS_arch_prctl, 703 ARCH_REQ_XCOMP_GUEST_PERM, 704 XFEATURE_XTILEDATA, 705 ) 706 }; 707 708 if amx_tile != 0 { 709 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 710 } else { 711 let mask: usize = 0; 712 // SAFETY: the mask being modified (not marked mutable as it is 713 // modified in unsafe only which is permitted) isn't in use elsewhere. 714 let result = unsafe { 715 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 716 }; 717 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 718 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 719 } 720 } 721 } 722 723 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 724 let mut cpu_list = Vec::new(); 725 for (proximity_domain, numa_node) in numa_nodes.iter() { 726 for cpu in numa_node.cpus.iter() { 727 cpu_list.push((*cpu, *proximity_domain)) 728 } 729 } 730 cpu_list 731 } 732 .into_iter() 733 .collect(); 734 735 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 736 cpu_affinity 737 .iter() 738 .map(|a| (a.vcpu, a.host_cpus.clone())) 739 .collect() 740 } else { 741 BTreeMap::new() 742 }; 743 744 #[cfg(feature = "tdx")] 745 let dynamic = !tdx_enabled; 746 #[cfg(not(feature = "tdx"))] 747 let dynamic = true; 748 749 Ok(Arc::new(Mutex::new(CpuManager { 750 config: config.clone(), 751 interrupt_controller: None, 752 #[cfg(target_arch = "x86_64")] 753 cpuid: Vec::new(), 754 vm, 755 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 756 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 757 vcpus_kick_signalled: Arc::new(AtomicBool::new(false)), 758 vcpu_states, 759 exit_evt, 760 reset_evt, 761 #[cfg(feature = "guest_debug")] 762 vm_debug_evt, 763 selected_cpu: 0, 764 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 765 seccomp_action, 766 vm_ops, 767 acpi_address: None, 768 proximity_domain_per_cpu, 769 affinity, 770 dynamic, 771 hypervisor: hypervisor.clone(), 772 #[cfg(feature = "sev_snp")] 773 sev_snp_enabled, 774 }))) 775 } 776 777 #[cfg(target_arch = "x86_64")] 778 pub fn populate_cpuid( 779 &mut self, 780 memory_manager: &Arc<Mutex<MemoryManager>>, 781 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 782 #[cfg(feature = "tdx")] tdx: bool, 783 ) -> Result<()> { 784 let sgx_epc_sections = memory_manager 785 .lock() 786 .unwrap() 787 .sgx_epc_region() 788 .as_ref() 789 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 790 791 self.cpuid = { 792 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 793 arch::generate_common_cpuid( 794 hypervisor, 795 &arch::CpuidConfig { 796 sgx_epc_sections, 797 phys_bits, 798 kvm_hyperv: self.config.kvm_hyperv, 799 #[cfg(feature = "tdx")] 800 tdx, 801 amx: self.config.features.amx, 802 }, 803 ) 804 .map_err(Error::CommonCpuId)? 805 }; 806 807 Ok(()) 808 } 809 810 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 811 info!("Creating vCPU: cpu_id = {}", cpu_id); 812 813 #[cfg(target_arch = "x86_64")] 814 let topology = self.get_vcpu_topology(); 815 #[cfg(target_arch = "x86_64")] 816 let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology); 817 #[cfg(target_arch = "aarch64")] 818 let x2apic_id = cpu_id as u32; 819 820 let mut vcpu = Vcpu::new( 821 cpu_id, 822 x2apic_id as u8, 823 &self.vm, 824 Some(self.vm_ops.clone()), 825 #[cfg(target_arch = "x86_64")] 826 self.hypervisor.get_cpu_vendor(), 827 )?; 828 829 if let Some(snapshot) = snapshot { 830 // AArch64 vCPUs should be initialized after created. 831 #[cfg(target_arch = "aarch64")] 832 vcpu.init(&self.vm)?; 833 834 let state: CpuState = snapshot.to_state().map_err(|e| { 835 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 836 })?; 837 vcpu.vcpu 838 .set_state(&state) 839 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 840 841 vcpu.saved_state = Some(state); 842 } 843 844 let vcpu = Arc::new(Mutex::new(vcpu)); 845 846 // Adding vCPU to the CpuManager's vCPU list. 847 self.vcpus.push(vcpu.clone()); 848 849 Ok(vcpu) 850 } 851 852 pub fn configure_vcpu( 853 &self, 854 vcpu: Arc<Mutex<Vcpu>>, 855 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 856 ) -> Result<()> { 857 let mut vcpu = vcpu.lock().unwrap(); 858 859 #[cfg(feature = "sev_snp")] 860 if self.sev_snp_enabled { 861 if let Some((kernel_entry_point, _)) = boot_setup { 862 vcpu.set_sev_control_register( 863 kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, 864 )?; 865 } 866 867 // Traditional way to configure vcpu doesn't work for SEV-SNP guests. 868 // All the vCPU configuration for SEV-SNP guest is provided via VMSA. 869 return Ok(()); 870 } 871 872 #[cfg(target_arch = "x86_64")] 873 assert!(!self.cpuid.is_empty()); 874 875 #[cfg(target_arch = "x86_64")] 876 let topology = self.config.topology.clone().map_or_else( 877 || Some((1, self.boot_vcpus(), 1)), 878 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 879 ); 880 #[cfg(target_arch = "x86_64")] 881 vcpu.configure( 882 boot_setup, 883 self.cpuid.clone(), 884 self.config.kvm_hyperv, 885 topology, 886 )?; 887 888 #[cfg(target_arch = "aarch64")] 889 vcpu.configure(&self.vm, boot_setup)?; 890 891 Ok(()) 892 } 893 894 /// Only create new vCPUs if there aren't any inactive ones to reuse 895 fn create_vcpus( 896 &mut self, 897 desired_vcpus: u8, 898 snapshot: Option<Snapshot>, 899 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 900 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 901 info!( 902 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 903 desired_vcpus, 904 self.config.max_vcpus, 905 self.vcpus.len(), 906 self.present_vcpus() 907 ); 908 909 if desired_vcpus > self.config.max_vcpus { 910 return Err(Error::DesiredVCpuCountExceedsMax); 911 } 912 913 // Only create vCPUs in excess of all the allocated vCPUs. 914 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 915 vcpus.push(self.create_vcpu( 916 cpu_id, 917 // TODO: The special format of the CPU id can be removed once 918 // ready to break live upgrade. 919 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 920 )?); 921 } 922 923 Ok(vcpus) 924 } 925 926 #[cfg(target_arch = "aarch64")] 927 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 928 for cpu in self.vcpus.iter() { 929 let cpu = cpu.lock().unwrap(); 930 // Check if PMU attr is available, if not, log the information. 931 if cpu.vcpu.has_pmu_support() { 932 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 933 } else { 934 debug!( 935 "PMU attribute is not supported in vCPU{}, skip PMU init!", 936 cpu.id 937 ); 938 return Ok(false); 939 } 940 } 941 942 Ok(true) 943 } 944 945 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 946 self.vcpus.clone() 947 } 948 949 fn start_vcpu( 950 &mut self, 951 vcpu: Arc<Mutex<Vcpu>>, 952 vcpu_id: u8, 953 vcpu_thread_barrier: Arc<Barrier>, 954 inserting: bool, 955 ) -> Result<()> { 956 let reset_evt = self.reset_evt.try_clone().unwrap(); 957 let exit_evt = self.exit_evt.try_clone().unwrap(); 958 #[cfg(feature = "kvm")] 959 let hypervisor_type = self.hypervisor.hypervisor_type(); 960 #[cfg(feature = "guest_debug")] 961 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 962 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 963 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 964 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 965 let vcpu_kick_signalled = self.vcpus_kick_signalled.clone(); 966 967 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 968 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 969 .vcpu_run_interrupted 970 .clone(); 971 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 972 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 973 974 // Prepare the CPU set the current vCPU is expected to run onto. 975 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 976 // SAFETY: all zeros is a valid pattern 977 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 978 // SAFETY: FFI call, trivially safe 979 unsafe { libc::CPU_ZERO(&mut cpuset) }; 980 for host_cpu in host_cpus { 981 // SAFETY: FFI call, trivially safe 982 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) }; 983 } 984 cpuset 985 }); 986 987 // Retrieve seccomp filter for vcpu thread 988 let vcpu_seccomp_filter = get_seccomp_filter( 989 &self.seccomp_action, 990 Thread::Vcpu, 991 self.hypervisor.hypervisor_type(), 992 ) 993 .map_err(Error::CreateSeccompFilter)?; 994 995 #[cfg(target_arch = "x86_64")] 996 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 997 998 info!("Starting vCPU: cpu_id = {}", vcpu_id); 999 1000 let handle = Some( 1001 thread::Builder::new() 1002 .name(format!("vcpu{vcpu_id}")) 1003 .spawn(move || { 1004 // Schedule the thread to run on the expected CPU set 1005 if let Some(cpuset) = cpuset.as_ref() { 1006 // SAFETY: FFI call with correct arguments 1007 let ret = unsafe { 1008 libc::sched_setaffinity( 1009 0, 1010 std::mem::size_of::<libc::cpu_set_t>(), 1011 cpuset as *const libc::cpu_set_t, 1012 ) 1013 }; 1014 1015 if ret != 0 { 1016 error!( 1017 "Failed scheduling the vCPU {} on the expected CPU set: {}", 1018 vcpu_id, 1019 io::Error::last_os_error() 1020 ); 1021 return; 1022 } 1023 } 1024 1025 // Apply seccomp filter for vcpu thread. 1026 if !vcpu_seccomp_filter.is_empty() { 1027 if let Err(e) = 1028 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 1029 { 1030 error!("Error applying seccomp filter: {:?}", e); 1031 return; 1032 } 1033 } 1034 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 1035 // This uses an async signal safe handler to kill the vcpu handles. 1036 register_signal_handler(SIGRTMIN(), handle_signal) 1037 .expect("Failed to register vcpu signal handler"); 1038 // Block until all CPUs are ready. 1039 vcpu_thread_barrier.wait(); 1040 1041 std::panic::catch_unwind(move || { 1042 loop { 1043 // If we are being told to pause, we park the thread 1044 // until the pause boolean is toggled. 1045 // The resume operation is responsible for toggling 1046 // the boolean and unpark the thread. 1047 // We enter a loop because park() could spuriously 1048 // return. We will then park() again unless the 1049 // pause boolean has been toggled. 1050 1051 // Need to use Ordering::SeqCst as we have multiple 1052 // loads and stores to different atomics and we need 1053 // to see them in a consistent order in all threads 1054 1055 if vcpu_pause_signalled.load(Ordering::SeqCst) { 1056 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 1057 // completed by returning to KVM_RUN. From the kernel docs: 1058 // 1059 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 1060 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 1061 // operations are complete (and guest state is consistent) only after userspace 1062 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 1063 // incomplete operations and then check for pending signals. 1064 // The pending state of the operation is not preserved in state which is 1065 // visible to userspace, thus userspace should ensure that the operation is 1066 // completed before performing a live migration. Userspace can re-enter the 1067 // guest with an unmasked signal pending or with the immediate_exit field set 1068 // to complete pending operations without allowing any further instructions 1069 // to be executed. 1070 1071 #[cfg(feature = "kvm")] 1072 if matches!(hypervisor_type, HypervisorType::Kvm) { 1073 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 1074 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 1075 error!("Unexpected VM exit on \"immediate_exit\" run"); 1076 break; 1077 } 1078 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 1079 } 1080 1081 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1082 1083 vcpu_paused.store(true, Ordering::SeqCst); 1084 while vcpu_pause_signalled.load(Ordering::SeqCst) { 1085 thread::park(); 1086 } 1087 vcpu_run_interrupted.store(false, Ordering::SeqCst); 1088 } 1089 1090 if vcpu_kick_signalled.load(Ordering::SeqCst) { 1091 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1092 #[cfg(target_arch = "x86_64")] 1093 match vcpu.lock().as_ref().unwrap().vcpu.nmi() { 1094 Ok(()) => {}, 1095 Err(e) => { 1096 error!("Error when inject nmi {}", e); 1097 break; 1098 } 1099 } 1100 } 1101 1102 // We've been told to terminate 1103 if vcpu_kill_signalled.load(Ordering::SeqCst) 1104 || vcpu_kill.load(Ordering::SeqCst) 1105 { 1106 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1107 break; 1108 } 1109 1110 #[cfg(feature = "tdx")] 1111 let mut vcpu = vcpu.lock().unwrap(); 1112 #[cfg(not(feature = "tdx"))] 1113 let vcpu = vcpu.lock().unwrap(); 1114 // vcpu.run() returns false on a triple-fault so trigger a reset 1115 match vcpu.run() { 1116 Ok(run) => match run { 1117 #[cfg(feature = "kvm")] 1118 VmExit::Debug => { 1119 info!("VmExit::Debug"); 1120 #[cfg(feature = "guest_debug")] 1121 { 1122 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1123 let raw_tid = get_raw_tid(vcpu_id as usize); 1124 vm_debug_evt.write(raw_tid as u64).unwrap(); 1125 } 1126 } 1127 #[cfg(target_arch = "x86_64")] 1128 VmExit::IoapicEoi(vector) => { 1129 if let Some(interrupt_controller) = 1130 &interrupt_controller_clone 1131 { 1132 interrupt_controller 1133 .lock() 1134 .unwrap() 1135 .end_of_interrupt(vector); 1136 } 1137 } 1138 VmExit::Ignore => {} 1139 VmExit::Hyperv => {} 1140 VmExit::Reset => { 1141 info!("VmExit::Reset"); 1142 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1143 reset_evt.write(1).unwrap(); 1144 break; 1145 } 1146 VmExit::Shutdown => { 1147 info!("VmExit::Shutdown"); 1148 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1149 exit_evt.write(1).unwrap(); 1150 break; 1151 } 1152 #[cfg(feature = "tdx")] 1153 VmExit::Tdx => { 1154 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1155 match vcpu.get_tdx_exit_details() { 1156 Ok(details) => match details { 1157 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1158 TdxExitDetails::SetupEventNotifyInterrupt => { 1159 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1160 } 1161 }, 1162 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1163 } 1164 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1165 } else { 1166 // We should never reach this code as 1167 // this means the design from the code 1168 // is wrong. 1169 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1170 } 1171 } 1172 }, 1173 1174 Err(e) => { 1175 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1176 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1177 exit_evt.write(1).unwrap(); 1178 break; 1179 } 1180 } 1181 1182 // We've been told to terminate 1183 if vcpu_kill_signalled.load(Ordering::SeqCst) 1184 || vcpu_kill.load(Ordering::SeqCst) 1185 { 1186 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1187 break; 1188 } 1189 } 1190 }) 1191 .or_else(|_| { 1192 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1193 error!("vCPU thread panicked"); 1194 panic_exit_evt.write(1) 1195 }) 1196 .ok(); 1197 }) 1198 .map_err(Error::VcpuSpawn)?, 1199 ); 1200 1201 // On hot plug calls into this function entry_point is None. It is for 1202 // those hotplug CPU additions that we need to set the inserting flag. 1203 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1204 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1205 1206 Ok(()) 1207 } 1208 1209 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1210 fn activate_vcpus( 1211 &mut self, 1212 desired_vcpus: u8, 1213 inserting: bool, 1214 paused: Option<bool>, 1215 ) -> Result<()> { 1216 if desired_vcpus > self.config.max_vcpus { 1217 return Err(Error::DesiredVCpuCountExceedsMax); 1218 } 1219 1220 let vcpu_thread_barrier = Arc::new(Barrier::new( 1221 (desired_vcpus - self.present_vcpus() + 1) as usize, 1222 )); 1223 1224 if let Some(paused) = paused { 1225 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1226 } 1227 1228 info!( 1229 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1230 desired_vcpus, 1231 self.vcpus.len(), 1232 self.present_vcpus(), 1233 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1234 ); 1235 1236 // This reuses any inactive vCPUs as well as any that were newly created 1237 for vcpu_id in self.present_vcpus()..desired_vcpus { 1238 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1239 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1240 } 1241 1242 // Unblock all CPU threads. 1243 vcpu_thread_barrier.wait(); 1244 Ok(()) 1245 } 1246 1247 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1248 // Mark vCPUs for removal, actual removal happens on ejection 1249 for cpu_id in desired_vcpus..self.present_vcpus() { 1250 self.vcpu_states[usize::from(cpu_id)].removing = true; 1251 self.vcpu_states[usize::from(cpu_id)] 1252 .pending_removal 1253 .store(true, Ordering::SeqCst); 1254 } 1255 } 1256 1257 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1258 for state in self.vcpu_states.iter() { 1259 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1260 return true; 1261 } 1262 } 1263 false 1264 } 1265 1266 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1267 info!("Removing vCPU: cpu_id = {}", cpu_id); 1268 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1269 state.kill.store(true, Ordering::SeqCst); 1270 state.signal_thread(); 1271 state.join_thread()?; 1272 state.handle = None; 1273 1274 // Once the thread has exited, clear the "kill" so that it can reused 1275 state.kill.store(false, Ordering::SeqCst); 1276 state.pending_removal.store(false, Ordering::SeqCst); 1277 1278 Ok(()) 1279 } 1280 1281 pub fn create_boot_vcpus( 1282 &mut self, 1283 snapshot: Option<Snapshot>, 1284 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1285 trace_scoped!("create_boot_vcpus"); 1286 1287 self.create_vcpus(self.boot_vcpus(), snapshot) 1288 } 1289 1290 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1291 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1292 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1293 } 1294 1295 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1296 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1297 .map_err(|e| { 1298 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1299 })?; 1300 1301 Ok(()) 1302 } 1303 1304 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1305 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1306 return Ok(false); 1307 } 1308 1309 if !self.dynamic { 1310 return Ok(false); 1311 } 1312 1313 if self.check_pending_removed_vcpu() { 1314 return Err(Error::VcpuPendingRemovedVcpu); 1315 } 1316 1317 match desired_vcpus.cmp(&self.present_vcpus()) { 1318 cmp::Ordering::Greater => { 1319 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1320 for vcpu in vcpus { 1321 self.configure_vcpu(vcpu, None)? 1322 } 1323 self.activate_vcpus(desired_vcpus, true, None)?; 1324 Ok(true) 1325 } 1326 cmp::Ordering::Less => { 1327 self.mark_vcpus_for_removal(desired_vcpus); 1328 Ok(true) 1329 } 1330 _ => Ok(false), 1331 } 1332 } 1333 1334 pub fn shutdown(&mut self) -> Result<()> { 1335 // Tell the vCPUs to stop themselves next time they go through the loop 1336 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1337 1338 // Toggle the vCPUs pause boolean 1339 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1340 1341 // Unpark all the VCPU threads. 1342 for state in self.vcpu_states.iter() { 1343 state.unpark_thread(); 1344 } 1345 1346 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1347 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1348 // above. 1349 for state in self.vcpu_states.iter() { 1350 state.signal_thread(); 1351 } 1352 1353 // Wait for all the threads to finish. This removes the state from the vector. 1354 for mut state in self.vcpu_states.drain(..) { 1355 state.join_thread()?; 1356 } 1357 1358 Ok(()) 1359 } 1360 1361 #[cfg(feature = "tdx")] 1362 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1363 for vcpu in &self.vcpus { 1364 vcpu.lock() 1365 .unwrap() 1366 .vcpu 1367 .tdx_init(hob_address) 1368 .map_err(Error::InitializeTdx)?; 1369 } 1370 Ok(()) 1371 } 1372 1373 pub fn boot_vcpus(&self) -> u8 { 1374 self.config.boot_vcpus 1375 } 1376 1377 pub fn max_vcpus(&self) -> u8 { 1378 self.config.max_vcpus 1379 } 1380 1381 #[cfg(target_arch = "x86_64")] 1382 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1383 assert!(!self.cpuid.is_empty()); 1384 self.cpuid.clone() 1385 } 1386 1387 fn present_vcpus(&self) -> u8 { 1388 self.vcpu_states 1389 .iter() 1390 .fold(0, |acc, state| acc + state.active() as u8) 1391 } 1392 1393 #[cfg(target_arch = "aarch64")] 1394 pub fn get_mpidrs(&self) -> Vec<u64> { 1395 self.vcpus 1396 .iter() 1397 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1398 .collect() 1399 } 1400 1401 #[cfg(target_arch = "aarch64")] 1402 pub fn get_saved_states(&self) -> Vec<CpuState> { 1403 self.vcpus 1404 .iter() 1405 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1406 .collect() 1407 } 1408 1409 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1410 self.config 1411 .topology 1412 .clone() 1413 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1414 } 1415 1416 pub fn create_madt(&self) -> Sdt { 1417 use crate::acpi; 1418 // This is also checked in the commandline parsing. 1419 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1420 1421 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1422 #[cfg(target_arch = "x86_64")] 1423 { 1424 madt.write(36, arch::layout::APIC_START.0); 1425 1426 for cpu in 0..self.config.max_vcpus { 1427 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology()); 1428 1429 let lapic = LocalX2Apic { 1430 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1431 length: 16, 1432 processor_id: cpu.into(), 1433 apic_id: x2apic_id, 1434 flags: if cpu < self.config.boot_vcpus { 1435 1 << MADT_CPU_ENABLE_FLAG 1436 } else { 1437 0 1438 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1439 _reserved: 0, 1440 }; 1441 madt.append(lapic); 1442 } 1443 1444 madt.append(Ioapic { 1445 r#type: acpi::ACPI_APIC_IO, 1446 length: 12, 1447 ioapic_id: 0, 1448 apic_address: arch::layout::IOAPIC_START.0 as u32, 1449 gsi_base: 0, 1450 ..Default::default() 1451 }); 1452 1453 madt.append(InterruptSourceOverride { 1454 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1455 length: 10, 1456 bus: 0, 1457 source: 4, 1458 gsi: 4, 1459 flags: 0, 1460 }); 1461 } 1462 1463 #[cfg(target_arch = "aarch64")] 1464 { 1465 /* Notes: 1466 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1467 */ 1468 1469 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1470 for cpu in 0..self.config.boot_vcpus { 1471 let vcpu = &self.vcpus[cpu as usize]; 1472 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1473 /* ARMv8 MPIDR format: 1474 Bits [63:40] Must be zero 1475 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1476 Bits [31:24] Must be zero 1477 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1478 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1479 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1480 */ 1481 let mpidr_mask = 0xff_00ff_ffff; 1482 let gicc = GicC { 1483 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1484 length: 80, 1485 reserved0: 0, 1486 cpu_interface_number: cpu as u32, 1487 uid: cpu as u32, 1488 flags: 1, 1489 parking_version: 0, 1490 performance_interrupt: 0, 1491 parked_address: 0, 1492 base_address: 0, 1493 gicv_base_address: 0, 1494 gich_base_address: 0, 1495 vgic_interrupt: 0, 1496 gicr_base_address: 0, 1497 mpidr: mpidr & mpidr_mask, 1498 proc_power_effi_class: 0, 1499 reserved1: 0, 1500 spe_overflow_interrupt: 0, 1501 }; 1502 1503 madt.append(gicc); 1504 } 1505 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1506 1507 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1508 let gicd = GicD { 1509 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1510 length: 24, 1511 reserved0: 0, 1512 gic_id: 0, 1513 base_address: vgic_config.dist_addr, 1514 global_irq_base: 0, 1515 version: 3, 1516 reserved1: [0; 3], 1517 }; 1518 madt.append(gicd); 1519 1520 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1521 let gicr = GicR { 1522 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1523 length: 16, 1524 reserved: 0, 1525 base_address: vgic_config.redists_addr, 1526 range_length: vgic_config.redists_size as u32, 1527 }; 1528 madt.append(gicr); 1529 1530 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1531 let gicits = GicIts { 1532 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1533 length: 20, 1534 reserved0: 0, 1535 translation_id: 0, 1536 base_address: vgic_config.msi_addr, 1537 reserved1: 0, 1538 }; 1539 madt.append(gicits); 1540 1541 madt.update_checksum(); 1542 } 1543 1544 madt 1545 } 1546 1547 #[cfg(target_arch = "aarch64")] 1548 pub fn create_pptt(&self) -> Sdt { 1549 let pptt_start = 0; 1550 let mut cpus = 0; 1551 let mut uid = 0; 1552 // If topology is not specified, the default setting is: 1553 // 1 package, multiple cores, 1 thread per core 1554 // This is also the behavior when PPTT is missing. 1555 let (threads_per_core, cores_per_package, packages) = 1556 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1557 1558 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1559 1560 for cluster_idx in 0..packages { 1561 if cpus < self.config.boot_vcpus as usize { 1562 let cluster_offset = pptt.len() - pptt_start; 1563 let cluster_hierarchy_node = ProcessorHierarchyNode { 1564 r#type: 0, 1565 length: 20, 1566 reserved: 0, 1567 flags: 0x2, 1568 parent: 0, 1569 acpi_processor_id: cluster_idx as u32, 1570 num_private_resources: 0, 1571 }; 1572 pptt.append(cluster_hierarchy_node); 1573 1574 for core_idx in 0..cores_per_package { 1575 let core_offset = pptt.len() - pptt_start; 1576 1577 if threads_per_core > 1 { 1578 let core_hierarchy_node = ProcessorHierarchyNode { 1579 r#type: 0, 1580 length: 20, 1581 reserved: 0, 1582 flags: 0x2, 1583 parent: cluster_offset as u32, 1584 acpi_processor_id: core_idx as u32, 1585 num_private_resources: 0, 1586 }; 1587 pptt.append(core_hierarchy_node); 1588 1589 for _thread_idx in 0..threads_per_core { 1590 let thread_hierarchy_node = ProcessorHierarchyNode { 1591 r#type: 0, 1592 length: 20, 1593 reserved: 0, 1594 flags: 0xE, 1595 parent: core_offset as u32, 1596 acpi_processor_id: uid as u32, 1597 num_private_resources: 0, 1598 }; 1599 pptt.append(thread_hierarchy_node); 1600 uid += 1; 1601 } 1602 } else { 1603 let thread_hierarchy_node = ProcessorHierarchyNode { 1604 r#type: 0, 1605 length: 20, 1606 reserved: 0, 1607 flags: 0xA, 1608 parent: cluster_offset as u32, 1609 acpi_processor_id: uid as u32, 1610 num_private_resources: 0, 1611 }; 1612 pptt.append(thread_hierarchy_node); 1613 uid += 1; 1614 } 1615 } 1616 cpus += (cores_per_package * threads_per_core) as usize; 1617 } 1618 } 1619 1620 pptt.update_checksum(); 1621 pptt 1622 } 1623 1624 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1625 fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters { 1626 self.vcpus[usize::from(cpu_id)] 1627 .lock() 1628 .unwrap() 1629 .vcpu 1630 .create_standard_regs() 1631 } 1632 1633 #[cfg(feature = "guest_debug")] 1634 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1635 self.vcpus[usize::from(cpu_id)] 1636 .lock() 1637 .unwrap() 1638 .vcpu 1639 .get_regs() 1640 .map_err(Error::CpuDebug) 1641 } 1642 1643 #[cfg(feature = "guest_debug")] 1644 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1645 self.vcpus[usize::from(cpu_id)] 1646 .lock() 1647 .unwrap() 1648 .vcpu 1649 .set_regs(regs) 1650 .map_err(Error::CpuDebug) 1651 } 1652 1653 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1654 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1655 self.vcpus[usize::from(cpu_id)] 1656 .lock() 1657 .unwrap() 1658 .vcpu 1659 .get_sregs() 1660 .map_err(Error::CpuDebug) 1661 } 1662 1663 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1664 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1665 self.vcpus[usize::from(cpu_id)] 1666 .lock() 1667 .unwrap() 1668 .vcpu 1669 .set_sregs(sregs) 1670 .map_err(Error::CpuDebug) 1671 } 1672 1673 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1674 fn translate_gva( 1675 &self, 1676 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1677 cpu_id: u8, 1678 gva: u64, 1679 ) -> Result<u64> { 1680 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1681 .lock() 1682 .unwrap() 1683 .vcpu 1684 .translate_gva(gva, /* flags: unused */ 0) 1685 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1686 Ok(gpa) 1687 } 1688 1689 /// 1690 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1691 /// it in VMM by walking through translation tables. 1692 /// 1693 /// Address translation is big topic, here we only focus the scenario that 1694 /// happens in VMM while debugging kernel. This `translate_gva` 1695 /// implementation is restricted to: 1696 /// - Exception Level 1 1697 /// - Translate high address range only (kernel space) 1698 /// 1699 /// This implementation supports following Arm-v8a features related to 1700 /// address translation: 1701 /// - FEAT_LPA 1702 /// - FEAT_LVA 1703 /// - FEAT_LPA2 1704 /// 1705 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1706 fn translate_gva( 1707 &self, 1708 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1709 cpu_id: u8, 1710 gva: u64, 1711 ) -> Result<u64> { 1712 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1713 .lock() 1714 .unwrap() 1715 .vcpu 1716 .get_sys_reg(regs::TCR_EL1) 1717 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1718 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1719 .lock() 1720 .unwrap() 1721 .vcpu 1722 .get_sys_reg(regs::TTBR1_EL1) 1723 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1724 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1725 .lock() 1726 .unwrap() 1727 .vcpu 1728 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1729 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1730 1731 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1732 // or low (0x000xxx...). 1733 let high_range = extract_bits_64!(gva, 55, 1); 1734 if high_range == 0 { 1735 info!("VA (0x{:x}) range is not supported!", gva); 1736 return Ok(gva); 1737 } 1738 1739 // High range size offset 1740 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1741 // Granule size 1742 let tg = extract_bits_64!(tcr_el1, 30, 2); 1743 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1744 let ds = extract_bits_64!(tcr_el1, 59, 1); 1745 1746 if tsz == 0 { 1747 info!("VA translation is not ready!"); 1748 return Ok(gva); 1749 } 1750 1751 // VA size is determined by TCR_BL1.T1SZ 1752 let va_size = 64 - tsz; 1753 // Number of bits in VA consumed in each level of translation 1754 let stride = match tg { 1755 3 => 13, // 64KB granule size 1756 1 => 11, // 16KB granule size 1757 _ => 9, // 4KB, default 1758 }; 1759 // Starting level of walking 1760 let mut level = 4 - (va_size - 4) / stride; 1761 1762 // PA or IPA size is determined 1763 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1764 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1765 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1766 // To be safe, we use the minimum value if they are different. 1767 let pa_range = std::cmp::min(tcr_ips, pa_range); 1768 // PA size in bits 1769 let pa_size = match pa_range { 1770 0 => 32, 1771 1 => 36, 1772 2 => 40, 1773 3 => 42, 1774 4 => 44, 1775 5 => 48, 1776 6 => 52, 1777 _ => { 1778 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1779 "PA range not supported {pa_range}" 1780 )))) 1781 } 1782 }; 1783 1784 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1785 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1786 // If FEAT_LPA2 is present, the translation table descriptor holds 1787 // 50 bits of the table address of next level. 1788 // Otherwise, it is 48 bits. 1789 let descaddrmask = if ds == 1 { 1790 !0u64 >> (64 - 50) // mask with 50 least significant bits 1791 } else { 1792 !0u64 >> (64 - 48) // mask with 48 least significant bits 1793 }; 1794 let descaddrmask = descaddrmask & !indexmask_grainsize; 1795 1796 // Translation table base address 1797 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1798 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1799 // address bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1800 if pa_size == 52 { 1801 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1802 } 1803 1804 // Loop through tables of each level 1805 loop { 1806 // Table offset for current level 1807 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1808 descaddr |= table_offset; 1809 descaddr &= !7u64; 1810 1811 let mut buf = [0; 8]; 1812 guest_memory 1813 .memory() 1814 .read(&mut buf, GuestAddress(descaddr)) 1815 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1816 let descriptor = u64::from_le_bytes(buf); 1817 1818 descaddr = descriptor & descaddrmask; 1819 // In the case of FEAT_LPA, the next-level translation table address 1820 // bits [48:51] comes from bits [12:15] of the current descriptor. 1821 // For FEAT_LPA2, the next-level translation table address 1822 // bits [50:51] comes from bits [8:9] of the current descriptor, 1823 // bits [48:49] comes from bits [48:49] of the descriptor which was 1824 // handled previously. 1825 if pa_size == 52 { 1826 if ds == 1 { 1827 // FEAT_LPA2 1828 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1829 } else { 1830 // FEAT_LPA 1831 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1832 } 1833 } 1834 1835 if (descriptor & 2) != 0 && (level < 3) { 1836 // This is a table entry. Go down to next level. 1837 level += 1; 1838 indexmask = indexmask_grainsize; 1839 continue; 1840 } 1841 1842 break; 1843 } 1844 1845 // We have reached either: 1846 // - a page entry at level 3 or 1847 // - a block entry at level 1 or 2 1848 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1849 descaddr &= !(page_size - 1); 1850 descaddr |= gva & (page_size - 1); 1851 1852 Ok(descaddr) 1853 } 1854 1855 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1856 self.acpi_address = Some(acpi_address); 1857 } 1858 1859 pub(crate) fn set_interrupt_controller( 1860 &mut self, 1861 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1862 ) { 1863 self.interrupt_controller = Some(interrupt_controller); 1864 } 1865 1866 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1867 &self.vcpus_kill_signalled 1868 } 1869 1870 #[cfg(feature = "igvm")] 1871 pub(crate) fn get_cpuid_leaf( 1872 &self, 1873 cpu_id: u8, 1874 eax: u32, 1875 ecx: u32, 1876 xfem: u64, 1877 xss: u64, 1878 ) -> Result<[u32; 4]> { 1879 let leaf_info = self.vcpus[usize::from(cpu_id)] 1880 .lock() 1881 .unwrap() 1882 .vcpu 1883 .get_cpuid_values(eax, ecx, xfem, xss) 1884 .unwrap(); 1885 Ok(leaf_info) 1886 } 1887 1888 #[cfg(feature = "sev_snp")] 1889 pub(crate) fn sev_snp_enabled(&self) -> bool { 1890 self.sev_snp_enabled 1891 } 1892 1893 pub(crate) fn nmi(&self) -> Result<()> { 1894 self.vcpus_kick_signalled.store(true, Ordering::SeqCst); 1895 1896 for state in self.vcpu_states.iter() { 1897 state.signal_thread(); 1898 } 1899 1900 self.vcpus_kick_signalled.store(false, Ordering::SeqCst); 1901 1902 Ok(()) 1903 } 1904 } 1905 1906 struct Cpu { 1907 cpu_id: u8, 1908 proximity_domain: u32, 1909 dynamic: bool, 1910 #[cfg(target_arch = "x86_64")] 1911 topology: Option<(u8, u8, u8)>, 1912 } 1913 1914 #[cfg(target_arch = "x86_64")] 1915 const MADT_CPU_ENABLE_FLAG: usize = 0; 1916 1917 #[cfg(target_arch = "x86_64")] 1918 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1919 1920 impl Cpu { 1921 #[cfg(target_arch = "x86_64")] 1922 fn generate_mat(&self) -> Vec<u8> { 1923 let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology); 1924 1925 let lapic = LocalX2Apic { 1926 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1927 length: 16, 1928 processor_id: self.cpu_id.into(), 1929 apic_id: x2apic_id, 1930 flags: 1 << MADT_CPU_ENABLE_FLAG, 1931 _reserved: 0, 1932 }; 1933 1934 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1935 // SAFETY: mat_data is large enough to hold lapic 1936 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1937 1938 mat_data 1939 } 1940 } 1941 1942 impl Aml for Cpu { 1943 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1944 #[cfg(target_arch = "x86_64")] 1945 let mat_data: Vec<u8> = self.generate_mat(); 1946 #[allow(clippy::if_same_then_else)] 1947 if self.dynamic { 1948 aml::Device::new( 1949 format!("C{:03X}", self.cpu_id).as_str().into(), 1950 vec![ 1951 &aml::Name::new("_HID".into(), &"ACPI0007"), 1952 &aml::Name::new("_UID".into(), &self.cpu_id), 1953 // Currently, AArch64 cannot support following fields. 1954 /* 1955 _STA return value: 1956 Bit [0] – Set if the device is present. 1957 Bit [1] – Set if the device is enabled and decoding its resources. 1958 Bit [2] – Set if the device should be shown in the UI. 1959 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1960 Bit [4] – Set if the battery is present. 1961 Bits [31:5] – Reserved (must be cleared). 1962 */ 1963 #[cfg(target_arch = "x86_64")] 1964 &aml::Method::new( 1965 "_STA".into(), 1966 0, 1967 false, 1968 // Call into CSTA method which will interrogate device 1969 vec![&aml::Return::new(&aml::MethodCall::new( 1970 "CSTA".into(), 1971 vec![&self.cpu_id], 1972 ))], 1973 ), 1974 &aml::Method::new( 1975 "_PXM".into(), 1976 0, 1977 false, 1978 vec![&aml::Return::new(&self.proximity_domain)], 1979 ), 1980 // The Linux kernel expects every CPU device to have a _MAT entry 1981 // containing the LAPIC for this processor with the enabled bit set 1982 // even it if is disabled in the MADT (non-boot CPU) 1983 #[cfg(target_arch = "x86_64")] 1984 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1985 // Trigger CPU ejection 1986 #[cfg(target_arch = "x86_64")] 1987 &aml::Method::new( 1988 "_EJ0".into(), 1989 1, 1990 false, 1991 // Call into CEJ0 method which will actually eject device 1992 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1993 ), 1994 ], 1995 ) 1996 .to_aml_bytes(sink); 1997 } else { 1998 aml::Device::new( 1999 format!("C{:03X}", self.cpu_id).as_str().into(), 2000 vec![ 2001 &aml::Name::new("_HID".into(), &"ACPI0007"), 2002 &aml::Name::new("_UID".into(), &self.cpu_id), 2003 #[cfg(target_arch = "x86_64")] 2004 &aml::Method::new( 2005 "_STA".into(), 2006 0, 2007 false, 2008 // Mark CPU present see CSTA implementation 2009 vec![&aml::Return::new(&0xfu8)], 2010 ), 2011 &aml::Method::new( 2012 "_PXM".into(), 2013 0, 2014 false, 2015 vec![&aml::Return::new(&self.proximity_domain)], 2016 ), 2017 // The Linux kernel expects every CPU device to have a _MAT entry 2018 // containing the LAPIC for this processor with the enabled bit set 2019 // even it if is disabled in the MADT (non-boot CPU) 2020 #[cfg(target_arch = "x86_64")] 2021 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 2022 ], 2023 ) 2024 .to_aml_bytes(sink); 2025 } 2026 } 2027 } 2028 2029 struct CpuNotify { 2030 cpu_id: u8, 2031 } 2032 2033 impl Aml for CpuNotify { 2034 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2035 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 2036 aml::If::new( 2037 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 2038 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2039 ) 2040 .to_aml_bytes(sink) 2041 } 2042 } 2043 2044 struct CpuMethods { 2045 max_vcpus: u8, 2046 dynamic: bool, 2047 } 2048 2049 impl Aml for CpuMethods { 2050 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2051 if self.dynamic { 2052 // CPU status method 2053 aml::Method::new( 2054 "CSTA".into(), 2055 1, 2056 true, 2057 vec![ 2058 // Take lock defined above 2059 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2060 // Write CPU number (in first argument) to I/O port via field 2061 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2062 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2063 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2064 &aml::If::new( 2065 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 2066 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2067 ), 2068 // Release lock 2069 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2070 // Return 0 or 0xf 2071 &aml::Return::new(&aml::Local(0)), 2072 ], 2073 ) 2074 .to_aml_bytes(sink); 2075 2076 let mut cpu_notifies = Vec::new(); 2077 for cpu_id in 0..self.max_vcpus { 2078 cpu_notifies.push(CpuNotify { cpu_id }); 2079 } 2080 2081 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2082 for cpu_id in 0..self.max_vcpus { 2083 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 2084 } 2085 2086 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 2087 2088 aml::Method::new( 2089 "CEJ0".into(), 2090 1, 2091 true, 2092 vec![ 2093 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2094 // Write CPU number (in first argument) to I/O port via field 2095 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2096 // Set CEJ0 bit 2097 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 2098 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2099 ], 2100 ) 2101 .to_aml_bytes(sink); 2102 2103 aml::Method::new( 2104 "CSCN".into(), 2105 0, 2106 true, 2107 vec![ 2108 // Take lock defined above 2109 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2110 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2111 &aml::While::new( 2112 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 2113 vec![ 2114 // Write CPU number (in first argument) to I/O port via field 2115 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 2116 // Check if CINS bit is set 2117 &aml::If::new( 2118 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 2119 // Notify device if it is 2120 vec![ 2121 &aml::MethodCall::new( 2122 "CTFY".into(), 2123 vec![&aml::Local(0), &aml::ONE], 2124 ), 2125 // Reset CINS bit 2126 &aml::Store::new( 2127 &aml::Path::new("\\_SB_.PRES.CINS"), 2128 &aml::ONE, 2129 ), 2130 ], 2131 ), 2132 // Check if CRMV bit is set 2133 &aml::If::new( 2134 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 2135 // Notify device if it is (with the eject constant 0x3) 2136 vec![ 2137 &aml::MethodCall::new( 2138 "CTFY".into(), 2139 vec![&aml::Local(0), &3u8], 2140 ), 2141 // Reset CRMV bit 2142 &aml::Store::new( 2143 &aml::Path::new("\\_SB_.PRES.CRMV"), 2144 &aml::ONE, 2145 ), 2146 ], 2147 ), 2148 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2149 ], 2150 ), 2151 // Release lock 2152 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2153 ], 2154 ) 2155 .to_aml_bytes(sink) 2156 } else { 2157 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2158 } 2159 } 2160 } 2161 2162 impl Aml for CpuManager { 2163 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2164 #[cfg(target_arch = "x86_64")] 2165 if let Some(acpi_address) = self.acpi_address { 2166 // CPU hotplug controller 2167 aml::Device::new( 2168 "_SB_.PRES".into(), 2169 vec![ 2170 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2171 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2172 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2173 &aml::Mutex::new("CPLK".into(), 0), 2174 &aml::Name::new( 2175 "_CRS".into(), 2176 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2177 aml::AddressSpaceCacheable::NotCacheable, 2178 true, 2179 acpi_address.0, 2180 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2181 None, 2182 )]), 2183 ), 2184 // OpRegion and Fields map MMIO range into individual field values 2185 &aml::OpRegion::new( 2186 "PRST".into(), 2187 aml::OpRegionSpace::SystemMemory, 2188 &(acpi_address.0 as usize), 2189 &CPU_MANAGER_ACPI_SIZE, 2190 ), 2191 &aml::Field::new( 2192 "PRST".into(), 2193 aml::FieldAccessType::Byte, 2194 aml::FieldLockRule::NoLock, 2195 aml::FieldUpdateRule::WriteAsZeroes, 2196 vec![ 2197 aml::FieldEntry::Reserved(32), 2198 aml::FieldEntry::Named(*b"CPEN", 1), 2199 aml::FieldEntry::Named(*b"CINS", 1), 2200 aml::FieldEntry::Named(*b"CRMV", 1), 2201 aml::FieldEntry::Named(*b"CEJ0", 1), 2202 aml::FieldEntry::Reserved(4), 2203 aml::FieldEntry::Named(*b"CCMD", 8), 2204 ], 2205 ), 2206 &aml::Field::new( 2207 "PRST".into(), 2208 aml::FieldAccessType::DWord, 2209 aml::FieldLockRule::NoLock, 2210 aml::FieldUpdateRule::Preserve, 2211 vec![ 2212 aml::FieldEntry::Named(*b"CSEL", 32), 2213 aml::FieldEntry::Reserved(32), 2214 aml::FieldEntry::Named(*b"CDAT", 32), 2215 ], 2216 ), 2217 ], 2218 ) 2219 .to_aml_bytes(sink); 2220 } 2221 2222 // CPU devices 2223 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2224 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2225 // Bundle methods together under a common object 2226 let methods = CpuMethods { 2227 max_vcpus: self.config.max_vcpus, 2228 dynamic: self.dynamic, 2229 }; 2230 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2231 2232 #[cfg(target_arch = "x86_64")] 2233 let topology = self.get_vcpu_topology(); 2234 let mut cpu_devices = Vec::new(); 2235 for cpu_id in 0..self.config.max_vcpus { 2236 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2237 let cpu_device = Cpu { 2238 cpu_id, 2239 proximity_domain, 2240 dynamic: self.dynamic, 2241 #[cfg(target_arch = "x86_64")] 2242 topology, 2243 }; 2244 2245 cpu_devices.push(cpu_device); 2246 } 2247 2248 for cpu_device in cpu_devices.iter() { 2249 cpu_data_inner.push(cpu_device); 2250 } 2251 2252 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2253 } 2254 } 2255 2256 impl Pausable for CpuManager { 2257 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2258 // Tell the vCPUs to pause themselves next time they exit 2259 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2260 2261 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2262 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2263 // above. 2264 for state in self.vcpu_states.iter() { 2265 state.signal_thread(); 2266 } 2267 2268 for vcpu in self.vcpus.iter() { 2269 let mut vcpu = vcpu.lock().unwrap(); 2270 vcpu.pause()?; 2271 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2272 if !self.config.kvm_hyperv { 2273 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2274 MigratableError::Pause(anyhow!( 2275 "Could not notify guest it has been paused {:?}", 2276 e 2277 )) 2278 })?; 2279 } 2280 } 2281 2282 // The vCPU thread will change its paused state before parking, wait here for each 2283 // activated vCPU change their state to ensure they have parked. 2284 for state in self.vcpu_states.iter() { 2285 if state.active() { 2286 while !state.paused.load(Ordering::SeqCst) { 2287 // To avoid a priority inversion with the vCPU thread 2288 thread::sleep(std::time::Duration::from_millis(1)); 2289 } 2290 } 2291 } 2292 2293 Ok(()) 2294 } 2295 2296 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2297 for vcpu in self.vcpus.iter() { 2298 vcpu.lock().unwrap().resume()?; 2299 } 2300 2301 // Toggle the vCPUs pause boolean 2302 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2303 2304 // Unpark all the VCPU threads. 2305 // Once unparked, the next thing they will do is checking for the pause 2306 // boolean. Since it'll be set to false, they will exit their pause loop 2307 // and go back to vmx root. 2308 for state in self.vcpu_states.iter() { 2309 state.paused.store(false, Ordering::SeqCst); 2310 state.unpark_thread(); 2311 } 2312 Ok(()) 2313 } 2314 } 2315 2316 impl Snapshottable for CpuManager { 2317 fn id(&self) -> String { 2318 CPU_MANAGER_SNAPSHOT_ID.to_string() 2319 } 2320 2321 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2322 let mut cpu_manager_snapshot = Snapshot::default(); 2323 2324 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2325 for vcpu in &self.vcpus { 2326 let mut vcpu = vcpu.lock().unwrap(); 2327 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2328 } 2329 2330 Ok(cpu_manager_snapshot) 2331 } 2332 } 2333 2334 impl Transportable for CpuManager {} 2335 impl Migratable for CpuManager {} 2336 2337 #[cfg(feature = "guest_debug")] 2338 impl Debuggable for CpuManager { 2339 #[cfg(feature = "kvm")] 2340 fn set_guest_debug( 2341 &self, 2342 cpu_id: usize, 2343 addrs: &[GuestAddress], 2344 singlestep: bool, 2345 ) -> std::result::Result<(), DebuggableError> { 2346 self.vcpus[cpu_id] 2347 .lock() 2348 .unwrap() 2349 .vcpu 2350 .set_guest_debug(addrs, singlestep) 2351 .map_err(DebuggableError::SetDebug) 2352 } 2353 2354 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2355 Ok(()) 2356 } 2357 2358 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2359 Ok(()) 2360 } 2361 2362 #[cfg(target_arch = "x86_64")] 2363 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2364 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2365 let gregs = self 2366 .get_regs(cpu_id as u8) 2367 .map_err(DebuggableError::ReadRegs)?; 2368 let regs = [ 2369 gregs.get_rax(), 2370 gregs.get_rbx(), 2371 gregs.get_rcx(), 2372 gregs.get_rdx(), 2373 gregs.get_rsi(), 2374 gregs.get_rdi(), 2375 gregs.get_rbp(), 2376 gregs.get_rsp(), 2377 gregs.get_r8(), 2378 gregs.get_r9(), 2379 gregs.get_r10(), 2380 gregs.get_r11(), 2381 gregs.get_r12(), 2382 gregs.get_r13(), 2383 gregs.get_r14(), 2384 gregs.get_r15(), 2385 ]; 2386 2387 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2388 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2389 let eflags = gregs.get_rflags() as u32; 2390 let rip = gregs.get_rip(); 2391 2392 // Segment registers: CS, SS, DS, ES, FS, GS 2393 let sregs = self 2394 .get_sregs(cpu_id as u8) 2395 .map_err(DebuggableError::ReadRegs)?; 2396 let segments = X86SegmentRegs { 2397 cs: sregs.cs.selector as u32, 2398 ss: sregs.ss.selector as u32, 2399 ds: sregs.ds.selector as u32, 2400 es: sregs.es.selector as u32, 2401 fs: sregs.fs.selector as u32, 2402 gs: sregs.gs.selector as u32, 2403 }; 2404 2405 // TODO: Add other registers 2406 2407 Ok(CoreRegs { 2408 regs, 2409 eflags, 2410 rip, 2411 segments, 2412 ..Default::default() 2413 }) 2414 } 2415 2416 #[cfg(target_arch = "aarch64")] 2417 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2418 let gregs = self 2419 .get_regs(cpu_id as u8) 2420 .map_err(DebuggableError::ReadRegs)?; 2421 Ok(CoreRegs { 2422 x: gregs.get_regs(), 2423 sp: gregs.get_sp(), 2424 pc: gregs.get_pc(), 2425 ..Default::default() 2426 }) 2427 } 2428 2429 #[cfg(target_arch = "x86_64")] 2430 fn write_regs( 2431 &self, 2432 cpu_id: usize, 2433 regs: &CoreRegs, 2434 ) -> std::result::Result<(), DebuggableError> { 2435 let orig_gregs = self 2436 .get_regs(cpu_id as u8) 2437 .map_err(DebuggableError::ReadRegs)?; 2438 let mut gregs = self.create_standard_regs(cpu_id as u8); 2439 gregs.set_rax(regs.regs[0]); 2440 gregs.set_rbx(regs.regs[1]); 2441 gregs.set_rcx(regs.regs[2]); 2442 gregs.set_rdx(regs.regs[3]); 2443 gregs.set_rsi(regs.regs[4]); 2444 gregs.set_rdi(regs.regs[5]); 2445 gregs.set_rbp(regs.regs[6]); 2446 gregs.set_rsp(regs.regs[7]); 2447 gregs.set_r8(regs.regs[8]); 2448 gregs.set_r9(regs.regs[9]); 2449 gregs.set_r10(regs.regs[10]); 2450 gregs.set_r11(regs.regs[11]); 2451 gregs.set_r12(regs.regs[12]); 2452 gregs.set_r13(regs.regs[13]); 2453 gregs.set_r14(regs.regs[14]); 2454 gregs.set_r15(regs.regs[15]); 2455 gregs.set_rip(regs.rip); 2456 // Update the lower 32-bit of rflags. 2457 gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64)); 2458 2459 self.set_regs(cpu_id as u8, &gregs) 2460 .map_err(DebuggableError::WriteRegs)?; 2461 2462 // Segment registers: CS, SS, DS, ES, FS, GS 2463 // Since GDB care only selectors, we call get_sregs() first. 2464 let mut sregs = self 2465 .get_sregs(cpu_id as u8) 2466 .map_err(DebuggableError::ReadRegs)?; 2467 sregs.cs.selector = regs.segments.cs as u16; 2468 sregs.ss.selector = regs.segments.ss as u16; 2469 sregs.ds.selector = regs.segments.ds as u16; 2470 sregs.es.selector = regs.segments.es as u16; 2471 sregs.fs.selector = regs.segments.fs as u16; 2472 sregs.gs.selector = regs.segments.gs as u16; 2473 2474 self.set_sregs(cpu_id as u8, &sregs) 2475 .map_err(DebuggableError::WriteRegs)?; 2476 2477 // TODO: Add other registers 2478 2479 Ok(()) 2480 } 2481 2482 #[cfg(target_arch = "aarch64")] 2483 fn write_regs( 2484 &self, 2485 cpu_id: usize, 2486 regs: &CoreRegs, 2487 ) -> std::result::Result<(), DebuggableError> { 2488 let mut gregs = self 2489 .get_regs(cpu_id as u8) 2490 .map_err(DebuggableError::ReadRegs)?; 2491 2492 gregs.set_regs(regs.x); 2493 gregs.set_sp(regs.sp); 2494 gregs.set_pc(regs.pc); 2495 2496 self.set_regs(cpu_id as u8, &gregs) 2497 .map_err(DebuggableError::WriteRegs)?; 2498 2499 Ok(()) 2500 } 2501 2502 fn read_mem( 2503 &self, 2504 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2505 cpu_id: usize, 2506 vaddr: GuestAddress, 2507 len: usize, 2508 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2509 let mut buf = vec![0; len]; 2510 let mut total_read = 0_u64; 2511 2512 while total_read < len as u64 { 2513 let gaddr = vaddr.0 + total_read; 2514 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2515 Ok(paddr) => paddr, 2516 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2517 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2518 }; 2519 let psize = arch::PAGE_SIZE as u64; 2520 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2521 guest_memory 2522 .memory() 2523 .read( 2524 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2525 GuestAddress(paddr), 2526 ) 2527 .map_err(DebuggableError::ReadMem)?; 2528 total_read += read_len; 2529 } 2530 Ok(buf) 2531 } 2532 2533 fn write_mem( 2534 &self, 2535 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2536 cpu_id: usize, 2537 vaddr: &GuestAddress, 2538 data: &[u8], 2539 ) -> std::result::Result<(), DebuggableError> { 2540 let mut total_written = 0_u64; 2541 2542 while total_written < data.len() as u64 { 2543 let gaddr = vaddr.0 + total_written; 2544 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2545 Ok(paddr) => paddr, 2546 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2547 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2548 }; 2549 let psize = arch::PAGE_SIZE as u64; 2550 let write_len = std::cmp::min( 2551 data.len() as u64 - total_written, 2552 psize - (paddr & (psize - 1)), 2553 ); 2554 guest_memory 2555 .memory() 2556 .write( 2557 &data[total_written as usize..total_written as usize + write_len as usize], 2558 GuestAddress(paddr), 2559 ) 2560 .map_err(DebuggableError::WriteMem)?; 2561 total_written += write_len; 2562 } 2563 Ok(()) 2564 } 2565 2566 fn active_vcpus(&self) -> usize { 2567 self.present_vcpus() as usize 2568 } 2569 } 2570 2571 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2572 impl Elf64Writable for CpuManager {} 2573 2574 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2575 impl CpuElf64Writable for CpuManager { 2576 fn cpu_write_elf64_note( 2577 &mut self, 2578 dump_state: &DumpState, 2579 ) -> std::result::Result<(), GuestDebuggableError> { 2580 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2581 for vcpu in &self.vcpus { 2582 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2583 let mut pos: usize = 0; 2584 let mut buf = vec![0; note_size as usize]; 2585 let descsz = size_of::<X86_64ElfPrStatus>(); 2586 let vcpu_id = vcpu.lock().unwrap().id; 2587 2588 let note = Elf64_Nhdr { 2589 n_namesz: COREDUMP_NAME_SIZE, 2590 n_descsz: descsz as u32, 2591 n_type: NT_PRSTATUS, 2592 }; 2593 2594 let bytes: &[u8] = note.as_slice(); 2595 buf.splice(0.., bytes.to_vec()); 2596 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2597 buf.resize(pos + 4, 0); 2598 buf.splice(pos.., "CORE".to_string().into_bytes()); 2599 2600 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2601 buf.resize(pos + 32 + 4, 0); 2602 let pid = vcpu_id as u64; 2603 let bytes: &[u8] = pid.as_slice(); 2604 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2605 2606 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2607 2608 let orig_rax: u64 = 0; 2609 let gregs = self.vcpus[usize::from(vcpu_id)] 2610 .lock() 2611 .unwrap() 2612 .vcpu 2613 .get_regs() 2614 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2615 2616 let regs1 = [ 2617 gregs.get_r15(), 2618 gregs.get_r14(), 2619 gregs.get_r13(), 2620 gregs.get_r12(), 2621 gregs.get_rbp(), 2622 gregs.get_rbx(), 2623 gregs.get_r11(), 2624 gregs.get_r10(), 2625 ]; 2626 let regs2 = [ 2627 gregs.get_r9(), 2628 gregs.get_r8(), 2629 gregs.get_rax(), 2630 gregs.get_rcx(), 2631 gregs.get_rdx(), 2632 gregs.get_rsi(), 2633 gregs.get_rdi(), 2634 orig_rax, 2635 ]; 2636 2637 let sregs = self.vcpus[usize::from(vcpu_id)] 2638 .lock() 2639 .unwrap() 2640 .vcpu 2641 .get_sregs() 2642 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2643 2644 debug!( 2645 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2646 gregs.get_rip(), 2647 gregs.get_rsp(), 2648 sregs.gs.base, 2649 sregs.cs.selector, 2650 sregs.ss.selector, 2651 sregs.ds.selector, 2652 ); 2653 2654 let regs = X86_64UserRegs { 2655 regs1, 2656 regs2, 2657 rip: gregs.get_rip(), 2658 cs: sregs.cs.selector as u64, 2659 eflags: gregs.get_rflags(), 2660 rsp: gregs.get_rsp(), 2661 ss: sregs.ss.selector as u64, 2662 fs_base: sregs.fs.base, 2663 gs_base: sregs.gs.base, 2664 ds: sregs.ds.selector as u64, 2665 es: sregs.es.selector as u64, 2666 fs: sregs.fs.selector as u64, 2667 gs: sregs.gs.selector as u64, 2668 }; 2669 2670 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2671 let bytes: &[u8] = regs.as_slice(); 2672 buf.resize(note_size as usize, 0); 2673 buf.splice(pos.., bytes.to_vec()); 2674 buf.resize(note_size as usize, 0); 2675 2676 coredump_file 2677 .write(&buf) 2678 .map_err(GuestDebuggableError::CoredumpFile)?; 2679 } 2680 2681 Ok(()) 2682 } 2683 2684 fn cpu_write_vmm_note( 2685 &mut self, 2686 dump_state: &DumpState, 2687 ) -> std::result::Result<(), GuestDebuggableError> { 2688 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2689 for vcpu in &self.vcpus { 2690 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2691 let mut pos: usize = 0; 2692 let mut buf = vec![0; note_size as usize]; 2693 let descsz = size_of::<DumpCpusState>(); 2694 let vcpu_id = vcpu.lock().unwrap().id; 2695 2696 let note = Elf64_Nhdr { 2697 n_namesz: COREDUMP_NAME_SIZE, 2698 n_descsz: descsz as u32, 2699 n_type: 0, 2700 }; 2701 2702 let bytes: &[u8] = note.as_slice(); 2703 buf.splice(0.., bytes.to_vec()); 2704 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2705 2706 buf.resize(pos + 4, 0); 2707 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2708 2709 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2710 2711 let gregs = self.vcpus[usize::from(vcpu_id)] 2712 .lock() 2713 .unwrap() 2714 .vcpu 2715 .get_regs() 2716 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2717 2718 let regs1 = [ 2719 gregs.get_rax(), 2720 gregs.get_rbx(), 2721 gregs.get_rcx(), 2722 gregs.get_rdx(), 2723 gregs.get_rsi(), 2724 gregs.get_rdi(), 2725 gregs.get_rsp(), 2726 gregs.get_rbp(), 2727 ]; 2728 2729 let regs2 = [ 2730 gregs.get_r8(), 2731 gregs.get_r9(), 2732 gregs.get_r10(), 2733 gregs.get_r11(), 2734 gregs.get_r12(), 2735 gregs.get_r13(), 2736 gregs.get_r14(), 2737 gregs.get_r15(), 2738 ]; 2739 2740 let sregs = self.vcpus[usize::from(vcpu_id)] 2741 .lock() 2742 .unwrap() 2743 .vcpu 2744 .get_sregs() 2745 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2746 2747 let mut msrs = vec![MsrEntry { 2748 index: msr_index::MSR_KERNEL_GS_BASE, 2749 ..Default::default() 2750 }]; 2751 2752 self.vcpus[vcpu_id as usize] 2753 .lock() 2754 .unwrap() 2755 .vcpu 2756 .get_msrs(&mut msrs) 2757 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2758 let kernel_gs_base = msrs[0].data; 2759 2760 let cs = CpuSegment::new(sregs.cs); 2761 let ds = CpuSegment::new(sregs.ds); 2762 let es = CpuSegment::new(sregs.es); 2763 let fs = CpuSegment::new(sregs.fs); 2764 let gs = CpuSegment::new(sregs.gs); 2765 let ss = CpuSegment::new(sregs.ss); 2766 let ldt = CpuSegment::new(sregs.ldt); 2767 let tr = CpuSegment::new(sregs.tr); 2768 let gdt = CpuSegment::new_from_table(sregs.gdt); 2769 let idt = CpuSegment::new_from_table(sregs.idt); 2770 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2771 let regs = DumpCpusState { 2772 version: 1, 2773 size: size_of::<DumpCpusState>() as u32, 2774 regs1, 2775 regs2, 2776 rip: gregs.get_rip(), 2777 rflags: gregs.get_rflags(), 2778 cs, 2779 ds, 2780 es, 2781 fs, 2782 gs, 2783 ss, 2784 ldt, 2785 tr, 2786 gdt, 2787 idt, 2788 cr, 2789 kernel_gs_base, 2790 }; 2791 2792 let bytes: &[u8] = regs.as_slice(); 2793 buf.resize(note_size as usize, 0); 2794 buf.splice(pos.., bytes.to_vec()); 2795 buf.resize(note_size as usize, 0); 2796 2797 coredump_file 2798 .write(&buf) 2799 .map_err(GuestDebuggableError::CoredumpFile)?; 2800 } 2801 2802 Ok(()) 2803 } 2804 } 2805 2806 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2807 #[cfg(test)] 2808 mod tests { 2809 use arch::layout::{BOOT_STACK_POINTER, ZERO_PAGE_START}; 2810 use arch::x86_64::interrupts::*; 2811 use arch::x86_64::regs::*; 2812 use hypervisor::arch::x86::{FpuState, LapicState}; 2813 use hypervisor::StandardRegisters; 2814 use linux_loader::loader::bootparam::setup_header; 2815 2816 #[test] 2817 fn test_setlint() { 2818 let hv = hypervisor::new().unwrap(); 2819 let vm = hv.create_vm().expect("new VM fd creation failed"); 2820 hv.check_required_extensions().unwrap(); 2821 // Calling get_lapic will fail if there is no irqchip before hand. 2822 vm.create_irq_chip().unwrap(); 2823 let vcpu = vm.create_vcpu(0, None).unwrap(); 2824 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2825 2826 // Compute the value that is expected to represent LVT0 and LVT1. 2827 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2828 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2829 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2830 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2831 2832 set_lint(&vcpu).unwrap(); 2833 2834 // Compute the value that represents LVT0 and LVT1 after set_lint. 2835 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2836 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2837 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2838 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2839 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2840 } 2841 2842 #[test] 2843 fn test_setup_fpu() { 2844 let hv = hypervisor::new().unwrap(); 2845 let vm = hv.create_vm().expect("new VM fd creation failed"); 2846 let vcpu = vm.create_vcpu(0, None).unwrap(); 2847 setup_fpu(&vcpu).unwrap(); 2848 2849 let expected_fpu: FpuState = FpuState { 2850 fcw: 0x37f, 2851 mxcsr: 0x1f80, 2852 ..Default::default() 2853 }; 2854 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2855 // TODO: auto-generate kvm related structures with PartialEq on. 2856 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2857 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2858 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2859 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2860 // remove it at all. 2861 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2862 } 2863 2864 #[test] 2865 fn test_setup_msrs() { 2866 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2867 2868 let hv = hypervisor::new().unwrap(); 2869 let vm = hv.create_vm().expect("new VM fd creation failed"); 2870 let vcpu = vm.create_vcpu(0, None).unwrap(); 2871 setup_msrs(&vcpu).unwrap(); 2872 2873 // This test will check against the last MSR entry configured (the tenth one). 2874 // See create_msr_entries for details. 2875 let mut msrs = vec![MsrEntry { 2876 index: msr_index::MSR_IA32_MISC_ENABLE, 2877 ..Default::default() 2878 }]; 2879 2880 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2881 // in this test case scenario. 2882 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2883 assert_eq!(read_msrs, 1); 2884 2885 // Official entries that were setup when we did setup_msrs. We need to assert that the 2886 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2887 // expect. 2888 let entry_vec = vcpu.boot_msr_entries(); 2889 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2890 } 2891 2892 #[test] 2893 fn test_setup_regs_for_pvh() { 2894 let hv = hypervisor::new().unwrap(); 2895 let vm = hv.create_vm().expect("new VM fd creation failed"); 2896 let vcpu = vm.create_vcpu(0, None).unwrap(); 2897 2898 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2899 expected_regs.set_rflags(0x0000000000000002u64); 2900 expected_regs.set_rbx(arch::layout::PVH_INFO_START.0); 2901 expected_regs.set_rip(1); 2902 2903 setup_regs( 2904 &vcpu, 2905 arch::EntryPoint { 2906 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2907 setup_header: None, 2908 }, 2909 ) 2910 .unwrap(); 2911 2912 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2913 assert_eq!(actual_regs, expected_regs); 2914 } 2915 2916 #[test] 2917 fn test_setup_regs_for_bzimage() { 2918 let hv = hypervisor::new().unwrap(); 2919 let vm = hv.create_vm().expect("new VM fd creation failed"); 2920 let vcpu = vm.create_vcpu(0, None).unwrap(); 2921 2922 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2923 expected_regs.set_rflags(0x0000000000000002u64); 2924 expected_regs.set_rip(1); 2925 expected_regs.set_rsp(BOOT_STACK_POINTER.0); 2926 expected_regs.set_rsi(ZERO_PAGE_START.0); 2927 2928 setup_regs( 2929 &vcpu, 2930 arch::EntryPoint { 2931 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2932 setup_header: Some(setup_header { 2933 ..Default::default() 2934 }), 2935 }, 2936 ) 2937 .unwrap(); 2938 2939 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2940 assert_eq!(actual_regs, expected_regs); 2941 } 2942 } 2943 2944 #[cfg(target_arch = "aarch64")] 2945 #[cfg(test)] 2946 mod tests { 2947 use std::mem; 2948 2949 use arch::aarch64::regs; 2950 use arch::layout; 2951 use hypervisor::kvm::aarch64::is_system_register; 2952 use hypervisor::kvm::kvm_bindings::{ 2953 kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE, 2954 KVM_REG_SIZE_U64, 2955 }; 2956 use hypervisor::{arm64_core_reg_id, offset_of}; 2957 2958 #[test] 2959 fn test_setup_regs() { 2960 let hv = hypervisor::new().unwrap(); 2961 let vm = hv.create_vm().unwrap(); 2962 let vcpu = vm.create_vcpu(0, None).unwrap(); 2963 2964 // Must fail when vcpu is not initialized yet. 2965 vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap_err(); 2966 2967 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2968 vm.get_preferred_target(&mut kvi).unwrap(); 2969 vcpu.vcpu_init(&kvi).unwrap(); 2970 2971 vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap(); 2972 } 2973 2974 #[test] 2975 fn test_read_mpidr() { 2976 let hv = hypervisor::new().unwrap(); 2977 let vm = hv.create_vm().unwrap(); 2978 let vcpu = vm.create_vcpu(0, None).unwrap(); 2979 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2980 vm.get_preferred_target(&mut kvi).unwrap(); 2981 2982 // Must fail when vcpu is not initialized yet. 2983 vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap_err(); 2984 2985 vcpu.vcpu_init(&kvi).unwrap(); 2986 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2987 } 2988 2989 #[test] 2990 fn test_is_system_register() { 2991 let offset = offset_of!(user_pt_regs, pc); 2992 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2993 assert!(!is_system_register(regid)); 2994 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2995 assert!(is_system_register(regid)); 2996 } 2997 2998 #[test] 2999 fn test_save_restore_core_regs() { 3000 let hv = hypervisor::new().unwrap(); 3001 let vm = hv.create_vm().unwrap(); 3002 let vcpu = vm.create_vcpu(0, None).unwrap(); 3003 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 3004 vm.get_preferred_target(&mut kvi).unwrap(); 3005 3006 // Must fail when vcpu is not initialized yet. 3007 assert_eq!( 3008 format!("{}", vcpu.get_regs().unwrap_err()), 3009 "Failed to get aarch64 core register: Exec format error (os error 8)" 3010 ); 3011 3012 let mut state = vcpu.create_standard_regs(); 3013 assert_eq!( 3014 format!("{}", vcpu.set_regs(&state).unwrap_err()), 3015 "Failed to set aarch64 core register: Exec format error (os error 8)" 3016 ); 3017 3018 vcpu.vcpu_init(&kvi).unwrap(); 3019 state = vcpu.get_regs().unwrap(); 3020 assert_eq!(state.get_pstate(), 0x3C5); 3021 3022 vcpu.set_regs(&state).unwrap(); 3023 } 3024 3025 #[test] 3026 fn test_get_set_mpstate() { 3027 let hv = hypervisor::new().unwrap(); 3028 let vm = hv.create_vm().unwrap(); 3029 let vcpu = vm.create_vcpu(0, None).unwrap(); 3030 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 3031 vm.get_preferred_target(&mut kvi).unwrap(); 3032 3033 let state = vcpu.get_mp_state().unwrap(); 3034 vcpu.set_mp_state(state).unwrap(); 3035 } 3036 } 3037