1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use std::collections::BTreeMap; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use std::io::Write; 17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 18 use std::mem::size_of; 19 use std::os::unix::thread::JoinHandleExt; 20 use std::sync::atomic::{AtomicBool, Ordering}; 21 use std::sync::{Arc, Barrier, Mutex}; 22 use std::{cmp, io, result, thread}; 23 24 use acpi_tables::sdt::Sdt; 25 use acpi_tables::{aml, Aml}; 26 use anyhow::anyhow; 27 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 28 use arch::aarch64::regs; 29 #[cfg(target_arch = "x86_64")] 30 use arch::x86_64::get_x2apic_id; 31 use arch::{EntryPoint, NumaNodes}; 32 #[cfg(target_arch = "aarch64")] 33 use devices::gic::Gic; 34 use devices::interrupt_controller::InterruptController; 35 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 36 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 37 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 38 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 39 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 40 use hypervisor::arch::x86::msr_index; 41 #[cfg(target_arch = "x86_64")] 42 use hypervisor::arch::x86::CpuIdEntry; 43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 44 use hypervisor::arch::x86::MsrEntry; 45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 46 use hypervisor::arch::x86::SpecialRegisters; 47 #[cfg(target_arch = "aarch64")] 48 use hypervisor::kvm::kvm_bindings; 49 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 50 use hypervisor::kvm::kvm_ioctls::Cap; 51 #[cfg(feature = "tdx")] 52 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 53 #[cfg(target_arch = "x86_64")] 54 use hypervisor::CpuVendor; 55 #[cfg(feature = "kvm")] 56 use hypervisor::HypervisorType; 57 #[cfg(feature = "guest_debug")] 58 use hypervisor::StandardRegisters; 59 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps}; 60 use libc::{c_void, siginfo_t}; 61 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 62 use linux_loader::elf::Elf64_Nhdr; 63 use seccompiler::{apply_filter, SeccompAction}; 64 use thiserror::Error; 65 use tracer::trace_scoped; 66 use vm_device::BusDevice; 67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 68 use vm_memory::ByteValued; 69 #[cfg(feature = "guest_debug")] 70 use vm_memory::{Bytes, GuestAddressSpace}; 71 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 72 use vm_migration::{ 73 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 74 Transportable, 75 }; 76 use vmm_sys_util::eventfd::EventFd; 77 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 78 use zerocopy::AsBytes; 79 80 use crate::config::CpusConfig; 81 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 82 use crate::coredump::{ 83 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 84 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 85 NT_PRSTATUS, 86 }; 87 #[cfg(feature = "guest_debug")] 88 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 89 #[cfg(target_arch = "x86_64")] 90 use crate::memory_manager::MemoryManager; 91 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 92 #[cfg(target_arch = "x86_64")] 93 use crate::vm::physical_bits; 94 use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID}; 95 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 96 /// Extract the specified bits of a 64-bit integer. 97 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 98 /// following expression should return 3 (`0b11`): 99 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 100 /// 101 macro_rules! extract_bits_64 { 102 ($value: tt, $offset: tt, $length: tt) => { 103 ($value >> $offset) & (!0u64 >> (64 - $length)) 104 }; 105 } 106 107 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 108 macro_rules! extract_bits_64_without_offset { 109 ($value: tt, $length: tt) => { 110 $value & (!0u64 >> (64 - $length)) 111 }; 112 } 113 114 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 115 116 #[derive(Debug, Error)] 117 pub enum Error { 118 #[error("Error creating vCPU: {0}")] 119 VcpuCreate(#[source] anyhow::Error), 120 121 #[error("Error running bCPU: {0}")] 122 VcpuRun(#[source] anyhow::Error), 123 124 #[error("Error spawning vCPU thread: {0}")] 125 VcpuSpawn(#[source] io::Error), 126 127 #[error("Error generating common CPUID: {0}")] 128 CommonCpuId(#[source] arch::Error), 129 130 #[error("Error configuring vCPU: {0}")] 131 VcpuConfiguration(#[source] arch::Error), 132 133 #[error("Still pending removed vcpu")] 134 VcpuPendingRemovedVcpu, 135 136 #[cfg(target_arch = "aarch64")] 137 #[error("Error fetching preferred target: {0}")] 138 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 139 140 #[cfg(target_arch = "aarch64")] 141 #[error("Error initialising vCPU: {0}")] 142 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 143 144 #[cfg(target_arch = "aarch64")] 145 #[error("Error finalising vCPU: {0}")] 146 VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError), 147 148 #[error("Failed to join on vCPU threads: {0:?}")] 149 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 150 151 #[error("Error adding CpuManager to MMIO bus: {0}")] 152 BusError(#[source] vm_device::BusError), 153 154 #[error("Requested vCPUs exceed maximum")] 155 DesiredVCpuCountExceedsMax, 156 157 #[error("Cannot create seccomp filter: {0}")] 158 CreateSeccompFilter(#[source] seccompiler::Error), 159 160 #[error("Cannot apply seccomp filter: {0}")] 161 ApplySeccompFilter(#[source] seccompiler::Error), 162 163 #[error("Error starting vCPU after restore: {0}")] 164 StartRestoreVcpu(#[source] anyhow::Error), 165 166 #[error("Unexpected VmExit")] 167 UnexpectedVmExit, 168 169 #[error("Failed to allocate MMIO address for CpuManager")] 170 AllocateMmmioAddress, 171 172 #[cfg(feature = "tdx")] 173 #[error("Error initializing TDX: {0}")] 174 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 175 176 #[cfg(target_arch = "aarch64")] 177 #[error("Error initializing PMU: {0}")] 178 InitPmu(#[source] hypervisor::HypervisorCpuError), 179 180 #[cfg(feature = "guest_debug")] 181 #[error("Error during CPU debug: {0}")] 182 CpuDebug(#[source] hypervisor::HypervisorCpuError), 183 184 #[cfg(feature = "guest_debug")] 185 #[error("Error translating virtual address: {0}")] 186 TranslateVirtualAddress(#[source] anyhow::Error), 187 188 #[cfg(target_arch = "x86_64")] 189 #[error("Error setting up AMX: {0}")] 190 AmxEnable(#[source] anyhow::Error), 191 192 #[error("Maximum number of vCPUs exceeds host limit")] 193 MaximumVcpusExceeded, 194 195 #[cfg(feature = "sev_snp")] 196 #[error("Failed to set sev control register: {0}")] 197 SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), 198 199 #[cfg(target_arch = "x86_64")] 200 #[error("Failed to inject NMI")] 201 NmiError(hypervisor::HypervisorCpuError), 202 } 203 pub type Result<T> = result::Result<T, Error>; 204 205 #[cfg(target_arch = "x86_64")] 206 #[allow(dead_code)] 207 #[repr(packed)] 208 #[derive(AsBytes)] 209 struct LocalX2Apic { 210 pub r#type: u8, 211 pub length: u8, 212 pub _reserved: u16, 213 pub apic_id: u32, 214 pub flags: u32, 215 pub processor_id: u32, 216 } 217 218 #[allow(dead_code)] 219 #[repr(packed)] 220 #[derive(Default, AsBytes)] 221 struct Ioapic { 222 pub r#type: u8, 223 pub length: u8, 224 pub ioapic_id: u8, 225 _reserved: u8, 226 pub apic_address: u32, 227 pub gsi_base: u32, 228 } 229 230 #[cfg(target_arch = "aarch64")] 231 #[allow(dead_code)] 232 #[repr(packed)] 233 #[derive(AsBytes)] 234 struct GicC { 235 pub r#type: u8, 236 pub length: u8, 237 pub reserved0: u16, 238 pub cpu_interface_number: u32, 239 pub uid: u32, 240 pub flags: u32, 241 pub parking_version: u32, 242 pub performance_interrupt: u32, 243 pub parked_address: u64, 244 pub base_address: u64, 245 pub gicv_base_address: u64, 246 pub gich_base_address: u64, 247 pub vgic_interrupt: u32, 248 pub gicr_base_address: u64, 249 pub mpidr: u64, 250 pub proc_power_effi_class: u8, 251 pub reserved1: u8, 252 pub spe_overflow_interrupt: u16, 253 } 254 255 #[cfg(target_arch = "aarch64")] 256 #[allow(dead_code)] 257 #[repr(packed)] 258 #[derive(AsBytes)] 259 struct GicD { 260 pub r#type: u8, 261 pub length: u8, 262 pub reserved0: u16, 263 pub gic_id: u32, 264 pub base_address: u64, 265 pub global_irq_base: u32, 266 pub version: u8, 267 pub reserved1: [u8; 3], 268 } 269 270 #[cfg(target_arch = "aarch64")] 271 #[allow(dead_code)] 272 #[repr(packed)] 273 #[derive(AsBytes)] 274 struct GicR { 275 pub r#type: u8, 276 pub length: u8, 277 pub reserved: u16, 278 pub base_address: u64, 279 pub range_length: u32, 280 } 281 282 #[cfg(target_arch = "aarch64")] 283 #[allow(dead_code)] 284 #[repr(packed)] 285 #[derive(AsBytes)] 286 struct GicIts { 287 pub r#type: u8, 288 pub length: u8, 289 pub reserved0: u16, 290 pub translation_id: u32, 291 pub base_address: u64, 292 pub reserved1: u32, 293 } 294 295 #[cfg(target_arch = "aarch64")] 296 #[allow(dead_code)] 297 #[repr(packed)] 298 #[derive(AsBytes)] 299 struct ProcessorHierarchyNode { 300 pub r#type: u8, 301 pub length: u8, 302 pub reserved: u16, 303 pub flags: u32, 304 pub parent: u32, 305 pub acpi_processor_id: u32, 306 pub num_private_resources: u32, 307 } 308 309 #[allow(dead_code)] 310 #[repr(packed)] 311 #[derive(Default, AsBytes)] 312 struct InterruptSourceOverride { 313 pub r#type: u8, 314 pub length: u8, 315 pub bus: u8, 316 pub source: u8, 317 pub gsi: u32, 318 pub flags: u16, 319 } 320 321 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 322 macro_rules! round_up { 323 ($n:expr,$d:expr) => { 324 (($n / ($d + 1)) + 1) * $d 325 }; 326 } 327 328 /// A wrapper around creating and using a kvm-based VCPU. 329 pub struct Vcpu { 330 // The hypervisor abstracted CPU. 331 vcpu: Arc<dyn hypervisor::Vcpu>, 332 id: u8, 333 #[cfg(target_arch = "aarch64")] 334 mpidr: u64, 335 saved_state: Option<CpuState>, 336 #[cfg(target_arch = "x86_64")] 337 vendor: CpuVendor, 338 } 339 340 impl Vcpu { 341 /// Constructs a new VCPU for `vm`. 342 /// 343 /// # Arguments 344 /// 345 /// * `id` - Represents the CPU number between [0, max vcpus). 346 /// * `vm` - The virtual machine this vcpu will get attached to. 347 /// * `vm_ops` - Optional object for exit handling. 348 /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) 349 pub fn new( 350 id: u8, 351 apic_id: u8, 352 vm: &Arc<dyn hypervisor::Vm>, 353 vm_ops: Option<Arc<dyn VmOps>>, 354 #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, 355 ) -> Result<Self> { 356 let vcpu = vm 357 .create_vcpu(apic_id, vm_ops) 358 .map_err(|e| Error::VcpuCreate(e.into()))?; 359 // Initially the cpuid per vCPU is the one supported by this VM. 360 Ok(Vcpu { 361 vcpu, 362 id, 363 #[cfg(target_arch = "aarch64")] 364 mpidr: 0, 365 saved_state: None, 366 #[cfg(target_arch = "x86_64")] 367 vendor: cpu_vendor, 368 }) 369 } 370 371 /// Configures a vcpu and should be called once per vcpu when created. 372 /// 373 /// # Arguments 374 /// 375 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 376 /// * `guest_memory` - Guest memory. 377 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 378 pub fn configure( 379 &mut self, 380 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 381 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 382 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 383 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 384 #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, 385 ) -> Result<()> { 386 #[cfg(target_arch = "aarch64")] 387 { 388 self.init(vm)?; 389 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 390 .map_err(Error::VcpuConfiguration)?; 391 } 392 info!("Configuring vCPU: cpu_id = {}", self.id); 393 #[cfg(target_arch = "x86_64")] 394 arch::configure_vcpu( 395 &self.vcpu, 396 self.id, 397 boot_setup, 398 cpuid, 399 kvm_hyperv, 400 self.vendor, 401 topology, 402 ) 403 .map_err(Error::VcpuConfiguration)?; 404 405 Ok(()) 406 } 407 408 /// Gets the MPIDR register value. 409 #[cfg(target_arch = "aarch64")] 410 pub fn get_mpidr(&self) -> u64 { 411 self.mpidr 412 } 413 414 /// Gets the saved vCPU state. 415 #[cfg(target_arch = "aarch64")] 416 pub fn get_saved_state(&self) -> Option<CpuState> { 417 self.saved_state.clone() 418 } 419 420 /// Initializes an aarch64 specific vcpu for booting Linux. 421 #[cfg(target_arch = "aarch64")] 422 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 423 use std::arch::is_aarch64_feature_detected; 424 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 425 #[allow(clippy::nonminimal_bool)] 426 let sve_supported = 427 is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2"); 428 // This reads back the kernel's preferred target type. 429 vm.get_preferred_target(&mut kvi) 430 .map_err(Error::VcpuArmPreferredTarget)?; 431 // We already checked that the capability is supported. 432 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 433 if vm 434 .as_any() 435 .downcast_ref::<hypervisor::kvm::KvmVm>() 436 .unwrap() 437 .check_extension(Cap::ArmPmuV3) 438 { 439 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 440 } 441 442 if sve_supported 443 && vm 444 .as_any() 445 .downcast_ref::<hypervisor::kvm::KvmVm>() 446 .unwrap() 447 .check_extension(Cap::ArmSve) 448 { 449 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_SVE; 450 } 451 452 // Non-boot cpus are powered off initially. 453 if self.id > 0 { 454 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 455 } 456 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?; 457 if sve_supported { 458 self.vcpu 459 .vcpu_finalize(kvm_bindings::KVM_ARM_VCPU_SVE as i32) 460 .map_err(Error::VcpuArmFinalize)?; 461 } 462 Ok(()) 463 } 464 465 /// Runs the VCPU until it exits, returning the reason. 466 /// 467 /// Note that the state of the VCPU and associated VM must be setup first for this to do 468 /// anything useful. 469 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 470 self.vcpu.run() 471 } 472 473 #[cfg(feature = "sev_snp")] 474 pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> { 475 self.vcpu 476 .set_sev_control_register(vmsa_pfn) 477 .map_err(Error::SetSevControlRegister) 478 } 479 } 480 481 impl Pausable for Vcpu {} 482 impl Snapshottable for Vcpu { 483 fn id(&self) -> String { 484 self.id.to_string() 485 } 486 487 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 488 let saved_state = self 489 .vcpu 490 .state() 491 .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?; 492 493 self.saved_state = Some(saved_state.clone()); 494 495 Ok(Snapshot::from_data(SnapshotData::new_from_state( 496 &saved_state, 497 )?)) 498 } 499 } 500 501 pub struct CpuManager { 502 config: CpusConfig, 503 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 504 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 505 #[cfg(target_arch = "x86_64")] 506 cpuid: Vec<CpuIdEntry>, 507 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 508 vm: Arc<dyn hypervisor::Vm>, 509 vcpus_kill_signalled: Arc<AtomicBool>, 510 vcpus_pause_signalled: Arc<AtomicBool>, 511 vcpus_kick_signalled: Arc<AtomicBool>, 512 exit_evt: EventFd, 513 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 514 reset_evt: EventFd, 515 #[cfg(feature = "guest_debug")] 516 vm_debug_evt: EventFd, 517 vcpu_states: Vec<VcpuState>, 518 selected_cpu: u8, 519 vcpus: Vec<Arc<Mutex<Vcpu>>>, 520 seccomp_action: SeccompAction, 521 vm_ops: Arc<dyn VmOps>, 522 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 523 acpi_address: Option<GuestAddress>, 524 proximity_domain_per_cpu: BTreeMap<u8, u32>, 525 affinity: BTreeMap<u8, Vec<usize>>, 526 dynamic: bool, 527 hypervisor: Arc<dyn hypervisor::Hypervisor>, 528 #[cfg(feature = "sev_snp")] 529 sev_snp_enabled: bool, 530 } 531 532 const CPU_ENABLE_FLAG: usize = 0; 533 const CPU_INSERTING_FLAG: usize = 1; 534 const CPU_REMOVING_FLAG: usize = 2; 535 const CPU_EJECT_FLAG: usize = 3; 536 537 const CPU_STATUS_OFFSET: u64 = 4; 538 const CPU_SELECTION_OFFSET: u64 = 0; 539 540 impl BusDevice for CpuManager { 541 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 542 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 543 data.fill(0); 544 545 match offset { 546 CPU_SELECTION_OFFSET => { 547 data[0] = self.selected_cpu; 548 } 549 CPU_STATUS_OFFSET => { 550 if self.selected_cpu < self.max_vcpus() { 551 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 552 if state.active() { 553 data[0] |= 1 << CPU_ENABLE_FLAG; 554 } 555 if state.inserting { 556 data[0] |= 1 << CPU_INSERTING_FLAG; 557 } 558 if state.removing { 559 data[0] |= 1 << CPU_REMOVING_FLAG; 560 } 561 } else { 562 warn!("Out of range vCPU id: {}", self.selected_cpu); 563 } 564 } 565 _ => { 566 warn!( 567 "Unexpected offset for accessing CPU manager device: {:#}", 568 offset 569 ); 570 } 571 } 572 } 573 574 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 575 match offset { 576 CPU_SELECTION_OFFSET => { 577 self.selected_cpu = data[0]; 578 } 579 CPU_STATUS_OFFSET => { 580 if self.selected_cpu < self.max_vcpus() { 581 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 582 // The ACPI code writes back a 1 to acknowledge the insertion 583 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 584 && state.inserting 585 { 586 state.inserting = false; 587 } 588 // Ditto for removal 589 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 590 && state.removing 591 { 592 state.removing = false; 593 } 594 // Trigger removal of vCPU 595 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 596 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 597 error!("Error removing vCPU: {:?}", e); 598 } 599 } 600 } else { 601 warn!("Out of range vCPU id: {}", self.selected_cpu); 602 } 603 } 604 _ => { 605 warn!( 606 "Unexpected offset for accessing CPU manager device: {:#}", 607 offset 608 ); 609 } 610 } 611 None 612 } 613 } 614 615 #[derive(Default)] 616 struct VcpuState { 617 inserting: bool, 618 removing: bool, 619 pending_removal: Arc<AtomicBool>, 620 handle: Option<thread::JoinHandle<()>>, 621 kill: Arc<AtomicBool>, 622 vcpu_run_interrupted: Arc<AtomicBool>, 623 paused: Arc<AtomicBool>, 624 } 625 626 impl VcpuState { 627 fn active(&self) -> bool { 628 self.handle.is_some() 629 } 630 631 fn signal_thread(&self) { 632 if let Some(handle) = self.handle.as_ref() { 633 loop { 634 // SAFETY: FFI call with correct arguments 635 unsafe { 636 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 637 } 638 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 639 break; 640 } else { 641 // This is more effective than thread::yield_now() at 642 // avoiding a priority inversion with the vCPU thread 643 thread::sleep(std::time::Duration::from_millis(1)); 644 } 645 } 646 } 647 } 648 649 fn join_thread(&mut self) -> Result<()> { 650 if let Some(handle) = self.handle.take() { 651 handle.join().map_err(Error::ThreadCleanup)? 652 } 653 654 Ok(()) 655 } 656 657 fn unpark_thread(&self) { 658 if let Some(handle) = self.handle.as_ref() { 659 handle.thread().unpark() 660 } 661 } 662 } 663 664 impl CpuManager { 665 #[allow(unused_variables)] 666 #[allow(clippy::too_many_arguments)] 667 pub fn new( 668 config: &CpusConfig, 669 vm: Arc<dyn hypervisor::Vm>, 670 exit_evt: EventFd, 671 reset_evt: EventFd, 672 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 673 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 674 seccomp_action: SeccompAction, 675 vm_ops: Arc<dyn VmOps>, 676 #[cfg(feature = "tdx")] tdx_enabled: bool, 677 numa_nodes: &NumaNodes, 678 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 679 ) -> Result<Arc<Mutex<CpuManager>>> { 680 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 681 return Err(Error::MaximumVcpusExceeded); 682 } 683 684 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 685 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 686 let hypervisor_type = hypervisor.hypervisor_type(); 687 #[cfg(target_arch = "x86_64")] 688 let cpu_vendor = hypervisor.get_cpu_vendor(); 689 690 #[cfg(target_arch = "x86_64")] 691 if config.features.amx { 692 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 693 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 694 const XFEATURE_XTILEDATA: usize = 18; 695 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 696 697 // SAFETY: the syscall is only modifying kernel internal 698 // data structures that the kernel is itself expected to safeguard. 699 let amx_tile = unsafe { 700 libc::syscall( 701 libc::SYS_arch_prctl, 702 ARCH_REQ_XCOMP_GUEST_PERM, 703 XFEATURE_XTILEDATA, 704 ) 705 }; 706 707 if amx_tile != 0 { 708 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 709 } else { 710 let mask: usize = 0; 711 // SAFETY: the mask being modified (not marked mutable as it is 712 // modified in unsafe only which is permitted) isn't in use elsewhere. 713 let result = unsafe { 714 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 715 }; 716 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 717 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 718 } 719 } 720 } 721 722 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 723 let mut cpu_list = Vec::new(); 724 for (proximity_domain, numa_node) in numa_nodes.iter() { 725 for cpu in numa_node.cpus.iter() { 726 cpu_list.push((*cpu, *proximity_domain)) 727 } 728 } 729 cpu_list 730 } 731 .into_iter() 732 .collect(); 733 734 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 735 cpu_affinity 736 .iter() 737 .map(|a| (a.vcpu, a.host_cpus.clone())) 738 .collect() 739 } else { 740 BTreeMap::new() 741 }; 742 743 #[cfg(feature = "tdx")] 744 let dynamic = !tdx_enabled; 745 #[cfg(not(feature = "tdx"))] 746 let dynamic = true; 747 748 Ok(Arc::new(Mutex::new(CpuManager { 749 config: config.clone(), 750 interrupt_controller: None, 751 #[cfg(target_arch = "x86_64")] 752 cpuid: Vec::new(), 753 vm, 754 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 755 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 756 vcpus_kick_signalled: Arc::new(AtomicBool::new(false)), 757 vcpu_states, 758 exit_evt, 759 reset_evt, 760 #[cfg(feature = "guest_debug")] 761 vm_debug_evt, 762 selected_cpu: 0, 763 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 764 seccomp_action, 765 vm_ops, 766 acpi_address: None, 767 proximity_domain_per_cpu, 768 affinity, 769 dynamic, 770 hypervisor: hypervisor.clone(), 771 #[cfg(feature = "sev_snp")] 772 sev_snp_enabled, 773 }))) 774 } 775 776 #[cfg(target_arch = "x86_64")] 777 pub fn populate_cpuid( 778 &mut self, 779 memory_manager: &Arc<Mutex<MemoryManager>>, 780 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 781 #[cfg(feature = "tdx")] tdx: bool, 782 ) -> Result<()> { 783 let sgx_epc_sections = memory_manager 784 .lock() 785 .unwrap() 786 .sgx_epc_region() 787 .as_ref() 788 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 789 790 self.cpuid = { 791 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 792 arch::generate_common_cpuid( 793 hypervisor, 794 &arch::CpuidConfig { 795 sgx_epc_sections, 796 phys_bits, 797 kvm_hyperv: self.config.kvm_hyperv, 798 #[cfg(feature = "tdx")] 799 tdx, 800 amx: self.config.features.amx, 801 }, 802 ) 803 .map_err(Error::CommonCpuId)? 804 }; 805 806 Ok(()) 807 } 808 809 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 810 info!("Creating vCPU: cpu_id = {}", cpu_id); 811 812 #[cfg(target_arch = "x86_64")] 813 let topology = self.get_vcpu_topology(); 814 #[cfg(target_arch = "x86_64")] 815 let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology); 816 #[cfg(target_arch = "aarch64")] 817 let x2apic_id = cpu_id as u32; 818 819 let mut vcpu = Vcpu::new( 820 cpu_id, 821 x2apic_id as u8, 822 &self.vm, 823 Some(self.vm_ops.clone()), 824 #[cfg(target_arch = "x86_64")] 825 self.hypervisor.get_cpu_vendor(), 826 )?; 827 828 if let Some(snapshot) = snapshot { 829 // AArch64 vCPUs should be initialized after created. 830 #[cfg(target_arch = "aarch64")] 831 vcpu.init(&self.vm)?; 832 833 let state: CpuState = snapshot.to_state().map_err(|e| { 834 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 835 })?; 836 vcpu.vcpu 837 .set_state(&state) 838 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 839 840 vcpu.saved_state = Some(state); 841 } 842 843 let vcpu = Arc::new(Mutex::new(vcpu)); 844 845 // Adding vCPU to the CpuManager's vCPU list. 846 self.vcpus.push(vcpu.clone()); 847 848 Ok(vcpu) 849 } 850 851 pub fn configure_vcpu( 852 &self, 853 vcpu: Arc<Mutex<Vcpu>>, 854 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 855 ) -> Result<()> { 856 let mut vcpu = vcpu.lock().unwrap(); 857 858 #[cfg(feature = "sev_snp")] 859 if self.sev_snp_enabled { 860 if let Some((kernel_entry_point, _)) = boot_setup { 861 vcpu.set_sev_control_register( 862 kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, 863 )?; 864 } 865 866 // Traditional way to configure vcpu doesn't work for SEV-SNP guests. 867 // All the vCPU configuration for SEV-SNP guest is provided via VMSA. 868 return Ok(()); 869 } 870 871 #[cfg(target_arch = "x86_64")] 872 assert!(!self.cpuid.is_empty()); 873 874 #[cfg(target_arch = "x86_64")] 875 let topology = self.config.topology.clone().map_or_else( 876 || Some((1, self.boot_vcpus(), 1)), 877 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 878 ); 879 #[cfg(target_arch = "x86_64")] 880 vcpu.configure( 881 boot_setup, 882 self.cpuid.clone(), 883 self.config.kvm_hyperv, 884 topology, 885 )?; 886 887 #[cfg(target_arch = "aarch64")] 888 vcpu.configure(&self.vm, boot_setup)?; 889 890 Ok(()) 891 } 892 893 /// Only create new vCPUs if there aren't any inactive ones to reuse 894 fn create_vcpus( 895 &mut self, 896 desired_vcpus: u8, 897 snapshot: Option<Snapshot>, 898 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 899 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 900 info!( 901 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 902 desired_vcpus, 903 self.config.max_vcpus, 904 self.vcpus.len(), 905 self.present_vcpus() 906 ); 907 908 if desired_vcpus > self.config.max_vcpus { 909 return Err(Error::DesiredVCpuCountExceedsMax); 910 } 911 912 // Only create vCPUs in excess of all the allocated vCPUs. 913 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 914 vcpus.push(self.create_vcpu( 915 cpu_id, 916 // TODO: The special format of the CPU id can be removed once 917 // ready to break live upgrade. 918 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 919 )?); 920 } 921 922 Ok(vcpus) 923 } 924 925 #[cfg(target_arch = "aarch64")] 926 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 927 for cpu in self.vcpus.iter() { 928 let cpu = cpu.lock().unwrap(); 929 // Check if PMU attr is available, if not, log the information. 930 if cpu.vcpu.has_pmu_support() { 931 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 932 } else { 933 debug!( 934 "PMU attribute is not supported in vCPU{}, skip PMU init!", 935 cpu.id 936 ); 937 return Ok(false); 938 } 939 } 940 941 Ok(true) 942 } 943 944 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 945 self.vcpus.clone() 946 } 947 948 fn start_vcpu( 949 &mut self, 950 vcpu: Arc<Mutex<Vcpu>>, 951 vcpu_id: u8, 952 vcpu_thread_barrier: Arc<Barrier>, 953 inserting: bool, 954 ) -> Result<()> { 955 let reset_evt = self.reset_evt.try_clone().unwrap(); 956 let exit_evt = self.exit_evt.try_clone().unwrap(); 957 #[cfg(feature = "kvm")] 958 let hypervisor_type = self.hypervisor.hypervisor_type(); 959 #[cfg(feature = "guest_debug")] 960 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 961 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 962 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 963 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 964 let vcpu_kick_signalled = self.vcpus_kick_signalled.clone(); 965 966 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 967 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 968 .vcpu_run_interrupted 969 .clone(); 970 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 971 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 972 973 // Prepare the CPU set the current vCPU is expected to run onto. 974 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 975 // SAFETY: all zeros is a valid pattern 976 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 977 // SAFETY: FFI call, trivially safe 978 unsafe { libc::CPU_ZERO(&mut cpuset) }; 979 for host_cpu in host_cpus { 980 // SAFETY: FFI call, trivially safe 981 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) }; 982 } 983 cpuset 984 }); 985 986 // Retrieve seccomp filter for vcpu thread 987 let vcpu_seccomp_filter = get_seccomp_filter( 988 &self.seccomp_action, 989 Thread::Vcpu, 990 self.hypervisor.hypervisor_type(), 991 ) 992 .map_err(Error::CreateSeccompFilter)?; 993 994 #[cfg(target_arch = "x86_64")] 995 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 996 997 info!("Starting vCPU: cpu_id = {}", vcpu_id); 998 999 let handle = Some( 1000 thread::Builder::new() 1001 .name(format!("vcpu{vcpu_id}")) 1002 .spawn(move || { 1003 // Schedule the thread to run on the expected CPU set 1004 if let Some(cpuset) = cpuset.as_ref() { 1005 // SAFETY: FFI call with correct arguments 1006 let ret = unsafe { 1007 libc::sched_setaffinity( 1008 0, 1009 std::mem::size_of::<libc::cpu_set_t>(), 1010 cpuset as *const libc::cpu_set_t, 1011 ) 1012 }; 1013 1014 if ret != 0 { 1015 error!( 1016 "Failed scheduling the vCPU {} on the expected CPU set: {}", 1017 vcpu_id, 1018 io::Error::last_os_error() 1019 ); 1020 return; 1021 } 1022 } 1023 1024 // Apply seccomp filter for vcpu thread. 1025 if !vcpu_seccomp_filter.is_empty() { 1026 if let Err(e) = 1027 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 1028 { 1029 error!("Error applying seccomp filter: {:?}", e); 1030 return; 1031 } 1032 } 1033 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 1034 // This uses an async signal safe handler to kill the vcpu handles. 1035 register_signal_handler(SIGRTMIN(), handle_signal) 1036 .expect("Failed to register vcpu signal handler"); 1037 // Block until all CPUs are ready. 1038 vcpu_thread_barrier.wait(); 1039 1040 std::panic::catch_unwind(move || { 1041 loop { 1042 // If we are being told to pause, we park the thread 1043 // until the pause boolean is toggled. 1044 // The resume operation is responsible for toggling 1045 // the boolean and unpark the thread. 1046 // We enter a loop because park() could spuriously 1047 // return. We will then park() again unless the 1048 // pause boolean has been toggled. 1049 1050 // Need to use Ordering::SeqCst as we have multiple 1051 // loads and stores to different atomics and we need 1052 // to see them in a consistent order in all threads 1053 1054 if vcpu_pause_signalled.load(Ordering::SeqCst) { 1055 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 1056 // completed by returning to KVM_RUN. From the kernel docs: 1057 // 1058 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 1059 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 1060 // operations are complete (and guest state is consistent) only after userspace 1061 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 1062 // incomplete operations and then check for pending signals. 1063 // The pending state of the operation is not preserved in state which is 1064 // visible to userspace, thus userspace should ensure that the operation is 1065 // completed before performing a live migration. Userspace can re-enter the 1066 // guest with an unmasked signal pending or with the immediate_exit field set 1067 // to complete pending operations without allowing any further instructions 1068 // to be executed. 1069 1070 #[cfg(feature = "kvm")] 1071 if matches!(hypervisor_type, HypervisorType::Kvm) { 1072 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 1073 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 1074 error!("Unexpected VM exit on \"immediate_exit\" run"); 1075 break; 1076 } 1077 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 1078 } 1079 1080 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1081 1082 vcpu_paused.store(true, Ordering::SeqCst); 1083 while vcpu_pause_signalled.load(Ordering::SeqCst) { 1084 thread::park(); 1085 } 1086 vcpu_run_interrupted.store(false, Ordering::SeqCst); 1087 } 1088 1089 if vcpu_kick_signalled.load(Ordering::SeqCst) { 1090 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1091 #[cfg(target_arch = "x86_64")] 1092 match vcpu.lock().as_ref().unwrap().vcpu.nmi() { 1093 Ok(()) => {}, 1094 Err(e) => { 1095 error!("Error when inject nmi {}", e); 1096 break; 1097 } 1098 } 1099 } 1100 1101 // We've been told to terminate 1102 if vcpu_kill_signalled.load(Ordering::SeqCst) 1103 || vcpu_kill.load(Ordering::SeqCst) 1104 { 1105 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1106 break; 1107 } 1108 1109 #[cfg(feature = "tdx")] 1110 let mut vcpu = vcpu.lock().unwrap(); 1111 #[cfg(not(feature = "tdx"))] 1112 let vcpu = vcpu.lock().unwrap(); 1113 // vcpu.run() returns false on a triple-fault so trigger a reset 1114 match vcpu.run() { 1115 Ok(run) => match run { 1116 #[cfg(feature = "kvm")] 1117 VmExit::Debug => { 1118 info!("VmExit::Debug"); 1119 #[cfg(feature = "guest_debug")] 1120 { 1121 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1122 let raw_tid = get_raw_tid(vcpu_id as usize); 1123 vm_debug_evt.write(raw_tid as u64).unwrap(); 1124 } 1125 } 1126 #[cfg(target_arch = "x86_64")] 1127 VmExit::IoapicEoi(vector) => { 1128 if let Some(interrupt_controller) = 1129 &interrupt_controller_clone 1130 { 1131 interrupt_controller 1132 .lock() 1133 .unwrap() 1134 .end_of_interrupt(vector); 1135 } 1136 } 1137 VmExit::Ignore => {} 1138 VmExit::Hyperv => {} 1139 VmExit::Reset => { 1140 info!("VmExit::Reset"); 1141 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1142 reset_evt.write(1).unwrap(); 1143 break; 1144 } 1145 VmExit::Shutdown => { 1146 info!("VmExit::Shutdown"); 1147 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1148 exit_evt.write(1).unwrap(); 1149 break; 1150 } 1151 #[cfg(feature = "tdx")] 1152 VmExit::Tdx => { 1153 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1154 match vcpu.get_tdx_exit_details() { 1155 Ok(details) => match details { 1156 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1157 TdxExitDetails::SetupEventNotifyInterrupt => { 1158 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1159 } 1160 }, 1161 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1162 } 1163 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1164 } else { 1165 // We should never reach this code as 1166 // this means the design from the code 1167 // is wrong. 1168 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1169 } 1170 } 1171 }, 1172 1173 Err(e) => { 1174 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1175 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1176 exit_evt.write(1).unwrap(); 1177 break; 1178 } 1179 } 1180 1181 // We've been told to terminate 1182 if vcpu_kill_signalled.load(Ordering::SeqCst) 1183 || vcpu_kill.load(Ordering::SeqCst) 1184 { 1185 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1186 break; 1187 } 1188 } 1189 }) 1190 .or_else(|_| { 1191 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1192 error!("vCPU thread panicked"); 1193 panic_exit_evt.write(1) 1194 }) 1195 .ok(); 1196 }) 1197 .map_err(Error::VcpuSpawn)?, 1198 ); 1199 1200 // On hot plug calls into this function entry_point is None. It is for 1201 // those hotplug CPU additions that we need to set the inserting flag. 1202 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1203 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1204 1205 Ok(()) 1206 } 1207 1208 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1209 fn activate_vcpus( 1210 &mut self, 1211 desired_vcpus: u8, 1212 inserting: bool, 1213 paused: Option<bool>, 1214 ) -> Result<()> { 1215 if desired_vcpus > self.config.max_vcpus { 1216 return Err(Error::DesiredVCpuCountExceedsMax); 1217 } 1218 1219 let vcpu_thread_barrier = Arc::new(Barrier::new( 1220 (desired_vcpus - self.present_vcpus() + 1) as usize, 1221 )); 1222 1223 if let Some(paused) = paused { 1224 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1225 } 1226 1227 info!( 1228 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1229 desired_vcpus, 1230 self.vcpus.len(), 1231 self.present_vcpus(), 1232 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1233 ); 1234 1235 // This reuses any inactive vCPUs as well as any that were newly created 1236 for vcpu_id in self.present_vcpus()..desired_vcpus { 1237 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1238 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1239 } 1240 1241 // Unblock all CPU threads. 1242 vcpu_thread_barrier.wait(); 1243 Ok(()) 1244 } 1245 1246 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1247 // Mark vCPUs for removal, actual removal happens on ejection 1248 for cpu_id in desired_vcpus..self.present_vcpus() { 1249 self.vcpu_states[usize::from(cpu_id)].removing = true; 1250 self.vcpu_states[usize::from(cpu_id)] 1251 .pending_removal 1252 .store(true, Ordering::SeqCst); 1253 } 1254 } 1255 1256 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1257 for state in self.vcpu_states.iter() { 1258 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1259 return true; 1260 } 1261 } 1262 false 1263 } 1264 1265 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1266 info!("Removing vCPU: cpu_id = {}", cpu_id); 1267 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1268 state.kill.store(true, Ordering::SeqCst); 1269 state.signal_thread(); 1270 state.join_thread()?; 1271 state.handle = None; 1272 1273 // Once the thread has exited, clear the "kill" so that it can reused 1274 state.kill.store(false, Ordering::SeqCst); 1275 state.pending_removal.store(false, Ordering::SeqCst); 1276 1277 Ok(()) 1278 } 1279 1280 pub fn create_boot_vcpus( 1281 &mut self, 1282 snapshot: Option<Snapshot>, 1283 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1284 trace_scoped!("create_boot_vcpus"); 1285 1286 self.create_vcpus(self.boot_vcpus(), snapshot) 1287 } 1288 1289 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1290 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1291 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1292 } 1293 1294 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1295 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1296 .map_err(|e| { 1297 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1298 })?; 1299 1300 Ok(()) 1301 } 1302 1303 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1304 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1305 return Ok(false); 1306 } 1307 1308 if !self.dynamic { 1309 return Ok(false); 1310 } 1311 1312 if self.check_pending_removed_vcpu() { 1313 return Err(Error::VcpuPendingRemovedVcpu); 1314 } 1315 1316 match desired_vcpus.cmp(&self.present_vcpus()) { 1317 cmp::Ordering::Greater => { 1318 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1319 for vcpu in vcpus { 1320 self.configure_vcpu(vcpu, None)? 1321 } 1322 self.activate_vcpus(desired_vcpus, true, None)?; 1323 Ok(true) 1324 } 1325 cmp::Ordering::Less => { 1326 self.mark_vcpus_for_removal(desired_vcpus); 1327 Ok(true) 1328 } 1329 _ => Ok(false), 1330 } 1331 } 1332 1333 pub fn shutdown(&mut self) -> Result<()> { 1334 // Tell the vCPUs to stop themselves next time they go through the loop 1335 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1336 1337 // Toggle the vCPUs pause boolean 1338 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1339 1340 // Unpark all the VCPU threads. 1341 for state in self.vcpu_states.iter() { 1342 state.unpark_thread(); 1343 } 1344 1345 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1346 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1347 // above. 1348 for state in self.vcpu_states.iter() { 1349 state.signal_thread(); 1350 } 1351 1352 // Wait for all the threads to finish. This removes the state from the vector. 1353 for mut state in self.vcpu_states.drain(..) { 1354 state.join_thread()?; 1355 } 1356 1357 Ok(()) 1358 } 1359 1360 #[cfg(feature = "tdx")] 1361 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1362 for vcpu in &self.vcpus { 1363 vcpu.lock() 1364 .unwrap() 1365 .vcpu 1366 .tdx_init(hob_address) 1367 .map_err(Error::InitializeTdx)?; 1368 } 1369 Ok(()) 1370 } 1371 1372 pub fn boot_vcpus(&self) -> u8 { 1373 self.config.boot_vcpus 1374 } 1375 1376 pub fn max_vcpus(&self) -> u8 { 1377 self.config.max_vcpus 1378 } 1379 1380 #[cfg(target_arch = "x86_64")] 1381 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1382 assert!(!self.cpuid.is_empty()); 1383 self.cpuid.clone() 1384 } 1385 1386 fn present_vcpus(&self) -> u8 { 1387 self.vcpu_states 1388 .iter() 1389 .fold(0, |acc, state| acc + state.active() as u8) 1390 } 1391 1392 #[cfg(target_arch = "aarch64")] 1393 pub fn get_mpidrs(&self) -> Vec<u64> { 1394 self.vcpus 1395 .iter() 1396 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1397 .collect() 1398 } 1399 1400 #[cfg(target_arch = "aarch64")] 1401 pub fn get_saved_states(&self) -> Vec<CpuState> { 1402 self.vcpus 1403 .iter() 1404 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1405 .collect() 1406 } 1407 1408 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1409 self.config 1410 .topology 1411 .clone() 1412 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1413 } 1414 1415 pub fn create_madt(&self) -> Sdt { 1416 use crate::acpi; 1417 // This is also checked in the commandline parsing. 1418 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1419 1420 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1421 #[cfg(target_arch = "x86_64")] 1422 { 1423 madt.write(36, arch::layout::APIC_START.0); 1424 1425 for cpu in 0..self.config.max_vcpus { 1426 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology()); 1427 1428 let lapic = LocalX2Apic { 1429 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1430 length: 16, 1431 processor_id: cpu.into(), 1432 apic_id: x2apic_id, 1433 flags: if cpu < self.config.boot_vcpus { 1434 1 << MADT_CPU_ENABLE_FLAG 1435 } else { 1436 0 1437 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1438 _reserved: 0, 1439 }; 1440 madt.append(lapic); 1441 } 1442 1443 madt.append(Ioapic { 1444 r#type: acpi::ACPI_APIC_IO, 1445 length: 12, 1446 ioapic_id: 0, 1447 apic_address: arch::layout::IOAPIC_START.0 as u32, 1448 gsi_base: 0, 1449 ..Default::default() 1450 }); 1451 1452 madt.append(InterruptSourceOverride { 1453 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1454 length: 10, 1455 bus: 0, 1456 source: 4, 1457 gsi: 4, 1458 flags: 0, 1459 }); 1460 } 1461 1462 #[cfg(target_arch = "aarch64")] 1463 { 1464 /* Notes: 1465 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1466 */ 1467 1468 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1469 for cpu in 0..self.config.boot_vcpus { 1470 let vcpu = &self.vcpus[cpu as usize]; 1471 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1472 /* ARMv8 MPIDR format: 1473 Bits [63:40] Must be zero 1474 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1475 Bits [31:24] Must be zero 1476 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1477 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1478 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1479 */ 1480 let mpidr_mask = 0xff_00ff_ffff; 1481 let gicc = GicC { 1482 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1483 length: 80, 1484 reserved0: 0, 1485 cpu_interface_number: cpu as u32, 1486 uid: cpu as u32, 1487 flags: 1, 1488 parking_version: 0, 1489 performance_interrupt: 0, 1490 parked_address: 0, 1491 base_address: 0, 1492 gicv_base_address: 0, 1493 gich_base_address: 0, 1494 vgic_interrupt: 0, 1495 gicr_base_address: 0, 1496 mpidr: mpidr & mpidr_mask, 1497 proc_power_effi_class: 0, 1498 reserved1: 0, 1499 spe_overflow_interrupt: 0, 1500 }; 1501 1502 madt.append(gicc); 1503 } 1504 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1505 1506 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1507 let gicd = GicD { 1508 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1509 length: 24, 1510 reserved0: 0, 1511 gic_id: 0, 1512 base_address: vgic_config.dist_addr, 1513 global_irq_base: 0, 1514 version: 3, 1515 reserved1: [0; 3], 1516 }; 1517 madt.append(gicd); 1518 1519 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1520 let gicr = GicR { 1521 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1522 length: 16, 1523 reserved: 0, 1524 base_address: vgic_config.redists_addr, 1525 range_length: vgic_config.redists_size as u32, 1526 }; 1527 madt.append(gicr); 1528 1529 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1530 let gicits = GicIts { 1531 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1532 length: 20, 1533 reserved0: 0, 1534 translation_id: 0, 1535 base_address: vgic_config.msi_addr, 1536 reserved1: 0, 1537 }; 1538 madt.append(gicits); 1539 1540 madt.update_checksum(); 1541 } 1542 1543 madt 1544 } 1545 1546 #[cfg(target_arch = "aarch64")] 1547 pub fn create_pptt(&self) -> Sdt { 1548 let pptt_start = 0; 1549 let mut cpus = 0; 1550 let mut uid = 0; 1551 // If topology is not specified, the default setting is: 1552 // 1 package, multiple cores, 1 thread per core 1553 // This is also the behavior when PPTT is missing. 1554 let (threads_per_core, cores_per_package, packages) = 1555 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1556 1557 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1558 1559 for cluster_idx in 0..packages { 1560 if cpus < self.config.boot_vcpus as usize { 1561 let cluster_offset = pptt.len() - pptt_start; 1562 let cluster_hierarchy_node = ProcessorHierarchyNode { 1563 r#type: 0, 1564 length: 20, 1565 reserved: 0, 1566 flags: 0x2, 1567 parent: 0, 1568 acpi_processor_id: cluster_idx as u32, 1569 num_private_resources: 0, 1570 }; 1571 pptt.append(cluster_hierarchy_node); 1572 1573 for core_idx in 0..cores_per_package { 1574 let core_offset = pptt.len() - pptt_start; 1575 1576 if threads_per_core > 1 { 1577 let core_hierarchy_node = ProcessorHierarchyNode { 1578 r#type: 0, 1579 length: 20, 1580 reserved: 0, 1581 flags: 0x2, 1582 parent: cluster_offset as u32, 1583 acpi_processor_id: core_idx as u32, 1584 num_private_resources: 0, 1585 }; 1586 pptt.append(core_hierarchy_node); 1587 1588 for _thread_idx in 0..threads_per_core { 1589 let thread_hierarchy_node = ProcessorHierarchyNode { 1590 r#type: 0, 1591 length: 20, 1592 reserved: 0, 1593 flags: 0xE, 1594 parent: core_offset as u32, 1595 acpi_processor_id: uid as u32, 1596 num_private_resources: 0, 1597 }; 1598 pptt.append(thread_hierarchy_node); 1599 uid += 1; 1600 } 1601 } else { 1602 let thread_hierarchy_node = ProcessorHierarchyNode { 1603 r#type: 0, 1604 length: 20, 1605 reserved: 0, 1606 flags: 0xA, 1607 parent: cluster_offset as u32, 1608 acpi_processor_id: uid as u32, 1609 num_private_resources: 0, 1610 }; 1611 pptt.append(thread_hierarchy_node); 1612 uid += 1; 1613 } 1614 } 1615 cpus += (cores_per_package * threads_per_core) as usize; 1616 } 1617 } 1618 1619 pptt.update_checksum(); 1620 pptt 1621 } 1622 1623 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1624 fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters { 1625 self.vcpus[usize::from(cpu_id)] 1626 .lock() 1627 .unwrap() 1628 .vcpu 1629 .create_standard_regs() 1630 } 1631 1632 #[cfg(feature = "guest_debug")] 1633 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1634 self.vcpus[usize::from(cpu_id)] 1635 .lock() 1636 .unwrap() 1637 .vcpu 1638 .get_regs() 1639 .map_err(Error::CpuDebug) 1640 } 1641 1642 #[cfg(feature = "guest_debug")] 1643 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1644 self.vcpus[usize::from(cpu_id)] 1645 .lock() 1646 .unwrap() 1647 .vcpu 1648 .set_regs(regs) 1649 .map_err(Error::CpuDebug) 1650 } 1651 1652 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1653 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1654 self.vcpus[usize::from(cpu_id)] 1655 .lock() 1656 .unwrap() 1657 .vcpu 1658 .get_sregs() 1659 .map_err(Error::CpuDebug) 1660 } 1661 1662 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1663 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1664 self.vcpus[usize::from(cpu_id)] 1665 .lock() 1666 .unwrap() 1667 .vcpu 1668 .set_sregs(sregs) 1669 .map_err(Error::CpuDebug) 1670 } 1671 1672 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1673 fn translate_gva( 1674 &self, 1675 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1676 cpu_id: u8, 1677 gva: u64, 1678 ) -> Result<u64> { 1679 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1680 .lock() 1681 .unwrap() 1682 .vcpu 1683 .translate_gva(gva, /* flags: unused */ 0) 1684 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1685 Ok(gpa) 1686 } 1687 1688 /// 1689 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1690 /// it in VMM by walking through translation tables. 1691 /// 1692 /// Address translation is big topic, here we only focus the scenario that 1693 /// happens in VMM while debugging kernel. This `translate_gva` 1694 /// implementation is restricted to: 1695 /// - Exception Level 1 1696 /// - Translate high address range only (kernel space) 1697 /// 1698 /// This implementation supports following Arm-v8a features related to 1699 /// address translation: 1700 /// - FEAT_LPA 1701 /// - FEAT_LVA 1702 /// - FEAT_LPA2 1703 /// 1704 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1705 fn translate_gva( 1706 &self, 1707 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1708 cpu_id: u8, 1709 gva: u64, 1710 ) -> Result<u64> { 1711 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1712 .lock() 1713 .unwrap() 1714 .vcpu 1715 .get_sys_reg(regs::TCR_EL1) 1716 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1717 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1718 .lock() 1719 .unwrap() 1720 .vcpu 1721 .get_sys_reg(regs::TTBR1_EL1) 1722 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1723 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1724 .lock() 1725 .unwrap() 1726 .vcpu 1727 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1728 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1729 1730 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1731 // or low (0x000xxx...). 1732 let high_range = extract_bits_64!(gva, 55, 1); 1733 if high_range == 0 { 1734 info!("VA (0x{:x}) range is not supported!", gva); 1735 return Ok(gva); 1736 } 1737 1738 // High range size offset 1739 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1740 // Granule size 1741 let tg = extract_bits_64!(tcr_el1, 30, 2); 1742 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1743 let ds = extract_bits_64!(tcr_el1, 59, 1); 1744 1745 if tsz == 0 { 1746 info!("VA translation is not ready!"); 1747 return Ok(gva); 1748 } 1749 1750 // VA size is determined by TCR_BL1.T1SZ 1751 let va_size = 64 - tsz; 1752 // Number of bits in VA consumed in each level of translation 1753 let stride = match tg { 1754 3 => 13, // 64KB granule size 1755 1 => 11, // 16KB granule size 1756 _ => 9, // 4KB, default 1757 }; 1758 // Starting level of walking 1759 let mut level = 4 - (va_size - 4) / stride; 1760 1761 // PA or IPA size is determined 1762 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1763 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1764 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1765 // To be safe, we use the minimum value if they are different. 1766 let pa_range = std::cmp::min(tcr_ips, pa_range); 1767 // PA size in bits 1768 let pa_size = match pa_range { 1769 0 => 32, 1770 1 => 36, 1771 2 => 40, 1772 3 => 42, 1773 4 => 44, 1774 5 => 48, 1775 6 => 52, 1776 _ => { 1777 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1778 "PA range not supported {pa_range}" 1779 )))) 1780 } 1781 }; 1782 1783 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1784 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1785 // If FEAT_LPA2 is present, the translation table descriptor holds 1786 // 50 bits of the table address of next level. 1787 // Otherwise, it is 48 bits. 1788 let descaddrmask = if ds == 1 { 1789 !0u64 >> (64 - 50) // mask with 50 least significant bits 1790 } else { 1791 !0u64 >> (64 - 48) // mask with 48 least significant bits 1792 }; 1793 let descaddrmask = descaddrmask & !indexmask_grainsize; 1794 1795 // Translation table base address 1796 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1797 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1798 // address bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1799 if pa_size == 52 { 1800 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1801 } 1802 1803 // Loop through tables of each level 1804 loop { 1805 // Table offset for current level 1806 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1807 descaddr |= table_offset; 1808 descaddr &= !7u64; 1809 1810 let mut buf = [0; 8]; 1811 guest_memory 1812 .memory() 1813 .read(&mut buf, GuestAddress(descaddr)) 1814 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1815 let descriptor = u64::from_le_bytes(buf); 1816 1817 descaddr = descriptor & descaddrmask; 1818 // In the case of FEAT_LPA, the next-level translation table address 1819 // bits [48:51] comes from bits [12:15] of the current descriptor. 1820 // For FEAT_LPA2, the next-level translation table address 1821 // bits [50:51] comes from bits [8:9] of the current descriptor, 1822 // bits [48:49] comes from bits [48:49] of the descriptor which was 1823 // handled previously. 1824 if pa_size == 52 { 1825 if ds == 1 { 1826 // FEAT_LPA2 1827 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1828 } else { 1829 // FEAT_LPA 1830 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1831 } 1832 } 1833 1834 if (descriptor & 2) != 0 && (level < 3) { 1835 // This is a table entry. Go down to next level. 1836 level += 1; 1837 indexmask = indexmask_grainsize; 1838 continue; 1839 } 1840 1841 break; 1842 } 1843 1844 // We have reached either: 1845 // - a page entry at level 3 or 1846 // - a block entry at level 1 or 2 1847 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1848 descaddr &= !(page_size - 1); 1849 descaddr |= gva & (page_size - 1); 1850 1851 Ok(descaddr) 1852 } 1853 1854 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1855 self.acpi_address = Some(acpi_address); 1856 } 1857 1858 pub(crate) fn set_interrupt_controller( 1859 &mut self, 1860 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1861 ) { 1862 self.interrupt_controller = Some(interrupt_controller); 1863 } 1864 1865 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1866 &self.vcpus_kill_signalled 1867 } 1868 1869 #[cfg(feature = "igvm")] 1870 pub(crate) fn get_cpuid_leaf( 1871 &self, 1872 cpu_id: u8, 1873 eax: u32, 1874 ecx: u32, 1875 xfem: u64, 1876 xss: u64, 1877 ) -> Result<[u32; 4]> { 1878 let leaf_info = self.vcpus[usize::from(cpu_id)] 1879 .lock() 1880 .unwrap() 1881 .vcpu 1882 .get_cpuid_values(eax, ecx, xfem, xss) 1883 .unwrap(); 1884 Ok(leaf_info) 1885 } 1886 1887 #[cfg(feature = "sev_snp")] 1888 pub(crate) fn sev_snp_enabled(&self) -> bool { 1889 self.sev_snp_enabled 1890 } 1891 1892 pub(crate) fn nmi(&self) -> Result<()> { 1893 self.vcpus_kick_signalled.store(true, Ordering::SeqCst); 1894 1895 for state in self.vcpu_states.iter() { 1896 state.signal_thread(); 1897 } 1898 1899 self.vcpus_kick_signalled.store(false, Ordering::SeqCst); 1900 1901 Ok(()) 1902 } 1903 } 1904 1905 struct Cpu { 1906 cpu_id: u8, 1907 proximity_domain: u32, 1908 dynamic: bool, 1909 #[cfg(target_arch = "x86_64")] 1910 topology: Option<(u8, u8, u8)>, 1911 } 1912 1913 #[cfg(target_arch = "x86_64")] 1914 const MADT_CPU_ENABLE_FLAG: usize = 0; 1915 1916 #[cfg(target_arch = "x86_64")] 1917 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1918 1919 impl Cpu { 1920 #[cfg(target_arch = "x86_64")] 1921 fn generate_mat(&self) -> Vec<u8> { 1922 let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology); 1923 1924 let lapic = LocalX2Apic { 1925 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1926 length: 16, 1927 processor_id: self.cpu_id.into(), 1928 apic_id: x2apic_id, 1929 flags: 1 << MADT_CPU_ENABLE_FLAG, 1930 _reserved: 0, 1931 }; 1932 1933 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1934 // SAFETY: mat_data is large enough to hold lapic 1935 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1936 1937 mat_data 1938 } 1939 } 1940 1941 impl Aml for Cpu { 1942 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1943 #[cfg(target_arch = "x86_64")] 1944 let mat_data: Vec<u8> = self.generate_mat(); 1945 #[allow(clippy::if_same_then_else)] 1946 if self.dynamic { 1947 aml::Device::new( 1948 format!("C{:03X}", self.cpu_id).as_str().into(), 1949 vec![ 1950 &aml::Name::new("_HID".into(), &"ACPI0007"), 1951 &aml::Name::new("_UID".into(), &self.cpu_id), 1952 // Currently, AArch64 cannot support following fields. 1953 /* 1954 _STA return value: 1955 Bit [0] – Set if the device is present. 1956 Bit [1] – Set if the device is enabled and decoding its resources. 1957 Bit [2] – Set if the device should be shown in the UI. 1958 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1959 Bit [4] – Set if the battery is present. 1960 Bits [31:5] – Reserved (must be cleared). 1961 */ 1962 #[cfg(target_arch = "x86_64")] 1963 &aml::Method::new( 1964 "_STA".into(), 1965 0, 1966 false, 1967 // Call into CSTA method which will interrogate device 1968 vec![&aml::Return::new(&aml::MethodCall::new( 1969 "CSTA".into(), 1970 vec![&self.cpu_id], 1971 ))], 1972 ), 1973 &aml::Method::new( 1974 "_PXM".into(), 1975 0, 1976 false, 1977 vec![&aml::Return::new(&self.proximity_domain)], 1978 ), 1979 // The Linux kernel expects every CPU device to have a _MAT entry 1980 // containing the LAPIC for this processor with the enabled bit set 1981 // even it if is disabled in the MADT (non-boot CPU) 1982 #[cfg(target_arch = "x86_64")] 1983 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1984 // Trigger CPU ejection 1985 #[cfg(target_arch = "x86_64")] 1986 &aml::Method::new( 1987 "_EJ0".into(), 1988 1, 1989 false, 1990 // Call into CEJ0 method which will actually eject device 1991 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1992 ), 1993 ], 1994 ) 1995 .to_aml_bytes(sink); 1996 } else { 1997 aml::Device::new( 1998 format!("C{:03X}", self.cpu_id).as_str().into(), 1999 vec![ 2000 &aml::Name::new("_HID".into(), &"ACPI0007"), 2001 &aml::Name::new("_UID".into(), &self.cpu_id), 2002 #[cfg(target_arch = "x86_64")] 2003 &aml::Method::new( 2004 "_STA".into(), 2005 0, 2006 false, 2007 // Mark CPU present see CSTA implementation 2008 vec![&aml::Return::new(&0xfu8)], 2009 ), 2010 &aml::Method::new( 2011 "_PXM".into(), 2012 0, 2013 false, 2014 vec![&aml::Return::new(&self.proximity_domain)], 2015 ), 2016 // The Linux kernel expects every CPU device to have a _MAT entry 2017 // containing the LAPIC for this processor with the enabled bit set 2018 // even it if is disabled in the MADT (non-boot CPU) 2019 #[cfg(target_arch = "x86_64")] 2020 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 2021 ], 2022 ) 2023 .to_aml_bytes(sink); 2024 } 2025 } 2026 } 2027 2028 struct CpuNotify { 2029 cpu_id: u8, 2030 } 2031 2032 impl Aml for CpuNotify { 2033 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2034 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 2035 aml::If::new( 2036 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 2037 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2038 ) 2039 .to_aml_bytes(sink) 2040 } 2041 } 2042 2043 struct CpuMethods { 2044 max_vcpus: u8, 2045 dynamic: bool, 2046 } 2047 2048 impl Aml for CpuMethods { 2049 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2050 if self.dynamic { 2051 // CPU status method 2052 aml::Method::new( 2053 "CSTA".into(), 2054 1, 2055 true, 2056 vec![ 2057 // Take lock defined above 2058 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2059 // Write CPU number (in first argument) to I/O port via field 2060 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2061 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2062 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2063 &aml::If::new( 2064 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 2065 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2066 ), 2067 // Release lock 2068 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2069 // Return 0 or 0xf 2070 &aml::Return::new(&aml::Local(0)), 2071 ], 2072 ) 2073 .to_aml_bytes(sink); 2074 2075 let mut cpu_notifies = Vec::new(); 2076 for cpu_id in 0..self.max_vcpus { 2077 cpu_notifies.push(CpuNotify { cpu_id }); 2078 } 2079 2080 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2081 for cpu_id in 0..self.max_vcpus { 2082 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 2083 } 2084 2085 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 2086 2087 aml::Method::new( 2088 "CEJ0".into(), 2089 1, 2090 true, 2091 vec![ 2092 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2093 // Write CPU number (in first argument) to I/O port via field 2094 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2095 // Set CEJ0 bit 2096 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 2097 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2098 ], 2099 ) 2100 .to_aml_bytes(sink); 2101 2102 aml::Method::new( 2103 "CSCN".into(), 2104 0, 2105 true, 2106 vec![ 2107 // Take lock defined above 2108 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2109 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2110 &aml::While::new( 2111 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 2112 vec![ 2113 // Write CPU number (in first argument) to I/O port via field 2114 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 2115 // Check if CINS bit is set 2116 &aml::If::new( 2117 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 2118 // Notify device if it is 2119 vec![ 2120 &aml::MethodCall::new( 2121 "CTFY".into(), 2122 vec![&aml::Local(0), &aml::ONE], 2123 ), 2124 // Reset CINS bit 2125 &aml::Store::new( 2126 &aml::Path::new("\\_SB_.PRES.CINS"), 2127 &aml::ONE, 2128 ), 2129 ], 2130 ), 2131 // Check if CRMV bit is set 2132 &aml::If::new( 2133 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 2134 // Notify device if it is (with the eject constant 0x3) 2135 vec![ 2136 &aml::MethodCall::new( 2137 "CTFY".into(), 2138 vec![&aml::Local(0), &3u8], 2139 ), 2140 // Reset CRMV bit 2141 &aml::Store::new( 2142 &aml::Path::new("\\_SB_.PRES.CRMV"), 2143 &aml::ONE, 2144 ), 2145 ], 2146 ), 2147 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2148 ], 2149 ), 2150 // Release lock 2151 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2152 ], 2153 ) 2154 .to_aml_bytes(sink) 2155 } else { 2156 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2157 } 2158 } 2159 } 2160 2161 impl Aml for CpuManager { 2162 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2163 #[cfg(target_arch = "x86_64")] 2164 if let Some(acpi_address) = self.acpi_address { 2165 // CPU hotplug controller 2166 aml::Device::new( 2167 "_SB_.PRES".into(), 2168 vec![ 2169 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2170 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2171 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2172 &aml::Mutex::new("CPLK".into(), 0), 2173 &aml::Name::new( 2174 "_CRS".into(), 2175 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2176 aml::AddressSpaceCacheable::NotCacheable, 2177 true, 2178 acpi_address.0, 2179 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2180 None, 2181 )]), 2182 ), 2183 // OpRegion and Fields map MMIO range into individual field values 2184 &aml::OpRegion::new( 2185 "PRST".into(), 2186 aml::OpRegionSpace::SystemMemory, 2187 &(acpi_address.0 as usize), 2188 &CPU_MANAGER_ACPI_SIZE, 2189 ), 2190 &aml::Field::new( 2191 "PRST".into(), 2192 aml::FieldAccessType::Byte, 2193 aml::FieldLockRule::NoLock, 2194 aml::FieldUpdateRule::WriteAsZeroes, 2195 vec![ 2196 aml::FieldEntry::Reserved(32), 2197 aml::FieldEntry::Named(*b"CPEN", 1), 2198 aml::FieldEntry::Named(*b"CINS", 1), 2199 aml::FieldEntry::Named(*b"CRMV", 1), 2200 aml::FieldEntry::Named(*b"CEJ0", 1), 2201 aml::FieldEntry::Reserved(4), 2202 aml::FieldEntry::Named(*b"CCMD", 8), 2203 ], 2204 ), 2205 &aml::Field::new( 2206 "PRST".into(), 2207 aml::FieldAccessType::DWord, 2208 aml::FieldLockRule::NoLock, 2209 aml::FieldUpdateRule::Preserve, 2210 vec![ 2211 aml::FieldEntry::Named(*b"CSEL", 32), 2212 aml::FieldEntry::Reserved(32), 2213 aml::FieldEntry::Named(*b"CDAT", 32), 2214 ], 2215 ), 2216 ], 2217 ) 2218 .to_aml_bytes(sink); 2219 } 2220 2221 // CPU devices 2222 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2223 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2224 // Bundle methods together under a common object 2225 let methods = CpuMethods { 2226 max_vcpus: self.config.max_vcpus, 2227 dynamic: self.dynamic, 2228 }; 2229 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2230 2231 #[cfg(target_arch = "x86_64")] 2232 let topology = self.get_vcpu_topology(); 2233 let mut cpu_devices = Vec::new(); 2234 for cpu_id in 0..self.config.max_vcpus { 2235 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2236 let cpu_device = Cpu { 2237 cpu_id, 2238 proximity_domain, 2239 dynamic: self.dynamic, 2240 #[cfg(target_arch = "x86_64")] 2241 topology, 2242 }; 2243 2244 cpu_devices.push(cpu_device); 2245 } 2246 2247 for cpu_device in cpu_devices.iter() { 2248 cpu_data_inner.push(cpu_device); 2249 } 2250 2251 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2252 } 2253 } 2254 2255 impl Pausable for CpuManager { 2256 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2257 // Tell the vCPUs to pause themselves next time they exit 2258 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2259 2260 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2261 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2262 // above. 2263 for state in self.vcpu_states.iter() { 2264 state.signal_thread(); 2265 } 2266 2267 for vcpu in self.vcpus.iter() { 2268 let mut vcpu = vcpu.lock().unwrap(); 2269 vcpu.pause()?; 2270 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2271 if !self.config.kvm_hyperv { 2272 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2273 MigratableError::Pause(anyhow!( 2274 "Could not notify guest it has been paused {:?}", 2275 e 2276 )) 2277 })?; 2278 } 2279 } 2280 2281 // The vCPU thread will change its paused state before parking, wait here for each 2282 // activated vCPU change their state to ensure they have parked. 2283 for state in self.vcpu_states.iter() { 2284 if state.active() { 2285 while !state.paused.load(Ordering::SeqCst) { 2286 // To avoid a priority inversion with the vCPU thread 2287 thread::sleep(std::time::Duration::from_millis(1)); 2288 } 2289 } 2290 } 2291 2292 Ok(()) 2293 } 2294 2295 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2296 for vcpu in self.vcpus.iter() { 2297 vcpu.lock().unwrap().resume()?; 2298 } 2299 2300 // Toggle the vCPUs pause boolean 2301 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2302 2303 // Unpark all the VCPU threads. 2304 // Once unparked, the next thing they will do is checking for the pause 2305 // boolean. Since it'll be set to false, they will exit their pause loop 2306 // and go back to vmx root. 2307 for state in self.vcpu_states.iter() { 2308 state.paused.store(false, Ordering::SeqCst); 2309 state.unpark_thread(); 2310 } 2311 Ok(()) 2312 } 2313 } 2314 2315 impl Snapshottable for CpuManager { 2316 fn id(&self) -> String { 2317 CPU_MANAGER_SNAPSHOT_ID.to_string() 2318 } 2319 2320 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2321 let mut cpu_manager_snapshot = Snapshot::default(); 2322 2323 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2324 for vcpu in &self.vcpus { 2325 let mut vcpu = vcpu.lock().unwrap(); 2326 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2327 } 2328 2329 Ok(cpu_manager_snapshot) 2330 } 2331 } 2332 2333 impl Transportable for CpuManager {} 2334 impl Migratable for CpuManager {} 2335 2336 #[cfg(feature = "guest_debug")] 2337 impl Debuggable for CpuManager { 2338 #[cfg(feature = "kvm")] 2339 fn set_guest_debug( 2340 &self, 2341 cpu_id: usize, 2342 addrs: &[GuestAddress], 2343 singlestep: bool, 2344 ) -> std::result::Result<(), DebuggableError> { 2345 self.vcpus[cpu_id] 2346 .lock() 2347 .unwrap() 2348 .vcpu 2349 .set_guest_debug(addrs, singlestep) 2350 .map_err(DebuggableError::SetDebug) 2351 } 2352 2353 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2354 Ok(()) 2355 } 2356 2357 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2358 Ok(()) 2359 } 2360 2361 #[cfg(target_arch = "x86_64")] 2362 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2363 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2364 let gregs = self 2365 .get_regs(cpu_id as u8) 2366 .map_err(DebuggableError::ReadRegs)?; 2367 let regs = [ 2368 gregs.get_rax(), 2369 gregs.get_rbx(), 2370 gregs.get_rcx(), 2371 gregs.get_rdx(), 2372 gregs.get_rsi(), 2373 gregs.get_rdi(), 2374 gregs.get_rbp(), 2375 gregs.get_rsp(), 2376 gregs.get_r8(), 2377 gregs.get_r9(), 2378 gregs.get_r10(), 2379 gregs.get_r11(), 2380 gregs.get_r12(), 2381 gregs.get_r13(), 2382 gregs.get_r14(), 2383 gregs.get_r15(), 2384 ]; 2385 2386 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2387 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2388 let eflags = gregs.get_rflags() as u32; 2389 let rip = gregs.get_rip(); 2390 2391 // Segment registers: CS, SS, DS, ES, FS, GS 2392 let sregs = self 2393 .get_sregs(cpu_id as u8) 2394 .map_err(DebuggableError::ReadRegs)?; 2395 let segments = X86SegmentRegs { 2396 cs: sregs.cs.selector as u32, 2397 ss: sregs.ss.selector as u32, 2398 ds: sregs.ds.selector as u32, 2399 es: sregs.es.selector as u32, 2400 fs: sregs.fs.selector as u32, 2401 gs: sregs.gs.selector as u32, 2402 }; 2403 2404 // TODO: Add other registers 2405 2406 Ok(CoreRegs { 2407 regs, 2408 eflags, 2409 rip, 2410 segments, 2411 ..Default::default() 2412 }) 2413 } 2414 2415 #[cfg(target_arch = "aarch64")] 2416 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2417 let gregs = self 2418 .get_regs(cpu_id as u8) 2419 .map_err(DebuggableError::ReadRegs)?; 2420 Ok(CoreRegs { 2421 x: gregs.get_regs(), 2422 sp: gregs.get_sp(), 2423 pc: gregs.get_pc(), 2424 ..Default::default() 2425 }) 2426 } 2427 2428 #[cfg(target_arch = "x86_64")] 2429 fn write_regs( 2430 &self, 2431 cpu_id: usize, 2432 regs: &CoreRegs, 2433 ) -> std::result::Result<(), DebuggableError> { 2434 let orig_gregs = self 2435 .get_regs(cpu_id as u8) 2436 .map_err(DebuggableError::ReadRegs)?; 2437 let mut gregs = self.create_standard_regs(cpu_id as u8); 2438 gregs.set_rax(regs.regs[0]); 2439 gregs.set_rbx(regs.regs[1]); 2440 gregs.set_rcx(regs.regs[2]); 2441 gregs.set_rdx(regs.regs[3]); 2442 gregs.set_rsi(regs.regs[4]); 2443 gregs.set_rdi(regs.regs[5]); 2444 gregs.set_rbp(regs.regs[6]); 2445 gregs.set_rsp(regs.regs[7]); 2446 gregs.set_r8(regs.regs[8]); 2447 gregs.set_r9(regs.regs[9]); 2448 gregs.set_r10(regs.regs[10]); 2449 gregs.set_r11(regs.regs[11]); 2450 gregs.set_r12(regs.regs[12]); 2451 gregs.set_r13(regs.regs[13]); 2452 gregs.set_r14(regs.regs[14]); 2453 gregs.set_r15(regs.regs[15]); 2454 gregs.set_rip(regs.rip); 2455 // Update the lower 32-bit of rflags. 2456 gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64)); 2457 2458 self.set_regs(cpu_id as u8, &gregs) 2459 .map_err(DebuggableError::WriteRegs)?; 2460 2461 // Segment registers: CS, SS, DS, ES, FS, GS 2462 // Since GDB care only selectors, we call get_sregs() first. 2463 let mut sregs = self 2464 .get_sregs(cpu_id as u8) 2465 .map_err(DebuggableError::ReadRegs)?; 2466 sregs.cs.selector = regs.segments.cs as u16; 2467 sregs.ss.selector = regs.segments.ss as u16; 2468 sregs.ds.selector = regs.segments.ds as u16; 2469 sregs.es.selector = regs.segments.es as u16; 2470 sregs.fs.selector = regs.segments.fs as u16; 2471 sregs.gs.selector = regs.segments.gs as u16; 2472 2473 self.set_sregs(cpu_id as u8, &sregs) 2474 .map_err(DebuggableError::WriteRegs)?; 2475 2476 // TODO: Add other registers 2477 2478 Ok(()) 2479 } 2480 2481 #[cfg(target_arch = "aarch64")] 2482 fn write_regs( 2483 &self, 2484 cpu_id: usize, 2485 regs: &CoreRegs, 2486 ) -> std::result::Result<(), DebuggableError> { 2487 let mut gregs = self 2488 .get_regs(cpu_id as u8) 2489 .map_err(DebuggableError::ReadRegs)?; 2490 2491 gregs.set_regs(regs.x); 2492 gregs.set_sp(regs.sp); 2493 gregs.set_pc(regs.pc); 2494 2495 self.set_regs(cpu_id as u8, &gregs) 2496 .map_err(DebuggableError::WriteRegs)?; 2497 2498 Ok(()) 2499 } 2500 2501 fn read_mem( 2502 &self, 2503 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2504 cpu_id: usize, 2505 vaddr: GuestAddress, 2506 len: usize, 2507 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2508 let mut buf = vec![0; len]; 2509 let mut total_read = 0_u64; 2510 2511 while total_read < len as u64 { 2512 let gaddr = vaddr.0 + total_read; 2513 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2514 Ok(paddr) => paddr, 2515 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2516 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2517 }; 2518 let psize = arch::PAGE_SIZE as u64; 2519 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2520 guest_memory 2521 .memory() 2522 .read( 2523 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2524 GuestAddress(paddr), 2525 ) 2526 .map_err(DebuggableError::ReadMem)?; 2527 total_read += read_len; 2528 } 2529 Ok(buf) 2530 } 2531 2532 fn write_mem( 2533 &self, 2534 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2535 cpu_id: usize, 2536 vaddr: &GuestAddress, 2537 data: &[u8], 2538 ) -> std::result::Result<(), DebuggableError> { 2539 let mut total_written = 0_u64; 2540 2541 while total_written < data.len() as u64 { 2542 let gaddr = vaddr.0 + total_written; 2543 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2544 Ok(paddr) => paddr, 2545 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2546 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2547 }; 2548 let psize = arch::PAGE_SIZE as u64; 2549 let write_len = std::cmp::min( 2550 data.len() as u64 - total_written, 2551 psize - (paddr & (psize - 1)), 2552 ); 2553 guest_memory 2554 .memory() 2555 .write( 2556 &data[total_written as usize..total_written as usize + write_len as usize], 2557 GuestAddress(paddr), 2558 ) 2559 .map_err(DebuggableError::WriteMem)?; 2560 total_written += write_len; 2561 } 2562 Ok(()) 2563 } 2564 2565 fn active_vcpus(&self) -> usize { 2566 self.present_vcpus() as usize 2567 } 2568 } 2569 2570 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2571 impl Elf64Writable for CpuManager {} 2572 2573 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2574 impl CpuElf64Writable for CpuManager { 2575 fn cpu_write_elf64_note( 2576 &mut self, 2577 dump_state: &DumpState, 2578 ) -> std::result::Result<(), GuestDebuggableError> { 2579 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2580 for vcpu in &self.vcpus { 2581 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2582 let mut pos: usize = 0; 2583 let mut buf = vec![0; note_size as usize]; 2584 let descsz = size_of::<X86_64ElfPrStatus>(); 2585 let vcpu_id = vcpu.lock().unwrap().id; 2586 2587 let note = Elf64_Nhdr { 2588 n_namesz: COREDUMP_NAME_SIZE, 2589 n_descsz: descsz as u32, 2590 n_type: NT_PRSTATUS, 2591 }; 2592 2593 let bytes: &[u8] = note.as_slice(); 2594 buf.splice(0.., bytes.to_vec()); 2595 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2596 buf.resize(pos + 4, 0); 2597 buf.splice(pos.., "CORE".to_string().into_bytes()); 2598 2599 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2600 buf.resize(pos + 32 + 4, 0); 2601 let pid = vcpu_id as u64; 2602 let bytes: &[u8] = pid.as_slice(); 2603 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2604 2605 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2606 2607 let orig_rax: u64 = 0; 2608 let gregs = self.vcpus[usize::from(vcpu_id)] 2609 .lock() 2610 .unwrap() 2611 .vcpu 2612 .get_regs() 2613 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2614 2615 let regs1 = [ 2616 gregs.get_r15(), 2617 gregs.get_r14(), 2618 gregs.get_r13(), 2619 gregs.get_r12(), 2620 gregs.get_rbp(), 2621 gregs.get_rbx(), 2622 gregs.get_r11(), 2623 gregs.get_r10(), 2624 ]; 2625 let regs2 = [ 2626 gregs.get_r9(), 2627 gregs.get_r8(), 2628 gregs.get_rax(), 2629 gregs.get_rcx(), 2630 gregs.get_rdx(), 2631 gregs.get_rsi(), 2632 gregs.get_rdi(), 2633 orig_rax, 2634 ]; 2635 2636 let sregs = self.vcpus[usize::from(vcpu_id)] 2637 .lock() 2638 .unwrap() 2639 .vcpu 2640 .get_sregs() 2641 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2642 2643 debug!( 2644 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2645 gregs.get_rip(), 2646 gregs.get_rsp(), 2647 sregs.gs.base, 2648 sregs.cs.selector, 2649 sregs.ss.selector, 2650 sregs.ds.selector, 2651 ); 2652 2653 let regs = X86_64UserRegs { 2654 regs1, 2655 regs2, 2656 rip: gregs.get_rip(), 2657 cs: sregs.cs.selector as u64, 2658 eflags: gregs.get_rflags(), 2659 rsp: gregs.get_rsp(), 2660 ss: sregs.ss.selector as u64, 2661 fs_base: sregs.fs.base, 2662 gs_base: sregs.gs.base, 2663 ds: sregs.ds.selector as u64, 2664 es: sregs.es.selector as u64, 2665 fs: sregs.fs.selector as u64, 2666 gs: sregs.gs.selector as u64, 2667 }; 2668 2669 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2670 let bytes: &[u8] = regs.as_slice(); 2671 buf.resize(note_size as usize, 0); 2672 buf.splice(pos.., bytes.to_vec()); 2673 buf.resize(note_size as usize, 0); 2674 2675 coredump_file 2676 .write(&buf) 2677 .map_err(GuestDebuggableError::CoredumpFile)?; 2678 } 2679 2680 Ok(()) 2681 } 2682 2683 fn cpu_write_vmm_note( 2684 &mut self, 2685 dump_state: &DumpState, 2686 ) -> std::result::Result<(), GuestDebuggableError> { 2687 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2688 for vcpu in &self.vcpus { 2689 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2690 let mut pos: usize = 0; 2691 let mut buf = vec![0; note_size as usize]; 2692 let descsz = size_of::<DumpCpusState>(); 2693 let vcpu_id = vcpu.lock().unwrap().id; 2694 2695 let note = Elf64_Nhdr { 2696 n_namesz: COREDUMP_NAME_SIZE, 2697 n_descsz: descsz as u32, 2698 n_type: 0, 2699 }; 2700 2701 let bytes: &[u8] = note.as_slice(); 2702 buf.splice(0.., bytes.to_vec()); 2703 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2704 2705 buf.resize(pos + 4, 0); 2706 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2707 2708 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2709 2710 let gregs = self.vcpus[usize::from(vcpu_id)] 2711 .lock() 2712 .unwrap() 2713 .vcpu 2714 .get_regs() 2715 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2716 2717 let regs1 = [ 2718 gregs.get_rax(), 2719 gregs.get_rbx(), 2720 gregs.get_rcx(), 2721 gregs.get_rdx(), 2722 gregs.get_rsi(), 2723 gregs.get_rdi(), 2724 gregs.get_rsp(), 2725 gregs.get_rbp(), 2726 ]; 2727 2728 let regs2 = [ 2729 gregs.get_r8(), 2730 gregs.get_r9(), 2731 gregs.get_r10(), 2732 gregs.get_r11(), 2733 gregs.get_r12(), 2734 gregs.get_r13(), 2735 gregs.get_r14(), 2736 gregs.get_r15(), 2737 ]; 2738 2739 let sregs = self.vcpus[usize::from(vcpu_id)] 2740 .lock() 2741 .unwrap() 2742 .vcpu 2743 .get_sregs() 2744 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2745 2746 let mut msrs = vec![MsrEntry { 2747 index: msr_index::MSR_KERNEL_GS_BASE, 2748 ..Default::default() 2749 }]; 2750 2751 self.vcpus[vcpu_id as usize] 2752 .lock() 2753 .unwrap() 2754 .vcpu 2755 .get_msrs(&mut msrs) 2756 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2757 let kernel_gs_base = msrs[0].data; 2758 2759 let cs = CpuSegment::new(sregs.cs); 2760 let ds = CpuSegment::new(sregs.ds); 2761 let es = CpuSegment::new(sregs.es); 2762 let fs = CpuSegment::new(sregs.fs); 2763 let gs = CpuSegment::new(sregs.gs); 2764 let ss = CpuSegment::new(sregs.ss); 2765 let ldt = CpuSegment::new(sregs.ldt); 2766 let tr = CpuSegment::new(sregs.tr); 2767 let gdt = CpuSegment::new_from_table(sregs.gdt); 2768 let idt = CpuSegment::new_from_table(sregs.idt); 2769 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2770 let regs = DumpCpusState { 2771 version: 1, 2772 size: size_of::<DumpCpusState>() as u32, 2773 regs1, 2774 regs2, 2775 rip: gregs.get_rip(), 2776 rflags: gregs.get_rflags(), 2777 cs, 2778 ds, 2779 es, 2780 fs, 2781 gs, 2782 ss, 2783 ldt, 2784 tr, 2785 gdt, 2786 idt, 2787 cr, 2788 kernel_gs_base, 2789 }; 2790 2791 let bytes: &[u8] = regs.as_slice(); 2792 buf.resize(note_size as usize, 0); 2793 buf.splice(pos.., bytes.to_vec()); 2794 buf.resize(note_size as usize, 0); 2795 2796 coredump_file 2797 .write(&buf) 2798 .map_err(GuestDebuggableError::CoredumpFile)?; 2799 } 2800 2801 Ok(()) 2802 } 2803 } 2804 2805 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2806 #[cfg(test)] 2807 mod tests { 2808 use arch::layout::{BOOT_STACK_POINTER, ZERO_PAGE_START}; 2809 use arch::x86_64::interrupts::*; 2810 use arch::x86_64::regs::*; 2811 use hypervisor::arch::x86::{FpuState, LapicState}; 2812 use hypervisor::StandardRegisters; 2813 use linux_loader::loader::bootparam::setup_header; 2814 2815 #[test] 2816 fn test_setlint() { 2817 let hv = hypervisor::new().unwrap(); 2818 let vm = hv.create_vm().expect("new VM fd creation failed"); 2819 assert!(hv.check_required_extensions().is_ok()); 2820 // Calling get_lapic will fail if there is no irqchip before hand. 2821 assert!(vm.create_irq_chip().is_ok()); 2822 let vcpu = vm.create_vcpu(0, None).unwrap(); 2823 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2824 2825 // Compute the value that is expected to represent LVT0 and LVT1. 2826 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2827 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2828 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2829 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2830 2831 set_lint(&vcpu).unwrap(); 2832 2833 // Compute the value that represents LVT0 and LVT1 after set_lint. 2834 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2835 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2836 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2837 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2838 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2839 } 2840 2841 #[test] 2842 fn test_setup_fpu() { 2843 let hv = hypervisor::new().unwrap(); 2844 let vm = hv.create_vm().expect("new VM fd creation failed"); 2845 let vcpu = vm.create_vcpu(0, None).unwrap(); 2846 setup_fpu(&vcpu).unwrap(); 2847 2848 let expected_fpu: FpuState = FpuState { 2849 fcw: 0x37f, 2850 mxcsr: 0x1f80, 2851 ..Default::default() 2852 }; 2853 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2854 // TODO: auto-generate kvm related structures with PartialEq on. 2855 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2856 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2857 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2858 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2859 // remove it at all. 2860 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2861 } 2862 2863 #[test] 2864 fn test_setup_msrs() { 2865 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2866 2867 let hv = hypervisor::new().unwrap(); 2868 let vm = hv.create_vm().expect("new VM fd creation failed"); 2869 let vcpu = vm.create_vcpu(0, None).unwrap(); 2870 setup_msrs(&vcpu).unwrap(); 2871 2872 // This test will check against the last MSR entry configured (the tenth one). 2873 // See create_msr_entries for details. 2874 let mut msrs = vec![MsrEntry { 2875 index: msr_index::MSR_IA32_MISC_ENABLE, 2876 ..Default::default() 2877 }]; 2878 2879 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2880 // in this test case scenario. 2881 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2882 assert_eq!(read_msrs, 1); 2883 2884 // Official entries that were setup when we did setup_msrs. We need to assert that the 2885 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2886 // expect. 2887 let entry_vec = vcpu.boot_msr_entries(); 2888 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2889 } 2890 2891 #[test] 2892 fn test_setup_regs_for_pvh() { 2893 let hv = hypervisor::new().unwrap(); 2894 let vm = hv.create_vm().expect("new VM fd creation failed"); 2895 let vcpu = vm.create_vcpu(0, None).unwrap(); 2896 2897 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2898 expected_regs.set_rflags(0x0000000000000002u64); 2899 expected_regs.set_rbx(arch::layout::PVH_INFO_START.0); 2900 expected_regs.set_rip(1); 2901 2902 setup_regs( 2903 &vcpu, 2904 arch::EntryPoint { 2905 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2906 setup_header: None, 2907 }, 2908 ) 2909 .unwrap(); 2910 2911 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2912 assert_eq!(actual_regs, expected_regs); 2913 } 2914 2915 #[test] 2916 fn test_setup_regs_for_bzimage() { 2917 let hv = hypervisor::new().unwrap(); 2918 let vm = hv.create_vm().expect("new VM fd creation failed"); 2919 let vcpu = vm.create_vcpu(0, None).unwrap(); 2920 2921 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2922 expected_regs.set_rflags(0x0000000000000002u64); 2923 expected_regs.set_rip(1); 2924 expected_regs.set_rsp(BOOT_STACK_POINTER.0); 2925 expected_regs.set_rsi(ZERO_PAGE_START.0); 2926 2927 setup_regs( 2928 &vcpu, 2929 arch::EntryPoint { 2930 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2931 setup_header: Some(setup_header { 2932 ..Default::default() 2933 }), 2934 }, 2935 ) 2936 .unwrap(); 2937 2938 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2939 assert_eq!(actual_regs, expected_regs); 2940 } 2941 } 2942 2943 #[cfg(target_arch = "aarch64")] 2944 #[cfg(test)] 2945 mod tests { 2946 use std::mem; 2947 2948 use arch::aarch64::regs; 2949 use arch::layout; 2950 use hypervisor::kvm::aarch64::is_system_register; 2951 use hypervisor::kvm::kvm_bindings::{ 2952 kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE, 2953 KVM_REG_SIZE_U64, 2954 }; 2955 use hypervisor::{arm64_core_reg_id, offset_of}; 2956 2957 #[test] 2958 fn test_setup_regs() { 2959 let hv = hypervisor::new().unwrap(); 2960 let vm = hv.create_vm().unwrap(); 2961 let vcpu = vm.create_vcpu(0, None).unwrap(); 2962 2963 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2964 // Must fail when vcpu is not initialized yet. 2965 assert!(res.is_err()); 2966 2967 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2968 vm.get_preferred_target(&mut kvi).unwrap(); 2969 vcpu.vcpu_init(&kvi).unwrap(); 2970 2971 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2972 } 2973 2974 #[test] 2975 fn test_read_mpidr() { 2976 let hv = hypervisor::new().unwrap(); 2977 let vm = hv.create_vm().unwrap(); 2978 let vcpu = vm.create_vcpu(0, None).unwrap(); 2979 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2980 vm.get_preferred_target(&mut kvi).unwrap(); 2981 2982 // Must fail when vcpu is not initialized yet. 2983 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2984 2985 vcpu.vcpu_init(&kvi).unwrap(); 2986 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2987 } 2988 2989 #[test] 2990 fn test_is_system_register() { 2991 let offset = offset_of!(user_pt_regs, pc); 2992 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2993 assert!(!is_system_register(regid)); 2994 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2995 assert!(is_system_register(regid)); 2996 } 2997 2998 #[test] 2999 fn test_save_restore_core_regs() { 3000 let hv = hypervisor::new().unwrap(); 3001 let vm = hv.create_vm().unwrap(); 3002 let vcpu = vm.create_vcpu(0, None).unwrap(); 3003 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 3004 vm.get_preferred_target(&mut kvi).unwrap(); 3005 3006 // Must fail when vcpu is not initialized yet. 3007 let res = vcpu.get_regs(); 3008 assert!(res.is_err()); 3009 assert_eq!( 3010 format!("{}", res.unwrap_err()), 3011 "Failed to get core register: Exec format error (os error 8)" 3012 ); 3013 3014 let mut state = vcpu.create_standard_regs(); 3015 let res = vcpu.set_regs(&state); 3016 assert!(res.is_err()); 3017 assert_eq!( 3018 format!("{}", res.unwrap_err()), 3019 "Failed to set core register: Exec format error (os error 8)" 3020 ); 3021 3022 vcpu.vcpu_init(&kvi).unwrap(); 3023 let res = vcpu.get_regs(); 3024 assert!(res.is_ok()); 3025 state = res.unwrap(); 3026 assert_eq!(state.get_pstate(), 0x3C5); 3027 3028 assert!(vcpu.set_regs(&state).is_ok()); 3029 } 3030 3031 #[test] 3032 fn test_get_set_mpstate() { 3033 let hv = hypervisor::new().unwrap(); 3034 let vm = hv.create_vm().unwrap(); 3035 let vcpu = vm.create_vcpu(0, None).unwrap(); 3036 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 3037 vm.get_preferred_target(&mut kvi).unwrap(); 3038 3039 let res = vcpu.get_mp_state(); 3040 assert!(res.is_ok()); 3041 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 3042 } 3043 } 3044