1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 #[cfg(target_arch = "x86_64")] 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, sdt::Sdt, Aml}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 #[cfg(target_arch = "x86_64")] 35 use arch::x86_64::get_x2apic_id; 36 use arch::EntryPoint; 37 use arch::NumaNodes; 38 #[cfg(target_arch = "aarch64")] 39 use devices::gic::Gic; 40 use devices::interrupt_controller::InterruptController; 41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 44 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 46 use hypervisor::arch::x86::msr_index; 47 #[cfg(target_arch = "x86_64")] 48 use hypervisor::arch::x86::CpuIdEntry; 49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 50 use hypervisor::arch::x86::MsrEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::SpecialRegisters; 53 #[cfg(target_arch = "aarch64")] 54 use hypervisor::kvm::kvm_bindings; 55 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 56 use hypervisor::kvm::kvm_ioctls::Cap; 57 #[cfg(feature = "tdx")] 58 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 59 #[cfg(target_arch = "x86_64")] 60 use hypervisor::CpuVendor; 61 #[cfg(feature = "kvm")] 62 use hypervisor::HypervisorType; 63 #[cfg(feature = "guest_debug")] 64 use hypervisor::StandardRegisters; 65 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps}; 66 use libc::{c_void, siginfo_t}; 67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 68 use linux_loader::elf::Elf64_Nhdr; 69 use seccompiler::{apply_filter, SeccompAction}; 70 use std::collections::BTreeMap; 71 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 72 use std::io::Write; 73 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 74 use std::mem::size_of; 75 use std::os::unix::thread::JoinHandleExt; 76 use std::sync::atomic::{AtomicBool, Ordering}; 77 use std::sync::{Arc, Barrier, Mutex}; 78 use std::{cmp, io, result, thread}; 79 use thiserror::Error; 80 use tracer::trace_scoped; 81 use vm_device::BusDevice; 82 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 83 use vm_memory::ByteValued; 84 #[cfg(feature = "guest_debug")] 85 use vm_memory::{Bytes, GuestAddressSpace}; 86 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 87 use vm_migration::{ 88 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 89 Transportable, 90 }; 91 use vmm_sys_util::eventfd::EventFd; 92 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 93 use zerocopy::AsBytes; 94 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 95 /// Extract the specified bits of a 64-bit integer. 96 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 97 /// following expression should return 3 (`0b11`): 98 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 99 /// 100 macro_rules! extract_bits_64 { 101 ($value: tt, $offset: tt, $length: tt) => { 102 ($value >> $offset) & (!0u64 >> (64 - $length)) 103 }; 104 } 105 106 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 107 macro_rules! extract_bits_64_without_offset { 108 ($value: tt, $length: tt) => { 109 $value & (!0u64 >> (64 - $length)) 110 }; 111 } 112 113 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 114 115 #[derive(Debug, Error)] 116 pub enum Error { 117 #[error("Error creating vCPU: {0}")] 118 VcpuCreate(#[source] anyhow::Error), 119 120 #[error("Error running bCPU: {0}")] 121 VcpuRun(#[source] anyhow::Error), 122 123 #[error("Error spawning vCPU thread: {0}")] 124 VcpuSpawn(#[source] io::Error), 125 126 #[error("Error generating common CPUID: {0}")] 127 CommonCpuId(#[source] arch::Error), 128 129 #[error("Error configuring vCPU: {0}")] 130 VcpuConfiguration(#[source] arch::Error), 131 132 #[error("Still pending removed vcpu")] 133 VcpuPendingRemovedVcpu, 134 135 #[cfg(target_arch = "aarch64")] 136 #[error("Error fetching preferred target: {0}")] 137 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 138 139 #[cfg(target_arch = "aarch64")] 140 #[error("Error initialising vCPU: {0}")] 141 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 142 143 #[cfg(target_arch = "aarch64")] 144 #[error("Error finalising vCPU: {0}")] 145 VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError), 146 147 #[error("Failed to join on vCPU threads: {0:?}")] 148 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 149 150 #[error("Error adding CpuManager to MMIO bus: {0}")] 151 BusError(#[source] vm_device::BusError), 152 153 #[error("Requested vCPUs exceed maximum")] 154 DesiredVCpuCountExceedsMax, 155 156 #[error("Cannot create seccomp filter: {0}")] 157 CreateSeccompFilter(#[source] seccompiler::Error), 158 159 #[error("Cannot apply seccomp filter: {0}")] 160 ApplySeccompFilter(#[source] seccompiler::Error), 161 162 #[error("Error starting vCPU after restore: {0}")] 163 StartRestoreVcpu(#[source] anyhow::Error), 164 165 #[error("Unexpected VmExit")] 166 UnexpectedVmExit, 167 168 #[error("Failed to allocate MMIO address for CpuManager")] 169 AllocateMmmioAddress, 170 171 #[cfg(feature = "tdx")] 172 #[error("Error initializing TDX: {0}")] 173 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 174 175 #[cfg(target_arch = "aarch64")] 176 #[error("Error initializing PMU: {0}")] 177 InitPmu(#[source] hypervisor::HypervisorCpuError), 178 179 #[cfg(feature = "guest_debug")] 180 #[error("Error during CPU debug: {0}")] 181 CpuDebug(#[source] hypervisor::HypervisorCpuError), 182 183 #[cfg(feature = "guest_debug")] 184 #[error("Error translating virtual address: {0}")] 185 TranslateVirtualAddress(#[source] anyhow::Error), 186 187 #[cfg(target_arch = "x86_64")] 188 #[error("Error setting up AMX: {0}")] 189 AmxEnable(#[source] anyhow::Error), 190 191 #[error("Maximum number of vCPUs exceeds host limit")] 192 MaximumVcpusExceeded, 193 194 #[cfg(feature = "sev_snp")] 195 #[error("Failed to set sev control register: {0}")] 196 SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), 197 198 #[cfg(target_arch = "x86_64")] 199 #[error("Failed to inject NMI")] 200 NmiError(hypervisor::HypervisorCpuError), 201 } 202 pub type Result<T> = result::Result<T, Error>; 203 204 #[cfg(target_arch = "x86_64")] 205 #[allow(dead_code)] 206 #[repr(packed)] 207 #[derive(AsBytes)] 208 struct LocalX2Apic { 209 pub r#type: u8, 210 pub length: u8, 211 pub _reserved: u16, 212 pub apic_id: u32, 213 pub flags: u32, 214 pub processor_id: u32, 215 } 216 217 #[allow(dead_code)] 218 #[repr(packed)] 219 #[derive(Default, AsBytes)] 220 struct Ioapic { 221 pub r#type: u8, 222 pub length: u8, 223 pub ioapic_id: u8, 224 _reserved: u8, 225 pub apic_address: u32, 226 pub gsi_base: u32, 227 } 228 229 #[cfg(target_arch = "aarch64")] 230 #[allow(dead_code)] 231 #[repr(packed)] 232 #[derive(AsBytes)] 233 struct GicC { 234 pub r#type: u8, 235 pub length: u8, 236 pub reserved0: u16, 237 pub cpu_interface_number: u32, 238 pub uid: u32, 239 pub flags: u32, 240 pub parking_version: u32, 241 pub performance_interrupt: u32, 242 pub parked_address: u64, 243 pub base_address: u64, 244 pub gicv_base_address: u64, 245 pub gich_base_address: u64, 246 pub vgic_interrupt: u32, 247 pub gicr_base_address: u64, 248 pub mpidr: u64, 249 pub proc_power_effi_class: u8, 250 pub reserved1: u8, 251 pub spe_overflow_interrupt: u16, 252 } 253 254 #[cfg(target_arch = "aarch64")] 255 #[allow(dead_code)] 256 #[repr(packed)] 257 #[derive(AsBytes)] 258 struct GicD { 259 pub r#type: u8, 260 pub length: u8, 261 pub reserved0: u16, 262 pub gic_id: u32, 263 pub base_address: u64, 264 pub global_irq_base: u32, 265 pub version: u8, 266 pub reserved1: [u8; 3], 267 } 268 269 #[cfg(target_arch = "aarch64")] 270 #[allow(dead_code)] 271 #[repr(packed)] 272 #[derive(AsBytes)] 273 struct GicR { 274 pub r#type: u8, 275 pub length: u8, 276 pub reserved: u16, 277 pub base_address: u64, 278 pub range_length: u32, 279 } 280 281 #[cfg(target_arch = "aarch64")] 282 #[allow(dead_code)] 283 #[repr(packed)] 284 #[derive(AsBytes)] 285 struct GicIts { 286 pub r#type: u8, 287 pub length: u8, 288 pub reserved0: u16, 289 pub translation_id: u32, 290 pub base_address: u64, 291 pub reserved1: u32, 292 } 293 294 #[cfg(target_arch = "aarch64")] 295 #[allow(dead_code)] 296 #[repr(packed)] 297 #[derive(AsBytes)] 298 struct ProcessorHierarchyNode { 299 pub r#type: u8, 300 pub length: u8, 301 pub reserved: u16, 302 pub flags: u32, 303 pub parent: u32, 304 pub acpi_processor_id: u32, 305 pub num_private_resources: u32, 306 } 307 308 #[allow(dead_code)] 309 #[repr(packed)] 310 #[derive(Default, AsBytes)] 311 struct InterruptSourceOverride { 312 pub r#type: u8, 313 pub length: u8, 314 pub bus: u8, 315 pub source: u8, 316 pub gsi: u32, 317 pub flags: u16, 318 } 319 320 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 321 macro_rules! round_up { 322 ($n:expr,$d:expr) => { 323 (($n / ($d + 1)) + 1) * $d 324 }; 325 } 326 327 /// A wrapper around creating and using a kvm-based VCPU. 328 pub struct Vcpu { 329 // The hypervisor abstracted CPU. 330 vcpu: Arc<dyn hypervisor::Vcpu>, 331 id: u8, 332 #[cfg(target_arch = "aarch64")] 333 mpidr: u64, 334 saved_state: Option<CpuState>, 335 #[cfg(target_arch = "x86_64")] 336 vendor: CpuVendor, 337 } 338 339 impl Vcpu { 340 /// Constructs a new VCPU for `vm`. 341 /// 342 /// # Arguments 343 /// 344 /// * `id` - Represents the CPU number between [0, max vcpus). 345 /// * `vm` - The virtual machine this vcpu will get attached to. 346 /// * `vm_ops` - Optional object for exit handling. 347 /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) 348 pub fn new( 349 id: u8, 350 apic_id: u8, 351 vm: &Arc<dyn hypervisor::Vm>, 352 vm_ops: Option<Arc<dyn VmOps>>, 353 #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, 354 ) -> Result<Self> { 355 let vcpu = vm 356 .create_vcpu(apic_id, vm_ops) 357 .map_err(|e| Error::VcpuCreate(e.into()))?; 358 // Initially the cpuid per vCPU is the one supported by this VM. 359 Ok(Vcpu { 360 vcpu, 361 id, 362 #[cfg(target_arch = "aarch64")] 363 mpidr: 0, 364 saved_state: None, 365 #[cfg(target_arch = "x86_64")] 366 vendor: cpu_vendor, 367 }) 368 } 369 370 /// Configures a vcpu and should be called once per vcpu when created. 371 /// 372 /// # Arguments 373 /// 374 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 375 /// * `guest_memory` - Guest memory. 376 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 377 pub fn configure( 378 &mut self, 379 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 380 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 381 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 382 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 383 #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, 384 ) -> Result<()> { 385 #[cfg(target_arch = "aarch64")] 386 { 387 self.init(vm)?; 388 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 389 .map_err(Error::VcpuConfiguration)?; 390 } 391 info!("Configuring vCPU: cpu_id = {}", self.id); 392 #[cfg(target_arch = "x86_64")] 393 arch::configure_vcpu( 394 &self.vcpu, 395 self.id, 396 boot_setup, 397 cpuid, 398 kvm_hyperv, 399 self.vendor, 400 topology, 401 ) 402 .map_err(Error::VcpuConfiguration)?; 403 404 Ok(()) 405 } 406 407 /// Gets the MPIDR register value. 408 #[cfg(target_arch = "aarch64")] 409 pub fn get_mpidr(&self) -> u64 { 410 self.mpidr 411 } 412 413 /// Gets the saved vCPU state. 414 #[cfg(target_arch = "aarch64")] 415 pub fn get_saved_state(&self) -> Option<CpuState> { 416 self.saved_state.clone() 417 } 418 419 /// Initializes an aarch64 specific vcpu for booting Linux. 420 #[cfg(target_arch = "aarch64")] 421 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 422 use std::arch::is_aarch64_feature_detected; 423 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 424 #[allow(clippy::nonminimal_bool)] 425 let sve_supported = 426 is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2"); 427 // This reads back the kernel's preferred target type. 428 vm.get_preferred_target(&mut kvi) 429 .map_err(Error::VcpuArmPreferredTarget)?; 430 // We already checked that the capability is supported. 431 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 432 if vm 433 .as_any() 434 .downcast_ref::<hypervisor::kvm::KvmVm>() 435 .unwrap() 436 .check_extension(Cap::ArmPmuV3) 437 { 438 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 439 } 440 441 if sve_supported 442 && vm 443 .as_any() 444 .downcast_ref::<hypervisor::kvm::KvmVm>() 445 .unwrap() 446 .check_extension(Cap::ArmSve) 447 { 448 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_SVE; 449 } 450 451 // Non-boot cpus are powered off initially. 452 if self.id > 0 { 453 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 454 } 455 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?; 456 if sve_supported { 457 self.vcpu 458 .vcpu_finalize(kvm_bindings::KVM_ARM_VCPU_SVE as i32) 459 .map_err(Error::VcpuArmFinalize)?; 460 } 461 Ok(()) 462 } 463 464 /// Runs the VCPU until it exits, returning the reason. 465 /// 466 /// Note that the state of the VCPU and associated VM must be setup first for this to do 467 /// anything useful. 468 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 469 self.vcpu.run() 470 } 471 472 #[cfg(feature = "sev_snp")] 473 pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> { 474 self.vcpu 475 .set_sev_control_register(vmsa_pfn) 476 .map_err(Error::SetSevControlRegister) 477 } 478 } 479 480 impl Pausable for Vcpu {} 481 impl Snapshottable for Vcpu { 482 fn id(&self) -> String { 483 self.id.to_string() 484 } 485 486 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 487 let saved_state = self 488 .vcpu 489 .state() 490 .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?; 491 492 self.saved_state = Some(saved_state.clone()); 493 494 Ok(Snapshot::from_data(SnapshotData::new_from_state( 495 &saved_state, 496 )?)) 497 } 498 } 499 500 pub struct CpuManager { 501 config: CpusConfig, 502 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 503 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 504 #[cfg(target_arch = "x86_64")] 505 cpuid: Vec<CpuIdEntry>, 506 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 507 vm: Arc<dyn hypervisor::Vm>, 508 vcpus_kill_signalled: Arc<AtomicBool>, 509 vcpus_pause_signalled: Arc<AtomicBool>, 510 vcpus_kick_signalled: Arc<AtomicBool>, 511 exit_evt: EventFd, 512 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 513 reset_evt: EventFd, 514 #[cfg(feature = "guest_debug")] 515 vm_debug_evt: EventFd, 516 vcpu_states: Vec<VcpuState>, 517 selected_cpu: u8, 518 vcpus: Vec<Arc<Mutex<Vcpu>>>, 519 seccomp_action: SeccompAction, 520 vm_ops: Arc<dyn VmOps>, 521 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 522 acpi_address: Option<GuestAddress>, 523 proximity_domain_per_cpu: BTreeMap<u8, u32>, 524 affinity: BTreeMap<u8, Vec<usize>>, 525 dynamic: bool, 526 hypervisor: Arc<dyn hypervisor::Hypervisor>, 527 #[cfg(feature = "sev_snp")] 528 sev_snp_enabled: bool, 529 } 530 531 const CPU_ENABLE_FLAG: usize = 0; 532 const CPU_INSERTING_FLAG: usize = 1; 533 const CPU_REMOVING_FLAG: usize = 2; 534 const CPU_EJECT_FLAG: usize = 3; 535 536 const CPU_STATUS_OFFSET: u64 = 4; 537 const CPU_SELECTION_OFFSET: u64 = 0; 538 539 impl BusDevice for CpuManager { 540 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 541 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 542 data.fill(0); 543 544 match offset { 545 CPU_SELECTION_OFFSET => { 546 data[0] = self.selected_cpu; 547 } 548 CPU_STATUS_OFFSET => { 549 if self.selected_cpu < self.max_vcpus() { 550 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 551 if state.active() { 552 data[0] |= 1 << CPU_ENABLE_FLAG; 553 } 554 if state.inserting { 555 data[0] |= 1 << CPU_INSERTING_FLAG; 556 } 557 if state.removing { 558 data[0] |= 1 << CPU_REMOVING_FLAG; 559 } 560 } else { 561 warn!("Out of range vCPU id: {}", self.selected_cpu); 562 } 563 } 564 _ => { 565 warn!( 566 "Unexpected offset for accessing CPU manager device: {:#}", 567 offset 568 ); 569 } 570 } 571 } 572 573 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 574 match offset { 575 CPU_SELECTION_OFFSET => { 576 self.selected_cpu = data[0]; 577 } 578 CPU_STATUS_OFFSET => { 579 if self.selected_cpu < self.max_vcpus() { 580 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 581 // The ACPI code writes back a 1 to acknowledge the insertion 582 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 583 && state.inserting 584 { 585 state.inserting = false; 586 } 587 // Ditto for removal 588 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 589 && state.removing 590 { 591 state.removing = false; 592 } 593 // Trigger removal of vCPU 594 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 595 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 596 error!("Error removing vCPU: {:?}", e); 597 } 598 } 599 } else { 600 warn!("Out of range vCPU id: {}", self.selected_cpu); 601 } 602 } 603 _ => { 604 warn!( 605 "Unexpected offset for accessing CPU manager device: {:#}", 606 offset 607 ); 608 } 609 } 610 None 611 } 612 } 613 614 #[derive(Default)] 615 struct VcpuState { 616 inserting: bool, 617 removing: bool, 618 pending_removal: Arc<AtomicBool>, 619 handle: Option<thread::JoinHandle<()>>, 620 kill: Arc<AtomicBool>, 621 vcpu_run_interrupted: Arc<AtomicBool>, 622 paused: Arc<AtomicBool>, 623 } 624 625 impl VcpuState { 626 fn active(&self) -> bool { 627 self.handle.is_some() 628 } 629 630 fn signal_thread(&self) { 631 if let Some(handle) = self.handle.as_ref() { 632 loop { 633 // SAFETY: FFI call with correct arguments 634 unsafe { 635 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 636 } 637 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 638 break; 639 } else { 640 // This is more effective than thread::yield_now() at 641 // avoiding a priority inversion with the vCPU thread 642 thread::sleep(std::time::Duration::from_millis(1)); 643 } 644 } 645 } 646 } 647 648 fn join_thread(&mut self) -> Result<()> { 649 if let Some(handle) = self.handle.take() { 650 handle.join().map_err(Error::ThreadCleanup)? 651 } 652 653 Ok(()) 654 } 655 656 fn unpark_thread(&self) { 657 if let Some(handle) = self.handle.as_ref() { 658 handle.thread().unpark() 659 } 660 } 661 } 662 663 impl CpuManager { 664 #[allow(unused_variables)] 665 #[allow(clippy::too_many_arguments)] 666 pub fn new( 667 config: &CpusConfig, 668 vm: Arc<dyn hypervisor::Vm>, 669 exit_evt: EventFd, 670 reset_evt: EventFd, 671 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 672 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 673 seccomp_action: SeccompAction, 674 vm_ops: Arc<dyn VmOps>, 675 #[cfg(feature = "tdx")] tdx_enabled: bool, 676 numa_nodes: &NumaNodes, 677 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 678 ) -> Result<Arc<Mutex<CpuManager>>> { 679 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 680 return Err(Error::MaximumVcpusExceeded); 681 } 682 683 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 684 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 685 let hypervisor_type = hypervisor.hypervisor_type(); 686 #[cfg(target_arch = "x86_64")] 687 let cpu_vendor = hypervisor.get_cpu_vendor(); 688 689 #[cfg(target_arch = "x86_64")] 690 if config.features.amx { 691 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 692 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 693 const XFEATURE_XTILEDATA: usize = 18; 694 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 695 696 // SAFETY: the syscall is only modifying kernel internal 697 // data structures that the kernel is itself expected to safeguard. 698 let amx_tile = unsafe { 699 libc::syscall( 700 libc::SYS_arch_prctl, 701 ARCH_REQ_XCOMP_GUEST_PERM, 702 XFEATURE_XTILEDATA, 703 ) 704 }; 705 706 if amx_tile != 0 { 707 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 708 } else { 709 let mask: usize = 0; 710 // SAFETY: the mask being modified (not marked mutable as it is 711 // modified in unsafe only which is permitted) isn't in use elsewhere. 712 let result = unsafe { 713 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 714 }; 715 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 716 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 717 } 718 } 719 } 720 721 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 722 let mut cpu_list = Vec::new(); 723 for (proximity_domain, numa_node) in numa_nodes.iter() { 724 for cpu in numa_node.cpus.iter() { 725 cpu_list.push((*cpu, *proximity_domain)) 726 } 727 } 728 cpu_list 729 } 730 .into_iter() 731 .collect(); 732 733 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 734 cpu_affinity 735 .iter() 736 .map(|a| (a.vcpu, a.host_cpus.clone())) 737 .collect() 738 } else { 739 BTreeMap::new() 740 }; 741 742 #[cfg(feature = "tdx")] 743 let dynamic = !tdx_enabled; 744 #[cfg(not(feature = "tdx"))] 745 let dynamic = true; 746 747 Ok(Arc::new(Mutex::new(CpuManager { 748 config: config.clone(), 749 interrupt_controller: None, 750 #[cfg(target_arch = "x86_64")] 751 cpuid: Vec::new(), 752 vm, 753 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 754 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 755 vcpus_kick_signalled: Arc::new(AtomicBool::new(false)), 756 vcpu_states, 757 exit_evt, 758 reset_evt, 759 #[cfg(feature = "guest_debug")] 760 vm_debug_evt, 761 selected_cpu: 0, 762 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 763 seccomp_action, 764 vm_ops, 765 acpi_address: None, 766 proximity_domain_per_cpu, 767 affinity, 768 dynamic, 769 hypervisor: hypervisor.clone(), 770 #[cfg(feature = "sev_snp")] 771 sev_snp_enabled, 772 }))) 773 } 774 775 #[cfg(target_arch = "x86_64")] 776 pub fn populate_cpuid( 777 &mut self, 778 memory_manager: &Arc<Mutex<MemoryManager>>, 779 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 780 #[cfg(feature = "tdx")] tdx: bool, 781 ) -> Result<()> { 782 let sgx_epc_sections = memory_manager 783 .lock() 784 .unwrap() 785 .sgx_epc_region() 786 .as_ref() 787 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 788 789 self.cpuid = { 790 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 791 arch::generate_common_cpuid( 792 hypervisor, 793 &arch::CpuidConfig { 794 sgx_epc_sections, 795 phys_bits, 796 kvm_hyperv: self.config.kvm_hyperv, 797 #[cfg(feature = "tdx")] 798 tdx, 799 amx: self.config.features.amx, 800 }, 801 ) 802 .map_err(Error::CommonCpuId)? 803 }; 804 805 Ok(()) 806 } 807 808 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 809 info!("Creating vCPU: cpu_id = {}", cpu_id); 810 811 #[cfg(target_arch = "x86_64")] 812 let topology = self.get_vcpu_topology(); 813 #[cfg(target_arch = "x86_64")] 814 let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology); 815 #[cfg(target_arch = "aarch64")] 816 let x2apic_id = cpu_id as u32; 817 818 let mut vcpu = Vcpu::new( 819 cpu_id, 820 x2apic_id as u8, 821 &self.vm, 822 Some(self.vm_ops.clone()), 823 #[cfg(target_arch = "x86_64")] 824 self.hypervisor.get_cpu_vendor(), 825 )?; 826 827 if let Some(snapshot) = snapshot { 828 // AArch64 vCPUs should be initialized after created. 829 #[cfg(target_arch = "aarch64")] 830 vcpu.init(&self.vm)?; 831 832 let state: CpuState = snapshot.to_state().map_err(|e| { 833 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 834 })?; 835 vcpu.vcpu 836 .set_state(&state) 837 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 838 839 vcpu.saved_state = Some(state); 840 } 841 842 let vcpu = Arc::new(Mutex::new(vcpu)); 843 844 // Adding vCPU to the CpuManager's vCPU list. 845 self.vcpus.push(vcpu.clone()); 846 847 Ok(vcpu) 848 } 849 850 pub fn configure_vcpu( 851 &self, 852 vcpu: Arc<Mutex<Vcpu>>, 853 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 854 ) -> Result<()> { 855 let mut vcpu = vcpu.lock().unwrap(); 856 857 #[cfg(feature = "sev_snp")] 858 if self.sev_snp_enabled { 859 if let Some((kernel_entry_point, _)) = boot_setup { 860 vcpu.set_sev_control_register( 861 kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, 862 )?; 863 } 864 865 // Traditional way to configure vcpu doesn't work for SEV-SNP guests. 866 // All the vCPU configuration for SEV-SNP guest is provided via VMSA. 867 return Ok(()); 868 } 869 870 #[cfg(target_arch = "x86_64")] 871 assert!(!self.cpuid.is_empty()); 872 873 #[cfg(target_arch = "x86_64")] 874 let topology = self.config.topology.clone().map_or_else( 875 || Some((1, self.boot_vcpus(), 1)), 876 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 877 ); 878 #[cfg(target_arch = "x86_64")] 879 vcpu.configure( 880 boot_setup, 881 self.cpuid.clone(), 882 self.config.kvm_hyperv, 883 topology, 884 )?; 885 886 #[cfg(target_arch = "aarch64")] 887 vcpu.configure(&self.vm, boot_setup)?; 888 889 Ok(()) 890 } 891 892 /// Only create new vCPUs if there aren't any inactive ones to reuse 893 fn create_vcpus( 894 &mut self, 895 desired_vcpus: u8, 896 snapshot: Option<Snapshot>, 897 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 898 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 899 info!( 900 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 901 desired_vcpus, 902 self.config.max_vcpus, 903 self.vcpus.len(), 904 self.present_vcpus() 905 ); 906 907 if desired_vcpus > self.config.max_vcpus { 908 return Err(Error::DesiredVCpuCountExceedsMax); 909 } 910 911 // Only create vCPUs in excess of all the allocated vCPUs. 912 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 913 vcpus.push(self.create_vcpu( 914 cpu_id, 915 // TODO: The special format of the CPU id can be removed once 916 // ready to break live upgrade. 917 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 918 )?); 919 } 920 921 Ok(vcpus) 922 } 923 924 #[cfg(target_arch = "aarch64")] 925 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 926 for cpu in self.vcpus.iter() { 927 let cpu = cpu.lock().unwrap(); 928 // Check if PMU attr is available, if not, log the information. 929 if cpu.vcpu.has_pmu_support() { 930 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 931 } else { 932 debug!( 933 "PMU attribute is not supported in vCPU{}, skip PMU init!", 934 cpu.id 935 ); 936 return Ok(false); 937 } 938 } 939 940 Ok(true) 941 } 942 943 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 944 self.vcpus.clone() 945 } 946 947 fn start_vcpu( 948 &mut self, 949 vcpu: Arc<Mutex<Vcpu>>, 950 vcpu_id: u8, 951 vcpu_thread_barrier: Arc<Barrier>, 952 inserting: bool, 953 ) -> Result<()> { 954 let reset_evt = self.reset_evt.try_clone().unwrap(); 955 let exit_evt = self.exit_evt.try_clone().unwrap(); 956 #[cfg(feature = "kvm")] 957 let hypervisor_type = self.hypervisor.hypervisor_type(); 958 #[cfg(feature = "guest_debug")] 959 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 960 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 961 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 962 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 963 let vcpu_kick_signalled = self.vcpus_kick_signalled.clone(); 964 965 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 966 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 967 .vcpu_run_interrupted 968 .clone(); 969 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 970 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 971 972 // Prepare the CPU set the current vCPU is expected to run onto. 973 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 974 // SAFETY: all zeros is a valid pattern 975 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 976 // SAFETY: FFI call, trivially safe 977 unsafe { libc::CPU_ZERO(&mut cpuset) }; 978 for host_cpu in host_cpus { 979 // SAFETY: FFI call, trivially safe 980 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) }; 981 } 982 cpuset 983 }); 984 985 // Retrieve seccomp filter for vcpu thread 986 let vcpu_seccomp_filter = get_seccomp_filter( 987 &self.seccomp_action, 988 Thread::Vcpu, 989 self.hypervisor.hypervisor_type(), 990 ) 991 .map_err(Error::CreateSeccompFilter)?; 992 993 #[cfg(target_arch = "x86_64")] 994 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 995 996 info!("Starting vCPU: cpu_id = {}", vcpu_id); 997 998 let handle = Some( 999 thread::Builder::new() 1000 .name(format!("vcpu{vcpu_id}")) 1001 .spawn(move || { 1002 // Schedule the thread to run on the expected CPU set 1003 if let Some(cpuset) = cpuset.as_ref() { 1004 // SAFETY: FFI call with correct arguments 1005 let ret = unsafe { 1006 libc::sched_setaffinity( 1007 0, 1008 std::mem::size_of::<libc::cpu_set_t>(), 1009 cpuset as *const libc::cpu_set_t, 1010 ) 1011 }; 1012 1013 if ret != 0 { 1014 error!( 1015 "Failed scheduling the vCPU {} on the expected CPU set: {}", 1016 vcpu_id, 1017 io::Error::last_os_error() 1018 ); 1019 return; 1020 } 1021 } 1022 1023 // Apply seccomp filter for vcpu thread. 1024 if !vcpu_seccomp_filter.is_empty() { 1025 if let Err(e) = 1026 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 1027 { 1028 error!("Error applying seccomp filter: {:?}", e); 1029 return; 1030 } 1031 } 1032 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 1033 // This uses an async signal safe handler to kill the vcpu handles. 1034 register_signal_handler(SIGRTMIN(), handle_signal) 1035 .expect("Failed to register vcpu signal handler"); 1036 // Block until all CPUs are ready. 1037 vcpu_thread_barrier.wait(); 1038 1039 std::panic::catch_unwind(move || { 1040 loop { 1041 // If we are being told to pause, we park the thread 1042 // until the pause boolean is toggled. 1043 // The resume operation is responsible for toggling 1044 // the boolean and unpark the thread. 1045 // We enter a loop because park() could spuriously 1046 // return. We will then park() again unless the 1047 // pause boolean has been toggled. 1048 1049 // Need to use Ordering::SeqCst as we have multiple 1050 // loads and stores to different atomics and we need 1051 // to see them in a consistent order in all threads 1052 1053 if vcpu_pause_signalled.load(Ordering::SeqCst) { 1054 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 1055 // completed by returning to KVM_RUN. From the kernel docs: 1056 // 1057 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 1058 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 1059 // operations are complete (and guest state is consistent) only after userspace 1060 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 1061 // incomplete operations and then check for pending signals. 1062 // The pending state of the operation is not preserved in state which is 1063 // visible to userspace, thus userspace should ensure that the operation is 1064 // completed before performing a live migration. Userspace can re-enter the 1065 // guest with an unmasked signal pending or with the immediate_exit field set 1066 // to complete pending operations without allowing any further instructions 1067 // to be executed. 1068 1069 #[cfg(feature = "kvm")] 1070 if matches!(hypervisor_type, HypervisorType::Kvm) { 1071 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 1072 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 1073 error!("Unexpected VM exit on \"immediate_exit\" run"); 1074 break; 1075 } 1076 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 1077 } 1078 1079 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1080 1081 vcpu_paused.store(true, Ordering::SeqCst); 1082 while vcpu_pause_signalled.load(Ordering::SeqCst) { 1083 thread::park(); 1084 } 1085 vcpu_run_interrupted.store(false, Ordering::SeqCst); 1086 } 1087 1088 if vcpu_kick_signalled.load(Ordering::SeqCst) { 1089 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1090 #[cfg(target_arch = "x86_64")] 1091 match vcpu.lock().as_ref().unwrap().vcpu.nmi() { 1092 Ok(()) => {}, 1093 Err(e) => { 1094 error!("Error when inject nmi {}", e); 1095 break; 1096 } 1097 } 1098 } 1099 1100 // We've been told to terminate 1101 if vcpu_kill_signalled.load(Ordering::SeqCst) 1102 || vcpu_kill.load(Ordering::SeqCst) 1103 { 1104 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1105 break; 1106 } 1107 1108 #[cfg(feature = "tdx")] 1109 let mut vcpu = vcpu.lock().unwrap(); 1110 #[cfg(not(feature = "tdx"))] 1111 let vcpu = vcpu.lock().unwrap(); 1112 // vcpu.run() returns false on a triple-fault so trigger a reset 1113 match vcpu.run() { 1114 Ok(run) => match run { 1115 #[cfg(feature = "kvm")] 1116 VmExit::Debug => { 1117 info!("VmExit::Debug"); 1118 #[cfg(feature = "guest_debug")] 1119 { 1120 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1121 let raw_tid = get_raw_tid(vcpu_id as usize); 1122 vm_debug_evt.write(raw_tid as u64).unwrap(); 1123 } 1124 } 1125 #[cfg(target_arch = "x86_64")] 1126 VmExit::IoapicEoi(vector) => { 1127 if let Some(interrupt_controller) = 1128 &interrupt_controller_clone 1129 { 1130 interrupt_controller 1131 .lock() 1132 .unwrap() 1133 .end_of_interrupt(vector); 1134 } 1135 } 1136 VmExit::Ignore => {} 1137 VmExit::Hyperv => {} 1138 VmExit::Reset => { 1139 info!("VmExit::Reset"); 1140 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1141 reset_evt.write(1).unwrap(); 1142 break; 1143 } 1144 VmExit::Shutdown => { 1145 info!("VmExit::Shutdown"); 1146 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1147 exit_evt.write(1).unwrap(); 1148 break; 1149 } 1150 #[cfg(feature = "tdx")] 1151 VmExit::Tdx => { 1152 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1153 match vcpu.get_tdx_exit_details() { 1154 Ok(details) => match details { 1155 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1156 TdxExitDetails::SetupEventNotifyInterrupt => { 1157 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1158 } 1159 }, 1160 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1161 } 1162 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1163 } else { 1164 // We should never reach this code as 1165 // this means the design from the code 1166 // is wrong. 1167 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1168 } 1169 } 1170 }, 1171 1172 Err(e) => { 1173 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1174 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1175 exit_evt.write(1).unwrap(); 1176 break; 1177 } 1178 } 1179 1180 // We've been told to terminate 1181 if vcpu_kill_signalled.load(Ordering::SeqCst) 1182 || vcpu_kill.load(Ordering::SeqCst) 1183 { 1184 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1185 break; 1186 } 1187 } 1188 }) 1189 .or_else(|_| { 1190 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1191 error!("vCPU thread panicked"); 1192 panic_exit_evt.write(1) 1193 }) 1194 .ok(); 1195 }) 1196 .map_err(Error::VcpuSpawn)?, 1197 ); 1198 1199 // On hot plug calls into this function entry_point is None. It is for 1200 // those hotplug CPU additions that we need to set the inserting flag. 1201 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1202 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1203 1204 Ok(()) 1205 } 1206 1207 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1208 fn activate_vcpus( 1209 &mut self, 1210 desired_vcpus: u8, 1211 inserting: bool, 1212 paused: Option<bool>, 1213 ) -> Result<()> { 1214 if desired_vcpus > self.config.max_vcpus { 1215 return Err(Error::DesiredVCpuCountExceedsMax); 1216 } 1217 1218 let vcpu_thread_barrier = Arc::new(Barrier::new( 1219 (desired_vcpus - self.present_vcpus() + 1) as usize, 1220 )); 1221 1222 if let Some(paused) = paused { 1223 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1224 } 1225 1226 info!( 1227 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1228 desired_vcpus, 1229 self.vcpus.len(), 1230 self.present_vcpus(), 1231 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1232 ); 1233 1234 // This reuses any inactive vCPUs as well as any that were newly created 1235 for vcpu_id in self.present_vcpus()..desired_vcpus { 1236 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1237 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1238 } 1239 1240 // Unblock all CPU threads. 1241 vcpu_thread_barrier.wait(); 1242 Ok(()) 1243 } 1244 1245 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1246 // Mark vCPUs for removal, actual removal happens on ejection 1247 for cpu_id in desired_vcpus..self.present_vcpus() { 1248 self.vcpu_states[usize::from(cpu_id)].removing = true; 1249 self.vcpu_states[usize::from(cpu_id)] 1250 .pending_removal 1251 .store(true, Ordering::SeqCst); 1252 } 1253 } 1254 1255 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1256 for state in self.vcpu_states.iter() { 1257 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1258 return true; 1259 } 1260 } 1261 false 1262 } 1263 1264 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1265 info!("Removing vCPU: cpu_id = {}", cpu_id); 1266 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1267 state.kill.store(true, Ordering::SeqCst); 1268 state.signal_thread(); 1269 state.join_thread()?; 1270 state.handle = None; 1271 1272 // Once the thread has exited, clear the "kill" so that it can reused 1273 state.kill.store(false, Ordering::SeqCst); 1274 state.pending_removal.store(false, Ordering::SeqCst); 1275 1276 Ok(()) 1277 } 1278 1279 pub fn create_boot_vcpus( 1280 &mut self, 1281 snapshot: Option<Snapshot>, 1282 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1283 trace_scoped!("create_boot_vcpus"); 1284 1285 self.create_vcpus(self.boot_vcpus(), snapshot) 1286 } 1287 1288 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1289 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1290 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1291 } 1292 1293 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1294 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1295 .map_err(|e| { 1296 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1297 })?; 1298 1299 Ok(()) 1300 } 1301 1302 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1303 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1304 return Ok(false); 1305 } 1306 1307 if !self.dynamic { 1308 return Ok(false); 1309 } 1310 1311 if self.check_pending_removed_vcpu() { 1312 return Err(Error::VcpuPendingRemovedVcpu); 1313 } 1314 1315 match desired_vcpus.cmp(&self.present_vcpus()) { 1316 cmp::Ordering::Greater => { 1317 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1318 for vcpu in vcpus { 1319 self.configure_vcpu(vcpu, None)? 1320 } 1321 self.activate_vcpus(desired_vcpus, true, None)?; 1322 Ok(true) 1323 } 1324 cmp::Ordering::Less => { 1325 self.mark_vcpus_for_removal(desired_vcpus); 1326 Ok(true) 1327 } 1328 _ => Ok(false), 1329 } 1330 } 1331 1332 pub fn shutdown(&mut self) -> Result<()> { 1333 // Tell the vCPUs to stop themselves next time they go through the loop 1334 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1335 1336 // Toggle the vCPUs pause boolean 1337 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1338 1339 // Unpark all the VCPU threads. 1340 for state in self.vcpu_states.iter() { 1341 state.unpark_thread(); 1342 } 1343 1344 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1345 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1346 // above. 1347 for state in self.vcpu_states.iter() { 1348 state.signal_thread(); 1349 } 1350 1351 // Wait for all the threads to finish. This removes the state from the vector. 1352 for mut state in self.vcpu_states.drain(..) { 1353 state.join_thread()?; 1354 } 1355 1356 Ok(()) 1357 } 1358 1359 #[cfg(feature = "tdx")] 1360 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1361 for vcpu in &self.vcpus { 1362 vcpu.lock() 1363 .unwrap() 1364 .vcpu 1365 .tdx_init(hob_address) 1366 .map_err(Error::InitializeTdx)?; 1367 } 1368 Ok(()) 1369 } 1370 1371 pub fn boot_vcpus(&self) -> u8 { 1372 self.config.boot_vcpus 1373 } 1374 1375 pub fn max_vcpus(&self) -> u8 { 1376 self.config.max_vcpus 1377 } 1378 1379 #[cfg(target_arch = "x86_64")] 1380 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1381 assert!(!self.cpuid.is_empty()); 1382 self.cpuid.clone() 1383 } 1384 1385 fn present_vcpus(&self) -> u8 { 1386 self.vcpu_states 1387 .iter() 1388 .fold(0, |acc, state| acc + state.active() as u8) 1389 } 1390 1391 #[cfg(target_arch = "aarch64")] 1392 pub fn get_mpidrs(&self) -> Vec<u64> { 1393 self.vcpus 1394 .iter() 1395 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1396 .collect() 1397 } 1398 1399 #[cfg(target_arch = "aarch64")] 1400 pub fn get_saved_states(&self) -> Vec<CpuState> { 1401 self.vcpus 1402 .iter() 1403 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1404 .collect() 1405 } 1406 1407 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1408 self.config 1409 .topology 1410 .clone() 1411 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1412 } 1413 1414 pub fn create_madt(&self) -> Sdt { 1415 use crate::acpi; 1416 // This is also checked in the commandline parsing. 1417 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1418 1419 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1420 #[cfg(target_arch = "x86_64")] 1421 { 1422 madt.write(36, arch::layout::APIC_START.0); 1423 1424 for cpu in 0..self.config.max_vcpus { 1425 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology()); 1426 1427 let lapic = LocalX2Apic { 1428 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1429 length: 16, 1430 processor_id: cpu.into(), 1431 apic_id: x2apic_id, 1432 flags: if cpu < self.config.boot_vcpus { 1433 1 << MADT_CPU_ENABLE_FLAG 1434 } else { 1435 0 1436 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1437 _reserved: 0, 1438 }; 1439 madt.append(lapic); 1440 } 1441 1442 madt.append(Ioapic { 1443 r#type: acpi::ACPI_APIC_IO, 1444 length: 12, 1445 ioapic_id: 0, 1446 apic_address: arch::layout::IOAPIC_START.0 as u32, 1447 gsi_base: 0, 1448 ..Default::default() 1449 }); 1450 1451 madt.append(InterruptSourceOverride { 1452 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1453 length: 10, 1454 bus: 0, 1455 source: 4, 1456 gsi: 4, 1457 flags: 0, 1458 }); 1459 } 1460 1461 #[cfg(target_arch = "aarch64")] 1462 { 1463 /* Notes: 1464 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1465 */ 1466 1467 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1468 for cpu in 0..self.config.boot_vcpus { 1469 let vcpu = &self.vcpus[cpu as usize]; 1470 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1471 /* ARMv8 MPIDR format: 1472 Bits [63:40] Must be zero 1473 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1474 Bits [31:24] Must be zero 1475 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1476 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1477 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1478 */ 1479 let mpidr_mask = 0xff_00ff_ffff; 1480 let gicc = GicC { 1481 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1482 length: 80, 1483 reserved0: 0, 1484 cpu_interface_number: cpu as u32, 1485 uid: cpu as u32, 1486 flags: 1, 1487 parking_version: 0, 1488 performance_interrupt: 0, 1489 parked_address: 0, 1490 base_address: 0, 1491 gicv_base_address: 0, 1492 gich_base_address: 0, 1493 vgic_interrupt: 0, 1494 gicr_base_address: 0, 1495 mpidr: mpidr & mpidr_mask, 1496 proc_power_effi_class: 0, 1497 reserved1: 0, 1498 spe_overflow_interrupt: 0, 1499 }; 1500 1501 madt.append(gicc); 1502 } 1503 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1504 1505 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1506 let gicd = GicD { 1507 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1508 length: 24, 1509 reserved0: 0, 1510 gic_id: 0, 1511 base_address: vgic_config.dist_addr, 1512 global_irq_base: 0, 1513 version: 3, 1514 reserved1: [0; 3], 1515 }; 1516 madt.append(gicd); 1517 1518 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1519 let gicr = GicR { 1520 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1521 length: 16, 1522 reserved: 0, 1523 base_address: vgic_config.redists_addr, 1524 range_length: vgic_config.redists_size as u32, 1525 }; 1526 madt.append(gicr); 1527 1528 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1529 let gicits = GicIts { 1530 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1531 length: 20, 1532 reserved0: 0, 1533 translation_id: 0, 1534 base_address: vgic_config.msi_addr, 1535 reserved1: 0, 1536 }; 1537 madt.append(gicits); 1538 1539 madt.update_checksum(); 1540 } 1541 1542 madt 1543 } 1544 1545 #[cfg(target_arch = "aarch64")] 1546 pub fn create_pptt(&self) -> Sdt { 1547 let pptt_start = 0; 1548 let mut cpus = 0; 1549 let mut uid = 0; 1550 // If topology is not specified, the default setting is: 1551 // 1 package, multiple cores, 1 thread per core 1552 // This is also the behavior when PPTT is missing. 1553 let (threads_per_core, cores_per_package, packages) = 1554 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1555 1556 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1557 1558 for cluster_idx in 0..packages { 1559 if cpus < self.config.boot_vcpus as usize { 1560 let cluster_offset = pptt.len() - pptt_start; 1561 let cluster_hierarchy_node = ProcessorHierarchyNode { 1562 r#type: 0, 1563 length: 20, 1564 reserved: 0, 1565 flags: 0x2, 1566 parent: 0, 1567 acpi_processor_id: cluster_idx as u32, 1568 num_private_resources: 0, 1569 }; 1570 pptt.append(cluster_hierarchy_node); 1571 1572 for core_idx in 0..cores_per_package { 1573 let core_offset = pptt.len() - pptt_start; 1574 1575 if threads_per_core > 1 { 1576 let core_hierarchy_node = ProcessorHierarchyNode { 1577 r#type: 0, 1578 length: 20, 1579 reserved: 0, 1580 flags: 0x2, 1581 parent: cluster_offset as u32, 1582 acpi_processor_id: core_idx as u32, 1583 num_private_resources: 0, 1584 }; 1585 pptt.append(core_hierarchy_node); 1586 1587 for _thread_idx in 0..threads_per_core { 1588 let thread_hierarchy_node = ProcessorHierarchyNode { 1589 r#type: 0, 1590 length: 20, 1591 reserved: 0, 1592 flags: 0xE, 1593 parent: core_offset as u32, 1594 acpi_processor_id: uid as u32, 1595 num_private_resources: 0, 1596 }; 1597 pptt.append(thread_hierarchy_node); 1598 uid += 1; 1599 } 1600 } else { 1601 let thread_hierarchy_node = ProcessorHierarchyNode { 1602 r#type: 0, 1603 length: 20, 1604 reserved: 0, 1605 flags: 0xA, 1606 parent: cluster_offset as u32, 1607 acpi_processor_id: uid as u32, 1608 num_private_resources: 0, 1609 }; 1610 pptt.append(thread_hierarchy_node); 1611 uid += 1; 1612 } 1613 } 1614 cpus += (cores_per_package * threads_per_core) as usize; 1615 } 1616 } 1617 1618 pptt.update_checksum(); 1619 pptt 1620 } 1621 1622 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1623 fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters { 1624 self.vcpus[usize::from(cpu_id)] 1625 .lock() 1626 .unwrap() 1627 .vcpu 1628 .create_standard_regs() 1629 } 1630 1631 #[cfg(feature = "guest_debug")] 1632 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1633 self.vcpus[usize::from(cpu_id)] 1634 .lock() 1635 .unwrap() 1636 .vcpu 1637 .get_regs() 1638 .map_err(Error::CpuDebug) 1639 } 1640 1641 #[cfg(feature = "guest_debug")] 1642 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1643 self.vcpus[usize::from(cpu_id)] 1644 .lock() 1645 .unwrap() 1646 .vcpu 1647 .set_regs(regs) 1648 .map_err(Error::CpuDebug) 1649 } 1650 1651 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1652 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1653 self.vcpus[usize::from(cpu_id)] 1654 .lock() 1655 .unwrap() 1656 .vcpu 1657 .get_sregs() 1658 .map_err(Error::CpuDebug) 1659 } 1660 1661 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1662 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1663 self.vcpus[usize::from(cpu_id)] 1664 .lock() 1665 .unwrap() 1666 .vcpu 1667 .set_sregs(sregs) 1668 .map_err(Error::CpuDebug) 1669 } 1670 1671 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1672 fn translate_gva( 1673 &self, 1674 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1675 cpu_id: u8, 1676 gva: u64, 1677 ) -> Result<u64> { 1678 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1679 .lock() 1680 .unwrap() 1681 .vcpu 1682 .translate_gva(gva, /* flags: unused */ 0) 1683 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1684 Ok(gpa) 1685 } 1686 1687 /// 1688 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1689 /// it in VMM by walking through translation tables. 1690 /// 1691 /// Address translation is big topic, here we only focus the scenario that 1692 /// happens in VMM while debugging kernel. This `translate_gva` 1693 /// implementation is restricted to: 1694 /// - Exception Level 1 1695 /// - Translate high address range only (kernel space) 1696 /// 1697 /// This implementation supports following Arm-v8a features related to 1698 /// address translation: 1699 /// - FEAT_LPA 1700 /// - FEAT_LVA 1701 /// - FEAT_LPA2 1702 /// 1703 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1704 fn translate_gva( 1705 &self, 1706 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1707 cpu_id: u8, 1708 gva: u64, 1709 ) -> Result<u64> { 1710 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1711 .lock() 1712 .unwrap() 1713 .vcpu 1714 .get_sys_reg(regs::TCR_EL1) 1715 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1716 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1717 .lock() 1718 .unwrap() 1719 .vcpu 1720 .get_sys_reg(regs::TTBR1_EL1) 1721 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1722 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1723 .lock() 1724 .unwrap() 1725 .vcpu 1726 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1727 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1728 1729 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1730 // or low (0x000xxx...). 1731 let high_range = extract_bits_64!(gva, 55, 1); 1732 if high_range == 0 { 1733 info!("VA (0x{:x}) range is not supported!", gva); 1734 return Ok(gva); 1735 } 1736 1737 // High range size offset 1738 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1739 // Granule size 1740 let tg = extract_bits_64!(tcr_el1, 30, 2); 1741 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1742 let ds = extract_bits_64!(tcr_el1, 59, 1); 1743 1744 if tsz == 0 { 1745 info!("VA translation is not ready!"); 1746 return Ok(gva); 1747 } 1748 1749 // VA size is determined by TCR_BL1.T1SZ 1750 let va_size = 64 - tsz; 1751 // Number of bits in VA consumed in each level of translation 1752 let stride = match tg { 1753 3 => 13, // 64KB granule size 1754 1 => 11, // 16KB granule size 1755 _ => 9, // 4KB, default 1756 }; 1757 // Starting level of walking 1758 let mut level = 4 - (va_size - 4) / stride; 1759 1760 // PA or IPA size is determined 1761 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1762 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1763 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1764 // To be safe, we use the minimum value if they are different. 1765 let pa_range = std::cmp::min(tcr_ips, pa_range); 1766 // PA size in bits 1767 let pa_size = match pa_range { 1768 0 => 32, 1769 1 => 36, 1770 2 => 40, 1771 3 => 42, 1772 4 => 44, 1773 5 => 48, 1774 6 => 52, 1775 _ => { 1776 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1777 "PA range not supported {pa_range}" 1778 )))) 1779 } 1780 }; 1781 1782 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1783 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1784 // If FEAT_LPA2 is present, the translation table descriptor holds 1785 // 50 bits of the table address of next level. 1786 // Otherwise, it is 48 bits. 1787 let descaddrmask = if ds == 1 { 1788 !0u64 >> (64 - 50) // mask with 50 least significant bits 1789 } else { 1790 !0u64 >> (64 - 48) // mask with 48 least significant bits 1791 }; 1792 let descaddrmask = descaddrmask & !indexmask_grainsize; 1793 1794 // Translation table base address 1795 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1796 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1797 // address bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1798 if pa_size == 52 { 1799 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1800 } 1801 1802 // Loop through tables of each level 1803 loop { 1804 // Table offset for current level 1805 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1806 descaddr |= table_offset; 1807 descaddr &= !7u64; 1808 1809 let mut buf = [0; 8]; 1810 guest_memory 1811 .memory() 1812 .read(&mut buf, GuestAddress(descaddr)) 1813 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1814 let descriptor = u64::from_le_bytes(buf); 1815 1816 descaddr = descriptor & descaddrmask; 1817 // In the case of FEAT_LPA, the next-level translation table address 1818 // bits [48:51] comes from bits [12:15] of the current descriptor. 1819 // For FEAT_LPA2, the next-level translation table address 1820 // bits [50:51] comes from bits [8:9] of the current descriptor, 1821 // bits [48:49] comes from bits [48:49] of the descriptor which was 1822 // handled previously. 1823 if pa_size == 52 { 1824 if ds == 1 { 1825 // FEAT_LPA2 1826 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1827 } else { 1828 // FEAT_LPA 1829 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1830 } 1831 } 1832 1833 if (descriptor & 2) != 0 && (level < 3) { 1834 // This is a table entry. Go down to next level. 1835 level += 1; 1836 indexmask = indexmask_grainsize; 1837 continue; 1838 } 1839 1840 break; 1841 } 1842 1843 // We have reached either: 1844 // - a page entry at level 3 or 1845 // - a block entry at level 1 or 2 1846 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1847 descaddr &= !(page_size - 1); 1848 descaddr |= gva & (page_size - 1); 1849 1850 Ok(descaddr) 1851 } 1852 1853 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1854 self.acpi_address = Some(acpi_address); 1855 } 1856 1857 pub(crate) fn set_interrupt_controller( 1858 &mut self, 1859 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1860 ) { 1861 self.interrupt_controller = Some(interrupt_controller); 1862 } 1863 1864 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1865 &self.vcpus_kill_signalled 1866 } 1867 1868 #[cfg(feature = "igvm")] 1869 pub(crate) fn get_cpuid_leaf( 1870 &self, 1871 cpu_id: u8, 1872 eax: u32, 1873 ecx: u32, 1874 xfem: u64, 1875 xss: u64, 1876 ) -> Result<[u32; 4]> { 1877 let leaf_info = self.vcpus[usize::from(cpu_id)] 1878 .lock() 1879 .unwrap() 1880 .vcpu 1881 .get_cpuid_values(eax, ecx, xfem, xss) 1882 .unwrap(); 1883 Ok(leaf_info) 1884 } 1885 1886 #[cfg(feature = "sev_snp")] 1887 pub(crate) fn sev_snp_enabled(&self) -> bool { 1888 self.sev_snp_enabled 1889 } 1890 1891 pub(crate) fn nmi(&self) -> Result<()> { 1892 self.vcpus_kick_signalled.store(true, Ordering::SeqCst); 1893 1894 for state in self.vcpu_states.iter() { 1895 state.signal_thread(); 1896 } 1897 1898 self.vcpus_kick_signalled.store(false, Ordering::SeqCst); 1899 1900 Ok(()) 1901 } 1902 } 1903 1904 struct Cpu { 1905 cpu_id: u8, 1906 proximity_domain: u32, 1907 dynamic: bool, 1908 #[cfg(target_arch = "x86_64")] 1909 topology: Option<(u8, u8, u8)>, 1910 } 1911 1912 #[cfg(target_arch = "x86_64")] 1913 const MADT_CPU_ENABLE_FLAG: usize = 0; 1914 1915 #[cfg(target_arch = "x86_64")] 1916 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1917 1918 impl Cpu { 1919 #[cfg(target_arch = "x86_64")] 1920 fn generate_mat(&self) -> Vec<u8> { 1921 let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology); 1922 1923 let lapic = LocalX2Apic { 1924 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1925 length: 16, 1926 processor_id: self.cpu_id.into(), 1927 apic_id: x2apic_id, 1928 flags: 1 << MADT_CPU_ENABLE_FLAG, 1929 _reserved: 0, 1930 }; 1931 1932 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1933 // SAFETY: mat_data is large enough to hold lapic 1934 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1935 1936 mat_data 1937 } 1938 } 1939 1940 impl Aml for Cpu { 1941 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1942 #[cfg(target_arch = "x86_64")] 1943 let mat_data: Vec<u8> = self.generate_mat(); 1944 #[allow(clippy::if_same_then_else)] 1945 if self.dynamic { 1946 aml::Device::new( 1947 format!("C{:03X}", self.cpu_id).as_str().into(), 1948 vec![ 1949 &aml::Name::new("_HID".into(), &"ACPI0007"), 1950 &aml::Name::new("_UID".into(), &self.cpu_id), 1951 // Currently, AArch64 cannot support following fields. 1952 /* 1953 _STA return value: 1954 Bit [0] – Set if the device is present. 1955 Bit [1] – Set if the device is enabled and decoding its resources. 1956 Bit [2] – Set if the device should be shown in the UI. 1957 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1958 Bit [4] – Set if the battery is present. 1959 Bits [31:5] – Reserved (must be cleared). 1960 */ 1961 #[cfg(target_arch = "x86_64")] 1962 &aml::Method::new( 1963 "_STA".into(), 1964 0, 1965 false, 1966 // Call into CSTA method which will interrogate device 1967 vec![&aml::Return::new(&aml::MethodCall::new( 1968 "CSTA".into(), 1969 vec![&self.cpu_id], 1970 ))], 1971 ), 1972 &aml::Method::new( 1973 "_PXM".into(), 1974 0, 1975 false, 1976 vec![&aml::Return::new(&self.proximity_domain)], 1977 ), 1978 // The Linux kernel expects every CPU device to have a _MAT entry 1979 // containing the LAPIC for this processor with the enabled bit set 1980 // even it if is disabled in the MADT (non-boot CPU) 1981 #[cfg(target_arch = "x86_64")] 1982 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1983 // Trigger CPU ejection 1984 #[cfg(target_arch = "x86_64")] 1985 &aml::Method::new( 1986 "_EJ0".into(), 1987 1, 1988 false, 1989 // Call into CEJ0 method which will actually eject device 1990 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1991 ), 1992 ], 1993 ) 1994 .to_aml_bytes(sink); 1995 } else { 1996 aml::Device::new( 1997 format!("C{:03X}", self.cpu_id).as_str().into(), 1998 vec![ 1999 &aml::Name::new("_HID".into(), &"ACPI0007"), 2000 &aml::Name::new("_UID".into(), &self.cpu_id), 2001 #[cfg(target_arch = "x86_64")] 2002 &aml::Method::new( 2003 "_STA".into(), 2004 0, 2005 false, 2006 // Mark CPU present see CSTA implementation 2007 vec![&aml::Return::new(&0xfu8)], 2008 ), 2009 &aml::Method::new( 2010 "_PXM".into(), 2011 0, 2012 false, 2013 vec![&aml::Return::new(&self.proximity_domain)], 2014 ), 2015 // The Linux kernel expects every CPU device to have a _MAT entry 2016 // containing the LAPIC for this processor with the enabled bit set 2017 // even it if is disabled in the MADT (non-boot CPU) 2018 #[cfg(target_arch = "x86_64")] 2019 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 2020 ], 2021 ) 2022 .to_aml_bytes(sink); 2023 } 2024 } 2025 } 2026 2027 struct CpuNotify { 2028 cpu_id: u8, 2029 } 2030 2031 impl Aml for CpuNotify { 2032 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2033 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 2034 aml::If::new( 2035 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 2036 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2037 ) 2038 .to_aml_bytes(sink) 2039 } 2040 } 2041 2042 struct CpuMethods { 2043 max_vcpus: u8, 2044 dynamic: bool, 2045 } 2046 2047 impl Aml for CpuMethods { 2048 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2049 if self.dynamic { 2050 // CPU status method 2051 aml::Method::new( 2052 "CSTA".into(), 2053 1, 2054 true, 2055 vec![ 2056 // Take lock defined above 2057 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2058 // Write CPU number (in first argument) to I/O port via field 2059 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2060 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2061 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2062 &aml::If::new( 2063 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 2064 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2065 ), 2066 // Release lock 2067 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2068 // Return 0 or 0xf 2069 &aml::Return::new(&aml::Local(0)), 2070 ], 2071 ) 2072 .to_aml_bytes(sink); 2073 2074 let mut cpu_notifies = Vec::new(); 2075 for cpu_id in 0..self.max_vcpus { 2076 cpu_notifies.push(CpuNotify { cpu_id }); 2077 } 2078 2079 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2080 for cpu_id in 0..self.max_vcpus { 2081 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 2082 } 2083 2084 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 2085 2086 aml::Method::new( 2087 "CEJ0".into(), 2088 1, 2089 true, 2090 vec![ 2091 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2092 // Write CPU number (in first argument) to I/O port via field 2093 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2094 // Set CEJ0 bit 2095 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 2096 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2097 ], 2098 ) 2099 .to_aml_bytes(sink); 2100 2101 aml::Method::new( 2102 "CSCN".into(), 2103 0, 2104 true, 2105 vec![ 2106 // Take lock defined above 2107 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2108 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2109 &aml::While::new( 2110 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 2111 vec![ 2112 // Write CPU number (in first argument) to I/O port via field 2113 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 2114 // Check if CINS bit is set 2115 &aml::If::new( 2116 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 2117 // Notify device if it is 2118 vec![ 2119 &aml::MethodCall::new( 2120 "CTFY".into(), 2121 vec![&aml::Local(0), &aml::ONE], 2122 ), 2123 // Reset CINS bit 2124 &aml::Store::new( 2125 &aml::Path::new("\\_SB_.PRES.CINS"), 2126 &aml::ONE, 2127 ), 2128 ], 2129 ), 2130 // Check if CRMV bit is set 2131 &aml::If::new( 2132 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 2133 // Notify device if it is (with the eject constant 0x3) 2134 vec![ 2135 &aml::MethodCall::new( 2136 "CTFY".into(), 2137 vec![&aml::Local(0), &3u8], 2138 ), 2139 // Reset CRMV bit 2140 &aml::Store::new( 2141 &aml::Path::new("\\_SB_.PRES.CRMV"), 2142 &aml::ONE, 2143 ), 2144 ], 2145 ), 2146 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2147 ], 2148 ), 2149 // Release lock 2150 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2151 ], 2152 ) 2153 .to_aml_bytes(sink) 2154 } else { 2155 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2156 } 2157 } 2158 } 2159 2160 impl Aml for CpuManager { 2161 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2162 #[cfg(target_arch = "x86_64")] 2163 if let Some(acpi_address) = self.acpi_address { 2164 // CPU hotplug controller 2165 aml::Device::new( 2166 "_SB_.PRES".into(), 2167 vec![ 2168 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2169 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2170 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2171 &aml::Mutex::new("CPLK".into(), 0), 2172 &aml::Name::new( 2173 "_CRS".into(), 2174 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2175 aml::AddressSpaceCacheable::NotCacheable, 2176 true, 2177 acpi_address.0, 2178 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2179 None, 2180 )]), 2181 ), 2182 // OpRegion and Fields map MMIO range into individual field values 2183 &aml::OpRegion::new( 2184 "PRST".into(), 2185 aml::OpRegionSpace::SystemMemory, 2186 &(acpi_address.0 as usize), 2187 &CPU_MANAGER_ACPI_SIZE, 2188 ), 2189 &aml::Field::new( 2190 "PRST".into(), 2191 aml::FieldAccessType::Byte, 2192 aml::FieldLockRule::NoLock, 2193 aml::FieldUpdateRule::WriteAsZeroes, 2194 vec![ 2195 aml::FieldEntry::Reserved(32), 2196 aml::FieldEntry::Named(*b"CPEN", 1), 2197 aml::FieldEntry::Named(*b"CINS", 1), 2198 aml::FieldEntry::Named(*b"CRMV", 1), 2199 aml::FieldEntry::Named(*b"CEJ0", 1), 2200 aml::FieldEntry::Reserved(4), 2201 aml::FieldEntry::Named(*b"CCMD", 8), 2202 ], 2203 ), 2204 &aml::Field::new( 2205 "PRST".into(), 2206 aml::FieldAccessType::DWord, 2207 aml::FieldLockRule::NoLock, 2208 aml::FieldUpdateRule::Preserve, 2209 vec![ 2210 aml::FieldEntry::Named(*b"CSEL", 32), 2211 aml::FieldEntry::Reserved(32), 2212 aml::FieldEntry::Named(*b"CDAT", 32), 2213 ], 2214 ), 2215 ], 2216 ) 2217 .to_aml_bytes(sink); 2218 } 2219 2220 // CPU devices 2221 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2222 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2223 // Bundle methods together under a common object 2224 let methods = CpuMethods { 2225 max_vcpus: self.config.max_vcpus, 2226 dynamic: self.dynamic, 2227 }; 2228 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2229 2230 #[cfg(target_arch = "x86_64")] 2231 let topology = self.get_vcpu_topology(); 2232 let mut cpu_devices = Vec::new(); 2233 for cpu_id in 0..self.config.max_vcpus { 2234 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2235 let cpu_device = Cpu { 2236 cpu_id, 2237 proximity_domain, 2238 dynamic: self.dynamic, 2239 #[cfg(target_arch = "x86_64")] 2240 topology, 2241 }; 2242 2243 cpu_devices.push(cpu_device); 2244 } 2245 2246 for cpu_device in cpu_devices.iter() { 2247 cpu_data_inner.push(cpu_device); 2248 } 2249 2250 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2251 } 2252 } 2253 2254 impl Pausable for CpuManager { 2255 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2256 // Tell the vCPUs to pause themselves next time they exit 2257 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2258 2259 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2260 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2261 // above. 2262 for state in self.vcpu_states.iter() { 2263 state.signal_thread(); 2264 } 2265 2266 for vcpu in self.vcpus.iter() { 2267 let mut vcpu = vcpu.lock().unwrap(); 2268 vcpu.pause()?; 2269 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2270 if !self.config.kvm_hyperv { 2271 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2272 MigratableError::Pause(anyhow!( 2273 "Could not notify guest it has been paused {:?}", 2274 e 2275 )) 2276 })?; 2277 } 2278 } 2279 2280 // The vCPU thread will change its paused state before parking, wait here for each 2281 // activated vCPU change their state to ensure they have parked. 2282 for state in self.vcpu_states.iter() { 2283 if state.active() { 2284 while !state.paused.load(Ordering::SeqCst) { 2285 // To avoid a priority inversion with the vCPU thread 2286 thread::sleep(std::time::Duration::from_millis(1)); 2287 } 2288 } 2289 } 2290 2291 Ok(()) 2292 } 2293 2294 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2295 for vcpu in self.vcpus.iter() { 2296 vcpu.lock().unwrap().resume()?; 2297 } 2298 2299 // Toggle the vCPUs pause boolean 2300 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2301 2302 // Unpark all the VCPU threads. 2303 // Once unparked, the next thing they will do is checking for the pause 2304 // boolean. Since it'll be set to false, they will exit their pause loop 2305 // and go back to vmx root. 2306 for state in self.vcpu_states.iter() { 2307 state.paused.store(false, Ordering::SeqCst); 2308 state.unpark_thread(); 2309 } 2310 Ok(()) 2311 } 2312 } 2313 2314 impl Snapshottable for CpuManager { 2315 fn id(&self) -> String { 2316 CPU_MANAGER_SNAPSHOT_ID.to_string() 2317 } 2318 2319 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2320 let mut cpu_manager_snapshot = Snapshot::default(); 2321 2322 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2323 for vcpu in &self.vcpus { 2324 let mut vcpu = vcpu.lock().unwrap(); 2325 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2326 } 2327 2328 Ok(cpu_manager_snapshot) 2329 } 2330 } 2331 2332 impl Transportable for CpuManager {} 2333 impl Migratable for CpuManager {} 2334 2335 #[cfg(feature = "guest_debug")] 2336 impl Debuggable for CpuManager { 2337 #[cfg(feature = "kvm")] 2338 fn set_guest_debug( 2339 &self, 2340 cpu_id: usize, 2341 addrs: &[GuestAddress], 2342 singlestep: bool, 2343 ) -> std::result::Result<(), DebuggableError> { 2344 self.vcpus[cpu_id] 2345 .lock() 2346 .unwrap() 2347 .vcpu 2348 .set_guest_debug(addrs, singlestep) 2349 .map_err(DebuggableError::SetDebug) 2350 } 2351 2352 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2353 Ok(()) 2354 } 2355 2356 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2357 Ok(()) 2358 } 2359 2360 #[cfg(target_arch = "x86_64")] 2361 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2362 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2363 let gregs = self 2364 .get_regs(cpu_id as u8) 2365 .map_err(DebuggableError::ReadRegs)?; 2366 let regs = [ 2367 gregs.get_rax(), 2368 gregs.get_rbx(), 2369 gregs.get_rcx(), 2370 gregs.get_rdx(), 2371 gregs.get_rsi(), 2372 gregs.get_rdi(), 2373 gregs.get_rbp(), 2374 gregs.get_rsp(), 2375 gregs.get_r8(), 2376 gregs.get_r9(), 2377 gregs.get_r10(), 2378 gregs.get_r11(), 2379 gregs.get_r12(), 2380 gregs.get_r13(), 2381 gregs.get_r14(), 2382 gregs.get_r15(), 2383 ]; 2384 2385 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2386 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2387 let eflags = gregs.get_rflags() as u32; 2388 let rip = gregs.get_rip(); 2389 2390 // Segment registers: CS, SS, DS, ES, FS, GS 2391 let sregs = self 2392 .get_sregs(cpu_id as u8) 2393 .map_err(DebuggableError::ReadRegs)?; 2394 let segments = X86SegmentRegs { 2395 cs: sregs.cs.selector as u32, 2396 ss: sregs.ss.selector as u32, 2397 ds: sregs.ds.selector as u32, 2398 es: sregs.es.selector as u32, 2399 fs: sregs.fs.selector as u32, 2400 gs: sregs.gs.selector as u32, 2401 }; 2402 2403 // TODO: Add other registers 2404 2405 Ok(CoreRegs { 2406 regs, 2407 eflags, 2408 rip, 2409 segments, 2410 ..Default::default() 2411 }) 2412 } 2413 2414 #[cfg(target_arch = "aarch64")] 2415 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2416 let gregs = self 2417 .get_regs(cpu_id as u8) 2418 .map_err(DebuggableError::ReadRegs)?; 2419 Ok(CoreRegs { 2420 x: gregs.get_regs(), 2421 sp: gregs.get_sp(), 2422 pc: gregs.get_pc(), 2423 ..Default::default() 2424 }) 2425 } 2426 2427 #[cfg(target_arch = "x86_64")] 2428 fn write_regs( 2429 &self, 2430 cpu_id: usize, 2431 regs: &CoreRegs, 2432 ) -> std::result::Result<(), DebuggableError> { 2433 let orig_gregs = self 2434 .get_regs(cpu_id as u8) 2435 .map_err(DebuggableError::ReadRegs)?; 2436 let mut gregs = self.create_standard_regs(cpu_id as u8); 2437 gregs.set_rax(regs.regs[0]); 2438 gregs.set_rbx(regs.regs[1]); 2439 gregs.set_rcx(regs.regs[2]); 2440 gregs.set_rdx(regs.regs[3]); 2441 gregs.set_rsi(regs.regs[4]); 2442 gregs.set_rdi(regs.regs[5]); 2443 gregs.set_rbp(regs.regs[6]); 2444 gregs.set_rsp(regs.regs[7]); 2445 gregs.set_r8(regs.regs[8]); 2446 gregs.set_r9(regs.regs[9]); 2447 gregs.set_r10(regs.regs[10]); 2448 gregs.set_r11(regs.regs[11]); 2449 gregs.set_r12(regs.regs[12]); 2450 gregs.set_r13(regs.regs[13]); 2451 gregs.set_r14(regs.regs[14]); 2452 gregs.set_r15(regs.regs[15]); 2453 gregs.set_rip(regs.rip); 2454 // Update the lower 32-bit of rflags. 2455 gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64)); 2456 2457 self.set_regs(cpu_id as u8, &gregs) 2458 .map_err(DebuggableError::WriteRegs)?; 2459 2460 // Segment registers: CS, SS, DS, ES, FS, GS 2461 // Since GDB care only selectors, we call get_sregs() first. 2462 let mut sregs = self 2463 .get_sregs(cpu_id as u8) 2464 .map_err(DebuggableError::ReadRegs)?; 2465 sregs.cs.selector = regs.segments.cs as u16; 2466 sregs.ss.selector = regs.segments.ss as u16; 2467 sregs.ds.selector = regs.segments.ds as u16; 2468 sregs.es.selector = regs.segments.es as u16; 2469 sregs.fs.selector = regs.segments.fs as u16; 2470 sregs.gs.selector = regs.segments.gs as u16; 2471 2472 self.set_sregs(cpu_id as u8, &sregs) 2473 .map_err(DebuggableError::WriteRegs)?; 2474 2475 // TODO: Add other registers 2476 2477 Ok(()) 2478 } 2479 2480 #[cfg(target_arch = "aarch64")] 2481 fn write_regs( 2482 &self, 2483 cpu_id: usize, 2484 regs: &CoreRegs, 2485 ) -> std::result::Result<(), DebuggableError> { 2486 let mut gregs = self 2487 .get_regs(cpu_id as u8) 2488 .map_err(DebuggableError::ReadRegs)?; 2489 2490 gregs.set_regs(regs.x); 2491 gregs.set_sp(regs.sp); 2492 gregs.set_pc(regs.pc); 2493 2494 self.set_regs(cpu_id as u8, &gregs) 2495 .map_err(DebuggableError::WriteRegs)?; 2496 2497 Ok(()) 2498 } 2499 2500 fn read_mem( 2501 &self, 2502 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2503 cpu_id: usize, 2504 vaddr: GuestAddress, 2505 len: usize, 2506 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2507 let mut buf = vec![0; len]; 2508 let mut total_read = 0_u64; 2509 2510 while total_read < len as u64 { 2511 let gaddr = vaddr.0 + total_read; 2512 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2513 Ok(paddr) => paddr, 2514 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2515 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2516 }; 2517 let psize = arch::PAGE_SIZE as u64; 2518 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2519 guest_memory 2520 .memory() 2521 .read( 2522 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2523 GuestAddress(paddr), 2524 ) 2525 .map_err(DebuggableError::ReadMem)?; 2526 total_read += read_len; 2527 } 2528 Ok(buf) 2529 } 2530 2531 fn write_mem( 2532 &self, 2533 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2534 cpu_id: usize, 2535 vaddr: &GuestAddress, 2536 data: &[u8], 2537 ) -> std::result::Result<(), DebuggableError> { 2538 let mut total_written = 0_u64; 2539 2540 while total_written < data.len() as u64 { 2541 let gaddr = vaddr.0 + total_written; 2542 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2543 Ok(paddr) => paddr, 2544 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2545 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2546 }; 2547 let psize = arch::PAGE_SIZE as u64; 2548 let write_len = std::cmp::min( 2549 data.len() as u64 - total_written, 2550 psize - (paddr & (psize - 1)), 2551 ); 2552 guest_memory 2553 .memory() 2554 .write( 2555 &data[total_written as usize..total_written as usize + write_len as usize], 2556 GuestAddress(paddr), 2557 ) 2558 .map_err(DebuggableError::WriteMem)?; 2559 total_written += write_len; 2560 } 2561 Ok(()) 2562 } 2563 2564 fn active_vcpus(&self) -> usize { 2565 self.present_vcpus() as usize 2566 } 2567 } 2568 2569 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2570 impl Elf64Writable for CpuManager {} 2571 2572 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2573 impl CpuElf64Writable for CpuManager { 2574 fn cpu_write_elf64_note( 2575 &mut self, 2576 dump_state: &DumpState, 2577 ) -> std::result::Result<(), GuestDebuggableError> { 2578 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2579 for vcpu in &self.vcpus { 2580 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2581 let mut pos: usize = 0; 2582 let mut buf = vec![0; note_size as usize]; 2583 let descsz = size_of::<X86_64ElfPrStatus>(); 2584 let vcpu_id = vcpu.lock().unwrap().id; 2585 2586 let note = Elf64_Nhdr { 2587 n_namesz: COREDUMP_NAME_SIZE, 2588 n_descsz: descsz as u32, 2589 n_type: NT_PRSTATUS, 2590 }; 2591 2592 let bytes: &[u8] = note.as_slice(); 2593 buf.splice(0.., bytes.to_vec()); 2594 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2595 buf.resize(pos + 4, 0); 2596 buf.splice(pos.., "CORE".to_string().into_bytes()); 2597 2598 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2599 buf.resize(pos + 32 + 4, 0); 2600 let pid = vcpu_id as u64; 2601 let bytes: &[u8] = pid.as_slice(); 2602 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2603 2604 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2605 2606 let orig_rax: u64 = 0; 2607 let gregs = self.vcpus[usize::from(vcpu_id)] 2608 .lock() 2609 .unwrap() 2610 .vcpu 2611 .get_regs() 2612 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2613 2614 let regs1 = [ 2615 gregs.get_r15(), 2616 gregs.get_r14(), 2617 gregs.get_r13(), 2618 gregs.get_r12(), 2619 gregs.get_rbp(), 2620 gregs.get_rbx(), 2621 gregs.get_r11(), 2622 gregs.get_r10(), 2623 ]; 2624 let regs2 = [ 2625 gregs.get_r9(), 2626 gregs.get_r8(), 2627 gregs.get_rax(), 2628 gregs.get_rcx(), 2629 gregs.get_rdx(), 2630 gregs.get_rsi(), 2631 gregs.get_rdi(), 2632 orig_rax, 2633 ]; 2634 2635 let sregs = self.vcpus[usize::from(vcpu_id)] 2636 .lock() 2637 .unwrap() 2638 .vcpu 2639 .get_sregs() 2640 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2641 2642 debug!( 2643 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2644 gregs.get_rip(), 2645 gregs.get_rsp(), 2646 sregs.gs.base, 2647 sregs.cs.selector, 2648 sregs.ss.selector, 2649 sregs.ds.selector, 2650 ); 2651 2652 let regs = X86_64UserRegs { 2653 regs1, 2654 regs2, 2655 rip: gregs.get_rip(), 2656 cs: sregs.cs.selector as u64, 2657 eflags: gregs.get_rflags(), 2658 rsp: gregs.get_rsp(), 2659 ss: sregs.ss.selector as u64, 2660 fs_base: sregs.fs.base, 2661 gs_base: sregs.gs.base, 2662 ds: sregs.ds.selector as u64, 2663 es: sregs.es.selector as u64, 2664 fs: sregs.fs.selector as u64, 2665 gs: sregs.gs.selector as u64, 2666 }; 2667 2668 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2669 let bytes: &[u8] = regs.as_slice(); 2670 buf.resize(note_size as usize, 0); 2671 buf.splice(pos.., bytes.to_vec()); 2672 buf.resize(note_size as usize, 0); 2673 2674 coredump_file 2675 .write(&buf) 2676 .map_err(GuestDebuggableError::CoredumpFile)?; 2677 } 2678 2679 Ok(()) 2680 } 2681 2682 fn cpu_write_vmm_note( 2683 &mut self, 2684 dump_state: &DumpState, 2685 ) -> std::result::Result<(), GuestDebuggableError> { 2686 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2687 for vcpu in &self.vcpus { 2688 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2689 let mut pos: usize = 0; 2690 let mut buf = vec![0; note_size as usize]; 2691 let descsz = size_of::<DumpCpusState>(); 2692 let vcpu_id = vcpu.lock().unwrap().id; 2693 2694 let note = Elf64_Nhdr { 2695 n_namesz: COREDUMP_NAME_SIZE, 2696 n_descsz: descsz as u32, 2697 n_type: 0, 2698 }; 2699 2700 let bytes: &[u8] = note.as_slice(); 2701 buf.splice(0.., bytes.to_vec()); 2702 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2703 2704 buf.resize(pos + 4, 0); 2705 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2706 2707 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2708 2709 let gregs = self.vcpus[usize::from(vcpu_id)] 2710 .lock() 2711 .unwrap() 2712 .vcpu 2713 .get_regs() 2714 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2715 2716 let regs1 = [ 2717 gregs.get_rax(), 2718 gregs.get_rbx(), 2719 gregs.get_rcx(), 2720 gregs.get_rdx(), 2721 gregs.get_rsi(), 2722 gregs.get_rdi(), 2723 gregs.get_rsp(), 2724 gregs.get_rbp(), 2725 ]; 2726 2727 let regs2 = [ 2728 gregs.get_r8(), 2729 gregs.get_r9(), 2730 gregs.get_r10(), 2731 gregs.get_r11(), 2732 gregs.get_r12(), 2733 gregs.get_r13(), 2734 gregs.get_r14(), 2735 gregs.get_r15(), 2736 ]; 2737 2738 let sregs = self.vcpus[usize::from(vcpu_id)] 2739 .lock() 2740 .unwrap() 2741 .vcpu 2742 .get_sregs() 2743 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2744 2745 let mut msrs = vec![MsrEntry { 2746 index: msr_index::MSR_KERNEL_GS_BASE, 2747 ..Default::default() 2748 }]; 2749 2750 self.vcpus[vcpu_id as usize] 2751 .lock() 2752 .unwrap() 2753 .vcpu 2754 .get_msrs(&mut msrs) 2755 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2756 let kernel_gs_base = msrs[0].data; 2757 2758 let cs = CpuSegment::new(sregs.cs); 2759 let ds = CpuSegment::new(sregs.ds); 2760 let es = CpuSegment::new(sregs.es); 2761 let fs = CpuSegment::new(sregs.fs); 2762 let gs = CpuSegment::new(sregs.gs); 2763 let ss = CpuSegment::new(sregs.ss); 2764 let ldt = CpuSegment::new(sregs.ldt); 2765 let tr = CpuSegment::new(sregs.tr); 2766 let gdt = CpuSegment::new_from_table(sregs.gdt); 2767 let idt = CpuSegment::new_from_table(sregs.idt); 2768 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2769 let regs = DumpCpusState { 2770 version: 1, 2771 size: size_of::<DumpCpusState>() as u32, 2772 regs1, 2773 regs2, 2774 rip: gregs.get_rip(), 2775 rflags: gregs.get_rflags(), 2776 cs, 2777 ds, 2778 es, 2779 fs, 2780 gs, 2781 ss, 2782 ldt, 2783 tr, 2784 gdt, 2785 idt, 2786 cr, 2787 kernel_gs_base, 2788 }; 2789 2790 let bytes: &[u8] = regs.as_slice(); 2791 buf.resize(note_size as usize, 0); 2792 buf.splice(pos.., bytes.to_vec()); 2793 buf.resize(note_size as usize, 0); 2794 2795 coredump_file 2796 .write(&buf) 2797 .map_err(GuestDebuggableError::CoredumpFile)?; 2798 } 2799 2800 Ok(()) 2801 } 2802 } 2803 2804 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2805 #[cfg(test)] 2806 mod tests { 2807 use arch::layout::BOOT_STACK_POINTER; 2808 use arch::layout::ZERO_PAGE_START; 2809 use arch::x86_64::interrupts::*; 2810 use arch::x86_64::regs::*; 2811 use hypervisor::arch::x86::{FpuState, LapicState}; 2812 use hypervisor::StandardRegisters; 2813 use linux_loader::loader::bootparam::setup_header; 2814 2815 #[test] 2816 fn test_setlint() { 2817 let hv = hypervisor::new().unwrap(); 2818 let vm = hv.create_vm().expect("new VM fd creation failed"); 2819 assert!(hv.check_required_extensions().is_ok()); 2820 // Calling get_lapic will fail if there is no irqchip before hand. 2821 assert!(vm.create_irq_chip().is_ok()); 2822 let vcpu = vm.create_vcpu(0, None).unwrap(); 2823 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2824 2825 // Compute the value that is expected to represent LVT0 and LVT1. 2826 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2827 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2828 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2829 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2830 2831 set_lint(&vcpu).unwrap(); 2832 2833 // Compute the value that represents LVT0 and LVT1 after set_lint. 2834 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2835 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2836 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2837 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2838 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2839 } 2840 2841 #[test] 2842 fn test_setup_fpu() { 2843 let hv = hypervisor::new().unwrap(); 2844 let vm = hv.create_vm().expect("new VM fd creation failed"); 2845 let vcpu = vm.create_vcpu(0, None).unwrap(); 2846 setup_fpu(&vcpu).unwrap(); 2847 2848 let expected_fpu: FpuState = FpuState { 2849 fcw: 0x37f, 2850 mxcsr: 0x1f80, 2851 ..Default::default() 2852 }; 2853 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2854 // TODO: auto-generate kvm related structures with PartialEq on. 2855 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2856 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2857 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2858 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2859 // remove it at all. 2860 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2861 } 2862 2863 #[test] 2864 fn test_setup_msrs() { 2865 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2866 2867 let hv = hypervisor::new().unwrap(); 2868 let vm = hv.create_vm().expect("new VM fd creation failed"); 2869 let vcpu = vm.create_vcpu(0, None).unwrap(); 2870 setup_msrs(&vcpu).unwrap(); 2871 2872 // This test will check against the last MSR entry configured (the tenth one). 2873 // See create_msr_entries for details. 2874 let mut msrs = vec![MsrEntry { 2875 index: msr_index::MSR_IA32_MISC_ENABLE, 2876 ..Default::default() 2877 }]; 2878 2879 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2880 // in this test case scenario. 2881 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2882 assert_eq!(read_msrs, 1); 2883 2884 // Official entries that were setup when we did setup_msrs. We need to assert that the 2885 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2886 // expect. 2887 let entry_vec = vcpu.boot_msr_entries(); 2888 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2889 } 2890 2891 #[test] 2892 fn test_setup_regs_for_pvh() { 2893 let hv = hypervisor::new().unwrap(); 2894 let vm = hv.create_vm().expect("new VM fd creation failed"); 2895 let vcpu = vm.create_vcpu(0, None).unwrap(); 2896 2897 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2898 expected_regs.set_rflags(0x0000000000000002u64); 2899 expected_regs.set_rbx(arch::layout::PVH_INFO_START.0); 2900 expected_regs.set_rip(1); 2901 2902 setup_regs( 2903 &vcpu, 2904 arch::EntryPoint { 2905 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2906 setup_header: None, 2907 }, 2908 ) 2909 .unwrap(); 2910 2911 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2912 assert_eq!(actual_regs, expected_regs); 2913 } 2914 2915 #[test] 2916 fn test_setup_regs_for_bzimage() { 2917 let hv = hypervisor::new().unwrap(); 2918 let vm = hv.create_vm().expect("new VM fd creation failed"); 2919 let vcpu = vm.create_vcpu(0, None).unwrap(); 2920 2921 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2922 expected_regs.set_rflags(0x0000000000000002u64); 2923 expected_regs.set_rip(1); 2924 expected_regs.set_rsp(BOOT_STACK_POINTER.0); 2925 expected_regs.set_rsi(ZERO_PAGE_START.0); 2926 2927 setup_regs( 2928 &vcpu, 2929 arch::EntryPoint { 2930 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2931 setup_header: Some(setup_header { 2932 ..Default::default() 2933 }), 2934 }, 2935 ) 2936 .unwrap(); 2937 2938 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2939 assert_eq!(actual_regs, expected_regs); 2940 } 2941 } 2942 2943 #[cfg(target_arch = "aarch64")] 2944 #[cfg(test)] 2945 mod tests { 2946 use arch::{aarch64::regs, layout}; 2947 use hypervisor::kvm::aarch64::is_system_register; 2948 use hypervisor::kvm::kvm_bindings::{ 2949 kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE, 2950 KVM_REG_SIZE_U64, 2951 }; 2952 use hypervisor::{arm64_core_reg_id, offset_of}; 2953 use std::mem; 2954 2955 #[test] 2956 fn test_setup_regs() { 2957 let hv = hypervisor::new().unwrap(); 2958 let vm = hv.create_vm().unwrap(); 2959 let vcpu = vm.create_vcpu(0, None).unwrap(); 2960 2961 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2962 // Must fail when vcpu is not initialized yet. 2963 assert!(res.is_err()); 2964 2965 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2966 vm.get_preferred_target(&mut kvi).unwrap(); 2967 vcpu.vcpu_init(&kvi).unwrap(); 2968 2969 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2970 } 2971 2972 #[test] 2973 fn test_read_mpidr() { 2974 let hv = hypervisor::new().unwrap(); 2975 let vm = hv.create_vm().unwrap(); 2976 let vcpu = vm.create_vcpu(0, None).unwrap(); 2977 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2978 vm.get_preferred_target(&mut kvi).unwrap(); 2979 2980 // Must fail when vcpu is not initialized yet. 2981 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2982 2983 vcpu.vcpu_init(&kvi).unwrap(); 2984 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2985 } 2986 2987 #[test] 2988 fn test_is_system_register() { 2989 let offset = offset_of!(user_pt_regs, pc); 2990 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2991 assert!(!is_system_register(regid)); 2992 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2993 assert!(is_system_register(regid)); 2994 } 2995 2996 #[test] 2997 fn test_save_restore_core_regs() { 2998 let hv = hypervisor::new().unwrap(); 2999 let vm = hv.create_vm().unwrap(); 3000 let vcpu = vm.create_vcpu(0, None).unwrap(); 3001 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 3002 vm.get_preferred_target(&mut kvi).unwrap(); 3003 3004 // Must fail when vcpu is not initialized yet. 3005 let res = vcpu.get_regs(); 3006 assert!(res.is_err()); 3007 assert_eq!( 3008 format!("{}", res.unwrap_err()), 3009 "Failed to get core register: Exec format error (os error 8)" 3010 ); 3011 3012 let mut state = vcpu.create_standard_regs(); 3013 let res = vcpu.set_regs(&state); 3014 assert!(res.is_err()); 3015 assert_eq!( 3016 format!("{}", res.unwrap_err()), 3017 "Failed to set core register: Exec format error (os error 8)" 3018 ); 3019 3020 vcpu.vcpu_init(&kvi).unwrap(); 3021 let res = vcpu.get_regs(); 3022 assert!(res.is_ok()); 3023 state = res.unwrap(); 3024 assert_eq!(state.get_pstate(), 0x3C5); 3025 3026 assert!(vcpu.set_regs(&state).is_ok()); 3027 } 3028 3029 #[test] 3030 fn test_get_set_mpstate() { 3031 let hv = hypervisor::new().unwrap(); 3032 let vm = hv.create_vm().unwrap(); 3033 let vcpu = vm.create_vcpu(0, None).unwrap(); 3034 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 3035 vm.get_preferred_target(&mut kvi).unwrap(); 3036 3037 let res = vcpu.get_mp_state(); 3038 assert!(res.is_ok()); 3039 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 3040 } 3041 } 3042