1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 #[cfg(target_arch = "x86_64")] 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, sdt::Sdt, Aml}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 #[cfg(target_arch = "x86_64")] 35 use arch::x86_64::get_x2apic_id; 36 use arch::EntryPoint; 37 use arch::NumaNodes; 38 #[cfg(target_arch = "aarch64")] 39 use devices::gic::Gic; 40 use devices::interrupt_controller::InterruptController; 41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 44 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 46 use hypervisor::arch::x86::msr_index; 47 #[cfg(target_arch = "x86_64")] 48 use hypervisor::arch::x86::CpuIdEntry; 49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 50 use hypervisor::arch::x86::MsrEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::SpecialRegisters; 53 #[cfg(target_arch = "aarch64")] 54 use hypervisor::kvm::kvm_bindings; 55 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 56 use hypervisor::kvm::kvm_ioctls::Cap; 57 #[cfg(feature = "tdx")] 58 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 59 #[cfg(target_arch = "x86_64")] 60 use hypervisor::CpuVendor; 61 #[cfg(feature = "kvm")] 62 use hypervisor::HypervisorType; 63 #[cfg(feature = "guest_debug")] 64 use hypervisor::StandardRegisters; 65 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps}; 66 use libc::{c_void, siginfo_t}; 67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 68 use linux_loader::elf::Elf64_Nhdr; 69 use seccompiler::{apply_filter, SeccompAction}; 70 use std::collections::BTreeMap; 71 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 72 use std::io::Write; 73 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 74 use std::mem::size_of; 75 use std::os::unix::thread::JoinHandleExt; 76 use std::sync::atomic::{AtomicBool, Ordering}; 77 use std::sync::{Arc, Barrier, Mutex}; 78 use std::{cmp, io, result, thread}; 79 use thiserror::Error; 80 use tracer::trace_scoped; 81 use vm_device::BusDevice; 82 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 83 use vm_memory::ByteValued; 84 #[cfg(feature = "guest_debug")] 85 use vm_memory::{Bytes, GuestAddressSpace}; 86 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 87 use vm_migration::{ 88 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 89 Transportable, 90 }; 91 use vmm_sys_util::eventfd::EventFd; 92 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 93 use zerocopy::AsBytes; 94 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 95 /// Extract the specified bits of a 64-bit integer. 96 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 97 /// following expression should return 3 (`0b11`): 98 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 99 /// 100 macro_rules! extract_bits_64 { 101 ($value: tt, $offset: tt, $length: tt) => { 102 ($value >> $offset) & (!0u64 >> (64 - $length)) 103 }; 104 } 105 106 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 107 macro_rules! extract_bits_64_without_offset { 108 ($value: tt, $length: tt) => { 109 $value & (!0u64 >> (64 - $length)) 110 }; 111 } 112 113 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 114 115 #[derive(Debug, Error)] 116 pub enum Error { 117 #[error("Error creating vCPU: {0}")] 118 VcpuCreate(#[source] anyhow::Error), 119 120 #[error("Error running bCPU: {0}")] 121 VcpuRun(#[source] anyhow::Error), 122 123 #[error("Error spawning vCPU thread: {0}")] 124 VcpuSpawn(#[source] io::Error), 125 126 #[error("Error generating common CPUID: {0}")] 127 CommonCpuId(#[source] arch::Error), 128 129 #[error("Error configuring vCPU: {0}")] 130 VcpuConfiguration(#[source] arch::Error), 131 132 #[error("Still pending removed vcpu")] 133 VcpuPendingRemovedVcpu, 134 135 #[cfg(target_arch = "aarch64")] 136 #[error("Error fetching preferred target: {0}")] 137 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 138 139 #[cfg(target_arch = "aarch64")] 140 #[error("Error initialising vCPU: {0}")] 141 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 142 143 #[cfg(target_arch = "aarch64")] 144 #[error("Error finalising vCPU: {0}")] 145 VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError), 146 147 #[error("Failed to join on vCPU threads: {0:?}")] 148 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 149 150 #[error("Error adding CpuManager to MMIO bus: {0}")] 151 BusError(#[source] vm_device::BusError), 152 153 #[error("Requested vCPUs exceed maximum")] 154 DesiredVCpuCountExceedsMax, 155 156 #[error("Cannot create seccomp filter: {0}")] 157 CreateSeccompFilter(#[source] seccompiler::Error), 158 159 #[error("Cannot apply seccomp filter: {0}")] 160 ApplySeccompFilter(#[source] seccompiler::Error), 161 162 #[error("Error starting vCPU after restore: {0}")] 163 StartRestoreVcpu(#[source] anyhow::Error), 164 165 #[error("Unexpected VmExit")] 166 UnexpectedVmExit, 167 168 #[error("Failed to allocate MMIO address for CpuManager")] 169 AllocateMmmioAddress, 170 171 #[cfg(feature = "tdx")] 172 #[error("Error initializing TDX: {0}")] 173 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 174 175 #[cfg(target_arch = "aarch64")] 176 #[error("Error initializing PMU: {0}")] 177 InitPmu(#[source] hypervisor::HypervisorCpuError), 178 179 #[cfg(feature = "guest_debug")] 180 #[error("Error during CPU debug: {0}")] 181 CpuDebug(#[source] hypervisor::HypervisorCpuError), 182 183 #[cfg(feature = "guest_debug")] 184 #[error("Error translating virtual address: {0}")] 185 TranslateVirtualAddress(#[source] anyhow::Error), 186 187 #[cfg(target_arch = "x86_64")] 188 #[error("Error setting up AMX: {0}")] 189 AmxEnable(#[source] anyhow::Error), 190 191 #[error("Maximum number of vCPUs exceeds host limit")] 192 MaximumVcpusExceeded, 193 194 #[cfg(feature = "sev_snp")] 195 #[error("Failed to set sev control register: {0}")] 196 SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), 197 198 #[cfg(target_arch = "x86_64")] 199 #[error("Failed to inject NMI")] 200 NmiError(hypervisor::HypervisorCpuError), 201 } 202 pub type Result<T> = result::Result<T, Error>; 203 204 #[cfg(target_arch = "x86_64")] 205 #[allow(dead_code)] 206 #[repr(packed)] 207 #[derive(AsBytes)] 208 struct LocalX2Apic { 209 pub r#type: u8, 210 pub length: u8, 211 pub _reserved: u16, 212 pub apic_id: u32, 213 pub flags: u32, 214 pub processor_id: u32, 215 } 216 217 #[allow(dead_code)] 218 #[repr(packed)] 219 #[derive(Default, AsBytes)] 220 struct Ioapic { 221 pub r#type: u8, 222 pub length: u8, 223 pub ioapic_id: u8, 224 _reserved: u8, 225 pub apic_address: u32, 226 pub gsi_base: u32, 227 } 228 229 #[cfg(target_arch = "aarch64")] 230 #[allow(dead_code)] 231 #[repr(packed)] 232 #[derive(AsBytes)] 233 struct GicC { 234 pub r#type: u8, 235 pub length: u8, 236 pub reserved0: u16, 237 pub cpu_interface_number: u32, 238 pub uid: u32, 239 pub flags: u32, 240 pub parking_version: u32, 241 pub performance_interrupt: u32, 242 pub parked_address: u64, 243 pub base_address: u64, 244 pub gicv_base_address: u64, 245 pub gich_base_address: u64, 246 pub vgic_interrupt: u32, 247 pub gicr_base_address: u64, 248 pub mpidr: u64, 249 pub proc_power_effi_class: u8, 250 pub reserved1: u8, 251 pub spe_overflow_interrupt: u16, 252 } 253 254 #[cfg(target_arch = "aarch64")] 255 #[allow(dead_code)] 256 #[repr(packed)] 257 #[derive(AsBytes)] 258 struct GicD { 259 pub r#type: u8, 260 pub length: u8, 261 pub reserved0: u16, 262 pub gic_id: u32, 263 pub base_address: u64, 264 pub global_irq_base: u32, 265 pub version: u8, 266 pub reserved1: [u8; 3], 267 } 268 269 #[cfg(target_arch = "aarch64")] 270 #[allow(dead_code)] 271 #[repr(packed)] 272 #[derive(AsBytes)] 273 struct GicR { 274 pub r#type: u8, 275 pub length: u8, 276 pub reserved: u16, 277 pub base_address: u64, 278 pub range_length: u32, 279 } 280 281 #[cfg(target_arch = "aarch64")] 282 #[allow(dead_code)] 283 #[repr(packed)] 284 #[derive(AsBytes)] 285 struct GicIts { 286 pub r#type: u8, 287 pub length: u8, 288 pub reserved0: u16, 289 pub translation_id: u32, 290 pub base_address: u64, 291 pub reserved1: u32, 292 } 293 294 #[cfg(target_arch = "aarch64")] 295 #[allow(dead_code)] 296 #[repr(packed)] 297 #[derive(AsBytes)] 298 struct ProcessorHierarchyNode { 299 pub r#type: u8, 300 pub length: u8, 301 pub reserved: u16, 302 pub flags: u32, 303 pub parent: u32, 304 pub acpi_processor_id: u32, 305 pub num_private_resources: u32, 306 } 307 308 #[allow(dead_code)] 309 #[repr(packed)] 310 #[derive(Default, AsBytes)] 311 struct InterruptSourceOverride { 312 pub r#type: u8, 313 pub length: u8, 314 pub bus: u8, 315 pub source: u8, 316 pub gsi: u32, 317 pub flags: u16, 318 } 319 320 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 321 macro_rules! round_up { 322 ($n:expr,$d:expr) => { 323 (($n / ($d + 1)) + 1) * $d 324 }; 325 } 326 327 /// A wrapper around creating and using a kvm-based VCPU. 328 pub struct Vcpu { 329 // The hypervisor abstracted CPU. 330 vcpu: Arc<dyn hypervisor::Vcpu>, 331 id: u8, 332 #[cfg(target_arch = "aarch64")] 333 mpidr: u64, 334 saved_state: Option<CpuState>, 335 #[cfg(target_arch = "x86_64")] 336 vendor: CpuVendor, 337 } 338 339 impl Vcpu { 340 /// Constructs a new VCPU for `vm`. 341 /// 342 /// # Arguments 343 /// 344 /// * `id` - Represents the CPU number between [0, max vcpus). 345 /// * `vm` - The virtual machine this vcpu will get attached to. 346 /// * `vm_ops` - Optional object for exit handling. 347 /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) 348 pub fn new( 349 id: u8, 350 apic_id: u8, 351 vm: &Arc<dyn hypervisor::Vm>, 352 vm_ops: Option<Arc<dyn VmOps>>, 353 #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, 354 ) -> Result<Self> { 355 let vcpu = vm 356 .create_vcpu(apic_id, vm_ops) 357 .map_err(|e| Error::VcpuCreate(e.into()))?; 358 // Initially the cpuid per vCPU is the one supported by this VM. 359 Ok(Vcpu { 360 vcpu, 361 id, 362 #[cfg(target_arch = "aarch64")] 363 mpidr: 0, 364 saved_state: None, 365 #[cfg(target_arch = "x86_64")] 366 vendor: cpu_vendor, 367 }) 368 } 369 370 /// Configures a vcpu and should be called once per vcpu when created. 371 /// 372 /// # Arguments 373 /// 374 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 375 /// * `guest_memory` - Guest memory. 376 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 377 pub fn configure( 378 &mut self, 379 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 380 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 381 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 382 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 383 #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, 384 ) -> Result<()> { 385 #[cfg(target_arch = "aarch64")] 386 { 387 self.init(vm)?; 388 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 389 .map_err(Error::VcpuConfiguration)?; 390 } 391 info!("Configuring vCPU: cpu_id = {}", self.id); 392 #[cfg(target_arch = "x86_64")] 393 arch::configure_vcpu( 394 &self.vcpu, 395 self.id, 396 boot_setup, 397 cpuid, 398 kvm_hyperv, 399 self.vendor, 400 topology, 401 ) 402 .map_err(Error::VcpuConfiguration)?; 403 404 Ok(()) 405 } 406 407 /// Gets the MPIDR register value. 408 #[cfg(target_arch = "aarch64")] 409 pub fn get_mpidr(&self) -> u64 { 410 self.mpidr 411 } 412 413 /// Gets the saved vCPU state. 414 #[cfg(target_arch = "aarch64")] 415 pub fn get_saved_state(&self) -> Option<CpuState> { 416 self.saved_state.clone() 417 } 418 419 /// Initializes an aarch64 specific vcpu for booting Linux. 420 #[cfg(target_arch = "aarch64")] 421 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 422 use std::arch::is_aarch64_feature_detected; 423 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 424 let sve_supported = 425 is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2"); 426 // This reads back the kernel's preferred target type. 427 vm.get_preferred_target(&mut kvi) 428 .map_err(Error::VcpuArmPreferredTarget)?; 429 // We already checked that the capability is supported. 430 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 431 if vm 432 .as_any() 433 .downcast_ref::<hypervisor::kvm::KvmVm>() 434 .unwrap() 435 .check_extension(Cap::ArmPmuV3) 436 { 437 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 438 } 439 440 if sve_supported 441 && vm 442 .as_any() 443 .downcast_ref::<hypervisor::kvm::KvmVm>() 444 .unwrap() 445 .check_extension(Cap::ArmSve) 446 { 447 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_SVE; 448 } 449 450 // Non-boot cpus are powered off initially. 451 if self.id > 0 { 452 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 453 } 454 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?; 455 if sve_supported { 456 self.vcpu 457 .vcpu_finalize(kvm_bindings::KVM_ARM_VCPU_SVE as i32) 458 .map_err(Error::VcpuArmFinalize)?; 459 } 460 Ok(()) 461 } 462 463 /// Runs the VCPU until it exits, returning the reason. 464 /// 465 /// Note that the state of the VCPU and associated VM must be setup first for this to do 466 /// anything useful. 467 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 468 self.vcpu.run() 469 } 470 471 #[cfg(feature = "sev_snp")] 472 pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> { 473 self.vcpu 474 .set_sev_control_register(vmsa_pfn) 475 .map_err(Error::SetSevControlRegister) 476 } 477 } 478 479 impl Pausable for Vcpu {} 480 impl Snapshottable for Vcpu { 481 fn id(&self) -> String { 482 self.id.to_string() 483 } 484 485 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 486 let saved_state = self 487 .vcpu 488 .state() 489 .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?; 490 491 self.saved_state = Some(saved_state.clone()); 492 493 Ok(Snapshot::from_data(SnapshotData::new_from_state( 494 &saved_state, 495 )?)) 496 } 497 } 498 499 pub struct CpuManager { 500 config: CpusConfig, 501 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 502 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 503 #[cfg(target_arch = "x86_64")] 504 cpuid: Vec<CpuIdEntry>, 505 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 506 vm: Arc<dyn hypervisor::Vm>, 507 vcpus_kill_signalled: Arc<AtomicBool>, 508 vcpus_pause_signalled: Arc<AtomicBool>, 509 vcpus_kick_signalled: Arc<AtomicBool>, 510 exit_evt: EventFd, 511 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 512 reset_evt: EventFd, 513 #[cfg(feature = "guest_debug")] 514 vm_debug_evt: EventFd, 515 vcpu_states: Vec<VcpuState>, 516 selected_cpu: u8, 517 vcpus: Vec<Arc<Mutex<Vcpu>>>, 518 seccomp_action: SeccompAction, 519 vm_ops: Arc<dyn VmOps>, 520 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 521 acpi_address: Option<GuestAddress>, 522 proximity_domain_per_cpu: BTreeMap<u8, u32>, 523 affinity: BTreeMap<u8, Vec<usize>>, 524 dynamic: bool, 525 hypervisor: Arc<dyn hypervisor::Hypervisor>, 526 #[cfg(feature = "sev_snp")] 527 sev_snp_enabled: bool, 528 } 529 530 const CPU_ENABLE_FLAG: usize = 0; 531 const CPU_INSERTING_FLAG: usize = 1; 532 const CPU_REMOVING_FLAG: usize = 2; 533 const CPU_EJECT_FLAG: usize = 3; 534 535 const CPU_STATUS_OFFSET: u64 = 4; 536 const CPU_SELECTION_OFFSET: u64 = 0; 537 538 impl BusDevice for CpuManager { 539 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 540 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 541 data.fill(0); 542 543 match offset { 544 CPU_SELECTION_OFFSET => { 545 data[0] = self.selected_cpu; 546 } 547 CPU_STATUS_OFFSET => { 548 if self.selected_cpu < self.max_vcpus() { 549 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 550 if state.active() { 551 data[0] |= 1 << CPU_ENABLE_FLAG; 552 } 553 if state.inserting { 554 data[0] |= 1 << CPU_INSERTING_FLAG; 555 } 556 if state.removing { 557 data[0] |= 1 << CPU_REMOVING_FLAG; 558 } 559 } else { 560 warn!("Out of range vCPU id: {}", self.selected_cpu); 561 } 562 } 563 _ => { 564 warn!( 565 "Unexpected offset for accessing CPU manager device: {:#}", 566 offset 567 ); 568 } 569 } 570 } 571 572 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 573 match offset { 574 CPU_SELECTION_OFFSET => { 575 self.selected_cpu = data[0]; 576 } 577 CPU_STATUS_OFFSET => { 578 if self.selected_cpu < self.max_vcpus() { 579 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 580 // The ACPI code writes back a 1 to acknowledge the insertion 581 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 582 && state.inserting 583 { 584 state.inserting = false; 585 } 586 // Ditto for removal 587 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 588 && state.removing 589 { 590 state.removing = false; 591 } 592 // Trigger removal of vCPU 593 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 594 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 595 error!("Error removing vCPU: {:?}", e); 596 } 597 } 598 } else { 599 warn!("Out of range vCPU id: {}", self.selected_cpu); 600 } 601 } 602 _ => { 603 warn!( 604 "Unexpected offset for accessing CPU manager device: {:#}", 605 offset 606 ); 607 } 608 } 609 None 610 } 611 } 612 613 #[derive(Default)] 614 struct VcpuState { 615 inserting: bool, 616 removing: bool, 617 pending_removal: Arc<AtomicBool>, 618 handle: Option<thread::JoinHandle<()>>, 619 kill: Arc<AtomicBool>, 620 vcpu_run_interrupted: Arc<AtomicBool>, 621 paused: Arc<AtomicBool>, 622 } 623 624 impl VcpuState { 625 fn active(&self) -> bool { 626 self.handle.is_some() 627 } 628 629 fn signal_thread(&self) { 630 if let Some(handle) = self.handle.as_ref() { 631 loop { 632 // SAFETY: FFI call with correct arguments 633 unsafe { 634 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 635 } 636 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 637 break; 638 } else { 639 // This is more effective than thread::yield_now() at 640 // avoiding a priority inversion with the vCPU thread 641 thread::sleep(std::time::Duration::from_millis(1)); 642 } 643 } 644 } 645 } 646 647 fn join_thread(&mut self) -> Result<()> { 648 if let Some(handle) = self.handle.take() { 649 handle.join().map_err(Error::ThreadCleanup)? 650 } 651 652 Ok(()) 653 } 654 655 fn unpark_thread(&self) { 656 if let Some(handle) = self.handle.as_ref() { 657 handle.thread().unpark() 658 } 659 } 660 } 661 662 impl CpuManager { 663 #[allow(unused_variables)] 664 #[allow(clippy::too_many_arguments)] 665 pub fn new( 666 config: &CpusConfig, 667 vm: Arc<dyn hypervisor::Vm>, 668 exit_evt: EventFd, 669 reset_evt: EventFd, 670 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 671 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 672 seccomp_action: SeccompAction, 673 vm_ops: Arc<dyn VmOps>, 674 #[cfg(feature = "tdx")] tdx_enabled: bool, 675 numa_nodes: &NumaNodes, 676 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 677 ) -> Result<Arc<Mutex<CpuManager>>> { 678 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 679 return Err(Error::MaximumVcpusExceeded); 680 } 681 682 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 683 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 684 let hypervisor_type = hypervisor.hypervisor_type(); 685 #[cfg(target_arch = "x86_64")] 686 let cpu_vendor = hypervisor.get_cpu_vendor(); 687 688 #[cfg(target_arch = "x86_64")] 689 if config.features.amx { 690 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 691 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 692 const XFEATURE_XTILEDATA: usize = 18; 693 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 694 695 // SAFETY: the syscall is only modifying kernel internal 696 // data structures that the kernel is itself expected to safeguard. 697 let amx_tile = unsafe { 698 libc::syscall( 699 libc::SYS_arch_prctl, 700 ARCH_REQ_XCOMP_GUEST_PERM, 701 XFEATURE_XTILEDATA, 702 ) 703 }; 704 705 if amx_tile != 0 { 706 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 707 } else { 708 let mask: usize = 0; 709 // SAFETY: the mask being modified (not marked mutable as it is 710 // modified in unsafe only which is permitted) isn't in use elsewhere. 711 let result = unsafe { 712 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 713 }; 714 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 715 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 716 } 717 } 718 } 719 720 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 721 let mut cpu_list = Vec::new(); 722 for (proximity_domain, numa_node) in numa_nodes.iter() { 723 for cpu in numa_node.cpus.iter() { 724 cpu_list.push((*cpu, *proximity_domain)) 725 } 726 } 727 cpu_list 728 } 729 .into_iter() 730 .collect(); 731 732 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 733 cpu_affinity 734 .iter() 735 .map(|a| (a.vcpu, a.host_cpus.clone())) 736 .collect() 737 } else { 738 BTreeMap::new() 739 }; 740 741 #[cfg(feature = "tdx")] 742 let dynamic = !tdx_enabled; 743 #[cfg(not(feature = "tdx"))] 744 let dynamic = true; 745 746 Ok(Arc::new(Mutex::new(CpuManager { 747 config: config.clone(), 748 interrupt_controller: None, 749 #[cfg(target_arch = "x86_64")] 750 cpuid: Vec::new(), 751 vm, 752 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 753 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 754 vcpus_kick_signalled: Arc::new(AtomicBool::new(false)), 755 vcpu_states, 756 exit_evt, 757 reset_evt, 758 #[cfg(feature = "guest_debug")] 759 vm_debug_evt, 760 selected_cpu: 0, 761 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 762 seccomp_action, 763 vm_ops, 764 acpi_address: None, 765 proximity_domain_per_cpu, 766 affinity, 767 dynamic, 768 hypervisor: hypervisor.clone(), 769 #[cfg(feature = "sev_snp")] 770 sev_snp_enabled, 771 }))) 772 } 773 774 #[cfg(target_arch = "x86_64")] 775 pub fn populate_cpuid( 776 &mut self, 777 memory_manager: &Arc<Mutex<MemoryManager>>, 778 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 779 #[cfg(feature = "tdx")] tdx: bool, 780 ) -> Result<()> { 781 let sgx_epc_sections = memory_manager 782 .lock() 783 .unwrap() 784 .sgx_epc_region() 785 .as_ref() 786 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 787 788 self.cpuid = { 789 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 790 arch::generate_common_cpuid( 791 hypervisor, 792 &arch::CpuidConfig { 793 sgx_epc_sections, 794 phys_bits, 795 kvm_hyperv: self.config.kvm_hyperv, 796 #[cfg(feature = "tdx")] 797 tdx, 798 amx: self.config.features.amx, 799 }, 800 ) 801 .map_err(Error::CommonCpuId)? 802 }; 803 804 Ok(()) 805 } 806 807 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 808 info!("Creating vCPU: cpu_id = {}", cpu_id); 809 810 #[cfg(target_arch = "x86_64")] 811 let topology = self.get_vcpu_topology(); 812 #[cfg(target_arch = "x86_64")] 813 let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology); 814 #[cfg(target_arch = "aarch64")] 815 let x2apic_id = cpu_id as u32; 816 817 let mut vcpu = Vcpu::new( 818 cpu_id, 819 x2apic_id as u8, 820 &self.vm, 821 Some(self.vm_ops.clone()), 822 #[cfg(target_arch = "x86_64")] 823 self.hypervisor.get_cpu_vendor(), 824 )?; 825 826 if let Some(snapshot) = snapshot { 827 // AArch64 vCPUs should be initialized after created. 828 #[cfg(target_arch = "aarch64")] 829 vcpu.init(&self.vm)?; 830 831 let state: CpuState = snapshot.to_state().map_err(|e| { 832 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 833 })?; 834 vcpu.vcpu 835 .set_state(&state) 836 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 837 838 vcpu.saved_state = Some(state); 839 } 840 841 let vcpu = Arc::new(Mutex::new(vcpu)); 842 843 // Adding vCPU to the CpuManager's vCPU list. 844 self.vcpus.push(vcpu.clone()); 845 846 Ok(vcpu) 847 } 848 849 pub fn configure_vcpu( 850 &self, 851 vcpu: Arc<Mutex<Vcpu>>, 852 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 853 ) -> Result<()> { 854 let mut vcpu = vcpu.lock().unwrap(); 855 856 #[cfg(feature = "sev_snp")] 857 if self.sev_snp_enabled { 858 if let Some((kernel_entry_point, _)) = boot_setup { 859 vcpu.set_sev_control_register( 860 kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, 861 )?; 862 } 863 864 // Traditional way to configure vcpu doesn't work for SEV-SNP guests. 865 // All the vCPU configuration for SEV-SNP guest is provided via VMSA. 866 return Ok(()); 867 } 868 869 #[cfg(target_arch = "x86_64")] 870 assert!(!self.cpuid.is_empty()); 871 872 #[cfg(target_arch = "x86_64")] 873 let topology = self.config.topology.clone().map_or_else( 874 || Some((1, self.boot_vcpus(), 1)), 875 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 876 ); 877 #[cfg(target_arch = "x86_64")] 878 vcpu.configure( 879 boot_setup, 880 self.cpuid.clone(), 881 self.config.kvm_hyperv, 882 topology, 883 )?; 884 885 #[cfg(target_arch = "aarch64")] 886 vcpu.configure(&self.vm, boot_setup)?; 887 888 Ok(()) 889 } 890 891 /// Only create new vCPUs if there aren't any inactive ones to reuse 892 fn create_vcpus( 893 &mut self, 894 desired_vcpus: u8, 895 snapshot: Option<Snapshot>, 896 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 897 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 898 info!( 899 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 900 desired_vcpus, 901 self.config.max_vcpus, 902 self.vcpus.len(), 903 self.present_vcpus() 904 ); 905 906 if desired_vcpus > self.config.max_vcpus { 907 return Err(Error::DesiredVCpuCountExceedsMax); 908 } 909 910 // Only create vCPUs in excess of all the allocated vCPUs. 911 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 912 vcpus.push(self.create_vcpu( 913 cpu_id, 914 // TODO: The special format of the CPU id can be removed once 915 // ready to break live upgrade. 916 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 917 )?); 918 } 919 920 Ok(vcpus) 921 } 922 923 #[cfg(target_arch = "aarch64")] 924 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 925 for cpu in self.vcpus.iter() { 926 let cpu = cpu.lock().unwrap(); 927 // Check if PMU attr is available, if not, log the information. 928 if cpu.vcpu.has_pmu_support() { 929 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 930 } else { 931 debug!( 932 "PMU attribute is not supported in vCPU{}, skip PMU init!", 933 cpu.id 934 ); 935 return Ok(false); 936 } 937 } 938 939 Ok(true) 940 } 941 942 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 943 self.vcpus.clone() 944 } 945 946 fn start_vcpu( 947 &mut self, 948 vcpu: Arc<Mutex<Vcpu>>, 949 vcpu_id: u8, 950 vcpu_thread_barrier: Arc<Barrier>, 951 inserting: bool, 952 ) -> Result<()> { 953 let reset_evt = self.reset_evt.try_clone().unwrap(); 954 let exit_evt = self.exit_evt.try_clone().unwrap(); 955 #[cfg(feature = "kvm")] 956 let hypervisor_type = self.hypervisor.hypervisor_type(); 957 #[cfg(feature = "guest_debug")] 958 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 959 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 960 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 961 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 962 let vcpu_kick_signalled = self.vcpus_kick_signalled.clone(); 963 964 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 965 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 966 .vcpu_run_interrupted 967 .clone(); 968 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 969 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 970 971 // Prepare the CPU set the current vCPU is expected to run onto. 972 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 973 // SAFETY: all zeros is a valid pattern 974 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 975 // SAFETY: FFI call, trivially safe 976 unsafe { libc::CPU_ZERO(&mut cpuset) }; 977 for host_cpu in host_cpus { 978 // SAFETY: FFI call, trivially safe 979 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) }; 980 } 981 cpuset 982 }); 983 984 // Retrieve seccomp filter for vcpu thread 985 let vcpu_seccomp_filter = get_seccomp_filter( 986 &self.seccomp_action, 987 Thread::Vcpu, 988 self.hypervisor.hypervisor_type(), 989 ) 990 .map_err(Error::CreateSeccompFilter)?; 991 992 #[cfg(target_arch = "x86_64")] 993 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 994 995 info!("Starting vCPU: cpu_id = {}", vcpu_id); 996 997 let handle = Some( 998 thread::Builder::new() 999 .name(format!("vcpu{vcpu_id}")) 1000 .spawn(move || { 1001 // Schedule the thread to run on the expected CPU set 1002 if let Some(cpuset) = cpuset.as_ref() { 1003 // SAFETY: FFI call with correct arguments 1004 let ret = unsafe { 1005 libc::sched_setaffinity( 1006 0, 1007 std::mem::size_of::<libc::cpu_set_t>(), 1008 cpuset as *const libc::cpu_set_t, 1009 ) 1010 }; 1011 1012 if ret != 0 { 1013 error!( 1014 "Failed scheduling the vCPU {} on the expected CPU set: {}", 1015 vcpu_id, 1016 io::Error::last_os_error() 1017 ); 1018 return; 1019 } 1020 } 1021 1022 // Apply seccomp filter for vcpu thread. 1023 if !vcpu_seccomp_filter.is_empty() { 1024 if let Err(e) = 1025 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 1026 { 1027 error!("Error applying seccomp filter: {:?}", e); 1028 return; 1029 } 1030 } 1031 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 1032 // This uses an async signal safe handler to kill the vcpu handles. 1033 register_signal_handler(SIGRTMIN(), handle_signal) 1034 .expect("Failed to register vcpu signal handler"); 1035 // Block until all CPUs are ready. 1036 vcpu_thread_barrier.wait(); 1037 1038 std::panic::catch_unwind(move || { 1039 loop { 1040 // If we are being told to pause, we park the thread 1041 // until the pause boolean is toggled. 1042 // The resume operation is responsible for toggling 1043 // the boolean and unpark the thread. 1044 // We enter a loop because park() could spuriously 1045 // return. We will then park() again unless the 1046 // pause boolean has been toggled. 1047 1048 // Need to use Ordering::SeqCst as we have multiple 1049 // loads and stores to different atomics and we need 1050 // to see them in a consistent order in all threads 1051 1052 if vcpu_pause_signalled.load(Ordering::SeqCst) { 1053 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 1054 // completed by returning to KVM_RUN. From the kernel docs: 1055 // 1056 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 1057 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 1058 // operations are complete (and guest state is consistent) only after userspace 1059 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 1060 // incomplete operations and then check for pending signals. 1061 // The pending state of the operation is not preserved in state which is 1062 // visible to userspace, thus userspace should ensure that the operation is 1063 // completed before performing a live migration. Userspace can re-enter the 1064 // guest with an unmasked signal pending or with the immediate_exit field set 1065 // to complete pending operations without allowing any further instructions 1066 // to be executed. 1067 1068 #[cfg(feature = "kvm")] 1069 if matches!(hypervisor_type, HypervisorType::Kvm) { 1070 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 1071 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 1072 error!("Unexpected VM exit on \"immediate_exit\" run"); 1073 break; 1074 } 1075 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 1076 } 1077 1078 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1079 1080 vcpu_paused.store(true, Ordering::SeqCst); 1081 while vcpu_pause_signalled.load(Ordering::SeqCst) { 1082 thread::park(); 1083 } 1084 vcpu_run_interrupted.store(false, Ordering::SeqCst); 1085 } 1086 1087 if vcpu_kick_signalled.load(Ordering::SeqCst) { 1088 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1089 #[cfg(target_arch = "x86_64")] 1090 match vcpu.lock().as_ref().unwrap().vcpu.nmi() { 1091 Ok(()) => {}, 1092 Err(e) => { 1093 error!("Error when inject nmi {}", e); 1094 break; 1095 } 1096 } 1097 } 1098 1099 // We've been told to terminate 1100 if vcpu_kill_signalled.load(Ordering::SeqCst) 1101 || vcpu_kill.load(Ordering::SeqCst) 1102 { 1103 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1104 break; 1105 } 1106 1107 #[cfg(feature = "tdx")] 1108 let mut vcpu = vcpu.lock().unwrap(); 1109 #[cfg(not(feature = "tdx"))] 1110 let vcpu = vcpu.lock().unwrap(); 1111 // vcpu.run() returns false on a triple-fault so trigger a reset 1112 match vcpu.run() { 1113 Ok(run) => match run { 1114 #[cfg(feature = "kvm")] 1115 VmExit::Debug => { 1116 info!("VmExit::Debug"); 1117 #[cfg(feature = "guest_debug")] 1118 { 1119 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1120 let raw_tid = get_raw_tid(vcpu_id as usize); 1121 vm_debug_evt.write(raw_tid as u64).unwrap(); 1122 } 1123 } 1124 #[cfg(target_arch = "x86_64")] 1125 VmExit::IoapicEoi(vector) => { 1126 if let Some(interrupt_controller) = 1127 &interrupt_controller_clone 1128 { 1129 interrupt_controller 1130 .lock() 1131 .unwrap() 1132 .end_of_interrupt(vector); 1133 } 1134 } 1135 VmExit::Ignore => {} 1136 VmExit::Hyperv => {} 1137 VmExit::Reset => { 1138 info!("VmExit::Reset"); 1139 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1140 reset_evt.write(1).unwrap(); 1141 break; 1142 } 1143 VmExit::Shutdown => { 1144 info!("VmExit::Shutdown"); 1145 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1146 exit_evt.write(1).unwrap(); 1147 break; 1148 } 1149 #[cfg(feature = "tdx")] 1150 VmExit::Tdx => { 1151 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1152 match vcpu.get_tdx_exit_details() { 1153 Ok(details) => match details { 1154 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1155 TdxExitDetails::SetupEventNotifyInterrupt => { 1156 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1157 } 1158 }, 1159 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1160 } 1161 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1162 } else { 1163 // We should never reach this code as 1164 // this means the design from the code 1165 // is wrong. 1166 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1167 } 1168 } 1169 }, 1170 1171 Err(e) => { 1172 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1173 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1174 exit_evt.write(1).unwrap(); 1175 break; 1176 } 1177 } 1178 1179 // We've been told to terminate 1180 if vcpu_kill_signalled.load(Ordering::SeqCst) 1181 || vcpu_kill.load(Ordering::SeqCst) 1182 { 1183 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1184 break; 1185 } 1186 } 1187 }) 1188 .or_else(|_| { 1189 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1190 error!("vCPU thread panicked"); 1191 panic_exit_evt.write(1) 1192 }) 1193 .ok(); 1194 }) 1195 .map_err(Error::VcpuSpawn)?, 1196 ); 1197 1198 // On hot plug calls into this function entry_point is None. It is for 1199 // those hotplug CPU additions that we need to set the inserting flag. 1200 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1201 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1202 1203 Ok(()) 1204 } 1205 1206 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1207 fn activate_vcpus( 1208 &mut self, 1209 desired_vcpus: u8, 1210 inserting: bool, 1211 paused: Option<bool>, 1212 ) -> Result<()> { 1213 if desired_vcpus > self.config.max_vcpus { 1214 return Err(Error::DesiredVCpuCountExceedsMax); 1215 } 1216 1217 let vcpu_thread_barrier = Arc::new(Barrier::new( 1218 (desired_vcpus - self.present_vcpus() + 1) as usize, 1219 )); 1220 1221 if let Some(paused) = paused { 1222 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1223 } 1224 1225 info!( 1226 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1227 desired_vcpus, 1228 self.vcpus.len(), 1229 self.present_vcpus(), 1230 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1231 ); 1232 1233 // This reuses any inactive vCPUs as well as any that were newly created 1234 for vcpu_id in self.present_vcpus()..desired_vcpus { 1235 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1236 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1237 } 1238 1239 // Unblock all CPU threads. 1240 vcpu_thread_barrier.wait(); 1241 Ok(()) 1242 } 1243 1244 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1245 // Mark vCPUs for removal, actual removal happens on ejection 1246 for cpu_id in desired_vcpus..self.present_vcpus() { 1247 self.vcpu_states[usize::from(cpu_id)].removing = true; 1248 self.vcpu_states[usize::from(cpu_id)] 1249 .pending_removal 1250 .store(true, Ordering::SeqCst); 1251 } 1252 } 1253 1254 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1255 for state in self.vcpu_states.iter() { 1256 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1257 return true; 1258 } 1259 } 1260 false 1261 } 1262 1263 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1264 info!("Removing vCPU: cpu_id = {}", cpu_id); 1265 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1266 state.kill.store(true, Ordering::SeqCst); 1267 state.signal_thread(); 1268 state.join_thread()?; 1269 state.handle = None; 1270 1271 // Once the thread has exited, clear the "kill" so that it can reused 1272 state.kill.store(false, Ordering::SeqCst); 1273 state.pending_removal.store(false, Ordering::SeqCst); 1274 1275 Ok(()) 1276 } 1277 1278 pub fn create_boot_vcpus( 1279 &mut self, 1280 snapshot: Option<Snapshot>, 1281 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1282 trace_scoped!("create_boot_vcpus"); 1283 1284 self.create_vcpus(self.boot_vcpus(), snapshot) 1285 } 1286 1287 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1288 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1289 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1290 } 1291 1292 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1293 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1294 .map_err(|e| { 1295 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1296 })?; 1297 1298 Ok(()) 1299 } 1300 1301 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1302 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1303 return Ok(false); 1304 } 1305 1306 if !self.dynamic { 1307 return Ok(false); 1308 } 1309 1310 if self.check_pending_removed_vcpu() { 1311 return Err(Error::VcpuPendingRemovedVcpu); 1312 } 1313 1314 match desired_vcpus.cmp(&self.present_vcpus()) { 1315 cmp::Ordering::Greater => { 1316 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1317 for vcpu in vcpus { 1318 self.configure_vcpu(vcpu, None)? 1319 } 1320 self.activate_vcpus(desired_vcpus, true, None)?; 1321 Ok(true) 1322 } 1323 cmp::Ordering::Less => { 1324 self.mark_vcpus_for_removal(desired_vcpus); 1325 Ok(true) 1326 } 1327 _ => Ok(false), 1328 } 1329 } 1330 1331 pub fn shutdown(&mut self) -> Result<()> { 1332 // Tell the vCPUs to stop themselves next time they go through the loop 1333 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1334 1335 // Toggle the vCPUs pause boolean 1336 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1337 1338 // Unpark all the VCPU threads. 1339 for state in self.vcpu_states.iter() { 1340 state.unpark_thread(); 1341 } 1342 1343 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1344 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1345 // above. 1346 for state in self.vcpu_states.iter() { 1347 state.signal_thread(); 1348 } 1349 1350 // Wait for all the threads to finish. This removes the state from the vector. 1351 for mut state in self.vcpu_states.drain(..) { 1352 state.join_thread()?; 1353 } 1354 1355 Ok(()) 1356 } 1357 1358 #[cfg(feature = "tdx")] 1359 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1360 for vcpu in &self.vcpus { 1361 vcpu.lock() 1362 .unwrap() 1363 .vcpu 1364 .tdx_init(hob_address) 1365 .map_err(Error::InitializeTdx)?; 1366 } 1367 Ok(()) 1368 } 1369 1370 pub fn boot_vcpus(&self) -> u8 { 1371 self.config.boot_vcpus 1372 } 1373 1374 pub fn max_vcpus(&self) -> u8 { 1375 self.config.max_vcpus 1376 } 1377 1378 #[cfg(target_arch = "x86_64")] 1379 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1380 assert!(!self.cpuid.is_empty()); 1381 self.cpuid.clone() 1382 } 1383 1384 fn present_vcpus(&self) -> u8 { 1385 self.vcpu_states 1386 .iter() 1387 .fold(0, |acc, state| acc + state.active() as u8) 1388 } 1389 1390 #[cfg(target_arch = "aarch64")] 1391 pub fn get_mpidrs(&self) -> Vec<u64> { 1392 self.vcpus 1393 .iter() 1394 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1395 .collect() 1396 } 1397 1398 #[cfg(target_arch = "aarch64")] 1399 pub fn get_saved_states(&self) -> Vec<CpuState> { 1400 self.vcpus 1401 .iter() 1402 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1403 .collect() 1404 } 1405 1406 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1407 self.config 1408 .topology 1409 .clone() 1410 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1411 } 1412 1413 pub fn create_madt(&self) -> Sdt { 1414 use crate::acpi; 1415 // This is also checked in the commandline parsing. 1416 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1417 1418 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1419 #[cfg(target_arch = "x86_64")] 1420 { 1421 madt.write(36, arch::layout::APIC_START.0); 1422 1423 for cpu in 0..self.config.max_vcpus { 1424 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology()); 1425 1426 let lapic = LocalX2Apic { 1427 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1428 length: 16, 1429 processor_id: cpu.into(), 1430 apic_id: x2apic_id, 1431 flags: if cpu < self.config.boot_vcpus { 1432 1 << MADT_CPU_ENABLE_FLAG 1433 } else { 1434 0 1435 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1436 _reserved: 0, 1437 }; 1438 madt.append(lapic); 1439 } 1440 1441 madt.append(Ioapic { 1442 r#type: acpi::ACPI_APIC_IO, 1443 length: 12, 1444 ioapic_id: 0, 1445 apic_address: arch::layout::IOAPIC_START.0 as u32, 1446 gsi_base: 0, 1447 ..Default::default() 1448 }); 1449 1450 madt.append(InterruptSourceOverride { 1451 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1452 length: 10, 1453 bus: 0, 1454 source: 4, 1455 gsi: 4, 1456 flags: 0, 1457 }); 1458 } 1459 1460 #[cfg(target_arch = "aarch64")] 1461 { 1462 /* Notes: 1463 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1464 */ 1465 1466 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1467 for cpu in 0..self.config.boot_vcpus { 1468 let vcpu = &self.vcpus[cpu as usize]; 1469 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1470 /* ARMv8 MPIDR format: 1471 Bits [63:40] Must be zero 1472 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1473 Bits [31:24] Must be zero 1474 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1475 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1476 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1477 */ 1478 let mpidr_mask = 0xff_00ff_ffff; 1479 let gicc = GicC { 1480 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1481 length: 80, 1482 reserved0: 0, 1483 cpu_interface_number: cpu as u32, 1484 uid: cpu as u32, 1485 flags: 1, 1486 parking_version: 0, 1487 performance_interrupt: 0, 1488 parked_address: 0, 1489 base_address: 0, 1490 gicv_base_address: 0, 1491 gich_base_address: 0, 1492 vgic_interrupt: 0, 1493 gicr_base_address: 0, 1494 mpidr: mpidr & mpidr_mask, 1495 proc_power_effi_class: 0, 1496 reserved1: 0, 1497 spe_overflow_interrupt: 0, 1498 }; 1499 1500 madt.append(gicc); 1501 } 1502 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1503 1504 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1505 let gicd = GicD { 1506 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1507 length: 24, 1508 reserved0: 0, 1509 gic_id: 0, 1510 base_address: vgic_config.dist_addr, 1511 global_irq_base: 0, 1512 version: 3, 1513 reserved1: [0; 3], 1514 }; 1515 madt.append(gicd); 1516 1517 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1518 let gicr = GicR { 1519 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1520 length: 16, 1521 reserved: 0, 1522 base_address: vgic_config.redists_addr, 1523 range_length: vgic_config.redists_size as u32, 1524 }; 1525 madt.append(gicr); 1526 1527 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1528 let gicits = GicIts { 1529 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1530 length: 20, 1531 reserved0: 0, 1532 translation_id: 0, 1533 base_address: vgic_config.msi_addr, 1534 reserved1: 0, 1535 }; 1536 madt.append(gicits); 1537 1538 madt.update_checksum(); 1539 } 1540 1541 madt 1542 } 1543 1544 #[cfg(target_arch = "aarch64")] 1545 pub fn create_pptt(&self) -> Sdt { 1546 let pptt_start = 0; 1547 let mut cpus = 0; 1548 let mut uid = 0; 1549 // If topology is not specified, the default setting is: 1550 // 1 package, multiple cores, 1 thread per core 1551 // This is also the behavior when PPTT is missing. 1552 let (threads_per_core, cores_per_package, packages) = 1553 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1554 1555 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1556 1557 for cluster_idx in 0..packages { 1558 if cpus < self.config.boot_vcpus as usize { 1559 let cluster_offset = pptt.len() - pptt_start; 1560 let cluster_hierarchy_node = ProcessorHierarchyNode { 1561 r#type: 0, 1562 length: 20, 1563 reserved: 0, 1564 flags: 0x2, 1565 parent: 0, 1566 acpi_processor_id: cluster_idx as u32, 1567 num_private_resources: 0, 1568 }; 1569 pptt.append(cluster_hierarchy_node); 1570 1571 for core_idx in 0..cores_per_package { 1572 let core_offset = pptt.len() - pptt_start; 1573 1574 if threads_per_core > 1 { 1575 let core_hierarchy_node = ProcessorHierarchyNode { 1576 r#type: 0, 1577 length: 20, 1578 reserved: 0, 1579 flags: 0x2, 1580 parent: cluster_offset as u32, 1581 acpi_processor_id: core_idx as u32, 1582 num_private_resources: 0, 1583 }; 1584 pptt.append(core_hierarchy_node); 1585 1586 for _thread_idx in 0..threads_per_core { 1587 let thread_hierarchy_node = ProcessorHierarchyNode { 1588 r#type: 0, 1589 length: 20, 1590 reserved: 0, 1591 flags: 0xE, 1592 parent: core_offset as u32, 1593 acpi_processor_id: uid as u32, 1594 num_private_resources: 0, 1595 }; 1596 pptt.append(thread_hierarchy_node); 1597 uid += 1; 1598 } 1599 } else { 1600 let thread_hierarchy_node = ProcessorHierarchyNode { 1601 r#type: 0, 1602 length: 20, 1603 reserved: 0, 1604 flags: 0xA, 1605 parent: cluster_offset as u32, 1606 acpi_processor_id: uid as u32, 1607 num_private_resources: 0, 1608 }; 1609 pptt.append(thread_hierarchy_node); 1610 uid += 1; 1611 } 1612 } 1613 cpus += (cores_per_package * threads_per_core) as usize; 1614 } 1615 } 1616 1617 pptt.update_checksum(); 1618 pptt 1619 } 1620 1621 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1622 fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters { 1623 self.vcpus[usize::from(cpu_id)] 1624 .lock() 1625 .unwrap() 1626 .vcpu 1627 .create_standard_regs() 1628 } 1629 1630 #[cfg(feature = "guest_debug")] 1631 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1632 self.vcpus[usize::from(cpu_id)] 1633 .lock() 1634 .unwrap() 1635 .vcpu 1636 .get_regs() 1637 .map_err(Error::CpuDebug) 1638 } 1639 1640 #[cfg(feature = "guest_debug")] 1641 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1642 self.vcpus[usize::from(cpu_id)] 1643 .lock() 1644 .unwrap() 1645 .vcpu 1646 .set_regs(regs) 1647 .map_err(Error::CpuDebug) 1648 } 1649 1650 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1651 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1652 self.vcpus[usize::from(cpu_id)] 1653 .lock() 1654 .unwrap() 1655 .vcpu 1656 .get_sregs() 1657 .map_err(Error::CpuDebug) 1658 } 1659 1660 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1661 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1662 self.vcpus[usize::from(cpu_id)] 1663 .lock() 1664 .unwrap() 1665 .vcpu 1666 .set_sregs(sregs) 1667 .map_err(Error::CpuDebug) 1668 } 1669 1670 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1671 fn translate_gva( 1672 &self, 1673 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1674 cpu_id: u8, 1675 gva: u64, 1676 ) -> Result<u64> { 1677 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1678 .lock() 1679 .unwrap() 1680 .vcpu 1681 .translate_gva(gva, /* flags: unused */ 0) 1682 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1683 Ok(gpa) 1684 } 1685 1686 /// 1687 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1688 /// it in VMM by walking through translation tables. 1689 /// 1690 /// Address translation is big topic, here we only focus the scenario that 1691 /// happens in VMM while debugging kernel. This `translate_gva` 1692 /// implementation is restricted to: 1693 /// - Exception Level 1 1694 /// - Translate high address range only (kernel space) 1695 /// 1696 /// This implementation supports following Arm-v8a features related to 1697 /// address translation: 1698 /// - FEAT_LPA 1699 /// - FEAT_LVA 1700 /// - FEAT_LPA2 1701 /// 1702 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1703 fn translate_gva( 1704 &self, 1705 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1706 cpu_id: u8, 1707 gva: u64, 1708 ) -> Result<u64> { 1709 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1710 .lock() 1711 .unwrap() 1712 .vcpu 1713 .get_sys_reg(regs::TCR_EL1) 1714 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1715 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1716 .lock() 1717 .unwrap() 1718 .vcpu 1719 .get_sys_reg(regs::TTBR1_EL1) 1720 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1721 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1722 .lock() 1723 .unwrap() 1724 .vcpu 1725 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1726 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1727 1728 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1729 // or low (0x000xxx...). 1730 let high_range = extract_bits_64!(gva, 55, 1); 1731 if high_range == 0 { 1732 info!("VA (0x{:x}) range is not supported!", gva); 1733 return Ok(gva); 1734 } 1735 1736 // High range size offset 1737 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1738 // Granule size 1739 let tg = extract_bits_64!(tcr_el1, 30, 2); 1740 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1741 let ds = extract_bits_64!(tcr_el1, 59, 1); 1742 1743 if tsz == 0 { 1744 info!("VA translation is not ready!"); 1745 return Ok(gva); 1746 } 1747 1748 // VA size is determined by TCR_BL1.T1SZ 1749 let va_size = 64 - tsz; 1750 // Number of bits in VA consumed in each level of translation 1751 let stride = match tg { 1752 3 => 13, // 64KB granule size 1753 1 => 11, // 16KB granule size 1754 _ => 9, // 4KB, default 1755 }; 1756 // Starting level of walking 1757 let mut level = 4 - (va_size - 4) / stride; 1758 1759 // PA or IPA size is determined 1760 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1761 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1762 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1763 // To be safe, we use the minimum value if they are different. 1764 let pa_range = std::cmp::min(tcr_ips, pa_range); 1765 // PA size in bits 1766 let pa_size = match pa_range { 1767 0 => 32, 1768 1 => 36, 1769 2 => 40, 1770 3 => 42, 1771 4 => 44, 1772 5 => 48, 1773 6 => 52, 1774 _ => { 1775 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1776 "PA range not supported {pa_range}" 1777 )))) 1778 } 1779 }; 1780 1781 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1782 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1783 // If FEAT_LPA2 is present, the translation table descriptor holds 1784 // 50 bits of the table address of next level. 1785 // Otherwise, it is 48 bits. 1786 let descaddrmask = if ds == 1 { 1787 !0u64 >> (64 - 50) // mask with 50 least significant bits 1788 } else { 1789 !0u64 >> (64 - 48) // mask with 48 least significant bits 1790 }; 1791 let descaddrmask = descaddrmask & !indexmask_grainsize; 1792 1793 // Translation table base address 1794 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1795 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1796 // address bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1797 if pa_size == 52 { 1798 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1799 } 1800 1801 // Loop through tables of each level 1802 loop { 1803 // Table offset for current level 1804 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1805 descaddr |= table_offset; 1806 descaddr &= !7u64; 1807 1808 let mut buf = [0; 8]; 1809 guest_memory 1810 .memory() 1811 .read(&mut buf, GuestAddress(descaddr)) 1812 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1813 let descriptor = u64::from_le_bytes(buf); 1814 1815 descaddr = descriptor & descaddrmask; 1816 // In the case of FEAT_LPA, the next-level translation table address 1817 // bits [48:51] comes from bits [12:15] of the current descriptor. 1818 // For FEAT_LPA2, the next-level translation table address 1819 // bits [50:51] comes from bits [8:9] of the current descriptor, 1820 // bits [48:49] comes from bits [48:49] of the descriptor which was 1821 // handled previously. 1822 if pa_size == 52 { 1823 if ds == 1 { 1824 // FEAT_LPA2 1825 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1826 } else { 1827 // FEAT_LPA 1828 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1829 } 1830 } 1831 1832 if (descriptor & 2) != 0 && (level < 3) { 1833 // This is a table entry. Go down to next level. 1834 level += 1; 1835 indexmask = indexmask_grainsize; 1836 continue; 1837 } 1838 1839 break; 1840 } 1841 1842 // We have reached either: 1843 // - a page entry at level 3 or 1844 // - a block entry at level 1 or 2 1845 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1846 descaddr &= !(page_size - 1); 1847 descaddr |= gva & (page_size - 1); 1848 1849 Ok(descaddr) 1850 } 1851 1852 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1853 self.acpi_address = Some(acpi_address); 1854 } 1855 1856 pub(crate) fn set_interrupt_controller( 1857 &mut self, 1858 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1859 ) { 1860 self.interrupt_controller = Some(interrupt_controller); 1861 } 1862 1863 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1864 &self.vcpus_kill_signalled 1865 } 1866 1867 #[cfg(feature = "igvm")] 1868 pub(crate) fn get_cpuid_leaf( 1869 &self, 1870 cpu_id: u8, 1871 eax: u32, 1872 ecx: u32, 1873 xfem: u64, 1874 xss: u64, 1875 ) -> Result<[u32; 4]> { 1876 let leaf_info = self.vcpus[usize::from(cpu_id)] 1877 .lock() 1878 .unwrap() 1879 .vcpu 1880 .get_cpuid_values(eax, ecx, xfem, xss) 1881 .unwrap(); 1882 Ok(leaf_info) 1883 } 1884 1885 #[cfg(feature = "sev_snp")] 1886 pub(crate) fn sev_snp_enabled(&self) -> bool { 1887 self.sev_snp_enabled 1888 } 1889 1890 pub(crate) fn nmi(&self) -> Result<()> { 1891 self.vcpus_kick_signalled.store(true, Ordering::SeqCst); 1892 1893 for state in self.vcpu_states.iter() { 1894 state.signal_thread(); 1895 } 1896 1897 self.vcpus_kick_signalled.store(false, Ordering::SeqCst); 1898 1899 Ok(()) 1900 } 1901 } 1902 1903 struct Cpu { 1904 cpu_id: u8, 1905 proximity_domain: u32, 1906 dynamic: bool, 1907 #[cfg(target_arch = "x86_64")] 1908 topology: Option<(u8, u8, u8)>, 1909 } 1910 1911 #[cfg(target_arch = "x86_64")] 1912 const MADT_CPU_ENABLE_FLAG: usize = 0; 1913 1914 #[cfg(target_arch = "x86_64")] 1915 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1916 1917 impl Cpu { 1918 #[cfg(target_arch = "x86_64")] 1919 fn generate_mat(&self) -> Vec<u8> { 1920 let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology); 1921 1922 let lapic = LocalX2Apic { 1923 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1924 length: 16, 1925 processor_id: self.cpu_id.into(), 1926 apic_id: x2apic_id, 1927 flags: 1 << MADT_CPU_ENABLE_FLAG, 1928 _reserved: 0, 1929 }; 1930 1931 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1932 // SAFETY: mat_data is large enough to hold lapic 1933 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1934 1935 mat_data 1936 } 1937 } 1938 1939 impl Aml for Cpu { 1940 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1941 #[cfg(target_arch = "x86_64")] 1942 let mat_data: Vec<u8> = self.generate_mat(); 1943 #[allow(clippy::if_same_then_else)] 1944 if self.dynamic { 1945 aml::Device::new( 1946 format!("C{:03X}", self.cpu_id).as_str().into(), 1947 vec![ 1948 &aml::Name::new("_HID".into(), &"ACPI0007"), 1949 &aml::Name::new("_UID".into(), &self.cpu_id), 1950 // Currently, AArch64 cannot support following fields. 1951 /* 1952 _STA return value: 1953 Bit [0] – Set if the device is present. 1954 Bit [1] – Set if the device is enabled and decoding its resources. 1955 Bit [2] – Set if the device should be shown in the UI. 1956 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1957 Bit [4] – Set if the battery is present. 1958 Bits [31:5] – Reserved (must be cleared). 1959 */ 1960 #[cfg(target_arch = "x86_64")] 1961 &aml::Method::new( 1962 "_STA".into(), 1963 0, 1964 false, 1965 // Call into CSTA method which will interrogate device 1966 vec![&aml::Return::new(&aml::MethodCall::new( 1967 "CSTA".into(), 1968 vec![&self.cpu_id], 1969 ))], 1970 ), 1971 &aml::Method::new( 1972 "_PXM".into(), 1973 0, 1974 false, 1975 vec![&aml::Return::new(&self.proximity_domain)], 1976 ), 1977 // The Linux kernel expects every CPU device to have a _MAT entry 1978 // containing the LAPIC for this processor with the enabled bit set 1979 // even it if is disabled in the MADT (non-boot CPU) 1980 #[cfg(target_arch = "x86_64")] 1981 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1982 // Trigger CPU ejection 1983 #[cfg(target_arch = "x86_64")] 1984 &aml::Method::new( 1985 "_EJ0".into(), 1986 1, 1987 false, 1988 // Call into CEJ0 method which will actually eject device 1989 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1990 ), 1991 ], 1992 ) 1993 .to_aml_bytes(sink); 1994 } else { 1995 aml::Device::new( 1996 format!("C{:03X}", self.cpu_id).as_str().into(), 1997 vec![ 1998 &aml::Name::new("_HID".into(), &"ACPI0007"), 1999 &aml::Name::new("_UID".into(), &self.cpu_id), 2000 #[cfg(target_arch = "x86_64")] 2001 &aml::Method::new( 2002 "_STA".into(), 2003 0, 2004 false, 2005 // Mark CPU present see CSTA implementation 2006 vec![&aml::Return::new(&0xfu8)], 2007 ), 2008 &aml::Method::new( 2009 "_PXM".into(), 2010 0, 2011 false, 2012 vec![&aml::Return::new(&self.proximity_domain)], 2013 ), 2014 // The Linux kernel expects every CPU device to have a _MAT entry 2015 // containing the LAPIC for this processor with the enabled bit set 2016 // even it if is disabled in the MADT (non-boot CPU) 2017 #[cfg(target_arch = "x86_64")] 2018 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 2019 ], 2020 ) 2021 .to_aml_bytes(sink); 2022 } 2023 } 2024 } 2025 2026 struct CpuNotify { 2027 cpu_id: u8, 2028 } 2029 2030 impl Aml for CpuNotify { 2031 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2032 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 2033 aml::If::new( 2034 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 2035 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2036 ) 2037 .to_aml_bytes(sink) 2038 } 2039 } 2040 2041 struct CpuMethods { 2042 max_vcpus: u8, 2043 dynamic: bool, 2044 } 2045 2046 impl Aml for CpuMethods { 2047 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2048 if self.dynamic { 2049 // CPU status method 2050 aml::Method::new( 2051 "CSTA".into(), 2052 1, 2053 true, 2054 vec![ 2055 // Take lock defined above 2056 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2057 // Write CPU number (in first argument) to I/O port via field 2058 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2059 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2060 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2061 &aml::If::new( 2062 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 2063 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2064 ), 2065 // Release lock 2066 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2067 // Return 0 or 0xf 2068 &aml::Return::new(&aml::Local(0)), 2069 ], 2070 ) 2071 .to_aml_bytes(sink); 2072 2073 let mut cpu_notifies = Vec::new(); 2074 for cpu_id in 0..self.max_vcpus { 2075 cpu_notifies.push(CpuNotify { cpu_id }); 2076 } 2077 2078 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2079 for cpu_id in 0..self.max_vcpus { 2080 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 2081 } 2082 2083 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 2084 2085 aml::Method::new( 2086 "CEJ0".into(), 2087 1, 2088 true, 2089 vec![ 2090 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2091 // Write CPU number (in first argument) to I/O port via field 2092 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2093 // Set CEJ0 bit 2094 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 2095 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2096 ], 2097 ) 2098 .to_aml_bytes(sink); 2099 2100 aml::Method::new( 2101 "CSCN".into(), 2102 0, 2103 true, 2104 vec![ 2105 // Take lock defined above 2106 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2107 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2108 &aml::While::new( 2109 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 2110 vec![ 2111 // Write CPU number (in first argument) to I/O port via field 2112 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 2113 // Check if CINS bit is set 2114 &aml::If::new( 2115 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 2116 // Notify device if it is 2117 vec![ 2118 &aml::MethodCall::new( 2119 "CTFY".into(), 2120 vec![&aml::Local(0), &aml::ONE], 2121 ), 2122 // Reset CINS bit 2123 &aml::Store::new( 2124 &aml::Path::new("\\_SB_.PRES.CINS"), 2125 &aml::ONE, 2126 ), 2127 ], 2128 ), 2129 // Check if CRMV bit is set 2130 &aml::If::new( 2131 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 2132 // Notify device if it is (with the eject constant 0x3) 2133 vec![ 2134 &aml::MethodCall::new( 2135 "CTFY".into(), 2136 vec![&aml::Local(0), &3u8], 2137 ), 2138 // Reset CRMV bit 2139 &aml::Store::new( 2140 &aml::Path::new("\\_SB_.PRES.CRMV"), 2141 &aml::ONE, 2142 ), 2143 ], 2144 ), 2145 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2146 ], 2147 ), 2148 // Release lock 2149 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2150 ], 2151 ) 2152 .to_aml_bytes(sink) 2153 } else { 2154 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2155 } 2156 } 2157 } 2158 2159 impl Aml for CpuManager { 2160 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2161 #[cfg(target_arch = "x86_64")] 2162 if let Some(acpi_address) = self.acpi_address { 2163 // CPU hotplug controller 2164 aml::Device::new( 2165 "_SB_.PRES".into(), 2166 vec![ 2167 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2168 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2169 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2170 &aml::Mutex::new("CPLK".into(), 0), 2171 &aml::Name::new( 2172 "_CRS".into(), 2173 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2174 aml::AddressSpaceCacheable::NotCacheable, 2175 true, 2176 acpi_address.0, 2177 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2178 None, 2179 )]), 2180 ), 2181 // OpRegion and Fields map MMIO range into individual field values 2182 &aml::OpRegion::new( 2183 "PRST".into(), 2184 aml::OpRegionSpace::SystemMemory, 2185 &(acpi_address.0 as usize), 2186 &CPU_MANAGER_ACPI_SIZE, 2187 ), 2188 &aml::Field::new( 2189 "PRST".into(), 2190 aml::FieldAccessType::Byte, 2191 aml::FieldLockRule::NoLock, 2192 aml::FieldUpdateRule::WriteAsZeroes, 2193 vec![ 2194 aml::FieldEntry::Reserved(32), 2195 aml::FieldEntry::Named(*b"CPEN", 1), 2196 aml::FieldEntry::Named(*b"CINS", 1), 2197 aml::FieldEntry::Named(*b"CRMV", 1), 2198 aml::FieldEntry::Named(*b"CEJ0", 1), 2199 aml::FieldEntry::Reserved(4), 2200 aml::FieldEntry::Named(*b"CCMD", 8), 2201 ], 2202 ), 2203 &aml::Field::new( 2204 "PRST".into(), 2205 aml::FieldAccessType::DWord, 2206 aml::FieldLockRule::NoLock, 2207 aml::FieldUpdateRule::Preserve, 2208 vec![ 2209 aml::FieldEntry::Named(*b"CSEL", 32), 2210 aml::FieldEntry::Reserved(32), 2211 aml::FieldEntry::Named(*b"CDAT", 32), 2212 ], 2213 ), 2214 ], 2215 ) 2216 .to_aml_bytes(sink); 2217 } 2218 2219 // CPU devices 2220 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2221 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2222 // Bundle methods together under a common object 2223 let methods = CpuMethods { 2224 max_vcpus: self.config.max_vcpus, 2225 dynamic: self.dynamic, 2226 }; 2227 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2228 2229 #[cfg(target_arch = "x86_64")] 2230 let topology = self.get_vcpu_topology(); 2231 let mut cpu_devices = Vec::new(); 2232 for cpu_id in 0..self.config.max_vcpus { 2233 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2234 let cpu_device = Cpu { 2235 cpu_id, 2236 proximity_domain, 2237 dynamic: self.dynamic, 2238 #[cfg(target_arch = "x86_64")] 2239 topology, 2240 }; 2241 2242 cpu_devices.push(cpu_device); 2243 } 2244 2245 for cpu_device in cpu_devices.iter() { 2246 cpu_data_inner.push(cpu_device); 2247 } 2248 2249 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2250 } 2251 } 2252 2253 impl Pausable for CpuManager { 2254 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2255 // Tell the vCPUs to pause themselves next time they exit 2256 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2257 2258 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2259 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2260 // above. 2261 for state in self.vcpu_states.iter() { 2262 state.signal_thread(); 2263 } 2264 2265 for vcpu in self.vcpus.iter() { 2266 let mut vcpu = vcpu.lock().unwrap(); 2267 vcpu.pause()?; 2268 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2269 if !self.config.kvm_hyperv { 2270 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2271 MigratableError::Pause(anyhow!( 2272 "Could not notify guest it has been paused {:?}", 2273 e 2274 )) 2275 })?; 2276 } 2277 } 2278 2279 // The vCPU thread will change its paused state before parking, wait here for each 2280 // activated vCPU change their state to ensure they have parked. 2281 for state in self.vcpu_states.iter() { 2282 if state.active() { 2283 while !state.paused.load(Ordering::SeqCst) { 2284 // To avoid a priority inversion with the vCPU thread 2285 thread::sleep(std::time::Duration::from_millis(1)); 2286 } 2287 } 2288 } 2289 2290 Ok(()) 2291 } 2292 2293 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2294 for vcpu in self.vcpus.iter() { 2295 vcpu.lock().unwrap().resume()?; 2296 } 2297 2298 // Toggle the vCPUs pause boolean 2299 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2300 2301 // Unpark all the VCPU threads. 2302 // Once unparked, the next thing they will do is checking for the pause 2303 // boolean. Since it'll be set to false, they will exit their pause loop 2304 // and go back to vmx root. 2305 for state in self.vcpu_states.iter() { 2306 state.paused.store(false, Ordering::SeqCst); 2307 state.unpark_thread(); 2308 } 2309 Ok(()) 2310 } 2311 } 2312 2313 impl Snapshottable for CpuManager { 2314 fn id(&self) -> String { 2315 CPU_MANAGER_SNAPSHOT_ID.to_string() 2316 } 2317 2318 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2319 let mut cpu_manager_snapshot = Snapshot::default(); 2320 2321 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2322 for vcpu in &self.vcpus { 2323 let mut vcpu = vcpu.lock().unwrap(); 2324 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2325 } 2326 2327 Ok(cpu_manager_snapshot) 2328 } 2329 } 2330 2331 impl Transportable for CpuManager {} 2332 impl Migratable for CpuManager {} 2333 2334 #[cfg(feature = "guest_debug")] 2335 impl Debuggable for CpuManager { 2336 #[cfg(feature = "kvm")] 2337 fn set_guest_debug( 2338 &self, 2339 cpu_id: usize, 2340 addrs: &[GuestAddress], 2341 singlestep: bool, 2342 ) -> std::result::Result<(), DebuggableError> { 2343 self.vcpus[cpu_id] 2344 .lock() 2345 .unwrap() 2346 .vcpu 2347 .set_guest_debug(addrs, singlestep) 2348 .map_err(DebuggableError::SetDebug) 2349 } 2350 2351 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2352 Ok(()) 2353 } 2354 2355 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2356 Ok(()) 2357 } 2358 2359 #[cfg(target_arch = "x86_64")] 2360 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2361 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2362 let gregs = self 2363 .get_regs(cpu_id as u8) 2364 .map_err(DebuggableError::ReadRegs)?; 2365 let regs = [ 2366 gregs.get_rax(), 2367 gregs.get_rbx(), 2368 gregs.get_rcx(), 2369 gregs.get_rdx(), 2370 gregs.get_rsi(), 2371 gregs.get_rdi(), 2372 gregs.get_rbp(), 2373 gregs.get_rsp(), 2374 gregs.get_r8(), 2375 gregs.get_r9(), 2376 gregs.get_r10(), 2377 gregs.get_r11(), 2378 gregs.get_r12(), 2379 gregs.get_r13(), 2380 gregs.get_r14(), 2381 gregs.get_r15(), 2382 ]; 2383 2384 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2385 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2386 let eflags = gregs.get_rflags() as u32; 2387 let rip = gregs.get_rip(); 2388 2389 // Segment registers: CS, SS, DS, ES, FS, GS 2390 let sregs = self 2391 .get_sregs(cpu_id as u8) 2392 .map_err(DebuggableError::ReadRegs)?; 2393 let segments = X86SegmentRegs { 2394 cs: sregs.cs.selector as u32, 2395 ss: sregs.ss.selector as u32, 2396 ds: sregs.ds.selector as u32, 2397 es: sregs.es.selector as u32, 2398 fs: sregs.fs.selector as u32, 2399 gs: sregs.gs.selector as u32, 2400 }; 2401 2402 // TODO: Add other registers 2403 2404 Ok(CoreRegs { 2405 regs, 2406 eflags, 2407 rip, 2408 segments, 2409 ..Default::default() 2410 }) 2411 } 2412 2413 #[cfg(target_arch = "aarch64")] 2414 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2415 let gregs = self 2416 .get_regs(cpu_id as u8) 2417 .map_err(DebuggableError::ReadRegs)?; 2418 Ok(CoreRegs { 2419 x: gregs.get_regs(), 2420 sp: gregs.get_sp(), 2421 pc: gregs.get_pc(), 2422 ..Default::default() 2423 }) 2424 } 2425 2426 #[cfg(target_arch = "x86_64")] 2427 fn write_regs( 2428 &self, 2429 cpu_id: usize, 2430 regs: &CoreRegs, 2431 ) -> std::result::Result<(), DebuggableError> { 2432 let orig_gregs = self 2433 .get_regs(cpu_id as u8) 2434 .map_err(DebuggableError::ReadRegs)?; 2435 let mut gregs = self.create_standard_regs(cpu_id as u8); 2436 gregs.set_rax(regs.regs[0]); 2437 gregs.set_rbx(regs.regs[1]); 2438 gregs.set_rcx(regs.regs[2]); 2439 gregs.set_rdx(regs.regs[3]); 2440 gregs.set_rsi(regs.regs[4]); 2441 gregs.set_rdi(regs.regs[5]); 2442 gregs.set_rbp(regs.regs[6]); 2443 gregs.set_rsp(regs.regs[7]); 2444 gregs.set_r8(regs.regs[8]); 2445 gregs.set_r9(regs.regs[9]); 2446 gregs.set_r10(regs.regs[10]); 2447 gregs.set_r11(regs.regs[11]); 2448 gregs.set_r12(regs.regs[12]); 2449 gregs.set_r13(regs.regs[13]); 2450 gregs.set_r14(regs.regs[14]); 2451 gregs.set_r15(regs.regs[15]); 2452 gregs.set_rip(regs.rip); 2453 // Update the lower 32-bit of rflags. 2454 gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64)); 2455 2456 self.set_regs(cpu_id as u8, &gregs) 2457 .map_err(DebuggableError::WriteRegs)?; 2458 2459 // Segment registers: CS, SS, DS, ES, FS, GS 2460 // Since GDB care only selectors, we call get_sregs() first. 2461 let mut sregs = self 2462 .get_sregs(cpu_id as u8) 2463 .map_err(DebuggableError::ReadRegs)?; 2464 sregs.cs.selector = regs.segments.cs as u16; 2465 sregs.ss.selector = regs.segments.ss as u16; 2466 sregs.ds.selector = regs.segments.ds as u16; 2467 sregs.es.selector = regs.segments.es as u16; 2468 sregs.fs.selector = regs.segments.fs as u16; 2469 sregs.gs.selector = regs.segments.gs as u16; 2470 2471 self.set_sregs(cpu_id as u8, &sregs) 2472 .map_err(DebuggableError::WriteRegs)?; 2473 2474 // TODO: Add other registers 2475 2476 Ok(()) 2477 } 2478 2479 #[cfg(target_arch = "aarch64")] 2480 fn write_regs( 2481 &self, 2482 cpu_id: usize, 2483 regs: &CoreRegs, 2484 ) -> std::result::Result<(), DebuggableError> { 2485 let mut gregs = self 2486 .get_regs(cpu_id as u8) 2487 .map_err(DebuggableError::ReadRegs)?; 2488 2489 gregs.set_regs(regs.x); 2490 gregs.set_sp(regs.sp); 2491 gregs.set_pc(regs.pc); 2492 2493 self.set_regs(cpu_id as u8, &gregs) 2494 .map_err(DebuggableError::WriteRegs)?; 2495 2496 Ok(()) 2497 } 2498 2499 fn read_mem( 2500 &self, 2501 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2502 cpu_id: usize, 2503 vaddr: GuestAddress, 2504 len: usize, 2505 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2506 let mut buf = vec![0; len]; 2507 let mut total_read = 0_u64; 2508 2509 while total_read < len as u64 { 2510 let gaddr = vaddr.0 + total_read; 2511 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2512 Ok(paddr) => paddr, 2513 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2514 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2515 }; 2516 let psize = arch::PAGE_SIZE as u64; 2517 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2518 guest_memory 2519 .memory() 2520 .read( 2521 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2522 GuestAddress(paddr), 2523 ) 2524 .map_err(DebuggableError::ReadMem)?; 2525 total_read += read_len; 2526 } 2527 Ok(buf) 2528 } 2529 2530 fn write_mem( 2531 &self, 2532 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2533 cpu_id: usize, 2534 vaddr: &GuestAddress, 2535 data: &[u8], 2536 ) -> std::result::Result<(), DebuggableError> { 2537 let mut total_written = 0_u64; 2538 2539 while total_written < data.len() as u64 { 2540 let gaddr = vaddr.0 + total_written; 2541 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2542 Ok(paddr) => paddr, 2543 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2544 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2545 }; 2546 let psize = arch::PAGE_SIZE as u64; 2547 let write_len = std::cmp::min( 2548 data.len() as u64 - total_written, 2549 psize - (paddr & (psize - 1)), 2550 ); 2551 guest_memory 2552 .memory() 2553 .write( 2554 &data[total_written as usize..total_written as usize + write_len as usize], 2555 GuestAddress(paddr), 2556 ) 2557 .map_err(DebuggableError::WriteMem)?; 2558 total_written += write_len; 2559 } 2560 Ok(()) 2561 } 2562 2563 fn active_vcpus(&self) -> usize { 2564 self.present_vcpus() as usize 2565 } 2566 } 2567 2568 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2569 impl Elf64Writable for CpuManager {} 2570 2571 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2572 impl CpuElf64Writable for CpuManager { 2573 fn cpu_write_elf64_note( 2574 &mut self, 2575 dump_state: &DumpState, 2576 ) -> std::result::Result<(), GuestDebuggableError> { 2577 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2578 for vcpu in &self.vcpus { 2579 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2580 let mut pos: usize = 0; 2581 let mut buf = vec![0; note_size as usize]; 2582 let descsz = size_of::<X86_64ElfPrStatus>(); 2583 let vcpu_id = vcpu.lock().unwrap().id; 2584 2585 let note = Elf64_Nhdr { 2586 n_namesz: COREDUMP_NAME_SIZE, 2587 n_descsz: descsz as u32, 2588 n_type: NT_PRSTATUS, 2589 }; 2590 2591 let bytes: &[u8] = note.as_slice(); 2592 buf.splice(0.., bytes.to_vec()); 2593 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2594 buf.resize(pos + 4, 0); 2595 buf.splice(pos.., "CORE".to_string().into_bytes()); 2596 2597 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2598 buf.resize(pos + 32 + 4, 0); 2599 let pid = vcpu_id as u64; 2600 let bytes: &[u8] = pid.as_slice(); 2601 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2602 2603 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2604 2605 let orig_rax: u64 = 0; 2606 let gregs = self.vcpus[usize::from(vcpu_id)] 2607 .lock() 2608 .unwrap() 2609 .vcpu 2610 .get_regs() 2611 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2612 2613 let regs1 = [ 2614 gregs.get_r15(), 2615 gregs.get_r14(), 2616 gregs.get_r13(), 2617 gregs.get_r12(), 2618 gregs.get_rbp(), 2619 gregs.get_rbx(), 2620 gregs.get_r11(), 2621 gregs.get_r10(), 2622 ]; 2623 let regs2 = [ 2624 gregs.get_r9(), 2625 gregs.get_r8(), 2626 gregs.get_rax(), 2627 gregs.get_rcx(), 2628 gregs.get_rdx(), 2629 gregs.get_rsi(), 2630 gregs.get_rdi(), 2631 orig_rax, 2632 ]; 2633 2634 let sregs = self.vcpus[usize::from(vcpu_id)] 2635 .lock() 2636 .unwrap() 2637 .vcpu 2638 .get_sregs() 2639 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2640 2641 debug!( 2642 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2643 gregs.get_rip(), 2644 gregs.get_rsp(), 2645 sregs.gs.base, 2646 sregs.cs.selector, 2647 sregs.ss.selector, 2648 sregs.ds.selector, 2649 ); 2650 2651 let regs = X86_64UserRegs { 2652 regs1, 2653 regs2, 2654 rip: gregs.get_rip(), 2655 cs: sregs.cs.selector as u64, 2656 eflags: gregs.get_rflags(), 2657 rsp: gregs.get_rsp(), 2658 ss: sregs.ss.selector as u64, 2659 fs_base: sregs.fs.base, 2660 gs_base: sregs.gs.base, 2661 ds: sregs.ds.selector as u64, 2662 es: sregs.es.selector as u64, 2663 fs: sregs.fs.selector as u64, 2664 gs: sregs.gs.selector as u64, 2665 }; 2666 2667 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2668 let bytes: &[u8] = regs.as_slice(); 2669 buf.resize(note_size as usize, 0); 2670 buf.splice(pos.., bytes.to_vec()); 2671 buf.resize(note_size as usize, 0); 2672 2673 coredump_file 2674 .write(&buf) 2675 .map_err(GuestDebuggableError::CoredumpFile)?; 2676 } 2677 2678 Ok(()) 2679 } 2680 2681 fn cpu_write_vmm_note( 2682 &mut self, 2683 dump_state: &DumpState, 2684 ) -> std::result::Result<(), GuestDebuggableError> { 2685 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2686 for vcpu in &self.vcpus { 2687 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2688 let mut pos: usize = 0; 2689 let mut buf = vec![0; note_size as usize]; 2690 let descsz = size_of::<DumpCpusState>(); 2691 let vcpu_id = vcpu.lock().unwrap().id; 2692 2693 let note = Elf64_Nhdr { 2694 n_namesz: COREDUMP_NAME_SIZE, 2695 n_descsz: descsz as u32, 2696 n_type: 0, 2697 }; 2698 2699 let bytes: &[u8] = note.as_slice(); 2700 buf.splice(0.., bytes.to_vec()); 2701 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2702 2703 buf.resize(pos + 4, 0); 2704 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2705 2706 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2707 2708 let gregs = self.vcpus[usize::from(vcpu_id)] 2709 .lock() 2710 .unwrap() 2711 .vcpu 2712 .get_regs() 2713 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2714 2715 let regs1 = [ 2716 gregs.get_rax(), 2717 gregs.get_rbx(), 2718 gregs.get_rcx(), 2719 gregs.get_rdx(), 2720 gregs.get_rsi(), 2721 gregs.get_rdi(), 2722 gregs.get_rsp(), 2723 gregs.get_rbp(), 2724 ]; 2725 2726 let regs2 = [ 2727 gregs.get_r8(), 2728 gregs.get_r9(), 2729 gregs.get_r10(), 2730 gregs.get_r11(), 2731 gregs.get_r12(), 2732 gregs.get_r13(), 2733 gregs.get_r14(), 2734 gregs.get_r15(), 2735 ]; 2736 2737 let sregs = self.vcpus[usize::from(vcpu_id)] 2738 .lock() 2739 .unwrap() 2740 .vcpu 2741 .get_sregs() 2742 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2743 2744 let mut msrs = vec![MsrEntry { 2745 index: msr_index::MSR_KERNEL_GS_BASE, 2746 ..Default::default() 2747 }]; 2748 2749 self.vcpus[vcpu_id as usize] 2750 .lock() 2751 .unwrap() 2752 .vcpu 2753 .get_msrs(&mut msrs) 2754 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2755 let kernel_gs_base = msrs[0].data; 2756 2757 let cs = CpuSegment::new(sregs.cs); 2758 let ds = CpuSegment::new(sregs.ds); 2759 let es = CpuSegment::new(sregs.es); 2760 let fs = CpuSegment::new(sregs.fs); 2761 let gs = CpuSegment::new(sregs.gs); 2762 let ss = CpuSegment::new(sregs.ss); 2763 let ldt = CpuSegment::new(sregs.ldt); 2764 let tr = CpuSegment::new(sregs.tr); 2765 let gdt = CpuSegment::new_from_table(sregs.gdt); 2766 let idt = CpuSegment::new_from_table(sregs.idt); 2767 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2768 let regs = DumpCpusState { 2769 version: 1, 2770 size: size_of::<DumpCpusState>() as u32, 2771 regs1, 2772 regs2, 2773 rip: gregs.get_rip(), 2774 rflags: gregs.get_rflags(), 2775 cs, 2776 ds, 2777 es, 2778 fs, 2779 gs, 2780 ss, 2781 ldt, 2782 tr, 2783 gdt, 2784 idt, 2785 cr, 2786 kernel_gs_base, 2787 }; 2788 2789 let bytes: &[u8] = regs.as_slice(); 2790 buf.resize(note_size as usize, 0); 2791 buf.splice(pos.., bytes.to_vec()); 2792 buf.resize(note_size as usize, 0); 2793 2794 coredump_file 2795 .write(&buf) 2796 .map_err(GuestDebuggableError::CoredumpFile)?; 2797 } 2798 2799 Ok(()) 2800 } 2801 } 2802 2803 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2804 #[cfg(test)] 2805 mod tests { 2806 use arch::layout::BOOT_STACK_POINTER; 2807 use arch::layout::ZERO_PAGE_START; 2808 use arch::x86_64::interrupts::*; 2809 use arch::x86_64::regs::*; 2810 use hypervisor::arch::x86::{FpuState, LapicState}; 2811 use hypervisor::StandardRegisters; 2812 use linux_loader::loader::bootparam::setup_header; 2813 2814 #[test] 2815 fn test_setlint() { 2816 let hv = hypervisor::new().unwrap(); 2817 let vm = hv.create_vm().expect("new VM fd creation failed"); 2818 assert!(hv.check_required_extensions().is_ok()); 2819 // Calling get_lapic will fail if there is no irqchip before hand. 2820 assert!(vm.create_irq_chip().is_ok()); 2821 let vcpu = vm.create_vcpu(0, None).unwrap(); 2822 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2823 2824 // Compute the value that is expected to represent LVT0 and LVT1. 2825 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2826 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2827 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2828 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2829 2830 set_lint(&vcpu).unwrap(); 2831 2832 // Compute the value that represents LVT0 and LVT1 after set_lint. 2833 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2834 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2835 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2836 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2837 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2838 } 2839 2840 #[test] 2841 fn test_setup_fpu() { 2842 let hv = hypervisor::new().unwrap(); 2843 let vm = hv.create_vm().expect("new VM fd creation failed"); 2844 let vcpu = vm.create_vcpu(0, None).unwrap(); 2845 setup_fpu(&vcpu).unwrap(); 2846 2847 let expected_fpu: FpuState = FpuState { 2848 fcw: 0x37f, 2849 mxcsr: 0x1f80, 2850 ..Default::default() 2851 }; 2852 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2853 // TODO: auto-generate kvm related structures with PartialEq on. 2854 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2855 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2856 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2857 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2858 // remove it at all. 2859 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2860 } 2861 2862 #[test] 2863 fn test_setup_msrs() { 2864 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2865 2866 let hv = hypervisor::new().unwrap(); 2867 let vm = hv.create_vm().expect("new VM fd creation failed"); 2868 let vcpu = vm.create_vcpu(0, None).unwrap(); 2869 setup_msrs(&vcpu).unwrap(); 2870 2871 // This test will check against the last MSR entry configured (the tenth one). 2872 // See create_msr_entries for details. 2873 let mut msrs = vec![MsrEntry { 2874 index: msr_index::MSR_IA32_MISC_ENABLE, 2875 ..Default::default() 2876 }]; 2877 2878 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2879 // in this test case scenario. 2880 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2881 assert_eq!(read_msrs, 1); 2882 2883 // Official entries that were setup when we did setup_msrs. We need to assert that the 2884 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2885 // expect. 2886 let entry_vec = vcpu.boot_msr_entries(); 2887 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2888 } 2889 2890 #[test] 2891 fn test_setup_regs_for_pvh() { 2892 let hv = hypervisor::new().unwrap(); 2893 let vm = hv.create_vm().expect("new VM fd creation failed"); 2894 let vcpu = vm.create_vcpu(0, None).unwrap(); 2895 2896 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2897 expected_regs.set_rflags(0x0000000000000002u64); 2898 expected_regs.set_rbx(arch::layout::PVH_INFO_START.0); 2899 expected_regs.set_rip(1); 2900 2901 setup_regs( 2902 &vcpu, 2903 arch::EntryPoint { 2904 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2905 setup_header: None, 2906 }, 2907 ) 2908 .unwrap(); 2909 2910 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2911 assert_eq!(actual_regs, expected_regs); 2912 } 2913 2914 #[test] 2915 fn test_setup_regs_for_bzimage() { 2916 let hv = hypervisor::new().unwrap(); 2917 let vm = hv.create_vm().expect("new VM fd creation failed"); 2918 let vcpu = vm.create_vcpu(0, None).unwrap(); 2919 2920 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2921 expected_regs.set_rflags(0x0000000000000002u64); 2922 expected_regs.set_rip(1); 2923 expected_regs.set_rsp(BOOT_STACK_POINTER.0); 2924 expected_regs.set_rsi(ZERO_PAGE_START.0); 2925 2926 setup_regs( 2927 &vcpu, 2928 arch::EntryPoint { 2929 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2930 setup_header: Some(setup_header { 2931 ..Default::default() 2932 }), 2933 }, 2934 ) 2935 .unwrap(); 2936 2937 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2938 assert_eq!(actual_regs, expected_regs); 2939 } 2940 } 2941 2942 #[cfg(target_arch = "aarch64")] 2943 #[cfg(test)] 2944 mod tests { 2945 use arch::{aarch64::regs, layout}; 2946 use hypervisor::kvm::aarch64::is_system_register; 2947 use hypervisor::kvm::kvm_bindings::{ 2948 kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE, 2949 KVM_REG_SIZE_U64, 2950 }; 2951 use hypervisor::{arm64_core_reg_id, offset_of}; 2952 use std::mem; 2953 2954 #[test] 2955 fn test_setup_regs() { 2956 let hv = hypervisor::new().unwrap(); 2957 let vm = hv.create_vm().unwrap(); 2958 let vcpu = vm.create_vcpu(0, None).unwrap(); 2959 2960 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2961 // Must fail when vcpu is not initialized yet. 2962 assert!(res.is_err()); 2963 2964 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2965 vm.get_preferred_target(&mut kvi).unwrap(); 2966 vcpu.vcpu_init(&kvi).unwrap(); 2967 2968 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2969 } 2970 2971 #[test] 2972 fn test_read_mpidr() { 2973 let hv = hypervisor::new().unwrap(); 2974 let vm = hv.create_vm().unwrap(); 2975 let vcpu = vm.create_vcpu(0, None).unwrap(); 2976 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2977 vm.get_preferred_target(&mut kvi).unwrap(); 2978 2979 // Must fail when vcpu is not initialized yet. 2980 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2981 2982 vcpu.vcpu_init(&kvi).unwrap(); 2983 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2984 } 2985 2986 #[test] 2987 fn test_is_system_register() { 2988 let offset = offset_of!(user_pt_regs, pc); 2989 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2990 assert!(!is_system_register(regid)); 2991 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2992 assert!(is_system_register(regid)); 2993 } 2994 2995 #[test] 2996 fn test_save_restore_core_regs() { 2997 let hv = hypervisor::new().unwrap(); 2998 let vm = hv.create_vm().unwrap(); 2999 let vcpu = vm.create_vcpu(0, None).unwrap(); 3000 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 3001 vm.get_preferred_target(&mut kvi).unwrap(); 3002 3003 // Must fail when vcpu is not initialized yet. 3004 let res = vcpu.get_regs(); 3005 assert!(res.is_err()); 3006 assert_eq!( 3007 format!("{}", res.unwrap_err()), 3008 "Failed to get core register: Exec format error (os error 8)" 3009 ); 3010 3011 let mut state = vcpu.create_standard_regs(); 3012 let res = vcpu.set_regs(&state); 3013 assert!(res.is_err()); 3014 assert_eq!( 3015 format!("{}", res.unwrap_err()), 3016 "Failed to set core register: Exec format error (os error 8)" 3017 ); 3018 3019 vcpu.vcpu_init(&kvi).unwrap(); 3020 let res = vcpu.get_regs(); 3021 assert!(res.is_ok()); 3022 state = res.unwrap(); 3023 assert_eq!(state.get_pstate(), 0x3C5); 3024 3025 assert!(vcpu.set_regs(&state).is_ok()); 3026 } 3027 3028 #[test] 3029 fn test_get_set_mpstate() { 3030 let hv = hypervisor::new().unwrap(); 3031 let vm = hv.create_vm().unwrap(); 3032 let vcpu = vm.create_vcpu(0, None).unwrap(); 3033 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 3034 vm.get_preferred_target(&mut kvi).unwrap(); 3035 3036 let res = vcpu.get_mp_state(); 3037 assert!(res.is_ok()); 3038 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 3039 } 3040 } 3041