1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use std::collections::BTreeMap; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use std::io::Write; 17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 18 use std::mem::size_of; 19 use std::os::unix::thread::JoinHandleExt; 20 use std::sync::atomic::{AtomicBool, Ordering}; 21 use std::sync::{Arc, Barrier, Mutex}; 22 use std::{cmp, io, result, thread}; 23 24 #[cfg(not(target_arch = "riscv64"))] 25 use acpi_tables::sdt::Sdt; 26 use acpi_tables::{aml, Aml}; 27 use anyhow::anyhow; 28 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 29 use arch::aarch64::regs; 30 #[cfg(target_arch = "x86_64")] 31 use arch::x86_64::get_x2apic_id; 32 use arch::{EntryPoint, NumaNodes}; 33 #[cfg(target_arch = "aarch64")] 34 use devices::gic::Gic; 35 use devices::interrupt_controller::InterruptController; 36 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 37 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 38 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 39 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 40 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 41 use hypervisor::arch::x86::msr_index; 42 #[cfg(target_arch = "x86_64")] 43 use hypervisor::arch::x86::CpuIdEntry; 44 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 45 use hypervisor::arch::x86::MsrEntry; 46 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 47 use hypervisor::arch::x86::SpecialRegisters; 48 #[cfg(feature = "tdx")] 49 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 50 #[cfg(target_arch = "x86_64")] 51 use hypervisor::CpuVendor; 52 #[cfg(feature = "kvm")] 53 use hypervisor::HypervisorType; 54 #[cfg(feature = "guest_debug")] 55 use hypervisor::StandardRegisters; 56 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps}; 57 use libc::{c_void, siginfo_t}; 58 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 59 use linux_loader::elf::Elf64_Nhdr; 60 use seccompiler::{apply_filter, SeccompAction}; 61 use thiserror::Error; 62 use tracer::trace_scoped; 63 use vm_device::BusDevice; 64 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 65 use vm_memory::ByteValued; 66 #[cfg(feature = "guest_debug")] 67 use vm_memory::{Bytes, GuestAddressSpace}; 68 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 69 use vm_migration::{ 70 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 71 Transportable, 72 }; 73 use vmm_sys_util::eventfd::EventFd; 74 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 75 use zerocopy::{FromBytes, Immutable, IntoBytes}; 76 77 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 78 use crate::coredump::{ 79 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 80 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 81 NT_PRSTATUS, 82 }; 83 #[cfg(feature = "guest_debug")] 84 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 85 #[cfg(target_arch = "x86_64")] 86 use crate::memory_manager::MemoryManager; 87 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 88 #[cfg(target_arch = "x86_64")] 89 use crate::vm::physical_bits; 90 use crate::vm_config::CpusConfig; 91 use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID}; 92 93 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 94 /// Extract the specified bits of a 64-bit integer. 95 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 96 /// following expression should return 3 (`0b11`): 97 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 98 /// 99 macro_rules! extract_bits_64 { 100 ($value: tt, $offset: tt, $length: tt) => { 101 ($value >> $offset) & (!0u64 >> (64 - $length)) 102 }; 103 } 104 105 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 106 macro_rules! extract_bits_64_without_offset { 107 ($value: tt, $length: tt) => { 108 $value & (!0u64 >> (64 - $length)) 109 }; 110 } 111 112 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 113 114 #[derive(Debug, Error)] 115 pub enum Error { 116 #[error("Error creating vCPU: {0}")] 117 VcpuCreate(#[source] anyhow::Error), 118 119 #[error("Error running bCPU: {0}")] 120 VcpuRun(#[source] anyhow::Error), 121 122 #[error("Error spawning vCPU thread: {0}")] 123 VcpuSpawn(#[source] io::Error), 124 125 #[error("Error generating common CPUID: {0}")] 126 CommonCpuId(#[source] arch::Error), 127 128 #[error("Error configuring vCPU: {0}")] 129 VcpuConfiguration(#[source] arch::Error), 130 131 #[error("Still pending removed vcpu")] 132 VcpuPendingRemovedVcpu, 133 134 #[cfg(target_arch = "aarch64")] 135 #[error("Error fetching preferred target: {0}")] 136 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 137 138 #[cfg(target_arch = "aarch64")] 139 #[error("Error setting vCPU processor features: {0}")] 140 VcpuSetProcessorFeatures(#[source] hypervisor::HypervisorCpuError), 141 142 #[cfg(target_arch = "aarch64")] 143 #[error("Error initialising vCPU: {0}")] 144 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 145 146 #[cfg(target_arch = "aarch64")] 147 #[error("Error finalising vCPU: {0}")] 148 VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError), 149 150 #[error("Failed to join on vCPU threads: {0:?}")] 151 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 152 153 #[error("Error adding CpuManager to MMIO bus: {0}")] 154 BusError(#[source] vm_device::BusError), 155 156 #[error("Requested vCPUs exceed maximum")] 157 DesiredVCpuCountExceedsMax, 158 159 #[error("Cannot create seccomp filter: {0}")] 160 CreateSeccompFilter(#[source] seccompiler::Error), 161 162 #[error("Cannot apply seccomp filter: {0}")] 163 ApplySeccompFilter(#[source] seccompiler::Error), 164 165 #[error("Error starting vCPU after restore: {0}")] 166 StartRestoreVcpu(#[source] anyhow::Error), 167 168 #[error("Unexpected VmExit")] 169 UnexpectedVmExit, 170 171 #[error("Failed to allocate MMIO address for CpuManager")] 172 AllocateMmmioAddress, 173 174 #[cfg(feature = "tdx")] 175 #[error("Error initializing TDX: {0}")] 176 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 177 178 #[cfg(target_arch = "aarch64")] 179 #[error("Error initializing PMU: {0}")] 180 InitPmu(#[source] hypervisor::HypervisorCpuError), 181 182 #[cfg(feature = "guest_debug")] 183 #[error("Error during CPU debug: {0}")] 184 CpuDebug(#[source] hypervisor::HypervisorCpuError), 185 186 #[cfg(feature = "guest_debug")] 187 #[error("Error translating virtual address: {0}")] 188 TranslateVirtualAddress(#[source] anyhow::Error), 189 190 #[cfg(target_arch = "x86_64")] 191 #[error("Error setting up AMX: {0}")] 192 AmxEnable(#[source] anyhow::Error), 193 194 #[error("Maximum number of vCPUs exceeds host limit")] 195 MaximumVcpusExceeded, 196 197 #[cfg(feature = "sev_snp")] 198 #[error("Failed to set sev control register: {0}")] 199 SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), 200 201 #[cfg(target_arch = "x86_64")] 202 #[error("Failed to inject NMI")] 203 NmiError(hypervisor::HypervisorCpuError), 204 } 205 pub type Result<T> = result::Result<T, Error>; 206 207 #[cfg(target_arch = "x86_64")] 208 #[allow(dead_code)] 209 #[repr(C, packed)] 210 #[derive(IntoBytes, Immutable, FromBytes)] 211 struct LocalX2Apic { 212 pub r#type: u8, 213 pub length: u8, 214 pub _reserved: u16, 215 pub apic_id: u32, 216 pub flags: u32, 217 pub processor_id: u32, 218 } 219 220 #[allow(dead_code)] 221 #[repr(C, packed)] 222 #[derive(Default, IntoBytes, Immutable, FromBytes)] 223 struct Ioapic { 224 pub r#type: u8, 225 pub length: u8, 226 pub ioapic_id: u8, 227 _reserved: u8, 228 pub apic_address: u32, 229 pub gsi_base: u32, 230 } 231 232 #[cfg(target_arch = "aarch64")] 233 #[allow(dead_code)] 234 #[repr(C, packed)] 235 #[derive(IntoBytes, Immutable, FromBytes)] 236 struct GicC { 237 pub r#type: u8, 238 pub length: u8, 239 pub reserved0: u16, 240 pub cpu_interface_number: u32, 241 pub uid: u32, 242 pub flags: u32, 243 pub parking_version: u32, 244 pub performance_interrupt: u32, 245 pub parked_address: u64, 246 pub base_address: u64, 247 pub gicv_base_address: u64, 248 pub gich_base_address: u64, 249 pub vgic_interrupt: u32, 250 pub gicr_base_address: u64, 251 pub mpidr: u64, 252 pub proc_power_effi_class: u8, 253 pub reserved1: u8, 254 pub spe_overflow_interrupt: u16, 255 } 256 257 #[cfg(target_arch = "aarch64")] 258 #[allow(dead_code)] 259 #[repr(C, packed)] 260 #[derive(IntoBytes, Immutable, FromBytes)] 261 struct GicD { 262 pub r#type: u8, 263 pub length: u8, 264 pub reserved0: u16, 265 pub gic_id: u32, 266 pub base_address: u64, 267 pub global_irq_base: u32, 268 pub version: u8, 269 pub reserved1: [u8; 3], 270 } 271 272 #[cfg(target_arch = "aarch64")] 273 #[allow(dead_code)] 274 #[repr(C, packed)] 275 #[derive(IntoBytes, Immutable, FromBytes)] 276 struct GicR { 277 pub r#type: u8, 278 pub length: u8, 279 pub reserved: u16, 280 pub base_address: u64, 281 pub range_length: u32, 282 } 283 284 #[cfg(target_arch = "aarch64")] 285 #[allow(dead_code)] 286 #[repr(C, packed)] 287 #[derive(IntoBytes, Immutable, FromBytes)] 288 struct GicIts { 289 pub r#type: u8, 290 pub length: u8, 291 pub reserved0: u16, 292 pub translation_id: u32, 293 pub base_address: u64, 294 pub reserved1: u32, 295 } 296 297 #[cfg(target_arch = "aarch64")] 298 #[allow(dead_code)] 299 #[repr(C, packed)] 300 #[derive(IntoBytes, Immutable, FromBytes)] 301 struct ProcessorHierarchyNode { 302 pub r#type: u8, 303 pub length: u8, 304 pub reserved: u16, 305 pub flags: u32, 306 pub parent: u32, 307 pub acpi_processor_id: u32, 308 pub num_private_resources: u32, 309 } 310 311 #[allow(dead_code)] 312 #[repr(C, packed)] 313 #[derive(Default, IntoBytes, Immutable, FromBytes)] 314 struct InterruptSourceOverride { 315 pub r#type: u8, 316 pub length: u8, 317 pub bus: u8, 318 pub source: u8, 319 pub gsi: u32, 320 pub flags: u16, 321 } 322 323 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 324 macro_rules! round_up { 325 ($n:expr,$d:expr) => { 326 (($n / ($d + 1)) + 1) * $d 327 }; 328 } 329 330 /// A wrapper around creating and using a kvm-based VCPU. 331 pub struct Vcpu { 332 // The hypervisor abstracted CPU. 333 vcpu: Arc<dyn hypervisor::Vcpu>, 334 id: u8, 335 #[cfg(target_arch = "aarch64")] 336 mpidr: u64, 337 saved_state: Option<CpuState>, 338 #[cfg(target_arch = "x86_64")] 339 vendor: CpuVendor, 340 } 341 342 impl Vcpu { 343 /// Constructs a new VCPU for `vm`. 344 /// 345 /// # Arguments 346 /// 347 /// * `id` - Represents the CPU number between [0, max vcpus). 348 /// * `vm` - The virtual machine this vcpu will get attached to. 349 /// * `vm_ops` - Optional object for exit handling. 350 /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) 351 pub fn new( 352 id: u8, 353 apic_id: u8, 354 vm: &Arc<dyn hypervisor::Vm>, 355 vm_ops: Option<Arc<dyn VmOps>>, 356 #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, 357 ) -> Result<Self> { 358 let vcpu = vm 359 .create_vcpu(apic_id, vm_ops) 360 .map_err(|e| Error::VcpuCreate(e.into()))?; 361 // Initially the cpuid per vCPU is the one supported by this VM. 362 Ok(Vcpu { 363 vcpu, 364 id, 365 #[cfg(target_arch = "aarch64")] 366 mpidr: 0, 367 saved_state: None, 368 #[cfg(target_arch = "x86_64")] 369 vendor: cpu_vendor, 370 }) 371 } 372 373 /// Configures a vcpu and should be called once per vcpu when created. 374 /// 375 /// # Arguments 376 /// 377 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 378 /// * `guest_memory` - Guest memory. 379 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 380 pub fn configure( 381 &mut self, 382 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 383 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 384 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 385 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 386 #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, 387 ) -> Result<()> { 388 #[cfg(target_arch = "aarch64")] 389 { 390 self.init(vm)?; 391 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 392 .map_err(Error::VcpuConfiguration)?; 393 } 394 #[cfg(target_arch = "riscv64")] 395 arch::configure_vcpu(&self.vcpu, self.id, boot_setup).map_err(Error::VcpuConfiguration)?; 396 info!("Configuring vCPU: cpu_id = {}", self.id); 397 #[cfg(target_arch = "x86_64")] 398 arch::configure_vcpu( 399 &self.vcpu, 400 self.id, 401 boot_setup, 402 cpuid, 403 kvm_hyperv, 404 self.vendor, 405 topology, 406 ) 407 .map_err(Error::VcpuConfiguration)?; 408 409 Ok(()) 410 } 411 412 /// Gets the MPIDR register value. 413 #[cfg(target_arch = "aarch64")] 414 pub fn get_mpidr(&self) -> u64 { 415 self.mpidr 416 } 417 418 /// Gets the saved vCPU state. 419 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] 420 pub fn get_saved_state(&self) -> Option<CpuState> { 421 self.saved_state.clone() 422 } 423 424 /// Initializes an aarch64 specific vcpu for booting Linux. 425 #[cfg(target_arch = "aarch64")] 426 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 427 use std::arch::is_aarch64_feature_detected; 428 #[allow(clippy::nonminimal_bool)] 429 let sve_supported = 430 is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2"); 431 let mut kvi = self.vcpu.create_vcpu_init(); 432 433 // This reads back the kernel's preferred target type. 434 vm.get_preferred_target(&mut kvi) 435 .map_err(Error::VcpuArmPreferredTarget)?; 436 437 self.vcpu 438 .vcpu_set_processor_features(vm, &mut kvi, self.id) 439 .map_err(Error::VcpuSetProcessorFeatures)?; 440 441 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?; 442 443 if sve_supported { 444 let finalized_features = self.vcpu.vcpu_get_finalized_features(); 445 self.vcpu 446 .vcpu_finalize(finalized_features) 447 .map_err(Error::VcpuArmFinalize)?; 448 } 449 Ok(()) 450 } 451 452 /// Runs the VCPU until it exits, returning the reason. 453 /// 454 /// Note that the state of the VCPU and associated VM must be setup first for this to do 455 /// anything useful. 456 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 457 self.vcpu.run() 458 } 459 460 #[cfg(feature = "sev_snp")] 461 pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> { 462 self.vcpu 463 .set_sev_control_register(vmsa_pfn) 464 .map_err(Error::SetSevControlRegister) 465 } 466 } 467 468 impl Pausable for Vcpu {} 469 impl Snapshottable for Vcpu { 470 fn id(&self) -> String { 471 self.id.to_string() 472 } 473 474 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 475 let saved_state = self 476 .vcpu 477 .state() 478 .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?; 479 480 self.saved_state = Some(saved_state.clone()); 481 482 Ok(Snapshot::from_data(SnapshotData::new_from_state( 483 &saved_state, 484 )?)) 485 } 486 } 487 488 pub struct CpuManager { 489 config: CpusConfig, 490 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 491 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 492 #[cfg(target_arch = "x86_64")] 493 cpuid: Vec<CpuIdEntry>, 494 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 495 vm: Arc<dyn hypervisor::Vm>, 496 vcpus_kill_signalled: Arc<AtomicBool>, 497 vcpus_pause_signalled: Arc<AtomicBool>, 498 vcpus_kick_signalled: Arc<AtomicBool>, 499 exit_evt: EventFd, 500 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 501 reset_evt: EventFd, 502 #[cfg(feature = "guest_debug")] 503 vm_debug_evt: EventFd, 504 vcpu_states: Vec<VcpuState>, 505 selected_cpu: u8, 506 vcpus: Vec<Arc<Mutex<Vcpu>>>, 507 seccomp_action: SeccompAction, 508 vm_ops: Arc<dyn VmOps>, 509 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 510 acpi_address: Option<GuestAddress>, 511 proximity_domain_per_cpu: BTreeMap<u8, u32>, 512 affinity: BTreeMap<u8, Vec<usize>>, 513 dynamic: bool, 514 hypervisor: Arc<dyn hypervisor::Hypervisor>, 515 #[cfg(feature = "sev_snp")] 516 sev_snp_enabled: bool, 517 } 518 519 const CPU_ENABLE_FLAG: usize = 0; 520 const CPU_INSERTING_FLAG: usize = 1; 521 const CPU_REMOVING_FLAG: usize = 2; 522 const CPU_EJECT_FLAG: usize = 3; 523 524 const CPU_STATUS_OFFSET: u64 = 4; 525 const CPU_SELECTION_OFFSET: u64 = 0; 526 527 impl BusDevice for CpuManager { 528 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 529 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 530 data.fill(0); 531 532 match offset { 533 CPU_SELECTION_OFFSET => { 534 data[0] = self.selected_cpu; 535 } 536 CPU_STATUS_OFFSET => { 537 if self.selected_cpu < self.max_vcpus() { 538 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 539 if state.active() { 540 data[0] |= 1 << CPU_ENABLE_FLAG; 541 } 542 if state.inserting { 543 data[0] |= 1 << CPU_INSERTING_FLAG; 544 } 545 if state.removing { 546 data[0] |= 1 << CPU_REMOVING_FLAG; 547 } 548 } else { 549 warn!("Out of range vCPU id: {}", self.selected_cpu); 550 } 551 } 552 _ => { 553 warn!( 554 "Unexpected offset for accessing CPU manager device: {:#}", 555 offset 556 ); 557 } 558 } 559 } 560 561 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 562 match offset { 563 CPU_SELECTION_OFFSET => { 564 self.selected_cpu = data[0]; 565 } 566 CPU_STATUS_OFFSET => { 567 if self.selected_cpu < self.max_vcpus() { 568 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 569 // The ACPI code writes back a 1 to acknowledge the insertion 570 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 571 && state.inserting 572 { 573 state.inserting = false; 574 } 575 // Ditto for removal 576 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 577 && state.removing 578 { 579 state.removing = false; 580 } 581 // Trigger removal of vCPU 582 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 583 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 584 error!("Error removing vCPU: {:?}", e); 585 } 586 } 587 } else { 588 warn!("Out of range vCPU id: {}", self.selected_cpu); 589 } 590 } 591 _ => { 592 warn!( 593 "Unexpected offset for accessing CPU manager device: {:#}", 594 offset 595 ); 596 } 597 } 598 None 599 } 600 } 601 602 #[derive(Default)] 603 struct VcpuState { 604 inserting: bool, 605 removing: bool, 606 pending_removal: Arc<AtomicBool>, 607 handle: Option<thread::JoinHandle<()>>, 608 kill: Arc<AtomicBool>, 609 vcpu_run_interrupted: Arc<AtomicBool>, 610 paused: Arc<AtomicBool>, 611 } 612 613 impl VcpuState { 614 fn active(&self) -> bool { 615 self.handle.is_some() 616 } 617 618 fn signal_thread(&self) { 619 if let Some(handle) = self.handle.as_ref() { 620 loop { 621 // SAFETY: FFI call with correct arguments 622 unsafe { 623 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 624 } 625 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 626 break; 627 } else { 628 // This is more effective than thread::yield_now() at 629 // avoiding a priority inversion with the vCPU thread 630 thread::sleep(std::time::Duration::from_millis(1)); 631 } 632 } 633 } 634 } 635 636 fn join_thread(&mut self) -> Result<()> { 637 if let Some(handle) = self.handle.take() { 638 handle.join().map_err(Error::ThreadCleanup)? 639 } 640 641 Ok(()) 642 } 643 644 fn unpark_thread(&self) { 645 if let Some(handle) = self.handle.as_ref() { 646 handle.thread().unpark() 647 } 648 } 649 } 650 651 impl CpuManager { 652 #[allow(unused_variables)] 653 #[allow(clippy::too_many_arguments)] 654 pub fn new( 655 config: &CpusConfig, 656 vm: Arc<dyn hypervisor::Vm>, 657 exit_evt: EventFd, 658 reset_evt: EventFd, 659 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 660 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 661 seccomp_action: SeccompAction, 662 vm_ops: Arc<dyn VmOps>, 663 #[cfg(feature = "tdx")] tdx_enabled: bool, 664 numa_nodes: &NumaNodes, 665 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 666 ) -> Result<Arc<Mutex<CpuManager>>> { 667 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 668 return Err(Error::MaximumVcpusExceeded); 669 } 670 671 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 672 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 673 let hypervisor_type = hypervisor.hypervisor_type(); 674 #[cfg(target_arch = "x86_64")] 675 let cpu_vendor = hypervisor.get_cpu_vendor(); 676 677 #[cfg(target_arch = "x86_64")] 678 if config.features.amx { 679 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 680 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 681 const XFEATURE_XTILEDATA: usize = 18; 682 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 683 684 // SAFETY: the syscall is only modifying kernel internal 685 // data structures that the kernel is itself expected to safeguard. 686 let amx_tile = unsafe { 687 libc::syscall( 688 libc::SYS_arch_prctl, 689 ARCH_REQ_XCOMP_GUEST_PERM, 690 XFEATURE_XTILEDATA, 691 ) 692 }; 693 694 if amx_tile != 0 { 695 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 696 } else { 697 let mask: usize = 0; 698 // SAFETY: the mask being modified (not marked mutable as it is 699 // modified in unsafe only which is permitted) isn't in use elsewhere. 700 let result = unsafe { 701 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 702 }; 703 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 704 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 705 } 706 } 707 } 708 709 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 710 let mut cpu_list = Vec::new(); 711 for (proximity_domain, numa_node) in numa_nodes.iter() { 712 for cpu in numa_node.cpus.iter() { 713 cpu_list.push((*cpu, *proximity_domain)) 714 } 715 } 716 cpu_list 717 } 718 .into_iter() 719 .collect(); 720 721 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 722 cpu_affinity 723 .iter() 724 .map(|a| (a.vcpu, a.host_cpus.clone())) 725 .collect() 726 } else { 727 BTreeMap::new() 728 }; 729 730 #[cfg(feature = "tdx")] 731 let dynamic = !tdx_enabled; 732 #[cfg(not(feature = "tdx"))] 733 let dynamic = true; 734 735 Ok(Arc::new(Mutex::new(CpuManager { 736 config: config.clone(), 737 interrupt_controller: None, 738 #[cfg(target_arch = "x86_64")] 739 cpuid: Vec::new(), 740 vm, 741 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 742 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 743 vcpus_kick_signalled: Arc::new(AtomicBool::new(false)), 744 vcpu_states, 745 exit_evt, 746 reset_evt, 747 #[cfg(feature = "guest_debug")] 748 vm_debug_evt, 749 selected_cpu: 0, 750 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 751 seccomp_action, 752 vm_ops, 753 acpi_address: None, 754 proximity_domain_per_cpu, 755 affinity, 756 dynamic, 757 hypervisor: hypervisor.clone(), 758 #[cfg(feature = "sev_snp")] 759 sev_snp_enabled, 760 }))) 761 } 762 763 #[cfg(target_arch = "x86_64")] 764 pub fn populate_cpuid( 765 &mut self, 766 memory_manager: &Arc<Mutex<MemoryManager>>, 767 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 768 #[cfg(feature = "tdx")] tdx: bool, 769 ) -> Result<()> { 770 let sgx_epc_sections = memory_manager 771 .lock() 772 .unwrap() 773 .sgx_epc_region() 774 .as_ref() 775 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 776 777 self.cpuid = { 778 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 779 arch::generate_common_cpuid( 780 hypervisor, 781 &arch::CpuidConfig { 782 sgx_epc_sections, 783 phys_bits, 784 kvm_hyperv: self.config.kvm_hyperv, 785 #[cfg(feature = "tdx")] 786 tdx, 787 amx: self.config.features.amx, 788 }, 789 ) 790 .map_err(Error::CommonCpuId)? 791 }; 792 793 Ok(()) 794 } 795 796 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 797 info!("Creating vCPU: cpu_id = {}", cpu_id); 798 799 #[cfg(target_arch = "x86_64")] 800 let topology = self.get_vcpu_topology(); 801 #[cfg(target_arch = "x86_64")] 802 let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology); 803 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] 804 let x2apic_id = cpu_id as u32; 805 806 let mut vcpu = Vcpu::new( 807 cpu_id, 808 x2apic_id as u8, 809 &self.vm, 810 Some(self.vm_ops.clone()), 811 #[cfg(target_arch = "x86_64")] 812 self.hypervisor.get_cpu_vendor(), 813 )?; 814 815 if let Some(snapshot) = snapshot { 816 // AArch64 vCPUs should be initialized after created. 817 #[cfg(target_arch = "aarch64")] 818 vcpu.init(&self.vm)?; 819 820 let state: CpuState = snapshot.to_state().map_err(|e| { 821 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 822 })?; 823 vcpu.vcpu 824 .set_state(&state) 825 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 826 827 vcpu.saved_state = Some(state); 828 } 829 830 let vcpu = Arc::new(Mutex::new(vcpu)); 831 832 // Adding vCPU to the CpuManager's vCPU list. 833 self.vcpus.push(vcpu.clone()); 834 835 Ok(vcpu) 836 } 837 838 pub fn configure_vcpu( 839 &self, 840 vcpu: Arc<Mutex<Vcpu>>, 841 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 842 ) -> Result<()> { 843 let mut vcpu = vcpu.lock().unwrap(); 844 845 #[cfg(feature = "sev_snp")] 846 if self.sev_snp_enabled { 847 if let Some((kernel_entry_point, _)) = boot_setup { 848 vcpu.set_sev_control_register( 849 kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, 850 )?; 851 } 852 853 // Traditional way to configure vcpu doesn't work for SEV-SNP guests. 854 // All the vCPU configuration for SEV-SNP guest is provided via VMSA. 855 return Ok(()); 856 } 857 858 #[cfg(target_arch = "x86_64")] 859 assert!(!self.cpuid.is_empty()); 860 861 #[cfg(target_arch = "x86_64")] 862 let topology = self.config.topology.clone().map_or_else( 863 || Some((1, self.boot_vcpus(), 1)), 864 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 865 ); 866 #[cfg(target_arch = "x86_64")] 867 vcpu.configure( 868 boot_setup, 869 self.cpuid.clone(), 870 self.config.kvm_hyperv, 871 topology, 872 )?; 873 874 #[cfg(target_arch = "aarch64")] 875 vcpu.configure(&self.vm, boot_setup)?; 876 877 #[cfg(target_arch = "riscv64")] 878 vcpu.configure(boot_setup)?; 879 880 Ok(()) 881 } 882 883 /// Only create new vCPUs if there aren't any inactive ones to reuse 884 fn create_vcpus( 885 &mut self, 886 desired_vcpus: u8, 887 snapshot: Option<Snapshot>, 888 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 889 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 890 info!( 891 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 892 desired_vcpus, 893 self.config.max_vcpus, 894 self.vcpus.len(), 895 self.present_vcpus() 896 ); 897 898 if desired_vcpus > self.config.max_vcpus { 899 return Err(Error::DesiredVCpuCountExceedsMax); 900 } 901 902 // Only create vCPUs in excess of all the allocated vCPUs. 903 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 904 vcpus.push(self.create_vcpu( 905 cpu_id, 906 // TODO: The special format of the CPU id can be removed once 907 // ready to break live upgrade. 908 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 909 )?); 910 } 911 912 Ok(vcpus) 913 } 914 915 #[cfg(target_arch = "aarch64")] 916 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 917 for cpu in self.vcpus.iter() { 918 let cpu = cpu.lock().unwrap(); 919 // Check if PMU attr is available, if not, log the information. 920 if cpu.vcpu.has_pmu_support() { 921 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 922 } else { 923 debug!( 924 "PMU attribute is not supported in vCPU{}, skip PMU init!", 925 cpu.id 926 ); 927 return Ok(false); 928 } 929 } 930 931 Ok(true) 932 } 933 934 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 935 self.vcpus.clone() 936 } 937 938 fn start_vcpu( 939 &mut self, 940 vcpu: Arc<Mutex<Vcpu>>, 941 vcpu_id: u8, 942 vcpu_thread_barrier: Arc<Barrier>, 943 inserting: bool, 944 ) -> Result<()> { 945 let reset_evt = self.reset_evt.try_clone().unwrap(); 946 let exit_evt = self.exit_evt.try_clone().unwrap(); 947 #[cfg(feature = "kvm")] 948 let hypervisor_type = self.hypervisor.hypervisor_type(); 949 #[cfg(feature = "guest_debug")] 950 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 951 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 952 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 953 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 954 let vcpu_kick_signalled = self.vcpus_kick_signalled.clone(); 955 956 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 957 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 958 .vcpu_run_interrupted 959 .clone(); 960 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 961 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 962 963 // Prepare the CPU set the current vCPU is expected to run onto. 964 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 965 // SAFETY: all zeros is a valid pattern 966 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 967 // SAFETY: FFI call, trivially safe 968 unsafe { libc::CPU_ZERO(&mut cpuset) }; 969 for host_cpu in host_cpus { 970 // SAFETY: FFI call, trivially safe 971 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) }; 972 } 973 cpuset 974 }); 975 976 // Retrieve seccomp filter for vcpu thread 977 let vcpu_seccomp_filter = get_seccomp_filter( 978 &self.seccomp_action, 979 Thread::Vcpu, 980 self.hypervisor.hypervisor_type(), 981 ) 982 .map_err(Error::CreateSeccompFilter)?; 983 984 #[cfg(target_arch = "x86_64")] 985 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 986 987 info!("Starting vCPU: cpu_id = {}", vcpu_id); 988 989 let handle = Some( 990 thread::Builder::new() 991 .name(format!("vcpu{vcpu_id}")) 992 .spawn(move || { 993 // Schedule the thread to run on the expected CPU set 994 if let Some(cpuset) = cpuset.as_ref() { 995 // SAFETY: FFI call with correct arguments 996 let ret = unsafe { 997 libc::sched_setaffinity( 998 0, 999 std::mem::size_of::<libc::cpu_set_t>(), 1000 cpuset as *const libc::cpu_set_t, 1001 ) 1002 }; 1003 1004 if ret != 0 { 1005 error!( 1006 "Failed scheduling the vCPU {} on the expected CPU set: {}", 1007 vcpu_id, 1008 io::Error::last_os_error() 1009 ); 1010 return; 1011 } 1012 } 1013 1014 // Apply seccomp filter for vcpu thread. 1015 if !vcpu_seccomp_filter.is_empty() { 1016 if let Err(e) = 1017 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 1018 { 1019 error!("Error applying seccomp filter: {:?}", e); 1020 return; 1021 } 1022 } 1023 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 1024 // This uses an async signal safe handler to kill the vcpu handles. 1025 register_signal_handler(SIGRTMIN(), handle_signal) 1026 .expect("Failed to register vcpu signal handler"); 1027 // Block until all CPUs are ready. 1028 vcpu_thread_barrier.wait(); 1029 1030 std::panic::catch_unwind(move || { 1031 loop { 1032 // If we are being told to pause, we park the thread 1033 // until the pause boolean is toggled. 1034 // The resume operation is responsible for toggling 1035 // the boolean and unpark the thread. 1036 // We enter a loop because park() could spuriously 1037 // return. We will then park() again unless the 1038 // pause boolean has been toggled. 1039 1040 // Need to use Ordering::SeqCst as we have multiple 1041 // loads and stores to different atomics and we need 1042 // to see them in a consistent order in all threads 1043 1044 if vcpu_pause_signalled.load(Ordering::SeqCst) { 1045 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 1046 // completed by returning to KVM_RUN. From the kernel docs: 1047 // 1048 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 1049 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 1050 // operations are complete (and guest state is consistent) only after userspace 1051 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 1052 // incomplete operations and then check for pending signals. 1053 // The pending state of the operation is not preserved in state which is 1054 // visible to userspace, thus userspace should ensure that the operation is 1055 // completed before performing a live migration. Userspace can re-enter the 1056 // guest with an unmasked signal pending or with the immediate_exit field set 1057 // to complete pending operations without allowing any further instructions 1058 // to be executed. 1059 1060 #[cfg(feature = "kvm")] 1061 if matches!(hypervisor_type, HypervisorType::Kvm) { 1062 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 1063 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 1064 error!("Unexpected VM exit on \"immediate_exit\" run"); 1065 break; 1066 } 1067 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 1068 } 1069 1070 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1071 1072 vcpu_paused.store(true, Ordering::SeqCst); 1073 while vcpu_pause_signalled.load(Ordering::SeqCst) { 1074 thread::park(); 1075 } 1076 vcpu_run_interrupted.store(false, Ordering::SeqCst); 1077 } 1078 1079 if vcpu_kick_signalled.load(Ordering::SeqCst) { 1080 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1081 #[cfg(target_arch = "x86_64")] 1082 match vcpu.lock().as_ref().unwrap().vcpu.nmi() { 1083 Ok(()) => {}, 1084 Err(e) => { 1085 error!("Error when inject nmi {}", e); 1086 break; 1087 } 1088 } 1089 } 1090 1091 // We've been told to terminate 1092 if vcpu_kill_signalled.load(Ordering::SeqCst) 1093 || vcpu_kill.load(Ordering::SeqCst) 1094 { 1095 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1096 break; 1097 } 1098 1099 #[cfg(feature = "tdx")] 1100 let mut vcpu = vcpu.lock().unwrap(); 1101 #[cfg(not(feature = "tdx"))] 1102 let vcpu = vcpu.lock().unwrap(); 1103 // vcpu.run() returns false on a triple-fault so trigger a reset 1104 match vcpu.run() { 1105 Ok(run) => match run { 1106 #[cfg(feature = "kvm")] 1107 VmExit::Debug => { 1108 info!("VmExit::Debug"); 1109 #[cfg(feature = "guest_debug")] 1110 { 1111 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1112 let raw_tid = get_raw_tid(vcpu_id as usize); 1113 vm_debug_evt.write(raw_tid as u64).unwrap(); 1114 } 1115 } 1116 #[cfg(target_arch = "x86_64")] 1117 VmExit::IoapicEoi(vector) => { 1118 if let Some(interrupt_controller) = 1119 &interrupt_controller_clone 1120 { 1121 interrupt_controller 1122 .lock() 1123 .unwrap() 1124 .end_of_interrupt(vector); 1125 } 1126 } 1127 VmExit::Ignore => {} 1128 VmExit::Hyperv => {} 1129 VmExit::Reset => { 1130 info!("VmExit::Reset"); 1131 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1132 reset_evt.write(1).unwrap(); 1133 break; 1134 } 1135 VmExit::Shutdown => { 1136 info!("VmExit::Shutdown"); 1137 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1138 exit_evt.write(1).unwrap(); 1139 break; 1140 } 1141 #[cfg(feature = "tdx")] 1142 VmExit::Tdx => { 1143 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1144 match vcpu.get_tdx_exit_details() { 1145 Ok(details) => match details { 1146 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1147 TdxExitDetails::SetupEventNotifyInterrupt => { 1148 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1149 } 1150 }, 1151 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1152 } 1153 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1154 } else { 1155 // We should never reach this code as 1156 // this means the design from the code 1157 // is wrong. 1158 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1159 } 1160 } 1161 }, 1162 1163 Err(e) => { 1164 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1165 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1166 exit_evt.write(1).unwrap(); 1167 break; 1168 } 1169 } 1170 1171 // We've been told to terminate 1172 if vcpu_kill_signalled.load(Ordering::SeqCst) 1173 || vcpu_kill.load(Ordering::SeqCst) 1174 { 1175 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1176 break; 1177 } 1178 } 1179 }) 1180 .or_else(|_| { 1181 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1182 error!("vCPU thread panicked"); 1183 panic_exit_evt.write(1) 1184 }) 1185 .ok(); 1186 }) 1187 .map_err(Error::VcpuSpawn)?, 1188 ); 1189 1190 // On hot plug calls into this function entry_point is None. It is for 1191 // those hotplug CPU additions that we need to set the inserting flag. 1192 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1193 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1194 1195 Ok(()) 1196 } 1197 1198 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1199 fn activate_vcpus( 1200 &mut self, 1201 desired_vcpus: u8, 1202 inserting: bool, 1203 paused: Option<bool>, 1204 ) -> Result<()> { 1205 if desired_vcpus > self.config.max_vcpus { 1206 return Err(Error::DesiredVCpuCountExceedsMax); 1207 } 1208 1209 let vcpu_thread_barrier = Arc::new(Barrier::new( 1210 (desired_vcpus - self.present_vcpus() + 1) as usize, 1211 )); 1212 1213 if let Some(paused) = paused { 1214 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1215 } 1216 1217 info!( 1218 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1219 desired_vcpus, 1220 self.vcpus.len(), 1221 self.present_vcpus(), 1222 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1223 ); 1224 1225 // This reuses any inactive vCPUs as well as any that were newly created 1226 for vcpu_id in self.present_vcpus()..desired_vcpus { 1227 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1228 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1229 } 1230 1231 // Unblock all CPU threads. 1232 vcpu_thread_barrier.wait(); 1233 Ok(()) 1234 } 1235 1236 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1237 // Mark vCPUs for removal, actual removal happens on ejection 1238 for cpu_id in desired_vcpus..self.present_vcpus() { 1239 self.vcpu_states[usize::from(cpu_id)].removing = true; 1240 self.vcpu_states[usize::from(cpu_id)] 1241 .pending_removal 1242 .store(true, Ordering::SeqCst); 1243 } 1244 } 1245 1246 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1247 for state in self.vcpu_states.iter() { 1248 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1249 return true; 1250 } 1251 } 1252 false 1253 } 1254 1255 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1256 info!("Removing vCPU: cpu_id = {}", cpu_id); 1257 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1258 state.kill.store(true, Ordering::SeqCst); 1259 state.signal_thread(); 1260 state.join_thread()?; 1261 state.handle = None; 1262 1263 // Once the thread has exited, clear the "kill" so that it can reused 1264 state.kill.store(false, Ordering::SeqCst); 1265 state.pending_removal.store(false, Ordering::SeqCst); 1266 1267 Ok(()) 1268 } 1269 1270 pub fn create_boot_vcpus( 1271 &mut self, 1272 snapshot: Option<Snapshot>, 1273 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1274 trace_scoped!("create_boot_vcpus"); 1275 1276 self.create_vcpus(self.boot_vcpus(), snapshot) 1277 } 1278 1279 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1280 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1281 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1282 } 1283 1284 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1285 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1286 .map_err(|e| { 1287 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1288 })?; 1289 1290 Ok(()) 1291 } 1292 1293 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1294 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1295 return Ok(false); 1296 } 1297 1298 if !self.dynamic { 1299 return Ok(false); 1300 } 1301 1302 if self.check_pending_removed_vcpu() { 1303 return Err(Error::VcpuPendingRemovedVcpu); 1304 } 1305 1306 match desired_vcpus.cmp(&self.present_vcpus()) { 1307 cmp::Ordering::Greater => { 1308 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1309 for vcpu in vcpus { 1310 self.configure_vcpu(vcpu, None)? 1311 } 1312 self.activate_vcpus(desired_vcpus, true, None)?; 1313 Ok(true) 1314 } 1315 cmp::Ordering::Less => { 1316 self.mark_vcpus_for_removal(desired_vcpus); 1317 Ok(true) 1318 } 1319 _ => Ok(false), 1320 } 1321 } 1322 1323 pub fn shutdown(&mut self) -> Result<()> { 1324 // Tell the vCPUs to stop themselves next time they go through the loop 1325 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1326 1327 // Toggle the vCPUs pause boolean 1328 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1329 1330 // Unpark all the VCPU threads. 1331 for state in self.vcpu_states.iter() { 1332 state.unpark_thread(); 1333 } 1334 1335 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1336 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1337 // above. 1338 for state in self.vcpu_states.iter() { 1339 state.signal_thread(); 1340 } 1341 1342 // Wait for all the threads to finish. This removes the state from the vector. 1343 for mut state in self.vcpu_states.drain(..) { 1344 state.join_thread()?; 1345 } 1346 1347 Ok(()) 1348 } 1349 1350 #[cfg(feature = "tdx")] 1351 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1352 for vcpu in &self.vcpus { 1353 vcpu.lock() 1354 .unwrap() 1355 .vcpu 1356 .tdx_init(hob_address) 1357 .map_err(Error::InitializeTdx)?; 1358 } 1359 Ok(()) 1360 } 1361 1362 pub fn boot_vcpus(&self) -> u8 { 1363 self.config.boot_vcpus 1364 } 1365 1366 pub fn max_vcpus(&self) -> u8 { 1367 self.config.max_vcpus 1368 } 1369 1370 #[cfg(target_arch = "x86_64")] 1371 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1372 assert!(!self.cpuid.is_empty()); 1373 self.cpuid.clone() 1374 } 1375 1376 fn present_vcpus(&self) -> u8 { 1377 self.vcpu_states 1378 .iter() 1379 .fold(0, |acc, state| acc + state.active() as u8) 1380 } 1381 1382 #[cfg(target_arch = "aarch64")] 1383 pub fn get_mpidrs(&self) -> Vec<u64> { 1384 self.vcpus 1385 .iter() 1386 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1387 .collect() 1388 } 1389 1390 #[cfg(target_arch = "aarch64")] 1391 pub fn get_saved_states(&self) -> Vec<CpuState> { 1392 self.vcpus 1393 .iter() 1394 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1395 .collect() 1396 } 1397 1398 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1399 self.config 1400 .topology 1401 .clone() 1402 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1403 } 1404 1405 #[cfg(not(target_arch = "riscv64"))] 1406 pub fn create_madt(&self) -> Sdt { 1407 use crate::acpi; 1408 // This is also checked in the commandline parsing. 1409 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1410 1411 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1412 #[cfg(target_arch = "x86_64")] 1413 { 1414 madt.write(36, arch::layout::APIC_START.0); 1415 1416 for cpu in 0..self.config.max_vcpus { 1417 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology()); 1418 1419 let lapic = LocalX2Apic { 1420 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1421 length: 16, 1422 processor_id: cpu.into(), 1423 apic_id: x2apic_id, 1424 flags: if cpu < self.config.boot_vcpus { 1425 1 << MADT_CPU_ENABLE_FLAG 1426 } else { 1427 0 1428 } | (1 << MADT_CPU_ONLINE_CAPABLE_FLAG), 1429 _reserved: 0, 1430 }; 1431 madt.append(lapic); 1432 } 1433 1434 madt.append(Ioapic { 1435 r#type: acpi::ACPI_APIC_IO, 1436 length: 12, 1437 ioapic_id: 0, 1438 apic_address: arch::layout::IOAPIC_START.0 as u32, 1439 gsi_base: 0, 1440 ..Default::default() 1441 }); 1442 1443 madt.append(InterruptSourceOverride { 1444 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1445 length: 10, 1446 bus: 0, 1447 source: 4, 1448 gsi: 4, 1449 flags: 0, 1450 }); 1451 } 1452 1453 #[cfg(target_arch = "aarch64")] 1454 { 1455 /* Notes: 1456 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1457 */ 1458 1459 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1460 for cpu in 0..self.config.boot_vcpus { 1461 let vcpu = &self.vcpus[cpu as usize]; 1462 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1463 /* ARMv8 MPIDR format: 1464 Bits [63:40] Must be zero 1465 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1466 Bits [31:24] Must be zero 1467 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1468 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1469 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1470 */ 1471 let mpidr_mask = 0xff_00ff_ffff; 1472 let gicc = GicC { 1473 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1474 length: 80, 1475 reserved0: 0, 1476 cpu_interface_number: cpu as u32, 1477 uid: cpu as u32, 1478 flags: 1, 1479 parking_version: 0, 1480 performance_interrupt: 0, 1481 parked_address: 0, 1482 base_address: 0, 1483 gicv_base_address: 0, 1484 gich_base_address: 0, 1485 vgic_interrupt: 0, 1486 gicr_base_address: 0, 1487 mpidr: mpidr & mpidr_mask, 1488 proc_power_effi_class: 0, 1489 reserved1: 0, 1490 spe_overflow_interrupt: 0, 1491 }; 1492 1493 madt.append(gicc); 1494 } 1495 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1496 1497 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1498 let gicd = GicD { 1499 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1500 length: 24, 1501 reserved0: 0, 1502 gic_id: 0, 1503 base_address: vgic_config.dist_addr, 1504 global_irq_base: 0, 1505 version: 3, 1506 reserved1: [0; 3], 1507 }; 1508 madt.append(gicd); 1509 1510 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1511 let gicr = GicR { 1512 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1513 length: 16, 1514 reserved: 0, 1515 base_address: vgic_config.redists_addr, 1516 range_length: vgic_config.redists_size as u32, 1517 }; 1518 madt.append(gicr); 1519 1520 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1521 let gicits = GicIts { 1522 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1523 length: 20, 1524 reserved0: 0, 1525 translation_id: 0, 1526 base_address: vgic_config.msi_addr, 1527 reserved1: 0, 1528 }; 1529 madt.append(gicits); 1530 1531 madt.update_checksum(); 1532 } 1533 1534 madt 1535 } 1536 1537 #[cfg(target_arch = "aarch64")] 1538 pub fn create_pptt(&self) -> Sdt { 1539 let pptt_start = 0; 1540 let mut cpus = 0; 1541 let mut uid = 0; 1542 // If topology is not specified, the default setting is: 1543 // 1 package, multiple cores, 1 thread per core 1544 // This is also the behavior when PPTT is missing. 1545 let (threads_per_core, cores_per_package, packages) = 1546 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1547 1548 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1549 1550 for cluster_idx in 0..packages { 1551 if cpus < self.config.boot_vcpus as usize { 1552 let cluster_offset = pptt.len() - pptt_start; 1553 let cluster_hierarchy_node = ProcessorHierarchyNode { 1554 r#type: 0, 1555 length: 20, 1556 reserved: 0, 1557 flags: 0x2, 1558 parent: 0, 1559 acpi_processor_id: cluster_idx as u32, 1560 num_private_resources: 0, 1561 }; 1562 pptt.append(cluster_hierarchy_node); 1563 1564 for core_idx in 0..cores_per_package { 1565 let core_offset = pptt.len() - pptt_start; 1566 1567 if threads_per_core > 1 { 1568 let core_hierarchy_node = ProcessorHierarchyNode { 1569 r#type: 0, 1570 length: 20, 1571 reserved: 0, 1572 flags: 0x2, 1573 parent: cluster_offset as u32, 1574 acpi_processor_id: core_idx as u32, 1575 num_private_resources: 0, 1576 }; 1577 pptt.append(core_hierarchy_node); 1578 1579 for _thread_idx in 0..threads_per_core { 1580 let thread_hierarchy_node = ProcessorHierarchyNode { 1581 r#type: 0, 1582 length: 20, 1583 reserved: 0, 1584 flags: 0xE, 1585 parent: core_offset as u32, 1586 acpi_processor_id: uid as u32, 1587 num_private_resources: 0, 1588 }; 1589 pptt.append(thread_hierarchy_node); 1590 uid += 1; 1591 } 1592 } else { 1593 let thread_hierarchy_node = ProcessorHierarchyNode { 1594 r#type: 0, 1595 length: 20, 1596 reserved: 0, 1597 flags: 0xA, 1598 parent: cluster_offset as u32, 1599 acpi_processor_id: uid as u32, 1600 num_private_resources: 0, 1601 }; 1602 pptt.append(thread_hierarchy_node); 1603 uid += 1; 1604 } 1605 } 1606 cpus += (cores_per_package * threads_per_core) as usize; 1607 } 1608 } 1609 1610 pptt.update_checksum(); 1611 pptt 1612 } 1613 1614 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1615 fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters { 1616 self.vcpus[usize::from(cpu_id)] 1617 .lock() 1618 .unwrap() 1619 .vcpu 1620 .create_standard_regs() 1621 } 1622 1623 #[cfg(feature = "guest_debug")] 1624 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1625 self.vcpus[usize::from(cpu_id)] 1626 .lock() 1627 .unwrap() 1628 .vcpu 1629 .get_regs() 1630 .map_err(Error::CpuDebug) 1631 } 1632 1633 #[cfg(feature = "guest_debug")] 1634 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1635 self.vcpus[usize::from(cpu_id)] 1636 .lock() 1637 .unwrap() 1638 .vcpu 1639 .set_regs(regs) 1640 .map_err(Error::CpuDebug) 1641 } 1642 1643 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1644 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1645 self.vcpus[usize::from(cpu_id)] 1646 .lock() 1647 .unwrap() 1648 .vcpu 1649 .get_sregs() 1650 .map_err(Error::CpuDebug) 1651 } 1652 1653 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1654 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1655 self.vcpus[usize::from(cpu_id)] 1656 .lock() 1657 .unwrap() 1658 .vcpu 1659 .set_sregs(sregs) 1660 .map_err(Error::CpuDebug) 1661 } 1662 1663 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1664 fn translate_gva( 1665 &self, 1666 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1667 cpu_id: u8, 1668 gva: u64, 1669 ) -> Result<u64> { 1670 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1671 .lock() 1672 .unwrap() 1673 .vcpu 1674 .translate_gva(gva, /* flags: unused */ 0) 1675 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1676 Ok(gpa) 1677 } 1678 1679 /// 1680 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1681 /// it in VMM by walking through translation tables. 1682 /// 1683 /// Address translation is big topic, here we only focus the scenario that 1684 /// happens in VMM while debugging kernel. This `translate_gva` 1685 /// implementation is restricted to: 1686 /// - Exception Level 1 1687 /// - Translate high address range only (kernel space) 1688 /// 1689 /// This implementation supports following Arm-v8a features related to 1690 /// address translation: 1691 /// - FEAT_LPA 1692 /// - FEAT_LVA 1693 /// - FEAT_LPA2 1694 /// 1695 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1696 fn translate_gva( 1697 &self, 1698 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1699 cpu_id: u8, 1700 gva: u64, 1701 ) -> Result<u64> { 1702 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1703 .lock() 1704 .unwrap() 1705 .vcpu 1706 .get_sys_reg(regs::TCR_EL1) 1707 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1708 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1709 .lock() 1710 .unwrap() 1711 .vcpu 1712 .get_sys_reg(regs::TTBR1_EL1) 1713 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1714 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1715 .lock() 1716 .unwrap() 1717 .vcpu 1718 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1719 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1720 1721 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1722 // or low (0x000xxx...). 1723 let high_range = extract_bits_64!(gva, 55, 1); 1724 if high_range == 0 { 1725 info!("VA (0x{:x}) range is not supported!", gva); 1726 return Ok(gva); 1727 } 1728 1729 // High range size offset 1730 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1731 // Granule size 1732 let tg = extract_bits_64!(tcr_el1, 30, 2); 1733 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1734 let ds = extract_bits_64!(tcr_el1, 59, 1); 1735 1736 if tsz == 0 { 1737 info!("VA translation is not ready!"); 1738 return Ok(gva); 1739 } 1740 1741 // VA size is determined by TCR_BL1.T1SZ 1742 let va_size = 64 - tsz; 1743 // Number of bits in VA consumed in each level of translation 1744 let stride = match tg { 1745 3 => 13, // 64KB granule size 1746 1 => 11, // 16KB granule size 1747 _ => 9, // 4KB, default 1748 }; 1749 // Starting level of walking 1750 let mut level = 4 - (va_size - 4) / stride; 1751 1752 // PA or IPA size is determined 1753 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1754 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1755 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1756 // To be safe, we use the minimum value if they are different. 1757 let pa_range = std::cmp::min(tcr_ips, pa_range); 1758 // PA size in bits 1759 let pa_size = match pa_range { 1760 0 => 32, 1761 1 => 36, 1762 2 => 40, 1763 3 => 42, 1764 4 => 44, 1765 5 => 48, 1766 6 => 52, 1767 _ => { 1768 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1769 "PA range not supported {pa_range}" 1770 )))) 1771 } 1772 }; 1773 1774 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1775 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1776 // If FEAT_LPA2 is present, the translation table descriptor holds 1777 // 50 bits of the table address of next level. 1778 // Otherwise, it is 48 bits. 1779 let descaddrmask = if ds == 1 { 1780 !0u64 >> (64 - 50) // mask with 50 least significant bits 1781 } else { 1782 !0u64 >> (64 - 48) // mask with 48 least significant bits 1783 }; 1784 let descaddrmask = descaddrmask & !indexmask_grainsize; 1785 1786 // Translation table base address 1787 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1788 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1789 // address bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1790 if pa_size == 52 { 1791 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1792 } 1793 1794 // Loop through tables of each level 1795 loop { 1796 // Table offset for current level 1797 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1798 descaddr |= table_offset; 1799 descaddr &= !7u64; 1800 1801 let mut buf = [0; 8]; 1802 guest_memory 1803 .memory() 1804 .read(&mut buf, GuestAddress(descaddr)) 1805 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1806 let descriptor = u64::from_le_bytes(buf); 1807 1808 descaddr = descriptor & descaddrmask; 1809 // In the case of FEAT_LPA, the next-level translation table address 1810 // bits [48:51] comes from bits [12:15] of the current descriptor. 1811 // For FEAT_LPA2, the next-level translation table address 1812 // bits [50:51] comes from bits [8:9] of the current descriptor, 1813 // bits [48:49] comes from bits [48:49] of the descriptor which was 1814 // handled previously. 1815 if pa_size == 52 { 1816 if ds == 1 { 1817 // FEAT_LPA2 1818 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1819 } else { 1820 // FEAT_LPA 1821 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1822 } 1823 } 1824 1825 if (descriptor & 2) != 0 && (level < 3) { 1826 // This is a table entry. Go down to next level. 1827 level += 1; 1828 indexmask = indexmask_grainsize; 1829 continue; 1830 } 1831 1832 break; 1833 } 1834 1835 // We have reached either: 1836 // - a page entry at level 3 or 1837 // - a block entry at level 1 or 2 1838 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1839 descaddr &= !(page_size - 1); 1840 descaddr |= gva & (page_size - 1); 1841 1842 Ok(descaddr) 1843 } 1844 1845 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1846 self.acpi_address = Some(acpi_address); 1847 } 1848 1849 pub(crate) fn set_interrupt_controller( 1850 &mut self, 1851 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1852 ) { 1853 self.interrupt_controller = Some(interrupt_controller); 1854 } 1855 1856 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1857 &self.vcpus_kill_signalled 1858 } 1859 1860 #[cfg(feature = "igvm")] 1861 pub(crate) fn get_cpuid_leaf( 1862 &self, 1863 cpu_id: u8, 1864 eax: u32, 1865 ecx: u32, 1866 xfem: u64, 1867 xss: u64, 1868 ) -> Result<[u32; 4]> { 1869 let leaf_info = self.vcpus[usize::from(cpu_id)] 1870 .lock() 1871 .unwrap() 1872 .vcpu 1873 .get_cpuid_values(eax, ecx, xfem, xss) 1874 .unwrap(); 1875 Ok(leaf_info) 1876 } 1877 1878 #[cfg(feature = "sev_snp")] 1879 pub(crate) fn sev_snp_enabled(&self) -> bool { 1880 self.sev_snp_enabled 1881 } 1882 1883 pub(crate) fn nmi(&self) -> Result<()> { 1884 self.vcpus_kick_signalled.store(true, Ordering::SeqCst); 1885 1886 for state in self.vcpu_states.iter() { 1887 state.signal_thread(); 1888 } 1889 1890 self.vcpus_kick_signalled.store(false, Ordering::SeqCst); 1891 1892 Ok(()) 1893 } 1894 } 1895 1896 struct Cpu { 1897 cpu_id: u8, 1898 proximity_domain: u32, 1899 dynamic: bool, 1900 #[cfg(target_arch = "x86_64")] 1901 topology: Option<(u8, u8, u8)>, 1902 } 1903 1904 #[cfg(target_arch = "x86_64")] 1905 const MADT_CPU_ENABLE_FLAG: usize = 0; 1906 1907 #[cfg(target_arch = "x86_64")] 1908 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1909 1910 impl Cpu { 1911 #[cfg(target_arch = "x86_64")] 1912 fn generate_mat(&self) -> Vec<u8> { 1913 let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology); 1914 1915 let lapic = LocalX2Apic { 1916 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1917 length: 16, 1918 processor_id: self.cpu_id.into(), 1919 apic_id: x2apic_id, 1920 flags: 1 << MADT_CPU_ENABLE_FLAG, 1921 _reserved: 0, 1922 }; 1923 1924 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1925 // SAFETY: mat_data is large enough to hold lapic 1926 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1927 1928 mat_data 1929 } 1930 } 1931 1932 impl Aml for Cpu { 1933 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1934 #[cfg(target_arch = "x86_64")] 1935 let mat_data: Vec<u8> = self.generate_mat(); 1936 #[allow(clippy::if_same_then_else)] 1937 if self.dynamic { 1938 aml::Device::new( 1939 format!("C{:03X}", self.cpu_id).as_str().into(), 1940 vec![ 1941 &aml::Name::new("_HID".into(), &"ACPI0007"), 1942 &aml::Name::new("_UID".into(), &self.cpu_id), 1943 // Currently, AArch64 cannot support following fields. 1944 /* 1945 _STA return value: 1946 Bit [0] – Set if the device is present. 1947 Bit [1] – Set if the device is enabled and decoding its resources. 1948 Bit [2] – Set if the device should be shown in the UI. 1949 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1950 Bit [4] – Set if the battery is present. 1951 Bits [31:5] – Reserved (must be cleared). 1952 */ 1953 #[cfg(target_arch = "x86_64")] 1954 &aml::Method::new( 1955 "_STA".into(), 1956 0, 1957 false, 1958 // Call into CSTA method which will interrogate device 1959 vec![&aml::Return::new(&aml::MethodCall::new( 1960 "CSTA".into(), 1961 vec![&self.cpu_id], 1962 ))], 1963 ), 1964 &aml::Method::new( 1965 "_PXM".into(), 1966 0, 1967 false, 1968 vec![&aml::Return::new(&self.proximity_domain)], 1969 ), 1970 // The Linux kernel expects every CPU device to have a _MAT entry 1971 // containing the LAPIC for this processor with the enabled bit set 1972 // even it if is disabled in the MADT (non-boot CPU) 1973 #[cfg(target_arch = "x86_64")] 1974 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1975 // Trigger CPU ejection 1976 #[cfg(target_arch = "x86_64")] 1977 &aml::Method::new( 1978 "_EJ0".into(), 1979 1, 1980 false, 1981 // Call into CEJ0 method which will actually eject device 1982 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1983 ), 1984 ], 1985 ) 1986 .to_aml_bytes(sink); 1987 } else { 1988 aml::Device::new( 1989 format!("C{:03X}", self.cpu_id).as_str().into(), 1990 vec![ 1991 &aml::Name::new("_HID".into(), &"ACPI0007"), 1992 &aml::Name::new("_UID".into(), &self.cpu_id), 1993 #[cfg(target_arch = "x86_64")] 1994 &aml::Method::new( 1995 "_STA".into(), 1996 0, 1997 false, 1998 // Mark CPU present see CSTA implementation 1999 vec![&aml::Return::new(&0xfu8)], 2000 ), 2001 &aml::Method::new( 2002 "_PXM".into(), 2003 0, 2004 false, 2005 vec![&aml::Return::new(&self.proximity_domain)], 2006 ), 2007 // The Linux kernel expects every CPU device to have a _MAT entry 2008 // containing the LAPIC for this processor with the enabled bit set 2009 // even it if is disabled in the MADT (non-boot CPU) 2010 #[cfg(target_arch = "x86_64")] 2011 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 2012 ], 2013 ) 2014 .to_aml_bytes(sink); 2015 } 2016 } 2017 } 2018 2019 struct CpuNotify { 2020 cpu_id: u8, 2021 } 2022 2023 impl Aml for CpuNotify { 2024 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2025 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 2026 aml::If::new( 2027 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 2028 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2029 ) 2030 .to_aml_bytes(sink) 2031 } 2032 } 2033 2034 struct CpuMethods { 2035 max_vcpus: u8, 2036 dynamic: bool, 2037 } 2038 2039 impl Aml for CpuMethods { 2040 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2041 if self.dynamic { 2042 // CPU status method 2043 aml::Method::new( 2044 "CSTA".into(), 2045 1, 2046 true, 2047 vec![ 2048 // Take lock defined above 2049 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2050 // Write CPU number (in first argument) to I/O port via field 2051 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2052 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2053 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2054 &aml::If::new( 2055 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 2056 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2057 ), 2058 // Release lock 2059 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2060 // Return 0 or 0xf 2061 &aml::Return::new(&aml::Local(0)), 2062 ], 2063 ) 2064 .to_aml_bytes(sink); 2065 2066 let mut cpu_notifies = Vec::new(); 2067 for cpu_id in 0..self.max_vcpus { 2068 cpu_notifies.push(CpuNotify { cpu_id }); 2069 } 2070 2071 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2072 for cpu_id in 0..self.max_vcpus { 2073 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 2074 } 2075 2076 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 2077 2078 aml::Method::new( 2079 "CEJ0".into(), 2080 1, 2081 true, 2082 vec![ 2083 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2084 // Write CPU number (in first argument) to I/O port via field 2085 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2086 // Set CEJ0 bit 2087 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 2088 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2089 ], 2090 ) 2091 .to_aml_bytes(sink); 2092 2093 aml::Method::new( 2094 "CSCN".into(), 2095 0, 2096 true, 2097 vec![ 2098 // Take lock defined above 2099 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2100 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2101 &aml::While::new( 2102 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 2103 vec![ 2104 // Write CPU number (in first argument) to I/O port via field 2105 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 2106 // Check if CINS bit is set 2107 &aml::If::new( 2108 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 2109 // Notify device if it is 2110 vec![ 2111 &aml::MethodCall::new( 2112 "CTFY".into(), 2113 vec![&aml::Local(0), &aml::ONE], 2114 ), 2115 // Reset CINS bit 2116 &aml::Store::new( 2117 &aml::Path::new("\\_SB_.PRES.CINS"), 2118 &aml::ONE, 2119 ), 2120 ], 2121 ), 2122 // Check if CRMV bit is set 2123 &aml::If::new( 2124 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 2125 // Notify device if it is (with the eject constant 0x3) 2126 vec![ 2127 &aml::MethodCall::new( 2128 "CTFY".into(), 2129 vec![&aml::Local(0), &3u8], 2130 ), 2131 // Reset CRMV bit 2132 &aml::Store::new( 2133 &aml::Path::new("\\_SB_.PRES.CRMV"), 2134 &aml::ONE, 2135 ), 2136 ], 2137 ), 2138 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2139 ], 2140 ), 2141 // Release lock 2142 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2143 ], 2144 ) 2145 .to_aml_bytes(sink) 2146 } else { 2147 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2148 } 2149 } 2150 } 2151 2152 impl Aml for CpuManager { 2153 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2154 #[cfg(target_arch = "x86_64")] 2155 if let Some(acpi_address) = self.acpi_address { 2156 // CPU hotplug controller 2157 aml::Device::new( 2158 "_SB_.PRES".into(), 2159 vec![ 2160 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2161 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2162 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2163 &aml::Mutex::new("CPLK".into(), 0), 2164 &aml::Name::new( 2165 "_CRS".into(), 2166 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2167 aml::AddressSpaceCacheable::NotCacheable, 2168 true, 2169 acpi_address.0, 2170 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2171 None, 2172 )]), 2173 ), 2174 // OpRegion and Fields map MMIO range into individual field values 2175 &aml::OpRegion::new( 2176 "PRST".into(), 2177 aml::OpRegionSpace::SystemMemory, 2178 &(acpi_address.0 as usize), 2179 &CPU_MANAGER_ACPI_SIZE, 2180 ), 2181 &aml::Field::new( 2182 "PRST".into(), 2183 aml::FieldAccessType::Byte, 2184 aml::FieldLockRule::NoLock, 2185 aml::FieldUpdateRule::WriteAsZeroes, 2186 vec![ 2187 aml::FieldEntry::Reserved(32), 2188 aml::FieldEntry::Named(*b"CPEN", 1), 2189 aml::FieldEntry::Named(*b"CINS", 1), 2190 aml::FieldEntry::Named(*b"CRMV", 1), 2191 aml::FieldEntry::Named(*b"CEJ0", 1), 2192 aml::FieldEntry::Reserved(4), 2193 aml::FieldEntry::Named(*b"CCMD", 8), 2194 ], 2195 ), 2196 &aml::Field::new( 2197 "PRST".into(), 2198 aml::FieldAccessType::DWord, 2199 aml::FieldLockRule::NoLock, 2200 aml::FieldUpdateRule::Preserve, 2201 vec![ 2202 aml::FieldEntry::Named(*b"CSEL", 32), 2203 aml::FieldEntry::Reserved(32), 2204 aml::FieldEntry::Named(*b"CDAT", 32), 2205 ], 2206 ), 2207 ], 2208 ) 2209 .to_aml_bytes(sink); 2210 } 2211 2212 // CPU devices 2213 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2214 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2215 // Bundle methods together under a common object 2216 let methods = CpuMethods { 2217 max_vcpus: self.config.max_vcpus, 2218 dynamic: self.dynamic, 2219 }; 2220 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2221 2222 #[cfg(target_arch = "x86_64")] 2223 let topology = self.get_vcpu_topology(); 2224 let mut cpu_devices = Vec::new(); 2225 for cpu_id in 0..self.config.max_vcpus { 2226 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2227 let cpu_device = Cpu { 2228 cpu_id, 2229 proximity_domain, 2230 dynamic: self.dynamic, 2231 #[cfg(target_arch = "x86_64")] 2232 topology, 2233 }; 2234 2235 cpu_devices.push(cpu_device); 2236 } 2237 2238 for cpu_device in cpu_devices.iter() { 2239 cpu_data_inner.push(cpu_device); 2240 } 2241 2242 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2243 } 2244 } 2245 2246 impl Pausable for CpuManager { 2247 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2248 // Tell the vCPUs to pause themselves next time they exit 2249 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2250 2251 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2252 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2253 // above. 2254 for state in self.vcpu_states.iter() { 2255 state.signal_thread(); 2256 } 2257 2258 for vcpu in self.vcpus.iter() { 2259 let mut vcpu = vcpu.lock().unwrap(); 2260 vcpu.pause()?; 2261 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2262 if !self.config.kvm_hyperv { 2263 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2264 MigratableError::Pause(anyhow!( 2265 "Could not notify guest it has been paused {:?}", 2266 e 2267 )) 2268 })?; 2269 } 2270 } 2271 2272 // The vCPU thread will change its paused state before parking, wait here for each 2273 // activated vCPU change their state to ensure they have parked. 2274 for state in self.vcpu_states.iter() { 2275 if state.active() { 2276 while !state.paused.load(Ordering::SeqCst) { 2277 // To avoid a priority inversion with the vCPU thread 2278 thread::sleep(std::time::Duration::from_millis(1)); 2279 } 2280 } 2281 } 2282 2283 Ok(()) 2284 } 2285 2286 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2287 for vcpu in self.vcpus.iter() { 2288 vcpu.lock().unwrap().resume()?; 2289 } 2290 2291 // Toggle the vCPUs pause boolean 2292 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2293 2294 // Unpark all the VCPU threads. 2295 // Once unparked, the next thing they will do is checking for the pause 2296 // boolean. Since it'll be set to false, they will exit their pause loop 2297 // and go back to vmx root. 2298 for state in self.vcpu_states.iter() { 2299 state.paused.store(false, Ordering::SeqCst); 2300 state.unpark_thread(); 2301 } 2302 Ok(()) 2303 } 2304 } 2305 2306 impl Snapshottable for CpuManager { 2307 fn id(&self) -> String { 2308 CPU_MANAGER_SNAPSHOT_ID.to_string() 2309 } 2310 2311 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2312 let mut cpu_manager_snapshot = Snapshot::default(); 2313 2314 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2315 for vcpu in &self.vcpus { 2316 let mut vcpu = vcpu.lock().unwrap(); 2317 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2318 } 2319 2320 Ok(cpu_manager_snapshot) 2321 } 2322 } 2323 2324 impl Transportable for CpuManager {} 2325 impl Migratable for CpuManager {} 2326 2327 #[cfg(feature = "guest_debug")] 2328 impl Debuggable for CpuManager { 2329 #[cfg(feature = "kvm")] 2330 fn set_guest_debug( 2331 &self, 2332 cpu_id: usize, 2333 addrs: &[GuestAddress], 2334 singlestep: bool, 2335 ) -> std::result::Result<(), DebuggableError> { 2336 self.vcpus[cpu_id] 2337 .lock() 2338 .unwrap() 2339 .vcpu 2340 .set_guest_debug(addrs, singlestep) 2341 .map_err(DebuggableError::SetDebug) 2342 } 2343 2344 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2345 Ok(()) 2346 } 2347 2348 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2349 Ok(()) 2350 } 2351 2352 #[cfg(target_arch = "x86_64")] 2353 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2354 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2355 let gregs = self 2356 .get_regs(cpu_id as u8) 2357 .map_err(DebuggableError::ReadRegs)?; 2358 let regs = [ 2359 gregs.get_rax(), 2360 gregs.get_rbx(), 2361 gregs.get_rcx(), 2362 gregs.get_rdx(), 2363 gregs.get_rsi(), 2364 gregs.get_rdi(), 2365 gregs.get_rbp(), 2366 gregs.get_rsp(), 2367 gregs.get_r8(), 2368 gregs.get_r9(), 2369 gregs.get_r10(), 2370 gregs.get_r11(), 2371 gregs.get_r12(), 2372 gregs.get_r13(), 2373 gregs.get_r14(), 2374 gregs.get_r15(), 2375 ]; 2376 2377 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2378 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2379 let eflags = gregs.get_rflags() as u32; 2380 let rip = gregs.get_rip(); 2381 2382 // Segment registers: CS, SS, DS, ES, FS, GS 2383 let sregs = self 2384 .get_sregs(cpu_id as u8) 2385 .map_err(DebuggableError::ReadRegs)?; 2386 let segments = X86SegmentRegs { 2387 cs: sregs.cs.selector as u32, 2388 ss: sregs.ss.selector as u32, 2389 ds: sregs.ds.selector as u32, 2390 es: sregs.es.selector as u32, 2391 fs: sregs.fs.selector as u32, 2392 gs: sregs.gs.selector as u32, 2393 }; 2394 2395 // TODO: Add other registers 2396 2397 Ok(CoreRegs { 2398 regs, 2399 eflags, 2400 rip, 2401 segments, 2402 ..Default::default() 2403 }) 2404 } 2405 2406 #[cfg(target_arch = "aarch64")] 2407 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2408 let gregs = self 2409 .get_regs(cpu_id as u8) 2410 .map_err(DebuggableError::ReadRegs)?; 2411 Ok(CoreRegs { 2412 x: gregs.get_regs(), 2413 sp: gregs.get_sp(), 2414 pc: gregs.get_pc(), 2415 ..Default::default() 2416 }) 2417 } 2418 2419 #[cfg(target_arch = "x86_64")] 2420 fn write_regs( 2421 &self, 2422 cpu_id: usize, 2423 regs: &CoreRegs, 2424 ) -> std::result::Result<(), DebuggableError> { 2425 let orig_gregs = self 2426 .get_regs(cpu_id as u8) 2427 .map_err(DebuggableError::ReadRegs)?; 2428 let mut gregs = self.create_standard_regs(cpu_id as u8); 2429 gregs.set_rax(regs.regs[0]); 2430 gregs.set_rbx(regs.regs[1]); 2431 gregs.set_rcx(regs.regs[2]); 2432 gregs.set_rdx(regs.regs[3]); 2433 gregs.set_rsi(regs.regs[4]); 2434 gregs.set_rdi(regs.regs[5]); 2435 gregs.set_rbp(regs.regs[6]); 2436 gregs.set_rsp(regs.regs[7]); 2437 gregs.set_r8(regs.regs[8]); 2438 gregs.set_r9(regs.regs[9]); 2439 gregs.set_r10(regs.regs[10]); 2440 gregs.set_r11(regs.regs[11]); 2441 gregs.set_r12(regs.regs[12]); 2442 gregs.set_r13(regs.regs[13]); 2443 gregs.set_r14(regs.regs[14]); 2444 gregs.set_r15(regs.regs[15]); 2445 gregs.set_rip(regs.rip); 2446 // Update the lower 32-bit of rflags. 2447 gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64)); 2448 2449 self.set_regs(cpu_id as u8, &gregs) 2450 .map_err(DebuggableError::WriteRegs)?; 2451 2452 // Segment registers: CS, SS, DS, ES, FS, GS 2453 // Since GDB care only selectors, we call get_sregs() first. 2454 let mut sregs = self 2455 .get_sregs(cpu_id as u8) 2456 .map_err(DebuggableError::ReadRegs)?; 2457 sregs.cs.selector = regs.segments.cs as u16; 2458 sregs.ss.selector = regs.segments.ss as u16; 2459 sregs.ds.selector = regs.segments.ds as u16; 2460 sregs.es.selector = regs.segments.es as u16; 2461 sregs.fs.selector = regs.segments.fs as u16; 2462 sregs.gs.selector = regs.segments.gs as u16; 2463 2464 self.set_sregs(cpu_id as u8, &sregs) 2465 .map_err(DebuggableError::WriteRegs)?; 2466 2467 // TODO: Add other registers 2468 2469 Ok(()) 2470 } 2471 2472 #[cfg(target_arch = "aarch64")] 2473 fn write_regs( 2474 &self, 2475 cpu_id: usize, 2476 regs: &CoreRegs, 2477 ) -> std::result::Result<(), DebuggableError> { 2478 let mut gregs = self 2479 .get_regs(cpu_id as u8) 2480 .map_err(DebuggableError::ReadRegs)?; 2481 2482 gregs.set_regs(regs.x); 2483 gregs.set_sp(regs.sp); 2484 gregs.set_pc(regs.pc); 2485 2486 self.set_regs(cpu_id as u8, &gregs) 2487 .map_err(DebuggableError::WriteRegs)?; 2488 2489 Ok(()) 2490 } 2491 2492 fn read_mem( 2493 &self, 2494 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2495 cpu_id: usize, 2496 vaddr: GuestAddress, 2497 len: usize, 2498 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2499 let mut buf = vec![0; len]; 2500 let mut total_read = 0_u64; 2501 2502 while total_read < len as u64 { 2503 let gaddr = vaddr.0 + total_read; 2504 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2505 Ok(paddr) => paddr, 2506 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2507 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2508 }; 2509 let psize = arch::PAGE_SIZE as u64; 2510 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2511 guest_memory 2512 .memory() 2513 .read( 2514 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2515 GuestAddress(paddr), 2516 ) 2517 .map_err(DebuggableError::ReadMem)?; 2518 total_read += read_len; 2519 } 2520 Ok(buf) 2521 } 2522 2523 fn write_mem( 2524 &self, 2525 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2526 cpu_id: usize, 2527 vaddr: &GuestAddress, 2528 data: &[u8], 2529 ) -> std::result::Result<(), DebuggableError> { 2530 let mut total_written = 0_u64; 2531 2532 while total_written < data.len() as u64 { 2533 let gaddr = vaddr.0 + total_written; 2534 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2535 Ok(paddr) => paddr, 2536 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2537 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2538 }; 2539 let psize = arch::PAGE_SIZE as u64; 2540 let write_len = std::cmp::min( 2541 data.len() as u64 - total_written, 2542 psize - (paddr & (psize - 1)), 2543 ); 2544 guest_memory 2545 .memory() 2546 .write( 2547 &data[total_written as usize..total_written as usize + write_len as usize], 2548 GuestAddress(paddr), 2549 ) 2550 .map_err(DebuggableError::WriteMem)?; 2551 total_written += write_len; 2552 } 2553 Ok(()) 2554 } 2555 2556 fn active_vcpus(&self) -> usize { 2557 self.present_vcpus() as usize 2558 } 2559 } 2560 2561 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2562 impl Elf64Writable for CpuManager {} 2563 2564 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2565 impl CpuElf64Writable for CpuManager { 2566 fn cpu_write_elf64_note( 2567 &mut self, 2568 dump_state: &DumpState, 2569 ) -> std::result::Result<(), GuestDebuggableError> { 2570 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2571 for vcpu in &self.vcpus { 2572 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2573 let mut pos: usize = 0; 2574 let mut buf = vec![0; note_size as usize]; 2575 let descsz = size_of::<X86_64ElfPrStatus>(); 2576 let vcpu_id = vcpu.lock().unwrap().id; 2577 2578 let note = Elf64_Nhdr { 2579 n_namesz: COREDUMP_NAME_SIZE, 2580 n_descsz: descsz as u32, 2581 n_type: NT_PRSTATUS, 2582 }; 2583 2584 let bytes: &[u8] = note.as_slice(); 2585 buf.splice(0.., bytes.to_vec()); 2586 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2587 buf.resize(pos + 4, 0); 2588 buf.splice(pos.., "CORE".to_string().into_bytes()); 2589 2590 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2591 buf.resize(pos + 32 + 4, 0); 2592 let pid = vcpu_id as u64; 2593 let bytes: &[u8] = pid.as_slice(); 2594 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2595 2596 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2597 2598 let orig_rax: u64 = 0; 2599 let gregs = self.vcpus[usize::from(vcpu_id)] 2600 .lock() 2601 .unwrap() 2602 .vcpu 2603 .get_regs() 2604 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2605 2606 let regs1 = [ 2607 gregs.get_r15(), 2608 gregs.get_r14(), 2609 gregs.get_r13(), 2610 gregs.get_r12(), 2611 gregs.get_rbp(), 2612 gregs.get_rbx(), 2613 gregs.get_r11(), 2614 gregs.get_r10(), 2615 ]; 2616 let regs2 = [ 2617 gregs.get_r9(), 2618 gregs.get_r8(), 2619 gregs.get_rax(), 2620 gregs.get_rcx(), 2621 gregs.get_rdx(), 2622 gregs.get_rsi(), 2623 gregs.get_rdi(), 2624 orig_rax, 2625 ]; 2626 2627 let sregs = self.vcpus[usize::from(vcpu_id)] 2628 .lock() 2629 .unwrap() 2630 .vcpu 2631 .get_sregs() 2632 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2633 2634 debug!( 2635 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2636 gregs.get_rip(), 2637 gregs.get_rsp(), 2638 sregs.gs.base, 2639 sregs.cs.selector, 2640 sregs.ss.selector, 2641 sregs.ds.selector, 2642 ); 2643 2644 let regs = X86_64UserRegs { 2645 regs1, 2646 regs2, 2647 rip: gregs.get_rip(), 2648 cs: sregs.cs.selector as u64, 2649 eflags: gregs.get_rflags(), 2650 rsp: gregs.get_rsp(), 2651 ss: sregs.ss.selector as u64, 2652 fs_base: sregs.fs.base, 2653 gs_base: sregs.gs.base, 2654 ds: sregs.ds.selector as u64, 2655 es: sregs.es.selector as u64, 2656 fs: sregs.fs.selector as u64, 2657 gs: sregs.gs.selector as u64, 2658 }; 2659 2660 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2661 let bytes: &[u8] = regs.as_slice(); 2662 buf.resize(note_size as usize, 0); 2663 buf.splice(pos.., bytes.to_vec()); 2664 buf.resize(note_size as usize, 0); 2665 2666 coredump_file 2667 .write(&buf) 2668 .map_err(GuestDebuggableError::CoredumpFile)?; 2669 } 2670 2671 Ok(()) 2672 } 2673 2674 fn cpu_write_vmm_note( 2675 &mut self, 2676 dump_state: &DumpState, 2677 ) -> std::result::Result<(), GuestDebuggableError> { 2678 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2679 for vcpu in &self.vcpus { 2680 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2681 let mut pos: usize = 0; 2682 let mut buf = vec![0; note_size as usize]; 2683 let descsz = size_of::<DumpCpusState>(); 2684 let vcpu_id = vcpu.lock().unwrap().id; 2685 2686 let note = Elf64_Nhdr { 2687 n_namesz: COREDUMP_NAME_SIZE, 2688 n_descsz: descsz as u32, 2689 n_type: 0, 2690 }; 2691 2692 let bytes: &[u8] = note.as_slice(); 2693 buf.splice(0.., bytes.to_vec()); 2694 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2695 2696 buf.resize(pos + 4, 0); 2697 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2698 2699 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2700 2701 let gregs = self.vcpus[usize::from(vcpu_id)] 2702 .lock() 2703 .unwrap() 2704 .vcpu 2705 .get_regs() 2706 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2707 2708 let regs1 = [ 2709 gregs.get_rax(), 2710 gregs.get_rbx(), 2711 gregs.get_rcx(), 2712 gregs.get_rdx(), 2713 gregs.get_rsi(), 2714 gregs.get_rdi(), 2715 gregs.get_rsp(), 2716 gregs.get_rbp(), 2717 ]; 2718 2719 let regs2 = [ 2720 gregs.get_r8(), 2721 gregs.get_r9(), 2722 gregs.get_r10(), 2723 gregs.get_r11(), 2724 gregs.get_r12(), 2725 gregs.get_r13(), 2726 gregs.get_r14(), 2727 gregs.get_r15(), 2728 ]; 2729 2730 let sregs = self.vcpus[usize::from(vcpu_id)] 2731 .lock() 2732 .unwrap() 2733 .vcpu 2734 .get_sregs() 2735 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2736 2737 let mut msrs = vec![MsrEntry { 2738 index: msr_index::MSR_KERNEL_GS_BASE, 2739 ..Default::default() 2740 }]; 2741 2742 self.vcpus[vcpu_id as usize] 2743 .lock() 2744 .unwrap() 2745 .vcpu 2746 .get_msrs(&mut msrs) 2747 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2748 let kernel_gs_base = msrs[0].data; 2749 2750 let cs = CpuSegment::new(sregs.cs); 2751 let ds = CpuSegment::new(sregs.ds); 2752 let es = CpuSegment::new(sregs.es); 2753 let fs = CpuSegment::new(sregs.fs); 2754 let gs = CpuSegment::new(sregs.gs); 2755 let ss = CpuSegment::new(sregs.ss); 2756 let ldt = CpuSegment::new(sregs.ldt); 2757 let tr = CpuSegment::new(sregs.tr); 2758 let gdt = CpuSegment::new_from_table(sregs.gdt); 2759 let idt = CpuSegment::new_from_table(sregs.idt); 2760 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2761 let regs = DumpCpusState { 2762 version: 1, 2763 size: size_of::<DumpCpusState>() as u32, 2764 regs1, 2765 regs2, 2766 rip: gregs.get_rip(), 2767 rflags: gregs.get_rflags(), 2768 cs, 2769 ds, 2770 es, 2771 fs, 2772 gs, 2773 ss, 2774 ldt, 2775 tr, 2776 gdt, 2777 idt, 2778 cr, 2779 kernel_gs_base, 2780 }; 2781 2782 let bytes: &[u8] = regs.as_slice(); 2783 buf.resize(note_size as usize, 0); 2784 buf.splice(pos.., bytes.to_vec()); 2785 buf.resize(note_size as usize, 0); 2786 2787 coredump_file 2788 .write(&buf) 2789 .map_err(GuestDebuggableError::CoredumpFile)?; 2790 } 2791 2792 Ok(()) 2793 } 2794 } 2795 2796 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2797 #[cfg(test)] 2798 mod tests { 2799 use arch::layout::{BOOT_STACK_POINTER, ZERO_PAGE_START}; 2800 use arch::x86_64::interrupts::*; 2801 use arch::x86_64::regs::*; 2802 use hypervisor::arch::x86::{FpuState, LapicState}; 2803 use hypervisor::StandardRegisters; 2804 use linux_loader::loader::bootparam::setup_header; 2805 2806 #[test] 2807 fn test_setlint() { 2808 let hv = hypervisor::new().unwrap(); 2809 let vm = hv.create_vm().expect("new VM fd creation failed"); 2810 hv.check_required_extensions().unwrap(); 2811 // Calling get_lapic will fail if there is no irqchip before hand. 2812 vm.create_irq_chip().unwrap(); 2813 let vcpu = vm.create_vcpu(0, None).unwrap(); 2814 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2815 2816 // Compute the value that is expected to represent LVT0 and LVT1. 2817 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2818 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2819 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2820 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2821 2822 set_lint(&vcpu).unwrap(); 2823 2824 // Compute the value that represents LVT0 and LVT1 after set_lint. 2825 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2826 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2827 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2828 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2829 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2830 } 2831 2832 #[test] 2833 fn test_setup_fpu() { 2834 let hv = hypervisor::new().unwrap(); 2835 let vm = hv.create_vm().expect("new VM fd creation failed"); 2836 let vcpu = vm.create_vcpu(0, None).unwrap(); 2837 setup_fpu(&vcpu).unwrap(); 2838 2839 let expected_fpu: FpuState = FpuState { 2840 fcw: 0x37f, 2841 mxcsr: 0x1f80, 2842 ..Default::default() 2843 }; 2844 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2845 // TODO: auto-generate kvm related structures with PartialEq on. 2846 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2847 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2848 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2849 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2850 // remove it at all. 2851 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2852 } 2853 2854 #[test] 2855 fn test_setup_msrs() { 2856 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2857 2858 let hv = hypervisor::new().unwrap(); 2859 let vm = hv.create_vm().expect("new VM fd creation failed"); 2860 let vcpu = vm.create_vcpu(0, None).unwrap(); 2861 setup_msrs(&vcpu).unwrap(); 2862 2863 // This test will check against the last MSR entry configured (the tenth one). 2864 // See create_msr_entries for details. 2865 let mut msrs = vec![MsrEntry { 2866 index: msr_index::MSR_IA32_MISC_ENABLE, 2867 ..Default::default() 2868 }]; 2869 2870 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2871 // in this test case scenario. 2872 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2873 assert_eq!(read_msrs, 1); 2874 2875 // Official entries that were setup when we did setup_msrs. We need to assert that the 2876 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2877 // expect. 2878 let entry_vec = vcpu.boot_msr_entries(); 2879 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2880 } 2881 2882 #[test] 2883 fn test_setup_regs_for_pvh() { 2884 let hv = hypervisor::new().unwrap(); 2885 let vm = hv.create_vm().expect("new VM fd creation failed"); 2886 let vcpu = vm.create_vcpu(0, None).unwrap(); 2887 2888 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2889 expected_regs.set_rflags(0x0000000000000002u64); 2890 expected_regs.set_rbx(arch::layout::PVH_INFO_START.0); 2891 expected_regs.set_rip(1); 2892 2893 setup_regs( 2894 &vcpu, 2895 arch::EntryPoint { 2896 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2897 setup_header: None, 2898 }, 2899 ) 2900 .unwrap(); 2901 2902 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2903 assert_eq!(actual_regs, expected_regs); 2904 } 2905 2906 #[test] 2907 fn test_setup_regs_for_bzimage() { 2908 let hv = hypervisor::new().unwrap(); 2909 let vm = hv.create_vm().expect("new VM fd creation failed"); 2910 let vcpu = vm.create_vcpu(0, None).unwrap(); 2911 2912 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2913 expected_regs.set_rflags(0x0000000000000002u64); 2914 expected_regs.set_rip(1); 2915 expected_regs.set_rsp(BOOT_STACK_POINTER.0); 2916 expected_regs.set_rsi(ZERO_PAGE_START.0); 2917 2918 setup_regs( 2919 &vcpu, 2920 arch::EntryPoint { 2921 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2922 setup_header: Some(setup_header { 2923 ..Default::default() 2924 }), 2925 }, 2926 ) 2927 .unwrap(); 2928 2929 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2930 assert_eq!(actual_regs, expected_regs); 2931 } 2932 } 2933 2934 #[cfg(target_arch = "aarch64")] 2935 #[cfg(test)] 2936 mod tests { 2937 #[cfg(feature = "kvm")] 2938 use std::mem; 2939 2940 use arch::aarch64::regs; 2941 use arch::layout; 2942 #[cfg(feature = "kvm")] 2943 use hypervisor::kvm::aarch64::is_system_register; 2944 #[cfg(feature = "kvm")] 2945 use hypervisor::kvm::kvm_bindings::{ 2946 user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2947 }; 2948 #[cfg(feature = "kvm")] 2949 use hypervisor::{arm64_core_reg_id, offset_of}; 2950 2951 #[test] 2952 fn test_setup_regs() { 2953 let hv = hypervisor::new().unwrap(); 2954 let vm = hv.create_vm().unwrap(); 2955 let vcpu = vm.create_vcpu(0, None).unwrap(); 2956 2957 // Must fail when vcpu is not initialized yet. 2958 vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap_err(); 2959 2960 let mut kvi = vcpu.create_vcpu_init(); 2961 vm.get_preferred_target(&mut kvi).unwrap(); 2962 vcpu.vcpu_init(&kvi).unwrap(); 2963 2964 vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap(); 2965 } 2966 2967 #[test] 2968 fn test_read_mpidr() { 2969 let hv = hypervisor::new().unwrap(); 2970 let vm = hv.create_vm().unwrap(); 2971 let vcpu = vm.create_vcpu(0, None).unwrap(); 2972 let mut kvi = vcpu.create_vcpu_init(); 2973 vm.get_preferred_target(&mut kvi).unwrap(); 2974 2975 // Must fail when vcpu is not initialized yet. 2976 vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap_err(); 2977 2978 vcpu.vcpu_init(&kvi).unwrap(); 2979 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2980 } 2981 2982 #[cfg(feature = "kvm")] 2983 #[test] 2984 fn test_is_system_register() { 2985 let offset = offset_of!(user_pt_regs, pc); 2986 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2987 assert!(!is_system_register(regid)); 2988 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2989 assert!(is_system_register(regid)); 2990 } 2991 2992 #[test] 2993 fn test_save_restore_core_regs() { 2994 let hv = hypervisor::new().unwrap(); 2995 let vm = hv.create_vm().unwrap(); 2996 let vcpu = vm.create_vcpu(0, None).unwrap(); 2997 let mut kvi = vcpu.create_vcpu_init(); 2998 vm.get_preferred_target(&mut kvi).unwrap(); 2999 3000 // Must fail when vcpu is not initialized yet. 3001 assert_eq!( 3002 format!("{}", vcpu.get_regs().unwrap_err()), 3003 "Failed to get aarch64 core register: Exec format error (os error 8)" 3004 ); 3005 3006 let mut state = vcpu.create_standard_regs(); 3007 assert_eq!( 3008 format!("{}", vcpu.set_regs(&state).unwrap_err()), 3009 "Failed to set aarch64 core register: Exec format error (os error 8)" 3010 ); 3011 3012 vcpu.vcpu_init(&kvi).unwrap(); 3013 state = vcpu.get_regs().unwrap(); 3014 assert_eq!(state.get_pstate(), 0x3C5); 3015 3016 vcpu.set_regs(&state).unwrap(); 3017 } 3018 3019 #[test] 3020 fn test_get_set_mpstate() { 3021 let hv = hypervisor::new().unwrap(); 3022 let vm = hv.create_vm().unwrap(); 3023 let vcpu = vm.create_vcpu(0, None).unwrap(); 3024 let mut kvi = vcpu.create_vcpu_init(); 3025 vm.get_preferred_target(&mut kvi).unwrap(); 3026 3027 let state = vcpu.get_mp_state().unwrap(); 3028 vcpu.set_mp_state(state).unwrap(); 3029 } 3030 } 3031