1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 #[cfg(target_arch = "x86_64")] 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, sdt::Sdt, Aml}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 #[cfg(target_arch = "x86_64")] 35 use arch::x86_64::get_x2apic_id; 36 use arch::EntryPoint; 37 use arch::NumaNodes; 38 #[cfg(target_arch = "aarch64")] 39 use devices::gic::Gic; 40 use devices::interrupt_controller::InterruptController; 41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 44 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 45 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 46 use hypervisor::aarch64::StandardRegisters; 47 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 48 use hypervisor::arch::x86::msr_index; 49 #[cfg(target_arch = "x86_64")] 50 use hypervisor::arch::x86::CpuIdEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::MsrEntry; 53 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 54 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters}; 55 #[cfg(target_arch = "aarch64")] 56 use hypervisor::kvm::kvm_bindings; 57 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 58 use hypervisor::kvm::kvm_ioctls::Cap; 59 #[cfg(feature = "tdx")] 60 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 61 #[cfg(target_arch = "x86_64")] 62 use hypervisor::CpuVendor; 63 #[cfg(feature = "kvm")] 64 use hypervisor::HypervisorType; 65 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps}; 66 use libc::{c_void, siginfo_t}; 67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 68 use linux_loader::elf::Elf64_Nhdr; 69 use seccompiler::{apply_filter, SeccompAction}; 70 use std::collections::BTreeMap; 71 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 72 use std::io::Write; 73 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 74 use std::mem::size_of; 75 use std::os::unix::thread::JoinHandleExt; 76 use std::sync::atomic::{AtomicBool, Ordering}; 77 use std::sync::{Arc, Barrier, Mutex}; 78 use std::{cmp, io, result, thread}; 79 use thiserror::Error; 80 use tracer::trace_scoped; 81 use vm_device::BusDevice; 82 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 83 use vm_memory::ByteValued; 84 #[cfg(feature = "guest_debug")] 85 use vm_memory::{Bytes, GuestAddressSpace}; 86 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 87 use vm_migration::{ 88 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 89 Transportable, 90 }; 91 use vmm_sys_util::eventfd::EventFd; 92 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 93 use zerocopy::AsBytes; 94 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 95 /// Extract the specified bits of a 64-bit integer. 96 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 97 /// following expression should return 3 (`0b11`): 98 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 99 /// 100 macro_rules! extract_bits_64 { 101 ($value: tt, $offset: tt, $length: tt) => { 102 ($value >> $offset) & (!0u64 >> (64 - $length)) 103 }; 104 } 105 106 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 107 macro_rules! extract_bits_64_without_offset { 108 ($value: tt, $length: tt) => { 109 $value & (!0u64 >> (64 - $length)) 110 }; 111 } 112 113 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 114 115 #[derive(Debug, Error)] 116 pub enum Error { 117 #[error("Error creating vCPU: {0}")] 118 VcpuCreate(#[source] anyhow::Error), 119 120 #[error("Error running bCPU: {0}")] 121 VcpuRun(#[source] anyhow::Error), 122 123 #[error("Error spawning vCPU thread: {0}")] 124 VcpuSpawn(#[source] io::Error), 125 126 #[error("Error generating common CPUID: {0}")] 127 CommonCpuId(#[source] arch::Error), 128 129 #[error("Error configuring vCPU: {0}")] 130 VcpuConfiguration(#[source] arch::Error), 131 132 #[error("Still pending removed vcpu")] 133 VcpuPendingRemovedVcpu, 134 135 #[cfg(target_arch = "aarch64")] 136 #[error("Error fetching preferred target: {0}")] 137 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 138 139 #[cfg(target_arch = "aarch64")] 140 #[error("Error initialising vCPU: {0}")] 141 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 142 143 #[error("Failed to join on vCPU threads: {0:?}")] 144 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 145 146 #[error("Error adding CpuManager to MMIO bus: {0}")] 147 BusError(#[source] vm_device::BusError), 148 149 #[error("Requested vCPUs exceed maximum")] 150 DesiredVCpuCountExceedsMax, 151 152 #[error("Cannot create seccomp filter: {0}")] 153 CreateSeccompFilter(#[source] seccompiler::Error), 154 155 #[error("Cannot apply seccomp filter: {0}")] 156 ApplySeccompFilter(#[source] seccompiler::Error), 157 158 #[error("Error starting vCPU after restore: {0}")] 159 StartRestoreVcpu(#[source] anyhow::Error), 160 161 #[error("Unexpected VmExit")] 162 UnexpectedVmExit, 163 164 #[error("Failed to allocate MMIO address for CpuManager")] 165 AllocateMmmioAddress, 166 167 #[cfg(feature = "tdx")] 168 #[error("Error initializing TDX: {0}")] 169 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 170 171 #[cfg(target_arch = "aarch64")] 172 #[error("Error initializing PMU: {0}")] 173 InitPmu(#[source] hypervisor::HypervisorCpuError), 174 175 #[cfg(feature = "guest_debug")] 176 #[error("Error during CPU debug: {0}")] 177 CpuDebug(#[source] hypervisor::HypervisorCpuError), 178 179 #[cfg(feature = "guest_debug")] 180 #[error("Error translating virtual address: {0}")] 181 TranslateVirtualAddress(#[source] anyhow::Error), 182 183 #[cfg(target_arch = "x86_64")] 184 #[error("Error setting up AMX: {0}")] 185 AmxEnable(#[source] anyhow::Error), 186 187 #[error("Maximum number of vCPUs exceeds host limit")] 188 MaximumVcpusExceeded, 189 190 #[cfg(feature = "sev_snp")] 191 #[error("Failed to set sev control register: {0}")] 192 SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), 193 194 #[cfg(target_arch = "x86_64")] 195 #[error("Failed to inject NMI")] 196 NmiError(hypervisor::HypervisorCpuError), 197 } 198 pub type Result<T> = result::Result<T, Error>; 199 200 #[cfg(target_arch = "x86_64")] 201 #[allow(dead_code)] 202 #[repr(packed)] 203 #[derive(AsBytes)] 204 struct LocalX2Apic { 205 pub r#type: u8, 206 pub length: u8, 207 pub _reserved: u16, 208 pub apic_id: u32, 209 pub flags: u32, 210 pub processor_id: u32, 211 } 212 213 #[allow(dead_code)] 214 #[repr(packed)] 215 #[derive(Default, AsBytes)] 216 struct Ioapic { 217 pub r#type: u8, 218 pub length: u8, 219 pub ioapic_id: u8, 220 _reserved: u8, 221 pub apic_address: u32, 222 pub gsi_base: u32, 223 } 224 225 #[cfg(target_arch = "aarch64")] 226 #[allow(dead_code)] 227 #[repr(packed)] 228 #[derive(AsBytes)] 229 struct GicC { 230 pub r#type: u8, 231 pub length: u8, 232 pub reserved0: u16, 233 pub cpu_interface_number: u32, 234 pub uid: u32, 235 pub flags: u32, 236 pub parking_version: u32, 237 pub performance_interrupt: u32, 238 pub parked_address: u64, 239 pub base_address: u64, 240 pub gicv_base_address: u64, 241 pub gich_base_address: u64, 242 pub vgic_interrupt: u32, 243 pub gicr_base_address: u64, 244 pub mpidr: u64, 245 pub proc_power_effi_class: u8, 246 pub reserved1: u8, 247 pub spe_overflow_interrupt: u16, 248 } 249 250 #[cfg(target_arch = "aarch64")] 251 #[allow(dead_code)] 252 #[repr(packed)] 253 #[derive(AsBytes)] 254 struct GicD { 255 pub r#type: u8, 256 pub length: u8, 257 pub reserved0: u16, 258 pub gic_id: u32, 259 pub base_address: u64, 260 pub global_irq_base: u32, 261 pub version: u8, 262 pub reserved1: [u8; 3], 263 } 264 265 #[cfg(target_arch = "aarch64")] 266 #[allow(dead_code)] 267 #[repr(packed)] 268 #[derive(AsBytes)] 269 struct GicR { 270 pub r#type: u8, 271 pub length: u8, 272 pub reserved: u16, 273 pub base_address: u64, 274 pub range_length: u32, 275 } 276 277 #[cfg(target_arch = "aarch64")] 278 #[allow(dead_code)] 279 #[repr(packed)] 280 #[derive(AsBytes)] 281 struct GicIts { 282 pub r#type: u8, 283 pub length: u8, 284 pub reserved0: u16, 285 pub translation_id: u32, 286 pub base_address: u64, 287 pub reserved1: u32, 288 } 289 290 #[cfg(target_arch = "aarch64")] 291 #[allow(dead_code)] 292 #[repr(packed)] 293 #[derive(AsBytes)] 294 struct ProcessorHierarchyNode { 295 pub r#type: u8, 296 pub length: u8, 297 pub reserved: u16, 298 pub flags: u32, 299 pub parent: u32, 300 pub acpi_processor_id: u32, 301 pub num_private_resources: u32, 302 } 303 304 #[allow(dead_code)] 305 #[repr(packed)] 306 #[derive(Default, AsBytes)] 307 struct InterruptSourceOverride { 308 pub r#type: u8, 309 pub length: u8, 310 pub bus: u8, 311 pub source: u8, 312 pub gsi: u32, 313 pub flags: u16, 314 } 315 316 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 317 macro_rules! round_up { 318 ($n:expr,$d:expr) => { 319 (($n / ($d + 1)) + 1) * $d 320 }; 321 } 322 323 /// A wrapper around creating and using a kvm-based VCPU. 324 pub struct Vcpu { 325 // The hypervisor abstracted CPU. 326 vcpu: Arc<dyn hypervisor::Vcpu>, 327 id: u8, 328 #[cfg(target_arch = "aarch64")] 329 mpidr: u64, 330 saved_state: Option<CpuState>, 331 #[cfg(target_arch = "x86_64")] 332 vendor: CpuVendor, 333 } 334 335 impl Vcpu { 336 /// Constructs a new VCPU for `vm`. 337 /// 338 /// # Arguments 339 /// 340 /// * `id` - Represents the CPU number between [0, max vcpus). 341 /// * `vm` - The virtual machine this vcpu will get attached to. 342 /// * `vm_ops` - Optional object for exit handling. 343 /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) 344 pub fn new( 345 id: u8, 346 apic_id: u8, 347 vm: &Arc<dyn hypervisor::Vm>, 348 vm_ops: Option<Arc<dyn VmOps>>, 349 #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, 350 ) -> Result<Self> { 351 let vcpu = vm 352 .create_vcpu(apic_id, vm_ops) 353 .map_err(|e| Error::VcpuCreate(e.into()))?; 354 // Initially the cpuid per vCPU is the one supported by this VM. 355 Ok(Vcpu { 356 vcpu, 357 id, 358 #[cfg(target_arch = "aarch64")] 359 mpidr: 0, 360 saved_state: None, 361 #[cfg(target_arch = "x86_64")] 362 vendor: cpu_vendor, 363 }) 364 } 365 366 /// Configures a vcpu and should be called once per vcpu when created. 367 /// 368 /// # Arguments 369 /// 370 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 371 /// * `guest_memory` - Guest memory. 372 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 373 pub fn configure( 374 &mut self, 375 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 376 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 377 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 378 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 379 #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, 380 ) -> Result<()> { 381 #[cfg(target_arch = "aarch64")] 382 { 383 self.init(vm)?; 384 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 385 .map_err(Error::VcpuConfiguration)?; 386 } 387 info!("Configuring vCPU: cpu_id = {}", self.id); 388 #[cfg(target_arch = "x86_64")] 389 arch::configure_vcpu( 390 &self.vcpu, 391 self.id, 392 boot_setup, 393 cpuid, 394 kvm_hyperv, 395 self.vendor, 396 topology, 397 ) 398 .map_err(Error::VcpuConfiguration)?; 399 400 Ok(()) 401 } 402 403 /// Gets the MPIDR register value. 404 #[cfg(target_arch = "aarch64")] 405 pub fn get_mpidr(&self) -> u64 { 406 self.mpidr 407 } 408 409 /// Gets the saved vCPU state. 410 #[cfg(target_arch = "aarch64")] 411 pub fn get_saved_state(&self) -> Option<CpuState> { 412 self.saved_state.clone() 413 } 414 415 /// Initializes an aarch64 specific vcpu for booting Linux. 416 #[cfg(target_arch = "aarch64")] 417 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 418 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 419 420 // This reads back the kernel's preferred target type. 421 vm.get_preferred_target(&mut kvi) 422 .map_err(Error::VcpuArmPreferredTarget)?; 423 // We already checked that the capability is supported. 424 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 425 if vm 426 .as_any() 427 .downcast_ref::<hypervisor::kvm::KvmVm>() 428 .unwrap() 429 .check_extension(Cap::ArmPmuV3) 430 { 431 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 432 } 433 // Non-boot cpus are powered off initially. 434 if self.id > 0 { 435 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 436 } 437 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 438 } 439 440 /// Runs the VCPU until it exits, returning the reason. 441 /// 442 /// Note that the state of the VCPU and associated VM must be setup first for this to do 443 /// anything useful. 444 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 445 self.vcpu.run() 446 } 447 448 #[cfg(feature = "sev_snp")] 449 pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> { 450 self.vcpu 451 .set_sev_control_register(vmsa_pfn) 452 .map_err(Error::SetSevControlRegister) 453 } 454 } 455 456 impl Pausable for Vcpu {} 457 impl Snapshottable for Vcpu { 458 fn id(&self) -> String { 459 self.id.to_string() 460 } 461 462 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 463 let saved_state = self 464 .vcpu 465 .state() 466 .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?; 467 468 self.saved_state = Some(saved_state.clone()); 469 470 Ok(Snapshot::from_data(SnapshotData::new_from_state( 471 &saved_state, 472 )?)) 473 } 474 } 475 476 pub struct CpuManager { 477 config: CpusConfig, 478 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 479 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 480 #[cfg(target_arch = "x86_64")] 481 cpuid: Vec<CpuIdEntry>, 482 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 483 vm: Arc<dyn hypervisor::Vm>, 484 vcpus_kill_signalled: Arc<AtomicBool>, 485 vcpus_pause_signalled: Arc<AtomicBool>, 486 vcpus_kick_signalled: Arc<AtomicBool>, 487 exit_evt: EventFd, 488 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 489 reset_evt: EventFd, 490 #[cfg(feature = "guest_debug")] 491 vm_debug_evt: EventFd, 492 vcpu_states: Vec<VcpuState>, 493 selected_cpu: u8, 494 vcpus: Vec<Arc<Mutex<Vcpu>>>, 495 seccomp_action: SeccompAction, 496 vm_ops: Arc<dyn VmOps>, 497 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 498 acpi_address: Option<GuestAddress>, 499 proximity_domain_per_cpu: BTreeMap<u8, u32>, 500 affinity: BTreeMap<u8, Vec<usize>>, 501 dynamic: bool, 502 hypervisor: Arc<dyn hypervisor::Hypervisor>, 503 #[cfg(feature = "sev_snp")] 504 sev_snp_enabled: bool, 505 } 506 507 const CPU_ENABLE_FLAG: usize = 0; 508 const CPU_INSERTING_FLAG: usize = 1; 509 const CPU_REMOVING_FLAG: usize = 2; 510 const CPU_EJECT_FLAG: usize = 3; 511 512 const CPU_STATUS_OFFSET: u64 = 4; 513 const CPU_SELECTION_OFFSET: u64 = 0; 514 515 impl BusDevice for CpuManager { 516 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 517 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 518 data.fill(0); 519 520 match offset { 521 CPU_SELECTION_OFFSET => { 522 data[0] = self.selected_cpu; 523 } 524 CPU_STATUS_OFFSET => { 525 if self.selected_cpu < self.max_vcpus() { 526 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 527 if state.active() { 528 data[0] |= 1 << CPU_ENABLE_FLAG; 529 } 530 if state.inserting { 531 data[0] |= 1 << CPU_INSERTING_FLAG; 532 } 533 if state.removing { 534 data[0] |= 1 << CPU_REMOVING_FLAG; 535 } 536 } else { 537 warn!("Out of range vCPU id: {}", self.selected_cpu); 538 } 539 } 540 _ => { 541 warn!( 542 "Unexpected offset for accessing CPU manager device: {:#}", 543 offset 544 ); 545 } 546 } 547 } 548 549 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 550 match offset { 551 CPU_SELECTION_OFFSET => { 552 self.selected_cpu = data[0]; 553 } 554 CPU_STATUS_OFFSET => { 555 if self.selected_cpu < self.max_vcpus() { 556 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 557 // The ACPI code writes back a 1 to acknowledge the insertion 558 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 559 && state.inserting 560 { 561 state.inserting = false; 562 } 563 // Ditto for removal 564 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 565 && state.removing 566 { 567 state.removing = false; 568 } 569 // Trigger removal of vCPU 570 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 571 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 572 error!("Error removing vCPU: {:?}", e); 573 } 574 } 575 } else { 576 warn!("Out of range vCPU id: {}", self.selected_cpu); 577 } 578 } 579 _ => { 580 warn!( 581 "Unexpected offset for accessing CPU manager device: {:#}", 582 offset 583 ); 584 } 585 } 586 None 587 } 588 } 589 590 #[derive(Default)] 591 struct VcpuState { 592 inserting: bool, 593 removing: bool, 594 pending_removal: Arc<AtomicBool>, 595 handle: Option<thread::JoinHandle<()>>, 596 kill: Arc<AtomicBool>, 597 vcpu_run_interrupted: Arc<AtomicBool>, 598 paused: Arc<AtomicBool>, 599 } 600 601 impl VcpuState { 602 fn active(&self) -> bool { 603 self.handle.is_some() 604 } 605 606 fn signal_thread(&self) { 607 if let Some(handle) = self.handle.as_ref() { 608 loop { 609 // SAFETY: FFI call with correct arguments 610 unsafe { 611 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 612 } 613 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 614 break; 615 } else { 616 // This is more effective than thread::yield_now() at 617 // avoiding a priority inversion with the vCPU thread 618 thread::sleep(std::time::Duration::from_millis(1)); 619 } 620 } 621 } 622 } 623 624 fn join_thread(&mut self) -> Result<()> { 625 if let Some(handle) = self.handle.take() { 626 handle.join().map_err(Error::ThreadCleanup)? 627 } 628 629 Ok(()) 630 } 631 632 fn unpark_thread(&self) { 633 if let Some(handle) = self.handle.as_ref() { 634 handle.thread().unpark() 635 } 636 } 637 } 638 639 impl CpuManager { 640 #[allow(unused_variables)] 641 #[allow(clippy::too_many_arguments)] 642 pub fn new( 643 config: &CpusConfig, 644 vm: Arc<dyn hypervisor::Vm>, 645 exit_evt: EventFd, 646 reset_evt: EventFd, 647 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 648 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 649 seccomp_action: SeccompAction, 650 vm_ops: Arc<dyn VmOps>, 651 #[cfg(feature = "tdx")] tdx_enabled: bool, 652 numa_nodes: &NumaNodes, 653 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 654 ) -> Result<Arc<Mutex<CpuManager>>> { 655 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 656 return Err(Error::MaximumVcpusExceeded); 657 } 658 659 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 660 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 661 let hypervisor_type = hypervisor.hypervisor_type(); 662 #[cfg(target_arch = "x86_64")] 663 let cpu_vendor = hypervisor.get_cpu_vendor(); 664 665 #[cfg(target_arch = "x86_64")] 666 if config.features.amx { 667 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 668 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 669 const XFEATURE_XTILEDATA: usize = 18; 670 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 671 672 // SAFETY: the syscall is only modifying kernel internal 673 // data structures that the kernel is itself expected to safeguard. 674 let amx_tile = unsafe { 675 libc::syscall( 676 libc::SYS_arch_prctl, 677 ARCH_REQ_XCOMP_GUEST_PERM, 678 XFEATURE_XTILEDATA, 679 ) 680 }; 681 682 if amx_tile != 0 { 683 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 684 } else { 685 let mask: usize = 0; 686 // SAFETY: the mask being modified (not marked mutable as it is 687 // modified in unsafe only which is permitted) isn't in use elsewhere. 688 let result = unsafe { 689 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 690 }; 691 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 692 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 693 } 694 } 695 } 696 697 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 698 let mut cpu_list = Vec::new(); 699 for (proximity_domain, numa_node) in numa_nodes.iter() { 700 for cpu in numa_node.cpus.iter() { 701 cpu_list.push((*cpu, *proximity_domain)) 702 } 703 } 704 cpu_list 705 } 706 .into_iter() 707 .collect(); 708 709 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 710 cpu_affinity 711 .iter() 712 .map(|a| (a.vcpu, a.host_cpus.clone())) 713 .collect() 714 } else { 715 BTreeMap::new() 716 }; 717 718 #[cfg(feature = "tdx")] 719 let dynamic = !tdx_enabled; 720 #[cfg(not(feature = "tdx"))] 721 let dynamic = true; 722 723 Ok(Arc::new(Mutex::new(CpuManager { 724 config: config.clone(), 725 interrupt_controller: None, 726 #[cfg(target_arch = "x86_64")] 727 cpuid: Vec::new(), 728 vm, 729 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 730 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 731 vcpus_kick_signalled: Arc::new(AtomicBool::new(false)), 732 vcpu_states, 733 exit_evt, 734 reset_evt, 735 #[cfg(feature = "guest_debug")] 736 vm_debug_evt, 737 selected_cpu: 0, 738 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 739 seccomp_action, 740 vm_ops, 741 acpi_address: None, 742 proximity_domain_per_cpu, 743 affinity, 744 dynamic, 745 hypervisor: hypervisor.clone(), 746 #[cfg(feature = "sev_snp")] 747 sev_snp_enabled, 748 }))) 749 } 750 751 #[cfg(target_arch = "x86_64")] 752 pub fn populate_cpuid( 753 &mut self, 754 memory_manager: &Arc<Mutex<MemoryManager>>, 755 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 756 #[cfg(feature = "tdx")] tdx: bool, 757 ) -> Result<()> { 758 let sgx_epc_sections = memory_manager 759 .lock() 760 .unwrap() 761 .sgx_epc_region() 762 .as_ref() 763 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 764 765 self.cpuid = { 766 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 767 arch::generate_common_cpuid( 768 hypervisor, 769 &arch::CpuidConfig { 770 sgx_epc_sections, 771 phys_bits, 772 kvm_hyperv: self.config.kvm_hyperv, 773 #[cfg(feature = "tdx")] 774 tdx, 775 amx: self.config.features.amx, 776 }, 777 ) 778 .map_err(Error::CommonCpuId)? 779 }; 780 781 Ok(()) 782 } 783 784 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 785 info!("Creating vCPU: cpu_id = {}", cpu_id); 786 787 #[cfg(target_arch = "x86_64")] 788 let topology = self.get_vcpu_topology(); 789 #[cfg(target_arch = "x86_64")] 790 let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology); 791 #[cfg(target_arch = "aarch64")] 792 let x2apic_id = cpu_id as u32; 793 794 let mut vcpu = Vcpu::new( 795 cpu_id, 796 x2apic_id as u8, 797 &self.vm, 798 Some(self.vm_ops.clone()), 799 #[cfg(target_arch = "x86_64")] 800 self.hypervisor.get_cpu_vendor(), 801 )?; 802 803 if let Some(snapshot) = snapshot { 804 // AArch64 vCPUs should be initialized after created. 805 #[cfg(target_arch = "aarch64")] 806 vcpu.init(&self.vm)?; 807 808 let state: CpuState = snapshot.to_state().map_err(|e| { 809 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 810 })?; 811 vcpu.vcpu 812 .set_state(&state) 813 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 814 815 vcpu.saved_state = Some(state); 816 } 817 818 let vcpu = Arc::new(Mutex::new(vcpu)); 819 820 // Adding vCPU to the CpuManager's vCPU list. 821 self.vcpus.push(vcpu.clone()); 822 823 Ok(vcpu) 824 } 825 826 pub fn configure_vcpu( 827 &self, 828 vcpu: Arc<Mutex<Vcpu>>, 829 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 830 ) -> Result<()> { 831 let mut vcpu = vcpu.lock().unwrap(); 832 833 #[cfg(feature = "sev_snp")] 834 if self.sev_snp_enabled { 835 if let Some((kernel_entry_point, _)) = boot_setup { 836 vcpu.set_sev_control_register( 837 kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, 838 )?; 839 } 840 841 // Traditional way to configure vcpu doesn't work for SEV-SNP guests. 842 // All the vCPU configuration for SEV-SNP guest is provided via VMSA. 843 return Ok(()); 844 } 845 846 #[cfg(target_arch = "x86_64")] 847 assert!(!self.cpuid.is_empty()); 848 849 #[cfg(target_arch = "x86_64")] 850 let topology = self.config.topology.clone().map_or_else( 851 || Some((1, self.boot_vcpus(), 1)), 852 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 853 ); 854 #[cfg(target_arch = "x86_64")] 855 vcpu.configure( 856 boot_setup, 857 self.cpuid.clone(), 858 self.config.kvm_hyperv, 859 topology, 860 )?; 861 862 #[cfg(target_arch = "aarch64")] 863 vcpu.configure(&self.vm, boot_setup)?; 864 865 Ok(()) 866 } 867 868 /// Only create new vCPUs if there aren't any inactive ones to reuse 869 fn create_vcpus( 870 &mut self, 871 desired_vcpus: u8, 872 snapshot: Option<Snapshot>, 873 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 874 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 875 info!( 876 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 877 desired_vcpus, 878 self.config.max_vcpus, 879 self.vcpus.len(), 880 self.present_vcpus() 881 ); 882 883 if desired_vcpus > self.config.max_vcpus { 884 return Err(Error::DesiredVCpuCountExceedsMax); 885 } 886 887 // Only create vCPUs in excess of all the allocated vCPUs. 888 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 889 vcpus.push(self.create_vcpu( 890 cpu_id, 891 // TODO: The special format of the CPU id can be removed once 892 // ready to break live upgrade. 893 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 894 )?); 895 } 896 897 Ok(vcpus) 898 } 899 900 #[cfg(target_arch = "aarch64")] 901 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 902 for cpu in self.vcpus.iter() { 903 let cpu = cpu.lock().unwrap(); 904 // Check if PMU attr is available, if not, log the information. 905 if cpu.vcpu.has_pmu_support() { 906 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 907 } else { 908 debug!( 909 "PMU attribute is not supported in vCPU{}, skip PMU init!", 910 cpu.id 911 ); 912 return Ok(false); 913 } 914 } 915 916 Ok(true) 917 } 918 919 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 920 self.vcpus.clone() 921 } 922 923 fn start_vcpu( 924 &mut self, 925 vcpu: Arc<Mutex<Vcpu>>, 926 vcpu_id: u8, 927 vcpu_thread_barrier: Arc<Barrier>, 928 inserting: bool, 929 ) -> Result<()> { 930 let reset_evt = self.reset_evt.try_clone().unwrap(); 931 let exit_evt = self.exit_evt.try_clone().unwrap(); 932 #[cfg(feature = "kvm")] 933 let hypervisor_type = self.hypervisor.hypervisor_type(); 934 #[cfg(feature = "guest_debug")] 935 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 936 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 937 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 938 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 939 let vcpu_kick_signalled = self.vcpus_kick_signalled.clone(); 940 941 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 942 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 943 .vcpu_run_interrupted 944 .clone(); 945 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 946 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 947 948 // Prepare the CPU set the current vCPU is expected to run onto. 949 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 950 // SAFETY: all zeros is a valid pattern 951 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 952 // SAFETY: FFI call, trivially safe 953 unsafe { libc::CPU_ZERO(&mut cpuset) }; 954 for host_cpu in host_cpus { 955 // SAFETY: FFI call, trivially safe 956 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) }; 957 } 958 cpuset 959 }); 960 961 // Retrieve seccomp filter for vcpu thread 962 let vcpu_seccomp_filter = get_seccomp_filter( 963 &self.seccomp_action, 964 Thread::Vcpu, 965 self.hypervisor.hypervisor_type(), 966 ) 967 .map_err(Error::CreateSeccompFilter)?; 968 969 #[cfg(target_arch = "x86_64")] 970 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 971 972 info!("Starting vCPU: cpu_id = {}", vcpu_id); 973 974 let handle = Some( 975 thread::Builder::new() 976 .name(format!("vcpu{vcpu_id}")) 977 .spawn(move || { 978 // Schedule the thread to run on the expected CPU set 979 if let Some(cpuset) = cpuset.as_ref() { 980 // SAFETY: FFI call with correct arguments 981 let ret = unsafe { 982 libc::sched_setaffinity( 983 0, 984 std::mem::size_of::<libc::cpu_set_t>(), 985 cpuset as *const libc::cpu_set_t, 986 ) 987 }; 988 989 if ret != 0 { 990 error!( 991 "Failed scheduling the vCPU {} on the expected CPU set: {}", 992 vcpu_id, 993 io::Error::last_os_error() 994 ); 995 return; 996 } 997 } 998 999 // Apply seccomp filter for vcpu thread. 1000 if !vcpu_seccomp_filter.is_empty() { 1001 if let Err(e) = 1002 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 1003 { 1004 error!("Error applying seccomp filter: {:?}", e); 1005 return; 1006 } 1007 } 1008 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 1009 // This uses an async signal safe handler to kill the vcpu handles. 1010 register_signal_handler(SIGRTMIN(), handle_signal) 1011 .expect("Failed to register vcpu signal handler"); 1012 // Block until all CPUs are ready. 1013 vcpu_thread_barrier.wait(); 1014 1015 std::panic::catch_unwind(move || { 1016 loop { 1017 // If we are being told to pause, we park the thread 1018 // until the pause boolean is toggled. 1019 // The resume operation is responsible for toggling 1020 // the boolean and unpark the thread. 1021 // We enter a loop because park() could spuriously 1022 // return. We will then park() again unless the 1023 // pause boolean has been toggled. 1024 1025 // Need to use Ordering::SeqCst as we have multiple 1026 // loads and stores to different atomics and we need 1027 // to see them in a consistent order in all threads 1028 1029 if vcpu_pause_signalled.load(Ordering::SeqCst) { 1030 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 1031 // completed by returning to KVM_RUN. From the kernel docs: 1032 // 1033 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 1034 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 1035 // operations are complete (and guest state is consistent) only after userspace 1036 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 1037 // incomplete operations and then check for pending signals. 1038 // The pending state of the operation is not preserved in state which is 1039 // visible to userspace, thus userspace should ensure that the operation is 1040 // completed before performing a live migration. Userspace can re-enter the 1041 // guest with an unmasked signal pending or with the immediate_exit field set 1042 // to complete pending operations without allowing any further instructions 1043 // to be executed. 1044 1045 #[cfg(feature = "kvm")] 1046 if matches!(hypervisor_type, HypervisorType::Kvm) { 1047 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 1048 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 1049 error!("Unexpected VM exit on \"immediate_exit\" run"); 1050 break; 1051 } 1052 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 1053 } 1054 1055 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1056 1057 vcpu_paused.store(true, Ordering::SeqCst); 1058 while vcpu_pause_signalled.load(Ordering::SeqCst) { 1059 thread::park(); 1060 } 1061 vcpu_run_interrupted.store(false, Ordering::SeqCst); 1062 } 1063 1064 if vcpu_kick_signalled.load(Ordering::SeqCst) { 1065 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1066 #[cfg(target_arch = "x86_64")] 1067 match vcpu.lock().as_ref().unwrap().vcpu.nmi() { 1068 Ok(()) => {}, 1069 Err(e) => { 1070 error!("Error when inject nmi {}", e); 1071 break; 1072 } 1073 } 1074 } 1075 1076 // We've been told to terminate 1077 if vcpu_kill_signalled.load(Ordering::SeqCst) 1078 || vcpu_kill.load(Ordering::SeqCst) 1079 { 1080 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1081 break; 1082 } 1083 1084 #[cfg(feature = "tdx")] 1085 let mut vcpu = vcpu.lock().unwrap(); 1086 #[cfg(not(feature = "tdx"))] 1087 let vcpu = vcpu.lock().unwrap(); 1088 // vcpu.run() returns false on a triple-fault so trigger a reset 1089 match vcpu.run() { 1090 Ok(run) => match run { 1091 #[cfg(feature = "kvm")] 1092 VmExit::Debug => { 1093 info!("VmExit::Debug"); 1094 #[cfg(feature = "guest_debug")] 1095 { 1096 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1097 let raw_tid = get_raw_tid(vcpu_id as usize); 1098 vm_debug_evt.write(raw_tid as u64).unwrap(); 1099 } 1100 } 1101 #[cfg(target_arch = "x86_64")] 1102 VmExit::IoapicEoi(vector) => { 1103 if let Some(interrupt_controller) = 1104 &interrupt_controller_clone 1105 { 1106 interrupt_controller 1107 .lock() 1108 .unwrap() 1109 .end_of_interrupt(vector); 1110 } 1111 } 1112 VmExit::Ignore => {} 1113 VmExit::Hyperv => {} 1114 VmExit::Reset => { 1115 info!("VmExit::Reset"); 1116 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1117 reset_evt.write(1).unwrap(); 1118 break; 1119 } 1120 VmExit::Shutdown => { 1121 info!("VmExit::Shutdown"); 1122 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1123 exit_evt.write(1).unwrap(); 1124 break; 1125 } 1126 #[cfg(feature = "tdx")] 1127 VmExit::Tdx => { 1128 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1129 match vcpu.get_tdx_exit_details() { 1130 Ok(details) => match details { 1131 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1132 TdxExitDetails::SetupEventNotifyInterrupt => { 1133 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1134 } 1135 }, 1136 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1137 } 1138 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1139 } else { 1140 // We should never reach this code as 1141 // this means the design from the code 1142 // is wrong. 1143 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1144 } 1145 } 1146 }, 1147 1148 Err(e) => { 1149 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1150 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1151 exit_evt.write(1).unwrap(); 1152 break; 1153 } 1154 } 1155 1156 // We've been told to terminate 1157 if vcpu_kill_signalled.load(Ordering::SeqCst) 1158 || vcpu_kill.load(Ordering::SeqCst) 1159 { 1160 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1161 break; 1162 } 1163 } 1164 }) 1165 .or_else(|_| { 1166 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1167 error!("vCPU thread panicked"); 1168 panic_exit_evt.write(1) 1169 }) 1170 .ok(); 1171 }) 1172 .map_err(Error::VcpuSpawn)?, 1173 ); 1174 1175 // On hot plug calls into this function entry_point is None. It is for 1176 // those hotplug CPU additions that we need to set the inserting flag. 1177 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1178 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1179 1180 Ok(()) 1181 } 1182 1183 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1184 fn activate_vcpus( 1185 &mut self, 1186 desired_vcpus: u8, 1187 inserting: bool, 1188 paused: Option<bool>, 1189 ) -> Result<()> { 1190 if desired_vcpus > self.config.max_vcpus { 1191 return Err(Error::DesiredVCpuCountExceedsMax); 1192 } 1193 1194 let vcpu_thread_barrier = Arc::new(Barrier::new( 1195 (desired_vcpus - self.present_vcpus() + 1) as usize, 1196 )); 1197 1198 if let Some(paused) = paused { 1199 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1200 } 1201 1202 info!( 1203 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1204 desired_vcpus, 1205 self.vcpus.len(), 1206 self.present_vcpus(), 1207 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1208 ); 1209 1210 // This reuses any inactive vCPUs as well as any that were newly created 1211 for vcpu_id in self.present_vcpus()..desired_vcpus { 1212 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1213 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1214 } 1215 1216 // Unblock all CPU threads. 1217 vcpu_thread_barrier.wait(); 1218 Ok(()) 1219 } 1220 1221 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1222 // Mark vCPUs for removal, actual removal happens on ejection 1223 for cpu_id in desired_vcpus..self.present_vcpus() { 1224 self.vcpu_states[usize::from(cpu_id)].removing = true; 1225 self.vcpu_states[usize::from(cpu_id)] 1226 .pending_removal 1227 .store(true, Ordering::SeqCst); 1228 } 1229 } 1230 1231 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1232 for state in self.vcpu_states.iter() { 1233 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1234 return true; 1235 } 1236 } 1237 false 1238 } 1239 1240 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1241 info!("Removing vCPU: cpu_id = {}", cpu_id); 1242 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1243 state.kill.store(true, Ordering::SeqCst); 1244 state.signal_thread(); 1245 state.join_thread()?; 1246 state.handle = None; 1247 1248 // Once the thread has exited, clear the "kill" so that it can reused 1249 state.kill.store(false, Ordering::SeqCst); 1250 state.pending_removal.store(false, Ordering::SeqCst); 1251 1252 Ok(()) 1253 } 1254 1255 pub fn create_boot_vcpus( 1256 &mut self, 1257 snapshot: Option<Snapshot>, 1258 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1259 trace_scoped!("create_boot_vcpus"); 1260 1261 self.create_vcpus(self.boot_vcpus(), snapshot) 1262 } 1263 1264 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1265 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1266 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1267 } 1268 1269 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1270 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1271 .map_err(|e| { 1272 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1273 })?; 1274 1275 Ok(()) 1276 } 1277 1278 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1279 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1280 return Ok(false); 1281 } 1282 1283 if !self.dynamic { 1284 return Ok(false); 1285 } 1286 1287 if self.check_pending_removed_vcpu() { 1288 return Err(Error::VcpuPendingRemovedVcpu); 1289 } 1290 1291 match desired_vcpus.cmp(&self.present_vcpus()) { 1292 cmp::Ordering::Greater => { 1293 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1294 for vcpu in vcpus { 1295 self.configure_vcpu(vcpu, None)? 1296 } 1297 self.activate_vcpus(desired_vcpus, true, None)?; 1298 Ok(true) 1299 } 1300 cmp::Ordering::Less => { 1301 self.mark_vcpus_for_removal(desired_vcpus); 1302 Ok(true) 1303 } 1304 _ => Ok(false), 1305 } 1306 } 1307 1308 pub fn shutdown(&mut self) -> Result<()> { 1309 // Tell the vCPUs to stop themselves next time they go through the loop 1310 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1311 1312 // Toggle the vCPUs pause boolean 1313 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1314 1315 // Unpark all the VCPU threads. 1316 for state in self.vcpu_states.iter() { 1317 state.unpark_thread(); 1318 } 1319 1320 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1321 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1322 // above. 1323 for state in self.vcpu_states.iter() { 1324 state.signal_thread(); 1325 } 1326 1327 // Wait for all the threads to finish. This removes the state from the vector. 1328 for mut state in self.vcpu_states.drain(..) { 1329 state.join_thread()?; 1330 } 1331 1332 Ok(()) 1333 } 1334 1335 #[cfg(feature = "tdx")] 1336 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1337 for vcpu in &self.vcpus { 1338 vcpu.lock() 1339 .unwrap() 1340 .vcpu 1341 .tdx_init(hob_address) 1342 .map_err(Error::InitializeTdx)?; 1343 } 1344 Ok(()) 1345 } 1346 1347 pub fn boot_vcpus(&self) -> u8 { 1348 self.config.boot_vcpus 1349 } 1350 1351 pub fn max_vcpus(&self) -> u8 { 1352 self.config.max_vcpus 1353 } 1354 1355 #[cfg(target_arch = "x86_64")] 1356 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1357 assert!(!self.cpuid.is_empty()); 1358 self.cpuid.clone() 1359 } 1360 1361 fn present_vcpus(&self) -> u8 { 1362 self.vcpu_states 1363 .iter() 1364 .fold(0, |acc, state| acc + state.active() as u8) 1365 } 1366 1367 #[cfg(target_arch = "aarch64")] 1368 pub fn get_mpidrs(&self) -> Vec<u64> { 1369 self.vcpus 1370 .iter() 1371 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1372 .collect() 1373 } 1374 1375 #[cfg(target_arch = "aarch64")] 1376 pub fn get_saved_states(&self) -> Vec<CpuState> { 1377 self.vcpus 1378 .iter() 1379 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1380 .collect() 1381 } 1382 1383 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1384 self.config 1385 .topology 1386 .clone() 1387 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1388 } 1389 1390 pub fn create_madt(&self) -> Sdt { 1391 use crate::acpi; 1392 // This is also checked in the commandline parsing. 1393 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1394 1395 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1396 #[cfg(target_arch = "x86_64")] 1397 { 1398 madt.write(36, arch::layout::APIC_START.0); 1399 1400 for cpu in 0..self.config.max_vcpus { 1401 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology()); 1402 1403 let lapic = LocalX2Apic { 1404 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1405 length: 16, 1406 processor_id: cpu.into(), 1407 apic_id: x2apic_id, 1408 flags: if cpu < self.config.boot_vcpus { 1409 1 << MADT_CPU_ENABLE_FLAG 1410 } else { 1411 0 1412 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1413 _reserved: 0, 1414 }; 1415 madt.append(lapic); 1416 } 1417 1418 madt.append(Ioapic { 1419 r#type: acpi::ACPI_APIC_IO, 1420 length: 12, 1421 ioapic_id: 0, 1422 apic_address: arch::layout::IOAPIC_START.0 as u32, 1423 gsi_base: 0, 1424 ..Default::default() 1425 }); 1426 1427 madt.append(InterruptSourceOverride { 1428 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1429 length: 10, 1430 bus: 0, 1431 source: 4, 1432 gsi: 4, 1433 flags: 0, 1434 }); 1435 } 1436 1437 #[cfg(target_arch = "aarch64")] 1438 { 1439 /* Notes: 1440 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1441 */ 1442 1443 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1444 for cpu in 0..self.config.boot_vcpus { 1445 let vcpu = &self.vcpus[cpu as usize]; 1446 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1447 /* ARMv8 MPIDR format: 1448 Bits [63:40] Must be zero 1449 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1450 Bits [31:24] Must be zero 1451 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1452 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1453 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1454 */ 1455 let mpidr_mask = 0xff_00ff_ffff; 1456 let gicc = GicC { 1457 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1458 length: 80, 1459 reserved0: 0, 1460 cpu_interface_number: cpu as u32, 1461 uid: cpu as u32, 1462 flags: 1, 1463 parking_version: 0, 1464 performance_interrupt: 0, 1465 parked_address: 0, 1466 base_address: 0, 1467 gicv_base_address: 0, 1468 gich_base_address: 0, 1469 vgic_interrupt: 0, 1470 gicr_base_address: 0, 1471 mpidr: mpidr & mpidr_mask, 1472 proc_power_effi_class: 0, 1473 reserved1: 0, 1474 spe_overflow_interrupt: 0, 1475 }; 1476 1477 madt.append(gicc); 1478 } 1479 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1480 1481 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1482 let gicd = GicD { 1483 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1484 length: 24, 1485 reserved0: 0, 1486 gic_id: 0, 1487 base_address: vgic_config.dist_addr, 1488 global_irq_base: 0, 1489 version: 3, 1490 reserved1: [0; 3], 1491 }; 1492 madt.append(gicd); 1493 1494 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1495 let gicr = GicR { 1496 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1497 length: 16, 1498 reserved: 0, 1499 base_address: vgic_config.redists_addr, 1500 range_length: vgic_config.redists_size as u32, 1501 }; 1502 madt.append(gicr); 1503 1504 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1505 let gicits = GicIts { 1506 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1507 length: 20, 1508 reserved0: 0, 1509 translation_id: 0, 1510 base_address: vgic_config.msi_addr, 1511 reserved1: 0, 1512 }; 1513 madt.append(gicits); 1514 1515 madt.update_checksum(); 1516 } 1517 1518 madt 1519 } 1520 1521 #[cfg(target_arch = "aarch64")] 1522 pub fn create_pptt(&self) -> Sdt { 1523 let pptt_start = 0; 1524 let mut cpus = 0; 1525 let mut uid = 0; 1526 // If topology is not specified, the default setting is: 1527 // 1 package, multiple cores, 1 thread per core 1528 // This is also the behavior when PPTT is missing. 1529 let (threads_per_core, cores_per_package, packages) = 1530 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1531 1532 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1533 1534 for cluster_idx in 0..packages { 1535 if cpus < self.config.boot_vcpus as usize { 1536 let cluster_offset = pptt.len() - pptt_start; 1537 let cluster_hierarchy_node = ProcessorHierarchyNode { 1538 r#type: 0, 1539 length: 20, 1540 reserved: 0, 1541 flags: 0x2, 1542 parent: 0, 1543 acpi_processor_id: cluster_idx as u32, 1544 num_private_resources: 0, 1545 }; 1546 pptt.append(cluster_hierarchy_node); 1547 1548 for core_idx in 0..cores_per_package { 1549 let core_offset = pptt.len() - pptt_start; 1550 1551 if threads_per_core > 1 { 1552 let core_hierarchy_node = ProcessorHierarchyNode { 1553 r#type: 0, 1554 length: 20, 1555 reserved: 0, 1556 flags: 0x2, 1557 parent: cluster_offset as u32, 1558 acpi_processor_id: core_idx as u32, 1559 num_private_resources: 0, 1560 }; 1561 pptt.append(core_hierarchy_node); 1562 1563 for _thread_idx in 0..threads_per_core { 1564 let thread_hierarchy_node = ProcessorHierarchyNode { 1565 r#type: 0, 1566 length: 20, 1567 reserved: 0, 1568 flags: 0xE, 1569 parent: core_offset as u32, 1570 acpi_processor_id: uid as u32, 1571 num_private_resources: 0, 1572 }; 1573 pptt.append(thread_hierarchy_node); 1574 uid += 1; 1575 } 1576 } else { 1577 let thread_hierarchy_node = ProcessorHierarchyNode { 1578 r#type: 0, 1579 length: 20, 1580 reserved: 0, 1581 flags: 0xA, 1582 parent: cluster_offset as u32, 1583 acpi_processor_id: uid as u32, 1584 num_private_resources: 0, 1585 }; 1586 pptt.append(thread_hierarchy_node); 1587 uid += 1; 1588 } 1589 } 1590 cpus += (cores_per_package * threads_per_core) as usize; 1591 } 1592 } 1593 1594 pptt.update_checksum(); 1595 pptt 1596 } 1597 1598 #[cfg(feature = "guest_debug")] 1599 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1600 self.vcpus[usize::from(cpu_id)] 1601 .lock() 1602 .unwrap() 1603 .vcpu 1604 .get_regs() 1605 .map_err(Error::CpuDebug) 1606 } 1607 1608 #[cfg(feature = "guest_debug")] 1609 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1610 self.vcpus[usize::from(cpu_id)] 1611 .lock() 1612 .unwrap() 1613 .vcpu 1614 .set_regs(regs) 1615 .map_err(Error::CpuDebug) 1616 } 1617 1618 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1619 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1620 self.vcpus[usize::from(cpu_id)] 1621 .lock() 1622 .unwrap() 1623 .vcpu 1624 .get_sregs() 1625 .map_err(Error::CpuDebug) 1626 } 1627 1628 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1629 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1630 self.vcpus[usize::from(cpu_id)] 1631 .lock() 1632 .unwrap() 1633 .vcpu 1634 .set_sregs(sregs) 1635 .map_err(Error::CpuDebug) 1636 } 1637 1638 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1639 fn translate_gva( 1640 &self, 1641 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1642 cpu_id: u8, 1643 gva: u64, 1644 ) -> Result<u64> { 1645 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1646 .lock() 1647 .unwrap() 1648 .vcpu 1649 .translate_gva(gva, /* flags: unused */ 0) 1650 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1651 Ok(gpa) 1652 } 1653 1654 /// 1655 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1656 /// it in VMM by walking through translation tables. 1657 /// 1658 /// Address translation is big topic, here we only focus the scenario that 1659 /// happens in VMM while debugging kernel. This `translate_gva` 1660 /// implementation is restricted to: 1661 /// - Exception Level 1 1662 /// - Translate high address range only (kernel space) 1663 /// 1664 /// This implementation supports following Arm-v8a features related to 1665 /// address translation: 1666 /// - FEAT_LPA 1667 /// - FEAT_LVA 1668 /// - FEAT_LPA2 1669 /// 1670 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1671 fn translate_gva( 1672 &self, 1673 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1674 cpu_id: u8, 1675 gva: u64, 1676 ) -> Result<u64> { 1677 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1678 .lock() 1679 .unwrap() 1680 .vcpu 1681 .get_sys_reg(regs::TCR_EL1) 1682 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1683 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1684 .lock() 1685 .unwrap() 1686 .vcpu 1687 .get_sys_reg(regs::TTBR1_EL1) 1688 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1689 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1690 .lock() 1691 .unwrap() 1692 .vcpu 1693 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1694 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1695 1696 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1697 // or low (0x000xxx...). 1698 let high_range = extract_bits_64!(gva, 55, 1); 1699 if high_range == 0 { 1700 info!("VA (0x{:x}) range is not supported!", gva); 1701 return Ok(gva); 1702 } 1703 1704 // High range size offset 1705 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1706 // Granule size 1707 let tg = extract_bits_64!(tcr_el1, 30, 2); 1708 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1709 let ds = extract_bits_64!(tcr_el1, 59, 1); 1710 1711 if tsz == 0 { 1712 info!("VA translation is not ready!"); 1713 return Ok(gva); 1714 } 1715 1716 // VA size is determined by TCR_BL1.T1SZ 1717 let va_size = 64 - tsz; 1718 // Number of bits in VA consumed in each level of translation 1719 let stride = match tg { 1720 3 => 13, // 64KB granule size 1721 1 => 11, // 16KB granule size 1722 _ => 9, // 4KB, default 1723 }; 1724 // Starting level of walking 1725 let mut level = 4 - (va_size - 4) / stride; 1726 1727 // PA or IPA size is determined 1728 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1729 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1730 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1731 // To be safe, we use the minimum value if they are different. 1732 let pa_range = std::cmp::min(tcr_ips, pa_range); 1733 // PA size in bits 1734 let pa_size = match pa_range { 1735 0 => 32, 1736 1 => 36, 1737 2 => 40, 1738 3 => 42, 1739 4 => 44, 1740 5 => 48, 1741 6 => 52, 1742 _ => { 1743 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1744 "PA range not supported {pa_range}" 1745 )))) 1746 } 1747 }; 1748 1749 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1750 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1751 // If FEAT_LPA2 is present, the translation table descriptor holds 1752 // 50 bits of the table address of next level. 1753 // Otherwise, it is 48 bits. 1754 let descaddrmask = if ds == 1 { 1755 !0u64 >> (64 - 50) // mask with 50 least significant bits 1756 } else { 1757 !0u64 >> (64 - 48) // mask with 48 least significant bits 1758 }; 1759 let descaddrmask = descaddrmask & !indexmask_grainsize; 1760 1761 // Translation table base address 1762 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1763 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1764 // address bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1765 if pa_size == 52 { 1766 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1767 } 1768 1769 // Loop through tables of each level 1770 loop { 1771 // Table offset for current level 1772 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1773 descaddr |= table_offset; 1774 descaddr &= !7u64; 1775 1776 let mut buf = [0; 8]; 1777 guest_memory 1778 .memory() 1779 .read(&mut buf, GuestAddress(descaddr)) 1780 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1781 let descriptor = u64::from_le_bytes(buf); 1782 1783 descaddr = descriptor & descaddrmask; 1784 // In the case of FEAT_LPA, the next-level translation table address 1785 // bits [48:51] comes from bits [12:15] of the current descriptor. 1786 // For FEAT_LPA2, the next-level translation table address 1787 // bits [50:51] comes from bits [8:9] of the current descriptor, 1788 // bits [48:49] comes from bits [48:49] of the descriptor which was 1789 // handled previously. 1790 if pa_size == 52 { 1791 if ds == 1 { 1792 // FEAT_LPA2 1793 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1794 } else { 1795 // FEAT_LPA 1796 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1797 } 1798 } 1799 1800 if (descriptor & 2) != 0 && (level < 3) { 1801 // This is a table entry. Go down to next level. 1802 level += 1; 1803 indexmask = indexmask_grainsize; 1804 continue; 1805 } 1806 1807 break; 1808 } 1809 1810 // We have reached either: 1811 // - a page entry at level 3 or 1812 // - a block entry at level 1 or 2 1813 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1814 descaddr &= !(page_size - 1); 1815 descaddr |= gva & (page_size - 1); 1816 1817 Ok(descaddr) 1818 } 1819 1820 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1821 self.acpi_address = Some(acpi_address); 1822 } 1823 1824 pub(crate) fn set_interrupt_controller( 1825 &mut self, 1826 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1827 ) { 1828 self.interrupt_controller = Some(interrupt_controller); 1829 } 1830 1831 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1832 &self.vcpus_kill_signalled 1833 } 1834 1835 #[cfg(feature = "igvm")] 1836 pub(crate) fn get_cpuid_leaf( 1837 &self, 1838 cpu_id: u8, 1839 eax: u32, 1840 ecx: u32, 1841 xfem: u64, 1842 xss: u64, 1843 ) -> Result<[u32; 4]> { 1844 let leaf_info = self.vcpus[usize::from(cpu_id)] 1845 .lock() 1846 .unwrap() 1847 .vcpu 1848 .get_cpuid_values(eax, ecx, xfem, xss) 1849 .unwrap(); 1850 Ok(leaf_info) 1851 } 1852 1853 #[cfg(feature = "sev_snp")] 1854 pub(crate) fn sev_snp_enabled(&self) -> bool { 1855 self.sev_snp_enabled 1856 } 1857 1858 pub(crate) fn nmi(&self) -> Result<()> { 1859 self.vcpus_kick_signalled.store(true, Ordering::SeqCst); 1860 1861 for state in self.vcpu_states.iter() { 1862 state.signal_thread(); 1863 } 1864 1865 self.vcpus_kick_signalled.store(false, Ordering::SeqCst); 1866 1867 Ok(()) 1868 } 1869 } 1870 1871 struct Cpu { 1872 cpu_id: u8, 1873 proximity_domain: u32, 1874 dynamic: bool, 1875 #[cfg(target_arch = "x86_64")] 1876 topology: Option<(u8, u8, u8)>, 1877 } 1878 1879 #[cfg(target_arch = "x86_64")] 1880 const MADT_CPU_ENABLE_FLAG: usize = 0; 1881 1882 #[cfg(target_arch = "x86_64")] 1883 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1884 1885 impl Cpu { 1886 #[cfg(target_arch = "x86_64")] 1887 fn generate_mat(&self) -> Vec<u8> { 1888 let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology); 1889 1890 let lapic = LocalX2Apic { 1891 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1892 length: 16, 1893 processor_id: self.cpu_id.into(), 1894 apic_id: x2apic_id, 1895 flags: 1 << MADT_CPU_ENABLE_FLAG, 1896 _reserved: 0, 1897 }; 1898 1899 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1900 // SAFETY: mat_data is large enough to hold lapic 1901 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1902 1903 mat_data 1904 } 1905 } 1906 1907 impl Aml for Cpu { 1908 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1909 #[cfg(target_arch = "x86_64")] 1910 let mat_data: Vec<u8> = self.generate_mat(); 1911 #[allow(clippy::if_same_then_else)] 1912 if self.dynamic { 1913 aml::Device::new( 1914 format!("C{:03X}", self.cpu_id).as_str().into(), 1915 vec![ 1916 &aml::Name::new("_HID".into(), &"ACPI0007"), 1917 &aml::Name::new("_UID".into(), &self.cpu_id), 1918 // Currently, AArch64 cannot support following fields. 1919 /* 1920 _STA return value: 1921 Bit [0] – Set if the device is present. 1922 Bit [1] – Set if the device is enabled and decoding its resources. 1923 Bit [2] – Set if the device should be shown in the UI. 1924 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1925 Bit [4] – Set if the battery is present. 1926 Bits [31:5] – Reserved (must be cleared). 1927 */ 1928 #[cfg(target_arch = "x86_64")] 1929 &aml::Method::new( 1930 "_STA".into(), 1931 0, 1932 false, 1933 // Call into CSTA method which will interrogate device 1934 vec![&aml::Return::new(&aml::MethodCall::new( 1935 "CSTA".into(), 1936 vec![&self.cpu_id], 1937 ))], 1938 ), 1939 &aml::Method::new( 1940 "_PXM".into(), 1941 0, 1942 false, 1943 vec![&aml::Return::new(&self.proximity_domain)], 1944 ), 1945 // The Linux kernel expects every CPU device to have a _MAT entry 1946 // containing the LAPIC for this processor with the enabled bit set 1947 // even it if is disabled in the MADT (non-boot CPU) 1948 #[cfg(target_arch = "x86_64")] 1949 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1950 // Trigger CPU ejection 1951 #[cfg(target_arch = "x86_64")] 1952 &aml::Method::new( 1953 "_EJ0".into(), 1954 1, 1955 false, 1956 // Call into CEJ0 method which will actually eject device 1957 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1958 ), 1959 ], 1960 ) 1961 .to_aml_bytes(sink); 1962 } else { 1963 aml::Device::new( 1964 format!("C{:03X}", self.cpu_id).as_str().into(), 1965 vec![ 1966 &aml::Name::new("_HID".into(), &"ACPI0007"), 1967 &aml::Name::new("_UID".into(), &self.cpu_id), 1968 #[cfg(target_arch = "x86_64")] 1969 &aml::Method::new( 1970 "_STA".into(), 1971 0, 1972 false, 1973 // Mark CPU present see CSTA implementation 1974 vec![&aml::Return::new(&0xfu8)], 1975 ), 1976 &aml::Method::new( 1977 "_PXM".into(), 1978 0, 1979 false, 1980 vec![&aml::Return::new(&self.proximity_domain)], 1981 ), 1982 // The Linux kernel expects every CPU device to have a _MAT entry 1983 // containing the LAPIC for this processor with the enabled bit set 1984 // even it if is disabled in the MADT (non-boot CPU) 1985 #[cfg(target_arch = "x86_64")] 1986 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1987 ], 1988 ) 1989 .to_aml_bytes(sink); 1990 } 1991 } 1992 } 1993 1994 struct CpuNotify { 1995 cpu_id: u8, 1996 } 1997 1998 impl Aml for CpuNotify { 1999 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2000 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 2001 aml::If::new( 2002 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 2003 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2004 ) 2005 .to_aml_bytes(sink) 2006 } 2007 } 2008 2009 struct CpuMethods { 2010 max_vcpus: u8, 2011 dynamic: bool, 2012 } 2013 2014 impl Aml for CpuMethods { 2015 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2016 if self.dynamic { 2017 // CPU status method 2018 aml::Method::new( 2019 "CSTA".into(), 2020 1, 2021 true, 2022 vec![ 2023 // Take lock defined above 2024 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2025 // Write CPU number (in first argument) to I/O port via field 2026 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2027 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2028 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2029 &aml::If::new( 2030 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 2031 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2032 ), 2033 // Release lock 2034 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2035 // Return 0 or 0xf 2036 &aml::Return::new(&aml::Local(0)), 2037 ], 2038 ) 2039 .to_aml_bytes(sink); 2040 2041 let mut cpu_notifies = Vec::new(); 2042 for cpu_id in 0..self.max_vcpus { 2043 cpu_notifies.push(CpuNotify { cpu_id }); 2044 } 2045 2046 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2047 for cpu_id in 0..self.max_vcpus { 2048 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 2049 } 2050 2051 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 2052 2053 aml::Method::new( 2054 "CEJ0".into(), 2055 1, 2056 true, 2057 vec![ 2058 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2059 // Write CPU number (in first argument) to I/O port via field 2060 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2061 // Set CEJ0 bit 2062 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 2063 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2064 ], 2065 ) 2066 .to_aml_bytes(sink); 2067 2068 aml::Method::new( 2069 "CSCN".into(), 2070 0, 2071 true, 2072 vec![ 2073 // Take lock defined above 2074 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2075 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2076 &aml::While::new( 2077 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 2078 vec![ 2079 // Write CPU number (in first argument) to I/O port via field 2080 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 2081 // Check if CINS bit is set 2082 &aml::If::new( 2083 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 2084 // Notify device if it is 2085 vec![ 2086 &aml::MethodCall::new( 2087 "CTFY".into(), 2088 vec![&aml::Local(0), &aml::ONE], 2089 ), 2090 // Reset CINS bit 2091 &aml::Store::new( 2092 &aml::Path::new("\\_SB_.PRES.CINS"), 2093 &aml::ONE, 2094 ), 2095 ], 2096 ), 2097 // Check if CRMV bit is set 2098 &aml::If::new( 2099 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 2100 // Notify device if it is (with the eject constant 0x3) 2101 vec![ 2102 &aml::MethodCall::new( 2103 "CTFY".into(), 2104 vec![&aml::Local(0), &3u8], 2105 ), 2106 // Reset CRMV bit 2107 &aml::Store::new( 2108 &aml::Path::new("\\_SB_.PRES.CRMV"), 2109 &aml::ONE, 2110 ), 2111 ], 2112 ), 2113 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2114 ], 2115 ), 2116 // Release lock 2117 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2118 ], 2119 ) 2120 .to_aml_bytes(sink) 2121 } else { 2122 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2123 } 2124 } 2125 } 2126 2127 impl Aml for CpuManager { 2128 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2129 #[cfg(target_arch = "x86_64")] 2130 if let Some(acpi_address) = self.acpi_address { 2131 // CPU hotplug controller 2132 aml::Device::new( 2133 "_SB_.PRES".into(), 2134 vec![ 2135 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2136 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2137 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2138 &aml::Mutex::new("CPLK".into(), 0), 2139 &aml::Name::new( 2140 "_CRS".into(), 2141 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2142 aml::AddressSpaceCacheable::NotCacheable, 2143 true, 2144 acpi_address.0, 2145 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2146 None, 2147 )]), 2148 ), 2149 // OpRegion and Fields map MMIO range into individual field values 2150 &aml::OpRegion::new( 2151 "PRST".into(), 2152 aml::OpRegionSpace::SystemMemory, 2153 &(acpi_address.0 as usize), 2154 &CPU_MANAGER_ACPI_SIZE, 2155 ), 2156 &aml::Field::new( 2157 "PRST".into(), 2158 aml::FieldAccessType::Byte, 2159 aml::FieldLockRule::NoLock, 2160 aml::FieldUpdateRule::WriteAsZeroes, 2161 vec![ 2162 aml::FieldEntry::Reserved(32), 2163 aml::FieldEntry::Named(*b"CPEN", 1), 2164 aml::FieldEntry::Named(*b"CINS", 1), 2165 aml::FieldEntry::Named(*b"CRMV", 1), 2166 aml::FieldEntry::Named(*b"CEJ0", 1), 2167 aml::FieldEntry::Reserved(4), 2168 aml::FieldEntry::Named(*b"CCMD", 8), 2169 ], 2170 ), 2171 &aml::Field::new( 2172 "PRST".into(), 2173 aml::FieldAccessType::DWord, 2174 aml::FieldLockRule::NoLock, 2175 aml::FieldUpdateRule::Preserve, 2176 vec![ 2177 aml::FieldEntry::Named(*b"CSEL", 32), 2178 aml::FieldEntry::Reserved(32), 2179 aml::FieldEntry::Named(*b"CDAT", 32), 2180 ], 2181 ), 2182 ], 2183 ) 2184 .to_aml_bytes(sink); 2185 } 2186 2187 // CPU devices 2188 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2189 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2190 // Bundle methods together under a common object 2191 let methods = CpuMethods { 2192 max_vcpus: self.config.max_vcpus, 2193 dynamic: self.dynamic, 2194 }; 2195 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2196 2197 #[cfg(target_arch = "x86_64")] 2198 let topology = self.get_vcpu_topology(); 2199 let mut cpu_devices = Vec::new(); 2200 for cpu_id in 0..self.config.max_vcpus { 2201 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2202 let cpu_device = Cpu { 2203 cpu_id, 2204 proximity_domain, 2205 dynamic: self.dynamic, 2206 #[cfg(target_arch = "x86_64")] 2207 topology, 2208 }; 2209 2210 cpu_devices.push(cpu_device); 2211 } 2212 2213 for cpu_device in cpu_devices.iter() { 2214 cpu_data_inner.push(cpu_device); 2215 } 2216 2217 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2218 } 2219 } 2220 2221 impl Pausable for CpuManager { 2222 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2223 // Tell the vCPUs to pause themselves next time they exit 2224 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2225 2226 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2227 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2228 // above. 2229 for state in self.vcpu_states.iter() { 2230 state.signal_thread(); 2231 } 2232 2233 for vcpu in self.vcpus.iter() { 2234 let mut vcpu = vcpu.lock().unwrap(); 2235 vcpu.pause()?; 2236 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2237 if !self.config.kvm_hyperv { 2238 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2239 MigratableError::Pause(anyhow!( 2240 "Could not notify guest it has been paused {:?}", 2241 e 2242 )) 2243 })?; 2244 } 2245 } 2246 2247 // The vCPU thread will change its paused state before parking, wait here for each 2248 // activated vCPU change their state to ensure they have parked. 2249 for state in self.vcpu_states.iter() { 2250 if state.active() { 2251 while !state.paused.load(Ordering::SeqCst) { 2252 // To avoid a priority inversion with the vCPU thread 2253 thread::sleep(std::time::Duration::from_millis(1)); 2254 } 2255 } 2256 } 2257 2258 Ok(()) 2259 } 2260 2261 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2262 for vcpu in self.vcpus.iter() { 2263 vcpu.lock().unwrap().resume()?; 2264 } 2265 2266 // Toggle the vCPUs pause boolean 2267 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2268 2269 // Unpark all the VCPU threads. 2270 // Once unparked, the next thing they will do is checking for the pause 2271 // boolean. Since it'll be set to false, they will exit their pause loop 2272 // and go back to vmx root. 2273 for state in self.vcpu_states.iter() { 2274 state.paused.store(false, Ordering::SeqCst); 2275 state.unpark_thread(); 2276 } 2277 Ok(()) 2278 } 2279 } 2280 2281 impl Snapshottable for CpuManager { 2282 fn id(&self) -> String { 2283 CPU_MANAGER_SNAPSHOT_ID.to_string() 2284 } 2285 2286 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2287 let mut cpu_manager_snapshot = Snapshot::default(); 2288 2289 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2290 for vcpu in &self.vcpus { 2291 let mut vcpu = vcpu.lock().unwrap(); 2292 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2293 } 2294 2295 Ok(cpu_manager_snapshot) 2296 } 2297 } 2298 2299 impl Transportable for CpuManager {} 2300 impl Migratable for CpuManager {} 2301 2302 #[cfg(feature = "guest_debug")] 2303 impl Debuggable for CpuManager { 2304 #[cfg(feature = "kvm")] 2305 fn set_guest_debug( 2306 &self, 2307 cpu_id: usize, 2308 addrs: &[GuestAddress], 2309 singlestep: bool, 2310 ) -> std::result::Result<(), DebuggableError> { 2311 self.vcpus[cpu_id] 2312 .lock() 2313 .unwrap() 2314 .vcpu 2315 .set_guest_debug(addrs, singlestep) 2316 .map_err(DebuggableError::SetDebug) 2317 } 2318 2319 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2320 Ok(()) 2321 } 2322 2323 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2324 Ok(()) 2325 } 2326 2327 #[cfg(target_arch = "x86_64")] 2328 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2329 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2330 let gregs = self 2331 .get_regs(cpu_id as u8) 2332 .map_err(DebuggableError::ReadRegs)?; 2333 let regs = [ 2334 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, 2335 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, 2336 ]; 2337 2338 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2339 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2340 let eflags = gregs.rflags as u32; 2341 let rip = gregs.rip; 2342 2343 // Segment registers: CS, SS, DS, ES, FS, GS 2344 let sregs = self 2345 .get_sregs(cpu_id as u8) 2346 .map_err(DebuggableError::ReadRegs)?; 2347 let segments = X86SegmentRegs { 2348 cs: sregs.cs.selector as u32, 2349 ss: sregs.ss.selector as u32, 2350 ds: sregs.ds.selector as u32, 2351 es: sregs.es.selector as u32, 2352 fs: sregs.fs.selector as u32, 2353 gs: sregs.gs.selector as u32, 2354 }; 2355 2356 // TODO: Add other registers 2357 2358 Ok(CoreRegs { 2359 regs, 2360 eflags, 2361 rip, 2362 segments, 2363 ..Default::default() 2364 }) 2365 } 2366 2367 #[cfg(target_arch = "aarch64")] 2368 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2369 let gregs = self 2370 .get_regs(cpu_id as u8) 2371 .map_err(DebuggableError::ReadRegs)?; 2372 Ok(CoreRegs { 2373 x: gregs.regs.regs, 2374 sp: gregs.regs.sp, 2375 pc: gregs.regs.pc, 2376 ..Default::default() 2377 }) 2378 } 2379 2380 #[cfg(target_arch = "x86_64")] 2381 fn write_regs( 2382 &self, 2383 cpu_id: usize, 2384 regs: &CoreRegs, 2385 ) -> std::result::Result<(), DebuggableError> { 2386 let orig_gregs = self 2387 .get_regs(cpu_id as u8) 2388 .map_err(DebuggableError::ReadRegs)?; 2389 let gregs = StandardRegisters { 2390 rax: regs.regs[0], 2391 rbx: regs.regs[1], 2392 rcx: regs.regs[2], 2393 rdx: regs.regs[3], 2394 rsi: regs.regs[4], 2395 rdi: regs.regs[5], 2396 rbp: regs.regs[6], 2397 rsp: regs.regs[7], 2398 r8: regs.regs[8], 2399 r9: regs.regs[9], 2400 r10: regs.regs[10], 2401 r11: regs.regs[11], 2402 r12: regs.regs[12], 2403 r13: regs.regs[13], 2404 r14: regs.regs[14], 2405 r15: regs.regs[15], 2406 rip: regs.rip, 2407 // Update the lower 32-bit of rflags. 2408 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), 2409 }; 2410 2411 self.set_regs(cpu_id as u8, &gregs) 2412 .map_err(DebuggableError::WriteRegs)?; 2413 2414 // Segment registers: CS, SS, DS, ES, FS, GS 2415 // Since GDB care only selectors, we call get_sregs() first. 2416 let mut sregs = self 2417 .get_sregs(cpu_id as u8) 2418 .map_err(DebuggableError::ReadRegs)?; 2419 sregs.cs.selector = regs.segments.cs as u16; 2420 sregs.ss.selector = regs.segments.ss as u16; 2421 sregs.ds.selector = regs.segments.ds as u16; 2422 sregs.es.selector = regs.segments.es as u16; 2423 sregs.fs.selector = regs.segments.fs as u16; 2424 sregs.gs.selector = regs.segments.gs as u16; 2425 2426 self.set_sregs(cpu_id as u8, &sregs) 2427 .map_err(DebuggableError::WriteRegs)?; 2428 2429 // TODO: Add other registers 2430 2431 Ok(()) 2432 } 2433 2434 #[cfg(target_arch = "aarch64")] 2435 fn write_regs( 2436 &self, 2437 cpu_id: usize, 2438 regs: &CoreRegs, 2439 ) -> std::result::Result<(), DebuggableError> { 2440 let mut gregs = self 2441 .get_regs(cpu_id as u8) 2442 .map_err(DebuggableError::ReadRegs)?; 2443 2444 gregs.regs.regs = regs.x; 2445 gregs.regs.sp = regs.sp; 2446 gregs.regs.pc = regs.pc; 2447 2448 self.set_regs(cpu_id as u8, &gregs) 2449 .map_err(DebuggableError::WriteRegs)?; 2450 2451 Ok(()) 2452 } 2453 2454 fn read_mem( 2455 &self, 2456 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2457 cpu_id: usize, 2458 vaddr: GuestAddress, 2459 len: usize, 2460 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2461 let mut buf = vec![0; len]; 2462 let mut total_read = 0_u64; 2463 2464 while total_read < len as u64 { 2465 let gaddr = vaddr.0 + total_read; 2466 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2467 Ok(paddr) => paddr, 2468 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2469 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2470 }; 2471 let psize = arch::PAGE_SIZE as u64; 2472 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2473 guest_memory 2474 .memory() 2475 .read( 2476 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2477 GuestAddress(paddr), 2478 ) 2479 .map_err(DebuggableError::ReadMem)?; 2480 total_read += read_len; 2481 } 2482 Ok(buf) 2483 } 2484 2485 fn write_mem( 2486 &self, 2487 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2488 cpu_id: usize, 2489 vaddr: &GuestAddress, 2490 data: &[u8], 2491 ) -> std::result::Result<(), DebuggableError> { 2492 let mut total_written = 0_u64; 2493 2494 while total_written < data.len() as u64 { 2495 let gaddr = vaddr.0 + total_written; 2496 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2497 Ok(paddr) => paddr, 2498 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2499 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2500 }; 2501 let psize = arch::PAGE_SIZE as u64; 2502 let write_len = std::cmp::min( 2503 data.len() as u64 - total_written, 2504 psize - (paddr & (psize - 1)), 2505 ); 2506 guest_memory 2507 .memory() 2508 .write( 2509 &data[total_written as usize..total_written as usize + write_len as usize], 2510 GuestAddress(paddr), 2511 ) 2512 .map_err(DebuggableError::WriteMem)?; 2513 total_written += write_len; 2514 } 2515 Ok(()) 2516 } 2517 2518 fn active_vcpus(&self) -> usize { 2519 self.present_vcpus() as usize 2520 } 2521 } 2522 2523 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2524 impl Elf64Writable for CpuManager {} 2525 2526 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2527 impl CpuElf64Writable for CpuManager { 2528 fn cpu_write_elf64_note( 2529 &mut self, 2530 dump_state: &DumpState, 2531 ) -> std::result::Result<(), GuestDebuggableError> { 2532 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2533 for vcpu in &self.vcpus { 2534 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2535 let mut pos: usize = 0; 2536 let mut buf = vec![0; note_size as usize]; 2537 let descsz = size_of::<X86_64ElfPrStatus>(); 2538 let vcpu_id = vcpu.lock().unwrap().id; 2539 2540 let note = Elf64_Nhdr { 2541 n_namesz: COREDUMP_NAME_SIZE, 2542 n_descsz: descsz as u32, 2543 n_type: NT_PRSTATUS, 2544 }; 2545 2546 let bytes: &[u8] = note.as_slice(); 2547 buf.splice(0.., bytes.to_vec()); 2548 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2549 buf.resize(pos + 4, 0); 2550 buf.splice(pos.., "CORE".to_string().into_bytes()); 2551 2552 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2553 buf.resize(pos + 32 + 4, 0); 2554 let pid = vcpu_id as u64; 2555 let bytes: &[u8] = pid.as_slice(); 2556 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2557 2558 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2559 2560 let orig_rax: u64 = 0; 2561 let gregs = self.vcpus[usize::from(vcpu_id)] 2562 .lock() 2563 .unwrap() 2564 .vcpu 2565 .get_regs() 2566 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2567 2568 let regs1 = [ 2569 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11, 2570 gregs.r10, 2571 ]; 2572 let regs2 = [ 2573 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax, 2574 ]; 2575 2576 let sregs = self.vcpus[usize::from(vcpu_id)] 2577 .lock() 2578 .unwrap() 2579 .vcpu 2580 .get_sregs() 2581 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2582 2583 debug!( 2584 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2585 gregs.rip, 2586 gregs.rsp, 2587 sregs.gs.base, 2588 sregs.cs.selector, 2589 sregs.ss.selector, 2590 sregs.ds.selector, 2591 ); 2592 2593 let regs = X86_64UserRegs { 2594 regs1, 2595 regs2, 2596 rip: gregs.rip, 2597 cs: sregs.cs.selector as u64, 2598 eflags: gregs.rflags, 2599 rsp: gregs.rsp, 2600 ss: sregs.ss.selector as u64, 2601 fs_base: sregs.fs.base, 2602 gs_base: sregs.gs.base, 2603 ds: sregs.ds.selector as u64, 2604 es: sregs.es.selector as u64, 2605 fs: sregs.fs.selector as u64, 2606 gs: sregs.gs.selector as u64, 2607 }; 2608 2609 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2610 let bytes: &[u8] = regs.as_slice(); 2611 buf.resize(note_size as usize, 0); 2612 buf.splice(pos.., bytes.to_vec()); 2613 buf.resize(note_size as usize, 0); 2614 2615 coredump_file 2616 .write(&buf) 2617 .map_err(GuestDebuggableError::CoredumpFile)?; 2618 } 2619 2620 Ok(()) 2621 } 2622 2623 fn cpu_write_vmm_note( 2624 &mut self, 2625 dump_state: &DumpState, 2626 ) -> std::result::Result<(), GuestDebuggableError> { 2627 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2628 for vcpu in &self.vcpus { 2629 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2630 let mut pos: usize = 0; 2631 let mut buf = vec![0; note_size as usize]; 2632 let descsz = size_of::<DumpCpusState>(); 2633 let vcpu_id = vcpu.lock().unwrap().id; 2634 2635 let note = Elf64_Nhdr { 2636 n_namesz: COREDUMP_NAME_SIZE, 2637 n_descsz: descsz as u32, 2638 n_type: 0, 2639 }; 2640 2641 let bytes: &[u8] = note.as_slice(); 2642 buf.splice(0.., bytes.to_vec()); 2643 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2644 2645 buf.resize(pos + 4, 0); 2646 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2647 2648 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2649 2650 let gregs = self.vcpus[usize::from(vcpu_id)] 2651 .lock() 2652 .unwrap() 2653 .vcpu 2654 .get_regs() 2655 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2656 2657 let regs1 = [ 2658 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp, 2659 gregs.rbp, 2660 ]; 2661 2662 let regs2 = [ 2663 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, 2664 gregs.r15, 2665 ]; 2666 2667 let sregs = self.vcpus[usize::from(vcpu_id)] 2668 .lock() 2669 .unwrap() 2670 .vcpu 2671 .get_sregs() 2672 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2673 2674 let mut msrs = vec![MsrEntry { 2675 index: msr_index::MSR_KERNEL_GS_BASE, 2676 ..Default::default() 2677 }]; 2678 2679 self.vcpus[vcpu_id as usize] 2680 .lock() 2681 .unwrap() 2682 .vcpu 2683 .get_msrs(&mut msrs) 2684 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2685 let kernel_gs_base = msrs[0].data; 2686 2687 let cs = CpuSegment::new(sregs.cs); 2688 let ds = CpuSegment::new(sregs.ds); 2689 let es = CpuSegment::new(sregs.es); 2690 let fs = CpuSegment::new(sregs.fs); 2691 let gs = CpuSegment::new(sregs.gs); 2692 let ss = CpuSegment::new(sregs.ss); 2693 let ldt = CpuSegment::new(sregs.ldt); 2694 let tr = CpuSegment::new(sregs.tr); 2695 let gdt = CpuSegment::new_from_table(sregs.gdt); 2696 let idt = CpuSegment::new_from_table(sregs.idt); 2697 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2698 let regs = DumpCpusState { 2699 version: 1, 2700 size: size_of::<DumpCpusState>() as u32, 2701 regs1, 2702 regs2, 2703 rip: gregs.rip, 2704 rflags: gregs.rflags, 2705 cs, 2706 ds, 2707 es, 2708 fs, 2709 gs, 2710 ss, 2711 ldt, 2712 tr, 2713 gdt, 2714 idt, 2715 cr, 2716 kernel_gs_base, 2717 }; 2718 2719 let bytes: &[u8] = regs.as_slice(); 2720 buf.resize(note_size as usize, 0); 2721 buf.splice(pos.., bytes.to_vec()); 2722 buf.resize(note_size as usize, 0); 2723 2724 coredump_file 2725 .write(&buf) 2726 .map_err(GuestDebuggableError::CoredumpFile)?; 2727 } 2728 2729 Ok(()) 2730 } 2731 } 2732 2733 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2734 #[cfg(test)] 2735 mod tests { 2736 use arch::layout::BOOT_STACK_POINTER; 2737 use arch::layout::ZERO_PAGE_START; 2738 use arch::x86_64::interrupts::*; 2739 use arch::x86_64::regs::*; 2740 use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters}; 2741 use linux_loader::loader::bootparam::setup_header; 2742 2743 #[test] 2744 fn test_setlint() { 2745 let hv = hypervisor::new().unwrap(); 2746 let vm = hv.create_vm().expect("new VM fd creation failed"); 2747 assert!(hv.check_required_extensions().is_ok()); 2748 // Calling get_lapic will fail if there is no irqchip before hand. 2749 assert!(vm.create_irq_chip().is_ok()); 2750 let vcpu = vm.create_vcpu(0, None).unwrap(); 2751 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2752 2753 // Compute the value that is expected to represent LVT0 and LVT1. 2754 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2755 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2756 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2757 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2758 2759 set_lint(&vcpu).unwrap(); 2760 2761 // Compute the value that represents LVT0 and LVT1 after set_lint. 2762 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2763 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2764 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2765 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2766 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2767 } 2768 2769 #[test] 2770 fn test_setup_fpu() { 2771 let hv = hypervisor::new().unwrap(); 2772 let vm = hv.create_vm().expect("new VM fd creation failed"); 2773 let vcpu = vm.create_vcpu(0, None).unwrap(); 2774 setup_fpu(&vcpu).unwrap(); 2775 2776 let expected_fpu: FpuState = FpuState { 2777 fcw: 0x37f, 2778 mxcsr: 0x1f80, 2779 ..Default::default() 2780 }; 2781 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2782 // TODO: auto-generate kvm related structures with PartialEq on. 2783 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2784 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2785 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2786 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2787 // remove it at all. 2788 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2789 } 2790 2791 #[test] 2792 fn test_setup_msrs() { 2793 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2794 2795 let hv = hypervisor::new().unwrap(); 2796 let vm = hv.create_vm().expect("new VM fd creation failed"); 2797 let vcpu = vm.create_vcpu(0, None).unwrap(); 2798 setup_msrs(&vcpu).unwrap(); 2799 2800 // This test will check against the last MSR entry configured (the tenth one). 2801 // See create_msr_entries for details. 2802 let mut msrs = vec![MsrEntry { 2803 index: msr_index::MSR_IA32_MISC_ENABLE, 2804 ..Default::default() 2805 }]; 2806 2807 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2808 // in this test case scenario. 2809 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2810 assert_eq!(read_msrs, 1); 2811 2812 // Official entries that were setup when we did setup_msrs. We need to assert that the 2813 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2814 // expect. 2815 let entry_vec = vcpu.boot_msr_entries(); 2816 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2817 } 2818 2819 #[test] 2820 fn test_setup_regs_for_pvh() { 2821 let hv = hypervisor::new().unwrap(); 2822 let vm = hv.create_vm().expect("new VM fd creation failed"); 2823 let vcpu = vm.create_vcpu(0, None).unwrap(); 2824 2825 let expected_regs: StandardRegisters = StandardRegisters { 2826 rflags: 0x0000000000000002u64, 2827 rbx: arch::layout::PVH_INFO_START.0, 2828 rip: 1, 2829 ..Default::default() 2830 }; 2831 2832 setup_regs( 2833 &vcpu, 2834 arch::EntryPoint { 2835 entry_addr: vm_memory::GuestAddress(expected_regs.rip), 2836 setup_header: None, 2837 }, 2838 ) 2839 .unwrap(); 2840 2841 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2842 assert_eq!(actual_regs, expected_regs); 2843 } 2844 2845 #[test] 2846 fn test_setup_regs_for_bzimage() { 2847 let hv = hypervisor::new().unwrap(); 2848 let vm = hv.create_vm().expect("new VM fd creation failed"); 2849 let vcpu = vm.create_vcpu(0, None).unwrap(); 2850 2851 let expected_regs: StandardRegisters = StandardRegisters { 2852 rflags: 0x0000000000000002u64, 2853 rip: 1, 2854 rsp: BOOT_STACK_POINTER.0, 2855 rsi: ZERO_PAGE_START.0, 2856 ..Default::default() 2857 }; 2858 2859 setup_regs( 2860 &vcpu, 2861 arch::EntryPoint { 2862 entry_addr: vm_memory::GuestAddress(expected_regs.rip), 2863 setup_header: Some(setup_header { 2864 ..Default::default() 2865 }), 2866 }, 2867 ) 2868 .unwrap(); 2869 2870 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2871 assert_eq!(actual_regs, expected_regs); 2872 } 2873 } 2874 2875 #[cfg(target_arch = "aarch64")] 2876 #[cfg(test)] 2877 mod tests { 2878 use arch::{aarch64::regs, layout}; 2879 use hypervisor::kvm::aarch64::is_system_register; 2880 use hypervisor::kvm::kvm_bindings::{ 2881 kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2882 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2883 }; 2884 use hypervisor::{arm64_core_reg_id, offset_of}; 2885 use std::mem; 2886 2887 #[test] 2888 fn test_setup_regs() { 2889 let hv = hypervisor::new().unwrap(); 2890 let vm = hv.create_vm().unwrap(); 2891 let vcpu = vm.create_vcpu(0, None).unwrap(); 2892 2893 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2894 // Must fail when vcpu is not initialized yet. 2895 assert!(res.is_err()); 2896 2897 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2898 vm.get_preferred_target(&mut kvi).unwrap(); 2899 vcpu.vcpu_init(&kvi).unwrap(); 2900 2901 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2902 } 2903 2904 #[test] 2905 fn test_read_mpidr() { 2906 let hv = hypervisor::new().unwrap(); 2907 let vm = hv.create_vm().unwrap(); 2908 let vcpu = vm.create_vcpu(0, None).unwrap(); 2909 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2910 vm.get_preferred_target(&mut kvi).unwrap(); 2911 2912 // Must fail when vcpu is not initialized yet. 2913 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2914 2915 vcpu.vcpu_init(&kvi).unwrap(); 2916 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2917 } 2918 2919 #[test] 2920 fn test_is_system_register() { 2921 let offset = offset_of!(user_pt_regs, pc); 2922 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2923 assert!(!is_system_register(regid)); 2924 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2925 assert!(is_system_register(regid)); 2926 } 2927 2928 #[test] 2929 fn test_save_restore_core_regs() { 2930 let hv = hypervisor::new().unwrap(); 2931 let vm = hv.create_vm().unwrap(); 2932 let vcpu = vm.create_vcpu(0, None).unwrap(); 2933 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2934 vm.get_preferred_target(&mut kvi).unwrap(); 2935 2936 // Must fail when vcpu is not initialized yet. 2937 let res = vcpu.get_regs(); 2938 assert!(res.is_err()); 2939 assert_eq!( 2940 format!("{}", res.unwrap_err()), 2941 "Failed to get core register: Exec format error (os error 8)" 2942 ); 2943 2944 let mut state = kvm_regs::default(); 2945 let res = vcpu.set_regs(&state); 2946 assert!(res.is_err()); 2947 assert_eq!( 2948 format!("{}", res.unwrap_err()), 2949 "Failed to set core register: Exec format error (os error 8)" 2950 ); 2951 2952 vcpu.vcpu_init(&kvi).unwrap(); 2953 let res = vcpu.get_regs(); 2954 assert!(res.is_ok()); 2955 state = res.unwrap(); 2956 assert_eq!(state.regs.pstate, 0x3C5); 2957 2958 assert!(vcpu.set_regs(&state).is_ok()); 2959 } 2960 2961 #[test] 2962 fn test_get_set_mpstate() { 2963 let hv = hypervisor::new().unwrap(); 2964 let vm = hv.create_vm().unwrap(); 2965 let vcpu = vm.create_vcpu(0, None).unwrap(); 2966 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2967 vm.get_preferred_target(&mut kvi).unwrap(); 2968 2969 let res = vcpu.get_mp_state(); 2970 assert!(res.is_ok()); 2971 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 2972 } 2973 } 2974