1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 #[cfg(target_arch = "x86_64")] 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, sdt::Sdt, Aml}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 #[cfg(target_arch = "x86_64")] 35 use arch::x86_64::get_x2apic_id; 36 use arch::EntryPoint; 37 use arch::NumaNodes; 38 #[cfg(target_arch = "aarch64")] 39 use devices::gic::Gic; 40 use devices::interrupt_controller::InterruptController; 41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 44 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 45 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 46 use hypervisor::aarch64::StandardRegisters; 47 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 48 use hypervisor::arch::x86::msr_index; 49 #[cfg(target_arch = "x86_64")] 50 use hypervisor::arch::x86::CpuIdEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::MsrEntry; 53 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 54 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters}; 55 #[cfg(target_arch = "aarch64")] 56 use hypervisor::kvm::kvm_bindings; 57 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 58 use hypervisor::kvm::kvm_ioctls::Cap; 59 #[cfg(feature = "tdx")] 60 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 61 #[cfg(target_arch = "x86_64")] 62 use hypervisor::CpuVendor; 63 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps}; 64 use libc::{c_void, siginfo_t}; 65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 66 use linux_loader::elf::Elf64_Nhdr; 67 use seccompiler::{apply_filter, SeccompAction}; 68 use std::collections::BTreeMap; 69 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 70 use std::io::Write; 71 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 72 use std::mem::size_of; 73 use std::os::unix::thread::JoinHandleExt; 74 use std::sync::atomic::{AtomicBool, Ordering}; 75 use std::sync::{Arc, Barrier, Mutex}; 76 use std::{cmp, io, result, thread}; 77 use thiserror::Error; 78 use tracer::trace_scoped; 79 use vm_device::BusDevice; 80 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 81 use vm_memory::ByteValued; 82 #[cfg(feature = "guest_debug")] 83 use vm_memory::{Bytes, GuestAddressSpace}; 84 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 85 use vm_migration::{ 86 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 87 Transportable, 88 }; 89 use vmm_sys_util::eventfd::EventFd; 90 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 91 use zerocopy::AsBytes; 92 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 93 /// Extract the specified bits of a 64-bit integer. 94 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 95 /// following expression should return 3 (`0b11`): 96 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 97 /// 98 macro_rules! extract_bits_64 { 99 ($value: tt, $offset: tt, $length: tt) => { 100 ($value >> $offset) & (!0u64 >> (64 - $length)) 101 }; 102 } 103 104 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 105 macro_rules! extract_bits_64_without_offset { 106 ($value: tt, $length: tt) => { 107 $value & (!0u64 >> (64 - $length)) 108 }; 109 } 110 111 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 112 113 #[derive(Debug, Error)] 114 pub enum Error { 115 #[error("Error creating vCPU: {0}")] 116 VcpuCreate(#[source] anyhow::Error), 117 118 #[error("Error running bCPU: {0}")] 119 VcpuRun(#[source] anyhow::Error), 120 121 #[error("Error spawning vCPU thread: {0}")] 122 VcpuSpawn(#[source] io::Error), 123 124 #[error("Error generating common CPUID: {0}")] 125 CommonCpuId(#[source] arch::Error), 126 127 #[error("Error configuring vCPU: {0}")] 128 VcpuConfiguration(#[source] arch::Error), 129 130 #[error("Still pending removed vcpu")] 131 VcpuPendingRemovedVcpu, 132 133 #[cfg(target_arch = "aarch64")] 134 #[error("Error fetching preferred target: {0}")] 135 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 136 137 #[cfg(target_arch = "aarch64")] 138 #[error("Error initialising vCPU: {0}")] 139 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 140 141 #[error("Failed to join on vCPU threads: {0:?}")] 142 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 143 144 #[error("Error adding CpuManager to MMIO bus: {0}")] 145 BusError(#[source] vm_device::BusError), 146 147 #[error("Requested vCPUs exceed maximum")] 148 DesiredVCpuCountExceedsMax, 149 150 #[error("Cannot create seccomp filter: {0}")] 151 CreateSeccompFilter(#[source] seccompiler::Error), 152 153 #[error("Cannot apply seccomp filter: {0}")] 154 ApplySeccompFilter(#[source] seccompiler::Error), 155 156 #[error("Error starting vCPU after restore: {0}")] 157 StartRestoreVcpu(#[source] anyhow::Error), 158 159 #[error("Unexpected VmExit")] 160 UnexpectedVmExit, 161 162 #[error("Failed to allocate MMIO address for CpuManager")] 163 AllocateMmmioAddress, 164 165 #[cfg(feature = "tdx")] 166 #[error("Error initializing TDX: {0}")] 167 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 168 169 #[cfg(target_arch = "aarch64")] 170 #[error("Error initializing PMU: {0}")] 171 InitPmu(#[source] hypervisor::HypervisorCpuError), 172 173 #[cfg(feature = "guest_debug")] 174 #[error("Error during CPU debug: {0}")] 175 CpuDebug(#[source] hypervisor::HypervisorCpuError), 176 177 #[cfg(feature = "guest_debug")] 178 #[error("Error translating virtual address: {0}")] 179 TranslateVirtualAddress(#[source] anyhow::Error), 180 181 #[cfg(target_arch = "x86_64")] 182 #[error("Error setting up AMX: {0}")] 183 AmxEnable(#[source] anyhow::Error), 184 185 #[error("Maximum number of vCPUs exceeds host limit")] 186 MaximumVcpusExceeded, 187 188 #[cfg(feature = "sev_snp")] 189 #[error("Failed to set sev control register: {0}")] 190 SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), 191 192 #[cfg(target_arch = "x86_64")] 193 #[error("Failed to inject NMI")] 194 NmiError(hypervisor::HypervisorCpuError), 195 } 196 pub type Result<T> = result::Result<T, Error>; 197 198 #[cfg(target_arch = "x86_64")] 199 #[allow(dead_code)] 200 #[repr(packed)] 201 #[derive(AsBytes)] 202 struct LocalX2Apic { 203 pub r#type: u8, 204 pub length: u8, 205 pub _reserved: u16, 206 pub apic_id: u32, 207 pub flags: u32, 208 pub processor_id: u32, 209 } 210 211 #[allow(dead_code)] 212 #[repr(packed)] 213 #[derive(Default, AsBytes)] 214 struct Ioapic { 215 pub r#type: u8, 216 pub length: u8, 217 pub ioapic_id: u8, 218 _reserved: u8, 219 pub apic_address: u32, 220 pub gsi_base: u32, 221 } 222 223 #[cfg(target_arch = "aarch64")] 224 #[allow(dead_code)] 225 #[repr(packed)] 226 #[derive(AsBytes)] 227 struct GicC { 228 pub r#type: u8, 229 pub length: u8, 230 pub reserved0: u16, 231 pub cpu_interface_number: u32, 232 pub uid: u32, 233 pub flags: u32, 234 pub parking_version: u32, 235 pub performance_interrupt: u32, 236 pub parked_address: u64, 237 pub base_address: u64, 238 pub gicv_base_address: u64, 239 pub gich_base_address: u64, 240 pub vgic_interrupt: u32, 241 pub gicr_base_address: u64, 242 pub mpidr: u64, 243 pub proc_power_effi_class: u8, 244 pub reserved1: u8, 245 pub spe_overflow_interrupt: u16, 246 } 247 248 #[cfg(target_arch = "aarch64")] 249 #[allow(dead_code)] 250 #[repr(packed)] 251 #[derive(AsBytes)] 252 struct GicD { 253 pub r#type: u8, 254 pub length: u8, 255 pub reserved0: u16, 256 pub gic_id: u32, 257 pub base_address: u64, 258 pub global_irq_base: u32, 259 pub version: u8, 260 pub reserved1: [u8; 3], 261 } 262 263 #[cfg(target_arch = "aarch64")] 264 #[allow(dead_code)] 265 #[repr(packed)] 266 #[derive(AsBytes)] 267 struct GicR { 268 pub r#type: u8, 269 pub length: u8, 270 pub reserved: u16, 271 pub base_address: u64, 272 pub range_length: u32, 273 } 274 275 #[cfg(target_arch = "aarch64")] 276 #[allow(dead_code)] 277 #[repr(packed)] 278 #[derive(AsBytes)] 279 struct GicIts { 280 pub r#type: u8, 281 pub length: u8, 282 pub reserved0: u16, 283 pub translation_id: u32, 284 pub base_address: u64, 285 pub reserved1: u32, 286 } 287 288 #[cfg(target_arch = "aarch64")] 289 #[allow(dead_code)] 290 #[repr(packed)] 291 #[derive(AsBytes)] 292 struct ProcessorHierarchyNode { 293 pub r#type: u8, 294 pub length: u8, 295 pub reserved: u16, 296 pub flags: u32, 297 pub parent: u32, 298 pub acpi_processor_id: u32, 299 pub num_private_resources: u32, 300 } 301 302 #[allow(dead_code)] 303 #[repr(packed)] 304 #[derive(Default, AsBytes)] 305 struct InterruptSourceOverride { 306 pub r#type: u8, 307 pub length: u8, 308 pub bus: u8, 309 pub source: u8, 310 pub gsi: u32, 311 pub flags: u16, 312 } 313 314 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 315 macro_rules! round_up { 316 ($n:expr,$d:expr) => { 317 (($n / ($d + 1)) + 1) * $d 318 }; 319 } 320 321 /// A wrapper around creating and using a kvm-based VCPU. 322 pub struct Vcpu { 323 // The hypervisor abstracted CPU. 324 vcpu: Arc<dyn hypervisor::Vcpu>, 325 id: u8, 326 #[cfg(target_arch = "aarch64")] 327 mpidr: u64, 328 saved_state: Option<CpuState>, 329 #[cfg(target_arch = "x86_64")] 330 vendor: CpuVendor, 331 } 332 333 impl Vcpu { 334 /// Constructs a new VCPU for `vm`. 335 /// 336 /// # Arguments 337 /// 338 /// * `id` - Represents the CPU number between [0, max vcpus). 339 /// * `vm` - The virtual machine this vcpu will get attached to. 340 /// * `vm_ops` - Optional object for exit handling. 341 /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) 342 pub fn new( 343 id: u8, 344 apic_id: u8, 345 vm: &Arc<dyn hypervisor::Vm>, 346 vm_ops: Option<Arc<dyn VmOps>>, 347 #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, 348 ) -> Result<Self> { 349 let vcpu = vm 350 .create_vcpu(apic_id, vm_ops) 351 .map_err(|e| Error::VcpuCreate(e.into()))?; 352 // Initially the cpuid per vCPU is the one supported by this VM. 353 Ok(Vcpu { 354 vcpu, 355 id, 356 #[cfg(target_arch = "aarch64")] 357 mpidr: 0, 358 saved_state: None, 359 #[cfg(target_arch = "x86_64")] 360 vendor: cpu_vendor, 361 }) 362 } 363 364 /// Configures a vcpu and should be called once per vcpu when created. 365 /// 366 /// # Arguments 367 /// 368 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 369 /// * `guest_memory` - Guest memory. 370 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 371 pub fn configure( 372 &mut self, 373 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 374 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 375 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 376 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 377 #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, 378 ) -> Result<()> { 379 #[cfg(target_arch = "aarch64")] 380 { 381 self.init(vm)?; 382 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 383 .map_err(Error::VcpuConfiguration)?; 384 } 385 info!("Configuring vCPU: cpu_id = {}", self.id); 386 #[cfg(target_arch = "x86_64")] 387 arch::configure_vcpu( 388 &self.vcpu, 389 self.id, 390 boot_setup, 391 cpuid, 392 kvm_hyperv, 393 self.vendor, 394 topology, 395 ) 396 .map_err(Error::VcpuConfiguration)?; 397 398 Ok(()) 399 } 400 401 /// Gets the MPIDR register value. 402 #[cfg(target_arch = "aarch64")] 403 pub fn get_mpidr(&self) -> u64 { 404 self.mpidr 405 } 406 407 /// Gets the saved vCPU state. 408 #[cfg(target_arch = "aarch64")] 409 pub fn get_saved_state(&self) -> Option<CpuState> { 410 self.saved_state.clone() 411 } 412 413 /// Initializes an aarch64 specific vcpu for booting Linux. 414 #[cfg(target_arch = "aarch64")] 415 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 416 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 417 418 // This reads back the kernel's preferred target type. 419 vm.get_preferred_target(&mut kvi) 420 .map_err(Error::VcpuArmPreferredTarget)?; 421 // We already checked that the capability is supported. 422 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 423 if vm 424 .as_any() 425 .downcast_ref::<hypervisor::kvm::KvmVm>() 426 .unwrap() 427 .check_extension(Cap::ArmPmuV3) 428 { 429 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 430 } 431 // Non-boot cpus are powered off initially. 432 if self.id > 0 { 433 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 434 } 435 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 436 } 437 438 /// Runs the VCPU until it exits, returning the reason. 439 /// 440 /// Note that the state of the VCPU and associated VM must be setup first for this to do 441 /// anything useful. 442 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 443 self.vcpu.run() 444 } 445 446 #[cfg(feature = "sev_snp")] 447 pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> { 448 self.vcpu 449 .set_sev_control_register(vmsa_pfn) 450 .map_err(Error::SetSevControlRegister) 451 } 452 } 453 454 impl Pausable for Vcpu {} 455 impl Snapshottable for Vcpu { 456 fn id(&self) -> String { 457 self.id.to_string() 458 } 459 460 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 461 let saved_state = self 462 .vcpu 463 .state() 464 .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?; 465 466 self.saved_state = Some(saved_state.clone()); 467 468 Ok(Snapshot::from_data(SnapshotData::new_from_state( 469 &saved_state, 470 )?)) 471 } 472 } 473 474 pub struct CpuManager { 475 config: CpusConfig, 476 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 477 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 478 #[cfg(target_arch = "x86_64")] 479 cpuid: Vec<CpuIdEntry>, 480 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 481 vm: Arc<dyn hypervisor::Vm>, 482 vcpus_kill_signalled: Arc<AtomicBool>, 483 vcpus_pause_signalled: Arc<AtomicBool>, 484 vcpus_kick_signalled: Arc<AtomicBool>, 485 exit_evt: EventFd, 486 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 487 reset_evt: EventFd, 488 #[cfg(feature = "guest_debug")] 489 vm_debug_evt: EventFd, 490 vcpu_states: Vec<VcpuState>, 491 selected_cpu: u8, 492 vcpus: Vec<Arc<Mutex<Vcpu>>>, 493 seccomp_action: SeccompAction, 494 vm_ops: Arc<dyn VmOps>, 495 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 496 acpi_address: Option<GuestAddress>, 497 proximity_domain_per_cpu: BTreeMap<u8, u32>, 498 affinity: BTreeMap<u8, Vec<usize>>, 499 dynamic: bool, 500 hypervisor: Arc<dyn hypervisor::Hypervisor>, 501 #[cfg(feature = "sev_snp")] 502 sev_snp_enabled: bool, 503 } 504 505 const CPU_ENABLE_FLAG: usize = 0; 506 const CPU_INSERTING_FLAG: usize = 1; 507 const CPU_REMOVING_FLAG: usize = 2; 508 const CPU_EJECT_FLAG: usize = 3; 509 510 const CPU_STATUS_OFFSET: u64 = 4; 511 const CPU_SELECTION_OFFSET: u64 = 0; 512 513 impl BusDevice for CpuManager { 514 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 515 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 516 data.fill(0); 517 518 match offset { 519 CPU_SELECTION_OFFSET => { 520 data[0] = self.selected_cpu; 521 } 522 CPU_STATUS_OFFSET => { 523 if self.selected_cpu < self.max_vcpus() { 524 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 525 if state.active() { 526 data[0] |= 1 << CPU_ENABLE_FLAG; 527 } 528 if state.inserting { 529 data[0] |= 1 << CPU_INSERTING_FLAG; 530 } 531 if state.removing { 532 data[0] |= 1 << CPU_REMOVING_FLAG; 533 } 534 } else { 535 warn!("Out of range vCPU id: {}", self.selected_cpu); 536 } 537 } 538 _ => { 539 warn!( 540 "Unexpected offset for accessing CPU manager device: {:#}", 541 offset 542 ); 543 } 544 } 545 } 546 547 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 548 match offset { 549 CPU_SELECTION_OFFSET => { 550 self.selected_cpu = data[0]; 551 } 552 CPU_STATUS_OFFSET => { 553 if self.selected_cpu < self.max_vcpus() { 554 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 555 // The ACPI code writes back a 1 to acknowledge the insertion 556 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 557 && state.inserting 558 { 559 state.inserting = false; 560 } 561 // Ditto for removal 562 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 563 && state.removing 564 { 565 state.removing = false; 566 } 567 // Trigger removal of vCPU 568 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 569 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 570 error!("Error removing vCPU: {:?}", e); 571 } 572 } 573 } else { 574 warn!("Out of range vCPU id: {}", self.selected_cpu); 575 } 576 } 577 _ => { 578 warn!( 579 "Unexpected offset for accessing CPU manager device: {:#}", 580 offset 581 ); 582 } 583 } 584 None 585 } 586 } 587 588 #[derive(Default)] 589 struct VcpuState { 590 inserting: bool, 591 removing: bool, 592 pending_removal: Arc<AtomicBool>, 593 handle: Option<thread::JoinHandle<()>>, 594 kill: Arc<AtomicBool>, 595 vcpu_run_interrupted: Arc<AtomicBool>, 596 paused: Arc<AtomicBool>, 597 } 598 599 impl VcpuState { 600 fn active(&self) -> bool { 601 self.handle.is_some() 602 } 603 604 fn signal_thread(&self) { 605 if let Some(handle) = self.handle.as_ref() { 606 loop { 607 // SAFETY: FFI call with correct arguments 608 unsafe { 609 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 610 } 611 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 612 break; 613 } else { 614 // This is more effective than thread::yield_now() at 615 // avoiding a priority inversion with the vCPU thread 616 thread::sleep(std::time::Duration::from_millis(1)); 617 } 618 } 619 } 620 } 621 622 fn join_thread(&mut self) -> Result<()> { 623 if let Some(handle) = self.handle.take() { 624 handle.join().map_err(Error::ThreadCleanup)? 625 } 626 627 Ok(()) 628 } 629 630 fn unpark_thread(&self) { 631 if let Some(handle) = self.handle.as_ref() { 632 handle.thread().unpark() 633 } 634 } 635 } 636 637 impl CpuManager { 638 #[allow(unused_variables)] 639 #[allow(clippy::too_many_arguments)] 640 pub fn new( 641 config: &CpusConfig, 642 vm: Arc<dyn hypervisor::Vm>, 643 exit_evt: EventFd, 644 reset_evt: EventFd, 645 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 646 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 647 seccomp_action: SeccompAction, 648 vm_ops: Arc<dyn VmOps>, 649 #[cfg(feature = "tdx")] tdx_enabled: bool, 650 numa_nodes: &NumaNodes, 651 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 652 ) -> Result<Arc<Mutex<CpuManager>>> { 653 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 654 return Err(Error::MaximumVcpusExceeded); 655 } 656 657 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 658 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 659 let hypervisor_type = hypervisor.hypervisor_type(); 660 #[cfg(target_arch = "x86_64")] 661 let cpu_vendor = hypervisor.get_cpu_vendor(); 662 663 #[cfg(target_arch = "x86_64")] 664 if config.features.amx { 665 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 666 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 667 const XFEATURE_XTILEDATA: usize = 18; 668 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 669 670 // SAFETY: the syscall is only modifying kernel internal 671 // data structures that the kernel is itself expected to safeguard. 672 let amx_tile = unsafe { 673 libc::syscall( 674 libc::SYS_arch_prctl, 675 ARCH_REQ_XCOMP_GUEST_PERM, 676 XFEATURE_XTILEDATA, 677 ) 678 }; 679 680 if amx_tile != 0 { 681 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 682 } else { 683 let mask: usize = 0; 684 // SAFETY: the mask being modified (not marked mutable as it is 685 // modified in unsafe only which is permitted) isn't in use elsewhere. 686 let result = unsafe { 687 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 688 }; 689 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 690 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 691 } 692 } 693 } 694 695 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 696 let mut cpu_list = Vec::new(); 697 for (proximity_domain, numa_node) in numa_nodes.iter() { 698 for cpu in numa_node.cpus.iter() { 699 cpu_list.push((*cpu, *proximity_domain)) 700 } 701 } 702 cpu_list 703 } 704 .into_iter() 705 .collect(); 706 707 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 708 cpu_affinity 709 .iter() 710 .map(|a| (a.vcpu, a.host_cpus.clone())) 711 .collect() 712 } else { 713 BTreeMap::new() 714 }; 715 716 #[cfg(feature = "tdx")] 717 let dynamic = !tdx_enabled; 718 #[cfg(not(feature = "tdx"))] 719 let dynamic = true; 720 721 Ok(Arc::new(Mutex::new(CpuManager { 722 config: config.clone(), 723 interrupt_controller: None, 724 #[cfg(target_arch = "x86_64")] 725 cpuid: Vec::new(), 726 vm, 727 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 728 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 729 vcpus_kick_signalled: Arc::new(AtomicBool::new(false)), 730 vcpu_states, 731 exit_evt, 732 reset_evt, 733 #[cfg(feature = "guest_debug")] 734 vm_debug_evt, 735 selected_cpu: 0, 736 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 737 seccomp_action, 738 vm_ops, 739 acpi_address: None, 740 proximity_domain_per_cpu, 741 affinity, 742 dynamic, 743 hypervisor: hypervisor.clone(), 744 #[cfg(feature = "sev_snp")] 745 sev_snp_enabled, 746 }))) 747 } 748 749 #[cfg(target_arch = "x86_64")] 750 pub fn populate_cpuid( 751 &mut self, 752 memory_manager: &Arc<Mutex<MemoryManager>>, 753 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 754 #[cfg(feature = "tdx")] tdx: bool, 755 ) -> Result<()> { 756 let sgx_epc_sections = memory_manager 757 .lock() 758 .unwrap() 759 .sgx_epc_region() 760 .as_ref() 761 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 762 763 self.cpuid = { 764 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 765 arch::generate_common_cpuid( 766 hypervisor, 767 &arch::CpuidConfig { 768 sgx_epc_sections, 769 phys_bits, 770 kvm_hyperv: self.config.kvm_hyperv, 771 #[cfg(feature = "tdx")] 772 tdx, 773 amx: self.config.features.amx, 774 }, 775 ) 776 .map_err(Error::CommonCpuId)? 777 }; 778 779 Ok(()) 780 } 781 782 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 783 info!("Creating vCPU: cpu_id = {}", cpu_id); 784 785 #[cfg(target_arch = "x86_64")] 786 let topology = self.get_vcpu_topology(); 787 #[cfg(target_arch = "x86_64")] 788 let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology); 789 #[cfg(target_arch = "aarch64")] 790 let x2apic_id = cpu_id as u32; 791 792 let mut vcpu = Vcpu::new( 793 cpu_id, 794 x2apic_id as u8, 795 &self.vm, 796 Some(self.vm_ops.clone()), 797 #[cfg(target_arch = "x86_64")] 798 self.hypervisor.get_cpu_vendor(), 799 )?; 800 801 if let Some(snapshot) = snapshot { 802 // AArch64 vCPUs should be initialized after created. 803 #[cfg(target_arch = "aarch64")] 804 vcpu.init(&self.vm)?; 805 806 let state: CpuState = snapshot.to_state().map_err(|e| { 807 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 808 })?; 809 vcpu.vcpu 810 .set_state(&state) 811 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 812 813 vcpu.saved_state = Some(state); 814 } 815 816 let vcpu = Arc::new(Mutex::new(vcpu)); 817 818 // Adding vCPU to the CpuManager's vCPU list. 819 self.vcpus.push(vcpu.clone()); 820 821 Ok(vcpu) 822 } 823 824 pub fn configure_vcpu( 825 &self, 826 vcpu: Arc<Mutex<Vcpu>>, 827 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 828 ) -> Result<()> { 829 let mut vcpu = vcpu.lock().unwrap(); 830 831 #[cfg(feature = "sev_snp")] 832 if self.sev_snp_enabled { 833 if let Some((kernel_entry_point, _)) = boot_setup { 834 vcpu.set_sev_control_register( 835 kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, 836 )?; 837 } 838 839 // Traditional way to configure vcpu doesn't work for SEV-SNP guests. 840 // All the vCPU configuration for SEV-SNP guest is provided via VMSA. 841 return Ok(()); 842 } 843 844 #[cfg(target_arch = "x86_64")] 845 assert!(!self.cpuid.is_empty()); 846 847 #[cfg(target_arch = "x86_64")] 848 let topology = self.config.topology.clone().map_or_else( 849 || { 850 #[cfg(feature = "mshv")] 851 if matches!(self.hypervisor.hypervisor_type(), HypervisorType::Mshv) { 852 return Some((1, self.boot_vcpus(), 1)); 853 } 854 None 855 }, 856 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 857 ); 858 #[cfg(target_arch = "x86_64")] 859 vcpu.configure( 860 boot_setup, 861 self.cpuid.clone(), 862 self.config.kvm_hyperv, 863 topology, 864 )?; 865 866 #[cfg(target_arch = "aarch64")] 867 vcpu.configure(&self.vm, boot_setup)?; 868 869 Ok(()) 870 } 871 872 /// Only create new vCPUs if there aren't any inactive ones to reuse 873 fn create_vcpus( 874 &mut self, 875 desired_vcpus: u8, 876 snapshot: Option<Snapshot>, 877 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 878 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 879 info!( 880 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 881 desired_vcpus, 882 self.config.max_vcpus, 883 self.vcpus.len(), 884 self.present_vcpus() 885 ); 886 887 if desired_vcpus > self.config.max_vcpus { 888 return Err(Error::DesiredVCpuCountExceedsMax); 889 } 890 891 // Only create vCPUs in excess of all the allocated vCPUs. 892 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 893 vcpus.push(self.create_vcpu( 894 cpu_id, 895 // TODO: The special format of the CPU id can be removed once 896 // ready to break live upgrade. 897 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 898 )?); 899 } 900 901 Ok(vcpus) 902 } 903 904 #[cfg(target_arch = "aarch64")] 905 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 906 for cpu in self.vcpus.iter() { 907 let cpu = cpu.lock().unwrap(); 908 // Check if PMU attr is available, if not, log the information. 909 if cpu.vcpu.has_pmu_support() { 910 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 911 } else { 912 debug!( 913 "PMU attribute is not supported in vCPU{}, skip PMU init!", 914 cpu.id 915 ); 916 return Ok(false); 917 } 918 } 919 920 Ok(true) 921 } 922 923 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 924 self.vcpus.clone() 925 } 926 927 fn start_vcpu( 928 &mut self, 929 vcpu: Arc<Mutex<Vcpu>>, 930 vcpu_id: u8, 931 vcpu_thread_barrier: Arc<Barrier>, 932 inserting: bool, 933 ) -> Result<()> { 934 let reset_evt = self.reset_evt.try_clone().unwrap(); 935 let exit_evt = self.exit_evt.try_clone().unwrap(); 936 #[cfg(feature = "kvm")] 937 let hypervisor_type = self.hypervisor.hypervisor_type(); 938 #[cfg(feature = "guest_debug")] 939 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 940 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 941 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 942 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 943 let vcpu_kick_signalled = self.vcpus_kick_signalled.clone(); 944 945 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 946 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 947 .vcpu_run_interrupted 948 .clone(); 949 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 950 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 951 952 // Prepare the CPU set the current vCPU is expected to run onto. 953 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 954 // SAFETY: all zeros is a valid pattern 955 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 956 // SAFETY: FFI call, trivially safe 957 unsafe { libc::CPU_ZERO(&mut cpuset) }; 958 for host_cpu in host_cpus { 959 // SAFETY: FFI call, trivially safe 960 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) }; 961 } 962 cpuset 963 }); 964 965 // Retrieve seccomp filter for vcpu thread 966 let vcpu_seccomp_filter = get_seccomp_filter( 967 &self.seccomp_action, 968 Thread::Vcpu, 969 self.hypervisor.hypervisor_type(), 970 ) 971 .map_err(Error::CreateSeccompFilter)?; 972 973 #[cfg(target_arch = "x86_64")] 974 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 975 976 info!("Starting vCPU: cpu_id = {}", vcpu_id); 977 978 let handle = Some( 979 thread::Builder::new() 980 .name(format!("vcpu{vcpu_id}")) 981 .spawn(move || { 982 // Schedule the thread to run on the expected CPU set 983 if let Some(cpuset) = cpuset.as_ref() { 984 // SAFETY: FFI call with correct arguments 985 let ret = unsafe { 986 libc::sched_setaffinity( 987 0, 988 std::mem::size_of::<libc::cpu_set_t>(), 989 cpuset as *const libc::cpu_set_t, 990 ) 991 }; 992 993 if ret != 0 { 994 error!( 995 "Failed scheduling the vCPU {} on the expected CPU set: {}", 996 vcpu_id, 997 io::Error::last_os_error() 998 ); 999 return; 1000 } 1001 } 1002 1003 // Apply seccomp filter for vcpu thread. 1004 if !vcpu_seccomp_filter.is_empty() { 1005 if let Err(e) = 1006 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 1007 { 1008 error!("Error applying seccomp filter: {:?}", e); 1009 return; 1010 } 1011 } 1012 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 1013 // This uses an async signal safe handler to kill the vcpu handles. 1014 register_signal_handler(SIGRTMIN(), handle_signal) 1015 .expect("Failed to register vcpu signal handler"); 1016 // Block until all CPUs are ready. 1017 vcpu_thread_barrier.wait(); 1018 1019 std::panic::catch_unwind(move || { 1020 loop { 1021 // If we are being told to pause, we park the thread 1022 // until the pause boolean is toggled. 1023 // The resume operation is responsible for toggling 1024 // the boolean and unpark the thread. 1025 // We enter a loop because park() could spuriously 1026 // return. We will then park() again unless the 1027 // pause boolean has been toggled. 1028 1029 // Need to use Ordering::SeqCst as we have multiple 1030 // loads and stores to different atomics and we need 1031 // to see them in a consistent order in all threads 1032 1033 if vcpu_pause_signalled.load(Ordering::SeqCst) { 1034 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 1035 // completed by returning to KVM_RUN. From the kernel docs: 1036 // 1037 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 1038 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 1039 // operations are complete (and guest state is consistent) only after userspace 1040 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 1041 // incomplete operations and then check for pending signals. 1042 // The pending state of the operation is not preserved in state which is 1043 // visible to userspace, thus userspace should ensure that the operation is 1044 // completed before performing a live migration. Userspace can re-enter the 1045 // guest with an unmasked signal pending or with the immediate_exit field set 1046 // to complete pending operations without allowing any further instructions 1047 // to be executed. 1048 1049 #[cfg(feature = "kvm")] 1050 if matches!(hypervisor_type, HypervisorType::Kvm) { 1051 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 1052 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 1053 error!("Unexpected VM exit on \"immediate_exit\" run"); 1054 break; 1055 } 1056 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 1057 } 1058 1059 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1060 1061 vcpu_paused.store(true, Ordering::SeqCst); 1062 while vcpu_pause_signalled.load(Ordering::SeqCst) { 1063 thread::park(); 1064 } 1065 vcpu_run_interrupted.store(false, Ordering::SeqCst); 1066 } 1067 1068 if vcpu_kick_signalled.load(Ordering::SeqCst) { 1069 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1070 #[cfg(target_arch = "x86_64")] 1071 match vcpu.lock().as_ref().unwrap().vcpu.nmi() { 1072 Ok(()) => {}, 1073 Err(e) => { 1074 error!("Error when inject nmi {}", e); 1075 break; 1076 } 1077 } 1078 } 1079 1080 // We've been told to terminate 1081 if vcpu_kill_signalled.load(Ordering::SeqCst) 1082 || vcpu_kill.load(Ordering::SeqCst) 1083 { 1084 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1085 break; 1086 } 1087 1088 #[cfg(feature = "tdx")] 1089 let mut vcpu = vcpu.lock().unwrap(); 1090 #[cfg(not(feature = "tdx"))] 1091 let vcpu = vcpu.lock().unwrap(); 1092 // vcpu.run() returns false on a triple-fault so trigger a reset 1093 match vcpu.run() { 1094 Ok(run) => match run { 1095 #[cfg(feature = "kvm")] 1096 VmExit::Debug => { 1097 info!("VmExit::Debug"); 1098 #[cfg(feature = "guest_debug")] 1099 { 1100 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1101 let raw_tid = get_raw_tid(vcpu_id as usize); 1102 vm_debug_evt.write(raw_tid as u64).unwrap(); 1103 } 1104 } 1105 #[cfg(target_arch = "x86_64")] 1106 VmExit::IoapicEoi(vector) => { 1107 if let Some(interrupt_controller) = 1108 &interrupt_controller_clone 1109 { 1110 interrupt_controller 1111 .lock() 1112 .unwrap() 1113 .end_of_interrupt(vector); 1114 } 1115 } 1116 VmExit::Ignore => {} 1117 VmExit::Hyperv => {} 1118 VmExit::Reset => { 1119 info!("VmExit::Reset"); 1120 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1121 reset_evt.write(1).unwrap(); 1122 break; 1123 } 1124 VmExit::Shutdown => { 1125 info!("VmExit::Shutdown"); 1126 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1127 exit_evt.write(1).unwrap(); 1128 break; 1129 } 1130 #[cfg(feature = "tdx")] 1131 VmExit::Tdx => { 1132 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1133 match vcpu.get_tdx_exit_details() { 1134 Ok(details) => match details { 1135 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1136 TdxExitDetails::SetupEventNotifyInterrupt => { 1137 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1138 } 1139 }, 1140 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1141 } 1142 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1143 } else { 1144 // We should never reach this code as 1145 // this means the design from the code 1146 // is wrong. 1147 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1148 } 1149 } 1150 _ => { 1151 error!( 1152 "VCPU generated error: {:?}", 1153 Error::UnexpectedVmExit 1154 ); 1155 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1156 exit_evt.write(1).unwrap(); 1157 break; 1158 } 1159 }, 1160 1161 Err(e) => { 1162 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1163 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1164 exit_evt.write(1).unwrap(); 1165 break; 1166 } 1167 } 1168 1169 // We've been told to terminate 1170 if vcpu_kill_signalled.load(Ordering::SeqCst) 1171 || vcpu_kill.load(Ordering::SeqCst) 1172 { 1173 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1174 break; 1175 } 1176 } 1177 }) 1178 .or_else(|_| { 1179 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1180 error!("vCPU thread panicked"); 1181 panic_exit_evt.write(1) 1182 }) 1183 .ok(); 1184 }) 1185 .map_err(Error::VcpuSpawn)?, 1186 ); 1187 1188 // On hot plug calls into this function entry_point is None. It is for 1189 // those hotplug CPU additions that we need to set the inserting flag. 1190 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1191 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1192 1193 Ok(()) 1194 } 1195 1196 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1197 fn activate_vcpus( 1198 &mut self, 1199 desired_vcpus: u8, 1200 inserting: bool, 1201 paused: Option<bool>, 1202 ) -> Result<()> { 1203 if desired_vcpus > self.config.max_vcpus { 1204 return Err(Error::DesiredVCpuCountExceedsMax); 1205 } 1206 1207 let vcpu_thread_barrier = Arc::new(Barrier::new( 1208 (desired_vcpus - self.present_vcpus() + 1) as usize, 1209 )); 1210 1211 if let Some(paused) = paused { 1212 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1213 } 1214 1215 info!( 1216 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1217 desired_vcpus, 1218 self.vcpus.len(), 1219 self.present_vcpus(), 1220 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1221 ); 1222 1223 // This reuses any inactive vCPUs as well as any that were newly created 1224 for vcpu_id in self.present_vcpus()..desired_vcpus { 1225 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1226 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1227 } 1228 1229 // Unblock all CPU threads. 1230 vcpu_thread_barrier.wait(); 1231 Ok(()) 1232 } 1233 1234 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1235 // Mark vCPUs for removal, actual removal happens on ejection 1236 for cpu_id in desired_vcpus..self.present_vcpus() { 1237 self.vcpu_states[usize::from(cpu_id)].removing = true; 1238 self.vcpu_states[usize::from(cpu_id)] 1239 .pending_removal 1240 .store(true, Ordering::SeqCst); 1241 } 1242 } 1243 1244 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1245 for state in self.vcpu_states.iter() { 1246 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1247 return true; 1248 } 1249 } 1250 false 1251 } 1252 1253 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1254 info!("Removing vCPU: cpu_id = {}", cpu_id); 1255 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1256 state.kill.store(true, Ordering::SeqCst); 1257 state.signal_thread(); 1258 state.join_thread()?; 1259 state.handle = None; 1260 1261 // Once the thread has exited, clear the "kill" so that it can reused 1262 state.kill.store(false, Ordering::SeqCst); 1263 state.pending_removal.store(false, Ordering::SeqCst); 1264 1265 Ok(()) 1266 } 1267 1268 pub fn create_boot_vcpus( 1269 &mut self, 1270 snapshot: Option<Snapshot>, 1271 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1272 trace_scoped!("create_boot_vcpus"); 1273 1274 self.create_vcpus(self.boot_vcpus(), snapshot) 1275 } 1276 1277 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1278 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1279 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1280 } 1281 1282 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1283 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1284 .map_err(|e| { 1285 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1286 })?; 1287 1288 Ok(()) 1289 } 1290 1291 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1292 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1293 return Ok(false); 1294 } 1295 1296 if !self.dynamic { 1297 return Ok(false); 1298 } 1299 1300 if self.check_pending_removed_vcpu() { 1301 return Err(Error::VcpuPendingRemovedVcpu); 1302 } 1303 1304 match desired_vcpus.cmp(&self.present_vcpus()) { 1305 cmp::Ordering::Greater => { 1306 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1307 for vcpu in vcpus { 1308 self.configure_vcpu(vcpu, None)? 1309 } 1310 self.activate_vcpus(desired_vcpus, true, None)?; 1311 Ok(true) 1312 } 1313 cmp::Ordering::Less => { 1314 self.mark_vcpus_for_removal(desired_vcpus); 1315 Ok(true) 1316 } 1317 _ => Ok(false), 1318 } 1319 } 1320 1321 pub fn shutdown(&mut self) -> Result<()> { 1322 // Tell the vCPUs to stop themselves next time they go through the loop 1323 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1324 1325 // Toggle the vCPUs pause boolean 1326 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1327 1328 // Unpark all the VCPU threads. 1329 for state in self.vcpu_states.iter() { 1330 state.unpark_thread(); 1331 } 1332 1333 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1334 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1335 // above. 1336 for state in self.vcpu_states.iter() { 1337 state.signal_thread(); 1338 } 1339 1340 // Wait for all the threads to finish. This removes the state from the vector. 1341 for mut state in self.vcpu_states.drain(..) { 1342 state.join_thread()?; 1343 } 1344 1345 Ok(()) 1346 } 1347 1348 #[cfg(feature = "tdx")] 1349 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1350 for vcpu in &self.vcpus { 1351 vcpu.lock() 1352 .unwrap() 1353 .vcpu 1354 .tdx_init(hob_address) 1355 .map_err(Error::InitializeTdx)?; 1356 } 1357 Ok(()) 1358 } 1359 1360 pub fn boot_vcpus(&self) -> u8 { 1361 self.config.boot_vcpus 1362 } 1363 1364 pub fn max_vcpus(&self) -> u8 { 1365 self.config.max_vcpus 1366 } 1367 1368 #[cfg(target_arch = "x86_64")] 1369 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1370 assert!(!self.cpuid.is_empty()); 1371 self.cpuid.clone() 1372 } 1373 1374 fn present_vcpus(&self) -> u8 { 1375 self.vcpu_states 1376 .iter() 1377 .fold(0, |acc, state| acc + state.active() as u8) 1378 } 1379 1380 #[cfg(target_arch = "aarch64")] 1381 pub fn get_mpidrs(&self) -> Vec<u64> { 1382 self.vcpus 1383 .iter() 1384 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1385 .collect() 1386 } 1387 1388 #[cfg(target_arch = "aarch64")] 1389 pub fn get_saved_states(&self) -> Vec<CpuState> { 1390 self.vcpus 1391 .iter() 1392 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1393 .collect() 1394 } 1395 1396 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1397 self.config 1398 .topology 1399 .clone() 1400 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1401 } 1402 1403 pub fn create_madt(&self) -> Sdt { 1404 use crate::acpi; 1405 // This is also checked in the commandline parsing. 1406 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1407 1408 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1409 #[cfg(target_arch = "x86_64")] 1410 { 1411 madt.write(36, arch::layout::APIC_START.0); 1412 1413 for cpu in 0..self.config.max_vcpus { 1414 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology()); 1415 1416 let lapic = LocalX2Apic { 1417 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1418 length: 16, 1419 processor_id: cpu.into(), 1420 apic_id: x2apic_id, 1421 flags: if cpu < self.config.boot_vcpus { 1422 1 << MADT_CPU_ENABLE_FLAG 1423 } else { 1424 0 1425 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1426 _reserved: 0, 1427 }; 1428 madt.append(lapic); 1429 } 1430 1431 madt.append(Ioapic { 1432 r#type: acpi::ACPI_APIC_IO, 1433 length: 12, 1434 ioapic_id: 0, 1435 apic_address: arch::layout::IOAPIC_START.0 as u32, 1436 gsi_base: 0, 1437 ..Default::default() 1438 }); 1439 1440 madt.append(InterruptSourceOverride { 1441 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1442 length: 10, 1443 bus: 0, 1444 source: 4, 1445 gsi: 4, 1446 flags: 0, 1447 }); 1448 } 1449 1450 #[cfg(target_arch = "aarch64")] 1451 { 1452 /* Notes: 1453 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1454 */ 1455 1456 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1457 for cpu in 0..self.config.boot_vcpus { 1458 let vcpu = &self.vcpus[cpu as usize]; 1459 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1460 /* ARMv8 MPIDR format: 1461 Bits [63:40] Must be zero 1462 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1463 Bits [31:24] Must be zero 1464 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1465 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1466 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1467 */ 1468 let mpidr_mask = 0xff_00ff_ffff; 1469 let gicc = GicC { 1470 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1471 length: 80, 1472 reserved0: 0, 1473 cpu_interface_number: cpu as u32, 1474 uid: cpu as u32, 1475 flags: 1, 1476 parking_version: 0, 1477 performance_interrupt: 0, 1478 parked_address: 0, 1479 base_address: 0, 1480 gicv_base_address: 0, 1481 gich_base_address: 0, 1482 vgic_interrupt: 0, 1483 gicr_base_address: 0, 1484 mpidr: mpidr & mpidr_mask, 1485 proc_power_effi_class: 0, 1486 reserved1: 0, 1487 spe_overflow_interrupt: 0, 1488 }; 1489 1490 madt.append(gicc); 1491 } 1492 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1493 1494 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1495 let gicd = GicD { 1496 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1497 length: 24, 1498 reserved0: 0, 1499 gic_id: 0, 1500 base_address: vgic_config.dist_addr, 1501 global_irq_base: 0, 1502 version: 3, 1503 reserved1: [0; 3], 1504 }; 1505 madt.append(gicd); 1506 1507 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1508 let gicr = GicR { 1509 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1510 length: 16, 1511 reserved: 0, 1512 base_address: vgic_config.redists_addr, 1513 range_length: vgic_config.redists_size as u32, 1514 }; 1515 madt.append(gicr); 1516 1517 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1518 let gicits = GicIts { 1519 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1520 length: 20, 1521 reserved0: 0, 1522 translation_id: 0, 1523 base_address: vgic_config.msi_addr, 1524 reserved1: 0, 1525 }; 1526 madt.append(gicits); 1527 1528 madt.update_checksum(); 1529 } 1530 1531 madt 1532 } 1533 1534 #[cfg(target_arch = "aarch64")] 1535 pub fn create_pptt(&self) -> Sdt { 1536 let pptt_start = 0; 1537 let mut cpus = 0; 1538 let mut uid = 0; 1539 // If topology is not specified, the default setting is: 1540 // 1 package, multiple cores, 1 thread per core 1541 // This is also the behavior when PPTT is missing. 1542 let (threads_per_core, cores_per_package, packages) = 1543 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1544 1545 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1546 1547 for cluster_idx in 0..packages { 1548 if cpus < self.config.boot_vcpus as usize { 1549 let cluster_offset = pptt.len() - pptt_start; 1550 let cluster_hierarchy_node = ProcessorHierarchyNode { 1551 r#type: 0, 1552 length: 20, 1553 reserved: 0, 1554 flags: 0x2, 1555 parent: 0, 1556 acpi_processor_id: cluster_idx as u32, 1557 num_private_resources: 0, 1558 }; 1559 pptt.append(cluster_hierarchy_node); 1560 1561 for core_idx in 0..cores_per_package { 1562 let core_offset = pptt.len() - pptt_start; 1563 1564 if threads_per_core > 1 { 1565 let core_hierarchy_node = ProcessorHierarchyNode { 1566 r#type: 0, 1567 length: 20, 1568 reserved: 0, 1569 flags: 0x2, 1570 parent: cluster_offset as u32, 1571 acpi_processor_id: core_idx as u32, 1572 num_private_resources: 0, 1573 }; 1574 pptt.append(core_hierarchy_node); 1575 1576 for _thread_idx in 0..threads_per_core { 1577 let thread_hierarchy_node = ProcessorHierarchyNode { 1578 r#type: 0, 1579 length: 20, 1580 reserved: 0, 1581 flags: 0xE, 1582 parent: core_offset as u32, 1583 acpi_processor_id: uid as u32, 1584 num_private_resources: 0, 1585 }; 1586 pptt.append(thread_hierarchy_node); 1587 uid += 1; 1588 } 1589 } else { 1590 let thread_hierarchy_node = ProcessorHierarchyNode { 1591 r#type: 0, 1592 length: 20, 1593 reserved: 0, 1594 flags: 0xA, 1595 parent: cluster_offset as u32, 1596 acpi_processor_id: uid as u32, 1597 num_private_resources: 0, 1598 }; 1599 pptt.append(thread_hierarchy_node); 1600 uid += 1; 1601 } 1602 } 1603 cpus += (cores_per_package * threads_per_core) as usize; 1604 } 1605 } 1606 1607 pptt.update_checksum(); 1608 pptt 1609 } 1610 1611 #[cfg(feature = "guest_debug")] 1612 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1613 self.vcpus[usize::from(cpu_id)] 1614 .lock() 1615 .unwrap() 1616 .vcpu 1617 .get_regs() 1618 .map_err(Error::CpuDebug) 1619 } 1620 1621 #[cfg(feature = "guest_debug")] 1622 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1623 self.vcpus[usize::from(cpu_id)] 1624 .lock() 1625 .unwrap() 1626 .vcpu 1627 .set_regs(regs) 1628 .map_err(Error::CpuDebug) 1629 } 1630 1631 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1632 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1633 self.vcpus[usize::from(cpu_id)] 1634 .lock() 1635 .unwrap() 1636 .vcpu 1637 .get_sregs() 1638 .map_err(Error::CpuDebug) 1639 } 1640 1641 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1642 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1643 self.vcpus[usize::from(cpu_id)] 1644 .lock() 1645 .unwrap() 1646 .vcpu 1647 .set_sregs(sregs) 1648 .map_err(Error::CpuDebug) 1649 } 1650 1651 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1652 fn translate_gva( 1653 &self, 1654 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1655 cpu_id: u8, 1656 gva: u64, 1657 ) -> Result<u64> { 1658 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1659 .lock() 1660 .unwrap() 1661 .vcpu 1662 .translate_gva(gva, /* flags: unused */ 0) 1663 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1664 Ok(gpa) 1665 } 1666 1667 /// 1668 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1669 /// it in VMM by walking through translation tables. 1670 /// 1671 /// Address translation is big topic, here we only focus the scenario that 1672 /// happens in VMM while debugging kernel. This `translate_gva` 1673 /// implementation is restricted to: 1674 /// - Exception Level 1 1675 /// - Translate high address range only (kernel space) 1676 /// 1677 /// This implementation supports following Arm-v8a features related to 1678 /// address translation: 1679 /// - FEAT_LPA 1680 /// - FEAT_LVA 1681 /// - FEAT_LPA2 1682 /// 1683 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1684 fn translate_gva( 1685 &self, 1686 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1687 cpu_id: u8, 1688 gva: u64, 1689 ) -> Result<u64> { 1690 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1691 .lock() 1692 .unwrap() 1693 .vcpu 1694 .get_sys_reg(regs::TCR_EL1) 1695 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1696 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1697 .lock() 1698 .unwrap() 1699 .vcpu 1700 .get_sys_reg(regs::TTBR1_EL1) 1701 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1702 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1703 .lock() 1704 .unwrap() 1705 .vcpu 1706 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1707 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1708 1709 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1710 // or low (0x000xxx...). 1711 let high_range = extract_bits_64!(gva, 55, 1); 1712 if high_range == 0 { 1713 info!("VA (0x{:x}) range is not supported!", gva); 1714 return Ok(gva); 1715 } 1716 1717 // High range size offset 1718 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1719 // Granule size 1720 let tg = extract_bits_64!(tcr_el1, 30, 2); 1721 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1722 let ds = extract_bits_64!(tcr_el1, 59, 1); 1723 1724 if tsz == 0 { 1725 info!("VA translation is not ready!"); 1726 return Ok(gva); 1727 } 1728 1729 // VA size is determined by TCR_BL1.T1SZ 1730 let va_size = 64 - tsz; 1731 // Number of bits in VA consumed in each level of translation 1732 let stride = match tg { 1733 3 => 13, // 64KB granule size 1734 1 => 11, // 16KB granule size 1735 _ => 9, // 4KB, default 1736 }; 1737 // Starting level of walking 1738 let mut level = 4 - (va_size - 4) / stride; 1739 1740 // PA or IPA size is determined 1741 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1742 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1743 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1744 // To be safe, we use the minimum value if they are different. 1745 let pa_range = std::cmp::min(tcr_ips, pa_range); 1746 // PA size in bits 1747 let pa_size = match pa_range { 1748 0 => 32, 1749 1 => 36, 1750 2 => 40, 1751 3 => 42, 1752 4 => 44, 1753 5 => 48, 1754 6 => 52, 1755 _ => { 1756 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1757 "PA range not supported {pa_range}" 1758 )))) 1759 } 1760 }; 1761 1762 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1763 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1764 // If FEAT_LPA2 is present, the translation table descriptor holds 1765 // 50 bits of the table address of next level. 1766 // Otherwise, it is 48 bits. 1767 let descaddrmask = if ds == 1 { 1768 !0u64 >> (64 - 50) // mask with 50 least significant bits 1769 } else { 1770 !0u64 >> (64 - 48) // mask with 48 least significant bits 1771 }; 1772 let descaddrmask = descaddrmask & !indexmask_grainsize; 1773 1774 // Translation table base address 1775 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1776 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1777 // address bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1778 if pa_size == 52 { 1779 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1780 } 1781 1782 // Loop through tables of each level 1783 loop { 1784 // Table offset for current level 1785 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1786 descaddr |= table_offset; 1787 descaddr &= !7u64; 1788 1789 let mut buf = [0; 8]; 1790 guest_memory 1791 .memory() 1792 .read(&mut buf, GuestAddress(descaddr)) 1793 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1794 let descriptor = u64::from_le_bytes(buf); 1795 1796 descaddr = descriptor & descaddrmask; 1797 // In the case of FEAT_LPA, the next-level translation table address 1798 // bits [48:51] comes from bits [12:15] of the current descriptor. 1799 // For FEAT_LPA2, the next-level translation table address 1800 // bits [50:51] comes from bits [8:9] of the current descriptor, 1801 // bits [48:49] comes from bits [48:49] of the descriptor which was 1802 // handled previously. 1803 if pa_size == 52 { 1804 if ds == 1 { 1805 // FEAT_LPA2 1806 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1807 } else { 1808 // FEAT_LPA 1809 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1810 } 1811 } 1812 1813 if (descriptor & 2) != 0 && (level < 3) { 1814 // This is a table entry. Go down to next level. 1815 level += 1; 1816 indexmask = indexmask_grainsize; 1817 continue; 1818 } 1819 1820 break; 1821 } 1822 1823 // We have reached either: 1824 // - a page entry at level 3 or 1825 // - a block entry at level 1 or 2 1826 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1827 descaddr &= !(page_size - 1); 1828 descaddr |= gva & (page_size - 1); 1829 1830 Ok(descaddr) 1831 } 1832 1833 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1834 self.acpi_address = Some(acpi_address); 1835 } 1836 1837 pub(crate) fn set_interrupt_controller( 1838 &mut self, 1839 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1840 ) { 1841 self.interrupt_controller = Some(interrupt_controller); 1842 } 1843 1844 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1845 &self.vcpus_kill_signalled 1846 } 1847 1848 #[cfg(feature = "igvm")] 1849 pub(crate) fn get_cpuid_leaf( 1850 &self, 1851 cpu_id: u8, 1852 eax: u32, 1853 ecx: u32, 1854 xfem: u64, 1855 xss: u64, 1856 ) -> Result<[u32; 4]> { 1857 let leaf_info = self.vcpus[usize::from(cpu_id)] 1858 .lock() 1859 .unwrap() 1860 .vcpu 1861 .get_cpuid_values(eax, ecx, xfem, xss) 1862 .unwrap(); 1863 Ok(leaf_info) 1864 } 1865 1866 #[cfg(feature = "sev_snp")] 1867 pub(crate) fn sev_snp_enabled(&self) -> bool { 1868 self.sev_snp_enabled 1869 } 1870 1871 pub(crate) fn nmi(&self) -> Result<()> { 1872 self.vcpus_kick_signalled.store(true, Ordering::SeqCst); 1873 1874 for state in self.vcpu_states.iter() { 1875 state.signal_thread(); 1876 } 1877 1878 self.vcpus_kick_signalled.store(false, Ordering::SeqCst); 1879 1880 Ok(()) 1881 } 1882 } 1883 1884 struct Cpu { 1885 cpu_id: u8, 1886 proximity_domain: u32, 1887 dynamic: bool, 1888 #[cfg(target_arch = "x86_64")] 1889 topology: Option<(u8, u8, u8)>, 1890 } 1891 1892 #[cfg(target_arch = "x86_64")] 1893 const MADT_CPU_ENABLE_FLAG: usize = 0; 1894 1895 #[cfg(target_arch = "x86_64")] 1896 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1897 1898 impl Cpu { 1899 #[cfg(target_arch = "x86_64")] 1900 fn generate_mat(&self) -> Vec<u8> { 1901 let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology); 1902 1903 let lapic = LocalX2Apic { 1904 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1905 length: 16, 1906 processor_id: self.cpu_id.into(), 1907 apic_id: x2apic_id, 1908 flags: 1 << MADT_CPU_ENABLE_FLAG, 1909 _reserved: 0, 1910 }; 1911 1912 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1913 // SAFETY: mat_data is large enough to hold lapic 1914 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1915 1916 mat_data 1917 } 1918 } 1919 1920 impl Aml for Cpu { 1921 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1922 #[cfg(target_arch = "x86_64")] 1923 let mat_data: Vec<u8> = self.generate_mat(); 1924 #[allow(clippy::if_same_then_else)] 1925 if self.dynamic { 1926 aml::Device::new( 1927 format!("C{:03X}", self.cpu_id).as_str().into(), 1928 vec![ 1929 &aml::Name::new("_HID".into(), &"ACPI0007"), 1930 &aml::Name::new("_UID".into(), &self.cpu_id), 1931 // Currently, AArch64 cannot support following fields. 1932 /* 1933 _STA return value: 1934 Bit [0] – Set if the device is present. 1935 Bit [1] – Set if the device is enabled and decoding its resources. 1936 Bit [2] – Set if the device should be shown in the UI. 1937 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1938 Bit [4] – Set if the battery is present. 1939 Bits [31:5] – Reserved (must be cleared). 1940 */ 1941 #[cfg(target_arch = "x86_64")] 1942 &aml::Method::new( 1943 "_STA".into(), 1944 0, 1945 false, 1946 // Call into CSTA method which will interrogate device 1947 vec![&aml::Return::new(&aml::MethodCall::new( 1948 "CSTA".into(), 1949 vec![&self.cpu_id], 1950 ))], 1951 ), 1952 &aml::Method::new( 1953 "_PXM".into(), 1954 0, 1955 false, 1956 vec![&aml::Return::new(&self.proximity_domain)], 1957 ), 1958 // The Linux kernel expects every CPU device to have a _MAT entry 1959 // containing the LAPIC for this processor with the enabled bit set 1960 // even it if is disabled in the MADT (non-boot CPU) 1961 #[cfg(target_arch = "x86_64")] 1962 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1963 // Trigger CPU ejection 1964 #[cfg(target_arch = "x86_64")] 1965 &aml::Method::new( 1966 "_EJ0".into(), 1967 1, 1968 false, 1969 // Call into CEJ0 method which will actually eject device 1970 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1971 ), 1972 ], 1973 ) 1974 .to_aml_bytes(sink); 1975 } else { 1976 aml::Device::new( 1977 format!("C{:03X}", self.cpu_id).as_str().into(), 1978 vec![ 1979 &aml::Name::new("_HID".into(), &"ACPI0007"), 1980 &aml::Name::new("_UID".into(), &self.cpu_id), 1981 #[cfg(target_arch = "x86_64")] 1982 &aml::Method::new( 1983 "_STA".into(), 1984 0, 1985 false, 1986 // Mark CPU present see CSTA implementation 1987 vec![&aml::Return::new(&0xfu8)], 1988 ), 1989 &aml::Method::new( 1990 "_PXM".into(), 1991 0, 1992 false, 1993 vec![&aml::Return::new(&self.proximity_domain)], 1994 ), 1995 // The Linux kernel expects every CPU device to have a _MAT entry 1996 // containing the LAPIC for this processor with the enabled bit set 1997 // even it if is disabled in the MADT (non-boot CPU) 1998 #[cfg(target_arch = "x86_64")] 1999 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 2000 ], 2001 ) 2002 .to_aml_bytes(sink); 2003 } 2004 } 2005 } 2006 2007 struct CpuNotify { 2008 cpu_id: u8, 2009 } 2010 2011 impl Aml for CpuNotify { 2012 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2013 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 2014 aml::If::new( 2015 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 2016 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2017 ) 2018 .to_aml_bytes(sink) 2019 } 2020 } 2021 2022 struct CpuMethods { 2023 max_vcpus: u8, 2024 dynamic: bool, 2025 } 2026 2027 impl Aml for CpuMethods { 2028 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2029 if self.dynamic { 2030 // CPU status method 2031 aml::Method::new( 2032 "CSTA".into(), 2033 1, 2034 true, 2035 vec![ 2036 // Take lock defined above 2037 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2038 // Write CPU number (in first argument) to I/O port via field 2039 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2040 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2041 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2042 &aml::If::new( 2043 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 2044 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2045 ), 2046 // Release lock 2047 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2048 // Return 0 or 0xf 2049 &aml::Return::new(&aml::Local(0)), 2050 ], 2051 ) 2052 .to_aml_bytes(sink); 2053 2054 let mut cpu_notifies = Vec::new(); 2055 for cpu_id in 0..self.max_vcpus { 2056 cpu_notifies.push(CpuNotify { cpu_id }); 2057 } 2058 2059 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2060 for cpu_id in 0..self.max_vcpus { 2061 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 2062 } 2063 2064 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 2065 2066 aml::Method::new( 2067 "CEJ0".into(), 2068 1, 2069 true, 2070 vec![ 2071 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2072 // Write CPU number (in first argument) to I/O port via field 2073 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2074 // Set CEJ0 bit 2075 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 2076 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2077 ], 2078 ) 2079 .to_aml_bytes(sink); 2080 2081 aml::Method::new( 2082 "CSCN".into(), 2083 0, 2084 true, 2085 vec![ 2086 // Take lock defined above 2087 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2088 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2089 &aml::While::new( 2090 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 2091 vec![ 2092 // Write CPU number (in first argument) to I/O port via field 2093 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 2094 // Check if CINS bit is set 2095 &aml::If::new( 2096 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 2097 // Notify device if it is 2098 vec![ 2099 &aml::MethodCall::new( 2100 "CTFY".into(), 2101 vec![&aml::Local(0), &aml::ONE], 2102 ), 2103 // Reset CINS bit 2104 &aml::Store::new( 2105 &aml::Path::new("\\_SB_.PRES.CINS"), 2106 &aml::ONE, 2107 ), 2108 ], 2109 ), 2110 // Check if CRMV bit is set 2111 &aml::If::new( 2112 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 2113 // Notify device if it is (with the eject constant 0x3) 2114 vec![ 2115 &aml::MethodCall::new( 2116 "CTFY".into(), 2117 vec![&aml::Local(0), &3u8], 2118 ), 2119 // Reset CRMV bit 2120 &aml::Store::new( 2121 &aml::Path::new("\\_SB_.PRES.CRMV"), 2122 &aml::ONE, 2123 ), 2124 ], 2125 ), 2126 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2127 ], 2128 ), 2129 // Release lock 2130 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2131 ], 2132 ) 2133 .to_aml_bytes(sink) 2134 } else { 2135 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2136 } 2137 } 2138 } 2139 2140 impl Aml for CpuManager { 2141 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2142 #[cfg(target_arch = "x86_64")] 2143 if let Some(acpi_address) = self.acpi_address { 2144 // CPU hotplug controller 2145 aml::Device::new( 2146 "_SB_.PRES".into(), 2147 vec![ 2148 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2149 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2150 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2151 &aml::Mutex::new("CPLK".into(), 0), 2152 &aml::Name::new( 2153 "_CRS".into(), 2154 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2155 aml::AddressSpaceCacheable::NotCacheable, 2156 true, 2157 acpi_address.0, 2158 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2159 None, 2160 )]), 2161 ), 2162 // OpRegion and Fields map MMIO range into individual field values 2163 &aml::OpRegion::new( 2164 "PRST".into(), 2165 aml::OpRegionSpace::SystemMemory, 2166 &(acpi_address.0 as usize), 2167 &CPU_MANAGER_ACPI_SIZE, 2168 ), 2169 &aml::Field::new( 2170 "PRST".into(), 2171 aml::FieldAccessType::Byte, 2172 aml::FieldLockRule::NoLock, 2173 aml::FieldUpdateRule::WriteAsZeroes, 2174 vec![ 2175 aml::FieldEntry::Reserved(32), 2176 aml::FieldEntry::Named(*b"CPEN", 1), 2177 aml::FieldEntry::Named(*b"CINS", 1), 2178 aml::FieldEntry::Named(*b"CRMV", 1), 2179 aml::FieldEntry::Named(*b"CEJ0", 1), 2180 aml::FieldEntry::Reserved(4), 2181 aml::FieldEntry::Named(*b"CCMD", 8), 2182 ], 2183 ), 2184 &aml::Field::new( 2185 "PRST".into(), 2186 aml::FieldAccessType::DWord, 2187 aml::FieldLockRule::NoLock, 2188 aml::FieldUpdateRule::Preserve, 2189 vec![ 2190 aml::FieldEntry::Named(*b"CSEL", 32), 2191 aml::FieldEntry::Reserved(32), 2192 aml::FieldEntry::Named(*b"CDAT", 32), 2193 ], 2194 ), 2195 ], 2196 ) 2197 .to_aml_bytes(sink); 2198 } 2199 2200 // CPU devices 2201 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2202 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2203 // Bundle methods together under a common object 2204 let methods = CpuMethods { 2205 max_vcpus: self.config.max_vcpus, 2206 dynamic: self.dynamic, 2207 }; 2208 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2209 2210 #[cfg(target_arch = "x86_64")] 2211 let topology = self.get_vcpu_topology(); 2212 let mut cpu_devices = Vec::new(); 2213 for cpu_id in 0..self.config.max_vcpus { 2214 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2215 let cpu_device = Cpu { 2216 cpu_id, 2217 proximity_domain, 2218 dynamic: self.dynamic, 2219 #[cfg(target_arch = "x86_64")] 2220 topology, 2221 }; 2222 2223 cpu_devices.push(cpu_device); 2224 } 2225 2226 for cpu_device in cpu_devices.iter() { 2227 cpu_data_inner.push(cpu_device); 2228 } 2229 2230 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2231 } 2232 } 2233 2234 impl Pausable for CpuManager { 2235 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2236 // Tell the vCPUs to pause themselves next time they exit 2237 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2238 2239 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2240 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2241 // above. 2242 for state in self.vcpu_states.iter() { 2243 state.signal_thread(); 2244 } 2245 2246 for vcpu in self.vcpus.iter() { 2247 let mut vcpu = vcpu.lock().unwrap(); 2248 vcpu.pause()?; 2249 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2250 if !self.config.kvm_hyperv { 2251 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2252 MigratableError::Pause(anyhow!( 2253 "Could not notify guest it has been paused {:?}", 2254 e 2255 )) 2256 })?; 2257 } 2258 } 2259 2260 // The vCPU thread will change its paused state before parking, wait here for each 2261 // activated vCPU change their state to ensure they have parked. 2262 for state in self.vcpu_states.iter() { 2263 if state.active() { 2264 while !state.paused.load(Ordering::SeqCst) { 2265 // To avoid a priority inversion with the vCPU thread 2266 thread::sleep(std::time::Duration::from_millis(1)); 2267 } 2268 } 2269 } 2270 2271 Ok(()) 2272 } 2273 2274 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2275 for vcpu in self.vcpus.iter() { 2276 vcpu.lock().unwrap().resume()?; 2277 } 2278 2279 // Toggle the vCPUs pause boolean 2280 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2281 2282 // Unpark all the VCPU threads. 2283 // Once unparked, the next thing they will do is checking for the pause 2284 // boolean. Since it'll be set to false, they will exit their pause loop 2285 // and go back to vmx root. 2286 for state in self.vcpu_states.iter() { 2287 state.paused.store(false, Ordering::SeqCst); 2288 state.unpark_thread(); 2289 } 2290 Ok(()) 2291 } 2292 } 2293 2294 impl Snapshottable for CpuManager { 2295 fn id(&self) -> String { 2296 CPU_MANAGER_SNAPSHOT_ID.to_string() 2297 } 2298 2299 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2300 let mut cpu_manager_snapshot = Snapshot::default(); 2301 2302 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2303 for vcpu in &self.vcpus { 2304 let mut vcpu = vcpu.lock().unwrap(); 2305 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2306 } 2307 2308 Ok(cpu_manager_snapshot) 2309 } 2310 } 2311 2312 impl Transportable for CpuManager {} 2313 impl Migratable for CpuManager {} 2314 2315 #[cfg(feature = "guest_debug")] 2316 impl Debuggable for CpuManager { 2317 #[cfg(feature = "kvm")] 2318 fn set_guest_debug( 2319 &self, 2320 cpu_id: usize, 2321 addrs: &[GuestAddress], 2322 singlestep: bool, 2323 ) -> std::result::Result<(), DebuggableError> { 2324 self.vcpus[cpu_id] 2325 .lock() 2326 .unwrap() 2327 .vcpu 2328 .set_guest_debug(addrs, singlestep) 2329 .map_err(DebuggableError::SetDebug) 2330 } 2331 2332 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2333 Ok(()) 2334 } 2335 2336 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2337 Ok(()) 2338 } 2339 2340 #[cfg(target_arch = "x86_64")] 2341 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2342 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2343 let gregs = self 2344 .get_regs(cpu_id as u8) 2345 .map_err(DebuggableError::ReadRegs)?; 2346 let regs = [ 2347 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, 2348 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, 2349 ]; 2350 2351 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2352 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2353 let eflags = gregs.rflags as u32; 2354 let rip = gregs.rip; 2355 2356 // Segment registers: CS, SS, DS, ES, FS, GS 2357 let sregs = self 2358 .get_sregs(cpu_id as u8) 2359 .map_err(DebuggableError::ReadRegs)?; 2360 let segments = X86SegmentRegs { 2361 cs: sregs.cs.selector as u32, 2362 ss: sregs.ss.selector as u32, 2363 ds: sregs.ds.selector as u32, 2364 es: sregs.es.selector as u32, 2365 fs: sregs.fs.selector as u32, 2366 gs: sregs.gs.selector as u32, 2367 }; 2368 2369 // TODO: Add other registers 2370 2371 Ok(CoreRegs { 2372 regs, 2373 eflags, 2374 rip, 2375 segments, 2376 ..Default::default() 2377 }) 2378 } 2379 2380 #[cfg(target_arch = "aarch64")] 2381 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2382 let gregs = self 2383 .get_regs(cpu_id as u8) 2384 .map_err(DebuggableError::ReadRegs)?; 2385 Ok(CoreRegs { 2386 x: gregs.regs.regs, 2387 sp: gregs.regs.sp, 2388 pc: gregs.regs.pc, 2389 ..Default::default() 2390 }) 2391 } 2392 2393 #[cfg(target_arch = "x86_64")] 2394 fn write_regs( 2395 &self, 2396 cpu_id: usize, 2397 regs: &CoreRegs, 2398 ) -> std::result::Result<(), DebuggableError> { 2399 let orig_gregs = self 2400 .get_regs(cpu_id as u8) 2401 .map_err(DebuggableError::ReadRegs)?; 2402 let gregs = StandardRegisters { 2403 rax: regs.regs[0], 2404 rbx: regs.regs[1], 2405 rcx: regs.regs[2], 2406 rdx: regs.regs[3], 2407 rsi: regs.regs[4], 2408 rdi: regs.regs[5], 2409 rbp: regs.regs[6], 2410 rsp: regs.regs[7], 2411 r8: regs.regs[8], 2412 r9: regs.regs[9], 2413 r10: regs.regs[10], 2414 r11: regs.regs[11], 2415 r12: regs.regs[12], 2416 r13: regs.regs[13], 2417 r14: regs.regs[14], 2418 r15: regs.regs[15], 2419 rip: regs.rip, 2420 // Update the lower 32-bit of rflags. 2421 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), 2422 }; 2423 2424 self.set_regs(cpu_id as u8, &gregs) 2425 .map_err(DebuggableError::WriteRegs)?; 2426 2427 // Segment registers: CS, SS, DS, ES, FS, GS 2428 // Since GDB care only selectors, we call get_sregs() first. 2429 let mut sregs = self 2430 .get_sregs(cpu_id as u8) 2431 .map_err(DebuggableError::ReadRegs)?; 2432 sregs.cs.selector = regs.segments.cs as u16; 2433 sregs.ss.selector = regs.segments.ss as u16; 2434 sregs.ds.selector = regs.segments.ds as u16; 2435 sregs.es.selector = regs.segments.es as u16; 2436 sregs.fs.selector = regs.segments.fs as u16; 2437 sregs.gs.selector = regs.segments.gs as u16; 2438 2439 self.set_sregs(cpu_id as u8, &sregs) 2440 .map_err(DebuggableError::WriteRegs)?; 2441 2442 // TODO: Add other registers 2443 2444 Ok(()) 2445 } 2446 2447 #[cfg(target_arch = "aarch64")] 2448 fn write_regs( 2449 &self, 2450 cpu_id: usize, 2451 regs: &CoreRegs, 2452 ) -> std::result::Result<(), DebuggableError> { 2453 let mut gregs = self 2454 .get_regs(cpu_id as u8) 2455 .map_err(DebuggableError::ReadRegs)?; 2456 2457 gregs.regs.regs = regs.x; 2458 gregs.regs.sp = regs.sp; 2459 gregs.regs.pc = regs.pc; 2460 2461 self.set_regs(cpu_id as u8, &gregs) 2462 .map_err(DebuggableError::WriteRegs)?; 2463 2464 Ok(()) 2465 } 2466 2467 fn read_mem( 2468 &self, 2469 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2470 cpu_id: usize, 2471 vaddr: GuestAddress, 2472 len: usize, 2473 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2474 let mut buf = vec![0; len]; 2475 let mut total_read = 0_u64; 2476 2477 while total_read < len as u64 { 2478 let gaddr = vaddr.0 + total_read; 2479 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2480 Ok(paddr) => paddr, 2481 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2482 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2483 }; 2484 let psize = arch::PAGE_SIZE as u64; 2485 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2486 guest_memory 2487 .memory() 2488 .read( 2489 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2490 GuestAddress(paddr), 2491 ) 2492 .map_err(DebuggableError::ReadMem)?; 2493 total_read += read_len; 2494 } 2495 Ok(buf) 2496 } 2497 2498 fn write_mem( 2499 &self, 2500 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2501 cpu_id: usize, 2502 vaddr: &GuestAddress, 2503 data: &[u8], 2504 ) -> std::result::Result<(), DebuggableError> { 2505 let mut total_written = 0_u64; 2506 2507 while total_written < data.len() as u64 { 2508 let gaddr = vaddr.0 + total_written; 2509 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2510 Ok(paddr) => paddr, 2511 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2512 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2513 }; 2514 let psize = arch::PAGE_SIZE as u64; 2515 let write_len = std::cmp::min( 2516 data.len() as u64 - total_written, 2517 psize - (paddr & (psize - 1)), 2518 ); 2519 guest_memory 2520 .memory() 2521 .write( 2522 &data[total_written as usize..total_written as usize + write_len as usize], 2523 GuestAddress(paddr), 2524 ) 2525 .map_err(DebuggableError::WriteMem)?; 2526 total_written += write_len; 2527 } 2528 Ok(()) 2529 } 2530 2531 fn active_vcpus(&self) -> usize { 2532 self.present_vcpus() as usize 2533 } 2534 } 2535 2536 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2537 impl Elf64Writable for CpuManager {} 2538 2539 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2540 impl CpuElf64Writable for CpuManager { 2541 fn cpu_write_elf64_note( 2542 &mut self, 2543 dump_state: &DumpState, 2544 ) -> std::result::Result<(), GuestDebuggableError> { 2545 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2546 for vcpu in &self.vcpus { 2547 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2548 let mut pos: usize = 0; 2549 let mut buf = vec![0; note_size as usize]; 2550 let descsz = size_of::<X86_64ElfPrStatus>(); 2551 let vcpu_id = vcpu.lock().unwrap().id; 2552 2553 let note = Elf64_Nhdr { 2554 n_namesz: COREDUMP_NAME_SIZE, 2555 n_descsz: descsz as u32, 2556 n_type: NT_PRSTATUS, 2557 }; 2558 2559 let bytes: &[u8] = note.as_slice(); 2560 buf.splice(0.., bytes.to_vec()); 2561 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2562 buf.resize(pos + 4, 0); 2563 buf.splice(pos.., "CORE".to_string().into_bytes()); 2564 2565 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2566 buf.resize(pos + 32 + 4, 0); 2567 let pid = vcpu_id as u64; 2568 let bytes: &[u8] = pid.as_slice(); 2569 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2570 2571 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2572 2573 let orig_rax: u64 = 0; 2574 let gregs = self.vcpus[usize::from(vcpu_id)] 2575 .lock() 2576 .unwrap() 2577 .vcpu 2578 .get_regs() 2579 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2580 2581 let regs1 = [ 2582 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11, 2583 gregs.r10, 2584 ]; 2585 let regs2 = [ 2586 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax, 2587 ]; 2588 2589 let sregs = self.vcpus[usize::from(vcpu_id)] 2590 .lock() 2591 .unwrap() 2592 .vcpu 2593 .get_sregs() 2594 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2595 2596 debug!( 2597 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2598 gregs.rip, 2599 gregs.rsp, 2600 sregs.gs.base, 2601 sregs.cs.selector, 2602 sregs.ss.selector, 2603 sregs.ds.selector, 2604 ); 2605 2606 let regs = X86_64UserRegs { 2607 regs1, 2608 regs2, 2609 rip: gregs.rip, 2610 cs: sregs.cs.selector as u64, 2611 eflags: gregs.rflags, 2612 rsp: gregs.rsp, 2613 ss: sregs.ss.selector as u64, 2614 fs_base: sregs.fs.base, 2615 gs_base: sregs.gs.base, 2616 ds: sregs.ds.selector as u64, 2617 es: sregs.es.selector as u64, 2618 fs: sregs.fs.selector as u64, 2619 gs: sregs.gs.selector as u64, 2620 }; 2621 2622 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2623 let bytes: &[u8] = regs.as_slice(); 2624 buf.resize(note_size as usize, 0); 2625 buf.splice(pos.., bytes.to_vec()); 2626 buf.resize(note_size as usize, 0); 2627 2628 coredump_file 2629 .write(&buf) 2630 .map_err(GuestDebuggableError::CoredumpFile)?; 2631 } 2632 2633 Ok(()) 2634 } 2635 2636 fn cpu_write_vmm_note( 2637 &mut self, 2638 dump_state: &DumpState, 2639 ) -> std::result::Result<(), GuestDebuggableError> { 2640 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2641 for vcpu in &self.vcpus { 2642 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2643 let mut pos: usize = 0; 2644 let mut buf = vec![0; note_size as usize]; 2645 let descsz = size_of::<DumpCpusState>(); 2646 let vcpu_id = vcpu.lock().unwrap().id; 2647 2648 let note = Elf64_Nhdr { 2649 n_namesz: COREDUMP_NAME_SIZE, 2650 n_descsz: descsz as u32, 2651 n_type: 0, 2652 }; 2653 2654 let bytes: &[u8] = note.as_slice(); 2655 buf.splice(0.., bytes.to_vec()); 2656 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2657 2658 buf.resize(pos + 4, 0); 2659 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2660 2661 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2662 2663 let gregs = self.vcpus[usize::from(vcpu_id)] 2664 .lock() 2665 .unwrap() 2666 .vcpu 2667 .get_regs() 2668 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2669 2670 let regs1 = [ 2671 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp, 2672 gregs.rbp, 2673 ]; 2674 2675 let regs2 = [ 2676 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, 2677 gregs.r15, 2678 ]; 2679 2680 let sregs = self.vcpus[usize::from(vcpu_id)] 2681 .lock() 2682 .unwrap() 2683 .vcpu 2684 .get_sregs() 2685 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2686 2687 let mut msrs = vec![MsrEntry { 2688 index: msr_index::MSR_KERNEL_GS_BASE, 2689 ..Default::default() 2690 }]; 2691 2692 self.vcpus[vcpu_id as usize] 2693 .lock() 2694 .unwrap() 2695 .vcpu 2696 .get_msrs(&mut msrs) 2697 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2698 let kernel_gs_base = msrs[0].data; 2699 2700 let cs = CpuSegment::new(sregs.cs); 2701 let ds = CpuSegment::new(sregs.ds); 2702 let es = CpuSegment::new(sregs.es); 2703 let fs = CpuSegment::new(sregs.fs); 2704 let gs = CpuSegment::new(sregs.gs); 2705 let ss = CpuSegment::new(sregs.ss); 2706 let ldt = CpuSegment::new(sregs.ldt); 2707 let tr = CpuSegment::new(sregs.tr); 2708 let gdt = CpuSegment::new_from_table(sregs.gdt); 2709 let idt = CpuSegment::new_from_table(sregs.idt); 2710 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2711 let regs = DumpCpusState { 2712 version: 1, 2713 size: size_of::<DumpCpusState>() as u32, 2714 regs1, 2715 regs2, 2716 rip: gregs.rip, 2717 rflags: gregs.rflags, 2718 cs, 2719 ds, 2720 es, 2721 fs, 2722 gs, 2723 ss, 2724 ldt, 2725 tr, 2726 gdt, 2727 idt, 2728 cr, 2729 kernel_gs_base, 2730 }; 2731 2732 let bytes: &[u8] = regs.as_slice(); 2733 buf.resize(note_size as usize, 0); 2734 buf.splice(pos.., bytes.to_vec()); 2735 buf.resize(note_size as usize, 0); 2736 2737 coredump_file 2738 .write(&buf) 2739 .map_err(GuestDebuggableError::CoredumpFile)?; 2740 } 2741 2742 Ok(()) 2743 } 2744 } 2745 2746 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2747 #[cfg(test)] 2748 mod tests { 2749 use arch::layout::BOOT_STACK_POINTER; 2750 use arch::layout::ZERO_PAGE_START; 2751 use arch::x86_64::interrupts::*; 2752 use arch::x86_64::regs::*; 2753 use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters}; 2754 use linux_loader::loader::bootparam::setup_header; 2755 2756 #[test] 2757 fn test_setlint() { 2758 let hv = hypervisor::new().unwrap(); 2759 let vm = hv.create_vm().expect("new VM fd creation failed"); 2760 assert!(hv.check_required_extensions().is_ok()); 2761 // Calling get_lapic will fail if there is no irqchip before hand. 2762 assert!(vm.create_irq_chip().is_ok()); 2763 let vcpu = vm.create_vcpu(0, None).unwrap(); 2764 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2765 2766 // Compute the value that is expected to represent LVT0 and LVT1. 2767 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2768 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2769 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2770 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2771 2772 set_lint(&vcpu).unwrap(); 2773 2774 // Compute the value that represents LVT0 and LVT1 after set_lint. 2775 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2776 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2777 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2778 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2779 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2780 } 2781 2782 #[test] 2783 fn test_setup_fpu() { 2784 let hv = hypervisor::new().unwrap(); 2785 let vm = hv.create_vm().expect("new VM fd creation failed"); 2786 let vcpu = vm.create_vcpu(0, None).unwrap(); 2787 setup_fpu(&vcpu).unwrap(); 2788 2789 let expected_fpu: FpuState = FpuState { 2790 fcw: 0x37f, 2791 mxcsr: 0x1f80, 2792 ..Default::default() 2793 }; 2794 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2795 // TODO: auto-generate kvm related structures with PartialEq on. 2796 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2797 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2798 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2799 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2800 // remove it at all. 2801 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2802 } 2803 2804 #[test] 2805 fn test_setup_msrs() { 2806 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2807 2808 let hv = hypervisor::new().unwrap(); 2809 let vm = hv.create_vm().expect("new VM fd creation failed"); 2810 let vcpu = vm.create_vcpu(0, None).unwrap(); 2811 setup_msrs(&vcpu).unwrap(); 2812 2813 // This test will check against the last MSR entry configured (the tenth one). 2814 // See create_msr_entries for details. 2815 let mut msrs = vec![MsrEntry { 2816 index: msr_index::MSR_IA32_MISC_ENABLE, 2817 ..Default::default() 2818 }]; 2819 2820 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2821 // in this test case scenario. 2822 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2823 assert_eq!(read_msrs, 1); 2824 2825 // Official entries that were setup when we did setup_msrs. We need to assert that the 2826 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2827 // expect. 2828 let entry_vec = vcpu.boot_msr_entries(); 2829 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2830 } 2831 2832 #[test] 2833 fn test_setup_regs_for_pvh() { 2834 let hv = hypervisor::new().unwrap(); 2835 let vm = hv.create_vm().expect("new VM fd creation failed"); 2836 let vcpu = vm.create_vcpu(0, None).unwrap(); 2837 2838 let expected_regs: StandardRegisters = StandardRegisters { 2839 rflags: 0x0000000000000002u64, 2840 rbx: arch::layout::PVH_INFO_START.0, 2841 rip: 1, 2842 ..Default::default() 2843 }; 2844 2845 setup_regs( 2846 &vcpu, 2847 arch::EntryPoint { 2848 entry_addr: vm_memory::GuestAddress(expected_regs.rip), 2849 setup_header: None, 2850 }, 2851 ) 2852 .unwrap(); 2853 2854 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2855 assert_eq!(actual_regs, expected_regs); 2856 } 2857 2858 #[test] 2859 fn test_setup_regs_for_bzimage() { 2860 let hv = hypervisor::new().unwrap(); 2861 let vm = hv.create_vm().expect("new VM fd creation failed"); 2862 let vcpu = vm.create_vcpu(0, None).unwrap(); 2863 2864 let expected_regs: StandardRegisters = StandardRegisters { 2865 rflags: 0x0000000000000002u64, 2866 rip: 1, 2867 rsp: BOOT_STACK_POINTER.0, 2868 rsi: ZERO_PAGE_START.0, 2869 ..Default::default() 2870 }; 2871 2872 setup_regs( 2873 &vcpu, 2874 arch::EntryPoint { 2875 entry_addr: vm_memory::GuestAddress(expected_regs.rip), 2876 setup_header: Some(setup_header { 2877 ..Default::default() 2878 }), 2879 }, 2880 ) 2881 .unwrap(); 2882 2883 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2884 assert_eq!(actual_regs, expected_regs); 2885 } 2886 } 2887 2888 #[cfg(target_arch = "aarch64")] 2889 #[cfg(test)] 2890 mod tests { 2891 use arch::{aarch64::regs, layout}; 2892 use hypervisor::kvm::aarch64::is_system_register; 2893 use hypervisor::kvm::kvm_bindings::{ 2894 kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2895 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2896 }; 2897 use hypervisor::{arm64_core_reg_id, offset_of}; 2898 use std::mem; 2899 2900 #[test] 2901 fn test_setup_regs() { 2902 let hv = hypervisor::new().unwrap(); 2903 let vm = hv.create_vm().unwrap(); 2904 let vcpu = vm.create_vcpu(0, None).unwrap(); 2905 2906 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2907 // Must fail when vcpu is not initialized yet. 2908 assert!(res.is_err()); 2909 2910 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2911 vm.get_preferred_target(&mut kvi).unwrap(); 2912 vcpu.vcpu_init(&kvi).unwrap(); 2913 2914 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2915 } 2916 2917 #[test] 2918 fn test_read_mpidr() { 2919 let hv = hypervisor::new().unwrap(); 2920 let vm = hv.create_vm().unwrap(); 2921 let vcpu = vm.create_vcpu(0, None).unwrap(); 2922 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2923 vm.get_preferred_target(&mut kvi).unwrap(); 2924 2925 // Must fail when vcpu is not initialized yet. 2926 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2927 2928 vcpu.vcpu_init(&kvi).unwrap(); 2929 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2930 } 2931 2932 #[test] 2933 fn test_is_system_register() { 2934 let offset = offset_of!(user_pt_regs, pc); 2935 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2936 assert!(!is_system_register(regid)); 2937 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2938 assert!(is_system_register(regid)); 2939 } 2940 2941 #[test] 2942 fn test_save_restore_core_regs() { 2943 let hv = hypervisor::new().unwrap(); 2944 let vm = hv.create_vm().unwrap(); 2945 let vcpu = vm.create_vcpu(0, None).unwrap(); 2946 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2947 vm.get_preferred_target(&mut kvi).unwrap(); 2948 2949 // Must fail when vcpu is not initialized yet. 2950 let res = vcpu.get_regs(); 2951 assert!(res.is_err()); 2952 assert_eq!( 2953 format!("{}", res.unwrap_err()), 2954 "Failed to get core register: Exec format error (os error 8)" 2955 ); 2956 2957 let mut state = kvm_regs::default(); 2958 let res = vcpu.set_regs(&state); 2959 assert!(res.is_err()); 2960 assert_eq!( 2961 format!("{}", res.unwrap_err()), 2962 "Failed to set core register: Exec format error (os error 8)" 2963 ); 2964 2965 vcpu.vcpu_init(&kvi).unwrap(); 2966 let res = vcpu.get_regs(); 2967 assert!(res.is_ok()); 2968 state = res.unwrap(); 2969 assert_eq!(state.regs.pstate, 0x3C5); 2970 2971 assert!(vcpu.set_regs(&state).is_ok()); 2972 } 2973 2974 #[test] 2975 fn test_get_set_mpstate() { 2976 let hv = hypervisor::new().unwrap(); 2977 let vm = hv.create_vm().unwrap(); 2978 let vcpu = vm.create_vcpu(0, None).unwrap(); 2979 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2980 vm.get_preferred_target(&mut kvi).unwrap(); 2981 2982 let res = vcpu.get_mp_state(); 2983 assert!(res.is_ok()); 2984 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 2985 } 2986 } 2987