1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 #[cfg(target_arch = "x86_64")] 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, sdt::Sdt, Aml}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 #[cfg(target_arch = "x86_64")] 35 use arch::x86_64::get_x2apic_id; 36 use arch::EntryPoint; 37 use arch::NumaNodes; 38 #[cfg(target_arch = "aarch64")] 39 use devices::gic::Gic; 40 use devices::interrupt_controller::InterruptController; 41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 44 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 45 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 46 use hypervisor::aarch64::StandardRegisters; 47 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 48 use hypervisor::arch::x86::msr_index; 49 #[cfg(target_arch = "x86_64")] 50 use hypervisor::arch::x86::CpuIdEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::MsrEntry; 53 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 54 use hypervisor::arch::x86::SpecialRegisters; 55 #[cfg(target_arch = "aarch64")] 56 use hypervisor::kvm::kvm_bindings; 57 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 58 use hypervisor::kvm::kvm_ioctls::Cap; 59 #[cfg(feature = "tdx")] 60 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 61 #[cfg(target_arch = "x86_64")] 62 use hypervisor::CpuVendor; 63 #[cfg(feature = "kvm")] 64 use hypervisor::HypervisorType; 65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 66 use hypervisor::StandardRegisters; 67 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps}; 68 use libc::{c_void, siginfo_t}; 69 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 70 use linux_loader::elf::Elf64_Nhdr; 71 use seccompiler::{apply_filter, SeccompAction}; 72 use std::collections::BTreeMap; 73 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 74 use std::io::Write; 75 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 76 use std::mem::size_of; 77 use std::os::unix::thread::JoinHandleExt; 78 use std::sync::atomic::{AtomicBool, Ordering}; 79 use std::sync::{Arc, Barrier, Mutex}; 80 use std::{cmp, io, result, thread}; 81 use thiserror::Error; 82 use tracer::trace_scoped; 83 use vm_device::BusDevice; 84 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 85 use vm_memory::ByteValued; 86 #[cfg(feature = "guest_debug")] 87 use vm_memory::{Bytes, GuestAddressSpace}; 88 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 89 use vm_migration::{ 90 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 91 Transportable, 92 }; 93 use vmm_sys_util::eventfd::EventFd; 94 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 95 use zerocopy::AsBytes; 96 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 97 /// Extract the specified bits of a 64-bit integer. 98 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 99 /// following expression should return 3 (`0b11`): 100 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 101 /// 102 macro_rules! extract_bits_64 { 103 ($value: tt, $offset: tt, $length: tt) => { 104 ($value >> $offset) & (!0u64 >> (64 - $length)) 105 }; 106 } 107 108 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 109 macro_rules! extract_bits_64_without_offset { 110 ($value: tt, $length: tt) => { 111 $value & (!0u64 >> (64 - $length)) 112 }; 113 } 114 115 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 116 117 #[derive(Debug, Error)] 118 pub enum Error { 119 #[error("Error creating vCPU: {0}")] 120 VcpuCreate(#[source] anyhow::Error), 121 122 #[error("Error running bCPU: {0}")] 123 VcpuRun(#[source] anyhow::Error), 124 125 #[error("Error spawning vCPU thread: {0}")] 126 VcpuSpawn(#[source] io::Error), 127 128 #[error("Error generating common CPUID: {0}")] 129 CommonCpuId(#[source] arch::Error), 130 131 #[error("Error configuring vCPU: {0}")] 132 VcpuConfiguration(#[source] arch::Error), 133 134 #[error("Still pending removed vcpu")] 135 VcpuPendingRemovedVcpu, 136 137 #[cfg(target_arch = "aarch64")] 138 #[error("Error fetching preferred target: {0}")] 139 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 140 141 #[cfg(target_arch = "aarch64")] 142 #[error("Error initialising vCPU: {0}")] 143 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 144 145 #[error("Failed to join on vCPU threads: {0:?}")] 146 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 147 148 #[error("Error adding CpuManager to MMIO bus: {0}")] 149 BusError(#[source] vm_device::BusError), 150 151 #[error("Requested vCPUs exceed maximum")] 152 DesiredVCpuCountExceedsMax, 153 154 #[error("Cannot create seccomp filter: {0}")] 155 CreateSeccompFilter(#[source] seccompiler::Error), 156 157 #[error("Cannot apply seccomp filter: {0}")] 158 ApplySeccompFilter(#[source] seccompiler::Error), 159 160 #[error("Error starting vCPU after restore: {0}")] 161 StartRestoreVcpu(#[source] anyhow::Error), 162 163 #[error("Unexpected VmExit")] 164 UnexpectedVmExit, 165 166 #[error("Failed to allocate MMIO address for CpuManager")] 167 AllocateMmmioAddress, 168 169 #[cfg(feature = "tdx")] 170 #[error("Error initializing TDX: {0}")] 171 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 172 173 #[cfg(target_arch = "aarch64")] 174 #[error("Error initializing PMU: {0}")] 175 InitPmu(#[source] hypervisor::HypervisorCpuError), 176 177 #[cfg(feature = "guest_debug")] 178 #[error("Error during CPU debug: {0}")] 179 CpuDebug(#[source] hypervisor::HypervisorCpuError), 180 181 #[cfg(feature = "guest_debug")] 182 #[error("Error translating virtual address: {0}")] 183 TranslateVirtualAddress(#[source] anyhow::Error), 184 185 #[cfg(target_arch = "x86_64")] 186 #[error("Error setting up AMX: {0}")] 187 AmxEnable(#[source] anyhow::Error), 188 189 #[error("Maximum number of vCPUs exceeds host limit")] 190 MaximumVcpusExceeded, 191 192 #[cfg(feature = "sev_snp")] 193 #[error("Failed to set sev control register: {0}")] 194 SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), 195 196 #[cfg(target_arch = "x86_64")] 197 #[error("Failed to inject NMI")] 198 NmiError(hypervisor::HypervisorCpuError), 199 } 200 pub type Result<T> = result::Result<T, Error>; 201 202 #[cfg(target_arch = "x86_64")] 203 #[allow(dead_code)] 204 #[repr(packed)] 205 #[derive(AsBytes)] 206 struct LocalX2Apic { 207 pub r#type: u8, 208 pub length: u8, 209 pub _reserved: u16, 210 pub apic_id: u32, 211 pub flags: u32, 212 pub processor_id: u32, 213 } 214 215 #[allow(dead_code)] 216 #[repr(packed)] 217 #[derive(Default, AsBytes)] 218 struct Ioapic { 219 pub r#type: u8, 220 pub length: u8, 221 pub ioapic_id: u8, 222 _reserved: u8, 223 pub apic_address: u32, 224 pub gsi_base: u32, 225 } 226 227 #[cfg(target_arch = "aarch64")] 228 #[allow(dead_code)] 229 #[repr(packed)] 230 #[derive(AsBytes)] 231 struct GicC { 232 pub r#type: u8, 233 pub length: u8, 234 pub reserved0: u16, 235 pub cpu_interface_number: u32, 236 pub uid: u32, 237 pub flags: u32, 238 pub parking_version: u32, 239 pub performance_interrupt: u32, 240 pub parked_address: u64, 241 pub base_address: u64, 242 pub gicv_base_address: u64, 243 pub gich_base_address: u64, 244 pub vgic_interrupt: u32, 245 pub gicr_base_address: u64, 246 pub mpidr: u64, 247 pub proc_power_effi_class: u8, 248 pub reserved1: u8, 249 pub spe_overflow_interrupt: u16, 250 } 251 252 #[cfg(target_arch = "aarch64")] 253 #[allow(dead_code)] 254 #[repr(packed)] 255 #[derive(AsBytes)] 256 struct GicD { 257 pub r#type: u8, 258 pub length: u8, 259 pub reserved0: u16, 260 pub gic_id: u32, 261 pub base_address: u64, 262 pub global_irq_base: u32, 263 pub version: u8, 264 pub reserved1: [u8; 3], 265 } 266 267 #[cfg(target_arch = "aarch64")] 268 #[allow(dead_code)] 269 #[repr(packed)] 270 #[derive(AsBytes)] 271 struct GicR { 272 pub r#type: u8, 273 pub length: u8, 274 pub reserved: u16, 275 pub base_address: u64, 276 pub range_length: u32, 277 } 278 279 #[cfg(target_arch = "aarch64")] 280 #[allow(dead_code)] 281 #[repr(packed)] 282 #[derive(AsBytes)] 283 struct GicIts { 284 pub r#type: u8, 285 pub length: u8, 286 pub reserved0: u16, 287 pub translation_id: u32, 288 pub base_address: u64, 289 pub reserved1: u32, 290 } 291 292 #[cfg(target_arch = "aarch64")] 293 #[allow(dead_code)] 294 #[repr(packed)] 295 #[derive(AsBytes)] 296 struct ProcessorHierarchyNode { 297 pub r#type: u8, 298 pub length: u8, 299 pub reserved: u16, 300 pub flags: u32, 301 pub parent: u32, 302 pub acpi_processor_id: u32, 303 pub num_private_resources: u32, 304 } 305 306 #[allow(dead_code)] 307 #[repr(packed)] 308 #[derive(Default, AsBytes)] 309 struct InterruptSourceOverride { 310 pub r#type: u8, 311 pub length: u8, 312 pub bus: u8, 313 pub source: u8, 314 pub gsi: u32, 315 pub flags: u16, 316 } 317 318 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 319 macro_rules! round_up { 320 ($n:expr,$d:expr) => { 321 (($n / ($d + 1)) + 1) * $d 322 }; 323 } 324 325 /// A wrapper around creating and using a kvm-based VCPU. 326 pub struct Vcpu { 327 // The hypervisor abstracted CPU. 328 vcpu: Arc<dyn hypervisor::Vcpu>, 329 id: u8, 330 #[cfg(target_arch = "aarch64")] 331 mpidr: u64, 332 saved_state: Option<CpuState>, 333 #[cfg(target_arch = "x86_64")] 334 vendor: CpuVendor, 335 } 336 337 impl Vcpu { 338 /// Constructs a new VCPU for `vm`. 339 /// 340 /// # Arguments 341 /// 342 /// * `id` - Represents the CPU number between [0, max vcpus). 343 /// * `vm` - The virtual machine this vcpu will get attached to. 344 /// * `vm_ops` - Optional object for exit handling. 345 /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) 346 pub fn new( 347 id: u8, 348 apic_id: u8, 349 vm: &Arc<dyn hypervisor::Vm>, 350 vm_ops: Option<Arc<dyn VmOps>>, 351 #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, 352 ) -> Result<Self> { 353 let vcpu = vm 354 .create_vcpu(apic_id, vm_ops) 355 .map_err(|e| Error::VcpuCreate(e.into()))?; 356 // Initially the cpuid per vCPU is the one supported by this VM. 357 Ok(Vcpu { 358 vcpu, 359 id, 360 #[cfg(target_arch = "aarch64")] 361 mpidr: 0, 362 saved_state: None, 363 #[cfg(target_arch = "x86_64")] 364 vendor: cpu_vendor, 365 }) 366 } 367 368 /// Configures a vcpu and should be called once per vcpu when created. 369 /// 370 /// # Arguments 371 /// 372 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 373 /// * `guest_memory` - Guest memory. 374 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 375 pub fn configure( 376 &mut self, 377 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 378 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 379 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 380 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 381 #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, 382 ) -> Result<()> { 383 #[cfg(target_arch = "aarch64")] 384 { 385 self.init(vm)?; 386 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 387 .map_err(Error::VcpuConfiguration)?; 388 } 389 info!("Configuring vCPU: cpu_id = {}", self.id); 390 #[cfg(target_arch = "x86_64")] 391 arch::configure_vcpu( 392 &self.vcpu, 393 self.id, 394 boot_setup, 395 cpuid, 396 kvm_hyperv, 397 self.vendor, 398 topology, 399 ) 400 .map_err(Error::VcpuConfiguration)?; 401 402 Ok(()) 403 } 404 405 /// Gets the MPIDR register value. 406 #[cfg(target_arch = "aarch64")] 407 pub fn get_mpidr(&self) -> u64 { 408 self.mpidr 409 } 410 411 /// Gets the saved vCPU state. 412 #[cfg(target_arch = "aarch64")] 413 pub fn get_saved_state(&self) -> Option<CpuState> { 414 self.saved_state.clone() 415 } 416 417 /// Initializes an aarch64 specific vcpu for booting Linux. 418 #[cfg(target_arch = "aarch64")] 419 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 420 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 421 422 // This reads back the kernel's preferred target type. 423 vm.get_preferred_target(&mut kvi) 424 .map_err(Error::VcpuArmPreferredTarget)?; 425 // We already checked that the capability is supported. 426 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 427 if vm 428 .as_any() 429 .downcast_ref::<hypervisor::kvm::KvmVm>() 430 .unwrap() 431 .check_extension(Cap::ArmPmuV3) 432 { 433 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 434 } 435 // Non-boot cpus are powered off initially. 436 if self.id > 0 { 437 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 438 } 439 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 440 } 441 442 /// Runs the VCPU until it exits, returning the reason. 443 /// 444 /// Note that the state of the VCPU and associated VM must be setup first for this to do 445 /// anything useful. 446 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 447 self.vcpu.run() 448 } 449 450 #[cfg(feature = "sev_snp")] 451 pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> { 452 self.vcpu 453 .set_sev_control_register(vmsa_pfn) 454 .map_err(Error::SetSevControlRegister) 455 } 456 } 457 458 impl Pausable for Vcpu {} 459 impl Snapshottable for Vcpu { 460 fn id(&self) -> String { 461 self.id.to_string() 462 } 463 464 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 465 let saved_state = self 466 .vcpu 467 .state() 468 .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?; 469 470 self.saved_state = Some(saved_state.clone()); 471 472 Ok(Snapshot::from_data(SnapshotData::new_from_state( 473 &saved_state, 474 )?)) 475 } 476 } 477 478 pub struct CpuManager { 479 config: CpusConfig, 480 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 481 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 482 #[cfg(target_arch = "x86_64")] 483 cpuid: Vec<CpuIdEntry>, 484 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 485 vm: Arc<dyn hypervisor::Vm>, 486 vcpus_kill_signalled: Arc<AtomicBool>, 487 vcpus_pause_signalled: Arc<AtomicBool>, 488 vcpus_kick_signalled: Arc<AtomicBool>, 489 exit_evt: EventFd, 490 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 491 reset_evt: EventFd, 492 #[cfg(feature = "guest_debug")] 493 vm_debug_evt: EventFd, 494 vcpu_states: Vec<VcpuState>, 495 selected_cpu: u8, 496 vcpus: Vec<Arc<Mutex<Vcpu>>>, 497 seccomp_action: SeccompAction, 498 vm_ops: Arc<dyn VmOps>, 499 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 500 acpi_address: Option<GuestAddress>, 501 proximity_domain_per_cpu: BTreeMap<u8, u32>, 502 affinity: BTreeMap<u8, Vec<usize>>, 503 dynamic: bool, 504 hypervisor: Arc<dyn hypervisor::Hypervisor>, 505 #[cfg(feature = "sev_snp")] 506 sev_snp_enabled: bool, 507 } 508 509 const CPU_ENABLE_FLAG: usize = 0; 510 const CPU_INSERTING_FLAG: usize = 1; 511 const CPU_REMOVING_FLAG: usize = 2; 512 const CPU_EJECT_FLAG: usize = 3; 513 514 const CPU_STATUS_OFFSET: u64 = 4; 515 const CPU_SELECTION_OFFSET: u64 = 0; 516 517 impl BusDevice for CpuManager { 518 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 519 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 520 data.fill(0); 521 522 match offset { 523 CPU_SELECTION_OFFSET => { 524 data[0] = self.selected_cpu; 525 } 526 CPU_STATUS_OFFSET => { 527 if self.selected_cpu < self.max_vcpus() { 528 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 529 if state.active() { 530 data[0] |= 1 << CPU_ENABLE_FLAG; 531 } 532 if state.inserting { 533 data[0] |= 1 << CPU_INSERTING_FLAG; 534 } 535 if state.removing { 536 data[0] |= 1 << CPU_REMOVING_FLAG; 537 } 538 } else { 539 warn!("Out of range vCPU id: {}", self.selected_cpu); 540 } 541 } 542 _ => { 543 warn!( 544 "Unexpected offset for accessing CPU manager device: {:#}", 545 offset 546 ); 547 } 548 } 549 } 550 551 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 552 match offset { 553 CPU_SELECTION_OFFSET => { 554 self.selected_cpu = data[0]; 555 } 556 CPU_STATUS_OFFSET => { 557 if self.selected_cpu < self.max_vcpus() { 558 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 559 // The ACPI code writes back a 1 to acknowledge the insertion 560 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 561 && state.inserting 562 { 563 state.inserting = false; 564 } 565 // Ditto for removal 566 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 567 && state.removing 568 { 569 state.removing = false; 570 } 571 // Trigger removal of vCPU 572 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 573 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 574 error!("Error removing vCPU: {:?}", e); 575 } 576 } 577 } else { 578 warn!("Out of range vCPU id: {}", self.selected_cpu); 579 } 580 } 581 _ => { 582 warn!( 583 "Unexpected offset for accessing CPU manager device: {:#}", 584 offset 585 ); 586 } 587 } 588 None 589 } 590 } 591 592 #[derive(Default)] 593 struct VcpuState { 594 inserting: bool, 595 removing: bool, 596 pending_removal: Arc<AtomicBool>, 597 handle: Option<thread::JoinHandle<()>>, 598 kill: Arc<AtomicBool>, 599 vcpu_run_interrupted: Arc<AtomicBool>, 600 paused: Arc<AtomicBool>, 601 } 602 603 impl VcpuState { 604 fn active(&self) -> bool { 605 self.handle.is_some() 606 } 607 608 fn signal_thread(&self) { 609 if let Some(handle) = self.handle.as_ref() { 610 loop { 611 // SAFETY: FFI call with correct arguments 612 unsafe { 613 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 614 } 615 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 616 break; 617 } else { 618 // This is more effective than thread::yield_now() at 619 // avoiding a priority inversion with the vCPU thread 620 thread::sleep(std::time::Duration::from_millis(1)); 621 } 622 } 623 } 624 } 625 626 fn join_thread(&mut self) -> Result<()> { 627 if let Some(handle) = self.handle.take() { 628 handle.join().map_err(Error::ThreadCleanup)? 629 } 630 631 Ok(()) 632 } 633 634 fn unpark_thread(&self) { 635 if let Some(handle) = self.handle.as_ref() { 636 handle.thread().unpark() 637 } 638 } 639 } 640 641 impl CpuManager { 642 #[allow(unused_variables)] 643 #[allow(clippy::too_many_arguments)] 644 pub fn new( 645 config: &CpusConfig, 646 vm: Arc<dyn hypervisor::Vm>, 647 exit_evt: EventFd, 648 reset_evt: EventFd, 649 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 650 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 651 seccomp_action: SeccompAction, 652 vm_ops: Arc<dyn VmOps>, 653 #[cfg(feature = "tdx")] tdx_enabled: bool, 654 numa_nodes: &NumaNodes, 655 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 656 ) -> Result<Arc<Mutex<CpuManager>>> { 657 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 658 return Err(Error::MaximumVcpusExceeded); 659 } 660 661 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 662 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 663 let hypervisor_type = hypervisor.hypervisor_type(); 664 #[cfg(target_arch = "x86_64")] 665 let cpu_vendor = hypervisor.get_cpu_vendor(); 666 667 #[cfg(target_arch = "x86_64")] 668 if config.features.amx { 669 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 670 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 671 const XFEATURE_XTILEDATA: usize = 18; 672 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 673 674 // SAFETY: the syscall is only modifying kernel internal 675 // data structures that the kernel is itself expected to safeguard. 676 let amx_tile = unsafe { 677 libc::syscall( 678 libc::SYS_arch_prctl, 679 ARCH_REQ_XCOMP_GUEST_PERM, 680 XFEATURE_XTILEDATA, 681 ) 682 }; 683 684 if amx_tile != 0 { 685 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 686 } else { 687 let mask: usize = 0; 688 // SAFETY: the mask being modified (not marked mutable as it is 689 // modified in unsafe only which is permitted) isn't in use elsewhere. 690 let result = unsafe { 691 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 692 }; 693 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 694 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 695 } 696 } 697 } 698 699 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 700 let mut cpu_list = Vec::new(); 701 for (proximity_domain, numa_node) in numa_nodes.iter() { 702 for cpu in numa_node.cpus.iter() { 703 cpu_list.push((*cpu, *proximity_domain)) 704 } 705 } 706 cpu_list 707 } 708 .into_iter() 709 .collect(); 710 711 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 712 cpu_affinity 713 .iter() 714 .map(|a| (a.vcpu, a.host_cpus.clone())) 715 .collect() 716 } else { 717 BTreeMap::new() 718 }; 719 720 #[cfg(feature = "tdx")] 721 let dynamic = !tdx_enabled; 722 #[cfg(not(feature = "tdx"))] 723 let dynamic = true; 724 725 Ok(Arc::new(Mutex::new(CpuManager { 726 config: config.clone(), 727 interrupt_controller: None, 728 #[cfg(target_arch = "x86_64")] 729 cpuid: Vec::new(), 730 vm, 731 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 732 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 733 vcpus_kick_signalled: Arc::new(AtomicBool::new(false)), 734 vcpu_states, 735 exit_evt, 736 reset_evt, 737 #[cfg(feature = "guest_debug")] 738 vm_debug_evt, 739 selected_cpu: 0, 740 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 741 seccomp_action, 742 vm_ops, 743 acpi_address: None, 744 proximity_domain_per_cpu, 745 affinity, 746 dynamic, 747 hypervisor: hypervisor.clone(), 748 #[cfg(feature = "sev_snp")] 749 sev_snp_enabled, 750 }))) 751 } 752 753 #[cfg(target_arch = "x86_64")] 754 pub fn populate_cpuid( 755 &mut self, 756 memory_manager: &Arc<Mutex<MemoryManager>>, 757 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 758 #[cfg(feature = "tdx")] tdx: bool, 759 ) -> Result<()> { 760 let sgx_epc_sections = memory_manager 761 .lock() 762 .unwrap() 763 .sgx_epc_region() 764 .as_ref() 765 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 766 767 self.cpuid = { 768 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 769 arch::generate_common_cpuid( 770 hypervisor, 771 &arch::CpuidConfig { 772 sgx_epc_sections, 773 phys_bits, 774 kvm_hyperv: self.config.kvm_hyperv, 775 #[cfg(feature = "tdx")] 776 tdx, 777 amx: self.config.features.amx, 778 }, 779 ) 780 .map_err(Error::CommonCpuId)? 781 }; 782 783 Ok(()) 784 } 785 786 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 787 info!("Creating vCPU: cpu_id = {}", cpu_id); 788 789 #[cfg(target_arch = "x86_64")] 790 let topology = self.get_vcpu_topology(); 791 #[cfg(target_arch = "x86_64")] 792 let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology); 793 #[cfg(target_arch = "aarch64")] 794 let x2apic_id = cpu_id as u32; 795 796 let mut vcpu = Vcpu::new( 797 cpu_id, 798 x2apic_id as u8, 799 &self.vm, 800 Some(self.vm_ops.clone()), 801 #[cfg(target_arch = "x86_64")] 802 self.hypervisor.get_cpu_vendor(), 803 )?; 804 805 if let Some(snapshot) = snapshot { 806 // AArch64 vCPUs should be initialized after created. 807 #[cfg(target_arch = "aarch64")] 808 vcpu.init(&self.vm)?; 809 810 let state: CpuState = snapshot.to_state().map_err(|e| { 811 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 812 })?; 813 vcpu.vcpu 814 .set_state(&state) 815 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 816 817 vcpu.saved_state = Some(state); 818 } 819 820 let vcpu = Arc::new(Mutex::new(vcpu)); 821 822 // Adding vCPU to the CpuManager's vCPU list. 823 self.vcpus.push(vcpu.clone()); 824 825 Ok(vcpu) 826 } 827 828 pub fn configure_vcpu( 829 &self, 830 vcpu: Arc<Mutex<Vcpu>>, 831 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 832 ) -> Result<()> { 833 let mut vcpu = vcpu.lock().unwrap(); 834 835 #[cfg(feature = "sev_snp")] 836 if self.sev_snp_enabled { 837 if let Some((kernel_entry_point, _)) = boot_setup { 838 vcpu.set_sev_control_register( 839 kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, 840 )?; 841 } 842 843 // Traditional way to configure vcpu doesn't work for SEV-SNP guests. 844 // All the vCPU configuration for SEV-SNP guest is provided via VMSA. 845 return Ok(()); 846 } 847 848 #[cfg(target_arch = "x86_64")] 849 assert!(!self.cpuid.is_empty()); 850 851 #[cfg(target_arch = "x86_64")] 852 let topology = self.config.topology.clone().map_or_else( 853 || Some((1, self.boot_vcpus(), 1)), 854 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 855 ); 856 #[cfg(target_arch = "x86_64")] 857 vcpu.configure( 858 boot_setup, 859 self.cpuid.clone(), 860 self.config.kvm_hyperv, 861 topology, 862 )?; 863 864 #[cfg(target_arch = "aarch64")] 865 vcpu.configure(&self.vm, boot_setup)?; 866 867 Ok(()) 868 } 869 870 /// Only create new vCPUs if there aren't any inactive ones to reuse 871 fn create_vcpus( 872 &mut self, 873 desired_vcpus: u8, 874 snapshot: Option<Snapshot>, 875 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 876 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 877 info!( 878 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 879 desired_vcpus, 880 self.config.max_vcpus, 881 self.vcpus.len(), 882 self.present_vcpus() 883 ); 884 885 if desired_vcpus > self.config.max_vcpus { 886 return Err(Error::DesiredVCpuCountExceedsMax); 887 } 888 889 // Only create vCPUs in excess of all the allocated vCPUs. 890 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 891 vcpus.push(self.create_vcpu( 892 cpu_id, 893 // TODO: The special format of the CPU id can be removed once 894 // ready to break live upgrade. 895 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 896 )?); 897 } 898 899 Ok(vcpus) 900 } 901 902 #[cfg(target_arch = "aarch64")] 903 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 904 for cpu in self.vcpus.iter() { 905 let cpu = cpu.lock().unwrap(); 906 // Check if PMU attr is available, if not, log the information. 907 if cpu.vcpu.has_pmu_support() { 908 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 909 } else { 910 debug!( 911 "PMU attribute is not supported in vCPU{}, skip PMU init!", 912 cpu.id 913 ); 914 return Ok(false); 915 } 916 } 917 918 Ok(true) 919 } 920 921 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 922 self.vcpus.clone() 923 } 924 925 fn start_vcpu( 926 &mut self, 927 vcpu: Arc<Mutex<Vcpu>>, 928 vcpu_id: u8, 929 vcpu_thread_barrier: Arc<Barrier>, 930 inserting: bool, 931 ) -> Result<()> { 932 let reset_evt = self.reset_evt.try_clone().unwrap(); 933 let exit_evt = self.exit_evt.try_clone().unwrap(); 934 #[cfg(feature = "kvm")] 935 let hypervisor_type = self.hypervisor.hypervisor_type(); 936 #[cfg(feature = "guest_debug")] 937 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 938 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 939 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 940 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 941 let vcpu_kick_signalled = self.vcpus_kick_signalled.clone(); 942 943 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 944 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 945 .vcpu_run_interrupted 946 .clone(); 947 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 948 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 949 950 // Prepare the CPU set the current vCPU is expected to run onto. 951 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 952 // SAFETY: all zeros is a valid pattern 953 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 954 // SAFETY: FFI call, trivially safe 955 unsafe { libc::CPU_ZERO(&mut cpuset) }; 956 for host_cpu in host_cpus { 957 // SAFETY: FFI call, trivially safe 958 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) }; 959 } 960 cpuset 961 }); 962 963 // Retrieve seccomp filter for vcpu thread 964 let vcpu_seccomp_filter = get_seccomp_filter( 965 &self.seccomp_action, 966 Thread::Vcpu, 967 self.hypervisor.hypervisor_type(), 968 ) 969 .map_err(Error::CreateSeccompFilter)?; 970 971 #[cfg(target_arch = "x86_64")] 972 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 973 974 info!("Starting vCPU: cpu_id = {}", vcpu_id); 975 976 let handle = Some( 977 thread::Builder::new() 978 .name(format!("vcpu{vcpu_id}")) 979 .spawn(move || { 980 // Schedule the thread to run on the expected CPU set 981 if let Some(cpuset) = cpuset.as_ref() { 982 // SAFETY: FFI call with correct arguments 983 let ret = unsafe { 984 libc::sched_setaffinity( 985 0, 986 std::mem::size_of::<libc::cpu_set_t>(), 987 cpuset as *const libc::cpu_set_t, 988 ) 989 }; 990 991 if ret != 0 { 992 error!( 993 "Failed scheduling the vCPU {} on the expected CPU set: {}", 994 vcpu_id, 995 io::Error::last_os_error() 996 ); 997 return; 998 } 999 } 1000 1001 // Apply seccomp filter for vcpu thread. 1002 if !vcpu_seccomp_filter.is_empty() { 1003 if let Err(e) = 1004 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 1005 { 1006 error!("Error applying seccomp filter: {:?}", e); 1007 return; 1008 } 1009 } 1010 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 1011 // This uses an async signal safe handler to kill the vcpu handles. 1012 register_signal_handler(SIGRTMIN(), handle_signal) 1013 .expect("Failed to register vcpu signal handler"); 1014 // Block until all CPUs are ready. 1015 vcpu_thread_barrier.wait(); 1016 1017 std::panic::catch_unwind(move || { 1018 loop { 1019 // If we are being told to pause, we park the thread 1020 // until the pause boolean is toggled. 1021 // The resume operation is responsible for toggling 1022 // the boolean and unpark the thread. 1023 // We enter a loop because park() could spuriously 1024 // return. We will then park() again unless the 1025 // pause boolean has been toggled. 1026 1027 // Need to use Ordering::SeqCst as we have multiple 1028 // loads and stores to different atomics and we need 1029 // to see them in a consistent order in all threads 1030 1031 if vcpu_pause_signalled.load(Ordering::SeqCst) { 1032 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 1033 // completed by returning to KVM_RUN. From the kernel docs: 1034 // 1035 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 1036 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 1037 // operations are complete (and guest state is consistent) only after userspace 1038 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 1039 // incomplete operations and then check for pending signals. 1040 // The pending state of the operation is not preserved in state which is 1041 // visible to userspace, thus userspace should ensure that the operation is 1042 // completed before performing a live migration. Userspace can re-enter the 1043 // guest with an unmasked signal pending or with the immediate_exit field set 1044 // to complete pending operations without allowing any further instructions 1045 // to be executed. 1046 1047 #[cfg(feature = "kvm")] 1048 if matches!(hypervisor_type, HypervisorType::Kvm) { 1049 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 1050 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 1051 error!("Unexpected VM exit on \"immediate_exit\" run"); 1052 break; 1053 } 1054 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 1055 } 1056 1057 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1058 1059 vcpu_paused.store(true, Ordering::SeqCst); 1060 while vcpu_pause_signalled.load(Ordering::SeqCst) { 1061 thread::park(); 1062 } 1063 vcpu_run_interrupted.store(false, Ordering::SeqCst); 1064 } 1065 1066 if vcpu_kick_signalled.load(Ordering::SeqCst) { 1067 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1068 #[cfg(target_arch = "x86_64")] 1069 match vcpu.lock().as_ref().unwrap().vcpu.nmi() { 1070 Ok(()) => {}, 1071 Err(e) => { 1072 error!("Error when inject nmi {}", e); 1073 break; 1074 } 1075 } 1076 } 1077 1078 // We've been told to terminate 1079 if vcpu_kill_signalled.load(Ordering::SeqCst) 1080 || vcpu_kill.load(Ordering::SeqCst) 1081 { 1082 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1083 break; 1084 } 1085 1086 #[cfg(feature = "tdx")] 1087 let mut vcpu = vcpu.lock().unwrap(); 1088 #[cfg(not(feature = "tdx"))] 1089 let vcpu = vcpu.lock().unwrap(); 1090 // vcpu.run() returns false on a triple-fault so trigger a reset 1091 match vcpu.run() { 1092 Ok(run) => match run { 1093 #[cfg(feature = "kvm")] 1094 VmExit::Debug => { 1095 info!("VmExit::Debug"); 1096 #[cfg(feature = "guest_debug")] 1097 { 1098 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1099 let raw_tid = get_raw_tid(vcpu_id as usize); 1100 vm_debug_evt.write(raw_tid as u64).unwrap(); 1101 } 1102 } 1103 #[cfg(target_arch = "x86_64")] 1104 VmExit::IoapicEoi(vector) => { 1105 if let Some(interrupt_controller) = 1106 &interrupt_controller_clone 1107 { 1108 interrupt_controller 1109 .lock() 1110 .unwrap() 1111 .end_of_interrupt(vector); 1112 } 1113 } 1114 VmExit::Ignore => {} 1115 VmExit::Hyperv => {} 1116 VmExit::Reset => { 1117 info!("VmExit::Reset"); 1118 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1119 reset_evt.write(1).unwrap(); 1120 break; 1121 } 1122 VmExit::Shutdown => { 1123 info!("VmExit::Shutdown"); 1124 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1125 exit_evt.write(1).unwrap(); 1126 break; 1127 } 1128 #[cfg(feature = "tdx")] 1129 VmExit::Tdx => { 1130 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1131 match vcpu.get_tdx_exit_details() { 1132 Ok(details) => match details { 1133 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1134 TdxExitDetails::SetupEventNotifyInterrupt => { 1135 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1136 } 1137 }, 1138 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1139 } 1140 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1141 } else { 1142 // We should never reach this code as 1143 // this means the design from the code 1144 // is wrong. 1145 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1146 } 1147 } 1148 }, 1149 1150 Err(e) => { 1151 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1152 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1153 exit_evt.write(1).unwrap(); 1154 break; 1155 } 1156 } 1157 1158 // We've been told to terminate 1159 if vcpu_kill_signalled.load(Ordering::SeqCst) 1160 || vcpu_kill.load(Ordering::SeqCst) 1161 { 1162 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1163 break; 1164 } 1165 } 1166 }) 1167 .or_else(|_| { 1168 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1169 error!("vCPU thread panicked"); 1170 panic_exit_evt.write(1) 1171 }) 1172 .ok(); 1173 }) 1174 .map_err(Error::VcpuSpawn)?, 1175 ); 1176 1177 // On hot plug calls into this function entry_point is None. It is for 1178 // those hotplug CPU additions that we need to set the inserting flag. 1179 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1180 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1181 1182 Ok(()) 1183 } 1184 1185 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1186 fn activate_vcpus( 1187 &mut self, 1188 desired_vcpus: u8, 1189 inserting: bool, 1190 paused: Option<bool>, 1191 ) -> Result<()> { 1192 if desired_vcpus > self.config.max_vcpus { 1193 return Err(Error::DesiredVCpuCountExceedsMax); 1194 } 1195 1196 let vcpu_thread_barrier = Arc::new(Barrier::new( 1197 (desired_vcpus - self.present_vcpus() + 1) as usize, 1198 )); 1199 1200 if let Some(paused) = paused { 1201 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1202 } 1203 1204 info!( 1205 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1206 desired_vcpus, 1207 self.vcpus.len(), 1208 self.present_vcpus(), 1209 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1210 ); 1211 1212 // This reuses any inactive vCPUs as well as any that were newly created 1213 for vcpu_id in self.present_vcpus()..desired_vcpus { 1214 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1215 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1216 } 1217 1218 // Unblock all CPU threads. 1219 vcpu_thread_barrier.wait(); 1220 Ok(()) 1221 } 1222 1223 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1224 // Mark vCPUs for removal, actual removal happens on ejection 1225 for cpu_id in desired_vcpus..self.present_vcpus() { 1226 self.vcpu_states[usize::from(cpu_id)].removing = true; 1227 self.vcpu_states[usize::from(cpu_id)] 1228 .pending_removal 1229 .store(true, Ordering::SeqCst); 1230 } 1231 } 1232 1233 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1234 for state in self.vcpu_states.iter() { 1235 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1236 return true; 1237 } 1238 } 1239 false 1240 } 1241 1242 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1243 info!("Removing vCPU: cpu_id = {}", cpu_id); 1244 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1245 state.kill.store(true, Ordering::SeqCst); 1246 state.signal_thread(); 1247 state.join_thread()?; 1248 state.handle = None; 1249 1250 // Once the thread has exited, clear the "kill" so that it can reused 1251 state.kill.store(false, Ordering::SeqCst); 1252 state.pending_removal.store(false, Ordering::SeqCst); 1253 1254 Ok(()) 1255 } 1256 1257 pub fn create_boot_vcpus( 1258 &mut self, 1259 snapshot: Option<Snapshot>, 1260 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1261 trace_scoped!("create_boot_vcpus"); 1262 1263 self.create_vcpus(self.boot_vcpus(), snapshot) 1264 } 1265 1266 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1267 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1268 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1269 } 1270 1271 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1272 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1273 .map_err(|e| { 1274 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1275 })?; 1276 1277 Ok(()) 1278 } 1279 1280 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1281 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1282 return Ok(false); 1283 } 1284 1285 if !self.dynamic { 1286 return Ok(false); 1287 } 1288 1289 if self.check_pending_removed_vcpu() { 1290 return Err(Error::VcpuPendingRemovedVcpu); 1291 } 1292 1293 match desired_vcpus.cmp(&self.present_vcpus()) { 1294 cmp::Ordering::Greater => { 1295 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1296 for vcpu in vcpus { 1297 self.configure_vcpu(vcpu, None)? 1298 } 1299 self.activate_vcpus(desired_vcpus, true, None)?; 1300 Ok(true) 1301 } 1302 cmp::Ordering::Less => { 1303 self.mark_vcpus_for_removal(desired_vcpus); 1304 Ok(true) 1305 } 1306 _ => Ok(false), 1307 } 1308 } 1309 1310 pub fn shutdown(&mut self) -> Result<()> { 1311 // Tell the vCPUs to stop themselves next time they go through the loop 1312 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1313 1314 // Toggle the vCPUs pause boolean 1315 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1316 1317 // Unpark all the VCPU threads. 1318 for state in self.vcpu_states.iter() { 1319 state.unpark_thread(); 1320 } 1321 1322 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1323 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1324 // above. 1325 for state in self.vcpu_states.iter() { 1326 state.signal_thread(); 1327 } 1328 1329 // Wait for all the threads to finish. This removes the state from the vector. 1330 for mut state in self.vcpu_states.drain(..) { 1331 state.join_thread()?; 1332 } 1333 1334 Ok(()) 1335 } 1336 1337 #[cfg(feature = "tdx")] 1338 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1339 for vcpu in &self.vcpus { 1340 vcpu.lock() 1341 .unwrap() 1342 .vcpu 1343 .tdx_init(hob_address) 1344 .map_err(Error::InitializeTdx)?; 1345 } 1346 Ok(()) 1347 } 1348 1349 pub fn boot_vcpus(&self) -> u8 { 1350 self.config.boot_vcpus 1351 } 1352 1353 pub fn max_vcpus(&self) -> u8 { 1354 self.config.max_vcpus 1355 } 1356 1357 #[cfg(target_arch = "x86_64")] 1358 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1359 assert!(!self.cpuid.is_empty()); 1360 self.cpuid.clone() 1361 } 1362 1363 fn present_vcpus(&self) -> u8 { 1364 self.vcpu_states 1365 .iter() 1366 .fold(0, |acc, state| acc + state.active() as u8) 1367 } 1368 1369 #[cfg(target_arch = "aarch64")] 1370 pub fn get_mpidrs(&self) -> Vec<u64> { 1371 self.vcpus 1372 .iter() 1373 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1374 .collect() 1375 } 1376 1377 #[cfg(target_arch = "aarch64")] 1378 pub fn get_saved_states(&self) -> Vec<CpuState> { 1379 self.vcpus 1380 .iter() 1381 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1382 .collect() 1383 } 1384 1385 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1386 self.config 1387 .topology 1388 .clone() 1389 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1390 } 1391 1392 pub fn create_madt(&self) -> Sdt { 1393 use crate::acpi; 1394 // This is also checked in the commandline parsing. 1395 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1396 1397 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1398 #[cfg(target_arch = "x86_64")] 1399 { 1400 madt.write(36, arch::layout::APIC_START.0); 1401 1402 for cpu in 0..self.config.max_vcpus { 1403 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology()); 1404 1405 let lapic = LocalX2Apic { 1406 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1407 length: 16, 1408 processor_id: cpu.into(), 1409 apic_id: x2apic_id, 1410 flags: if cpu < self.config.boot_vcpus { 1411 1 << MADT_CPU_ENABLE_FLAG 1412 } else { 1413 0 1414 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1415 _reserved: 0, 1416 }; 1417 madt.append(lapic); 1418 } 1419 1420 madt.append(Ioapic { 1421 r#type: acpi::ACPI_APIC_IO, 1422 length: 12, 1423 ioapic_id: 0, 1424 apic_address: arch::layout::IOAPIC_START.0 as u32, 1425 gsi_base: 0, 1426 ..Default::default() 1427 }); 1428 1429 madt.append(InterruptSourceOverride { 1430 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1431 length: 10, 1432 bus: 0, 1433 source: 4, 1434 gsi: 4, 1435 flags: 0, 1436 }); 1437 } 1438 1439 #[cfg(target_arch = "aarch64")] 1440 { 1441 /* Notes: 1442 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1443 */ 1444 1445 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1446 for cpu in 0..self.config.boot_vcpus { 1447 let vcpu = &self.vcpus[cpu as usize]; 1448 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1449 /* ARMv8 MPIDR format: 1450 Bits [63:40] Must be zero 1451 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1452 Bits [31:24] Must be zero 1453 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1454 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1455 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1456 */ 1457 let mpidr_mask = 0xff_00ff_ffff; 1458 let gicc = GicC { 1459 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1460 length: 80, 1461 reserved0: 0, 1462 cpu_interface_number: cpu as u32, 1463 uid: cpu as u32, 1464 flags: 1, 1465 parking_version: 0, 1466 performance_interrupt: 0, 1467 parked_address: 0, 1468 base_address: 0, 1469 gicv_base_address: 0, 1470 gich_base_address: 0, 1471 vgic_interrupt: 0, 1472 gicr_base_address: 0, 1473 mpidr: mpidr & mpidr_mask, 1474 proc_power_effi_class: 0, 1475 reserved1: 0, 1476 spe_overflow_interrupt: 0, 1477 }; 1478 1479 madt.append(gicc); 1480 } 1481 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1482 1483 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1484 let gicd = GicD { 1485 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1486 length: 24, 1487 reserved0: 0, 1488 gic_id: 0, 1489 base_address: vgic_config.dist_addr, 1490 global_irq_base: 0, 1491 version: 3, 1492 reserved1: [0; 3], 1493 }; 1494 madt.append(gicd); 1495 1496 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1497 let gicr = GicR { 1498 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1499 length: 16, 1500 reserved: 0, 1501 base_address: vgic_config.redists_addr, 1502 range_length: vgic_config.redists_size as u32, 1503 }; 1504 madt.append(gicr); 1505 1506 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1507 let gicits = GicIts { 1508 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1509 length: 20, 1510 reserved0: 0, 1511 translation_id: 0, 1512 base_address: vgic_config.msi_addr, 1513 reserved1: 0, 1514 }; 1515 madt.append(gicits); 1516 1517 madt.update_checksum(); 1518 } 1519 1520 madt 1521 } 1522 1523 #[cfg(target_arch = "aarch64")] 1524 pub fn create_pptt(&self) -> Sdt { 1525 let pptt_start = 0; 1526 let mut cpus = 0; 1527 let mut uid = 0; 1528 // If topology is not specified, the default setting is: 1529 // 1 package, multiple cores, 1 thread per core 1530 // This is also the behavior when PPTT is missing. 1531 let (threads_per_core, cores_per_package, packages) = 1532 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1533 1534 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1535 1536 for cluster_idx in 0..packages { 1537 if cpus < self.config.boot_vcpus as usize { 1538 let cluster_offset = pptt.len() - pptt_start; 1539 let cluster_hierarchy_node = ProcessorHierarchyNode { 1540 r#type: 0, 1541 length: 20, 1542 reserved: 0, 1543 flags: 0x2, 1544 parent: 0, 1545 acpi_processor_id: cluster_idx as u32, 1546 num_private_resources: 0, 1547 }; 1548 pptt.append(cluster_hierarchy_node); 1549 1550 for core_idx in 0..cores_per_package { 1551 let core_offset = pptt.len() - pptt_start; 1552 1553 if threads_per_core > 1 { 1554 let core_hierarchy_node = ProcessorHierarchyNode { 1555 r#type: 0, 1556 length: 20, 1557 reserved: 0, 1558 flags: 0x2, 1559 parent: cluster_offset as u32, 1560 acpi_processor_id: core_idx as u32, 1561 num_private_resources: 0, 1562 }; 1563 pptt.append(core_hierarchy_node); 1564 1565 for _thread_idx in 0..threads_per_core { 1566 let thread_hierarchy_node = ProcessorHierarchyNode { 1567 r#type: 0, 1568 length: 20, 1569 reserved: 0, 1570 flags: 0xE, 1571 parent: core_offset as u32, 1572 acpi_processor_id: uid as u32, 1573 num_private_resources: 0, 1574 }; 1575 pptt.append(thread_hierarchy_node); 1576 uid += 1; 1577 } 1578 } else { 1579 let thread_hierarchy_node = ProcessorHierarchyNode { 1580 r#type: 0, 1581 length: 20, 1582 reserved: 0, 1583 flags: 0xA, 1584 parent: cluster_offset as u32, 1585 acpi_processor_id: uid as u32, 1586 num_private_resources: 0, 1587 }; 1588 pptt.append(thread_hierarchy_node); 1589 uid += 1; 1590 } 1591 } 1592 cpus += (cores_per_package * threads_per_core) as usize; 1593 } 1594 } 1595 1596 pptt.update_checksum(); 1597 pptt 1598 } 1599 1600 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1601 fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters { 1602 self.vcpus[usize::from(cpu_id)] 1603 .lock() 1604 .unwrap() 1605 .vcpu 1606 .create_standard_regs() 1607 } 1608 1609 #[cfg(feature = "guest_debug")] 1610 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1611 self.vcpus[usize::from(cpu_id)] 1612 .lock() 1613 .unwrap() 1614 .vcpu 1615 .get_regs() 1616 .map_err(Error::CpuDebug) 1617 } 1618 1619 #[cfg(feature = "guest_debug")] 1620 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1621 self.vcpus[usize::from(cpu_id)] 1622 .lock() 1623 .unwrap() 1624 .vcpu 1625 .set_regs(regs) 1626 .map_err(Error::CpuDebug) 1627 } 1628 1629 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1630 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1631 self.vcpus[usize::from(cpu_id)] 1632 .lock() 1633 .unwrap() 1634 .vcpu 1635 .get_sregs() 1636 .map_err(Error::CpuDebug) 1637 } 1638 1639 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1640 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1641 self.vcpus[usize::from(cpu_id)] 1642 .lock() 1643 .unwrap() 1644 .vcpu 1645 .set_sregs(sregs) 1646 .map_err(Error::CpuDebug) 1647 } 1648 1649 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1650 fn translate_gva( 1651 &self, 1652 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1653 cpu_id: u8, 1654 gva: u64, 1655 ) -> Result<u64> { 1656 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1657 .lock() 1658 .unwrap() 1659 .vcpu 1660 .translate_gva(gva, /* flags: unused */ 0) 1661 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1662 Ok(gpa) 1663 } 1664 1665 /// 1666 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1667 /// it in VMM by walking through translation tables. 1668 /// 1669 /// Address translation is big topic, here we only focus the scenario that 1670 /// happens in VMM while debugging kernel. This `translate_gva` 1671 /// implementation is restricted to: 1672 /// - Exception Level 1 1673 /// - Translate high address range only (kernel space) 1674 /// 1675 /// This implementation supports following Arm-v8a features related to 1676 /// address translation: 1677 /// - FEAT_LPA 1678 /// - FEAT_LVA 1679 /// - FEAT_LPA2 1680 /// 1681 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1682 fn translate_gva( 1683 &self, 1684 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1685 cpu_id: u8, 1686 gva: u64, 1687 ) -> Result<u64> { 1688 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1689 .lock() 1690 .unwrap() 1691 .vcpu 1692 .get_sys_reg(regs::TCR_EL1) 1693 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1694 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1695 .lock() 1696 .unwrap() 1697 .vcpu 1698 .get_sys_reg(regs::TTBR1_EL1) 1699 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1700 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1701 .lock() 1702 .unwrap() 1703 .vcpu 1704 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1705 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1706 1707 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1708 // or low (0x000xxx...). 1709 let high_range = extract_bits_64!(gva, 55, 1); 1710 if high_range == 0 { 1711 info!("VA (0x{:x}) range is not supported!", gva); 1712 return Ok(gva); 1713 } 1714 1715 // High range size offset 1716 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1717 // Granule size 1718 let tg = extract_bits_64!(tcr_el1, 30, 2); 1719 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1720 let ds = extract_bits_64!(tcr_el1, 59, 1); 1721 1722 if tsz == 0 { 1723 info!("VA translation is not ready!"); 1724 return Ok(gva); 1725 } 1726 1727 // VA size is determined by TCR_BL1.T1SZ 1728 let va_size = 64 - tsz; 1729 // Number of bits in VA consumed in each level of translation 1730 let stride = match tg { 1731 3 => 13, // 64KB granule size 1732 1 => 11, // 16KB granule size 1733 _ => 9, // 4KB, default 1734 }; 1735 // Starting level of walking 1736 let mut level = 4 - (va_size - 4) / stride; 1737 1738 // PA or IPA size is determined 1739 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1740 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1741 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1742 // To be safe, we use the minimum value if they are different. 1743 let pa_range = std::cmp::min(tcr_ips, pa_range); 1744 // PA size in bits 1745 let pa_size = match pa_range { 1746 0 => 32, 1747 1 => 36, 1748 2 => 40, 1749 3 => 42, 1750 4 => 44, 1751 5 => 48, 1752 6 => 52, 1753 _ => { 1754 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1755 "PA range not supported {pa_range}" 1756 )))) 1757 } 1758 }; 1759 1760 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1761 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1762 // If FEAT_LPA2 is present, the translation table descriptor holds 1763 // 50 bits of the table address of next level. 1764 // Otherwise, it is 48 bits. 1765 let descaddrmask = if ds == 1 { 1766 !0u64 >> (64 - 50) // mask with 50 least significant bits 1767 } else { 1768 !0u64 >> (64 - 48) // mask with 48 least significant bits 1769 }; 1770 let descaddrmask = descaddrmask & !indexmask_grainsize; 1771 1772 // Translation table base address 1773 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1774 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1775 // address bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1776 if pa_size == 52 { 1777 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1778 } 1779 1780 // Loop through tables of each level 1781 loop { 1782 // Table offset for current level 1783 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1784 descaddr |= table_offset; 1785 descaddr &= !7u64; 1786 1787 let mut buf = [0; 8]; 1788 guest_memory 1789 .memory() 1790 .read(&mut buf, GuestAddress(descaddr)) 1791 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1792 let descriptor = u64::from_le_bytes(buf); 1793 1794 descaddr = descriptor & descaddrmask; 1795 // In the case of FEAT_LPA, the next-level translation table address 1796 // bits [48:51] comes from bits [12:15] of the current descriptor. 1797 // For FEAT_LPA2, the next-level translation table address 1798 // bits [50:51] comes from bits [8:9] of the current descriptor, 1799 // bits [48:49] comes from bits [48:49] of the descriptor which was 1800 // handled previously. 1801 if pa_size == 52 { 1802 if ds == 1 { 1803 // FEAT_LPA2 1804 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1805 } else { 1806 // FEAT_LPA 1807 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1808 } 1809 } 1810 1811 if (descriptor & 2) != 0 && (level < 3) { 1812 // This is a table entry. Go down to next level. 1813 level += 1; 1814 indexmask = indexmask_grainsize; 1815 continue; 1816 } 1817 1818 break; 1819 } 1820 1821 // We have reached either: 1822 // - a page entry at level 3 or 1823 // - a block entry at level 1 or 2 1824 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1825 descaddr &= !(page_size - 1); 1826 descaddr |= gva & (page_size - 1); 1827 1828 Ok(descaddr) 1829 } 1830 1831 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1832 self.acpi_address = Some(acpi_address); 1833 } 1834 1835 pub(crate) fn set_interrupt_controller( 1836 &mut self, 1837 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1838 ) { 1839 self.interrupt_controller = Some(interrupt_controller); 1840 } 1841 1842 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1843 &self.vcpus_kill_signalled 1844 } 1845 1846 #[cfg(feature = "igvm")] 1847 pub(crate) fn get_cpuid_leaf( 1848 &self, 1849 cpu_id: u8, 1850 eax: u32, 1851 ecx: u32, 1852 xfem: u64, 1853 xss: u64, 1854 ) -> Result<[u32; 4]> { 1855 let leaf_info = self.vcpus[usize::from(cpu_id)] 1856 .lock() 1857 .unwrap() 1858 .vcpu 1859 .get_cpuid_values(eax, ecx, xfem, xss) 1860 .unwrap(); 1861 Ok(leaf_info) 1862 } 1863 1864 #[cfg(feature = "sev_snp")] 1865 pub(crate) fn sev_snp_enabled(&self) -> bool { 1866 self.sev_snp_enabled 1867 } 1868 1869 pub(crate) fn nmi(&self) -> Result<()> { 1870 self.vcpus_kick_signalled.store(true, Ordering::SeqCst); 1871 1872 for state in self.vcpu_states.iter() { 1873 state.signal_thread(); 1874 } 1875 1876 self.vcpus_kick_signalled.store(false, Ordering::SeqCst); 1877 1878 Ok(()) 1879 } 1880 } 1881 1882 struct Cpu { 1883 cpu_id: u8, 1884 proximity_domain: u32, 1885 dynamic: bool, 1886 #[cfg(target_arch = "x86_64")] 1887 topology: Option<(u8, u8, u8)>, 1888 } 1889 1890 #[cfg(target_arch = "x86_64")] 1891 const MADT_CPU_ENABLE_FLAG: usize = 0; 1892 1893 #[cfg(target_arch = "x86_64")] 1894 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1895 1896 impl Cpu { 1897 #[cfg(target_arch = "x86_64")] 1898 fn generate_mat(&self) -> Vec<u8> { 1899 let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology); 1900 1901 let lapic = LocalX2Apic { 1902 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1903 length: 16, 1904 processor_id: self.cpu_id.into(), 1905 apic_id: x2apic_id, 1906 flags: 1 << MADT_CPU_ENABLE_FLAG, 1907 _reserved: 0, 1908 }; 1909 1910 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1911 // SAFETY: mat_data is large enough to hold lapic 1912 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1913 1914 mat_data 1915 } 1916 } 1917 1918 impl Aml for Cpu { 1919 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1920 #[cfg(target_arch = "x86_64")] 1921 let mat_data: Vec<u8> = self.generate_mat(); 1922 #[allow(clippy::if_same_then_else)] 1923 if self.dynamic { 1924 aml::Device::new( 1925 format!("C{:03X}", self.cpu_id).as_str().into(), 1926 vec![ 1927 &aml::Name::new("_HID".into(), &"ACPI0007"), 1928 &aml::Name::new("_UID".into(), &self.cpu_id), 1929 // Currently, AArch64 cannot support following fields. 1930 /* 1931 _STA return value: 1932 Bit [0] – Set if the device is present. 1933 Bit [1] – Set if the device is enabled and decoding its resources. 1934 Bit [2] – Set if the device should be shown in the UI. 1935 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1936 Bit [4] – Set if the battery is present. 1937 Bits [31:5] – Reserved (must be cleared). 1938 */ 1939 #[cfg(target_arch = "x86_64")] 1940 &aml::Method::new( 1941 "_STA".into(), 1942 0, 1943 false, 1944 // Call into CSTA method which will interrogate device 1945 vec![&aml::Return::new(&aml::MethodCall::new( 1946 "CSTA".into(), 1947 vec![&self.cpu_id], 1948 ))], 1949 ), 1950 &aml::Method::new( 1951 "_PXM".into(), 1952 0, 1953 false, 1954 vec![&aml::Return::new(&self.proximity_domain)], 1955 ), 1956 // The Linux kernel expects every CPU device to have a _MAT entry 1957 // containing the LAPIC for this processor with the enabled bit set 1958 // even it if is disabled in the MADT (non-boot CPU) 1959 #[cfg(target_arch = "x86_64")] 1960 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1961 // Trigger CPU ejection 1962 #[cfg(target_arch = "x86_64")] 1963 &aml::Method::new( 1964 "_EJ0".into(), 1965 1, 1966 false, 1967 // Call into CEJ0 method which will actually eject device 1968 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1969 ), 1970 ], 1971 ) 1972 .to_aml_bytes(sink); 1973 } else { 1974 aml::Device::new( 1975 format!("C{:03X}", self.cpu_id).as_str().into(), 1976 vec![ 1977 &aml::Name::new("_HID".into(), &"ACPI0007"), 1978 &aml::Name::new("_UID".into(), &self.cpu_id), 1979 #[cfg(target_arch = "x86_64")] 1980 &aml::Method::new( 1981 "_STA".into(), 1982 0, 1983 false, 1984 // Mark CPU present see CSTA implementation 1985 vec![&aml::Return::new(&0xfu8)], 1986 ), 1987 &aml::Method::new( 1988 "_PXM".into(), 1989 0, 1990 false, 1991 vec![&aml::Return::new(&self.proximity_domain)], 1992 ), 1993 // The Linux kernel expects every CPU device to have a _MAT entry 1994 // containing the LAPIC for this processor with the enabled bit set 1995 // even it if is disabled in the MADT (non-boot CPU) 1996 #[cfg(target_arch = "x86_64")] 1997 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1998 ], 1999 ) 2000 .to_aml_bytes(sink); 2001 } 2002 } 2003 } 2004 2005 struct CpuNotify { 2006 cpu_id: u8, 2007 } 2008 2009 impl Aml for CpuNotify { 2010 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2011 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 2012 aml::If::new( 2013 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 2014 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2015 ) 2016 .to_aml_bytes(sink) 2017 } 2018 } 2019 2020 struct CpuMethods { 2021 max_vcpus: u8, 2022 dynamic: bool, 2023 } 2024 2025 impl Aml for CpuMethods { 2026 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2027 if self.dynamic { 2028 // CPU status method 2029 aml::Method::new( 2030 "CSTA".into(), 2031 1, 2032 true, 2033 vec![ 2034 // Take lock defined above 2035 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2036 // Write CPU number (in first argument) to I/O port via field 2037 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2038 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2039 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2040 &aml::If::new( 2041 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 2042 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2043 ), 2044 // Release lock 2045 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2046 // Return 0 or 0xf 2047 &aml::Return::new(&aml::Local(0)), 2048 ], 2049 ) 2050 .to_aml_bytes(sink); 2051 2052 let mut cpu_notifies = Vec::new(); 2053 for cpu_id in 0..self.max_vcpus { 2054 cpu_notifies.push(CpuNotify { cpu_id }); 2055 } 2056 2057 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2058 for cpu_id in 0..self.max_vcpus { 2059 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 2060 } 2061 2062 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 2063 2064 aml::Method::new( 2065 "CEJ0".into(), 2066 1, 2067 true, 2068 vec![ 2069 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2070 // Write CPU number (in first argument) to I/O port via field 2071 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2072 // Set CEJ0 bit 2073 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 2074 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2075 ], 2076 ) 2077 .to_aml_bytes(sink); 2078 2079 aml::Method::new( 2080 "CSCN".into(), 2081 0, 2082 true, 2083 vec![ 2084 // Take lock defined above 2085 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2086 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2087 &aml::While::new( 2088 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 2089 vec![ 2090 // Write CPU number (in first argument) to I/O port via field 2091 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 2092 // Check if CINS bit is set 2093 &aml::If::new( 2094 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 2095 // Notify device if it is 2096 vec![ 2097 &aml::MethodCall::new( 2098 "CTFY".into(), 2099 vec![&aml::Local(0), &aml::ONE], 2100 ), 2101 // Reset CINS bit 2102 &aml::Store::new( 2103 &aml::Path::new("\\_SB_.PRES.CINS"), 2104 &aml::ONE, 2105 ), 2106 ], 2107 ), 2108 // Check if CRMV bit is set 2109 &aml::If::new( 2110 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 2111 // Notify device if it is (with the eject constant 0x3) 2112 vec![ 2113 &aml::MethodCall::new( 2114 "CTFY".into(), 2115 vec![&aml::Local(0), &3u8], 2116 ), 2117 // Reset CRMV bit 2118 &aml::Store::new( 2119 &aml::Path::new("\\_SB_.PRES.CRMV"), 2120 &aml::ONE, 2121 ), 2122 ], 2123 ), 2124 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2125 ], 2126 ), 2127 // Release lock 2128 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2129 ], 2130 ) 2131 .to_aml_bytes(sink) 2132 } else { 2133 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2134 } 2135 } 2136 } 2137 2138 impl Aml for CpuManager { 2139 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2140 #[cfg(target_arch = "x86_64")] 2141 if let Some(acpi_address) = self.acpi_address { 2142 // CPU hotplug controller 2143 aml::Device::new( 2144 "_SB_.PRES".into(), 2145 vec![ 2146 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2147 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2148 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2149 &aml::Mutex::new("CPLK".into(), 0), 2150 &aml::Name::new( 2151 "_CRS".into(), 2152 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2153 aml::AddressSpaceCacheable::NotCacheable, 2154 true, 2155 acpi_address.0, 2156 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2157 None, 2158 )]), 2159 ), 2160 // OpRegion and Fields map MMIO range into individual field values 2161 &aml::OpRegion::new( 2162 "PRST".into(), 2163 aml::OpRegionSpace::SystemMemory, 2164 &(acpi_address.0 as usize), 2165 &CPU_MANAGER_ACPI_SIZE, 2166 ), 2167 &aml::Field::new( 2168 "PRST".into(), 2169 aml::FieldAccessType::Byte, 2170 aml::FieldLockRule::NoLock, 2171 aml::FieldUpdateRule::WriteAsZeroes, 2172 vec![ 2173 aml::FieldEntry::Reserved(32), 2174 aml::FieldEntry::Named(*b"CPEN", 1), 2175 aml::FieldEntry::Named(*b"CINS", 1), 2176 aml::FieldEntry::Named(*b"CRMV", 1), 2177 aml::FieldEntry::Named(*b"CEJ0", 1), 2178 aml::FieldEntry::Reserved(4), 2179 aml::FieldEntry::Named(*b"CCMD", 8), 2180 ], 2181 ), 2182 &aml::Field::new( 2183 "PRST".into(), 2184 aml::FieldAccessType::DWord, 2185 aml::FieldLockRule::NoLock, 2186 aml::FieldUpdateRule::Preserve, 2187 vec![ 2188 aml::FieldEntry::Named(*b"CSEL", 32), 2189 aml::FieldEntry::Reserved(32), 2190 aml::FieldEntry::Named(*b"CDAT", 32), 2191 ], 2192 ), 2193 ], 2194 ) 2195 .to_aml_bytes(sink); 2196 } 2197 2198 // CPU devices 2199 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2200 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2201 // Bundle methods together under a common object 2202 let methods = CpuMethods { 2203 max_vcpus: self.config.max_vcpus, 2204 dynamic: self.dynamic, 2205 }; 2206 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2207 2208 #[cfg(target_arch = "x86_64")] 2209 let topology = self.get_vcpu_topology(); 2210 let mut cpu_devices = Vec::new(); 2211 for cpu_id in 0..self.config.max_vcpus { 2212 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2213 let cpu_device = Cpu { 2214 cpu_id, 2215 proximity_domain, 2216 dynamic: self.dynamic, 2217 #[cfg(target_arch = "x86_64")] 2218 topology, 2219 }; 2220 2221 cpu_devices.push(cpu_device); 2222 } 2223 2224 for cpu_device in cpu_devices.iter() { 2225 cpu_data_inner.push(cpu_device); 2226 } 2227 2228 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2229 } 2230 } 2231 2232 impl Pausable for CpuManager { 2233 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2234 // Tell the vCPUs to pause themselves next time they exit 2235 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2236 2237 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2238 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2239 // above. 2240 for state in self.vcpu_states.iter() { 2241 state.signal_thread(); 2242 } 2243 2244 for vcpu in self.vcpus.iter() { 2245 let mut vcpu = vcpu.lock().unwrap(); 2246 vcpu.pause()?; 2247 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2248 if !self.config.kvm_hyperv { 2249 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2250 MigratableError::Pause(anyhow!( 2251 "Could not notify guest it has been paused {:?}", 2252 e 2253 )) 2254 })?; 2255 } 2256 } 2257 2258 // The vCPU thread will change its paused state before parking, wait here for each 2259 // activated vCPU change their state to ensure they have parked. 2260 for state in self.vcpu_states.iter() { 2261 if state.active() { 2262 while !state.paused.load(Ordering::SeqCst) { 2263 // To avoid a priority inversion with the vCPU thread 2264 thread::sleep(std::time::Duration::from_millis(1)); 2265 } 2266 } 2267 } 2268 2269 Ok(()) 2270 } 2271 2272 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2273 for vcpu in self.vcpus.iter() { 2274 vcpu.lock().unwrap().resume()?; 2275 } 2276 2277 // Toggle the vCPUs pause boolean 2278 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2279 2280 // Unpark all the VCPU threads. 2281 // Once unparked, the next thing they will do is checking for the pause 2282 // boolean. Since it'll be set to false, they will exit their pause loop 2283 // and go back to vmx root. 2284 for state in self.vcpu_states.iter() { 2285 state.paused.store(false, Ordering::SeqCst); 2286 state.unpark_thread(); 2287 } 2288 Ok(()) 2289 } 2290 } 2291 2292 impl Snapshottable for CpuManager { 2293 fn id(&self) -> String { 2294 CPU_MANAGER_SNAPSHOT_ID.to_string() 2295 } 2296 2297 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2298 let mut cpu_manager_snapshot = Snapshot::default(); 2299 2300 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2301 for vcpu in &self.vcpus { 2302 let mut vcpu = vcpu.lock().unwrap(); 2303 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2304 } 2305 2306 Ok(cpu_manager_snapshot) 2307 } 2308 } 2309 2310 impl Transportable for CpuManager {} 2311 impl Migratable for CpuManager {} 2312 2313 #[cfg(feature = "guest_debug")] 2314 impl Debuggable for CpuManager { 2315 #[cfg(feature = "kvm")] 2316 fn set_guest_debug( 2317 &self, 2318 cpu_id: usize, 2319 addrs: &[GuestAddress], 2320 singlestep: bool, 2321 ) -> std::result::Result<(), DebuggableError> { 2322 self.vcpus[cpu_id] 2323 .lock() 2324 .unwrap() 2325 .vcpu 2326 .set_guest_debug(addrs, singlestep) 2327 .map_err(DebuggableError::SetDebug) 2328 } 2329 2330 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2331 Ok(()) 2332 } 2333 2334 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2335 Ok(()) 2336 } 2337 2338 #[cfg(target_arch = "x86_64")] 2339 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2340 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2341 let gregs = self 2342 .get_regs(cpu_id as u8) 2343 .map_err(DebuggableError::ReadRegs)?; 2344 let regs = [ 2345 gregs.get_rax(), 2346 gregs.get_rbx(), 2347 gregs.get_rcx(), 2348 gregs.get_rdx(), 2349 gregs.get_rsi(), 2350 gregs.get_rdi(), 2351 gregs.get_rbp(), 2352 gregs.get_rsp(), 2353 gregs.get_r8(), 2354 gregs.get_r9(), 2355 gregs.get_r10(), 2356 gregs.get_r11(), 2357 gregs.get_r12(), 2358 gregs.get_r13(), 2359 gregs.get_r14(), 2360 gregs.get_r15(), 2361 ]; 2362 2363 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2364 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2365 let eflags = gregs.get_rflags() as u32; 2366 let rip = gregs.get_rip(); 2367 2368 // Segment registers: CS, SS, DS, ES, FS, GS 2369 let sregs = self 2370 .get_sregs(cpu_id as u8) 2371 .map_err(DebuggableError::ReadRegs)?; 2372 let segments = X86SegmentRegs { 2373 cs: sregs.cs.selector as u32, 2374 ss: sregs.ss.selector as u32, 2375 ds: sregs.ds.selector as u32, 2376 es: sregs.es.selector as u32, 2377 fs: sregs.fs.selector as u32, 2378 gs: sregs.gs.selector as u32, 2379 }; 2380 2381 // TODO: Add other registers 2382 2383 Ok(CoreRegs { 2384 regs, 2385 eflags, 2386 rip, 2387 segments, 2388 ..Default::default() 2389 }) 2390 } 2391 2392 #[cfg(target_arch = "aarch64")] 2393 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2394 let gregs = self 2395 .get_regs(cpu_id as u8) 2396 .map_err(DebuggableError::ReadRegs)?; 2397 Ok(CoreRegs { 2398 x: gregs.regs.regs, 2399 sp: gregs.regs.sp, 2400 pc: gregs.regs.pc, 2401 ..Default::default() 2402 }) 2403 } 2404 2405 #[cfg(target_arch = "x86_64")] 2406 fn write_regs( 2407 &self, 2408 cpu_id: usize, 2409 regs: &CoreRegs, 2410 ) -> std::result::Result<(), DebuggableError> { 2411 let orig_gregs = self 2412 .get_regs(cpu_id as u8) 2413 .map_err(DebuggableError::ReadRegs)?; 2414 let mut gregs = self.create_standard_regs(cpu_id as u8); 2415 gregs.set_rax(regs.regs[0]); 2416 gregs.set_rbx(regs.regs[1]); 2417 gregs.set_rcx(regs.regs[2]); 2418 gregs.set_rdx(regs.regs[3]); 2419 gregs.set_rsi(regs.regs[4]); 2420 gregs.set_rdi(regs.regs[5]); 2421 gregs.set_rbp(regs.regs[6]); 2422 gregs.set_rsp(regs.regs[7]); 2423 gregs.set_r8(regs.regs[8]); 2424 gregs.set_r9(regs.regs[9]); 2425 gregs.set_r10(regs.regs[10]); 2426 gregs.set_r11(regs.regs[11]); 2427 gregs.set_r12(regs.regs[12]); 2428 gregs.set_r13(regs.regs[13]); 2429 gregs.set_r14(regs.regs[14]); 2430 gregs.set_r15(regs.regs[15]); 2431 gregs.set_rip(regs.rip); 2432 // Update the lower 32-bit of rflags. 2433 gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64)); 2434 2435 self.set_regs(cpu_id as u8, &gregs) 2436 .map_err(DebuggableError::WriteRegs)?; 2437 2438 // Segment registers: CS, SS, DS, ES, FS, GS 2439 // Since GDB care only selectors, we call get_sregs() first. 2440 let mut sregs = self 2441 .get_sregs(cpu_id as u8) 2442 .map_err(DebuggableError::ReadRegs)?; 2443 sregs.cs.selector = regs.segments.cs as u16; 2444 sregs.ss.selector = regs.segments.ss as u16; 2445 sregs.ds.selector = regs.segments.ds as u16; 2446 sregs.es.selector = regs.segments.es as u16; 2447 sregs.fs.selector = regs.segments.fs as u16; 2448 sregs.gs.selector = regs.segments.gs as u16; 2449 2450 self.set_sregs(cpu_id as u8, &sregs) 2451 .map_err(DebuggableError::WriteRegs)?; 2452 2453 // TODO: Add other registers 2454 2455 Ok(()) 2456 } 2457 2458 #[cfg(target_arch = "aarch64")] 2459 fn write_regs( 2460 &self, 2461 cpu_id: usize, 2462 regs: &CoreRegs, 2463 ) -> std::result::Result<(), DebuggableError> { 2464 let mut gregs = self 2465 .get_regs(cpu_id as u8) 2466 .map_err(DebuggableError::ReadRegs)?; 2467 2468 gregs.regs.regs = regs.x; 2469 gregs.regs.sp = regs.sp; 2470 gregs.regs.pc = regs.pc; 2471 2472 self.set_regs(cpu_id as u8, &gregs) 2473 .map_err(DebuggableError::WriteRegs)?; 2474 2475 Ok(()) 2476 } 2477 2478 fn read_mem( 2479 &self, 2480 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2481 cpu_id: usize, 2482 vaddr: GuestAddress, 2483 len: usize, 2484 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2485 let mut buf = vec![0; len]; 2486 let mut total_read = 0_u64; 2487 2488 while total_read < len as u64 { 2489 let gaddr = vaddr.0 + total_read; 2490 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2491 Ok(paddr) => paddr, 2492 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2493 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2494 }; 2495 let psize = arch::PAGE_SIZE as u64; 2496 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2497 guest_memory 2498 .memory() 2499 .read( 2500 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2501 GuestAddress(paddr), 2502 ) 2503 .map_err(DebuggableError::ReadMem)?; 2504 total_read += read_len; 2505 } 2506 Ok(buf) 2507 } 2508 2509 fn write_mem( 2510 &self, 2511 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2512 cpu_id: usize, 2513 vaddr: &GuestAddress, 2514 data: &[u8], 2515 ) -> std::result::Result<(), DebuggableError> { 2516 let mut total_written = 0_u64; 2517 2518 while total_written < data.len() as u64 { 2519 let gaddr = vaddr.0 + total_written; 2520 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2521 Ok(paddr) => paddr, 2522 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2523 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2524 }; 2525 let psize = arch::PAGE_SIZE as u64; 2526 let write_len = std::cmp::min( 2527 data.len() as u64 - total_written, 2528 psize - (paddr & (psize - 1)), 2529 ); 2530 guest_memory 2531 .memory() 2532 .write( 2533 &data[total_written as usize..total_written as usize + write_len as usize], 2534 GuestAddress(paddr), 2535 ) 2536 .map_err(DebuggableError::WriteMem)?; 2537 total_written += write_len; 2538 } 2539 Ok(()) 2540 } 2541 2542 fn active_vcpus(&self) -> usize { 2543 self.present_vcpus() as usize 2544 } 2545 } 2546 2547 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2548 impl Elf64Writable for CpuManager {} 2549 2550 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2551 impl CpuElf64Writable for CpuManager { 2552 fn cpu_write_elf64_note( 2553 &mut self, 2554 dump_state: &DumpState, 2555 ) -> std::result::Result<(), GuestDebuggableError> { 2556 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2557 for vcpu in &self.vcpus { 2558 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2559 let mut pos: usize = 0; 2560 let mut buf = vec![0; note_size as usize]; 2561 let descsz = size_of::<X86_64ElfPrStatus>(); 2562 let vcpu_id = vcpu.lock().unwrap().id; 2563 2564 let note = Elf64_Nhdr { 2565 n_namesz: COREDUMP_NAME_SIZE, 2566 n_descsz: descsz as u32, 2567 n_type: NT_PRSTATUS, 2568 }; 2569 2570 let bytes: &[u8] = note.as_slice(); 2571 buf.splice(0.., bytes.to_vec()); 2572 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2573 buf.resize(pos + 4, 0); 2574 buf.splice(pos.., "CORE".to_string().into_bytes()); 2575 2576 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2577 buf.resize(pos + 32 + 4, 0); 2578 let pid = vcpu_id as u64; 2579 let bytes: &[u8] = pid.as_slice(); 2580 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2581 2582 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2583 2584 let orig_rax: u64 = 0; 2585 let gregs = self.vcpus[usize::from(vcpu_id)] 2586 .lock() 2587 .unwrap() 2588 .vcpu 2589 .get_regs() 2590 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2591 2592 let regs1 = [ 2593 gregs.get_r15(), 2594 gregs.get_r14(), 2595 gregs.get_r13(), 2596 gregs.get_r12(), 2597 gregs.get_rbp(), 2598 gregs.get_rbx(), 2599 gregs.get_r11(), 2600 gregs.get_r10(), 2601 ]; 2602 let regs2 = [ 2603 gregs.get_r9(), 2604 gregs.get_r8(), 2605 gregs.get_rax(), 2606 gregs.get_rcx(), 2607 gregs.get_rdx(), 2608 gregs.get_rsi(), 2609 gregs.get_rdi(), 2610 orig_rax, 2611 ]; 2612 2613 let sregs = self.vcpus[usize::from(vcpu_id)] 2614 .lock() 2615 .unwrap() 2616 .vcpu 2617 .get_sregs() 2618 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2619 2620 debug!( 2621 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2622 gregs.get_rip(), 2623 gregs.get_rsp(), 2624 sregs.gs.base, 2625 sregs.cs.selector, 2626 sregs.ss.selector, 2627 sregs.ds.selector, 2628 ); 2629 2630 let regs = X86_64UserRegs { 2631 regs1, 2632 regs2, 2633 rip: gregs.get_rip(), 2634 cs: sregs.cs.selector as u64, 2635 eflags: gregs.get_rflags(), 2636 rsp: gregs.get_rsp(), 2637 ss: sregs.ss.selector as u64, 2638 fs_base: sregs.fs.base, 2639 gs_base: sregs.gs.base, 2640 ds: sregs.ds.selector as u64, 2641 es: sregs.es.selector as u64, 2642 fs: sregs.fs.selector as u64, 2643 gs: sregs.gs.selector as u64, 2644 }; 2645 2646 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2647 let bytes: &[u8] = regs.as_slice(); 2648 buf.resize(note_size as usize, 0); 2649 buf.splice(pos.., bytes.to_vec()); 2650 buf.resize(note_size as usize, 0); 2651 2652 coredump_file 2653 .write(&buf) 2654 .map_err(GuestDebuggableError::CoredumpFile)?; 2655 } 2656 2657 Ok(()) 2658 } 2659 2660 fn cpu_write_vmm_note( 2661 &mut self, 2662 dump_state: &DumpState, 2663 ) -> std::result::Result<(), GuestDebuggableError> { 2664 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2665 for vcpu in &self.vcpus { 2666 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2667 let mut pos: usize = 0; 2668 let mut buf = vec![0; note_size as usize]; 2669 let descsz = size_of::<DumpCpusState>(); 2670 let vcpu_id = vcpu.lock().unwrap().id; 2671 2672 let note = Elf64_Nhdr { 2673 n_namesz: COREDUMP_NAME_SIZE, 2674 n_descsz: descsz as u32, 2675 n_type: 0, 2676 }; 2677 2678 let bytes: &[u8] = note.as_slice(); 2679 buf.splice(0.., bytes.to_vec()); 2680 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2681 2682 buf.resize(pos + 4, 0); 2683 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2684 2685 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2686 2687 let gregs = self.vcpus[usize::from(vcpu_id)] 2688 .lock() 2689 .unwrap() 2690 .vcpu 2691 .get_regs() 2692 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2693 2694 let regs1 = [ 2695 gregs.get_rax(), 2696 gregs.get_rbx(), 2697 gregs.get_rcx(), 2698 gregs.get_rdx(), 2699 gregs.get_rsi(), 2700 gregs.get_rdi(), 2701 gregs.get_rsp(), 2702 gregs.get_rbp(), 2703 ]; 2704 2705 let regs2 = [ 2706 gregs.get_r8(), 2707 gregs.get_r9(), 2708 gregs.get_r10(), 2709 gregs.get_r11(), 2710 gregs.get_r12(), 2711 gregs.get_r13(), 2712 gregs.get_r14(), 2713 gregs.get_r15(), 2714 ]; 2715 2716 let sregs = self.vcpus[usize::from(vcpu_id)] 2717 .lock() 2718 .unwrap() 2719 .vcpu 2720 .get_sregs() 2721 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2722 2723 let mut msrs = vec![MsrEntry { 2724 index: msr_index::MSR_KERNEL_GS_BASE, 2725 ..Default::default() 2726 }]; 2727 2728 self.vcpus[vcpu_id as usize] 2729 .lock() 2730 .unwrap() 2731 .vcpu 2732 .get_msrs(&mut msrs) 2733 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2734 let kernel_gs_base = msrs[0].data; 2735 2736 let cs = CpuSegment::new(sregs.cs); 2737 let ds = CpuSegment::new(sregs.ds); 2738 let es = CpuSegment::new(sregs.es); 2739 let fs = CpuSegment::new(sregs.fs); 2740 let gs = CpuSegment::new(sregs.gs); 2741 let ss = CpuSegment::new(sregs.ss); 2742 let ldt = CpuSegment::new(sregs.ldt); 2743 let tr = CpuSegment::new(sregs.tr); 2744 let gdt = CpuSegment::new_from_table(sregs.gdt); 2745 let idt = CpuSegment::new_from_table(sregs.idt); 2746 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2747 let regs = DumpCpusState { 2748 version: 1, 2749 size: size_of::<DumpCpusState>() as u32, 2750 regs1, 2751 regs2, 2752 rip: gregs.get_rip(), 2753 rflags: gregs.get_rflags(), 2754 cs, 2755 ds, 2756 es, 2757 fs, 2758 gs, 2759 ss, 2760 ldt, 2761 tr, 2762 gdt, 2763 idt, 2764 cr, 2765 kernel_gs_base, 2766 }; 2767 2768 let bytes: &[u8] = regs.as_slice(); 2769 buf.resize(note_size as usize, 0); 2770 buf.splice(pos.., bytes.to_vec()); 2771 buf.resize(note_size as usize, 0); 2772 2773 coredump_file 2774 .write(&buf) 2775 .map_err(GuestDebuggableError::CoredumpFile)?; 2776 } 2777 2778 Ok(()) 2779 } 2780 } 2781 2782 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2783 #[cfg(test)] 2784 mod tests { 2785 use arch::layout::BOOT_STACK_POINTER; 2786 use arch::layout::ZERO_PAGE_START; 2787 use arch::x86_64::interrupts::*; 2788 use arch::x86_64::regs::*; 2789 use hypervisor::arch::x86::{FpuState, LapicState}; 2790 use hypervisor::StandardRegisters; 2791 use linux_loader::loader::bootparam::setup_header; 2792 2793 #[test] 2794 fn test_setlint() { 2795 let hv = hypervisor::new().unwrap(); 2796 let vm = hv.create_vm().expect("new VM fd creation failed"); 2797 assert!(hv.check_required_extensions().is_ok()); 2798 // Calling get_lapic will fail if there is no irqchip before hand. 2799 assert!(vm.create_irq_chip().is_ok()); 2800 let vcpu = vm.create_vcpu(0, None).unwrap(); 2801 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2802 2803 // Compute the value that is expected to represent LVT0 and LVT1. 2804 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2805 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2806 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2807 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2808 2809 set_lint(&vcpu).unwrap(); 2810 2811 // Compute the value that represents LVT0 and LVT1 after set_lint. 2812 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2813 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2814 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2815 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2816 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2817 } 2818 2819 #[test] 2820 fn test_setup_fpu() { 2821 let hv = hypervisor::new().unwrap(); 2822 let vm = hv.create_vm().expect("new VM fd creation failed"); 2823 let vcpu = vm.create_vcpu(0, None).unwrap(); 2824 setup_fpu(&vcpu).unwrap(); 2825 2826 let expected_fpu: FpuState = FpuState { 2827 fcw: 0x37f, 2828 mxcsr: 0x1f80, 2829 ..Default::default() 2830 }; 2831 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2832 // TODO: auto-generate kvm related structures with PartialEq on. 2833 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2834 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2835 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2836 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2837 // remove it at all. 2838 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2839 } 2840 2841 #[test] 2842 fn test_setup_msrs() { 2843 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2844 2845 let hv = hypervisor::new().unwrap(); 2846 let vm = hv.create_vm().expect("new VM fd creation failed"); 2847 let vcpu = vm.create_vcpu(0, None).unwrap(); 2848 setup_msrs(&vcpu).unwrap(); 2849 2850 // This test will check against the last MSR entry configured (the tenth one). 2851 // See create_msr_entries for details. 2852 let mut msrs = vec![MsrEntry { 2853 index: msr_index::MSR_IA32_MISC_ENABLE, 2854 ..Default::default() 2855 }]; 2856 2857 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2858 // in this test case scenario. 2859 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2860 assert_eq!(read_msrs, 1); 2861 2862 // Official entries that were setup when we did setup_msrs. We need to assert that the 2863 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2864 // expect. 2865 let entry_vec = vcpu.boot_msr_entries(); 2866 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2867 } 2868 2869 #[test] 2870 fn test_setup_regs_for_pvh() { 2871 let hv = hypervisor::new().unwrap(); 2872 let vm = hv.create_vm().expect("new VM fd creation failed"); 2873 let vcpu = vm.create_vcpu(0, None).unwrap(); 2874 2875 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2876 expected_regs.set_rflags(0x0000000000000002u64); 2877 expected_regs.set_rbx(arch::layout::PVH_INFO_START.0); 2878 expected_regs.set_rip(1); 2879 2880 setup_regs( 2881 &vcpu, 2882 arch::EntryPoint { 2883 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2884 setup_header: None, 2885 }, 2886 ) 2887 .unwrap(); 2888 2889 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2890 assert_eq!(actual_regs, expected_regs); 2891 } 2892 2893 #[test] 2894 fn test_setup_regs_for_bzimage() { 2895 let hv = hypervisor::new().unwrap(); 2896 let vm = hv.create_vm().expect("new VM fd creation failed"); 2897 let vcpu = vm.create_vcpu(0, None).unwrap(); 2898 2899 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2900 expected_regs.set_rflags(0x0000000000000002u64); 2901 expected_regs.set_rip(1); 2902 expected_regs.set_rsp(BOOT_STACK_POINTER.0); 2903 expected_regs.set_rsi(ZERO_PAGE_START.0); 2904 2905 setup_regs( 2906 &vcpu, 2907 arch::EntryPoint { 2908 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2909 setup_header: Some(setup_header { 2910 ..Default::default() 2911 }), 2912 }, 2913 ) 2914 .unwrap(); 2915 2916 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2917 assert_eq!(actual_regs, expected_regs); 2918 } 2919 } 2920 2921 #[cfg(target_arch = "aarch64")] 2922 #[cfg(test)] 2923 mod tests { 2924 use arch::{aarch64::regs, layout}; 2925 use hypervisor::kvm::aarch64::is_system_register; 2926 use hypervisor::kvm::kvm_bindings::{ 2927 kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2928 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2929 }; 2930 use hypervisor::{arm64_core_reg_id, offset_of}; 2931 use std::mem; 2932 2933 #[test] 2934 fn test_setup_regs() { 2935 let hv = hypervisor::new().unwrap(); 2936 let vm = hv.create_vm().unwrap(); 2937 let vcpu = vm.create_vcpu(0, None).unwrap(); 2938 2939 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2940 // Must fail when vcpu is not initialized yet. 2941 assert!(res.is_err()); 2942 2943 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2944 vm.get_preferred_target(&mut kvi).unwrap(); 2945 vcpu.vcpu_init(&kvi).unwrap(); 2946 2947 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2948 } 2949 2950 #[test] 2951 fn test_read_mpidr() { 2952 let hv = hypervisor::new().unwrap(); 2953 let vm = hv.create_vm().unwrap(); 2954 let vcpu = vm.create_vcpu(0, None).unwrap(); 2955 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2956 vm.get_preferred_target(&mut kvi).unwrap(); 2957 2958 // Must fail when vcpu is not initialized yet. 2959 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2960 2961 vcpu.vcpu_init(&kvi).unwrap(); 2962 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2963 } 2964 2965 #[test] 2966 fn test_is_system_register() { 2967 let offset = offset_of!(user_pt_regs, pc); 2968 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2969 assert!(!is_system_register(regid)); 2970 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2971 assert!(is_system_register(regid)); 2972 } 2973 2974 #[test] 2975 fn test_save_restore_core_regs() { 2976 let hv = hypervisor::new().unwrap(); 2977 let vm = hv.create_vm().unwrap(); 2978 let vcpu = vm.create_vcpu(0, None).unwrap(); 2979 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2980 vm.get_preferred_target(&mut kvi).unwrap(); 2981 2982 // Must fail when vcpu is not initialized yet. 2983 let res = vcpu.get_regs(); 2984 assert!(res.is_err()); 2985 assert_eq!( 2986 format!("{}", res.unwrap_err()), 2987 "Failed to get core register: Exec format error (os error 8)" 2988 ); 2989 2990 let mut state = kvm_regs::default(); 2991 let res = vcpu.set_regs(&state); 2992 assert!(res.is_err()); 2993 assert_eq!( 2994 format!("{}", res.unwrap_err()), 2995 "Failed to set core register: Exec format error (os error 8)" 2996 ); 2997 2998 vcpu.vcpu_init(&kvi).unwrap(); 2999 let res = vcpu.get_regs(); 3000 assert!(res.is_ok()); 3001 state = res.unwrap(); 3002 assert_eq!(state.regs.pstate, 0x3C5); 3003 3004 assert!(vcpu.set_regs(&state).is_ok()); 3005 } 3006 3007 #[test] 3008 fn test_get_set_mpstate() { 3009 let hv = hypervisor::new().unwrap(); 3010 let vm = hv.create_vm().unwrap(); 3011 let vcpu = vm.create_vcpu(0, None).unwrap(); 3012 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 3013 vm.get_preferred_target(&mut kvi).unwrap(); 3014 3015 let res = vcpu.get_mp_state(); 3016 assert!(res.is_ok()); 3017 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 3018 } 3019 } 3020