1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 #[cfg(target_arch = "x86_64")] 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, sdt::Sdt, Aml}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 #[cfg(target_arch = "x86_64")] 35 use arch::x86_64::get_x2apic_id; 36 use arch::EntryPoint; 37 use arch::NumaNodes; 38 #[cfg(target_arch = "aarch64")] 39 use devices::gic::Gic; 40 use devices::interrupt_controller::InterruptController; 41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 44 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 45 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 46 use hypervisor::aarch64::StandardRegisters; 47 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 48 use hypervisor::arch::x86::msr_index; 49 #[cfg(target_arch = "x86_64")] 50 use hypervisor::arch::x86::CpuIdEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::MsrEntry; 53 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 54 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters}; 55 #[cfg(target_arch = "aarch64")] 56 use hypervisor::kvm::kvm_bindings; 57 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 58 use hypervisor::kvm::kvm_ioctls::Cap; 59 #[cfg(feature = "tdx")] 60 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 61 #[cfg(target_arch = "x86_64")] 62 use hypervisor::CpuVendor; 63 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps}; 64 use libc::{c_void, siginfo_t}; 65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 66 use linux_loader::elf::Elf64_Nhdr; 67 use seccompiler::{apply_filter, SeccompAction}; 68 use std::collections::BTreeMap; 69 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 70 use std::io::Write; 71 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 72 use std::mem::size_of; 73 use std::os::unix::thread::JoinHandleExt; 74 use std::sync::atomic::{AtomicBool, Ordering}; 75 use std::sync::{Arc, Barrier, Mutex}; 76 use std::{cmp, io, result, thread}; 77 use thiserror::Error; 78 use tracer::trace_scoped; 79 use vm_device::BusDevice; 80 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 81 use vm_memory::ByteValued; 82 #[cfg(feature = "guest_debug")] 83 use vm_memory::{Bytes, GuestAddressSpace}; 84 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 85 use vm_migration::{ 86 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 87 Transportable, 88 }; 89 use vmm_sys_util::eventfd::EventFd; 90 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 91 use zerocopy::AsBytes; 92 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 93 /// Extract the specified bits of a 64-bit integer. 94 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 95 /// following expression should return 3 (`0b11`): 96 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 97 /// 98 macro_rules! extract_bits_64 { 99 ($value: tt, $offset: tt, $length: tt) => { 100 ($value >> $offset) & (!0u64 >> (64 - $length)) 101 }; 102 } 103 104 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 105 macro_rules! extract_bits_64_without_offset { 106 ($value: tt, $length: tt) => { 107 $value & (!0u64 >> (64 - $length)) 108 }; 109 } 110 111 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 112 113 #[derive(Debug, Error)] 114 pub enum Error { 115 #[error("Error creating vCPU: {0}")] 116 VcpuCreate(#[source] anyhow::Error), 117 118 #[error("Error running bCPU: {0}")] 119 VcpuRun(#[source] anyhow::Error), 120 121 #[error("Error spawning vCPU thread: {0}")] 122 VcpuSpawn(#[source] io::Error), 123 124 #[error("Error generating common CPUID: {0}")] 125 CommonCpuId(#[source] arch::Error), 126 127 #[error("Error configuring vCPU: {0}")] 128 VcpuConfiguration(#[source] arch::Error), 129 130 #[error("Still pending removed vcpu")] 131 VcpuPendingRemovedVcpu, 132 133 #[cfg(target_arch = "aarch64")] 134 #[error("Error fetching preferred target: {0}")] 135 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 136 137 #[cfg(target_arch = "aarch64")] 138 #[error("Error initialising vCPU: {0}")] 139 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 140 141 #[error("Failed to join on vCPU threads: {0:?}")] 142 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 143 144 #[error("Error adding CpuManager to MMIO bus: {0}")] 145 BusError(#[source] vm_device::BusError), 146 147 #[error("Requested vCPUs exceed maximum")] 148 DesiredVCpuCountExceedsMax, 149 150 #[error("Cannot create seccomp filter: {0}")] 151 CreateSeccompFilter(#[source] seccompiler::Error), 152 153 #[error("Cannot apply seccomp filter: {0}")] 154 ApplySeccompFilter(#[source] seccompiler::Error), 155 156 #[error("Error starting vCPU after restore: {0}")] 157 StartRestoreVcpu(#[source] anyhow::Error), 158 159 #[error("Unexpected VmExit")] 160 UnexpectedVmExit, 161 162 #[error("Failed to allocate MMIO address for CpuManager")] 163 AllocateMmmioAddress, 164 165 #[cfg(feature = "tdx")] 166 #[error("Error initializing TDX: {0}")] 167 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 168 169 #[cfg(target_arch = "aarch64")] 170 #[error("Error initializing PMU: {0}")] 171 InitPmu(#[source] hypervisor::HypervisorCpuError), 172 173 #[cfg(feature = "guest_debug")] 174 #[error("Error during CPU debug: {0}")] 175 CpuDebug(#[source] hypervisor::HypervisorCpuError), 176 177 #[cfg(feature = "guest_debug")] 178 #[error("Error translating virtual address: {0}")] 179 TranslateVirtualAddress(#[source] anyhow::Error), 180 181 #[cfg(target_arch = "x86_64")] 182 #[error("Error setting up AMX: {0}")] 183 AmxEnable(#[source] anyhow::Error), 184 185 #[error("Maximum number of vCPUs exceeds host limit")] 186 MaximumVcpusExceeded, 187 188 #[cfg(feature = "sev_snp")] 189 #[error("Failed to set sev control register: {0}")] 190 SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), 191 } 192 pub type Result<T> = result::Result<T, Error>; 193 194 #[cfg(target_arch = "x86_64")] 195 #[allow(dead_code)] 196 #[repr(packed)] 197 #[derive(AsBytes)] 198 struct LocalX2Apic { 199 pub r#type: u8, 200 pub length: u8, 201 pub _reserved: u16, 202 pub apic_id: u32, 203 pub flags: u32, 204 pub processor_id: u32, 205 } 206 207 #[allow(dead_code)] 208 #[repr(packed)] 209 #[derive(Default, AsBytes)] 210 struct Ioapic { 211 pub r#type: u8, 212 pub length: u8, 213 pub ioapic_id: u8, 214 _reserved: u8, 215 pub apic_address: u32, 216 pub gsi_base: u32, 217 } 218 219 #[cfg(target_arch = "aarch64")] 220 #[allow(dead_code)] 221 #[repr(packed)] 222 #[derive(AsBytes)] 223 struct GicC { 224 pub r#type: u8, 225 pub length: u8, 226 pub reserved0: u16, 227 pub cpu_interface_number: u32, 228 pub uid: u32, 229 pub flags: u32, 230 pub parking_version: u32, 231 pub performance_interrupt: u32, 232 pub parked_address: u64, 233 pub base_address: u64, 234 pub gicv_base_address: u64, 235 pub gich_base_address: u64, 236 pub vgic_interrupt: u32, 237 pub gicr_base_address: u64, 238 pub mpidr: u64, 239 pub proc_power_effi_class: u8, 240 pub reserved1: u8, 241 pub spe_overflow_interrupt: u16, 242 } 243 244 #[cfg(target_arch = "aarch64")] 245 #[allow(dead_code)] 246 #[repr(packed)] 247 #[derive(AsBytes)] 248 struct GicD { 249 pub r#type: u8, 250 pub length: u8, 251 pub reserved0: u16, 252 pub gic_id: u32, 253 pub base_address: u64, 254 pub global_irq_base: u32, 255 pub version: u8, 256 pub reserved1: [u8; 3], 257 } 258 259 #[cfg(target_arch = "aarch64")] 260 #[allow(dead_code)] 261 #[repr(packed)] 262 #[derive(AsBytes)] 263 struct GicR { 264 pub r#type: u8, 265 pub length: u8, 266 pub reserved: u16, 267 pub base_address: u64, 268 pub range_length: u32, 269 } 270 271 #[cfg(target_arch = "aarch64")] 272 #[allow(dead_code)] 273 #[repr(packed)] 274 #[derive(AsBytes)] 275 struct GicIts { 276 pub r#type: u8, 277 pub length: u8, 278 pub reserved0: u16, 279 pub translation_id: u32, 280 pub base_address: u64, 281 pub reserved1: u32, 282 } 283 284 #[cfg(target_arch = "aarch64")] 285 #[allow(dead_code)] 286 #[repr(packed)] 287 #[derive(AsBytes)] 288 struct ProcessorHierarchyNode { 289 pub r#type: u8, 290 pub length: u8, 291 pub reserved: u16, 292 pub flags: u32, 293 pub parent: u32, 294 pub acpi_processor_id: u32, 295 pub num_private_resources: u32, 296 } 297 298 #[allow(dead_code)] 299 #[repr(packed)] 300 #[derive(Default, AsBytes)] 301 struct InterruptSourceOverride { 302 pub r#type: u8, 303 pub length: u8, 304 pub bus: u8, 305 pub source: u8, 306 pub gsi: u32, 307 pub flags: u16, 308 } 309 310 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 311 macro_rules! round_up { 312 ($n:expr,$d:expr) => { 313 (($n / ($d + 1)) + 1) * $d 314 }; 315 } 316 317 /// A wrapper around creating and using a kvm-based VCPU. 318 pub struct Vcpu { 319 // The hypervisor abstracted CPU. 320 vcpu: Arc<dyn hypervisor::Vcpu>, 321 id: u8, 322 #[cfg(target_arch = "aarch64")] 323 mpidr: u64, 324 saved_state: Option<CpuState>, 325 #[cfg(target_arch = "x86_64")] 326 vendor: CpuVendor, 327 } 328 329 impl Vcpu { 330 /// Constructs a new VCPU for `vm`. 331 /// 332 /// # Arguments 333 /// 334 /// * `id` - Represents the CPU number between [0, max vcpus). 335 /// * `vm` - The virtual machine this vcpu will get attached to. 336 /// * `vm_ops` - Optional object for exit handling. 337 /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) 338 pub fn new( 339 id: u8, 340 apic_id: u8, 341 vm: &Arc<dyn hypervisor::Vm>, 342 vm_ops: Option<Arc<dyn VmOps>>, 343 #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, 344 ) -> Result<Self> { 345 let vcpu = vm 346 .create_vcpu(apic_id, vm_ops) 347 .map_err(|e| Error::VcpuCreate(e.into()))?; 348 // Initially the cpuid per vCPU is the one supported by this VM. 349 Ok(Vcpu { 350 vcpu, 351 id, 352 #[cfg(target_arch = "aarch64")] 353 mpidr: 0, 354 saved_state: None, 355 #[cfg(target_arch = "x86_64")] 356 vendor: cpu_vendor, 357 }) 358 } 359 360 /// Configures a vcpu and should be called once per vcpu when created. 361 /// 362 /// # Arguments 363 /// 364 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 365 /// * `guest_memory` - Guest memory. 366 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 367 pub fn configure( 368 &mut self, 369 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 370 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 371 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 372 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 373 #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, 374 ) -> Result<()> { 375 #[cfg(target_arch = "aarch64")] 376 { 377 self.init(vm)?; 378 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 379 .map_err(Error::VcpuConfiguration)?; 380 } 381 info!("Configuring vCPU: cpu_id = {}", self.id); 382 #[cfg(target_arch = "x86_64")] 383 arch::configure_vcpu( 384 &self.vcpu, 385 self.id, 386 boot_setup, 387 cpuid, 388 kvm_hyperv, 389 self.vendor, 390 topology, 391 ) 392 .map_err(Error::VcpuConfiguration)?; 393 394 Ok(()) 395 } 396 397 /// Gets the MPIDR register value. 398 #[cfg(target_arch = "aarch64")] 399 pub fn get_mpidr(&self) -> u64 { 400 self.mpidr 401 } 402 403 /// Gets the saved vCPU state. 404 #[cfg(target_arch = "aarch64")] 405 pub fn get_saved_state(&self) -> Option<CpuState> { 406 self.saved_state.clone() 407 } 408 409 /// Initializes an aarch64 specific vcpu for booting Linux. 410 #[cfg(target_arch = "aarch64")] 411 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 412 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 413 414 // This reads back the kernel's preferred target type. 415 vm.get_preferred_target(&mut kvi) 416 .map_err(Error::VcpuArmPreferredTarget)?; 417 // We already checked that the capability is supported. 418 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 419 if vm 420 .as_any() 421 .downcast_ref::<hypervisor::kvm::KvmVm>() 422 .unwrap() 423 .check_extension(Cap::ArmPmuV3) 424 { 425 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 426 } 427 // Non-boot cpus are powered off initially. 428 if self.id > 0 { 429 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 430 } 431 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 432 } 433 434 /// Runs the VCPU until it exits, returning the reason. 435 /// 436 /// Note that the state of the VCPU and associated VM must be setup first for this to do 437 /// anything useful. 438 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 439 self.vcpu.run() 440 } 441 442 #[cfg(feature = "sev_snp")] 443 pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> { 444 self.vcpu 445 .set_sev_control_register(vmsa_pfn) 446 .map_err(Error::SetSevControlRegister) 447 } 448 } 449 450 impl Pausable for Vcpu {} 451 impl Snapshottable for Vcpu { 452 fn id(&self) -> String { 453 self.id.to_string() 454 } 455 456 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 457 let saved_state = self 458 .vcpu 459 .state() 460 .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?; 461 462 self.saved_state = Some(saved_state.clone()); 463 464 Ok(Snapshot::from_data(SnapshotData::new_from_state( 465 &saved_state, 466 )?)) 467 } 468 } 469 470 pub struct CpuManager { 471 config: CpusConfig, 472 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 473 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 474 #[cfg(target_arch = "x86_64")] 475 cpuid: Vec<CpuIdEntry>, 476 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 477 vm: Arc<dyn hypervisor::Vm>, 478 vcpus_kill_signalled: Arc<AtomicBool>, 479 vcpus_pause_signalled: Arc<AtomicBool>, 480 exit_evt: EventFd, 481 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 482 reset_evt: EventFd, 483 #[cfg(feature = "guest_debug")] 484 vm_debug_evt: EventFd, 485 vcpu_states: Vec<VcpuState>, 486 selected_cpu: u8, 487 vcpus: Vec<Arc<Mutex<Vcpu>>>, 488 seccomp_action: SeccompAction, 489 vm_ops: Arc<dyn VmOps>, 490 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 491 acpi_address: Option<GuestAddress>, 492 proximity_domain_per_cpu: BTreeMap<u8, u32>, 493 affinity: BTreeMap<u8, Vec<usize>>, 494 dynamic: bool, 495 hypervisor: Arc<dyn hypervisor::Hypervisor>, 496 #[cfg(feature = "sev_snp")] 497 sev_snp_enabled: bool, 498 } 499 500 const CPU_ENABLE_FLAG: usize = 0; 501 const CPU_INSERTING_FLAG: usize = 1; 502 const CPU_REMOVING_FLAG: usize = 2; 503 const CPU_EJECT_FLAG: usize = 3; 504 505 const CPU_STATUS_OFFSET: u64 = 4; 506 const CPU_SELECTION_OFFSET: u64 = 0; 507 508 impl BusDevice for CpuManager { 509 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 510 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 511 data.fill(0); 512 513 match offset { 514 CPU_SELECTION_OFFSET => { 515 data[0] = self.selected_cpu; 516 } 517 CPU_STATUS_OFFSET => { 518 if self.selected_cpu < self.max_vcpus() { 519 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 520 if state.active() { 521 data[0] |= 1 << CPU_ENABLE_FLAG; 522 } 523 if state.inserting { 524 data[0] |= 1 << CPU_INSERTING_FLAG; 525 } 526 if state.removing { 527 data[0] |= 1 << CPU_REMOVING_FLAG; 528 } 529 } else { 530 warn!("Out of range vCPU id: {}", self.selected_cpu); 531 } 532 } 533 _ => { 534 warn!( 535 "Unexpected offset for accessing CPU manager device: {:#}", 536 offset 537 ); 538 } 539 } 540 } 541 542 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 543 match offset { 544 CPU_SELECTION_OFFSET => { 545 self.selected_cpu = data[0]; 546 } 547 CPU_STATUS_OFFSET => { 548 if self.selected_cpu < self.max_vcpus() { 549 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 550 // The ACPI code writes back a 1 to acknowledge the insertion 551 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 552 && state.inserting 553 { 554 state.inserting = false; 555 } 556 // Ditto for removal 557 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 558 && state.removing 559 { 560 state.removing = false; 561 } 562 // Trigger removal of vCPU 563 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 564 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 565 error!("Error removing vCPU: {:?}", e); 566 } 567 } 568 } else { 569 warn!("Out of range vCPU id: {}", self.selected_cpu); 570 } 571 } 572 _ => { 573 warn!( 574 "Unexpected offset for accessing CPU manager device: {:#}", 575 offset 576 ); 577 } 578 } 579 None 580 } 581 } 582 583 #[derive(Default)] 584 struct VcpuState { 585 inserting: bool, 586 removing: bool, 587 pending_removal: Arc<AtomicBool>, 588 handle: Option<thread::JoinHandle<()>>, 589 kill: Arc<AtomicBool>, 590 vcpu_run_interrupted: Arc<AtomicBool>, 591 paused: Arc<AtomicBool>, 592 } 593 594 impl VcpuState { 595 fn active(&self) -> bool { 596 self.handle.is_some() 597 } 598 599 fn signal_thread(&self) { 600 if let Some(handle) = self.handle.as_ref() { 601 loop { 602 // SAFETY: FFI call with correct arguments 603 unsafe { 604 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 605 } 606 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 607 break; 608 } else { 609 // This is more effective than thread::yield_now() at 610 // avoiding a priority inversion with the vCPU thread 611 thread::sleep(std::time::Duration::from_millis(1)); 612 } 613 } 614 } 615 } 616 617 fn join_thread(&mut self) -> Result<()> { 618 if let Some(handle) = self.handle.take() { 619 handle.join().map_err(Error::ThreadCleanup)? 620 } 621 622 Ok(()) 623 } 624 625 fn unpark_thread(&self) { 626 if let Some(handle) = self.handle.as_ref() { 627 handle.thread().unpark() 628 } 629 } 630 } 631 632 impl CpuManager { 633 #[allow(unused_variables)] 634 #[allow(clippy::too_many_arguments)] 635 pub fn new( 636 config: &CpusConfig, 637 vm: Arc<dyn hypervisor::Vm>, 638 exit_evt: EventFd, 639 reset_evt: EventFd, 640 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 641 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 642 seccomp_action: SeccompAction, 643 vm_ops: Arc<dyn VmOps>, 644 #[cfg(feature = "tdx")] tdx_enabled: bool, 645 numa_nodes: &NumaNodes, 646 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 647 ) -> Result<Arc<Mutex<CpuManager>>> { 648 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 649 return Err(Error::MaximumVcpusExceeded); 650 } 651 652 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 653 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 654 let hypervisor_type = hypervisor.hypervisor_type(); 655 #[cfg(target_arch = "x86_64")] 656 let cpu_vendor = hypervisor.get_cpu_vendor(); 657 658 #[cfg(target_arch = "x86_64")] 659 if config.features.amx { 660 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 661 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 662 const XFEATURE_XTILEDATA: usize = 18; 663 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 664 665 // SAFETY: the syscall is only modifying kernel internal 666 // data structures that the kernel is itself expected to safeguard. 667 let amx_tile = unsafe { 668 libc::syscall( 669 libc::SYS_arch_prctl, 670 ARCH_REQ_XCOMP_GUEST_PERM, 671 XFEATURE_XTILEDATA, 672 ) 673 }; 674 675 if amx_tile != 0 { 676 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 677 } else { 678 let mask: usize = 0; 679 // SAFETY: the mask being modified (not marked mutable as it is 680 // modified in unsafe only which is permitted) isn't in use elsewhere. 681 let result = unsafe { 682 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 683 }; 684 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 685 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 686 } 687 } 688 } 689 690 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 691 let mut cpu_list = Vec::new(); 692 for (proximity_domain, numa_node) in numa_nodes.iter() { 693 for cpu in numa_node.cpus.iter() { 694 cpu_list.push((*cpu, *proximity_domain)) 695 } 696 } 697 cpu_list 698 } 699 .into_iter() 700 .collect(); 701 702 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 703 cpu_affinity 704 .iter() 705 .map(|a| (a.vcpu, a.host_cpus.clone())) 706 .collect() 707 } else { 708 BTreeMap::new() 709 }; 710 711 #[cfg(feature = "tdx")] 712 let dynamic = !tdx_enabled; 713 #[cfg(not(feature = "tdx"))] 714 let dynamic = true; 715 716 Ok(Arc::new(Mutex::new(CpuManager { 717 config: config.clone(), 718 interrupt_controller: None, 719 #[cfg(target_arch = "x86_64")] 720 cpuid: Vec::new(), 721 vm, 722 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 723 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 724 vcpu_states, 725 exit_evt, 726 reset_evt, 727 #[cfg(feature = "guest_debug")] 728 vm_debug_evt, 729 selected_cpu: 0, 730 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 731 seccomp_action, 732 vm_ops, 733 acpi_address: None, 734 proximity_domain_per_cpu, 735 affinity, 736 dynamic, 737 hypervisor: hypervisor.clone(), 738 #[cfg(feature = "sev_snp")] 739 sev_snp_enabled, 740 }))) 741 } 742 743 #[cfg(target_arch = "x86_64")] 744 pub fn populate_cpuid( 745 &mut self, 746 memory_manager: &Arc<Mutex<MemoryManager>>, 747 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 748 #[cfg(feature = "tdx")] tdx: bool, 749 ) -> Result<()> { 750 let sgx_epc_sections = memory_manager 751 .lock() 752 .unwrap() 753 .sgx_epc_region() 754 .as_ref() 755 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 756 757 self.cpuid = { 758 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 759 arch::generate_common_cpuid( 760 hypervisor, 761 &arch::CpuidConfig { 762 sgx_epc_sections, 763 phys_bits, 764 kvm_hyperv: self.config.kvm_hyperv, 765 #[cfg(feature = "tdx")] 766 tdx, 767 amx: self.config.features.amx, 768 }, 769 ) 770 .map_err(Error::CommonCpuId)? 771 }; 772 773 Ok(()) 774 } 775 776 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 777 info!("Creating vCPU: cpu_id = {}", cpu_id); 778 779 #[cfg(target_arch = "x86_64")] 780 let topology = self.get_vcpu_topology(); 781 #[cfg(target_arch = "x86_64")] 782 let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology); 783 #[cfg(target_arch = "aarch64")] 784 let x2apic_id = cpu_id as u32; 785 786 let mut vcpu = Vcpu::new( 787 cpu_id, 788 x2apic_id as u8, 789 &self.vm, 790 Some(self.vm_ops.clone()), 791 #[cfg(target_arch = "x86_64")] 792 self.hypervisor.get_cpu_vendor(), 793 )?; 794 795 if let Some(snapshot) = snapshot { 796 // AArch64 vCPUs should be initialized after created. 797 #[cfg(target_arch = "aarch64")] 798 vcpu.init(&self.vm)?; 799 800 let state: CpuState = snapshot.to_state().map_err(|e| { 801 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 802 })?; 803 vcpu.vcpu 804 .set_state(&state) 805 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 806 807 vcpu.saved_state = Some(state); 808 } 809 810 let vcpu = Arc::new(Mutex::new(vcpu)); 811 812 // Adding vCPU to the CpuManager's vCPU list. 813 self.vcpus.push(vcpu.clone()); 814 815 Ok(vcpu) 816 } 817 818 pub fn configure_vcpu( 819 &self, 820 vcpu: Arc<Mutex<Vcpu>>, 821 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 822 ) -> Result<()> { 823 let mut vcpu = vcpu.lock().unwrap(); 824 825 #[cfg(feature = "sev_snp")] 826 if self.sev_snp_enabled { 827 if let Some((kernel_entry_point, _)) = boot_setup { 828 vcpu.set_sev_control_register( 829 kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, 830 )?; 831 } 832 } 833 #[cfg(target_arch = "x86_64")] 834 assert!(!self.cpuid.is_empty()); 835 836 #[cfg(target_arch = "x86_64")] 837 let topology = self.config.topology.clone().map_or_else( 838 || { 839 #[cfg(feature = "mshv")] 840 if matches!(self.hypervisor.hypervisor_type(), HypervisorType::Mshv) { 841 return Some((1, self.boot_vcpus(), 1)); 842 } 843 None 844 }, 845 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 846 ); 847 #[cfg(target_arch = "x86_64")] 848 vcpu.configure( 849 boot_setup, 850 self.cpuid.clone(), 851 self.config.kvm_hyperv, 852 topology, 853 )?; 854 855 #[cfg(target_arch = "aarch64")] 856 vcpu.configure(&self.vm, boot_setup)?; 857 858 Ok(()) 859 } 860 861 /// Only create new vCPUs if there aren't any inactive ones to reuse 862 fn create_vcpus( 863 &mut self, 864 desired_vcpus: u8, 865 snapshot: Option<Snapshot>, 866 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 867 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 868 info!( 869 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 870 desired_vcpus, 871 self.config.max_vcpus, 872 self.vcpus.len(), 873 self.present_vcpus() 874 ); 875 876 if desired_vcpus > self.config.max_vcpus { 877 return Err(Error::DesiredVCpuCountExceedsMax); 878 } 879 880 // Only create vCPUs in excess of all the allocated vCPUs. 881 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 882 vcpus.push(self.create_vcpu( 883 cpu_id, 884 // TODO: The special format of the CPU id can be removed once 885 // ready to break live upgrade. 886 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 887 )?); 888 } 889 890 Ok(vcpus) 891 } 892 893 #[cfg(target_arch = "aarch64")] 894 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 895 for cpu in self.vcpus.iter() { 896 let cpu = cpu.lock().unwrap(); 897 // Check if PMU attr is available, if not, log the information. 898 if cpu.vcpu.has_pmu_support() { 899 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 900 } else { 901 debug!( 902 "PMU attribute is not supported in vCPU{}, skip PMU init!", 903 cpu.id 904 ); 905 return Ok(false); 906 } 907 } 908 909 Ok(true) 910 } 911 912 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 913 self.vcpus.clone() 914 } 915 916 fn start_vcpu( 917 &mut self, 918 vcpu: Arc<Mutex<Vcpu>>, 919 vcpu_id: u8, 920 vcpu_thread_barrier: Arc<Barrier>, 921 inserting: bool, 922 ) -> Result<()> { 923 let reset_evt = self.reset_evt.try_clone().unwrap(); 924 let exit_evt = self.exit_evt.try_clone().unwrap(); 925 #[cfg(feature = "kvm")] 926 let hypervisor_type = self.hypervisor.hypervisor_type(); 927 #[cfg(feature = "guest_debug")] 928 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 929 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 930 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 931 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 932 933 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 934 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 935 .vcpu_run_interrupted 936 .clone(); 937 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 938 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 939 940 // Prepare the CPU set the current vCPU is expected to run onto. 941 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 942 // SAFETY: all zeros is a valid pattern 943 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 944 // SAFETY: FFI call, trivially safe 945 unsafe { libc::CPU_ZERO(&mut cpuset) }; 946 for host_cpu in host_cpus { 947 // SAFETY: FFI call, trivially safe 948 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) }; 949 } 950 cpuset 951 }); 952 953 // Retrieve seccomp filter for vcpu thread 954 let vcpu_seccomp_filter = get_seccomp_filter( 955 &self.seccomp_action, 956 Thread::Vcpu, 957 self.hypervisor.hypervisor_type(), 958 ) 959 .map_err(Error::CreateSeccompFilter)?; 960 961 #[cfg(target_arch = "x86_64")] 962 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 963 964 info!("Starting vCPU: cpu_id = {}", vcpu_id); 965 966 let handle = Some( 967 thread::Builder::new() 968 .name(format!("vcpu{vcpu_id}")) 969 .spawn(move || { 970 // Schedule the thread to run on the expected CPU set 971 if let Some(cpuset) = cpuset.as_ref() { 972 // SAFETY: FFI call with correct arguments 973 let ret = unsafe { 974 libc::sched_setaffinity( 975 0, 976 std::mem::size_of::<libc::cpu_set_t>(), 977 cpuset as *const libc::cpu_set_t, 978 ) 979 }; 980 981 if ret != 0 { 982 error!( 983 "Failed scheduling the vCPU {} on the expected CPU set: {}", 984 vcpu_id, 985 io::Error::last_os_error() 986 ); 987 return; 988 } 989 } 990 991 // Apply seccomp filter for vcpu thread. 992 if !vcpu_seccomp_filter.is_empty() { 993 if let Err(e) = 994 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 995 { 996 error!("Error applying seccomp filter: {:?}", e); 997 return; 998 } 999 } 1000 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 1001 // This uses an async signal safe handler to kill the vcpu handles. 1002 register_signal_handler(SIGRTMIN(), handle_signal) 1003 .expect("Failed to register vcpu signal handler"); 1004 // Block until all CPUs are ready. 1005 vcpu_thread_barrier.wait(); 1006 1007 std::panic::catch_unwind(move || { 1008 loop { 1009 // If we are being told to pause, we park the thread 1010 // until the pause boolean is toggled. 1011 // The resume operation is responsible for toggling 1012 // the boolean and unpark the thread. 1013 // We enter a loop because park() could spuriously 1014 // return. We will then park() again unless the 1015 // pause boolean has been toggled. 1016 1017 // Need to use Ordering::SeqCst as we have multiple 1018 // loads and stores to different atomics and we need 1019 // to see them in a consistent order in all threads 1020 1021 if vcpu_pause_signalled.load(Ordering::SeqCst) { 1022 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 1023 // completed by returning to KVM_RUN. From the kernel docs: 1024 // 1025 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 1026 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 1027 // operations are complete (and guest state is consistent) only after userspace 1028 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 1029 // incomplete operations and then check for pending signals. 1030 // The pending state of the operation is not preserved in state which is 1031 // visible to userspace, thus userspace should ensure that the operation is 1032 // completed before performing a live migration. Userspace can re-enter the 1033 // guest with an unmasked signal pending or with the immediate_exit field set 1034 // to complete pending operations without allowing any further instructions 1035 // to be executed. 1036 1037 #[cfg(feature = "kvm")] 1038 if matches!(hypervisor_type, HypervisorType::Kvm) { 1039 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 1040 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 1041 error!("Unexpected VM exit on \"immediate_exit\" run"); 1042 break; 1043 } 1044 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 1045 } 1046 1047 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1048 1049 vcpu_paused.store(true, Ordering::SeqCst); 1050 while vcpu_pause_signalled.load(Ordering::SeqCst) { 1051 thread::park(); 1052 } 1053 vcpu_run_interrupted.store(false, Ordering::SeqCst); 1054 } 1055 1056 // We've been told to terminate 1057 if vcpu_kill_signalled.load(Ordering::SeqCst) 1058 || vcpu_kill.load(Ordering::SeqCst) 1059 { 1060 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1061 break; 1062 } 1063 1064 #[cfg(feature = "tdx")] 1065 let mut vcpu = vcpu.lock().unwrap(); 1066 #[cfg(not(feature = "tdx"))] 1067 let vcpu = vcpu.lock().unwrap(); 1068 // vcpu.run() returns false on a triple-fault so trigger a reset 1069 match vcpu.run() { 1070 Ok(run) => match run { 1071 #[cfg(feature = "kvm")] 1072 VmExit::Debug => { 1073 info!("VmExit::Debug"); 1074 #[cfg(feature = "guest_debug")] 1075 { 1076 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1077 let raw_tid = get_raw_tid(vcpu_id as usize); 1078 vm_debug_evt.write(raw_tid as u64).unwrap(); 1079 } 1080 } 1081 #[cfg(target_arch = "x86_64")] 1082 VmExit::IoapicEoi(vector) => { 1083 if let Some(interrupt_controller) = 1084 &interrupt_controller_clone 1085 { 1086 interrupt_controller 1087 .lock() 1088 .unwrap() 1089 .end_of_interrupt(vector); 1090 } 1091 } 1092 VmExit::Ignore => {} 1093 VmExit::Hyperv => {} 1094 VmExit::Reset => { 1095 info!("VmExit::Reset"); 1096 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1097 reset_evt.write(1).unwrap(); 1098 break; 1099 } 1100 VmExit::Shutdown => { 1101 info!("VmExit::Shutdown"); 1102 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1103 exit_evt.write(1).unwrap(); 1104 break; 1105 } 1106 #[cfg(feature = "tdx")] 1107 VmExit::Tdx => { 1108 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1109 match vcpu.get_tdx_exit_details() { 1110 Ok(details) => match details { 1111 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1112 TdxExitDetails::SetupEventNotifyInterrupt => { 1113 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1114 } 1115 }, 1116 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1117 } 1118 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1119 } else { 1120 // We should never reach this code as 1121 // this means the design from the code 1122 // is wrong. 1123 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1124 } 1125 } 1126 _ => { 1127 error!( 1128 "VCPU generated error: {:?}", 1129 Error::UnexpectedVmExit 1130 ); 1131 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1132 exit_evt.write(1).unwrap(); 1133 break; 1134 } 1135 }, 1136 1137 Err(e) => { 1138 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1139 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1140 exit_evt.write(1).unwrap(); 1141 break; 1142 } 1143 } 1144 1145 // We've been told to terminate 1146 if vcpu_kill_signalled.load(Ordering::SeqCst) 1147 || vcpu_kill.load(Ordering::SeqCst) 1148 { 1149 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1150 break; 1151 } 1152 } 1153 }) 1154 .or_else(|_| { 1155 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1156 error!("vCPU thread panicked"); 1157 panic_exit_evt.write(1) 1158 }) 1159 .ok(); 1160 }) 1161 .map_err(Error::VcpuSpawn)?, 1162 ); 1163 1164 // On hot plug calls into this function entry_point is None. It is for 1165 // those hotplug CPU additions that we need to set the inserting flag. 1166 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1167 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1168 1169 Ok(()) 1170 } 1171 1172 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1173 fn activate_vcpus( 1174 &mut self, 1175 desired_vcpus: u8, 1176 inserting: bool, 1177 paused: Option<bool>, 1178 ) -> Result<()> { 1179 if desired_vcpus > self.config.max_vcpus { 1180 return Err(Error::DesiredVCpuCountExceedsMax); 1181 } 1182 1183 let vcpu_thread_barrier = Arc::new(Barrier::new( 1184 (desired_vcpus - self.present_vcpus() + 1) as usize, 1185 )); 1186 1187 if let Some(paused) = paused { 1188 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1189 } 1190 1191 info!( 1192 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1193 desired_vcpus, 1194 self.vcpus.len(), 1195 self.present_vcpus(), 1196 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1197 ); 1198 1199 // This reuses any inactive vCPUs as well as any that were newly created 1200 for vcpu_id in self.present_vcpus()..desired_vcpus { 1201 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1202 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1203 } 1204 1205 // Unblock all CPU threads. 1206 vcpu_thread_barrier.wait(); 1207 Ok(()) 1208 } 1209 1210 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1211 // Mark vCPUs for removal, actual removal happens on ejection 1212 for cpu_id in desired_vcpus..self.present_vcpus() { 1213 self.vcpu_states[usize::from(cpu_id)].removing = true; 1214 self.vcpu_states[usize::from(cpu_id)] 1215 .pending_removal 1216 .store(true, Ordering::SeqCst); 1217 } 1218 } 1219 1220 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1221 for state in self.vcpu_states.iter() { 1222 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1223 return true; 1224 } 1225 } 1226 false 1227 } 1228 1229 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1230 info!("Removing vCPU: cpu_id = {}", cpu_id); 1231 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1232 state.kill.store(true, Ordering::SeqCst); 1233 state.signal_thread(); 1234 state.join_thread()?; 1235 state.handle = None; 1236 1237 // Once the thread has exited, clear the "kill" so that it can reused 1238 state.kill.store(false, Ordering::SeqCst); 1239 state.pending_removal.store(false, Ordering::SeqCst); 1240 1241 Ok(()) 1242 } 1243 1244 pub fn create_boot_vcpus( 1245 &mut self, 1246 snapshot: Option<Snapshot>, 1247 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1248 trace_scoped!("create_boot_vcpus"); 1249 1250 self.create_vcpus(self.boot_vcpus(), snapshot) 1251 } 1252 1253 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1254 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1255 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1256 } 1257 1258 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1259 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1260 .map_err(|e| { 1261 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1262 })?; 1263 1264 Ok(()) 1265 } 1266 1267 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1268 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1269 return Ok(false); 1270 } 1271 1272 if !self.dynamic { 1273 return Ok(false); 1274 } 1275 1276 if self.check_pending_removed_vcpu() { 1277 return Err(Error::VcpuPendingRemovedVcpu); 1278 } 1279 1280 match desired_vcpus.cmp(&self.present_vcpus()) { 1281 cmp::Ordering::Greater => { 1282 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1283 for vcpu in vcpus { 1284 self.configure_vcpu(vcpu, None)? 1285 } 1286 self.activate_vcpus(desired_vcpus, true, None)?; 1287 Ok(true) 1288 } 1289 cmp::Ordering::Less => { 1290 self.mark_vcpus_for_removal(desired_vcpus); 1291 Ok(true) 1292 } 1293 _ => Ok(false), 1294 } 1295 } 1296 1297 pub fn shutdown(&mut self) -> Result<()> { 1298 // Tell the vCPUs to stop themselves next time they go through the loop 1299 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1300 1301 // Toggle the vCPUs pause boolean 1302 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1303 1304 // Unpark all the VCPU threads. 1305 for state in self.vcpu_states.iter() { 1306 state.unpark_thread(); 1307 } 1308 1309 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1310 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1311 // above. 1312 for state in self.vcpu_states.iter() { 1313 state.signal_thread(); 1314 } 1315 1316 // Wait for all the threads to finish. This removes the state from the vector. 1317 for mut state in self.vcpu_states.drain(..) { 1318 state.join_thread()?; 1319 } 1320 1321 Ok(()) 1322 } 1323 1324 #[cfg(feature = "tdx")] 1325 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1326 for vcpu in &self.vcpus { 1327 vcpu.lock() 1328 .unwrap() 1329 .vcpu 1330 .tdx_init(hob_address) 1331 .map_err(Error::InitializeTdx)?; 1332 } 1333 Ok(()) 1334 } 1335 1336 pub fn boot_vcpus(&self) -> u8 { 1337 self.config.boot_vcpus 1338 } 1339 1340 pub fn max_vcpus(&self) -> u8 { 1341 self.config.max_vcpus 1342 } 1343 1344 #[cfg(target_arch = "x86_64")] 1345 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1346 assert!(!self.cpuid.is_empty()); 1347 self.cpuid.clone() 1348 } 1349 1350 fn present_vcpus(&self) -> u8 { 1351 self.vcpu_states 1352 .iter() 1353 .fold(0, |acc, state| acc + state.active() as u8) 1354 } 1355 1356 #[cfg(target_arch = "aarch64")] 1357 pub fn get_mpidrs(&self) -> Vec<u64> { 1358 self.vcpus 1359 .iter() 1360 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1361 .collect() 1362 } 1363 1364 #[cfg(target_arch = "aarch64")] 1365 pub fn get_saved_states(&self) -> Vec<CpuState> { 1366 self.vcpus 1367 .iter() 1368 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1369 .collect() 1370 } 1371 1372 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1373 self.config 1374 .topology 1375 .clone() 1376 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1377 } 1378 1379 pub fn create_madt(&self) -> Sdt { 1380 use crate::acpi; 1381 // This is also checked in the commandline parsing. 1382 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1383 1384 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1385 #[cfg(target_arch = "x86_64")] 1386 { 1387 madt.write(36, arch::layout::APIC_START.0); 1388 1389 for cpu in 0..self.config.max_vcpus { 1390 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology()); 1391 1392 let lapic = LocalX2Apic { 1393 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1394 length: 16, 1395 processor_id: cpu.into(), 1396 apic_id: x2apic_id, 1397 flags: if cpu < self.config.boot_vcpus { 1398 1 << MADT_CPU_ENABLE_FLAG 1399 } else { 1400 0 1401 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1402 _reserved: 0, 1403 }; 1404 madt.append(lapic); 1405 } 1406 1407 madt.append(Ioapic { 1408 r#type: acpi::ACPI_APIC_IO, 1409 length: 12, 1410 ioapic_id: 0, 1411 apic_address: arch::layout::IOAPIC_START.0 as u32, 1412 gsi_base: 0, 1413 ..Default::default() 1414 }); 1415 1416 madt.append(InterruptSourceOverride { 1417 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1418 length: 10, 1419 bus: 0, 1420 source: 4, 1421 gsi: 4, 1422 flags: 0, 1423 }); 1424 } 1425 1426 #[cfg(target_arch = "aarch64")] 1427 { 1428 /* Notes: 1429 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1430 */ 1431 1432 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1433 for cpu in 0..self.config.boot_vcpus { 1434 let vcpu = &self.vcpus[cpu as usize]; 1435 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1436 /* ARMv8 MPIDR format: 1437 Bits [63:40] Must be zero 1438 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1439 Bits [31:24] Must be zero 1440 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1441 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1442 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1443 */ 1444 let mpidr_mask = 0xff_00ff_ffff; 1445 let gicc = GicC { 1446 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1447 length: 80, 1448 reserved0: 0, 1449 cpu_interface_number: cpu as u32, 1450 uid: cpu as u32, 1451 flags: 1, 1452 parking_version: 0, 1453 performance_interrupt: 0, 1454 parked_address: 0, 1455 base_address: 0, 1456 gicv_base_address: 0, 1457 gich_base_address: 0, 1458 vgic_interrupt: 0, 1459 gicr_base_address: 0, 1460 mpidr: mpidr & mpidr_mask, 1461 proc_power_effi_class: 0, 1462 reserved1: 0, 1463 spe_overflow_interrupt: 0, 1464 }; 1465 1466 madt.append(gicc); 1467 } 1468 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1469 1470 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1471 let gicd = GicD { 1472 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1473 length: 24, 1474 reserved0: 0, 1475 gic_id: 0, 1476 base_address: vgic_config.dist_addr, 1477 global_irq_base: 0, 1478 version: 3, 1479 reserved1: [0; 3], 1480 }; 1481 madt.append(gicd); 1482 1483 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1484 let gicr = GicR { 1485 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1486 length: 16, 1487 reserved: 0, 1488 base_address: vgic_config.redists_addr, 1489 range_length: vgic_config.redists_size as u32, 1490 }; 1491 madt.append(gicr); 1492 1493 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1494 let gicits = GicIts { 1495 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1496 length: 20, 1497 reserved0: 0, 1498 translation_id: 0, 1499 base_address: vgic_config.msi_addr, 1500 reserved1: 0, 1501 }; 1502 madt.append(gicits); 1503 1504 madt.update_checksum(); 1505 } 1506 1507 madt 1508 } 1509 1510 #[cfg(target_arch = "aarch64")] 1511 pub fn create_pptt(&self) -> Sdt { 1512 let pptt_start = 0; 1513 let mut cpus = 0; 1514 let mut uid = 0; 1515 // If topology is not specified, the default setting is: 1516 // 1 package, multiple cores, 1 thread per core 1517 // This is also the behavior when PPTT is missing. 1518 let (threads_per_core, cores_per_package, packages) = 1519 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1520 1521 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1522 1523 for cluster_idx in 0..packages { 1524 if cpus < self.config.boot_vcpus as usize { 1525 let cluster_offset = pptt.len() - pptt_start; 1526 let cluster_hierarchy_node = ProcessorHierarchyNode { 1527 r#type: 0, 1528 length: 20, 1529 reserved: 0, 1530 flags: 0x2, 1531 parent: 0, 1532 acpi_processor_id: cluster_idx as u32, 1533 num_private_resources: 0, 1534 }; 1535 pptt.append(cluster_hierarchy_node); 1536 1537 for core_idx in 0..cores_per_package { 1538 let core_offset = pptt.len() - pptt_start; 1539 1540 if threads_per_core > 1 { 1541 let core_hierarchy_node = ProcessorHierarchyNode { 1542 r#type: 0, 1543 length: 20, 1544 reserved: 0, 1545 flags: 0x2, 1546 parent: cluster_offset as u32, 1547 acpi_processor_id: core_idx as u32, 1548 num_private_resources: 0, 1549 }; 1550 pptt.append(core_hierarchy_node); 1551 1552 for _thread_idx in 0..threads_per_core { 1553 let thread_hierarchy_node = ProcessorHierarchyNode { 1554 r#type: 0, 1555 length: 20, 1556 reserved: 0, 1557 flags: 0xE, 1558 parent: core_offset as u32, 1559 acpi_processor_id: uid as u32, 1560 num_private_resources: 0, 1561 }; 1562 pptt.append(thread_hierarchy_node); 1563 uid += 1; 1564 } 1565 } else { 1566 let thread_hierarchy_node = ProcessorHierarchyNode { 1567 r#type: 0, 1568 length: 20, 1569 reserved: 0, 1570 flags: 0xA, 1571 parent: cluster_offset as u32, 1572 acpi_processor_id: uid as u32, 1573 num_private_resources: 0, 1574 }; 1575 pptt.append(thread_hierarchy_node); 1576 uid += 1; 1577 } 1578 } 1579 cpus += (cores_per_package * threads_per_core) as usize; 1580 } 1581 } 1582 1583 pptt.update_checksum(); 1584 pptt 1585 } 1586 1587 #[cfg(feature = "guest_debug")] 1588 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1589 self.vcpus[usize::from(cpu_id)] 1590 .lock() 1591 .unwrap() 1592 .vcpu 1593 .get_regs() 1594 .map_err(Error::CpuDebug) 1595 } 1596 1597 #[cfg(feature = "guest_debug")] 1598 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1599 self.vcpus[usize::from(cpu_id)] 1600 .lock() 1601 .unwrap() 1602 .vcpu 1603 .set_regs(regs) 1604 .map_err(Error::CpuDebug) 1605 } 1606 1607 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1608 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1609 self.vcpus[usize::from(cpu_id)] 1610 .lock() 1611 .unwrap() 1612 .vcpu 1613 .get_sregs() 1614 .map_err(Error::CpuDebug) 1615 } 1616 1617 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1618 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1619 self.vcpus[usize::from(cpu_id)] 1620 .lock() 1621 .unwrap() 1622 .vcpu 1623 .set_sregs(sregs) 1624 .map_err(Error::CpuDebug) 1625 } 1626 1627 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1628 fn translate_gva( 1629 &self, 1630 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1631 cpu_id: u8, 1632 gva: u64, 1633 ) -> Result<u64> { 1634 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1635 .lock() 1636 .unwrap() 1637 .vcpu 1638 .translate_gva(gva, /* flags: unused */ 0) 1639 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1640 Ok(gpa) 1641 } 1642 1643 /// 1644 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1645 /// it in VMM by walking through translation tables. 1646 /// 1647 /// Address translation is big topic, here we only focus the scenario that 1648 /// happens in VMM while debugging kernel. This `translate_gva` 1649 /// implementation is restricted to: 1650 /// - Exception Level 1 1651 /// - Translate high address range only (kernel space) 1652 /// 1653 /// This implementation supports following Arm-v8a features related to 1654 /// address translation: 1655 /// - FEAT_LPA 1656 /// - FEAT_LVA 1657 /// - FEAT_LPA2 1658 /// 1659 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1660 fn translate_gva( 1661 &self, 1662 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1663 cpu_id: u8, 1664 gva: u64, 1665 ) -> Result<u64> { 1666 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1667 .lock() 1668 .unwrap() 1669 .vcpu 1670 .get_sys_reg(regs::TCR_EL1) 1671 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1672 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1673 .lock() 1674 .unwrap() 1675 .vcpu 1676 .get_sys_reg(regs::TTBR1_EL1) 1677 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1678 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1679 .lock() 1680 .unwrap() 1681 .vcpu 1682 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1683 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1684 1685 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1686 // or low (0x000xxx...). 1687 let high_range = extract_bits_64!(gva, 55, 1); 1688 if high_range == 0 { 1689 info!("VA (0x{:x}) range is not supported!", gva); 1690 return Ok(gva); 1691 } 1692 1693 // High range size offset 1694 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1695 // Granule size 1696 let tg = extract_bits_64!(tcr_el1, 30, 2); 1697 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1698 let ds = extract_bits_64!(tcr_el1, 59, 1); 1699 1700 if tsz == 0 { 1701 info!("VA translation is not ready!"); 1702 return Ok(gva); 1703 } 1704 1705 // VA size is determined by TCR_BL1.T1SZ 1706 let va_size = 64 - tsz; 1707 // Number of bits in VA consumed in each level of translation 1708 let stride = match tg { 1709 3 => 13, // 64KB granule size 1710 1 => 11, // 16KB granule size 1711 _ => 9, // 4KB, default 1712 }; 1713 // Starting level of walking 1714 let mut level = 4 - (va_size - 4) / stride; 1715 1716 // PA or IPA size is determined 1717 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1718 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1719 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1720 // To be safe, we use the minimum value if they are different. 1721 let pa_range = std::cmp::min(tcr_ips, pa_range); 1722 // PA size in bits 1723 let pa_size = match pa_range { 1724 0 => 32, 1725 1 => 36, 1726 2 => 40, 1727 3 => 42, 1728 4 => 44, 1729 5 => 48, 1730 6 => 52, 1731 _ => { 1732 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1733 "PA range not supported {pa_range}" 1734 )))) 1735 } 1736 }; 1737 1738 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1739 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1740 // If FEAT_LPA2 is present, the translation table descriptor holds 1741 // 50 bits of the table address of next level. 1742 // Otherwise, it is 48 bits. 1743 let descaddrmask = if ds == 1 { 1744 !0u64 >> (64 - 50) // mask with 50 least significant bits 1745 } else { 1746 !0u64 >> (64 - 48) // mask with 48 least significant bits 1747 }; 1748 let descaddrmask = descaddrmask & !indexmask_grainsize; 1749 1750 // Translation table base address 1751 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1752 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1753 // address bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1754 if pa_size == 52 { 1755 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1756 } 1757 1758 // Loop through tables of each level 1759 loop { 1760 // Table offset for current level 1761 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1762 descaddr |= table_offset; 1763 descaddr &= !7u64; 1764 1765 let mut buf = [0; 8]; 1766 guest_memory 1767 .memory() 1768 .read(&mut buf, GuestAddress(descaddr)) 1769 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1770 let descriptor = u64::from_le_bytes(buf); 1771 1772 descaddr = descriptor & descaddrmask; 1773 // In the case of FEAT_LPA, the next-level translation table address 1774 // bits [48:51] comes from bits [12:15] of the current descriptor. 1775 // For FEAT_LPA2, the next-level translation table address 1776 // bits [50:51] comes from bits [8:9] of the current descriptor, 1777 // bits [48:49] comes from bits [48:49] of the descriptor which was 1778 // handled previously. 1779 if pa_size == 52 { 1780 if ds == 1 { 1781 // FEAT_LPA2 1782 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1783 } else { 1784 // FEAT_LPA 1785 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1786 } 1787 } 1788 1789 if (descriptor & 2) != 0 && (level < 3) { 1790 // This is a table entry. Go down to next level. 1791 level += 1; 1792 indexmask = indexmask_grainsize; 1793 continue; 1794 } 1795 1796 break; 1797 } 1798 1799 // We have reached either: 1800 // - a page entry at level 3 or 1801 // - a block entry at level 1 or 2 1802 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1803 descaddr &= !(page_size - 1); 1804 descaddr |= gva & (page_size - 1); 1805 1806 Ok(descaddr) 1807 } 1808 1809 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1810 self.acpi_address = Some(acpi_address); 1811 } 1812 1813 pub(crate) fn set_interrupt_controller( 1814 &mut self, 1815 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1816 ) { 1817 self.interrupt_controller = Some(interrupt_controller); 1818 } 1819 1820 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1821 &self.vcpus_kill_signalled 1822 } 1823 1824 #[cfg(feature = "igvm")] 1825 pub(crate) fn get_cpuid_leaf( 1826 &self, 1827 cpu_id: u8, 1828 eax: u32, 1829 ecx: u32, 1830 xfem: u64, 1831 xss: u64, 1832 ) -> Result<[u32; 4]> { 1833 let leaf_info = self.vcpus[usize::from(cpu_id)] 1834 .lock() 1835 .unwrap() 1836 .vcpu 1837 .get_cpuid_values(eax, ecx, xfem, xss) 1838 .unwrap(); 1839 Ok(leaf_info) 1840 } 1841 1842 #[cfg(feature = "sev_snp")] 1843 pub(crate) fn sev_snp_enabled(&self) -> bool { 1844 self.sev_snp_enabled 1845 } 1846 } 1847 1848 struct Cpu { 1849 cpu_id: u8, 1850 proximity_domain: u32, 1851 dynamic: bool, 1852 #[cfg(target_arch = "x86_64")] 1853 topology: Option<(u8, u8, u8)>, 1854 } 1855 1856 #[cfg(target_arch = "x86_64")] 1857 const MADT_CPU_ENABLE_FLAG: usize = 0; 1858 1859 #[cfg(target_arch = "x86_64")] 1860 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1861 1862 impl Cpu { 1863 #[cfg(target_arch = "x86_64")] 1864 fn generate_mat(&self) -> Vec<u8> { 1865 let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology); 1866 1867 let lapic = LocalX2Apic { 1868 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1869 length: 16, 1870 processor_id: self.cpu_id.into(), 1871 apic_id: x2apic_id, 1872 flags: 1 << MADT_CPU_ENABLE_FLAG, 1873 _reserved: 0, 1874 }; 1875 1876 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1877 // SAFETY: mat_data is large enough to hold lapic 1878 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1879 1880 mat_data 1881 } 1882 } 1883 1884 impl Aml for Cpu { 1885 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1886 #[cfg(target_arch = "x86_64")] 1887 let mat_data: Vec<u8> = self.generate_mat(); 1888 #[allow(clippy::if_same_then_else)] 1889 if self.dynamic { 1890 aml::Device::new( 1891 format!("C{:03X}", self.cpu_id).as_str().into(), 1892 vec![ 1893 &aml::Name::new("_HID".into(), &"ACPI0007"), 1894 &aml::Name::new("_UID".into(), &self.cpu_id), 1895 // Currently, AArch64 cannot support following fields. 1896 /* 1897 _STA return value: 1898 Bit [0] – Set if the device is present. 1899 Bit [1] – Set if the device is enabled and decoding its resources. 1900 Bit [2] – Set if the device should be shown in the UI. 1901 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1902 Bit [4] – Set if the battery is present. 1903 Bits [31:5] – Reserved (must be cleared). 1904 */ 1905 #[cfg(target_arch = "x86_64")] 1906 &aml::Method::new( 1907 "_STA".into(), 1908 0, 1909 false, 1910 // Call into CSTA method which will interrogate device 1911 vec![&aml::Return::new(&aml::MethodCall::new( 1912 "CSTA".into(), 1913 vec![&self.cpu_id], 1914 ))], 1915 ), 1916 &aml::Method::new( 1917 "_PXM".into(), 1918 0, 1919 false, 1920 vec![&aml::Return::new(&self.proximity_domain)], 1921 ), 1922 // The Linux kernel expects every CPU device to have a _MAT entry 1923 // containing the LAPIC for this processor with the enabled bit set 1924 // even it if is disabled in the MADT (non-boot CPU) 1925 #[cfg(target_arch = "x86_64")] 1926 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1927 // Trigger CPU ejection 1928 #[cfg(target_arch = "x86_64")] 1929 &aml::Method::new( 1930 "_EJ0".into(), 1931 1, 1932 false, 1933 // Call into CEJ0 method which will actually eject device 1934 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1935 ), 1936 ], 1937 ) 1938 .to_aml_bytes(sink); 1939 } else { 1940 aml::Device::new( 1941 format!("C{:03X}", self.cpu_id).as_str().into(), 1942 vec![ 1943 &aml::Name::new("_HID".into(), &"ACPI0007"), 1944 &aml::Name::new("_UID".into(), &self.cpu_id), 1945 #[cfg(target_arch = "x86_64")] 1946 &aml::Method::new( 1947 "_STA".into(), 1948 0, 1949 false, 1950 // Mark CPU present see CSTA implementation 1951 vec![&aml::Return::new(&0xfu8)], 1952 ), 1953 &aml::Method::new( 1954 "_PXM".into(), 1955 0, 1956 false, 1957 vec![&aml::Return::new(&self.proximity_domain)], 1958 ), 1959 // The Linux kernel expects every CPU device to have a _MAT entry 1960 // containing the LAPIC for this processor with the enabled bit set 1961 // even it if is disabled in the MADT (non-boot CPU) 1962 #[cfg(target_arch = "x86_64")] 1963 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1964 ], 1965 ) 1966 .to_aml_bytes(sink); 1967 } 1968 } 1969 } 1970 1971 struct CpuNotify { 1972 cpu_id: u8, 1973 } 1974 1975 impl Aml for CpuNotify { 1976 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1977 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 1978 aml::If::new( 1979 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1980 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1981 ) 1982 .to_aml_bytes(sink) 1983 } 1984 } 1985 1986 struct CpuMethods { 1987 max_vcpus: u8, 1988 dynamic: bool, 1989 } 1990 1991 impl Aml for CpuMethods { 1992 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1993 if self.dynamic { 1994 // CPU status method 1995 aml::Method::new( 1996 "CSTA".into(), 1997 1, 1998 true, 1999 vec![ 2000 // Take lock defined above 2001 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2002 // Write CPU number (in first argument) to I/O port via field 2003 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2004 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2005 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2006 &aml::If::new( 2007 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 2008 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2009 ), 2010 // Release lock 2011 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2012 // Return 0 or 0xf 2013 &aml::Return::new(&aml::Local(0)), 2014 ], 2015 ) 2016 .to_aml_bytes(sink); 2017 2018 let mut cpu_notifies = Vec::new(); 2019 for cpu_id in 0..self.max_vcpus { 2020 cpu_notifies.push(CpuNotify { cpu_id }); 2021 } 2022 2023 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2024 for cpu_id in 0..self.max_vcpus { 2025 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 2026 } 2027 2028 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 2029 2030 aml::Method::new( 2031 "CEJ0".into(), 2032 1, 2033 true, 2034 vec![ 2035 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2036 // Write CPU number (in first argument) to I/O port via field 2037 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2038 // Set CEJ0 bit 2039 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 2040 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2041 ], 2042 ) 2043 .to_aml_bytes(sink); 2044 2045 aml::Method::new( 2046 "CSCN".into(), 2047 0, 2048 true, 2049 vec![ 2050 // Take lock defined above 2051 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2052 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2053 &aml::While::new( 2054 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 2055 vec![ 2056 // Write CPU number (in first argument) to I/O port via field 2057 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 2058 // Check if CINS bit is set 2059 &aml::If::new( 2060 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 2061 // Notify device if it is 2062 vec![ 2063 &aml::MethodCall::new( 2064 "CTFY".into(), 2065 vec![&aml::Local(0), &aml::ONE], 2066 ), 2067 // Reset CINS bit 2068 &aml::Store::new( 2069 &aml::Path::new("\\_SB_.PRES.CINS"), 2070 &aml::ONE, 2071 ), 2072 ], 2073 ), 2074 // Check if CRMV bit is set 2075 &aml::If::new( 2076 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 2077 // Notify device if it is (with the eject constant 0x3) 2078 vec![ 2079 &aml::MethodCall::new( 2080 "CTFY".into(), 2081 vec![&aml::Local(0), &3u8], 2082 ), 2083 // Reset CRMV bit 2084 &aml::Store::new( 2085 &aml::Path::new("\\_SB_.PRES.CRMV"), 2086 &aml::ONE, 2087 ), 2088 ], 2089 ), 2090 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2091 ], 2092 ), 2093 // Release lock 2094 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2095 ], 2096 ) 2097 .to_aml_bytes(sink) 2098 } else { 2099 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2100 } 2101 } 2102 } 2103 2104 impl Aml for CpuManager { 2105 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2106 #[cfg(target_arch = "x86_64")] 2107 if let Some(acpi_address) = self.acpi_address { 2108 // CPU hotplug controller 2109 aml::Device::new( 2110 "_SB_.PRES".into(), 2111 vec![ 2112 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2113 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2114 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2115 &aml::Mutex::new("CPLK".into(), 0), 2116 &aml::Name::new( 2117 "_CRS".into(), 2118 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2119 aml::AddressSpaceCacheable::NotCacheable, 2120 true, 2121 acpi_address.0, 2122 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2123 None, 2124 )]), 2125 ), 2126 // OpRegion and Fields map MMIO range into individual field values 2127 &aml::OpRegion::new( 2128 "PRST".into(), 2129 aml::OpRegionSpace::SystemMemory, 2130 &(acpi_address.0 as usize), 2131 &CPU_MANAGER_ACPI_SIZE, 2132 ), 2133 &aml::Field::new( 2134 "PRST".into(), 2135 aml::FieldAccessType::Byte, 2136 aml::FieldLockRule::NoLock, 2137 aml::FieldUpdateRule::WriteAsZeroes, 2138 vec![ 2139 aml::FieldEntry::Reserved(32), 2140 aml::FieldEntry::Named(*b"CPEN", 1), 2141 aml::FieldEntry::Named(*b"CINS", 1), 2142 aml::FieldEntry::Named(*b"CRMV", 1), 2143 aml::FieldEntry::Named(*b"CEJ0", 1), 2144 aml::FieldEntry::Reserved(4), 2145 aml::FieldEntry::Named(*b"CCMD", 8), 2146 ], 2147 ), 2148 &aml::Field::new( 2149 "PRST".into(), 2150 aml::FieldAccessType::DWord, 2151 aml::FieldLockRule::NoLock, 2152 aml::FieldUpdateRule::Preserve, 2153 vec![ 2154 aml::FieldEntry::Named(*b"CSEL", 32), 2155 aml::FieldEntry::Reserved(32), 2156 aml::FieldEntry::Named(*b"CDAT", 32), 2157 ], 2158 ), 2159 ], 2160 ) 2161 .to_aml_bytes(sink); 2162 } 2163 2164 // CPU devices 2165 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2166 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2167 // Bundle methods together under a common object 2168 let methods = CpuMethods { 2169 max_vcpus: self.config.max_vcpus, 2170 dynamic: self.dynamic, 2171 }; 2172 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2173 2174 #[cfg(target_arch = "x86_64")] 2175 let topology = self.get_vcpu_topology(); 2176 let mut cpu_devices = Vec::new(); 2177 for cpu_id in 0..self.config.max_vcpus { 2178 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2179 let cpu_device = Cpu { 2180 cpu_id, 2181 proximity_domain, 2182 dynamic: self.dynamic, 2183 #[cfg(target_arch = "x86_64")] 2184 topology, 2185 }; 2186 2187 cpu_devices.push(cpu_device); 2188 } 2189 2190 for cpu_device in cpu_devices.iter() { 2191 cpu_data_inner.push(cpu_device); 2192 } 2193 2194 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2195 } 2196 } 2197 2198 impl Pausable for CpuManager { 2199 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2200 // Tell the vCPUs to pause themselves next time they exit 2201 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2202 2203 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2204 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2205 // above. 2206 for state in self.vcpu_states.iter() { 2207 state.signal_thread(); 2208 } 2209 2210 for vcpu in self.vcpus.iter() { 2211 let mut vcpu = vcpu.lock().unwrap(); 2212 vcpu.pause()?; 2213 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2214 if !self.config.kvm_hyperv { 2215 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2216 MigratableError::Pause(anyhow!( 2217 "Could not notify guest it has been paused {:?}", 2218 e 2219 )) 2220 })?; 2221 } 2222 } 2223 2224 // The vCPU thread will change its paused state before parking, wait here for each 2225 // activated vCPU change their state to ensure they have parked. 2226 for state in self.vcpu_states.iter() { 2227 if state.active() { 2228 while !state.paused.load(Ordering::SeqCst) { 2229 // To avoid a priority inversion with the vCPU thread 2230 thread::sleep(std::time::Duration::from_millis(1)); 2231 } 2232 } 2233 } 2234 2235 Ok(()) 2236 } 2237 2238 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2239 for vcpu in self.vcpus.iter() { 2240 vcpu.lock().unwrap().resume()?; 2241 } 2242 2243 // Toggle the vCPUs pause boolean 2244 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2245 2246 // Unpark all the VCPU threads. 2247 // Once unparked, the next thing they will do is checking for the pause 2248 // boolean. Since it'll be set to false, they will exit their pause loop 2249 // and go back to vmx root. 2250 for state in self.vcpu_states.iter() { 2251 state.paused.store(false, Ordering::SeqCst); 2252 state.unpark_thread(); 2253 } 2254 Ok(()) 2255 } 2256 } 2257 2258 impl Snapshottable for CpuManager { 2259 fn id(&self) -> String { 2260 CPU_MANAGER_SNAPSHOT_ID.to_string() 2261 } 2262 2263 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2264 let mut cpu_manager_snapshot = Snapshot::default(); 2265 2266 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2267 for vcpu in &self.vcpus { 2268 let mut vcpu = vcpu.lock().unwrap(); 2269 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2270 } 2271 2272 Ok(cpu_manager_snapshot) 2273 } 2274 } 2275 2276 impl Transportable for CpuManager {} 2277 impl Migratable for CpuManager {} 2278 2279 #[cfg(feature = "guest_debug")] 2280 impl Debuggable for CpuManager { 2281 #[cfg(feature = "kvm")] 2282 fn set_guest_debug( 2283 &self, 2284 cpu_id: usize, 2285 addrs: &[GuestAddress], 2286 singlestep: bool, 2287 ) -> std::result::Result<(), DebuggableError> { 2288 self.vcpus[cpu_id] 2289 .lock() 2290 .unwrap() 2291 .vcpu 2292 .set_guest_debug(addrs, singlestep) 2293 .map_err(DebuggableError::SetDebug) 2294 } 2295 2296 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2297 Ok(()) 2298 } 2299 2300 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2301 Ok(()) 2302 } 2303 2304 #[cfg(target_arch = "x86_64")] 2305 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2306 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2307 let gregs = self 2308 .get_regs(cpu_id as u8) 2309 .map_err(DebuggableError::ReadRegs)?; 2310 let regs = [ 2311 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, 2312 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, 2313 ]; 2314 2315 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2316 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2317 let eflags = gregs.rflags as u32; 2318 let rip = gregs.rip; 2319 2320 // Segment registers: CS, SS, DS, ES, FS, GS 2321 let sregs = self 2322 .get_sregs(cpu_id as u8) 2323 .map_err(DebuggableError::ReadRegs)?; 2324 let segments = X86SegmentRegs { 2325 cs: sregs.cs.selector as u32, 2326 ss: sregs.ss.selector as u32, 2327 ds: sregs.ds.selector as u32, 2328 es: sregs.es.selector as u32, 2329 fs: sregs.fs.selector as u32, 2330 gs: sregs.gs.selector as u32, 2331 }; 2332 2333 // TODO: Add other registers 2334 2335 Ok(CoreRegs { 2336 regs, 2337 eflags, 2338 rip, 2339 segments, 2340 ..Default::default() 2341 }) 2342 } 2343 2344 #[cfg(target_arch = "aarch64")] 2345 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2346 let gregs = self 2347 .get_regs(cpu_id as u8) 2348 .map_err(DebuggableError::ReadRegs)?; 2349 Ok(CoreRegs { 2350 x: gregs.regs.regs, 2351 sp: gregs.regs.sp, 2352 pc: gregs.regs.pc, 2353 ..Default::default() 2354 }) 2355 } 2356 2357 #[cfg(target_arch = "x86_64")] 2358 fn write_regs( 2359 &self, 2360 cpu_id: usize, 2361 regs: &CoreRegs, 2362 ) -> std::result::Result<(), DebuggableError> { 2363 let orig_gregs = self 2364 .get_regs(cpu_id as u8) 2365 .map_err(DebuggableError::ReadRegs)?; 2366 let gregs = StandardRegisters { 2367 rax: regs.regs[0], 2368 rbx: regs.regs[1], 2369 rcx: regs.regs[2], 2370 rdx: regs.regs[3], 2371 rsi: regs.regs[4], 2372 rdi: regs.regs[5], 2373 rbp: regs.regs[6], 2374 rsp: regs.regs[7], 2375 r8: regs.regs[8], 2376 r9: regs.regs[9], 2377 r10: regs.regs[10], 2378 r11: regs.regs[11], 2379 r12: regs.regs[12], 2380 r13: regs.regs[13], 2381 r14: regs.regs[14], 2382 r15: regs.regs[15], 2383 rip: regs.rip, 2384 // Update the lower 32-bit of rflags. 2385 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), 2386 }; 2387 2388 self.set_regs(cpu_id as u8, &gregs) 2389 .map_err(DebuggableError::WriteRegs)?; 2390 2391 // Segment registers: CS, SS, DS, ES, FS, GS 2392 // Since GDB care only selectors, we call get_sregs() first. 2393 let mut sregs = self 2394 .get_sregs(cpu_id as u8) 2395 .map_err(DebuggableError::ReadRegs)?; 2396 sregs.cs.selector = regs.segments.cs as u16; 2397 sregs.ss.selector = regs.segments.ss as u16; 2398 sregs.ds.selector = regs.segments.ds as u16; 2399 sregs.es.selector = regs.segments.es as u16; 2400 sregs.fs.selector = regs.segments.fs as u16; 2401 sregs.gs.selector = regs.segments.gs as u16; 2402 2403 self.set_sregs(cpu_id as u8, &sregs) 2404 .map_err(DebuggableError::WriteRegs)?; 2405 2406 // TODO: Add other registers 2407 2408 Ok(()) 2409 } 2410 2411 #[cfg(target_arch = "aarch64")] 2412 fn write_regs( 2413 &self, 2414 cpu_id: usize, 2415 regs: &CoreRegs, 2416 ) -> std::result::Result<(), DebuggableError> { 2417 let mut gregs = self 2418 .get_regs(cpu_id as u8) 2419 .map_err(DebuggableError::ReadRegs)?; 2420 2421 gregs.regs.regs = regs.x; 2422 gregs.regs.sp = regs.sp; 2423 gregs.regs.pc = regs.pc; 2424 2425 self.set_regs(cpu_id as u8, &gregs) 2426 .map_err(DebuggableError::WriteRegs)?; 2427 2428 Ok(()) 2429 } 2430 2431 fn read_mem( 2432 &self, 2433 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2434 cpu_id: usize, 2435 vaddr: GuestAddress, 2436 len: usize, 2437 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2438 let mut buf = vec![0; len]; 2439 let mut total_read = 0_u64; 2440 2441 while total_read < len as u64 { 2442 let gaddr = vaddr.0 + total_read; 2443 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2444 Ok(paddr) => paddr, 2445 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2446 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2447 }; 2448 let psize = arch::PAGE_SIZE as u64; 2449 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2450 guest_memory 2451 .memory() 2452 .read( 2453 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2454 GuestAddress(paddr), 2455 ) 2456 .map_err(DebuggableError::ReadMem)?; 2457 total_read += read_len; 2458 } 2459 Ok(buf) 2460 } 2461 2462 fn write_mem( 2463 &self, 2464 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2465 cpu_id: usize, 2466 vaddr: &GuestAddress, 2467 data: &[u8], 2468 ) -> std::result::Result<(), DebuggableError> { 2469 let mut total_written = 0_u64; 2470 2471 while total_written < data.len() as u64 { 2472 let gaddr = vaddr.0 + total_written; 2473 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2474 Ok(paddr) => paddr, 2475 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2476 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2477 }; 2478 let psize = arch::PAGE_SIZE as u64; 2479 let write_len = std::cmp::min( 2480 data.len() as u64 - total_written, 2481 psize - (paddr & (psize - 1)), 2482 ); 2483 guest_memory 2484 .memory() 2485 .write( 2486 &data[total_written as usize..total_written as usize + write_len as usize], 2487 GuestAddress(paddr), 2488 ) 2489 .map_err(DebuggableError::WriteMem)?; 2490 total_written += write_len; 2491 } 2492 Ok(()) 2493 } 2494 2495 fn active_vcpus(&self) -> usize { 2496 self.present_vcpus() as usize 2497 } 2498 } 2499 2500 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2501 impl Elf64Writable for CpuManager {} 2502 2503 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2504 impl CpuElf64Writable for CpuManager { 2505 fn cpu_write_elf64_note( 2506 &mut self, 2507 dump_state: &DumpState, 2508 ) -> std::result::Result<(), GuestDebuggableError> { 2509 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2510 for vcpu in &self.vcpus { 2511 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2512 let mut pos: usize = 0; 2513 let mut buf = vec![0; note_size as usize]; 2514 let descsz = size_of::<X86_64ElfPrStatus>(); 2515 let vcpu_id = vcpu.lock().unwrap().id; 2516 2517 let note = Elf64_Nhdr { 2518 n_namesz: COREDUMP_NAME_SIZE, 2519 n_descsz: descsz as u32, 2520 n_type: NT_PRSTATUS, 2521 }; 2522 2523 let bytes: &[u8] = note.as_slice(); 2524 buf.splice(0.., bytes.to_vec()); 2525 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2526 buf.resize(pos + 4, 0); 2527 buf.splice(pos.., "CORE".to_string().into_bytes()); 2528 2529 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2530 buf.resize(pos + 32 + 4, 0); 2531 let pid = vcpu_id as u64; 2532 let bytes: &[u8] = pid.as_slice(); 2533 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2534 2535 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2536 2537 let orig_rax: u64 = 0; 2538 let gregs = self.vcpus[usize::from(vcpu_id)] 2539 .lock() 2540 .unwrap() 2541 .vcpu 2542 .get_regs() 2543 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2544 2545 let regs1 = [ 2546 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11, 2547 gregs.r10, 2548 ]; 2549 let regs2 = [ 2550 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax, 2551 ]; 2552 2553 let sregs = self.vcpus[usize::from(vcpu_id)] 2554 .lock() 2555 .unwrap() 2556 .vcpu 2557 .get_sregs() 2558 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2559 2560 debug!( 2561 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2562 gregs.rip, 2563 gregs.rsp, 2564 sregs.gs.base, 2565 sregs.cs.selector, 2566 sregs.ss.selector, 2567 sregs.ds.selector, 2568 ); 2569 2570 let regs = X86_64UserRegs { 2571 regs1, 2572 regs2, 2573 rip: gregs.rip, 2574 cs: sregs.cs.selector as u64, 2575 eflags: gregs.rflags, 2576 rsp: gregs.rsp, 2577 ss: sregs.ss.selector as u64, 2578 fs_base: sregs.fs.base, 2579 gs_base: sregs.gs.base, 2580 ds: sregs.ds.selector as u64, 2581 es: sregs.es.selector as u64, 2582 fs: sregs.fs.selector as u64, 2583 gs: sregs.gs.selector as u64, 2584 }; 2585 2586 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2587 let bytes: &[u8] = regs.as_slice(); 2588 buf.resize(note_size as usize, 0); 2589 buf.splice(pos.., bytes.to_vec()); 2590 buf.resize(note_size as usize, 0); 2591 2592 coredump_file 2593 .write(&buf) 2594 .map_err(GuestDebuggableError::CoredumpFile)?; 2595 } 2596 2597 Ok(()) 2598 } 2599 2600 fn cpu_write_vmm_note( 2601 &mut self, 2602 dump_state: &DumpState, 2603 ) -> std::result::Result<(), GuestDebuggableError> { 2604 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2605 for vcpu in &self.vcpus { 2606 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2607 let mut pos: usize = 0; 2608 let mut buf = vec![0; note_size as usize]; 2609 let descsz = size_of::<DumpCpusState>(); 2610 let vcpu_id = vcpu.lock().unwrap().id; 2611 2612 let note = Elf64_Nhdr { 2613 n_namesz: COREDUMP_NAME_SIZE, 2614 n_descsz: descsz as u32, 2615 n_type: 0, 2616 }; 2617 2618 let bytes: &[u8] = note.as_slice(); 2619 buf.splice(0.., bytes.to_vec()); 2620 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2621 2622 buf.resize(pos + 4, 0); 2623 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2624 2625 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2626 2627 let gregs = self.vcpus[usize::from(vcpu_id)] 2628 .lock() 2629 .unwrap() 2630 .vcpu 2631 .get_regs() 2632 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2633 2634 let regs1 = [ 2635 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp, 2636 gregs.rbp, 2637 ]; 2638 2639 let regs2 = [ 2640 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, 2641 gregs.r15, 2642 ]; 2643 2644 let sregs = self.vcpus[usize::from(vcpu_id)] 2645 .lock() 2646 .unwrap() 2647 .vcpu 2648 .get_sregs() 2649 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2650 2651 let mut msrs = vec![MsrEntry { 2652 index: msr_index::MSR_KERNEL_GS_BASE, 2653 ..Default::default() 2654 }]; 2655 2656 self.vcpus[vcpu_id as usize] 2657 .lock() 2658 .unwrap() 2659 .vcpu 2660 .get_msrs(&mut msrs) 2661 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2662 let kernel_gs_base = msrs[0].data; 2663 2664 let cs = CpuSegment::new(sregs.cs); 2665 let ds = CpuSegment::new(sregs.ds); 2666 let es = CpuSegment::new(sregs.es); 2667 let fs = CpuSegment::new(sregs.fs); 2668 let gs = CpuSegment::new(sregs.gs); 2669 let ss = CpuSegment::new(sregs.ss); 2670 let ldt = CpuSegment::new(sregs.ldt); 2671 let tr = CpuSegment::new(sregs.tr); 2672 let gdt = CpuSegment::new_from_table(sregs.gdt); 2673 let idt = CpuSegment::new_from_table(sregs.idt); 2674 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2675 let regs = DumpCpusState { 2676 version: 1, 2677 size: size_of::<DumpCpusState>() as u32, 2678 regs1, 2679 regs2, 2680 rip: gregs.rip, 2681 rflags: gregs.rflags, 2682 cs, 2683 ds, 2684 es, 2685 fs, 2686 gs, 2687 ss, 2688 ldt, 2689 tr, 2690 gdt, 2691 idt, 2692 cr, 2693 kernel_gs_base, 2694 }; 2695 2696 let bytes: &[u8] = regs.as_slice(); 2697 buf.resize(note_size as usize, 0); 2698 buf.splice(pos.., bytes.to_vec()); 2699 buf.resize(note_size as usize, 0); 2700 2701 coredump_file 2702 .write(&buf) 2703 .map_err(GuestDebuggableError::CoredumpFile)?; 2704 } 2705 2706 Ok(()) 2707 } 2708 } 2709 2710 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2711 #[cfg(test)] 2712 mod tests { 2713 use arch::x86_64::interrupts::*; 2714 use arch::x86_64::regs::*; 2715 use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters}; 2716 2717 #[test] 2718 fn test_setlint() { 2719 let hv = hypervisor::new().unwrap(); 2720 let vm = hv.create_vm().expect("new VM fd creation failed"); 2721 assert!(hv.check_required_extensions().is_ok()); 2722 // Calling get_lapic will fail if there is no irqchip before hand. 2723 assert!(vm.create_irq_chip().is_ok()); 2724 let vcpu = vm.create_vcpu(0, None).unwrap(); 2725 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2726 2727 // Compute the value that is expected to represent LVT0 and LVT1. 2728 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2729 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2730 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2731 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2732 2733 set_lint(&vcpu).unwrap(); 2734 2735 // Compute the value that represents LVT0 and LVT1 after set_lint. 2736 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2737 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2738 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2739 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2740 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2741 } 2742 2743 #[test] 2744 fn test_setup_fpu() { 2745 let hv = hypervisor::new().unwrap(); 2746 let vm = hv.create_vm().expect("new VM fd creation failed"); 2747 let vcpu = vm.create_vcpu(0, None).unwrap(); 2748 setup_fpu(&vcpu).unwrap(); 2749 2750 let expected_fpu: FpuState = FpuState { 2751 fcw: 0x37f, 2752 mxcsr: 0x1f80, 2753 ..Default::default() 2754 }; 2755 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2756 // TODO: auto-generate kvm related structures with PartialEq on. 2757 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2758 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2759 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2760 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2761 // remove it at all. 2762 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2763 } 2764 2765 #[test] 2766 fn test_setup_msrs() { 2767 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2768 2769 let hv = hypervisor::new().unwrap(); 2770 let vm = hv.create_vm().expect("new VM fd creation failed"); 2771 let vcpu = vm.create_vcpu(0, None).unwrap(); 2772 setup_msrs(&vcpu).unwrap(); 2773 2774 // This test will check against the last MSR entry configured (the tenth one). 2775 // See create_msr_entries for details. 2776 let mut msrs = vec![MsrEntry { 2777 index: msr_index::MSR_IA32_MISC_ENABLE, 2778 ..Default::default() 2779 }]; 2780 2781 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2782 // in this test case scenario. 2783 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2784 assert_eq!(read_msrs, 1); 2785 2786 // Official entries that were setup when we did setup_msrs. We need to assert that the 2787 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2788 // expect. 2789 let entry_vec = vcpu.boot_msr_entries(); 2790 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2791 } 2792 2793 #[test] 2794 fn test_setup_regs() { 2795 let hv = hypervisor::new().unwrap(); 2796 let vm = hv.create_vm().expect("new VM fd creation failed"); 2797 let vcpu = vm.create_vcpu(0, None).unwrap(); 2798 2799 let expected_regs: StandardRegisters = StandardRegisters { 2800 rflags: 0x0000000000000002u64, 2801 rbx: arch::layout::PVH_INFO_START.0, 2802 rip: 1, 2803 ..Default::default() 2804 }; 2805 2806 setup_regs(&vcpu, expected_regs.rip).unwrap(); 2807 2808 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2809 assert_eq!(actual_regs, expected_regs); 2810 } 2811 } 2812 2813 #[cfg(target_arch = "aarch64")] 2814 #[cfg(test)] 2815 mod tests { 2816 use arch::{aarch64::regs, layout}; 2817 use hypervisor::kvm::aarch64::is_system_register; 2818 use hypervisor::kvm::kvm_bindings::{ 2819 kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2820 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2821 }; 2822 use hypervisor::{arm64_core_reg_id, offset_of}; 2823 use std::mem; 2824 2825 #[test] 2826 fn test_setup_regs() { 2827 let hv = hypervisor::new().unwrap(); 2828 let vm = hv.create_vm().unwrap(); 2829 let vcpu = vm.create_vcpu(0, None).unwrap(); 2830 2831 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2832 // Must fail when vcpu is not initialized yet. 2833 assert!(res.is_err()); 2834 2835 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2836 vm.get_preferred_target(&mut kvi).unwrap(); 2837 vcpu.vcpu_init(&kvi).unwrap(); 2838 2839 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2840 } 2841 2842 #[test] 2843 fn test_read_mpidr() { 2844 let hv = hypervisor::new().unwrap(); 2845 let vm = hv.create_vm().unwrap(); 2846 let vcpu = vm.create_vcpu(0, None).unwrap(); 2847 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2848 vm.get_preferred_target(&mut kvi).unwrap(); 2849 2850 // Must fail when vcpu is not initialized yet. 2851 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2852 2853 vcpu.vcpu_init(&kvi).unwrap(); 2854 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2855 } 2856 2857 #[test] 2858 fn test_is_system_register() { 2859 let offset = offset_of!(user_pt_regs, pc); 2860 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2861 assert!(!is_system_register(regid)); 2862 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2863 assert!(is_system_register(regid)); 2864 } 2865 2866 #[test] 2867 fn test_save_restore_core_regs() { 2868 let hv = hypervisor::new().unwrap(); 2869 let vm = hv.create_vm().unwrap(); 2870 let vcpu = vm.create_vcpu(0, None).unwrap(); 2871 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2872 vm.get_preferred_target(&mut kvi).unwrap(); 2873 2874 // Must fail when vcpu is not initialized yet. 2875 let res = vcpu.get_regs(); 2876 assert!(res.is_err()); 2877 assert_eq!( 2878 format!("{}", res.unwrap_err()), 2879 "Failed to get core register: Exec format error (os error 8)" 2880 ); 2881 2882 let mut state = kvm_regs::default(); 2883 let res = vcpu.set_regs(&state); 2884 assert!(res.is_err()); 2885 assert_eq!( 2886 format!("{}", res.unwrap_err()), 2887 "Failed to set core register: Exec format error (os error 8)" 2888 ); 2889 2890 vcpu.vcpu_init(&kvi).unwrap(); 2891 let res = vcpu.get_regs(); 2892 assert!(res.is_ok()); 2893 state = res.unwrap(); 2894 assert_eq!(state.regs.pstate, 0x3C5); 2895 2896 assert!(vcpu.set_regs(&state).is_ok()); 2897 } 2898 2899 #[test] 2900 fn test_get_set_mpstate() { 2901 let hv = hypervisor::new().unwrap(); 2902 let vm = hv.create_vm().unwrap(); 2903 let vcpu = vm.create_vcpu(0, None).unwrap(); 2904 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2905 vm.get_preferred_target(&mut kvi).unwrap(); 2906 2907 let res = vcpu.get_mp_state(); 2908 assert!(res.is_ok()); 2909 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 2910 } 2911 } 2912