1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 #[cfg(target_arch = "x86_64")] 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, sdt::Sdt, Aml}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 #[cfg(target_arch = "x86_64")] 35 use arch::x86_64::get_x2apic_id; 36 use arch::EntryPoint; 37 use arch::NumaNodes; 38 #[cfg(target_arch = "aarch64")] 39 use devices::gic::Gic; 40 use devices::interrupt_controller::InterruptController; 41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 44 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 45 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 46 use hypervisor::aarch64::StandardRegisters; 47 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 48 use hypervisor::arch::x86::msr_index; 49 #[cfg(target_arch = "x86_64")] 50 use hypervisor::arch::x86::CpuIdEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::MsrEntry; 53 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 54 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters}; 55 #[cfg(target_arch = "aarch64")] 56 use hypervisor::kvm::kvm_bindings; 57 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 58 use hypervisor::kvm::kvm_ioctls::Cap; 59 #[cfg(feature = "tdx")] 60 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 61 #[cfg(target_arch = "x86_64")] 62 use hypervisor::CpuVendor; 63 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps}; 64 use libc::{c_void, siginfo_t}; 65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 66 use linux_loader::elf::Elf64_Nhdr; 67 use seccompiler::{apply_filter, SeccompAction}; 68 use std::collections::BTreeMap; 69 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 70 use std::io::Write; 71 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 72 use std::mem::size_of; 73 use std::os::unix::thread::JoinHandleExt; 74 use std::sync::atomic::{AtomicBool, Ordering}; 75 use std::sync::{Arc, Barrier, Mutex}; 76 use std::{cmp, io, result, thread}; 77 use thiserror::Error; 78 use tracer::trace_scoped; 79 use vm_device::BusDevice; 80 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 81 use vm_memory::ByteValued; 82 #[cfg(feature = "guest_debug")] 83 use vm_memory::{Bytes, GuestAddressSpace}; 84 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 85 use vm_migration::{ 86 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 87 Transportable, 88 }; 89 use vmm_sys_util::eventfd::EventFd; 90 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 91 use zerocopy::AsBytes; 92 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 93 /// Extract the specified bits of a 64-bit integer. 94 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 95 /// following expression should return 3 (`0b11`): 96 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 97 /// 98 macro_rules! extract_bits_64 { 99 ($value: tt, $offset: tt, $length: tt) => { 100 ($value >> $offset) & (!0u64 >> (64 - $length)) 101 }; 102 } 103 104 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 105 macro_rules! extract_bits_64_without_offset { 106 ($value: tt, $length: tt) => { 107 $value & (!0u64 >> (64 - $length)) 108 }; 109 } 110 111 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 112 113 #[derive(Debug, Error)] 114 pub enum Error { 115 #[error("Error creating vCPU: {0}")] 116 VcpuCreate(#[source] anyhow::Error), 117 118 #[error("Error running bCPU: {0}")] 119 VcpuRun(#[source] anyhow::Error), 120 121 #[error("Error spawning vCPU thread: {0}")] 122 VcpuSpawn(#[source] io::Error), 123 124 #[error("Error generating common CPUID: {0}")] 125 CommonCpuId(#[source] arch::Error), 126 127 #[error("Error configuring vCPU: {0}")] 128 VcpuConfiguration(#[source] arch::Error), 129 130 #[error("Still pending removed vcpu")] 131 VcpuPendingRemovedVcpu, 132 133 #[cfg(target_arch = "aarch64")] 134 #[error("Error fetching preferred target: {0}")] 135 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 136 137 #[cfg(target_arch = "aarch64")] 138 #[error("Error initialising vCPU: {0}")] 139 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 140 141 #[error("Failed to join on vCPU threads: {0:?}")] 142 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 143 144 #[error("Error adding CpuManager to MMIO bus: {0}")] 145 BusError(#[source] vm_device::BusError), 146 147 #[error("Requested vCPUs exceed maximum")] 148 DesiredVCpuCountExceedsMax, 149 150 #[error("Cannot create seccomp filter: {0}")] 151 CreateSeccompFilter(#[source] seccompiler::Error), 152 153 #[error("Cannot apply seccomp filter: {0}")] 154 ApplySeccompFilter(#[source] seccompiler::Error), 155 156 #[error("Error starting vCPU after restore: {0}")] 157 StartRestoreVcpu(#[source] anyhow::Error), 158 159 #[error("Unexpected VmExit")] 160 UnexpectedVmExit, 161 162 #[error("Failed to allocate MMIO address for CpuManager")] 163 AllocateMmmioAddress, 164 165 #[cfg(feature = "tdx")] 166 #[error("Error initializing TDX: {0}")] 167 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 168 169 #[cfg(target_arch = "aarch64")] 170 #[error("Error initializing PMU: {0}")] 171 InitPmu(#[source] hypervisor::HypervisorCpuError), 172 173 #[cfg(feature = "guest_debug")] 174 #[error("Error during CPU debug: {0}")] 175 CpuDebug(#[source] hypervisor::HypervisorCpuError), 176 177 #[cfg(feature = "guest_debug")] 178 #[error("Error translating virtual address: {0}")] 179 TranslateVirtualAddress(#[source] anyhow::Error), 180 181 #[cfg(target_arch = "x86_64")] 182 #[error("Error setting up AMX: {0}")] 183 AmxEnable(#[source] anyhow::Error), 184 185 #[error("Maximum number of vCPUs exceeds host limit")] 186 MaximumVcpusExceeded, 187 188 #[cfg(feature = "sev_snp")] 189 #[error("Failed to set sev control register: {0}")] 190 SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), 191 } 192 pub type Result<T> = result::Result<T, Error>; 193 194 #[cfg(target_arch = "x86_64")] 195 #[allow(dead_code)] 196 #[repr(packed)] 197 #[derive(AsBytes)] 198 struct LocalX2Apic { 199 pub r#type: u8, 200 pub length: u8, 201 pub _reserved: u16, 202 pub apic_id: u32, 203 pub flags: u32, 204 pub processor_id: u32, 205 } 206 207 #[allow(dead_code)] 208 #[repr(packed)] 209 #[derive(Default, AsBytes)] 210 struct Ioapic { 211 pub r#type: u8, 212 pub length: u8, 213 pub ioapic_id: u8, 214 _reserved: u8, 215 pub apic_address: u32, 216 pub gsi_base: u32, 217 } 218 219 #[cfg(target_arch = "aarch64")] 220 #[allow(dead_code)] 221 #[repr(packed)] 222 #[derive(AsBytes)] 223 struct GicC { 224 pub r#type: u8, 225 pub length: u8, 226 pub reserved0: u16, 227 pub cpu_interface_number: u32, 228 pub uid: u32, 229 pub flags: u32, 230 pub parking_version: u32, 231 pub performance_interrupt: u32, 232 pub parked_address: u64, 233 pub base_address: u64, 234 pub gicv_base_address: u64, 235 pub gich_base_address: u64, 236 pub vgic_interrupt: u32, 237 pub gicr_base_address: u64, 238 pub mpidr: u64, 239 pub proc_power_effi_class: u8, 240 pub reserved1: u8, 241 pub spe_overflow_interrupt: u16, 242 } 243 244 #[cfg(target_arch = "aarch64")] 245 #[allow(dead_code)] 246 #[repr(packed)] 247 #[derive(AsBytes)] 248 struct GicD { 249 pub r#type: u8, 250 pub length: u8, 251 pub reserved0: u16, 252 pub gic_id: u32, 253 pub base_address: u64, 254 pub global_irq_base: u32, 255 pub version: u8, 256 pub reserved1: [u8; 3], 257 } 258 259 #[cfg(target_arch = "aarch64")] 260 #[allow(dead_code)] 261 #[repr(packed)] 262 #[derive(AsBytes)] 263 struct GicR { 264 pub r#type: u8, 265 pub length: u8, 266 pub reserved: u16, 267 pub base_address: u64, 268 pub range_length: u32, 269 } 270 271 #[cfg(target_arch = "aarch64")] 272 #[allow(dead_code)] 273 #[repr(packed)] 274 #[derive(AsBytes)] 275 struct GicIts { 276 pub r#type: u8, 277 pub length: u8, 278 pub reserved0: u16, 279 pub translation_id: u32, 280 pub base_address: u64, 281 pub reserved1: u32, 282 } 283 284 #[cfg(target_arch = "aarch64")] 285 #[allow(dead_code)] 286 #[repr(packed)] 287 #[derive(AsBytes)] 288 struct ProcessorHierarchyNode { 289 pub r#type: u8, 290 pub length: u8, 291 pub reserved: u16, 292 pub flags: u32, 293 pub parent: u32, 294 pub acpi_processor_id: u32, 295 pub num_private_resources: u32, 296 } 297 298 #[allow(dead_code)] 299 #[repr(packed)] 300 #[derive(Default, AsBytes)] 301 struct InterruptSourceOverride { 302 pub r#type: u8, 303 pub length: u8, 304 pub bus: u8, 305 pub source: u8, 306 pub gsi: u32, 307 pub flags: u16, 308 } 309 310 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 311 macro_rules! round_up { 312 ($n:expr,$d:expr) => { 313 (($n / ($d + 1)) + 1) * $d 314 }; 315 } 316 317 /// A wrapper around creating and using a kvm-based VCPU. 318 pub struct Vcpu { 319 // The hypervisor abstracted CPU. 320 vcpu: Arc<dyn hypervisor::Vcpu>, 321 id: u8, 322 #[cfg(target_arch = "aarch64")] 323 mpidr: u64, 324 saved_state: Option<CpuState>, 325 #[cfg(target_arch = "x86_64")] 326 vendor: CpuVendor, 327 } 328 329 impl Vcpu { 330 /// Constructs a new VCPU for `vm`. 331 /// 332 /// # Arguments 333 /// 334 /// * `id` - Represents the CPU number between [0, max vcpus). 335 /// * `vm` - The virtual machine this vcpu will get attached to. 336 /// * `vm_ops` - Optional object for exit handling. 337 /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) 338 pub fn new( 339 id: u8, 340 apic_id: u8, 341 vm: &Arc<dyn hypervisor::Vm>, 342 vm_ops: Option<Arc<dyn VmOps>>, 343 #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, 344 ) -> Result<Self> { 345 let vcpu = vm 346 .create_vcpu(apic_id, vm_ops) 347 .map_err(|e| Error::VcpuCreate(e.into()))?; 348 // Initially the cpuid per vCPU is the one supported by this VM. 349 Ok(Vcpu { 350 vcpu, 351 id, 352 #[cfg(target_arch = "aarch64")] 353 mpidr: 0, 354 saved_state: None, 355 #[cfg(target_arch = "x86_64")] 356 vendor: cpu_vendor, 357 }) 358 } 359 360 /// Configures a vcpu and should be called once per vcpu when created. 361 /// 362 /// # Arguments 363 /// 364 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 365 /// * `guest_memory` - Guest memory. 366 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 367 pub fn configure( 368 &mut self, 369 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 370 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 371 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 372 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 373 #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, 374 ) -> Result<()> { 375 #[cfg(target_arch = "aarch64")] 376 { 377 self.init(vm)?; 378 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 379 .map_err(Error::VcpuConfiguration)?; 380 } 381 info!("Configuring vCPU: cpu_id = {}", self.id); 382 #[cfg(target_arch = "x86_64")] 383 arch::configure_vcpu( 384 &self.vcpu, 385 self.id, 386 boot_setup, 387 cpuid, 388 kvm_hyperv, 389 self.vendor, 390 topology, 391 ) 392 .map_err(Error::VcpuConfiguration)?; 393 394 Ok(()) 395 } 396 397 /// Gets the MPIDR register value. 398 #[cfg(target_arch = "aarch64")] 399 pub fn get_mpidr(&self) -> u64 { 400 self.mpidr 401 } 402 403 /// Gets the saved vCPU state. 404 #[cfg(target_arch = "aarch64")] 405 pub fn get_saved_state(&self) -> Option<CpuState> { 406 self.saved_state.clone() 407 } 408 409 /// Initializes an aarch64 specific vcpu for booting Linux. 410 #[cfg(target_arch = "aarch64")] 411 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 412 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 413 414 // This reads back the kernel's preferred target type. 415 vm.get_preferred_target(&mut kvi) 416 .map_err(Error::VcpuArmPreferredTarget)?; 417 // We already checked that the capability is supported. 418 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 419 if vm 420 .as_any() 421 .downcast_ref::<hypervisor::kvm::KvmVm>() 422 .unwrap() 423 .check_extension(Cap::ArmPmuV3) 424 { 425 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 426 } 427 // Non-boot cpus are powered off initially. 428 if self.id > 0 { 429 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 430 } 431 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 432 } 433 434 /// Runs the VCPU until it exits, returning the reason. 435 /// 436 /// Note that the state of the VCPU and associated VM must be setup first for this to do 437 /// anything useful. 438 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 439 self.vcpu.run() 440 } 441 442 #[cfg(feature = "sev_snp")] 443 pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> { 444 self.vcpu 445 .set_sev_control_register(vmsa_pfn) 446 .map_err(Error::SetSevControlRegister) 447 } 448 } 449 450 impl Pausable for Vcpu {} 451 impl Snapshottable for Vcpu { 452 fn id(&self) -> String { 453 self.id.to_string() 454 } 455 456 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 457 let saved_state = self 458 .vcpu 459 .state() 460 .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?; 461 462 self.saved_state = Some(saved_state.clone()); 463 464 Ok(Snapshot::from_data(SnapshotData::new_from_state( 465 &saved_state, 466 )?)) 467 } 468 } 469 470 pub struct CpuManager { 471 config: CpusConfig, 472 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 473 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 474 #[cfg(target_arch = "x86_64")] 475 cpuid: Vec<CpuIdEntry>, 476 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 477 vm: Arc<dyn hypervisor::Vm>, 478 vcpus_kill_signalled: Arc<AtomicBool>, 479 vcpus_pause_signalled: Arc<AtomicBool>, 480 exit_evt: EventFd, 481 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 482 reset_evt: EventFd, 483 #[cfg(feature = "guest_debug")] 484 vm_debug_evt: EventFd, 485 vcpu_states: Vec<VcpuState>, 486 selected_cpu: u8, 487 vcpus: Vec<Arc<Mutex<Vcpu>>>, 488 seccomp_action: SeccompAction, 489 vm_ops: Arc<dyn VmOps>, 490 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 491 acpi_address: Option<GuestAddress>, 492 proximity_domain_per_cpu: BTreeMap<u8, u32>, 493 affinity: BTreeMap<u8, Vec<usize>>, 494 dynamic: bool, 495 hypervisor: Arc<dyn hypervisor::Hypervisor>, 496 #[cfg(feature = "sev_snp")] 497 sev_snp_enabled: bool, 498 } 499 500 const CPU_ENABLE_FLAG: usize = 0; 501 const CPU_INSERTING_FLAG: usize = 1; 502 const CPU_REMOVING_FLAG: usize = 2; 503 const CPU_EJECT_FLAG: usize = 3; 504 505 const CPU_STATUS_OFFSET: u64 = 4; 506 const CPU_SELECTION_OFFSET: u64 = 0; 507 508 impl BusDevice for CpuManager { 509 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 510 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 511 data.fill(0); 512 513 match offset { 514 CPU_SELECTION_OFFSET => { 515 data[0] = self.selected_cpu; 516 } 517 CPU_STATUS_OFFSET => { 518 if self.selected_cpu < self.max_vcpus() { 519 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 520 if state.active() { 521 data[0] |= 1 << CPU_ENABLE_FLAG; 522 } 523 if state.inserting { 524 data[0] |= 1 << CPU_INSERTING_FLAG; 525 } 526 if state.removing { 527 data[0] |= 1 << CPU_REMOVING_FLAG; 528 } 529 } else { 530 warn!("Out of range vCPU id: {}", self.selected_cpu); 531 } 532 } 533 _ => { 534 warn!( 535 "Unexpected offset for accessing CPU manager device: {:#}", 536 offset 537 ); 538 } 539 } 540 } 541 542 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 543 match offset { 544 CPU_SELECTION_OFFSET => { 545 self.selected_cpu = data[0]; 546 } 547 CPU_STATUS_OFFSET => { 548 if self.selected_cpu < self.max_vcpus() { 549 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 550 // The ACPI code writes back a 1 to acknowledge the insertion 551 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 552 && state.inserting 553 { 554 state.inserting = false; 555 } 556 // Ditto for removal 557 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 558 && state.removing 559 { 560 state.removing = false; 561 } 562 // Trigger removal of vCPU 563 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 564 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 565 error!("Error removing vCPU: {:?}", e); 566 } 567 } 568 } else { 569 warn!("Out of range vCPU id: {}", self.selected_cpu); 570 } 571 } 572 _ => { 573 warn!( 574 "Unexpected offset for accessing CPU manager device: {:#}", 575 offset 576 ); 577 } 578 } 579 None 580 } 581 } 582 583 #[derive(Default)] 584 struct VcpuState { 585 inserting: bool, 586 removing: bool, 587 pending_removal: Arc<AtomicBool>, 588 handle: Option<thread::JoinHandle<()>>, 589 kill: Arc<AtomicBool>, 590 vcpu_run_interrupted: Arc<AtomicBool>, 591 paused: Arc<AtomicBool>, 592 } 593 594 impl VcpuState { 595 fn active(&self) -> bool { 596 self.handle.is_some() 597 } 598 599 fn signal_thread(&self) { 600 if let Some(handle) = self.handle.as_ref() { 601 loop { 602 // SAFETY: FFI call with correct arguments 603 unsafe { 604 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 605 } 606 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 607 break; 608 } else { 609 // This is more effective than thread::yield_now() at 610 // avoiding a priority inversion with the vCPU thread 611 thread::sleep(std::time::Duration::from_millis(1)); 612 } 613 } 614 } 615 } 616 617 fn join_thread(&mut self) -> Result<()> { 618 if let Some(handle) = self.handle.take() { 619 handle.join().map_err(Error::ThreadCleanup)? 620 } 621 622 Ok(()) 623 } 624 625 fn unpark_thread(&self) { 626 if let Some(handle) = self.handle.as_ref() { 627 handle.thread().unpark() 628 } 629 } 630 } 631 632 impl CpuManager { 633 #[allow(unused_variables)] 634 #[allow(clippy::too_many_arguments)] 635 pub fn new( 636 config: &CpusConfig, 637 vm: Arc<dyn hypervisor::Vm>, 638 exit_evt: EventFd, 639 reset_evt: EventFd, 640 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 641 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 642 seccomp_action: SeccompAction, 643 vm_ops: Arc<dyn VmOps>, 644 #[cfg(feature = "tdx")] tdx_enabled: bool, 645 numa_nodes: &NumaNodes, 646 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 647 ) -> Result<Arc<Mutex<CpuManager>>> { 648 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 649 return Err(Error::MaximumVcpusExceeded); 650 } 651 652 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 653 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 654 let hypervisor_type = hypervisor.hypervisor_type(); 655 #[cfg(target_arch = "x86_64")] 656 let cpu_vendor = hypervisor.get_cpu_vendor(); 657 658 #[cfg(target_arch = "x86_64")] 659 if config.features.amx { 660 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 661 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 662 const XFEATURE_XTILEDATA: usize = 18; 663 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 664 665 // SAFETY: the syscall is only modifying kernel internal 666 // data structures that the kernel is itself expected to safeguard. 667 let amx_tile = unsafe { 668 libc::syscall( 669 libc::SYS_arch_prctl, 670 ARCH_REQ_XCOMP_GUEST_PERM, 671 XFEATURE_XTILEDATA, 672 ) 673 }; 674 675 if amx_tile != 0 { 676 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 677 } else { 678 let mask: usize = 0; 679 // SAFETY: the mask being modified (not marked mutable as it is 680 // modified in unsafe only which is permitted) isn't in use elsewhere. 681 let result = unsafe { 682 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 683 }; 684 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 685 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 686 } 687 } 688 } 689 690 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 691 let mut cpu_list = Vec::new(); 692 for (proximity_domain, numa_node) in numa_nodes.iter() { 693 for cpu in numa_node.cpus.iter() { 694 cpu_list.push((*cpu, *proximity_domain)) 695 } 696 } 697 cpu_list 698 } 699 .into_iter() 700 .collect(); 701 702 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 703 cpu_affinity 704 .iter() 705 .map(|a| (a.vcpu, a.host_cpus.clone())) 706 .collect() 707 } else { 708 BTreeMap::new() 709 }; 710 711 #[cfg(feature = "tdx")] 712 let dynamic = !tdx_enabled; 713 #[cfg(not(feature = "tdx"))] 714 let dynamic = true; 715 716 Ok(Arc::new(Mutex::new(CpuManager { 717 config: config.clone(), 718 interrupt_controller: None, 719 #[cfg(target_arch = "x86_64")] 720 cpuid: Vec::new(), 721 vm, 722 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 723 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 724 vcpu_states, 725 exit_evt, 726 reset_evt, 727 #[cfg(feature = "guest_debug")] 728 vm_debug_evt, 729 selected_cpu: 0, 730 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 731 seccomp_action, 732 vm_ops, 733 acpi_address: None, 734 proximity_domain_per_cpu, 735 affinity, 736 dynamic, 737 hypervisor: hypervisor.clone(), 738 #[cfg(feature = "sev_snp")] 739 sev_snp_enabled, 740 }))) 741 } 742 743 #[cfg(target_arch = "x86_64")] 744 pub fn populate_cpuid( 745 &mut self, 746 memory_manager: &Arc<Mutex<MemoryManager>>, 747 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 748 #[cfg(feature = "tdx")] tdx: bool, 749 ) -> Result<()> { 750 let sgx_epc_sections = memory_manager 751 .lock() 752 .unwrap() 753 .sgx_epc_region() 754 .as_ref() 755 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 756 757 self.cpuid = { 758 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 759 arch::generate_common_cpuid( 760 hypervisor, 761 &arch::CpuidConfig { 762 sgx_epc_sections, 763 phys_bits, 764 kvm_hyperv: self.config.kvm_hyperv, 765 #[cfg(feature = "tdx")] 766 tdx, 767 amx: self.config.features.amx, 768 }, 769 ) 770 .map_err(Error::CommonCpuId)? 771 }; 772 773 Ok(()) 774 } 775 776 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 777 info!("Creating vCPU: cpu_id = {}", cpu_id); 778 779 #[cfg(target_arch = "x86_64")] 780 let topology = self.get_vcpu_topology(); 781 #[cfg(target_arch = "x86_64")] 782 let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology); 783 #[cfg(target_arch = "aarch64")] 784 let x2apic_id = cpu_id as u32; 785 786 let mut vcpu = Vcpu::new( 787 cpu_id, 788 x2apic_id as u8, 789 &self.vm, 790 Some(self.vm_ops.clone()), 791 #[cfg(target_arch = "x86_64")] 792 self.hypervisor.get_cpu_vendor(), 793 )?; 794 795 if let Some(snapshot) = snapshot { 796 // AArch64 vCPUs should be initialized after created. 797 #[cfg(target_arch = "aarch64")] 798 vcpu.init(&self.vm)?; 799 800 let state: CpuState = snapshot.to_state().map_err(|e| { 801 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 802 })?; 803 vcpu.vcpu 804 .set_state(&state) 805 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 806 807 vcpu.saved_state = Some(state); 808 } 809 810 let vcpu = Arc::new(Mutex::new(vcpu)); 811 812 // Adding vCPU to the CpuManager's vCPU list. 813 self.vcpus.push(vcpu.clone()); 814 815 Ok(vcpu) 816 } 817 818 pub fn configure_vcpu( 819 &self, 820 vcpu: Arc<Mutex<Vcpu>>, 821 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 822 ) -> Result<()> { 823 let mut vcpu = vcpu.lock().unwrap(); 824 825 #[cfg(feature = "sev_snp")] 826 if self.sev_snp_enabled { 827 if let Some((kernel_entry_point, _)) = boot_setup { 828 vcpu.set_sev_control_register( 829 kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, 830 )?; 831 } 832 833 // Traditional way to configure vcpu doesn't work for SEV-SNP guests. 834 // All the vCPU configuration for SEV-SNP guest is provided via VMSA. 835 return Ok(()); 836 } 837 838 #[cfg(target_arch = "x86_64")] 839 assert!(!self.cpuid.is_empty()); 840 841 #[cfg(target_arch = "x86_64")] 842 let topology = self.config.topology.clone().map_or_else( 843 || { 844 #[cfg(feature = "mshv")] 845 if matches!(self.hypervisor.hypervisor_type(), HypervisorType::Mshv) { 846 return Some((1, self.boot_vcpus(), 1)); 847 } 848 None 849 }, 850 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 851 ); 852 #[cfg(target_arch = "x86_64")] 853 vcpu.configure( 854 boot_setup, 855 self.cpuid.clone(), 856 self.config.kvm_hyperv, 857 topology, 858 )?; 859 860 #[cfg(target_arch = "aarch64")] 861 vcpu.configure(&self.vm, boot_setup)?; 862 863 Ok(()) 864 } 865 866 /// Only create new vCPUs if there aren't any inactive ones to reuse 867 fn create_vcpus( 868 &mut self, 869 desired_vcpus: u8, 870 snapshot: Option<Snapshot>, 871 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 872 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 873 info!( 874 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 875 desired_vcpus, 876 self.config.max_vcpus, 877 self.vcpus.len(), 878 self.present_vcpus() 879 ); 880 881 if desired_vcpus > self.config.max_vcpus { 882 return Err(Error::DesiredVCpuCountExceedsMax); 883 } 884 885 // Only create vCPUs in excess of all the allocated vCPUs. 886 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 887 vcpus.push(self.create_vcpu( 888 cpu_id, 889 // TODO: The special format of the CPU id can be removed once 890 // ready to break live upgrade. 891 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 892 )?); 893 } 894 895 Ok(vcpus) 896 } 897 898 #[cfg(target_arch = "aarch64")] 899 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 900 for cpu in self.vcpus.iter() { 901 let cpu = cpu.lock().unwrap(); 902 // Check if PMU attr is available, if not, log the information. 903 if cpu.vcpu.has_pmu_support() { 904 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 905 } else { 906 debug!( 907 "PMU attribute is not supported in vCPU{}, skip PMU init!", 908 cpu.id 909 ); 910 return Ok(false); 911 } 912 } 913 914 Ok(true) 915 } 916 917 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 918 self.vcpus.clone() 919 } 920 921 fn start_vcpu( 922 &mut self, 923 vcpu: Arc<Mutex<Vcpu>>, 924 vcpu_id: u8, 925 vcpu_thread_barrier: Arc<Barrier>, 926 inserting: bool, 927 ) -> Result<()> { 928 let reset_evt = self.reset_evt.try_clone().unwrap(); 929 let exit_evt = self.exit_evt.try_clone().unwrap(); 930 #[cfg(feature = "kvm")] 931 let hypervisor_type = self.hypervisor.hypervisor_type(); 932 #[cfg(feature = "guest_debug")] 933 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 934 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 935 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 936 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 937 938 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 939 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 940 .vcpu_run_interrupted 941 .clone(); 942 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 943 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 944 945 // Prepare the CPU set the current vCPU is expected to run onto. 946 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 947 // SAFETY: all zeros is a valid pattern 948 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 949 // SAFETY: FFI call, trivially safe 950 unsafe { libc::CPU_ZERO(&mut cpuset) }; 951 for host_cpu in host_cpus { 952 // SAFETY: FFI call, trivially safe 953 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) }; 954 } 955 cpuset 956 }); 957 958 // Retrieve seccomp filter for vcpu thread 959 let vcpu_seccomp_filter = get_seccomp_filter( 960 &self.seccomp_action, 961 Thread::Vcpu, 962 self.hypervisor.hypervisor_type(), 963 ) 964 .map_err(Error::CreateSeccompFilter)?; 965 966 #[cfg(target_arch = "x86_64")] 967 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 968 969 info!("Starting vCPU: cpu_id = {}", vcpu_id); 970 971 let handle = Some( 972 thread::Builder::new() 973 .name(format!("vcpu{vcpu_id}")) 974 .spawn(move || { 975 // Schedule the thread to run on the expected CPU set 976 if let Some(cpuset) = cpuset.as_ref() { 977 // SAFETY: FFI call with correct arguments 978 let ret = unsafe { 979 libc::sched_setaffinity( 980 0, 981 std::mem::size_of::<libc::cpu_set_t>(), 982 cpuset as *const libc::cpu_set_t, 983 ) 984 }; 985 986 if ret != 0 { 987 error!( 988 "Failed scheduling the vCPU {} on the expected CPU set: {}", 989 vcpu_id, 990 io::Error::last_os_error() 991 ); 992 return; 993 } 994 } 995 996 // Apply seccomp filter for vcpu thread. 997 if !vcpu_seccomp_filter.is_empty() { 998 if let Err(e) = 999 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 1000 { 1001 error!("Error applying seccomp filter: {:?}", e); 1002 return; 1003 } 1004 } 1005 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 1006 // This uses an async signal safe handler to kill the vcpu handles. 1007 register_signal_handler(SIGRTMIN(), handle_signal) 1008 .expect("Failed to register vcpu signal handler"); 1009 // Block until all CPUs are ready. 1010 vcpu_thread_barrier.wait(); 1011 1012 std::panic::catch_unwind(move || { 1013 loop { 1014 // If we are being told to pause, we park the thread 1015 // until the pause boolean is toggled. 1016 // The resume operation is responsible for toggling 1017 // the boolean and unpark the thread. 1018 // We enter a loop because park() could spuriously 1019 // return. We will then park() again unless the 1020 // pause boolean has been toggled. 1021 1022 // Need to use Ordering::SeqCst as we have multiple 1023 // loads and stores to different atomics and we need 1024 // to see them in a consistent order in all threads 1025 1026 if vcpu_pause_signalled.load(Ordering::SeqCst) { 1027 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 1028 // completed by returning to KVM_RUN. From the kernel docs: 1029 // 1030 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 1031 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 1032 // operations are complete (and guest state is consistent) only after userspace 1033 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 1034 // incomplete operations and then check for pending signals. 1035 // The pending state of the operation is not preserved in state which is 1036 // visible to userspace, thus userspace should ensure that the operation is 1037 // completed before performing a live migration. Userspace can re-enter the 1038 // guest with an unmasked signal pending or with the immediate_exit field set 1039 // to complete pending operations without allowing any further instructions 1040 // to be executed. 1041 1042 #[cfg(feature = "kvm")] 1043 if matches!(hypervisor_type, HypervisorType::Kvm) { 1044 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 1045 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 1046 error!("Unexpected VM exit on \"immediate_exit\" run"); 1047 break; 1048 } 1049 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 1050 } 1051 1052 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1053 1054 vcpu_paused.store(true, Ordering::SeqCst); 1055 while vcpu_pause_signalled.load(Ordering::SeqCst) { 1056 thread::park(); 1057 } 1058 vcpu_run_interrupted.store(false, Ordering::SeqCst); 1059 } 1060 1061 // We've been told to terminate 1062 if vcpu_kill_signalled.load(Ordering::SeqCst) 1063 || vcpu_kill.load(Ordering::SeqCst) 1064 { 1065 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1066 break; 1067 } 1068 1069 #[cfg(feature = "tdx")] 1070 let mut vcpu = vcpu.lock().unwrap(); 1071 #[cfg(not(feature = "tdx"))] 1072 let vcpu = vcpu.lock().unwrap(); 1073 // vcpu.run() returns false on a triple-fault so trigger a reset 1074 match vcpu.run() { 1075 Ok(run) => match run { 1076 #[cfg(feature = "kvm")] 1077 VmExit::Debug => { 1078 info!("VmExit::Debug"); 1079 #[cfg(feature = "guest_debug")] 1080 { 1081 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1082 let raw_tid = get_raw_tid(vcpu_id as usize); 1083 vm_debug_evt.write(raw_tid as u64).unwrap(); 1084 } 1085 } 1086 #[cfg(target_arch = "x86_64")] 1087 VmExit::IoapicEoi(vector) => { 1088 if let Some(interrupt_controller) = 1089 &interrupt_controller_clone 1090 { 1091 interrupt_controller 1092 .lock() 1093 .unwrap() 1094 .end_of_interrupt(vector); 1095 } 1096 } 1097 VmExit::Ignore => {} 1098 VmExit::Hyperv => {} 1099 VmExit::Reset => { 1100 info!("VmExit::Reset"); 1101 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1102 reset_evt.write(1).unwrap(); 1103 break; 1104 } 1105 VmExit::Shutdown => { 1106 info!("VmExit::Shutdown"); 1107 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1108 exit_evt.write(1).unwrap(); 1109 break; 1110 } 1111 #[cfg(feature = "tdx")] 1112 VmExit::Tdx => { 1113 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1114 match vcpu.get_tdx_exit_details() { 1115 Ok(details) => match details { 1116 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1117 TdxExitDetails::SetupEventNotifyInterrupt => { 1118 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1119 } 1120 }, 1121 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1122 } 1123 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1124 } else { 1125 // We should never reach this code as 1126 // this means the design from the code 1127 // is wrong. 1128 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1129 } 1130 } 1131 _ => { 1132 error!( 1133 "VCPU generated error: {:?}", 1134 Error::UnexpectedVmExit 1135 ); 1136 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1137 exit_evt.write(1).unwrap(); 1138 break; 1139 } 1140 }, 1141 1142 Err(e) => { 1143 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1144 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1145 exit_evt.write(1).unwrap(); 1146 break; 1147 } 1148 } 1149 1150 // We've been told to terminate 1151 if vcpu_kill_signalled.load(Ordering::SeqCst) 1152 || vcpu_kill.load(Ordering::SeqCst) 1153 { 1154 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1155 break; 1156 } 1157 } 1158 }) 1159 .or_else(|_| { 1160 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1161 error!("vCPU thread panicked"); 1162 panic_exit_evt.write(1) 1163 }) 1164 .ok(); 1165 }) 1166 .map_err(Error::VcpuSpawn)?, 1167 ); 1168 1169 // On hot plug calls into this function entry_point is None. It is for 1170 // those hotplug CPU additions that we need to set the inserting flag. 1171 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1172 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1173 1174 Ok(()) 1175 } 1176 1177 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1178 fn activate_vcpus( 1179 &mut self, 1180 desired_vcpus: u8, 1181 inserting: bool, 1182 paused: Option<bool>, 1183 ) -> Result<()> { 1184 if desired_vcpus > self.config.max_vcpus { 1185 return Err(Error::DesiredVCpuCountExceedsMax); 1186 } 1187 1188 let vcpu_thread_barrier = Arc::new(Barrier::new( 1189 (desired_vcpus - self.present_vcpus() + 1) as usize, 1190 )); 1191 1192 if let Some(paused) = paused { 1193 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1194 } 1195 1196 info!( 1197 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1198 desired_vcpus, 1199 self.vcpus.len(), 1200 self.present_vcpus(), 1201 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1202 ); 1203 1204 // This reuses any inactive vCPUs as well as any that were newly created 1205 for vcpu_id in self.present_vcpus()..desired_vcpus { 1206 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1207 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1208 } 1209 1210 // Unblock all CPU threads. 1211 vcpu_thread_barrier.wait(); 1212 Ok(()) 1213 } 1214 1215 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1216 // Mark vCPUs for removal, actual removal happens on ejection 1217 for cpu_id in desired_vcpus..self.present_vcpus() { 1218 self.vcpu_states[usize::from(cpu_id)].removing = true; 1219 self.vcpu_states[usize::from(cpu_id)] 1220 .pending_removal 1221 .store(true, Ordering::SeqCst); 1222 } 1223 } 1224 1225 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1226 for state in self.vcpu_states.iter() { 1227 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1228 return true; 1229 } 1230 } 1231 false 1232 } 1233 1234 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1235 info!("Removing vCPU: cpu_id = {}", cpu_id); 1236 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1237 state.kill.store(true, Ordering::SeqCst); 1238 state.signal_thread(); 1239 state.join_thread()?; 1240 state.handle = None; 1241 1242 // Once the thread has exited, clear the "kill" so that it can reused 1243 state.kill.store(false, Ordering::SeqCst); 1244 state.pending_removal.store(false, Ordering::SeqCst); 1245 1246 Ok(()) 1247 } 1248 1249 pub fn create_boot_vcpus( 1250 &mut self, 1251 snapshot: Option<Snapshot>, 1252 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1253 trace_scoped!("create_boot_vcpus"); 1254 1255 self.create_vcpus(self.boot_vcpus(), snapshot) 1256 } 1257 1258 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1259 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1260 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1261 } 1262 1263 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1264 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1265 .map_err(|e| { 1266 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1267 })?; 1268 1269 Ok(()) 1270 } 1271 1272 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1273 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1274 return Ok(false); 1275 } 1276 1277 if !self.dynamic { 1278 return Ok(false); 1279 } 1280 1281 if self.check_pending_removed_vcpu() { 1282 return Err(Error::VcpuPendingRemovedVcpu); 1283 } 1284 1285 match desired_vcpus.cmp(&self.present_vcpus()) { 1286 cmp::Ordering::Greater => { 1287 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1288 for vcpu in vcpus { 1289 self.configure_vcpu(vcpu, None)? 1290 } 1291 self.activate_vcpus(desired_vcpus, true, None)?; 1292 Ok(true) 1293 } 1294 cmp::Ordering::Less => { 1295 self.mark_vcpus_for_removal(desired_vcpus); 1296 Ok(true) 1297 } 1298 _ => Ok(false), 1299 } 1300 } 1301 1302 pub fn shutdown(&mut self) -> Result<()> { 1303 // Tell the vCPUs to stop themselves next time they go through the loop 1304 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1305 1306 // Toggle the vCPUs pause boolean 1307 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1308 1309 // Unpark all the VCPU threads. 1310 for state in self.vcpu_states.iter() { 1311 state.unpark_thread(); 1312 } 1313 1314 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1315 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1316 // above. 1317 for state in self.vcpu_states.iter() { 1318 state.signal_thread(); 1319 } 1320 1321 // Wait for all the threads to finish. This removes the state from the vector. 1322 for mut state in self.vcpu_states.drain(..) { 1323 state.join_thread()?; 1324 } 1325 1326 Ok(()) 1327 } 1328 1329 #[cfg(feature = "tdx")] 1330 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1331 for vcpu in &self.vcpus { 1332 vcpu.lock() 1333 .unwrap() 1334 .vcpu 1335 .tdx_init(hob_address) 1336 .map_err(Error::InitializeTdx)?; 1337 } 1338 Ok(()) 1339 } 1340 1341 pub fn boot_vcpus(&self) -> u8 { 1342 self.config.boot_vcpus 1343 } 1344 1345 pub fn max_vcpus(&self) -> u8 { 1346 self.config.max_vcpus 1347 } 1348 1349 #[cfg(target_arch = "x86_64")] 1350 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1351 assert!(!self.cpuid.is_empty()); 1352 self.cpuid.clone() 1353 } 1354 1355 fn present_vcpus(&self) -> u8 { 1356 self.vcpu_states 1357 .iter() 1358 .fold(0, |acc, state| acc + state.active() as u8) 1359 } 1360 1361 #[cfg(target_arch = "aarch64")] 1362 pub fn get_mpidrs(&self) -> Vec<u64> { 1363 self.vcpus 1364 .iter() 1365 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1366 .collect() 1367 } 1368 1369 #[cfg(target_arch = "aarch64")] 1370 pub fn get_saved_states(&self) -> Vec<CpuState> { 1371 self.vcpus 1372 .iter() 1373 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1374 .collect() 1375 } 1376 1377 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1378 self.config 1379 .topology 1380 .clone() 1381 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1382 } 1383 1384 pub fn create_madt(&self) -> Sdt { 1385 use crate::acpi; 1386 // This is also checked in the commandline parsing. 1387 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1388 1389 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1390 #[cfg(target_arch = "x86_64")] 1391 { 1392 madt.write(36, arch::layout::APIC_START.0); 1393 1394 for cpu in 0..self.config.max_vcpus { 1395 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology()); 1396 1397 let lapic = LocalX2Apic { 1398 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1399 length: 16, 1400 processor_id: cpu.into(), 1401 apic_id: x2apic_id, 1402 flags: if cpu < self.config.boot_vcpus { 1403 1 << MADT_CPU_ENABLE_FLAG 1404 } else { 1405 0 1406 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1407 _reserved: 0, 1408 }; 1409 madt.append(lapic); 1410 } 1411 1412 madt.append(Ioapic { 1413 r#type: acpi::ACPI_APIC_IO, 1414 length: 12, 1415 ioapic_id: 0, 1416 apic_address: arch::layout::IOAPIC_START.0 as u32, 1417 gsi_base: 0, 1418 ..Default::default() 1419 }); 1420 1421 madt.append(InterruptSourceOverride { 1422 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1423 length: 10, 1424 bus: 0, 1425 source: 4, 1426 gsi: 4, 1427 flags: 0, 1428 }); 1429 } 1430 1431 #[cfg(target_arch = "aarch64")] 1432 { 1433 /* Notes: 1434 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1435 */ 1436 1437 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1438 for cpu in 0..self.config.boot_vcpus { 1439 let vcpu = &self.vcpus[cpu as usize]; 1440 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1441 /* ARMv8 MPIDR format: 1442 Bits [63:40] Must be zero 1443 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1444 Bits [31:24] Must be zero 1445 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1446 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1447 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1448 */ 1449 let mpidr_mask = 0xff_00ff_ffff; 1450 let gicc = GicC { 1451 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1452 length: 80, 1453 reserved0: 0, 1454 cpu_interface_number: cpu as u32, 1455 uid: cpu as u32, 1456 flags: 1, 1457 parking_version: 0, 1458 performance_interrupt: 0, 1459 parked_address: 0, 1460 base_address: 0, 1461 gicv_base_address: 0, 1462 gich_base_address: 0, 1463 vgic_interrupt: 0, 1464 gicr_base_address: 0, 1465 mpidr: mpidr & mpidr_mask, 1466 proc_power_effi_class: 0, 1467 reserved1: 0, 1468 spe_overflow_interrupt: 0, 1469 }; 1470 1471 madt.append(gicc); 1472 } 1473 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1474 1475 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1476 let gicd = GicD { 1477 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1478 length: 24, 1479 reserved0: 0, 1480 gic_id: 0, 1481 base_address: vgic_config.dist_addr, 1482 global_irq_base: 0, 1483 version: 3, 1484 reserved1: [0; 3], 1485 }; 1486 madt.append(gicd); 1487 1488 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1489 let gicr = GicR { 1490 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1491 length: 16, 1492 reserved: 0, 1493 base_address: vgic_config.redists_addr, 1494 range_length: vgic_config.redists_size as u32, 1495 }; 1496 madt.append(gicr); 1497 1498 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1499 let gicits = GicIts { 1500 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1501 length: 20, 1502 reserved0: 0, 1503 translation_id: 0, 1504 base_address: vgic_config.msi_addr, 1505 reserved1: 0, 1506 }; 1507 madt.append(gicits); 1508 1509 madt.update_checksum(); 1510 } 1511 1512 madt 1513 } 1514 1515 #[cfg(target_arch = "aarch64")] 1516 pub fn create_pptt(&self) -> Sdt { 1517 let pptt_start = 0; 1518 let mut cpus = 0; 1519 let mut uid = 0; 1520 // If topology is not specified, the default setting is: 1521 // 1 package, multiple cores, 1 thread per core 1522 // This is also the behavior when PPTT is missing. 1523 let (threads_per_core, cores_per_package, packages) = 1524 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1525 1526 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1527 1528 for cluster_idx in 0..packages { 1529 if cpus < self.config.boot_vcpus as usize { 1530 let cluster_offset = pptt.len() - pptt_start; 1531 let cluster_hierarchy_node = ProcessorHierarchyNode { 1532 r#type: 0, 1533 length: 20, 1534 reserved: 0, 1535 flags: 0x2, 1536 parent: 0, 1537 acpi_processor_id: cluster_idx as u32, 1538 num_private_resources: 0, 1539 }; 1540 pptt.append(cluster_hierarchy_node); 1541 1542 for core_idx in 0..cores_per_package { 1543 let core_offset = pptt.len() - pptt_start; 1544 1545 if threads_per_core > 1 { 1546 let core_hierarchy_node = ProcessorHierarchyNode { 1547 r#type: 0, 1548 length: 20, 1549 reserved: 0, 1550 flags: 0x2, 1551 parent: cluster_offset as u32, 1552 acpi_processor_id: core_idx as u32, 1553 num_private_resources: 0, 1554 }; 1555 pptt.append(core_hierarchy_node); 1556 1557 for _thread_idx in 0..threads_per_core { 1558 let thread_hierarchy_node = ProcessorHierarchyNode { 1559 r#type: 0, 1560 length: 20, 1561 reserved: 0, 1562 flags: 0xE, 1563 parent: core_offset as u32, 1564 acpi_processor_id: uid as u32, 1565 num_private_resources: 0, 1566 }; 1567 pptt.append(thread_hierarchy_node); 1568 uid += 1; 1569 } 1570 } else { 1571 let thread_hierarchy_node = ProcessorHierarchyNode { 1572 r#type: 0, 1573 length: 20, 1574 reserved: 0, 1575 flags: 0xA, 1576 parent: cluster_offset as u32, 1577 acpi_processor_id: uid as u32, 1578 num_private_resources: 0, 1579 }; 1580 pptt.append(thread_hierarchy_node); 1581 uid += 1; 1582 } 1583 } 1584 cpus += (cores_per_package * threads_per_core) as usize; 1585 } 1586 } 1587 1588 pptt.update_checksum(); 1589 pptt 1590 } 1591 1592 #[cfg(feature = "guest_debug")] 1593 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1594 self.vcpus[usize::from(cpu_id)] 1595 .lock() 1596 .unwrap() 1597 .vcpu 1598 .get_regs() 1599 .map_err(Error::CpuDebug) 1600 } 1601 1602 #[cfg(feature = "guest_debug")] 1603 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1604 self.vcpus[usize::from(cpu_id)] 1605 .lock() 1606 .unwrap() 1607 .vcpu 1608 .set_regs(regs) 1609 .map_err(Error::CpuDebug) 1610 } 1611 1612 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1613 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1614 self.vcpus[usize::from(cpu_id)] 1615 .lock() 1616 .unwrap() 1617 .vcpu 1618 .get_sregs() 1619 .map_err(Error::CpuDebug) 1620 } 1621 1622 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1623 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1624 self.vcpus[usize::from(cpu_id)] 1625 .lock() 1626 .unwrap() 1627 .vcpu 1628 .set_sregs(sregs) 1629 .map_err(Error::CpuDebug) 1630 } 1631 1632 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1633 fn translate_gva( 1634 &self, 1635 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1636 cpu_id: u8, 1637 gva: u64, 1638 ) -> Result<u64> { 1639 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1640 .lock() 1641 .unwrap() 1642 .vcpu 1643 .translate_gva(gva, /* flags: unused */ 0) 1644 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1645 Ok(gpa) 1646 } 1647 1648 /// 1649 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1650 /// it in VMM by walking through translation tables. 1651 /// 1652 /// Address translation is big topic, here we only focus the scenario that 1653 /// happens in VMM while debugging kernel. This `translate_gva` 1654 /// implementation is restricted to: 1655 /// - Exception Level 1 1656 /// - Translate high address range only (kernel space) 1657 /// 1658 /// This implementation supports following Arm-v8a features related to 1659 /// address translation: 1660 /// - FEAT_LPA 1661 /// - FEAT_LVA 1662 /// - FEAT_LPA2 1663 /// 1664 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1665 fn translate_gva( 1666 &self, 1667 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1668 cpu_id: u8, 1669 gva: u64, 1670 ) -> Result<u64> { 1671 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1672 .lock() 1673 .unwrap() 1674 .vcpu 1675 .get_sys_reg(regs::TCR_EL1) 1676 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1677 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1678 .lock() 1679 .unwrap() 1680 .vcpu 1681 .get_sys_reg(regs::TTBR1_EL1) 1682 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1683 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1684 .lock() 1685 .unwrap() 1686 .vcpu 1687 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1688 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1689 1690 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1691 // or low (0x000xxx...). 1692 let high_range = extract_bits_64!(gva, 55, 1); 1693 if high_range == 0 { 1694 info!("VA (0x{:x}) range is not supported!", gva); 1695 return Ok(gva); 1696 } 1697 1698 // High range size offset 1699 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1700 // Granule size 1701 let tg = extract_bits_64!(tcr_el1, 30, 2); 1702 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1703 let ds = extract_bits_64!(tcr_el1, 59, 1); 1704 1705 if tsz == 0 { 1706 info!("VA translation is not ready!"); 1707 return Ok(gva); 1708 } 1709 1710 // VA size is determined by TCR_BL1.T1SZ 1711 let va_size = 64 - tsz; 1712 // Number of bits in VA consumed in each level of translation 1713 let stride = match tg { 1714 3 => 13, // 64KB granule size 1715 1 => 11, // 16KB granule size 1716 _ => 9, // 4KB, default 1717 }; 1718 // Starting level of walking 1719 let mut level = 4 - (va_size - 4) / stride; 1720 1721 // PA or IPA size is determined 1722 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1723 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1724 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1725 // To be safe, we use the minimum value if they are different. 1726 let pa_range = std::cmp::min(tcr_ips, pa_range); 1727 // PA size in bits 1728 let pa_size = match pa_range { 1729 0 => 32, 1730 1 => 36, 1731 2 => 40, 1732 3 => 42, 1733 4 => 44, 1734 5 => 48, 1735 6 => 52, 1736 _ => { 1737 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1738 "PA range not supported {pa_range}" 1739 )))) 1740 } 1741 }; 1742 1743 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1744 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1745 // If FEAT_LPA2 is present, the translation table descriptor holds 1746 // 50 bits of the table address of next level. 1747 // Otherwise, it is 48 bits. 1748 let descaddrmask = if ds == 1 { 1749 !0u64 >> (64 - 50) // mask with 50 least significant bits 1750 } else { 1751 !0u64 >> (64 - 48) // mask with 48 least significant bits 1752 }; 1753 let descaddrmask = descaddrmask & !indexmask_grainsize; 1754 1755 // Translation table base address 1756 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1757 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1758 // address bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1759 if pa_size == 52 { 1760 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1761 } 1762 1763 // Loop through tables of each level 1764 loop { 1765 // Table offset for current level 1766 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1767 descaddr |= table_offset; 1768 descaddr &= !7u64; 1769 1770 let mut buf = [0; 8]; 1771 guest_memory 1772 .memory() 1773 .read(&mut buf, GuestAddress(descaddr)) 1774 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1775 let descriptor = u64::from_le_bytes(buf); 1776 1777 descaddr = descriptor & descaddrmask; 1778 // In the case of FEAT_LPA, the next-level translation table address 1779 // bits [48:51] comes from bits [12:15] of the current descriptor. 1780 // For FEAT_LPA2, the next-level translation table address 1781 // bits [50:51] comes from bits [8:9] of the current descriptor, 1782 // bits [48:49] comes from bits [48:49] of the descriptor which was 1783 // handled previously. 1784 if pa_size == 52 { 1785 if ds == 1 { 1786 // FEAT_LPA2 1787 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1788 } else { 1789 // FEAT_LPA 1790 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1791 } 1792 } 1793 1794 if (descriptor & 2) != 0 && (level < 3) { 1795 // This is a table entry. Go down to next level. 1796 level += 1; 1797 indexmask = indexmask_grainsize; 1798 continue; 1799 } 1800 1801 break; 1802 } 1803 1804 // We have reached either: 1805 // - a page entry at level 3 or 1806 // - a block entry at level 1 or 2 1807 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1808 descaddr &= !(page_size - 1); 1809 descaddr |= gva & (page_size - 1); 1810 1811 Ok(descaddr) 1812 } 1813 1814 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1815 self.acpi_address = Some(acpi_address); 1816 } 1817 1818 pub(crate) fn set_interrupt_controller( 1819 &mut self, 1820 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1821 ) { 1822 self.interrupt_controller = Some(interrupt_controller); 1823 } 1824 1825 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1826 &self.vcpus_kill_signalled 1827 } 1828 1829 #[cfg(feature = "igvm")] 1830 pub(crate) fn get_cpuid_leaf( 1831 &self, 1832 cpu_id: u8, 1833 eax: u32, 1834 ecx: u32, 1835 xfem: u64, 1836 xss: u64, 1837 ) -> Result<[u32; 4]> { 1838 let leaf_info = self.vcpus[usize::from(cpu_id)] 1839 .lock() 1840 .unwrap() 1841 .vcpu 1842 .get_cpuid_values(eax, ecx, xfem, xss) 1843 .unwrap(); 1844 Ok(leaf_info) 1845 } 1846 1847 #[cfg(feature = "sev_snp")] 1848 pub(crate) fn sev_snp_enabled(&self) -> bool { 1849 self.sev_snp_enabled 1850 } 1851 } 1852 1853 struct Cpu { 1854 cpu_id: u8, 1855 proximity_domain: u32, 1856 dynamic: bool, 1857 #[cfg(target_arch = "x86_64")] 1858 topology: Option<(u8, u8, u8)>, 1859 } 1860 1861 #[cfg(target_arch = "x86_64")] 1862 const MADT_CPU_ENABLE_FLAG: usize = 0; 1863 1864 #[cfg(target_arch = "x86_64")] 1865 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1866 1867 impl Cpu { 1868 #[cfg(target_arch = "x86_64")] 1869 fn generate_mat(&self) -> Vec<u8> { 1870 let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology); 1871 1872 let lapic = LocalX2Apic { 1873 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1874 length: 16, 1875 processor_id: self.cpu_id.into(), 1876 apic_id: x2apic_id, 1877 flags: 1 << MADT_CPU_ENABLE_FLAG, 1878 _reserved: 0, 1879 }; 1880 1881 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1882 // SAFETY: mat_data is large enough to hold lapic 1883 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1884 1885 mat_data 1886 } 1887 } 1888 1889 impl Aml for Cpu { 1890 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1891 #[cfg(target_arch = "x86_64")] 1892 let mat_data: Vec<u8> = self.generate_mat(); 1893 #[allow(clippy::if_same_then_else)] 1894 if self.dynamic { 1895 aml::Device::new( 1896 format!("C{:03X}", self.cpu_id).as_str().into(), 1897 vec![ 1898 &aml::Name::new("_HID".into(), &"ACPI0007"), 1899 &aml::Name::new("_UID".into(), &self.cpu_id), 1900 // Currently, AArch64 cannot support following fields. 1901 /* 1902 _STA return value: 1903 Bit [0] – Set if the device is present. 1904 Bit [1] – Set if the device is enabled and decoding its resources. 1905 Bit [2] – Set if the device should be shown in the UI. 1906 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1907 Bit [4] – Set if the battery is present. 1908 Bits [31:5] – Reserved (must be cleared). 1909 */ 1910 #[cfg(target_arch = "x86_64")] 1911 &aml::Method::new( 1912 "_STA".into(), 1913 0, 1914 false, 1915 // Call into CSTA method which will interrogate device 1916 vec![&aml::Return::new(&aml::MethodCall::new( 1917 "CSTA".into(), 1918 vec![&self.cpu_id], 1919 ))], 1920 ), 1921 &aml::Method::new( 1922 "_PXM".into(), 1923 0, 1924 false, 1925 vec![&aml::Return::new(&self.proximity_domain)], 1926 ), 1927 // The Linux kernel expects every CPU device to have a _MAT entry 1928 // containing the LAPIC for this processor with the enabled bit set 1929 // even it if is disabled in the MADT (non-boot CPU) 1930 #[cfg(target_arch = "x86_64")] 1931 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1932 // Trigger CPU ejection 1933 #[cfg(target_arch = "x86_64")] 1934 &aml::Method::new( 1935 "_EJ0".into(), 1936 1, 1937 false, 1938 // Call into CEJ0 method which will actually eject device 1939 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1940 ), 1941 ], 1942 ) 1943 .to_aml_bytes(sink); 1944 } else { 1945 aml::Device::new( 1946 format!("C{:03X}", self.cpu_id).as_str().into(), 1947 vec![ 1948 &aml::Name::new("_HID".into(), &"ACPI0007"), 1949 &aml::Name::new("_UID".into(), &self.cpu_id), 1950 #[cfg(target_arch = "x86_64")] 1951 &aml::Method::new( 1952 "_STA".into(), 1953 0, 1954 false, 1955 // Mark CPU present see CSTA implementation 1956 vec![&aml::Return::new(&0xfu8)], 1957 ), 1958 &aml::Method::new( 1959 "_PXM".into(), 1960 0, 1961 false, 1962 vec![&aml::Return::new(&self.proximity_domain)], 1963 ), 1964 // The Linux kernel expects every CPU device to have a _MAT entry 1965 // containing the LAPIC for this processor with the enabled bit set 1966 // even it if is disabled in the MADT (non-boot CPU) 1967 #[cfg(target_arch = "x86_64")] 1968 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1969 ], 1970 ) 1971 .to_aml_bytes(sink); 1972 } 1973 } 1974 } 1975 1976 struct CpuNotify { 1977 cpu_id: u8, 1978 } 1979 1980 impl Aml for CpuNotify { 1981 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1982 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 1983 aml::If::new( 1984 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1985 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1986 ) 1987 .to_aml_bytes(sink) 1988 } 1989 } 1990 1991 struct CpuMethods { 1992 max_vcpus: u8, 1993 dynamic: bool, 1994 } 1995 1996 impl Aml for CpuMethods { 1997 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1998 if self.dynamic { 1999 // CPU status method 2000 aml::Method::new( 2001 "CSTA".into(), 2002 1, 2003 true, 2004 vec![ 2005 // Take lock defined above 2006 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2007 // Write CPU number (in first argument) to I/O port via field 2008 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2009 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2010 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2011 &aml::If::new( 2012 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 2013 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2014 ), 2015 // Release lock 2016 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2017 // Return 0 or 0xf 2018 &aml::Return::new(&aml::Local(0)), 2019 ], 2020 ) 2021 .to_aml_bytes(sink); 2022 2023 let mut cpu_notifies = Vec::new(); 2024 for cpu_id in 0..self.max_vcpus { 2025 cpu_notifies.push(CpuNotify { cpu_id }); 2026 } 2027 2028 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2029 for cpu_id in 0..self.max_vcpus { 2030 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 2031 } 2032 2033 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 2034 2035 aml::Method::new( 2036 "CEJ0".into(), 2037 1, 2038 true, 2039 vec![ 2040 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2041 // Write CPU number (in first argument) to I/O port via field 2042 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2043 // Set CEJ0 bit 2044 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 2045 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2046 ], 2047 ) 2048 .to_aml_bytes(sink); 2049 2050 aml::Method::new( 2051 "CSCN".into(), 2052 0, 2053 true, 2054 vec![ 2055 // Take lock defined above 2056 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2057 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2058 &aml::While::new( 2059 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 2060 vec![ 2061 // Write CPU number (in first argument) to I/O port via field 2062 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 2063 // Check if CINS bit is set 2064 &aml::If::new( 2065 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 2066 // Notify device if it is 2067 vec![ 2068 &aml::MethodCall::new( 2069 "CTFY".into(), 2070 vec![&aml::Local(0), &aml::ONE], 2071 ), 2072 // Reset CINS bit 2073 &aml::Store::new( 2074 &aml::Path::new("\\_SB_.PRES.CINS"), 2075 &aml::ONE, 2076 ), 2077 ], 2078 ), 2079 // Check if CRMV bit is set 2080 &aml::If::new( 2081 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 2082 // Notify device if it is (with the eject constant 0x3) 2083 vec![ 2084 &aml::MethodCall::new( 2085 "CTFY".into(), 2086 vec![&aml::Local(0), &3u8], 2087 ), 2088 // Reset CRMV bit 2089 &aml::Store::new( 2090 &aml::Path::new("\\_SB_.PRES.CRMV"), 2091 &aml::ONE, 2092 ), 2093 ], 2094 ), 2095 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2096 ], 2097 ), 2098 // Release lock 2099 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2100 ], 2101 ) 2102 .to_aml_bytes(sink) 2103 } else { 2104 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2105 } 2106 } 2107 } 2108 2109 impl Aml for CpuManager { 2110 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2111 #[cfg(target_arch = "x86_64")] 2112 if let Some(acpi_address) = self.acpi_address { 2113 // CPU hotplug controller 2114 aml::Device::new( 2115 "_SB_.PRES".into(), 2116 vec![ 2117 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2118 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2119 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2120 &aml::Mutex::new("CPLK".into(), 0), 2121 &aml::Name::new( 2122 "_CRS".into(), 2123 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2124 aml::AddressSpaceCacheable::NotCacheable, 2125 true, 2126 acpi_address.0, 2127 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2128 None, 2129 )]), 2130 ), 2131 // OpRegion and Fields map MMIO range into individual field values 2132 &aml::OpRegion::new( 2133 "PRST".into(), 2134 aml::OpRegionSpace::SystemMemory, 2135 &(acpi_address.0 as usize), 2136 &CPU_MANAGER_ACPI_SIZE, 2137 ), 2138 &aml::Field::new( 2139 "PRST".into(), 2140 aml::FieldAccessType::Byte, 2141 aml::FieldLockRule::NoLock, 2142 aml::FieldUpdateRule::WriteAsZeroes, 2143 vec![ 2144 aml::FieldEntry::Reserved(32), 2145 aml::FieldEntry::Named(*b"CPEN", 1), 2146 aml::FieldEntry::Named(*b"CINS", 1), 2147 aml::FieldEntry::Named(*b"CRMV", 1), 2148 aml::FieldEntry::Named(*b"CEJ0", 1), 2149 aml::FieldEntry::Reserved(4), 2150 aml::FieldEntry::Named(*b"CCMD", 8), 2151 ], 2152 ), 2153 &aml::Field::new( 2154 "PRST".into(), 2155 aml::FieldAccessType::DWord, 2156 aml::FieldLockRule::NoLock, 2157 aml::FieldUpdateRule::Preserve, 2158 vec![ 2159 aml::FieldEntry::Named(*b"CSEL", 32), 2160 aml::FieldEntry::Reserved(32), 2161 aml::FieldEntry::Named(*b"CDAT", 32), 2162 ], 2163 ), 2164 ], 2165 ) 2166 .to_aml_bytes(sink); 2167 } 2168 2169 // CPU devices 2170 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2171 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2172 // Bundle methods together under a common object 2173 let methods = CpuMethods { 2174 max_vcpus: self.config.max_vcpus, 2175 dynamic: self.dynamic, 2176 }; 2177 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2178 2179 #[cfg(target_arch = "x86_64")] 2180 let topology = self.get_vcpu_topology(); 2181 let mut cpu_devices = Vec::new(); 2182 for cpu_id in 0..self.config.max_vcpus { 2183 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2184 let cpu_device = Cpu { 2185 cpu_id, 2186 proximity_domain, 2187 dynamic: self.dynamic, 2188 #[cfg(target_arch = "x86_64")] 2189 topology, 2190 }; 2191 2192 cpu_devices.push(cpu_device); 2193 } 2194 2195 for cpu_device in cpu_devices.iter() { 2196 cpu_data_inner.push(cpu_device); 2197 } 2198 2199 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2200 } 2201 } 2202 2203 impl Pausable for CpuManager { 2204 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2205 // Tell the vCPUs to pause themselves next time they exit 2206 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2207 2208 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2209 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2210 // above. 2211 for state in self.vcpu_states.iter() { 2212 state.signal_thread(); 2213 } 2214 2215 for vcpu in self.vcpus.iter() { 2216 let mut vcpu = vcpu.lock().unwrap(); 2217 vcpu.pause()?; 2218 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2219 if !self.config.kvm_hyperv { 2220 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2221 MigratableError::Pause(anyhow!( 2222 "Could not notify guest it has been paused {:?}", 2223 e 2224 )) 2225 })?; 2226 } 2227 } 2228 2229 // The vCPU thread will change its paused state before parking, wait here for each 2230 // activated vCPU change their state to ensure they have parked. 2231 for state in self.vcpu_states.iter() { 2232 if state.active() { 2233 while !state.paused.load(Ordering::SeqCst) { 2234 // To avoid a priority inversion with the vCPU thread 2235 thread::sleep(std::time::Duration::from_millis(1)); 2236 } 2237 } 2238 } 2239 2240 Ok(()) 2241 } 2242 2243 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2244 for vcpu in self.vcpus.iter() { 2245 vcpu.lock().unwrap().resume()?; 2246 } 2247 2248 // Toggle the vCPUs pause boolean 2249 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2250 2251 // Unpark all the VCPU threads. 2252 // Once unparked, the next thing they will do is checking for the pause 2253 // boolean. Since it'll be set to false, they will exit their pause loop 2254 // and go back to vmx root. 2255 for state in self.vcpu_states.iter() { 2256 state.paused.store(false, Ordering::SeqCst); 2257 state.unpark_thread(); 2258 } 2259 Ok(()) 2260 } 2261 } 2262 2263 impl Snapshottable for CpuManager { 2264 fn id(&self) -> String { 2265 CPU_MANAGER_SNAPSHOT_ID.to_string() 2266 } 2267 2268 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2269 let mut cpu_manager_snapshot = Snapshot::default(); 2270 2271 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2272 for vcpu in &self.vcpus { 2273 let mut vcpu = vcpu.lock().unwrap(); 2274 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2275 } 2276 2277 Ok(cpu_manager_snapshot) 2278 } 2279 } 2280 2281 impl Transportable for CpuManager {} 2282 impl Migratable for CpuManager {} 2283 2284 #[cfg(feature = "guest_debug")] 2285 impl Debuggable for CpuManager { 2286 #[cfg(feature = "kvm")] 2287 fn set_guest_debug( 2288 &self, 2289 cpu_id: usize, 2290 addrs: &[GuestAddress], 2291 singlestep: bool, 2292 ) -> std::result::Result<(), DebuggableError> { 2293 self.vcpus[cpu_id] 2294 .lock() 2295 .unwrap() 2296 .vcpu 2297 .set_guest_debug(addrs, singlestep) 2298 .map_err(DebuggableError::SetDebug) 2299 } 2300 2301 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2302 Ok(()) 2303 } 2304 2305 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2306 Ok(()) 2307 } 2308 2309 #[cfg(target_arch = "x86_64")] 2310 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2311 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2312 let gregs = self 2313 .get_regs(cpu_id as u8) 2314 .map_err(DebuggableError::ReadRegs)?; 2315 let regs = [ 2316 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, 2317 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, 2318 ]; 2319 2320 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2321 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2322 let eflags = gregs.rflags as u32; 2323 let rip = gregs.rip; 2324 2325 // Segment registers: CS, SS, DS, ES, FS, GS 2326 let sregs = self 2327 .get_sregs(cpu_id as u8) 2328 .map_err(DebuggableError::ReadRegs)?; 2329 let segments = X86SegmentRegs { 2330 cs: sregs.cs.selector as u32, 2331 ss: sregs.ss.selector as u32, 2332 ds: sregs.ds.selector as u32, 2333 es: sregs.es.selector as u32, 2334 fs: sregs.fs.selector as u32, 2335 gs: sregs.gs.selector as u32, 2336 }; 2337 2338 // TODO: Add other registers 2339 2340 Ok(CoreRegs { 2341 regs, 2342 eflags, 2343 rip, 2344 segments, 2345 ..Default::default() 2346 }) 2347 } 2348 2349 #[cfg(target_arch = "aarch64")] 2350 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2351 let gregs = self 2352 .get_regs(cpu_id as u8) 2353 .map_err(DebuggableError::ReadRegs)?; 2354 Ok(CoreRegs { 2355 x: gregs.regs.regs, 2356 sp: gregs.regs.sp, 2357 pc: gregs.regs.pc, 2358 ..Default::default() 2359 }) 2360 } 2361 2362 #[cfg(target_arch = "x86_64")] 2363 fn write_regs( 2364 &self, 2365 cpu_id: usize, 2366 regs: &CoreRegs, 2367 ) -> std::result::Result<(), DebuggableError> { 2368 let orig_gregs = self 2369 .get_regs(cpu_id as u8) 2370 .map_err(DebuggableError::ReadRegs)?; 2371 let gregs = StandardRegisters { 2372 rax: regs.regs[0], 2373 rbx: regs.regs[1], 2374 rcx: regs.regs[2], 2375 rdx: regs.regs[3], 2376 rsi: regs.regs[4], 2377 rdi: regs.regs[5], 2378 rbp: regs.regs[6], 2379 rsp: regs.regs[7], 2380 r8: regs.regs[8], 2381 r9: regs.regs[9], 2382 r10: regs.regs[10], 2383 r11: regs.regs[11], 2384 r12: regs.regs[12], 2385 r13: regs.regs[13], 2386 r14: regs.regs[14], 2387 r15: regs.regs[15], 2388 rip: regs.rip, 2389 // Update the lower 32-bit of rflags. 2390 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), 2391 }; 2392 2393 self.set_regs(cpu_id as u8, &gregs) 2394 .map_err(DebuggableError::WriteRegs)?; 2395 2396 // Segment registers: CS, SS, DS, ES, FS, GS 2397 // Since GDB care only selectors, we call get_sregs() first. 2398 let mut sregs = self 2399 .get_sregs(cpu_id as u8) 2400 .map_err(DebuggableError::ReadRegs)?; 2401 sregs.cs.selector = regs.segments.cs as u16; 2402 sregs.ss.selector = regs.segments.ss as u16; 2403 sregs.ds.selector = regs.segments.ds as u16; 2404 sregs.es.selector = regs.segments.es as u16; 2405 sregs.fs.selector = regs.segments.fs as u16; 2406 sregs.gs.selector = regs.segments.gs as u16; 2407 2408 self.set_sregs(cpu_id as u8, &sregs) 2409 .map_err(DebuggableError::WriteRegs)?; 2410 2411 // TODO: Add other registers 2412 2413 Ok(()) 2414 } 2415 2416 #[cfg(target_arch = "aarch64")] 2417 fn write_regs( 2418 &self, 2419 cpu_id: usize, 2420 regs: &CoreRegs, 2421 ) -> std::result::Result<(), DebuggableError> { 2422 let mut gregs = self 2423 .get_regs(cpu_id as u8) 2424 .map_err(DebuggableError::ReadRegs)?; 2425 2426 gregs.regs.regs = regs.x; 2427 gregs.regs.sp = regs.sp; 2428 gregs.regs.pc = regs.pc; 2429 2430 self.set_regs(cpu_id as u8, &gregs) 2431 .map_err(DebuggableError::WriteRegs)?; 2432 2433 Ok(()) 2434 } 2435 2436 fn read_mem( 2437 &self, 2438 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2439 cpu_id: usize, 2440 vaddr: GuestAddress, 2441 len: usize, 2442 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2443 let mut buf = vec![0; len]; 2444 let mut total_read = 0_u64; 2445 2446 while total_read < len as u64 { 2447 let gaddr = vaddr.0 + total_read; 2448 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2449 Ok(paddr) => paddr, 2450 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2451 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2452 }; 2453 let psize = arch::PAGE_SIZE as u64; 2454 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2455 guest_memory 2456 .memory() 2457 .read( 2458 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2459 GuestAddress(paddr), 2460 ) 2461 .map_err(DebuggableError::ReadMem)?; 2462 total_read += read_len; 2463 } 2464 Ok(buf) 2465 } 2466 2467 fn write_mem( 2468 &self, 2469 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2470 cpu_id: usize, 2471 vaddr: &GuestAddress, 2472 data: &[u8], 2473 ) -> std::result::Result<(), DebuggableError> { 2474 let mut total_written = 0_u64; 2475 2476 while total_written < data.len() as u64 { 2477 let gaddr = vaddr.0 + total_written; 2478 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2479 Ok(paddr) => paddr, 2480 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2481 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2482 }; 2483 let psize = arch::PAGE_SIZE as u64; 2484 let write_len = std::cmp::min( 2485 data.len() as u64 - total_written, 2486 psize - (paddr & (psize - 1)), 2487 ); 2488 guest_memory 2489 .memory() 2490 .write( 2491 &data[total_written as usize..total_written as usize + write_len as usize], 2492 GuestAddress(paddr), 2493 ) 2494 .map_err(DebuggableError::WriteMem)?; 2495 total_written += write_len; 2496 } 2497 Ok(()) 2498 } 2499 2500 fn active_vcpus(&self) -> usize { 2501 self.present_vcpus() as usize 2502 } 2503 } 2504 2505 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2506 impl Elf64Writable for CpuManager {} 2507 2508 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2509 impl CpuElf64Writable for CpuManager { 2510 fn cpu_write_elf64_note( 2511 &mut self, 2512 dump_state: &DumpState, 2513 ) -> std::result::Result<(), GuestDebuggableError> { 2514 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2515 for vcpu in &self.vcpus { 2516 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2517 let mut pos: usize = 0; 2518 let mut buf = vec![0; note_size as usize]; 2519 let descsz = size_of::<X86_64ElfPrStatus>(); 2520 let vcpu_id = vcpu.lock().unwrap().id; 2521 2522 let note = Elf64_Nhdr { 2523 n_namesz: COREDUMP_NAME_SIZE, 2524 n_descsz: descsz as u32, 2525 n_type: NT_PRSTATUS, 2526 }; 2527 2528 let bytes: &[u8] = note.as_slice(); 2529 buf.splice(0.., bytes.to_vec()); 2530 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2531 buf.resize(pos + 4, 0); 2532 buf.splice(pos.., "CORE".to_string().into_bytes()); 2533 2534 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2535 buf.resize(pos + 32 + 4, 0); 2536 let pid = vcpu_id as u64; 2537 let bytes: &[u8] = pid.as_slice(); 2538 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2539 2540 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2541 2542 let orig_rax: u64 = 0; 2543 let gregs = self.vcpus[usize::from(vcpu_id)] 2544 .lock() 2545 .unwrap() 2546 .vcpu 2547 .get_regs() 2548 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2549 2550 let regs1 = [ 2551 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11, 2552 gregs.r10, 2553 ]; 2554 let regs2 = [ 2555 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax, 2556 ]; 2557 2558 let sregs = self.vcpus[usize::from(vcpu_id)] 2559 .lock() 2560 .unwrap() 2561 .vcpu 2562 .get_sregs() 2563 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2564 2565 debug!( 2566 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2567 gregs.rip, 2568 gregs.rsp, 2569 sregs.gs.base, 2570 sregs.cs.selector, 2571 sregs.ss.selector, 2572 sregs.ds.selector, 2573 ); 2574 2575 let regs = X86_64UserRegs { 2576 regs1, 2577 regs2, 2578 rip: gregs.rip, 2579 cs: sregs.cs.selector as u64, 2580 eflags: gregs.rflags, 2581 rsp: gregs.rsp, 2582 ss: sregs.ss.selector as u64, 2583 fs_base: sregs.fs.base, 2584 gs_base: sregs.gs.base, 2585 ds: sregs.ds.selector as u64, 2586 es: sregs.es.selector as u64, 2587 fs: sregs.fs.selector as u64, 2588 gs: sregs.gs.selector as u64, 2589 }; 2590 2591 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2592 let bytes: &[u8] = regs.as_slice(); 2593 buf.resize(note_size as usize, 0); 2594 buf.splice(pos.., bytes.to_vec()); 2595 buf.resize(note_size as usize, 0); 2596 2597 coredump_file 2598 .write(&buf) 2599 .map_err(GuestDebuggableError::CoredumpFile)?; 2600 } 2601 2602 Ok(()) 2603 } 2604 2605 fn cpu_write_vmm_note( 2606 &mut self, 2607 dump_state: &DumpState, 2608 ) -> std::result::Result<(), GuestDebuggableError> { 2609 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2610 for vcpu in &self.vcpus { 2611 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2612 let mut pos: usize = 0; 2613 let mut buf = vec![0; note_size as usize]; 2614 let descsz = size_of::<DumpCpusState>(); 2615 let vcpu_id = vcpu.lock().unwrap().id; 2616 2617 let note = Elf64_Nhdr { 2618 n_namesz: COREDUMP_NAME_SIZE, 2619 n_descsz: descsz as u32, 2620 n_type: 0, 2621 }; 2622 2623 let bytes: &[u8] = note.as_slice(); 2624 buf.splice(0.., bytes.to_vec()); 2625 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2626 2627 buf.resize(pos + 4, 0); 2628 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2629 2630 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2631 2632 let gregs = self.vcpus[usize::from(vcpu_id)] 2633 .lock() 2634 .unwrap() 2635 .vcpu 2636 .get_regs() 2637 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2638 2639 let regs1 = [ 2640 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp, 2641 gregs.rbp, 2642 ]; 2643 2644 let regs2 = [ 2645 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, 2646 gregs.r15, 2647 ]; 2648 2649 let sregs = self.vcpus[usize::from(vcpu_id)] 2650 .lock() 2651 .unwrap() 2652 .vcpu 2653 .get_sregs() 2654 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2655 2656 let mut msrs = vec![MsrEntry { 2657 index: msr_index::MSR_KERNEL_GS_BASE, 2658 ..Default::default() 2659 }]; 2660 2661 self.vcpus[vcpu_id as usize] 2662 .lock() 2663 .unwrap() 2664 .vcpu 2665 .get_msrs(&mut msrs) 2666 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2667 let kernel_gs_base = msrs[0].data; 2668 2669 let cs = CpuSegment::new(sregs.cs); 2670 let ds = CpuSegment::new(sregs.ds); 2671 let es = CpuSegment::new(sregs.es); 2672 let fs = CpuSegment::new(sregs.fs); 2673 let gs = CpuSegment::new(sregs.gs); 2674 let ss = CpuSegment::new(sregs.ss); 2675 let ldt = CpuSegment::new(sregs.ldt); 2676 let tr = CpuSegment::new(sregs.tr); 2677 let gdt = CpuSegment::new_from_table(sregs.gdt); 2678 let idt = CpuSegment::new_from_table(sregs.idt); 2679 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2680 let regs = DumpCpusState { 2681 version: 1, 2682 size: size_of::<DumpCpusState>() as u32, 2683 regs1, 2684 regs2, 2685 rip: gregs.rip, 2686 rflags: gregs.rflags, 2687 cs, 2688 ds, 2689 es, 2690 fs, 2691 gs, 2692 ss, 2693 ldt, 2694 tr, 2695 gdt, 2696 idt, 2697 cr, 2698 kernel_gs_base, 2699 }; 2700 2701 let bytes: &[u8] = regs.as_slice(); 2702 buf.resize(note_size as usize, 0); 2703 buf.splice(pos.., bytes.to_vec()); 2704 buf.resize(note_size as usize, 0); 2705 2706 coredump_file 2707 .write(&buf) 2708 .map_err(GuestDebuggableError::CoredumpFile)?; 2709 } 2710 2711 Ok(()) 2712 } 2713 } 2714 2715 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2716 #[cfg(test)] 2717 mod tests { 2718 use arch::layout::BOOT_STACK_POINTER; 2719 use arch::layout::ZERO_PAGE_START; 2720 use arch::x86_64::interrupts::*; 2721 use arch::x86_64::regs::*; 2722 use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters}; 2723 use linux_loader::loader::bootparam::setup_header; 2724 2725 #[test] 2726 fn test_setlint() { 2727 let hv = hypervisor::new().unwrap(); 2728 let vm = hv.create_vm().expect("new VM fd creation failed"); 2729 assert!(hv.check_required_extensions().is_ok()); 2730 // Calling get_lapic will fail if there is no irqchip before hand. 2731 assert!(vm.create_irq_chip().is_ok()); 2732 let vcpu = vm.create_vcpu(0, None).unwrap(); 2733 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2734 2735 // Compute the value that is expected to represent LVT0 and LVT1. 2736 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2737 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2738 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2739 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2740 2741 set_lint(&vcpu).unwrap(); 2742 2743 // Compute the value that represents LVT0 and LVT1 after set_lint. 2744 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2745 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2746 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2747 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2748 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2749 } 2750 2751 #[test] 2752 fn test_setup_fpu() { 2753 let hv = hypervisor::new().unwrap(); 2754 let vm = hv.create_vm().expect("new VM fd creation failed"); 2755 let vcpu = vm.create_vcpu(0, None).unwrap(); 2756 setup_fpu(&vcpu).unwrap(); 2757 2758 let expected_fpu: FpuState = FpuState { 2759 fcw: 0x37f, 2760 mxcsr: 0x1f80, 2761 ..Default::default() 2762 }; 2763 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2764 // TODO: auto-generate kvm related structures with PartialEq on. 2765 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2766 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2767 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2768 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2769 // remove it at all. 2770 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2771 } 2772 2773 #[test] 2774 fn test_setup_msrs() { 2775 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2776 2777 let hv = hypervisor::new().unwrap(); 2778 let vm = hv.create_vm().expect("new VM fd creation failed"); 2779 let vcpu = vm.create_vcpu(0, None).unwrap(); 2780 setup_msrs(&vcpu).unwrap(); 2781 2782 // This test will check against the last MSR entry configured (the tenth one). 2783 // See create_msr_entries for details. 2784 let mut msrs = vec![MsrEntry { 2785 index: msr_index::MSR_IA32_MISC_ENABLE, 2786 ..Default::default() 2787 }]; 2788 2789 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2790 // in this test case scenario. 2791 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2792 assert_eq!(read_msrs, 1); 2793 2794 // Official entries that were setup when we did setup_msrs. We need to assert that the 2795 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2796 // expect. 2797 let entry_vec = vcpu.boot_msr_entries(); 2798 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2799 } 2800 2801 #[test] 2802 fn test_setup_regs_for_pvh() { 2803 let hv = hypervisor::new().unwrap(); 2804 let vm = hv.create_vm().expect("new VM fd creation failed"); 2805 let vcpu = vm.create_vcpu(0, None).unwrap(); 2806 2807 let expected_regs: StandardRegisters = StandardRegisters { 2808 rflags: 0x0000000000000002u64, 2809 rbx: arch::layout::PVH_INFO_START.0, 2810 rip: 1, 2811 ..Default::default() 2812 }; 2813 2814 setup_regs( 2815 &vcpu, 2816 arch::EntryPoint { 2817 entry_addr: vm_memory::GuestAddress(expected_regs.rip), 2818 setup_header: None, 2819 }, 2820 ) 2821 .unwrap(); 2822 2823 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2824 assert_eq!(actual_regs, expected_regs); 2825 } 2826 2827 #[test] 2828 fn test_setup_regs_for_bzimage() { 2829 let hv = hypervisor::new().unwrap(); 2830 let vm = hv.create_vm().expect("new VM fd creation failed"); 2831 let vcpu = vm.create_vcpu(0, None).unwrap(); 2832 2833 let expected_regs: StandardRegisters = StandardRegisters { 2834 rflags: 0x0000000000000002u64, 2835 rip: 1, 2836 rsp: BOOT_STACK_POINTER.0, 2837 rsi: ZERO_PAGE_START.0, 2838 ..Default::default() 2839 }; 2840 2841 setup_regs( 2842 &vcpu, 2843 arch::EntryPoint { 2844 entry_addr: vm_memory::GuestAddress(expected_regs.rip), 2845 setup_header: Some(setup_header { 2846 ..Default::default() 2847 }), 2848 }, 2849 ) 2850 .unwrap(); 2851 2852 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2853 assert_eq!(actual_regs, expected_regs); 2854 } 2855 } 2856 2857 #[cfg(target_arch = "aarch64")] 2858 #[cfg(test)] 2859 mod tests { 2860 use arch::{aarch64::regs, layout}; 2861 use hypervisor::kvm::aarch64::is_system_register; 2862 use hypervisor::kvm::kvm_bindings::{ 2863 kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2864 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2865 }; 2866 use hypervisor::{arm64_core_reg_id, offset_of}; 2867 use std::mem; 2868 2869 #[test] 2870 fn test_setup_regs() { 2871 let hv = hypervisor::new().unwrap(); 2872 let vm = hv.create_vm().unwrap(); 2873 let vcpu = vm.create_vcpu(0, None).unwrap(); 2874 2875 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2876 // Must fail when vcpu is not initialized yet. 2877 assert!(res.is_err()); 2878 2879 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2880 vm.get_preferred_target(&mut kvi).unwrap(); 2881 vcpu.vcpu_init(&kvi).unwrap(); 2882 2883 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2884 } 2885 2886 #[test] 2887 fn test_read_mpidr() { 2888 let hv = hypervisor::new().unwrap(); 2889 let vm = hv.create_vm().unwrap(); 2890 let vcpu = vm.create_vcpu(0, None).unwrap(); 2891 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2892 vm.get_preferred_target(&mut kvi).unwrap(); 2893 2894 // Must fail when vcpu is not initialized yet. 2895 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2896 2897 vcpu.vcpu_init(&kvi).unwrap(); 2898 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2899 } 2900 2901 #[test] 2902 fn test_is_system_register() { 2903 let offset = offset_of!(user_pt_regs, pc); 2904 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2905 assert!(!is_system_register(regid)); 2906 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2907 assert!(is_system_register(regid)); 2908 } 2909 2910 #[test] 2911 fn test_save_restore_core_regs() { 2912 let hv = hypervisor::new().unwrap(); 2913 let vm = hv.create_vm().unwrap(); 2914 let vcpu = vm.create_vcpu(0, None).unwrap(); 2915 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2916 vm.get_preferred_target(&mut kvi).unwrap(); 2917 2918 // Must fail when vcpu is not initialized yet. 2919 let res = vcpu.get_regs(); 2920 assert!(res.is_err()); 2921 assert_eq!( 2922 format!("{}", res.unwrap_err()), 2923 "Failed to get core register: Exec format error (os error 8)" 2924 ); 2925 2926 let mut state = kvm_regs::default(); 2927 let res = vcpu.set_regs(&state); 2928 assert!(res.is_err()); 2929 assert_eq!( 2930 format!("{}", res.unwrap_err()), 2931 "Failed to set core register: Exec format error (os error 8)" 2932 ); 2933 2934 vcpu.vcpu_init(&kvi).unwrap(); 2935 let res = vcpu.get_regs(); 2936 assert!(res.is_ok()); 2937 state = res.unwrap(); 2938 assert_eq!(state.regs.pstate, 0x3C5); 2939 2940 assert!(vcpu.set_regs(&state).is_ok()); 2941 } 2942 2943 #[test] 2944 fn test_get_set_mpstate() { 2945 let hv = hypervisor::new().unwrap(); 2946 let vm = hv.create_vm().unwrap(); 2947 let vcpu = vm.create_vcpu(0, None).unwrap(); 2948 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2949 vm.get_preferred_target(&mut kvi).unwrap(); 2950 2951 let res = vcpu.get_mp_state(); 2952 assert!(res.is_ok()); 2953 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 2954 } 2955 } 2956