1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 #[cfg(target_arch = "x86_64")] 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, sdt::Sdt, Aml}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 use arch::EntryPoint; 35 use arch::NumaNodes; 36 #[cfg(target_arch = "aarch64")] 37 use devices::gic::Gic; 38 use devices::interrupt_controller::InterruptController; 39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 44 use hypervisor::aarch64::StandardRegisters; 45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 46 use hypervisor::arch::x86::msr_index; 47 #[cfg(target_arch = "x86_64")] 48 use hypervisor::arch::x86::CpuIdEntry; 49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 50 use hypervisor::arch::x86::MsrEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters}; 53 #[cfg(target_arch = "aarch64")] 54 use hypervisor::kvm::kvm_bindings; 55 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 56 use hypervisor::kvm::kvm_ioctls::Cap; 57 #[cfg(feature = "tdx")] 58 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 59 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps}; 60 use libc::{c_void, siginfo_t}; 61 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 62 use linux_loader::elf::Elf64_Nhdr; 63 use seccompiler::{apply_filter, SeccompAction}; 64 use std::collections::BTreeMap; 65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 66 use std::io::Write; 67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 68 use std::mem::size_of; 69 use std::os::unix::thread::JoinHandleExt; 70 use std::sync::atomic::{AtomicBool, Ordering}; 71 use std::sync::{Arc, Barrier, Mutex}; 72 use std::{cmp, io, result, thread}; 73 use thiserror::Error; 74 use tracer::trace_scoped; 75 use vm_device::BusDevice; 76 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 77 use vm_memory::ByteValued; 78 #[cfg(feature = "guest_debug")] 79 use vm_memory::{Bytes, GuestAddressSpace}; 80 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 81 use vm_migration::{ 82 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 83 Transportable, 84 }; 85 use vmm_sys_util::eventfd::EventFd; 86 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 87 use zerocopy::AsBytes; 88 89 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 90 /// Extract the specified bits of a 64-bit integer. 91 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 92 /// following expression should return 3 (`0b11`): 93 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 94 /// 95 macro_rules! extract_bits_64 { 96 ($value: tt, $offset: tt, $length: tt) => { 97 ($value >> $offset) & (!0u64 >> (64 - $length)) 98 }; 99 } 100 101 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 102 macro_rules! extract_bits_64_without_offset { 103 ($value: tt, $length: tt) => { 104 $value & (!0u64 >> (64 - $length)) 105 }; 106 } 107 108 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 109 110 #[derive(Debug, Error)] 111 pub enum Error { 112 #[error("Error creating vCPU: {0}")] 113 VcpuCreate(#[source] anyhow::Error), 114 115 #[error("Error running bCPU: {0}")] 116 VcpuRun(#[source] anyhow::Error), 117 118 #[error("Error spawning vCPU thread: {0}")] 119 VcpuSpawn(#[source] io::Error), 120 121 #[error("Error generating common CPUID: {0}")] 122 CommonCpuId(#[source] arch::Error), 123 124 #[error("Error configuring vCPU: {0}")] 125 VcpuConfiguration(#[source] arch::Error), 126 127 #[error("Still pending removed vcpu")] 128 VcpuPendingRemovedVcpu, 129 130 #[cfg(target_arch = "aarch64")] 131 #[error("Error fetching preferred target: {0}")] 132 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 133 134 #[cfg(target_arch = "aarch64")] 135 #[error("Error initialising vCPU: {0}")] 136 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 137 138 #[error("Failed to join on vCPU threads: {0:?}")] 139 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 140 141 #[error("Error adding CpuManager to MMIO bus: {0}")] 142 BusError(#[source] vm_device::BusError), 143 144 #[error("Requested vCPUs exceed maximum")] 145 DesiredVCpuCountExceedsMax, 146 147 #[error("Cannot create seccomp filter: {0}")] 148 CreateSeccompFilter(#[source] seccompiler::Error), 149 150 #[error("Cannot apply seccomp filter: {0}")] 151 ApplySeccompFilter(#[source] seccompiler::Error), 152 153 #[error("Error starting vCPU after restore: {0}")] 154 StartRestoreVcpu(#[source] anyhow::Error), 155 156 #[error("Unexpected VmExit")] 157 UnexpectedVmExit, 158 159 #[error("Failed to allocate MMIO address for CpuManager")] 160 AllocateMmmioAddress, 161 162 #[cfg(feature = "tdx")] 163 #[error("Error initializing TDX: {0}")] 164 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 165 166 #[cfg(target_arch = "aarch64")] 167 #[error("Error initializing PMU: {0}")] 168 InitPmu(#[source] hypervisor::HypervisorCpuError), 169 170 #[cfg(feature = "guest_debug")] 171 #[error("Error during CPU debug: {0}")] 172 CpuDebug(#[source] hypervisor::HypervisorCpuError), 173 174 #[cfg(feature = "guest_debug")] 175 #[error("Error translating virtual address: {0}")] 176 TranslateVirtualAddress(#[source] anyhow::Error), 177 178 #[cfg(target_arch = "x86_64")] 179 #[error("Error setting up AMX: {0}")] 180 AmxEnable(#[source] anyhow::Error), 181 182 #[error("Maximum number of vCPUs exceeds host limit")] 183 MaximumVcpusExceeded, 184 } 185 pub type Result<T> = result::Result<T, Error>; 186 187 #[cfg(target_arch = "x86_64")] 188 #[allow(dead_code)] 189 #[repr(packed)] 190 #[derive(AsBytes)] 191 struct LocalX2Apic { 192 pub r#type: u8, 193 pub length: u8, 194 pub _reserved: u16, 195 pub apic_id: u32, 196 pub flags: u32, 197 pub processor_id: u32, 198 } 199 200 #[allow(dead_code)] 201 #[repr(packed)] 202 #[derive(Default, AsBytes)] 203 struct Ioapic { 204 pub r#type: u8, 205 pub length: u8, 206 pub ioapic_id: u8, 207 _reserved: u8, 208 pub apic_address: u32, 209 pub gsi_base: u32, 210 } 211 212 #[cfg(target_arch = "aarch64")] 213 #[allow(dead_code)] 214 #[repr(packed)] 215 #[derive(AsBytes)] 216 struct GicC { 217 pub r#type: u8, 218 pub length: u8, 219 pub reserved0: u16, 220 pub cpu_interface_number: u32, 221 pub uid: u32, 222 pub flags: u32, 223 pub parking_version: u32, 224 pub performance_interrupt: u32, 225 pub parked_address: u64, 226 pub base_address: u64, 227 pub gicv_base_address: u64, 228 pub gich_base_address: u64, 229 pub vgic_interrupt: u32, 230 pub gicr_base_address: u64, 231 pub mpidr: u64, 232 pub proc_power_effi_class: u8, 233 pub reserved1: u8, 234 pub spe_overflow_interrupt: u16, 235 } 236 237 #[cfg(target_arch = "aarch64")] 238 #[allow(dead_code)] 239 #[repr(packed)] 240 #[derive(AsBytes)] 241 struct GicD { 242 pub r#type: u8, 243 pub length: u8, 244 pub reserved0: u16, 245 pub gic_id: u32, 246 pub base_address: u64, 247 pub global_irq_base: u32, 248 pub version: u8, 249 pub reserved1: [u8; 3], 250 } 251 252 #[cfg(target_arch = "aarch64")] 253 #[allow(dead_code)] 254 #[repr(packed)] 255 #[derive(AsBytes)] 256 struct GicR { 257 pub r#type: u8, 258 pub length: u8, 259 pub reserved: u16, 260 pub base_address: u64, 261 pub range_length: u32, 262 } 263 264 #[cfg(target_arch = "aarch64")] 265 #[allow(dead_code)] 266 #[repr(packed)] 267 #[derive(AsBytes)] 268 struct GicIts { 269 pub r#type: u8, 270 pub length: u8, 271 pub reserved0: u16, 272 pub translation_id: u32, 273 pub base_address: u64, 274 pub reserved1: u32, 275 } 276 277 #[cfg(target_arch = "aarch64")] 278 #[allow(dead_code)] 279 #[repr(packed)] 280 #[derive(AsBytes)] 281 struct ProcessorHierarchyNode { 282 pub r#type: u8, 283 pub length: u8, 284 pub reserved: u16, 285 pub flags: u32, 286 pub parent: u32, 287 pub acpi_processor_id: u32, 288 pub num_private_resources: u32, 289 } 290 291 #[allow(dead_code)] 292 #[repr(packed)] 293 #[derive(Default, AsBytes)] 294 struct InterruptSourceOverride { 295 pub r#type: u8, 296 pub length: u8, 297 pub bus: u8, 298 pub source: u8, 299 pub gsi: u32, 300 pub flags: u16, 301 } 302 303 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 304 macro_rules! round_up { 305 ($n:expr,$d:expr) => { 306 (($n / ($d + 1)) + 1) * $d 307 }; 308 } 309 310 /// A wrapper around creating and using a kvm-based VCPU. 311 pub struct Vcpu { 312 // The hypervisor abstracted CPU. 313 vcpu: Arc<dyn hypervisor::Vcpu>, 314 id: u8, 315 #[cfg(target_arch = "aarch64")] 316 mpidr: u64, 317 saved_state: Option<CpuState>, 318 } 319 320 impl Vcpu { 321 /// Constructs a new VCPU for `vm`. 322 /// 323 /// # Arguments 324 /// 325 /// * `id` - Represents the CPU number between [0, max vcpus). 326 /// * `vm` - The virtual machine this vcpu will get attached to. 327 /// * `vm_ops` - Optional object for exit handling. 328 pub fn new( 329 id: u8, 330 vm: &Arc<dyn hypervisor::Vm>, 331 vm_ops: Option<Arc<dyn VmOps>>, 332 ) -> Result<Self> { 333 let vcpu = vm 334 .create_vcpu(id, vm_ops) 335 .map_err(|e| Error::VcpuCreate(e.into()))?; 336 // Initially the cpuid per vCPU is the one supported by this VM. 337 Ok(Vcpu { 338 vcpu, 339 id, 340 #[cfg(target_arch = "aarch64")] 341 mpidr: 0, 342 saved_state: None, 343 }) 344 } 345 346 /// Configures a vcpu and should be called once per vcpu when created. 347 /// 348 /// # Arguments 349 /// 350 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 351 /// * `guest_memory` - Guest memory. 352 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 353 pub fn configure( 354 &mut self, 355 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 356 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 357 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 358 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 359 ) -> Result<()> { 360 #[cfg(target_arch = "aarch64")] 361 { 362 self.init(vm)?; 363 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 364 .map_err(Error::VcpuConfiguration)?; 365 } 366 info!("Configuring vCPU: cpu_id = {}", self.id); 367 #[cfg(target_arch = "x86_64")] 368 arch::configure_vcpu(&self.vcpu, self.id, boot_setup, cpuid, kvm_hyperv) 369 .map_err(Error::VcpuConfiguration)?; 370 371 Ok(()) 372 } 373 374 /// Gets the MPIDR register value. 375 #[cfg(target_arch = "aarch64")] 376 pub fn get_mpidr(&self) -> u64 { 377 self.mpidr 378 } 379 380 /// Gets the saved vCPU state. 381 #[cfg(target_arch = "aarch64")] 382 pub fn get_saved_state(&self) -> Option<CpuState> { 383 self.saved_state.clone() 384 } 385 386 /// Initializes an aarch64 specific vcpu for booting Linux. 387 #[cfg(target_arch = "aarch64")] 388 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 389 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 390 391 // This reads back the kernel's preferred target type. 392 vm.get_preferred_target(&mut kvi) 393 .map_err(Error::VcpuArmPreferredTarget)?; 394 // We already checked that the capability is supported. 395 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 396 if vm 397 .as_any() 398 .downcast_ref::<hypervisor::kvm::KvmVm>() 399 .unwrap() 400 .check_extension(Cap::ArmPmuV3) 401 { 402 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 403 } 404 // Non-boot cpus are powered off initially. 405 if self.id > 0 { 406 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 407 } 408 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 409 } 410 411 /// Runs the VCPU until it exits, returning the reason. 412 /// 413 /// Note that the state of the VCPU and associated VM must be setup first for this to do 414 /// anything useful. 415 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 416 self.vcpu.run() 417 } 418 } 419 420 impl Pausable for Vcpu {} 421 impl Snapshottable for Vcpu { 422 fn id(&self) -> String { 423 self.id.to_string() 424 } 425 426 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 427 let saved_state = self 428 .vcpu 429 .state() 430 .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?; 431 432 self.saved_state = Some(saved_state.clone()); 433 434 Ok(Snapshot::from_data(SnapshotData::new_from_state( 435 &saved_state, 436 )?)) 437 } 438 } 439 440 pub struct CpuManager { 441 hypervisor_type: HypervisorType, 442 config: CpusConfig, 443 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 444 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 445 #[cfg(target_arch = "x86_64")] 446 cpuid: Vec<CpuIdEntry>, 447 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 448 vm: Arc<dyn hypervisor::Vm>, 449 vcpus_kill_signalled: Arc<AtomicBool>, 450 vcpus_pause_signalled: Arc<AtomicBool>, 451 exit_evt: EventFd, 452 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 453 reset_evt: EventFd, 454 #[cfg(feature = "guest_debug")] 455 vm_debug_evt: EventFd, 456 vcpu_states: Vec<VcpuState>, 457 selected_cpu: u8, 458 vcpus: Vec<Arc<Mutex<Vcpu>>>, 459 seccomp_action: SeccompAction, 460 vm_ops: Arc<dyn VmOps>, 461 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 462 acpi_address: Option<GuestAddress>, 463 proximity_domain_per_cpu: BTreeMap<u8, u32>, 464 affinity: BTreeMap<u8, Vec<u8>>, 465 dynamic: bool, 466 } 467 468 const CPU_ENABLE_FLAG: usize = 0; 469 const CPU_INSERTING_FLAG: usize = 1; 470 const CPU_REMOVING_FLAG: usize = 2; 471 const CPU_EJECT_FLAG: usize = 3; 472 473 const CPU_STATUS_OFFSET: u64 = 4; 474 const CPU_SELECTION_OFFSET: u64 = 0; 475 476 impl BusDevice for CpuManager { 477 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 478 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 479 data.fill(0); 480 481 match offset { 482 CPU_SELECTION_OFFSET => { 483 data[0] = self.selected_cpu; 484 } 485 CPU_STATUS_OFFSET => { 486 if self.selected_cpu < self.max_vcpus() { 487 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 488 if state.active() { 489 data[0] |= 1 << CPU_ENABLE_FLAG; 490 } 491 if state.inserting { 492 data[0] |= 1 << CPU_INSERTING_FLAG; 493 } 494 if state.removing { 495 data[0] |= 1 << CPU_REMOVING_FLAG; 496 } 497 } else { 498 warn!("Out of range vCPU id: {}", self.selected_cpu); 499 } 500 } 501 _ => { 502 warn!( 503 "Unexpected offset for accessing CPU manager device: {:#}", 504 offset 505 ); 506 } 507 } 508 } 509 510 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 511 match offset { 512 CPU_SELECTION_OFFSET => { 513 self.selected_cpu = data[0]; 514 } 515 CPU_STATUS_OFFSET => { 516 if self.selected_cpu < self.max_vcpus() { 517 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 518 // The ACPI code writes back a 1 to acknowledge the insertion 519 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 520 && state.inserting 521 { 522 state.inserting = false; 523 } 524 // Ditto for removal 525 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 526 && state.removing 527 { 528 state.removing = false; 529 } 530 // Trigger removal of vCPU 531 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 532 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 533 error!("Error removing vCPU: {:?}", e); 534 } 535 } 536 } else { 537 warn!("Out of range vCPU id: {}", self.selected_cpu); 538 } 539 } 540 _ => { 541 warn!( 542 "Unexpected offset for accessing CPU manager device: {:#}", 543 offset 544 ); 545 } 546 } 547 None 548 } 549 } 550 551 #[derive(Default)] 552 struct VcpuState { 553 inserting: bool, 554 removing: bool, 555 pending_removal: Arc<AtomicBool>, 556 handle: Option<thread::JoinHandle<()>>, 557 kill: Arc<AtomicBool>, 558 vcpu_run_interrupted: Arc<AtomicBool>, 559 paused: Arc<AtomicBool>, 560 } 561 562 impl VcpuState { 563 fn active(&self) -> bool { 564 self.handle.is_some() 565 } 566 567 fn signal_thread(&self) { 568 if let Some(handle) = self.handle.as_ref() { 569 loop { 570 // SAFETY: FFI call with correct arguments 571 unsafe { 572 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 573 } 574 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 575 break; 576 } else { 577 // This is more effective than thread::yield_now() at 578 // avoiding a priority inversion with the vCPU thread 579 thread::sleep(std::time::Duration::from_millis(1)); 580 } 581 } 582 } 583 } 584 585 fn join_thread(&mut self) -> Result<()> { 586 if let Some(handle) = self.handle.take() { 587 handle.join().map_err(Error::ThreadCleanup)? 588 } 589 590 Ok(()) 591 } 592 593 fn unpark_thread(&self) { 594 if let Some(handle) = self.handle.as_ref() { 595 handle.thread().unpark() 596 } 597 } 598 } 599 600 impl CpuManager { 601 #[allow(unused_variables)] 602 #[allow(clippy::too_many_arguments)] 603 pub fn new( 604 config: &CpusConfig, 605 vm: Arc<dyn hypervisor::Vm>, 606 exit_evt: EventFd, 607 reset_evt: EventFd, 608 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 609 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 610 seccomp_action: SeccompAction, 611 vm_ops: Arc<dyn VmOps>, 612 #[cfg(feature = "tdx")] tdx_enabled: bool, 613 numa_nodes: &NumaNodes, 614 ) -> Result<Arc<Mutex<CpuManager>>> { 615 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 616 return Err(Error::MaximumVcpusExceeded); 617 } 618 619 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 620 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 621 let hypervisor_type = hypervisor.hypervisor_type(); 622 623 #[cfg(target_arch = "x86_64")] 624 if config.features.amx { 625 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 626 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 627 const XFEATURE_XTILEDATA: usize = 18; 628 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 629 630 // SAFETY: the syscall is only modifing kernel internal 631 // data structures that the kernel is itself expected to safeguard. 632 let amx_tile = unsafe { 633 libc::syscall( 634 libc::SYS_arch_prctl, 635 ARCH_REQ_XCOMP_GUEST_PERM, 636 XFEATURE_XTILEDATA, 637 ) 638 }; 639 640 if amx_tile != 0 { 641 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 642 } else { 643 let mask: usize = 0; 644 // SAFETY: the mask being modified (not marked mutable as it is 645 // modified in unsafe only which is permitted) isn't in use elsewhere. 646 let result = unsafe { 647 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 648 }; 649 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 650 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 651 } 652 } 653 } 654 655 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 656 let mut cpu_list = Vec::new(); 657 for (proximity_domain, numa_node) in numa_nodes.iter() { 658 for cpu in numa_node.cpus.iter() { 659 cpu_list.push((*cpu, *proximity_domain)) 660 } 661 } 662 cpu_list 663 } 664 .into_iter() 665 .collect(); 666 667 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 668 cpu_affinity 669 .iter() 670 .map(|a| (a.vcpu, a.host_cpus.clone())) 671 .collect() 672 } else { 673 BTreeMap::new() 674 }; 675 676 #[cfg(feature = "tdx")] 677 let dynamic = !tdx_enabled; 678 #[cfg(not(feature = "tdx"))] 679 let dynamic = true; 680 681 Ok(Arc::new(Mutex::new(CpuManager { 682 hypervisor_type, 683 config: config.clone(), 684 interrupt_controller: None, 685 #[cfg(target_arch = "x86_64")] 686 cpuid: Vec::new(), 687 vm, 688 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 689 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 690 vcpu_states, 691 exit_evt, 692 reset_evt, 693 #[cfg(feature = "guest_debug")] 694 vm_debug_evt, 695 selected_cpu: 0, 696 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 697 seccomp_action, 698 vm_ops, 699 acpi_address: None, 700 proximity_domain_per_cpu, 701 affinity, 702 dynamic, 703 }))) 704 } 705 706 #[cfg(target_arch = "x86_64")] 707 pub fn populate_cpuid( 708 &mut self, 709 memory_manager: &Arc<Mutex<MemoryManager>>, 710 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 711 #[cfg(feature = "tdx")] tdx_enabled: bool, 712 ) -> Result<()> { 713 let sgx_epc_sections = memory_manager 714 .lock() 715 .unwrap() 716 .sgx_epc_region() 717 .as_ref() 718 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 719 720 let topology = self.config.topology.clone().map_or_else( 721 || { 722 #[cfg(feature = "mshv")] 723 if matches!(hypervisor.hypervisor_type(), HypervisorType::Mshv) { 724 return Some((1, self.boot_vcpus(), 1)); 725 } 726 None 727 }, 728 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 729 ); 730 731 self.cpuid = { 732 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 733 arch::generate_common_cpuid( 734 hypervisor, 735 topology, 736 sgx_epc_sections, 737 phys_bits, 738 self.config.kvm_hyperv, 739 #[cfg(feature = "tdx")] 740 tdx_enabled, 741 ) 742 .map_err(Error::CommonCpuId)? 743 }; 744 745 Ok(()) 746 } 747 748 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 749 info!("Creating vCPU: cpu_id = {}", cpu_id); 750 751 let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?; 752 753 if let Some(snapshot) = snapshot { 754 // AArch64 vCPUs should be initialized after created. 755 #[cfg(target_arch = "aarch64")] 756 vcpu.init(&self.vm)?; 757 758 let state: CpuState = snapshot.to_state().map_err(|e| { 759 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 760 })?; 761 vcpu.vcpu 762 .set_state(&state) 763 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 764 765 vcpu.saved_state = Some(state); 766 } 767 768 let vcpu = Arc::new(Mutex::new(vcpu)); 769 770 // Adding vCPU to the CpuManager's vCPU list. 771 self.vcpus.push(vcpu.clone()); 772 773 Ok(vcpu) 774 } 775 776 pub fn configure_vcpu( 777 &self, 778 vcpu: Arc<Mutex<Vcpu>>, 779 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 780 ) -> Result<()> { 781 let mut vcpu = vcpu.lock().unwrap(); 782 783 #[cfg(target_arch = "x86_64")] 784 assert!(!self.cpuid.is_empty()); 785 786 #[cfg(target_arch = "x86_64")] 787 vcpu.configure(boot_setup, self.cpuid.clone(), self.config.kvm_hyperv)?; 788 789 #[cfg(target_arch = "aarch64")] 790 vcpu.configure(&self.vm, boot_setup)?; 791 792 Ok(()) 793 } 794 795 /// Only create new vCPUs if there aren't any inactive ones to reuse 796 fn create_vcpus( 797 &mut self, 798 desired_vcpus: u8, 799 snapshot: Option<Snapshot>, 800 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 801 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 802 info!( 803 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 804 desired_vcpus, 805 self.config.max_vcpus, 806 self.vcpus.len(), 807 self.present_vcpus() 808 ); 809 810 if desired_vcpus > self.config.max_vcpus { 811 return Err(Error::DesiredVCpuCountExceedsMax); 812 } 813 814 // Only create vCPUs in excess of all the allocated vCPUs. 815 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 816 vcpus.push(self.create_vcpu( 817 cpu_id, 818 // TODO: The special format of the CPU id can be removed once 819 // ready to break live upgrade. 820 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 821 )?); 822 } 823 824 Ok(vcpus) 825 } 826 827 #[cfg(target_arch = "aarch64")] 828 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 829 for cpu in self.vcpus.iter() { 830 let cpu = cpu.lock().unwrap(); 831 // Check if PMU attr is available, if not, log the information. 832 if cpu.vcpu.has_pmu_support() { 833 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 834 } else { 835 debug!( 836 "PMU attribute is not supported in vCPU{}, skip PMU init!", 837 cpu.id 838 ); 839 return Ok(false); 840 } 841 } 842 843 Ok(true) 844 } 845 846 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 847 self.vcpus.clone() 848 } 849 850 fn start_vcpu( 851 &mut self, 852 vcpu: Arc<Mutex<Vcpu>>, 853 vcpu_id: u8, 854 vcpu_thread_barrier: Arc<Barrier>, 855 inserting: bool, 856 ) -> Result<()> { 857 let reset_evt = self.reset_evt.try_clone().unwrap(); 858 let exit_evt = self.exit_evt.try_clone().unwrap(); 859 #[cfg(feature = "kvm")] 860 let hypervisor_type = self.hypervisor_type; 861 #[cfg(feature = "guest_debug")] 862 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 863 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 864 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 865 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 866 867 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 868 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 869 .vcpu_run_interrupted 870 .clone(); 871 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 872 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 873 874 // Prepare the CPU set the current vCPU is expected to run onto. 875 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 876 // SAFETY: all zeros is a valid pattern 877 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 878 // SAFETY: FFI call, trivially safe 879 unsafe { libc::CPU_ZERO(&mut cpuset) }; 880 for host_cpu in host_cpus { 881 // SAFETY: FFI call, trivially safe 882 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) }; 883 } 884 cpuset 885 }); 886 887 // Retrieve seccomp filter for vcpu thread 888 let vcpu_seccomp_filter = 889 get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type) 890 .map_err(Error::CreateSeccompFilter)?; 891 892 #[cfg(target_arch = "x86_64")] 893 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 894 895 info!("Starting vCPU: cpu_id = {}", vcpu_id); 896 897 let handle = Some( 898 thread::Builder::new() 899 .name(format!("vcpu{vcpu_id}")) 900 .spawn(move || { 901 // Schedule the thread to run on the expected CPU set 902 if let Some(cpuset) = cpuset.as_ref() { 903 // SAFETY: FFI call with correct arguments 904 let ret = unsafe { 905 libc::sched_setaffinity( 906 0, 907 std::mem::size_of::<libc::cpu_set_t>(), 908 cpuset as *const libc::cpu_set_t, 909 ) 910 }; 911 912 if ret != 0 { 913 error!( 914 "Failed scheduling the vCPU {} on the expected CPU set: {}", 915 vcpu_id, 916 io::Error::last_os_error() 917 ); 918 return; 919 } 920 } 921 922 // Apply seccomp filter for vcpu thread. 923 if !vcpu_seccomp_filter.is_empty() { 924 if let Err(e) = 925 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 926 { 927 error!("Error applying seccomp filter: {:?}", e); 928 return; 929 } 930 } 931 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 932 // This uses an async signal safe handler to kill the vcpu handles. 933 register_signal_handler(SIGRTMIN(), handle_signal) 934 .expect("Failed to register vcpu signal handler"); 935 // Block until all CPUs are ready. 936 vcpu_thread_barrier.wait(); 937 938 std::panic::catch_unwind(move || { 939 loop { 940 // If we are being told to pause, we park the thread 941 // until the pause boolean is toggled. 942 // The resume operation is responsible for toggling 943 // the boolean and unpark the thread. 944 // We enter a loop because park() could spuriously 945 // return. We will then park() again unless the 946 // pause boolean has been toggled. 947 948 // Need to use Ordering::SeqCst as we have multiple 949 // loads and stores to different atomics and we need 950 // to see them in a consistent order in all threads 951 952 if vcpu_pause_signalled.load(Ordering::SeqCst) { 953 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 954 // completed by returning to KVM_RUN. From the kernel docs: 955 // 956 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 957 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 958 // operations are complete (and guest state is consistent) only after userspace 959 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 960 // incomplete operations and then check for pending signals. 961 // The pending state of the operation is not preserved in state which is 962 // visible to userspace, thus userspace should ensure that the operation is 963 // completed before performing a live migration. Userspace can re-enter the 964 // guest with an unmasked signal pending or with the immediate_exit field set 965 // to complete pending operations without allowing any further instructions 966 // to be executed. 967 968 #[cfg(feature = "kvm")] 969 if matches!(hypervisor_type, HypervisorType::Kvm) { 970 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 971 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 972 error!("Unexpected VM exit on \"immediate_exit\" run"); 973 break; 974 } 975 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 976 } 977 978 vcpu_run_interrupted.store(true, Ordering::SeqCst); 979 980 vcpu_paused.store(true, Ordering::SeqCst); 981 while vcpu_pause_signalled.load(Ordering::SeqCst) { 982 thread::park(); 983 } 984 vcpu_run_interrupted.store(false, Ordering::SeqCst); 985 } 986 987 // We've been told to terminate 988 if vcpu_kill_signalled.load(Ordering::SeqCst) 989 || vcpu_kill.load(Ordering::SeqCst) 990 { 991 vcpu_run_interrupted.store(true, Ordering::SeqCst); 992 break; 993 } 994 995 #[cfg(feature = "tdx")] 996 let mut vcpu = vcpu.lock().unwrap(); 997 #[cfg(not(feature = "tdx"))] 998 let vcpu = vcpu.lock().unwrap(); 999 // vcpu.run() returns false on a triple-fault so trigger a reset 1000 match vcpu.run() { 1001 Ok(run) => match run { 1002 #[cfg(feature = "kvm")] 1003 VmExit::Debug => { 1004 info!("VmExit::Debug"); 1005 #[cfg(feature = "guest_debug")] 1006 { 1007 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1008 let raw_tid = get_raw_tid(vcpu_id as usize); 1009 vm_debug_evt.write(raw_tid as u64).unwrap(); 1010 } 1011 } 1012 #[cfg(target_arch = "x86_64")] 1013 VmExit::IoapicEoi(vector) => { 1014 if let Some(interrupt_controller) = 1015 &interrupt_controller_clone 1016 { 1017 interrupt_controller 1018 .lock() 1019 .unwrap() 1020 .end_of_interrupt(vector); 1021 } 1022 } 1023 VmExit::Ignore => {} 1024 VmExit::Hyperv => {} 1025 VmExit::Reset => { 1026 info!("VmExit::Reset"); 1027 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1028 reset_evt.write(1).unwrap(); 1029 break; 1030 } 1031 VmExit::Shutdown => { 1032 info!("VmExit::Shutdown"); 1033 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1034 exit_evt.write(1).unwrap(); 1035 break; 1036 } 1037 #[cfg(feature = "tdx")] 1038 VmExit::Tdx => { 1039 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1040 match vcpu.get_tdx_exit_details() { 1041 Ok(details) => match details { 1042 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1043 TdxExitDetails::SetupEventNotifyInterrupt => { 1044 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1045 } 1046 }, 1047 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1048 } 1049 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1050 } else { 1051 // We should never reach this code as 1052 // this means the design from the code 1053 // is wrong. 1054 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1055 } 1056 } 1057 _ => { 1058 error!( 1059 "VCPU generated error: {:?}", 1060 Error::UnexpectedVmExit 1061 ); 1062 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1063 exit_evt.write(1).unwrap(); 1064 break; 1065 } 1066 }, 1067 1068 Err(e) => { 1069 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1070 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1071 exit_evt.write(1).unwrap(); 1072 break; 1073 } 1074 } 1075 1076 // We've been told to terminate 1077 if vcpu_kill_signalled.load(Ordering::SeqCst) 1078 || vcpu_kill.load(Ordering::SeqCst) 1079 { 1080 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1081 break; 1082 } 1083 } 1084 }) 1085 .or_else(|_| { 1086 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1087 error!("vCPU thread panicked"); 1088 panic_exit_evt.write(1) 1089 }) 1090 .ok(); 1091 }) 1092 .map_err(Error::VcpuSpawn)?, 1093 ); 1094 1095 // On hot plug calls into this function entry_point is None. It is for 1096 // those hotplug CPU additions that we need to set the inserting flag. 1097 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1098 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1099 1100 Ok(()) 1101 } 1102 1103 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1104 fn activate_vcpus( 1105 &mut self, 1106 desired_vcpus: u8, 1107 inserting: bool, 1108 paused: Option<bool>, 1109 ) -> Result<()> { 1110 if desired_vcpus > self.config.max_vcpus { 1111 return Err(Error::DesiredVCpuCountExceedsMax); 1112 } 1113 1114 let vcpu_thread_barrier = Arc::new(Barrier::new( 1115 (desired_vcpus - self.present_vcpus() + 1) as usize, 1116 )); 1117 1118 if let Some(paused) = paused { 1119 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1120 } 1121 1122 info!( 1123 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1124 desired_vcpus, 1125 self.vcpus.len(), 1126 self.present_vcpus(), 1127 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1128 ); 1129 1130 // This reuses any inactive vCPUs as well as any that were newly created 1131 for vcpu_id in self.present_vcpus()..desired_vcpus { 1132 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1133 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1134 } 1135 1136 // Unblock all CPU threads. 1137 vcpu_thread_barrier.wait(); 1138 Ok(()) 1139 } 1140 1141 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1142 // Mark vCPUs for removal, actual removal happens on ejection 1143 for cpu_id in desired_vcpus..self.present_vcpus() { 1144 self.vcpu_states[usize::from(cpu_id)].removing = true; 1145 self.vcpu_states[usize::from(cpu_id)] 1146 .pending_removal 1147 .store(true, Ordering::SeqCst); 1148 } 1149 } 1150 1151 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1152 for state in self.vcpu_states.iter() { 1153 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1154 return true; 1155 } 1156 } 1157 false 1158 } 1159 1160 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1161 info!("Removing vCPU: cpu_id = {}", cpu_id); 1162 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1163 state.kill.store(true, Ordering::SeqCst); 1164 state.signal_thread(); 1165 state.join_thread()?; 1166 state.handle = None; 1167 1168 // Once the thread has exited, clear the "kill" so that it can reused 1169 state.kill.store(false, Ordering::SeqCst); 1170 state.pending_removal.store(false, Ordering::SeqCst); 1171 1172 Ok(()) 1173 } 1174 1175 pub fn create_boot_vcpus( 1176 &mut self, 1177 snapshot: Option<Snapshot>, 1178 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1179 trace_scoped!("create_boot_vcpus"); 1180 1181 self.create_vcpus(self.boot_vcpus(), snapshot) 1182 } 1183 1184 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1185 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1186 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1187 } 1188 1189 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1190 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1191 .map_err(|e| { 1192 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1193 })?; 1194 1195 Ok(()) 1196 } 1197 1198 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1199 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1200 return Ok(false); 1201 } 1202 1203 if !self.dynamic { 1204 return Ok(false); 1205 } 1206 1207 if self.check_pending_removed_vcpu() { 1208 return Err(Error::VcpuPendingRemovedVcpu); 1209 } 1210 1211 match desired_vcpus.cmp(&self.present_vcpus()) { 1212 cmp::Ordering::Greater => { 1213 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1214 for vcpu in vcpus { 1215 self.configure_vcpu(vcpu, None)? 1216 } 1217 self.activate_vcpus(desired_vcpus, true, None)?; 1218 Ok(true) 1219 } 1220 cmp::Ordering::Less => { 1221 self.mark_vcpus_for_removal(desired_vcpus); 1222 Ok(true) 1223 } 1224 _ => Ok(false), 1225 } 1226 } 1227 1228 pub fn shutdown(&mut self) -> Result<()> { 1229 // Tell the vCPUs to stop themselves next time they go through the loop 1230 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1231 1232 // Toggle the vCPUs pause boolean 1233 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1234 1235 // Unpark all the VCPU threads. 1236 for state in self.vcpu_states.iter() { 1237 state.unpark_thread(); 1238 } 1239 1240 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1241 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1242 // above. 1243 for state in self.vcpu_states.iter() { 1244 state.signal_thread(); 1245 } 1246 1247 // Wait for all the threads to finish. This removes the state from the vector. 1248 for mut state in self.vcpu_states.drain(..) { 1249 state.join_thread()?; 1250 } 1251 1252 Ok(()) 1253 } 1254 1255 #[cfg(feature = "tdx")] 1256 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1257 for vcpu in &self.vcpus { 1258 vcpu.lock() 1259 .unwrap() 1260 .vcpu 1261 .tdx_init(hob_address) 1262 .map_err(Error::InitializeTdx)?; 1263 } 1264 Ok(()) 1265 } 1266 1267 pub fn boot_vcpus(&self) -> u8 { 1268 self.config.boot_vcpus 1269 } 1270 1271 pub fn max_vcpus(&self) -> u8 { 1272 self.config.max_vcpus 1273 } 1274 1275 #[cfg(target_arch = "x86_64")] 1276 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1277 assert!(!self.cpuid.is_empty()); 1278 self.cpuid.clone() 1279 } 1280 1281 fn present_vcpus(&self) -> u8 { 1282 self.vcpu_states 1283 .iter() 1284 .fold(0, |acc, state| acc + state.active() as u8) 1285 } 1286 1287 #[cfg(target_arch = "aarch64")] 1288 pub fn get_mpidrs(&self) -> Vec<u64> { 1289 self.vcpus 1290 .iter() 1291 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1292 .collect() 1293 } 1294 1295 #[cfg(target_arch = "aarch64")] 1296 pub fn get_saved_states(&self) -> Vec<CpuState> { 1297 self.vcpus 1298 .iter() 1299 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1300 .collect() 1301 } 1302 1303 #[cfg(target_arch = "aarch64")] 1304 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1305 self.config 1306 .topology 1307 .clone() 1308 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1309 } 1310 1311 pub fn create_madt(&self) -> Sdt { 1312 use crate::acpi; 1313 // This is also checked in the commandline parsing. 1314 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1315 1316 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1317 #[cfg(target_arch = "x86_64")] 1318 { 1319 madt.write(36, arch::layout::APIC_START.0); 1320 1321 for cpu in 0..self.config.max_vcpus { 1322 let lapic = LocalX2Apic { 1323 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1324 length: 16, 1325 processor_id: cpu.into(), 1326 apic_id: cpu.into(), 1327 flags: if cpu < self.config.boot_vcpus { 1328 1 << MADT_CPU_ENABLE_FLAG 1329 } else { 1330 0 1331 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1332 _reserved: 0, 1333 }; 1334 madt.append(lapic); 1335 } 1336 1337 madt.append(Ioapic { 1338 r#type: acpi::ACPI_APIC_IO, 1339 length: 12, 1340 ioapic_id: 0, 1341 apic_address: arch::layout::IOAPIC_START.0 as u32, 1342 gsi_base: 0, 1343 ..Default::default() 1344 }); 1345 1346 madt.append(InterruptSourceOverride { 1347 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1348 length: 10, 1349 bus: 0, 1350 source: 4, 1351 gsi: 4, 1352 flags: 0, 1353 }); 1354 } 1355 1356 #[cfg(target_arch = "aarch64")] 1357 { 1358 /* Notes: 1359 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1360 */ 1361 1362 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1363 for cpu in 0..self.config.boot_vcpus { 1364 let vcpu = &self.vcpus[cpu as usize]; 1365 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1366 /* ARMv8 MPIDR format: 1367 Bits [63:40] Must be zero 1368 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1369 Bits [31:24] Must be zero 1370 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1371 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1372 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1373 */ 1374 let mpidr_mask = 0xff_00ff_ffff; 1375 let gicc = GicC { 1376 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1377 length: 80, 1378 reserved0: 0, 1379 cpu_interface_number: cpu as u32, 1380 uid: cpu as u32, 1381 flags: 1, 1382 parking_version: 0, 1383 performance_interrupt: 0, 1384 parked_address: 0, 1385 base_address: 0, 1386 gicv_base_address: 0, 1387 gich_base_address: 0, 1388 vgic_interrupt: 0, 1389 gicr_base_address: 0, 1390 mpidr: mpidr & mpidr_mask, 1391 proc_power_effi_class: 0, 1392 reserved1: 0, 1393 spe_overflow_interrupt: 0, 1394 }; 1395 1396 madt.append(gicc); 1397 } 1398 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1399 1400 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1401 let gicd = GicD { 1402 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1403 length: 24, 1404 reserved0: 0, 1405 gic_id: 0, 1406 base_address: vgic_config.dist_addr, 1407 global_irq_base: 0, 1408 version: 3, 1409 reserved1: [0; 3], 1410 }; 1411 madt.append(gicd); 1412 1413 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1414 let gicr = GicR { 1415 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1416 length: 16, 1417 reserved: 0, 1418 base_address: vgic_config.redists_addr, 1419 range_length: vgic_config.redists_size as u32, 1420 }; 1421 madt.append(gicr); 1422 1423 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1424 let gicits = GicIts { 1425 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1426 length: 20, 1427 reserved0: 0, 1428 translation_id: 0, 1429 base_address: vgic_config.msi_addr, 1430 reserved1: 0, 1431 }; 1432 madt.append(gicits); 1433 1434 madt.update_checksum(); 1435 } 1436 1437 madt 1438 } 1439 1440 #[cfg(target_arch = "aarch64")] 1441 pub fn create_pptt(&self) -> Sdt { 1442 let pptt_start = 0; 1443 let mut cpus = 0; 1444 let mut uid = 0; 1445 // If topology is not specified, the default setting is: 1446 // 1 package, multiple cores, 1 thread per core 1447 // This is also the behavior when PPTT is missing. 1448 let (threads_per_core, cores_per_package, packages) = 1449 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1450 1451 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1452 1453 for cluster_idx in 0..packages { 1454 if cpus < self.config.boot_vcpus as usize { 1455 let cluster_offset = pptt.len() - pptt_start; 1456 let cluster_hierarchy_node = ProcessorHierarchyNode { 1457 r#type: 0, 1458 length: 20, 1459 reserved: 0, 1460 flags: 0x2, 1461 parent: 0, 1462 acpi_processor_id: cluster_idx as u32, 1463 num_private_resources: 0, 1464 }; 1465 pptt.append(cluster_hierarchy_node); 1466 1467 for core_idx in 0..cores_per_package { 1468 let core_offset = pptt.len() - pptt_start; 1469 1470 if threads_per_core > 1 { 1471 let core_hierarchy_node = ProcessorHierarchyNode { 1472 r#type: 0, 1473 length: 20, 1474 reserved: 0, 1475 flags: 0x2, 1476 parent: cluster_offset as u32, 1477 acpi_processor_id: core_idx as u32, 1478 num_private_resources: 0, 1479 }; 1480 pptt.append(core_hierarchy_node); 1481 1482 for _thread_idx in 0..threads_per_core { 1483 let thread_hierarchy_node = ProcessorHierarchyNode { 1484 r#type: 0, 1485 length: 20, 1486 reserved: 0, 1487 flags: 0xE, 1488 parent: core_offset as u32, 1489 acpi_processor_id: uid as u32, 1490 num_private_resources: 0, 1491 }; 1492 pptt.append(thread_hierarchy_node); 1493 uid += 1; 1494 } 1495 } else { 1496 let thread_hierarchy_node = ProcessorHierarchyNode { 1497 r#type: 0, 1498 length: 20, 1499 reserved: 0, 1500 flags: 0xA, 1501 parent: cluster_offset as u32, 1502 acpi_processor_id: uid as u32, 1503 num_private_resources: 0, 1504 }; 1505 pptt.append(thread_hierarchy_node); 1506 uid += 1; 1507 } 1508 } 1509 cpus += (cores_per_package * threads_per_core) as usize; 1510 } 1511 } 1512 1513 pptt.update_checksum(); 1514 pptt 1515 } 1516 1517 #[cfg(feature = "guest_debug")] 1518 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1519 self.vcpus[usize::from(cpu_id)] 1520 .lock() 1521 .unwrap() 1522 .vcpu 1523 .get_regs() 1524 .map_err(Error::CpuDebug) 1525 } 1526 1527 #[cfg(feature = "guest_debug")] 1528 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1529 self.vcpus[usize::from(cpu_id)] 1530 .lock() 1531 .unwrap() 1532 .vcpu 1533 .set_regs(regs) 1534 .map_err(Error::CpuDebug) 1535 } 1536 1537 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1538 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1539 self.vcpus[usize::from(cpu_id)] 1540 .lock() 1541 .unwrap() 1542 .vcpu 1543 .get_sregs() 1544 .map_err(Error::CpuDebug) 1545 } 1546 1547 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1548 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1549 self.vcpus[usize::from(cpu_id)] 1550 .lock() 1551 .unwrap() 1552 .vcpu 1553 .set_sregs(sregs) 1554 .map_err(Error::CpuDebug) 1555 } 1556 1557 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1558 fn translate_gva( 1559 &self, 1560 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1561 cpu_id: u8, 1562 gva: u64, 1563 ) -> Result<u64> { 1564 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1565 .lock() 1566 .unwrap() 1567 .vcpu 1568 .translate_gva(gva, /* flags: unused */ 0) 1569 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1570 Ok(gpa) 1571 } 1572 1573 /// 1574 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1575 /// it in VMM by walking through translation tables. 1576 /// 1577 /// Address translation is big topic, here we only focus the scenario that 1578 /// happens in VMM while debugging kernel. This `translate_gva` 1579 /// implementation is restricted to: 1580 /// - Exception Level 1 1581 /// - Translate high address range only (kernel space) 1582 /// 1583 /// This implementation supports following Arm-v8a features related to 1584 /// address translation: 1585 /// - FEAT_LPA 1586 /// - FEAT_LVA 1587 /// - FEAT_LPA2 1588 /// 1589 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1590 fn translate_gva( 1591 &self, 1592 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1593 cpu_id: u8, 1594 gva: u64, 1595 ) -> Result<u64> { 1596 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1597 .lock() 1598 .unwrap() 1599 .vcpu 1600 .get_sys_reg(regs::TCR_EL1) 1601 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1602 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1603 .lock() 1604 .unwrap() 1605 .vcpu 1606 .get_sys_reg(regs::TTBR1_EL1) 1607 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1608 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1609 .lock() 1610 .unwrap() 1611 .vcpu 1612 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1613 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1614 1615 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1616 // or low (0x000xxx...). 1617 let high_range = extract_bits_64!(gva, 55, 1); 1618 if high_range == 0 { 1619 info!("VA (0x{:x}) range is not supported!", gva); 1620 return Ok(gva); 1621 } 1622 1623 // High range size offset 1624 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1625 // Granule size 1626 let tg = extract_bits_64!(tcr_el1, 30, 2); 1627 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1628 let ds = extract_bits_64!(tcr_el1, 59, 1); 1629 1630 if tsz == 0 { 1631 info!("VA translation is not ready!"); 1632 return Ok(gva); 1633 } 1634 1635 // VA size is determined by TCR_BL1.T1SZ 1636 let va_size = 64 - tsz; 1637 // Number of bits in VA consumed in each level of translation 1638 let stride = match tg { 1639 3 => 13, // 64KB granule size 1640 1 => 11, // 16KB granule size 1641 _ => 9, // 4KB, default 1642 }; 1643 // Starting level of walking 1644 let mut level = 4 - (va_size - 4) / stride; 1645 1646 // PA or IPA size is determined 1647 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1648 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1649 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1650 // To be safe, we use the minimum value if they are different. 1651 let pa_range = std::cmp::min(tcr_ips, pa_range); 1652 // PA size in bits 1653 let pa_size = match pa_range { 1654 0 => 32, 1655 1 => 36, 1656 2 => 40, 1657 3 => 42, 1658 4 => 44, 1659 5 => 48, 1660 6 => 52, 1661 _ => { 1662 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1663 "PA range not supported {pa_range}" 1664 )))) 1665 } 1666 }; 1667 1668 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1669 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1670 // If FEAT_LPA2 is present, the translation table descriptor holds 1671 // 50 bits of the table address of next level. 1672 // Otherwise, it is 48 bits. 1673 let descaddrmask = if ds == 1 { 1674 !0u64 >> (64 - 50) // mask with 50 least significant bits 1675 } else { 1676 !0u64 >> (64 - 48) // mask with 48 least significant bits 1677 }; 1678 let descaddrmask = descaddrmask & !indexmask_grainsize; 1679 1680 // Translation table base address 1681 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1682 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1683 // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1684 if pa_size == 52 { 1685 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1686 } 1687 1688 // Loop through tables of each level 1689 loop { 1690 // Table offset for current level 1691 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1692 descaddr |= table_offset; 1693 descaddr &= !7u64; 1694 1695 let mut buf = [0; 8]; 1696 guest_memory 1697 .memory() 1698 .read(&mut buf, GuestAddress(descaddr)) 1699 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1700 let descriptor = u64::from_le_bytes(buf); 1701 1702 descaddr = descriptor & descaddrmask; 1703 // In the case of FEAT_LPA, the next-level translation table address 1704 // bits [48:51] comes from bits [12:15] of the current descriptor. 1705 // For FEAT_LPA2, the next-level translation table address 1706 // bits [50:51] comes from bits [8:9] of the current descriptor, 1707 // bits [48:49] comes from bits [48:49] of the descriptor which was 1708 // handled previously. 1709 if pa_size == 52 { 1710 if ds == 1 { 1711 // FEAT_LPA2 1712 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1713 } else { 1714 // FEAT_LPA 1715 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1716 } 1717 } 1718 1719 if (descriptor & 2) != 0 && (level < 3) { 1720 // This is a table entry. Go down to next level. 1721 level += 1; 1722 indexmask = indexmask_grainsize; 1723 continue; 1724 } 1725 1726 break; 1727 } 1728 1729 // We have reached either: 1730 // - a page entry at level 3 or 1731 // - a block entry at level 1 or 2 1732 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1733 descaddr &= !(page_size - 1); 1734 descaddr |= gva & (page_size - 1); 1735 1736 Ok(descaddr) 1737 } 1738 1739 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1740 self.acpi_address = Some(acpi_address); 1741 } 1742 1743 pub(crate) fn set_interrupt_controller( 1744 &mut self, 1745 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1746 ) { 1747 self.interrupt_controller = Some(interrupt_controller); 1748 } 1749 1750 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1751 &self.vcpus_kill_signalled 1752 } 1753 } 1754 1755 struct Cpu { 1756 cpu_id: u8, 1757 proximity_domain: u32, 1758 dynamic: bool, 1759 } 1760 1761 #[cfg(target_arch = "x86_64")] 1762 const MADT_CPU_ENABLE_FLAG: usize = 0; 1763 1764 #[cfg(target_arch = "x86_64")] 1765 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1766 1767 impl Cpu { 1768 #[cfg(target_arch = "x86_64")] 1769 fn generate_mat(&self) -> Vec<u8> { 1770 let lapic = LocalX2Apic { 1771 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1772 length: 16, 1773 processor_id: self.cpu_id.into(), 1774 apic_id: self.cpu_id.into(), 1775 flags: 1 << MADT_CPU_ENABLE_FLAG, 1776 _reserved: 0, 1777 }; 1778 1779 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1780 // SAFETY: mat_data is large enough to hold lapic 1781 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1782 1783 mat_data 1784 } 1785 } 1786 1787 impl Aml for Cpu { 1788 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1789 #[cfg(target_arch = "x86_64")] 1790 let mat_data: Vec<u8> = self.generate_mat(); 1791 #[allow(clippy::if_same_then_else)] 1792 if self.dynamic { 1793 aml::Device::new( 1794 format!("C{:03X}", self.cpu_id).as_str().into(), 1795 vec![ 1796 &aml::Name::new("_HID".into(), &"ACPI0007"), 1797 &aml::Name::new("_UID".into(), &self.cpu_id), 1798 // Currently, AArch64 cannot support following fields. 1799 /* 1800 _STA return value: 1801 Bit [0] – Set if the device is present. 1802 Bit [1] – Set if the device is enabled and decoding its resources. 1803 Bit [2] – Set if the device should be shown in the UI. 1804 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1805 Bit [4] – Set if the battery is present. 1806 Bits [31:5] – Reserved (must be cleared). 1807 */ 1808 #[cfg(target_arch = "x86_64")] 1809 &aml::Method::new( 1810 "_STA".into(), 1811 0, 1812 false, 1813 // Call into CSTA method which will interrogate device 1814 vec![&aml::Return::new(&aml::MethodCall::new( 1815 "CSTA".into(), 1816 vec![&self.cpu_id], 1817 ))], 1818 ), 1819 &aml::Method::new( 1820 "_PXM".into(), 1821 0, 1822 false, 1823 vec![&aml::Return::new(&self.proximity_domain)], 1824 ), 1825 // The Linux kernel expects every CPU device to have a _MAT entry 1826 // containing the LAPIC for this processor with the enabled bit set 1827 // even it if is disabled in the MADT (non-boot CPU) 1828 #[cfg(target_arch = "x86_64")] 1829 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1830 // Trigger CPU ejection 1831 #[cfg(target_arch = "x86_64")] 1832 &aml::Method::new( 1833 "_EJ0".into(), 1834 1, 1835 false, 1836 // Call into CEJ0 method which will actually eject device 1837 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1838 ), 1839 ], 1840 ) 1841 .to_aml_bytes(sink); 1842 } else { 1843 aml::Device::new( 1844 format!("C{:03X}", self.cpu_id).as_str().into(), 1845 vec![ 1846 &aml::Name::new("_HID".into(), &"ACPI0007"), 1847 &aml::Name::new("_UID".into(), &self.cpu_id), 1848 #[cfg(target_arch = "x86_64")] 1849 &aml::Method::new( 1850 "_STA".into(), 1851 0, 1852 false, 1853 // Mark CPU present see CSTA implementation 1854 vec![&aml::Return::new(&0xfu8)], 1855 ), 1856 &aml::Method::new( 1857 "_PXM".into(), 1858 0, 1859 false, 1860 vec![&aml::Return::new(&self.proximity_domain)], 1861 ), 1862 // The Linux kernel expects every CPU device to have a _MAT entry 1863 // containing the LAPIC for this processor with the enabled bit set 1864 // even it if is disabled in the MADT (non-boot CPU) 1865 #[cfg(target_arch = "x86_64")] 1866 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1867 ], 1868 ) 1869 .to_aml_bytes(sink); 1870 } 1871 } 1872 } 1873 1874 struct CpuNotify { 1875 cpu_id: u8, 1876 } 1877 1878 impl Aml for CpuNotify { 1879 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1880 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 1881 aml::If::new( 1882 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1883 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1884 ) 1885 .to_aml_bytes(sink) 1886 } 1887 } 1888 1889 struct CpuMethods { 1890 max_vcpus: u8, 1891 dynamic: bool, 1892 } 1893 1894 impl Aml for CpuMethods { 1895 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1896 if self.dynamic { 1897 // CPU status method 1898 aml::Method::new( 1899 "CSTA".into(), 1900 1, 1901 true, 1902 vec![ 1903 // Take lock defined above 1904 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1905 // Write CPU number (in first argument) to I/O port via field 1906 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1907 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1908 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 1909 &aml::If::new( 1910 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 1911 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 1912 ), 1913 // Release lock 1914 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1915 // Return 0 or 0xf 1916 &aml::Return::new(&aml::Local(0)), 1917 ], 1918 ) 1919 .to_aml_bytes(sink); 1920 1921 let mut cpu_notifies = Vec::new(); 1922 for cpu_id in 0..self.max_vcpus { 1923 cpu_notifies.push(CpuNotify { cpu_id }); 1924 } 1925 1926 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 1927 for cpu_id in 0..self.max_vcpus { 1928 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 1929 } 1930 1931 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 1932 1933 aml::Method::new( 1934 "CEJ0".into(), 1935 1, 1936 true, 1937 vec![ 1938 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1939 // Write CPU number (in first argument) to I/O port via field 1940 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1941 // Set CEJ0 bit 1942 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 1943 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1944 ], 1945 ) 1946 .to_aml_bytes(sink); 1947 1948 aml::Method::new( 1949 "CSCN".into(), 1950 0, 1951 true, 1952 vec![ 1953 // Take lock defined above 1954 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1955 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1956 &aml::While::new( 1957 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 1958 vec![ 1959 // Write CPU number (in first argument) to I/O port via field 1960 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 1961 // Check if CINS bit is set 1962 &aml::If::new( 1963 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 1964 // Notify device if it is 1965 vec![ 1966 &aml::MethodCall::new( 1967 "CTFY".into(), 1968 vec![&aml::Local(0), &aml::ONE], 1969 ), 1970 // Reset CINS bit 1971 &aml::Store::new( 1972 &aml::Path::new("\\_SB_.PRES.CINS"), 1973 &aml::ONE, 1974 ), 1975 ], 1976 ), 1977 // Check if CRMV bit is set 1978 &aml::If::new( 1979 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 1980 // Notify device if it is (with the eject constant 0x3) 1981 vec![ 1982 &aml::MethodCall::new( 1983 "CTFY".into(), 1984 vec![&aml::Local(0), &3u8], 1985 ), 1986 // Reset CRMV bit 1987 &aml::Store::new( 1988 &aml::Path::new("\\_SB_.PRES.CRMV"), 1989 &aml::ONE, 1990 ), 1991 ], 1992 ), 1993 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 1994 ], 1995 ), 1996 // Release lock 1997 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1998 ], 1999 ) 2000 .to_aml_bytes(sink) 2001 } else { 2002 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2003 } 2004 } 2005 } 2006 2007 impl Aml for CpuManager { 2008 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2009 #[cfg(target_arch = "x86_64")] 2010 if let Some(acpi_address) = self.acpi_address { 2011 // CPU hotplug controller 2012 aml::Device::new( 2013 "_SB_.PRES".into(), 2014 vec![ 2015 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2016 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2017 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2018 &aml::Mutex::new("CPLK".into(), 0), 2019 &aml::Name::new( 2020 "_CRS".into(), 2021 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2022 aml::AddressSpaceCacheable::NotCacheable, 2023 true, 2024 acpi_address.0, 2025 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2026 None, 2027 )]), 2028 ), 2029 // OpRegion and Fields map MMIO range into individual field values 2030 &aml::OpRegion::new( 2031 "PRST".into(), 2032 aml::OpRegionSpace::SystemMemory, 2033 &(acpi_address.0 as usize), 2034 &CPU_MANAGER_ACPI_SIZE, 2035 ), 2036 &aml::Field::new( 2037 "PRST".into(), 2038 aml::FieldAccessType::Byte, 2039 aml::FieldLockRule::NoLock, 2040 aml::FieldUpdateRule::WriteAsZeroes, 2041 vec![ 2042 aml::FieldEntry::Reserved(32), 2043 aml::FieldEntry::Named(*b"CPEN", 1), 2044 aml::FieldEntry::Named(*b"CINS", 1), 2045 aml::FieldEntry::Named(*b"CRMV", 1), 2046 aml::FieldEntry::Named(*b"CEJ0", 1), 2047 aml::FieldEntry::Reserved(4), 2048 aml::FieldEntry::Named(*b"CCMD", 8), 2049 ], 2050 ), 2051 &aml::Field::new( 2052 "PRST".into(), 2053 aml::FieldAccessType::DWord, 2054 aml::FieldLockRule::NoLock, 2055 aml::FieldUpdateRule::Preserve, 2056 vec![ 2057 aml::FieldEntry::Named(*b"CSEL", 32), 2058 aml::FieldEntry::Reserved(32), 2059 aml::FieldEntry::Named(*b"CDAT", 32), 2060 ], 2061 ), 2062 ], 2063 ) 2064 .to_aml_bytes(sink); 2065 } 2066 2067 // CPU devices 2068 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2069 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2070 // Bundle methods together under a common object 2071 let methods = CpuMethods { 2072 max_vcpus: self.config.max_vcpus, 2073 dynamic: self.dynamic, 2074 }; 2075 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2076 2077 let mut cpu_devices = Vec::new(); 2078 for cpu_id in 0..self.config.max_vcpus { 2079 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2080 let cpu_device = Cpu { 2081 cpu_id, 2082 proximity_domain, 2083 dynamic: self.dynamic, 2084 }; 2085 2086 cpu_devices.push(cpu_device); 2087 } 2088 2089 for cpu_device in cpu_devices.iter() { 2090 cpu_data_inner.push(cpu_device); 2091 } 2092 2093 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2094 } 2095 } 2096 2097 impl Pausable for CpuManager { 2098 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2099 // Tell the vCPUs to pause themselves next time they exit 2100 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2101 2102 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2103 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2104 // above. 2105 for state in self.vcpu_states.iter() { 2106 state.signal_thread(); 2107 } 2108 2109 for vcpu in self.vcpus.iter() { 2110 let mut vcpu = vcpu.lock().unwrap(); 2111 vcpu.pause()?; 2112 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2113 if !self.config.kvm_hyperv { 2114 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2115 MigratableError::Pause(anyhow!( 2116 "Could not notify guest it has been paused {:?}", 2117 e 2118 )) 2119 })?; 2120 } 2121 } 2122 2123 // The vCPU thread will change its paused state before parking, wait here for each 2124 // actived vCPU change their state to ensure they have parked. 2125 for state in self.vcpu_states.iter() { 2126 if state.active() { 2127 while !state.paused.load(Ordering::SeqCst) { 2128 // To avoid a priority inversion with the vCPU thread 2129 thread::sleep(std::time::Duration::from_millis(1)); 2130 } 2131 } 2132 } 2133 2134 Ok(()) 2135 } 2136 2137 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2138 for vcpu in self.vcpus.iter() { 2139 vcpu.lock().unwrap().resume()?; 2140 } 2141 2142 // Toggle the vCPUs pause boolean 2143 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2144 2145 // Unpark all the VCPU threads. 2146 // Once unparked, the next thing they will do is checking for the pause 2147 // boolean. Since it'll be set to false, they will exit their pause loop 2148 // and go back to vmx root. 2149 for state in self.vcpu_states.iter() { 2150 state.paused.store(false, Ordering::SeqCst); 2151 state.unpark_thread(); 2152 } 2153 Ok(()) 2154 } 2155 } 2156 2157 impl Snapshottable for CpuManager { 2158 fn id(&self) -> String { 2159 CPU_MANAGER_SNAPSHOT_ID.to_string() 2160 } 2161 2162 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2163 let mut cpu_manager_snapshot = Snapshot::default(); 2164 2165 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2166 for vcpu in &self.vcpus { 2167 let mut vcpu = vcpu.lock().unwrap(); 2168 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2169 } 2170 2171 Ok(cpu_manager_snapshot) 2172 } 2173 } 2174 2175 impl Transportable for CpuManager {} 2176 impl Migratable for CpuManager {} 2177 2178 #[cfg(feature = "guest_debug")] 2179 impl Debuggable for CpuManager { 2180 #[cfg(feature = "kvm")] 2181 fn set_guest_debug( 2182 &self, 2183 cpu_id: usize, 2184 addrs: &[GuestAddress], 2185 singlestep: bool, 2186 ) -> std::result::Result<(), DebuggableError> { 2187 self.vcpus[cpu_id] 2188 .lock() 2189 .unwrap() 2190 .vcpu 2191 .set_guest_debug(addrs, singlestep) 2192 .map_err(DebuggableError::SetDebug) 2193 } 2194 2195 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2196 Ok(()) 2197 } 2198 2199 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2200 Ok(()) 2201 } 2202 2203 #[cfg(target_arch = "x86_64")] 2204 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2205 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2206 let gregs = self 2207 .get_regs(cpu_id as u8) 2208 .map_err(DebuggableError::ReadRegs)?; 2209 let regs = [ 2210 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, 2211 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, 2212 ]; 2213 2214 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2215 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2216 let eflags = gregs.rflags as u32; 2217 let rip = gregs.rip; 2218 2219 // Segment registers: CS, SS, DS, ES, FS, GS 2220 let sregs = self 2221 .get_sregs(cpu_id as u8) 2222 .map_err(DebuggableError::ReadRegs)?; 2223 let segments = X86SegmentRegs { 2224 cs: sregs.cs.selector as u32, 2225 ss: sregs.ss.selector as u32, 2226 ds: sregs.ds.selector as u32, 2227 es: sregs.es.selector as u32, 2228 fs: sregs.fs.selector as u32, 2229 gs: sregs.gs.selector as u32, 2230 }; 2231 2232 // TODO: Add other registers 2233 2234 Ok(CoreRegs { 2235 regs, 2236 eflags, 2237 rip, 2238 segments, 2239 ..Default::default() 2240 }) 2241 } 2242 2243 #[cfg(target_arch = "aarch64")] 2244 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2245 let gregs = self 2246 .get_regs(cpu_id as u8) 2247 .map_err(DebuggableError::ReadRegs)?; 2248 Ok(CoreRegs { 2249 x: gregs.regs.regs, 2250 sp: gregs.regs.sp, 2251 pc: gregs.regs.pc, 2252 ..Default::default() 2253 }) 2254 } 2255 2256 #[cfg(target_arch = "x86_64")] 2257 fn write_regs( 2258 &self, 2259 cpu_id: usize, 2260 regs: &CoreRegs, 2261 ) -> std::result::Result<(), DebuggableError> { 2262 let orig_gregs = self 2263 .get_regs(cpu_id as u8) 2264 .map_err(DebuggableError::ReadRegs)?; 2265 let gregs = StandardRegisters { 2266 rax: regs.regs[0], 2267 rbx: regs.regs[1], 2268 rcx: regs.regs[2], 2269 rdx: regs.regs[3], 2270 rsi: regs.regs[4], 2271 rdi: regs.regs[5], 2272 rbp: regs.regs[6], 2273 rsp: regs.regs[7], 2274 r8: regs.regs[8], 2275 r9: regs.regs[9], 2276 r10: regs.regs[10], 2277 r11: regs.regs[11], 2278 r12: regs.regs[12], 2279 r13: regs.regs[13], 2280 r14: regs.regs[14], 2281 r15: regs.regs[15], 2282 rip: regs.rip, 2283 // Update the lower 32-bit of rflags. 2284 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), 2285 }; 2286 2287 self.set_regs(cpu_id as u8, &gregs) 2288 .map_err(DebuggableError::WriteRegs)?; 2289 2290 // Segment registers: CS, SS, DS, ES, FS, GS 2291 // Since GDB care only selectors, we call get_sregs() first. 2292 let mut sregs = self 2293 .get_sregs(cpu_id as u8) 2294 .map_err(DebuggableError::ReadRegs)?; 2295 sregs.cs.selector = regs.segments.cs as u16; 2296 sregs.ss.selector = regs.segments.ss as u16; 2297 sregs.ds.selector = regs.segments.ds as u16; 2298 sregs.es.selector = regs.segments.es as u16; 2299 sregs.fs.selector = regs.segments.fs as u16; 2300 sregs.gs.selector = regs.segments.gs as u16; 2301 2302 self.set_sregs(cpu_id as u8, &sregs) 2303 .map_err(DebuggableError::WriteRegs)?; 2304 2305 // TODO: Add other registers 2306 2307 Ok(()) 2308 } 2309 2310 #[cfg(target_arch = "aarch64")] 2311 fn write_regs( 2312 &self, 2313 cpu_id: usize, 2314 regs: &CoreRegs, 2315 ) -> std::result::Result<(), DebuggableError> { 2316 let mut gregs = self 2317 .get_regs(cpu_id as u8) 2318 .map_err(DebuggableError::ReadRegs)?; 2319 2320 gregs.regs.regs = regs.x; 2321 gregs.regs.sp = regs.sp; 2322 gregs.regs.pc = regs.pc; 2323 2324 self.set_regs(cpu_id as u8, &gregs) 2325 .map_err(DebuggableError::WriteRegs)?; 2326 2327 Ok(()) 2328 } 2329 2330 fn read_mem( 2331 &self, 2332 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2333 cpu_id: usize, 2334 vaddr: GuestAddress, 2335 len: usize, 2336 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2337 let mut buf = vec![0; len]; 2338 let mut total_read = 0_u64; 2339 2340 while total_read < len as u64 { 2341 let gaddr = vaddr.0 + total_read; 2342 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2343 Ok(paddr) => paddr, 2344 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2345 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2346 }; 2347 let psize = arch::PAGE_SIZE as u64; 2348 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2349 guest_memory 2350 .memory() 2351 .read( 2352 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2353 GuestAddress(paddr), 2354 ) 2355 .map_err(DebuggableError::ReadMem)?; 2356 total_read += read_len; 2357 } 2358 Ok(buf) 2359 } 2360 2361 fn write_mem( 2362 &self, 2363 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2364 cpu_id: usize, 2365 vaddr: &GuestAddress, 2366 data: &[u8], 2367 ) -> std::result::Result<(), DebuggableError> { 2368 let mut total_written = 0_u64; 2369 2370 while total_written < data.len() as u64 { 2371 let gaddr = vaddr.0 + total_written; 2372 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2373 Ok(paddr) => paddr, 2374 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2375 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2376 }; 2377 let psize = arch::PAGE_SIZE as u64; 2378 let write_len = std::cmp::min( 2379 data.len() as u64 - total_written, 2380 psize - (paddr & (psize - 1)), 2381 ); 2382 guest_memory 2383 .memory() 2384 .write( 2385 &data[total_written as usize..total_written as usize + write_len as usize], 2386 GuestAddress(paddr), 2387 ) 2388 .map_err(DebuggableError::WriteMem)?; 2389 total_written += write_len; 2390 } 2391 Ok(()) 2392 } 2393 2394 fn active_vcpus(&self) -> usize { 2395 self.present_vcpus() as usize 2396 } 2397 } 2398 2399 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2400 impl Elf64Writable for CpuManager {} 2401 2402 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2403 impl CpuElf64Writable for CpuManager { 2404 fn cpu_write_elf64_note( 2405 &mut self, 2406 dump_state: &DumpState, 2407 ) -> std::result::Result<(), GuestDebuggableError> { 2408 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2409 for vcpu in &self.vcpus { 2410 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2411 let mut pos: usize = 0; 2412 let mut buf = vec![0; note_size as usize]; 2413 let descsz = size_of::<X86_64ElfPrStatus>(); 2414 let vcpu_id = vcpu.lock().unwrap().id; 2415 2416 let note = Elf64_Nhdr { 2417 n_namesz: COREDUMP_NAME_SIZE, 2418 n_descsz: descsz as u32, 2419 n_type: NT_PRSTATUS, 2420 }; 2421 2422 let bytes: &[u8] = note.as_slice(); 2423 buf.splice(0.., bytes.to_vec()); 2424 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2425 buf.resize(pos + 4, 0); 2426 buf.splice(pos.., "CORE".to_string().into_bytes()); 2427 2428 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2429 buf.resize(pos + 32 + 4, 0); 2430 let pid = vcpu_id as u64; 2431 let bytes: &[u8] = pid.as_slice(); 2432 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2433 2434 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2435 2436 let orig_rax: u64 = 0; 2437 let gregs = self.vcpus[usize::from(vcpu_id)] 2438 .lock() 2439 .unwrap() 2440 .vcpu 2441 .get_regs() 2442 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2443 2444 let regs1 = [ 2445 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11, 2446 gregs.r10, 2447 ]; 2448 let regs2 = [ 2449 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax, 2450 ]; 2451 2452 let sregs = self.vcpus[usize::from(vcpu_id)] 2453 .lock() 2454 .unwrap() 2455 .vcpu 2456 .get_sregs() 2457 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2458 2459 debug!( 2460 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2461 gregs.rip, 2462 gregs.rsp, 2463 sregs.gs.base, 2464 sregs.cs.selector, 2465 sregs.ss.selector, 2466 sregs.ds.selector, 2467 ); 2468 2469 let regs = X86_64UserRegs { 2470 regs1, 2471 regs2, 2472 rip: gregs.rip, 2473 cs: sregs.cs.selector as u64, 2474 eflags: gregs.rflags, 2475 rsp: gregs.rsp, 2476 ss: sregs.ss.selector as u64, 2477 fs_base: sregs.fs.base, 2478 gs_base: sregs.gs.base, 2479 ds: sregs.ds.selector as u64, 2480 es: sregs.es.selector as u64, 2481 fs: sregs.fs.selector as u64, 2482 gs: sregs.gs.selector as u64, 2483 }; 2484 2485 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2486 let bytes: &[u8] = regs.as_slice(); 2487 buf.resize(note_size as usize, 0); 2488 buf.splice(pos.., bytes.to_vec()); 2489 buf.resize(note_size as usize, 0); 2490 2491 coredump_file 2492 .write(&buf) 2493 .map_err(GuestDebuggableError::CoredumpFile)?; 2494 } 2495 2496 Ok(()) 2497 } 2498 2499 fn cpu_write_vmm_note( 2500 &mut self, 2501 dump_state: &DumpState, 2502 ) -> std::result::Result<(), GuestDebuggableError> { 2503 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2504 for vcpu in &self.vcpus { 2505 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2506 let mut pos: usize = 0; 2507 let mut buf = vec![0; note_size as usize]; 2508 let descsz = size_of::<DumpCpusState>(); 2509 let vcpu_id = vcpu.lock().unwrap().id; 2510 2511 let note = Elf64_Nhdr { 2512 n_namesz: COREDUMP_NAME_SIZE, 2513 n_descsz: descsz as u32, 2514 n_type: 0, 2515 }; 2516 2517 let bytes: &[u8] = note.as_slice(); 2518 buf.splice(0.., bytes.to_vec()); 2519 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2520 2521 buf.resize(pos + 4, 0); 2522 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2523 2524 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2525 2526 let gregs = self.vcpus[usize::from(vcpu_id)] 2527 .lock() 2528 .unwrap() 2529 .vcpu 2530 .get_regs() 2531 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2532 2533 let regs1 = [ 2534 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp, 2535 gregs.rbp, 2536 ]; 2537 2538 let regs2 = [ 2539 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, 2540 gregs.r15, 2541 ]; 2542 2543 let sregs = self.vcpus[usize::from(vcpu_id)] 2544 .lock() 2545 .unwrap() 2546 .vcpu 2547 .get_sregs() 2548 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2549 2550 let mut msrs = vec![MsrEntry { 2551 index: msr_index::MSR_KERNEL_GS_BASE, 2552 ..Default::default() 2553 }]; 2554 2555 self.vcpus[vcpu_id as usize] 2556 .lock() 2557 .unwrap() 2558 .vcpu 2559 .get_msrs(&mut msrs) 2560 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2561 let kernel_gs_base = msrs[0].data; 2562 2563 let cs = CpuSegment::new(sregs.cs); 2564 let ds = CpuSegment::new(sregs.ds); 2565 let es = CpuSegment::new(sregs.es); 2566 let fs = CpuSegment::new(sregs.fs); 2567 let gs = CpuSegment::new(sregs.gs); 2568 let ss = CpuSegment::new(sregs.ss); 2569 let ldt = CpuSegment::new(sregs.ldt); 2570 let tr = CpuSegment::new(sregs.tr); 2571 let gdt = CpuSegment::new_from_table(sregs.gdt); 2572 let idt = CpuSegment::new_from_table(sregs.idt); 2573 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2574 let regs = DumpCpusState { 2575 version: 1, 2576 size: size_of::<DumpCpusState>() as u32, 2577 regs1, 2578 regs2, 2579 rip: gregs.rip, 2580 rflags: gregs.rflags, 2581 cs, 2582 ds, 2583 es, 2584 fs, 2585 gs, 2586 ss, 2587 ldt, 2588 tr, 2589 gdt, 2590 idt, 2591 cr, 2592 kernel_gs_base, 2593 }; 2594 2595 let bytes: &[u8] = regs.as_slice(); 2596 buf.resize(note_size as usize, 0); 2597 buf.splice(pos.., bytes.to_vec()); 2598 buf.resize(note_size as usize, 0); 2599 2600 coredump_file 2601 .write(&buf) 2602 .map_err(GuestDebuggableError::CoredumpFile)?; 2603 } 2604 2605 Ok(()) 2606 } 2607 } 2608 2609 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2610 #[cfg(test)] 2611 mod tests { 2612 use arch::x86_64::interrupts::*; 2613 use arch::x86_64::regs::*; 2614 use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters}; 2615 2616 #[test] 2617 fn test_setlint() { 2618 let hv = hypervisor::new().unwrap(); 2619 let vm = hv.create_vm().expect("new VM fd creation failed"); 2620 assert!(hv.check_required_extensions().is_ok()); 2621 // Calling get_lapic will fail if there is no irqchip before hand. 2622 assert!(vm.create_irq_chip().is_ok()); 2623 let vcpu = vm.create_vcpu(0, None).unwrap(); 2624 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2625 2626 // Compute the value that is expected to represent LVT0 and LVT1. 2627 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2628 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2629 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2630 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2631 2632 set_lint(&vcpu).unwrap(); 2633 2634 // Compute the value that represents LVT0 and LVT1 after set_lint. 2635 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2636 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2637 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2638 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2639 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2640 } 2641 2642 #[test] 2643 fn test_setup_fpu() { 2644 let hv = hypervisor::new().unwrap(); 2645 let vm = hv.create_vm().expect("new VM fd creation failed"); 2646 let vcpu = vm.create_vcpu(0, None).unwrap(); 2647 setup_fpu(&vcpu).unwrap(); 2648 2649 let expected_fpu: FpuState = FpuState { 2650 fcw: 0x37f, 2651 mxcsr: 0x1f80, 2652 ..Default::default() 2653 }; 2654 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2655 // TODO: auto-generate kvm related structures with PartialEq on. 2656 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2657 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2658 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2659 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2660 // remove it at all. 2661 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2662 } 2663 2664 #[test] 2665 fn test_setup_msrs() { 2666 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2667 2668 let hv = hypervisor::new().unwrap(); 2669 let vm = hv.create_vm().expect("new VM fd creation failed"); 2670 let vcpu = vm.create_vcpu(0, None).unwrap(); 2671 setup_msrs(&vcpu).unwrap(); 2672 2673 // This test will check against the last MSR entry configured (the tenth one). 2674 // See create_msr_entries for details. 2675 let mut msrs = vec![MsrEntry { 2676 index: msr_index::MSR_IA32_MISC_ENABLE, 2677 ..Default::default() 2678 }]; 2679 2680 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2681 // in this test case scenario. 2682 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2683 assert_eq!(read_msrs, 1); 2684 2685 // Official entries that were setup when we did setup_msrs. We need to assert that the 2686 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2687 // expect. 2688 let entry_vec = vcpu.boot_msr_entries(); 2689 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2690 } 2691 2692 #[test] 2693 fn test_setup_regs() { 2694 let hv = hypervisor::new().unwrap(); 2695 let vm = hv.create_vm().expect("new VM fd creation failed"); 2696 let vcpu = vm.create_vcpu(0, None).unwrap(); 2697 2698 let expected_regs: StandardRegisters = StandardRegisters { 2699 rflags: 0x0000000000000002u64, 2700 rbx: arch::layout::PVH_INFO_START.0, 2701 rip: 1, 2702 ..Default::default() 2703 }; 2704 2705 setup_regs(&vcpu, expected_regs.rip).unwrap(); 2706 2707 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2708 assert_eq!(actual_regs, expected_regs); 2709 } 2710 } 2711 2712 #[cfg(target_arch = "aarch64")] 2713 #[cfg(test)] 2714 mod tests { 2715 use arch::{aarch64::regs, layout}; 2716 use hypervisor::kvm::aarch64::is_system_register; 2717 use hypervisor::kvm::kvm_bindings::{ 2718 kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2719 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2720 }; 2721 use hypervisor::{arm64_core_reg_id, offset_of}; 2722 use std::mem; 2723 2724 #[test] 2725 fn test_setup_regs() { 2726 let hv = hypervisor::new().unwrap(); 2727 let vm = hv.create_vm().unwrap(); 2728 let vcpu = vm.create_vcpu(0, None).unwrap(); 2729 2730 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2731 // Must fail when vcpu is not initialized yet. 2732 assert!(res.is_err()); 2733 2734 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2735 vm.get_preferred_target(&mut kvi).unwrap(); 2736 vcpu.vcpu_init(&kvi).unwrap(); 2737 2738 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2739 } 2740 2741 #[test] 2742 fn test_read_mpidr() { 2743 let hv = hypervisor::new().unwrap(); 2744 let vm = hv.create_vm().unwrap(); 2745 let vcpu = vm.create_vcpu(0, None).unwrap(); 2746 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2747 vm.get_preferred_target(&mut kvi).unwrap(); 2748 2749 // Must fail when vcpu is not initialized yet. 2750 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2751 2752 vcpu.vcpu_init(&kvi).unwrap(); 2753 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2754 } 2755 2756 #[test] 2757 fn test_is_system_register() { 2758 let offset = offset_of!(user_pt_regs, pc); 2759 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2760 assert!(!is_system_register(regid)); 2761 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2762 assert!(is_system_register(regid)); 2763 } 2764 2765 #[test] 2766 fn test_save_restore_core_regs() { 2767 let hv = hypervisor::new().unwrap(); 2768 let vm = hv.create_vm().unwrap(); 2769 let vcpu = vm.create_vcpu(0, None).unwrap(); 2770 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2771 vm.get_preferred_target(&mut kvi).unwrap(); 2772 2773 // Must fail when vcpu is not initialized yet. 2774 let res = vcpu.get_regs(); 2775 assert!(res.is_err()); 2776 assert_eq!( 2777 format!("{}", res.unwrap_err()), 2778 "Failed to get core register: Exec format error (os error 8)" 2779 ); 2780 2781 let mut state = kvm_regs::default(); 2782 let res = vcpu.set_regs(&state); 2783 assert!(res.is_err()); 2784 assert_eq!( 2785 format!("{}", res.unwrap_err()), 2786 "Failed to set core register: Exec format error (os error 8)" 2787 ); 2788 2789 vcpu.vcpu_init(&kvi).unwrap(); 2790 let res = vcpu.get_regs(); 2791 assert!(res.is_ok()); 2792 state = res.unwrap(); 2793 assert_eq!(state.regs.pstate, 0x3C5); 2794 2795 assert!(vcpu.set_regs(&state).is_ok()); 2796 } 2797 2798 #[test] 2799 fn test_get_set_mpstate() { 2800 let hv = hypervisor::new().unwrap(); 2801 let vm = hv.create_vm().unwrap(); 2802 let vcpu = vm.create_vcpu(0, None).unwrap(); 2803 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2804 vm.get_preferred_target(&mut kvi).unwrap(); 2805 2806 let res = vcpu.get_mp_state(); 2807 assert!(res.is_ok()); 2808 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 2809 } 2810 } 2811