1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 #[cfg(target_arch = "x86_64")] 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, sdt::Sdt, Aml}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 use arch::EntryPoint; 35 use arch::NumaNodes; 36 #[cfg(target_arch = "aarch64")] 37 use devices::gic::Gic; 38 use devices::interrupt_controller::InterruptController; 39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 44 use hypervisor::aarch64::StandardRegisters; 45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 46 use hypervisor::arch::x86::msr_index; 47 #[cfg(target_arch = "x86_64")] 48 use hypervisor::arch::x86::CpuIdEntry; 49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 50 use hypervisor::arch::x86::MsrEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters}; 53 #[cfg(target_arch = "aarch64")] 54 use hypervisor::kvm::kvm_bindings; 55 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 56 use hypervisor::kvm::kvm_ioctls::Cap; 57 #[cfg(feature = "tdx")] 58 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 59 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps}; 60 use libc::{c_void, siginfo_t}; 61 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 62 use linux_loader::elf::Elf64_Nhdr; 63 use seccompiler::{apply_filter, SeccompAction}; 64 use std::collections::BTreeMap; 65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 66 use std::io::Write; 67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 68 use std::mem::size_of; 69 use std::os::unix::thread::JoinHandleExt; 70 use std::sync::atomic::{AtomicBool, Ordering}; 71 use std::sync::{Arc, Barrier, Mutex}; 72 use std::{cmp, io, result, thread}; 73 use thiserror::Error; 74 use tracer::trace_scoped; 75 use vm_device::BusDevice; 76 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 77 use vm_memory::ByteValued; 78 #[cfg(feature = "guest_debug")] 79 use vm_memory::{Bytes, GuestAddressSpace}; 80 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 81 use vm_migration::{ 82 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 83 Transportable, 84 }; 85 use vmm_sys_util::eventfd::EventFd; 86 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 87 use zerocopy::AsBytes; 88 89 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 90 /// Extract the specified bits of a 64-bit integer. 91 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 92 /// following expression should return 3 (`0b11`): 93 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 94 /// 95 macro_rules! extract_bits_64 { 96 ($value: tt, $offset: tt, $length: tt) => { 97 ($value >> $offset) & (!0u64 >> (64 - $length)) 98 }; 99 } 100 101 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 102 macro_rules! extract_bits_64_without_offset { 103 ($value: tt, $length: tt) => { 104 $value & (!0u64 >> (64 - $length)) 105 }; 106 } 107 108 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 109 110 #[derive(Debug, Error)] 111 pub enum Error { 112 #[error("Error creating vCPU: {0}")] 113 VcpuCreate(#[source] anyhow::Error), 114 115 #[error("Error running bCPU: {0}")] 116 VcpuRun(#[source] anyhow::Error), 117 118 #[error("Error spawning vCPU thread: {0}")] 119 VcpuSpawn(#[source] io::Error), 120 121 #[error("Error generating common CPUID: {0}")] 122 CommonCpuId(#[source] arch::Error), 123 124 #[error("Error configuring vCPU: {0}")] 125 VcpuConfiguration(#[source] arch::Error), 126 127 #[cfg(target_arch = "aarch64")] 128 #[error("Error fetching preferred target: {0}")] 129 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 130 131 #[cfg(target_arch = "aarch64")] 132 #[error("Error initialising vCPU: {0}")] 133 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 134 135 #[error("Failed to join on vCPU threads: {0:?}")] 136 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 137 138 #[error("Error adding CpuManager to MMIO bus: {0}")] 139 BusError(#[source] vm_device::BusError), 140 141 #[error("Requested vCPUs exceed maximum")] 142 DesiredVCpuCountExceedsMax, 143 144 #[error("Cannot create seccomp filter: {0}")] 145 CreateSeccompFilter(#[source] seccompiler::Error), 146 147 #[error("Cannot apply seccomp filter: {0}")] 148 ApplySeccompFilter(#[source] seccompiler::Error), 149 150 #[error("Error starting vCPU after restore: {0}")] 151 StartRestoreVcpu(#[source] anyhow::Error), 152 153 #[error("Unexpected VmExit")] 154 UnexpectedVmExit, 155 156 #[error("Failed to allocate MMIO address for CpuManager")] 157 AllocateMmmioAddress, 158 159 #[cfg(feature = "tdx")] 160 #[error("Error initializing TDX: {0}")] 161 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 162 163 #[cfg(target_arch = "aarch64")] 164 #[error("Error initializing PMU: {0}")] 165 InitPmu(#[source] hypervisor::HypervisorCpuError), 166 167 #[cfg(feature = "guest_debug")] 168 #[error("Error during CPU debug: {0}")] 169 CpuDebug(#[source] hypervisor::HypervisorCpuError), 170 171 #[cfg(feature = "guest_debug")] 172 #[error("Error translating virtual address: {0}")] 173 TranslateVirtualAddress(#[source] anyhow::Error), 174 175 #[cfg(target_arch = "x86_64")] 176 #[error("Error setting up AMX: {0}")] 177 AmxEnable(#[source] anyhow::Error), 178 179 #[error("Maximum number of vCPUs exceeds host limit")] 180 MaximumVcpusExceeded, 181 } 182 pub type Result<T> = result::Result<T, Error>; 183 184 #[cfg(target_arch = "x86_64")] 185 #[allow(dead_code)] 186 #[repr(packed)] 187 #[derive(AsBytes)] 188 struct LocalApic { 189 pub r#type: u8, 190 pub length: u8, 191 pub processor_id: u8, 192 pub apic_id: u8, 193 pub flags: u32, 194 } 195 196 #[allow(dead_code)] 197 #[repr(packed)] 198 #[derive(Default, AsBytes)] 199 struct Ioapic { 200 pub r#type: u8, 201 pub length: u8, 202 pub ioapic_id: u8, 203 _reserved: u8, 204 pub apic_address: u32, 205 pub gsi_base: u32, 206 } 207 208 #[cfg(target_arch = "aarch64")] 209 #[allow(dead_code)] 210 #[repr(packed)] 211 #[derive(AsBytes)] 212 struct GicC { 213 pub r#type: u8, 214 pub length: u8, 215 pub reserved0: u16, 216 pub cpu_interface_number: u32, 217 pub uid: u32, 218 pub flags: u32, 219 pub parking_version: u32, 220 pub performance_interrupt: u32, 221 pub parked_address: u64, 222 pub base_address: u64, 223 pub gicv_base_address: u64, 224 pub gich_base_address: u64, 225 pub vgic_interrupt: u32, 226 pub gicr_base_address: u64, 227 pub mpidr: u64, 228 pub proc_power_effi_class: u8, 229 pub reserved1: u8, 230 pub spe_overflow_interrupt: u16, 231 } 232 233 #[cfg(target_arch = "aarch64")] 234 #[allow(dead_code)] 235 #[repr(packed)] 236 #[derive(AsBytes)] 237 struct GicD { 238 pub r#type: u8, 239 pub length: u8, 240 pub reserved0: u16, 241 pub gic_id: u32, 242 pub base_address: u64, 243 pub global_irq_base: u32, 244 pub version: u8, 245 pub reserved1: [u8; 3], 246 } 247 248 #[cfg(target_arch = "aarch64")] 249 #[allow(dead_code)] 250 #[repr(packed)] 251 #[derive(AsBytes)] 252 struct GicR { 253 pub r#type: u8, 254 pub length: u8, 255 pub reserved: u16, 256 pub base_address: u64, 257 pub range_length: u32, 258 } 259 260 #[cfg(target_arch = "aarch64")] 261 #[allow(dead_code)] 262 #[repr(packed)] 263 #[derive(AsBytes)] 264 struct GicIts { 265 pub r#type: u8, 266 pub length: u8, 267 pub reserved0: u16, 268 pub translation_id: u32, 269 pub base_address: u64, 270 pub reserved1: u32, 271 } 272 273 #[cfg(target_arch = "aarch64")] 274 #[allow(dead_code)] 275 #[repr(packed)] 276 #[derive(AsBytes)] 277 struct ProcessorHierarchyNode { 278 pub r#type: u8, 279 pub length: u8, 280 pub reserved: u16, 281 pub flags: u32, 282 pub parent: u32, 283 pub acpi_processor_id: u32, 284 pub num_private_resources: u32, 285 } 286 287 #[allow(dead_code)] 288 #[repr(packed)] 289 #[derive(Default, AsBytes)] 290 struct InterruptSourceOverride { 291 pub r#type: u8, 292 pub length: u8, 293 pub bus: u8, 294 pub source: u8, 295 pub gsi: u32, 296 pub flags: u16, 297 } 298 299 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 300 macro_rules! round_up { 301 ($n:expr,$d:expr) => { 302 (($n / ($d + 1)) + 1) * $d 303 }; 304 } 305 306 /// A wrapper around creating and using a kvm-based VCPU. 307 pub struct Vcpu { 308 // The hypervisor abstracted CPU. 309 vcpu: Arc<dyn hypervisor::Vcpu>, 310 id: u8, 311 #[cfg(target_arch = "aarch64")] 312 mpidr: u64, 313 saved_state: Option<CpuState>, 314 } 315 316 impl Vcpu { 317 /// Constructs a new VCPU for `vm`. 318 /// 319 /// # Arguments 320 /// 321 /// * `id` - Represents the CPU number between [0, max vcpus). 322 /// * `vm` - The virtual machine this vcpu will get attached to. 323 /// * `vm_ops` - Optional object for exit handling. 324 pub fn new( 325 id: u8, 326 vm: &Arc<dyn hypervisor::Vm>, 327 vm_ops: Option<Arc<dyn VmOps>>, 328 ) -> Result<Self> { 329 let vcpu = vm 330 .create_vcpu(id, vm_ops) 331 .map_err(|e| Error::VcpuCreate(e.into()))?; 332 // Initially the cpuid per vCPU is the one supported by this VM. 333 Ok(Vcpu { 334 vcpu, 335 id, 336 #[cfg(target_arch = "aarch64")] 337 mpidr: 0, 338 saved_state: None, 339 }) 340 } 341 342 /// Configures a vcpu and should be called once per vcpu when created. 343 /// 344 /// # Arguments 345 /// 346 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 347 /// * `guest_memory` - Guest memory. 348 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 349 pub fn configure( 350 &mut self, 351 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 352 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 353 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 354 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 355 ) -> Result<()> { 356 #[cfg(target_arch = "aarch64")] 357 { 358 self.init(vm)?; 359 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 360 .map_err(Error::VcpuConfiguration)?; 361 } 362 info!("Configuring vCPU: cpu_id = {}", self.id); 363 #[cfg(target_arch = "x86_64")] 364 arch::configure_vcpu(&self.vcpu, self.id, boot_setup, cpuid, kvm_hyperv) 365 .map_err(Error::VcpuConfiguration)?; 366 367 Ok(()) 368 } 369 370 /// Gets the MPIDR register value. 371 #[cfg(target_arch = "aarch64")] 372 pub fn get_mpidr(&self) -> u64 { 373 self.mpidr 374 } 375 376 /// Gets the saved vCPU state. 377 #[cfg(target_arch = "aarch64")] 378 pub fn get_saved_state(&self) -> Option<CpuState> { 379 self.saved_state.clone() 380 } 381 382 /// Initializes an aarch64 specific vcpu for booting Linux. 383 #[cfg(target_arch = "aarch64")] 384 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 385 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 386 387 // This reads back the kernel's preferred target type. 388 vm.get_preferred_target(&mut kvi) 389 .map_err(Error::VcpuArmPreferredTarget)?; 390 // We already checked that the capability is supported. 391 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 392 if vm 393 .as_any() 394 .downcast_ref::<hypervisor::kvm::KvmVm>() 395 .unwrap() 396 .check_extension(Cap::ArmPmuV3) 397 { 398 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 399 } 400 // Non-boot cpus are powered off initially. 401 if self.id > 0 { 402 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 403 } 404 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 405 } 406 407 /// Runs the VCPU until it exits, returning the reason. 408 /// 409 /// Note that the state of the VCPU and associated VM must be setup first for this to do 410 /// anything useful. 411 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 412 self.vcpu.run() 413 } 414 } 415 416 impl Pausable for Vcpu {} 417 impl Snapshottable for Vcpu { 418 fn id(&self) -> String { 419 self.id.to_string() 420 } 421 422 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 423 let saved_state = self 424 .vcpu 425 .state() 426 .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?; 427 428 self.saved_state = Some(saved_state.clone()); 429 430 Ok(Snapshot::from_data(SnapshotData::new_from_state( 431 &saved_state, 432 )?)) 433 } 434 } 435 436 pub struct CpuManager { 437 hypervisor_type: HypervisorType, 438 config: CpusConfig, 439 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 440 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 441 #[cfg(target_arch = "x86_64")] 442 cpuid: Vec<CpuIdEntry>, 443 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 444 vm: Arc<dyn hypervisor::Vm>, 445 vcpus_kill_signalled: Arc<AtomicBool>, 446 vcpus_pause_signalled: Arc<AtomicBool>, 447 exit_evt: EventFd, 448 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 449 reset_evt: EventFd, 450 #[cfg(feature = "guest_debug")] 451 vm_debug_evt: EventFd, 452 vcpu_states: Vec<VcpuState>, 453 selected_cpu: u8, 454 vcpus: Vec<Arc<Mutex<Vcpu>>>, 455 seccomp_action: SeccompAction, 456 vm_ops: Arc<dyn VmOps>, 457 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 458 acpi_address: Option<GuestAddress>, 459 proximity_domain_per_cpu: BTreeMap<u8, u32>, 460 affinity: BTreeMap<u8, Vec<u8>>, 461 dynamic: bool, 462 } 463 464 const CPU_ENABLE_FLAG: usize = 0; 465 const CPU_INSERTING_FLAG: usize = 1; 466 const CPU_REMOVING_FLAG: usize = 2; 467 const CPU_EJECT_FLAG: usize = 3; 468 469 const CPU_STATUS_OFFSET: u64 = 4; 470 const CPU_SELECTION_OFFSET: u64 = 0; 471 472 impl BusDevice for CpuManager { 473 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 474 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 475 data.fill(0); 476 477 match offset { 478 CPU_SELECTION_OFFSET => { 479 data[0] = self.selected_cpu; 480 } 481 CPU_STATUS_OFFSET => { 482 if self.selected_cpu < self.max_vcpus() { 483 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 484 if state.active() { 485 data[0] |= 1 << CPU_ENABLE_FLAG; 486 } 487 if state.inserting { 488 data[0] |= 1 << CPU_INSERTING_FLAG; 489 } 490 if state.removing { 491 data[0] |= 1 << CPU_REMOVING_FLAG; 492 } 493 } else { 494 warn!("Out of range vCPU id: {}", self.selected_cpu); 495 } 496 } 497 _ => { 498 warn!( 499 "Unexpected offset for accessing CPU manager device: {:#}", 500 offset 501 ); 502 } 503 } 504 } 505 506 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 507 match offset { 508 CPU_SELECTION_OFFSET => { 509 self.selected_cpu = data[0]; 510 } 511 CPU_STATUS_OFFSET => { 512 if self.selected_cpu < self.max_vcpus() { 513 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 514 // The ACPI code writes back a 1 to acknowledge the insertion 515 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 516 && state.inserting 517 { 518 state.inserting = false; 519 } 520 // Ditto for removal 521 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 522 && state.removing 523 { 524 state.removing = false; 525 } 526 // Trigger removal of vCPU 527 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 528 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 529 error!("Error removing vCPU: {:?}", e); 530 } 531 } 532 } else { 533 warn!("Out of range vCPU id: {}", self.selected_cpu); 534 } 535 } 536 _ => { 537 warn!( 538 "Unexpected offset for accessing CPU manager device: {:#}", 539 offset 540 ); 541 } 542 } 543 None 544 } 545 } 546 547 #[derive(Default)] 548 struct VcpuState { 549 inserting: bool, 550 removing: bool, 551 handle: Option<thread::JoinHandle<()>>, 552 kill: Arc<AtomicBool>, 553 vcpu_run_interrupted: Arc<AtomicBool>, 554 } 555 556 impl VcpuState { 557 fn active(&self) -> bool { 558 self.handle.is_some() 559 } 560 561 fn signal_thread(&self) { 562 if let Some(handle) = self.handle.as_ref() { 563 loop { 564 // SAFETY: FFI call with correct arguments 565 unsafe { 566 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 567 } 568 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 569 break; 570 } else { 571 // This is more effective than thread::yield_now() at 572 // avoiding a priority inversion with the vCPU thread 573 thread::sleep(std::time::Duration::from_millis(1)); 574 } 575 } 576 } 577 } 578 579 fn join_thread(&mut self) -> Result<()> { 580 if let Some(handle) = self.handle.take() { 581 handle.join().map_err(Error::ThreadCleanup)? 582 } 583 584 Ok(()) 585 } 586 587 fn unpark_thread(&self) { 588 if let Some(handle) = self.handle.as_ref() { 589 handle.thread().unpark() 590 } 591 } 592 } 593 594 impl CpuManager { 595 #[allow(unused_variables)] 596 #[allow(clippy::too_many_arguments)] 597 pub fn new( 598 config: &CpusConfig, 599 vm: Arc<dyn hypervisor::Vm>, 600 exit_evt: EventFd, 601 reset_evt: EventFd, 602 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 603 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 604 seccomp_action: SeccompAction, 605 vm_ops: Arc<dyn VmOps>, 606 #[cfg(feature = "tdx")] tdx_enabled: bool, 607 numa_nodes: &NumaNodes, 608 ) -> Result<Arc<Mutex<CpuManager>>> { 609 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 610 return Err(Error::MaximumVcpusExceeded); 611 } 612 613 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 614 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 615 let hypervisor_type = hypervisor.hypervisor_type(); 616 617 #[cfg(target_arch = "x86_64")] 618 if config.features.amx { 619 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 620 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 621 const XFEATURE_XTILEDATA: usize = 18; 622 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 623 624 // SAFETY: the syscall is only modifing kernel internal 625 // data structures that the kernel is itself expected to safeguard. 626 let amx_tile = unsafe { 627 libc::syscall( 628 libc::SYS_arch_prctl, 629 ARCH_REQ_XCOMP_GUEST_PERM, 630 XFEATURE_XTILEDATA, 631 ) 632 }; 633 634 if amx_tile != 0 { 635 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 636 } else { 637 let mask: usize = 0; 638 // SAFETY: the mask being modified (not marked mutable as it is 639 // modified in unsafe only which is permitted) isn't in use elsewhere. 640 let result = unsafe { 641 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 642 }; 643 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 644 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 645 } 646 } 647 } 648 649 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 650 let mut cpu_list = Vec::new(); 651 for (proximity_domain, numa_node) in numa_nodes.iter() { 652 for cpu in numa_node.cpus.iter() { 653 cpu_list.push((*cpu, *proximity_domain)) 654 } 655 } 656 cpu_list 657 } 658 .into_iter() 659 .collect(); 660 661 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 662 cpu_affinity 663 .iter() 664 .map(|a| (a.vcpu, a.host_cpus.clone())) 665 .collect() 666 } else { 667 BTreeMap::new() 668 }; 669 670 #[cfg(feature = "tdx")] 671 let dynamic = !tdx_enabled; 672 #[cfg(not(feature = "tdx"))] 673 let dynamic = true; 674 675 Ok(Arc::new(Mutex::new(CpuManager { 676 hypervisor_type, 677 config: config.clone(), 678 interrupt_controller: None, 679 #[cfg(target_arch = "x86_64")] 680 cpuid: Vec::new(), 681 vm, 682 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 683 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 684 vcpu_states, 685 exit_evt, 686 reset_evt, 687 #[cfg(feature = "guest_debug")] 688 vm_debug_evt, 689 selected_cpu: 0, 690 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 691 seccomp_action, 692 vm_ops, 693 acpi_address: None, 694 proximity_domain_per_cpu, 695 affinity, 696 dynamic, 697 }))) 698 } 699 700 #[cfg(target_arch = "x86_64")] 701 pub fn populate_cpuid( 702 &mut self, 703 memory_manager: &Arc<Mutex<MemoryManager>>, 704 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 705 #[cfg(feature = "tdx")] tdx_enabled: bool, 706 ) -> Result<()> { 707 let sgx_epc_sections = memory_manager 708 .lock() 709 .unwrap() 710 .sgx_epc_region() 711 .as_ref() 712 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 713 714 let topology = self.config.topology.clone().map_or_else( 715 || { 716 #[cfg(feature = "mshv")] 717 if matches!(hypervisor.hypervisor_type(), HypervisorType::Mshv) { 718 return Some((1, self.boot_vcpus(), 1)); 719 } 720 None 721 }, 722 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 723 ); 724 725 self.cpuid = { 726 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 727 arch::generate_common_cpuid( 728 hypervisor, 729 topology, 730 sgx_epc_sections, 731 phys_bits, 732 self.config.kvm_hyperv, 733 #[cfg(feature = "tdx")] 734 tdx_enabled, 735 ) 736 .map_err(Error::CommonCpuId)? 737 }; 738 739 Ok(()) 740 } 741 742 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 743 info!("Creating vCPU: cpu_id = {}", cpu_id); 744 745 let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?; 746 747 if let Some(snapshot) = snapshot { 748 // AArch64 vCPUs should be initialized after created. 749 #[cfg(target_arch = "aarch64")] 750 vcpu.init(&self.vm)?; 751 752 let state: CpuState = snapshot.to_state().map_err(|e| { 753 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 754 })?; 755 vcpu.vcpu 756 .set_state(&state) 757 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 758 759 vcpu.saved_state = Some(state); 760 } 761 762 let vcpu = Arc::new(Mutex::new(vcpu)); 763 764 // Adding vCPU to the CpuManager's vCPU list. 765 self.vcpus.push(vcpu.clone()); 766 767 Ok(vcpu) 768 } 769 770 pub fn configure_vcpu( 771 &self, 772 vcpu: Arc<Mutex<Vcpu>>, 773 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 774 ) -> Result<()> { 775 let mut vcpu = vcpu.lock().unwrap(); 776 777 #[cfg(target_arch = "x86_64")] 778 assert!(!self.cpuid.is_empty()); 779 780 #[cfg(target_arch = "x86_64")] 781 vcpu.configure(boot_setup, self.cpuid.clone(), self.config.kvm_hyperv)?; 782 783 #[cfg(target_arch = "aarch64")] 784 vcpu.configure(&self.vm, boot_setup)?; 785 786 Ok(()) 787 } 788 789 /// Only create new vCPUs if there aren't any inactive ones to reuse 790 fn create_vcpus( 791 &mut self, 792 desired_vcpus: u8, 793 snapshot: Option<Snapshot>, 794 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 795 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 796 info!( 797 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 798 desired_vcpus, 799 self.config.max_vcpus, 800 self.vcpus.len(), 801 self.present_vcpus() 802 ); 803 804 if desired_vcpus > self.config.max_vcpus { 805 return Err(Error::DesiredVCpuCountExceedsMax); 806 } 807 808 // Only create vCPUs in excess of all the allocated vCPUs. 809 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 810 vcpus.push(self.create_vcpu( 811 cpu_id, 812 // TODO: The special format of the CPU id can be removed once 813 // ready to break live upgrade. 814 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 815 )?); 816 } 817 818 Ok(vcpus) 819 } 820 821 #[cfg(target_arch = "aarch64")] 822 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 823 for cpu in self.vcpus.iter() { 824 let cpu = cpu.lock().unwrap(); 825 // Check if PMU attr is available, if not, log the information. 826 if cpu.vcpu.has_pmu_support() { 827 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 828 } else { 829 debug!( 830 "PMU attribute is not supported in vCPU{}, skip PMU init!", 831 cpu.id 832 ); 833 return Ok(false); 834 } 835 } 836 837 Ok(true) 838 } 839 840 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 841 self.vcpus.clone() 842 } 843 844 fn start_vcpu( 845 &mut self, 846 vcpu: Arc<Mutex<Vcpu>>, 847 vcpu_id: u8, 848 vcpu_thread_barrier: Arc<Barrier>, 849 inserting: bool, 850 ) -> Result<()> { 851 let reset_evt = self.reset_evt.try_clone().unwrap(); 852 let exit_evt = self.exit_evt.try_clone().unwrap(); 853 #[cfg(feature = "kvm")] 854 let hypervisor_type = self.hypervisor_type; 855 #[cfg(feature = "guest_debug")] 856 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 857 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 858 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 859 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 860 861 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 862 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 863 .vcpu_run_interrupted 864 .clone(); 865 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 866 867 // Prepare the CPU set the current vCPU is expected to run onto. 868 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 869 // SAFETY: all zeros is a valid pattern 870 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 871 // SAFETY: FFI call, trivially safe 872 unsafe { libc::CPU_ZERO(&mut cpuset) }; 873 for host_cpu in host_cpus { 874 // SAFETY: FFI call, trivially safe 875 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) }; 876 } 877 cpuset 878 }); 879 880 // Retrieve seccomp filter for vcpu thread 881 let vcpu_seccomp_filter = 882 get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type) 883 .map_err(Error::CreateSeccompFilter)?; 884 885 #[cfg(target_arch = "x86_64")] 886 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 887 888 info!("Starting vCPU: cpu_id = {}", vcpu_id); 889 890 let handle = Some( 891 thread::Builder::new() 892 .name(format!("vcpu{vcpu_id}")) 893 .spawn(move || { 894 // Schedule the thread to run on the expected CPU set 895 if let Some(cpuset) = cpuset.as_ref() { 896 // SAFETY: FFI call with correct arguments 897 let ret = unsafe { 898 libc::sched_setaffinity( 899 0, 900 std::mem::size_of::<libc::cpu_set_t>(), 901 cpuset as *const libc::cpu_set_t, 902 ) 903 }; 904 905 if ret != 0 { 906 error!( 907 "Failed scheduling the vCPU {} on the expected CPU set: {}", 908 vcpu_id, 909 io::Error::last_os_error() 910 ); 911 return; 912 } 913 } 914 915 // Apply seccomp filter for vcpu thread. 916 if !vcpu_seccomp_filter.is_empty() { 917 if let Err(e) = 918 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 919 { 920 error!("Error applying seccomp filter: {:?}", e); 921 return; 922 } 923 } 924 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 925 // This uses an async signal safe handler to kill the vcpu handles. 926 register_signal_handler(SIGRTMIN(), handle_signal) 927 .expect("Failed to register vcpu signal handler"); 928 // Block until all CPUs are ready. 929 vcpu_thread_barrier.wait(); 930 931 std::panic::catch_unwind(move || { 932 loop { 933 // If we are being told to pause, we park the thread 934 // until the pause boolean is toggled. 935 // The resume operation is responsible for toggling 936 // the boolean and unpark the thread. 937 // We enter a loop because park() could spuriously 938 // return. We will then park() again unless the 939 // pause boolean has been toggled. 940 941 // Need to use Ordering::SeqCst as we have multiple 942 // loads and stores to different atomics and we need 943 // to see them in a consistent order in all threads 944 945 if vcpu_pause_signalled.load(Ordering::SeqCst) { 946 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 947 // completed by returning to KVM_RUN. From the kernel docs: 948 // 949 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 950 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 951 // operations are complete (and guest state is consistent) only after userspace 952 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 953 // incomplete operations and then check for pending signals. 954 // The pending state of the operation is not preserved in state which is 955 // visible to userspace, thus userspace should ensure that the operation is 956 // completed before performing a live migration. Userspace can re-enter the 957 // guest with an unmasked signal pending or with the immediate_exit field set 958 // to complete pending operations without allowing any further instructions 959 // to be executed. 960 961 #[cfg(feature = "kvm")] 962 if matches!(hypervisor_type, HypervisorType::Kvm) { 963 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 964 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 965 error!("Unexpected VM exit on \"immediate_exit\" run"); 966 break; 967 } 968 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 969 } 970 971 vcpu_run_interrupted.store(true, Ordering::SeqCst); 972 while vcpu_pause_signalled.load(Ordering::SeqCst) { 973 thread::park(); 974 } 975 vcpu_run_interrupted.store(false, Ordering::SeqCst); 976 } 977 978 // We've been told to terminate 979 if vcpu_kill_signalled.load(Ordering::SeqCst) 980 || vcpu_kill.load(Ordering::SeqCst) 981 { 982 vcpu_run_interrupted.store(true, Ordering::SeqCst); 983 break; 984 } 985 986 #[cfg(feature = "tdx")] 987 let mut vcpu = vcpu.lock().unwrap(); 988 #[cfg(not(feature = "tdx"))] 989 let vcpu = vcpu.lock().unwrap(); 990 // vcpu.run() returns false on a triple-fault so trigger a reset 991 match vcpu.run() { 992 Ok(run) => match run { 993 #[cfg(feature = "kvm")] 994 VmExit::Debug => { 995 info!("VmExit::Debug"); 996 #[cfg(feature = "guest_debug")] 997 { 998 vcpu_pause_signalled.store(true, Ordering::SeqCst); 999 let raw_tid = get_raw_tid(vcpu_id as usize); 1000 vm_debug_evt.write(raw_tid as u64).unwrap(); 1001 } 1002 } 1003 #[cfg(target_arch = "x86_64")] 1004 VmExit::IoapicEoi(vector) => { 1005 if let Some(interrupt_controller) = 1006 &interrupt_controller_clone 1007 { 1008 interrupt_controller 1009 .lock() 1010 .unwrap() 1011 .end_of_interrupt(vector); 1012 } 1013 } 1014 VmExit::Ignore => {} 1015 VmExit::Hyperv => {} 1016 VmExit::Reset => { 1017 info!("VmExit::Reset"); 1018 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1019 reset_evt.write(1).unwrap(); 1020 break; 1021 } 1022 VmExit::Shutdown => { 1023 info!("VmExit::Shutdown"); 1024 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1025 exit_evt.write(1).unwrap(); 1026 break; 1027 } 1028 #[cfg(feature = "tdx")] 1029 VmExit::Tdx => { 1030 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1031 match vcpu.get_tdx_exit_details() { 1032 Ok(details) => match details { 1033 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1034 TdxExitDetails::SetupEventNotifyInterrupt => { 1035 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1036 } 1037 }, 1038 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1039 } 1040 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1041 } else { 1042 // We should never reach this code as 1043 // this means the design from the code 1044 // is wrong. 1045 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1046 } 1047 } 1048 _ => { 1049 error!( 1050 "VCPU generated error: {:?}", 1051 Error::UnexpectedVmExit 1052 ); 1053 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1054 exit_evt.write(1).unwrap(); 1055 break; 1056 } 1057 }, 1058 1059 Err(e) => { 1060 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1061 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1062 exit_evt.write(1).unwrap(); 1063 break; 1064 } 1065 } 1066 1067 // We've been told to terminate 1068 if vcpu_kill_signalled.load(Ordering::SeqCst) 1069 || vcpu_kill.load(Ordering::SeqCst) 1070 { 1071 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1072 break; 1073 } 1074 } 1075 }) 1076 .or_else(|_| { 1077 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1078 error!("vCPU thread panicked"); 1079 panic_exit_evt.write(1) 1080 }) 1081 .ok(); 1082 }) 1083 .map_err(Error::VcpuSpawn)?, 1084 ); 1085 1086 // On hot plug calls into this function entry_point is None. It is for 1087 // those hotplug CPU additions that we need to set the inserting flag. 1088 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1089 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1090 1091 Ok(()) 1092 } 1093 1094 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1095 fn activate_vcpus( 1096 &mut self, 1097 desired_vcpus: u8, 1098 inserting: bool, 1099 paused: Option<bool>, 1100 ) -> Result<()> { 1101 if desired_vcpus > self.config.max_vcpus { 1102 return Err(Error::DesiredVCpuCountExceedsMax); 1103 } 1104 1105 let vcpu_thread_barrier = Arc::new(Barrier::new( 1106 (desired_vcpus - self.present_vcpus() + 1) as usize, 1107 )); 1108 1109 if let Some(paused) = paused { 1110 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1111 } 1112 1113 info!( 1114 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1115 desired_vcpus, 1116 self.vcpus.len(), 1117 self.present_vcpus(), 1118 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1119 ); 1120 1121 // This reuses any inactive vCPUs as well as any that were newly created 1122 for vcpu_id in self.present_vcpus()..desired_vcpus { 1123 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1124 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1125 } 1126 1127 // Unblock all CPU threads. 1128 vcpu_thread_barrier.wait(); 1129 Ok(()) 1130 } 1131 1132 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1133 // Mark vCPUs for removal, actual removal happens on ejection 1134 for cpu_id in desired_vcpus..self.present_vcpus() { 1135 self.vcpu_states[usize::from(cpu_id)].removing = true; 1136 } 1137 } 1138 1139 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1140 info!("Removing vCPU: cpu_id = {}", cpu_id); 1141 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1142 state.kill.store(true, Ordering::SeqCst); 1143 state.signal_thread(); 1144 state.join_thread()?; 1145 state.handle = None; 1146 1147 // Once the thread has exited, clear the "kill" so that it can reused 1148 state.kill.store(false, Ordering::SeqCst); 1149 1150 Ok(()) 1151 } 1152 1153 pub fn create_boot_vcpus( 1154 &mut self, 1155 snapshot: Option<Snapshot>, 1156 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1157 trace_scoped!("create_boot_vcpus"); 1158 1159 self.create_vcpus(self.boot_vcpus(), snapshot) 1160 } 1161 1162 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1163 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1164 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1165 } 1166 1167 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1168 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1169 .map_err(|e| { 1170 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1171 })?; 1172 1173 Ok(()) 1174 } 1175 1176 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1177 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1178 return Ok(false); 1179 } 1180 1181 if !self.dynamic { 1182 return Ok(false); 1183 } 1184 1185 match desired_vcpus.cmp(&self.present_vcpus()) { 1186 cmp::Ordering::Greater => { 1187 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1188 for vcpu in vcpus { 1189 self.configure_vcpu(vcpu, None)? 1190 } 1191 self.activate_vcpus(desired_vcpus, true, None)?; 1192 Ok(true) 1193 } 1194 cmp::Ordering::Less => { 1195 self.mark_vcpus_for_removal(desired_vcpus); 1196 Ok(true) 1197 } 1198 _ => Ok(false), 1199 } 1200 } 1201 1202 pub fn shutdown(&mut self) -> Result<()> { 1203 // Tell the vCPUs to stop themselves next time they go through the loop 1204 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1205 1206 // Toggle the vCPUs pause boolean 1207 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1208 1209 // Unpark all the VCPU threads. 1210 for state in self.vcpu_states.iter() { 1211 state.unpark_thread(); 1212 } 1213 1214 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1215 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1216 // above. 1217 for state in self.vcpu_states.iter() { 1218 state.signal_thread(); 1219 } 1220 1221 // Wait for all the threads to finish. This removes the state from the vector. 1222 for mut state in self.vcpu_states.drain(..) { 1223 state.join_thread()?; 1224 } 1225 1226 Ok(()) 1227 } 1228 1229 #[cfg(feature = "tdx")] 1230 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1231 for vcpu in &self.vcpus { 1232 vcpu.lock() 1233 .unwrap() 1234 .vcpu 1235 .tdx_init(hob_address) 1236 .map_err(Error::InitializeTdx)?; 1237 } 1238 Ok(()) 1239 } 1240 1241 pub fn boot_vcpus(&self) -> u8 { 1242 self.config.boot_vcpus 1243 } 1244 1245 pub fn max_vcpus(&self) -> u8 { 1246 self.config.max_vcpus 1247 } 1248 1249 #[cfg(target_arch = "x86_64")] 1250 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1251 assert!(!self.cpuid.is_empty()); 1252 self.cpuid.clone() 1253 } 1254 1255 fn present_vcpus(&self) -> u8 { 1256 self.vcpu_states 1257 .iter() 1258 .fold(0, |acc, state| acc + state.active() as u8) 1259 } 1260 1261 #[cfg(target_arch = "aarch64")] 1262 pub fn get_mpidrs(&self) -> Vec<u64> { 1263 self.vcpus 1264 .iter() 1265 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1266 .collect() 1267 } 1268 1269 #[cfg(target_arch = "aarch64")] 1270 pub fn get_saved_states(&self) -> Vec<CpuState> { 1271 self.vcpus 1272 .iter() 1273 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1274 .collect() 1275 } 1276 1277 #[cfg(target_arch = "aarch64")] 1278 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1279 self.config 1280 .topology 1281 .clone() 1282 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1283 } 1284 1285 pub fn create_madt(&self) -> Sdt { 1286 use crate::acpi; 1287 // This is also checked in the commandline parsing. 1288 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1289 1290 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1291 #[cfg(target_arch = "x86_64")] 1292 { 1293 madt.write(36, arch::layout::APIC_START.0); 1294 1295 for cpu in 0..self.config.max_vcpus { 1296 let lapic = LocalApic { 1297 r#type: acpi::ACPI_APIC_PROCESSOR, 1298 length: 8, 1299 processor_id: cpu, 1300 apic_id: cpu, 1301 flags: if cpu < self.config.boot_vcpus { 1302 1 << MADT_CPU_ENABLE_FLAG 1303 } else { 1304 0 1305 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1306 }; 1307 madt.append(lapic); 1308 } 1309 1310 madt.append(Ioapic { 1311 r#type: acpi::ACPI_APIC_IO, 1312 length: 12, 1313 ioapic_id: 0, 1314 apic_address: arch::layout::IOAPIC_START.0 as u32, 1315 gsi_base: 0, 1316 ..Default::default() 1317 }); 1318 1319 madt.append(InterruptSourceOverride { 1320 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1321 length: 10, 1322 bus: 0, 1323 source: 4, 1324 gsi: 4, 1325 flags: 0, 1326 }); 1327 } 1328 1329 #[cfg(target_arch = "aarch64")] 1330 { 1331 /* Notes: 1332 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1333 */ 1334 1335 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1336 for cpu in 0..self.config.boot_vcpus { 1337 let vcpu = &self.vcpus[cpu as usize]; 1338 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1339 /* ARMv8 MPIDR format: 1340 Bits [63:40] Must be zero 1341 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1342 Bits [31:24] Must be zero 1343 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1344 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1345 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1346 */ 1347 let mpidr_mask = 0xff_00ff_ffff; 1348 let gicc = GicC { 1349 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1350 length: 80, 1351 reserved0: 0, 1352 cpu_interface_number: cpu as u32, 1353 uid: cpu as u32, 1354 flags: 1, 1355 parking_version: 0, 1356 performance_interrupt: 0, 1357 parked_address: 0, 1358 base_address: 0, 1359 gicv_base_address: 0, 1360 gich_base_address: 0, 1361 vgic_interrupt: 0, 1362 gicr_base_address: 0, 1363 mpidr: mpidr & mpidr_mask, 1364 proc_power_effi_class: 0, 1365 reserved1: 0, 1366 spe_overflow_interrupt: 0, 1367 }; 1368 1369 madt.append(gicc); 1370 } 1371 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1372 1373 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1374 let gicd = GicD { 1375 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1376 length: 24, 1377 reserved0: 0, 1378 gic_id: 0, 1379 base_address: vgic_config.dist_addr, 1380 global_irq_base: 0, 1381 version: 3, 1382 reserved1: [0; 3], 1383 }; 1384 madt.append(gicd); 1385 1386 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1387 let gicr = GicR { 1388 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1389 length: 16, 1390 reserved: 0, 1391 base_address: vgic_config.redists_addr, 1392 range_length: vgic_config.redists_size as u32, 1393 }; 1394 madt.append(gicr); 1395 1396 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1397 let gicits = GicIts { 1398 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1399 length: 20, 1400 reserved0: 0, 1401 translation_id: 0, 1402 base_address: vgic_config.msi_addr, 1403 reserved1: 0, 1404 }; 1405 madt.append(gicits); 1406 1407 madt.update_checksum(); 1408 } 1409 1410 madt 1411 } 1412 1413 #[cfg(target_arch = "aarch64")] 1414 pub fn create_pptt(&self) -> Sdt { 1415 let pptt_start = 0; 1416 let mut cpus = 0; 1417 let mut uid = 0; 1418 // If topology is not specified, the default setting is: 1419 // 1 package, multiple cores, 1 thread per core 1420 // This is also the behavior when PPTT is missing. 1421 let (threads_per_core, cores_per_package, packages) = 1422 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1423 1424 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1425 1426 for cluster_idx in 0..packages { 1427 if cpus < self.config.boot_vcpus as usize { 1428 let cluster_offset = pptt.len() - pptt_start; 1429 let cluster_hierarchy_node = ProcessorHierarchyNode { 1430 r#type: 0, 1431 length: 20, 1432 reserved: 0, 1433 flags: 0x2, 1434 parent: 0, 1435 acpi_processor_id: cluster_idx as u32, 1436 num_private_resources: 0, 1437 }; 1438 pptt.append(cluster_hierarchy_node); 1439 1440 for core_idx in 0..cores_per_package { 1441 let core_offset = pptt.len() - pptt_start; 1442 1443 if threads_per_core > 1 { 1444 let core_hierarchy_node = ProcessorHierarchyNode { 1445 r#type: 0, 1446 length: 20, 1447 reserved: 0, 1448 flags: 0x2, 1449 parent: cluster_offset as u32, 1450 acpi_processor_id: core_idx as u32, 1451 num_private_resources: 0, 1452 }; 1453 pptt.append(core_hierarchy_node); 1454 1455 for _thread_idx in 0..threads_per_core { 1456 let thread_hierarchy_node = ProcessorHierarchyNode { 1457 r#type: 0, 1458 length: 20, 1459 reserved: 0, 1460 flags: 0xE, 1461 parent: core_offset as u32, 1462 acpi_processor_id: uid as u32, 1463 num_private_resources: 0, 1464 }; 1465 pptt.append(thread_hierarchy_node); 1466 uid += 1; 1467 } 1468 } else { 1469 let thread_hierarchy_node = ProcessorHierarchyNode { 1470 r#type: 0, 1471 length: 20, 1472 reserved: 0, 1473 flags: 0xA, 1474 parent: cluster_offset as u32, 1475 acpi_processor_id: uid as u32, 1476 num_private_resources: 0, 1477 }; 1478 pptt.append(thread_hierarchy_node); 1479 uid += 1; 1480 } 1481 } 1482 cpus += (cores_per_package * threads_per_core) as usize; 1483 } 1484 } 1485 1486 pptt.update_checksum(); 1487 pptt 1488 } 1489 1490 #[cfg(feature = "guest_debug")] 1491 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1492 self.vcpus[usize::from(cpu_id)] 1493 .lock() 1494 .unwrap() 1495 .vcpu 1496 .get_regs() 1497 .map_err(Error::CpuDebug) 1498 } 1499 1500 #[cfg(feature = "guest_debug")] 1501 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1502 self.vcpus[usize::from(cpu_id)] 1503 .lock() 1504 .unwrap() 1505 .vcpu 1506 .set_regs(regs) 1507 .map_err(Error::CpuDebug) 1508 } 1509 1510 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1511 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1512 self.vcpus[usize::from(cpu_id)] 1513 .lock() 1514 .unwrap() 1515 .vcpu 1516 .get_sregs() 1517 .map_err(Error::CpuDebug) 1518 } 1519 1520 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1521 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1522 self.vcpus[usize::from(cpu_id)] 1523 .lock() 1524 .unwrap() 1525 .vcpu 1526 .set_sregs(sregs) 1527 .map_err(Error::CpuDebug) 1528 } 1529 1530 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1531 fn translate_gva( 1532 &self, 1533 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1534 cpu_id: u8, 1535 gva: u64, 1536 ) -> Result<u64> { 1537 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1538 .lock() 1539 .unwrap() 1540 .vcpu 1541 .translate_gva(gva, /* flags: unused */ 0) 1542 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1543 Ok(gpa) 1544 } 1545 1546 /// 1547 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1548 /// it in VMM by walking through translation tables. 1549 /// 1550 /// Address translation is big topic, here we only focus the scenario that 1551 /// happens in VMM while debugging kernel. This `translate_gva` 1552 /// implementation is restricted to: 1553 /// - Exception Level 1 1554 /// - Translate high address range only (kernel space) 1555 /// 1556 /// This implementation supports following Arm-v8a features related to 1557 /// address translation: 1558 /// - FEAT_LPA 1559 /// - FEAT_LVA 1560 /// - FEAT_LPA2 1561 /// 1562 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1563 fn translate_gva( 1564 &self, 1565 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1566 cpu_id: u8, 1567 gva: u64, 1568 ) -> Result<u64> { 1569 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1570 .lock() 1571 .unwrap() 1572 .vcpu 1573 .get_sys_reg(regs::TCR_EL1) 1574 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1575 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1576 .lock() 1577 .unwrap() 1578 .vcpu 1579 .get_sys_reg(regs::TTBR1_EL1) 1580 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1581 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1582 .lock() 1583 .unwrap() 1584 .vcpu 1585 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1586 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1587 1588 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1589 // or low (0x000xxx...). 1590 let high_range = extract_bits_64!(gva, 55, 1); 1591 if high_range == 0 { 1592 info!("VA (0x{:x}) range is not supported!", gva); 1593 return Ok(gva); 1594 } 1595 1596 // High range size offset 1597 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1598 // Granule size 1599 let tg = extract_bits_64!(tcr_el1, 30, 2); 1600 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1601 let ds = extract_bits_64!(tcr_el1, 59, 1); 1602 1603 if tsz == 0 { 1604 info!("VA translation is not ready!"); 1605 return Ok(gva); 1606 } 1607 1608 // VA size is determined by TCR_BL1.T1SZ 1609 let va_size = 64 - tsz; 1610 // Number of bits in VA consumed in each level of translation 1611 let stride = match tg { 1612 3 => 13, // 64KB granule size 1613 1 => 11, // 16KB granule size 1614 _ => 9, // 4KB, default 1615 }; 1616 // Starting level of walking 1617 let mut level = 4 - (va_size - 4) / stride; 1618 1619 // PA or IPA size is determined 1620 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1621 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1622 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1623 // To be safe, we use the minimum value if they are different. 1624 let pa_range = std::cmp::min(tcr_ips, pa_range); 1625 // PA size in bits 1626 let pa_size = match pa_range { 1627 0 => 32, 1628 1 => 36, 1629 2 => 40, 1630 3 => 42, 1631 4 => 44, 1632 5 => 48, 1633 6 => 52, 1634 _ => { 1635 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1636 "PA range not supported {pa_range}" 1637 )))) 1638 } 1639 }; 1640 1641 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1642 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1643 // If FEAT_LPA2 is present, the translation table descriptor holds 1644 // 50 bits of the table address of next level. 1645 // Otherwise, it is 48 bits. 1646 let descaddrmask = if ds == 1 { 1647 !0u64 >> (64 - 50) // mask with 50 least significant bits 1648 } else { 1649 !0u64 >> (64 - 48) // mask with 48 least significant bits 1650 }; 1651 let descaddrmask = descaddrmask & !indexmask_grainsize; 1652 1653 // Translation table base address 1654 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1655 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1656 // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1657 if pa_size == 52 { 1658 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1659 } 1660 1661 // Loop through tables of each level 1662 loop { 1663 // Table offset for current level 1664 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1665 descaddr |= table_offset; 1666 descaddr &= !7u64; 1667 1668 let mut buf = [0; 8]; 1669 guest_memory 1670 .memory() 1671 .read(&mut buf, GuestAddress(descaddr)) 1672 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1673 let descriptor = u64::from_le_bytes(buf); 1674 1675 descaddr = descriptor & descaddrmask; 1676 // In the case of FEAT_LPA, the next-level translation table address 1677 // bits [48:51] comes from bits [12:15] of the current descriptor. 1678 // For FEAT_LPA2, the next-level translation table address 1679 // bits [50:51] comes from bits [8:9] of the current descriptor, 1680 // bits [48:49] comes from bits [48:49] of the descriptor which was 1681 // handled previously. 1682 if pa_size == 52 { 1683 if ds == 1 { 1684 // FEAT_LPA2 1685 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1686 } else { 1687 // FEAT_LPA 1688 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1689 } 1690 } 1691 1692 if (descriptor & 2) != 0 && (level < 3) { 1693 // This is a table entry. Go down to next level. 1694 level += 1; 1695 indexmask = indexmask_grainsize; 1696 continue; 1697 } 1698 1699 break; 1700 } 1701 1702 // We have reached either: 1703 // - a page entry at level 3 or 1704 // - a block entry at level 1 or 2 1705 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1706 descaddr &= !(page_size - 1); 1707 descaddr |= gva & (page_size - 1); 1708 1709 Ok(descaddr) 1710 } 1711 1712 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1713 self.acpi_address = Some(acpi_address); 1714 } 1715 1716 pub(crate) fn set_interrupt_controller( 1717 &mut self, 1718 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1719 ) { 1720 self.interrupt_controller = Some(interrupt_controller); 1721 } 1722 } 1723 1724 struct Cpu { 1725 cpu_id: u8, 1726 proximity_domain: u32, 1727 dynamic: bool, 1728 } 1729 1730 #[cfg(target_arch = "x86_64")] 1731 const MADT_CPU_ENABLE_FLAG: usize = 0; 1732 1733 #[cfg(target_arch = "x86_64")] 1734 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1735 1736 impl Cpu { 1737 #[cfg(target_arch = "x86_64")] 1738 fn generate_mat(&self) -> Vec<u8> { 1739 let lapic = LocalApic { 1740 r#type: 0, 1741 length: 8, 1742 processor_id: self.cpu_id, 1743 apic_id: self.cpu_id, 1744 flags: 1 << MADT_CPU_ENABLE_FLAG, 1745 }; 1746 1747 let mut mat_data: Vec<u8> = Vec::new(); 1748 mat_data.resize(std::mem::size_of_val(&lapic), 0); 1749 // SAFETY: mat_data is large enough to hold lapic 1750 unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic }; 1751 1752 mat_data 1753 } 1754 } 1755 1756 impl Aml for Cpu { 1757 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1758 #[cfg(target_arch = "x86_64")] 1759 let mat_data: Vec<u8> = self.generate_mat(); 1760 #[allow(clippy::if_same_then_else)] 1761 if self.dynamic { 1762 aml::Device::new( 1763 format!("C{:03}", self.cpu_id).as_str().into(), 1764 vec![ 1765 &aml::Name::new("_HID".into(), &"ACPI0007"), 1766 &aml::Name::new("_UID".into(), &self.cpu_id), 1767 // Currently, AArch64 cannot support following fields. 1768 /* 1769 _STA return value: 1770 Bit [0] – Set if the device is present. 1771 Bit [1] – Set if the device is enabled and decoding its resources. 1772 Bit [2] – Set if the device should be shown in the UI. 1773 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1774 Bit [4] – Set if the battery is present. 1775 Bits [31:5] – Reserved (must be cleared). 1776 */ 1777 #[cfg(target_arch = "x86_64")] 1778 &aml::Method::new( 1779 "_STA".into(), 1780 0, 1781 false, 1782 // Call into CSTA method which will interrogate device 1783 vec![&aml::Return::new(&aml::MethodCall::new( 1784 "CSTA".into(), 1785 vec![&self.cpu_id], 1786 ))], 1787 ), 1788 &aml::Method::new( 1789 "_PXM".into(), 1790 0, 1791 false, 1792 vec![&aml::Return::new(&self.proximity_domain)], 1793 ), 1794 // The Linux kernel expects every CPU device to have a _MAT entry 1795 // containing the LAPIC for this processor with the enabled bit set 1796 // even it if is disabled in the MADT (non-boot CPU) 1797 #[cfg(target_arch = "x86_64")] 1798 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1799 // Trigger CPU ejection 1800 #[cfg(target_arch = "x86_64")] 1801 &aml::Method::new( 1802 "_EJ0".into(), 1803 1, 1804 false, 1805 // Call into CEJ0 method which will actually eject device 1806 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1807 ), 1808 ], 1809 ) 1810 .to_aml_bytes(sink); 1811 } else { 1812 aml::Device::new( 1813 format!("C{:03}", self.cpu_id).as_str().into(), 1814 vec![ 1815 &aml::Name::new("_HID".into(), &"ACPI0007"), 1816 &aml::Name::new("_UID".into(), &self.cpu_id), 1817 #[cfg(target_arch = "x86_64")] 1818 &aml::Method::new( 1819 "_STA".into(), 1820 0, 1821 false, 1822 // Mark CPU present see CSTA implementation 1823 vec![&aml::Return::new(&0xfu8)], 1824 ), 1825 &aml::Method::new( 1826 "_PXM".into(), 1827 0, 1828 false, 1829 vec![&aml::Return::new(&self.proximity_domain)], 1830 ), 1831 // The Linux kernel expects every CPU device to have a _MAT entry 1832 // containing the LAPIC for this processor with the enabled bit set 1833 // even it if is disabled in the MADT (non-boot CPU) 1834 #[cfg(target_arch = "x86_64")] 1835 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1836 ], 1837 ) 1838 .to_aml_bytes(sink); 1839 } 1840 } 1841 } 1842 1843 struct CpuNotify { 1844 cpu_id: u8, 1845 } 1846 1847 impl Aml for CpuNotify { 1848 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1849 let object = aml::Path::new(&format!("C{:03}", self.cpu_id)); 1850 aml::If::new( 1851 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1852 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1853 ) 1854 .to_aml_bytes(sink) 1855 } 1856 } 1857 1858 struct CpuMethods { 1859 max_vcpus: u8, 1860 dynamic: bool, 1861 } 1862 1863 impl Aml for CpuMethods { 1864 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1865 if self.dynamic { 1866 // CPU status method 1867 aml::Method::new( 1868 "CSTA".into(), 1869 1, 1870 true, 1871 vec![ 1872 // Take lock defined above 1873 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1874 // Write CPU number (in first argument) to I/O port via field 1875 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1876 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1877 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 1878 &aml::If::new( 1879 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 1880 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 1881 ), 1882 // Release lock 1883 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1884 // Return 0 or 0xf 1885 &aml::Return::new(&aml::Local(0)), 1886 ], 1887 ) 1888 .to_aml_bytes(sink); 1889 1890 let mut cpu_notifies = Vec::new(); 1891 for cpu_id in 0..self.max_vcpus { 1892 cpu_notifies.push(CpuNotify { cpu_id }); 1893 } 1894 1895 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 1896 for cpu_id in 0..self.max_vcpus { 1897 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 1898 } 1899 1900 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 1901 1902 aml::Method::new( 1903 "CEJ0".into(), 1904 1, 1905 true, 1906 vec![ 1907 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1908 // Write CPU number (in first argument) to I/O port via field 1909 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1910 // Set CEJ0 bit 1911 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 1912 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1913 ], 1914 ) 1915 .to_aml_bytes(sink); 1916 1917 aml::Method::new( 1918 "CSCN".into(), 1919 0, 1920 true, 1921 vec![ 1922 // Take lock defined above 1923 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1924 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1925 &aml::While::new( 1926 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 1927 vec![ 1928 // Write CPU number (in first argument) to I/O port via field 1929 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 1930 // Check if CINS bit is set 1931 &aml::If::new( 1932 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 1933 // Notify device if it is 1934 vec![ 1935 &aml::MethodCall::new( 1936 "CTFY".into(), 1937 vec![&aml::Local(0), &aml::ONE], 1938 ), 1939 // Reset CINS bit 1940 &aml::Store::new( 1941 &aml::Path::new("\\_SB_.PRES.CINS"), 1942 &aml::ONE, 1943 ), 1944 ], 1945 ), 1946 // Check if CRMV bit is set 1947 &aml::If::new( 1948 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 1949 // Notify device if it is (with the eject constant 0x3) 1950 vec![ 1951 &aml::MethodCall::new( 1952 "CTFY".into(), 1953 vec![&aml::Local(0), &3u8], 1954 ), 1955 // Reset CRMV bit 1956 &aml::Store::new( 1957 &aml::Path::new("\\_SB_.PRES.CRMV"), 1958 &aml::ONE, 1959 ), 1960 ], 1961 ), 1962 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 1963 ], 1964 ), 1965 // Release lock 1966 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1967 ], 1968 ) 1969 .to_aml_bytes(sink) 1970 } else { 1971 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 1972 } 1973 } 1974 } 1975 1976 impl Aml for CpuManager { 1977 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1978 #[cfg(target_arch = "x86_64")] 1979 if let Some(acpi_address) = self.acpi_address { 1980 // CPU hotplug controller 1981 aml::Device::new( 1982 "_SB_.PRES".into(), 1983 vec![ 1984 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 1985 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 1986 // Mutex to protect concurrent access as we write to choose CPU and then read back status 1987 &aml::Mutex::new("CPLK".into(), 0), 1988 &aml::Name::new( 1989 "_CRS".into(), 1990 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 1991 aml::AddressSpaceCachable::NotCacheable, 1992 true, 1993 acpi_address.0, 1994 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 1995 None, 1996 )]), 1997 ), 1998 // OpRegion and Fields map MMIO range into individual field values 1999 &aml::OpRegion::new( 2000 "PRST".into(), 2001 aml::OpRegionSpace::SystemMemory, 2002 &(acpi_address.0 as usize), 2003 &CPU_MANAGER_ACPI_SIZE, 2004 ), 2005 &aml::Field::new( 2006 "PRST".into(), 2007 aml::FieldAccessType::Byte, 2008 aml::FieldLockRule::NoLock, 2009 aml::FieldUpdateRule::WriteAsZeroes, 2010 vec![ 2011 aml::FieldEntry::Reserved(32), 2012 aml::FieldEntry::Named(*b"CPEN", 1), 2013 aml::FieldEntry::Named(*b"CINS", 1), 2014 aml::FieldEntry::Named(*b"CRMV", 1), 2015 aml::FieldEntry::Named(*b"CEJ0", 1), 2016 aml::FieldEntry::Reserved(4), 2017 aml::FieldEntry::Named(*b"CCMD", 8), 2018 ], 2019 ), 2020 &aml::Field::new( 2021 "PRST".into(), 2022 aml::FieldAccessType::DWord, 2023 aml::FieldLockRule::NoLock, 2024 aml::FieldUpdateRule::Preserve, 2025 vec![ 2026 aml::FieldEntry::Named(*b"CSEL", 32), 2027 aml::FieldEntry::Reserved(32), 2028 aml::FieldEntry::Named(*b"CDAT", 32), 2029 ], 2030 ), 2031 ], 2032 ) 2033 .to_aml_bytes(sink); 2034 } 2035 2036 // CPU devices 2037 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2038 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2039 // Bundle methods together under a common object 2040 let methods = CpuMethods { 2041 max_vcpus: self.config.max_vcpus, 2042 dynamic: self.dynamic, 2043 }; 2044 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2045 2046 let mut cpu_devices = Vec::new(); 2047 for cpu_id in 0..self.config.max_vcpus { 2048 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2049 let cpu_device = Cpu { 2050 cpu_id, 2051 proximity_domain, 2052 dynamic: self.dynamic, 2053 }; 2054 2055 cpu_devices.push(cpu_device); 2056 } 2057 2058 for cpu_device in cpu_devices.iter() { 2059 cpu_data_inner.push(cpu_device); 2060 } 2061 2062 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2063 } 2064 } 2065 2066 impl Pausable for CpuManager { 2067 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2068 // Tell the vCPUs to pause themselves next time they exit 2069 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2070 2071 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2072 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2073 // above. 2074 for state in self.vcpu_states.iter() { 2075 state.signal_thread(); 2076 } 2077 2078 for vcpu in self.vcpus.iter() { 2079 let mut vcpu = vcpu.lock().unwrap(); 2080 vcpu.pause()?; 2081 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2082 if !self.config.kvm_hyperv { 2083 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2084 MigratableError::Pause(anyhow!( 2085 "Could not notify guest it has been paused {:?}", 2086 e 2087 )) 2088 })?; 2089 } 2090 } 2091 2092 Ok(()) 2093 } 2094 2095 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2096 for vcpu in self.vcpus.iter() { 2097 vcpu.lock().unwrap().resume()?; 2098 } 2099 2100 // Toggle the vCPUs pause boolean 2101 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2102 2103 // Unpark all the VCPU threads. 2104 // Once unparked, the next thing they will do is checking for the pause 2105 // boolean. Since it'll be set to false, they will exit their pause loop 2106 // and go back to vmx root. 2107 for state in self.vcpu_states.iter() { 2108 state.unpark_thread(); 2109 } 2110 Ok(()) 2111 } 2112 } 2113 2114 impl Snapshottable for CpuManager { 2115 fn id(&self) -> String { 2116 CPU_MANAGER_SNAPSHOT_ID.to_string() 2117 } 2118 2119 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2120 let mut cpu_manager_snapshot = Snapshot::default(); 2121 2122 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2123 for vcpu in &self.vcpus { 2124 let mut vcpu = vcpu.lock().unwrap(); 2125 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2126 } 2127 2128 Ok(cpu_manager_snapshot) 2129 } 2130 } 2131 2132 impl Transportable for CpuManager {} 2133 impl Migratable for CpuManager {} 2134 2135 #[cfg(feature = "guest_debug")] 2136 impl Debuggable for CpuManager { 2137 #[cfg(feature = "kvm")] 2138 fn set_guest_debug( 2139 &self, 2140 cpu_id: usize, 2141 addrs: &[GuestAddress], 2142 singlestep: bool, 2143 ) -> std::result::Result<(), DebuggableError> { 2144 self.vcpus[cpu_id] 2145 .lock() 2146 .unwrap() 2147 .vcpu 2148 .set_guest_debug(addrs, singlestep) 2149 .map_err(DebuggableError::SetDebug) 2150 } 2151 2152 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2153 Ok(()) 2154 } 2155 2156 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2157 Ok(()) 2158 } 2159 2160 #[cfg(target_arch = "x86_64")] 2161 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2162 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2163 let gregs = self 2164 .get_regs(cpu_id as u8) 2165 .map_err(DebuggableError::ReadRegs)?; 2166 let regs = [ 2167 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, 2168 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, 2169 ]; 2170 2171 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2172 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2173 let eflags = gregs.rflags as u32; 2174 let rip = gregs.rip; 2175 2176 // Segment registers: CS, SS, DS, ES, FS, GS 2177 let sregs = self 2178 .get_sregs(cpu_id as u8) 2179 .map_err(DebuggableError::ReadRegs)?; 2180 let segments = X86SegmentRegs { 2181 cs: sregs.cs.selector as u32, 2182 ss: sregs.ss.selector as u32, 2183 ds: sregs.ds.selector as u32, 2184 es: sregs.es.selector as u32, 2185 fs: sregs.fs.selector as u32, 2186 gs: sregs.gs.selector as u32, 2187 }; 2188 2189 // TODO: Add other registers 2190 2191 Ok(CoreRegs { 2192 regs, 2193 eflags, 2194 rip, 2195 segments, 2196 ..Default::default() 2197 }) 2198 } 2199 2200 #[cfg(target_arch = "aarch64")] 2201 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2202 let gregs = self 2203 .get_regs(cpu_id as u8) 2204 .map_err(DebuggableError::ReadRegs)?; 2205 Ok(CoreRegs { 2206 x: gregs.regs.regs, 2207 sp: gregs.regs.sp, 2208 pc: gregs.regs.pc, 2209 ..Default::default() 2210 }) 2211 } 2212 2213 #[cfg(target_arch = "x86_64")] 2214 fn write_regs( 2215 &self, 2216 cpu_id: usize, 2217 regs: &CoreRegs, 2218 ) -> std::result::Result<(), DebuggableError> { 2219 let orig_gregs = self 2220 .get_regs(cpu_id as u8) 2221 .map_err(DebuggableError::ReadRegs)?; 2222 let gregs = StandardRegisters { 2223 rax: regs.regs[0], 2224 rbx: regs.regs[1], 2225 rcx: regs.regs[2], 2226 rdx: regs.regs[3], 2227 rsi: regs.regs[4], 2228 rdi: regs.regs[5], 2229 rbp: regs.regs[6], 2230 rsp: regs.regs[7], 2231 r8: regs.regs[8], 2232 r9: regs.regs[9], 2233 r10: regs.regs[10], 2234 r11: regs.regs[11], 2235 r12: regs.regs[12], 2236 r13: regs.regs[13], 2237 r14: regs.regs[14], 2238 r15: regs.regs[15], 2239 rip: regs.rip, 2240 // Update the lower 32-bit of rflags. 2241 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), 2242 }; 2243 2244 self.set_regs(cpu_id as u8, &gregs) 2245 .map_err(DebuggableError::WriteRegs)?; 2246 2247 // Segment registers: CS, SS, DS, ES, FS, GS 2248 // Since GDB care only selectors, we call get_sregs() first. 2249 let mut sregs = self 2250 .get_sregs(cpu_id as u8) 2251 .map_err(DebuggableError::ReadRegs)?; 2252 sregs.cs.selector = regs.segments.cs as u16; 2253 sregs.ss.selector = regs.segments.ss as u16; 2254 sregs.ds.selector = regs.segments.ds as u16; 2255 sregs.es.selector = regs.segments.es as u16; 2256 sregs.fs.selector = regs.segments.fs as u16; 2257 sregs.gs.selector = regs.segments.gs as u16; 2258 2259 self.set_sregs(cpu_id as u8, &sregs) 2260 .map_err(DebuggableError::WriteRegs)?; 2261 2262 // TODO: Add other registers 2263 2264 Ok(()) 2265 } 2266 2267 #[cfg(target_arch = "aarch64")] 2268 fn write_regs( 2269 &self, 2270 cpu_id: usize, 2271 regs: &CoreRegs, 2272 ) -> std::result::Result<(), DebuggableError> { 2273 let mut gregs = self 2274 .get_regs(cpu_id as u8) 2275 .map_err(DebuggableError::ReadRegs)?; 2276 2277 gregs.regs.regs = regs.x; 2278 gregs.regs.sp = regs.sp; 2279 gregs.regs.pc = regs.pc; 2280 2281 self.set_regs(cpu_id as u8, &gregs) 2282 .map_err(DebuggableError::WriteRegs)?; 2283 2284 Ok(()) 2285 } 2286 2287 fn read_mem( 2288 &self, 2289 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2290 cpu_id: usize, 2291 vaddr: GuestAddress, 2292 len: usize, 2293 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2294 let mut buf = vec![0; len]; 2295 let mut total_read = 0_u64; 2296 2297 while total_read < len as u64 { 2298 let gaddr = vaddr.0 + total_read; 2299 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2300 Ok(paddr) => paddr, 2301 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2302 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2303 }; 2304 let psize = arch::PAGE_SIZE as u64; 2305 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2306 guest_memory 2307 .memory() 2308 .read( 2309 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2310 GuestAddress(paddr), 2311 ) 2312 .map_err(DebuggableError::ReadMem)?; 2313 total_read += read_len; 2314 } 2315 Ok(buf) 2316 } 2317 2318 fn write_mem( 2319 &self, 2320 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2321 cpu_id: usize, 2322 vaddr: &GuestAddress, 2323 data: &[u8], 2324 ) -> std::result::Result<(), DebuggableError> { 2325 let mut total_written = 0_u64; 2326 2327 while total_written < data.len() as u64 { 2328 let gaddr = vaddr.0 + total_written; 2329 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2330 Ok(paddr) => paddr, 2331 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2332 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2333 }; 2334 let psize = arch::PAGE_SIZE as u64; 2335 let write_len = std::cmp::min( 2336 data.len() as u64 - total_written, 2337 psize - (paddr & (psize - 1)), 2338 ); 2339 guest_memory 2340 .memory() 2341 .write( 2342 &data[total_written as usize..total_written as usize + write_len as usize], 2343 GuestAddress(paddr), 2344 ) 2345 .map_err(DebuggableError::WriteMem)?; 2346 total_written += write_len; 2347 } 2348 Ok(()) 2349 } 2350 2351 fn active_vcpus(&self) -> usize { 2352 self.present_vcpus() as usize 2353 } 2354 } 2355 2356 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2357 impl Elf64Writable for CpuManager {} 2358 2359 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2360 impl CpuElf64Writable for CpuManager { 2361 fn cpu_write_elf64_note( 2362 &mut self, 2363 dump_state: &DumpState, 2364 ) -> std::result::Result<(), GuestDebuggableError> { 2365 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2366 for vcpu in &self.vcpus { 2367 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2368 let mut pos: usize = 0; 2369 let mut buf = vec![0; note_size as usize]; 2370 let descsz = size_of::<X86_64ElfPrStatus>(); 2371 let vcpu_id = vcpu.lock().unwrap().id; 2372 2373 let note = Elf64_Nhdr { 2374 n_namesz: COREDUMP_NAME_SIZE, 2375 n_descsz: descsz as u32, 2376 n_type: NT_PRSTATUS, 2377 }; 2378 2379 let bytes: &[u8] = note.as_slice(); 2380 buf.splice(0.., bytes.to_vec()); 2381 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2382 buf.resize(pos + 4, 0); 2383 buf.splice(pos.., "CORE".to_string().into_bytes()); 2384 2385 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2386 buf.resize(pos + 32 + 4, 0); 2387 let pid = vcpu_id as u64; 2388 let bytes: &[u8] = pid.as_slice(); 2389 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2390 2391 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2392 2393 let orig_rax: u64 = 0; 2394 let gregs = self.vcpus[usize::from(vcpu_id)] 2395 .lock() 2396 .unwrap() 2397 .vcpu 2398 .get_regs() 2399 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2400 2401 let regs1 = [ 2402 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11, 2403 gregs.r10, 2404 ]; 2405 let regs2 = [ 2406 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax, 2407 ]; 2408 2409 let sregs = self.vcpus[usize::from(vcpu_id)] 2410 .lock() 2411 .unwrap() 2412 .vcpu 2413 .get_sregs() 2414 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2415 2416 debug!( 2417 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2418 gregs.rip, 2419 gregs.rsp, 2420 sregs.gs.base, 2421 sregs.cs.selector, 2422 sregs.ss.selector, 2423 sregs.ds.selector, 2424 ); 2425 2426 let regs = X86_64UserRegs { 2427 regs1, 2428 regs2, 2429 rip: gregs.rip, 2430 cs: sregs.cs.selector as u64, 2431 eflags: gregs.rflags, 2432 rsp: gregs.rsp, 2433 ss: sregs.ss.selector as u64, 2434 fs_base: sregs.fs.base, 2435 gs_base: sregs.gs.base, 2436 ds: sregs.ds.selector as u64, 2437 es: sregs.es.selector as u64, 2438 fs: sregs.fs.selector as u64, 2439 gs: sregs.gs.selector as u64, 2440 }; 2441 2442 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2443 let bytes: &[u8] = regs.as_slice(); 2444 buf.resize(note_size as usize, 0); 2445 buf.splice(pos.., bytes.to_vec()); 2446 buf.resize(note_size as usize, 0); 2447 2448 coredump_file 2449 .write(&buf) 2450 .map_err(GuestDebuggableError::CoredumpFile)?; 2451 } 2452 2453 Ok(()) 2454 } 2455 2456 fn cpu_write_vmm_note( 2457 &mut self, 2458 dump_state: &DumpState, 2459 ) -> std::result::Result<(), GuestDebuggableError> { 2460 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2461 for vcpu in &self.vcpus { 2462 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2463 let mut pos: usize = 0; 2464 let mut buf = vec![0; note_size as usize]; 2465 let descsz = size_of::<DumpCpusState>(); 2466 let vcpu_id = vcpu.lock().unwrap().id; 2467 2468 let note = Elf64_Nhdr { 2469 n_namesz: COREDUMP_NAME_SIZE, 2470 n_descsz: descsz as u32, 2471 n_type: 0, 2472 }; 2473 2474 let bytes: &[u8] = note.as_slice(); 2475 buf.splice(0.., bytes.to_vec()); 2476 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2477 2478 buf.resize(pos + 4, 0); 2479 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2480 2481 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2482 2483 let gregs = self.vcpus[usize::from(vcpu_id)] 2484 .lock() 2485 .unwrap() 2486 .vcpu 2487 .get_regs() 2488 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2489 2490 let regs1 = [ 2491 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp, 2492 gregs.rbp, 2493 ]; 2494 2495 let regs2 = [ 2496 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, 2497 gregs.r15, 2498 ]; 2499 2500 let sregs = self.vcpus[usize::from(vcpu_id)] 2501 .lock() 2502 .unwrap() 2503 .vcpu 2504 .get_sregs() 2505 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2506 2507 let mut msrs = vec![MsrEntry { 2508 index: msr_index::MSR_KERNEL_GS_BASE, 2509 ..Default::default() 2510 }]; 2511 2512 self.vcpus[vcpu_id as usize] 2513 .lock() 2514 .unwrap() 2515 .vcpu 2516 .get_msrs(&mut msrs) 2517 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2518 let kernel_gs_base = msrs[0].data; 2519 2520 let cs = CpuSegment::new(sregs.cs); 2521 let ds = CpuSegment::new(sregs.ds); 2522 let es = CpuSegment::new(sregs.es); 2523 let fs = CpuSegment::new(sregs.fs); 2524 let gs = CpuSegment::new(sregs.gs); 2525 let ss = CpuSegment::new(sregs.ss); 2526 let ldt = CpuSegment::new(sregs.ldt); 2527 let tr = CpuSegment::new(sregs.tr); 2528 let gdt = CpuSegment::new_from_table(sregs.gdt); 2529 let idt = CpuSegment::new_from_table(sregs.idt); 2530 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2531 let regs = DumpCpusState { 2532 version: 1, 2533 size: size_of::<DumpCpusState>() as u32, 2534 regs1, 2535 regs2, 2536 rip: gregs.rip, 2537 rflags: gregs.rflags, 2538 cs, 2539 ds, 2540 es, 2541 fs, 2542 gs, 2543 ss, 2544 ldt, 2545 tr, 2546 gdt, 2547 idt, 2548 cr, 2549 kernel_gs_base, 2550 }; 2551 2552 let bytes: &[u8] = regs.as_slice(); 2553 buf.resize(note_size as usize, 0); 2554 buf.splice(pos.., bytes.to_vec()); 2555 buf.resize(note_size as usize, 0); 2556 2557 coredump_file 2558 .write(&buf) 2559 .map_err(GuestDebuggableError::CoredumpFile)?; 2560 } 2561 2562 Ok(()) 2563 } 2564 } 2565 2566 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2567 #[cfg(test)] 2568 mod tests { 2569 use arch::x86_64::interrupts::*; 2570 use arch::x86_64::regs::*; 2571 use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters}; 2572 2573 #[test] 2574 fn test_setlint() { 2575 let hv = hypervisor::new().unwrap(); 2576 let vm = hv.create_vm().expect("new VM fd creation failed"); 2577 assert!(hv.check_required_extensions().is_ok()); 2578 // Calling get_lapic will fail if there is no irqchip before hand. 2579 assert!(vm.create_irq_chip().is_ok()); 2580 let vcpu = vm.create_vcpu(0, None).unwrap(); 2581 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2582 2583 // Compute the value that is expected to represent LVT0 and LVT1. 2584 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2585 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2586 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2587 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2588 2589 set_lint(&vcpu).unwrap(); 2590 2591 // Compute the value that represents LVT0 and LVT1 after set_lint. 2592 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2593 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2594 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2595 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2596 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2597 } 2598 2599 #[test] 2600 fn test_setup_fpu() { 2601 let hv = hypervisor::new().unwrap(); 2602 let vm = hv.create_vm().expect("new VM fd creation failed"); 2603 let vcpu = vm.create_vcpu(0, None).unwrap(); 2604 setup_fpu(&vcpu).unwrap(); 2605 2606 let expected_fpu: FpuState = FpuState { 2607 fcw: 0x37f, 2608 mxcsr: 0x1f80, 2609 ..Default::default() 2610 }; 2611 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2612 // TODO: auto-generate kvm related structures with PartialEq on. 2613 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2614 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2615 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2616 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2617 // remove it at all. 2618 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2619 } 2620 2621 #[test] 2622 fn test_setup_msrs() { 2623 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2624 2625 let hv = hypervisor::new().unwrap(); 2626 let vm = hv.create_vm().expect("new VM fd creation failed"); 2627 let vcpu = vm.create_vcpu(0, None).unwrap(); 2628 setup_msrs(&vcpu).unwrap(); 2629 2630 // This test will check against the last MSR entry configured (the tenth one). 2631 // See create_msr_entries for details. 2632 let mut msrs = vec![MsrEntry { 2633 index: msr_index::MSR_IA32_MISC_ENABLE, 2634 ..Default::default() 2635 }]; 2636 2637 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2638 // in this test case scenario. 2639 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2640 assert_eq!(read_msrs, 1); 2641 2642 // Official entries that were setup when we did setup_msrs. We need to assert that the 2643 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2644 // expect. 2645 let entry_vec = vcpu.boot_msr_entries(); 2646 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2647 } 2648 2649 #[test] 2650 fn test_setup_regs() { 2651 let hv = hypervisor::new().unwrap(); 2652 let vm = hv.create_vm().expect("new VM fd creation failed"); 2653 let vcpu = vm.create_vcpu(0, None).unwrap(); 2654 2655 let expected_regs: StandardRegisters = StandardRegisters { 2656 rflags: 0x0000000000000002u64, 2657 rbx: arch::layout::PVH_INFO_START.0, 2658 rip: 1, 2659 ..Default::default() 2660 }; 2661 2662 setup_regs(&vcpu, expected_regs.rip).unwrap(); 2663 2664 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2665 assert_eq!(actual_regs, expected_regs); 2666 } 2667 } 2668 2669 #[cfg(target_arch = "aarch64")] 2670 #[cfg(test)] 2671 mod tests { 2672 use arch::{aarch64::regs, layout}; 2673 use hypervisor::kvm::aarch64::is_system_register; 2674 use hypervisor::kvm::kvm_bindings::{ 2675 kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2676 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2677 }; 2678 use hypervisor::{arm64_core_reg_id, offset_of}; 2679 use std::mem; 2680 2681 #[test] 2682 fn test_setup_regs() { 2683 let hv = hypervisor::new().unwrap(); 2684 let vm = hv.create_vm().unwrap(); 2685 let vcpu = vm.create_vcpu(0, None).unwrap(); 2686 2687 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2688 // Must fail when vcpu is not initialized yet. 2689 assert!(res.is_err()); 2690 2691 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2692 vm.get_preferred_target(&mut kvi).unwrap(); 2693 vcpu.vcpu_init(&kvi).unwrap(); 2694 2695 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2696 } 2697 2698 #[test] 2699 fn test_read_mpidr() { 2700 let hv = hypervisor::new().unwrap(); 2701 let vm = hv.create_vm().unwrap(); 2702 let vcpu = vm.create_vcpu(0, None).unwrap(); 2703 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2704 vm.get_preferred_target(&mut kvi).unwrap(); 2705 2706 // Must fail when vcpu is not initialized yet. 2707 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2708 2709 vcpu.vcpu_init(&kvi).unwrap(); 2710 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2711 } 2712 2713 #[test] 2714 fn test_is_system_register() { 2715 let offset = offset_of!(user_pt_regs, pc); 2716 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2717 assert!(!is_system_register(regid)); 2718 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2719 assert!(is_system_register(regid)); 2720 } 2721 2722 #[test] 2723 fn test_save_restore_core_regs() { 2724 let hv = hypervisor::new().unwrap(); 2725 let vm = hv.create_vm().unwrap(); 2726 let vcpu = vm.create_vcpu(0, None).unwrap(); 2727 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2728 vm.get_preferred_target(&mut kvi).unwrap(); 2729 2730 // Must fail when vcpu is not initialized yet. 2731 let res = vcpu.get_regs(); 2732 assert!(res.is_err()); 2733 assert_eq!( 2734 format!("{}", res.unwrap_err()), 2735 "Failed to get core register: Exec format error (os error 8)" 2736 ); 2737 2738 let mut state = kvm_regs::default(); 2739 let res = vcpu.set_regs(&state); 2740 assert!(res.is_err()); 2741 assert_eq!( 2742 format!("{}", res.unwrap_err()), 2743 "Failed to set core register: Exec format error (os error 8)" 2744 ); 2745 2746 vcpu.vcpu_init(&kvi).unwrap(); 2747 let res = vcpu.get_regs(); 2748 assert!(res.is_ok()); 2749 state = res.unwrap(); 2750 assert_eq!(state.regs.pstate, 0x3C5); 2751 2752 assert!(vcpu.set_regs(&state).is_ok()); 2753 } 2754 2755 #[test] 2756 fn test_get_set_mpstate() { 2757 let hv = hypervisor::new().unwrap(); 2758 let vm = hv.create_vm().unwrap(); 2759 let vcpu = vm.create_vcpu(0, None).unwrap(); 2760 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2761 vm.get_preferred_target(&mut kvi).unwrap(); 2762 2763 let res = vcpu.get_mp_state(); 2764 assert!(res.is_ok()); 2765 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 2766 } 2767 } 2768