1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 #[cfg(target_arch = "x86_64")] 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, sdt::Sdt, Aml}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 use arch::EntryPoint; 35 use arch::NumaNodes; 36 #[cfg(target_arch = "aarch64")] 37 use devices::gic::Gic; 38 use devices::interrupt_controller::InterruptController; 39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 44 use hypervisor::aarch64::StandardRegisters; 45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 46 use hypervisor::arch::x86::msr_index; 47 #[cfg(target_arch = "x86_64")] 48 use hypervisor::arch::x86::CpuIdEntry; 49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 50 use hypervisor::arch::x86::MsrEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters}; 53 #[cfg(target_arch = "aarch64")] 54 use hypervisor::kvm::kvm_bindings; 55 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 56 use hypervisor::kvm::kvm_ioctls::Cap; 57 #[cfg(feature = "tdx")] 58 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 59 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps}; 60 use libc::{c_void, siginfo_t}; 61 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 62 use linux_loader::elf::Elf64_Nhdr; 63 use seccompiler::{apply_filter, SeccompAction}; 64 use std::collections::BTreeMap; 65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 66 use std::io::Write; 67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 68 use std::mem::size_of; 69 use std::os::unix::thread::JoinHandleExt; 70 use std::sync::atomic::{AtomicBool, Ordering}; 71 use std::sync::{Arc, Barrier, Mutex}; 72 use std::{cmp, io, result, thread}; 73 use thiserror::Error; 74 use tracer::trace_scoped; 75 use vm_device::BusDevice; 76 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 77 use vm_memory::ByteValued; 78 #[cfg(feature = "guest_debug")] 79 use vm_memory::{Bytes, GuestAddressSpace}; 80 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 81 use vm_migration::{ 82 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 83 Transportable, 84 }; 85 use vmm_sys_util::eventfd::EventFd; 86 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 87 use zerocopy::AsBytes; 88 89 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 90 /// Extract the specified bits of a 64-bit integer. 91 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 92 /// following expression should return 3 (`0b11`): 93 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 94 /// 95 macro_rules! extract_bits_64 { 96 ($value: tt, $offset: tt, $length: tt) => { 97 ($value >> $offset) & (!0u64 >> (64 - $length)) 98 }; 99 } 100 101 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 102 103 #[derive(Debug, Error)] 104 pub enum Error { 105 #[error("Error creating vCPU: {0}")] 106 VcpuCreate(#[source] anyhow::Error), 107 108 #[error("Error running bCPU: {0}")] 109 VcpuRun(#[source] anyhow::Error), 110 111 #[error("Error spawning vCPU thread: {0}")] 112 VcpuSpawn(#[source] io::Error), 113 114 #[error("Error generating common CPUID: {0}")] 115 CommonCpuId(#[source] arch::Error), 116 117 #[error("Error configuring vCPU: {0}")] 118 VcpuConfiguration(#[source] arch::Error), 119 120 #[cfg(target_arch = "aarch64")] 121 #[error("Error fetching preferred target: {0}")] 122 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 123 124 #[cfg(target_arch = "aarch64")] 125 #[error("Error initialising vCPU: {0}")] 126 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 127 128 #[error("Failed to join on vCPU threads: {0:?}")] 129 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 130 131 #[error("Error adding CpuManager to MMIO bus: {0}")] 132 BusError(#[source] vm_device::BusError), 133 134 #[error("Requested vCPUs exceed maximum")] 135 DesiredVCpuCountExceedsMax, 136 137 #[error("Cannot create seccomp filter: {0}")] 138 CreateSeccompFilter(#[source] seccompiler::Error), 139 140 #[error("Cannot apply seccomp filter: {0}")] 141 ApplySeccompFilter(#[source] seccompiler::Error), 142 143 #[error("Error starting vCPU after restore: {0}")] 144 StartRestoreVcpu(#[source] anyhow::Error), 145 146 #[error("Unexpected VmExit")] 147 UnexpectedVmExit, 148 149 #[error("Failed to allocate MMIO address for CpuManager")] 150 AllocateMmmioAddress, 151 152 #[cfg(feature = "tdx")] 153 #[error("Error initializing TDX: {0}")] 154 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 155 156 #[cfg(target_arch = "aarch64")] 157 #[error("Error initializing PMU: {0}")] 158 InitPmu(#[source] hypervisor::HypervisorCpuError), 159 160 #[cfg(feature = "guest_debug")] 161 #[error("Error during CPU debug: {0}")] 162 CpuDebug(#[source] hypervisor::HypervisorCpuError), 163 164 #[cfg(feature = "guest_debug")] 165 #[error("Error translating virtual address: {0}")] 166 TranslateVirtualAddress(#[source] anyhow::Error), 167 168 #[cfg(target_arch = "x86_64")] 169 #[error("Error setting up AMX: {0}")] 170 AmxEnable(#[source] anyhow::Error), 171 172 #[error("Maximum number of vCPUs exceeds host limit")] 173 MaximumVcpusExceeded, 174 } 175 pub type Result<T> = result::Result<T, Error>; 176 177 #[cfg(target_arch = "x86_64")] 178 #[allow(dead_code)] 179 #[repr(packed)] 180 #[derive(AsBytes)] 181 struct LocalApic { 182 pub r#type: u8, 183 pub length: u8, 184 pub processor_id: u8, 185 pub apic_id: u8, 186 pub flags: u32, 187 } 188 189 #[allow(dead_code)] 190 #[repr(packed)] 191 #[derive(Default, AsBytes)] 192 struct Ioapic { 193 pub r#type: u8, 194 pub length: u8, 195 pub ioapic_id: u8, 196 _reserved: u8, 197 pub apic_address: u32, 198 pub gsi_base: u32, 199 } 200 201 #[cfg(target_arch = "aarch64")] 202 #[allow(dead_code)] 203 #[repr(packed)] 204 #[derive(AsBytes)] 205 struct GicC { 206 pub r#type: u8, 207 pub length: u8, 208 pub reserved0: u16, 209 pub cpu_interface_number: u32, 210 pub uid: u32, 211 pub flags: u32, 212 pub parking_version: u32, 213 pub performance_interrupt: u32, 214 pub parked_address: u64, 215 pub base_address: u64, 216 pub gicv_base_address: u64, 217 pub gich_base_address: u64, 218 pub vgic_interrupt: u32, 219 pub gicr_base_address: u64, 220 pub mpidr: u64, 221 pub proc_power_effi_class: u8, 222 pub reserved1: u8, 223 pub spe_overflow_interrupt: u16, 224 } 225 226 #[cfg(target_arch = "aarch64")] 227 #[allow(dead_code)] 228 #[repr(packed)] 229 #[derive(AsBytes)] 230 struct GicD { 231 pub r#type: u8, 232 pub length: u8, 233 pub reserved0: u16, 234 pub gic_id: u32, 235 pub base_address: u64, 236 pub global_irq_base: u32, 237 pub version: u8, 238 pub reserved1: [u8; 3], 239 } 240 241 #[cfg(target_arch = "aarch64")] 242 #[allow(dead_code)] 243 #[repr(packed)] 244 #[derive(AsBytes)] 245 struct GicR { 246 pub r#type: u8, 247 pub length: u8, 248 pub reserved: u16, 249 pub base_address: u64, 250 pub range_length: u32, 251 } 252 253 #[cfg(target_arch = "aarch64")] 254 #[allow(dead_code)] 255 #[repr(packed)] 256 #[derive(AsBytes)] 257 struct GicIts { 258 pub r#type: u8, 259 pub length: u8, 260 pub reserved0: u16, 261 pub translation_id: u32, 262 pub base_address: u64, 263 pub reserved1: u32, 264 } 265 266 #[cfg(target_arch = "aarch64")] 267 #[allow(dead_code)] 268 #[repr(packed)] 269 #[derive(AsBytes)] 270 struct ProcessorHierarchyNode { 271 pub r#type: u8, 272 pub length: u8, 273 pub reserved: u16, 274 pub flags: u32, 275 pub parent: u32, 276 pub acpi_processor_id: u32, 277 pub num_private_resources: u32, 278 } 279 280 #[allow(dead_code)] 281 #[repr(packed)] 282 #[derive(Default, AsBytes)] 283 struct InterruptSourceOverride { 284 pub r#type: u8, 285 pub length: u8, 286 pub bus: u8, 287 pub source: u8, 288 pub gsi: u32, 289 pub flags: u16, 290 } 291 292 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 293 macro_rules! round_up { 294 ($n:expr,$d:expr) => { 295 (($n / ($d + 1)) + 1) * $d 296 }; 297 } 298 299 /// A wrapper around creating and using a kvm-based VCPU. 300 pub struct Vcpu { 301 // The hypervisor abstracted CPU. 302 vcpu: Arc<dyn hypervisor::Vcpu>, 303 id: u8, 304 #[cfg(target_arch = "aarch64")] 305 mpidr: u64, 306 saved_state: Option<CpuState>, 307 } 308 309 impl Vcpu { 310 /// Constructs a new VCPU for `vm`. 311 /// 312 /// # Arguments 313 /// 314 /// * `id` - Represents the CPU number between [0, max vcpus). 315 /// * `vm` - The virtual machine this vcpu will get attached to. 316 /// * `vm_ops` - Optional object for exit handling. 317 pub fn new( 318 id: u8, 319 vm: &Arc<dyn hypervisor::Vm>, 320 vm_ops: Option<Arc<dyn VmOps>>, 321 ) -> Result<Self> { 322 let vcpu = vm 323 .create_vcpu(id, vm_ops) 324 .map_err(|e| Error::VcpuCreate(e.into()))?; 325 // Initially the cpuid per vCPU is the one supported by this VM. 326 Ok(Vcpu { 327 vcpu, 328 id, 329 #[cfg(target_arch = "aarch64")] 330 mpidr: 0, 331 saved_state: None, 332 }) 333 } 334 335 /// Configures a vcpu and should be called once per vcpu when created. 336 /// 337 /// # Arguments 338 /// 339 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 340 /// * `guest_memory` - Guest memory. 341 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 342 pub fn configure( 343 &mut self, 344 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 345 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 346 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 347 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 348 ) -> Result<()> { 349 #[cfg(target_arch = "aarch64")] 350 { 351 self.init(vm)?; 352 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 353 .map_err(Error::VcpuConfiguration)?; 354 } 355 info!("Configuring vCPU: cpu_id = {}", self.id); 356 #[cfg(target_arch = "x86_64")] 357 arch::configure_vcpu(&self.vcpu, self.id, boot_setup, cpuid, kvm_hyperv) 358 .map_err(Error::VcpuConfiguration)?; 359 360 Ok(()) 361 } 362 363 /// Gets the MPIDR register value. 364 #[cfg(target_arch = "aarch64")] 365 pub fn get_mpidr(&self) -> u64 { 366 self.mpidr 367 } 368 369 /// Gets the saved vCPU state. 370 #[cfg(target_arch = "aarch64")] 371 pub fn get_saved_state(&self) -> Option<CpuState> { 372 self.saved_state.clone() 373 } 374 375 /// Initializes an aarch64 specific vcpu for booting Linux. 376 #[cfg(target_arch = "aarch64")] 377 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 378 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 379 380 // This reads back the kernel's preferred target type. 381 vm.get_preferred_target(&mut kvi) 382 .map_err(Error::VcpuArmPreferredTarget)?; 383 // We already checked that the capability is supported. 384 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 385 if vm 386 .as_any() 387 .downcast_ref::<hypervisor::kvm::KvmVm>() 388 .unwrap() 389 .check_extension(Cap::ArmPmuV3) 390 { 391 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 392 } 393 // Non-boot cpus are powered off initially. 394 if self.id > 0 { 395 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 396 } 397 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 398 } 399 400 /// Runs the VCPU until it exits, returning the reason. 401 /// 402 /// Note that the state of the VCPU and associated VM must be setup first for this to do 403 /// anything useful. 404 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 405 self.vcpu.run() 406 } 407 } 408 409 impl Pausable for Vcpu {} 410 impl Snapshottable for Vcpu { 411 fn id(&self) -> String { 412 self.id.to_string() 413 } 414 415 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 416 let saved_state = self 417 .vcpu 418 .state() 419 .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?; 420 421 self.saved_state = Some(saved_state.clone()); 422 423 Ok(Snapshot::from_data(SnapshotData::new_from_state( 424 &saved_state, 425 )?)) 426 } 427 } 428 429 pub struct CpuManager { 430 hypervisor_type: HypervisorType, 431 config: CpusConfig, 432 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 433 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 434 #[cfg(target_arch = "x86_64")] 435 cpuid: Vec<CpuIdEntry>, 436 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 437 vm: Arc<dyn hypervisor::Vm>, 438 vcpus_kill_signalled: Arc<AtomicBool>, 439 vcpus_pause_signalled: Arc<AtomicBool>, 440 exit_evt: EventFd, 441 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 442 reset_evt: EventFd, 443 #[cfg(feature = "guest_debug")] 444 vm_debug_evt: EventFd, 445 vcpu_states: Vec<VcpuState>, 446 selected_cpu: u8, 447 vcpus: Vec<Arc<Mutex<Vcpu>>>, 448 seccomp_action: SeccompAction, 449 vm_ops: Arc<dyn VmOps>, 450 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 451 acpi_address: Option<GuestAddress>, 452 proximity_domain_per_cpu: BTreeMap<u8, u32>, 453 affinity: BTreeMap<u8, Vec<u8>>, 454 dynamic: bool, 455 } 456 457 const CPU_ENABLE_FLAG: usize = 0; 458 const CPU_INSERTING_FLAG: usize = 1; 459 const CPU_REMOVING_FLAG: usize = 2; 460 const CPU_EJECT_FLAG: usize = 3; 461 462 const CPU_STATUS_OFFSET: u64 = 4; 463 const CPU_SELECTION_OFFSET: u64 = 0; 464 465 impl BusDevice for CpuManager { 466 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 467 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 468 data.fill(0); 469 470 match offset { 471 CPU_SELECTION_OFFSET => { 472 data[0] = self.selected_cpu; 473 } 474 CPU_STATUS_OFFSET => { 475 if self.selected_cpu < self.max_vcpus() { 476 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 477 if state.active() { 478 data[0] |= 1 << CPU_ENABLE_FLAG; 479 } 480 if state.inserting { 481 data[0] |= 1 << CPU_INSERTING_FLAG; 482 } 483 if state.removing { 484 data[0] |= 1 << CPU_REMOVING_FLAG; 485 } 486 } else { 487 warn!("Out of range vCPU id: {}", self.selected_cpu); 488 } 489 } 490 _ => { 491 warn!( 492 "Unexpected offset for accessing CPU manager device: {:#}", 493 offset 494 ); 495 } 496 } 497 } 498 499 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 500 match offset { 501 CPU_SELECTION_OFFSET => { 502 self.selected_cpu = data[0]; 503 } 504 CPU_STATUS_OFFSET => { 505 if self.selected_cpu < self.max_vcpus() { 506 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 507 // The ACPI code writes back a 1 to acknowledge the insertion 508 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 509 && state.inserting 510 { 511 state.inserting = false; 512 } 513 // Ditto for removal 514 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 515 && state.removing 516 { 517 state.removing = false; 518 } 519 // Trigger removal of vCPU 520 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 521 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 522 error!("Error removing vCPU: {:?}", e); 523 } 524 } 525 } else { 526 warn!("Out of range vCPU id: {}", self.selected_cpu); 527 } 528 } 529 _ => { 530 warn!( 531 "Unexpected offset for accessing CPU manager device: {:#}", 532 offset 533 ); 534 } 535 } 536 None 537 } 538 } 539 540 #[derive(Default)] 541 struct VcpuState { 542 inserting: bool, 543 removing: bool, 544 handle: Option<thread::JoinHandle<()>>, 545 kill: Arc<AtomicBool>, 546 vcpu_run_interrupted: Arc<AtomicBool>, 547 } 548 549 impl VcpuState { 550 fn active(&self) -> bool { 551 self.handle.is_some() 552 } 553 554 fn signal_thread(&self) { 555 if let Some(handle) = self.handle.as_ref() { 556 loop { 557 // SAFETY: FFI call with correct arguments 558 unsafe { 559 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 560 } 561 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 562 break; 563 } else { 564 // This is more effective than thread::yield_now() at 565 // avoiding a priority inversion with the vCPU thread 566 thread::sleep(std::time::Duration::from_millis(1)); 567 } 568 } 569 } 570 } 571 572 fn join_thread(&mut self) -> Result<()> { 573 if let Some(handle) = self.handle.take() { 574 handle.join().map_err(Error::ThreadCleanup)? 575 } 576 577 Ok(()) 578 } 579 580 fn unpark_thread(&self) { 581 if let Some(handle) = self.handle.as_ref() { 582 handle.thread().unpark() 583 } 584 } 585 } 586 587 impl CpuManager { 588 #[allow(unused_variables)] 589 #[allow(clippy::too_many_arguments)] 590 pub fn new( 591 config: &CpusConfig, 592 vm: Arc<dyn hypervisor::Vm>, 593 exit_evt: EventFd, 594 reset_evt: EventFd, 595 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 596 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 597 seccomp_action: SeccompAction, 598 vm_ops: Arc<dyn VmOps>, 599 #[cfg(feature = "tdx")] tdx_enabled: bool, 600 numa_nodes: &NumaNodes, 601 ) -> Result<Arc<Mutex<CpuManager>>> { 602 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 603 return Err(Error::MaximumVcpusExceeded); 604 } 605 606 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 607 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 608 let hypervisor_type = hypervisor.hypervisor_type(); 609 610 #[cfg(target_arch = "x86_64")] 611 if config.features.amx { 612 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 613 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 614 const XFEATURE_XTILEDATA: usize = 18; 615 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 616 617 // SAFETY: the syscall is only modifing kernel internal 618 // data structures that the kernel is itself expected to safeguard. 619 let amx_tile = unsafe { 620 libc::syscall( 621 libc::SYS_arch_prctl, 622 ARCH_REQ_XCOMP_GUEST_PERM, 623 XFEATURE_XTILEDATA, 624 ) 625 }; 626 627 if amx_tile != 0 { 628 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 629 } else { 630 let mask: usize = 0; 631 // SAFETY: the mask being modified (not marked mutable as it is 632 // modified in unsafe only which is permitted) isn't in use elsewhere. 633 let result = unsafe { 634 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 635 }; 636 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 637 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 638 } 639 } 640 } 641 642 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 643 let mut cpu_list = Vec::new(); 644 for (proximity_domain, numa_node) in numa_nodes.iter() { 645 for cpu in numa_node.cpus.iter() { 646 cpu_list.push((*cpu, *proximity_domain)) 647 } 648 } 649 cpu_list 650 } 651 .into_iter() 652 .collect(); 653 654 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 655 cpu_affinity 656 .iter() 657 .map(|a| (a.vcpu, a.host_cpus.clone())) 658 .collect() 659 } else { 660 BTreeMap::new() 661 }; 662 663 #[cfg(feature = "tdx")] 664 let dynamic = !tdx_enabled; 665 #[cfg(not(feature = "tdx"))] 666 let dynamic = true; 667 668 Ok(Arc::new(Mutex::new(CpuManager { 669 hypervisor_type, 670 config: config.clone(), 671 interrupt_controller: None, 672 #[cfg(target_arch = "x86_64")] 673 cpuid: Vec::new(), 674 vm, 675 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 676 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 677 vcpu_states, 678 exit_evt, 679 reset_evt, 680 #[cfg(feature = "guest_debug")] 681 vm_debug_evt, 682 selected_cpu: 0, 683 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 684 seccomp_action, 685 vm_ops, 686 acpi_address: None, 687 proximity_domain_per_cpu, 688 affinity, 689 dynamic, 690 }))) 691 } 692 693 #[cfg(target_arch = "x86_64")] 694 pub fn populate_cpuid( 695 &mut self, 696 memory_manager: &Arc<Mutex<MemoryManager>>, 697 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 698 #[cfg(feature = "tdx")] tdx_enabled: bool, 699 ) -> Result<()> { 700 let sgx_epc_sections = memory_manager 701 .lock() 702 .unwrap() 703 .sgx_epc_region() 704 .as_ref() 705 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 706 707 let topology = self.config.topology.clone().map_or_else( 708 || { 709 #[cfg(feature = "mshv")] 710 if matches!(hypervisor.hypervisor_type(), HypervisorType::Mshv) { 711 return Some((1, self.boot_vcpus(), 1)); 712 } 713 None 714 }, 715 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 716 ); 717 718 self.cpuid = { 719 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 720 arch::generate_common_cpuid( 721 hypervisor, 722 topology, 723 sgx_epc_sections, 724 phys_bits, 725 self.config.kvm_hyperv, 726 #[cfg(feature = "tdx")] 727 tdx_enabled, 728 ) 729 .map_err(Error::CommonCpuId)? 730 }; 731 732 Ok(()) 733 } 734 735 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 736 info!("Creating vCPU: cpu_id = {}", cpu_id); 737 738 let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?; 739 740 if let Some(snapshot) = snapshot { 741 // AArch64 vCPUs should be initialized after created. 742 #[cfg(target_arch = "aarch64")] 743 vcpu.init(&self.vm)?; 744 745 let state: CpuState = snapshot.to_state().map_err(|e| { 746 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 747 })?; 748 vcpu.vcpu 749 .set_state(&state) 750 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 751 752 vcpu.saved_state = Some(state); 753 } 754 755 let vcpu = Arc::new(Mutex::new(vcpu)); 756 757 // Adding vCPU to the CpuManager's vCPU list. 758 self.vcpus.push(vcpu.clone()); 759 760 Ok(vcpu) 761 } 762 763 pub fn configure_vcpu( 764 &self, 765 vcpu: Arc<Mutex<Vcpu>>, 766 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 767 ) -> Result<()> { 768 let mut vcpu = vcpu.lock().unwrap(); 769 770 #[cfg(target_arch = "x86_64")] 771 assert!(!self.cpuid.is_empty()); 772 773 #[cfg(target_arch = "x86_64")] 774 vcpu.configure(boot_setup, self.cpuid.clone(), self.config.kvm_hyperv)?; 775 776 #[cfg(target_arch = "aarch64")] 777 vcpu.configure(&self.vm, boot_setup)?; 778 779 Ok(()) 780 } 781 782 /// Only create new vCPUs if there aren't any inactive ones to reuse 783 fn create_vcpus( 784 &mut self, 785 desired_vcpus: u8, 786 snapshot: Option<Snapshot>, 787 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 788 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 789 info!( 790 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 791 desired_vcpus, 792 self.config.max_vcpus, 793 self.vcpus.len(), 794 self.present_vcpus() 795 ); 796 797 if desired_vcpus > self.config.max_vcpus { 798 return Err(Error::DesiredVCpuCountExceedsMax); 799 } 800 801 // Only create vCPUs in excess of all the allocated vCPUs. 802 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 803 vcpus.push(self.create_vcpu( 804 cpu_id, 805 // TODO: The special format of the CPU id can be removed once 806 // ready to break live upgrade. 807 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 808 )?); 809 } 810 811 Ok(vcpus) 812 } 813 814 #[cfg(target_arch = "aarch64")] 815 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 816 for cpu in self.vcpus.iter() { 817 let cpu = cpu.lock().unwrap(); 818 // Check if PMU attr is available, if not, log the information. 819 if cpu.vcpu.has_pmu_support() { 820 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 821 } else { 822 debug!( 823 "PMU attribute is not supported in vCPU{}, skip PMU init!", 824 cpu.id 825 ); 826 return Ok(false); 827 } 828 } 829 830 Ok(true) 831 } 832 833 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 834 self.vcpus.clone() 835 } 836 837 fn start_vcpu( 838 &mut self, 839 vcpu: Arc<Mutex<Vcpu>>, 840 vcpu_id: u8, 841 vcpu_thread_barrier: Arc<Barrier>, 842 inserting: bool, 843 ) -> Result<()> { 844 let reset_evt = self.reset_evt.try_clone().unwrap(); 845 let exit_evt = self.exit_evt.try_clone().unwrap(); 846 #[cfg(feature = "kvm")] 847 let hypervisor_type = self.hypervisor_type; 848 #[cfg(feature = "guest_debug")] 849 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 850 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 851 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 852 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 853 854 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 855 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 856 .vcpu_run_interrupted 857 .clone(); 858 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 859 860 // Prepare the CPU set the current vCPU is expected to run onto. 861 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 862 // SAFETY: all zeros is a valid pattern 863 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 864 // SAFETY: FFI call, trivially safe 865 unsafe { libc::CPU_ZERO(&mut cpuset) }; 866 for host_cpu in host_cpus { 867 // SAFETY: FFI call, trivially safe 868 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) }; 869 } 870 cpuset 871 }); 872 873 // Retrieve seccomp filter for vcpu thread 874 let vcpu_seccomp_filter = 875 get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type) 876 .map_err(Error::CreateSeccompFilter)?; 877 878 #[cfg(target_arch = "x86_64")] 879 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 880 881 info!("Starting vCPU: cpu_id = {}", vcpu_id); 882 883 let handle = Some( 884 thread::Builder::new() 885 .name(format!("vcpu{vcpu_id}")) 886 .spawn(move || { 887 // Schedule the thread to run on the expected CPU set 888 if let Some(cpuset) = cpuset.as_ref() { 889 // SAFETY: FFI call with correct arguments 890 let ret = unsafe { 891 libc::sched_setaffinity( 892 0, 893 std::mem::size_of::<libc::cpu_set_t>(), 894 cpuset as *const libc::cpu_set_t, 895 ) 896 }; 897 898 if ret != 0 { 899 error!( 900 "Failed scheduling the vCPU {} on the expected CPU set: {}", 901 vcpu_id, 902 io::Error::last_os_error() 903 ); 904 return; 905 } 906 } 907 908 // Apply seccomp filter for vcpu thread. 909 if !vcpu_seccomp_filter.is_empty() { 910 if let Err(e) = 911 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 912 { 913 error!("Error applying seccomp filter: {:?}", e); 914 return; 915 } 916 } 917 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 918 // This uses an async signal safe handler to kill the vcpu handles. 919 register_signal_handler(SIGRTMIN(), handle_signal) 920 .expect("Failed to register vcpu signal handler"); 921 // Block until all CPUs are ready. 922 vcpu_thread_barrier.wait(); 923 924 std::panic::catch_unwind(move || { 925 loop { 926 // If we are being told to pause, we park the thread 927 // until the pause boolean is toggled. 928 // The resume operation is responsible for toggling 929 // the boolean and unpark the thread. 930 // We enter a loop because park() could spuriously 931 // return. We will then park() again unless the 932 // pause boolean has been toggled. 933 934 // Need to use Ordering::SeqCst as we have multiple 935 // loads and stores to different atomics and we need 936 // to see them in a consistent order in all threads 937 938 if vcpu_pause_signalled.load(Ordering::SeqCst) { 939 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 940 // completed by returning to KVM_RUN. From the kernel docs: 941 // 942 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 943 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 944 // operations are complete (and guest state is consistent) only after userspace 945 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 946 // incomplete operations and then check for pending signals. 947 // The pending state of the operation is not preserved in state which is 948 // visible to userspace, thus userspace should ensure that the operation is 949 // completed before performing a live migration. Userspace can re-enter the 950 // guest with an unmasked signal pending or with the immediate_exit field set 951 // to complete pending operations without allowing any further instructions 952 // to be executed. 953 954 #[cfg(feature = "kvm")] 955 if matches!(hypervisor_type, HypervisorType::Kvm) { 956 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 957 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 958 error!("Unexpected VM exit on \"immediate_exit\" run"); 959 break; 960 } 961 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 962 } 963 964 vcpu_run_interrupted.store(true, Ordering::SeqCst); 965 while vcpu_pause_signalled.load(Ordering::SeqCst) { 966 thread::park(); 967 } 968 vcpu_run_interrupted.store(false, Ordering::SeqCst); 969 } 970 971 // We've been told to terminate 972 if vcpu_kill_signalled.load(Ordering::SeqCst) 973 || vcpu_kill.load(Ordering::SeqCst) 974 { 975 vcpu_run_interrupted.store(true, Ordering::SeqCst); 976 break; 977 } 978 979 #[cfg(feature = "tdx")] 980 let mut vcpu = vcpu.lock().unwrap(); 981 #[cfg(not(feature = "tdx"))] 982 let vcpu = vcpu.lock().unwrap(); 983 // vcpu.run() returns false on a triple-fault so trigger a reset 984 match vcpu.run() { 985 Ok(run) => match run { 986 #[cfg(feature = "kvm")] 987 VmExit::Debug => { 988 info!("VmExit::Debug"); 989 #[cfg(feature = "guest_debug")] 990 { 991 vcpu_pause_signalled.store(true, Ordering::SeqCst); 992 let raw_tid = get_raw_tid(vcpu_id as usize); 993 vm_debug_evt.write(raw_tid as u64).unwrap(); 994 } 995 } 996 #[cfg(target_arch = "x86_64")] 997 VmExit::IoapicEoi(vector) => { 998 if let Some(interrupt_controller) = 999 &interrupt_controller_clone 1000 { 1001 interrupt_controller 1002 .lock() 1003 .unwrap() 1004 .end_of_interrupt(vector); 1005 } 1006 } 1007 VmExit::Ignore => {} 1008 VmExit::Hyperv => {} 1009 VmExit::Reset => { 1010 info!("VmExit::Reset"); 1011 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1012 reset_evt.write(1).unwrap(); 1013 break; 1014 } 1015 VmExit::Shutdown => { 1016 info!("VmExit::Shutdown"); 1017 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1018 exit_evt.write(1).unwrap(); 1019 break; 1020 } 1021 #[cfg(feature = "tdx")] 1022 VmExit::Tdx => { 1023 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1024 match vcpu.get_tdx_exit_details() { 1025 Ok(details) => match details { 1026 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1027 TdxExitDetails::SetupEventNotifyInterrupt => { 1028 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1029 } 1030 }, 1031 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1032 } 1033 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1034 } else { 1035 // We should never reach this code as 1036 // this means the design from the code 1037 // is wrong. 1038 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1039 } 1040 } 1041 _ => { 1042 error!( 1043 "VCPU generated error: {:?}", 1044 Error::UnexpectedVmExit 1045 ); 1046 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1047 exit_evt.write(1).unwrap(); 1048 break; 1049 } 1050 }, 1051 1052 Err(e) => { 1053 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1054 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1055 exit_evt.write(1).unwrap(); 1056 break; 1057 } 1058 } 1059 1060 // We've been told to terminate 1061 if vcpu_kill_signalled.load(Ordering::SeqCst) 1062 || vcpu_kill.load(Ordering::SeqCst) 1063 { 1064 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1065 break; 1066 } 1067 } 1068 }) 1069 .or_else(|_| { 1070 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1071 error!("vCPU thread panicked"); 1072 panic_exit_evt.write(1) 1073 }) 1074 .ok(); 1075 }) 1076 .map_err(Error::VcpuSpawn)?, 1077 ); 1078 1079 // On hot plug calls into this function entry_point is None. It is for 1080 // those hotplug CPU additions that we need to set the inserting flag. 1081 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1082 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1083 1084 Ok(()) 1085 } 1086 1087 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1088 fn activate_vcpus( 1089 &mut self, 1090 desired_vcpus: u8, 1091 inserting: bool, 1092 paused: Option<bool>, 1093 ) -> Result<()> { 1094 if desired_vcpus > self.config.max_vcpus { 1095 return Err(Error::DesiredVCpuCountExceedsMax); 1096 } 1097 1098 let vcpu_thread_barrier = Arc::new(Barrier::new( 1099 (desired_vcpus - self.present_vcpus() + 1) as usize, 1100 )); 1101 1102 if let Some(paused) = paused { 1103 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1104 } 1105 1106 info!( 1107 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1108 desired_vcpus, 1109 self.vcpus.len(), 1110 self.present_vcpus(), 1111 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1112 ); 1113 1114 // This reuses any inactive vCPUs as well as any that were newly created 1115 for vcpu_id in self.present_vcpus()..desired_vcpus { 1116 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1117 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1118 } 1119 1120 // Unblock all CPU threads. 1121 vcpu_thread_barrier.wait(); 1122 Ok(()) 1123 } 1124 1125 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1126 // Mark vCPUs for removal, actual removal happens on ejection 1127 for cpu_id in desired_vcpus..self.present_vcpus() { 1128 self.vcpu_states[usize::from(cpu_id)].removing = true; 1129 } 1130 } 1131 1132 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1133 info!("Removing vCPU: cpu_id = {}", cpu_id); 1134 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1135 state.kill.store(true, Ordering::SeqCst); 1136 state.signal_thread(); 1137 state.join_thread()?; 1138 state.handle = None; 1139 1140 // Once the thread has exited, clear the "kill" so that it can reused 1141 state.kill.store(false, Ordering::SeqCst); 1142 1143 Ok(()) 1144 } 1145 1146 pub fn create_boot_vcpus( 1147 &mut self, 1148 snapshot: Option<Snapshot>, 1149 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1150 trace_scoped!("create_boot_vcpus"); 1151 1152 self.create_vcpus(self.boot_vcpus(), snapshot) 1153 } 1154 1155 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1156 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1157 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1158 } 1159 1160 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1161 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1162 .map_err(|e| { 1163 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1164 })?; 1165 1166 Ok(()) 1167 } 1168 1169 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1170 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1171 return Ok(false); 1172 } 1173 1174 if !self.dynamic { 1175 return Ok(false); 1176 } 1177 1178 match desired_vcpus.cmp(&self.present_vcpus()) { 1179 cmp::Ordering::Greater => { 1180 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1181 for vcpu in vcpus { 1182 self.configure_vcpu(vcpu, None)? 1183 } 1184 self.activate_vcpus(desired_vcpus, true, None)?; 1185 Ok(true) 1186 } 1187 cmp::Ordering::Less => { 1188 self.mark_vcpus_for_removal(desired_vcpus); 1189 Ok(true) 1190 } 1191 _ => Ok(false), 1192 } 1193 } 1194 1195 pub fn shutdown(&mut self) -> Result<()> { 1196 // Tell the vCPUs to stop themselves next time they go through the loop 1197 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1198 1199 // Toggle the vCPUs pause boolean 1200 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1201 1202 // Unpark all the VCPU threads. 1203 for state in self.vcpu_states.iter() { 1204 state.unpark_thread(); 1205 } 1206 1207 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1208 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1209 // above. 1210 for state in self.vcpu_states.iter() { 1211 state.signal_thread(); 1212 } 1213 1214 // Wait for all the threads to finish. This removes the state from the vector. 1215 for mut state in self.vcpu_states.drain(..) { 1216 state.join_thread()?; 1217 } 1218 1219 Ok(()) 1220 } 1221 1222 #[cfg(feature = "tdx")] 1223 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1224 for vcpu in &self.vcpus { 1225 vcpu.lock() 1226 .unwrap() 1227 .vcpu 1228 .tdx_init(hob_address) 1229 .map_err(Error::InitializeTdx)?; 1230 } 1231 Ok(()) 1232 } 1233 1234 pub fn boot_vcpus(&self) -> u8 { 1235 self.config.boot_vcpus 1236 } 1237 1238 pub fn max_vcpus(&self) -> u8 { 1239 self.config.max_vcpus 1240 } 1241 1242 #[cfg(target_arch = "x86_64")] 1243 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1244 assert!(!self.cpuid.is_empty()); 1245 self.cpuid.clone() 1246 } 1247 1248 fn present_vcpus(&self) -> u8 { 1249 self.vcpu_states 1250 .iter() 1251 .fold(0, |acc, state| acc + state.active() as u8) 1252 } 1253 1254 #[cfg(target_arch = "aarch64")] 1255 pub fn get_mpidrs(&self) -> Vec<u64> { 1256 self.vcpus 1257 .iter() 1258 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1259 .collect() 1260 } 1261 1262 #[cfg(target_arch = "aarch64")] 1263 pub fn get_saved_states(&self) -> Vec<CpuState> { 1264 self.vcpus 1265 .iter() 1266 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1267 .collect() 1268 } 1269 1270 #[cfg(target_arch = "aarch64")] 1271 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1272 self.config 1273 .topology 1274 .clone() 1275 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1276 } 1277 1278 pub fn create_madt(&self) -> Sdt { 1279 use crate::acpi; 1280 // This is also checked in the commandline parsing. 1281 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1282 1283 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1284 #[cfg(target_arch = "x86_64")] 1285 { 1286 madt.write(36, arch::layout::APIC_START.0); 1287 1288 for cpu in 0..self.config.max_vcpus { 1289 let lapic = LocalApic { 1290 r#type: acpi::ACPI_APIC_PROCESSOR, 1291 length: 8, 1292 processor_id: cpu, 1293 apic_id: cpu, 1294 flags: if cpu < self.config.boot_vcpus { 1295 1 << MADT_CPU_ENABLE_FLAG 1296 } else { 1297 0 1298 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1299 }; 1300 madt.append(lapic); 1301 } 1302 1303 madt.append(Ioapic { 1304 r#type: acpi::ACPI_APIC_IO, 1305 length: 12, 1306 ioapic_id: 0, 1307 apic_address: arch::layout::IOAPIC_START.0 as u32, 1308 gsi_base: 0, 1309 ..Default::default() 1310 }); 1311 1312 madt.append(InterruptSourceOverride { 1313 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1314 length: 10, 1315 bus: 0, 1316 source: 4, 1317 gsi: 4, 1318 flags: 0, 1319 }); 1320 } 1321 1322 #[cfg(target_arch = "aarch64")] 1323 { 1324 /* Notes: 1325 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1326 */ 1327 1328 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1329 for cpu in 0..self.config.boot_vcpus { 1330 let vcpu = &self.vcpus[cpu as usize]; 1331 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1332 /* ARMv8 MPIDR format: 1333 Bits [63:40] Must be zero 1334 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1335 Bits [31:24] Must be zero 1336 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1337 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1338 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1339 */ 1340 let mpidr_mask = 0xff_00ff_ffff; 1341 let gicc = GicC { 1342 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1343 length: 80, 1344 reserved0: 0, 1345 cpu_interface_number: cpu as u32, 1346 uid: cpu as u32, 1347 flags: 1, 1348 parking_version: 0, 1349 performance_interrupt: 0, 1350 parked_address: 0, 1351 base_address: 0, 1352 gicv_base_address: 0, 1353 gich_base_address: 0, 1354 vgic_interrupt: 0, 1355 gicr_base_address: 0, 1356 mpidr: mpidr & mpidr_mask, 1357 proc_power_effi_class: 0, 1358 reserved1: 0, 1359 spe_overflow_interrupt: 0, 1360 }; 1361 1362 madt.append(gicc); 1363 } 1364 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1365 1366 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1367 let gicd = GicD { 1368 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1369 length: 24, 1370 reserved0: 0, 1371 gic_id: 0, 1372 base_address: vgic_config.dist_addr, 1373 global_irq_base: 0, 1374 version: 3, 1375 reserved1: [0; 3], 1376 }; 1377 madt.append(gicd); 1378 1379 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1380 let gicr = GicR { 1381 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1382 length: 16, 1383 reserved: 0, 1384 base_address: vgic_config.redists_addr, 1385 range_length: vgic_config.redists_size as u32, 1386 }; 1387 madt.append(gicr); 1388 1389 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1390 let gicits = GicIts { 1391 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1392 length: 20, 1393 reserved0: 0, 1394 translation_id: 0, 1395 base_address: vgic_config.msi_addr, 1396 reserved1: 0, 1397 }; 1398 madt.append(gicits); 1399 1400 madt.update_checksum(); 1401 } 1402 1403 madt 1404 } 1405 1406 #[cfg(target_arch = "aarch64")] 1407 pub fn create_pptt(&self) -> Sdt { 1408 let pptt_start = 0; 1409 let mut cpus = 0; 1410 let mut uid = 0; 1411 // If topology is not specified, the default setting is: 1412 // 1 package, multiple cores, 1 thread per core 1413 // This is also the behavior when PPTT is missing. 1414 let (threads_per_core, cores_per_package, packages) = 1415 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1416 1417 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1418 1419 for cluster_idx in 0..packages { 1420 if cpus < self.config.boot_vcpus as usize { 1421 let cluster_offset = pptt.len() - pptt_start; 1422 let cluster_hierarchy_node = ProcessorHierarchyNode { 1423 r#type: 0, 1424 length: 20, 1425 reserved: 0, 1426 flags: 0x2, 1427 parent: 0, 1428 acpi_processor_id: cluster_idx as u32, 1429 num_private_resources: 0, 1430 }; 1431 pptt.append(cluster_hierarchy_node); 1432 1433 for core_idx in 0..cores_per_package { 1434 let core_offset = pptt.len() - pptt_start; 1435 1436 if threads_per_core > 1 { 1437 let core_hierarchy_node = ProcessorHierarchyNode { 1438 r#type: 0, 1439 length: 20, 1440 reserved: 0, 1441 flags: 0x2, 1442 parent: cluster_offset as u32, 1443 acpi_processor_id: core_idx as u32, 1444 num_private_resources: 0, 1445 }; 1446 pptt.append(core_hierarchy_node); 1447 1448 for _thread_idx in 0..threads_per_core { 1449 let thread_hierarchy_node = ProcessorHierarchyNode { 1450 r#type: 0, 1451 length: 20, 1452 reserved: 0, 1453 flags: 0xE, 1454 parent: core_offset as u32, 1455 acpi_processor_id: uid as u32, 1456 num_private_resources: 0, 1457 }; 1458 pptt.append(thread_hierarchy_node); 1459 uid += 1; 1460 } 1461 } else { 1462 let thread_hierarchy_node = ProcessorHierarchyNode { 1463 r#type: 0, 1464 length: 20, 1465 reserved: 0, 1466 flags: 0xA, 1467 parent: cluster_offset as u32, 1468 acpi_processor_id: uid as u32, 1469 num_private_resources: 0, 1470 }; 1471 pptt.append(thread_hierarchy_node); 1472 uid += 1; 1473 } 1474 } 1475 cpus += (cores_per_package * threads_per_core) as usize; 1476 } 1477 } 1478 1479 pptt.update_checksum(); 1480 pptt 1481 } 1482 1483 #[cfg(feature = "guest_debug")] 1484 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1485 self.vcpus[usize::from(cpu_id)] 1486 .lock() 1487 .unwrap() 1488 .vcpu 1489 .get_regs() 1490 .map_err(Error::CpuDebug) 1491 } 1492 1493 #[cfg(feature = "guest_debug")] 1494 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1495 self.vcpus[usize::from(cpu_id)] 1496 .lock() 1497 .unwrap() 1498 .vcpu 1499 .set_regs(regs) 1500 .map_err(Error::CpuDebug) 1501 } 1502 1503 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1504 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1505 self.vcpus[usize::from(cpu_id)] 1506 .lock() 1507 .unwrap() 1508 .vcpu 1509 .get_sregs() 1510 .map_err(Error::CpuDebug) 1511 } 1512 1513 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1514 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1515 self.vcpus[usize::from(cpu_id)] 1516 .lock() 1517 .unwrap() 1518 .vcpu 1519 .set_sregs(sregs) 1520 .map_err(Error::CpuDebug) 1521 } 1522 1523 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1524 fn translate_gva( 1525 &self, 1526 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1527 cpu_id: u8, 1528 gva: u64, 1529 ) -> Result<u64> { 1530 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1531 .lock() 1532 .unwrap() 1533 .vcpu 1534 .translate_gva(gva, /* flags: unused */ 0) 1535 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1536 Ok(gpa) 1537 } 1538 1539 /// 1540 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1541 /// it in VMM by walking through translation tables. 1542 /// 1543 /// Address translation is big topic, here we only focus the scenario that 1544 /// happens in VMM while debugging kernel. This `translate_gva` 1545 /// implementation is restricted to: 1546 /// - Exception Level 1 1547 /// - Translate high address range only (kernel space) 1548 /// 1549 /// This implementation supports following Arm-v8a features related to 1550 /// address translation: 1551 /// - FEAT_LPA 1552 /// - FEAT_LVA 1553 /// - FEAT_LPA2 1554 /// 1555 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1556 fn translate_gva( 1557 &self, 1558 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1559 cpu_id: u8, 1560 gva: u64, 1561 ) -> Result<u64> { 1562 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1563 .lock() 1564 .unwrap() 1565 .vcpu 1566 .get_sys_reg(regs::TCR_EL1) 1567 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1568 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1569 .lock() 1570 .unwrap() 1571 .vcpu 1572 .get_sys_reg(regs::TTBR1_EL1) 1573 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1574 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1575 .lock() 1576 .unwrap() 1577 .vcpu 1578 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1579 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1580 1581 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1582 // or low (0x000xxx...). 1583 let high_range = extract_bits_64!(gva, 55, 1); 1584 if high_range == 0 { 1585 info!("VA (0x{:x}) range is not supported!", gva); 1586 return Ok(gva); 1587 } 1588 1589 // High range size offset 1590 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1591 // Granule size 1592 let tg = extract_bits_64!(tcr_el1, 30, 2); 1593 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1594 let ds = extract_bits_64!(tcr_el1, 59, 1); 1595 1596 if tsz == 0 { 1597 info!("VA translation is not ready!"); 1598 return Ok(gva); 1599 } 1600 1601 // VA size is determined by TCR_BL1.T1SZ 1602 let va_size = 64 - tsz; 1603 // Number of bits in VA consumed in each level of translation 1604 let stride = match tg { 1605 3 => 13, // 64KB granule size 1606 1 => 11, // 16KB granule size 1607 _ => 9, // 4KB, default 1608 }; 1609 // Starting level of walking 1610 let mut level = 4 - (va_size - 4) / stride; 1611 1612 // PA or IPA size is determined 1613 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1614 #[allow(clippy::identity_op)] 1615 let pa_range = extract_bits_64!(id_aa64mmfr0_el1, 0, 4); 1616 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1617 // To be safe, we use the minimum value if they are different. 1618 let pa_range = std::cmp::min(tcr_ips, pa_range); 1619 // PA size in bits 1620 let pa_size = match pa_range { 1621 0 => 32, 1622 1 => 36, 1623 2 => 40, 1624 3 => 42, 1625 4 => 44, 1626 5 => 48, 1627 6 => 52, 1628 _ => { 1629 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1630 "PA range not supported {pa_range}" 1631 )))) 1632 } 1633 }; 1634 1635 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1636 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1637 // If FEAT_LPA2 is present, the translation table descriptor holds 1638 // 50 bits of the table address of next level. 1639 // Otherwise, it is 48 bits. 1640 let descaddrmask = if ds == 1 { 1641 !0u64 >> (64 - 50) // mask with 50 least significant bits 1642 } else { 1643 !0u64 >> (64 - 48) // mask with 48 least significant bits 1644 }; 1645 let descaddrmask = descaddrmask & !indexmask_grainsize; 1646 1647 // Translation table base address 1648 #[allow(clippy::identity_op)] 1649 let mut descaddr: u64 = extract_bits_64!(ttbr1_el1, 0, 48); 1650 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1651 // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1652 if pa_size == 52 { 1653 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1654 } 1655 1656 // Loop through tables of each level 1657 loop { 1658 // Table offset for current level 1659 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1660 descaddr |= table_offset; 1661 descaddr &= !7u64; 1662 1663 let mut buf = [0; 8]; 1664 guest_memory 1665 .memory() 1666 .read(&mut buf, GuestAddress(descaddr)) 1667 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1668 let descriptor = u64::from_le_bytes(buf); 1669 1670 descaddr = descriptor & descaddrmask; 1671 // In the case of FEAT_LPA, the next-level translation table address 1672 // bits [48:51] comes from bits [12:15] of the current descriptor. 1673 // For FEAT_LPA2, the next-level translation table address 1674 // bits [50:51] comes from bits [8:9] of the current descriptor, 1675 // bits [48:49] comes from bits [48:49] of the descriptor which was 1676 // handled previously. 1677 if pa_size == 52 { 1678 if ds == 1 { 1679 // FEAT_LPA2 1680 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1681 } else { 1682 // FEAT_LPA 1683 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1684 } 1685 } 1686 1687 if (descriptor & 2) != 0 && (level < 3) { 1688 // This is a table entry. Go down to next level. 1689 level += 1; 1690 indexmask = indexmask_grainsize; 1691 continue; 1692 } 1693 1694 break; 1695 } 1696 1697 // We have reached either: 1698 // - a page entry at level 3 or 1699 // - a block entry at level 1 or 2 1700 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1701 descaddr &= !(page_size - 1); 1702 descaddr |= gva & (page_size - 1); 1703 1704 Ok(descaddr) 1705 } 1706 1707 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1708 self.acpi_address = Some(acpi_address); 1709 } 1710 1711 pub(crate) fn set_interrupt_controller( 1712 &mut self, 1713 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1714 ) { 1715 self.interrupt_controller = Some(interrupt_controller); 1716 } 1717 } 1718 1719 struct Cpu { 1720 cpu_id: u8, 1721 proximity_domain: u32, 1722 dynamic: bool, 1723 } 1724 1725 #[cfg(target_arch = "x86_64")] 1726 const MADT_CPU_ENABLE_FLAG: usize = 0; 1727 1728 #[cfg(target_arch = "x86_64")] 1729 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1730 1731 impl Cpu { 1732 #[cfg(target_arch = "x86_64")] 1733 fn generate_mat(&self) -> Vec<u8> { 1734 let lapic = LocalApic { 1735 r#type: 0, 1736 length: 8, 1737 processor_id: self.cpu_id, 1738 apic_id: self.cpu_id, 1739 flags: 1 << MADT_CPU_ENABLE_FLAG, 1740 }; 1741 1742 let mut mat_data: Vec<u8> = Vec::new(); 1743 mat_data.resize(std::mem::size_of_val(&lapic), 0); 1744 // SAFETY: mat_data is large enough to hold lapic 1745 unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic }; 1746 1747 mat_data 1748 } 1749 } 1750 1751 impl Aml for Cpu { 1752 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1753 #[cfg(target_arch = "x86_64")] 1754 let mat_data: Vec<u8> = self.generate_mat(); 1755 #[allow(clippy::if_same_then_else)] 1756 if self.dynamic { 1757 aml::Device::new( 1758 format!("C{:03}", self.cpu_id).as_str().into(), 1759 vec![ 1760 &aml::Name::new("_HID".into(), &"ACPI0007"), 1761 &aml::Name::new("_UID".into(), &self.cpu_id), 1762 // Currently, AArch64 cannot support following fields. 1763 /* 1764 _STA return value: 1765 Bit [0] – Set if the device is present. 1766 Bit [1] – Set if the device is enabled and decoding its resources. 1767 Bit [2] – Set if the device should be shown in the UI. 1768 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1769 Bit [4] – Set if the battery is present. 1770 Bits [31:5] – Reserved (must be cleared). 1771 */ 1772 #[cfg(target_arch = "x86_64")] 1773 &aml::Method::new( 1774 "_STA".into(), 1775 0, 1776 false, 1777 // Call into CSTA method which will interrogate device 1778 vec![&aml::Return::new(&aml::MethodCall::new( 1779 "CSTA".into(), 1780 vec![&self.cpu_id], 1781 ))], 1782 ), 1783 &aml::Method::new( 1784 "_PXM".into(), 1785 0, 1786 false, 1787 vec![&aml::Return::new(&self.proximity_domain)], 1788 ), 1789 // The Linux kernel expects every CPU device to have a _MAT entry 1790 // containing the LAPIC for this processor with the enabled bit set 1791 // even it if is disabled in the MADT (non-boot CPU) 1792 #[cfg(target_arch = "x86_64")] 1793 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1794 // Trigger CPU ejection 1795 #[cfg(target_arch = "x86_64")] 1796 &aml::Method::new( 1797 "_EJ0".into(), 1798 1, 1799 false, 1800 // Call into CEJ0 method which will actually eject device 1801 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1802 ), 1803 ], 1804 ) 1805 .to_aml_bytes(sink); 1806 } else { 1807 aml::Device::new( 1808 format!("C{:03}", self.cpu_id).as_str().into(), 1809 vec![ 1810 &aml::Name::new("_HID".into(), &"ACPI0007"), 1811 &aml::Name::new("_UID".into(), &self.cpu_id), 1812 #[cfg(target_arch = "x86_64")] 1813 &aml::Method::new( 1814 "_STA".into(), 1815 0, 1816 false, 1817 // Mark CPU present see CSTA implementation 1818 vec![&aml::Return::new(&0xfu8)], 1819 ), 1820 &aml::Method::new( 1821 "_PXM".into(), 1822 0, 1823 false, 1824 vec![&aml::Return::new(&self.proximity_domain)], 1825 ), 1826 // The Linux kernel expects every CPU device to have a _MAT entry 1827 // containing the LAPIC for this processor with the enabled bit set 1828 // even it if is disabled in the MADT (non-boot CPU) 1829 #[cfg(target_arch = "x86_64")] 1830 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1831 ], 1832 ) 1833 .to_aml_bytes(sink); 1834 } 1835 } 1836 } 1837 1838 struct CpuNotify { 1839 cpu_id: u8, 1840 } 1841 1842 impl Aml for CpuNotify { 1843 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1844 let object = aml::Path::new(&format!("C{:03}", self.cpu_id)); 1845 aml::If::new( 1846 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1847 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1848 ) 1849 .to_aml_bytes(sink) 1850 } 1851 } 1852 1853 struct CpuMethods { 1854 max_vcpus: u8, 1855 dynamic: bool, 1856 } 1857 1858 impl Aml for CpuMethods { 1859 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1860 if self.dynamic { 1861 // CPU status method 1862 aml::Method::new( 1863 "CSTA".into(), 1864 1, 1865 true, 1866 vec![ 1867 // Take lock defined above 1868 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1869 // Write CPU number (in first argument) to I/O port via field 1870 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1871 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1872 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 1873 &aml::If::new( 1874 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 1875 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 1876 ), 1877 // Release lock 1878 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1879 // Return 0 or 0xf 1880 &aml::Return::new(&aml::Local(0)), 1881 ], 1882 ) 1883 .to_aml_bytes(sink); 1884 1885 let mut cpu_notifies = Vec::new(); 1886 for cpu_id in 0..self.max_vcpus { 1887 cpu_notifies.push(CpuNotify { cpu_id }); 1888 } 1889 1890 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 1891 for cpu_id in 0..self.max_vcpus { 1892 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 1893 } 1894 1895 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 1896 1897 aml::Method::new( 1898 "CEJ0".into(), 1899 1, 1900 true, 1901 vec![ 1902 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1903 // Write CPU number (in first argument) to I/O port via field 1904 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1905 // Set CEJ0 bit 1906 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 1907 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1908 ], 1909 ) 1910 .to_aml_bytes(sink); 1911 1912 aml::Method::new( 1913 "CSCN".into(), 1914 0, 1915 true, 1916 vec![ 1917 // Take lock defined above 1918 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1919 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1920 &aml::While::new( 1921 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 1922 vec![ 1923 // Write CPU number (in first argument) to I/O port via field 1924 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 1925 // Check if CINS bit is set 1926 &aml::If::new( 1927 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 1928 // Notify device if it is 1929 vec![ 1930 &aml::MethodCall::new( 1931 "CTFY".into(), 1932 vec![&aml::Local(0), &aml::ONE], 1933 ), 1934 // Reset CINS bit 1935 &aml::Store::new( 1936 &aml::Path::new("\\_SB_.PRES.CINS"), 1937 &aml::ONE, 1938 ), 1939 ], 1940 ), 1941 // Check if CRMV bit is set 1942 &aml::If::new( 1943 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 1944 // Notify device if it is (with the eject constant 0x3) 1945 vec![ 1946 &aml::MethodCall::new( 1947 "CTFY".into(), 1948 vec![&aml::Local(0), &3u8], 1949 ), 1950 // Reset CRMV bit 1951 &aml::Store::new( 1952 &aml::Path::new("\\_SB_.PRES.CRMV"), 1953 &aml::ONE, 1954 ), 1955 ], 1956 ), 1957 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 1958 ], 1959 ), 1960 // Release lock 1961 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1962 ], 1963 ) 1964 .to_aml_bytes(sink) 1965 } else { 1966 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 1967 } 1968 } 1969 } 1970 1971 impl Aml for CpuManager { 1972 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1973 #[cfg(target_arch = "x86_64")] 1974 if let Some(acpi_address) = self.acpi_address { 1975 // CPU hotplug controller 1976 aml::Device::new( 1977 "_SB_.PRES".into(), 1978 vec![ 1979 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 1980 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 1981 // Mutex to protect concurrent access as we write to choose CPU and then read back status 1982 &aml::Mutex::new("CPLK".into(), 0), 1983 &aml::Name::new( 1984 "_CRS".into(), 1985 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 1986 aml::AddressSpaceCachable::NotCacheable, 1987 true, 1988 acpi_address.0, 1989 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 1990 None, 1991 )]), 1992 ), 1993 // OpRegion and Fields map MMIO range into individual field values 1994 &aml::OpRegion::new( 1995 "PRST".into(), 1996 aml::OpRegionSpace::SystemMemory, 1997 &(acpi_address.0 as usize), 1998 &CPU_MANAGER_ACPI_SIZE, 1999 ), 2000 &aml::Field::new( 2001 "PRST".into(), 2002 aml::FieldAccessType::Byte, 2003 aml::FieldLockRule::NoLock, 2004 aml::FieldUpdateRule::WriteAsZeroes, 2005 vec![ 2006 aml::FieldEntry::Reserved(32), 2007 aml::FieldEntry::Named(*b"CPEN", 1), 2008 aml::FieldEntry::Named(*b"CINS", 1), 2009 aml::FieldEntry::Named(*b"CRMV", 1), 2010 aml::FieldEntry::Named(*b"CEJ0", 1), 2011 aml::FieldEntry::Reserved(4), 2012 aml::FieldEntry::Named(*b"CCMD", 8), 2013 ], 2014 ), 2015 &aml::Field::new( 2016 "PRST".into(), 2017 aml::FieldAccessType::DWord, 2018 aml::FieldLockRule::NoLock, 2019 aml::FieldUpdateRule::Preserve, 2020 vec![ 2021 aml::FieldEntry::Named(*b"CSEL", 32), 2022 aml::FieldEntry::Reserved(32), 2023 aml::FieldEntry::Named(*b"CDAT", 32), 2024 ], 2025 ), 2026 ], 2027 ) 2028 .to_aml_bytes(sink); 2029 } 2030 2031 // CPU devices 2032 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2033 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2034 // Bundle methods together under a common object 2035 let methods = CpuMethods { 2036 max_vcpus: self.config.max_vcpus, 2037 dynamic: self.dynamic, 2038 }; 2039 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2040 2041 let mut cpu_devices = Vec::new(); 2042 for cpu_id in 0..self.config.max_vcpus { 2043 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2044 let cpu_device = Cpu { 2045 cpu_id, 2046 proximity_domain, 2047 dynamic: self.dynamic, 2048 }; 2049 2050 cpu_devices.push(cpu_device); 2051 } 2052 2053 for cpu_device in cpu_devices.iter() { 2054 cpu_data_inner.push(cpu_device); 2055 } 2056 2057 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2058 } 2059 } 2060 2061 impl Pausable for CpuManager { 2062 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2063 // Tell the vCPUs to pause themselves next time they exit 2064 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2065 2066 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2067 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2068 // above. 2069 for state in self.vcpu_states.iter() { 2070 state.signal_thread(); 2071 } 2072 2073 for vcpu in self.vcpus.iter() { 2074 let mut vcpu = vcpu.lock().unwrap(); 2075 vcpu.pause()?; 2076 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2077 if !self.config.kvm_hyperv { 2078 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2079 MigratableError::Pause(anyhow!( 2080 "Could not notify guest it has been paused {:?}", 2081 e 2082 )) 2083 })?; 2084 } 2085 } 2086 2087 Ok(()) 2088 } 2089 2090 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2091 for vcpu in self.vcpus.iter() { 2092 vcpu.lock().unwrap().resume()?; 2093 } 2094 2095 // Toggle the vCPUs pause boolean 2096 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2097 2098 // Unpark all the VCPU threads. 2099 // Once unparked, the next thing they will do is checking for the pause 2100 // boolean. Since it'll be set to false, they will exit their pause loop 2101 // and go back to vmx root. 2102 for state in self.vcpu_states.iter() { 2103 state.unpark_thread(); 2104 } 2105 Ok(()) 2106 } 2107 } 2108 2109 impl Snapshottable for CpuManager { 2110 fn id(&self) -> String { 2111 CPU_MANAGER_SNAPSHOT_ID.to_string() 2112 } 2113 2114 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2115 let mut cpu_manager_snapshot = Snapshot::default(); 2116 2117 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2118 for vcpu in &self.vcpus { 2119 let mut vcpu = vcpu.lock().unwrap(); 2120 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2121 } 2122 2123 Ok(cpu_manager_snapshot) 2124 } 2125 } 2126 2127 impl Transportable for CpuManager {} 2128 impl Migratable for CpuManager {} 2129 2130 #[cfg(feature = "guest_debug")] 2131 impl Debuggable for CpuManager { 2132 #[cfg(feature = "kvm")] 2133 fn set_guest_debug( 2134 &self, 2135 cpu_id: usize, 2136 addrs: &[GuestAddress], 2137 singlestep: bool, 2138 ) -> std::result::Result<(), DebuggableError> { 2139 self.vcpus[cpu_id] 2140 .lock() 2141 .unwrap() 2142 .vcpu 2143 .set_guest_debug(addrs, singlestep) 2144 .map_err(DebuggableError::SetDebug) 2145 } 2146 2147 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2148 Ok(()) 2149 } 2150 2151 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2152 Ok(()) 2153 } 2154 2155 #[cfg(target_arch = "x86_64")] 2156 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2157 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2158 let gregs = self 2159 .get_regs(cpu_id as u8) 2160 .map_err(DebuggableError::ReadRegs)?; 2161 let regs = [ 2162 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, 2163 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, 2164 ]; 2165 2166 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2167 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2168 let eflags = gregs.rflags as u32; 2169 let rip = gregs.rip; 2170 2171 // Segment registers: CS, SS, DS, ES, FS, GS 2172 let sregs = self 2173 .get_sregs(cpu_id as u8) 2174 .map_err(DebuggableError::ReadRegs)?; 2175 let segments = X86SegmentRegs { 2176 cs: sregs.cs.selector as u32, 2177 ss: sregs.ss.selector as u32, 2178 ds: sregs.ds.selector as u32, 2179 es: sregs.es.selector as u32, 2180 fs: sregs.fs.selector as u32, 2181 gs: sregs.gs.selector as u32, 2182 }; 2183 2184 // TODO: Add other registers 2185 2186 Ok(CoreRegs { 2187 regs, 2188 eflags, 2189 rip, 2190 segments, 2191 ..Default::default() 2192 }) 2193 } 2194 2195 #[cfg(target_arch = "aarch64")] 2196 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2197 let gregs = self 2198 .get_regs(cpu_id as u8) 2199 .map_err(DebuggableError::ReadRegs)?; 2200 Ok(CoreRegs { 2201 x: gregs.regs.regs, 2202 sp: gregs.regs.sp, 2203 pc: gregs.regs.pc, 2204 ..Default::default() 2205 }) 2206 } 2207 2208 #[cfg(target_arch = "x86_64")] 2209 fn write_regs( 2210 &self, 2211 cpu_id: usize, 2212 regs: &CoreRegs, 2213 ) -> std::result::Result<(), DebuggableError> { 2214 let orig_gregs = self 2215 .get_regs(cpu_id as u8) 2216 .map_err(DebuggableError::ReadRegs)?; 2217 let gregs = StandardRegisters { 2218 rax: regs.regs[0], 2219 rbx: regs.regs[1], 2220 rcx: regs.regs[2], 2221 rdx: regs.regs[3], 2222 rsi: regs.regs[4], 2223 rdi: regs.regs[5], 2224 rbp: regs.regs[6], 2225 rsp: regs.regs[7], 2226 r8: regs.regs[8], 2227 r9: regs.regs[9], 2228 r10: regs.regs[10], 2229 r11: regs.regs[11], 2230 r12: regs.regs[12], 2231 r13: regs.regs[13], 2232 r14: regs.regs[14], 2233 r15: regs.regs[15], 2234 rip: regs.rip, 2235 // Update the lower 32-bit of rflags. 2236 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), 2237 }; 2238 2239 self.set_regs(cpu_id as u8, &gregs) 2240 .map_err(DebuggableError::WriteRegs)?; 2241 2242 // Segment registers: CS, SS, DS, ES, FS, GS 2243 // Since GDB care only selectors, we call get_sregs() first. 2244 let mut sregs = self 2245 .get_sregs(cpu_id as u8) 2246 .map_err(DebuggableError::ReadRegs)?; 2247 sregs.cs.selector = regs.segments.cs as u16; 2248 sregs.ss.selector = regs.segments.ss as u16; 2249 sregs.ds.selector = regs.segments.ds as u16; 2250 sregs.es.selector = regs.segments.es as u16; 2251 sregs.fs.selector = regs.segments.fs as u16; 2252 sregs.gs.selector = regs.segments.gs as u16; 2253 2254 self.set_sregs(cpu_id as u8, &sregs) 2255 .map_err(DebuggableError::WriteRegs)?; 2256 2257 // TODO: Add other registers 2258 2259 Ok(()) 2260 } 2261 2262 #[cfg(target_arch = "aarch64")] 2263 fn write_regs( 2264 &self, 2265 cpu_id: usize, 2266 regs: &CoreRegs, 2267 ) -> std::result::Result<(), DebuggableError> { 2268 let mut gregs = self 2269 .get_regs(cpu_id as u8) 2270 .map_err(DebuggableError::ReadRegs)?; 2271 2272 gregs.regs.regs = regs.x; 2273 gregs.regs.sp = regs.sp; 2274 gregs.regs.pc = regs.pc; 2275 2276 self.set_regs(cpu_id as u8, &gregs) 2277 .map_err(DebuggableError::WriteRegs)?; 2278 2279 Ok(()) 2280 } 2281 2282 fn read_mem( 2283 &self, 2284 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2285 cpu_id: usize, 2286 vaddr: GuestAddress, 2287 len: usize, 2288 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2289 let mut buf = vec![0; len]; 2290 let mut total_read = 0_u64; 2291 2292 while total_read < len as u64 { 2293 let gaddr = vaddr.0 + total_read; 2294 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2295 Ok(paddr) => paddr, 2296 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2297 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2298 }; 2299 let psize = arch::PAGE_SIZE as u64; 2300 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2301 guest_memory 2302 .memory() 2303 .read( 2304 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2305 GuestAddress(paddr), 2306 ) 2307 .map_err(DebuggableError::ReadMem)?; 2308 total_read += read_len; 2309 } 2310 Ok(buf) 2311 } 2312 2313 fn write_mem( 2314 &self, 2315 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2316 cpu_id: usize, 2317 vaddr: &GuestAddress, 2318 data: &[u8], 2319 ) -> std::result::Result<(), DebuggableError> { 2320 let mut total_written = 0_u64; 2321 2322 while total_written < data.len() as u64 { 2323 let gaddr = vaddr.0 + total_written; 2324 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2325 Ok(paddr) => paddr, 2326 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2327 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2328 }; 2329 let psize = arch::PAGE_SIZE as u64; 2330 let write_len = std::cmp::min( 2331 data.len() as u64 - total_written, 2332 psize - (paddr & (psize - 1)), 2333 ); 2334 guest_memory 2335 .memory() 2336 .write( 2337 &data[total_written as usize..total_written as usize + write_len as usize], 2338 GuestAddress(paddr), 2339 ) 2340 .map_err(DebuggableError::WriteMem)?; 2341 total_written += write_len; 2342 } 2343 Ok(()) 2344 } 2345 2346 fn active_vcpus(&self) -> usize { 2347 self.present_vcpus() as usize 2348 } 2349 } 2350 2351 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2352 impl Elf64Writable for CpuManager {} 2353 2354 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2355 impl CpuElf64Writable for CpuManager { 2356 fn cpu_write_elf64_note( 2357 &mut self, 2358 dump_state: &DumpState, 2359 ) -> std::result::Result<(), GuestDebuggableError> { 2360 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2361 for vcpu in &self.vcpus { 2362 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2363 let mut pos: usize = 0; 2364 let mut buf = vec![0; note_size as usize]; 2365 let descsz = size_of::<X86_64ElfPrStatus>(); 2366 let vcpu_id = vcpu.lock().unwrap().id; 2367 2368 let note = Elf64_Nhdr { 2369 n_namesz: COREDUMP_NAME_SIZE, 2370 n_descsz: descsz as u32, 2371 n_type: NT_PRSTATUS, 2372 }; 2373 2374 let bytes: &[u8] = note.as_slice(); 2375 buf.splice(0.., bytes.to_vec()); 2376 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2377 buf.resize(pos + 4, 0); 2378 buf.splice(pos.., "CORE".to_string().into_bytes()); 2379 2380 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2381 buf.resize(pos + 32 + 4, 0); 2382 let pid = vcpu_id as u64; 2383 let bytes: &[u8] = pid.as_slice(); 2384 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2385 2386 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2387 2388 let orig_rax: u64 = 0; 2389 let gregs = self.vcpus[usize::from(vcpu_id)] 2390 .lock() 2391 .unwrap() 2392 .vcpu 2393 .get_regs() 2394 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2395 2396 let regs1 = [ 2397 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11, 2398 gregs.r10, 2399 ]; 2400 let regs2 = [ 2401 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax, 2402 ]; 2403 2404 let sregs = self.vcpus[usize::from(vcpu_id)] 2405 .lock() 2406 .unwrap() 2407 .vcpu 2408 .get_sregs() 2409 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2410 2411 debug!( 2412 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2413 gregs.rip, 2414 gregs.rsp, 2415 sregs.gs.base, 2416 sregs.cs.selector, 2417 sregs.ss.selector, 2418 sregs.ds.selector, 2419 ); 2420 2421 let regs = X86_64UserRegs { 2422 regs1, 2423 regs2, 2424 rip: gregs.rip, 2425 cs: sregs.cs.selector as u64, 2426 eflags: gregs.rflags, 2427 rsp: gregs.rsp, 2428 ss: sregs.ss.selector as u64, 2429 fs_base: sregs.fs.base, 2430 gs_base: sregs.gs.base, 2431 ds: sregs.ds.selector as u64, 2432 es: sregs.es.selector as u64, 2433 fs: sregs.fs.selector as u64, 2434 gs: sregs.gs.selector as u64, 2435 }; 2436 2437 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2438 let bytes: &[u8] = regs.as_slice(); 2439 buf.resize(note_size as usize, 0); 2440 buf.splice(pos.., bytes.to_vec()); 2441 buf.resize(note_size as usize, 0); 2442 2443 coredump_file 2444 .write(&buf) 2445 .map_err(GuestDebuggableError::CoredumpFile)?; 2446 } 2447 2448 Ok(()) 2449 } 2450 2451 fn cpu_write_vmm_note( 2452 &mut self, 2453 dump_state: &DumpState, 2454 ) -> std::result::Result<(), GuestDebuggableError> { 2455 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2456 for vcpu in &self.vcpus { 2457 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2458 let mut pos: usize = 0; 2459 let mut buf = vec![0; note_size as usize]; 2460 let descsz = size_of::<DumpCpusState>(); 2461 let vcpu_id = vcpu.lock().unwrap().id; 2462 2463 let note = Elf64_Nhdr { 2464 n_namesz: COREDUMP_NAME_SIZE, 2465 n_descsz: descsz as u32, 2466 n_type: 0, 2467 }; 2468 2469 let bytes: &[u8] = note.as_slice(); 2470 buf.splice(0.., bytes.to_vec()); 2471 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2472 2473 buf.resize(pos + 4, 0); 2474 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2475 2476 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2477 2478 let gregs = self.vcpus[usize::from(vcpu_id)] 2479 .lock() 2480 .unwrap() 2481 .vcpu 2482 .get_regs() 2483 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2484 2485 let regs1 = [ 2486 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp, 2487 gregs.rbp, 2488 ]; 2489 2490 let regs2 = [ 2491 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, 2492 gregs.r15, 2493 ]; 2494 2495 let sregs = self.vcpus[usize::from(vcpu_id)] 2496 .lock() 2497 .unwrap() 2498 .vcpu 2499 .get_sregs() 2500 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2501 2502 let mut msrs = vec![MsrEntry { 2503 index: msr_index::MSR_KERNEL_GS_BASE, 2504 ..Default::default() 2505 }]; 2506 2507 self.vcpus[vcpu_id as usize] 2508 .lock() 2509 .unwrap() 2510 .vcpu 2511 .get_msrs(&mut msrs) 2512 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2513 let kernel_gs_base = msrs[0].data; 2514 2515 let cs = CpuSegment::new(sregs.cs); 2516 let ds = CpuSegment::new(sregs.ds); 2517 let es = CpuSegment::new(sregs.es); 2518 let fs = CpuSegment::new(sregs.fs); 2519 let gs = CpuSegment::new(sregs.gs); 2520 let ss = CpuSegment::new(sregs.ss); 2521 let ldt = CpuSegment::new(sregs.ldt); 2522 let tr = CpuSegment::new(sregs.tr); 2523 let gdt = CpuSegment::new_from_table(sregs.gdt); 2524 let idt = CpuSegment::new_from_table(sregs.idt); 2525 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2526 let regs = DumpCpusState { 2527 version: 1, 2528 size: size_of::<DumpCpusState>() as u32, 2529 regs1, 2530 regs2, 2531 rip: gregs.rip, 2532 rflags: gregs.rflags, 2533 cs, 2534 ds, 2535 es, 2536 fs, 2537 gs, 2538 ss, 2539 ldt, 2540 tr, 2541 gdt, 2542 idt, 2543 cr, 2544 kernel_gs_base, 2545 }; 2546 2547 let bytes: &[u8] = regs.as_slice(); 2548 buf.resize(note_size as usize, 0); 2549 buf.splice(pos.., bytes.to_vec()); 2550 buf.resize(note_size as usize, 0); 2551 2552 coredump_file 2553 .write(&buf) 2554 .map_err(GuestDebuggableError::CoredumpFile)?; 2555 } 2556 2557 Ok(()) 2558 } 2559 } 2560 2561 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2562 #[cfg(test)] 2563 mod tests { 2564 use arch::x86_64::interrupts::*; 2565 use arch::x86_64::regs::*; 2566 use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters}; 2567 2568 #[test] 2569 fn test_setlint() { 2570 let hv = hypervisor::new().unwrap(); 2571 let vm = hv.create_vm().expect("new VM fd creation failed"); 2572 assert!(hv.check_required_extensions().is_ok()); 2573 // Calling get_lapic will fail if there is no irqchip before hand. 2574 assert!(vm.create_irq_chip().is_ok()); 2575 let vcpu = vm.create_vcpu(0, None).unwrap(); 2576 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2577 2578 // Compute the value that is expected to represent LVT0 and LVT1. 2579 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2580 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2581 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2582 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2583 2584 set_lint(&vcpu).unwrap(); 2585 2586 // Compute the value that represents LVT0 and LVT1 after set_lint. 2587 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2588 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2589 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2590 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2591 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2592 } 2593 2594 #[test] 2595 fn test_setup_fpu() { 2596 let hv = hypervisor::new().unwrap(); 2597 let vm = hv.create_vm().expect("new VM fd creation failed"); 2598 let vcpu = vm.create_vcpu(0, None).unwrap(); 2599 setup_fpu(&vcpu).unwrap(); 2600 2601 let expected_fpu: FpuState = FpuState { 2602 fcw: 0x37f, 2603 mxcsr: 0x1f80, 2604 ..Default::default() 2605 }; 2606 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2607 // TODO: auto-generate kvm related structures with PartialEq on. 2608 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2609 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2610 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2611 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2612 // remove it at all. 2613 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2614 } 2615 2616 #[test] 2617 fn test_setup_msrs() { 2618 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2619 2620 let hv = hypervisor::new().unwrap(); 2621 let vm = hv.create_vm().expect("new VM fd creation failed"); 2622 let vcpu = vm.create_vcpu(0, None).unwrap(); 2623 setup_msrs(&vcpu).unwrap(); 2624 2625 // This test will check against the last MSR entry configured (the tenth one). 2626 // See create_msr_entries for details. 2627 let mut msrs = vec![MsrEntry { 2628 index: msr_index::MSR_IA32_MISC_ENABLE, 2629 ..Default::default() 2630 }]; 2631 2632 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2633 // in this test case scenario. 2634 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2635 assert_eq!(read_msrs, 1); 2636 2637 // Official entries that were setup when we did setup_msrs. We need to assert that the 2638 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2639 // expect. 2640 let entry_vec = vcpu.boot_msr_entries(); 2641 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2642 } 2643 2644 #[test] 2645 fn test_setup_regs() { 2646 let hv = hypervisor::new().unwrap(); 2647 let vm = hv.create_vm().expect("new VM fd creation failed"); 2648 let vcpu = vm.create_vcpu(0, None).unwrap(); 2649 2650 let expected_regs: StandardRegisters = StandardRegisters { 2651 rflags: 0x0000000000000002u64, 2652 rbx: arch::layout::PVH_INFO_START.0, 2653 rip: 1, 2654 ..Default::default() 2655 }; 2656 2657 setup_regs(&vcpu, expected_regs.rip).unwrap(); 2658 2659 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2660 assert_eq!(actual_regs, expected_regs); 2661 } 2662 } 2663 2664 #[cfg(target_arch = "aarch64")] 2665 #[cfg(test)] 2666 mod tests { 2667 use arch::{aarch64::regs, layout}; 2668 use hypervisor::kvm::aarch64::is_system_register; 2669 use hypervisor::kvm::kvm_bindings::{ 2670 kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2671 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2672 }; 2673 use hypervisor::{arm64_core_reg_id, offset_of}; 2674 use std::mem; 2675 2676 #[test] 2677 fn test_setup_regs() { 2678 let hv = hypervisor::new().unwrap(); 2679 let vm = hv.create_vm().unwrap(); 2680 let vcpu = vm.create_vcpu(0, None).unwrap(); 2681 2682 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2683 // Must fail when vcpu is not initialized yet. 2684 assert!(res.is_err()); 2685 2686 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2687 vm.get_preferred_target(&mut kvi).unwrap(); 2688 vcpu.vcpu_init(&kvi).unwrap(); 2689 2690 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2691 } 2692 2693 #[test] 2694 fn test_read_mpidr() { 2695 let hv = hypervisor::new().unwrap(); 2696 let vm = hv.create_vm().unwrap(); 2697 let vcpu = vm.create_vcpu(0, None).unwrap(); 2698 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2699 vm.get_preferred_target(&mut kvi).unwrap(); 2700 2701 // Must fail when vcpu is not initialized yet. 2702 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2703 2704 vcpu.vcpu_init(&kvi).unwrap(); 2705 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2706 } 2707 2708 #[test] 2709 fn test_is_system_register() { 2710 let offset = offset_of!(user_pt_regs, pc); 2711 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2712 assert!(!is_system_register(regid)); 2713 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2714 assert!(is_system_register(regid)); 2715 } 2716 2717 #[test] 2718 fn test_save_restore_core_regs() { 2719 let hv = hypervisor::new().unwrap(); 2720 let vm = hv.create_vm().unwrap(); 2721 let vcpu = vm.create_vcpu(0, None).unwrap(); 2722 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2723 vm.get_preferred_target(&mut kvi).unwrap(); 2724 2725 // Must fail when vcpu is not initialized yet. 2726 let res = vcpu.get_regs(); 2727 assert!(res.is_err()); 2728 assert_eq!( 2729 format!("{}", res.unwrap_err()), 2730 "Failed to get core register: Exec format error (os error 8)" 2731 ); 2732 2733 let mut state = kvm_regs::default(); 2734 let res = vcpu.set_regs(&state); 2735 assert!(res.is_err()); 2736 assert_eq!( 2737 format!("{}", res.unwrap_err()), 2738 "Failed to set core register: Exec format error (os error 8)" 2739 ); 2740 2741 vcpu.vcpu_init(&kvi).unwrap(); 2742 let res = vcpu.get_regs(); 2743 assert!(res.is_ok()); 2744 state = res.unwrap(); 2745 assert_eq!(state.regs.pstate, 0x3C5); 2746 2747 assert!(vcpu.set_regs(&state).is_ok()); 2748 } 2749 2750 #[test] 2751 fn test_get_set_mpstate() { 2752 let hv = hypervisor::new().unwrap(); 2753 let vm = hv.create_vm().unwrap(); 2754 let vcpu = vm.create_vcpu(0, None).unwrap(); 2755 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2756 vm.get_preferred_target(&mut kvi).unwrap(); 2757 2758 let res = vcpu.get_mp_state(); 2759 assert!(res.is_ok()); 2760 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 2761 } 2762 } 2763