1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 #[cfg(target_arch = "x86_64")] 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, sdt::Sdt, Aml}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 use arch::EntryPoint; 35 use arch::NumaNodes; 36 #[cfg(target_arch = "aarch64")] 37 use devices::gic::Gic; 38 use devices::interrupt_controller::InterruptController; 39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 44 use hypervisor::aarch64::StandardRegisters; 45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 46 use hypervisor::arch::x86::msr_index; 47 #[cfg(target_arch = "x86_64")] 48 use hypervisor::arch::x86::CpuIdEntry; 49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 50 use hypervisor::arch::x86::MsrEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters}; 53 #[cfg(target_arch = "aarch64")] 54 use hypervisor::kvm::kvm_bindings; 55 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 56 use hypervisor::kvm::kvm_ioctls::Cap; 57 #[cfg(feature = "tdx")] 58 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 59 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps}; 60 use libc::{c_void, siginfo_t}; 61 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 62 use linux_loader::elf::Elf64_Nhdr; 63 use seccompiler::{apply_filter, SeccompAction}; 64 use std::collections::BTreeMap; 65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 66 use std::io::Write; 67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 68 use std::mem::size_of; 69 use std::os::unix::thread::JoinHandleExt; 70 use std::sync::atomic::{AtomicBool, Ordering}; 71 use std::sync::{Arc, Barrier, Mutex}; 72 use std::{cmp, io, result, thread}; 73 use thiserror::Error; 74 use tracer::trace_scoped; 75 use vm_device::BusDevice; 76 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 77 use vm_memory::ByteValued; 78 #[cfg(feature = "guest_debug")] 79 use vm_memory::{Bytes, GuestAddressSpace}; 80 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 81 use vm_migration::{ 82 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 83 Transportable, 84 }; 85 use vmm_sys_util::eventfd::EventFd; 86 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 87 use zerocopy::AsBytes; 88 89 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 90 /// Extract the specified bits of a 64-bit integer. 91 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 92 /// following expression should return 3 (`0b11`): 93 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 94 /// 95 macro_rules! extract_bits_64 { 96 ($value: tt, $offset: tt, $length: tt) => { 97 ($value >> $offset) & (!0u64 >> (64 - $length)) 98 }; 99 } 100 101 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 102 macro_rules! extract_bits_64_without_offset { 103 ($value: tt, $length: tt) => { 104 $value & (!0u64 >> (64 - $length)) 105 }; 106 } 107 108 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 109 110 #[derive(Debug, Error)] 111 pub enum Error { 112 #[error("Error creating vCPU: {0}")] 113 VcpuCreate(#[source] anyhow::Error), 114 115 #[error("Error running bCPU: {0}")] 116 VcpuRun(#[source] anyhow::Error), 117 118 #[error("Error spawning vCPU thread: {0}")] 119 VcpuSpawn(#[source] io::Error), 120 121 #[error("Error generating common CPUID: {0}")] 122 CommonCpuId(#[source] arch::Error), 123 124 #[error("Error configuring vCPU: {0}")] 125 VcpuConfiguration(#[source] arch::Error), 126 127 #[cfg(target_arch = "aarch64")] 128 #[error("Error fetching preferred target: {0}")] 129 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 130 131 #[cfg(target_arch = "aarch64")] 132 #[error("Error initialising vCPU: {0}")] 133 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 134 135 #[error("Failed to join on vCPU threads: {0:?}")] 136 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 137 138 #[error("Error adding CpuManager to MMIO bus: {0}")] 139 BusError(#[source] vm_device::BusError), 140 141 #[error("Requested vCPUs exceed maximum")] 142 DesiredVCpuCountExceedsMax, 143 144 #[error("Cannot create seccomp filter: {0}")] 145 CreateSeccompFilter(#[source] seccompiler::Error), 146 147 #[error("Cannot apply seccomp filter: {0}")] 148 ApplySeccompFilter(#[source] seccompiler::Error), 149 150 #[error("Error starting vCPU after restore: {0}")] 151 StartRestoreVcpu(#[source] anyhow::Error), 152 153 #[error("Unexpected VmExit")] 154 UnexpectedVmExit, 155 156 #[error("Failed to allocate MMIO address for CpuManager")] 157 AllocateMmmioAddress, 158 159 #[cfg(feature = "tdx")] 160 #[error("Error initializing TDX: {0}")] 161 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 162 163 #[cfg(target_arch = "aarch64")] 164 #[error("Error initializing PMU: {0}")] 165 InitPmu(#[source] hypervisor::HypervisorCpuError), 166 167 #[cfg(feature = "guest_debug")] 168 #[error("Error during CPU debug: {0}")] 169 CpuDebug(#[source] hypervisor::HypervisorCpuError), 170 171 #[cfg(feature = "guest_debug")] 172 #[error("Error translating virtual address: {0}")] 173 TranslateVirtualAddress(#[source] anyhow::Error), 174 175 #[cfg(target_arch = "x86_64")] 176 #[error("Error setting up AMX: {0}")] 177 AmxEnable(#[source] anyhow::Error), 178 179 #[error("Maximum number of vCPUs exceeds host limit")] 180 MaximumVcpusExceeded, 181 } 182 pub type Result<T> = result::Result<T, Error>; 183 184 #[cfg(target_arch = "x86_64")] 185 #[allow(dead_code)] 186 #[repr(packed)] 187 #[derive(AsBytes)] 188 struct LocalX2Apic { 189 pub r#type: u8, 190 pub length: u8, 191 pub _reserved: u16, 192 pub apic_id: u32, 193 pub flags: u32, 194 pub processor_id: u32, 195 } 196 197 #[allow(dead_code)] 198 #[repr(packed)] 199 #[derive(Default, AsBytes)] 200 struct Ioapic { 201 pub r#type: u8, 202 pub length: u8, 203 pub ioapic_id: u8, 204 _reserved: u8, 205 pub apic_address: u32, 206 pub gsi_base: u32, 207 } 208 209 #[cfg(target_arch = "aarch64")] 210 #[allow(dead_code)] 211 #[repr(packed)] 212 #[derive(AsBytes)] 213 struct GicC { 214 pub r#type: u8, 215 pub length: u8, 216 pub reserved0: u16, 217 pub cpu_interface_number: u32, 218 pub uid: u32, 219 pub flags: u32, 220 pub parking_version: u32, 221 pub performance_interrupt: u32, 222 pub parked_address: u64, 223 pub base_address: u64, 224 pub gicv_base_address: u64, 225 pub gich_base_address: u64, 226 pub vgic_interrupt: u32, 227 pub gicr_base_address: u64, 228 pub mpidr: u64, 229 pub proc_power_effi_class: u8, 230 pub reserved1: u8, 231 pub spe_overflow_interrupt: u16, 232 } 233 234 #[cfg(target_arch = "aarch64")] 235 #[allow(dead_code)] 236 #[repr(packed)] 237 #[derive(AsBytes)] 238 struct GicD { 239 pub r#type: u8, 240 pub length: u8, 241 pub reserved0: u16, 242 pub gic_id: u32, 243 pub base_address: u64, 244 pub global_irq_base: u32, 245 pub version: u8, 246 pub reserved1: [u8; 3], 247 } 248 249 #[cfg(target_arch = "aarch64")] 250 #[allow(dead_code)] 251 #[repr(packed)] 252 #[derive(AsBytes)] 253 struct GicR { 254 pub r#type: u8, 255 pub length: u8, 256 pub reserved: u16, 257 pub base_address: u64, 258 pub range_length: u32, 259 } 260 261 #[cfg(target_arch = "aarch64")] 262 #[allow(dead_code)] 263 #[repr(packed)] 264 #[derive(AsBytes)] 265 struct GicIts { 266 pub r#type: u8, 267 pub length: u8, 268 pub reserved0: u16, 269 pub translation_id: u32, 270 pub base_address: u64, 271 pub reserved1: u32, 272 } 273 274 #[cfg(target_arch = "aarch64")] 275 #[allow(dead_code)] 276 #[repr(packed)] 277 #[derive(AsBytes)] 278 struct ProcessorHierarchyNode { 279 pub r#type: u8, 280 pub length: u8, 281 pub reserved: u16, 282 pub flags: u32, 283 pub parent: u32, 284 pub acpi_processor_id: u32, 285 pub num_private_resources: u32, 286 } 287 288 #[allow(dead_code)] 289 #[repr(packed)] 290 #[derive(Default, AsBytes)] 291 struct InterruptSourceOverride { 292 pub r#type: u8, 293 pub length: u8, 294 pub bus: u8, 295 pub source: u8, 296 pub gsi: u32, 297 pub flags: u16, 298 } 299 300 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 301 macro_rules! round_up { 302 ($n:expr,$d:expr) => { 303 (($n / ($d + 1)) + 1) * $d 304 }; 305 } 306 307 /// A wrapper around creating and using a kvm-based VCPU. 308 pub struct Vcpu { 309 // The hypervisor abstracted CPU. 310 vcpu: Arc<dyn hypervisor::Vcpu>, 311 id: u8, 312 #[cfg(target_arch = "aarch64")] 313 mpidr: u64, 314 saved_state: Option<CpuState>, 315 } 316 317 impl Vcpu { 318 /// Constructs a new VCPU for `vm`. 319 /// 320 /// # Arguments 321 /// 322 /// * `id` - Represents the CPU number between [0, max vcpus). 323 /// * `vm` - The virtual machine this vcpu will get attached to. 324 /// * `vm_ops` - Optional object for exit handling. 325 pub fn new( 326 id: u8, 327 vm: &Arc<dyn hypervisor::Vm>, 328 vm_ops: Option<Arc<dyn VmOps>>, 329 ) -> Result<Self> { 330 let vcpu = vm 331 .create_vcpu(id, vm_ops) 332 .map_err(|e| Error::VcpuCreate(e.into()))?; 333 // Initially the cpuid per vCPU is the one supported by this VM. 334 Ok(Vcpu { 335 vcpu, 336 id, 337 #[cfg(target_arch = "aarch64")] 338 mpidr: 0, 339 saved_state: None, 340 }) 341 } 342 343 /// Configures a vcpu and should be called once per vcpu when created. 344 /// 345 /// # Arguments 346 /// 347 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 348 /// * `guest_memory` - Guest memory. 349 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 350 pub fn configure( 351 &mut self, 352 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 353 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 354 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 355 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 356 ) -> Result<()> { 357 #[cfg(target_arch = "aarch64")] 358 { 359 self.init(vm)?; 360 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 361 .map_err(Error::VcpuConfiguration)?; 362 } 363 info!("Configuring vCPU: cpu_id = {}", self.id); 364 #[cfg(target_arch = "x86_64")] 365 arch::configure_vcpu(&self.vcpu, self.id, boot_setup, cpuid, kvm_hyperv) 366 .map_err(Error::VcpuConfiguration)?; 367 368 Ok(()) 369 } 370 371 /// Gets the MPIDR register value. 372 #[cfg(target_arch = "aarch64")] 373 pub fn get_mpidr(&self) -> u64 { 374 self.mpidr 375 } 376 377 /// Gets the saved vCPU state. 378 #[cfg(target_arch = "aarch64")] 379 pub fn get_saved_state(&self) -> Option<CpuState> { 380 self.saved_state.clone() 381 } 382 383 /// Initializes an aarch64 specific vcpu for booting Linux. 384 #[cfg(target_arch = "aarch64")] 385 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 386 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 387 388 // This reads back the kernel's preferred target type. 389 vm.get_preferred_target(&mut kvi) 390 .map_err(Error::VcpuArmPreferredTarget)?; 391 // We already checked that the capability is supported. 392 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 393 if vm 394 .as_any() 395 .downcast_ref::<hypervisor::kvm::KvmVm>() 396 .unwrap() 397 .check_extension(Cap::ArmPmuV3) 398 { 399 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 400 } 401 // Non-boot cpus are powered off initially. 402 if self.id > 0 { 403 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 404 } 405 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 406 } 407 408 /// Runs the VCPU until it exits, returning the reason. 409 /// 410 /// Note that the state of the VCPU and associated VM must be setup first for this to do 411 /// anything useful. 412 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 413 self.vcpu.run() 414 } 415 } 416 417 impl Pausable for Vcpu {} 418 impl Snapshottable for Vcpu { 419 fn id(&self) -> String { 420 self.id.to_string() 421 } 422 423 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 424 let saved_state = self 425 .vcpu 426 .state() 427 .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?; 428 429 self.saved_state = Some(saved_state.clone()); 430 431 Ok(Snapshot::from_data(SnapshotData::new_from_state( 432 &saved_state, 433 )?)) 434 } 435 } 436 437 pub struct CpuManager { 438 hypervisor_type: HypervisorType, 439 config: CpusConfig, 440 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 441 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 442 #[cfg(target_arch = "x86_64")] 443 cpuid: Vec<CpuIdEntry>, 444 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 445 vm: Arc<dyn hypervisor::Vm>, 446 vcpus_kill_signalled: Arc<AtomicBool>, 447 vcpus_pause_signalled: Arc<AtomicBool>, 448 exit_evt: EventFd, 449 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 450 reset_evt: EventFd, 451 #[cfg(feature = "guest_debug")] 452 vm_debug_evt: EventFd, 453 vcpu_states: Vec<VcpuState>, 454 selected_cpu: u8, 455 vcpus: Vec<Arc<Mutex<Vcpu>>>, 456 seccomp_action: SeccompAction, 457 vm_ops: Arc<dyn VmOps>, 458 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 459 acpi_address: Option<GuestAddress>, 460 proximity_domain_per_cpu: BTreeMap<u8, u32>, 461 affinity: BTreeMap<u8, Vec<u8>>, 462 dynamic: bool, 463 } 464 465 const CPU_ENABLE_FLAG: usize = 0; 466 const CPU_INSERTING_FLAG: usize = 1; 467 const CPU_REMOVING_FLAG: usize = 2; 468 const CPU_EJECT_FLAG: usize = 3; 469 470 const CPU_STATUS_OFFSET: u64 = 4; 471 const CPU_SELECTION_OFFSET: u64 = 0; 472 473 impl BusDevice for CpuManager { 474 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 475 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 476 data.fill(0); 477 478 match offset { 479 CPU_SELECTION_OFFSET => { 480 data[0] = self.selected_cpu; 481 } 482 CPU_STATUS_OFFSET => { 483 if self.selected_cpu < self.max_vcpus() { 484 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 485 if state.active() { 486 data[0] |= 1 << CPU_ENABLE_FLAG; 487 } 488 if state.inserting { 489 data[0] |= 1 << CPU_INSERTING_FLAG; 490 } 491 if state.removing { 492 data[0] |= 1 << CPU_REMOVING_FLAG; 493 } 494 } else { 495 warn!("Out of range vCPU id: {}", self.selected_cpu); 496 } 497 } 498 _ => { 499 warn!( 500 "Unexpected offset for accessing CPU manager device: {:#}", 501 offset 502 ); 503 } 504 } 505 } 506 507 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 508 match offset { 509 CPU_SELECTION_OFFSET => { 510 self.selected_cpu = data[0]; 511 } 512 CPU_STATUS_OFFSET => { 513 if self.selected_cpu < self.max_vcpus() { 514 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 515 // The ACPI code writes back a 1 to acknowledge the insertion 516 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 517 && state.inserting 518 { 519 state.inserting = false; 520 } 521 // Ditto for removal 522 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 523 && state.removing 524 { 525 state.removing = false; 526 } 527 // Trigger removal of vCPU 528 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 529 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 530 error!("Error removing vCPU: {:?}", e); 531 } 532 } 533 } else { 534 warn!("Out of range vCPU id: {}", self.selected_cpu); 535 } 536 } 537 _ => { 538 warn!( 539 "Unexpected offset for accessing CPU manager device: {:#}", 540 offset 541 ); 542 } 543 } 544 None 545 } 546 } 547 548 #[derive(Default)] 549 struct VcpuState { 550 inserting: bool, 551 removing: bool, 552 handle: Option<thread::JoinHandle<()>>, 553 kill: Arc<AtomicBool>, 554 vcpu_run_interrupted: Arc<AtomicBool>, 555 paused: Arc<AtomicBool>, 556 } 557 558 impl VcpuState { 559 fn active(&self) -> bool { 560 self.handle.is_some() 561 } 562 563 fn signal_thread(&self) { 564 if let Some(handle) = self.handle.as_ref() { 565 loop { 566 // SAFETY: FFI call with correct arguments 567 unsafe { 568 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 569 } 570 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 571 break; 572 } else { 573 // This is more effective than thread::yield_now() at 574 // avoiding a priority inversion with the vCPU thread 575 thread::sleep(std::time::Duration::from_millis(1)); 576 } 577 } 578 } 579 } 580 581 fn join_thread(&mut self) -> Result<()> { 582 if let Some(handle) = self.handle.take() { 583 handle.join().map_err(Error::ThreadCleanup)? 584 } 585 586 Ok(()) 587 } 588 589 fn unpark_thread(&self) { 590 if let Some(handle) = self.handle.as_ref() { 591 handle.thread().unpark() 592 } 593 } 594 } 595 596 impl CpuManager { 597 #[allow(unused_variables)] 598 #[allow(clippy::too_many_arguments)] 599 pub fn new( 600 config: &CpusConfig, 601 vm: Arc<dyn hypervisor::Vm>, 602 exit_evt: EventFd, 603 reset_evt: EventFd, 604 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 605 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 606 seccomp_action: SeccompAction, 607 vm_ops: Arc<dyn VmOps>, 608 #[cfg(feature = "tdx")] tdx_enabled: bool, 609 numa_nodes: &NumaNodes, 610 ) -> Result<Arc<Mutex<CpuManager>>> { 611 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 612 return Err(Error::MaximumVcpusExceeded); 613 } 614 615 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 616 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 617 let hypervisor_type = hypervisor.hypervisor_type(); 618 619 #[cfg(target_arch = "x86_64")] 620 if config.features.amx { 621 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 622 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 623 const XFEATURE_XTILEDATA: usize = 18; 624 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 625 626 // SAFETY: the syscall is only modifing kernel internal 627 // data structures that the kernel is itself expected to safeguard. 628 let amx_tile = unsafe { 629 libc::syscall( 630 libc::SYS_arch_prctl, 631 ARCH_REQ_XCOMP_GUEST_PERM, 632 XFEATURE_XTILEDATA, 633 ) 634 }; 635 636 if amx_tile != 0 { 637 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 638 } else { 639 let mask: usize = 0; 640 // SAFETY: the mask being modified (not marked mutable as it is 641 // modified in unsafe only which is permitted) isn't in use elsewhere. 642 let result = unsafe { 643 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 644 }; 645 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 646 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 647 } 648 } 649 } 650 651 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 652 let mut cpu_list = Vec::new(); 653 for (proximity_domain, numa_node) in numa_nodes.iter() { 654 for cpu in numa_node.cpus.iter() { 655 cpu_list.push((*cpu, *proximity_domain)) 656 } 657 } 658 cpu_list 659 } 660 .into_iter() 661 .collect(); 662 663 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 664 cpu_affinity 665 .iter() 666 .map(|a| (a.vcpu, a.host_cpus.clone())) 667 .collect() 668 } else { 669 BTreeMap::new() 670 }; 671 672 #[cfg(feature = "tdx")] 673 let dynamic = !tdx_enabled; 674 #[cfg(not(feature = "tdx"))] 675 let dynamic = true; 676 677 Ok(Arc::new(Mutex::new(CpuManager { 678 hypervisor_type, 679 config: config.clone(), 680 interrupt_controller: None, 681 #[cfg(target_arch = "x86_64")] 682 cpuid: Vec::new(), 683 vm, 684 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 685 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 686 vcpu_states, 687 exit_evt, 688 reset_evt, 689 #[cfg(feature = "guest_debug")] 690 vm_debug_evt, 691 selected_cpu: 0, 692 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 693 seccomp_action, 694 vm_ops, 695 acpi_address: None, 696 proximity_domain_per_cpu, 697 affinity, 698 dynamic, 699 }))) 700 } 701 702 #[cfg(target_arch = "x86_64")] 703 pub fn populate_cpuid( 704 &mut self, 705 memory_manager: &Arc<Mutex<MemoryManager>>, 706 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 707 #[cfg(feature = "tdx")] tdx_enabled: bool, 708 ) -> Result<()> { 709 let sgx_epc_sections = memory_manager 710 .lock() 711 .unwrap() 712 .sgx_epc_region() 713 .as_ref() 714 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 715 716 let topology = self.config.topology.clone().map_or_else( 717 || { 718 #[cfg(feature = "mshv")] 719 if matches!(hypervisor.hypervisor_type(), HypervisorType::Mshv) { 720 return Some((1, self.boot_vcpus(), 1)); 721 } 722 None 723 }, 724 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 725 ); 726 727 self.cpuid = { 728 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 729 arch::generate_common_cpuid( 730 hypervisor, 731 topology, 732 sgx_epc_sections, 733 phys_bits, 734 self.config.kvm_hyperv, 735 #[cfg(feature = "tdx")] 736 tdx_enabled, 737 ) 738 .map_err(Error::CommonCpuId)? 739 }; 740 741 Ok(()) 742 } 743 744 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 745 info!("Creating vCPU: cpu_id = {}", cpu_id); 746 747 let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?; 748 749 if let Some(snapshot) = snapshot { 750 // AArch64 vCPUs should be initialized after created. 751 #[cfg(target_arch = "aarch64")] 752 vcpu.init(&self.vm)?; 753 754 let state: CpuState = snapshot.to_state().map_err(|e| { 755 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 756 })?; 757 vcpu.vcpu 758 .set_state(&state) 759 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 760 761 vcpu.saved_state = Some(state); 762 } 763 764 let vcpu = Arc::new(Mutex::new(vcpu)); 765 766 // Adding vCPU to the CpuManager's vCPU list. 767 self.vcpus.push(vcpu.clone()); 768 769 Ok(vcpu) 770 } 771 772 pub fn configure_vcpu( 773 &self, 774 vcpu: Arc<Mutex<Vcpu>>, 775 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 776 ) -> Result<()> { 777 let mut vcpu = vcpu.lock().unwrap(); 778 779 #[cfg(target_arch = "x86_64")] 780 assert!(!self.cpuid.is_empty()); 781 782 #[cfg(target_arch = "x86_64")] 783 vcpu.configure(boot_setup, self.cpuid.clone(), self.config.kvm_hyperv)?; 784 785 #[cfg(target_arch = "aarch64")] 786 vcpu.configure(&self.vm, boot_setup)?; 787 788 Ok(()) 789 } 790 791 /// Only create new vCPUs if there aren't any inactive ones to reuse 792 fn create_vcpus( 793 &mut self, 794 desired_vcpus: u8, 795 snapshot: Option<Snapshot>, 796 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 797 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 798 info!( 799 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 800 desired_vcpus, 801 self.config.max_vcpus, 802 self.vcpus.len(), 803 self.present_vcpus() 804 ); 805 806 if desired_vcpus > self.config.max_vcpus { 807 return Err(Error::DesiredVCpuCountExceedsMax); 808 } 809 810 // Only create vCPUs in excess of all the allocated vCPUs. 811 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 812 vcpus.push(self.create_vcpu( 813 cpu_id, 814 // TODO: The special format of the CPU id can be removed once 815 // ready to break live upgrade. 816 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 817 )?); 818 } 819 820 Ok(vcpus) 821 } 822 823 #[cfg(target_arch = "aarch64")] 824 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 825 for cpu in self.vcpus.iter() { 826 let cpu = cpu.lock().unwrap(); 827 // Check if PMU attr is available, if not, log the information. 828 if cpu.vcpu.has_pmu_support() { 829 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 830 } else { 831 debug!( 832 "PMU attribute is not supported in vCPU{}, skip PMU init!", 833 cpu.id 834 ); 835 return Ok(false); 836 } 837 } 838 839 Ok(true) 840 } 841 842 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 843 self.vcpus.clone() 844 } 845 846 fn start_vcpu( 847 &mut self, 848 vcpu: Arc<Mutex<Vcpu>>, 849 vcpu_id: u8, 850 vcpu_thread_barrier: Arc<Barrier>, 851 inserting: bool, 852 ) -> Result<()> { 853 let reset_evt = self.reset_evt.try_clone().unwrap(); 854 let exit_evt = self.exit_evt.try_clone().unwrap(); 855 #[cfg(feature = "kvm")] 856 let hypervisor_type = self.hypervisor_type; 857 #[cfg(feature = "guest_debug")] 858 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 859 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 860 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 861 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 862 863 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 864 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 865 .vcpu_run_interrupted 866 .clone(); 867 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 868 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 869 870 // Prepare the CPU set the current vCPU is expected to run onto. 871 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 872 // SAFETY: all zeros is a valid pattern 873 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 874 // SAFETY: FFI call, trivially safe 875 unsafe { libc::CPU_ZERO(&mut cpuset) }; 876 for host_cpu in host_cpus { 877 // SAFETY: FFI call, trivially safe 878 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) }; 879 } 880 cpuset 881 }); 882 883 // Retrieve seccomp filter for vcpu thread 884 let vcpu_seccomp_filter = 885 get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type) 886 .map_err(Error::CreateSeccompFilter)?; 887 888 #[cfg(target_arch = "x86_64")] 889 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 890 891 info!("Starting vCPU: cpu_id = {}", vcpu_id); 892 893 let handle = Some( 894 thread::Builder::new() 895 .name(format!("vcpu{vcpu_id}")) 896 .spawn(move || { 897 // Schedule the thread to run on the expected CPU set 898 if let Some(cpuset) = cpuset.as_ref() { 899 // SAFETY: FFI call with correct arguments 900 let ret = unsafe { 901 libc::sched_setaffinity( 902 0, 903 std::mem::size_of::<libc::cpu_set_t>(), 904 cpuset as *const libc::cpu_set_t, 905 ) 906 }; 907 908 if ret != 0 { 909 error!( 910 "Failed scheduling the vCPU {} on the expected CPU set: {}", 911 vcpu_id, 912 io::Error::last_os_error() 913 ); 914 return; 915 } 916 } 917 918 // Apply seccomp filter for vcpu thread. 919 if !vcpu_seccomp_filter.is_empty() { 920 if let Err(e) = 921 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 922 { 923 error!("Error applying seccomp filter: {:?}", e); 924 return; 925 } 926 } 927 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 928 // This uses an async signal safe handler to kill the vcpu handles. 929 register_signal_handler(SIGRTMIN(), handle_signal) 930 .expect("Failed to register vcpu signal handler"); 931 // Block until all CPUs are ready. 932 vcpu_thread_barrier.wait(); 933 934 std::panic::catch_unwind(move || { 935 loop { 936 // If we are being told to pause, we park the thread 937 // until the pause boolean is toggled. 938 // The resume operation is responsible for toggling 939 // the boolean and unpark the thread. 940 // We enter a loop because park() could spuriously 941 // return. We will then park() again unless the 942 // pause boolean has been toggled. 943 944 // Need to use Ordering::SeqCst as we have multiple 945 // loads and stores to different atomics and we need 946 // to see them in a consistent order in all threads 947 948 if vcpu_pause_signalled.load(Ordering::SeqCst) { 949 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 950 // completed by returning to KVM_RUN. From the kernel docs: 951 // 952 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 953 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 954 // operations are complete (and guest state is consistent) only after userspace 955 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 956 // incomplete operations and then check for pending signals. 957 // The pending state of the operation is not preserved in state which is 958 // visible to userspace, thus userspace should ensure that the operation is 959 // completed before performing a live migration. Userspace can re-enter the 960 // guest with an unmasked signal pending or with the immediate_exit field set 961 // to complete pending operations without allowing any further instructions 962 // to be executed. 963 964 #[cfg(feature = "kvm")] 965 if matches!(hypervisor_type, HypervisorType::Kvm) { 966 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 967 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 968 error!("Unexpected VM exit on \"immediate_exit\" run"); 969 break; 970 } 971 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 972 } 973 974 vcpu_run_interrupted.store(true, Ordering::SeqCst); 975 976 vcpu_paused.store(true, Ordering::SeqCst); 977 while vcpu_pause_signalled.load(Ordering::SeqCst) { 978 thread::park(); 979 } 980 vcpu_run_interrupted.store(false, Ordering::SeqCst); 981 } 982 983 // We've been told to terminate 984 if vcpu_kill_signalled.load(Ordering::SeqCst) 985 || vcpu_kill.load(Ordering::SeqCst) 986 { 987 vcpu_run_interrupted.store(true, Ordering::SeqCst); 988 break; 989 } 990 991 #[cfg(feature = "tdx")] 992 let mut vcpu = vcpu.lock().unwrap(); 993 #[cfg(not(feature = "tdx"))] 994 let vcpu = vcpu.lock().unwrap(); 995 // vcpu.run() returns false on a triple-fault so trigger a reset 996 match vcpu.run() { 997 Ok(run) => match run { 998 #[cfg(feature = "kvm")] 999 VmExit::Debug => { 1000 info!("VmExit::Debug"); 1001 #[cfg(feature = "guest_debug")] 1002 { 1003 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1004 let raw_tid = get_raw_tid(vcpu_id as usize); 1005 vm_debug_evt.write(raw_tid as u64).unwrap(); 1006 } 1007 } 1008 #[cfg(target_arch = "x86_64")] 1009 VmExit::IoapicEoi(vector) => { 1010 if let Some(interrupt_controller) = 1011 &interrupt_controller_clone 1012 { 1013 interrupt_controller 1014 .lock() 1015 .unwrap() 1016 .end_of_interrupt(vector); 1017 } 1018 } 1019 VmExit::Ignore => {} 1020 VmExit::Hyperv => {} 1021 VmExit::Reset => { 1022 info!("VmExit::Reset"); 1023 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1024 reset_evt.write(1).unwrap(); 1025 break; 1026 } 1027 VmExit::Shutdown => { 1028 info!("VmExit::Shutdown"); 1029 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1030 exit_evt.write(1).unwrap(); 1031 break; 1032 } 1033 #[cfg(feature = "tdx")] 1034 VmExit::Tdx => { 1035 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1036 match vcpu.get_tdx_exit_details() { 1037 Ok(details) => match details { 1038 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1039 TdxExitDetails::SetupEventNotifyInterrupt => { 1040 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1041 } 1042 }, 1043 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1044 } 1045 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1046 } else { 1047 // We should never reach this code as 1048 // this means the design from the code 1049 // is wrong. 1050 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1051 } 1052 } 1053 _ => { 1054 error!( 1055 "VCPU generated error: {:?}", 1056 Error::UnexpectedVmExit 1057 ); 1058 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1059 exit_evt.write(1).unwrap(); 1060 break; 1061 } 1062 }, 1063 1064 Err(e) => { 1065 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1066 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1067 exit_evt.write(1).unwrap(); 1068 break; 1069 } 1070 } 1071 1072 // We've been told to terminate 1073 if vcpu_kill_signalled.load(Ordering::SeqCst) 1074 || vcpu_kill.load(Ordering::SeqCst) 1075 { 1076 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1077 break; 1078 } 1079 } 1080 }) 1081 .or_else(|_| { 1082 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1083 error!("vCPU thread panicked"); 1084 panic_exit_evt.write(1) 1085 }) 1086 .ok(); 1087 }) 1088 .map_err(Error::VcpuSpawn)?, 1089 ); 1090 1091 // On hot plug calls into this function entry_point is None. It is for 1092 // those hotplug CPU additions that we need to set the inserting flag. 1093 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1094 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1095 1096 Ok(()) 1097 } 1098 1099 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1100 fn activate_vcpus( 1101 &mut self, 1102 desired_vcpus: u8, 1103 inserting: bool, 1104 paused: Option<bool>, 1105 ) -> Result<()> { 1106 if desired_vcpus > self.config.max_vcpus { 1107 return Err(Error::DesiredVCpuCountExceedsMax); 1108 } 1109 1110 let vcpu_thread_barrier = Arc::new(Barrier::new( 1111 (desired_vcpus - self.present_vcpus() + 1) as usize, 1112 )); 1113 1114 if let Some(paused) = paused { 1115 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1116 } 1117 1118 info!( 1119 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1120 desired_vcpus, 1121 self.vcpus.len(), 1122 self.present_vcpus(), 1123 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1124 ); 1125 1126 // This reuses any inactive vCPUs as well as any that were newly created 1127 for vcpu_id in self.present_vcpus()..desired_vcpus { 1128 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1129 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1130 } 1131 1132 // Unblock all CPU threads. 1133 vcpu_thread_barrier.wait(); 1134 Ok(()) 1135 } 1136 1137 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1138 // Mark vCPUs for removal, actual removal happens on ejection 1139 for cpu_id in desired_vcpus..self.present_vcpus() { 1140 self.vcpu_states[usize::from(cpu_id)].removing = true; 1141 } 1142 } 1143 1144 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1145 info!("Removing vCPU: cpu_id = {}", cpu_id); 1146 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1147 state.kill.store(true, Ordering::SeqCst); 1148 state.signal_thread(); 1149 state.join_thread()?; 1150 state.handle = None; 1151 1152 // Once the thread has exited, clear the "kill" so that it can reused 1153 state.kill.store(false, Ordering::SeqCst); 1154 1155 Ok(()) 1156 } 1157 1158 pub fn create_boot_vcpus( 1159 &mut self, 1160 snapshot: Option<Snapshot>, 1161 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1162 trace_scoped!("create_boot_vcpus"); 1163 1164 self.create_vcpus(self.boot_vcpus(), snapshot) 1165 } 1166 1167 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1168 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1169 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1170 } 1171 1172 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1173 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1174 .map_err(|e| { 1175 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1176 })?; 1177 1178 Ok(()) 1179 } 1180 1181 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1182 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1183 return Ok(false); 1184 } 1185 1186 if !self.dynamic { 1187 return Ok(false); 1188 } 1189 1190 match desired_vcpus.cmp(&self.present_vcpus()) { 1191 cmp::Ordering::Greater => { 1192 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1193 for vcpu in vcpus { 1194 self.configure_vcpu(vcpu, None)? 1195 } 1196 self.activate_vcpus(desired_vcpus, true, None)?; 1197 Ok(true) 1198 } 1199 cmp::Ordering::Less => { 1200 self.mark_vcpus_for_removal(desired_vcpus); 1201 Ok(true) 1202 } 1203 _ => Ok(false), 1204 } 1205 } 1206 1207 pub fn shutdown(&mut self) -> Result<()> { 1208 // Tell the vCPUs to stop themselves next time they go through the loop 1209 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1210 1211 // Toggle the vCPUs pause boolean 1212 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1213 1214 // Unpark all the VCPU threads. 1215 for state in self.vcpu_states.iter() { 1216 state.unpark_thread(); 1217 } 1218 1219 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1220 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1221 // above. 1222 for state in self.vcpu_states.iter() { 1223 state.signal_thread(); 1224 } 1225 1226 // Wait for all the threads to finish. This removes the state from the vector. 1227 for mut state in self.vcpu_states.drain(..) { 1228 state.join_thread()?; 1229 } 1230 1231 Ok(()) 1232 } 1233 1234 #[cfg(feature = "tdx")] 1235 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1236 for vcpu in &self.vcpus { 1237 vcpu.lock() 1238 .unwrap() 1239 .vcpu 1240 .tdx_init(hob_address) 1241 .map_err(Error::InitializeTdx)?; 1242 } 1243 Ok(()) 1244 } 1245 1246 pub fn boot_vcpus(&self) -> u8 { 1247 self.config.boot_vcpus 1248 } 1249 1250 pub fn max_vcpus(&self) -> u8 { 1251 self.config.max_vcpus 1252 } 1253 1254 #[cfg(target_arch = "x86_64")] 1255 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1256 assert!(!self.cpuid.is_empty()); 1257 self.cpuid.clone() 1258 } 1259 1260 fn present_vcpus(&self) -> u8 { 1261 self.vcpu_states 1262 .iter() 1263 .fold(0, |acc, state| acc + state.active() as u8) 1264 } 1265 1266 #[cfg(target_arch = "aarch64")] 1267 pub fn get_mpidrs(&self) -> Vec<u64> { 1268 self.vcpus 1269 .iter() 1270 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1271 .collect() 1272 } 1273 1274 #[cfg(target_arch = "aarch64")] 1275 pub fn get_saved_states(&self) -> Vec<CpuState> { 1276 self.vcpus 1277 .iter() 1278 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1279 .collect() 1280 } 1281 1282 #[cfg(target_arch = "aarch64")] 1283 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1284 self.config 1285 .topology 1286 .clone() 1287 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1288 } 1289 1290 pub fn create_madt(&self) -> Sdt { 1291 use crate::acpi; 1292 // This is also checked in the commandline parsing. 1293 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1294 1295 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1296 #[cfg(target_arch = "x86_64")] 1297 { 1298 madt.write(36, arch::layout::APIC_START.0); 1299 1300 for cpu in 0..self.config.max_vcpus { 1301 let lapic = LocalX2Apic { 1302 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1303 length: 16, 1304 processor_id: cpu.into(), 1305 apic_id: cpu.into(), 1306 flags: if cpu < self.config.boot_vcpus { 1307 1 << MADT_CPU_ENABLE_FLAG 1308 } else { 1309 0 1310 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1311 _reserved: 0, 1312 }; 1313 madt.append(lapic); 1314 } 1315 1316 madt.append(Ioapic { 1317 r#type: acpi::ACPI_APIC_IO, 1318 length: 12, 1319 ioapic_id: 0, 1320 apic_address: arch::layout::IOAPIC_START.0 as u32, 1321 gsi_base: 0, 1322 ..Default::default() 1323 }); 1324 1325 madt.append(InterruptSourceOverride { 1326 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1327 length: 10, 1328 bus: 0, 1329 source: 4, 1330 gsi: 4, 1331 flags: 0, 1332 }); 1333 } 1334 1335 #[cfg(target_arch = "aarch64")] 1336 { 1337 /* Notes: 1338 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1339 */ 1340 1341 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1342 for cpu in 0..self.config.boot_vcpus { 1343 let vcpu = &self.vcpus[cpu as usize]; 1344 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1345 /* ARMv8 MPIDR format: 1346 Bits [63:40] Must be zero 1347 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1348 Bits [31:24] Must be zero 1349 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1350 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1351 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1352 */ 1353 let mpidr_mask = 0xff_00ff_ffff; 1354 let gicc = GicC { 1355 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1356 length: 80, 1357 reserved0: 0, 1358 cpu_interface_number: cpu as u32, 1359 uid: cpu as u32, 1360 flags: 1, 1361 parking_version: 0, 1362 performance_interrupt: 0, 1363 parked_address: 0, 1364 base_address: 0, 1365 gicv_base_address: 0, 1366 gich_base_address: 0, 1367 vgic_interrupt: 0, 1368 gicr_base_address: 0, 1369 mpidr: mpidr & mpidr_mask, 1370 proc_power_effi_class: 0, 1371 reserved1: 0, 1372 spe_overflow_interrupt: 0, 1373 }; 1374 1375 madt.append(gicc); 1376 } 1377 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1378 1379 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1380 let gicd = GicD { 1381 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1382 length: 24, 1383 reserved0: 0, 1384 gic_id: 0, 1385 base_address: vgic_config.dist_addr, 1386 global_irq_base: 0, 1387 version: 3, 1388 reserved1: [0; 3], 1389 }; 1390 madt.append(gicd); 1391 1392 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1393 let gicr = GicR { 1394 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1395 length: 16, 1396 reserved: 0, 1397 base_address: vgic_config.redists_addr, 1398 range_length: vgic_config.redists_size as u32, 1399 }; 1400 madt.append(gicr); 1401 1402 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1403 let gicits = GicIts { 1404 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1405 length: 20, 1406 reserved0: 0, 1407 translation_id: 0, 1408 base_address: vgic_config.msi_addr, 1409 reserved1: 0, 1410 }; 1411 madt.append(gicits); 1412 1413 madt.update_checksum(); 1414 } 1415 1416 madt 1417 } 1418 1419 #[cfg(target_arch = "aarch64")] 1420 pub fn create_pptt(&self) -> Sdt { 1421 let pptt_start = 0; 1422 let mut cpus = 0; 1423 let mut uid = 0; 1424 // If topology is not specified, the default setting is: 1425 // 1 package, multiple cores, 1 thread per core 1426 // This is also the behavior when PPTT is missing. 1427 let (threads_per_core, cores_per_package, packages) = 1428 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1429 1430 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1431 1432 for cluster_idx in 0..packages { 1433 if cpus < self.config.boot_vcpus as usize { 1434 let cluster_offset = pptt.len() - pptt_start; 1435 let cluster_hierarchy_node = ProcessorHierarchyNode { 1436 r#type: 0, 1437 length: 20, 1438 reserved: 0, 1439 flags: 0x2, 1440 parent: 0, 1441 acpi_processor_id: cluster_idx as u32, 1442 num_private_resources: 0, 1443 }; 1444 pptt.append(cluster_hierarchy_node); 1445 1446 for core_idx in 0..cores_per_package { 1447 let core_offset = pptt.len() - pptt_start; 1448 1449 if threads_per_core > 1 { 1450 let core_hierarchy_node = ProcessorHierarchyNode { 1451 r#type: 0, 1452 length: 20, 1453 reserved: 0, 1454 flags: 0x2, 1455 parent: cluster_offset as u32, 1456 acpi_processor_id: core_idx as u32, 1457 num_private_resources: 0, 1458 }; 1459 pptt.append(core_hierarchy_node); 1460 1461 for _thread_idx in 0..threads_per_core { 1462 let thread_hierarchy_node = ProcessorHierarchyNode { 1463 r#type: 0, 1464 length: 20, 1465 reserved: 0, 1466 flags: 0xE, 1467 parent: core_offset as u32, 1468 acpi_processor_id: uid as u32, 1469 num_private_resources: 0, 1470 }; 1471 pptt.append(thread_hierarchy_node); 1472 uid += 1; 1473 } 1474 } else { 1475 let thread_hierarchy_node = ProcessorHierarchyNode { 1476 r#type: 0, 1477 length: 20, 1478 reserved: 0, 1479 flags: 0xA, 1480 parent: cluster_offset as u32, 1481 acpi_processor_id: uid as u32, 1482 num_private_resources: 0, 1483 }; 1484 pptt.append(thread_hierarchy_node); 1485 uid += 1; 1486 } 1487 } 1488 cpus += (cores_per_package * threads_per_core) as usize; 1489 } 1490 } 1491 1492 pptt.update_checksum(); 1493 pptt 1494 } 1495 1496 #[cfg(feature = "guest_debug")] 1497 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1498 self.vcpus[usize::from(cpu_id)] 1499 .lock() 1500 .unwrap() 1501 .vcpu 1502 .get_regs() 1503 .map_err(Error::CpuDebug) 1504 } 1505 1506 #[cfg(feature = "guest_debug")] 1507 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1508 self.vcpus[usize::from(cpu_id)] 1509 .lock() 1510 .unwrap() 1511 .vcpu 1512 .set_regs(regs) 1513 .map_err(Error::CpuDebug) 1514 } 1515 1516 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1517 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1518 self.vcpus[usize::from(cpu_id)] 1519 .lock() 1520 .unwrap() 1521 .vcpu 1522 .get_sregs() 1523 .map_err(Error::CpuDebug) 1524 } 1525 1526 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1527 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1528 self.vcpus[usize::from(cpu_id)] 1529 .lock() 1530 .unwrap() 1531 .vcpu 1532 .set_sregs(sregs) 1533 .map_err(Error::CpuDebug) 1534 } 1535 1536 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1537 fn translate_gva( 1538 &self, 1539 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1540 cpu_id: u8, 1541 gva: u64, 1542 ) -> Result<u64> { 1543 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1544 .lock() 1545 .unwrap() 1546 .vcpu 1547 .translate_gva(gva, /* flags: unused */ 0) 1548 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1549 Ok(gpa) 1550 } 1551 1552 /// 1553 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1554 /// it in VMM by walking through translation tables. 1555 /// 1556 /// Address translation is big topic, here we only focus the scenario that 1557 /// happens in VMM while debugging kernel. This `translate_gva` 1558 /// implementation is restricted to: 1559 /// - Exception Level 1 1560 /// - Translate high address range only (kernel space) 1561 /// 1562 /// This implementation supports following Arm-v8a features related to 1563 /// address translation: 1564 /// - FEAT_LPA 1565 /// - FEAT_LVA 1566 /// - FEAT_LPA2 1567 /// 1568 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1569 fn translate_gva( 1570 &self, 1571 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1572 cpu_id: u8, 1573 gva: u64, 1574 ) -> Result<u64> { 1575 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1576 .lock() 1577 .unwrap() 1578 .vcpu 1579 .get_sys_reg(regs::TCR_EL1) 1580 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1581 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1582 .lock() 1583 .unwrap() 1584 .vcpu 1585 .get_sys_reg(regs::TTBR1_EL1) 1586 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1587 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1588 .lock() 1589 .unwrap() 1590 .vcpu 1591 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1592 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1593 1594 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1595 // or low (0x000xxx...). 1596 let high_range = extract_bits_64!(gva, 55, 1); 1597 if high_range == 0 { 1598 info!("VA (0x{:x}) range is not supported!", gva); 1599 return Ok(gva); 1600 } 1601 1602 // High range size offset 1603 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1604 // Granule size 1605 let tg = extract_bits_64!(tcr_el1, 30, 2); 1606 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1607 let ds = extract_bits_64!(tcr_el1, 59, 1); 1608 1609 if tsz == 0 { 1610 info!("VA translation is not ready!"); 1611 return Ok(gva); 1612 } 1613 1614 // VA size is determined by TCR_BL1.T1SZ 1615 let va_size = 64 - tsz; 1616 // Number of bits in VA consumed in each level of translation 1617 let stride = match tg { 1618 3 => 13, // 64KB granule size 1619 1 => 11, // 16KB granule size 1620 _ => 9, // 4KB, default 1621 }; 1622 // Starting level of walking 1623 let mut level = 4 - (va_size - 4) / stride; 1624 1625 // PA or IPA size is determined 1626 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1627 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1628 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1629 // To be safe, we use the minimum value if they are different. 1630 let pa_range = std::cmp::min(tcr_ips, pa_range); 1631 // PA size in bits 1632 let pa_size = match pa_range { 1633 0 => 32, 1634 1 => 36, 1635 2 => 40, 1636 3 => 42, 1637 4 => 44, 1638 5 => 48, 1639 6 => 52, 1640 _ => { 1641 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1642 "PA range not supported {pa_range}" 1643 )))) 1644 } 1645 }; 1646 1647 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1648 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1649 // If FEAT_LPA2 is present, the translation table descriptor holds 1650 // 50 bits of the table address of next level. 1651 // Otherwise, it is 48 bits. 1652 let descaddrmask = if ds == 1 { 1653 !0u64 >> (64 - 50) // mask with 50 least significant bits 1654 } else { 1655 !0u64 >> (64 - 48) // mask with 48 least significant bits 1656 }; 1657 let descaddrmask = descaddrmask & !indexmask_grainsize; 1658 1659 // Translation table base address 1660 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1661 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1662 // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1663 if pa_size == 52 { 1664 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1665 } 1666 1667 // Loop through tables of each level 1668 loop { 1669 // Table offset for current level 1670 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1671 descaddr |= table_offset; 1672 descaddr &= !7u64; 1673 1674 let mut buf = [0; 8]; 1675 guest_memory 1676 .memory() 1677 .read(&mut buf, GuestAddress(descaddr)) 1678 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1679 let descriptor = u64::from_le_bytes(buf); 1680 1681 descaddr = descriptor & descaddrmask; 1682 // In the case of FEAT_LPA, the next-level translation table address 1683 // bits [48:51] comes from bits [12:15] of the current descriptor. 1684 // For FEAT_LPA2, the next-level translation table address 1685 // bits [50:51] comes from bits [8:9] of the current descriptor, 1686 // bits [48:49] comes from bits [48:49] of the descriptor which was 1687 // handled previously. 1688 if pa_size == 52 { 1689 if ds == 1 { 1690 // FEAT_LPA2 1691 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1692 } else { 1693 // FEAT_LPA 1694 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1695 } 1696 } 1697 1698 if (descriptor & 2) != 0 && (level < 3) { 1699 // This is a table entry. Go down to next level. 1700 level += 1; 1701 indexmask = indexmask_grainsize; 1702 continue; 1703 } 1704 1705 break; 1706 } 1707 1708 // We have reached either: 1709 // - a page entry at level 3 or 1710 // - a block entry at level 1 or 2 1711 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1712 descaddr &= !(page_size - 1); 1713 descaddr |= gva & (page_size - 1); 1714 1715 Ok(descaddr) 1716 } 1717 1718 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1719 self.acpi_address = Some(acpi_address); 1720 } 1721 1722 pub(crate) fn set_interrupt_controller( 1723 &mut self, 1724 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1725 ) { 1726 self.interrupt_controller = Some(interrupt_controller); 1727 } 1728 1729 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1730 &self.vcpus_kill_signalled 1731 } 1732 } 1733 1734 struct Cpu { 1735 cpu_id: u8, 1736 proximity_domain: u32, 1737 dynamic: bool, 1738 } 1739 1740 #[cfg(target_arch = "x86_64")] 1741 const MADT_CPU_ENABLE_FLAG: usize = 0; 1742 1743 #[cfg(target_arch = "x86_64")] 1744 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1745 1746 impl Cpu { 1747 #[cfg(target_arch = "x86_64")] 1748 fn generate_mat(&self) -> Vec<u8> { 1749 let lapic = LocalX2Apic { 1750 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1751 length: 16, 1752 processor_id: self.cpu_id.into(), 1753 apic_id: self.cpu_id.into(), 1754 flags: 1 << MADT_CPU_ENABLE_FLAG, 1755 _reserved: 0, 1756 }; 1757 1758 let mut mat_data: Vec<u8> = Vec::new(); 1759 mat_data.resize(std::mem::size_of_val(&lapic), 0); 1760 // SAFETY: mat_data is large enough to hold lapic 1761 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1762 1763 mat_data 1764 } 1765 } 1766 1767 impl Aml for Cpu { 1768 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1769 #[cfg(target_arch = "x86_64")] 1770 let mat_data: Vec<u8> = self.generate_mat(); 1771 #[allow(clippy::if_same_then_else)] 1772 if self.dynamic { 1773 aml::Device::new( 1774 format!("C{:03X}", self.cpu_id).as_str().into(), 1775 vec![ 1776 &aml::Name::new("_HID".into(), &"ACPI0007"), 1777 &aml::Name::new("_UID".into(), &self.cpu_id), 1778 // Currently, AArch64 cannot support following fields. 1779 /* 1780 _STA return value: 1781 Bit [0] – Set if the device is present. 1782 Bit [1] – Set if the device is enabled and decoding its resources. 1783 Bit [2] – Set if the device should be shown in the UI. 1784 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1785 Bit [4] – Set if the battery is present. 1786 Bits [31:5] – Reserved (must be cleared). 1787 */ 1788 #[cfg(target_arch = "x86_64")] 1789 &aml::Method::new( 1790 "_STA".into(), 1791 0, 1792 false, 1793 // Call into CSTA method which will interrogate device 1794 vec![&aml::Return::new(&aml::MethodCall::new( 1795 "CSTA".into(), 1796 vec![&self.cpu_id], 1797 ))], 1798 ), 1799 &aml::Method::new( 1800 "_PXM".into(), 1801 0, 1802 false, 1803 vec![&aml::Return::new(&self.proximity_domain)], 1804 ), 1805 // The Linux kernel expects every CPU device to have a _MAT entry 1806 // containing the LAPIC for this processor with the enabled bit set 1807 // even it if is disabled in the MADT (non-boot CPU) 1808 #[cfg(target_arch = "x86_64")] 1809 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1810 // Trigger CPU ejection 1811 #[cfg(target_arch = "x86_64")] 1812 &aml::Method::new( 1813 "_EJ0".into(), 1814 1, 1815 false, 1816 // Call into CEJ0 method which will actually eject device 1817 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1818 ), 1819 ], 1820 ) 1821 .to_aml_bytes(sink); 1822 } else { 1823 aml::Device::new( 1824 format!("C{:03X}", self.cpu_id).as_str().into(), 1825 vec![ 1826 &aml::Name::new("_HID".into(), &"ACPI0007"), 1827 &aml::Name::new("_UID".into(), &self.cpu_id), 1828 #[cfg(target_arch = "x86_64")] 1829 &aml::Method::new( 1830 "_STA".into(), 1831 0, 1832 false, 1833 // Mark CPU present see CSTA implementation 1834 vec![&aml::Return::new(&0xfu8)], 1835 ), 1836 &aml::Method::new( 1837 "_PXM".into(), 1838 0, 1839 false, 1840 vec![&aml::Return::new(&self.proximity_domain)], 1841 ), 1842 // The Linux kernel expects every CPU device to have a _MAT entry 1843 // containing the LAPIC for this processor with the enabled bit set 1844 // even it if is disabled in the MADT (non-boot CPU) 1845 #[cfg(target_arch = "x86_64")] 1846 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1847 ], 1848 ) 1849 .to_aml_bytes(sink); 1850 } 1851 } 1852 } 1853 1854 struct CpuNotify { 1855 cpu_id: u8, 1856 } 1857 1858 impl Aml for CpuNotify { 1859 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1860 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 1861 aml::If::new( 1862 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1863 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1864 ) 1865 .to_aml_bytes(sink) 1866 } 1867 } 1868 1869 struct CpuMethods { 1870 max_vcpus: u8, 1871 dynamic: bool, 1872 } 1873 1874 impl Aml for CpuMethods { 1875 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1876 if self.dynamic { 1877 // CPU status method 1878 aml::Method::new( 1879 "CSTA".into(), 1880 1, 1881 true, 1882 vec![ 1883 // Take lock defined above 1884 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1885 // Write CPU number (in first argument) to I/O port via field 1886 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1887 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1888 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 1889 &aml::If::new( 1890 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 1891 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 1892 ), 1893 // Release lock 1894 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1895 // Return 0 or 0xf 1896 &aml::Return::new(&aml::Local(0)), 1897 ], 1898 ) 1899 .to_aml_bytes(sink); 1900 1901 let mut cpu_notifies = Vec::new(); 1902 for cpu_id in 0..self.max_vcpus { 1903 cpu_notifies.push(CpuNotify { cpu_id }); 1904 } 1905 1906 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 1907 for cpu_id in 0..self.max_vcpus { 1908 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 1909 } 1910 1911 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 1912 1913 aml::Method::new( 1914 "CEJ0".into(), 1915 1, 1916 true, 1917 vec![ 1918 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1919 // Write CPU number (in first argument) to I/O port via field 1920 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1921 // Set CEJ0 bit 1922 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 1923 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1924 ], 1925 ) 1926 .to_aml_bytes(sink); 1927 1928 aml::Method::new( 1929 "CSCN".into(), 1930 0, 1931 true, 1932 vec![ 1933 // Take lock defined above 1934 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1935 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1936 &aml::While::new( 1937 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 1938 vec![ 1939 // Write CPU number (in first argument) to I/O port via field 1940 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 1941 // Check if CINS bit is set 1942 &aml::If::new( 1943 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 1944 // Notify device if it is 1945 vec![ 1946 &aml::MethodCall::new( 1947 "CTFY".into(), 1948 vec![&aml::Local(0), &aml::ONE], 1949 ), 1950 // Reset CINS bit 1951 &aml::Store::new( 1952 &aml::Path::new("\\_SB_.PRES.CINS"), 1953 &aml::ONE, 1954 ), 1955 ], 1956 ), 1957 // Check if CRMV bit is set 1958 &aml::If::new( 1959 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 1960 // Notify device if it is (with the eject constant 0x3) 1961 vec![ 1962 &aml::MethodCall::new( 1963 "CTFY".into(), 1964 vec![&aml::Local(0), &3u8], 1965 ), 1966 // Reset CRMV bit 1967 &aml::Store::new( 1968 &aml::Path::new("\\_SB_.PRES.CRMV"), 1969 &aml::ONE, 1970 ), 1971 ], 1972 ), 1973 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 1974 ], 1975 ), 1976 // Release lock 1977 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1978 ], 1979 ) 1980 .to_aml_bytes(sink) 1981 } else { 1982 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 1983 } 1984 } 1985 } 1986 1987 impl Aml for CpuManager { 1988 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1989 #[cfg(target_arch = "x86_64")] 1990 if let Some(acpi_address) = self.acpi_address { 1991 // CPU hotplug controller 1992 aml::Device::new( 1993 "_SB_.PRES".into(), 1994 vec![ 1995 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 1996 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 1997 // Mutex to protect concurrent access as we write to choose CPU and then read back status 1998 &aml::Mutex::new("CPLK".into(), 0), 1999 &aml::Name::new( 2000 "_CRS".into(), 2001 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2002 aml::AddressSpaceCachable::NotCacheable, 2003 true, 2004 acpi_address.0, 2005 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2006 None, 2007 )]), 2008 ), 2009 // OpRegion and Fields map MMIO range into individual field values 2010 &aml::OpRegion::new( 2011 "PRST".into(), 2012 aml::OpRegionSpace::SystemMemory, 2013 &(acpi_address.0 as usize), 2014 &CPU_MANAGER_ACPI_SIZE, 2015 ), 2016 &aml::Field::new( 2017 "PRST".into(), 2018 aml::FieldAccessType::Byte, 2019 aml::FieldLockRule::NoLock, 2020 aml::FieldUpdateRule::WriteAsZeroes, 2021 vec![ 2022 aml::FieldEntry::Reserved(32), 2023 aml::FieldEntry::Named(*b"CPEN", 1), 2024 aml::FieldEntry::Named(*b"CINS", 1), 2025 aml::FieldEntry::Named(*b"CRMV", 1), 2026 aml::FieldEntry::Named(*b"CEJ0", 1), 2027 aml::FieldEntry::Reserved(4), 2028 aml::FieldEntry::Named(*b"CCMD", 8), 2029 ], 2030 ), 2031 &aml::Field::new( 2032 "PRST".into(), 2033 aml::FieldAccessType::DWord, 2034 aml::FieldLockRule::NoLock, 2035 aml::FieldUpdateRule::Preserve, 2036 vec![ 2037 aml::FieldEntry::Named(*b"CSEL", 32), 2038 aml::FieldEntry::Reserved(32), 2039 aml::FieldEntry::Named(*b"CDAT", 32), 2040 ], 2041 ), 2042 ], 2043 ) 2044 .to_aml_bytes(sink); 2045 } 2046 2047 // CPU devices 2048 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2049 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2050 // Bundle methods together under a common object 2051 let methods = CpuMethods { 2052 max_vcpus: self.config.max_vcpus, 2053 dynamic: self.dynamic, 2054 }; 2055 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2056 2057 let mut cpu_devices = Vec::new(); 2058 for cpu_id in 0..self.config.max_vcpus { 2059 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2060 let cpu_device = Cpu { 2061 cpu_id, 2062 proximity_domain, 2063 dynamic: self.dynamic, 2064 }; 2065 2066 cpu_devices.push(cpu_device); 2067 } 2068 2069 for cpu_device in cpu_devices.iter() { 2070 cpu_data_inner.push(cpu_device); 2071 } 2072 2073 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2074 } 2075 } 2076 2077 impl Pausable for CpuManager { 2078 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2079 // Tell the vCPUs to pause themselves next time they exit 2080 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2081 2082 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2083 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2084 // above. 2085 for state in self.vcpu_states.iter() { 2086 state.signal_thread(); 2087 } 2088 2089 for vcpu in self.vcpus.iter() { 2090 let mut vcpu = vcpu.lock().unwrap(); 2091 vcpu.pause()?; 2092 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2093 if !self.config.kvm_hyperv { 2094 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2095 MigratableError::Pause(anyhow!( 2096 "Could not notify guest it has been paused {:?}", 2097 e 2098 )) 2099 })?; 2100 } 2101 } 2102 2103 // The vCPU thread will change its paused state before parking, wait here for each 2104 // actived vCPU change their state to ensure they have parked. 2105 for state in self.vcpu_states.iter() { 2106 if state.active() { 2107 while !state.paused.load(Ordering::SeqCst) { 2108 // To avoid a priority inversion with the vCPU thread 2109 thread::sleep(std::time::Duration::from_millis(1)); 2110 } 2111 } 2112 } 2113 2114 Ok(()) 2115 } 2116 2117 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2118 for vcpu in self.vcpus.iter() { 2119 vcpu.lock().unwrap().resume()?; 2120 } 2121 2122 // Toggle the vCPUs pause boolean 2123 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2124 2125 // Unpark all the VCPU threads. 2126 // Once unparked, the next thing they will do is checking for the pause 2127 // boolean. Since it'll be set to false, they will exit their pause loop 2128 // and go back to vmx root. 2129 for state in self.vcpu_states.iter() { 2130 state.paused.store(false, Ordering::SeqCst); 2131 state.unpark_thread(); 2132 } 2133 Ok(()) 2134 } 2135 } 2136 2137 impl Snapshottable for CpuManager { 2138 fn id(&self) -> String { 2139 CPU_MANAGER_SNAPSHOT_ID.to_string() 2140 } 2141 2142 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2143 let mut cpu_manager_snapshot = Snapshot::default(); 2144 2145 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2146 for vcpu in &self.vcpus { 2147 let mut vcpu = vcpu.lock().unwrap(); 2148 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2149 } 2150 2151 Ok(cpu_manager_snapshot) 2152 } 2153 } 2154 2155 impl Transportable for CpuManager {} 2156 impl Migratable for CpuManager {} 2157 2158 #[cfg(feature = "guest_debug")] 2159 impl Debuggable for CpuManager { 2160 #[cfg(feature = "kvm")] 2161 fn set_guest_debug( 2162 &self, 2163 cpu_id: usize, 2164 addrs: &[GuestAddress], 2165 singlestep: bool, 2166 ) -> std::result::Result<(), DebuggableError> { 2167 self.vcpus[cpu_id] 2168 .lock() 2169 .unwrap() 2170 .vcpu 2171 .set_guest_debug(addrs, singlestep) 2172 .map_err(DebuggableError::SetDebug) 2173 } 2174 2175 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2176 Ok(()) 2177 } 2178 2179 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2180 Ok(()) 2181 } 2182 2183 #[cfg(target_arch = "x86_64")] 2184 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2185 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2186 let gregs = self 2187 .get_regs(cpu_id as u8) 2188 .map_err(DebuggableError::ReadRegs)?; 2189 let regs = [ 2190 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, 2191 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, 2192 ]; 2193 2194 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2195 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2196 let eflags = gregs.rflags as u32; 2197 let rip = gregs.rip; 2198 2199 // Segment registers: CS, SS, DS, ES, FS, GS 2200 let sregs = self 2201 .get_sregs(cpu_id as u8) 2202 .map_err(DebuggableError::ReadRegs)?; 2203 let segments = X86SegmentRegs { 2204 cs: sregs.cs.selector as u32, 2205 ss: sregs.ss.selector as u32, 2206 ds: sregs.ds.selector as u32, 2207 es: sregs.es.selector as u32, 2208 fs: sregs.fs.selector as u32, 2209 gs: sregs.gs.selector as u32, 2210 }; 2211 2212 // TODO: Add other registers 2213 2214 Ok(CoreRegs { 2215 regs, 2216 eflags, 2217 rip, 2218 segments, 2219 ..Default::default() 2220 }) 2221 } 2222 2223 #[cfg(target_arch = "aarch64")] 2224 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2225 let gregs = self 2226 .get_regs(cpu_id as u8) 2227 .map_err(DebuggableError::ReadRegs)?; 2228 Ok(CoreRegs { 2229 x: gregs.regs.regs, 2230 sp: gregs.regs.sp, 2231 pc: gregs.regs.pc, 2232 ..Default::default() 2233 }) 2234 } 2235 2236 #[cfg(target_arch = "x86_64")] 2237 fn write_regs( 2238 &self, 2239 cpu_id: usize, 2240 regs: &CoreRegs, 2241 ) -> std::result::Result<(), DebuggableError> { 2242 let orig_gregs = self 2243 .get_regs(cpu_id as u8) 2244 .map_err(DebuggableError::ReadRegs)?; 2245 let gregs = StandardRegisters { 2246 rax: regs.regs[0], 2247 rbx: regs.regs[1], 2248 rcx: regs.regs[2], 2249 rdx: regs.regs[3], 2250 rsi: regs.regs[4], 2251 rdi: regs.regs[5], 2252 rbp: regs.regs[6], 2253 rsp: regs.regs[7], 2254 r8: regs.regs[8], 2255 r9: regs.regs[9], 2256 r10: regs.regs[10], 2257 r11: regs.regs[11], 2258 r12: regs.regs[12], 2259 r13: regs.regs[13], 2260 r14: regs.regs[14], 2261 r15: regs.regs[15], 2262 rip: regs.rip, 2263 // Update the lower 32-bit of rflags. 2264 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), 2265 }; 2266 2267 self.set_regs(cpu_id as u8, &gregs) 2268 .map_err(DebuggableError::WriteRegs)?; 2269 2270 // Segment registers: CS, SS, DS, ES, FS, GS 2271 // Since GDB care only selectors, we call get_sregs() first. 2272 let mut sregs = self 2273 .get_sregs(cpu_id as u8) 2274 .map_err(DebuggableError::ReadRegs)?; 2275 sregs.cs.selector = regs.segments.cs as u16; 2276 sregs.ss.selector = regs.segments.ss as u16; 2277 sregs.ds.selector = regs.segments.ds as u16; 2278 sregs.es.selector = regs.segments.es as u16; 2279 sregs.fs.selector = regs.segments.fs as u16; 2280 sregs.gs.selector = regs.segments.gs as u16; 2281 2282 self.set_sregs(cpu_id as u8, &sregs) 2283 .map_err(DebuggableError::WriteRegs)?; 2284 2285 // TODO: Add other registers 2286 2287 Ok(()) 2288 } 2289 2290 #[cfg(target_arch = "aarch64")] 2291 fn write_regs( 2292 &self, 2293 cpu_id: usize, 2294 regs: &CoreRegs, 2295 ) -> std::result::Result<(), DebuggableError> { 2296 let mut gregs = self 2297 .get_regs(cpu_id as u8) 2298 .map_err(DebuggableError::ReadRegs)?; 2299 2300 gregs.regs.regs = regs.x; 2301 gregs.regs.sp = regs.sp; 2302 gregs.regs.pc = regs.pc; 2303 2304 self.set_regs(cpu_id as u8, &gregs) 2305 .map_err(DebuggableError::WriteRegs)?; 2306 2307 Ok(()) 2308 } 2309 2310 fn read_mem( 2311 &self, 2312 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2313 cpu_id: usize, 2314 vaddr: GuestAddress, 2315 len: usize, 2316 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2317 let mut buf = vec![0; len]; 2318 let mut total_read = 0_u64; 2319 2320 while total_read < len as u64 { 2321 let gaddr = vaddr.0 + total_read; 2322 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2323 Ok(paddr) => paddr, 2324 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2325 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2326 }; 2327 let psize = arch::PAGE_SIZE as u64; 2328 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2329 guest_memory 2330 .memory() 2331 .read( 2332 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2333 GuestAddress(paddr), 2334 ) 2335 .map_err(DebuggableError::ReadMem)?; 2336 total_read += read_len; 2337 } 2338 Ok(buf) 2339 } 2340 2341 fn write_mem( 2342 &self, 2343 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2344 cpu_id: usize, 2345 vaddr: &GuestAddress, 2346 data: &[u8], 2347 ) -> std::result::Result<(), DebuggableError> { 2348 let mut total_written = 0_u64; 2349 2350 while total_written < data.len() as u64 { 2351 let gaddr = vaddr.0 + total_written; 2352 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2353 Ok(paddr) => paddr, 2354 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2355 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2356 }; 2357 let psize = arch::PAGE_SIZE as u64; 2358 let write_len = std::cmp::min( 2359 data.len() as u64 - total_written, 2360 psize - (paddr & (psize - 1)), 2361 ); 2362 guest_memory 2363 .memory() 2364 .write( 2365 &data[total_written as usize..total_written as usize + write_len as usize], 2366 GuestAddress(paddr), 2367 ) 2368 .map_err(DebuggableError::WriteMem)?; 2369 total_written += write_len; 2370 } 2371 Ok(()) 2372 } 2373 2374 fn active_vcpus(&self) -> usize { 2375 self.present_vcpus() as usize 2376 } 2377 } 2378 2379 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2380 impl Elf64Writable for CpuManager {} 2381 2382 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2383 impl CpuElf64Writable for CpuManager { 2384 fn cpu_write_elf64_note( 2385 &mut self, 2386 dump_state: &DumpState, 2387 ) -> std::result::Result<(), GuestDebuggableError> { 2388 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2389 for vcpu in &self.vcpus { 2390 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2391 let mut pos: usize = 0; 2392 let mut buf = vec![0; note_size as usize]; 2393 let descsz = size_of::<X86_64ElfPrStatus>(); 2394 let vcpu_id = vcpu.lock().unwrap().id; 2395 2396 let note = Elf64_Nhdr { 2397 n_namesz: COREDUMP_NAME_SIZE, 2398 n_descsz: descsz as u32, 2399 n_type: NT_PRSTATUS, 2400 }; 2401 2402 let bytes: &[u8] = note.as_slice(); 2403 buf.splice(0.., bytes.to_vec()); 2404 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2405 buf.resize(pos + 4, 0); 2406 buf.splice(pos.., "CORE".to_string().into_bytes()); 2407 2408 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2409 buf.resize(pos + 32 + 4, 0); 2410 let pid = vcpu_id as u64; 2411 let bytes: &[u8] = pid.as_slice(); 2412 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2413 2414 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2415 2416 let orig_rax: u64 = 0; 2417 let gregs = self.vcpus[usize::from(vcpu_id)] 2418 .lock() 2419 .unwrap() 2420 .vcpu 2421 .get_regs() 2422 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2423 2424 let regs1 = [ 2425 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11, 2426 gregs.r10, 2427 ]; 2428 let regs2 = [ 2429 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax, 2430 ]; 2431 2432 let sregs = self.vcpus[usize::from(vcpu_id)] 2433 .lock() 2434 .unwrap() 2435 .vcpu 2436 .get_sregs() 2437 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2438 2439 debug!( 2440 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2441 gregs.rip, 2442 gregs.rsp, 2443 sregs.gs.base, 2444 sregs.cs.selector, 2445 sregs.ss.selector, 2446 sregs.ds.selector, 2447 ); 2448 2449 let regs = X86_64UserRegs { 2450 regs1, 2451 regs2, 2452 rip: gregs.rip, 2453 cs: sregs.cs.selector as u64, 2454 eflags: gregs.rflags, 2455 rsp: gregs.rsp, 2456 ss: sregs.ss.selector as u64, 2457 fs_base: sregs.fs.base, 2458 gs_base: sregs.gs.base, 2459 ds: sregs.ds.selector as u64, 2460 es: sregs.es.selector as u64, 2461 fs: sregs.fs.selector as u64, 2462 gs: sregs.gs.selector as u64, 2463 }; 2464 2465 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2466 let bytes: &[u8] = regs.as_slice(); 2467 buf.resize(note_size as usize, 0); 2468 buf.splice(pos.., bytes.to_vec()); 2469 buf.resize(note_size as usize, 0); 2470 2471 coredump_file 2472 .write(&buf) 2473 .map_err(GuestDebuggableError::CoredumpFile)?; 2474 } 2475 2476 Ok(()) 2477 } 2478 2479 fn cpu_write_vmm_note( 2480 &mut self, 2481 dump_state: &DumpState, 2482 ) -> std::result::Result<(), GuestDebuggableError> { 2483 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2484 for vcpu in &self.vcpus { 2485 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2486 let mut pos: usize = 0; 2487 let mut buf = vec![0; note_size as usize]; 2488 let descsz = size_of::<DumpCpusState>(); 2489 let vcpu_id = vcpu.lock().unwrap().id; 2490 2491 let note = Elf64_Nhdr { 2492 n_namesz: COREDUMP_NAME_SIZE, 2493 n_descsz: descsz as u32, 2494 n_type: 0, 2495 }; 2496 2497 let bytes: &[u8] = note.as_slice(); 2498 buf.splice(0.., bytes.to_vec()); 2499 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2500 2501 buf.resize(pos + 4, 0); 2502 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2503 2504 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2505 2506 let gregs = self.vcpus[usize::from(vcpu_id)] 2507 .lock() 2508 .unwrap() 2509 .vcpu 2510 .get_regs() 2511 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2512 2513 let regs1 = [ 2514 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp, 2515 gregs.rbp, 2516 ]; 2517 2518 let regs2 = [ 2519 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, 2520 gregs.r15, 2521 ]; 2522 2523 let sregs = self.vcpus[usize::from(vcpu_id)] 2524 .lock() 2525 .unwrap() 2526 .vcpu 2527 .get_sregs() 2528 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2529 2530 let mut msrs = vec![MsrEntry { 2531 index: msr_index::MSR_KERNEL_GS_BASE, 2532 ..Default::default() 2533 }]; 2534 2535 self.vcpus[vcpu_id as usize] 2536 .lock() 2537 .unwrap() 2538 .vcpu 2539 .get_msrs(&mut msrs) 2540 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2541 let kernel_gs_base = msrs[0].data; 2542 2543 let cs = CpuSegment::new(sregs.cs); 2544 let ds = CpuSegment::new(sregs.ds); 2545 let es = CpuSegment::new(sregs.es); 2546 let fs = CpuSegment::new(sregs.fs); 2547 let gs = CpuSegment::new(sregs.gs); 2548 let ss = CpuSegment::new(sregs.ss); 2549 let ldt = CpuSegment::new(sregs.ldt); 2550 let tr = CpuSegment::new(sregs.tr); 2551 let gdt = CpuSegment::new_from_table(sregs.gdt); 2552 let idt = CpuSegment::new_from_table(sregs.idt); 2553 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2554 let regs = DumpCpusState { 2555 version: 1, 2556 size: size_of::<DumpCpusState>() as u32, 2557 regs1, 2558 regs2, 2559 rip: gregs.rip, 2560 rflags: gregs.rflags, 2561 cs, 2562 ds, 2563 es, 2564 fs, 2565 gs, 2566 ss, 2567 ldt, 2568 tr, 2569 gdt, 2570 idt, 2571 cr, 2572 kernel_gs_base, 2573 }; 2574 2575 let bytes: &[u8] = regs.as_slice(); 2576 buf.resize(note_size as usize, 0); 2577 buf.splice(pos.., bytes.to_vec()); 2578 buf.resize(note_size as usize, 0); 2579 2580 coredump_file 2581 .write(&buf) 2582 .map_err(GuestDebuggableError::CoredumpFile)?; 2583 } 2584 2585 Ok(()) 2586 } 2587 } 2588 2589 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2590 #[cfg(test)] 2591 mod tests { 2592 use arch::x86_64::interrupts::*; 2593 use arch::x86_64::regs::*; 2594 use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters}; 2595 2596 #[test] 2597 fn test_setlint() { 2598 let hv = hypervisor::new().unwrap(); 2599 let vm = hv.create_vm().expect("new VM fd creation failed"); 2600 assert!(hv.check_required_extensions().is_ok()); 2601 // Calling get_lapic will fail if there is no irqchip before hand. 2602 assert!(vm.create_irq_chip().is_ok()); 2603 let vcpu = vm.create_vcpu(0, None).unwrap(); 2604 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2605 2606 // Compute the value that is expected to represent LVT0 and LVT1. 2607 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2608 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2609 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2610 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2611 2612 set_lint(&vcpu).unwrap(); 2613 2614 // Compute the value that represents LVT0 and LVT1 after set_lint. 2615 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2616 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2617 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2618 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2619 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2620 } 2621 2622 #[test] 2623 fn test_setup_fpu() { 2624 let hv = hypervisor::new().unwrap(); 2625 let vm = hv.create_vm().expect("new VM fd creation failed"); 2626 let vcpu = vm.create_vcpu(0, None).unwrap(); 2627 setup_fpu(&vcpu).unwrap(); 2628 2629 let expected_fpu: FpuState = FpuState { 2630 fcw: 0x37f, 2631 mxcsr: 0x1f80, 2632 ..Default::default() 2633 }; 2634 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2635 // TODO: auto-generate kvm related structures with PartialEq on. 2636 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2637 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2638 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2639 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2640 // remove it at all. 2641 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2642 } 2643 2644 #[test] 2645 fn test_setup_msrs() { 2646 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2647 2648 let hv = hypervisor::new().unwrap(); 2649 let vm = hv.create_vm().expect("new VM fd creation failed"); 2650 let vcpu = vm.create_vcpu(0, None).unwrap(); 2651 setup_msrs(&vcpu).unwrap(); 2652 2653 // This test will check against the last MSR entry configured (the tenth one). 2654 // See create_msr_entries for details. 2655 let mut msrs = vec![MsrEntry { 2656 index: msr_index::MSR_IA32_MISC_ENABLE, 2657 ..Default::default() 2658 }]; 2659 2660 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2661 // in this test case scenario. 2662 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2663 assert_eq!(read_msrs, 1); 2664 2665 // Official entries that were setup when we did setup_msrs. We need to assert that the 2666 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2667 // expect. 2668 let entry_vec = vcpu.boot_msr_entries(); 2669 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2670 } 2671 2672 #[test] 2673 fn test_setup_regs() { 2674 let hv = hypervisor::new().unwrap(); 2675 let vm = hv.create_vm().expect("new VM fd creation failed"); 2676 let vcpu = vm.create_vcpu(0, None).unwrap(); 2677 2678 let expected_regs: StandardRegisters = StandardRegisters { 2679 rflags: 0x0000000000000002u64, 2680 rbx: arch::layout::PVH_INFO_START.0, 2681 rip: 1, 2682 ..Default::default() 2683 }; 2684 2685 setup_regs(&vcpu, expected_regs.rip).unwrap(); 2686 2687 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2688 assert_eq!(actual_regs, expected_regs); 2689 } 2690 } 2691 2692 #[cfg(target_arch = "aarch64")] 2693 #[cfg(test)] 2694 mod tests { 2695 use arch::{aarch64::regs, layout}; 2696 use hypervisor::kvm::aarch64::is_system_register; 2697 use hypervisor::kvm::kvm_bindings::{ 2698 kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2699 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2700 }; 2701 use hypervisor::{arm64_core_reg_id, offset_of}; 2702 use std::mem; 2703 2704 #[test] 2705 fn test_setup_regs() { 2706 let hv = hypervisor::new().unwrap(); 2707 let vm = hv.create_vm().unwrap(); 2708 let vcpu = vm.create_vcpu(0, None).unwrap(); 2709 2710 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2711 // Must fail when vcpu is not initialized yet. 2712 assert!(res.is_err()); 2713 2714 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2715 vm.get_preferred_target(&mut kvi).unwrap(); 2716 vcpu.vcpu_init(&kvi).unwrap(); 2717 2718 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2719 } 2720 2721 #[test] 2722 fn test_read_mpidr() { 2723 let hv = hypervisor::new().unwrap(); 2724 let vm = hv.create_vm().unwrap(); 2725 let vcpu = vm.create_vcpu(0, None).unwrap(); 2726 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2727 vm.get_preferred_target(&mut kvi).unwrap(); 2728 2729 // Must fail when vcpu is not initialized yet. 2730 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2731 2732 vcpu.vcpu_init(&kvi).unwrap(); 2733 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2734 } 2735 2736 #[test] 2737 fn test_is_system_register() { 2738 let offset = offset_of!(user_pt_regs, pc); 2739 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2740 assert!(!is_system_register(regid)); 2741 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2742 assert!(is_system_register(regid)); 2743 } 2744 2745 #[test] 2746 fn test_save_restore_core_regs() { 2747 let hv = hypervisor::new().unwrap(); 2748 let vm = hv.create_vm().unwrap(); 2749 let vcpu = vm.create_vcpu(0, None).unwrap(); 2750 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2751 vm.get_preferred_target(&mut kvi).unwrap(); 2752 2753 // Must fail when vcpu is not initialized yet. 2754 let res = vcpu.get_regs(); 2755 assert!(res.is_err()); 2756 assert_eq!( 2757 format!("{}", res.unwrap_err()), 2758 "Failed to get core register: Exec format error (os error 8)" 2759 ); 2760 2761 let mut state = kvm_regs::default(); 2762 let res = vcpu.set_regs(&state); 2763 assert!(res.is_err()); 2764 assert_eq!( 2765 format!("{}", res.unwrap_err()), 2766 "Failed to set core register: Exec format error (os error 8)" 2767 ); 2768 2769 vcpu.vcpu_init(&kvi).unwrap(); 2770 let res = vcpu.get_regs(); 2771 assert!(res.is_ok()); 2772 state = res.unwrap(); 2773 assert_eq!(state.regs.pstate, 0x3C5); 2774 2775 assert!(vcpu.set_regs(&state).is_ok()); 2776 } 2777 2778 #[test] 2779 fn test_get_set_mpstate() { 2780 let hv = hypervisor::new().unwrap(); 2781 let vm = hv.create_vm().unwrap(); 2782 let vcpu = vm.create_vcpu(0, None).unwrap(); 2783 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2784 vm.get_preferred_target(&mut kvi).unwrap(); 2785 2786 let res = vcpu.get_mp_state(); 2787 assert!(res.is_ok()); 2788 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 2789 } 2790 } 2791