1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 #[cfg(target_arch = "x86_64")] 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, sdt::Sdt, Aml}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 use arch::EntryPoint; 35 use arch::NumaNodes; 36 #[cfg(target_arch = "aarch64")] 37 use devices::gic::Gic; 38 use devices::interrupt_controller::InterruptController; 39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 44 use hypervisor::aarch64::StandardRegisters; 45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 46 use hypervisor::arch::x86::msr_index; 47 #[cfg(target_arch = "x86_64")] 48 use hypervisor::arch::x86::CpuIdEntry; 49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 50 use hypervisor::arch::x86::MsrEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters}; 53 #[cfg(target_arch = "aarch64")] 54 use hypervisor::kvm::kvm_bindings; 55 #[cfg(all(target_arch = "aarch64", feature = "kvm"))] 56 use hypervisor::kvm::kvm_ioctls::Cap; 57 #[cfg(feature = "tdx")] 58 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 59 #[cfg(target_arch = "x86_64")] 60 use hypervisor::CpuVendor; 61 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps}; 62 use libc::{c_void, siginfo_t}; 63 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 64 use linux_loader::elf::Elf64_Nhdr; 65 use seccompiler::{apply_filter, SeccompAction}; 66 use std::collections::BTreeMap; 67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 68 use std::io::Write; 69 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 70 use std::mem::size_of; 71 use std::os::unix::thread::JoinHandleExt; 72 use std::sync::atomic::{AtomicBool, Ordering}; 73 use std::sync::{Arc, Barrier, Mutex}; 74 use std::{cmp, io, result, thread}; 75 use thiserror::Error; 76 use tracer::trace_scoped; 77 use vm_device::BusDevice; 78 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 79 use vm_memory::ByteValued; 80 #[cfg(feature = "guest_debug")] 81 use vm_memory::{Bytes, GuestAddressSpace}; 82 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 83 use vm_migration::{ 84 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 85 Transportable, 86 }; 87 use vmm_sys_util::eventfd::EventFd; 88 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 89 use zerocopy::AsBytes; 90 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 91 /// Extract the specified bits of a 64-bit integer. 92 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 93 /// following expression should return 3 (`0b11`): 94 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 95 /// 96 macro_rules! extract_bits_64 { 97 ($value: tt, $offset: tt, $length: tt) => { 98 ($value >> $offset) & (!0u64 >> (64 - $length)) 99 }; 100 } 101 102 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 103 macro_rules! extract_bits_64_without_offset { 104 ($value: tt, $length: tt) => { 105 $value & (!0u64 >> (64 - $length)) 106 }; 107 } 108 109 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 110 111 #[derive(Debug, Error)] 112 pub enum Error { 113 #[error("Error creating vCPU: {0}")] 114 VcpuCreate(#[source] anyhow::Error), 115 116 #[error("Error running bCPU: {0}")] 117 VcpuRun(#[source] anyhow::Error), 118 119 #[error("Error spawning vCPU thread: {0}")] 120 VcpuSpawn(#[source] io::Error), 121 122 #[error("Error generating common CPUID: {0}")] 123 CommonCpuId(#[source] arch::Error), 124 125 #[error("Error configuring vCPU: {0}")] 126 VcpuConfiguration(#[source] arch::Error), 127 128 #[error("Still pending removed vcpu")] 129 VcpuPendingRemovedVcpu, 130 131 #[cfg(target_arch = "aarch64")] 132 #[error("Error fetching preferred target: {0}")] 133 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 134 135 #[cfg(target_arch = "aarch64")] 136 #[error("Error initialising vCPU: {0}")] 137 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 138 139 #[error("Failed to join on vCPU threads: {0:?}")] 140 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 141 142 #[error("Error adding CpuManager to MMIO bus: {0}")] 143 BusError(#[source] vm_device::BusError), 144 145 #[error("Requested vCPUs exceed maximum")] 146 DesiredVCpuCountExceedsMax, 147 148 #[error("Cannot create seccomp filter: {0}")] 149 CreateSeccompFilter(#[source] seccompiler::Error), 150 151 #[error("Cannot apply seccomp filter: {0}")] 152 ApplySeccompFilter(#[source] seccompiler::Error), 153 154 #[error("Error starting vCPU after restore: {0}")] 155 StartRestoreVcpu(#[source] anyhow::Error), 156 157 #[error("Unexpected VmExit")] 158 UnexpectedVmExit, 159 160 #[error("Failed to allocate MMIO address for CpuManager")] 161 AllocateMmmioAddress, 162 163 #[cfg(feature = "tdx")] 164 #[error("Error initializing TDX: {0}")] 165 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 166 167 #[cfg(target_arch = "aarch64")] 168 #[error("Error initializing PMU: {0}")] 169 InitPmu(#[source] hypervisor::HypervisorCpuError), 170 171 #[cfg(feature = "guest_debug")] 172 #[error("Error during CPU debug: {0}")] 173 CpuDebug(#[source] hypervisor::HypervisorCpuError), 174 175 #[cfg(feature = "guest_debug")] 176 #[error("Error translating virtual address: {0}")] 177 TranslateVirtualAddress(#[source] anyhow::Error), 178 179 #[cfg(target_arch = "x86_64")] 180 #[error("Error setting up AMX: {0}")] 181 AmxEnable(#[source] anyhow::Error), 182 183 #[error("Maximum number of vCPUs exceeds host limit")] 184 MaximumVcpusExceeded, 185 } 186 pub type Result<T> = result::Result<T, Error>; 187 188 #[cfg(target_arch = "x86_64")] 189 #[allow(dead_code)] 190 #[repr(packed)] 191 #[derive(AsBytes)] 192 struct LocalX2Apic { 193 pub r#type: u8, 194 pub length: u8, 195 pub _reserved: u16, 196 pub apic_id: u32, 197 pub flags: u32, 198 pub processor_id: u32, 199 } 200 201 #[allow(dead_code)] 202 #[repr(packed)] 203 #[derive(Default, AsBytes)] 204 struct Ioapic { 205 pub r#type: u8, 206 pub length: u8, 207 pub ioapic_id: u8, 208 _reserved: u8, 209 pub apic_address: u32, 210 pub gsi_base: u32, 211 } 212 213 #[cfg(target_arch = "aarch64")] 214 #[allow(dead_code)] 215 #[repr(packed)] 216 #[derive(AsBytes)] 217 struct GicC { 218 pub r#type: u8, 219 pub length: u8, 220 pub reserved0: u16, 221 pub cpu_interface_number: u32, 222 pub uid: u32, 223 pub flags: u32, 224 pub parking_version: u32, 225 pub performance_interrupt: u32, 226 pub parked_address: u64, 227 pub base_address: u64, 228 pub gicv_base_address: u64, 229 pub gich_base_address: u64, 230 pub vgic_interrupt: u32, 231 pub gicr_base_address: u64, 232 pub mpidr: u64, 233 pub proc_power_effi_class: u8, 234 pub reserved1: u8, 235 pub spe_overflow_interrupt: u16, 236 } 237 238 #[cfg(target_arch = "aarch64")] 239 #[allow(dead_code)] 240 #[repr(packed)] 241 #[derive(AsBytes)] 242 struct GicD { 243 pub r#type: u8, 244 pub length: u8, 245 pub reserved0: u16, 246 pub gic_id: u32, 247 pub base_address: u64, 248 pub global_irq_base: u32, 249 pub version: u8, 250 pub reserved1: [u8; 3], 251 } 252 253 #[cfg(target_arch = "aarch64")] 254 #[allow(dead_code)] 255 #[repr(packed)] 256 #[derive(AsBytes)] 257 struct GicR { 258 pub r#type: u8, 259 pub length: u8, 260 pub reserved: u16, 261 pub base_address: u64, 262 pub range_length: u32, 263 } 264 265 #[cfg(target_arch = "aarch64")] 266 #[allow(dead_code)] 267 #[repr(packed)] 268 #[derive(AsBytes)] 269 struct GicIts { 270 pub r#type: u8, 271 pub length: u8, 272 pub reserved0: u16, 273 pub translation_id: u32, 274 pub base_address: u64, 275 pub reserved1: u32, 276 } 277 278 #[cfg(target_arch = "aarch64")] 279 #[allow(dead_code)] 280 #[repr(packed)] 281 #[derive(AsBytes)] 282 struct ProcessorHierarchyNode { 283 pub r#type: u8, 284 pub length: u8, 285 pub reserved: u16, 286 pub flags: u32, 287 pub parent: u32, 288 pub acpi_processor_id: u32, 289 pub num_private_resources: u32, 290 } 291 292 #[allow(dead_code)] 293 #[repr(packed)] 294 #[derive(Default, AsBytes)] 295 struct InterruptSourceOverride { 296 pub r#type: u8, 297 pub length: u8, 298 pub bus: u8, 299 pub source: u8, 300 pub gsi: u32, 301 pub flags: u16, 302 } 303 304 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 305 macro_rules! round_up { 306 ($n:expr,$d:expr) => { 307 (($n / ($d + 1)) + 1) * $d 308 }; 309 } 310 311 /// A wrapper around creating and using a kvm-based VCPU. 312 pub struct Vcpu { 313 // The hypervisor abstracted CPU. 314 vcpu: Arc<dyn hypervisor::Vcpu>, 315 id: u8, 316 #[cfg(target_arch = "aarch64")] 317 mpidr: u64, 318 saved_state: Option<CpuState>, 319 #[cfg(target_arch = "x86_64")] 320 vendor: CpuVendor, 321 } 322 323 impl Vcpu { 324 /// Constructs a new VCPU for `vm`. 325 /// 326 /// # Arguments 327 /// 328 /// * `id` - Represents the CPU number between [0, max vcpus). 329 /// * `vm` - The virtual machine this vcpu will get attached to. 330 /// * `vm_ops` - Optional object for exit handling. 331 /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) 332 pub fn new( 333 id: u8, 334 vm: &Arc<dyn hypervisor::Vm>, 335 vm_ops: Option<Arc<dyn VmOps>>, 336 #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, 337 ) -> Result<Self> { 338 let vcpu = vm 339 .create_vcpu(id, vm_ops) 340 .map_err(|e| Error::VcpuCreate(e.into()))?; 341 // Initially the cpuid per vCPU is the one supported by this VM. 342 Ok(Vcpu { 343 vcpu, 344 id, 345 #[cfg(target_arch = "aarch64")] 346 mpidr: 0, 347 saved_state: None, 348 #[cfg(target_arch = "x86_64")] 349 vendor: cpu_vendor, 350 }) 351 } 352 353 /// Configures a vcpu and should be called once per vcpu when created. 354 /// 355 /// # Arguments 356 /// 357 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 358 /// * `guest_memory` - Guest memory. 359 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 360 pub fn configure( 361 &mut self, 362 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 363 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 364 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 365 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 366 #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, 367 ) -> Result<()> { 368 #[cfg(target_arch = "aarch64")] 369 { 370 self.init(vm)?; 371 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 372 .map_err(Error::VcpuConfiguration)?; 373 } 374 info!("Configuring vCPU: cpu_id = {}", self.id); 375 #[cfg(target_arch = "x86_64")] 376 arch::configure_vcpu( 377 &self.vcpu, 378 self.id, 379 boot_setup, 380 cpuid, 381 kvm_hyperv, 382 self.vendor, 383 topology, 384 ) 385 .map_err(Error::VcpuConfiguration)?; 386 387 Ok(()) 388 } 389 390 /// Gets the MPIDR register value. 391 #[cfg(target_arch = "aarch64")] 392 pub fn get_mpidr(&self) -> u64 { 393 self.mpidr 394 } 395 396 /// Gets the saved vCPU state. 397 #[cfg(target_arch = "aarch64")] 398 pub fn get_saved_state(&self) -> Option<CpuState> { 399 self.saved_state.clone() 400 } 401 402 /// Initializes an aarch64 specific vcpu for booting Linux. 403 #[cfg(target_arch = "aarch64")] 404 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 405 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 406 407 // This reads back the kernel's preferred target type. 408 vm.get_preferred_target(&mut kvi) 409 .map_err(Error::VcpuArmPreferredTarget)?; 410 // We already checked that the capability is supported. 411 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 412 if vm 413 .as_any() 414 .downcast_ref::<hypervisor::kvm::KvmVm>() 415 .unwrap() 416 .check_extension(Cap::ArmPmuV3) 417 { 418 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 419 } 420 // Non-boot cpus are powered off initially. 421 if self.id > 0 { 422 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 423 } 424 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 425 } 426 427 /// Runs the VCPU until it exits, returning the reason. 428 /// 429 /// Note that the state of the VCPU and associated VM must be setup first for this to do 430 /// anything useful. 431 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 432 self.vcpu.run() 433 } 434 } 435 436 impl Pausable for Vcpu {} 437 impl Snapshottable for Vcpu { 438 fn id(&self) -> String { 439 self.id.to_string() 440 } 441 442 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 443 let saved_state = self 444 .vcpu 445 .state() 446 .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?; 447 448 self.saved_state = Some(saved_state.clone()); 449 450 Ok(Snapshot::from_data(SnapshotData::new_from_state( 451 &saved_state, 452 )?)) 453 } 454 } 455 456 pub struct CpuManager { 457 config: CpusConfig, 458 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 459 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 460 #[cfg(target_arch = "x86_64")] 461 cpuid: Vec<CpuIdEntry>, 462 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 463 vm: Arc<dyn hypervisor::Vm>, 464 vcpus_kill_signalled: Arc<AtomicBool>, 465 vcpus_pause_signalled: Arc<AtomicBool>, 466 exit_evt: EventFd, 467 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 468 reset_evt: EventFd, 469 #[cfg(feature = "guest_debug")] 470 vm_debug_evt: EventFd, 471 vcpu_states: Vec<VcpuState>, 472 selected_cpu: u8, 473 vcpus: Vec<Arc<Mutex<Vcpu>>>, 474 seccomp_action: SeccompAction, 475 vm_ops: Arc<dyn VmOps>, 476 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 477 acpi_address: Option<GuestAddress>, 478 proximity_domain_per_cpu: BTreeMap<u8, u32>, 479 affinity: BTreeMap<u8, Vec<u8>>, 480 dynamic: bool, 481 hypervisor: Arc<dyn hypervisor::Hypervisor>, 482 } 483 484 const CPU_ENABLE_FLAG: usize = 0; 485 const CPU_INSERTING_FLAG: usize = 1; 486 const CPU_REMOVING_FLAG: usize = 2; 487 const CPU_EJECT_FLAG: usize = 3; 488 489 const CPU_STATUS_OFFSET: u64 = 4; 490 const CPU_SELECTION_OFFSET: u64 = 0; 491 492 impl BusDevice for CpuManager { 493 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 494 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 495 data.fill(0); 496 497 match offset { 498 CPU_SELECTION_OFFSET => { 499 data[0] = self.selected_cpu; 500 } 501 CPU_STATUS_OFFSET => { 502 if self.selected_cpu < self.max_vcpus() { 503 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 504 if state.active() { 505 data[0] |= 1 << CPU_ENABLE_FLAG; 506 } 507 if state.inserting { 508 data[0] |= 1 << CPU_INSERTING_FLAG; 509 } 510 if state.removing { 511 data[0] |= 1 << CPU_REMOVING_FLAG; 512 } 513 } else { 514 warn!("Out of range vCPU id: {}", self.selected_cpu); 515 } 516 } 517 _ => { 518 warn!( 519 "Unexpected offset for accessing CPU manager device: {:#}", 520 offset 521 ); 522 } 523 } 524 } 525 526 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 527 match offset { 528 CPU_SELECTION_OFFSET => { 529 self.selected_cpu = data[0]; 530 } 531 CPU_STATUS_OFFSET => { 532 if self.selected_cpu < self.max_vcpus() { 533 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 534 // The ACPI code writes back a 1 to acknowledge the insertion 535 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 536 && state.inserting 537 { 538 state.inserting = false; 539 } 540 // Ditto for removal 541 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 542 && state.removing 543 { 544 state.removing = false; 545 } 546 // Trigger removal of vCPU 547 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 548 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 549 error!("Error removing vCPU: {:?}", e); 550 } 551 } 552 } else { 553 warn!("Out of range vCPU id: {}", self.selected_cpu); 554 } 555 } 556 _ => { 557 warn!( 558 "Unexpected offset for accessing CPU manager device: {:#}", 559 offset 560 ); 561 } 562 } 563 None 564 } 565 } 566 567 #[derive(Default)] 568 struct VcpuState { 569 inserting: bool, 570 removing: bool, 571 pending_removal: Arc<AtomicBool>, 572 handle: Option<thread::JoinHandle<()>>, 573 kill: Arc<AtomicBool>, 574 vcpu_run_interrupted: Arc<AtomicBool>, 575 paused: Arc<AtomicBool>, 576 } 577 578 impl VcpuState { 579 fn active(&self) -> bool { 580 self.handle.is_some() 581 } 582 583 fn signal_thread(&self) { 584 if let Some(handle) = self.handle.as_ref() { 585 loop { 586 // SAFETY: FFI call with correct arguments 587 unsafe { 588 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 589 } 590 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 591 break; 592 } else { 593 // This is more effective than thread::yield_now() at 594 // avoiding a priority inversion with the vCPU thread 595 thread::sleep(std::time::Duration::from_millis(1)); 596 } 597 } 598 } 599 } 600 601 fn join_thread(&mut self) -> Result<()> { 602 if let Some(handle) = self.handle.take() { 603 handle.join().map_err(Error::ThreadCleanup)? 604 } 605 606 Ok(()) 607 } 608 609 fn unpark_thread(&self) { 610 if let Some(handle) = self.handle.as_ref() { 611 handle.thread().unpark() 612 } 613 } 614 } 615 616 impl CpuManager { 617 #[allow(unused_variables)] 618 #[allow(clippy::too_many_arguments)] 619 pub fn new( 620 config: &CpusConfig, 621 vm: Arc<dyn hypervisor::Vm>, 622 exit_evt: EventFd, 623 reset_evt: EventFd, 624 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 625 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 626 seccomp_action: SeccompAction, 627 vm_ops: Arc<dyn VmOps>, 628 #[cfg(feature = "tdx")] tdx_enabled: bool, 629 numa_nodes: &NumaNodes, 630 ) -> Result<Arc<Mutex<CpuManager>>> { 631 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 632 return Err(Error::MaximumVcpusExceeded); 633 } 634 635 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 636 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 637 let hypervisor_type = hypervisor.hypervisor_type(); 638 #[cfg(target_arch = "x86_64")] 639 let cpu_vendor = hypervisor.get_cpu_vendor(); 640 641 #[cfg(target_arch = "x86_64")] 642 if config.features.amx { 643 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 644 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 645 const XFEATURE_XTILEDATA: usize = 18; 646 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 647 648 // SAFETY: the syscall is only modifying kernel internal 649 // data structures that the kernel is itself expected to safeguard. 650 let amx_tile = unsafe { 651 libc::syscall( 652 libc::SYS_arch_prctl, 653 ARCH_REQ_XCOMP_GUEST_PERM, 654 XFEATURE_XTILEDATA, 655 ) 656 }; 657 658 if amx_tile != 0 { 659 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 660 } else { 661 let mask: usize = 0; 662 // SAFETY: the mask being modified (not marked mutable as it is 663 // modified in unsafe only which is permitted) isn't in use elsewhere. 664 let result = unsafe { 665 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 666 }; 667 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 668 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 669 } 670 } 671 } 672 673 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 674 let mut cpu_list = Vec::new(); 675 for (proximity_domain, numa_node) in numa_nodes.iter() { 676 for cpu in numa_node.cpus.iter() { 677 cpu_list.push((*cpu, *proximity_domain)) 678 } 679 } 680 cpu_list 681 } 682 .into_iter() 683 .collect(); 684 685 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 686 cpu_affinity 687 .iter() 688 .map(|a| (a.vcpu, a.host_cpus.clone())) 689 .collect() 690 } else { 691 BTreeMap::new() 692 }; 693 694 #[cfg(feature = "tdx")] 695 let dynamic = !tdx_enabled; 696 #[cfg(not(feature = "tdx"))] 697 let dynamic = true; 698 699 Ok(Arc::new(Mutex::new(CpuManager { 700 config: config.clone(), 701 interrupt_controller: None, 702 #[cfg(target_arch = "x86_64")] 703 cpuid: Vec::new(), 704 vm, 705 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 706 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 707 vcpu_states, 708 exit_evt, 709 reset_evt, 710 #[cfg(feature = "guest_debug")] 711 vm_debug_evt, 712 selected_cpu: 0, 713 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 714 seccomp_action, 715 vm_ops, 716 acpi_address: None, 717 proximity_domain_per_cpu, 718 affinity, 719 dynamic, 720 hypervisor: hypervisor.clone(), 721 }))) 722 } 723 724 #[cfg(target_arch = "x86_64")] 725 pub fn populate_cpuid( 726 &mut self, 727 memory_manager: &Arc<Mutex<MemoryManager>>, 728 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 729 #[cfg(feature = "tdx")] tdx: bool, 730 ) -> Result<()> { 731 let sgx_epc_sections = memory_manager 732 .lock() 733 .unwrap() 734 .sgx_epc_region() 735 .as_ref() 736 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 737 738 self.cpuid = { 739 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 740 arch::generate_common_cpuid( 741 hypervisor, 742 &arch::CpuidConfig { 743 sgx_epc_sections, 744 phys_bits, 745 kvm_hyperv: self.config.kvm_hyperv, 746 #[cfg(feature = "tdx")] 747 tdx, 748 amx: self.config.features.amx, 749 }, 750 ) 751 .map_err(Error::CommonCpuId)? 752 }; 753 754 Ok(()) 755 } 756 757 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 758 info!("Creating vCPU: cpu_id = {}", cpu_id); 759 760 let mut vcpu = Vcpu::new( 761 cpu_id, 762 &self.vm, 763 Some(self.vm_ops.clone()), 764 #[cfg(target_arch = "x86_64")] 765 self.hypervisor.get_cpu_vendor(), 766 )?; 767 768 if let Some(snapshot) = snapshot { 769 // AArch64 vCPUs should be initialized after created. 770 #[cfg(target_arch = "aarch64")] 771 vcpu.init(&self.vm)?; 772 773 let state: CpuState = snapshot.to_state().map_err(|e| { 774 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 775 })?; 776 vcpu.vcpu 777 .set_state(&state) 778 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 779 780 vcpu.saved_state = Some(state); 781 } 782 783 let vcpu = Arc::new(Mutex::new(vcpu)); 784 785 // Adding vCPU to the CpuManager's vCPU list. 786 self.vcpus.push(vcpu.clone()); 787 788 Ok(vcpu) 789 } 790 791 pub fn configure_vcpu( 792 &self, 793 vcpu: Arc<Mutex<Vcpu>>, 794 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 795 ) -> Result<()> { 796 let mut vcpu = vcpu.lock().unwrap(); 797 798 #[cfg(target_arch = "x86_64")] 799 assert!(!self.cpuid.is_empty()); 800 801 #[cfg(target_arch = "x86_64")] 802 let topology = self.config.topology.clone().map_or_else( 803 || { 804 #[cfg(feature = "mshv")] 805 if matches!(self.hypervisor.hypervisor_type(), HypervisorType::Mshv) { 806 return Some((1, self.boot_vcpus(), 1)); 807 } 808 None 809 }, 810 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 811 ); 812 #[cfg(target_arch = "x86_64")] 813 vcpu.configure( 814 boot_setup, 815 self.cpuid.clone(), 816 self.config.kvm_hyperv, 817 topology, 818 )?; 819 820 #[cfg(target_arch = "aarch64")] 821 vcpu.configure(&self.vm, boot_setup)?; 822 823 Ok(()) 824 } 825 826 /// Only create new vCPUs if there aren't any inactive ones to reuse 827 fn create_vcpus( 828 &mut self, 829 desired_vcpus: u8, 830 snapshot: Option<Snapshot>, 831 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 832 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 833 info!( 834 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 835 desired_vcpus, 836 self.config.max_vcpus, 837 self.vcpus.len(), 838 self.present_vcpus() 839 ); 840 841 if desired_vcpus > self.config.max_vcpus { 842 return Err(Error::DesiredVCpuCountExceedsMax); 843 } 844 845 // Only create vCPUs in excess of all the allocated vCPUs. 846 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 847 vcpus.push(self.create_vcpu( 848 cpu_id, 849 // TODO: The special format of the CPU id can be removed once 850 // ready to break live upgrade. 851 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 852 )?); 853 } 854 855 Ok(vcpus) 856 } 857 858 #[cfg(target_arch = "aarch64")] 859 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 860 for cpu in self.vcpus.iter() { 861 let cpu = cpu.lock().unwrap(); 862 // Check if PMU attr is available, if not, log the information. 863 if cpu.vcpu.has_pmu_support() { 864 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 865 } else { 866 debug!( 867 "PMU attribute is not supported in vCPU{}, skip PMU init!", 868 cpu.id 869 ); 870 return Ok(false); 871 } 872 } 873 874 Ok(true) 875 } 876 877 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 878 self.vcpus.clone() 879 } 880 881 fn start_vcpu( 882 &mut self, 883 vcpu: Arc<Mutex<Vcpu>>, 884 vcpu_id: u8, 885 vcpu_thread_barrier: Arc<Barrier>, 886 inserting: bool, 887 ) -> Result<()> { 888 let reset_evt = self.reset_evt.try_clone().unwrap(); 889 let exit_evt = self.exit_evt.try_clone().unwrap(); 890 #[cfg(feature = "kvm")] 891 let hypervisor_type = self.hypervisor.hypervisor_type(); 892 #[cfg(feature = "guest_debug")] 893 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 894 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 895 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 896 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 897 898 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 899 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 900 .vcpu_run_interrupted 901 .clone(); 902 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 903 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 904 905 // Prepare the CPU set the current vCPU is expected to run onto. 906 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 907 // SAFETY: all zeros is a valid pattern 908 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 909 // SAFETY: FFI call, trivially safe 910 unsafe { libc::CPU_ZERO(&mut cpuset) }; 911 for host_cpu in host_cpus { 912 // SAFETY: FFI call, trivially safe 913 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) }; 914 } 915 cpuset 916 }); 917 918 // Retrieve seccomp filter for vcpu thread 919 let vcpu_seccomp_filter = get_seccomp_filter( 920 &self.seccomp_action, 921 Thread::Vcpu, 922 self.hypervisor.hypervisor_type(), 923 ) 924 .map_err(Error::CreateSeccompFilter)?; 925 926 #[cfg(target_arch = "x86_64")] 927 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 928 929 info!("Starting vCPU: cpu_id = {}", vcpu_id); 930 931 let handle = Some( 932 thread::Builder::new() 933 .name(format!("vcpu{vcpu_id}")) 934 .spawn(move || { 935 // Schedule the thread to run on the expected CPU set 936 if let Some(cpuset) = cpuset.as_ref() { 937 // SAFETY: FFI call with correct arguments 938 let ret = unsafe { 939 libc::sched_setaffinity( 940 0, 941 std::mem::size_of::<libc::cpu_set_t>(), 942 cpuset as *const libc::cpu_set_t, 943 ) 944 }; 945 946 if ret != 0 { 947 error!( 948 "Failed scheduling the vCPU {} on the expected CPU set: {}", 949 vcpu_id, 950 io::Error::last_os_error() 951 ); 952 return; 953 } 954 } 955 956 // Apply seccomp filter for vcpu thread. 957 if !vcpu_seccomp_filter.is_empty() { 958 if let Err(e) = 959 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 960 { 961 error!("Error applying seccomp filter: {:?}", e); 962 return; 963 } 964 } 965 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 966 // This uses an async signal safe handler to kill the vcpu handles. 967 register_signal_handler(SIGRTMIN(), handle_signal) 968 .expect("Failed to register vcpu signal handler"); 969 // Block until all CPUs are ready. 970 vcpu_thread_barrier.wait(); 971 972 std::panic::catch_unwind(move || { 973 loop { 974 // If we are being told to pause, we park the thread 975 // until the pause boolean is toggled. 976 // The resume operation is responsible for toggling 977 // the boolean and unpark the thread. 978 // We enter a loop because park() could spuriously 979 // return. We will then park() again unless the 980 // pause boolean has been toggled. 981 982 // Need to use Ordering::SeqCst as we have multiple 983 // loads and stores to different atomics and we need 984 // to see them in a consistent order in all threads 985 986 if vcpu_pause_signalled.load(Ordering::SeqCst) { 987 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 988 // completed by returning to KVM_RUN. From the kernel docs: 989 // 990 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 991 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 992 // operations are complete (and guest state is consistent) only after userspace 993 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 994 // incomplete operations and then check for pending signals. 995 // The pending state of the operation is not preserved in state which is 996 // visible to userspace, thus userspace should ensure that the operation is 997 // completed before performing a live migration. Userspace can re-enter the 998 // guest with an unmasked signal pending or with the immediate_exit field set 999 // to complete pending operations without allowing any further instructions 1000 // to be executed. 1001 1002 #[cfg(feature = "kvm")] 1003 if matches!(hypervisor_type, HypervisorType::Kvm) { 1004 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 1005 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 1006 error!("Unexpected VM exit on \"immediate_exit\" run"); 1007 break; 1008 } 1009 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 1010 } 1011 1012 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1013 1014 vcpu_paused.store(true, Ordering::SeqCst); 1015 while vcpu_pause_signalled.load(Ordering::SeqCst) { 1016 thread::park(); 1017 } 1018 vcpu_run_interrupted.store(false, Ordering::SeqCst); 1019 } 1020 1021 // We've been told to terminate 1022 if vcpu_kill_signalled.load(Ordering::SeqCst) 1023 || vcpu_kill.load(Ordering::SeqCst) 1024 { 1025 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1026 break; 1027 } 1028 1029 #[cfg(feature = "tdx")] 1030 let mut vcpu = vcpu.lock().unwrap(); 1031 #[cfg(not(feature = "tdx"))] 1032 let vcpu = vcpu.lock().unwrap(); 1033 // vcpu.run() returns false on a triple-fault so trigger a reset 1034 match vcpu.run() { 1035 Ok(run) => match run { 1036 #[cfg(feature = "kvm")] 1037 VmExit::Debug => { 1038 info!("VmExit::Debug"); 1039 #[cfg(feature = "guest_debug")] 1040 { 1041 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1042 let raw_tid = get_raw_tid(vcpu_id as usize); 1043 vm_debug_evt.write(raw_tid as u64).unwrap(); 1044 } 1045 } 1046 #[cfg(target_arch = "x86_64")] 1047 VmExit::IoapicEoi(vector) => { 1048 if let Some(interrupt_controller) = 1049 &interrupt_controller_clone 1050 { 1051 interrupt_controller 1052 .lock() 1053 .unwrap() 1054 .end_of_interrupt(vector); 1055 } 1056 } 1057 VmExit::Ignore => {} 1058 VmExit::Hyperv => {} 1059 VmExit::Reset => { 1060 info!("VmExit::Reset"); 1061 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1062 reset_evt.write(1).unwrap(); 1063 break; 1064 } 1065 VmExit::Shutdown => { 1066 info!("VmExit::Shutdown"); 1067 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1068 exit_evt.write(1).unwrap(); 1069 break; 1070 } 1071 #[cfg(feature = "tdx")] 1072 VmExit::Tdx => { 1073 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1074 match vcpu.get_tdx_exit_details() { 1075 Ok(details) => match details { 1076 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1077 TdxExitDetails::SetupEventNotifyInterrupt => { 1078 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1079 } 1080 }, 1081 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1082 } 1083 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1084 } else { 1085 // We should never reach this code as 1086 // this means the design from the code 1087 // is wrong. 1088 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1089 } 1090 } 1091 _ => { 1092 error!( 1093 "VCPU generated error: {:?}", 1094 Error::UnexpectedVmExit 1095 ); 1096 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1097 exit_evt.write(1).unwrap(); 1098 break; 1099 } 1100 }, 1101 1102 Err(e) => { 1103 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1104 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1105 exit_evt.write(1).unwrap(); 1106 break; 1107 } 1108 } 1109 1110 // We've been told to terminate 1111 if vcpu_kill_signalled.load(Ordering::SeqCst) 1112 || vcpu_kill.load(Ordering::SeqCst) 1113 { 1114 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1115 break; 1116 } 1117 } 1118 }) 1119 .or_else(|_| { 1120 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1121 error!("vCPU thread panicked"); 1122 panic_exit_evt.write(1) 1123 }) 1124 .ok(); 1125 }) 1126 .map_err(Error::VcpuSpawn)?, 1127 ); 1128 1129 // On hot plug calls into this function entry_point is None. It is for 1130 // those hotplug CPU additions that we need to set the inserting flag. 1131 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1132 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1133 1134 Ok(()) 1135 } 1136 1137 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1138 fn activate_vcpus( 1139 &mut self, 1140 desired_vcpus: u8, 1141 inserting: bool, 1142 paused: Option<bool>, 1143 ) -> Result<()> { 1144 if desired_vcpus > self.config.max_vcpus { 1145 return Err(Error::DesiredVCpuCountExceedsMax); 1146 } 1147 1148 let vcpu_thread_barrier = Arc::new(Barrier::new( 1149 (desired_vcpus - self.present_vcpus() + 1) as usize, 1150 )); 1151 1152 if let Some(paused) = paused { 1153 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1154 } 1155 1156 info!( 1157 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1158 desired_vcpus, 1159 self.vcpus.len(), 1160 self.present_vcpus(), 1161 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1162 ); 1163 1164 // This reuses any inactive vCPUs as well as any that were newly created 1165 for vcpu_id in self.present_vcpus()..desired_vcpus { 1166 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1167 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1168 } 1169 1170 // Unblock all CPU threads. 1171 vcpu_thread_barrier.wait(); 1172 Ok(()) 1173 } 1174 1175 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1176 // Mark vCPUs for removal, actual removal happens on ejection 1177 for cpu_id in desired_vcpus..self.present_vcpus() { 1178 self.vcpu_states[usize::from(cpu_id)].removing = true; 1179 self.vcpu_states[usize::from(cpu_id)] 1180 .pending_removal 1181 .store(true, Ordering::SeqCst); 1182 } 1183 } 1184 1185 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1186 for state in self.vcpu_states.iter() { 1187 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1188 return true; 1189 } 1190 } 1191 false 1192 } 1193 1194 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1195 info!("Removing vCPU: cpu_id = {}", cpu_id); 1196 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1197 state.kill.store(true, Ordering::SeqCst); 1198 state.signal_thread(); 1199 state.join_thread()?; 1200 state.handle = None; 1201 1202 // Once the thread has exited, clear the "kill" so that it can reused 1203 state.kill.store(false, Ordering::SeqCst); 1204 state.pending_removal.store(false, Ordering::SeqCst); 1205 1206 Ok(()) 1207 } 1208 1209 pub fn create_boot_vcpus( 1210 &mut self, 1211 snapshot: Option<Snapshot>, 1212 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1213 trace_scoped!("create_boot_vcpus"); 1214 1215 self.create_vcpus(self.boot_vcpus(), snapshot) 1216 } 1217 1218 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1219 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1220 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1221 } 1222 1223 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1224 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1225 .map_err(|e| { 1226 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1227 })?; 1228 1229 Ok(()) 1230 } 1231 1232 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1233 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1234 return Ok(false); 1235 } 1236 1237 if !self.dynamic { 1238 return Ok(false); 1239 } 1240 1241 if self.check_pending_removed_vcpu() { 1242 return Err(Error::VcpuPendingRemovedVcpu); 1243 } 1244 1245 match desired_vcpus.cmp(&self.present_vcpus()) { 1246 cmp::Ordering::Greater => { 1247 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1248 for vcpu in vcpus { 1249 self.configure_vcpu(vcpu, None)? 1250 } 1251 self.activate_vcpus(desired_vcpus, true, None)?; 1252 Ok(true) 1253 } 1254 cmp::Ordering::Less => { 1255 self.mark_vcpus_for_removal(desired_vcpus); 1256 Ok(true) 1257 } 1258 _ => Ok(false), 1259 } 1260 } 1261 1262 pub fn shutdown(&mut self) -> Result<()> { 1263 // Tell the vCPUs to stop themselves next time they go through the loop 1264 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1265 1266 // Toggle the vCPUs pause boolean 1267 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1268 1269 // Unpark all the VCPU threads. 1270 for state in self.vcpu_states.iter() { 1271 state.unpark_thread(); 1272 } 1273 1274 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1275 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1276 // above. 1277 for state in self.vcpu_states.iter() { 1278 state.signal_thread(); 1279 } 1280 1281 // Wait for all the threads to finish. This removes the state from the vector. 1282 for mut state in self.vcpu_states.drain(..) { 1283 state.join_thread()?; 1284 } 1285 1286 Ok(()) 1287 } 1288 1289 #[cfg(feature = "tdx")] 1290 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1291 for vcpu in &self.vcpus { 1292 vcpu.lock() 1293 .unwrap() 1294 .vcpu 1295 .tdx_init(hob_address) 1296 .map_err(Error::InitializeTdx)?; 1297 } 1298 Ok(()) 1299 } 1300 1301 pub fn boot_vcpus(&self) -> u8 { 1302 self.config.boot_vcpus 1303 } 1304 1305 pub fn max_vcpus(&self) -> u8 { 1306 self.config.max_vcpus 1307 } 1308 1309 #[cfg(target_arch = "x86_64")] 1310 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1311 assert!(!self.cpuid.is_empty()); 1312 self.cpuid.clone() 1313 } 1314 1315 fn present_vcpus(&self) -> u8 { 1316 self.vcpu_states 1317 .iter() 1318 .fold(0, |acc, state| acc + state.active() as u8) 1319 } 1320 1321 #[cfg(target_arch = "aarch64")] 1322 pub fn get_mpidrs(&self) -> Vec<u64> { 1323 self.vcpus 1324 .iter() 1325 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1326 .collect() 1327 } 1328 1329 #[cfg(target_arch = "aarch64")] 1330 pub fn get_saved_states(&self) -> Vec<CpuState> { 1331 self.vcpus 1332 .iter() 1333 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1334 .collect() 1335 } 1336 1337 #[cfg(target_arch = "aarch64")] 1338 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1339 self.config 1340 .topology 1341 .clone() 1342 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1343 } 1344 1345 pub fn create_madt(&self) -> Sdt { 1346 use crate::acpi; 1347 // This is also checked in the commandline parsing. 1348 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1349 1350 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1351 #[cfg(target_arch = "x86_64")] 1352 { 1353 madt.write(36, arch::layout::APIC_START.0); 1354 1355 for cpu in 0..self.config.max_vcpus { 1356 let lapic = LocalX2Apic { 1357 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1358 length: 16, 1359 processor_id: cpu.into(), 1360 apic_id: cpu.into(), 1361 flags: if cpu < self.config.boot_vcpus { 1362 1 << MADT_CPU_ENABLE_FLAG 1363 } else { 1364 0 1365 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1366 _reserved: 0, 1367 }; 1368 madt.append(lapic); 1369 } 1370 1371 madt.append(Ioapic { 1372 r#type: acpi::ACPI_APIC_IO, 1373 length: 12, 1374 ioapic_id: 0, 1375 apic_address: arch::layout::IOAPIC_START.0 as u32, 1376 gsi_base: 0, 1377 ..Default::default() 1378 }); 1379 1380 madt.append(InterruptSourceOverride { 1381 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1382 length: 10, 1383 bus: 0, 1384 source: 4, 1385 gsi: 4, 1386 flags: 0, 1387 }); 1388 } 1389 1390 #[cfg(target_arch = "aarch64")] 1391 { 1392 /* Notes: 1393 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1394 */ 1395 1396 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1397 for cpu in 0..self.config.boot_vcpus { 1398 let vcpu = &self.vcpus[cpu as usize]; 1399 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1400 /* ARMv8 MPIDR format: 1401 Bits [63:40] Must be zero 1402 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1403 Bits [31:24] Must be zero 1404 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1405 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1406 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1407 */ 1408 let mpidr_mask = 0xff_00ff_ffff; 1409 let gicc = GicC { 1410 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1411 length: 80, 1412 reserved0: 0, 1413 cpu_interface_number: cpu as u32, 1414 uid: cpu as u32, 1415 flags: 1, 1416 parking_version: 0, 1417 performance_interrupt: 0, 1418 parked_address: 0, 1419 base_address: 0, 1420 gicv_base_address: 0, 1421 gich_base_address: 0, 1422 vgic_interrupt: 0, 1423 gicr_base_address: 0, 1424 mpidr: mpidr & mpidr_mask, 1425 proc_power_effi_class: 0, 1426 reserved1: 0, 1427 spe_overflow_interrupt: 0, 1428 }; 1429 1430 madt.append(gicc); 1431 } 1432 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1433 1434 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1435 let gicd = GicD { 1436 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1437 length: 24, 1438 reserved0: 0, 1439 gic_id: 0, 1440 base_address: vgic_config.dist_addr, 1441 global_irq_base: 0, 1442 version: 3, 1443 reserved1: [0; 3], 1444 }; 1445 madt.append(gicd); 1446 1447 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1448 let gicr = GicR { 1449 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1450 length: 16, 1451 reserved: 0, 1452 base_address: vgic_config.redists_addr, 1453 range_length: vgic_config.redists_size as u32, 1454 }; 1455 madt.append(gicr); 1456 1457 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1458 let gicits = GicIts { 1459 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1460 length: 20, 1461 reserved0: 0, 1462 translation_id: 0, 1463 base_address: vgic_config.msi_addr, 1464 reserved1: 0, 1465 }; 1466 madt.append(gicits); 1467 1468 madt.update_checksum(); 1469 } 1470 1471 madt 1472 } 1473 1474 #[cfg(target_arch = "aarch64")] 1475 pub fn create_pptt(&self) -> Sdt { 1476 let pptt_start = 0; 1477 let mut cpus = 0; 1478 let mut uid = 0; 1479 // If topology is not specified, the default setting is: 1480 // 1 package, multiple cores, 1 thread per core 1481 // This is also the behavior when PPTT is missing. 1482 let (threads_per_core, cores_per_package, packages) = 1483 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1484 1485 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1486 1487 for cluster_idx in 0..packages { 1488 if cpus < self.config.boot_vcpus as usize { 1489 let cluster_offset = pptt.len() - pptt_start; 1490 let cluster_hierarchy_node = ProcessorHierarchyNode { 1491 r#type: 0, 1492 length: 20, 1493 reserved: 0, 1494 flags: 0x2, 1495 parent: 0, 1496 acpi_processor_id: cluster_idx as u32, 1497 num_private_resources: 0, 1498 }; 1499 pptt.append(cluster_hierarchy_node); 1500 1501 for core_idx in 0..cores_per_package { 1502 let core_offset = pptt.len() - pptt_start; 1503 1504 if threads_per_core > 1 { 1505 let core_hierarchy_node = ProcessorHierarchyNode { 1506 r#type: 0, 1507 length: 20, 1508 reserved: 0, 1509 flags: 0x2, 1510 parent: cluster_offset as u32, 1511 acpi_processor_id: core_idx as u32, 1512 num_private_resources: 0, 1513 }; 1514 pptt.append(core_hierarchy_node); 1515 1516 for _thread_idx in 0..threads_per_core { 1517 let thread_hierarchy_node = ProcessorHierarchyNode { 1518 r#type: 0, 1519 length: 20, 1520 reserved: 0, 1521 flags: 0xE, 1522 parent: core_offset as u32, 1523 acpi_processor_id: uid as u32, 1524 num_private_resources: 0, 1525 }; 1526 pptt.append(thread_hierarchy_node); 1527 uid += 1; 1528 } 1529 } else { 1530 let thread_hierarchy_node = ProcessorHierarchyNode { 1531 r#type: 0, 1532 length: 20, 1533 reserved: 0, 1534 flags: 0xA, 1535 parent: cluster_offset as u32, 1536 acpi_processor_id: uid as u32, 1537 num_private_resources: 0, 1538 }; 1539 pptt.append(thread_hierarchy_node); 1540 uid += 1; 1541 } 1542 } 1543 cpus += (cores_per_package * threads_per_core) as usize; 1544 } 1545 } 1546 1547 pptt.update_checksum(); 1548 pptt 1549 } 1550 1551 #[cfg(feature = "guest_debug")] 1552 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1553 self.vcpus[usize::from(cpu_id)] 1554 .lock() 1555 .unwrap() 1556 .vcpu 1557 .get_regs() 1558 .map_err(Error::CpuDebug) 1559 } 1560 1561 #[cfg(feature = "guest_debug")] 1562 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1563 self.vcpus[usize::from(cpu_id)] 1564 .lock() 1565 .unwrap() 1566 .vcpu 1567 .set_regs(regs) 1568 .map_err(Error::CpuDebug) 1569 } 1570 1571 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1572 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1573 self.vcpus[usize::from(cpu_id)] 1574 .lock() 1575 .unwrap() 1576 .vcpu 1577 .get_sregs() 1578 .map_err(Error::CpuDebug) 1579 } 1580 1581 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1582 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1583 self.vcpus[usize::from(cpu_id)] 1584 .lock() 1585 .unwrap() 1586 .vcpu 1587 .set_sregs(sregs) 1588 .map_err(Error::CpuDebug) 1589 } 1590 1591 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1592 fn translate_gva( 1593 &self, 1594 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1595 cpu_id: u8, 1596 gva: u64, 1597 ) -> Result<u64> { 1598 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1599 .lock() 1600 .unwrap() 1601 .vcpu 1602 .translate_gva(gva, /* flags: unused */ 0) 1603 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1604 Ok(gpa) 1605 } 1606 1607 /// 1608 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1609 /// it in VMM by walking through translation tables. 1610 /// 1611 /// Address translation is big topic, here we only focus the scenario that 1612 /// happens in VMM while debugging kernel. This `translate_gva` 1613 /// implementation is restricted to: 1614 /// - Exception Level 1 1615 /// - Translate high address range only (kernel space) 1616 /// 1617 /// This implementation supports following Arm-v8a features related to 1618 /// address translation: 1619 /// - FEAT_LPA 1620 /// - FEAT_LVA 1621 /// - FEAT_LPA2 1622 /// 1623 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1624 fn translate_gva( 1625 &self, 1626 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1627 cpu_id: u8, 1628 gva: u64, 1629 ) -> Result<u64> { 1630 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1631 .lock() 1632 .unwrap() 1633 .vcpu 1634 .get_sys_reg(regs::TCR_EL1) 1635 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1636 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1637 .lock() 1638 .unwrap() 1639 .vcpu 1640 .get_sys_reg(regs::TTBR1_EL1) 1641 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1642 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1643 .lock() 1644 .unwrap() 1645 .vcpu 1646 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1647 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1648 1649 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1650 // or low (0x000xxx...). 1651 let high_range = extract_bits_64!(gva, 55, 1); 1652 if high_range == 0 { 1653 info!("VA (0x{:x}) range is not supported!", gva); 1654 return Ok(gva); 1655 } 1656 1657 // High range size offset 1658 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1659 // Granule size 1660 let tg = extract_bits_64!(tcr_el1, 30, 2); 1661 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1662 let ds = extract_bits_64!(tcr_el1, 59, 1); 1663 1664 if tsz == 0 { 1665 info!("VA translation is not ready!"); 1666 return Ok(gva); 1667 } 1668 1669 // VA size is determined by TCR_BL1.T1SZ 1670 let va_size = 64 - tsz; 1671 // Number of bits in VA consumed in each level of translation 1672 let stride = match tg { 1673 3 => 13, // 64KB granule size 1674 1 => 11, // 16KB granule size 1675 _ => 9, // 4KB, default 1676 }; 1677 // Starting level of walking 1678 let mut level = 4 - (va_size - 4) / stride; 1679 1680 // PA or IPA size is determined 1681 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1682 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1683 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1684 // To be safe, we use the minimum value if they are different. 1685 let pa_range = std::cmp::min(tcr_ips, pa_range); 1686 // PA size in bits 1687 let pa_size = match pa_range { 1688 0 => 32, 1689 1 => 36, 1690 2 => 40, 1691 3 => 42, 1692 4 => 44, 1693 5 => 48, 1694 6 => 52, 1695 _ => { 1696 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1697 "PA range not supported {pa_range}" 1698 )))) 1699 } 1700 }; 1701 1702 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1703 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1704 // If FEAT_LPA2 is present, the translation table descriptor holds 1705 // 50 bits of the table address of next level. 1706 // Otherwise, it is 48 bits. 1707 let descaddrmask = if ds == 1 { 1708 !0u64 >> (64 - 50) // mask with 50 least significant bits 1709 } else { 1710 !0u64 >> (64 - 48) // mask with 48 least significant bits 1711 }; 1712 let descaddrmask = descaddrmask & !indexmask_grainsize; 1713 1714 // Translation table base address 1715 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1716 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1717 // address bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1718 if pa_size == 52 { 1719 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1720 } 1721 1722 // Loop through tables of each level 1723 loop { 1724 // Table offset for current level 1725 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1726 descaddr |= table_offset; 1727 descaddr &= !7u64; 1728 1729 let mut buf = [0; 8]; 1730 guest_memory 1731 .memory() 1732 .read(&mut buf, GuestAddress(descaddr)) 1733 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1734 let descriptor = u64::from_le_bytes(buf); 1735 1736 descaddr = descriptor & descaddrmask; 1737 // In the case of FEAT_LPA, the next-level translation table address 1738 // bits [48:51] comes from bits [12:15] of the current descriptor. 1739 // For FEAT_LPA2, the next-level translation table address 1740 // bits [50:51] comes from bits [8:9] of the current descriptor, 1741 // bits [48:49] comes from bits [48:49] of the descriptor which was 1742 // handled previously. 1743 if pa_size == 52 { 1744 if ds == 1 { 1745 // FEAT_LPA2 1746 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1747 } else { 1748 // FEAT_LPA 1749 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1750 } 1751 } 1752 1753 if (descriptor & 2) != 0 && (level < 3) { 1754 // This is a table entry. Go down to next level. 1755 level += 1; 1756 indexmask = indexmask_grainsize; 1757 continue; 1758 } 1759 1760 break; 1761 } 1762 1763 // We have reached either: 1764 // - a page entry at level 3 or 1765 // - a block entry at level 1 or 2 1766 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1767 descaddr &= !(page_size - 1); 1768 descaddr |= gva & (page_size - 1); 1769 1770 Ok(descaddr) 1771 } 1772 1773 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1774 self.acpi_address = Some(acpi_address); 1775 } 1776 1777 pub(crate) fn set_interrupt_controller( 1778 &mut self, 1779 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1780 ) { 1781 self.interrupt_controller = Some(interrupt_controller); 1782 } 1783 1784 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1785 &self.vcpus_kill_signalled 1786 } 1787 } 1788 1789 struct Cpu { 1790 cpu_id: u8, 1791 proximity_domain: u32, 1792 dynamic: bool, 1793 } 1794 1795 #[cfg(target_arch = "x86_64")] 1796 const MADT_CPU_ENABLE_FLAG: usize = 0; 1797 1798 #[cfg(target_arch = "x86_64")] 1799 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1800 1801 impl Cpu { 1802 #[cfg(target_arch = "x86_64")] 1803 fn generate_mat(&self) -> Vec<u8> { 1804 let lapic = LocalX2Apic { 1805 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1806 length: 16, 1807 processor_id: self.cpu_id.into(), 1808 apic_id: self.cpu_id.into(), 1809 flags: 1 << MADT_CPU_ENABLE_FLAG, 1810 _reserved: 0, 1811 }; 1812 1813 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1814 // SAFETY: mat_data is large enough to hold lapic 1815 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1816 1817 mat_data 1818 } 1819 } 1820 1821 impl Aml for Cpu { 1822 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1823 #[cfg(target_arch = "x86_64")] 1824 let mat_data: Vec<u8> = self.generate_mat(); 1825 #[allow(clippy::if_same_then_else)] 1826 if self.dynamic { 1827 aml::Device::new( 1828 format!("C{:03X}", self.cpu_id).as_str().into(), 1829 vec![ 1830 &aml::Name::new("_HID".into(), &"ACPI0007"), 1831 &aml::Name::new("_UID".into(), &self.cpu_id), 1832 // Currently, AArch64 cannot support following fields. 1833 /* 1834 _STA return value: 1835 Bit [0] – Set if the device is present. 1836 Bit [1] – Set if the device is enabled and decoding its resources. 1837 Bit [2] – Set if the device should be shown in the UI. 1838 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1839 Bit [4] – Set if the battery is present. 1840 Bits [31:5] – Reserved (must be cleared). 1841 */ 1842 #[cfg(target_arch = "x86_64")] 1843 &aml::Method::new( 1844 "_STA".into(), 1845 0, 1846 false, 1847 // Call into CSTA method which will interrogate device 1848 vec![&aml::Return::new(&aml::MethodCall::new( 1849 "CSTA".into(), 1850 vec![&self.cpu_id], 1851 ))], 1852 ), 1853 &aml::Method::new( 1854 "_PXM".into(), 1855 0, 1856 false, 1857 vec![&aml::Return::new(&self.proximity_domain)], 1858 ), 1859 // The Linux kernel expects every CPU device to have a _MAT entry 1860 // containing the LAPIC for this processor with the enabled bit set 1861 // even it if is disabled in the MADT (non-boot CPU) 1862 #[cfg(target_arch = "x86_64")] 1863 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1864 // Trigger CPU ejection 1865 #[cfg(target_arch = "x86_64")] 1866 &aml::Method::new( 1867 "_EJ0".into(), 1868 1, 1869 false, 1870 // Call into CEJ0 method which will actually eject device 1871 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1872 ), 1873 ], 1874 ) 1875 .to_aml_bytes(sink); 1876 } else { 1877 aml::Device::new( 1878 format!("C{:03X}", self.cpu_id).as_str().into(), 1879 vec![ 1880 &aml::Name::new("_HID".into(), &"ACPI0007"), 1881 &aml::Name::new("_UID".into(), &self.cpu_id), 1882 #[cfg(target_arch = "x86_64")] 1883 &aml::Method::new( 1884 "_STA".into(), 1885 0, 1886 false, 1887 // Mark CPU present see CSTA implementation 1888 vec![&aml::Return::new(&0xfu8)], 1889 ), 1890 &aml::Method::new( 1891 "_PXM".into(), 1892 0, 1893 false, 1894 vec![&aml::Return::new(&self.proximity_domain)], 1895 ), 1896 // The Linux kernel expects every CPU device to have a _MAT entry 1897 // containing the LAPIC for this processor with the enabled bit set 1898 // even it if is disabled in the MADT (non-boot CPU) 1899 #[cfg(target_arch = "x86_64")] 1900 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1901 ], 1902 ) 1903 .to_aml_bytes(sink); 1904 } 1905 } 1906 } 1907 1908 struct CpuNotify { 1909 cpu_id: u8, 1910 } 1911 1912 impl Aml for CpuNotify { 1913 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1914 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 1915 aml::If::new( 1916 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1917 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1918 ) 1919 .to_aml_bytes(sink) 1920 } 1921 } 1922 1923 struct CpuMethods { 1924 max_vcpus: u8, 1925 dynamic: bool, 1926 } 1927 1928 impl Aml for CpuMethods { 1929 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1930 if self.dynamic { 1931 // CPU status method 1932 aml::Method::new( 1933 "CSTA".into(), 1934 1, 1935 true, 1936 vec![ 1937 // Take lock defined above 1938 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1939 // Write CPU number (in first argument) to I/O port via field 1940 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1941 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1942 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 1943 &aml::If::new( 1944 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 1945 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 1946 ), 1947 // Release lock 1948 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1949 // Return 0 or 0xf 1950 &aml::Return::new(&aml::Local(0)), 1951 ], 1952 ) 1953 .to_aml_bytes(sink); 1954 1955 let mut cpu_notifies = Vec::new(); 1956 for cpu_id in 0..self.max_vcpus { 1957 cpu_notifies.push(CpuNotify { cpu_id }); 1958 } 1959 1960 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 1961 for cpu_id in 0..self.max_vcpus { 1962 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 1963 } 1964 1965 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 1966 1967 aml::Method::new( 1968 "CEJ0".into(), 1969 1, 1970 true, 1971 vec![ 1972 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1973 // Write CPU number (in first argument) to I/O port via field 1974 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1975 // Set CEJ0 bit 1976 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 1977 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1978 ], 1979 ) 1980 .to_aml_bytes(sink); 1981 1982 aml::Method::new( 1983 "CSCN".into(), 1984 0, 1985 true, 1986 vec![ 1987 // Take lock defined above 1988 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1989 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1990 &aml::While::new( 1991 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 1992 vec![ 1993 // Write CPU number (in first argument) to I/O port via field 1994 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 1995 // Check if CINS bit is set 1996 &aml::If::new( 1997 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 1998 // Notify device if it is 1999 vec![ 2000 &aml::MethodCall::new( 2001 "CTFY".into(), 2002 vec![&aml::Local(0), &aml::ONE], 2003 ), 2004 // Reset CINS bit 2005 &aml::Store::new( 2006 &aml::Path::new("\\_SB_.PRES.CINS"), 2007 &aml::ONE, 2008 ), 2009 ], 2010 ), 2011 // Check if CRMV bit is set 2012 &aml::If::new( 2013 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 2014 // Notify device if it is (with the eject constant 0x3) 2015 vec![ 2016 &aml::MethodCall::new( 2017 "CTFY".into(), 2018 vec![&aml::Local(0), &3u8], 2019 ), 2020 // Reset CRMV bit 2021 &aml::Store::new( 2022 &aml::Path::new("\\_SB_.PRES.CRMV"), 2023 &aml::ONE, 2024 ), 2025 ], 2026 ), 2027 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2028 ], 2029 ), 2030 // Release lock 2031 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2032 ], 2033 ) 2034 .to_aml_bytes(sink) 2035 } else { 2036 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2037 } 2038 } 2039 } 2040 2041 impl Aml for CpuManager { 2042 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2043 #[cfg(target_arch = "x86_64")] 2044 if let Some(acpi_address) = self.acpi_address { 2045 // CPU hotplug controller 2046 aml::Device::new( 2047 "_SB_.PRES".into(), 2048 vec![ 2049 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2050 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2051 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2052 &aml::Mutex::new("CPLK".into(), 0), 2053 &aml::Name::new( 2054 "_CRS".into(), 2055 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2056 aml::AddressSpaceCacheable::NotCacheable, 2057 true, 2058 acpi_address.0, 2059 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2060 None, 2061 )]), 2062 ), 2063 // OpRegion and Fields map MMIO range into individual field values 2064 &aml::OpRegion::new( 2065 "PRST".into(), 2066 aml::OpRegionSpace::SystemMemory, 2067 &(acpi_address.0 as usize), 2068 &CPU_MANAGER_ACPI_SIZE, 2069 ), 2070 &aml::Field::new( 2071 "PRST".into(), 2072 aml::FieldAccessType::Byte, 2073 aml::FieldLockRule::NoLock, 2074 aml::FieldUpdateRule::WriteAsZeroes, 2075 vec![ 2076 aml::FieldEntry::Reserved(32), 2077 aml::FieldEntry::Named(*b"CPEN", 1), 2078 aml::FieldEntry::Named(*b"CINS", 1), 2079 aml::FieldEntry::Named(*b"CRMV", 1), 2080 aml::FieldEntry::Named(*b"CEJ0", 1), 2081 aml::FieldEntry::Reserved(4), 2082 aml::FieldEntry::Named(*b"CCMD", 8), 2083 ], 2084 ), 2085 &aml::Field::new( 2086 "PRST".into(), 2087 aml::FieldAccessType::DWord, 2088 aml::FieldLockRule::NoLock, 2089 aml::FieldUpdateRule::Preserve, 2090 vec![ 2091 aml::FieldEntry::Named(*b"CSEL", 32), 2092 aml::FieldEntry::Reserved(32), 2093 aml::FieldEntry::Named(*b"CDAT", 32), 2094 ], 2095 ), 2096 ], 2097 ) 2098 .to_aml_bytes(sink); 2099 } 2100 2101 // CPU devices 2102 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2103 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2104 // Bundle methods together under a common object 2105 let methods = CpuMethods { 2106 max_vcpus: self.config.max_vcpus, 2107 dynamic: self.dynamic, 2108 }; 2109 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2110 2111 let mut cpu_devices = Vec::new(); 2112 for cpu_id in 0..self.config.max_vcpus { 2113 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2114 let cpu_device = Cpu { 2115 cpu_id, 2116 proximity_domain, 2117 dynamic: self.dynamic, 2118 }; 2119 2120 cpu_devices.push(cpu_device); 2121 } 2122 2123 for cpu_device in cpu_devices.iter() { 2124 cpu_data_inner.push(cpu_device); 2125 } 2126 2127 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2128 } 2129 } 2130 2131 impl Pausable for CpuManager { 2132 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2133 // Tell the vCPUs to pause themselves next time they exit 2134 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2135 2136 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2137 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2138 // above. 2139 for state in self.vcpu_states.iter() { 2140 state.signal_thread(); 2141 } 2142 2143 for vcpu in self.vcpus.iter() { 2144 let mut vcpu = vcpu.lock().unwrap(); 2145 vcpu.pause()?; 2146 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2147 if !self.config.kvm_hyperv { 2148 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2149 MigratableError::Pause(anyhow!( 2150 "Could not notify guest it has been paused {:?}", 2151 e 2152 )) 2153 })?; 2154 } 2155 } 2156 2157 // The vCPU thread will change its paused state before parking, wait here for each 2158 // activated vCPU change their state to ensure they have parked. 2159 for state in self.vcpu_states.iter() { 2160 if state.active() { 2161 while !state.paused.load(Ordering::SeqCst) { 2162 // To avoid a priority inversion with the vCPU thread 2163 thread::sleep(std::time::Duration::from_millis(1)); 2164 } 2165 } 2166 } 2167 2168 Ok(()) 2169 } 2170 2171 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2172 for vcpu in self.vcpus.iter() { 2173 vcpu.lock().unwrap().resume()?; 2174 } 2175 2176 // Toggle the vCPUs pause boolean 2177 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2178 2179 // Unpark all the VCPU threads. 2180 // Once unparked, the next thing they will do is checking for the pause 2181 // boolean. Since it'll be set to false, they will exit their pause loop 2182 // and go back to vmx root. 2183 for state in self.vcpu_states.iter() { 2184 state.paused.store(false, Ordering::SeqCst); 2185 state.unpark_thread(); 2186 } 2187 Ok(()) 2188 } 2189 } 2190 2191 impl Snapshottable for CpuManager { 2192 fn id(&self) -> String { 2193 CPU_MANAGER_SNAPSHOT_ID.to_string() 2194 } 2195 2196 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2197 let mut cpu_manager_snapshot = Snapshot::default(); 2198 2199 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2200 for vcpu in &self.vcpus { 2201 let mut vcpu = vcpu.lock().unwrap(); 2202 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2203 } 2204 2205 Ok(cpu_manager_snapshot) 2206 } 2207 } 2208 2209 impl Transportable for CpuManager {} 2210 impl Migratable for CpuManager {} 2211 2212 #[cfg(feature = "guest_debug")] 2213 impl Debuggable for CpuManager { 2214 #[cfg(feature = "kvm")] 2215 fn set_guest_debug( 2216 &self, 2217 cpu_id: usize, 2218 addrs: &[GuestAddress], 2219 singlestep: bool, 2220 ) -> std::result::Result<(), DebuggableError> { 2221 self.vcpus[cpu_id] 2222 .lock() 2223 .unwrap() 2224 .vcpu 2225 .set_guest_debug(addrs, singlestep) 2226 .map_err(DebuggableError::SetDebug) 2227 } 2228 2229 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2230 Ok(()) 2231 } 2232 2233 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2234 Ok(()) 2235 } 2236 2237 #[cfg(target_arch = "x86_64")] 2238 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2239 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2240 let gregs = self 2241 .get_regs(cpu_id as u8) 2242 .map_err(DebuggableError::ReadRegs)?; 2243 let regs = [ 2244 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, 2245 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, 2246 ]; 2247 2248 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2249 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2250 let eflags = gregs.rflags as u32; 2251 let rip = gregs.rip; 2252 2253 // Segment registers: CS, SS, DS, ES, FS, GS 2254 let sregs = self 2255 .get_sregs(cpu_id as u8) 2256 .map_err(DebuggableError::ReadRegs)?; 2257 let segments = X86SegmentRegs { 2258 cs: sregs.cs.selector as u32, 2259 ss: sregs.ss.selector as u32, 2260 ds: sregs.ds.selector as u32, 2261 es: sregs.es.selector as u32, 2262 fs: sregs.fs.selector as u32, 2263 gs: sregs.gs.selector as u32, 2264 }; 2265 2266 // TODO: Add other registers 2267 2268 Ok(CoreRegs { 2269 regs, 2270 eflags, 2271 rip, 2272 segments, 2273 ..Default::default() 2274 }) 2275 } 2276 2277 #[cfg(target_arch = "aarch64")] 2278 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2279 let gregs = self 2280 .get_regs(cpu_id as u8) 2281 .map_err(DebuggableError::ReadRegs)?; 2282 Ok(CoreRegs { 2283 x: gregs.regs.regs, 2284 sp: gregs.regs.sp, 2285 pc: gregs.regs.pc, 2286 ..Default::default() 2287 }) 2288 } 2289 2290 #[cfg(target_arch = "x86_64")] 2291 fn write_regs( 2292 &self, 2293 cpu_id: usize, 2294 regs: &CoreRegs, 2295 ) -> std::result::Result<(), DebuggableError> { 2296 let orig_gregs = self 2297 .get_regs(cpu_id as u8) 2298 .map_err(DebuggableError::ReadRegs)?; 2299 let gregs = StandardRegisters { 2300 rax: regs.regs[0], 2301 rbx: regs.regs[1], 2302 rcx: regs.regs[2], 2303 rdx: regs.regs[3], 2304 rsi: regs.regs[4], 2305 rdi: regs.regs[5], 2306 rbp: regs.regs[6], 2307 rsp: regs.regs[7], 2308 r8: regs.regs[8], 2309 r9: regs.regs[9], 2310 r10: regs.regs[10], 2311 r11: regs.regs[11], 2312 r12: regs.regs[12], 2313 r13: regs.regs[13], 2314 r14: regs.regs[14], 2315 r15: regs.regs[15], 2316 rip: regs.rip, 2317 // Update the lower 32-bit of rflags. 2318 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), 2319 }; 2320 2321 self.set_regs(cpu_id as u8, &gregs) 2322 .map_err(DebuggableError::WriteRegs)?; 2323 2324 // Segment registers: CS, SS, DS, ES, FS, GS 2325 // Since GDB care only selectors, we call get_sregs() first. 2326 let mut sregs = self 2327 .get_sregs(cpu_id as u8) 2328 .map_err(DebuggableError::ReadRegs)?; 2329 sregs.cs.selector = regs.segments.cs as u16; 2330 sregs.ss.selector = regs.segments.ss as u16; 2331 sregs.ds.selector = regs.segments.ds as u16; 2332 sregs.es.selector = regs.segments.es as u16; 2333 sregs.fs.selector = regs.segments.fs as u16; 2334 sregs.gs.selector = regs.segments.gs as u16; 2335 2336 self.set_sregs(cpu_id as u8, &sregs) 2337 .map_err(DebuggableError::WriteRegs)?; 2338 2339 // TODO: Add other registers 2340 2341 Ok(()) 2342 } 2343 2344 #[cfg(target_arch = "aarch64")] 2345 fn write_regs( 2346 &self, 2347 cpu_id: usize, 2348 regs: &CoreRegs, 2349 ) -> std::result::Result<(), DebuggableError> { 2350 let mut gregs = self 2351 .get_regs(cpu_id as u8) 2352 .map_err(DebuggableError::ReadRegs)?; 2353 2354 gregs.regs.regs = regs.x; 2355 gregs.regs.sp = regs.sp; 2356 gregs.regs.pc = regs.pc; 2357 2358 self.set_regs(cpu_id as u8, &gregs) 2359 .map_err(DebuggableError::WriteRegs)?; 2360 2361 Ok(()) 2362 } 2363 2364 fn read_mem( 2365 &self, 2366 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2367 cpu_id: usize, 2368 vaddr: GuestAddress, 2369 len: usize, 2370 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2371 let mut buf = vec![0; len]; 2372 let mut total_read = 0_u64; 2373 2374 while total_read < len as u64 { 2375 let gaddr = vaddr.0 + total_read; 2376 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2377 Ok(paddr) => paddr, 2378 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2379 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2380 }; 2381 let psize = arch::PAGE_SIZE as u64; 2382 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2383 guest_memory 2384 .memory() 2385 .read( 2386 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2387 GuestAddress(paddr), 2388 ) 2389 .map_err(DebuggableError::ReadMem)?; 2390 total_read += read_len; 2391 } 2392 Ok(buf) 2393 } 2394 2395 fn write_mem( 2396 &self, 2397 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2398 cpu_id: usize, 2399 vaddr: &GuestAddress, 2400 data: &[u8], 2401 ) -> std::result::Result<(), DebuggableError> { 2402 let mut total_written = 0_u64; 2403 2404 while total_written < data.len() as u64 { 2405 let gaddr = vaddr.0 + total_written; 2406 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2407 Ok(paddr) => paddr, 2408 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2409 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2410 }; 2411 let psize = arch::PAGE_SIZE as u64; 2412 let write_len = std::cmp::min( 2413 data.len() as u64 - total_written, 2414 psize - (paddr & (psize - 1)), 2415 ); 2416 guest_memory 2417 .memory() 2418 .write( 2419 &data[total_written as usize..total_written as usize + write_len as usize], 2420 GuestAddress(paddr), 2421 ) 2422 .map_err(DebuggableError::WriteMem)?; 2423 total_written += write_len; 2424 } 2425 Ok(()) 2426 } 2427 2428 fn active_vcpus(&self) -> usize { 2429 self.present_vcpus() as usize 2430 } 2431 } 2432 2433 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2434 impl Elf64Writable for CpuManager {} 2435 2436 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2437 impl CpuElf64Writable for CpuManager { 2438 fn cpu_write_elf64_note( 2439 &mut self, 2440 dump_state: &DumpState, 2441 ) -> std::result::Result<(), GuestDebuggableError> { 2442 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2443 for vcpu in &self.vcpus { 2444 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2445 let mut pos: usize = 0; 2446 let mut buf = vec![0; note_size as usize]; 2447 let descsz = size_of::<X86_64ElfPrStatus>(); 2448 let vcpu_id = vcpu.lock().unwrap().id; 2449 2450 let note = Elf64_Nhdr { 2451 n_namesz: COREDUMP_NAME_SIZE, 2452 n_descsz: descsz as u32, 2453 n_type: NT_PRSTATUS, 2454 }; 2455 2456 let bytes: &[u8] = note.as_slice(); 2457 buf.splice(0.., bytes.to_vec()); 2458 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2459 buf.resize(pos + 4, 0); 2460 buf.splice(pos.., "CORE".to_string().into_bytes()); 2461 2462 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2463 buf.resize(pos + 32 + 4, 0); 2464 let pid = vcpu_id as u64; 2465 let bytes: &[u8] = pid.as_slice(); 2466 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2467 2468 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2469 2470 let orig_rax: u64 = 0; 2471 let gregs = self.vcpus[usize::from(vcpu_id)] 2472 .lock() 2473 .unwrap() 2474 .vcpu 2475 .get_regs() 2476 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2477 2478 let regs1 = [ 2479 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11, 2480 gregs.r10, 2481 ]; 2482 let regs2 = [ 2483 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax, 2484 ]; 2485 2486 let sregs = self.vcpus[usize::from(vcpu_id)] 2487 .lock() 2488 .unwrap() 2489 .vcpu 2490 .get_sregs() 2491 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2492 2493 debug!( 2494 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2495 gregs.rip, 2496 gregs.rsp, 2497 sregs.gs.base, 2498 sregs.cs.selector, 2499 sregs.ss.selector, 2500 sregs.ds.selector, 2501 ); 2502 2503 let regs = X86_64UserRegs { 2504 regs1, 2505 regs2, 2506 rip: gregs.rip, 2507 cs: sregs.cs.selector as u64, 2508 eflags: gregs.rflags, 2509 rsp: gregs.rsp, 2510 ss: sregs.ss.selector as u64, 2511 fs_base: sregs.fs.base, 2512 gs_base: sregs.gs.base, 2513 ds: sregs.ds.selector as u64, 2514 es: sregs.es.selector as u64, 2515 fs: sregs.fs.selector as u64, 2516 gs: sregs.gs.selector as u64, 2517 }; 2518 2519 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2520 let bytes: &[u8] = regs.as_slice(); 2521 buf.resize(note_size as usize, 0); 2522 buf.splice(pos.., bytes.to_vec()); 2523 buf.resize(note_size as usize, 0); 2524 2525 coredump_file 2526 .write(&buf) 2527 .map_err(GuestDebuggableError::CoredumpFile)?; 2528 } 2529 2530 Ok(()) 2531 } 2532 2533 fn cpu_write_vmm_note( 2534 &mut self, 2535 dump_state: &DumpState, 2536 ) -> std::result::Result<(), GuestDebuggableError> { 2537 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2538 for vcpu in &self.vcpus { 2539 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2540 let mut pos: usize = 0; 2541 let mut buf = vec![0; note_size as usize]; 2542 let descsz = size_of::<DumpCpusState>(); 2543 let vcpu_id = vcpu.lock().unwrap().id; 2544 2545 let note = Elf64_Nhdr { 2546 n_namesz: COREDUMP_NAME_SIZE, 2547 n_descsz: descsz as u32, 2548 n_type: 0, 2549 }; 2550 2551 let bytes: &[u8] = note.as_slice(); 2552 buf.splice(0.., bytes.to_vec()); 2553 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2554 2555 buf.resize(pos + 4, 0); 2556 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2557 2558 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2559 2560 let gregs = self.vcpus[usize::from(vcpu_id)] 2561 .lock() 2562 .unwrap() 2563 .vcpu 2564 .get_regs() 2565 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2566 2567 let regs1 = [ 2568 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp, 2569 gregs.rbp, 2570 ]; 2571 2572 let regs2 = [ 2573 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, 2574 gregs.r15, 2575 ]; 2576 2577 let sregs = self.vcpus[usize::from(vcpu_id)] 2578 .lock() 2579 .unwrap() 2580 .vcpu 2581 .get_sregs() 2582 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2583 2584 let mut msrs = vec![MsrEntry { 2585 index: msr_index::MSR_KERNEL_GS_BASE, 2586 ..Default::default() 2587 }]; 2588 2589 self.vcpus[vcpu_id as usize] 2590 .lock() 2591 .unwrap() 2592 .vcpu 2593 .get_msrs(&mut msrs) 2594 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2595 let kernel_gs_base = msrs[0].data; 2596 2597 let cs = CpuSegment::new(sregs.cs); 2598 let ds = CpuSegment::new(sregs.ds); 2599 let es = CpuSegment::new(sregs.es); 2600 let fs = CpuSegment::new(sregs.fs); 2601 let gs = CpuSegment::new(sregs.gs); 2602 let ss = CpuSegment::new(sregs.ss); 2603 let ldt = CpuSegment::new(sregs.ldt); 2604 let tr = CpuSegment::new(sregs.tr); 2605 let gdt = CpuSegment::new_from_table(sregs.gdt); 2606 let idt = CpuSegment::new_from_table(sregs.idt); 2607 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2608 let regs = DumpCpusState { 2609 version: 1, 2610 size: size_of::<DumpCpusState>() as u32, 2611 regs1, 2612 regs2, 2613 rip: gregs.rip, 2614 rflags: gregs.rflags, 2615 cs, 2616 ds, 2617 es, 2618 fs, 2619 gs, 2620 ss, 2621 ldt, 2622 tr, 2623 gdt, 2624 idt, 2625 cr, 2626 kernel_gs_base, 2627 }; 2628 2629 let bytes: &[u8] = regs.as_slice(); 2630 buf.resize(note_size as usize, 0); 2631 buf.splice(pos.., bytes.to_vec()); 2632 buf.resize(note_size as usize, 0); 2633 2634 coredump_file 2635 .write(&buf) 2636 .map_err(GuestDebuggableError::CoredumpFile)?; 2637 } 2638 2639 Ok(()) 2640 } 2641 } 2642 2643 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2644 #[cfg(test)] 2645 mod tests { 2646 use arch::x86_64::interrupts::*; 2647 use arch::x86_64::regs::*; 2648 use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters}; 2649 2650 #[test] 2651 fn test_setlint() { 2652 let hv = hypervisor::new().unwrap(); 2653 let vm = hv.create_vm().expect("new VM fd creation failed"); 2654 assert!(hv.check_required_extensions().is_ok()); 2655 // Calling get_lapic will fail if there is no irqchip before hand. 2656 assert!(vm.create_irq_chip().is_ok()); 2657 let vcpu = vm.create_vcpu(0, None).unwrap(); 2658 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2659 2660 // Compute the value that is expected to represent LVT0 and LVT1. 2661 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2662 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2663 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2664 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2665 2666 set_lint(&vcpu).unwrap(); 2667 2668 // Compute the value that represents LVT0 and LVT1 after set_lint. 2669 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2670 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2671 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2672 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2673 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2674 } 2675 2676 #[test] 2677 fn test_setup_fpu() { 2678 let hv = hypervisor::new().unwrap(); 2679 let vm = hv.create_vm().expect("new VM fd creation failed"); 2680 let vcpu = vm.create_vcpu(0, None).unwrap(); 2681 setup_fpu(&vcpu).unwrap(); 2682 2683 let expected_fpu: FpuState = FpuState { 2684 fcw: 0x37f, 2685 mxcsr: 0x1f80, 2686 ..Default::default() 2687 }; 2688 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2689 // TODO: auto-generate kvm related structures with PartialEq on. 2690 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2691 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2692 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2693 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2694 // remove it at all. 2695 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2696 } 2697 2698 #[test] 2699 fn test_setup_msrs() { 2700 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2701 2702 let hv = hypervisor::new().unwrap(); 2703 let vm = hv.create_vm().expect("new VM fd creation failed"); 2704 let vcpu = vm.create_vcpu(0, None).unwrap(); 2705 setup_msrs(&vcpu).unwrap(); 2706 2707 // This test will check against the last MSR entry configured (the tenth one). 2708 // See create_msr_entries for details. 2709 let mut msrs = vec![MsrEntry { 2710 index: msr_index::MSR_IA32_MISC_ENABLE, 2711 ..Default::default() 2712 }]; 2713 2714 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2715 // in this test case scenario. 2716 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2717 assert_eq!(read_msrs, 1); 2718 2719 // Official entries that were setup when we did setup_msrs. We need to assert that the 2720 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2721 // expect. 2722 let entry_vec = vcpu.boot_msr_entries(); 2723 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2724 } 2725 2726 #[test] 2727 fn test_setup_regs() { 2728 let hv = hypervisor::new().unwrap(); 2729 let vm = hv.create_vm().expect("new VM fd creation failed"); 2730 let vcpu = vm.create_vcpu(0, None).unwrap(); 2731 2732 let expected_regs: StandardRegisters = StandardRegisters { 2733 rflags: 0x0000000000000002u64, 2734 rbx: arch::layout::PVH_INFO_START.0, 2735 rip: 1, 2736 ..Default::default() 2737 }; 2738 2739 setup_regs(&vcpu, expected_regs.rip).unwrap(); 2740 2741 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2742 assert_eq!(actual_regs, expected_regs); 2743 } 2744 } 2745 2746 #[cfg(target_arch = "aarch64")] 2747 #[cfg(test)] 2748 mod tests { 2749 use arch::{aarch64::regs, layout}; 2750 use hypervisor::kvm::aarch64::is_system_register; 2751 use hypervisor::kvm::kvm_bindings::{ 2752 kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2753 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2754 }; 2755 use hypervisor::{arm64_core_reg_id, offset_of}; 2756 use std::mem; 2757 2758 #[test] 2759 fn test_setup_regs() { 2760 let hv = hypervisor::new().unwrap(); 2761 let vm = hv.create_vm().unwrap(); 2762 let vcpu = vm.create_vcpu(0, None).unwrap(); 2763 2764 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2765 // Must fail when vcpu is not initialized yet. 2766 assert!(res.is_err()); 2767 2768 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2769 vm.get_preferred_target(&mut kvi).unwrap(); 2770 vcpu.vcpu_init(&kvi).unwrap(); 2771 2772 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2773 } 2774 2775 #[test] 2776 fn test_read_mpidr() { 2777 let hv = hypervisor::new().unwrap(); 2778 let vm = hv.create_vm().unwrap(); 2779 let vcpu = vm.create_vcpu(0, None).unwrap(); 2780 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2781 vm.get_preferred_target(&mut kvi).unwrap(); 2782 2783 // Must fail when vcpu is not initialized yet. 2784 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2785 2786 vcpu.vcpu_init(&kvi).unwrap(); 2787 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2788 } 2789 2790 #[test] 2791 fn test_is_system_register() { 2792 let offset = offset_of!(user_pt_regs, pc); 2793 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2794 assert!(!is_system_register(regid)); 2795 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2796 assert!(is_system_register(regid)); 2797 } 2798 2799 #[test] 2800 fn test_save_restore_core_regs() { 2801 let hv = hypervisor::new().unwrap(); 2802 let vm = hv.create_vm().unwrap(); 2803 let vcpu = vm.create_vcpu(0, None).unwrap(); 2804 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2805 vm.get_preferred_target(&mut kvi).unwrap(); 2806 2807 // Must fail when vcpu is not initialized yet. 2808 let res = vcpu.get_regs(); 2809 assert!(res.is_err()); 2810 assert_eq!( 2811 format!("{}", res.unwrap_err()), 2812 "Failed to get core register: Exec format error (os error 8)" 2813 ); 2814 2815 let mut state = kvm_regs::default(); 2816 let res = vcpu.set_regs(&state); 2817 assert!(res.is_err()); 2818 assert_eq!( 2819 format!("{}", res.unwrap_err()), 2820 "Failed to set core register: Exec format error (os error 8)" 2821 ); 2822 2823 vcpu.vcpu_init(&kvi).unwrap(); 2824 let res = vcpu.get_regs(); 2825 assert!(res.is_ok()); 2826 state = res.unwrap(); 2827 assert_eq!(state.regs.pstate, 0x3C5); 2828 2829 assert!(vcpu.set_regs(&state).is_ok()); 2830 } 2831 2832 #[test] 2833 fn test_get_set_mpstate() { 2834 let hv = hypervisor::new().unwrap(); 2835 let vm = hv.create_vm().unwrap(); 2836 let vcpu = vm.create_vcpu(0, None).unwrap(); 2837 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2838 vm.get_preferred_target(&mut kvi).unwrap(); 2839 2840 let res = vcpu.get_mp_state(); 2841 assert!(res.is_ok()); 2842 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 2843 } 2844 } 2845