1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use std::collections::BTreeMap; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use std::io::Write; 17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 18 use std::mem::size_of; 19 use std::os::unix::thread::JoinHandleExt; 20 use std::sync::atomic::{AtomicBool, Ordering}; 21 use std::sync::{Arc, Barrier, Mutex}; 22 use std::{cmp, io, result, thread}; 23 24 #[cfg(not(target_arch = "riscv64"))] 25 use acpi_tables::sdt::Sdt; 26 use acpi_tables::{aml, Aml}; 27 use anyhow::anyhow; 28 #[cfg(target_arch = "x86_64")] 29 use arch::x86_64::get_x2apic_id; 30 use arch::{EntryPoint, NumaNodes}; 31 #[cfg(target_arch = "aarch64")] 32 use devices::gic::Gic; 33 use devices::interrupt_controller::InterruptController; 34 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 35 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 36 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 37 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 38 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 39 use hypervisor::arch::aarch64::regs::{ID_AA64MMFR0_EL1, TCR_EL1, TTBR1_EL1}; 40 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 41 use hypervisor::arch::x86::msr_index; 42 #[cfg(target_arch = "x86_64")] 43 use hypervisor::arch::x86::CpuIdEntry; 44 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 45 use hypervisor::arch::x86::MsrEntry; 46 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 47 use hypervisor::arch::x86::SpecialRegisters; 48 #[cfg(feature = "tdx")] 49 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 50 #[cfg(target_arch = "x86_64")] 51 use hypervisor::CpuVendor; 52 #[cfg(feature = "kvm")] 53 use hypervisor::HypervisorType; 54 #[cfg(feature = "guest_debug")] 55 use hypervisor::StandardRegisters; 56 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps}; 57 use libc::{c_void, siginfo_t}; 58 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 59 use linux_loader::elf::Elf64_Nhdr; 60 use seccompiler::{apply_filter, SeccompAction}; 61 use thiserror::Error; 62 use tracer::trace_scoped; 63 use vm_device::BusDevice; 64 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 65 use vm_memory::ByteValued; 66 #[cfg(feature = "guest_debug")] 67 use vm_memory::{Bytes, GuestAddressSpace}; 68 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 69 use vm_migration::{ 70 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 71 Transportable, 72 }; 73 use vmm_sys_util::eventfd::EventFd; 74 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 75 use zerocopy::{FromBytes, Immutable, IntoBytes}; 76 77 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 78 use crate::coredump::{ 79 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 80 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 81 NT_PRSTATUS, 82 }; 83 #[cfg(feature = "guest_debug")] 84 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 85 #[cfg(target_arch = "x86_64")] 86 use crate::memory_manager::MemoryManager; 87 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 88 #[cfg(target_arch = "x86_64")] 89 use crate::vm::physical_bits; 90 use crate::vm_config::CpusConfig; 91 use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID}; 92 93 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 94 /// Extract the specified bits of a 64-bit integer. 95 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 96 /// following expression should return 3 (`0b11`): 97 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 98 /// 99 macro_rules! extract_bits_64 { 100 ($value: tt, $offset: tt, $length: tt) => { 101 ($value >> $offset) & (!0u64 >> (64 - $length)) 102 }; 103 } 104 105 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 106 macro_rules! extract_bits_64_without_offset { 107 ($value: tt, $length: tt) => { 108 $value & (!0u64 >> (64 - $length)) 109 }; 110 } 111 112 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 113 114 #[derive(Debug, Error)] 115 pub enum Error { 116 #[error("Error creating vCPU: {0}")] 117 VcpuCreate(#[source] anyhow::Error), 118 119 #[error("Error running bCPU: {0}")] 120 VcpuRun(#[source] anyhow::Error), 121 122 #[error("Error spawning vCPU thread: {0}")] 123 VcpuSpawn(#[source] io::Error), 124 125 #[error("Error generating common CPUID: {0}")] 126 CommonCpuId(#[source] arch::Error), 127 128 #[error("Error configuring vCPU: {0}")] 129 VcpuConfiguration(#[source] arch::Error), 130 131 #[error("Still pending removed vcpu")] 132 VcpuPendingRemovedVcpu, 133 134 #[cfg(target_arch = "aarch64")] 135 #[error("Error fetching preferred target: {0}")] 136 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 137 138 #[cfg(target_arch = "aarch64")] 139 #[error("Error setting vCPU processor features: {0}")] 140 VcpuSetProcessorFeatures(#[source] hypervisor::HypervisorCpuError), 141 142 #[cfg(target_arch = "aarch64")] 143 #[error("Error initialising vCPU: {0}")] 144 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 145 146 #[cfg(target_arch = "aarch64")] 147 #[error("Error finalising vCPU: {0}")] 148 VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError), 149 150 #[cfg(target_arch = "aarch64")] 151 #[error("Error initialising GICR base address: {0}")] 152 VcpuSetGicrBaseAddr(#[source] hypervisor::HypervisorCpuError), 153 154 #[error("Failed to join on vCPU threads: {0:?}")] 155 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 156 157 #[error("Error adding CpuManager to MMIO bus: {0}")] 158 BusError(#[source] vm_device::BusError), 159 160 #[error("Requested vCPUs exceed maximum")] 161 DesiredVCpuCountExceedsMax, 162 163 #[error("Cannot create seccomp filter: {0}")] 164 CreateSeccompFilter(#[source] seccompiler::Error), 165 166 #[error("Cannot apply seccomp filter: {0}")] 167 ApplySeccompFilter(#[source] seccompiler::Error), 168 169 #[error("Error starting vCPU after restore: {0}")] 170 StartRestoreVcpu(#[source] anyhow::Error), 171 172 #[error("Unexpected VmExit")] 173 UnexpectedVmExit, 174 175 #[error("Failed to allocate MMIO address for CpuManager")] 176 AllocateMmmioAddress, 177 178 #[cfg(feature = "tdx")] 179 #[error("Error initializing TDX: {0}")] 180 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 181 182 #[cfg(target_arch = "aarch64")] 183 #[error("Error initializing PMU: {0}")] 184 InitPmu(#[source] hypervisor::HypervisorCpuError), 185 186 #[cfg(feature = "guest_debug")] 187 #[error("Error during CPU debug: {0}")] 188 CpuDebug(#[source] hypervisor::HypervisorCpuError), 189 190 #[cfg(feature = "guest_debug")] 191 #[error("Error translating virtual address: {0}")] 192 TranslateVirtualAddress(#[source] anyhow::Error), 193 194 #[cfg(target_arch = "x86_64")] 195 #[error("Error setting up AMX: {0}")] 196 AmxEnable(#[source] anyhow::Error), 197 198 #[error("Maximum number of vCPUs exceeds host limit")] 199 MaximumVcpusExceeded, 200 201 #[cfg(feature = "sev_snp")] 202 #[error("Failed to set sev control register: {0}")] 203 SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), 204 205 #[cfg(target_arch = "x86_64")] 206 #[error("Failed to inject NMI")] 207 NmiError(hypervisor::HypervisorCpuError), 208 } 209 pub type Result<T> = result::Result<T, Error>; 210 211 #[cfg(target_arch = "x86_64")] 212 #[allow(dead_code)] 213 #[repr(C, packed)] 214 #[derive(IntoBytes, Immutable, FromBytes)] 215 struct LocalX2Apic { 216 pub r#type: u8, 217 pub length: u8, 218 pub _reserved: u16, 219 pub apic_id: u32, 220 pub flags: u32, 221 pub processor_id: u32, 222 } 223 224 #[allow(dead_code)] 225 #[repr(C, packed)] 226 #[derive(Default, IntoBytes, Immutable, FromBytes)] 227 struct Ioapic { 228 pub r#type: u8, 229 pub length: u8, 230 pub ioapic_id: u8, 231 _reserved: u8, 232 pub apic_address: u32, 233 pub gsi_base: u32, 234 } 235 236 #[cfg(target_arch = "aarch64")] 237 #[allow(dead_code)] 238 #[repr(C, packed)] 239 #[derive(IntoBytes, Immutable, FromBytes)] 240 struct GicC { 241 pub r#type: u8, 242 pub length: u8, 243 pub reserved0: u16, 244 pub cpu_interface_number: u32, 245 pub uid: u32, 246 pub flags: u32, 247 pub parking_version: u32, 248 pub performance_interrupt: u32, 249 pub parked_address: u64, 250 pub base_address: u64, 251 pub gicv_base_address: u64, 252 pub gich_base_address: u64, 253 pub vgic_interrupt: u32, 254 pub gicr_base_address: u64, 255 pub mpidr: u64, 256 pub proc_power_effi_class: u8, 257 pub reserved1: u8, 258 pub spe_overflow_interrupt: u16, 259 } 260 261 #[cfg(target_arch = "aarch64")] 262 #[allow(dead_code)] 263 #[repr(C, packed)] 264 #[derive(IntoBytes, Immutable, FromBytes)] 265 struct GicD { 266 pub r#type: u8, 267 pub length: u8, 268 pub reserved0: u16, 269 pub gic_id: u32, 270 pub base_address: u64, 271 pub global_irq_base: u32, 272 pub version: u8, 273 pub reserved1: [u8; 3], 274 } 275 276 #[cfg(target_arch = "aarch64")] 277 #[allow(dead_code)] 278 #[repr(C, packed)] 279 #[derive(IntoBytes, Immutable, FromBytes)] 280 struct GicR { 281 pub r#type: u8, 282 pub length: u8, 283 pub reserved: u16, 284 pub base_address: u64, 285 pub range_length: u32, 286 } 287 288 #[cfg(target_arch = "aarch64")] 289 #[allow(dead_code)] 290 #[repr(C, packed)] 291 #[derive(IntoBytes, Immutable, FromBytes)] 292 struct GicIts { 293 pub r#type: u8, 294 pub length: u8, 295 pub reserved0: u16, 296 pub translation_id: u32, 297 pub base_address: u64, 298 pub reserved1: u32, 299 } 300 301 #[cfg(target_arch = "aarch64")] 302 #[allow(dead_code)] 303 #[repr(C, packed)] 304 #[derive(IntoBytes, Immutable, FromBytes)] 305 struct ProcessorHierarchyNode { 306 pub r#type: u8, 307 pub length: u8, 308 pub reserved: u16, 309 pub flags: u32, 310 pub parent: u32, 311 pub acpi_processor_id: u32, 312 pub num_private_resources: u32, 313 } 314 315 #[allow(dead_code)] 316 #[repr(C, packed)] 317 #[derive(Default, IntoBytes, Immutable, FromBytes)] 318 struct InterruptSourceOverride { 319 pub r#type: u8, 320 pub length: u8, 321 pub bus: u8, 322 pub source: u8, 323 pub gsi: u32, 324 pub flags: u16, 325 } 326 327 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 328 macro_rules! round_up { 329 ($n:expr,$d:expr) => { 330 (($n / ($d + 1)) + 1) * $d 331 }; 332 } 333 334 /// A wrapper around creating and using a kvm-based VCPU. 335 pub struct Vcpu { 336 // The hypervisor abstracted CPU. 337 vcpu: Arc<dyn hypervisor::Vcpu>, 338 id: u8, 339 #[cfg(target_arch = "aarch64")] 340 mpidr: u64, 341 saved_state: Option<CpuState>, 342 #[cfg(target_arch = "x86_64")] 343 vendor: CpuVendor, 344 } 345 346 impl Vcpu { 347 /// Constructs a new VCPU for `vm`. 348 /// 349 /// # Arguments 350 /// 351 /// * `id` - Represents the CPU number between [0, max vcpus). 352 /// * `vm` - The virtual machine this vcpu will get attached to. 353 /// * `vm_ops` - Optional object for exit handling. 354 /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) 355 pub fn new( 356 id: u8, 357 apic_id: u8, 358 vm: &Arc<dyn hypervisor::Vm>, 359 vm_ops: Option<Arc<dyn VmOps>>, 360 #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, 361 ) -> Result<Self> { 362 let vcpu = vm 363 .create_vcpu(apic_id, vm_ops) 364 .map_err(|e| Error::VcpuCreate(e.into()))?; 365 // Initially the cpuid per vCPU is the one supported by this VM. 366 Ok(Vcpu { 367 vcpu, 368 id, 369 #[cfg(target_arch = "aarch64")] 370 mpidr: 0, 371 saved_state: None, 372 #[cfg(target_arch = "x86_64")] 373 vendor: cpu_vendor, 374 }) 375 } 376 377 /// Configures a vcpu and should be called once per vcpu when created. 378 /// 379 /// # Arguments 380 /// 381 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 382 /// * `guest_memory` - Guest memory. 383 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 384 pub fn configure( 385 &mut self, 386 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 387 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 388 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 389 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 390 #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, 391 ) -> Result<()> { 392 #[cfg(target_arch = "aarch64")] 393 { 394 self.init(vm)?; 395 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 396 .map_err(Error::VcpuConfiguration)?; 397 } 398 #[cfg(target_arch = "riscv64")] 399 arch::configure_vcpu(&self.vcpu, self.id, boot_setup).map_err(Error::VcpuConfiguration)?; 400 info!("Configuring vCPU: cpu_id = {}", self.id); 401 #[cfg(target_arch = "x86_64")] 402 arch::configure_vcpu( 403 &self.vcpu, 404 self.id, 405 boot_setup, 406 cpuid, 407 kvm_hyperv, 408 self.vendor, 409 topology, 410 ) 411 .map_err(Error::VcpuConfiguration)?; 412 413 Ok(()) 414 } 415 416 /// Gets the MPIDR register value. 417 #[cfg(target_arch = "aarch64")] 418 pub fn get_mpidr(&self) -> u64 { 419 self.mpidr 420 } 421 422 /// Gets the saved vCPU state. 423 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] 424 pub fn get_saved_state(&self) -> Option<CpuState> { 425 self.saved_state.clone() 426 } 427 428 /// Initializes an aarch64 specific vcpu for booting Linux. 429 #[cfg(target_arch = "aarch64")] 430 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 431 use std::arch::is_aarch64_feature_detected; 432 #[allow(clippy::nonminimal_bool)] 433 let sve_supported = 434 is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2"); 435 let mut kvi = self.vcpu.create_vcpu_init(); 436 437 // This reads back the kernel's preferred target type. 438 vm.get_preferred_target(&mut kvi) 439 .map_err(Error::VcpuArmPreferredTarget)?; 440 441 self.vcpu 442 .vcpu_set_processor_features(vm, &mut kvi, self.id) 443 .map_err(Error::VcpuSetProcessorFeatures)?; 444 445 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?; 446 447 if sve_supported { 448 let finalized_features = self.vcpu.vcpu_get_finalized_features(); 449 self.vcpu 450 .vcpu_finalize(finalized_features) 451 .map_err(Error::VcpuArmFinalize)?; 452 } 453 Ok(()) 454 } 455 456 /// Runs the VCPU until it exits, returning the reason. 457 /// 458 /// Note that the state of the VCPU and associated VM must be setup first for this to do 459 /// anything useful. 460 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 461 self.vcpu.run() 462 } 463 464 #[cfg(feature = "sev_snp")] 465 pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> { 466 self.vcpu 467 .set_sev_control_register(vmsa_pfn) 468 .map_err(Error::SetSevControlRegister) 469 } 470 471 /// 472 /// Sets the vCPU's GIC redistributor base address. 473 /// 474 #[cfg(target_arch = "aarch64")] 475 pub fn set_gic_redistributor_addr( 476 &self, 477 base_redist_addr: u64, 478 redist_size: u64, 479 ) -> Result<()> { 480 let gicr_base = base_redist_addr + (arch::layout::GIC_V3_REDIST_SIZE * self.id as u64); 481 assert!(gicr_base + arch::layout::GIC_V3_REDIST_SIZE <= base_redist_addr + redist_size); 482 self.vcpu 483 .set_gic_redistributor_addr(gicr_base) 484 .map_err(Error::VcpuSetGicrBaseAddr)?; 485 Ok(()) 486 } 487 } 488 489 impl Pausable for Vcpu {} 490 impl Snapshottable for Vcpu { 491 fn id(&self) -> String { 492 self.id.to_string() 493 } 494 495 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 496 let saved_state = self 497 .vcpu 498 .state() 499 .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?; 500 501 self.saved_state = Some(saved_state.clone()); 502 503 Ok(Snapshot::from_data(SnapshotData::new_from_state( 504 &saved_state, 505 )?)) 506 } 507 } 508 509 pub struct CpuManager { 510 config: CpusConfig, 511 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 512 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 513 #[cfg(target_arch = "x86_64")] 514 cpuid: Vec<CpuIdEntry>, 515 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 516 vm: Arc<dyn hypervisor::Vm>, 517 vcpus_kill_signalled: Arc<AtomicBool>, 518 vcpus_pause_signalled: Arc<AtomicBool>, 519 vcpus_kick_signalled: Arc<AtomicBool>, 520 exit_evt: EventFd, 521 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 522 reset_evt: EventFd, 523 #[cfg(feature = "guest_debug")] 524 vm_debug_evt: EventFd, 525 vcpu_states: Vec<VcpuState>, 526 selected_cpu: u8, 527 vcpus: Vec<Arc<Mutex<Vcpu>>>, 528 seccomp_action: SeccompAction, 529 vm_ops: Arc<dyn VmOps>, 530 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 531 acpi_address: Option<GuestAddress>, 532 proximity_domain_per_cpu: BTreeMap<u8, u32>, 533 affinity: BTreeMap<u8, Vec<usize>>, 534 dynamic: bool, 535 hypervisor: Arc<dyn hypervisor::Hypervisor>, 536 #[cfg(feature = "sev_snp")] 537 sev_snp_enabled: bool, 538 } 539 540 const CPU_ENABLE_FLAG: usize = 0; 541 const CPU_INSERTING_FLAG: usize = 1; 542 const CPU_REMOVING_FLAG: usize = 2; 543 const CPU_EJECT_FLAG: usize = 3; 544 545 const CPU_STATUS_OFFSET: u64 = 4; 546 const CPU_SELECTION_OFFSET: u64 = 0; 547 548 impl BusDevice for CpuManager { 549 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 550 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 551 data.fill(0); 552 553 match offset { 554 CPU_SELECTION_OFFSET => { 555 data[0] = self.selected_cpu; 556 } 557 CPU_STATUS_OFFSET => { 558 if self.selected_cpu < self.max_vcpus() { 559 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 560 if state.active() { 561 data[0] |= 1 << CPU_ENABLE_FLAG; 562 } 563 if state.inserting { 564 data[0] |= 1 << CPU_INSERTING_FLAG; 565 } 566 if state.removing { 567 data[0] |= 1 << CPU_REMOVING_FLAG; 568 } 569 } else { 570 warn!("Out of range vCPU id: {}", self.selected_cpu); 571 } 572 } 573 _ => { 574 warn!( 575 "Unexpected offset for accessing CPU manager device: {:#}", 576 offset 577 ); 578 } 579 } 580 } 581 582 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 583 match offset { 584 CPU_SELECTION_OFFSET => { 585 self.selected_cpu = data[0]; 586 } 587 CPU_STATUS_OFFSET => { 588 if self.selected_cpu < self.max_vcpus() { 589 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 590 // The ACPI code writes back a 1 to acknowledge the insertion 591 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 592 && state.inserting 593 { 594 state.inserting = false; 595 } 596 // Ditto for removal 597 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 598 && state.removing 599 { 600 state.removing = false; 601 } 602 // Trigger removal of vCPU 603 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 604 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 605 error!("Error removing vCPU: {:?}", e); 606 } 607 } 608 } else { 609 warn!("Out of range vCPU id: {}", self.selected_cpu); 610 } 611 } 612 _ => { 613 warn!( 614 "Unexpected offset for accessing CPU manager device: {:#}", 615 offset 616 ); 617 } 618 } 619 None 620 } 621 } 622 623 #[derive(Default)] 624 struct VcpuState { 625 inserting: bool, 626 removing: bool, 627 pending_removal: Arc<AtomicBool>, 628 handle: Option<thread::JoinHandle<()>>, 629 kill: Arc<AtomicBool>, 630 vcpu_run_interrupted: Arc<AtomicBool>, 631 paused: Arc<AtomicBool>, 632 } 633 634 impl VcpuState { 635 fn active(&self) -> bool { 636 self.handle.is_some() 637 } 638 639 fn signal_thread(&self) { 640 if let Some(handle) = self.handle.as_ref() { 641 loop { 642 // SAFETY: FFI call with correct arguments 643 unsafe { 644 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 645 } 646 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 647 break; 648 } else { 649 // This is more effective than thread::yield_now() at 650 // avoiding a priority inversion with the vCPU thread 651 thread::sleep(std::time::Duration::from_millis(1)); 652 } 653 } 654 } 655 } 656 657 fn join_thread(&mut self) -> Result<()> { 658 if let Some(handle) = self.handle.take() { 659 handle.join().map_err(Error::ThreadCleanup)? 660 } 661 662 Ok(()) 663 } 664 665 fn unpark_thread(&self) { 666 if let Some(handle) = self.handle.as_ref() { 667 handle.thread().unpark() 668 } 669 } 670 } 671 672 impl CpuManager { 673 #[allow(unused_variables)] 674 #[allow(clippy::too_many_arguments)] 675 pub fn new( 676 config: &CpusConfig, 677 vm: Arc<dyn hypervisor::Vm>, 678 exit_evt: EventFd, 679 reset_evt: EventFd, 680 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 681 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 682 seccomp_action: SeccompAction, 683 vm_ops: Arc<dyn VmOps>, 684 #[cfg(feature = "tdx")] tdx_enabled: bool, 685 numa_nodes: &NumaNodes, 686 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 687 ) -> Result<Arc<Mutex<CpuManager>>> { 688 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 689 return Err(Error::MaximumVcpusExceeded); 690 } 691 692 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 693 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 694 let hypervisor_type = hypervisor.hypervisor_type(); 695 #[cfg(target_arch = "x86_64")] 696 let cpu_vendor = hypervisor.get_cpu_vendor(); 697 698 #[cfg(target_arch = "x86_64")] 699 if config.features.amx { 700 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 701 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 702 const XFEATURE_XTILEDATA: usize = 18; 703 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 704 705 // SAFETY: the syscall is only modifying kernel internal 706 // data structures that the kernel is itself expected to safeguard. 707 let amx_tile = unsafe { 708 libc::syscall( 709 libc::SYS_arch_prctl, 710 ARCH_REQ_XCOMP_GUEST_PERM, 711 XFEATURE_XTILEDATA, 712 ) 713 }; 714 715 if amx_tile != 0 { 716 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 717 } else { 718 let mask: usize = 0; 719 // SAFETY: the mask being modified (not marked mutable as it is 720 // modified in unsafe only which is permitted) isn't in use elsewhere. 721 let result = unsafe { 722 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 723 }; 724 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 725 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 726 } 727 } 728 } 729 730 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 731 let mut cpu_list = Vec::new(); 732 for (proximity_domain, numa_node) in numa_nodes.iter() { 733 for cpu in numa_node.cpus.iter() { 734 cpu_list.push((*cpu, *proximity_domain)) 735 } 736 } 737 cpu_list 738 } 739 .into_iter() 740 .collect(); 741 742 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 743 cpu_affinity 744 .iter() 745 .map(|a| (a.vcpu, a.host_cpus.clone())) 746 .collect() 747 } else { 748 BTreeMap::new() 749 }; 750 751 #[cfg(feature = "tdx")] 752 let dynamic = !tdx_enabled; 753 #[cfg(not(feature = "tdx"))] 754 let dynamic = true; 755 756 Ok(Arc::new(Mutex::new(CpuManager { 757 config: config.clone(), 758 interrupt_controller: None, 759 #[cfg(target_arch = "x86_64")] 760 cpuid: Vec::new(), 761 vm, 762 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 763 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 764 vcpus_kick_signalled: Arc::new(AtomicBool::new(false)), 765 vcpu_states, 766 exit_evt, 767 reset_evt, 768 #[cfg(feature = "guest_debug")] 769 vm_debug_evt, 770 selected_cpu: 0, 771 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 772 seccomp_action, 773 vm_ops, 774 acpi_address: None, 775 proximity_domain_per_cpu, 776 affinity, 777 dynamic, 778 hypervisor: hypervisor.clone(), 779 #[cfg(feature = "sev_snp")] 780 sev_snp_enabled, 781 }))) 782 } 783 784 #[cfg(target_arch = "x86_64")] 785 pub fn populate_cpuid( 786 &mut self, 787 memory_manager: &Arc<Mutex<MemoryManager>>, 788 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 789 #[cfg(feature = "tdx")] tdx: bool, 790 ) -> Result<()> { 791 let sgx_epc_sections = memory_manager 792 .lock() 793 .unwrap() 794 .sgx_epc_region() 795 .as_ref() 796 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 797 798 self.cpuid = { 799 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 800 arch::generate_common_cpuid( 801 hypervisor, 802 &arch::CpuidConfig { 803 sgx_epc_sections, 804 phys_bits, 805 kvm_hyperv: self.config.kvm_hyperv, 806 #[cfg(feature = "tdx")] 807 tdx, 808 amx: self.config.features.amx, 809 }, 810 ) 811 .map_err(Error::CommonCpuId)? 812 }; 813 814 Ok(()) 815 } 816 817 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 818 info!("Creating vCPU: cpu_id = {}", cpu_id); 819 820 #[cfg(target_arch = "x86_64")] 821 let topology = self.get_vcpu_topology(); 822 #[cfg(target_arch = "x86_64")] 823 let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology); 824 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] 825 let x2apic_id = cpu_id as u32; 826 827 let mut vcpu = Vcpu::new( 828 cpu_id, 829 x2apic_id as u8, 830 &self.vm, 831 Some(self.vm_ops.clone()), 832 #[cfg(target_arch = "x86_64")] 833 self.hypervisor.get_cpu_vendor(), 834 )?; 835 836 if let Some(snapshot) = snapshot { 837 // AArch64 vCPUs should be initialized after created. 838 #[cfg(target_arch = "aarch64")] 839 vcpu.init(&self.vm)?; 840 841 let state: CpuState = snapshot.to_state().map_err(|e| { 842 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 843 })?; 844 vcpu.vcpu 845 .set_state(&state) 846 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 847 848 vcpu.saved_state = Some(state); 849 } 850 851 let vcpu = Arc::new(Mutex::new(vcpu)); 852 853 // Adding vCPU to the CpuManager's vCPU list. 854 self.vcpus.push(vcpu.clone()); 855 856 Ok(vcpu) 857 } 858 859 pub fn configure_vcpu( 860 &self, 861 vcpu: Arc<Mutex<Vcpu>>, 862 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 863 ) -> Result<()> { 864 let mut vcpu = vcpu.lock().unwrap(); 865 866 #[cfg(feature = "sev_snp")] 867 if self.sev_snp_enabled { 868 if let Some((kernel_entry_point, _)) = boot_setup { 869 vcpu.set_sev_control_register( 870 kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, 871 )?; 872 } 873 874 // Traditional way to configure vcpu doesn't work for SEV-SNP guests. 875 // All the vCPU configuration for SEV-SNP guest is provided via VMSA. 876 return Ok(()); 877 } 878 879 #[cfg(target_arch = "x86_64")] 880 assert!(!self.cpuid.is_empty()); 881 882 #[cfg(target_arch = "x86_64")] 883 let topology = self.config.topology.clone().map_or_else( 884 || Some((1, self.boot_vcpus(), 1)), 885 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 886 ); 887 #[cfg(target_arch = "x86_64")] 888 vcpu.configure( 889 boot_setup, 890 self.cpuid.clone(), 891 self.config.kvm_hyperv, 892 topology, 893 )?; 894 895 #[cfg(target_arch = "aarch64")] 896 vcpu.configure(&self.vm, boot_setup)?; 897 898 #[cfg(target_arch = "riscv64")] 899 vcpu.configure(boot_setup)?; 900 901 Ok(()) 902 } 903 904 /// Only create new vCPUs if there aren't any inactive ones to reuse 905 fn create_vcpus( 906 &mut self, 907 desired_vcpus: u8, 908 snapshot: Option<Snapshot>, 909 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 910 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 911 info!( 912 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 913 desired_vcpus, 914 self.config.max_vcpus, 915 self.vcpus.len(), 916 self.present_vcpus() 917 ); 918 919 if desired_vcpus > self.config.max_vcpus { 920 return Err(Error::DesiredVCpuCountExceedsMax); 921 } 922 923 // Only create vCPUs in excess of all the allocated vCPUs. 924 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 925 vcpus.push(self.create_vcpu( 926 cpu_id, 927 // TODO: The special format of the CPU id can be removed once 928 // ready to break live upgrade. 929 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 930 )?); 931 } 932 933 Ok(vcpus) 934 } 935 936 #[cfg(target_arch = "aarch64")] 937 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 938 for cpu in self.vcpus.iter() { 939 let cpu = cpu.lock().unwrap(); 940 // Check if PMU attr is available, if not, log the information. 941 if cpu.vcpu.has_pmu_support() { 942 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 943 } else { 944 debug!( 945 "PMU attribute is not supported in vCPU{}, skip PMU init!", 946 cpu.id 947 ); 948 return Ok(false); 949 } 950 } 951 952 Ok(true) 953 } 954 955 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 956 self.vcpus.clone() 957 } 958 959 fn start_vcpu( 960 &mut self, 961 vcpu: Arc<Mutex<Vcpu>>, 962 vcpu_id: u8, 963 vcpu_thread_barrier: Arc<Barrier>, 964 inserting: bool, 965 ) -> Result<()> { 966 let reset_evt = self.reset_evt.try_clone().unwrap(); 967 let exit_evt = self.exit_evt.try_clone().unwrap(); 968 #[cfg(feature = "kvm")] 969 let hypervisor_type = self.hypervisor.hypervisor_type(); 970 #[cfg(feature = "guest_debug")] 971 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 972 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 973 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 974 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 975 let vcpu_kick_signalled = self.vcpus_kick_signalled.clone(); 976 977 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 978 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 979 .vcpu_run_interrupted 980 .clone(); 981 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 982 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 983 984 // Prepare the CPU set the current vCPU is expected to run onto. 985 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 986 // SAFETY: all zeros is a valid pattern 987 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 988 // SAFETY: FFI call, trivially safe 989 unsafe { libc::CPU_ZERO(&mut cpuset) }; 990 for host_cpu in host_cpus { 991 // SAFETY: FFI call, trivially safe 992 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) }; 993 } 994 cpuset 995 }); 996 997 // Retrieve seccomp filter for vcpu thread 998 let vcpu_seccomp_filter = get_seccomp_filter( 999 &self.seccomp_action, 1000 Thread::Vcpu, 1001 self.hypervisor.hypervisor_type(), 1002 ) 1003 .map_err(Error::CreateSeccompFilter)?; 1004 1005 #[cfg(target_arch = "x86_64")] 1006 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 1007 1008 info!("Starting vCPU: cpu_id = {}", vcpu_id); 1009 1010 let handle = Some( 1011 thread::Builder::new() 1012 .name(format!("vcpu{vcpu_id}")) 1013 .spawn(move || { 1014 // Schedule the thread to run on the expected CPU set 1015 if let Some(cpuset) = cpuset.as_ref() { 1016 // SAFETY: FFI call with correct arguments 1017 let ret = unsafe { 1018 libc::sched_setaffinity( 1019 0, 1020 std::mem::size_of::<libc::cpu_set_t>(), 1021 cpuset as *const libc::cpu_set_t, 1022 ) 1023 }; 1024 1025 if ret != 0 { 1026 error!( 1027 "Failed scheduling the vCPU {} on the expected CPU set: {}", 1028 vcpu_id, 1029 io::Error::last_os_error() 1030 ); 1031 return; 1032 } 1033 } 1034 1035 // Apply seccomp filter for vcpu thread. 1036 if !vcpu_seccomp_filter.is_empty() { 1037 if let Err(e) = 1038 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 1039 { 1040 error!("Error applying seccomp filter: {:?}", e); 1041 return; 1042 } 1043 } 1044 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 1045 // This uses an async signal safe handler to kill the vcpu handles. 1046 register_signal_handler(SIGRTMIN(), handle_signal) 1047 .expect("Failed to register vcpu signal handler"); 1048 // Block until all CPUs are ready. 1049 vcpu_thread_barrier.wait(); 1050 1051 std::panic::catch_unwind(move || { 1052 loop { 1053 // If we are being told to pause, we park the thread 1054 // until the pause boolean is toggled. 1055 // The resume operation is responsible for toggling 1056 // the boolean and unpark the thread. 1057 // We enter a loop because park() could spuriously 1058 // return. We will then park() again unless the 1059 // pause boolean has been toggled. 1060 1061 // Need to use Ordering::SeqCst as we have multiple 1062 // loads and stores to different atomics and we need 1063 // to see them in a consistent order in all threads 1064 1065 if vcpu_pause_signalled.load(Ordering::SeqCst) { 1066 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 1067 // completed by returning to KVM_RUN. From the kernel docs: 1068 // 1069 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 1070 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 1071 // operations are complete (and guest state is consistent) only after userspace 1072 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 1073 // incomplete operations and then check for pending signals. 1074 // The pending state of the operation is not preserved in state which is 1075 // visible to userspace, thus userspace should ensure that the operation is 1076 // completed before performing a live migration. Userspace can re-enter the 1077 // guest with an unmasked signal pending or with the immediate_exit field set 1078 // to complete pending operations without allowing any further instructions 1079 // to be executed. 1080 1081 #[cfg(feature = "kvm")] 1082 if matches!(hypervisor_type, HypervisorType::Kvm) { 1083 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 1084 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 1085 error!("Unexpected VM exit on \"immediate_exit\" run"); 1086 break; 1087 } 1088 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 1089 } 1090 1091 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1092 1093 vcpu_paused.store(true, Ordering::SeqCst); 1094 while vcpu_pause_signalled.load(Ordering::SeqCst) { 1095 thread::park(); 1096 } 1097 vcpu_run_interrupted.store(false, Ordering::SeqCst); 1098 } 1099 1100 if vcpu_kick_signalled.load(Ordering::SeqCst) { 1101 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1102 #[cfg(target_arch = "x86_64")] 1103 match vcpu.lock().as_ref().unwrap().vcpu.nmi() { 1104 Ok(()) => {}, 1105 Err(e) => { 1106 error!("Error when inject nmi {}", e); 1107 break; 1108 } 1109 } 1110 } 1111 1112 // We've been told to terminate 1113 if vcpu_kill_signalled.load(Ordering::SeqCst) 1114 || vcpu_kill.load(Ordering::SeqCst) 1115 { 1116 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1117 break; 1118 } 1119 1120 #[cfg(feature = "tdx")] 1121 let mut vcpu = vcpu.lock().unwrap(); 1122 #[cfg(not(feature = "tdx"))] 1123 let vcpu = vcpu.lock().unwrap(); 1124 // vcpu.run() returns false on a triple-fault so trigger a reset 1125 match vcpu.run() { 1126 Ok(run) => match run { 1127 #[cfg(feature = "kvm")] 1128 VmExit::Debug => { 1129 info!("VmExit::Debug"); 1130 #[cfg(feature = "guest_debug")] 1131 { 1132 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1133 let raw_tid = get_raw_tid(vcpu_id as usize); 1134 vm_debug_evt.write(raw_tid as u64).unwrap(); 1135 } 1136 } 1137 #[cfg(target_arch = "x86_64")] 1138 VmExit::IoapicEoi(vector) => { 1139 if let Some(interrupt_controller) = 1140 &interrupt_controller_clone 1141 { 1142 interrupt_controller 1143 .lock() 1144 .unwrap() 1145 .end_of_interrupt(vector); 1146 } 1147 } 1148 VmExit::Ignore => {} 1149 VmExit::Hyperv => {} 1150 VmExit::Reset => { 1151 info!("VmExit::Reset"); 1152 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1153 reset_evt.write(1).unwrap(); 1154 break; 1155 } 1156 VmExit::Shutdown => { 1157 info!("VmExit::Shutdown"); 1158 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1159 exit_evt.write(1).unwrap(); 1160 break; 1161 } 1162 #[cfg(feature = "tdx")] 1163 VmExit::Tdx => { 1164 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1165 match vcpu.get_tdx_exit_details() { 1166 Ok(details) => match details { 1167 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1168 TdxExitDetails::SetupEventNotifyInterrupt => { 1169 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1170 } 1171 }, 1172 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1173 } 1174 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1175 } else { 1176 // We should never reach this code as 1177 // this means the design from the code 1178 // is wrong. 1179 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1180 } 1181 } 1182 }, 1183 1184 Err(e) => { 1185 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1186 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1187 exit_evt.write(1).unwrap(); 1188 break; 1189 } 1190 } 1191 1192 // We've been told to terminate 1193 if vcpu_kill_signalled.load(Ordering::SeqCst) 1194 || vcpu_kill.load(Ordering::SeqCst) 1195 { 1196 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1197 break; 1198 } 1199 } 1200 }) 1201 .or_else(|_| { 1202 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1203 error!("vCPU thread panicked"); 1204 panic_exit_evt.write(1) 1205 }) 1206 .ok(); 1207 }) 1208 .map_err(Error::VcpuSpawn)?, 1209 ); 1210 1211 // On hot plug calls into this function entry_point is None. It is for 1212 // those hotplug CPU additions that we need to set the inserting flag. 1213 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1214 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1215 1216 Ok(()) 1217 } 1218 1219 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1220 fn activate_vcpus( 1221 &mut self, 1222 desired_vcpus: u8, 1223 inserting: bool, 1224 paused: Option<bool>, 1225 ) -> Result<()> { 1226 if desired_vcpus > self.config.max_vcpus { 1227 return Err(Error::DesiredVCpuCountExceedsMax); 1228 } 1229 1230 let vcpu_thread_barrier = Arc::new(Barrier::new( 1231 (desired_vcpus - self.present_vcpus() + 1) as usize, 1232 )); 1233 1234 if let Some(paused) = paused { 1235 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1236 } 1237 1238 info!( 1239 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1240 desired_vcpus, 1241 self.vcpus.len(), 1242 self.present_vcpus(), 1243 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1244 ); 1245 1246 // This reuses any inactive vCPUs as well as any that were newly created 1247 for vcpu_id in self.present_vcpus()..desired_vcpus { 1248 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1249 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1250 } 1251 1252 // Unblock all CPU threads. 1253 vcpu_thread_barrier.wait(); 1254 Ok(()) 1255 } 1256 1257 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1258 // Mark vCPUs for removal, actual removal happens on ejection 1259 for cpu_id in desired_vcpus..self.present_vcpus() { 1260 self.vcpu_states[usize::from(cpu_id)].removing = true; 1261 self.vcpu_states[usize::from(cpu_id)] 1262 .pending_removal 1263 .store(true, Ordering::SeqCst); 1264 } 1265 } 1266 1267 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1268 for state in self.vcpu_states.iter() { 1269 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1270 return true; 1271 } 1272 } 1273 false 1274 } 1275 1276 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1277 info!("Removing vCPU: cpu_id = {}", cpu_id); 1278 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1279 state.kill.store(true, Ordering::SeqCst); 1280 state.signal_thread(); 1281 state.join_thread()?; 1282 state.handle = None; 1283 1284 // Once the thread has exited, clear the "kill" so that it can reused 1285 state.kill.store(false, Ordering::SeqCst); 1286 state.pending_removal.store(false, Ordering::SeqCst); 1287 1288 Ok(()) 1289 } 1290 1291 pub fn create_boot_vcpus( 1292 &mut self, 1293 snapshot: Option<Snapshot>, 1294 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1295 trace_scoped!("create_boot_vcpus"); 1296 1297 self.create_vcpus(self.boot_vcpus(), snapshot) 1298 } 1299 1300 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1301 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1302 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1303 } 1304 1305 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1306 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1307 .map_err(|e| { 1308 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1309 })?; 1310 1311 Ok(()) 1312 } 1313 1314 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1315 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1316 return Ok(false); 1317 } 1318 1319 if !self.dynamic { 1320 return Ok(false); 1321 } 1322 1323 if self.check_pending_removed_vcpu() { 1324 return Err(Error::VcpuPendingRemovedVcpu); 1325 } 1326 1327 match desired_vcpus.cmp(&self.present_vcpus()) { 1328 cmp::Ordering::Greater => { 1329 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1330 for vcpu in vcpus { 1331 self.configure_vcpu(vcpu, None)? 1332 } 1333 self.activate_vcpus(desired_vcpus, true, None)?; 1334 Ok(true) 1335 } 1336 cmp::Ordering::Less => { 1337 self.mark_vcpus_for_removal(desired_vcpus); 1338 Ok(true) 1339 } 1340 _ => Ok(false), 1341 } 1342 } 1343 1344 pub fn shutdown(&mut self) -> Result<()> { 1345 // Tell the vCPUs to stop themselves next time they go through the loop 1346 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1347 1348 // Toggle the vCPUs pause boolean 1349 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1350 1351 // Unpark all the VCPU threads. 1352 for state in self.vcpu_states.iter() { 1353 state.unpark_thread(); 1354 } 1355 1356 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1357 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1358 // above. 1359 for state in self.vcpu_states.iter() { 1360 state.signal_thread(); 1361 } 1362 1363 // Wait for all the threads to finish. This removes the state from the vector. 1364 for mut state in self.vcpu_states.drain(..) { 1365 state.join_thread()?; 1366 } 1367 1368 Ok(()) 1369 } 1370 1371 #[cfg(feature = "tdx")] 1372 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1373 for vcpu in &self.vcpus { 1374 vcpu.lock() 1375 .unwrap() 1376 .vcpu 1377 .tdx_init(hob_address) 1378 .map_err(Error::InitializeTdx)?; 1379 } 1380 Ok(()) 1381 } 1382 1383 pub fn boot_vcpus(&self) -> u8 { 1384 self.config.boot_vcpus 1385 } 1386 1387 pub fn max_vcpus(&self) -> u8 { 1388 self.config.max_vcpus 1389 } 1390 1391 #[cfg(target_arch = "x86_64")] 1392 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1393 assert!(!self.cpuid.is_empty()); 1394 self.cpuid.clone() 1395 } 1396 1397 fn present_vcpus(&self) -> u8 { 1398 self.vcpu_states 1399 .iter() 1400 .fold(0, |acc, state| acc + state.active() as u8) 1401 } 1402 1403 #[cfg(target_arch = "aarch64")] 1404 pub fn get_mpidrs(&self) -> Vec<u64> { 1405 self.vcpus 1406 .iter() 1407 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1408 .collect() 1409 } 1410 1411 #[cfg(target_arch = "aarch64")] 1412 pub fn get_saved_states(&self) -> Vec<CpuState> { 1413 self.vcpus 1414 .iter() 1415 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1416 .collect() 1417 } 1418 1419 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1420 self.config 1421 .topology 1422 .clone() 1423 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1424 } 1425 1426 #[cfg(not(target_arch = "riscv64"))] 1427 pub fn create_madt(&self) -> Sdt { 1428 use crate::acpi; 1429 // This is also checked in the commandline parsing. 1430 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1431 1432 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1433 #[cfg(target_arch = "x86_64")] 1434 { 1435 madt.write(36, arch::layout::APIC_START.0); 1436 1437 for cpu in 0..self.config.max_vcpus { 1438 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology()); 1439 1440 let lapic = LocalX2Apic { 1441 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1442 length: 16, 1443 processor_id: cpu.into(), 1444 apic_id: x2apic_id, 1445 flags: if cpu < self.config.boot_vcpus { 1446 1 << MADT_CPU_ENABLE_FLAG 1447 } else { 1448 0 1449 } | (1 << MADT_CPU_ONLINE_CAPABLE_FLAG), 1450 _reserved: 0, 1451 }; 1452 madt.append(lapic); 1453 } 1454 1455 madt.append(Ioapic { 1456 r#type: acpi::ACPI_APIC_IO, 1457 length: 12, 1458 ioapic_id: 0, 1459 apic_address: arch::layout::IOAPIC_START.0 as u32, 1460 gsi_base: 0, 1461 ..Default::default() 1462 }); 1463 1464 madt.append(InterruptSourceOverride { 1465 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1466 length: 10, 1467 bus: 0, 1468 source: 4, 1469 gsi: 4, 1470 flags: 0, 1471 }); 1472 } 1473 1474 #[cfg(target_arch = "aarch64")] 1475 { 1476 /* Notes: 1477 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1478 */ 1479 1480 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1481 for cpu in 0..self.config.boot_vcpus { 1482 let vcpu = &self.vcpus[cpu as usize]; 1483 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1484 /* ARMv8 MPIDR format: 1485 Bits [63:40] Must be zero 1486 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1487 Bits [31:24] Must be zero 1488 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1489 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1490 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1491 */ 1492 let mpidr_mask = 0xff_00ff_ffff; 1493 let gicc = GicC { 1494 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1495 length: 80, 1496 reserved0: 0, 1497 cpu_interface_number: cpu as u32, 1498 uid: cpu as u32, 1499 flags: 1, 1500 parking_version: 0, 1501 performance_interrupt: 0, 1502 parked_address: 0, 1503 base_address: 0, 1504 gicv_base_address: 0, 1505 gich_base_address: 0, 1506 vgic_interrupt: 0, 1507 gicr_base_address: 0, 1508 mpidr: mpidr & mpidr_mask, 1509 proc_power_effi_class: 0, 1510 reserved1: 0, 1511 spe_overflow_interrupt: 0, 1512 }; 1513 1514 madt.append(gicc); 1515 } 1516 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1517 1518 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1519 let gicd = GicD { 1520 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1521 length: 24, 1522 reserved0: 0, 1523 gic_id: 0, 1524 base_address: vgic_config.dist_addr, 1525 global_irq_base: 0, 1526 version: 3, 1527 reserved1: [0; 3], 1528 }; 1529 madt.append(gicd); 1530 1531 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1532 let gicr = GicR { 1533 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1534 length: 16, 1535 reserved: 0, 1536 base_address: vgic_config.redists_addr, 1537 range_length: vgic_config.redists_size as u32, 1538 }; 1539 madt.append(gicr); 1540 1541 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1542 let gicits = GicIts { 1543 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1544 length: 20, 1545 reserved0: 0, 1546 translation_id: 0, 1547 base_address: vgic_config.msi_addr, 1548 reserved1: 0, 1549 }; 1550 madt.append(gicits); 1551 1552 madt.update_checksum(); 1553 } 1554 1555 madt 1556 } 1557 1558 #[cfg(target_arch = "aarch64")] 1559 pub fn create_pptt(&self) -> Sdt { 1560 let pptt_start = 0; 1561 let mut cpus = 0; 1562 let mut uid = 0; 1563 // If topology is not specified, the default setting is: 1564 // 1 package, multiple cores, 1 thread per core 1565 // This is also the behavior when PPTT is missing. 1566 let (threads_per_core, cores_per_package, packages) = 1567 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1568 1569 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1570 1571 for cluster_idx in 0..packages { 1572 if cpus < self.config.boot_vcpus as usize { 1573 let cluster_offset = pptt.len() - pptt_start; 1574 let cluster_hierarchy_node = ProcessorHierarchyNode { 1575 r#type: 0, 1576 length: 20, 1577 reserved: 0, 1578 flags: 0x2, 1579 parent: 0, 1580 acpi_processor_id: cluster_idx as u32, 1581 num_private_resources: 0, 1582 }; 1583 pptt.append(cluster_hierarchy_node); 1584 1585 for core_idx in 0..cores_per_package { 1586 let core_offset = pptt.len() - pptt_start; 1587 1588 if threads_per_core > 1 { 1589 let core_hierarchy_node = ProcessorHierarchyNode { 1590 r#type: 0, 1591 length: 20, 1592 reserved: 0, 1593 flags: 0x2, 1594 parent: cluster_offset as u32, 1595 acpi_processor_id: core_idx as u32, 1596 num_private_resources: 0, 1597 }; 1598 pptt.append(core_hierarchy_node); 1599 1600 for _thread_idx in 0..threads_per_core { 1601 let thread_hierarchy_node = ProcessorHierarchyNode { 1602 r#type: 0, 1603 length: 20, 1604 reserved: 0, 1605 flags: 0xE, 1606 parent: core_offset as u32, 1607 acpi_processor_id: uid as u32, 1608 num_private_resources: 0, 1609 }; 1610 pptt.append(thread_hierarchy_node); 1611 uid += 1; 1612 } 1613 } else { 1614 let thread_hierarchy_node = ProcessorHierarchyNode { 1615 r#type: 0, 1616 length: 20, 1617 reserved: 0, 1618 flags: 0xA, 1619 parent: cluster_offset as u32, 1620 acpi_processor_id: uid as u32, 1621 num_private_resources: 0, 1622 }; 1623 pptt.append(thread_hierarchy_node); 1624 uid += 1; 1625 } 1626 } 1627 cpus += (cores_per_package * threads_per_core) as usize; 1628 } 1629 } 1630 1631 pptt.update_checksum(); 1632 pptt 1633 } 1634 1635 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1636 fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters { 1637 self.vcpus[usize::from(cpu_id)] 1638 .lock() 1639 .unwrap() 1640 .vcpu 1641 .create_standard_regs() 1642 } 1643 1644 #[cfg(feature = "guest_debug")] 1645 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1646 self.vcpus[usize::from(cpu_id)] 1647 .lock() 1648 .unwrap() 1649 .vcpu 1650 .get_regs() 1651 .map_err(Error::CpuDebug) 1652 } 1653 1654 #[cfg(feature = "guest_debug")] 1655 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1656 self.vcpus[usize::from(cpu_id)] 1657 .lock() 1658 .unwrap() 1659 .vcpu 1660 .set_regs(regs) 1661 .map_err(Error::CpuDebug) 1662 } 1663 1664 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1665 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1666 self.vcpus[usize::from(cpu_id)] 1667 .lock() 1668 .unwrap() 1669 .vcpu 1670 .get_sregs() 1671 .map_err(Error::CpuDebug) 1672 } 1673 1674 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1675 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1676 self.vcpus[usize::from(cpu_id)] 1677 .lock() 1678 .unwrap() 1679 .vcpu 1680 .set_sregs(sregs) 1681 .map_err(Error::CpuDebug) 1682 } 1683 1684 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1685 fn translate_gva( 1686 &self, 1687 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1688 cpu_id: u8, 1689 gva: u64, 1690 ) -> Result<u64> { 1691 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1692 .lock() 1693 .unwrap() 1694 .vcpu 1695 .translate_gva(gva, /* flags: unused */ 0) 1696 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1697 Ok(gpa) 1698 } 1699 1700 /// 1701 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1702 /// it in VMM by walking through translation tables. 1703 /// 1704 /// Address translation is big topic, here we only focus the scenario that 1705 /// happens in VMM while debugging kernel. This `translate_gva` 1706 /// implementation is restricted to: 1707 /// - Exception Level 1 1708 /// - Translate high address range only (kernel space) 1709 /// 1710 /// This implementation supports following Arm-v8a features related to 1711 /// address translation: 1712 /// - FEAT_LPA 1713 /// - FEAT_LVA 1714 /// - FEAT_LPA2 1715 /// 1716 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1717 fn translate_gva( 1718 &self, 1719 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1720 cpu_id: u8, 1721 gva: u64, 1722 ) -> Result<u64> { 1723 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1724 .lock() 1725 .unwrap() 1726 .vcpu 1727 .get_sys_reg(TCR_EL1) 1728 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1729 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1730 .lock() 1731 .unwrap() 1732 .vcpu 1733 .get_sys_reg(TTBR1_EL1) 1734 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1735 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1736 .lock() 1737 .unwrap() 1738 .vcpu 1739 .get_sys_reg(ID_AA64MMFR0_EL1) 1740 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1741 1742 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1743 // or low (0x000xxx...). 1744 let high_range = extract_bits_64!(gva, 55, 1); 1745 if high_range == 0 { 1746 info!("VA (0x{:x}) range is not supported!", gva); 1747 return Ok(gva); 1748 } 1749 1750 // High range size offset 1751 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1752 // Granule size 1753 let tg = extract_bits_64!(tcr_el1, 30, 2); 1754 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1755 let ds = extract_bits_64!(tcr_el1, 59, 1); 1756 1757 if tsz == 0 { 1758 info!("VA translation is not ready!"); 1759 return Ok(gva); 1760 } 1761 1762 // VA size is determined by TCR_BL1.T1SZ 1763 let va_size = 64 - tsz; 1764 // Number of bits in VA consumed in each level of translation 1765 let stride = match tg { 1766 3 => 13, // 64KB granule size 1767 1 => 11, // 16KB granule size 1768 _ => 9, // 4KB, default 1769 }; 1770 // Starting level of walking 1771 let mut level = 4 - (va_size - 4) / stride; 1772 1773 // PA or IPA size is determined 1774 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1775 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1776 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1777 // To be safe, we use the minimum value if they are different. 1778 let pa_range = std::cmp::min(tcr_ips, pa_range); 1779 // PA size in bits 1780 let pa_size = match pa_range { 1781 0 => 32, 1782 1 => 36, 1783 2 => 40, 1784 3 => 42, 1785 4 => 44, 1786 5 => 48, 1787 6 => 52, 1788 _ => { 1789 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1790 "PA range not supported {pa_range}" 1791 )))) 1792 } 1793 }; 1794 1795 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1796 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1797 // If FEAT_LPA2 is present, the translation table descriptor holds 1798 // 50 bits of the table address of next level. 1799 // Otherwise, it is 48 bits. 1800 let descaddrmask = if ds == 1 { 1801 !0u64 >> (64 - 50) // mask with 50 least significant bits 1802 } else { 1803 !0u64 >> (64 - 48) // mask with 48 least significant bits 1804 }; 1805 let descaddrmask = descaddrmask & !indexmask_grainsize; 1806 1807 // Translation table base address 1808 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1809 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1810 // address bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1811 if pa_size == 52 { 1812 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1813 } 1814 1815 // Loop through tables of each level 1816 loop { 1817 // Table offset for current level 1818 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1819 descaddr |= table_offset; 1820 descaddr &= !7u64; 1821 1822 let mut buf = [0; 8]; 1823 guest_memory 1824 .memory() 1825 .read(&mut buf, GuestAddress(descaddr)) 1826 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1827 let descriptor = u64::from_le_bytes(buf); 1828 1829 descaddr = descriptor & descaddrmask; 1830 // In the case of FEAT_LPA, the next-level translation table address 1831 // bits [48:51] comes from bits [12:15] of the current descriptor. 1832 // For FEAT_LPA2, the next-level translation table address 1833 // bits [50:51] comes from bits [8:9] of the current descriptor, 1834 // bits [48:49] comes from bits [48:49] of the descriptor which was 1835 // handled previously. 1836 if pa_size == 52 { 1837 if ds == 1 { 1838 // FEAT_LPA2 1839 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1840 } else { 1841 // FEAT_LPA 1842 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1843 } 1844 } 1845 1846 if (descriptor & 2) != 0 && (level < 3) { 1847 // This is a table entry. Go down to next level. 1848 level += 1; 1849 indexmask = indexmask_grainsize; 1850 continue; 1851 } 1852 1853 break; 1854 } 1855 1856 // We have reached either: 1857 // - a page entry at level 3 or 1858 // - a block entry at level 1 or 2 1859 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1860 descaddr &= !(page_size - 1); 1861 descaddr |= gva & (page_size - 1); 1862 1863 Ok(descaddr) 1864 } 1865 1866 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1867 self.acpi_address = Some(acpi_address); 1868 } 1869 1870 pub(crate) fn set_interrupt_controller( 1871 &mut self, 1872 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1873 ) { 1874 self.interrupt_controller = Some(interrupt_controller); 1875 } 1876 1877 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1878 &self.vcpus_kill_signalled 1879 } 1880 1881 #[cfg(feature = "igvm")] 1882 pub(crate) fn get_cpuid_leaf( 1883 &self, 1884 cpu_id: u8, 1885 eax: u32, 1886 ecx: u32, 1887 xfem: u64, 1888 xss: u64, 1889 ) -> Result<[u32; 4]> { 1890 let leaf_info = self.vcpus[usize::from(cpu_id)] 1891 .lock() 1892 .unwrap() 1893 .vcpu 1894 .get_cpuid_values(eax, ecx, xfem, xss) 1895 .unwrap(); 1896 Ok(leaf_info) 1897 } 1898 1899 #[cfg(feature = "sev_snp")] 1900 pub(crate) fn sev_snp_enabled(&self) -> bool { 1901 self.sev_snp_enabled 1902 } 1903 1904 pub(crate) fn nmi(&self) -> Result<()> { 1905 self.vcpus_kick_signalled.store(true, Ordering::SeqCst); 1906 1907 for state in self.vcpu_states.iter() { 1908 state.signal_thread(); 1909 } 1910 1911 self.vcpus_kick_signalled.store(false, Ordering::SeqCst); 1912 1913 Ok(()) 1914 } 1915 } 1916 1917 struct Cpu { 1918 cpu_id: u8, 1919 proximity_domain: u32, 1920 dynamic: bool, 1921 #[cfg(target_arch = "x86_64")] 1922 topology: Option<(u8, u8, u8)>, 1923 } 1924 1925 #[cfg(target_arch = "x86_64")] 1926 const MADT_CPU_ENABLE_FLAG: usize = 0; 1927 1928 #[cfg(target_arch = "x86_64")] 1929 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1930 1931 impl Cpu { 1932 #[cfg(target_arch = "x86_64")] 1933 fn generate_mat(&self) -> Vec<u8> { 1934 let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology); 1935 1936 let lapic = LocalX2Apic { 1937 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1938 length: 16, 1939 processor_id: self.cpu_id.into(), 1940 apic_id: x2apic_id, 1941 flags: 1 << MADT_CPU_ENABLE_FLAG, 1942 _reserved: 0, 1943 }; 1944 1945 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1946 // SAFETY: mat_data is large enough to hold lapic 1947 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1948 1949 mat_data 1950 } 1951 } 1952 1953 impl Aml for Cpu { 1954 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1955 #[cfg(target_arch = "x86_64")] 1956 let mat_data: Vec<u8> = self.generate_mat(); 1957 #[allow(clippy::if_same_then_else)] 1958 if self.dynamic { 1959 aml::Device::new( 1960 format!("C{:03X}", self.cpu_id).as_str().into(), 1961 vec![ 1962 &aml::Name::new("_HID".into(), &"ACPI0007"), 1963 &aml::Name::new("_UID".into(), &self.cpu_id), 1964 // Currently, AArch64 cannot support following fields. 1965 /* 1966 _STA return value: 1967 Bit [0] – Set if the device is present. 1968 Bit [1] – Set if the device is enabled and decoding its resources. 1969 Bit [2] – Set if the device should be shown in the UI. 1970 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1971 Bit [4] – Set if the battery is present. 1972 Bits [31:5] – Reserved (must be cleared). 1973 */ 1974 #[cfg(target_arch = "x86_64")] 1975 &aml::Method::new( 1976 "_STA".into(), 1977 0, 1978 false, 1979 // Call into CSTA method which will interrogate device 1980 vec![&aml::Return::new(&aml::MethodCall::new( 1981 "CSTA".into(), 1982 vec![&self.cpu_id], 1983 ))], 1984 ), 1985 &aml::Method::new( 1986 "_PXM".into(), 1987 0, 1988 false, 1989 vec![&aml::Return::new(&self.proximity_domain)], 1990 ), 1991 // The Linux kernel expects every CPU device to have a _MAT entry 1992 // containing the LAPIC for this processor with the enabled bit set 1993 // even it if is disabled in the MADT (non-boot CPU) 1994 #[cfg(target_arch = "x86_64")] 1995 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1996 // Trigger CPU ejection 1997 #[cfg(target_arch = "x86_64")] 1998 &aml::Method::new( 1999 "_EJ0".into(), 2000 1, 2001 false, 2002 // Call into CEJ0 method which will actually eject device 2003 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 2004 ), 2005 ], 2006 ) 2007 .to_aml_bytes(sink); 2008 } else { 2009 aml::Device::new( 2010 format!("C{:03X}", self.cpu_id).as_str().into(), 2011 vec![ 2012 &aml::Name::new("_HID".into(), &"ACPI0007"), 2013 &aml::Name::new("_UID".into(), &self.cpu_id), 2014 #[cfg(target_arch = "x86_64")] 2015 &aml::Method::new( 2016 "_STA".into(), 2017 0, 2018 false, 2019 // Mark CPU present see CSTA implementation 2020 vec![&aml::Return::new(&0xfu8)], 2021 ), 2022 &aml::Method::new( 2023 "_PXM".into(), 2024 0, 2025 false, 2026 vec![&aml::Return::new(&self.proximity_domain)], 2027 ), 2028 // The Linux kernel expects every CPU device to have a _MAT entry 2029 // containing the LAPIC for this processor with the enabled bit set 2030 // even it if is disabled in the MADT (non-boot CPU) 2031 #[cfg(target_arch = "x86_64")] 2032 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 2033 ], 2034 ) 2035 .to_aml_bytes(sink); 2036 } 2037 } 2038 } 2039 2040 struct CpuNotify { 2041 cpu_id: u8, 2042 } 2043 2044 impl Aml for CpuNotify { 2045 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2046 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 2047 aml::If::new( 2048 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 2049 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2050 ) 2051 .to_aml_bytes(sink) 2052 } 2053 } 2054 2055 struct CpuMethods { 2056 max_vcpus: u8, 2057 dynamic: bool, 2058 } 2059 2060 impl Aml for CpuMethods { 2061 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2062 if self.dynamic { 2063 // CPU status method 2064 aml::Method::new( 2065 "CSTA".into(), 2066 1, 2067 true, 2068 vec![ 2069 // Take lock defined above 2070 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2071 // Write CPU number (in first argument) to I/O port via field 2072 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2073 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2074 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2075 &aml::If::new( 2076 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 2077 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2078 ), 2079 // Release lock 2080 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2081 // Return 0 or 0xf 2082 &aml::Return::new(&aml::Local(0)), 2083 ], 2084 ) 2085 .to_aml_bytes(sink); 2086 2087 let mut cpu_notifies = Vec::new(); 2088 for cpu_id in 0..self.max_vcpus { 2089 cpu_notifies.push(CpuNotify { cpu_id }); 2090 } 2091 2092 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2093 for cpu_id in 0..self.max_vcpus { 2094 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 2095 } 2096 2097 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 2098 2099 aml::Method::new( 2100 "CEJ0".into(), 2101 1, 2102 true, 2103 vec![ 2104 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2105 // Write CPU number (in first argument) to I/O port via field 2106 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2107 // Set CEJ0 bit 2108 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 2109 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2110 ], 2111 ) 2112 .to_aml_bytes(sink); 2113 2114 aml::Method::new( 2115 "CSCN".into(), 2116 0, 2117 true, 2118 vec![ 2119 // Take lock defined above 2120 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2121 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2122 &aml::While::new( 2123 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 2124 vec![ 2125 // Write CPU number (in first argument) to I/O port via field 2126 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 2127 // Check if CINS bit is set 2128 &aml::If::new( 2129 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 2130 // Notify device if it is 2131 vec![ 2132 &aml::MethodCall::new( 2133 "CTFY".into(), 2134 vec![&aml::Local(0), &aml::ONE], 2135 ), 2136 // Reset CINS bit 2137 &aml::Store::new( 2138 &aml::Path::new("\\_SB_.PRES.CINS"), 2139 &aml::ONE, 2140 ), 2141 ], 2142 ), 2143 // Check if CRMV bit is set 2144 &aml::If::new( 2145 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 2146 // Notify device if it is (with the eject constant 0x3) 2147 vec![ 2148 &aml::MethodCall::new( 2149 "CTFY".into(), 2150 vec![&aml::Local(0), &3u8], 2151 ), 2152 // Reset CRMV bit 2153 &aml::Store::new( 2154 &aml::Path::new("\\_SB_.PRES.CRMV"), 2155 &aml::ONE, 2156 ), 2157 ], 2158 ), 2159 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2160 ], 2161 ), 2162 // Release lock 2163 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2164 ], 2165 ) 2166 .to_aml_bytes(sink) 2167 } else { 2168 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2169 } 2170 } 2171 } 2172 2173 impl Aml for CpuManager { 2174 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2175 #[cfg(target_arch = "x86_64")] 2176 if let Some(acpi_address) = self.acpi_address { 2177 // CPU hotplug controller 2178 aml::Device::new( 2179 "_SB_.PRES".into(), 2180 vec![ 2181 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2182 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2183 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2184 &aml::Mutex::new("CPLK".into(), 0), 2185 &aml::Name::new( 2186 "_CRS".into(), 2187 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2188 aml::AddressSpaceCacheable::NotCacheable, 2189 true, 2190 acpi_address.0, 2191 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2192 None, 2193 )]), 2194 ), 2195 // OpRegion and Fields map MMIO range into individual field values 2196 &aml::OpRegion::new( 2197 "PRST".into(), 2198 aml::OpRegionSpace::SystemMemory, 2199 &(acpi_address.0 as usize), 2200 &CPU_MANAGER_ACPI_SIZE, 2201 ), 2202 &aml::Field::new( 2203 "PRST".into(), 2204 aml::FieldAccessType::Byte, 2205 aml::FieldLockRule::NoLock, 2206 aml::FieldUpdateRule::WriteAsZeroes, 2207 vec![ 2208 aml::FieldEntry::Reserved(32), 2209 aml::FieldEntry::Named(*b"CPEN", 1), 2210 aml::FieldEntry::Named(*b"CINS", 1), 2211 aml::FieldEntry::Named(*b"CRMV", 1), 2212 aml::FieldEntry::Named(*b"CEJ0", 1), 2213 aml::FieldEntry::Reserved(4), 2214 aml::FieldEntry::Named(*b"CCMD", 8), 2215 ], 2216 ), 2217 &aml::Field::new( 2218 "PRST".into(), 2219 aml::FieldAccessType::DWord, 2220 aml::FieldLockRule::NoLock, 2221 aml::FieldUpdateRule::Preserve, 2222 vec![ 2223 aml::FieldEntry::Named(*b"CSEL", 32), 2224 aml::FieldEntry::Reserved(32), 2225 aml::FieldEntry::Named(*b"CDAT", 32), 2226 ], 2227 ), 2228 ], 2229 ) 2230 .to_aml_bytes(sink); 2231 } 2232 2233 // CPU devices 2234 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2235 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2236 // Bundle methods together under a common object 2237 let methods = CpuMethods { 2238 max_vcpus: self.config.max_vcpus, 2239 dynamic: self.dynamic, 2240 }; 2241 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2242 2243 #[cfg(target_arch = "x86_64")] 2244 let topology = self.get_vcpu_topology(); 2245 let mut cpu_devices = Vec::new(); 2246 for cpu_id in 0..self.config.max_vcpus { 2247 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2248 let cpu_device = Cpu { 2249 cpu_id, 2250 proximity_domain, 2251 dynamic: self.dynamic, 2252 #[cfg(target_arch = "x86_64")] 2253 topology, 2254 }; 2255 2256 cpu_devices.push(cpu_device); 2257 } 2258 2259 for cpu_device in cpu_devices.iter() { 2260 cpu_data_inner.push(cpu_device); 2261 } 2262 2263 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2264 } 2265 } 2266 2267 impl Pausable for CpuManager { 2268 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2269 // Tell the vCPUs to pause themselves next time they exit 2270 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2271 2272 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2273 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2274 // above. 2275 for state in self.vcpu_states.iter() { 2276 state.signal_thread(); 2277 } 2278 2279 for vcpu in self.vcpus.iter() { 2280 let mut vcpu = vcpu.lock().unwrap(); 2281 vcpu.pause()?; 2282 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2283 if !self.config.kvm_hyperv { 2284 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2285 MigratableError::Pause(anyhow!( 2286 "Could not notify guest it has been paused {:?}", 2287 e 2288 )) 2289 })?; 2290 } 2291 } 2292 2293 // The vCPU thread will change its paused state before parking, wait here for each 2294 // activated vCPU change their state to ensure they have parked. 2295 for state in self.vcpu_states.iter() { 2296 if state.active() { 2297 while !state.paused.load(Ordering::SeqCst) { 2298 // To avoid a priority inversion with the vCPU thread 2299 thread::sleep(std::time::Duration::from_millis(1)); 2300 } 2301 } 2302 } 2303 2304 Ok(()) 2305 } 2306 2307 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2308 for vcpu in self.vcpus.iter() { 2309 vcpu.lock().unwrap().resume()?; 2310 } 2311 2312 // Toggle the vCPUs pause boolean 2313 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2314 2315 // Unpark all the VCPU threads. 2316 // Once unparked, the next thing they will do is checking for the pause 2317 // boolean. Since it'll be set to false, they will exit their pause loop 2318 // and go back to vmx root. 2319 for state in self.vcpu_states.iter() { 2320 state.paused.store(false, Ordering::SeqCst); 2321 state.unpark_thread(); 2322 } 2323 Ok(()) 2324 } 2325 } 2326 2327 impl Snapshottable for CpuManager { 2328 fn id(&self) -> String { 2329 CPU_MANAGER_SNAPSHOT_ID.to_string() 2330 } 2331 2332 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2333 let mut cpu_manager_snapshot = Snapshot::default(); 2334 2335 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2336 for vcpu in &self.vcpus { 2337 let mut vcpu = vcpu.lock().unwrap(); 2338 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2339 } 2340 2341 Ok(cpu_manager_snapshot) 2342 } 2343 } 2344 2345 impl Transportable for CpuManager {} 2346 impl Migratable for CpuManager {} 2347 2348 #[cfg(feature = "guest_debug")] 2349 impl Debuggable for CpuManager { 2350 #[cfg(feature = "kvm")] 2351 fn set_guest_debug( 2352 &self, 2353 cpu_id: usize, 2354 addrs: &[GuestAddress], 2355 singlestep: bool, 2356 ) -> std::result::Result<(), DebuggableError> { 2357 self.vcpus[cpu_id] 2358 .lock() 2359 .unwrap() 2360 .vcpu 2361 .set_guest_debug(addrs, singlestep) 2362 .map_err(DebuggableError::SetDebug) 2363 } 2364 2365 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2366 Ok(()) 2367 } 2368 2369 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2370 Ok(()) 2371 } 2372 2373 #[cfg(target_arch = "x86_64")] 2374 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2375 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2376 let gregs = self 2377 .get_regs(cpu_id as u8) 2378 .map_err(DebuggableError::ReadRegs)?; 2379 let regs = [ 2380 gregs.get_rax(), 2381 gregs.get_rbx(), 2382 gregs.get_rcx(), 2383 gregs.get_rdx(), 2384 gregs.get_rsi(), 2385 gregs.get_rdi(), 2386 gregs.get_rbp(), 2387 gregs.get_rsp(), 2388 gregs.get_r8(), 2389 gregs.get_r9(), 2390 gregs.get_r10(), 2391 gregs.get_r11(), 2392 gregs.get_r12(), 2393 gregs.get_r13(), 2394 gregs.get_r14(), 2395 gregs.get_r15(), 2396 ]; 2397 2398 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2399 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2400 let eflags = gregs.get_rflags() as u32; 2401 let rip = gregs.get_rip(); 2402 2403 // Segment registers: CS, SS, DS, ES, FS, GS 2404 let sregs = self 2405 .get_sregs(cpu_id as u8) 2406 .map_err(DebuggableError::ReadRegs)?; 2407 let segments = X86SegmentRegs { 2408 cs: sregs.cs.selector as u32, 2409 ss: sregs.ss.selector as u32, 2410 ds: sregs.ds.selector as u32, 2411 es: sregs.es.selector as u32, 2412 fs: sregs.fs.selector as u32, 2413 gs: sregs.gs.selector as u32, 2414 }; 2415 2416 // TODO: Add other registers 2417 2418 Ok(CoreRegs { 2419 regs, 2420 eflags, 2421 rip, 2422 segments, 2423 ..Default::default() 2424 }) 2425 } 2426 2427 #[cfg(target_arch = "aarch64")] 2428 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2429 let gregs = self 2430 .get_regs(cpu_id as u8) 2431 .map_err(DebuggableError::ReadRegs)?; 2432 Ok(CoreRegs { 2433 x: gregs.get_regs(), 2434 sp: gregs.get_sp(), 2435 pc: gregs.get_pc(), 2436 ..Default::default() 2437 }) 2438 } 2439 2440 #[cfg(target_arch = "x86_64")] 2441 fn write_regs( 2442 &self, 2443 cpu_id: usize, 2444 regs: &CoreRegs, 2445 ) -> std::result::Result<(), DebuggableError> { 2446 let orig_gregs = self 2447 .get_regs(cpu_id as u8) 2448 .map_err(DebuggableError::ReadRegs)?; 2449 let mut gregs = self.create_standard_regs(cpu_id as u8); 2450 gregs.set_rax(regs.regs[0]); 2451 gregs.set_rbx(regs.regs[1]); 2452 gregs.set_rcx(regs.regs[2]); 2453 gregs.set_rdx(regs.regs[3]); 2454 gregs.set_rsi(regs.regs[4]); 2455 gregs.set_rdi(regs.regs[5]); 2456 gregs.set_rbp(regs.regs[6]); 2457 gregs.set_rsp(regs.regs[7]); 2458 gregs.set_r8(regs.regs[8]); 2459 gregs.set_r9(regs.regs[9]); 2460 gregs.set_r10(regs.regs[10]); 2461 gregs.set_r11(regs.regs[11]); 2462 gregs.set_r12(regs.regs[12]); 2463 gregs.set_r13(regs.regs[13]); 2464 gregs.set_r14(regs.regs[14]); 2465 gregs.set_r15(regs.regs[15]); 2466 gregs.set_rip(regs.rip); 2467 // Update the lower 32-bit of rflags. 2468 gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64)); 2469 2470 self.set_regs(cpu_id as u8, &gregs) 2471 .map_err(DebuggableError::WriteRegs)?; 2472 2473 // Segment registers: CS, SS, DS, ES, FS, GS 2474 // Since GDB care only selectors, we call get_sregs() first. 2475 let mut sregs = self 2476 .get_sregs(cpu_id as u8) 2477 .map_err(DebuggableError::ReadRegs)?; 2478 sregs.cs.selector = regs.segments.cs as u16; 2479 sregs.ss.selector = regs.segments.ss as u16; 2480 sregs.ds.selector = regs.segments.ds as u16; 2481 sregs.es.selector = regs.segments.es as u16; 2482 sregs.fs.selector = regs.segments.fs as u16; 2483 sregs.gs.selector = regs.segments.gs as u16; 2484 2485 self.set_sregs(cpu_id as u8, &sregs) 2486 .map_err(DebuggableError::WriteRegs)?; 2487 2488 // TODO: Add other registers 2489 2490 Ok(()) 2491 } 2492 2493 #[cfg(target_arch = "aarch64")] 2494 fn write_regs( 2495 &self, 2496 cpu_id: usize, 2497 regs: &CoreRegs, 2498 ) -> std::result::Result<(), DebuggableError> { 2499 let mut gregs = self 2500 .get_regs(cpu_id as u8) 2501 .map_err(DebuggableError::ReadRegs)?; 2502 2503 gregs.set_regs(regs.x); 2504 gregs.set_sp(regs.sp); 2505 gregs.set_pc(regs.pc); 2506 2507 self.set_regs(cpu_id as u8, &gregs) 2508 .map_err(DebuggableError::WriteRegs)?; 2509 2510 Ok(()) 2511 } 2512 2513 fn read_mem( 2514 &self, 2515 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2516 cpu_id: usize, 2517 vaddr: GuestAddress, 2518 len: usize, 2519 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2520 let mut buf = vec![0; len]; 2521 let mut total_read = 0_u64; 2522 2523 while total_read < len as u64 { 2524 let gaddr = vaddr.0 + total_read; 2525 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2526 Ok(paddr) => paddr, 2527 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2528 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2529 }; 2530 let psize = arch::PAGE_SIZE as u64; 2531 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2532 guest_memory 2533 .memory() 2534 .read( 2535 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2536 GuestAddress(paddr), 2537 ) 2538 .map_err(DebuggableError::ReadMem)?; 2539 total_read += read_len; 2540 } 2541 Ok(buf) 2542 } 2543 2544 fn write_mem( 2545 &self, 2546 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2547 cpu_id: usize, 2548 vaddr: &GuestAddress, 2549 data: &[u8], 2550 ) -> std::result::Result<(), DebuggableError> { 2551 let mut total_written = 0_u64; 2552 2553 while total_written < data.len() as u64 { 2554 let gaddr = vaddr.0 + total_written; 2555 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2556 Ok(paddr) => paddr, 2557 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2558 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2559 }; 2560 let psize = arch::PAGE_SIZE as u64; 2561 let write_len = std::cmp::min( 2562 data.len() as u64 - total_written, 2563 psize - (paddr & (psize - 1)), 2564 ); 2565 guest_memory 2566 .memory() 2567 .write( 2568 &data[total_written as usize..total_written as usize + write_len as usize], 2569 GuestAddress(paddr), 2570 ) 2571 .map_err(DebuggableError::WriteMem)?; 2572 total_written += write_len; 2573 } 2574 Ok(()) 2575 } 2576 2577 fn active_vcpus(&self) -> usize { 2578 self.present_vcpus() as usize 2579 } 2580 } 2581 2582 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2583 impl Elf64Writable for CpuManager {} 2584 2585 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2586 impl CpuElf64Writable for CpuManager { 2587 fn cpu_write_elf64_note( 2588 &mut self, 2589 dump_state: &DumpState, 2590 ) -> std::result::Result<(), GuestDebuggableError> { 2591 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2592 for vcpu in &self.vcpus { 2593 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2594 let mut pos: usize = 0; 2595 let mut buf = vec![0; note_size as usize]; 2596 let descsz = size_of::<X86_64ElfPrStatus>(); 2597 let vcpu_id = vcpu.lock().unwrap().id; 2598 2599 let note = Elf64_Nhdr { 2600 n_namesz: COREDUMP_NAME_SIZE, 2601 n_descsz: descsz as u32, 2602 n_type: NT_PRSTATUS, 2603 }; 2604 2605 let bytes: &[u8] = note.as_slice(); 2606 buf.splice(0.., bytes.to_vec()); 2607 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2608 buf.resize(pos + 4, 0); 2609 buf.splice(pos.., "CORE".to_string().into_bytes()); 2610 2611 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2612 buf.resize(pos + 32 + 4, 0); 2613 let pid = vcpu_id as u64; 2614 let bytes: &[u8] = pid.as_slice(); 2615 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2616 2617 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2618 2619 let orig_rax: u64 = 0; 2620 let gregs = self.vcpus[usize::from(vcpu_id)] 2621 .lock() 2622 .unwrap() 2623 .vcpu 2624 .get_regs() 2625 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2626 2627 let regs1 = [ 2628 gregs.get_r15(), 2629 gregs.get_r14(), 2630 gregs.get_r13(), 2631 gregs.get_r12(), 2632 gregs.get_rbp(), 2633 gregs.get_rbx(), 2634 gregs.get_r11(), 2635 gregs.get_r10(), 2636 ]; 2637 let regs2 = [ 2638 gregs.get_r9(), 2639 gregs.get_r8(), 2640 gregs.get_rax(), 2641 gregs.get_rcx(), 2642 gregs.get_rdx(), 2643 gregs.get_rsi(), 2644 gregs.get_rdi(), 2645 orig_rax, 2646 ]; 2647 2648 let sregs = self.vcpus[usize::from(vcpu_id)] 2649 .lock() 2650 .unwrap() 2651 .vcpu 2652 .get_sregs() 2653 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2654 2655 debug!( 2656 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2657 gregs.get_rip(), 2658 gregs.get_rsp(), 2659 sregs.gs.base, 2660 sregs.cs.selector, 2661 sregs.ss.selector, 2662 sregs.ds.selector, 2663 ); 2664 2665 let regs = X86_64UserRegs { 2666 regs1, 2667 regs2, 2668 rip: gregs.get_rip(), 2669 cs: sregs.cs.selector as u64, 2670 eflags: gregs.get_rflags(), 2671 rsp: gregs.get_rsp(), 2672 ss: sregs.ss.selector as u64, 2673 fs_base: sregs.fs.base, 2674 gs_base: sregs.gs.base, 2675 ds: sregs.ds.selector as u64, 2676 es: sregs.es.selector as u64, 2677 fs: sregs.fs.selector as u64, 2678 gs: sregs.gs.selector as u64, 2679 }; 2680 2681 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2682 let bytes: &[u8] = regs.as_slice(); 2683 buf.resize(note_size as usize, 0); 2684 buf.splice(pos.., bytes.to_vec()); 2685 buf.resize(note_size as usize, 0); 2686 2687 coredump_file 2688 .write(&buf) 2689 .map_err(GuestDebuggableError::CoredumpFile)?; 2690 } 2691 2692 Ok(()) 2693 } 2694 2695 fn cpu_write_vmm_note( 2696 &mut self, 2697 dump_state: &DumpState, 2698 ) -> std::result::Result<(), GuestDebuggableError> { 2699 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2700 for vcpu in &self.vcpus { 2701 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2702 let mut pos: usize = 0; 2703 let mut buf = vec![0; note_size as usize]; 2704 let descsz = size_of::<DumpCpusState>(); 2705 let vcpu_id = vcpu.lock().unwrap().id; 2706 2707 let note = Elf64_Nhdr { 2708 n_namesz: COREDUMP_NAME_SIZE, 2709 n_descsz: descsz as u32, 2710 n_type: 0, 2711 }; 2712 2713 let bytes: &[u8] = note.as_slice(); 2714 buf.splice(0.., bytes.to_vec()); 2715 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2716 2717 buf.resize(pos + 4, 0); 2718 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2719 2720 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2721 2722 let gregs = self.vcpus[usize::from(vcpu_id)] 2723 .lock() 2724 .unwrap() 2725 .vcpu 2726 .get_regs() 2727 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2728 2729 let regs1 = [ 2730 gregs.get_rax(), 2731 gregs.get_rbx(), 2732 gregs.get_rcx(), 2733 gregs.get_rdx(), 2734 gregs.get_rsi(), 2735 gregs.get_rdi(), 2736 gregs.get_rsp(), 2737 gregs.get_rbp(), 2738 ]; 2739 2740 let regs2 = [ 2741 gregs.get_r8(), 2742 gregs.get_r9(), 2743 gregs.get_r10(), 2744 gregs.get_r11(), 2745 gregs.get_r12(), 2746 gregs.get_r13(), 2747 gregs.get_r14(), 2748 gregs.get_r15(), 2749 ]; 2750 2751 let sregs = self.vcpus[usize::from(vcpu_id)] 2752 .lock() 2753 .unwrap() 2754 .vcpu 2755 .get_sregs() 2756 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2757 2758 let mut msrs = vec![MsrEntry { 2759 index: msr_index::MSR_KERNEL_GS_BASE, 2760 ..Default::default() 2761 }]; 2762 2763 self.vcpus[vcpu_id as usize] 2764 .lock() 2765 .unwrap() 2766 .vcpu 2767 .get_msrs(&mut msrs) 2768 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2769 let kernel_gs_base = msrs[0].data; 2770 2771 let cs = CpuSegment::new(sregs.cs); 2772 let ds = CpuSegment::new(sregs.ds); 2773 let es = CpuSegment::new(sregs.es); 2774 let fs = CpuSegment::new(sregs.fs); 2775 let gs = CpuSegment::new(sregs.gs); 2776 let ss = CpuSegment::new(sregs.ss); 2777 let ldt = CpuSegment::new(sregs.ldt); 2778 let tr = CpuSegment::new(sregs.tr); 2779 let gdt = CpuSegment::new_from_table(sregs.gdt); 2780 let idt = CpuSegment::new_from_table(sregs.idt); 2781 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2782 let regs = DumpCpusState { 2783 version: 1, 2784 size: size_of::<DumpCpusState>() as u32, 2785 regs1, 2786 regs2, 2787 rip: gregs.get_rip(), 2788 rflags: gregs.get_rflags(), 2789 cs, 2790 ds, 2791 es, 2792 fs, 2793 gs, 2794 ss, 2795 ldt, 2796 tr, 2797 gdt, 2798 idt, 2799 cr, 2800 kernel_gs_base, 2801 }; 2802 2803 let bytes: &[u8] = regs.as_slice(); 2804 buf.resize(note_size as usize, 0); 2805 buf.splice(pos.., bytes.to_vec()); 2806 buf.resize(note_size as usize, 0); 2807 2808 coredump_file 2809 .write(&buf) 2810 .map_err(GuestDebuggableError::CoredumpFile)?; 2811 } 2812 2813 Ok(()) 2814 } 2815 } 2816 2817 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2818 #[cfg(test)] 2819 mod tests { 2820 use arch::layout::{BOOT_STACK_POINTER, ZERO_PAGE_START}; 2821 use arch::x86_64::interrupts::*; 2822 use arch::x86_64::regs::*; 2823 use hypervisor::arch::x86::{FpuState, LapicState}; 2824 use hypervisor::StandardRegisters; 2825 use linux_loader::loader::bootparam::setup_header; 2826 2827 #[test] 2828 fn test_setlint() { 2829 let hv = hypervisor::new().unwrap(); 2830 let vm = hv.create_vm().expect("new VM fd creation failed"); 2831 hv.check_required_extensions().unwrap(); 2832 // Calling get_lapic will fail if there is no irqchip before hand. 2833 vm.create_irq_chip().unwrap(); 2834 let vcpu = vm.create_vcpu(0, None).unwrap(); 2835 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2836 2837 // Compute the value that is expected to represent LVT0 and LVT1. 2838 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2839 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2840 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2841 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2842 2843 set_lint(&vcpu).unwrap(); 2844 2845 // Compute the value that represents LVT0 and LVT1 after set_lint. 2846 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2847 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2848 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2849 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2850 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2851 } 2852 2853 #[test] 2854 fn test_setup_fpu() { 2855 let hv = hypervisor::new().unwrap(); 2856 let vm = hv.create_vm().expect("new VM fd creation failed"); 2857 let vcpu = vm.create_vcpu(0, None).unwrap(); 2858 setup_fpu(&vcpu).unwrap(); 2859 2860 let expected_fpu: FpuState = FpuState { 2861 fcw: 0x37f, 2862 mxcsr: 0x1f80, 2863 ..Default::default() 2864 }; 2865 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2866 // TODO: auto-generate kvm related structures with PartialEq on. 2867 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2868 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2869 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2870 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2871 // remove it at all. 2872 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2873 } 2874 2875 #[test] 2876 fn test_setup_msrs() { 2877 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2878 2879 let hv = hypervisor::new().unwrap(); 2880 let vm = hv.create_vm().expect("new VM fd creation failed"); 2881 let vcpu = vm.create_vcpu(0, None).unwrap(); 2882 setup_msrs(&vcpu).unwrap(); 2883 2884 // This test will check against the last MSR entry configured (the tenth one). 2885 // See create_msr_entries for details. 2886 let mut msrs = vec![MsrEntry { 2887 index: msr_index::MSR_IA32_MISC_ENABLE, 2888 ..Default::default() 2889 }]; 2890 2891 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2892 // in this test case scenario. 2893 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2894 assert_eq!(read_msrs, 1); 2895 2896 // Official entries that were setup when we did setup_msrs. We need to assert that the 2897 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2898 // expect. 2899 let entry_vec = vcpu.boot_msr_entries(); 2900 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2901 } 2902 2903 #[test] 2904 fn test_setup_regs_for_pvh() { 2905 let hv = hypervisor::new().unwrap(); 2906 let vm = hv.create_vm().expect("new VM fd creation failed"); 2907 let vcpu = vm.create_vcpu(0, None).unwrap(); 2908 2909 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2910 expected_regs.set_rflags(0x0000000000000002u64); 2911 expected_regs.set_rbx(arch::layout::PVH_INFO_START.0); 2912 expected_regs.set_rip(1); 2913 2914 setup_regs( 2915 &vcpu, 2916 arch::EntryPoint { 2917 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2918 setup_header: None, 2919 }, 2920 ) 2921 .unwrap(); 2922 2923 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2924 assert_eq!(actual_regs, expected_regs); 2925 } 2926 2927 #[test] 2928 fn test_setup_regs_for_bzimage() { 2929 let hv = hypervisor::new().unwrap(); 2930 let vm = hv.create_vm().expect("new VM fd creation failed"); 2931 let vcpu = vm.create_vcpu(0, None).unwrap(); 2932 2933 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2934 expected_regs.set_rflags(0x0000000000000002u64); 2935 expected_regs.set_rip(1); 2936 expected_regs.set_rsp(BOOT_STACK_POINTER.0); 2937 expected_regs.set_rsi(ZERO_PAGE_START.0); 2938 2939 setup_regs( 2940 &vcpu, 2941 arch::EntryPoint { 2942 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2943 setup_header: Some(setup_header { 2944 ..Default::default() 2945 }), 2946 }, 2947 ) 2948 .unwrap(); 2949 2950 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2951 assert_eq!(actual_regs, expected_regs); 2952 } 2953 } 2954 2955 #[cfg(target_arch = "aarch64")] 2956 #[cfg(test)] 2957 mod tests { 2958 #[cfg(feature = "kvm")] 2959 use std::mem; 2960 2961 use arch::layout; 2962 use hypervisor::arch::aarch64::regs::MPIDR_EL1; 2963 #[cfg(feature = "kvm")] 2964 use hypervisor::kvm::aarch64::is_system_register; 2965 #[cfg(feature = "kvm")] 2966 use hypervisor::kvm::kvm_bindings::{ 2967 user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2968 }; 2969 #[cfg(feature = "kvm")] 2970 use hypervisor::{arm64_core_reg_id, offset_of}; 2971 2972 #[test] 2973 fn test_setup_regs() { 2974 let hv = hypervisor::new().unwrap(); 2975 let vm = hv.create_vm().unwrap(); 2976 let vcpu = vm.create_vcpu(0, None).unwrap(); 2977 2978 // Must fail when vcpu is not initialized yet. 2979 vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap_err(); 2980 2981 let mut kvi = vcpu.create_vcpu_init(); 2982 vm.get_preferred_target(&mut kvi).unwrap(); 2983 vcpu.vcpu_init(&kvi).unwrap(); 2984 2985 vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap(); 2986 } 2987 2988 #[test] 2989 fn test_read_mpidr() { 2990 let hv = hypervisor::new().unwrap(); 2991 let vm = hv.create_vm().unwrap(); 2992 let vcpu = vm.create_vcpu(0, None).unwrap(); 2993 let mut kvi = vcpu.create_vcpu_init(); 2994 vm.get_preferred_target(&mut kvi).unwrap(); 2995 2996 // Must fail when vcpu is not initialized yet. 2997 vcpu.get_sys_reg(MPIDR_EL1).unwrap_err(); 2998 2999 vcpu.vcpu_init(&kvi).unwrap(); 3000 assert_eq!(vcpu.get_sys_reg(MPIDR_EL1).unwrap(), 0x80000000); 3001 } 3002 3003 #[cfg(feature = "kvm")] 3004 #[test] 3005 fn test_is_system_register() { 3006 let offset = offset_of!(user_pt_regs, pc); 3007 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 3008 assert!(!is_system_register(regid)); 3009 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 3010 assert!(is_system_register(regid)); 3011 } 3012 3013 #[test] 3014 fn test_save_restore_core_regs() { 3015 let hv = hypervisor::new().unwrap(); 3016 let vm = hv.create_vm().unwrap(); 3017 let vcpu = vm.create_vcpu(0, None).unwrap(); 3018 let mut kvi = vcpu.create_vcpu_init(); 3019 vm.get_preferred_target(&mut kvi).unwrap(); 3020 3021 // Must fail when vcpu is not initialized yet. 3022 assert_eq!( 3023 format!("{}", vcpu.get_regs().unwrap_err()), 3024 "Failed to get aarch64 core register: Exec format error (os error 8)" 3025 ); 3026 3027 let mut state = vcpu.create_standard_regs(); 3028 assert_eq!( 3029 format!("{}", vcpu.set_regs(&state).unwrap_err()), 3030 "Failed to set aarch64 core register: Exec format error (os error 8)" 3031 ); 3032 3033 vcpu.vcpu_init(&kvi).unwrap(); 3034 state = vcpu.get_regs().unwrap(); 3035 assert_eq!(state.get_pstate(), 0x3C5); 3036 3037 vcpu.set_regs(&state).unwrap(); 3038 } 3039 3040 #[test] 3041 fn test_get_set_mpstate() { 3042 let hv = hypervisor::new().unwrap(); 3043 let vm = hv.create_vm().unwrap(); 3044 let vcpu = vm.create_vcpu(0, None).unwrap(); 3045 let mut kvi = vcpu.create_vcpu_init(); 3046 vm.get_preferred_target(&mut kvi).unwrap(); 3047 3048 let state = vcpu.get_mp_state().unwrap(); 3049 vcpu.set_mp_state(state).unwrap(); 3050 } 3051 } 3052