1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use std::collections::BTreeMap; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use std::io::Write; 17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 18 use std::mem::size_of; 19 use std::os::unix::thread::JoinHandleExt; 20 use std::sync::atomic::{AtomicBool, Ordering}; 21 use std::sync::{Arc, Barrier, Mutex}; 22 use std::{cmp, io, result, thread}; 23 24 use acpi_tables::sdt::Sdt; 25 use acpi_tables::{aml, Aml}; 26 use anyhow::anyhow; 27 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 28 use arch::aarch64::regs; 29 #[cfg(target_arch = "x86_64")] 30 use arch::x86_64::get_x2apic_id; 31 use arch::{EntryPoint, NumaNodes}; 32 #[cfg(target_arch = "aarch64")] 33 use devices::gic::Gic; 34 use devices::interrupt_controller::InterruptController; 35 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 36 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 37 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 38 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 39 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 40 use hypervisor::arch::x86::msr_index; 41 #[cfg(target_arch = "x86_64")] 42 use hypervisor::arch::x86::CpuIdEntry; 43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 44 use hypervisor::arch::x86::MsrEntry; 45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 46 use hypervisor::arch::x86::SpecialRegisters; 47 #[cfg(feature = "tdx")] 48 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 49 #[cfg(target_arch = "x86_64")] 50 use hypervisor::CpuVendor; 51 #[cfg(feature = "kvm")] 52 use hypervisor::HypervisorType; 53 #[cfg(feature = "guest_debug")] 54 use hypervisor::StandardRegisters; 55 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps}; 56 use libc::{c_void, siginfo_t}; 57 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 58 use linux_loader::elf::Elf64_Nhdr; 59 use seccompiler::{apply_filter, SeccompAction}; 60 use thiserror::Error; 61 use tracer::trace_scoped; 62 use vm_device::BusDevice; 63 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 64 use vm_memory::ByteValued; 65 #[cfg(feature = "guest_debug")] 66 use vm_memory::{Bytes, GuestAddressSpace}; 67 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 68 use vm_migration::{ 69 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 70 Transportable, 71 }; 72 use vmm_sys_util::eventfd::EventFd; 73 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 74 use zerocopy::AsBytes; 75 76 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 77 use crate::coredump::{ 78 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 79 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 80 NT_PRSTATUS, 81 }; 82 #[cfg(feature = "guest_debug")] 83 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 84 #[cfg(target_arch = "x86_64")] 85 use crate::memory_manager::MemoryManager; 86 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 87 #[cfg(target_arch = "x86_64")] 88 use crate::vm::physical_bits; 89 use crate::vm_config::CpusConfig; 90 use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID}; 91 92 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 93 /// Extract the specified bits of a 64-bit integer. 94 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 95 /// following expression should return 3 (`0b11`): 96 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 97 /// 98 macro_rules! extract_bits_64 { 99 ($value: tt, $offset: tt, $length: tt) => { 100 ($value >> $offset) & (!0u64 >> (64 - $length)) 101 }; 102 } 103 104 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 105 macro_rules! extract_bits_64_without_offset { 106 ($value: tt, $length: tt) => { 107 $value & (!0u64 >> (64 - $length)) 108 }; 109 } 110 111 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 112 113 #[derive(Debug, Error)] 114 pub enum Error { 115 #[error("Error creating vCPU: {0}")] 116 VcpuCreate(#[source] anyhow::Error), 117 118 #[error("Error running bCPU: {0}")] 119 VcpuRun(#[source] anyhow::Error), 120 121 #[error("Error spawning vCPU thread: {0}")] 122 VcpuSpawn(#[source] io::Error), 123 124 #[error("Error generating common CPUID: {0}")] 125 CommonCpuId(#[source] arch::Error), 126 127 #[error("Error configuring vCPU: {0}")] 128 VcpuConfiguration(#[source] arch::Error), 129 130 #[error("Still pending removed vcpu")] 131 VcpuPendingRemovedVcpu, 132 133 #[cfg(target_arch = "aarch64")] 134 #[error("Error fetching preferred target: {0}")] 135 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 136 137 #[cfg(target_arch = "aarch64")] 138 #[error("Error setting vCPU processor features: {0}")] 139 VcpuSetProcessorFeatures(#[source] hypervisor::HypervisorCpuError), 140 141 #[cfg(target_arch = "aarch64")] 142 #[error("Error initialising vCPU: {0}")] 143 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 144 145 #[cfg(target_arch = "aarch64")] 146 #[error("Error finalising vCPU: {0}")] 147 VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError), 148 149 #[error("Failed to join on vCPU threads: {0:?}")] 150 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 151 152 #[error("Error adding CpuManager to MMIO bus: {0}")] 153 BusError(#[source] vm_device::BusError), 154 155 #[error("Requested vCPUs exceed maximum")] 156 DesiredVCpuCountExceedsMax, 157 158 #[error("Cannot create seccomp filter: {0}")] 159 CreateSeccompFilter(#[source] seccompiler::Error), 160 161 #[error("Cannot apply seccomp filter: {0}")] 162 ApplySeccompFilter(#[source] seccompiler::Error), 163 164 #[error("Error starting vCPU after restore: {0}")] 165 StartRestoreVcpu(#[source] anyhow::Error), 166 167 #[error("Unexpected VmExit")] 168 UnexpectedVmExit, 169 170 #[error("Failed to allocate MMIO address for CpuManager")] 171 AllocateMmmioAddress, 172 173 #[cfg(feature = "tdx")] 174 #[error("Error initializing TDX: {0}")] 175 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 176 177 #[cfg(target_arch = "aarch64")] 178 #[error("Error initializing PMU: {0}")] 179 InitPmu(#[source] hypervisor::HypervisorCpuError), 180 181 #[cfg(feature = "guest_debug")] 182 #[error("Error during CPU debug: {0}")] 183 CpuDebug(#[source] hypervisor::HypervisorCpuError), 184 185 #[cfg(feature = "guest_debug")] 186 #[error("Error translating virtual address: {0}")] 187 TranslateVirtualAddress(#[source] anyhow::Error), 188 189 #[cfg(target_arch = "x86_64")] 190 #[error("Error setting up AMX: {0}")] 191 AmxEnable(#[source] anyhow::Error), 192 193 #[error("Maximum number of vCPUs exceeds host limit")] 194 MaximumVcpusExceeded, 195 196 #[cfg(feature = "sev_snp")] 197 #[error("Failed to set sev control register: {0}")] 198 SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), 199 200 #[cfg(target_arch = "x86_64")] 201 #[error("Failed to inject NMI")] 202 NmiError(hypervisor::HypervisorCpuError), 203 } 204 pub type Result<T> = result::Result<T, Error>; 205 206 #[cfg(target_arch = "x86_64")] 207 #[allow(dead_code)] 208 #[repr(C, packed)] 209 #[derive(AsBytes)] 210 struct LocalX2Apic { 211 pub r#type: u8, 212 pub length: u8, 213 pub _reserved: u16, 214 pub apic_id: u32, 215 pub flags: u32, 216 pub processor_id: u32, 217 } 218 219 #[allow(dead_code)] 220 #[repr(C, packed)] 221 #[derive(Default, AsBytes)] 222 struct Ioapic { 223 pub r#type: u8, 224 pub length: u8, 225 pub ioapic_id: u8, 226 _reserved: u8, 227 pub apic_address: u32, 228 pub gsi_base: u32, 229 } 230 231 #[cfg(target_arch = "aarch64")] 232 #[allow(dead_code)] 233 #[repr(C, packed)] 234 #[derive(AsBytes)] 235 struct GicC { 236 pub r#type: u8, 237 pub length: u8, 238 pub reserved0: u16, 239 pub cpu_interface_number: u32, 240 pub uid: u32, 241 pub flags: u32, 242 pub parking_version: u32, 243 pub performance_interrupt: u32, 244 pub parked_address: u64, 245 pub base_address: u64, 246 pub gicv_base_address: u64, 247 pub gich_base_address: u64, 248 pub vgic_interrupt: u32, 249 pub gicr_base_address: u64, 250 pub mpidr: u64, 251 pub proc_power_effi_class: u8, 252 pub reserved1: u8, 253 pub spe_overflow_interrupt: u16, 254 } 255 256 #[cfg(target_arch = "aarch64")] 257 #[allow(dead_code)] 258 #[repr(C, packed)] 259 #[derive(AsBytes)] 260 struct GicD { 261 pub r#type: u8, 262 pub length: u8, 263 pub reserved0: u16, 264 pub gic_id: u32, 265 pub base_address: u64, 266 pub global_irq_base: u32, 267 pub version: u8, 268 pub reserved1: [u8; 3], 269 } 270 271 #[cfg(target_arch = "aarch64")] 272 #[allow(dead_code)] 273 #[repr(C, packed)] 274 #[derive(AsBytes)] 275 struct GicR { 276 pub r#type: u8, 277 pub length: u8, 278 pub reserved: u16, 279 pub base_address: u64, 280 pub range_length: u32, 281 } 282 283 #[cfg(target_arch = "aarch64")] 284 #[allow(dead_code)] 285 #[repr(C, packed)] 286 #[derive(AsBytes)] 287 struct GicIts { 288 pub r#type: u8, 289 pub length: u8, 290 pub reserved0: u16, 291 pub translation_id: u32, 292 pub base_address: u64, 293 pub reserved1: u32, 294 } 295 296 #[cfg(target_arch = "aarch64")] 297 #[allow(dead_code)] 298 #[repr(C, packed)] 299 #[derive(AsBytes)] 300 struct ProcessorHierarchyNode { 301 pub r#type: u8, 302 pub length: u8, 303 pub reserved: u16, 304 pub flags: u32, 305 pub parent: u32, 306 pub acpi_processor_id: u32, 307 pub num_private_resources: u32, 308 } 309 310 #[allow(dead_code)] 311 #[repr(C, packed)] 312 #[derive(Default, AsBytes)] 313 struct InterruptSourceOverride { 314 pub r#type: u8, 315 pub length: u8, 316 pub bus: u8, 317 pub source: u8, 318 pub gsi: u32, 319 pub flags: u16, 320 } 321 322 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 323 macro_rules! round_up { 324 ($n:expr,$d:expr) => { 325 (($n / ($d + 1)) + 1) * $d 326 }; 327 } 328 329 /// A wrapper around creating and using a kvm-based VCPU. 330 pub struct Vcpu { 331 // The hypervisor abstracted CPU. 332 vcpu: Arc<dyn hypervisor::Vcpu>, 333 id: u8, 334 #[cfg(target_arch = "aarch64")] 335 mpidr: u64, 336 saved_state: Option<CpuState>, 337 #[cfg(target_arch = "x86_64")] 338 vendor: CpuVendor, 339 } 340 341 impl Vcpu { 342 /// Constructs a new VCPU for `vm`. 343 /// 344 /// # Arguments 345 /// 346 /// * `id` - Represents the CPU number between [0, max vcpus). 347 /// * `vm` - The virtual machine this vcpu will get attached to. 348 /// * `vm_ops` - Optional object for exit handling. 349 /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) 350 pub fn new( 351 id: u8, 352 apic_id: u8, 353 vm: &Arc<dyn hypervisor::Vm>, 354 vm_ops: Option<Arc<dyn VmOps>>, 355 #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, 356 ) -> Result<Self> { 357 let vcpu = vm 358 .create_vcpu(apic_id, vm_ops) 359 .map_err(|e| Error::VcpuCreate(e.into()))?; 360 // Initially the cpuid per vCPU is the one supported by this VM. 361 Ok(Vcpu { 362 vcpu, 363 id, 364 #[cfg(target_arch = "aarch64")] 365 mpidr: 0, 366 saved_state: None, 367 #[cfg(target_arch = "x86_64")] 368 vendor: cpu_vendor, 369 }) 370 } 371 372 /// Configures a vcpu and should be called once per vcpu when created. 373 /// 374 /// # Arguments 375 /// 376 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 377 /// * `guest_memory` - Guest memory. 378 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 379 pub fn configure( 380 &mut self, 381 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 382 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 383 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 384 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 385 #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, 386 ) -> Result<()> { 387 #[cfg(target_arch = "aarch64")] 388 { 389 self.init(vm)?; 390 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 391 .map_err(Error::VcpuConfiguration)?; 392 } 393 info!("Configuring vCPU: cpu_id = {}", self.id); 394 #[cfg(target_arch = "x86_64")] 395 arch::configure_vcpu( 396 &self.vcpu, 397 self.id, 398 boot_setup, 399 cpuid, 400 kvm_hyperv, 401 self.vendor, 402 topology, 403 ) 404 .map_err(Error::VcpuConfiguration)?; 405 406 Ok(()) 407 } 408 409 /// Gets the MPIDR register value. 410 #[cfg(target_arch = "aarch64")] 411 pub fn get_mpidr(&self) -> u64 { 412 self.mpidr 413 } 414 415 /// Gets the saved vCPU state. 416 #[cfg(target_arch = "aarch64")] 417 pub fn get_saved_state(&self) -> Option<CpuState> { 418 self.saved_state.clone() 419 } 420 421 /// Initializes an aarch64 specific vcpu for booting Linux. 422 #[cfg(target_arch = "aarch64")] 423 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 424 use std::arch::is_aarch64_feature_detected; 425 #[allow(clippy::nonminimal_bool)] 426 let sve_supported = 427 is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2"); 428 let mut kvi = self.vcpu.create_vcpu_init(); 429 430 // This reads back the kernel's preferred target type. 431 vm.get_preferred_target(&mut kvi) 432 .map_err(Error::VcpuArmPreferredTarget)?; 433 434 self.vcpu 435 .vcpu_set_processor_features(vm, &mut kvi, self.id) 436 .map_err(Error::VcpuSetProcessorFeatures)?; 437 438 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?; 439 440 if sve_supported { 441 let finalized_features = self.vcpu.vcpu_get_finalized_features(); 442 self.vcpu 443 .vcpu_finalize(finalized_features) 444 .map_err(Error::VcpuArmFinalize)?; 445 } 446 Ok(()) 447 } 448 449 /// Runs the VCPU until it exits, returning the reason. 450 /// 451 /// Note that the state of the VCPU and associated VM must be setup first for this to do 452 /// anything useful. 453 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 454 self.vcpu.run() 455 } 456 457 #[cfg(feature = "sev_snp")] 458 pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> { 459 self.vcpu 460 .set_sev_control_register(vmsa_pfn) 461 .map_err(Error::SetSevControlRegister) 462 } 463 } 464 465 impl Pausable for Vcpu {} 466 impl Snapshottable for Vcpu { 467 fn id(&self) -> String { 468 self.id.to_string() 469 } 470 471 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 472 let saved_state = self 473 .vcpu 474 .state() 475 .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?; 476 477 self.saved_state = Some(saved_state.clone()); 478 479 Ok(Snapshot::from_data(SnapshotData::new_from_state( 480 &saved_state, 481 )?)) 482 } 483 } 484 485 pub struct CpuManager { 486 config: CpusConfig, 487 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 488 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 489 #[cfg(target_arch = "x86_64")] 490 cpuid: Vec<CpuIdEntry>, 491 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 492 vm: Arc<dyn hypervisor::Vm>, 493 vcpus_kill_signalled: Arc<AtomicBool>, 494 vcpus_pause_signalled: Arc<AtomicBool>, 495 vcpus_kick_signalled: Arc<AtomicBool>, 496 exit_evt: EventFd, 497 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 498 reset_evt: EventFd, 499 #[cfg(feature = "guest_debug")] 500 vm_debug_evt: EventFd, 501 vcpu_states: Vec<VcpuState>, 502 selected_cpu: u8, 503 vcpus: Vec<Arc<Mutex<Vcpu>>>, 504 seccomp_action: SeccompAction, 505 vm_ops: Arc<dyn VmOps>, 506 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 507 acpi_address: Option<GuestAddress>, 508 proximity_domain_per_cpu: BTreeMap<u8, u32>, 509 affinity: BTreeMap<u8, Vec<usize>>, 510 dynamic: bool, 511 hypervisor: Arc<dyn hypervisor::Hypervisor>, 512 #[cfg(feature = "sev_snp")] 513 sev_snp_enabled: bool, 514 } 515 516 const CPU_ENABLE_FLAG: usize = 0; 517 const CPU_INSERTING_FLAG: usize = 1; 518 const CPU_REMOVING_FLAG: usize = 2; 519 const CPU_EJECT_FLAG: usize = 3; 520 521 const CPU_STATUS_OFFSET: u64 = 4; 522 const CPU_SELECTION_OFFSET: u64 = 0; 523 524 impl BusDevice for CpuManager { 525 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 526 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 527 data.fill(0); 528 529 match offset { 530 CPU_SELECTION_OFFSET => { 531 data[0] = self.selected_cpu; 532 } 533 CPU_STATUS_OFFSET => { 534 if self.selected_cpu < self.max_vcpus() { 535 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 536 if state.active() { 537 data[0] |= 1 << CPU_ENABLE_FLAG; 538 } 539 if state.inserting { 540 data[0] |= 1 << CPU_INSERTING_FLAG; 541 } 542 if state.removing { 543 data[0] |= 1 << CPU_REMOVING_FLAG; 544 } 545 } else { 546 warn!("Out of range vCPU id: {}", self.selected_cpu); 547 } 548 } 549 _ => { 550 warn!( 551 "Unexpected offset for accessing CPU manager device: {:#}", 552 offset 553 ); 554 } 555 } 556 } 557 558 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 559 match offset { 560 CPU_SELECTION_OFFSET => { 561 self.selected_cpu = data[0]; 562 } 563 CPU_STATUS_OFFSET => { 564 if self.selected_cpu < self.max_vcpus() { 565 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 566 // The ACPI code writes back a 1 to acknowledge the insertion 567 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 568 && state.inserting 569 { 570 state.inserting = false; 571 } 572 // Ditto for removal 573 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 574 && state.removing 575 { 576 state.removing = false; 577 } 578 // Trigger removal of vCPU 579 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 580 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 581 error!("Error removing vCPU: {:?}", e); 582 } 583 } 584 } else { 585 warn!("Out of range vCPU id: {}", self.selected_cpu); 586 } 587 } 588 _ => { 589 warn!( 590 "Unexpected offset for accessing CPU manager device: {:#}", 591 offset 592 ); 593 } 594 } 595 None 596 } 597 } 598 599 #[derive(Default)] 600 struct VcpuState { 601 inserting: bool, 602 removing: bool, 603 pending_removal: Arc<AtomicBool>, 604 handle: Option<thread::JoinHandle<()>>, 605 kill: Arc<AtomicBool>, 606 vcpu_run_interrupted: Arc<AtomicBool>, 607 paused: Arc<AtomicBool>, 608 } 609 610 impl VcpuState { 611 fn active(&self) -> bool { 612 self.handle.is_some() 613 } 614 615 fn signal_thread(&self) { 616 if let Some(handle) = self.handle.as_ref() { 617 loop { 618 // SAFETY: FFI call with correct arguments 619 unsafe { 620 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 621 } 622 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 623 break; 624 } else { 625 // This is more effective than thread::yield_now() at 626 // avoiding a priority inversion with the vCPU thread 627 thread::sleep(std::time::Duration::from_millis(1)); 628 } 629 } 630 } 631 } 632 633 fn join_thread(&mut self) -> Result<()> { 634 if let Some(handle) = self.handle.take() { 635 handle.join().map_err(Error::ThreadCleanup)? 636 } 637 638 Ok(()) 639 } 640 641 fn unpark_thread(&self) { 642 if let Some(handle) = self.handle.as_ref() { 643 handle.thread().unpark() 644 } 645 } 646 } 647 648 impl CpuManager { 649 #[allow(unused_variables)] 650 #[allow(clippy::too_many_arguments)] 651 pub fn new( 652 config: &CpusConfig, 653 vm: Arc<dyn hypervisor::Vm>, 654 exit_evt: EventFd, 655 reset_evt: EventFd, 656 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 657 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 658 seccomp_action: SeccompAction, 659 vm_ops: Arc<dyn VmOps>, 660 #[cfg(feature = "tdx")] tdx_enabled: bool, 661 numa_nodes: &NumaNodes, 662 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 663 ) -> Result<Arc<Mutex<CpuManager>>> { 664 if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { 665 return Err(Error::MaximumVcpusExceeded); 666 } 667 668 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 669 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 670 let hypervisor_type = hypervisor.hypervisor_type(); 671 #[cfg(target_arch = "x86_64")] 672 let cpu_vendor = hypervisor.get_cpu_vendor(); 673 674 #[cfg(target_arch = "x86_64")] 675 if config.features.amx { 676 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 677 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 678 const XFEATURE_XTILEDATA: usize = 18; 679 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 680 681 // SAFETY: the syscall is only modifying kernel internal 682 // data structures that the kernel is itself expected to safeguard. 683 let amx_tile = unsafe { 684 libc::syscall( 685 libc::SYS_arch_prctl, 686 ARCH_REQ_XCOMP_GUEST_PERM, 687 XFEATURE_XTILEDATA, 688 ) 689 }; 690 691 if amx_tile != 0 { 692 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 693 } else { 694 let mask: usize = 0; 695 // SAFETY: the mask being modified (not marked mutable as it is 696 // modified in unsafe only which is permitted) isn't in use elsewhere. 697 let result = unsafe { 698 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 699 }; 700 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 701 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 702 } 703 } 704 } 705 706 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 707 let mut cpu_list = Vec::new(); 708 for (proximity_domain, numa_node) in numa_nodes.iter() { 709 for cpu in numa_node.cpus.iter() { 710 cpu_list.push((*cpu, *proximity_domain)) 711 } 712 } 713 cpu_list 714 } 715 .into_iter() 716 .collect(); 717 718 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 719 cpu_affinity 720 .iter() 721 .map(|a| (a.vcpu, a.host_cpus.clone())) 722 .collect() 723 } else { 724 BTreeMap::new() 725 }; 726 727 #[cfg(feature = "tdx")] 728 let dynamic = !tdx_enabled; 729 #[cfg(not(feature = "tdx"))] 730 let dynamic = true; 731 732 Ok(Arc::new(Mutex::new(CpuManager { 733 config: config.clone(), 734 interrupt_controller: None, 735 #[cfg(target_arch = "x86_64")] 736 cpuid: Vec::new(), 737 vm, 738 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 739 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 740 vcpus_kick_signalled: Arc::new(AtomicBool::new(false)), 741 vcpu_states, 742 exit_evt, 743 reset_evt, 744 #[cfg(feature = "guest_debug")] 745 vm_debug_evt, 746 selected_cpu: 0, 747 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 748 seccomp_action, 749 vm_ops, 750 acpi_address: None, 751 proximity_domain_per_cpu, 752 affinity, 753 dynamic, 754 hypervisor: hypervisor.clone(), 755 #[cfg(feature = "sev_snp")] 756 sev_snp_enabled, 757 }))) 758 } 759 760 #[cfg(target_arch = "x86_64")] 761 pub fn populate_cpuid( 762 &mut self, 763 memory_manager: &Arc<Mutex<MemoryManager>>, 764 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 765 #[cfg(feature = "tdx")] tdx: bool, 766 ) -> Result<()> { 767 let sgx_epc_sections = memory_manager 768 .lock() 769 .unwrap() 770 .sgx_epc_region() 771 .as_ref() 772 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 773 774 self.cpuid = { 775 let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); 776 arch::generate_common_cpuid( 777 hypervisor, 778 &arch::CpuidConfig { 779 sgx_epc_sections, 780 phys_bits, 781 kvm_hyperv: self.config.kvm_hyperv, 782 #[cfg(feature = "tdx")] 783 tdx, 784 amx: self.config.features.amx, 785 }, 786 ) 787 .map_err(Error::CommonCpuId)? 788 }; 789 790 Ok(()) 791 } 792 793 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 794 info!("Creating vCPU: cpu_id = {}", cpu_id); 795 796 #[cfg(target_arch = "x86_64")] 797 let topology = self.get_vcpu_topology(); 798 #[cfg(target_arch = "x86_64")] 799 let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology); 800 #[cfg(target_arch = "aarch64")] 801 let x2apic_id = cpu_id as u32; 802 803 let mut vcpu = Vcpu::new( 804 cpu_id, 805 x2apic_id as u8, 806 &self.vm, 807 Some(self.vm_ops.clone()), 808 #[cfg(target_arch = "x86_64")] 809 self.hypervisor.get_cpu_vendor(), 810 )?; 811 812 if let Some(snapshot) = snapshot { 813 // AArch64 vCPUs should be initialized after created. 814 #[cfg(target_arch = "aarch64")] 815 vcpu.init(&self.vm)?; 816 817 let state: CpuState = snapshot.to_state().map_err(|e| { 818 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 819 })?; 820 vcpu.vcpu 821 .set_state(&state) 822 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 823 824 vcpu.saved_state = Some(state); 825 } 826 827 let vcpu = Arc::new(Mutex::new(vcpu)); 828 829 // Adding vCPU to the CpuManager's vCPU list. 830 self.vcpus.push(vcpu.clone()); 831 832 Ok(vcpu) 833 } 834 835 pub fn configure_vcpu( 836 &self, 837 vcpu: Arc<Mutex<Vcpu>>, 838 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 839 ) -> Result<()> { 840 let mut vcpu = vcpu.lock().unwrap(); 841 842 #[cfg(feature = "sev_snp")] 843 if self.sev_snp_enabled { 844 if let Some((kernel_entry_point, _)) = boot_setup { 845 vcpu.set_sev_control_register( 846 kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, 847 )?; 848 } 849 850 // Traditional way to configure vcpu doesn't work for SEV-SNP guests. 851 // All the vCPU configuration for SEV-SNP guest is provided via VMSA. 852 return Ok(()); 853 } 854 855 #[cfg(target_arch = "x86_64")] 856 assert!(!self.cpuid.is_empty()); 857 858 #[cfg(target_arch = "x86_64")] 859 let topology = self.config.topology.clone().map_or_else( 860 || Some((1, self.boot_vcpus(), 1)), 861 |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), 862 ); 863 #[cfg(target_arch = "x86_64")] 864 vcpu.configure( 865 boot_setup, 866 self.cpuid.clone(), 867 self.config.kvm_hyperv, 868 topology, 869 )?; 870 871 #[cfg(target_arch = "aarch64")] 872 vcpu.configure(&self.vm, boot_setup)?; 873 874 Ok(()) 875 } 876 877 /// Only create new vCPUs if there aren't any inactive ones to reuse 878 fn create_vcpus( 879 &mut self, 880 desired_vcpus: u8, 881 snapshot: Option<Snapshot>, 882 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 883 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 884 info!( 885 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 886 desired_vcpus, 887 self.config.max_vcpus, 888 self.vcpus.len(), 889 self.present_vcpus() 890 ); 891 892 if desired_vcpus > self.config.max_vcpus { 893 return Err(Error::DesiredVCpuCountExceedsMax); 894 } 895 896 // Only create vCPUs in excess of all the allocated vCPUs. 897 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 898 vcpus.push(self.create_vcpu( 899 cpu_id, 900 // TODO: The special format of the CPU id can be removed once 901 // ready to break live upgrade. 902 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 903 )?); 904 } 905 906 Ok(vcpus) 907 } 908 909 #[cfg(target_arch = "aarch64")] 910 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 911 for cpu in self.vcpus.iter() { 912 let cpu = cpu.lock().unwrap(); 913 // Check if PMU attr is available, if not, log the information. 914 if cpu.vcpu.has_pmu_support() { 915 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 916 } else { 917 debug!( 918 "PMU attribute is not supported in vCPU{}, skip PMU init!", 919 cpu.id 920 ); 921 return Ok(false); 922 } 923 } 924 925 Ok(true) 926 } 927 928 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 929 self.vcpus.clone() 930 } 931 932 fn start_vcpu( 933 &mut self, 934 vcpu: Arc<Mutex<Vcpu>>, 935 vcpu_id: u8, 936 vcpu_thread_barrier: Arc<Barrier>, 937 inserting: bool, 938 ) -> Result<()> { 939 let reset_evt = self.reset_evt.try_clone().unwrap(); 940 let exit_evt = self.exit_evt.try_clone().unwrap(); 941 #[cfg(feature = "kvm")] 942 let hypervisor_type = self.hypervisor.hypervisor_type(); 943 #[cfg(feature = "guest_debug")] 944 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 945 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 946 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 947 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 948 let vcpu_kick_signalled = self.vcpus_kick_signalled.clone(); 949 950 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 951 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 952 .vcpu_run_interrupted 953 .clone(); 954 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 955 let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); 956 957 // Prepare the CPU set the current vCPU is expected to run onto. 958 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 959 // SAFETY: all zeros is a valid pattern 960 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 961 // SAFETY: FFI call, trivially safe 962 unsafe { libc::CPU_ZERO(&mut cpuset) }; 963 for host_cpu in host_cpus { 964 // SAFETY: FFI call, trivially safe 965 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) }; 966 } 967 cpuset 968 }); 969 970 // Retrieve seccomp filter for vcpu thread 971 let vcpu_seccomp_filter = get_seccomp_filter( 972 &self.seccomp_action, 973 Thread::Vcpu, 974 self.hypervisor.hypervisor_type(), 975 ) 976 .map_err(Error::CreateSeccompFilter)?; 977 978 #[cfg(target_arch = "x86_64")] 979 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 980 981 info!("Starting vCPU: cpu_id = {}", vcpu_id); 982 983 let handle = Some( 984 thread::Builder::new() 985 .name(format!("vcpu{vcpu_id}")) 986 .spawn(move || { 987 // Schedule the thread to run on the expected CPU set 988 if let Some(cpuset) = cpuset.as_ref() { 989 // SAFETY: FFI call with correct arguments 990 let ret = unsafe { 991 libc::sched_setaffinity( 992 0, 993 std::mem::size_of::<libc::cpu_set_t>(), 994 cpuset as *const libc::cpu_set_t, 995 ) 996 }; 997 998 if ret != 0 { 999 error!( 1000 "Failed scheduling the vCPU {} on the expected CPU set: {}", 1001 vcpu_id, 1002 io::Error::last_os_error() 1003 ); 1004 return; 1005 } 1006 } 1007 1008 // Apply seccomp filter for vcpu thread. 1009 if !vcpu_seccomp_filter.is_empty() { 1010 if let Err(e) = 1011 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 1012 { 1013 error!("Error applying seccomp filter: {:?}", e); 1014 return; 1015 } 1016 } 1017 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 1018 // This uses an async signal safe handler to kill the vcpu handles. 1019 register_signal_handler(SIGRTMIN(), handle_signal) 1020 .expect("Failed to register vcpu signal handler"); 1021 // Block until all CPUs are ready. 1022 vcpu_thread_barrier.wait(); 1023 1024 std::panic::catch_unwind(move || { 1025 loop { 1026 // If we are being told to pause, we park the thread 1027 // until the pause boolean is toggled. 1028 // The resume operation is responsible for toggling 1029 // the boolean and unpark the thread. 1030 // We enter a loop because park() could spuriously 1031 // return. We will then park() again unless the 1032 // pause boolean has been toggled. 1033 1034 // Need to use Ordering::SeqCst as we have multiple 1035 // loads and stores to different atomics and we need 1036 // to see them in a consistent order in all threads 1037 1038 if vcpu_pause_signalled.load(Ordering::SeqCst) { 1039 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 1040 // completed by returning to KVM_RUN. From the kernel docs: 1041 // 1042 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 1043 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 1044 // operations are complete (and guest state is consistent) only after userspace 1045 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 1046 // incomplete operations and then check for pending signals. 1047 // The pending state of the operation is not preserved in state which is 1048 // visible to userspace, thus userspace should ensure that the operation is 1049 // completed before performing a live migration. Userspace can re-enter the 1050 // guest with an unmasked signal pending or with the immediate_exit field set 1051 // to complete pending operations without allowing any further instructions 1052 // to be executed. 1053 1054 #[cfg(feature = "kvm")] 1055 if matches!(hypervisor_type, HypervisorType::Kvm) { 1056 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 1057 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 1058 error!("Unexpected VM exit on \"immediate_exit\" run"); 1059 break; 1060 } 1061 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 1062 } 1063 1064 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1065 1066 vcpu_paused.store(true, Ordering::SeqCst); 1067 while vcpu_pause_signalled.load(Ordering::SeqCst) { 1068 thread::park(); 1069 } 1070 vcpu_run_interrupted.store(false, Ordering::SeqCst); 1071 } 1072 1073 if vcpu_kick_signalled.load(Ordering::SeqCst) { 1074 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1075 #[cfg(target_arch = "x86_64")] 1076 match vcpu.lock().as_ref().unwrap().vcpu.nmi() { 1077 Ok(()) => {}, 1078 Err(e) => { 1079 error!("Error when inject nmi {}", e); 1080 break; 1081 } 1082 } 1083 } 1084 1085 // We've been told to terminate 1086 if vcpu_kill_signalled.load(Ordering::SeqCst) 1087 || vcpu_kill.load(Ordering::SeqCst) 1088 { 1089 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1090 break; 1091 } 1092 1093 #[cfg(feature = "tdx")] 1094 let mut vcpu = vcpu.lock().unwrap(); 1095 #[cfg(not(feature = "tdx"))] 1096 let vcpu = vcpu.lock().unwrap(); 1097 // vcpu.run() returns false on a triple-fault so trigger a reset 1098 match vcpu.run() { 1099 Ok(run) => match run { 1100 #[cfg(feature = "kvm")] 1101 VmExit::Debug => { 1102 info!("VmExit::Debug"); 1103 #[cfg(feature = "guest_debug")] 1104 { 1105 vcpu_pause_signalled.store(true, Ordering::SeqCst); 1106 let raw_tid = get_raw_tid(vcpu_id as usize); 1107 vm_debug_evt.write(raw_tid as u64).unwrap(); 1108 } 1109 } 1110 #[cfg(target_arch = "x86_64")] 1111 VmExit::IoapicEoi(vector) => { 1112 if let Some(interrupt_controller) = 1113 &interrupt_controller_clone 1114 { 1115 interrupt_controller 1116 .lock() 1117 .unwrap() 1118 .end_of_interrupt(vector); 1119 } 1120 } 1121 VmExit::Ignore => {} 1122 VmExit::Hyperv => {} 1123 VmExit::Reset => { 1124 info!("VmExit::Reset"); 1125 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1126 reset_evt.write(1).unwrap(); 1127 break; 1128 } 1129 VmExit::Shutdown => { 1130 info!("VmExit::Shutdown"); 1131 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1132 exit_evt.write(1).unwrap(); 1133 break; 1134 } 1135 #[cfg(feature = "tdx")] 1136 VmExit::Tdx => { 1137 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1138 match vcpu.get_tdx_exit_details() { 1139 Ok(details) => match details { 1140 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1141 TdxExitDetails::SetupEventNotifyInterrupt => { 1142 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1143 } 1144 }, 1145 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1146 } 1147 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1148 } else { 1149 // We should never reach this code as 1150 // this means the design from the code 1151 // is wrong. 1152 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1153 } 1154 } 1155 }, 1156 1157 Err(e) => { 1158 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1159 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1160 exit_evt.write(1).unwrap(); 1161 break; 1162 } 1163 } 1164 1165 // We've been told to terminate 1166 if vcpu_kill_signalled.load(Ordering::SeqCst) 1167 || vcpu_kill.load(Ordering::SeqCst) 1168 { 1169 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1170 break; 1171 } 1172 } 1173 }) 1174 .or_else(|_| { 1175 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1176 error!("vCPU thread panicked"); 1177 panic_exit_evt.write(1) 1178 }) 1179 .ok(); 1180 }) 1181 .map_err(Error::VcpuSpawn)?, 1182 ); 1183 1184 // On hot plug calls into this function entry_point is None. It is for 1185 // those hotplug CPU additions that we need to set the inserting flag. 1186 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1187 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1188 1189 Ok(()) 1190 } 1191 1192 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1193 fn activate_vcpus( 1194 &mut self, 1195 desired_vcpus: u8, 1196 inserting: bool, 1197 paused: Option<bool>, 1198 ) -> Result<()> { 1199 if desired_vcpus > self.config.max_vcpus { 1200 return Err(Error::DesiredVCpuCountExceedsMax); 1201 } 1202 1203 let vcpu_thread_barrier = Arc::new(Barrier::new( 1204 (desired_vcpus - self.present_vcpus() + 1) as usize, 1205 )); 1206 1207 if let Some(paused) = paused { 1208 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1209 } 1210 1211 info!( 1212 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1213 desired_vcpus, 1214 self.vcpus.len(), 1215 self.present_vcpus(), 1216 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1217 ); 1218 1219 // This reuses any inactive vCPUs as well as any that were newly created 1220 for vcpu_id in self.present_vcpus()..desired_vcpus { 1221 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1222 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1223 } 1224 1225 // Unblock all CPU threads. 1226 vcpu_thread_barrier.wait(); 1227 Ok(()) 1228 } 1229 1230 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1231 // Mark vCPUs for removal, actual removal happens on ejection 1232 for cpu_id in desired_vcpus..self.present_vcpus() { 1233 self.vcpu_states[usize::from(cpu_id)].removing = true; 1234 self.vcpu_states[usize::from(cpu_id)] 1235 .pending_removal 1236 .store(true, Ordering::SeqCst); 1237 } 1238 } 1239 1240 pub fn check_pending_removed_vcpu(&mut self) -> bool { 1241 for state in self.vcpu_states.iter() { 1242 if state.active() && state.pending_removal.load(Ordering::SeqCst) { 1243 return true; 1244 } 1245 } 1246 false 1247 } 1248 1249 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1250 info!("Removing vCPU: cpu_id = {}", cpu_id); 1251 let state = &mut self.vcpu_states[usize::from(cpu_id)]; 1252 state.kill.store(true, Ordering::SeqCst); 1253 state.signal_thread(); 1254 state.join_thread()?; 1255 state.handle = None; 1256 1257 // Once the thread has exited, clear the "kill" so that it can reused 1258 state.kill.store(false, Ordering::SeqCst); 1259 state.pending_removal.store(false, Ordering::SeqCst); 1260 1261 Ok(()) 1262 } 1263 1264 pub fn create_boot_vcpus( 1265 &mut self, 1266 snapshot: Option<Snapshot>, 1267 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1268 trace_scoped!("create_boot_vcpus"); 1269 1270 self.create_vcpus(self.boot_vcpus(), snapshot) 1271 } 1272 1273 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1274 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1275 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1276 } 1277 1278 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1279 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1280 .map_err(|e| { 1281 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1282 })?; 1283 1284 Ok(()) 1285 } 1286 1287 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1288 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1289 return Ok(false); 1290 } 1291 1292 if !self.dynamic { 1293 return Ok(false); 1294 } 1295 1296 if self.check_pending_removed_vcpu() { 1297 return Err(Error::VcpuPendingRemovedVcpu); 1298 } 1299 1300 match desired_vcpus.cmp(&self.present_vcpus()) { 1301 cmp::Ordering::Greater => { 1302 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1303 for vcpu in vcpus { 1304 self.configure_vcpu(vcpu, None)? 1305 } 1306 self.activate_vcpus(desired_vcpus, true, None)?; 1307 Ok(true) 1308 } 1309 cmp::Ordering::Less => { 1310 self.mark_vcpus_for_removal(desired_vcpus); 1311 Ok(true) 1312 } 1313 _ => Ok(false), 1314 } 1315 } 1316 1317 pub fn shutdown(&mut self) -> Result<()> { 1318 // Tell the vCPUs to stop themselves next time they go through the loop 1319 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1320 1321 // Toggle the vCPUs pause boolean 1322 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1323 1324 // Unpark all the VCPU threads. 1325 for state in self.vcpu_states.iter() { 1326 state.unpark_thread(); 1327 } 1328 1329 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1330 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1331 // above. 1332 for state in self.vcpu_states.iter() { 1333 state.signal_thread(); 1334 } 1335 1336 // Wait for all the threads to finish. This removes the state from the vector. 1337 for mut state in self.vcpu_states.drain(..) { 1338 state.join_thread()?; 1339 } 1340 1341 Ok(()) 1342 } 1343 1344 #[cfg(feature = "tdx")] 1345 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1346 for vcpu in &self.vcpus { 1347 vcpu.lock() 1348 .unwrap() 1349 .vcpu 1350 .tdx_init(hob_address) 1351 .map_err(Error::InitializeTdx)?; 1352 } 1353 Ok(()) 1354 } 1355 1356 pub fn boot_vcpus(&self) -> u8 { 1357 self.config.boot_vcpus 1358 } 1359 1360 pub fn max_vcpus(&self) -> u8 { 1361 self.config.max_vcpus 1362 } 1363 1364 #[cfg(target_arch = "x86_64")] 1365 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1366 assert!(!self.cpuid.is_empty()); 1367 self.cpuid.clone() 1368 } 1369 1370 fn present_vcpus(&self) -> u8 { 1371 self.vcpu_states 1372 .iter() 1373 .fold(0, |acc, state| acc + state.active() as u8) 1374 } 1375 1376 #[cfg(target_arch = "aarch64")] 1377 pub fn get_mpidrs(&self) -> Vec<u64> { 1378 self.vcpus 1379 .iter() 1380 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1381 .collect() 1382 } 1383 1384 #[cfg(target_arch = "aarch64")] 1385 pub fn get_saved_states(&self) -> Vec<CpuState> { 1386 self.vcpus 1387 .iter() 1388 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1389 .collect() 1390 } 1391 1392 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1393 self.config 1394 .topology 1395 .clone() 1396 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1397 } 1398 1399 pub fn create_madt(&self) -> Sdt { 1400 use crate::acpi; 1401 // This is also checked in the commandline parsing. 1402 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1403 1404 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1405 #[cfg(target_arch = "x86_64")] 1406 { 1407 madt.write(36, arch::layout::APIC_START.0); 1408 1409 for cpu in 0..self.config.max_vcpus { 1410 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology()); 1411 1412 let lapic = LocalX2Apic { 1413 r#type: acpi::ACPI_X2APIC_PROCESSOR, 1414 length: 16, 1415 processor_id: cpu.into(), 1416 apic_id: x2apic_id, 1417 flags: if cpu < self.config.boot_vcpus { 1418 1 << MADT_CPU_ENABLE_FLAG 1419 } else { 1420 0 1421 } | (1 << MADT_CPU_ONLINE_CAPABLE_FLAG), 1422 _reserved: 0, 1423 }; 1424 madt.append(lapic); 1425 } 1426 1427 madt.append(Ioapic { 1428 r#type: acpi::ACPI_APIC_IO, 1429 length: 12, 1430 ioapic_id: 0, 1431 apic_address: arch::layout::IOAPIC_START.0 as u32, 1432 gsi_base: 0, 1433 ..Default::default() 1434 }); 1435 1436 madt.append(InterruptSourceOverride { 1437 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1438 length: 10, 1439 bus: 0, 1440 source: 4, 1441 gsi: 4, 1442 flags: 0, 1443 }); 1444 } 1445 1446 #[cfg(target_arch = "aarch64")] 1447 { 1448 /* Notes: 1449 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1450 */ 1451 1452 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1453 for cpu in 0..self.config.boot_vcpus { 1454 let vcpu = &self.vcpus[cpu as usize]; 1455 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1456 /* ARMv8 MPIDR format: 1457 Bits [63:40] Must be zero 1458 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1459 Bits [31:24] Must be zero 1460 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1461 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1462 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1463 */ 1464 let mpidr_mask = 0xff_00ff_ffff; 1465 let gicc = GicC { 1466 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1467 length: 80, 1468 reserved0: 0, 1469 cpu_interface_number: cpu as u32, 1470 uid: cpu as u32, 1471 flags: 1, 1472 parking_version: 0, 1473 performance_interrupt: 0, 1474 parked_address: 0, 1475 base_address: 0, 1476 gicv_base_address: 0, 1477 gich_base_address: 0, 1478 vgic_interrupt: 0, 1479 gicr_base_address: 0, 1480 mpidr: mpidr & mpidr_mask, 1481 proc_power_effi_class: 0, 1482 reserved1: 0, 1483 spe_overflow_interrupt: 0, 1484 }; 1485 1486 madt.append(gicc); 1487 } 1488 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1489 1490 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1491 let gicd = GicD { 1492 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1493 length: 24, 1494 reserved0: 0, 1495 gic_id: 0, 1496 base_address: vgic_config.dist_addr, 1497 global_irq_base: 0, 1498 version: 3, 1499 reserved1: [0; 3], 1500 }; 1501 madt.append(gicd); 1502 1503 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1504 let gicr = GicR { 1505 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1506 length: 16, 1507 reserved: 0, 1508 base_address: vgic_config.redists_addr, 1509 range_length: vgic_config.redists_size as u32, 1510 }; 1511 madt.append(gicr); 1512 1513 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1514 let gicits = GicIts { 1515 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1516 length: 20, 1517 reserved0: 0, 1518 translation_id: 0, 1519 base_address: vgic_config.msi_addr, 1520 reserved1: 0, 1521 }; 1522 madt.append(gicits); 1523 1524 madt.update_checksum(); 1525 } 1526 1527 madt 1528 } 1529 1530 #[cfg(target_arch = "aarch64")] 1531 pub fn create_pptt(&self) -> Sdt { 1532 let pptt_start = 0; 1533 let mut cpus = 0; 1534 let mut uid = 0; 1535 // If topology is not specified, the default setting is: 1536 // 1 package, multiple cores, 1 thread per core 1537 // This is also the behavior when PPTT is missing. 1538 let (threads_per_core, cores_per_package, packages) = 1539 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1540 1541 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1542 1543 for cluster_idx in 0..packages { 1544 if cpus < self.config.boot_vcpus as usize { 1545 let cluster_offset = pptt.len() - pptt_start; 1546 let cluster_hierarchy_node = ProcessorHierarchyNode { 1547 r#type: 0, 1548 length: 20, 1549 reserved: 0, 1550 flags: 0x2, 1551 parent: 0, 1552 acpi_processor_id: cluster_idx as u32, 1553 num_private_resources: 0, 1554 }; 1555 pptt.append(cluster_hierarchy_node); 1556 1557 for core_idx in 0..cores_per_package { 1558 let core_offset = pptt.len() - pptt_start; 1559 1560 if threads_per_core > 1 { 1561 let core_hierarchy_node = ProcessorHierarchyNode { 1562 r#type: 0, 1563 length: 20, 1564 reserved: 0, 1565 flags: 0x2, 1566 parent: cluster_offset as u32, 1567 acpi_processor_id: core_idx as u32, 1568 num_private_resources: 0, 1569 }; 1570 pptt.append(core_hierarchy_node); 1571 1572 for _thread_idx in 0..threads_per_core { 1573 let thread_hierarchy_node = ProcessorHierarchyNode { 1574 r#type: 0, 1575 length: 20, 1576 reserved: 0, 1577 flags: 0xE, 1578 parent: core_offset as u32, 1579 acpi_processor_id: uid as u32, 1580 num_private_resources: 0, 1581 }; 1582 pptt.append(thread_hierarchy_node); 1583 uid += 1; 1584 } 1585 } else { 1586 let thread_hierarchy_node = ProcessorHierarchyNode { 1587 r#type: 0, 1588 length: 20, 1589 reserved: 0, 1590 flags: 0xA, 1591 parent: cluster_offset as u32, 1592 acpi_processor_id: uid as u32, 1593 num_private_resources: 0, 1594 }; 1595 pptt.append(thread_hierarchy_node); 1596 uid += 1; 1597 } 1598 } 1599 cpus += (cores_per_package * threads_per_core) as usize; 1600 } 1601 } 1602 1603 pptt.update_checksum(); 1604 pptt 1605 } 1606 1607 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1608 fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters { 1609 self.vcpus[usize::from(cpu_id)] 1610 .lock() 1611 .unwrap() 1612 .vcpu 1613 .create_standard_regs() 1614 } 1615 1616 #[cfg(feature = "guest_debug")] 1617 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1618 self.vcpus[usize::from(cpu_id)] 1619 .lock() 1620 .unwrap() 1621 .vcpu 1622 .get_regs() 1623 .map_err(Error::CpuDebug) 1624 } 1625 1626 #[cfg(feature = "guest_debug")] 1627 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1628 self.vcpus[usize::from(cpu_id)] 1629 .lock() 1630 .unwrap() 1631 .vcpu 1632 .set_regs(regs) 1633 .map_err(Error::CpuDebug) 1634 } 1635 1636 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1637 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1638 self.vcpus[usize::from(cpu_id)] 1639 .lock() 1640 .unwrap() 1641 .vcpu 1642 .get_sregs() 1643 .map_err(Error::CpuDebug) 1644 } 1645 1646 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1647 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1648 self.vcpus[usize::from(cpu_id)] 1649 .lock() 1650 .unwrap() 1651 .vcpu 1652 .set_sregs(sregs) 1653 .map_err(Error::CpuDebug) 1654 } 1655 1656 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1657 fn translate_gva( 1658 &self, 1659 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1660 cpu_id: u8, 1661 gva: u64, 1662 ) -> Result<u64> { 1663 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1664 .lock() 1665 .unwrap() 1666 .vcpu 1667 .translate_gva(gva, /* flags: unused */ 0) 1668 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1669 Ok(gpa) 1670 } 1671 1672 /// 1673 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1674 /// it in VMM by walking through translation tables. 1675 /// 1676 /// Address translation is big topic, here we only focus the scenario that 1677 /// happens in VMM while debugging kernel. This `translate_gva` 1678 /// implementation is restricted to: 1679 /// - Exception Level 1 1680 /// - Translate high address range only (kernel space) 1681 /// 1682 /// This implementation supports following Arm-v8a features related to 1683 /// address translation: 1684 /// - FEAT_LPA 1685 /// - FEAT_LVA 1686 /// - FEAT_LPA2 1687 /// 1688 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1689 fn translate_gva( 1690 &self, 1691 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1692 cpu_id: u8, 1693 gva: u64, 1694 ) -> Result<u64> { 1695 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1696 .lock() 1697 .unwrap() 1698 .vcpu 1699 .get_sys_reg(regs::TCR_EL1) 1700 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1701 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1702 .lock() 1703 .unwrap() 1704 .vcpu 1705 .get_sys_reg(regs::TTBR1_EL1) 1706 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1707 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1708 .lock() 1709 .unwrap() 1710 .vcpu 1711 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1712 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1713 1714 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1715 // or low (0x000xxx...). 1716 let high_range = extract_bits_64!(gva, 55, 1); 1717 if high_range == 0 { 1718 info!("VA (0x{:x}) range is not supported!", gva); 1719 return Ok(gva); 1720 } 1721 1722 // High range size offset 1723 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1724 // Granule size 1725 let tg = extract_bits_64!(tcr_el1, 30, 2); 1726 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1727 let ds = extract_bits_64!(tcr_el1, 59, 1); 1728 1729 if tsz == 0 { 1730 info!("VA translation is not ready!"); 1731 return Ok(gva); 1732 } 1733 1734 // VA size is determined by TCR_BL1.T1SZ 1735 let va_size = 64 - tsz; 1736 // Number of bits in VA consumed in each level of translation 1737 let stride = match tg { 1738 3 => 13, // 64KB granule size 1739 1 => 11, // 16KB granule size 1740 _ => 9, // 4KB, default 1741 }; 1742 // Starting level of walking 1743 let mut level = 4 - (va_size - 4) / stride; 1744 1745 // PA or IPA size is determined 1746 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1747 let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4); 1748 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1749 // To be safe, we use the minimum value if they are different. 1750 let pa_range = std::cmp::min(tcr_ips, pa_range); 1751 // PA size in bits 1752 let pa_size = match pa_range { 1753 0 => 32, 1754 1 => 36, 1755 2 => 40, 1756 3 => 42, 1757 4 => 44, 1758 5 => 48, 1759 6 => 52, 1760 _ => { 1761 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1762 "PA range not supported {pa_range}" 1763 )))) 1764 } 1765 }; 1766 1767 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1768 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1769 // If FEAT_LPA2 is present, the translation table descriptor holds 1770 // 50 bits of the table address of next level. 1771 // Otherwise, it is 48 bits. 1772 let descaddrmask = if ds == 1 { 1773 !0u64 >> (64 - 50) // mask with 50 least significant bits 1774 } else { 1775 !0u64 >> (64 - 48) // mask with 48 least significant bits 1776 }; 1777 let descaddrmask = descaddrmask & !indexmask_grainsize; 1778 1779 // Translation table base address 1780 let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48); 1781 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1782 // address bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1783 if pa_size == 52 { 1784 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1785 } 1786 1787 // Loop through tables of each level 1788 loop { 1789 // Table offset for current level 1790 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1791 descaddr |= table_offset; 1792 descaddr &= !7u64; 1793 1794 let mut buf = [0; 8]; 1795 guest_memory 1796 .memory() 1797 .read(&mut buf, GuestAddress(descaddr)) 1798 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1799 let descriptor = u64::from_le_bytes(buf); 1800 1801 descaddr = descriptor & descaddrmask; 1802 // In the case of FEAT_LPA, the next-level translation table address 1803 // bits [48:51] comes from bits [12:15] of the current descriptor. 1804 // For FEAT_LPA2, the next-level translation table address 1805 // bits [50:51] comes from bits [8:9] of the current descriptor, 1806 // bits [48:49] comes from bits [48:49] of the descriptor which was 1807 // handled previously. 1808 if pa_size == 52 { 1809 if ds == 1 { 1810 // FEAT_LPA2 1811 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1812 } else { 1813 // FEAT_LPA 1814 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1815 } 1816 } 1817 1818 if (descriptor & 2) != 0 && (level < 3) { 1819 // This is a table entry. Go down to next level. 1820 level += 1; 1821 indexmask = indexmask_grainsize; 1822 continue; 1823 } 1824 1825 break; 1826 } 1827 1828 // We have reached either: 1829 // - a page entry at level 3 or 1830 // - a block entry at level 1 or 2 1831 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1832 descaddr &= !(page_size - 1); 1833 descaddr |= gva & (page_size - 1); 1834 1835 Ok(descaddr) 1836 } 1837 1838 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1839 self.acpi_address = Some(acpi_address); 1840 } 1841 1842 pub(crate) fn set_interrupt_controller( 1843 &mut self, 1844 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1845 ) { 1846 self.interrupt_controller = Some(interrupt_controller); 1847 } 1848 1849 pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> { 1850 &self.vcpus_kill_signalled 1851 } 1852 1853 #[cfg(feature = "igvm")] 1854 pub(crate) fn get_cpuid_leaf( 1855 &self, 1856 cpu_id: u8, 1857 eax: u32, 1858 ecx: u32, 1859 xfem: u64, 1860 xss: u64, 1861 ) -> Result<[u32; 4]> { 1862 let leaf_info = self.vcpus[usize::from(cpu_id)] 1863 .lock() 1864 .unwrap() 1865 .vcpu 1866 .get_cpuid_values(eax, ecx, xfem, xss) 1867 .unwrap(); 1868 Ok(leaf_info) 1869 } 1870 1871 #[cfg(feature = "sev_snp")] 1872 pub(crate) fn sev_snp_enabled(&self) -> bool { 1873 self.sev_snp_enabled 1874 } 1875 1876 pub(crate) fn nmi(&self) -> Result<()> { 1877 self.vcpus_kick_signalled.store(true, Ordering::SeqCst); 1878 1879 for state in self.vcpu_states.iter() { 1880 state.signal_thread(); 1881 } 1882 1883 self.vcpus_kick_signalled.store(false, Ordering::SeqCst); 1884 1885 Ok(()) 1886 } 1887 } 1888 1889 struct Cpu { 1890 cpu_id: u8, 1891 proximity_domain: u32, 1892 dynamic: bool, 1893 #[cfg(target_arch = "x86_64")] 1894 topology: Option<(u8, u8, u8)>, 1895 } 1896 1897 #[cfg(target_arch = "x86_64")] 1898 const MADT_CPU_ENABLE_FLAG: usize = 0; 1899 1900 #[cfg(target_arch = "x86_64")] 1901 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1902 1903 impl Cpu { 1904 #[cfg(target_arch = "x86_64")] 1905 fn generate_mat(&self) -> Vec<u8> { 1906 let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology); 1907 1908 let lapic = LocalX2Apic { 1909 r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, 1910 length: 16, 1911 processor_id: self.cpu_id.into(), 1912 apic_id: x2apic_id, 1913 flags: 1 << MADT_CPU_ENABLE_FLAG, 1914 _reserved: 0, 1915 }; 1916 1917 let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)]; 1918 // SAFETY: mat_data is large enough to hold lapic 1919 unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic }; 1920 1921 mat_data 1922 } 1923 } 1924 1925 impl Aml for Cpu { 1926 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 1927 #[cfg(target_arch = "x86_64")] 1928 let mat_data: Vec<u8> = self.generate_mat(); 1929 #[allow(clippy::if_same_then_else)] 1930 if self.dynamic { 1931 aml::Device::new( 1932 format!("C{:03X}", self.cpu_id).as_str().into(), 1933 vec![ 1934 &aml::Name::new("_HID".into(), &"ACPI0007"), 1935 &aml::Name::new("_UID".into(), &self.cpu_id), 1936 // Currently, AArch64 cannot support following fields. 1937 /* 1938 _STA return value: 1939 Bit [0] – Set if the device is present. 1940 Bit [1] – Set if the device is enabled and decoding its resources. 1941 Bit [2] – Set if the device should be shown in the UI. 1942 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1943 Bit [4] – Set if the battery is present. 1944 Bits [31:5] – Reserved (must be cleared). 1945 */ 1946 #[cfg(target_arch = "x86_64")] 1947 &aml::Method::new( 1948 "_STA".into(), 1949 0, 1950 false, 1951 // Call into CSTA method which will interrogate device 1952 vec![&aml::Return::new(&aml::MethodCall::new( 1953 "CSTA".into(), 1954 vec![&self.cpu_id], 1955 ))], 1956 ), 1957 &aml::Method::new( 1958 "_PXM".into(), 1959 0, 1960 false, 1961 vec![&aml::Return::new(&self.proximity_domain)], 1962 ), 1963 // The Linux kernel expects every CPU device to have a _MAT entry 1964 // containing the LAPIC for this processor with the enabled bit set 1965 // even it if is disabled in the MADT (non-boot CPU) 1966 #[cfg(target_arch = "x86_64")] 1967 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 1968 // Trigger CPU ejection 1969 #[cfg(target_arch = "x86_64")] 1970 &aml::Method::new( 1971 "_EJ0".into(), 1972 1, 1973 false, 1974 // Call into CEJ0 method which will actually eject device 1975 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1976 ), 1977 ], 1978 ) 1979 .to_aml_bytes(sink); 1980 } else { 1981 aml::Device::new( 1982 format!("C{:03X}", self.cpu_id).as_str().into(), 1983 vec![ 1984 &aml::Name::new("_HID".into(), &"ACPI0007"), 1985 &aml::Name::new("_UID".into(), &self.cpu_id), 1986 #[cfg(target_arch = "x86_64")] 1987 &aml::Method::new( 1988 "_STA".into(), 1989 0, 1990 false, 1991 // Mark CPU present see CSTA implementation 1992 vec![&aml::Return::new(&0xfu8)], 1993 ), 1994 &aml::Method::new( 1995 "_PXM".into(), 1996 0, 1997 false, 1998 vec![&aml::Return::new(&self.proximity_domain)], 1999 ), 2000 // The Linux kernel expects every CPU device to have a _MAT entry 2001 // containing the LAPIC for this processor with the enabled bit set 2002 // even it if is disabled in the MADT (non-boot CPU) 2003 #[cfg(target_arch = "x86_64")] 2004 &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)), 2005 ], 2006 ) 2007 .to_aml_bytes(sink); 2008 } 2009 } 2010 } 2011 2012 struct CpuNotify { 2013 cpu_id: u8, 2014 } 2015 2016 impl Aml for CpuNotify { 2017 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2018 let object = aml::Path::new(&format!("C{:03X}", self.cpu_id)); 2019 aml::If::new( 2020 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 2021 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2022 ) 2023 .to_aml_bytes(sink) 2024 } 2025 } 2026 2027 struct CpuMethods { 2028 max_vcpus: u8, 2029 dynamic: bool, 2030 } 2031 2032 impl Aml for CpuMethods { 2033 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2034 if self.dynamic { 2035 // CPU status method 2036 aml::Method::new( 2037 "CSTA".into(), 2038 1, 2039 true, 2040 vec![ 2041 // Take lock defined above 2042 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2043 // Write CPU number (in first argument) to I/O port via field 2044 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2045 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2046 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2047 &aml::If::new( 2048 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 2049 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2050 ), 2051 // Release lock 2052 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2053 // Return 0 or 0xf 2054 &aml::Return::new(&aml::Local(0)), 2055 ], 2056 ) 2057 .to_aml_bytes(sink); 2058 2059 let mut cpu_notifies = Vec::new(); 2060 for cpu_id in 0..self.max_vcpus { 2061 cpu_notifies.push(CpuNotify { cpu_id }); 2062 } 2063 2064 let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2065 for cpu_id in 0..self.max_vcpus { 2066 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 2067 } 2068 2069 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); 2070 2071 aml::Method::new( 2072 "CEJ0".into(), 2073 1, 2074 true, 2075 vec![ 2076 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2077 // Write CPU number (in first argument) to I/O port via field 2078 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 2079 // Set CEJ0 bit 2080 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 2081 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2082 ], 2083 ) 2084 .to_aml_bytes(sink); 2085 2086 aml::Method::new( 2087 "CSCN".into(), 2088 0, 2089 true, 2090 vec![ 2091 // Take lock defined above 2092 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 2093 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2094 &aml::While::new( 2095 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 2096 vec![ 2097 // Write CPU number (in first argument) to I/O port via field 2098 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 2099 // Check if CINS bit is set 2100 &aml::If::new( 2101 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 2102 // Notify device if it is 2103 vec![ 2104 &aml::MethodCall::new( 2105 "CTFY".into(), 2106 vec![&aml::Local(0), &aml::ONE], 2107 ), 2108 // Reset CINS bit 2109 &aml::Store::new( 2110 &aml::Path::new("\\_SB_.PRES.CINS"), 2111 &aml::ONE, 2112 ), 2113 ], 2114 ), 2115 // Check if CRMV bit is set 2116 &aml::If::new( 2117 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 2118 // Notify device if it is (with the eject constant 0x3) 2119 vec![ 2120 &aml::MethodCall::new( 2121 "CTFY".into(), 2122 vec![&aml::Local(0), &3u8], 2123 ), 2124 // Reset CRMV bit 2125 &aml::Store::new( 2126 &aml::Path::new("\\_SB_.PRES.CRMV"), 2127 &aml::ONE, 2128 ), 2129 ], 2130 ), 2131 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2132 ], 2133 ), 2134 // Release lock 2135 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 2136 ], 2137 ) 2138 .to_aml_bytes(sink) 2139 } else { 2140 aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink) 2141 } 2142 } 2143 } 2144 2145 impl Aml for CpuManager { 2146 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2147 #[cfg(target_arch = "x86_64")] 2148 if let Some(acpi_address) = self.acpi_address { 2149 // CPU hotplug controller 2150 aml::Device::new( 2151 "_SB_.PRES".into(), 2152 vec![ 2153 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2154 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 2155 // Mutex to protect concurrent access as we write to choose CPU and then read back status 2156 &aml::Mutex::new("CPLK".into(), 0), 2157 &aml::Name::new( 2158 "_CRS".into(), 2159 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2160 aml::AddressSpaceCacheable::NotCacheable, 2161 true, 2162 acpi_address.0, 2163 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 2164 None, 2165 )]), 2166 ), 2167 // OpRegion and Fields map MMIO range into individual field values 2168 &aml::OpRegion::new( 2169 "PRST".into(), 2170 aml::OpRegionSpace::SystemMemory, 2171 &(acpi_address.0 as usize), 2172 &CPU_MANAGER_ACPI_SIZE, 2173 ), 2174 &aml::Field::new( 2175 "PRST".into(), 2176 aml::FieldAccessType::Byte, 2177 aml::FieldLockRule::NoLock, 2178 aml::FieldUpdateRule::WriteAsZeroes, 2179 vec![ 2180 aml::FieldEntry::Reserved(32), 2181 aml::FieldEntry::Named(*b"CPEN", 1), 2182 aml::FieldEntry::Named(*b"CINS", 1), 2183 aml::FieldEntry::Named(*b"CRMV", 1), 2184 aml::FieldEntry::Named(*b"CEJ0", 1), 2185 aml::FieldEntry::Reserved(4), 2186 aml::FieldEntry::Named(*b"CCMD", 8), 2187 ], 2188 ), 2189 &aml::Field::new( 2190 "PRST".into(), 2191 aml::FieldAccessType::DWord, 2192 aml::FieldLockRule::NoLock, 2193 aml::FieldUpdateRule::Preserve, 2194 vec![ 2195 aml::FieldEntry::Named(*b"CSEL", 32), 2196 aml::FieldEntry::Reserved(32), 2197 aml::FieldEntry::Named(*b"CDAT", 32), 2198 ], 2199 ), 2200 ], 2201 ) 2202 .to_aml_bytes(sink); 2203 } 2204 2205 // CPU devices 2206 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 2207 let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05")); 2208 // Bundle methods together under a common object 2209 let methods = CpuMethods { 2210 max_vcpus: self.config.max_vcpus, 2211 dynamic: self.dynamic, 2212 }; 2213 let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods]; 2214 2215 #[cfg(target_arch = "x86_64")] 2216 let topology = self.get_vcpu_topology(); 2217 let mut cpu_devices = Vec::new(); 2218 for cpu_id in 0..self.config.max_vcpus { 2219 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2220 let cpu_device = Cpu { 2221 cpu_id, 2222 proximity_domain, 2223 dynamic: self.dynamic, 2224 #[cfg(target_arch = "x86_64")] 2225 topology, 2226 }; 2227 2228 cpu_devices.push(cpu_device); 2229 } 2230 2231 for cpu_device in cpu_devices.iter() { 2232 cpu_data_inner.push(cpu_device); 2233 } 2234 2235 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink) 2236 } 2237 } 2238 2239 impl Pausable for CpuManager { 2240 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2241 // Tell the vCPUs to pause themselves next time they exit 2242 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2243 2244 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2245 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2246 // above. 2247 for state in self.vcpu_states.iter() { 2248 state.signal_thread(); 2249 } 2250 2251 for vcpu in self.vcpus.iter() { 2252 let mut vcpu = vcpu.lock().unwrap(); 2253 vcpu.pause()?; 2254 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2255 if !self.config.kvm_hyperv { 2256 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2257 MigratableError::Pause(anyhow!( 2258 "Could not notify guest it has been paused {:?}", 2259 e 2260 )) 2261 })?; 2262 } 2263 } 2264 2265 // The vCPU thread will change its paused state before parking, wait here for each 2266 // activated vCPU change their state to ensure they have parked. 2267 for state in self.vcpu_states.iter() { 2268 if state.active() { 2269 while !state.paused.load(Ordering::SeqCst) { 2270 // To avoid a priority inversion with the vCPU thread 2271 thread::sleep(std::time::Duration::from_millis(1)); 2272 } 2273 } 2274 } 2275 2276 Ok(()) 2277 } 2278 2279 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2280 for vcpu in self.vcpus.iter() { 2281 vcpu.lock().unwrap().resume()?; 2282 } 2283 2284 // Toggle the vCPUs pause boolean 2285 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2286 2287 // Unpark all the VCPU threads. 2288 // Once unparked, the next thing they will do is checking for the pause 2289 // boolean. Since it'll be set to false, they will exit their pause loop 2290 // and go back to vmx root. 2291 for state in self.vcpu_states.iter() { 2292 state.paused.store(false, Ordering::SeqCst); 2293 state.unpark_thread(); 2294 } 2295 Ok(()) 2296 } 2297 } 2298 2299 impl Snapshottable for CpuManager { 2300 fn id(&self) -> String { 2301 CPU_MANAGER_SNAPSHOT_ID.to_string() 2302 } 2303 2304 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2305 let mut cpu_manager_snapshot = Snapshot::default(); 2306 2307 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2308 for vcpu in &self.vcpus { 2309 let mut vcpu = vcpu.lock().unwrap(); 2310 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2311 } 2312 2313 Ok(cpu_manager_snapshot) 2314 } 2315 } 2316 2317 impl Transportable for CpuManager {} 2318 impl Migratable for CpuManager {} 2319 2320 #[cfg(feature = "guest_debug")] 2321 impl Debuggable for CpuManager { 2322 #[cfg(feature = "kvm")] 2323 fn set_guest_debug( 2324 &self, 2325 cpu_id: usize, 2326 addrs: &[GuestAddress], 2327 singlestep: bool, 2328 ) -> std::result::Result<(), DebuggableError> { 2329 self.vcpus[cpu_id] 2330 .lock() 2331 .unwrap() 2332 .vcpu 2333 .set_guest_debug(addrs, singlestep) 2334 .map_err(DebuggableError::SetDebug) 2335 } 2336 2337 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2338 Ok(()) 2339 } 2340 2341 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2342 Ok(()) 2343 } 2344 2345 #[cfg(target_arch = "x86_64")] 2346 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2347 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2348 let gregs = self 2349 .get_regs(cpu_id as u8) 2350 .map_err(DebuggableError::ReadRegs)?; 2351 let regs = [ 2352 gregs.get_rax(), 2353 gregs.get_rbx(), 2354 gregs.get_rcx(), 2355 gregs.get_rdx(), 2356 gregs.get_rsi(), 2357 gregs.get_rdi(), 2358 gregs.get_rbp(), 2359 gregs.get_rsp(), 2360 gregs.get_r8(), 2361 gregs.get_r9(), 2362 gregs.get_r10(), 2363 gregs.get_r11(), 2364 gregs.get_r12(), 2365 gregs.get_r13(), 2366 gregs.get_r14(), 2367 gregs.get_r15(), 2368 ]; 2369 2370 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2371 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2372 let eflags = gregs.get_rflags() as u32; 2373 let rip = gregs.get_rip(); 2374 2375 // Segment registers: CS, SS, DS, ES, FS, GS 2376 let sregs = self 2377 .get_sregs(cpu_id as u8) 2378 .map_err(DebuggableError::ReadRegs)?; 2379 let segments = X86SegmentRegs { 2380 cs: sregs.cs.selector as u32, 2381 ss: sregs.ss.selector as u32, 2382 ds: sregs.ds.selector as u32, 2383 es: sregs.es.selector as u32, 2384 fs: sregs.fs.selector as u32, 2385 gs: sregs.gs.selector as u32, 2386 }; 2387 2388 // TODO: Add other registers 2389 2390 Ok(CoreRegs { 2391 regs, 2392 eflags, 2393 rip, 2394 segments, 2395 ..Default::default() 2396 }) 2397 } 2398 2399 #[cfg(target_arch = "aarch64")] 2400 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2401 let gregs = self 2402 .get_regs(cpu_id as u8) 2403 .map_err(DebuggableError::ReadRegs)?; 2404 Ok(CoreRegs { 2405 x: gregs.get_regs(), 2406 sp: gregs.get_sp(), 2407 pc: gregs.get_pc(), 2408 ..Default::default() 2409 }) 2410 } 2411 2412 #[cfg(target_arch = "x86_64")] 2413 fn write_regs( 2414 &self, 2415 cpu_id: usize, 2416 regs: &CoreRegs, 2417 ) -> std::result::Result<(), DebuggableError> { 2418 let orig_gregs = self 2419 .get_regs(cpu_id as u8) 2420 .map_err(DebuggableError::ReadRegs)?; 2421 let mut gregs = self.create_standard_regs(cpu_id as u8); 2422 gregs.set_rax(regs.regs[0]); 2423 gregs.set_rbx(regs.regs[1]); 2424 gregs.set_rcx(regs.regs[2]); 2425 gregs.set_rdx(regs.regs[3]); 2426 gregs.set_rsi(regs.regs[4]); 2427 gregs.set_rdi(regs.regs[5]); 2428 gregs.set_rbp(regs.regs[6]); 2429 gregs.set_rsp(regs.regs[7]); 2430 gregs.set_r8(regs.regs[8]); 2431 gregs.set_r9(regs.regs[9]); 2432 gregs.set_r10(regs.regs[10]); 2433 gregs.set_r11(regs.regs[11]); 2434 gregs.set_r12(regs.regs[12]); 2435 gregs.set_r13(regs.regs[13]); 2436 gregs.set_r14(regs.regs[14]); 2437 gregs.set_r15(regs.regs[15]); 2438 gregs.set_rip(regs.rip); 2439 // Update the lower 32-bit of rflags. 2440 gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64)); 2441 2442 self.set_regs(cpu_id as u8, &gregs) 2443 .map_err(DebuggableError::WriteRegs)?; 2444 2445 // Segment registers: CS, SS, DS, ES, FS, GS 2446 // Since GDB care only selectors, we call get_sregs() first. 2447 let mut sregs = self 2448 .get_sregs(cpu_id as u8) 2449 .map_err(DebuggableError::ReadRegs)?; 2450 sregs.cs.selector = regs.segments.cs as u16; 2451 sregs.ss.selector = regs.segments.ss as u16; 2452 sregs.ds.selector = regs.segments.ds as u16; 2453 sregs.es.selector = regs.segments.es as u16; 2454 sregs.fs.selector = regs.segments.fs as u16; 2455 sregs.gs.selector = regs.segments.gs as u16; 2456 2457 self.set_sregs(cpu_id as u8, &sregs) 2458 .map_err(DebuggableError::WriteRegs)?; 2459 2460 // TODO: Add other registers 2461 2462 Ok(()) 2463 } 2464 2465 #[cfg(target_arch = "aarch64")] 2466 fn write_regs( 2467 &self, 2468 cpu_id: usize, 2469 regs: &CoreRegs, 2470 ) -> std::result::Result<(), DebuggableError> { 2471 let mut gregs = self 2472 .get_regs(cpu_id as u8) 2473 .map_err(DebuggableError::ReadRegs)?; 2474 2475 gregs.set_regs(regs.x); 2476 gregs.set_sp(regs.sp); 2477 gregs.set_pc(regs.pc); 2478 2479 self.set_regs(cpu_id as u8, &gregs) 2480 .map_err(DebuggableError::WriteRegs)?; 2481 2482 Ok(()) 2483 } 2484 2485 fn read_mem( 2486 &self, 2487 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2488 cpu_id: usize, 2489 vaddr: GuestAddress, 2490 len: usize, 2491 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2492 let mut buf = vec![0; len]; 2493 let mut total_read = 0_u64; 2494 2495 while total_read < len as u64 { 2496 let gaddr = vaddr.0 + total_read; 2497 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2498 Ok(paddr) => paddr, 2499 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2500 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2501 }; 2502 let psize = arch::PAGE_SIZE as u64; 2503 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2504 guest_memory 2505 .memory() 2506 .read( 2507 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2508 GuestAddress(paddr), 2509 ) 2510 .map_err(DebuggableError::ReadMem)?; 2511 total_read += read_len; 2512 } 2513 Ok(buf) 2514 } 2515 2516 fn write_mem( 2517 &self, 2518 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2519 cpu_id: usize, 2520 vaddr: &GuestAddress, 2521 data: &[u8], 2522 ) -> std::result::Result<(), DebuggableError> { 2523 let mut total_written = 0_u64; 2524 2525 while total_written < data.len() as u64 { 2526 let gaddr = vaddr.0 + total_written; 2527 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2528 Ok(paddr) => paddr, 2529 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2530 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2531 }; 2532 let psize = arch::PAGE_SIZE as u64; 2533 let write_len = std::cmp::min( 2534 data.len() as u64 - total_written, 2535 psize - (paddr & (psize - 1)), 2536 ); 2537 guest_memory 2538 .memory() 2539 .write( 2540 &data[total_written as usize..total_written as usize + write_len as usize], 2541 GuestAddress(paddr), 2542 ) 2543 .map_err(DebuggableError::WriteMem)?; 2544 total_written += write_len; 2545 } 2546 Ok(()) 2547 } 2548 2549 fn active_vcpus(&self) -> usize { 2550 self.present_vcpus() as usize 2551 } 2552 } 2553 2554 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2555 impl Elf64Writable for CpuManager {} 2556 2557 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2558 impl CpuElf64Writable for CpuManager { 2559 fn cpu_write_elf64_note( 2560 &mut self, 2561 dump_state: &DumpState, 2562 ) -> std::result::Result<(), GuestDebuggableError> { 2563 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2564 for vcpu in &self.vcpus { 2565 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2566 let mut pos: usize = 0; 2567 let mut buf = vec![0; note_size as usize]; 2568 let descsz = size_of::<X86_64ElfPrStatus>(); 2569 let vcpu_id = vcpu.lock().unwrap().id; 2570 2571 let note = Elf64_Nhdr { 2572 n_namesz: COREDUMP_NAME_SIZE, 2573 n_descsz: descsz as u32, 2574 n_type: NT_PRSTATUS, 2575 }; 2576 2577 let bytes: &[u8] = note.as_slice(); 2578 buf.splice(0.., bytes.to_vec()); 2579 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2580 buf.resize(pos + 4, 0); 2581 buf.splice(pos.., "CORE".to_string().into_bytes()); 2582 2583 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2584 buf.resize(pos + 32 + 4, 0); 2585 let pid = vcpu_id as u64; 2586 let bytes: &[u8] = pid.as_slice(); 2587 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2588 2589 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2590 2591 let orig_rax: u64 = 0; 2592 let gregs = self.vcpus[usize::from(vcpu_id)] 2593 .lock() 2594 .unwrap() 2595 .vcpu 2596 .get_regs() 2597 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2598 2599 let regs1 = [ 2600 gregs.get_r15(), 2601 gregs.get_r14(), 2602 gregs.get_r13(), 2603 gregs.get_r12(), 2604 gregs.get_rbp(), 2605 gregs.get_rbx(), 2606 gregs.get_r11(), 2607 gregs.get_r10(), 2608 ]; 2609 let regs2 = [ 2610 gregs.get_r9(), 2611 gregs.get_r8(), 2612 gregs.get_rax(), 2613 gregs.get_rcx(), 2614 gregs.get_rdx(), 2615 gregs.get_rsi(), 2616 gregs.get_rdi(), 2617 orig_rax, 2618 ]; 2619 2620 let sregs = self.vcpus[usize::from(vcpu_id)] 2621 .lock() 2622 .unwrap() 2623 .vcpu 2624 .get_sregs() 2625 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2626 2627 debug!( 2628 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2629 gregs.get_rip(), 2630 gregs.get_rsp(), 2631 sregs.gs.base, 2632 sregs.cs.selector, 2633 sregs.ss.selector, 2634 sregs.ds.selector, 2635 ); 2636 2637 let regs = X86_64UserRegs { 2638 regs1, 2639 regs2, 2640 rip: gregs.get_rip(), 2641 cs: sregs.cs.selector as u64, 2642 eflags: gregs.get_rflags(), 2643 rsp: gregs.get_rsp(), 2644 ss: sregs.ss.selector as u64, 2645 fs_base: sregs.fs.base, 2646 gs_base: sregs.gs.base, 2647 ds: sregs.ds.selector as u64, 2648 es: sregs.es.selector as u64, 2649 fs: sregs.fs.selector as u64, 2650 gs: sregs.gs.selector as u64, 2651 }; 2652 2653 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2654 let bytes: &[u8] = regs.as_slice(); 2655 buf.resize(note_size as usize, 0); 2656 buf.splice(pos.., bytes.to_vec()); 2657 buf.resize(note_size as usize, 0); 2658 2659 coredump_file 2660 .write(&buf) 2661 .map_err(GuestDebuggableError::CoredumpFile)?; 2662 } 2663 2664 Ok(()) 2665 } 2666 2667 fn cpu_write_vmm_note( 2668 &mut self, 2669 dump_state: &DumpState, 2670 ) -> std::result::Result<(), GuestDebuggableError> { 2671 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2672 for vcpu in &self.vcpus { 2673 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2674 let mut pos: usize = 0; 2675 let mut buf = vec![0; note_size as usize]; 2676 let descsz = size_of::<DumpCpusState>(); 2677 let vcpu_id = vcpu.lock().unwrap().id; 2678 2679 let note = Elf64_Nhdr { 2680 n_namesz: COREDUMP_NAME_SIZE, 2681 n_descsz: descsz as u32, 2682 n_type: 0, 2683 }; 2684 2685 let bytes: &[u8] = note.as_slice(); 2686 buf.splice(0.., bytes.to_vec()); 2687 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2688 2689 buf.resize(pos + 4, 0); 2690 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2691 2692 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2693 2694 let gregs = self.vcpus[usize::from(vcpu_id)] 2695 .lock() 2696 .unwrap() 2697 .vcpu 2698 .get_regs() 2699 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2700 2701 let regs1 = [ 2702 gregs.get_rax(), 2703 gregs.get_rbx(), 2704 gregs.get_rcx(), 2705 gregs.get_rdx(), 2706 gregs.get_rsi(), 2707 gregs.get_rdi(), 2708 gregs.get_rsp(), 2709 gregs.get_rbp(), 2710 ]; 2711 2712 let regs2 = [ 2713 gregs.get_r8(), 2714 gregs.get_r9(), 2715 gregs.get_r10(), 2716 gregs.get_r11(), 2717 gregs.get_r12(), 2718 gregs.get_r13(), 2719 gregs.get_r14(), 2720 gregs.get_r15(), 2721 ]; 2722 2723 let sregs = self.vcpus[usize::from(vcpu_id)] 2724 .lock() 2725 .unwrap() 2726 .vcpu 2727 .get_sregs() 2728 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2729 2730 let mut msrs = vec![MsrEntry { 2731 index: msr_index::MSR_KERNEL_GS_BASE, 2732 ..Default::default() 2733 }]; 2734 2735 self.vcpus[vcpu_id as usize] 2736 .lock() 2737 .unwrap() 2738 .vcpu 2739 .get_msrs(&mut msrs) 2740 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2741 let kernel_gs_base = msrs[0].data; 2742 2743 let cs = CpuSegment::new(sregs.cs); 2744 let ds = CpuSegment::new(sregs.ds); 2745 let es = CpuSegment::new(sregs.es); 2746 let fs = CpuSegment::new(sregs.fs); 2747 let gs = CpuSegment::new(sregs.gs); 2748 let ss = CpuSegment::new(sregs.ss); 2749 let ldt = CpuSegment::new(sregs.ldt); 2750 let tr = CpuSegment::new(sregs.tr); 2751 let gdt = CpuSegment::new_from_table(sregs.gdt); 2752 let idt = CpuSegment::new_from_table(sregs.idt); 2753 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2754 let regs = DumpCpusState { 2755 version: 1, 2756 size: size_of::<DumpCpusState>() as u32, 2757 regs1, 2758 regs2, 2759 rip: gregs.get_rip(), 2760 rflags: gregs.get_rflags(), 2761 cs, 2762 ds, 2763 es, 2764 fs, 2765 gs, 2766 ss, 2767 ldt, 2768 tr, 2769 gdt, 2770 idt, 2771 cr, 2772 kernel_gs_base, 2773 }; 2774 2775 let bytes: &[u8] = regs.as_slice(); 2776 buf.resize(note_size as usize, 0); 2777 buf.splice(pos.., bytes.to_vec()); 2778 buf.resize(note_size as usize, 0); 2779 2780 coredump_file 2781 .write(&buf) 2782 .map_err(GuestDebuggableError::CoredumpFile)?; 2783 } 2784 2785 Ok(()) 2786 } 2787 } 2788 2789 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2790 #[cfg(test)] 2791 mod tests { 2792 use arch::layout::{BOOT_STACK_POINTER, ZERO_PAGE_START}; 2793 use arch::x86_64::interrupts::*; 2794 use arch::x86_64::regs::*; 2795 use hypervisor::arch::x86::{FpuState, LapicState}; 2796 use hypervisor::StandardRegisters; 2797 use linux_loader::loader::bootparam::setup_header; 2798 2799 #[test] 2800 fn test_setlint() { 2801 let hv = hypervisor::new().unwrap(); 2802 let vm = hv.create_vm().expect("new VM fd creation failed"); 2803 hv.check_required_extensions().unwrap(); 2804 // Calling get_lapic will fail if there is no irqchip before hand. 2805 vm.create_irq_chip().unwrap(); 2806 let vcpu = vm.create_vcpu(0, None).unwrap(); 2807 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2808 2809 // Compute the value that is expected to represent LVT0 and LVT1. 2810 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2811 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2812 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2813 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2814 2815 set_lint(&vcpu).unwrap(); 2816 2817 // Compute the value that represents LVT0 and LVT1 after set_lint. 2818 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2819 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2820 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2821 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2822 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2823 } 2824 2825 #[test] 2826 fn test_setup_fpu() { 2827 let hv = hypervisor::new().unwrap(); 2828 let vm = hv.create_vm().expect("new VM fd creation failed"); 2829 let vcpu = vm.create_vcpu(0, None).unwrap(); 2830 setup_fpu(&vcpu).unwrap(); 2831 2832 let expected_fpu: FpuState = FpuState { 2833 fcw: 0x37f, 2834 mxcsr: 0x1f80, 2835 ..Default::default() 2836 }; 2837 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2838 // TODO: auto-generate kvm related structures with PartialEq on. 2839 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2840 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2841 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2842 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2843 // remove it at all. 2844 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2845 } 2846 2847 #[test] 2848 fn test_setup_msrs() { 2849 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2850 2851 let hv = hypervisor::new().unwrap(); 2852 let vm = hv.create_vm().expect("new VM fd creation failed"); 2853 let vcpu = vm.create_vcpu(0, None).unwrap(); 2854 setup_msrs(&vcpu).unwrap(); 2855 2856 // This test will check against the last MSR entry configured (the tenth one). 2857 // See create_msr_entries for details. 2858 let mut msrs = vec![MsrEntry { 2859 index: msr_index::MSR_IA32_MISC_ENABLE, 2860 ..Default::default() 2861 }]; 2862 2863 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2864 // in this test case scenario. 2865 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2866 assert_eq!(read_msrs, 1); 2867 2868 // Official entries that were setup when we did setup_msrs. We need to assert that the 2869 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2870 // expect. 2871 let entry_vec = vcpu.boot_msr_entries(); 2872 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2873 } 2874 2875 #[test] 2876 fn test_setup_regs_for_pvh() { 2877 let hv = hypervisor::new().unwrap(); 2878 let vm = hv.create_vm().expect("new VM fd creation failed"); 2879 let vcpu = vm.create_vcpu(0, None).unwrap(); 2880 2881 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2882 expected_regs.set_rflags(0x0000000000000002u64); 2883 expected_regs.set_rbx(arch::layout::PVH_INFO_START.0); 2884 expected_regs.set_rip(1); 2885 2886 setup_regs( 2887 &vcpu, 2888 arch::EntryPoint { 2889 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2890 setup_header: None, 2891 }, 2892 ) 2893 .unwrap(); 2894 2895 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2896 assert_eq!(actual_regs, expected_regs); 2897 } 2898 2899 #[test] 2900 fn test_setup_regs_for_bzimage() { 2901 let hv = hypervisor::new().unwrap(); 2902 let vm = hv.create_vm().expect("new VM fd creation failed"); 2903 let vcpu = vm.create_vcpu(0, None).unwrap(); 2904 2905 let mut expected_regs: StandardRegisters = vcpu.create_standard_regs(); 2906 expected_regs.set_rflags(0x0000000000000002u64); 2907 expected_regs.set_rip(1); 2908 expected_regs.set_rsp(BOOT_STACK_POINTER.0); 2909 expected_regs.set_rsi(ZERO_PAGE_START.0); 2910 2911 setup_regs( 2912 &vcpu, 2913 arch::EntryPoint { 2914 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()), 2915 setup_header: Some(setup_header { 2916 ..Default::default() 2917 }), 2918 }, 2919 ) 2920 .unwrap(); 2921 2922 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2923 assert_eq!(actual_regs, expected_regs); 2924 } 2925 } 2926 2927 #[cfg(target_arch = "aarch64")] 2928 #[cfg(test)] 2929 mod tests { 2930 use std::mem; 2931 2932 use arch::aarch64::regs; 2933 use arch::layout; 2934 use hypervisor::kvm::aarch64::is_system_register; 2935 use hypervisor::kvm::kvm_bindings::{ 2936 user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2937 }; 2938 use hypervisor::{arm64_core_reg_id, offset_of}; 2939 2940 #[test] 2941 fn test_setup_regs() { 2942 let hv = hypervisor::new().unwrap(); 2943 let vm = hv.create_vm().unwrap(); 2944 let vcpu = vm.create_vcpu(0, None).unwrap(); 2945 2946 // Must fail when vcpu is not initialized yet. 2947 vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap_err(); 2948 2949 let mut kvi = vcpu.create_vcpu_init(); 2950 vm.get_preferred_target(&mut kvi).unwrap(); 2951 vcpu.vcpu_init(&kvi).unwrap(); 2952 2953 vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap(); 2954 } 2955 2956 #[test] 2957 fn test_read_mpidr() { 2958 let hv = hypervisor::new().unwrap(); 2959 let vm = hv.create_vm().unwrap(); 2960 let vcpu = vm.create_vcpu(0, None).unwrap(); 2961 let mut kvi = vcpu.create_vcpu_init(); 2962 vm.get_preferred_target(&mut kvi).unwrap(); 2963 2964 // Must fail when vcpu is not initialized yet. 2965 vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap_err(); 2966 2967 vcpu.vcpu_init(&kvi).unwrap(); 2968 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2969 } 2970 2971 #[test] 2972 fn test_is_system_register() { 2973 let offset = offset_of!(user_pt_regs, pc); 2974 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2975 assert!(!is_system_register(regid)); 2976 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2977 assert!(is_system_register(regid)); 2978 } 2979 2980 #[test] 2981 fn test_save_restore_core_regs() { 2982 let hv = hypervisor::new().unwrap(); 2983 let vm = hv.create_vm().unwrap(); 2984 let vcpu = vm.create_vcpu(0, None).unwrap(); 2985 let mut kvi = vcpu.create_vcpu_init(); 2986 vm.get_preferred_target(&mut kvi).unwrap(); 2987 2988 // Must fail when vcpu is not initialized yet. 2989 assert_eq!( 2990 format!("{}", vcpu.get_regs().unwrap_err()), 2991 "Failed to get aarch64 core register: Exec format error (os error 8)" 2992 ); 2993 2994 let mut state = vcpu.create_standard_regs(); 2995 assert_eq!( 2996 format!("{}", vcpu.set_regs(&state).unwrap_err()), 2997 "Failed to set aarch64 core register: Exec format error (os error 8)" 2998 ); 2999 3000 vcpu.vcpu_init(&kvi).unwrap(); 3001 state = vcpu.get_regs().unwrap(); 3002 assert_eq!(state.get_pstate(), 0x3C5); 3003 3004 vcpu.set_regs(&state).unwrap(); 3005 } 3006 3007 #[test] 3008 fn test_get_set_mpstate() { 3009 let hv = hypervisor::new().unwrap(); 3010 let vm = hv.create_vm().unwrap(); 3011 let vcpu = vm.create_vcpu(0, None).unwrap(); 3012 let mut kvi = vcpu.create_vcpu_init(); 3013 vm.get_preferred_target(&mut kvi).unwrap(); 3014 3015 let state = vcpu.get_mp_state().unwrap(); 3016 vcpu.set_mp_state(state).unwrap(); 3017 } 3018 } 3019