1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 #[cfg(target_arch = "x86_64")] 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, aml::Aml, sdt::Sdt}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 use arch::EntryPoint; 35 use arch::NumaNodes; 36 #[cfg(target_arch = "aarch64")] 37 use devices::gic::Gic; 38 use devices::interrupt_controller::InterruptController; 39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 44 use hypervisor::aarch64::StandardRegisters; 45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 46 use hypervisor::arch::x86::msr_index; 47 #[cfg(target_arch = "x86_64")] 48 use hypervisor::arch::x86::CpuIdEntry; 49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 50 use hypervisor::arch::x86::MsrEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters}; 53 #[cfg(target_arch = "aarch64")] 54 use hypervisor::kvm::kvm_bindings; 55 #[cfg(feature = "tdx")] 56 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 57 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps}; 58 use libc::{c_void, siginfo_t}; 59 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 60 use linux_loader::elf::Elf64_Nhdr; 61 use seccompiler::{apply_filter, SeccompAction}; 62 use std::collections::BTreeMap; 63 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 64 use std::io::Write; 65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 66 use std::mem::size_of; 67 use std::os::unix::thread::JoinHandleExt; 68 use std::sync::atomic::{AtomicBool, Ordering}; 69 use std::sync::{Arc, Barrier, Mutex}; 70 use std::{cmp, io, result, thread}; 71 use thiserror::Error; 72 use tracer::trace_scoped; 73 use vm_device::BusDevice; 74 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 75 use vm_memory::ByteValued; 76 #[cfg(feature = "guest_debug")] 77 use vm_memory::{Bytes, GuestAddressSpace}; 78 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 79 use vm_migration::{ 80 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, 81 Transportable, 82 }; 83 use vmm_sys_util::eventfd::EventFd; 84 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 85 86 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 87 /// Extract the specified bits of a 64-bit integer. 88 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 89 /// following expression should return 3 (`0b11`): 90 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 91 /// 92 macro_rules! extract_bits_64 { 93 ($value: tt, $offset: tt, $length: tt) => { 94 ($value >> $offset) & (!0u64 >> (64 - $length)) 95 }; 96 } 97 98 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 99 100 #[derive(Debug, Error)] 101 pub enum Error { 102 #[error("Error creating vCPU: {0}")] 103 VcpuCreate(#[source] anyhow::Error), 104 105 #[error("Error running bCPU: {0}")] 106 VcpuRun(#[source] anyhow::Error), 107 108 #[error("Error spawning vCPU thread: {0}")] 109 VcpuSpawn(#[source] io::Error), 110 111 #[error("Error generating common CPUID: {0}")] 112 CommonCpuId(#[source] arch::Error), 113 114 #[error("Error configuring vCPU: {0}")] 115 VcpuConfiguration(#[source] arch::Error), 116 117 #[cfg(target_arch = "aarch64")] 118 #[error("Error fetching preferred target: {0}")] 119 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 120 121 #[cfg(target_arch = "aarch64")] 122 #[error("Error initialising vCPU: {0}")] 123 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 124 125 #[error("Failed to join on vCPU threads: {0:?}")] 126 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 127 128 #[error("Error adding CpuManager to MMIO bus: {0}")] 129 BusError(#[source] vm_device::BusError), 130 131 #[error("Requested vCPUs exceed maximum")] 132 DesiredVCpuCountExceedsMax, 133 134 #[error("Cannot create seccomp filter: {0}")] 135 CreateSeccompFilter(#[source] seccompiler::Error), 136 137 #[error("Cannot apply seccomp filter: {0}")] 138 ApplySeccompFilter(#[source] seccompiler::Error), 139 140 #[error("Error starting vCPU after restore: {0}")] 141 StartRestoreVcpu(#[source] anyhow::Error), 142 143 #[error("Unexpected VmExit")] 144 UnexpectedVmExit, 145 146 #[error("Failed to allocate MMIO address for CpuManager")] 147 AllocateMmmioAddress, 148 149 #[cfg(feature = "tdx")] 150 #[error("Error initializing TDX: {0}")] 151 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 152 153 #[cfg(target_arch = "aarch64")] 154 #[error("Error initializing PMU: {0}")] 155 InitPmu(#[source] hypervisor::HypervisorCpuError), 156 157 #[cfg(feature = "guest_debug")] 158 #[error("Error during CPU debug: {0}")] 159 CpuDebug(#[source] hypervisor::HypervisorCpuError), 160 161 #[cfg(feature = "guest_debug")] 162 #[error("Error translating virtual address: {0}")] 163 TranslateVirtualAddress(#[source] anyhow::Error), 164 165 #[cfg(target_arch = "x86_64")] 166 #[error("Error setting up AMX: {0}")] 167 AmxEnable(#[source] anyhow::Error), 168 } 169 pub type Result<T> = result::Result<T, Error>; 170 171 #[cfg(target_arch = "x86_64")] 172 #[allow(dead_code)] 173 #[repr(packed)] 174 struct LocalApic { 175 pub r#type: u8, 176 pub length: u8, 177 pub processor_id: u8, 178 pub apic_id: u8, 179 pub flags: u32, 180 } 181 182 #[allow(dead_code)] 183 #[repr(packed)] 184 #[derive(Default)] 185 struct Ioapic { 186 pub r#type: u8, 187 pub length: u8, 188 pub ioapic_id: u8, 189 _reserved: u8, 190 pub apic_address: u32, 191 pub gsi_base: u32, 192 } 193 194 #[cfg(target_arch = "aarch64")] 195 #[allow(dead_code)] 196 #[repr(packed)] 197 struct GicC { 198 pub r#type: u8, 199 pub length: u8, 200 pub reserved0: u16, 201 pub cpu_interface_number: u32, 202 pub uid: u32, 203 pub flags: u32, 204 pub parking_version: u32, 205 pub performance_interrupt: u32, 206 pub parked_address: u64, 207 pub base_address: u64, 208 pub gicv_base_address: u64, 209 pub gich_base_address: u64, 210 pub vgic_interrupt: u32, 211 pub gicr_base_address: u64, 212 pub mpidr: u64, 213 pub proc_power_effi_class: u8, 214 pub reserved1: u8, 215 pub spe_overflow_interrupt: u16, 216 } 217 218 #[cfg(target_arch = "aarch64")] 219 #[allow(dead_code)] 220 #[repr(packed)] 221 struct GicD { 222 pub r#type: u8, 223 pub length: u8, 224 pub reserved0: u16, 225 pub gic_id: u32, 226 pub base_address: u64, 227 pub global_irq_base: u32, 228 pub version: u8, 229 pub reserved1: [u8; 3], 230 } 231 232 #[cfg(target_arch = "aarch64")] 233 #[allow(dead_code)] 234 #[repr(packed)] 235 struct GicR { 236 pub r#type: u8, 237 pub length: u8, 238 pub reserved: u16, 239 pub base_address: u64, 240 pub range_length: u32, 241 } 242 243 #[cfg(target_arch = "aarch64")] 244 #[allow(dead_code)] 245 #[repr(packed)] 246 struct GicIts { 247 pub r#type: u8, 248 pub length: u8, 249 pub reserved0: u16, 250 pub translation_id: u32, 251 pub base_address: u64, 252 pub reserved1: u32, 253 } 254 255 #[cfg(target_arch = "aarch64")] 256 #[allow(dead_code)] 257 #[repr(packed)] 258 struct ProcessorHierarchyNode { 259 pub r#type: u8, 260 pub length: u8, 261 pub reserved: u16, 262 pub flags: u32, 263 pub parent: u32, 264 pub acpi_processor_id: u32, 265 pub num_private_resources: u32, 266 } 267 268 #[allow(dead_code)] 269 #[repr(packed)] 270 #[derive(Default)] 271 struct InterruptSourceOverride { 272 pub r#type: u8, 273 pub length: u8, 274 pub bus: u8, 275 pub source: u8, 276 pub gsi: u32, 277 pub flags: u16, 278 } 279 280 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 281 macro_rules! round_up { 282 ($n:expr,$d:expr) => { 283 (($n / ($d + 1)) + 1) * $d 284 }; 285 } 286 287 /// A wrapper around creating and using a kvm-based VCPU. 288 pub struct Vcpu { 289 // The hypervisor abstracted CPU. 290 vcpu: Arc<dyn hypervisor::Vcpu>, 291 id: u8, 292 #[cfg(target_arch = "aarch64")] 293 mpidr: u64, 294 saved_state: Option<CpuState>, 295 } 296 297 impl Vcpu { 298 /// Constructs a new VCPU for `vm`. 299 /// 300 /// # Arguments 301 /// 302 /// * `id` - Represents the CPU number between [0, max vcpus). 303 /// * `vm` - The virtual machine this vcpu will get attached to. 304 /// * `vm_ops` - Optional object for exit handling. 305 pub fn new( 306 id: u8, 307 vm: &Arc<dyn hypervisor::Vm>, 308 vm_ops: Option<Arc<dyn VmOps>>, 309 ) -> Result<Self> { 310 let vcpu = vm 311 .create_vcpu(id, vm_ops) 312 .map_err(|e| Error::VcpuCreate(e.into()))?; 313 // Initially the cpuid per vCPU is the one supported by this VM. 314 Ok(Vcpu { 315 vcpu, 316 id, 317 #[cfg(target_arch = "aarch64")] 318 mpidr: 0, 319 saved_state: None, 320 }) 321 } 322 323 /// Configures a vcpu and should be called once per vcpu when created. 324 /// 325 /// # Arguments 326 /// 327 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 328 /// * `guest_memory` - Guest memory. 329 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 330 pub fn configure( 331 &mut self, 332 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 333 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 334 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 335 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 336 ) -> Result<()> { 337 #[cfg(target_arch = "aarch64")] 338 { 339 self.init(vm)?; 340 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup) 341 .map_err(Error::VcpuConfiguration)?; 342 } 343 info!("Configuring vCPU: cpu_id = {}", self.id); 344 #[cfg(target_arch = "x86_64")] 345 arch::configure_vcpu(&self.vcpu, self.id, boot_setup, cpuid, kvm_hyperv) 346 .map_err(Error::VcpuConfiguration)?; 347 348 Ok(()) 349 } 350 351 /// Gets the MPIDR register value. 352 #[cfg(target_arch = "aarch64")] 353 pub fn get_mpidr(&self) -> u64 { 354 self.mpidr 355 } 356 357 /// Gets the saved vCPU state. 358 #[cfg(target_arch = "aarch64")] 359 pub fn get_saved_state(&self) -> Option<CpuState> { 360 self.saved_state.clone() 361 } 362 363 /// Initializes an aarch64 specific vcpu for booting Linux. 364 #[cfg(target_arch = "aarch64")] 365 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 366 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 367 368 // This reads back the kernel's preferred target type. 369 vm.get_preferred_target(&mut kvi) 370 .map_err(Error::VcpuArmPreferredTarget)?; 371 // We already checked that the capability is supported. 372 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 373 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 374 // Non-boot cpus are powered off initially. 375 if self.id > 0 { 376 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 377 } 378 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 379 } 380 381 /// Runs the VCPU until it exits, returning the reason. 382 /// 383 /// Note that the state of the VCPU and associated VM must be setup first for this to do 384 /// anything useful. 385 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 386 self.vcpu.run() 387 } 388 } 389 390 impl Pausable for Vcpu {} 391 impl Snapshottable for Vcpu { 392 fn id(&self) -> String { 393 self.id.to_string() 394 } 395 396 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 397 let saved_state = self 398 .vcpu 399 .state() 400 .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?; 401 402 self.saved_state = Some(saved_state.clone()); 403 404 Ok(Snapshot::from_data(SnapshotData::new_from_state( 405 &saved_state, 406 )?)) 407 } 408 } 409 410 pub struct CpuManager { 411 hypervisor_type: HypervisorType, 412 config: CpusConfig, 413 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 414 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 415 #[cfg(target_arch = "x86_64")] 416 cpuid: Vec<CpuIdEntry>, 417 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 418 vm: Arc<dyn hypervisor::Vm>, 419 vcpus_kill_signalled: Arc<AtomicBool>, 420 vcpus_pause_signalled: Arc<AtomicBool>, 421 exit_evt: EventFd, 422 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 423 reset_evt: EventFd, 424 #[cfg(feature = "guest_debug")] 425 vm_debug_evt: EventFd, 426 vcpu_states: Vec<VcpuState>, 427 selected_cpu: u8, 428 vcpus: Vec<Arc<Mutex<Vcpu>>>, 429 seccomp_action: SeccompAction, 430 vm_ops: Arc<dyn VmOps>, 431 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 432 acpi_address: Option<GuestAddress>, 433 proximity_domain_per_cpu: BTreeMap<u8, u32>, 434 affinity: BTreeMap<u8, Vec<u8>>, 435 dynamic: bool, 436 } 437 438 const CPU_ENABLE_FLAG: usize = 0; 439 const CPU_INSERTING_FLAG: usize = 1; 440 const CPU_REMOVING_FLAG: usize = 2; 441 const CPU_EJECT_FLAG: usize = 3; 442 443 const CPU_STATUS_OFFSET: u64 = 4; 444 const CPU_SELECTION_OFFSET: u64 = 0; 445 446 impl BusDevice for CpuManager { 447 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 448 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 449 data.fill(0); 450 451 match offset { 452 CPU_SELECTION_OFFSET => { 453 data[0] = self.selected_cpu; 454 } 455 CPU_STATUS_OFFSET => { 456 if self.selected_cpu < self.max_vcpus() { 457 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 458 if state.active() { 459 data[0] |= 1 << CPU_ENABLE_FLAG; 460 } 461 if state.inserting { 462 data[0] |= 1 << CPU_INSERTING_FLAG; 463 } 464 if state.removing { 465 data[0] |= 1 << CPU_REMOVING_FLAG; 466 } 467 } else { 468 warn!("Out of range vCPU id: {}", self.selected_cpu); 469 } 470 } 471 _ => { 472 warn!( 473 "Unexpected offset for accessing CPU manager device: {:#}", 474 offset 475 ); 476 } 477 } 478 } 479 480 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 481 match offset { 482 CPU_SELECTION_OFFSET => { 483 self.selected_cpu = data[0]; 484 } 485 CPU_STATUS_OFFSET => { 486 if self.selected_cpu < self.max_vcpus() { 487 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 488 // The ACPI code writes back a 1 to acknowledge the insertion 489 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 490 && state.inserting 491 { 492 state.inserting = false; 493 } 494 // Ditto for removal 495 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 496 && state.removing 497 { 498 state.removing = false; 499 } 500 // Trigger removal of vCPU 501 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 502 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 503 error!("Error removing vCPU: {:?}", e); 504 } 505 } 506 } else { 507 warn!("Out of range vCPU id: {}", self.selected_cpu); 508 } 509 } 510 _ => { 511 warn!( 512 "Unexpected offset for accessing CPU manager device: {:#}", 513 offset 514 ); 515 } 516 } 517 None 518 } 519 } 520 521 #[derive(Default)] 522 struct VcpuState { 523 inserting: bool, 524 removing: bool, 525 handle: Option<thread::JoinHandle<()>>, 526 kill: Arc<AtomicBool>, 527 vcpu_run_interrupted: Arc<AtomicBool>, 528 } 529 530 impl VcpuState { 531 fn active(&self) -> bool { 532 self.handle.is_some() 533 } 534 535 fn signal_thread(&self) { 536 if let Some(handle) = self.handle.as_ref() { 537 loop { 538 // SAFETY: FFI call with correct arguments 539 unsafe { 540 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 541 } 542 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 543 break; 544 } else { 545 // This is more effective than thread::yield_now() at 546 // avoiding a priority inversion with the vCPU thread 547 thread::sleep(std::time::Duration::from_millis(1)); 548 } 549 } 550 } 551 } 552 553 fn join_thread(&mut self) -> Result<()> { 554 if let Some(handle) = self.handle.take() { 555 handle.join().map_err(Error::ThreadCleanup)? 556 } 557 558 Ok(()) 559 } 560 561 fn unpark_thread(&self) { 562 if let Some(handle) = self.handle.as_ref() { 563 handle.thread().unpark() 564 } 565 } 566 } 567 568 impl CpuManager { 569 #[allow(unused_variables)] 570 #[allow(clippy::too_many_arguments)] 571 pub fn new( 572 config: &CpusConfig, 573 vm: Arc<dyn hypervisor::Vm>, 574 exit_evt: EventFd, 575 reset_evt: EventFd, 576 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 577 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 578 seccomp_action: SeccompAction, 579 vm_ops: Arc<dyn VmOps>, 580 #[cfg(feature = "tdx")] tdx_enabled: bool, 581 numa_nodes: &NumaNodes, 582 ) -> Result<Arc<Mutex<CpuManager>>> { 583 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 584 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 585 let hypervisor_type = hypervisor.hypervisor_type(); 586 587 #[cfg(target_arch = "x86_64")] 588 if config.features.amx { 589 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 590 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 591 const XFEATURE_XTILEDATA: usize = 18; 592 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 593 594 // SAFETY: the syscall is only modifing kernel internal 595 // data structures that the kernel is itself expected to safeguard. 596 let amx_tile = unsafe { 597 libc::syscall( 598 libc::SYS_arch_prctl, 599 ARCH_REQ_XCOMP_GUEST_PERM, 600 XFEATURE_XTILEDATA, 601 ) 602 }; 603 604 if amx_tile != 0 { 605 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 606 } else { 607 let mask: usize = 0; 608 // SAFETY: the mask being modified (not marked mutable as it is 609 // modified in unsafe only which is permitted) isn't in use elsewhere. 610 let result = unsafe { 611 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 612 }; 613 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 614 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 615 } 616 } 617 } 618 619 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 620 let mut cpu_list = Vec::new(); 621 for (proximity_domain, numa_node) in numa_nodes.iter() { 622 for cpu in numa_node.cpus.iter() { 623 cpu_list.push((*cpu, *proximity_domain)) 624 } 625 } 626 cpu_list 627 } 628 .into_iter() 629 .collect(); 630 631 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 632 cpu_affinity 633 .iter() 634 .map(|a| (a.vcpu, a.host_cpus.clone())) 635 .collect() 636 } else { 637 BTreeMap::new() 638 }; 639 640 #[cfg(feature = "tdx")] 641 let dynamic = !tdx_enabled; 642 #[cfg(not(feature = "tdx"))] 643 let dynamic = true; 644 645 Ok(Arc::new(Mutex::new(CpuManager { 646 hypervisor_type, 647 config: config.clone(), 648 interrupt_controller: None, 649 #[cfg(target_arch = "x86_64")] 650 cpuid: Vec::new(), 651 vm, 652 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 653 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 654 vcpu_states, 655 exit_evt, 656 reset_evt, 657 #[cfg(feature = "guest_debug")] 658 vm_debug_evt, 659 selected_cpu: 0, 660 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 661 seccomp_action, 662 vm_ops, 663 acpi_address: None, 664 proximity_domain_per_cpu, 665 affinity, 666 dynamic, 667 }))) 668 } 669 670 #[cfg(target_arch = "x86_64")] 671 pub fn populate_cpuid( 672 &mut self, 673 memory_manager: &Arc<Mutex<MemoryManager>>, 674 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 675 #[cfg(feature = "tdx")] tdx_enabled: bool, 676 ) -> Result<()> { 677 let sgx_epc_sections = memory_manager 678 .lock() 679 .unwrap() 680 .sgx_epc_region() 681 .as_ref() 682 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 683 self.cpuid = { 684 let phys_bits = physical_bits(self.config.max_phys_bits); 685 arch::generate_common_cpuid( 686 hypervisor, 687 self.config 688 .topology 689 .clone() 690 .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)), 691 sgx_epc_sections, 692 phys_bits, 693 self.config.kvm_hyperv, 694 #[cfg(feature = "tdx")] 695 tdx_enabled, 696 ) 697 .map_err(Error::CommonCpuId)? 698 }; 699 700 Ok(()) 701 } 702 703 fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> { 704 info!("Creating vCPU: cpu_id = {}", cpu_id); 705 706 let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?; 707 708 if let Some(snapshot) = snapshot { 709 // AArch64 vCPUs should be initialized after created. 710 #[cfg(target_arch = "aarch64")] 711 vcpu.init(&self.vm)?; 712 713 let state: CpuState = snapshot.to_state().map_err(|e| { 714 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e)) 715 })?; 716 vcpu.vcpu 717 .set_state(&state) 718 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?; 719 720 vcpu.saved_state = Some(state); 721 } 722 723 let vcpu = Arc::new(Mutex::new(vcpu)); 724 725 // Adding vCPU to the CpuManager's vCPU list. 726 self.vcpus.push(vcpu.clone()); 727 728 Ok(vcpu) 729 } 730 731 pub fn configure_vcpu( 732 &self, 733 vcpu: Arc<Mutex<Vcpu>>, 734 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 735 ) -> Result<()> { 736 let mut vcpu = vcpu.lock().unwrap(); 737 738 #[cfg(target_arch = "x86_64")] 739 assert!(!self.cpuid.is_empty()); 740 741 #[cfg(target_arch = "x86_64")] 742 vcpu.configure(boot_setup, self.cpuid.clone(), self.config.kvm_hyperv)?; 743 744 #[cfg(target_arch = "aarch64")] 745 vcpu.configure(&self.vm, boot_setup)?; 746 747 Ok(()) 748 } 749 750 /// Only create new vCPUs if there aren't any inactive ones to reuse 751 fn create_vcpus( 752 &mut self, 753 desired_vcpus: u8, 754 snapshot: Option<Snapshot>, 755 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 756 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 757 info!( 758 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 759 desired_vcpus, 760 self.config.max_vcpus, 761 self.vcpus.len(), 762 self.present_vcpus() 763 ); 764 765 if desired_vcpus > self.config.max_vcpus { 766 return Err(Error::DesiredVCpuCountExceedsMax); 767 } 768 769 // Only create vCPUs in excess of all the allocated vCPUs. 770 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 771 vcpus.push(self.create_vcpu( 772 cpu_id, 773 // TODO: The special format of the CPU id can be removed once 774 // ready to break live upgrade. 775 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()), 776 )?); 777 } 778 779 Ok(vcpus) 780 } 781 782 #[cfg(target_arch = "aarch64")] 783 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 784 for cpu in self.vcpus.iter() { 785 let cpu = cpu.lock().unwrap(); 786 // Check if PMU attr is available, if not, log the information. 787 if cpu.vcpu.has_pmu_support() { 788 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 789 } else { 790 debug!( 791 "PMU attribute is not supported in vCPU{}, skip PMU init!", 792 cpu.id 793 ); 794 return Ok(false); 795 } 796 } 797 798 Ok(true) 799 } 800 801 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 802 self.vcpus.clone() 803 } 804 805 fn start_vcpu( 806 &mut self, 807 vcpu: Arc<Mutex<Vcpu>>, 808 vcpu_id: u8, 809 vcpu_thread_barrier: Arc<Barrier>, 810 inserting: bool, 811 ) -> Result<()> { 812 let reset_evt = self.reset_evt.try_clone().unwrap(); 813 let exit_evt = self.exit_evt.try_clone().unwrap(); 814 #[cfg(feature = "guest_debug")] 815 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 816 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 817 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 818 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 819 820 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 821 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 822 .vcpu_run_interrupted 823 .clone(); 824 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 825 826 // Prepare the CPU set the current vCPU is expected to run onto. 827 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 828 // SAFETY: all zeros is a valid pattern 829 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 830 // SAFETY: FFI call, trivially safe 831 unsafe { libc::CPU_ZERO(&mut cpuset) }; 832 for host_cpu in host_cpus { 833 // SAFETY: FFI call, trivially safe 834 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) }; 835 } 836 cpuset 837 }); 838 839 // Retrieve seccomp filter for vcpu thread 840 let vcpu_seccomp_filter = 841 get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type) 842 .map_err(Error::CreateSeccompFilter)?; 843 844 #[cfg(target_arch = "x86_64")] 845 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 846 847 info!("Starting vCPU: cpu_id = {}", vcpu_id); 848 849 let handle = Some( 850 thread::Builder::new() 851 .name(format!("vcpu{vcpu_id}")) 852 .spawn(move || { 853 // Schedule the thread to run on the expected CPU set 854 if let Some(cpuset) = cpuset.as_ref() { 855 // SAFETY: FFI call with correct arguments 856 let ret = unsafe { 857 libc::sched_setaffinity( 858 0, 859 std::mem::size_of::<libc::cpu_set_t>(), 860 cpuset as *const libc::cpu_set_t, 861 ) 862 }; 863 864 if ret != 0 { 865 error!( 866 "Failed scheduling the vCPU {} on the expected CPU set: {}", 867 vcpu_id, 868 io::Error::last_os_error() 869 ); 870 return; 871 } 872 } 873 874 // Apply seccomp filter for vcpu thread. 875 if !vcpu_seccomp_filter.is_empty() { 876 if let Err(e) = 877 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 878 { 879 error!("Error applying seccomp filter: {:?}", e); 880 return; 881 } 882 } 883 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 884 // This uses an async signal safe handler to kill the vcpu handles. 885 register_signal_handler(SIGRTMIN(), handle_signal) 886 .expect("Failed to register vcpu signal handler"); 887 // Block until all CPUs are ready. 888 vcpu_thread_barrier.wait(); 889 890 std::panic::catch_unwind(move || { 891 loop { 892 // If we are being told to pause, we park the thread 893 // until the pause boolean is toggled. 894 // The resume operation is responsible for toggling 895 // the boolean and unpark the thread. 896 // We enter a loop because park() could spuriously 897 // return. We will then park() again unless the 898 // pause boolean has been toggled. 899 900 // Need to use Ordering::SeqCst as we have multiple 901 // loads and stores to different atomics and we need 902 // to see them in a consistent order in all threads 903 904 if vcpu_pause_signalled.load(Ordering::SeqCst) { 905 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 906 // completed by returning to KVM_RUN. From the kernel docs: 907 // 908 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 909 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 910 // operations are complete (and guest state is consistent) only after userspace 911 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 912 // incomplete operations and then check for pending signals. 913 // The pending state of the operation is not preserved in state which is 914 // visible to userspace, thus userspace should ensure that the operation is 915 // completed before performing a live migration. Userspace can re-enter the 916 // guest with an unmasked signal pending or with the immediate_exit field set 917 // to complete pending operations without allowing any further instructions 918 // to be executed. 919 920 #[cfg(feature = "kvm")] 921 { 922 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 923 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 924 error!("Unexpected VM exit on \"immediate_exit\" run"); 925 break; 926 } 927 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 928 } 929 930 vcpu_run_interrupted.store(true, Ordering::SeqCst); 931 while vcpu_pause_signalled.load(Ordering::SeqCst) { 932 thread::park(); 933 } 934 vcpu_run_interrupted.store(false, Ordering::SeqCst); 935 } 936 937 // We've been told to terminate 938 if vcpu_kill_signalled.load(Ordering::SeqCst) 939 || vcpu_kill.load(Ordering::SeqCst) 940 { 941 vcpu_run_interrupted.store(true, Ordering::SeqCst); 942 break; 943 } 944 945 #[cfg(feature = "tdx")] 946 let mut vcpu = vcpu.lock().unwrap(); 947 #[cfg(not(feature = "tdx"))] 948 let vcpu = vcpu.lock().unwrap(); 949 // vcpu.run() returns false on a triple-fault so trigger a reset 950 match vcpu.run() { 951 Ok(run) => match run { 952 #[cfg(feature = "kvm")] 953 VmExit::Debug => { 954 info!("VmExit::Debug"); 955 #[cfg(feature = "guest_debug")] 956 { 957 vcpu_pause_signalled.store(true, Ordering::SeqCst); 958 let raw_tid = get_raw_tid(vcpu_id as usize); 959 vm_debug_evt.write(raw_tid as u64).unwrap(); 960 } 961 } 962 #[cfg(target_arch = "x86_64")] 963 VmExit::IoapicEoi(vector) => { 964 if let Some(interrupt_controller) = 965 &interrupt_controller_clone 966 { 967 interrupt_controller 968 .lock() 969 .unwrap() 970 .end_of_interrupt(vector); 971 } 972 } 973 VmExit::Ignore => {} 974 VmExit::Hyperv => {} 975 VmExit::Reset => { 976 info!("VmExit::Reset"); 977 vcpu_run_interrupted.store(true, Ordering::SeqCst); 978 reset_evt.write(1).unwrap(); 979 break; 980 } 981 VmExit::Shutdown => { 982 info!("VmExit::Shutdown"); 983 vcpu_run_interrupted.store(true, Ordering::SeqCst); 984 exit_evt.write(1).unwrap(); 985 break; 986 } 987 #[cfg(feature = "tdx")] 988 VmExit::Tdx => { 989 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 990 match vcpu.get_tdx_exit_details() { 991 Ok(details) => match details { 992 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 993 TdxExitDetails::SetupEventNotifyInterrupt => { 994 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 995 } 996 }, 997 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 998 } 999 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1000 } else { 1001 // We should never reach this code as 1002 // this means the design from the code 1003 // is wrong. 1004 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1005 } 1006 } 1007 _ => { 1008 error!( 1009 "VCPU generated error: {:?}", 1010 Error::UnexpectedVmExit 1011 ); 1012 break; 1013 } 1014 }, 1015 1016 Err(e) => { 1017 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1018 break; 1019 } 1020 } 1021 1022 // We've been told to terminate 1023 if vcpu_kill_signalled.load(Ordering::SeqCst) 1024 || vcpu_kill.load(Ordering::SeqCst) 1025 { 1026 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1027 break; 1028 } 1029 } 1030 }) 1031 .or_else(|_| { 1032 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1033 error!("vCPU thread panicked"); 1034 panic_exit_evt.write(1) 1035 }) 1036 .ok(); 1037 }) 1038 .map_err(Error::VcpuSpawn)?, 1039 ); 1040 1041 // On hot plug calls into this function entry_point is None. It is for 1042 // those hotplug CPU additions that we need to set the inserting flag. 1043 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1044 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1045 1046 Ok(()) 1047 } 1048 1049 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1050 fn activate_vcpus( 1051 &mut self, 1052 desired_vcpus: u8, 1053 inserting: bool, 1054 paused: Option<bool>, 1055 ) -> Result<()> { 1056 if desired_vcpus > self.config.max_vcpus { 1057 return Err(Error::DesiredVCpuCountExceedsMax); 1058 } 1059 1060 let vcpu_thread_barrier = Arc::new(Barrier::new( 1061 (desired_vcpus - self.present_vcpus() + 1) as usize, 1062 )); 1063 1064 if let Some(paused) = paused { 1065 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1066 } 1067 1068 info!( 1069 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1070 desired_vcpus, 1071 self.vcpus.len(), 1072 self.present_vcpus(), 1073 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1074 ); 1075 1076 // This reuses any inactive vCPUs as well as any that were newly created 1077 for vcpu_id in self.present_vcpus()..desired_vcpus { 1078 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1079 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1080 } 1081 1082 // Unblock all CPU threads. 1083 vcpu_thread_barrier.wait(); 1084 Ok(()) 1085 } 1086 1087 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1088 // Mark vCPUs for removal, actual removal happens on ejection 1089 for cpu_id in desired_vcpus..self.present_vcpus() { 1090 self.vcpu_states[usize::from(cpu_id)].removing = true; 1091 } 1092 } 1093 1094 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1095 info!("Removing vCPU: cpu_id = {}", cpu_id); 1096 let mut state = &mut self.vcpu_states[usize::from(cpu_id)]; 1097 state.kill.store(true, Ordering::SeqCst); 1098 state.signal_thread(); 1099 state.join_thread()?; 1100 state.handle = None; 1101 1102 // Once the thread has exited, clear the "kill" so that it can reused 1103 state.kill.store(false, Ordering::SeqCst); 1104 1105 Ok(()) 1106 } 1107 1108 pub fn create_boot_vcpus( 1109 &mut self, 1110 snapshot: Option<Snapshot>, 1111 ) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1112 trace_scoped!("create_boot_vcpus"); 1113 1114 self.create_vcpus(self.boot_vcpus(), snapshot) 1115 } 1116 1117 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1118 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1119 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1120 } 1121 1122 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1123 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1124 .map_err(|e| { 1125 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1126 })?; 1127 1128 Ok(()) 1129 } 1130 1131 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1132 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1133 return Ok(false); 1134 } 1135 1136 if !self.dynamic { 1137 return Ok(false); 1138 } 1139 1140 match desired_vcpus.cmp(&self.present_vcpus()) { 1141 cmp::Ordering::Greater => { 1142 let vcpus = self.create_vcpus(desired_vcpus, None)?; 1143 for vcpu in vcpus { 1144 self.configure_vcpu(vcpu, None)? 1145 } 1146 self.activate_vcpus(desired_vcpus, true, None)?; 1147 Ok(true) 1148 } 1149 cmp::Ordering::Less => { 1150 self.mark_vcpus_for_removal(desired_vcpus); 1151 Ok(true) 1152 } 1153 _ => Ok(false), 1154 } 1155 } 1156 1157 pub fn shutdown(&mut self) -> Result<()> { 1158 // Tell the vCPUs to stop themselves next time they go through the loop 1159 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1160 1161 // Toggle the vCPUs pause boolean 1162 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1163 1164 // Unpark all the VCPU threads. 1165 for state in self.vcpu_states.iter() { 1166 state.unpark_thread(); 1167 } 1168 1169 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1170 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1171 // above. 1172 for state in self.vcpu_states.iter() { 1173 state.signal_thread(); 1174 } 1175 1176 // Wait for all the threads to finish. This removes the state from the vector. 1177 for mut state in self.vcpu_states.drain(..) { 1178 state.join_thread()?; 1179 } 1180 1181 Ok(()) 1182 } 1183 1184 #[cfg(feature = "tdx")] 1185 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1186 for vcpu in &self.vcpus { 1187 vcpu.lock() 1188 .unwrap() 1189 .vcpu 1190 .tdx_init(hob_address) 1191 .map_err(Error::InitializeTdx)?; 1192 } 1193 Ok(()) 1194 } 1195 1196 pub fn boot_vcpus(&self) -> u8 { 1197 self.config.boot_vcpus 1198 } 1199 1200 pub fn max_vcpus(&self) -> u8 { 1201 self.config.max_vcpus 1202 } 1203 1204 #[cfg(target_arch = "x86_64")] 1205 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1206 assert!(!self.cpuid.is_empty()); 1207 self.cpuid.clone() 1208 } 1209 1210 fn present_vcpus(&self) -> u8 { 1211 self.vcpu_states 1212 .iter() 1213 .fold(0, |acc, state| acc + state.active() as u8) 1214 } 1215 1216 #[cfg(target_arch = "aarch64")] 1217 pub fn get_mpidrs(&self) -> Vec<u64> { 1218 self.vcpus 1219 .iter() 1220 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1221 .collect() 1222 } 1223 1224 #[cfg(target_arch = "aarch64")] 1225 pub fn get_saved_states(&self) -> Vec<CpuState> { 1226 self.vcpus 1227 .iter() 1228 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1229 .collect() 1230 } 1231 1232 #[cfg(target_arch = "aarch64")] 1233 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1234 self.config 1235 .topology 1236 .clone() 1237 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1238 } 1239 1240 pub fn create_madt(&self) -> Sdt { 1241 use crate::acpi; 1242 // This is also checked in the commandline parsing. 1243 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1244 1245 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1246 #[cfg(target_arch = "x86_64")] 1247 { 1248 madt.write(36, arch::layout::APIC_START); 1249 1250 for cpu in 0..self.config.max_vcpus { 1251 let lapic = LocalApic { 1252 r#type: acpi::ACPI_APIC_PROCESSOR, 1253 length: 8, 1254 processor_id: cpu, 1255 apic_id: cpu, 1256 flags: if cpu < self.config.boot_vcpus { 1257 1 << MADT_CPU_ENABLE_FLAG 1258 } else { 1259 0 1260 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1261 }; 1262 madt.append(lapic); 1263 } 1264 1265 madt.append(Ioapic { 1266 r#type: acpi::ACPI_APIC_IO, 1267 length: 12, 1268 ioapic_id: 0, 1269 apic_address: arch::layout::IOAPIC_START.0 as u32, 1270 gsi_base: 0, 1271 ..Default::default() 1272 }); 1273 1274 madt.append(InterruptSourceOverride { 1275 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1276 length: 10, 1277 bus: 0, 1278 source: 4, 1279 gsi: 4, 1280 flags: 0, 1281 }); 1282 } 1283 1284 #[cfg(target_arch = "aarch64")] 1285 { 1286 /* Notes: 1287 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1288 */ 1289 1290 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1291 for cpu in 0..self.config.boot_vcpus { 1292 let vcpu = &self.vcpus[cpu as usize]; 1293 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1294 /* ARMv8 MPIDR format: 1295 Bits [63:40] Must be zero 1296 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1297 Bits [31:24] Must be zero 1298 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1299 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1300 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1301 */ 1302 let mpidr_mask = 0xff_00ff_ffff; 1303 let gicc = GicC { 1304 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1305 length: 80, 1306 reserved0: 0, 1307 cpu_interface_number: cpu as u32, 1308 uid: cpu as u32, 1309 flags: 1, 1310 parking_version: 0, 1311 performance_interrupt: 0, 1312 parked_address: 0, 1313 base_address: 0, 1314 gicv_base_address: 0, 1315 gich_base_address: 0, 1316 vgic_interrupt: 0, 1317 gicr_base_address: 0, 1318 mpidr: mpidr & mpidr_mask, 1319 proc_power_effi_class: 0, 1320 reserved1: 0, 1321 spe_overflow_interrupt: 0, 1322 }; 1323 1324 madt.append(gicc); 1325 } 1326 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1327 1328 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1329 let gicd = GicD { 1330 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1331 length: 24, 1332 reserved0: 0, 1333 gic_id: 0, 1334 base_address: vgic_config.dist_addr, 1335 global_irq_base: 0, 1336 version: 3, 1337 reserved1: [0; 3], 1338 }; 1339 madt.append(gicd); 1340 1341 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1342 let gicr = GicR { 1343 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1344 length: 16, 1345 reserved: 0, 1346 base_address: vgic_config.redists_addr, 1347 range_length: vgic_config.redists_size as u32, 1348 }; 1349 madt.append(gicr); 1350 1351 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1352 let gicits = GicIts { 1353 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1354 length: 20, 1355 reserved0: 0, 1356 translation_id: 0, 1357 base_address: vgic_config.msi_addr, 1358 reserved1: 0, 1359 }; 1360 madt.append(gicits); 1361 1362 madt.update_checksum(); 1363 } 1364 1365 madt 1366 } 1367 1368 #[cfg(target_arch = "aarch64")] 1369 pub fn create_pptt(&self) -> Sdt { 1370 let pptt_start = 0; 1371 let mut cpus = 0; 1372 let mut uid = 0; 1373 // If topology is not specified, the default setting is: 1374 // 1 package, multiple cores, 1 thread per core 1375 // This is also the behavior when PPTT is missing. 1376 let (threads_per_core, cores_per_package, packages) = 1377 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1378 1379 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1380 1381 for cluster_idx in 0..packages { 1382 if cpus < self.config.boot_vcpus as usize { 1383 let cluster_offset = pptt.len() - pptt_start; 1384 let cluster_hierarchy_node = ProcessorHierarchyNode { 1385 r#type: 0, 1386 length: 20, 1387 reserved: 0, 1388 flags: 0x2, 1389 parent: 0, 1390 acpi_processor_id: cluster_idx as u32, 1391 num_private_resources: 0, 1392 }; 1393 pptt.append(cluster_hierarchy_node); 1394 1395 for core_idx in 0..cores_per_package { 1396 let core_offset = pptt.len() - pptt_start; 1397 1398 if threads_per_core > 1 { 1399 let core_hierarchy_node = ProcessorHierarchyNode { 1400 r#type: 0, 1401 length: 20, 1402 reserved: 0, 1403 flags: 0x2, 1404 parent: cluster_offset as u32, 1405 acpi_processor_id: core_idx as u32, 1406 num_private_resources: 0, 1407 }; 1408 pptt.append(core_hierarchy_node); 1409 1410 for _thread_idx in 0..threads_per_core { 1411 let thread_hierarchy_node = ProcessorHierarchyNode { 1412 r#type: 0, 1413 length: 20, 1414 reserved: 0, 1415 flags: 0xE, 1416 parent: core_offset as u32, 1417 acpi_processor_id: uid as u32, 1418 num_private_resources: 0, 1419 }; 1420 pptt.append(thread_hierarchy_node); 1421 uid += 1; 1422 } 1423 } else { 1424 let thread_hierarchy_node = ProcessorHierarchyNode { 1425 r#type: 0, 1426 length: 20, 1427 reserved: 0, 1428 flags: 0xA, 1429 parent: cluster_offset as u32, 1430 acpi_processor_id: uid as u32, 1431 num_private_resources: 0, 1432 }; 1433 pptt.append(thread_hierarchy_node); 1434 uid += 1; 1435 } 1436 } 1437 cpus += (cores_per_package * threads_per_core) as usize; 1438 } 1439 } 1440 1441 pptt.update_checksum(); 1442 pptt 1443 } 1444 1445 #[cfg(feature = "guest_debug")] 1446 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1447 self.vcpus[usize::from(cpu_id)] 1448 .lock() 1449 .unwrap() 1450 .vcpu 1451 .get_regs() 1452 .map_err(Error::CpuDebug) 1453 } 1454 1455 #[cfg(feature = "guest_debug")] 1456 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1457 self.vcpus[usize::from(cpu_id)] 1458 .lock() 1459 .unwrap() 1460 .vcpu 1461 .set_regs(regs) 1462 .map_err(Error::CpuDebug) 1463 } 1464 1465 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1466 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1467 self.vcpus[usize::from(cpu_id)] 1468 .lock() 1469 .unwrap() 1470 .vcpu 1471 .get_sregs() 1472 .map_err(Error::CpuDebug) 1473 } 1474 1475 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1476 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1477 self.vcpus[usize::from(cpu_id)] 1478 .lock() 1479 .unwrap() 1480 .vcpu 1481 .set_sregs(sregs) 1482 .map_err(Error::CpuDebug) 1483 } 1484 1485 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1486 fn translate_gva( 1487 &self, 1488 _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1489 cpu_id: u8, 1490 gva: u64, 1491 ) -> Result<u64> { 1492 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1493 .lock() 1494 .unwrap() 1495 .vcpu 1496 .translate_gva(gva, /* flags: unused */ 0) 1497 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1498 Ok(gpa) 1499 } 1500 1501 /// 1502 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1503 /// it in VMM by walking through translation tables. 1504 /// 1505 /// Address translation is big topic, here we only focus the scenario that 1506 /// happens in VMM while debugging kernel. This `translate_gva` 1507 /// implementation is restricted to: 1508 /// - Exception Level 1 1509 /// - Translate high address range only (kernel space) 1510 /// 1511 /// This implementation supports following Arm-v8a features related to 1512 /// address translation: 1513 /// - FEAT_LPA 1514 /// - FEAT_LVA 1515 /// - FEAT_LPA2 1516 /// 1517 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1518 fn translate_gva( 1519 &self, 1520 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 1521 cpu_id: u8, 1522 gva: u64, 1523 ) -> Result<u64> { 1524 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1525 .lock() 1526 .unwrap() 1527 .vcpu 1528 .get_sys_reg(regs::TCR_EL1) 1529 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1530 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1531 .lock() 1532 .unwrap() 1533 .vcpu 1534 .get_sys_reg(regs::TTBR1_EL1) 1535 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1536 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1537 .lock() 1538 .unwrap() 1539 .vcpu 1540 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1541 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1542 1543 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1544 // or low (0x000xxx...). 1545 let high_range = extract_bits_64!(gva, 55, 1); 1546 if high_range == 0 { 1547 info!("VA (0x{:x}) range is not supported!", gva); 1548 return Ok(gva); 1549 } 1550 1551 // High range size offset 1552 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1553 // Granule size 1554 let tg = extract_bits_64!(tcr_el1, 30, 2); 1555 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1556 let ds = extract_bits_64!(tcr_el1, 59, 1); 1557 1558 if tsz == 0 { 1559 info!("VA translation is not ready!"); 1560 return Ok(gva); 1561 } 1562 1563 // VA size is determined by TCR_BL1.T1SZ 1564 let va_size = 64 - tsz; 1565 // Number of bits in VA consumed in each level of translation 1566 let stride = match tg { 1567 3 => 13, // 64KB granule size 1568 1 => 11, // 16KB granule size 1569 _ => 9, // 4KB, default 1570 }; 1571 // Starting level of walking 1572 let mut level = 4 - (va_size - 4) / stride; 1573 1574 // PA or IPA size is determined 1575 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1576 #[allow(clippy::identity_op)] 1577 let pa_range = extract_bits_64!(id_aa64mmfr0_el1, 0, 4); 1578 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1579 // To be safe, we use the minimum value if they are different. 1580 let pa_range = std::cmp::min(tcr_ips, pa_range); 1581 // PA size in bits 1582 let pa_size = match pa_range { 1583 0 => 32, 1584 1 => 36, 1585 2 => 40, 1586 3 => 42, 1587 4 => 44, 1588 5 => 48, 1589 6 => 52, 1590 _ => { 1591 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1592 "PA range not supported {pa_range}" 1593 )))) 1594 } 1595 }; 1596 1597 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1598 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1599 // If FEAT_LPA2 is present, the translation table descriptor holds 1600 // 50 bits of the table address of next level. 1601 // Otherwise, it is 48 bits. 1602 let descaddrmask = if ds == 1 { 1603 !0u64 >> (64 - 50) // mask with 50 least significant bits 1604 } else { 1605 !0u64 >> (64 - 48) // mask with 48 least significant bits 1606 }; 1607 let descaddrmask = descaddrmask & !indexmask_grainsize; 1608 1609 // Translation table base address 1610 #[allow(clippy::identity_op)] 1611 let mut descaddr: u64 = extract_bits_64!(ttbr1_el1, 0, 48); 1612 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1613 // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1614 if pa_size == 52 { 1615 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1616 } 1617 1618 // Loop through tables of each level 1619 loop { 1620 // Table offset for current level 1621 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1622 descaddr |= table_offset; 1623 descaddr &= !7u64; 1624 1625 let mut buf = [0; 8]; 1626 guest_memory 1627 .memory() 1628 .read(&mut buf, GuestAddress(descaddr)) 1629 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1630 let descriptor = u64::from_le_bytes(buf); 1631 1632 descaddr = descriptor & descaddrmask; 1633 // In the case of FEAT_LPA, the next-level translation table address 1634 // bits [48:51] comes from bits [12:15] of the current descriptor. 1635 // For FEAT_LPA2, the next-level translation table address 1636 // bits [50:51] comes from bits [8:9] of the current descriptor, 1637 // bits [48:49] comes from bits [48:49] of the descriptor which was 1638 // handled previously. 1639 if pa_size == 52 { 1640 if ds == 1 { 1641 // FEAT_LPA2 1642 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1643 } else { 1644 // FEAT_LPA 1645 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1646 } 1647 } 1648 1649 if (descriptor & 2) != 0 && (level < 3) { 1650 // This is a table entry. Go down to next level. 1651 level += 1; 1652 indexmask = indexmask_grainsize; 1653 continue; 1654 } 1655 1656 break; 1657 } 1658 1659 // We have reached either: 1660 // - a page entry at level 3 or 1661 // - a block entry at level 1 or 2 1662 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1663 descaddr &= !(page_size - 1); 1664 descaddr |= gva & (page_size - 1); 1665 1666 Ok(descaddr) 1667 } 1668 1669 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1670 self.acpi_address = Some(acpi_address); 1671 } 1672 1673 pub(crate) fn set_interrupt_controller( 1674 &mut self, 1675 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1676 ) { 1677 self.interrupt_controller = Some(interrupt_controller); 1678 } 1679 } 1680 1681 struct Cpu { 1682 cpu_id: u8, 1683 proximity_domain: u32, 1684 dynamic: bool, 1685 } 1686 1687 #[cfg(target_arch = "x86_64")] 1688 const MADT_CPU_ENABLE_FLAG: usize = 0; 1689 1690 #[cfg(target_arch = "x86_64")] 1691 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1692 1693 impl Cpu { 1694 #[cfg(target_arch = "x86_64")] 1695 fn generate_mat(&self) -> Vec<u8> { 1696 let lapic = LocalApic { 1697 r#type: 0, 1698 length: 8, 1699 processor_id: self.cpu_id, 1700 apic_id: self.cpu_id, 1701 flags: 1 << MADT_CPU_ENABLE_FLAG, 1702 }; 1703 1704 let mut mat_data: Vec<u8> = Vec::new(); 1705 mat_data.resize(std::mem::size_of_val(&lapic), 0); 1706 // SAFETY: mat_data is large enough to hold lapic 1707 unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic }; 1708 1709 mat_data 1710 } 1711 } 1712 1713 impl Aml for Cpu { 1714 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1715 #[cfg(target_arch = "x86_64")] 1716 let mat_data: Vec<u8> = self.generate_mat(); 1717 #[allow(clippy::if_same_then_else)] 1718 if self.dynamic { 1719 aml::Device::new( 1720 format!("C{:03}", self.cpu_id).as_str().into(), 1721 vec![ 1722 &aml::Name::new("_HID".into(), &"ACPI0007"), 1723 &aml::Name::new("_UID".into(), &self.cpu_id), 1724 // Currently, AArch64 cannot support following fields. 1725 /* 1726 _STA return value: 1727 Bit [0] – Set if the device is present. 1728 Bit [1] – Set if the device is enabled and decoding its resources. 1729 Bit [2] – Set if the device should be shown in the UI. 1730 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1731 Bit [4] – Set if the battery is present. 1732 Bits [31:5] – Reserved (must be cleared). 1733 */ 1734 #[cfg(target_arch = "x86_64")] 1735 &aml::Method::new( 1736 "_STA".into(), 1737 0, 1738 false, 1739 // Call into CSTA method which will interrogate device 1740 vec![&aml::Return::new(&aml::MethodCall::new( 1741 "CSTA".into(), 1742 vec![&self.cpu_id], 1743 ))], 1744 ), 1745 &aml::Method::new( 1746 "_PXM".into(), 1747 0, 1748 false, 1749 vec![&aml::Return::new(&self.proximity_domain)], 1750 ), 1751 // The Linux kernel expects every CPU device to have a _MAT entry 1752 // containing the LAPIC for this processor with the enabled bit set 1753 // even it if is disabled in the MADT (non-boot CPU) 1754 #[cfg(target_arch = "x86_64")] 1755 &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)), 1756 // Trigger CPU ejection 1757 #[cfg(target_arch = "x86_64")] 1758 &aml::Method::new( 1759 "_EJ0".into(), 1760 1, 1761 false, 1762 // Call into CEJ0 method which will actually eject device 1763 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1764 ), 1765 ], 1766 ) 1767 .append_aml_bytes(bytes); 1768 } else { 1769 aml::Device::new( 1770 format!("C{:03}", self.cpu_id).as_str().into(), 1771 vec![ 1772 &aml::Name::new("_HID".into(), &"ACPI0007"), 1773 &aml::Name::new("_UID".into(), &self.cpu_id), 1774 #[cfg(target_arch = "x86_64")] 1775 &aml::Method::new( 1776 "_STA".into(), 1777 0, 1778 false, 1779 // Mark CPU present see CSTA implementation 1780 vec![&aml::Return::new(&0xfu8)], 1781 ), 1782 &aml::Method::new( 1783 "_PXM".into(), 1784 0, 1785 false, 1786 vec![&aml::Return::new(&self.proximity_domain)], 1787 ), 1788 // The Linux kernel expects every CPU device to have a _MAT entry 1789 // containing the LAPIC for this processor with the enabled bit set 1790 // even it if is disabled in the MADT (non-boot CPU) 1791 #[cfg(target_arch = "x86_64")] 1792 &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)), 1793 ], 1794 ) 1795 .append_aml_bytes(bytes); 1796 } 1797 } 1798 } 1799 1800 struct CpuNotify { 1801 cpu_id: u8, 1802 } 1803 1804 impl Aml for CpuNotify { 1805 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1806 let object = aml::Path::new(&format!("C{:03}", self.cpu_id)); 1807 aml::If::new( 1808 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1809 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1810 ) 1811 .append_aml_bytes(bytes) 1812 } 1813 } 1814 1815 struct CpuMethods { 1816 max_vcpus: u8, 1817 dynamic: bool, 1818 } 1819 1820 impl Aml for CpuMethods { 1821 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1822 if self.dynamic { 1823 // CPU status method 1824 aml::Method::new( 1825 "CSTA".into(), 1826 1, 1827 true, 1828 vec![ 1829 // Take lock defined above 1830 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1831 // Write CPU number (in first argument) to I/O port via field 1832 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1833 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1834 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 1835 &aml::If::new( 1836 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 1837 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 1838 ), 1839 // Release lock 1840 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1841 // Return 0 or 0xf 1842 &aml::Return::new(&aml::Local(0)), 1843 ], 1844 ) 1845 .append_aml_bytes(bytes); 1846 1847 let mut cpu_notifies = Vec::new(); 1848 for cpu_id in 0..self.max_vcpus { 1849 cpu_notifies.push(CpuNotify { cpu_id }); 1850 } 1851 1852 let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new(); 1853 for cpu_id in 0..self.max_vcpus { 1854 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 1855 } 1856 1857 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).append_aml_bytes(bytes); 1858 1859 aml::Method::new( 1860 "CEJ0".into(), 1861 1, 1862 true, 1863 vec![ 1864 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1865 // Write CPU number (in first argument) to I/O port via field 1866 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1867 // Set CEJ0 bit 1868 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 1869 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1870 ], 1871 ) 1872 .append_aml_bytes(bytes); 1873 1874 aml::Method::new( 1875 "CSCN".into(), 1876 0, 1877 true, 1878 vec![ 1879 // Take lock defined above 1880 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1881 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1882 &aml::While::new( 1883 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 1884 vec![ 1885 // Write CPU number (in first argument) to I/O port via field 1886 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 1887 // Check if CINS bit is set 1888 &aml::If::new( 1889 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 1890 // Notify device if it is 1891 vec![ 1892 &aml::MethodCall::new( 1893 "CTFY".into(), 1894 vec![&aml::Local(0), &aml::ONE], 1895 ), 1896 // Reset CINS bit 1897 &aml::Store::new( 1898 &aml::Path::new("\\_SB_.PRES.CINS"), 1899 &aml::ONE, 1900 ), 1901 ], 1902 ), 1903 // Check if CRMV bit is set 1904 &aml::If::new( 1905 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 1906 // Notify device if it is (with the eject constant 0x3) 1907 vec![ 1908 &aml::MethodCall::new( 1909 "CTFY".into(), 1910 vec![&aml::Local(0), &3u8], 1911 ), 1912 // Reset CRMV bit 1913 &aml::Store::new( 1914 &aml::Path::new("\\_SB_.PRES.CRMV"), 1915 &aml::ONE, 1916 ), 1917 ], 1918 ), 1919 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 1920 ], 1921 ), 1922 // Release lock 1923 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1924 ], 1925 ) 1926 .append_aml_bytes(bytes) 1927 } else { 1928 aml::Method::new("CSCN".into(), 0, true, vec![]).append_aml_bytes(bytes) 1929 } 1930 } 1931 } 1932 1933 impl Aml for CpuManager { 1934 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1935 #[cfg(target_arch = "x86_64")] 1936 if let Some(acpi_address) = self.acpi_address { 1937 // CPU hotplug controller 1938 aml::Device::new( 1939 "_SB_.PRES".into(), 1940 vec![ 1941 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 1942 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 1943 // Mutex to protect concurrent access as we write to choose CPU and then read back status 1944 &aml::Mutex::new("CPLK".into(), 0), 1945 &aml::Name::new( 1946 "_CRS".into(), 1947 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 1948 aml::AddressSpaceCachable::NotCacheable, 1949 true, 1950 acpi_address.0, 1951 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 1952 )]), 1953 ), 1954 // OpRegion and Fields map MMIO range into individual field values 1955 &aml::OpRegion::new( 1956 "PRST".into(), 1957 aml::OpRegionSpace::SystemMemory, 1958 acpi_address.0 as usize, 1959 CPU_MANAGER_ACPI_SIZE, 1960 ), 1961 &aml::Field::new( 1962 "PRST".into(), 1963 aml::FieldAccessType::Byte, 1964 aml::FieldUpdateRule::WriteAsZeroes, 1965 vec![ 1966 aml::FieldEntry::Reserved(32), 1967 aml::FieldEntry::Named(*b"CPEN", 1), 1968 aml::FieldEntry::Named(*b"CINS", 1), 1969 aml::FieldEntry::Named(*b"CRMV", 1), 1970 aml::FieldEntry::Named(*b"CEJ0", 1), 1971 aml::FieldEntry::Reserved(4), 1972 aml::FieldEntry::Named(*b"CCMD", 8), 1973 ], 1974 ), 1975 &aml::Field::new( 1976 "PRST".into(), 1977 aml::FieldAccessType::DWord, 1978 aml::FieldUpdateRule::Preserve, 1979 vec![ 1980 aml::FieldEntry::Named(*b"CSEL", 32), 1981 aml::FieldEntry::Reserved(32), 1982 aml::FieldEntry::Named(*b"CDAT", 32), 1983 ], 1984 ), 1985 ], 1986 ) 1987 .append_aml_bytes(bytes); 1988 } 1989 1990 // CPU devices 1991 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 1992 let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05")); 1993 // Bundle methods together under a common object 1994 let methods = CpuMethods { 1995 max_vcpus: self.config.max_vcpus, 1996 dynamic: self.dynamic, 1997 }; 1998 let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods]; 1999 2000 let mut cpu_devices = Vec::new(); 2001 for cpu_id in 0..self.config.max_vcpus { 2002 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2003 let cpu_device = Cpu { 2004 cpu_id, 2005 proximity_domain, 2006 dynamic: self.dynamic, 2007 }; 2008 2009 cpu_devices.push(cpu_device); 2010 } 2011 2012 for cpu_device in cpu_devices.iter() { 2013 cpu_data_inner.push(cpu_device); 2014 } 2015 2016 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).append_aml_bytes(bytes) 2017 } 2018 } 2019 2020 impl Pausable for CpuManager { 2021 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2022 // Tell the vCPUs to pause themselves next time they exit 2023 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2024 2025 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2026 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2027 // above. 2028 for state in self.vcpu_states.iter() { 2029 state.signal_thread(); 2030 } 2031 2032 for vcpu in self.vcpus.iter() { 2033 let mut vcpu = vcpu.lock().unwrap(); 2034 vcpu.pause()?; 2035 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2036 if !self.config.kvm_hyperv { 2037 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2038 MigratableError::Pause(anyhow!( 2039 "Could not notify guest it has been paused {:?}", 2040 e 2041 )) 2042 })?; 2043 } 2044 } 2045 2046 Ok(()) 2047 } 2048 2049 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2050 for vcpu in self.vcpus.iter() { 2051 vcpu.lock().unwrap().resume()?; 2052 } 2053 2054 // Toggle the vCPUs pause boolean 2055 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2056 2057 // Unpark all the VCPU threads. 2058 // Once unparked, the next thing they will do is checking for the pause 2059 // boolean. Since it'll be set to false, they will exit their pause loop 2060 // and go back to vmx root. 2061 for state in self.vcpu_states.iter() { 2062 state.unpark_thread(); 2063 } 2064 Ok(()) 2065 } 2066 } 2067 2068 impl Snapshottable for CpuManager { 2069 fn id(&self) -> String { 2070 CPU_MANAGER_SNAPSHOT_ID.to_string() 2071 } 2072 2073 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2074 let mut cpu_manager_snapshot = Snapshot::default(); 2075 2076 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2077 for vcpu in &self.vcpus { 2078 let mut vcpu = vcpu.lock().unwrap(); 2079 cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?); 2080 } 2081 2082 Ok(cpu_manager_snapshot) 2083 } 2084 } 2085 2086 impl Transportable for CpuManager {} 2087 impl Migratable for CpuManager {} 2088 2089 #[cfg(feature = "guest_debug")] 2090 impl Debuggable for CpuManager { 2091 #[cfg(feature = "kvm")] 2092 fn set_guest_debug( 2093 &self, 2094 cpu_id: usize, 2095 addrs: &[GuestAddress], 2096 singlestep: bool, 2097 ) -> std::result::Result<(), DebuggableError> { 2098 self.vcpus[cpu_id] 2099 .lock() 2100 .unwrap() 2101 .vcpu 2102 .set_guest_debug(addrs, singlestep) 2103 .map_err(DebuggableError::SetDebug) 2104 } 2105 2106 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2107 Ok(()) 2108 } 2109 2110 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2111 Ok(()) 2112 } 2113 2114 #[cfg(target_arch = "x86_64")] 2115 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2116 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2117 let gregs = self 2118 .get_regs(cpu_id as u8) 2119 .map_err(DebuggableError::ReadRegs)?; 2120 let regs = [ 2121 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, 2122 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, 2123 ]; 2124 2125 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2126 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2127 let eflags = gregs.rflags as u32; 2128 let rip = gregs.rip; 2129 2130 // Segment registers: CS, SS, DS, ES, FS, GS 2131 let sregs = self 2132 .get_sregs(cpu_id as u8) 2133 .map_err(DebuggableError::ReadRegs)?; 2134 let segments = X86SegmentRegs { 2135 cs: sregs.cs.selector as u32, 2136 ss: sregs.ss.selector as u32, 2137 ds: sregs.ds.selector as u32, 2138 es: sregs.es.selector as u32, 2139 fs: sregs.fs.selector as u32, 2140 gs: sregs.gs.selector as u32, 2141 }; 2142 2143 // TODO: Add other registers 2144 2145 Ok(CoreRegs { 2146 regs, 2147 eflags, 2148 rip, 2149 segments, 2150 ..Default::default() 2151 }) 2152 } 2153 2154 #[cfg(target_arch = "aarch64")] 2155 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2156 let gregs = self 2157 .get_regs(cpu_id as u8) 2158 .map_err(DebuggableError::ReadRegs)?; 2159 Ok(CoreRegs { 2160 x: gregs.regs.regs, 2161 sp: gregs.regs.sp, 2162 pc: gregs.regs.pc, 2163 ..Default::default() 2164 }) 2165 } 2166 2167 #[cfg(target_arch = "x86_64")] 2168 fn write_regs( 2169 &self, 2170 cpu_id: usize, 2171 regs: &CoreRegs, 2172 ) -> std::result::Result<(), DebuggableError> { 2173 let orig_gregs = self 2174 .get_regs(cpu_id as u8) 2175 .map_err(DebuggableError::ReadRegs)?; 2176 let gregs = StandardRegisters { 2177 rax: regs.regs[0], 2178 rbx: regs.regs[1], 2179 rcx: regs.regs[2], 2180 rdx: regs.regs[3], 2181 rsi: regs.regs[4], 2182 rdi: regs.regs[5], 2183 rbp: regs.regs[6], 2184 rsp: regs.regs[7], 2185 r8: regs.regs[8], 2186 r9: regs.regs[9], 2187 r10: regs.regs[10], 2188 r11: regs.regs[11], 2189 r12: regs.regs[12], 2190 r13: regs.regs[13], 2191 r14: regs.regs[14], 2192 r15: regs.regs[15], 2193 rip: regs.rip, 2194 // Update the lower 32-bit of rflags. 2195 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), 2196 }; 2197 2198 self.set_regs(cpu_id as u8, &gregs) 2199 .map_err(DebuggableError::WriteRegs)?; 2200 2201 // Segment registers: CS, SS, DS, ES, FS, GS 2202 // Since GDB care only selectors, we call get_sregs() first. 2203 let mut sregs = self 2204 .get_sregs(cpu_id as u8) 2205 .map_err(DebuggableError::ReadRegs)?; 2206 sregs.cs.selector = regs.segments.cs as u16; 2207 sregs.ss.selector = regs.segments.ss as u16; 2208 sregs.ds.selector = regs.segments.ds as u16; 2209 sregs.es.selector = regs.segments.es as u16; 2210 sregs.fs.selector = regs.segments.fs as u16; 2211 sregs.gs.selector = regs.segments.gs as u16; 2212 2213 self.set_sregs(cpu_id as u8, &sregs) 2214 .map_err(DebuggableError::WriteRegs)?; 2215 2216 // TODO: Add other registers 2217 2218 Ok(()) 2219 } 2220 2221 #[cfg(target_arch = "aarch64")] 2222 fn write_regs( 2223 &self, 2224 cpu_id: usize, 2225 regs: &CoreRegs, 2226 ) -> std::result::Result<(), DebuggableError> { 2227 let mut gregs = self 2228 .get_regs(cpu_id as u8) 2229 .map_err(DebuggableError::ReadRegs)?; 2230 2231 gregs.regs.regs = regs.x; 2232 gregs.regs.sp = regs.sp; 2233 gregs.regs.pc = regs.pc; 2234 2235 self.set_regs(cpu_id as u8, &gregs) 2236 .map_err(DebuggableError::WriteRegs)?; 2237 2238 Ok(()) 2239 } 2240 2241 fn read_mem( 2242 &self, 2243 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2244 cpu_id: usize, 2245 vaddr: GuestAddress, 2246 len: usize, 2247 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2248 let mut buf = vec![0; len]; 2249 let mut total_read = 0_u64; 2250 2251 while total_read < len as u64 { 2252 let gaddr = vaddr.0 + total_read; 2253 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2254 Ok(paddr) => paddr, 2255 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2256 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2257 }; 2258 let psize = arch::PAGE_SIZE as u64; 2259 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2260 guest_memory 2261 .memory() 2262 .read( 2263 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2264 GuestAddress(paddr), 2265 ) 2266 .map_err(DebuggableError::ReadMem)?; 2267 total_read += read_len; 2268 } 2269 Ok(buf) 2270 } 2271 2272 fn write_mem( 2273 &self, 2274 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2275 cpu_id: usize, 2276 vaddr: &GuestAddress, 2277 data: &[u8], 2278 ) -> std::result::Result<(), DebuggableError> { 2279 let mut total_written = 0_u64; 2280 2281 while total_written < data.len() as u64 { 2282 let gaddr = vaddr.0 + total_written; 2283 let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) { 2284 Ok(paddr) => paddr, 2285 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2286 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2287 }; 2288 let psize = arch::PAGE_SIZE as u64; 2289 let write_len = std::cmp::min( 2290 data.len() as u64 - total_written, 2291 psize - (paddr & (psize - 1)), 2292 ); 2293 guest_memory 2294 .memory() 2295 .write( 2296 &data[total_written as usize..total_written as usize + write_len as usize], 2297 GuestAddress(paddr), 2298 ) 2299 .map_err(DebuggableError::WriteMem)?; 2300 total_written += write_len; 2301 } 2302 Ok(()) 2303 } 2304 2305 fn active_vcpus(&self) -> usize { 2306 self.present_vcpus() as usize 2307 } 2308 } 2309 2310 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2311 impl Elf64Writable for CpuManager {} 2312 2313 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2314 impl CpuElf64Writable for CpuManager { 2315 fn cpu_write_elf64_note( 2316 &mut self, 2317 dump_state: &DumpState, 2318 ) -> std::result::Result<(), GuestDebuggableError> { 2319 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2320 for vcpu in &self.vcpus { 2321 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2322 let mut pos: usize = 0; 2323 let mut buf = vec![0; note_size as usize]; 2324 let descsz = size_of::<X86_64ElfPrStatus>(); 2325 let vcpu_id = vcpu.lock().unwrap().id; 2326 2327 let note = Elf64_Nhdr { 2328 n_namesz: COREDUMP_NAME_SIZE, 2329 n_descsz: descsz as u32, 2330 n_type: NT_PRSTATUS, 2331 }; 2332 2333 let bytes: &[u8] = note.as_slice(); 2334 buf.splice(0.., bytes.to_vec()); 2335 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2336 buf.resize(pos + 4, 0); 2337 buf.splice(pos.., "CORE".to_string().into_bytes()); 2338 2339 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2340 buf.resize(pos + 32 + 4, 0); 2341 let pid = vcpu_id as u64; 2342 let bytes: &[u8] = pid.as_slice(); 2343 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2344 2345 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2346 2347 let orig_rax: u64 = 0; 2348 let gregs = self.vcpus[usize::from(vcpu_id)] 2349 .lock() 2350 .unwrap() 2351 .vcpu 2352 .get_regs() 2353 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2354 2355 let regs1 = [ 2356 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11, 2357 gregs.r10, 2358 ]; 2359 let regs2 = [ 2360 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax, 2361 ]; 2362 2363 let sregs = self.vcpus[usize::from(vcpu_id)] 2364 .lock() 2365 .unwrap() 2366 .vcpu 2367 .get_sregs() 2368 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2369 2370 debug!( 2371 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2372 gregs.rip, 2373 gregs.rsp, 2374 sregs.gs.base, 2375 sregs.cs.selector, 2376 sregs.ss.selector, 2377 sregs.ds.selector, 2378 ); 2379 2380 let regs = X86_64UserRegs { 2381 regs1, 2382 regs2, 2383 rip: gregs.rip, 2384 cs: sregs.cs.selector as u64, 2385 eflags: gregs.rflags, 2386 rsp: gregs.rsp, 2387 ss: sregs.ss.selector as u64, 2388 fs_base: sregs.fs.base, 2389 gs_base: sregs.gs.base, 2390 ds: sregs.ds.selector as u64, 2391 es: sregs.es.selector as u64, 2392 fs: sregs.fs.selector as u64, 2393 gs: sregs.gs.selector as u64, 2394 }; 2395 2396 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2397 let bytes: &[u8] = regs.as_slice(); 2398 buf.resize(note_size as usize, 0); 2399 buf.splice(pos.., bytes.to_vec()); 2400 buf.resize(note_size as usize, 0); 2401 2402 coredump_file 2403 .write(&buf) 2404 .map_err(GuestDebuggableError::CoredumpFile)?; 2405 } 2406 2407 Ok(()) 2408 } 2409 2410 fn cpu_write_vmm_note( 2411 &mut self, 2412 dump_state: &DumpState, 2413 ) -> std::result::Result<(), GuestDebuggableError> { 2414 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2415 for vcpu in &self.vcpus { 2416 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2417 let mut pos: usize = 0; 2418 let mut buf = vec![0; note_size as usize]; 2419 let descsz = size_of::<DumpCpusState>(); 2420 let vcpu_id = vcpu.lock().unwrap().id; 2421 2422 let note = Elf64_Nhdr { 2423 n_namesz: COREDUMP_NAME_SIZE, 2424 n_descsz: descsz as u32, 2425 n_type: 0, 2426 }; 2427 2428 let bytes: &[u8] = note.as_slice(); 2429 buf.splice(0.., bytes.to_vec()); 2430 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2431 2432 buf.resize(pos + 4, 0); 2433 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2434 2435 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2436 2437 let gregs = self.vcpus[usize::from(vcpu_id)] 2438 .lock() 2439 .unwrap() 2440 .vcpu 2441 .get_regs() 2442 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2443 2444 let regs1 = [ 2445 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp, 2446 gregs.rbp, 2447 ]; 2448 2449 let regs2 = [ 2450 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, 2451 gregs.r15, 2452 ]; 2453 2454 let sregs = self.vcpus[usize::from(vcpu_id)] 2455 .lock() 2456 .unwrap() 2457 .vcpu 2458 .get_sregs() 2459 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2460 2461 let mut msrs = vec![MsrEntry { 2462 index: msr_index::MSR_KERNEL_GS_BASE, 2463 ..Default::default() 2464 }]; 2465 2466 self.vcpus[vcpu_id as usize] 2467 .lock() 2468 .unwrap() 2469 .vcpu 2470 .get_msrs(&mut msrs) 2471 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2472 let kernel_gs_base = msrs[0].data; 2473 2474 let cs = CpuSegment::new(sregs.cs); 2475 let ds = CpuSegment::new(sregs.ds); 2476 let es = CpuSegment::new(sregs.es); 2477 let fs = CpuSegment::new(sregs.fs); 2478 let gs = CpuSegment::new(sregs.gs); 2479 let ss = CpuSegment::new(sregs.ss); 2480 let ldt = CpuSegment::new(sregs.ldt); 2481 let tr = CpuSegment::new(sregs.tr); 2482 let gdt = CpuSegment::new_from_table(sregs.gdt); 2483 let idt = CpuSegment::new_from_table(sregs.idt); 2484 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2485 let regs = DumpCpusState { 2486 version: 1, 2487 size: size_of::<DumpCpusState>() as u32, 2488 regs1, 2489 regs2, 2490 rip: gregs.rip, 2491 rflags: gregs.rflags, 2492 cs, 2493 ds, 2494 es, 2495 fs, 2496 gs, 2497 ss, 2498 ldt, 2499 tr, 2500 gdt, 2501 idt, 2502 cr, 2503 kernel_gs_base, 2504 }; 2505 2506 let bytes: &[u8] = regs.as_slice(); 2507 buf.resize(note_size as usize, 0); 2508 buf.splice(pos.., bytes.to_vec()); 2509 buf.resize(note_size as usize, 0); 2510 2511 coredump_file 2512 .write(&buf) 2513 .map_err(GuestDebuggableError::CoredumpFile)?; 2514 } 2515 2516 Ok(()) 2517 } 2518 } 2519 2520 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2521 #[cfg(test)] 2522 mod tests { 2523 use arch::x86_64::interrupts::*; 2524 use arch::x86_64::regs::*; 2525 use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters}; 2526 2527 #[test] 2528 fn test_setlint() { 2529 let hv = hypervisor::new().unwrap(); 2530 let vm = hv.create_vm().expect("new VM fd creation failed"); 2531 assert!(hv.check_required_extensions().is_ok()); 2532 // Calling get_lapic will fail if there is no irqchip before hand. 2533 assert!(vm.create_irq_chip().is_ok()); 2534 let vcpu = vm.create_vcpu(0, None).unwrap(); 2535 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2536 2537 // Compute the value that is expected to represent LVT0 and LVT1. 2538 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2539 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2540 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2541 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2542 2543 set_lint(&vcpu).unwrap(); 2544 2545 // Compute the value that represents LVT0 and LVT1 after set_lint. 2546 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2547 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2548 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2549 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2550 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2551 } 2552 2553 #[test] 2554 fn test_setup_fpu() { 2555 let hv = hypervisor::new().unwrap(); 2556 let vm = hv.create_vm().expect("new VM fd creation failed"); 2557 let vcpu = vm.create_vcpu(0, None).unwrap(); 2558 setup_fpu(&vcpu).unwrap(); 2559 2560 let expected_fpu: FpuState = FpuState { 2561 fcw: 0x37f, 2562 mxcsr: 0x1f80, 2563 ..Default::default() 2564 }; 2565 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2566 // TODO: auto-generate kvm related structures with PartialEq on. 2567 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2568 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2569 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2570 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2571 // remove it at all. 2572 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2573 } 2574 2575 #[test] 2576 fn test_setup_msrs() { 2577 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2578 2579 let hv = hypervisor::new().unwrap(); 2580 let vm = hv.create_vm().expect("new VM fd creation failed"); 2581 let vcpu = vm.create_vcpu(0, None).unwrap(); 2582 setup_msrs(&vcpu).unwrap(); 2583 2584 // This test will check against the last MSR entry configured (the tenth one). 2585 // See create_msr_entries for details. 2586 let mut msrs = vec![MsrEntry { 2587 index: msr_index::MSR_IA32_MISC_ENABLE, 2588 ..Default::default() 2589 }]; 2590 2591 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2592 // in this test case scenario. 2593 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2594 assert_eq!(read_msrs, 1); 2595 2596 // Official entries that were setup when we did setup_msrs. We need to assert that the 2597 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2598 // expect. 2599 let entry_vec = vcpu.boot_msr_entries(); 2600 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2601 } 2602 2603 #[test] 2604 fn test_setup_regs() { 2605 let hv = hypervisor::new().unwrap(); 2606 let vm = hv.create_vm().expect("new VM fd creation failed"); 2607 let vcpu = vm.create_vcpu(0, None).unwrap(); 2608 2609 let expected_regs: StandardRegisters = StandardRegisters { 2610 rflags: 0x0000000000000002u64, 2611 rbx: arch::layout::PVH_INFO_START.0, 2612 rip: 1, 2613 ..Default::default() 2614 }; 2615 2616 setup_regs(&vcpu, expected_regs.rip).unwrap(); 2617 2618 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2619 assert_eq!(actual_regs, expected_regs); 2620 } 2621 } 2622 2623 #[cfg(target_arch = "aarch64")] 2624 #[cfg(test)] 2625 mod tests { 2626 use arch::{aarch64::regs, layout}; 2627 use hypervisor::kvm::aarch64::is_system_register; 2628 use hypervisor::kvm::kvm_bindings::{ 2629 kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2630 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2631 }; 2632 use hypervisor::{arm64_core_reg_id, offset__of}; 2633 use std::mem; 2634 2635 #[test] 2636 fn test_setup_regs() { 2637 let hv = hypervisor::new().unwrap(); 2638 let vm = hv.create_vm().unwrap(); 2639 let vcpu = vm.create_vcpu(0, None).unwrap(); 2640 2641 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2642 // Must fail when vcpu is not initialized yet. 2643 assert!(res.is_err()); 2644 2645 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2646 vm.get_preferred_target(&mut kvi).unwrap(); 2647 vcpu.vcpu_init(&kvi).unwrap(); 2648 2649 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2650 } 2651 2652 #[test] 2653 fn test_read_mpidr() { 2654 let hv = hypervisor::new().unwrap(); 2655 let vm = hv.create_vm().unwrap(); 2656 let vcpu = vm.create_vcpu(0, None).unwrap(); 2657 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2658 vm.get_preferred_target(&mut kvi).unwrap(); 2659 2660 // Must fail when vcpu is not initialized yet. 2661 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2662 2663 vcpu.vcpu_init(&kvi).unwrap(); 2664 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2665 } 2666 2667 #[test] 2668 fn test_is_system_register() { 2669 let offset = offset__of!(user_pt_regs, pc); 2670 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2671 assert!(!is_system_register(regid)); 2672 let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64; 2673 assert!(is_system_register(regid)); 2674 } 2675 2676 #[test] 2677 fn test_save_restore_core_regs() { 2678 let hv = hypervisor::new().unwrap(); 2679 let vm = hv.create_vm().unwrap(); 2680 let vcpu = vm.create_vcpu(0, None).unwrap(); 2681 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2682 vm.get_preferred_target(&mut kvi).unwrap(); 2683 2684 // Must fail when vcpu is not initialized yet. 2685 let res = vcpu.get_regs(); 2686 assert!(res.is_err()); 2687 assert_eq!( 2688 format!("{}", res.unwrap_err()), 2689 "Failed to get core register: Exec format error (os error 8)" 2690 ); 2691 2692 let mut state = kvm_regs::default(); 2693 let res = vcpu.set_regs(&state); 2694 assert!(res.is_err()); 2695 assert_eq!( 2696 format!("{}", res.unwrap_err()), 2697 "Failed to set core register: Exec format error (os error 8)" 2698 ); 2699 2700 vcpu.vcpu_init(&kvi).unwrap(); 2701 let res = vcpu.get_regs(); 2702 assert!(res.is_ok()); 2703 state = res.unwrap(); 2704 assert_eq!(state.regs.pstate, 0x3C5); 2705 2706 assert!(vcpu.set_regs(&state).is_ok()); 2707 } 2708 2709 #[test] 2710 fn test_get_set_mpstate() { 2711 let hv = hypervisor::new().unwrap(); 2712 let vm = hv.create_vm().unwrap(); 2713 let vcpu = vm.create_vcpu(0, None).unwrap(); 2714 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2715 vm.get_preferred_target(&mut kvi).unwrap(); 2716 2717 let res = vcpu.get_mp_state(); 2718 assert!(res.is_ok()); 2719 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 2720 } 2721 } 2722