1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(feature = "guest_debug")] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 #[cfg(feature = "guest_debug")] 22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 23 use crate::memory_manager::MemoryManager; 24 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 25 #[cfg(target_arch = "x86_64")] 26 use crate::vm::physical_bits; 27 use crate::GuestMemoryMmap; 28 use crate::CPU_MANAGER_SNAPSHOT_ID; 29 use acpi_tables::{aml, aml::Aml, sdt::Sdt}; 30 use anyhow::anyhow; 31 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 32 use arch::aarch64::regs; 33 use arch::EntryPoint; 34 use arch::NumaNodes; 35 #[cfg(target_arch = "aarch64")] 36 use devices::gic::Gic; 37 use devices::interrupt_controller::InterruptController; 38 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 39 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 40 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 41 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 42 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 43 use hypervisor::aarch64::StandardRegisters; 44 #[cfg(feature = "guest_debug")] 45 use hypervisor::arch::x86::msr_index; 46 #[cfg(target_arch = "x86_64")] 47 use hypervisor::arch::x86::CpuIdEntry; 48 #[cfg(feature = "guest_debug")] 49 use hypervisor::arch::x86::MsrEntry; 50 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 51 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters}; 52 #[cfg(target_arch = "aarch64")] 53 use hypervisor::kvm::kvm_bindings; 54 #[cfg(feature = "tdx")] 55 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 56 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps}; 57 use libc::{c_void, siginfo_t}; 58 #[cfg(feature = "guest_debug")] 59 use linux_loader::elf::Elf64_Nhdr; 60 use seccompiler::{apply_filter, SeccompAction}; 61 use std::collections::BTreeMap; 62 #[cfg(feature = "guest_debug")] 63 use std::io::Write; 64 #[cfg(feature = "guest_debug")] 65 use std::mem::size_of; 66 use std::os::unix::thread::JoinHandleExt; 67 use std::sync::atomic::{AtomicBool, Ordering}; 68 use std::sync::{Arc, Barrier, Mutex}; 69 use std::{cmp, io, result, thread}; 70 use thiserror::Error; 71 use tracer::trace_scoped; 72 use vm_device::BusDevice; 73 #[cfg(feature = "guest_debug")] 74 use vm_memory::ByteValued; 75 #[cfg(feature = "guest_debug")] 76 use vm_memory::{Bytes, GuestAddressSpace}; 77 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 78 use vm_migration::{ 79 Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable, 80 Transportable, 81 }; 82 use vmm_sys_util::eventfd::EventFd; 83 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 84 85 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 86 /// Extract the specified bits of a 64-bit integer. 87 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 88 /// following expression should return 3 (`0b11`): 89 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 90 /// 91 macro_rules! extract_bits_64 { 92 ($value: tt, $offset: tt, $length: tt) => { 93 ($value >> $offset) & (!0u64 >> (64 - $length)) 94 }; 95 } 96 97 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 98 99 #[derive(Debug, Error)] 100 pub enum Error { 101 #[error("Error creating vCPU: {0}")] 102 VcpuCreate(#[source] anyhow::Error), 103 104 #[error("Error running bCPU: {0}")] 105 VcpuRun(#[source] anyhow::Error), 106 107 #[error("Error spawning vCPU thread: {0}")] 108 VcpuSpawn(#[source] io::Error), 109 110 #[error("Error generating common CPUID: {0}")] 111 CommonCpuId(#[source] arch::Error), 112 113 #[error("Error configuring vCPU: {0}")] 114 VcpuConfiguration(#[source] arch::Error), 115 116 #[cfg(target_arch = "aarch64")] 117 #[error("Error fetching preferred target: {0}")] 118 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 119 120 #[cfg(target_arch = "aarch64")] 121 #[error("Error initialising vCPU: {0}")] 122 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 123 124 #[error("Failed to join on vCPU threads: {0:?}")] 125 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 126 127 #[error("Error adding CpuManager to MMIO bus: {0}")] 128 BusError(#[source] vm_device::BusError), 129 130 #[error("Requested vCPUs exceed maximum")] 131 DesiredVCpuCountExceedsMax, 132 133 #[error("Cannot create seccomp filter: {0}")] 134 CreateSeccompFilter(#[source] seccompiler::Error), 135 136 #[error("Cannot apply seccomp filter: {0}")] 137 ApplySeccompFilter(#[source] seccompiler::Error), 138 139 #[error("Error starting vCPU after restore: {0}")] 140 StartRestoreVcpu(#[source] anyhow::Error), 141 142 #[error("Unexpected VmExit")] 143 UnexpectedVmExit, 144 145 #[error("Failed to allocate MMIO address for CpuManager")] 146 AllocateMmmioAddress, 147 148 #[cfg(feature = "tdx")] 149 #[error("Error initializing TDX: {0}")] 150 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 151 152 #[cfg(target_arch = "aarch64")] 153 #[error("Error initializing PMU: {0}")] 154 InitPmu(#[source] hypervisor::HypervisorCpuError), 155 156 #[cfg(feature = "guest_debug")] 157 #[error("Error during CPU debug: {0}")] 158 CpuDebug(#[source] hypervisor::HypervisorCpuError), 159 160 #[cfg(feature = "guest_debug")] 161 #[error("Error translating virtual address: {0}")] 162 TranslateVirtualAddress(#[source] anyhow::Error), 163 164 #[cfg(target_arch = "x86_64")] 165 #[error("Error setting up AMX: {0}")] 166 AmxEnable(#[source] anyhow::Error), 167 } 168 pub type Result<T> = result::Result<T, Error>; 169 170 #[cfg(target_arch = "x86_64")] 171 #[allow(dead_code)] 172 #[repr(packed)] 173 struct LocalApic { 174 pub r#type: u8, 175 pub length: u8, 176 pub processor_id: u8, 177 pub apic_id: u8, 178 pub flags: u32, 179 } 180 181 #[allow(dead_code)] 182 #[repr(packed)] 183 #[derive(Default)] 184 struct Ioapic { 185 pub r#type: u8, 186 pub length: u8, 187 pub ioapic_id: u8, 188 _reserved: u8, 189 pub apic_address: u32, 190 pub gsi_base: u32, 191 } 192 193 #[cfg(target_arch = "aarch64")] 194 #[allow(dead_code)] 195 #[repr(packed)] 196 struct GicC { 197 pub r#type: u8, 198 pub length: u8, 199 pub reserved0: u16, 200 pub cpu_interface_number: u32, 201 pub uid: u32, 202 pub flags: u32, 203 pub parking_version: u32, 204 pub performance_interrupt: u32, 205 pub parked_address: u64, 206 pub base_address: u64, 207 pub gicv_base_address: u64, 208 pub gich_base_address: u64, 209 pub vgic_interrupt: u32, 210 pub gicr_base_address: u64, 211 pub mpidr: u64, 212 pub proc_power_effi_class: u8, 213 pub reserved1: u8, 214 pub spe_overflow_interrupt: u16, 215 } 216 217 #[cfg(target_arch = "aarch64")] 218 #[allow(dead_code)] 219 #[repr(packed)] 220 struct GicD { 221 pub r#type: u8, 222 pub length: u8, 223 pub reserved0: u16, 224 pub gic_id: u32, 225 pub base_address: u64, 226 pub global_irq_base: u32, 227 pub version: u8, 228 pub reserved1: [u8; 3], 229 } 230 231 #[cfg(target_arch = "aarch64")] 232 #[allow(dead_code)] 233 #[repr(packed)] 234 struct GicR { 235 pub r#type: u8, 236 pub length: u8, 237 pub reserved: u16, 238 pub base_address: u64, 239 pub range_length: u32, 240 } 241 242 #[cfg(target_arch = "aarch64")] 243 #[allow(dead_code)] 244 #[repr(packed)] 245 struct GicIts { 246 pub r#type: u8, 247 pub length: u8, 248 pub reserved0: u16, 249 pub translation_id: u32, 250 pub base_address: u64, 251 pub reserved1: u32, 252 } 253 254 #[cfg(target_arch = "aarch64")] 255 #[allow(dead_code)] 256 #[repr(packed)] 257 struct ProcessorHierarchyNode { 258 pub r#type: u8, 259 pub length: u8, 260 pub reserved: u16, 261 pub flags: u32, 262 pub parent: u32, 263 pub acpi_processor_id: u32, 264 pub num_private_resources: u32, 265 } 266 267 #[allow(dead_code)] 268 #[repr(packed)] 269 #[derive(Default)] 270 struct InterruptSourceOverride { 271 pub r#type: u8, 272 pub length: u8, 273 pub bus: u8, 274 pub source: u8, 275 pub gsi: u32, 276 pub flags: u16, 277 } 278 279 #[cfg(feature = "guest_debug")] 280 macro_rules! round_up { 281 ($n:expr,$d:expr) => { 282 (($n / ($d + 1)) + 1) * $d 283 }; 284 } 285 286 /// A wrapper around creating and using a kvm-based VCPU. 287 pub struct Vcpu { 288 // The hypervisor abstracted CPU. 289 vcpu: Arc<dyn hypervisor::Vcpu>, 290 id: u8, 291 #[cfg(target_arch = "aarch64")] 292 mpidr: u64, 293 saved_state: Option<CpuState>, 294 } 295 296 impl Vcpu { 297 /// Constructs a new VCPU for `vm`. 298 /// 299 /// # Arguments 300 /// 301 /// * `id` - Represents the CPU number between [0, max vcpus). 302 /// * `vm` - The virtual machine this vcpu will get attached to. 303 /// * `vm_ops` - Optional object for exit handling. 304 pub fn new( 305 id: u8, 306 vm: &Arc<dyn hypervisor::Vm>, 307 vm_ops: Option<Arc<dyn VmOps>>, 308 ) -> Result<Self> { 309 let vcpu = vm 310 .create_vcpu(id, vm_ops) 311 .map_err(|e| Error::VcpuCreate(e.into()))?; 312 // Initially the cpuid per vCPU is the one supported by this VM. 313 Ok(Vcpu { 314 vcpu, 315 id, 316 #[cfg(target_arch = "aarch64")] 317 mpidr: 0, 318 saved_state: None, 319 }) 320 } 321 322 /// Configures a vcpu and should be called once per vcpu when created. 323 /// 324 /// # Arguments 325 /// 326 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 327 /// * `vm_memory` - Guest memory. 328 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 329 pub fn configure( 330 &mut self, 331 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 332 kernel_entry_point: Option<EntryPoint>, 333 #[cfg(target_arch = "x86_64")] vm_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 334 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 335 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 336 ) -> Result<()> { 337 #[cfg(target_arch = "aarch64")] 338 { 339 self.init(vm)?; 340 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, kernel_entry_point) 341 .map_err(Error::VcpuConfiguration)?; 342 } 343 info!("Configuring vCPU: cpu_id = {}", self.id); 344 #[cfg(target_arch = "x86_64")] 345 arch::configure_vcpu( 346 &self.vcpu, 347 self.id, 348 kernel_entry_point, 349 vm_memory, 350 cpuid, 351 kvm_hyperv, 352 ) 353 .map_err(Error::VcpuConfiguration)?; 354 355 Ok(()) 356 } 357 358 /// Gets the MPIDR register value. 359 #[cfg(target_arch = "aarch64")] 360 pub fn get_mpidr(&self) -> u64 { 361 self.mpidr 362 } 363 364 /// Gets the saved vCPU state. 365 #[cfg(target_arch = "aarch64")] 366 pub fn get_saved_state(&self) -> Option<CpuState> { 367 self.saved_state.clone() 368 } 369 370 /// Initializes an aarch64 specific vcpu for booting Linux. 371 #[cfg(target_arch = "aarch64")] 372 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 373 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 374 375 // This reads back the kernel's preferred target type. 376 vm.get_preferred_target(&mut kvi) 377 .map_err(Error::VcpuArmPreferredTarget)?; 378 // We already checked that the capability is supported. 379 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 380 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 381 // Non-boot cpus are powered off initially. 382 if self.id > 0 { 383 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 384 } 385 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 386 } 387 388 /// Runs the VCPU until it exits, returning the reason. 389 /// 390 /// Note that the state of the VCPU and associated VM must be setup first for this to do 391 /// anything useful. 392 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 393 self.vcpu.run() 394 } 395 } 396 397 const VCPU_SNAPSHOT_ID: &str = "vcpu"; 398 impl Pausable for Vcpu {} 399 impl Snapshottable for Vcpu { 400 fn id(&self) -> String { 401 VCPU_SNAPSHOT_ID.to_string() 402 } 403 404 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 405 let saved_state = self 406 .vcpu 407 .state() 408 .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?; 409 410 let mut vcpu_snapshot = Snapshot::new(&format!("{:03}", self.id)); 411 vcpu_snapshot.add_data_section(SnapshotDataSection::new_from_state( 412 VCPU_SNAPSHOT_ID, 413 &saved_state, 414 )?); 415 416 self.saved_state = Some(saved_state); 417 418 Ok(vcpu_snapshot) 419 } 420 421 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 422 let saved_state: CpuState = snapshot.to_state(VCPU_SNAPSHOT_ID)?; 423 424 self.vcpu 425 .set_state(&saved_state) 426 .map_err(|e| MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e)))?; 427 428 self.saved_state = Some(saved_state); 429 430 Ok(()) 431 } 432 } 433 434 pub struct CpuManager { 435 hypervisor_type: HypervisorType, 436 config: CpusConfig, 437 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 438 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 439 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 440 vm_memory: GuestMemoryAtomic<GuestMemoryMmap>, 441 #[cfg(target_arch = "x86_64")] 442 cpuid: Vec<CpuIdEntry>, 443 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 444 vm: Arc<dyn hypervisor::Vm>, 445 vcpus_kill_signalled: Arc<AtomicBool>, 446 vcpus_pause_signalled: Arc<AtomicBool>, 447 exit_evt: EventFd, 448 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 449 reset_evt: EventFd, 450 #[cfg(feature = "guest_debug")] 451 vm_debug_evt: EventFd, 452 vcpu_states: Vec<VcpuState>, 453 selected_cpu: u8, 454 vcpus: Vec<Arc<Mutex<Vcpu>>>, 455 seccomp_action: SeccompAction, 456 vm_ops: Arc<dyn VmOps>, 457 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 458 acpi_address: Option<GuestAddress>, 459 proximity_domain_per_cpu: BTreeMap<u8, u32>, 460 affinity: BTreeMap<u8, Vec<u8>>, 461 dynamic: bool, 462 } 463 464 const CPU_ENABLE_FLAG: usize = 0; 465 const CPU_INSERTING_FLAG: usize = 1; 466 const CPU_REMOVING_FLAG: usize = 2; 467 const CPU_EJECT_FLAG: usize = 3; 468 469 const CPU_STATUS_OFFSET: u64 = 4; 470 const CPU_SELECTION_OFFSET: u64 = 0; 471 472 impl BusDevice for CpuManager { 473 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 474 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 475 data.fill(0); 476 477 match offset { 478 CPU_SELECTION_OFFSET => { 479 data[0] = self.selected_cpu; 480 } 481 CPU_STATUS_OFFSET => { 482 if self.selected_cpu < self.max_vcpus() { 483 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 484 if state.active() { 485 data[0] |= 1 << CPU_ENABLE_FLAG; 486 } 487 if state.inserting { 488 data[0] |= 1 << CPU_INSERTING_FLAG; 489 } 490 if state.removing { 491 data[0] |= 1 << CPU_REMOVING_FLAG; 492 } 493 } else { 494 warn!("Out of range vCPU id: {}", self.selected_cpu); 495 } 496 } 497 _ => { 498 warn!( 499 "Unexpected offset for accessing CPU manager device: {:#}", 500 offset 501 ); 502 } 503 } 504 } 505 506 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 507 match offset { 508 CPU_SELECTION_OFFSET => { 509 self.selected_cpu = data[0]; 510 } 511 CPU_STATUS_OFFSET => { 512 if self.selected_cpu < self.max_vcpus() { 513 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 514 // The ACPI code writes back a 1 to acknowledge the insertion 515 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 516 && state.inserting 517 { 518 state.inserting = false; 519 } 520 // Ditto for removal 521 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 522 && state.removing 523 { 524 state.removing = false; 525 } 526 // Trigger removal of vCPU 527 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 528 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 529 error!("Error removing vCPU: {:?}", e); 530 } 531 } 532 } else { 533 warn!("Out of range vCPU id: {}", self.selected_cpu); 534 } 535 } 536 _ => { 537 warn!( 538 "Unexpected offset for accessing CPU manager device: {:#}", 539 offset 540 ); 541 } 542 } 543 None 544 } 545 } 546 547 #[derive(Default)] 548 struct VcpuState { 549 inserting: bool, 550 removing: bool, 551 handle: Option<thread::JoinHandle<()>>, 552 kill: Arc<AtomicBool>, 553 vcpu_run_interrupted: Arc<AtomicBool>, 554 } 555 556 impl VcpuState { 557 fn active(&self) -> bool { 558 self.handle.is_some() 559 } 560 561 fn signal_thread(&self) { 562 if let Some(handle) = self.handle.as_ref() { 563 loop { 564 // SAFETY: FFI call with correct arguments 565 unsafe { 566 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 567 } 568 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 569 break; 570 } else { 571 // This is more effective than thread::yield_now() at 572 // avoiding a priority inversion with the vCPU thread 573 thread::sleep(std::time::Duration::from_millis(1)); 574 } 575 } 576 } 577 } 578 579 fn join_thread(&mut self) -> Result<()> { 580 if let Some(handle) = self.handle.take() { 581 handle.join().map_err(Error::ThreadCleanup)? 582 } 583 584 Ok(()) 585 } 586 587 fn unpark_thread(&self) { 588 if let Some(handle) = self.handle.as_ref() { 589 handle.thread().unpark() 590 } 591 } 592 } 593 594 impl CpuManager { 595 #[allow(unused_variables)] 596 #[allow(clippy::too_many_arguments)] 597 pub fn new( 598 config: &CpusConfig, 599 memory_manager: &Arc<Mutex<MemoryManager>>, 600 vm: Arc<dyn hypervisor::Vm>, 601 exit_evt: EventFd, 602 reset_evt: EventFd, 603 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 604 hypervisor: Arc<dyn hypervisor::Hypervisor>, 605 seccomp_action: SeccompAction, 606 vm_ops: Arc<dyn VmOps>, 607 #[cfg(feature = "tdx")] tdx_enabled: bool, 608 numa_nodes: &NumaNodes, 609 ) -> Result<Arc<Mutex<CpuManager>>> { 610 let guest_memory = memory_manager.lock().unwrap().guest_memory(); 611 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 612 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 613 let hypervisor_type = hypervisor.hypervisor_type(); 614 615 #[cfg(target_arch = "x86_64")] 616 let sgx_epc_sections = memory_manager 617 .lock() 618 .unwrap() 619 .sgx_epc_region() 620 .as_ref() 621 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 622 #[cfg(target_arch = "x86_64")] 623 let cpuid = { 624 let phys_bits = physical_bits(config.max_phys_bits); 625 arch::generate_common_cpuid( 626 hypervisor, 627 config 628 .topology 629 .clone() 630 .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)), 631 sgx_epc_sections, 632 phys_bits, 633 config.kvm_hyperv, 634 #[cfg(feature = "tdx")] 635 tdx_enabled, 636 ) 637 .map_err(Error::CommonCpuId)? 638 }; 639 #[cfg(target_arch = "x86_64")] 640 if config.features.amx { 641 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 642 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 643 const XFEATURE_XTILEDATA: usize = 18; 644 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 645 646 // SAFETY: the syscall is only modifing kernel internal 647 // data structures that the kernel is itself expected to safeguard. 648 let amx_tile = unsafe { 649 libc::syscall( 650 libc::SYS_arch_prctl, 651 ARCH_REQ_XCOMP_GUEST_PERM, 652 XFEATURE_XTILEDATA, 653 ) 654 }; 655 656 if amx_tile != 0 { 657 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 658 } else { 659 let mask: usize = 0; 660 // SAFETY: the mask being modified (not marked mutable as it is 661 // modified in unsafe only which is permitted) isn't in use elsewhere. 662 let result = unsafe { 663 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 664 }; 665 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 666 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 667 } 668 } 669 } 670 671 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 672 let mut cpu_list = Vec::new(); 673 for (proximity_domain, numa_node) in numa_nodes.iter() { 674 for cpu in numa_node.cpus.iter() { 675 cpu_list.push((*cpu, *proximity_domain)) 676 } 677 } 678 cpu_list 679 } 680 .into_iter() 681 .collect(); 682 683 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 684 cpu_affinity 685 .iter() 686 .map(|a| (a.vcpu, a.host_cpus.clone())) 687 .collect() 688 } else { 689 BTreeMap::new() 690 }; 691 692 #[cfg(feature = "tdx")] 693 let dynamic = !tdx_enabled; 694 #[cfg(not(feature = "tdx"))] 695 let dynamic = true; 696 697 Ok(Arc::new(Mutex::new(CpuManager { 698 hypervisor_type, 699 config: config.clone(), 700 interrupt_controller: None, 701 vm_memory: guest_memory, 702 #[cfg(target_arch = "x86_64")] 703 cpuid, 704 vm, 705 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 706 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 707 vcpu_states, 708 exit_evt, 709 reset_evt, 710 #[cfg(feature = "guest_debug")] 711 vm_debug_evt, 712 selected_cpu: 0, 713 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 714 seccomp_action, 715 vm_ops, 716 acpi_address: None, 717 proximity_domain_per_cpu, 718 affinity, 719 dynamic, 720 }))) 721 } 722 723 fn create_vcpu(&mut self, cpu_id: u8) -> Result<Arc<Mutex<Vcpu>>> { 724 info!("Creating vCPU: cpu_id = {}", cpu_id); 725 726 let vcpu = Arc::new(Mutex::new(Vcpu::new( 727 cpu_id, 728 &self.vm, 729 Some(self.vm_ops.clone()), 730 )?)); 731 732 // Adding vCPU to the CpuManager's vCPU list. 733 self.vcpus.push(vcpu.clone()); 734 735 Ok(vcpu) 736 } 737 738 pub fn configure_vcpu( 739 &self, 740 vcpu: Arc<Mutex<Vcpu>>, 741 entry_point: Option<EntryPoint>, 742 snapshot: Option<Snapshot>, 743 ) -> Result<()> { 744 let mut vcpu = vcpu.lock().unwrap(); 745 746 if let Some(snapshot) = snapshot { 747 // AArch64 vCPUs should be initialized after created. 748 #[cfg(target_arch = "aarch64")] 749 vcpu.init(&self.vm)?; 750 751 vcpu.restore(snapshot).expect("Failed to restore vCPU"); 752 } else { 753 #[cfg(target_arch = "x86_64")] 754 vcpu.configure( 755 entry_point, 756 &self.vm_memory, 757 self.cpuid.clone(), 758 self.config.kvm_hyperv, 759 ) 760 .expect("Failed to configure vCPU"); 761 762 #[cfg(target_arch = "aarch64")] 763 vcpu.configure(&self.vm, entry_point) 764 .expect("Failed to configure vCPU"); 765 } 766 767 Ok(()) 768 } 769 770 /// Only create new vCPUs if there aren't any inactive ones to reuse 771 fn create_vcpus(&mut self, desired_vcpus: u8) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 772 let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![]; 773 info!( 774 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 775 desired_vcpus, 776 self.config.max_vcpus, 777 self.vcpus.len(), 778 self.present_vcpus() 779 ); 780 781 if desired_vcpus > self.config.max_vcpus { 782 return Err(Error::DesiredVCpuCountExceedsMax); 783 } 784 785 // Only create vCPUs in excess of all the allocated vCPUs. 786 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 787 vcpus.push(self.create_vcpu(cpu_id)?); 788 } 789 790 Ok(vcpus) 791 } 792 793 #[cfg(target_arch = "aarch64")] 794 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 795 for cpu in self.vcpus.iter() { 796 let cpu = cpu.lock().unwrap(); 797 // Check if PMU attr is available, if not, log the information. 798 if cpu.vcpu.has_pmu_support() { 799 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 800 } else { 801 debug!( 802 "PMU attribute is not supported in vCPU{}, skip PMU init!", 803 cpu.id 804 ); 805 return Ok(false); 806 } 807 } 808 809 Ok(true) 810 } 811 812 pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> { 813 self.vcpus.clone() 814 } 815 816 fn start_vcpu( 817 &mut self, 818 vcpu: Arc<Mutex<Vcpu>>, 819 vcpu_id: u8, 820 vcpu_thread_barrier: Arc<Barrier>, 821 inserting: bool, 822 ) -> Result<()> { 823 let reset_evt = self.reset_evt.try_clone().unwrap(); 824 let exit_evt = self.exit_evt.try_clone().unwrap(); 825 #[cfg(feature = "guest_debug")] 826 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 827 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 828 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 829 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 830 831 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 832 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 833 .vcpu_run_interrupted 834 .clone(); 835 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 836 837 // Prepare the CPU set the current vCPU is expected to run onto. 838 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 839 // SAFETY: all zeros is a valid pattern 840 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 841 // SAFETY: FFI call, trivially safe 842 unsafe { libc::CPU_ZERO(&mut cpuset) }; 843 for host_cpu in host_cpus { 844 // SAFETY: FFI call, trivially safe 845 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) }; 846 } 847 cpuset 848 }); 849 850 // Retrieve seccomp filter for vcpu thread 851 let vcpu_seccomp_filter = 852 get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type) 853 .map_err(Error::CreateSeccompFilter)?; 854 855 #[cfg(target_arch = "x86_64")] 856 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 857 858 info!("Starting vCPU: cpu_id = {}", vcpu_id); 859 860 let handle = Some( 861 thread::Builder::new() 862 .name(format!("vcpu{}", vcpu_id)) 863 .spawn(move || { 864 // Schedule the thread to run on the expected CPU set 865 if let Some(cpuset) = cpuset.as_ref() { 866 // SAFETY: FFI call with correct arguments 867 let ret = unsafe { 868 libc::sched_setaffinity( 869 0, 870 std::mem::size_of::<libc::cpu_set_t>(), 871 cpuset as *const libc::cpu_set_t, 872 ) 873 }; 874 875 if ret != 0 { 876 error!( 877 "Failed scheduling the vCPU {} on the expected CPU set: {}", 878 vcpu_id, 879 io::Error::last_os_error() 880 ); 881 return; 882 } 883 } 884 885 // Apply seccomp filter for vcpu thread. 886 if !vcpu_seccomp_filter.is_empty() { 887 if let Err(e) = 888 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 889 { 890 error!("Error applying seccomp filter: {:?}", e); 891 return; 892 } 893 } 894 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 895 // This uses an async signal safe handler to kill the vcpu handles. 896 register_signal_handler(SIGRTMIN(), handle_signal) 897 .expect("Failed to register vcpu signal handler"); 898 // Block until all CPUs are ready. 899 vcpu_thread_barrier.wait(); 900 901 std::panic::catch_unwind(move || { 902 loop { 903 // If we are being told to pause, we park the thread 904 // until the pause boolean is toggled. 905 // The resume operation is responsible for toggling 906 // the boolean and unpark the thread. 907 // We enter a loop because park() could spuriously 908 // return. We will then park() again unless the 909 // pause boolean has been toggled. 910 911 // Need to use Ordering::SeqCst as we have multiple 912 // loads and stores to different atomics and we need 913 // to see them in a consistent order in all threads 914 915 if vcpu_pause_signalled.load(Ordering::SeqCst) { 916 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 917 // completed by returning to KVM_RUN. From the kernel docs: 918 // 919 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 920 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 921 // operations are complete (and guest state is consistent) only after userspace 922 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 923 // incomplete operations and then check for pending signals. 924 // The pending state of the operation is not preserved in state which is 925 // visible to userspace, thus userspace should ensure that the operation is 926 // completed before performing a live migration. Userspace can re-enter the 927 // guest with an unmasked signal pending or with the immediate_exit field set 928 // to complete pending operations without allowing any further instructions 929 // to be executed. 930 931 #[cfg(feature = "kvm")] 932 { 933 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 934 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 935 error!("Unexpected VM exit on \"immediate_exit\" run"); 936 break; 937 } 938 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 939 } 940 941 vcpu_run_interrupted.store(true, Ordering::SeqCst); 942 while vcpu_pause_signalled.load(Ordering::SeqCst) { 943 thread::park(); 944 } 945 vcpu_run_interrupted.store(false, Ordering::SeqCst); 946 } 947 948 // We've been told to terminate 949 if vcpu_kill_signalled.load(Ordering::SeqCst) 950 || vcpu_kill.load(Ordering::SeqCst) 951 { 952 vcpu_run_interrupted.store(true, Ordering::SeqCst); 953 break; 954 } 955 956 #[cfg(feature = "tdx")] 957 let mut vcpu = vcpu.lock().unwrap(); 958 #[cfg(not(feature = "tdx"))] 959 let vcpu = vcpu.lock().unwrap(); 960 // vcpu.run() returns false on a triple-fault so trigger a reset 961 match vcpu.run() { 962 Ok(run) => match run { 963 #[cfg(feature = "kvm")] 964 VmExit::Debug => { 965 info!("VmExit::Debug"); 966 #[cfg(feature = "guest_debug")] 967 { 968 vcpu_pause_signalled.store(true, Ordering::SeqCst); 969 let raw_tid = get_raw_tid(vcpu_id as usize); 970 vm_debug_evt.write(raw_tid as u64).unwrap(); 971 } 972 } 973 #[cfg(target_arch = "x86_64")] 974 VmExit::IoapicEoi(vector) => { 975 if let Some(interrupt_controller) = 976 &interrupt_controller_clone 977 { 978 interrupt_controller 979 .lock() 980 .unwrap() 981 .end_of_interrupt(vector); 982 } 983 } 984 VmExit::Ignore => {} 985 VmExit::Hyperv => {} 986 VmExit::Reset => { 987 info!("VmExit::Reset"); 988 vcpu_run_interrupted.store(true, Ordering::SeqCst); 989 reset_evt.write(1).unwrap(); 990 break; 991 } 992 VmExit::Shutdown => { 993 info!("VmExit::Shutdown"); 994 vcpu_run_interrupted.store(true, Ordering::SeqCst); 995 exit_evt.write(1).unwrap(); 996 break; 997 } 998 #[cfg(feature = "tdx")] 999 VmExit::Tdx => { 1000 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1001 match vcpu.get_tdx_exit_details() { 1002 Ok(details) => match details { 1003 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1004 TdxExitDetails::SetupEventNotifyInterrupt => { 1005 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1006 } 1007 }, 1008 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1009 } 1010 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1011 } else { 1012 // We should never reach this code as 1013 // this means the design from the code 1014 // is wrong. 1015 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1016 } 1017 } 1018 _ => { 1019 error!( 1020 "VCPU generated error: {:?}", 1021 Error::UnexpectedVmExit 1022 ); 1023 break; 1024 } 1025 }, 1026 1027 Err(e) => { 1028 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1029 break; 1030 } 1031 } 1032 1033 // We've been told to terminate 1034 if vcpu_kill_signalled.load(Ordering::SeqCst) 1035 || vcpu_kill.load(Ordering::SeqCst) 1036 { 1037 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1038 break; 1039 } 1040 } 1041 }) 1042 .or_else(|_| { 1043 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1044 error!("vCPU thread panicked"); 1045 panic_exit_evt.write(1) 1046 }) 1047 .ok(); 1048 }) 1049 .map_err(Error::VcpuSpawn)?, 1050 ); 1051 1052 // On hot plug calls into this function entry_point is None. It is for 1053 // those hotplug CPU additions that we need to set the inserting flag. 1054 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1055 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1056 1057 Ok(()) 1058 } 1059 1060 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1061 fn activate_vcpus( 1062 &mut self, 1063 desired_vcpus: u8, 1064 inserting: bool, 1065 paused: Option<bool>, 1066 ) -> Result<()> { 1067 if desired_vcpus > self.config.max_vcpus { 1068 return Err(Error::DesiredVCpuCountExceedsMax); 1069 } 1070 1071 let vcpu_thread_barrier = Arc::new(Barrier::new( 1072 (desired_vcpus - self.present_vcpus() + 1) as usize, 1073 )); 1074 1075 if let Some(paused) = paused { 1076 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1077 } 1078 1079 info!( 1080 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1081 desired_vcpus, 1082 self.vcpus.len(), 1083 self.present_vcpus(), 1084 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1085 ); 1086 1087 // This reuses any inactive vCPUs as well as any that were newly created 1088 for vcpu_id in self.present_vcpus()..desired_vcpus { 1089 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1090 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1091 } 1092 1093 // Unblock all CPU threads. 1094 vcpu_thread_barrier.wait(); 1095 Ok(()) 1096 } 1097 1098 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1099 // Mark vCPUs for removal, actual removal happens on ejection 1100 for cpu_id in desired_vcpus..self.present_vcpus() { 1101 self.vcpu_states[usize::from(cpu_id)].removing = true; 1102 } 1103 } 1104 1105 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1106 info!("Removing vCPU: cpu_id = {}", cpu_id); 1107 let mut state = &mut self.vcpu_states[usize::from(cpu_id)]; 1108 state.kill.store(true, Ordering::SeqCst); 1109 state.signal_thread(); 1110 state.join_thread()?; 1111 state.handle = None; 1112 1113 // Once the thread has exited, clear the "kill" so that it can reused 1114 state.kill.store(false, Ordering::SeqCst); 1115 1116 Ok(()) 1117 } 1118 1119 pub fn create_boot_vcpus(&mut self) -> Result<Vec<Arc<Mutex<Vcpu>>>> { 1120 trace_scoped!("create_boot_vcpus"); 1121 1122 self.create_vcpus(self.boot_vcpus()) 1123 } 1124 1125 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1126 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1127 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1128 } 1129 1130 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1131 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1132 .map_err(|e| { 1133 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1134 })?; 1135 1136 Ok(()) 1137 } 1138 1139 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1140 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1141 return Ok(false); 1142 } 1143 1144 if !self.dynamic { 1145 return Ok(false); 1146 } 1147 1148 match desired_vcpus.cmp(&self.present_vcpus()) { 1149 cmp::Ordering::Greater => { 1150 let vcpus = self.create_vcpus(desired_vcpus)?; 1151 for vcpu in vcpus { 1152 self.configure_vcpu(vcpu, None, None)? 1153 } 1154 self.activate_vcpus(desired_vcpus, true, None)?; 1155 Ok(true) 1156 } 1157 cmp::Ordering::Less => { 1158 self.mark_vcpus_for_removal(desired_vcpus); 1159 Ok(true) 1160 } 1161 _ => Ok(false), 1162 } 1163 } 1164 1165 pub fn shutdown(&mut self) -> Result<()> { 1166 // Tell the vCPUs to stop themselves next time they go through the loop 1167 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1168 1169 // Toggle the vCPUs pause boolean 1170 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1171 1172 // Unpark all the VCPU threads. 1173 for state in self.vcpu_states.iter() { 1174 state.unpark_thread(); 1175 } 1176 1177 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1178 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1179 // above. 1180 for state in self.vcpu_states.iter() { 1181 state.signal_thread(); 1182 } 1183 1184 // Wait for all the threads to finish. This removes the state from the vector. 1185 for mut state in self.vcpu_states.drain(..) { 1186 state.join_thread()?; 1187 } 1188 1189 Ok(()) 1190 } 1191 1192 #[cfg(feature = "tdx")] 1193 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1194 for vcpu in &self.vcpus { 1195 vcpu.lock() 1196 .unwrap() 1197 .vcpu 1198 .tdx_init(hob_address) 1199 .map_err(Error::InitializeTdx)?; 1200 } 1201 Ok(()) 1202 } 1203 1204 pub fn boot_vcpus(&self) -> u8 { 1205 self.config.boot_vcpus 1206 } 1207 1208 pub fn max_vcpus(&self) -> u8 { 1209 self.config.max_vcpus 1210 } 1211 1212 #[cfg(target_arch = "x86_64")] 1213 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1214 self.cpuid.clone() 1215 } 1216 1217 fn present_vcpus(&self) -> u8 { 1218 self.vcpu_states 1219 .iter() 1220 .fold(0, |acc, state| acc + state.active() as u8) 1221 } 1222 1223 #[cfg(target_arch = "aarch64")] 1224 pub fn get_mpidrs(&self) -> Vec<u64> { 1225 self.vcpus 1226 .iter() 1227 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1228 .collect() 1229 } 1230 1231 #[cfg(target_arch = "aarch64")] 1232 pub fn get_saved_states(&self) -> Vec<CpuState> { 1233 self.vcpus 1234 .iter() 1235 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1236 .collect() 1237 } 1238 1239 #[cfg(target_arch = "aarch64")] 1240 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1241 self.config 1242 .topology 1243 .clone() 1244 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1245 } 1246 1247 pub fn create_madt(&self) -> Sdt { 1248 use crate::acpi; 1249 // This is also checked in the commandline parsing. 1250 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1251 1252 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1253 #[cfg(target_arch = "x86_64")] 1254 { 1255 madt.write(36, arch::layout::APIC_START); 1256 1257 for cpu in 0..self.config.max_vcpus { 1258 let lapic = LocalApic { 1259 r#type: acpi::ACPI_APIC_PROCESSOR, 1260 length: 8, 1261 processor_id: cpu, 1262 apic_id: cpu, 1263 flags: if cpu < self.config.boot_vcpus { 1264 1 << MADT_CPU_ENABLE_FLAG 1265 } else { 1266 0 1267 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1268 }; 1269 madt.append(lapic); 1270 } 1271 1272 madt.append(Ioapic { 1273 r#type: acpi::ACPI_APIC_IO, 1274 length: 12, 1275 ioapic_id: 0, 1276 apic_address: arch::layout::IOAPIC_START.0 as u32, 1277 gsi_base: 0, 1278 ..Default::default() 1279 }); 1280 1281 madt.append(InterruptSourceOverride { 1282 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1283 length: 10, 1284 bus: 0, 1285 source: 4, 1286 gsi: 4, 1287 flags: 0, 1288 }); 1289 } 1290 1291 #[cfg(target_arch = "aarch64")] 1292 { 1293 /* Notes: 1294 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1295 */ 1296 1297 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1298 for cpu in 0..self.config.boot_vcpus { 1299 let vcpu = &self.vcpus[cpu as usize]; 1300 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1301 /* ARMv8 MPIDR format: 1302 Bits [63:40] Must be zero 1303 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1304 Bits [31:24] Must be zero 1305 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1306 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1307 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1308 */ 1309 let mpidr_mask = 0xff_00ff_ffff; 1310 let gicc = GicC { 1311 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1312 length: 80, 1313 reserved0: 0, 1314 cpu_interface_number: cpu as u32, 1315 uid: cpu as u32, 1316 flags: 1, 1317 parking_version: 0, 1318 performance_interrupt: 0, 1319 parked_address: 0, 1320 base_address: 0, 1321 gicv_base_address: 0, 1322 gich_base_address: 0, 1323 vgic_interrupt: 0, 1324 gicr_base_address: 0, 1325 mpidr: mpidr & mpidr_mask, 1326 proc_power_effi_class: 0, 1327 reserved1: 0, 1328 spe_overflow_interrupt: 0, 1329 }; 1330 1331 madt.append(gicc); 1332 } 1333 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1334 1335 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1336 let gicd = GicD { 1337 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1338 length: 24, 1339 reserved0: 0, 1340 gic_id: 0, 1341 base_address: vgic_config.dist_addr, 1342 global_irq_base: 0, 1343 version: 3, 1344 reserved1: [0; 3], 1345 }; 1346 madt.append(gicd); 1347 1348 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1349 let gicr = GicR { 1350 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1351 length: 16, 1352 reserved: 0, 1353 base_address: vgic_config.redists_addr, 1354 range_length: vgic_config.redists_size as u32, 1355 }; 1356 madt.append(gicr); 1357 1358 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1359 let gicits = GicIts { 1360 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1361 length: 20, 1362 reserved0: 0, 1363 translation_id: 0, 1364 base_address: vgic_config.msi_addr, 1365 reserved1: 0, 1366 }; 1367 madt.append(gicits); 1368 1369 madt.update_checksum(); 1370 } 1371 1372 madt 1373 } 1374 1375 #[cfg(target_arch = "aarch64")] 1376 pub fn create_pptt(&self) -> Sdt { 1377 let pptt_start = 0; 1378 let mut cpus = 0; 1379 let mut uid = 0; 1380 // If topology is not specified, the default setting is: 1381 // 1 package, multiple cores, 1 thread per core 1382 // This is also the behavior when PPTT is missing. 1383 let (threads_per_core, cores_per_package, packages) = 1384 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1385 1386 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1387 1388 for cluster_idx in 0..packages { 1389 if cpus < self.config.boot_vcpus as usize { 1390 let cluster_offset = pptt.len() - pptt_start; 1391 let cluster_hierarchy_node = ProcessorHierarchyNode { 1392 r#type: 0, 1393 length: 20, 1394 reserved: 0, 1395 flags: 0x2, 1396 parent: 0, 1397 acpi_processor_id: cluster_idx as u32, 1398 num_private_resources: 0, 1399 }; 1400 pptt.append(cluster_hierarchy_node); 1401 1402 for core_idx in 0..cores_per_package { 1403 let core_offset = pptt.len() - pptt_start; 1404 1405 if threads_per_core > 1 { 1406 let core_hierarchy_node = ProcessorHierarchyNode { 1407 r#type: 0, 1408 length: 20, 1409 reserved: 0, 1410 flags: 0x2, 1411 parent: cluster_offset as u32, 1412 acpi_processor_id: core_idx as u32, 1413 num_private_resources: 0, 1414 }; 1415 pptt.append(core_hierarchy_node); 1416 1417 for _thread_idx in 0..threads_per_core { 1418 let thread_hierarchy_node = ProcessorHierarchyNode { 1419 r#type: 0, 1420 length: 20, 1421 reserved: 0, 1422 flags: 0xE, 1423 parent: core_offset as u32, 1424 acpi_processor_id: uid as u32, 1425 num_private_resources: 0, 1426 }; 1427 pptt.append(thread_hierarchy_node); 1428 uid += 1; 1429 } 1430 } else { 1431 let thread_hierarchy_node = ProcessorHierarchyNode { 1432 r#type: 0, 1433 length: 20, 1434 reserved: 0, 1435 flags: 0xA, 1436 parent: cluster_offset as u32, 1437 acpi_processor_id: uid as u32, 1438 num_private_resources: 0, 1439 }; 1440 pptt.append(thread_hierarchy_node); 1441 uid += 1; 1442 } 1443 } 1444 cpus += (cores_per_package * threads_per_core) as usize; 1445 } 1446 } 1447 1448 pptt.update_checksum(); 1449 pptt 1450 } 1451 1452 #[cfg(feature = "guest_debug")] 1453 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1454 self.vcpus[usize::from(cpu_id)] 1455 .lock() 1456 .unwrap() 1457 .vcpu 1458 .get_regs() 1459 .map_err(Error::CpuDebug) 1460 } 1461 1462 #[cfg(feature = "guest_debug")] 1463 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1464 self.vcpus[usize::from(cpu_id)] 1465 .lock() 1466 .unwrap() 1467 .vcpu 1468 .set_regs(regs) 1469 .map_err(Error::CpuDebug) 1470 } 1471 1472 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1473 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1474 self.vcpus[usize::from(cpu_id)] 1475 .lock() 1476 .unwrap() 1477 .vcpu 1478 .get_sregs() 1479 .map_err(Error::CpuDebug) 1480 } 1481 1482 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1483 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1484 self.vcpus[usize::from(cpu_id)] 1485 .lock() 1486 .unwrap() 1487 .vcpu 1488 .set_sregs(sregs) 1489 .map_err(Error::CpuDebug) 1490 } 1491 1492 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1493 fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result<u64> { 1494 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1495 .lock() 1496 .unwrap() 1497 .vcpu 1498 .translate_gva(gva, /* flags: unused */ 0) 1499 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1500 Ok(gpa) 1501 } 1502 1503 /// 1504 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1505 /// it in VMM by walking through translation tables. 1506 /// 1507 /// Address translation is big topic, here we only focus the scenario that 1508 /// happens in VMM while debugging kernel. This `translate_gva` 1509 /// implementation is restricted to: 1510 /// - Exception Level 1 1511 /// - Translate high address range only (kernel space) 1512 /// 1513 /// This implementation supports following Arm-v8a features related to 1514 /// address translation: 1515 /// - FEAT_LPA 1516 /// - FEAT_LVA 1517 /// - FEAT_LPA2 1518 /// 1519 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1520 fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result<u64> { 1521 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1522 .lock() 1523 .unwrap() 1524 .vcpu 1525 .get_sys_reg(regs::TCR_EL1) 1526 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1527 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1528 .lock() 1529 .unwrap() 1530 .vcpu 1531 .get_sys_reg(regs::TTBR1_EL1) 1532 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1533 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1534 .lock() 1535 .unwrap() 1536 .vcpu 1537 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1538 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1539 1540 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1541 // or low (0x000xxx...). 1542 let high_range = extract_bits_64!(gva, 55, 1); 1543 if high_range == 0 { 1544 info!("VA (0x{:x}) range is not supported!", gva); 1545 return Ok(gva); 1546 } 1547 1548 // High range size offset 1549 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1550 // Granule size 1551 let tg = extract_bits_64!(tcr_el1, 30, 2); 1552 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1553 let ds = extract_bits_64!(tcr_el1, 59, 1); 1554 1555 if tsz == 0 { 1556 info!("VA translation is not ready!"); 1557 return Ok(gva); 1558 } 1559 1560 // VA size is determined by TCR_BL1.T1SZ 1561 let va_size = 64 - tsz; 1562 // Number of bits in VA consumed in each level of translation 1563 let stride = match tg { 1564 3 => 13, // 64KB granule size 1565 1 => 11, // 16KB granule size 1566 _ => 9, // 4KB, default 1567 }; 1568 // Starting level of walking 1569 let mut level = 4 - (va_size - 4) / stride; 1570 1571 // PA or IPA size is determined 1572 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1573 #[allow(clippy::identity_op)] 1574 let pa_range = extract_bits_64!(id_aa64mmfr0_el1, 0, 4); 1575 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1576 // To be safe, we use the minimum value if they are different. 1577 let pa_range = std::cmp::min(tcr_ips, pa_range); 1578 // PA size in bits 1579 let pa_size = match pa_range { 1580 0 => 32, 1581 1 => 36, 1582 2 => 40, 1583 3 => 42, 1584 4 => 44, 1585 5 => 48, 1586 6 => 52, 1587 _ => { 1588 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1589 "PA range not supported {}", 1590 pa_range 1591 )))) 1592 } 1593 }; 1594 1595 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1596 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1597 // If FEAT_LPA2 is present, the translation table descriptor holds 1598 // 50 bits of the table address of next level. 1599 // Otherwise, it is 48 bits. 1600 let descaddrmask = if ds == 1 { 1601 !0u64 >> (64 - 50) // mask with 50 least significant bits 1602 } else { 1603 !0u64 >> (64 - 48) // mask with 48 least significant bits 1604 }; 1605 let descaddrmask = descaddrmask & !indexmask_grainsize; 1606 1607 // Translation table base address 1608 #[allow(clippy::identity_op)] 1609 let mut descaddr: u64 = extract_bits_64!(ttbr1_el1, 0, 48); 1610 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1611 // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1612 if pa_size == 52 { 1613 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1614 } 1615 1616 // Loop through tables of each level 1617 loop { 1618 // Table offset for current level 1619 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1620 descaddr |= table_offset; 1621 descaddr &= !7u64; 1622 1623 let mut buf = [0; 8]; 1624 self.vm_memory 1625 .memory() 1626 .read(&mut buf, GuestAddress(descaddr)) 1627 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1628 let descriptor = u64::from_le_bytes(buf); 1629 1630 descaddr = descriptor & descaddrmask; 1631 // In the case of FEAT_LPA, the next-level translation table address 1632 // bits [48:51] comes from bits [12:15] of the current descriptor. 1633 // For FEAT_LPA2, the next-level translation table address 1634 // bits [50:51] comes from bits [8:9] of the current descriptor, 1635 // bits [48:49] comes from bits [48:49] of the descriptor which was 1636 // handled previously. 1637 if pa_size == 52 { 1638 if ds == 1 { 1639 // FEAT_LPA2 1640 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1641 } else { 1642 // FEAT_LPA 1643 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1644 } 1645 } 1646 1647 if (descriptor & 2) != 0 && (level < 3) { 1648 // This is a table entry. Go down to next level. 1649 level += 1; 1650 indexmask = indexmask_grainsize; 1651 continue; 1652 } 1653 1654 break; 1655 } 1656 1657 // We have reached either: 1658 // - a page entry at level 3 or 1659 // - a block entry at level 1 or 2 1660 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1661 descaddr &= !(page_size - 1); 1662 descaddr |= gva & (page_size - 1); 1663 1664 Ok(descaddr) 1665 } 1666 1667 pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) { 1668 self.acpi_address = Some(acpi_address); 1669 } 1670 1671 pub(crate) fn set_interrupt_controller( 1672 &mut self, 1673 interrupt_controller: Arc<Mutex<dyn InterruptController>>, 1674 ) { 1675 self.interrupt_controller = Some(interrupt_controller); 1676 } 1677 } 1678 1679 struct Cpu { 1680 cpu_id: u8, 1681 proximity_domain: u32, 1682 dynamic: bool, 1683 } 1684 1685 #[cfg(target_arch = "x86_64")] 1686 const MADT_CPU_ENABLE_FLAG: usize = 0; 1687 1688 #[cfg(target_arch = "x86_64")] 1689 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1690 1691 impl Cpu { 1692 #[cfg(target_arch = "x86_64")] 1693 fn generate_mat(&self) -> Vec<u8> { 1694 let lapic = LocalApic { 1695 r#type: 0, 1696 length: 8, 1697 processor_id: self.cpu_id, 1698 apic_id: self.cpu_id, 1699 flags: 1 << MADT_CPU_ENABLE_FLAG, 1700 }; 1701 1702 let mut mat_data: Vec<u8> = Vec::new(); 1703 mat_data.resize(std::mem::size_of_val(&lapic), 0); 1704 // SAFETY: mat_data is large enough to hold lapic 1705 unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic }; 1706 1707 mat_data 1708 } 1709 } 1710 1711 impl Aml for Cpu { 1712 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1713 #[cfg(target_arch = "x86_64")] 1714 let mat_data: Vec<u8> = self.generate_mat(); 1715 #[allow(clippy::if_same_then_else)] 1716 if self.dynamic { 1717 aml::Device::new( 1718 format!("C{:03}", self.cpu_id).as_str().into(), 1719 vec![ 1720 &aml::Name::new("_HID".into(), &"ACPI0007"), 1721 &aml::Name::new("_UID".into(), &self.cpu_id), 1722 // Currently, AArch64 cannot support following fields. 1723 /* 1724 _STA return value: 1725 Bit [0] – Set if the device is present. 1726 Bit [1] – Set if the device is enabled and decoding its resources. 1727 Bit [2] – Set if the device should be shown in the UI. 1728 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1729 Bit [4] – Set if the battery is present. 1730 Bits [31:5] – Reserved (must be cleared). 1731 */ 1732 #[cfg(target_arch = "x86_64")] 1733 &aml::Method::new( 1734 "_STA".into(), 1735 0, 1736 false, 1737 // Call into CSTA method which will interrogate device 1738 vec![&aml::Return::new(&aml::MethodCall::new( 1739 "CSTA".into(), 1740 vec![&self.cpu_id], 1741 ))], 1742 ), 1743 &aml::Method::new( 1744 "_PXM".into(), 1745 0, 1746 false, 1747 vec![&aml::Return::new(&self.proximity_domain)], 1748 ), 1749 // The Linux kernel expects every CPU device to have a _MAT entry 1750 // containing the LAPIC for this processor with the enabled bit set 1751 // even it if is disabled in the MADT (non-boot CPU) 1752 #[cfg(target_arch = "x86_64")] 1753 &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)), 1754 // Trigger CPU ejection 1755 #[cfg(target_arch = "x86_64")] 1756 &aml::Method::new( 1757 "_EJ0".into(), 1758 1, 1759 false, 1760 // Call into CEJ0 method which will actually eject device 1761 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1762 ), 1763 ], 1764 ) 1765 .append_aml_bytes(bytes); 1766 } else { 1767 aml::Device::new( 1768 format!("C{:03}", self.cpu_id).as_str().into(), 1769 vec![ 1770 &aml::Name::new("_HID".into(), &"ACPI0007"), 1771 &aml::Name::new("_UID".into(), &self.cpu_id), 1772 #[cfg(target_arch = "x86_64")] 1773 &aml::Method::new( 1774 "_STA".into(), 1775 0, 1776 false, 1777 // Mark CPU present see CSTA implementation 1778 vec![&aml::Return::new(&0xfu8)], 1779 ), 1780 &aml::Method::new( 1781 "_PXM".into(), 1782 0, 1783 false, 1784 vec![&aml::Return::new(&self.proximity_domain)], 1785 ), 1786 // The Linux kernel expects every CPU device to have a _MAT entry 1787 // containing the LAPIC for this processor with the enabled bit set 1788 // even it if is disabled in the MADT (non-boot CPU) 1789 #[cfg(target_arch = "x86_64")] 1790 &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)), 1791 ], 1792 ) 1793 .append_aml_bytes(bytes); 1794 } 1795 } 1796 } 1797 1798 struct CpuNotify { 1799 cpu_id: u8, 1800 } 1801 1802 impl Aml for CpuNotify { 1803 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1804 let object = aml::Path::new(&format!("C{:03}", self.cpu_id)); 1805 aml::If::new( 1806 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1807 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1808 ) 1809 .append_aml_bytes(bytes) 1810 } 1811 } 1812 1813 struct CpuMethods { 1814 max_vcpus: u8, 1815 dynamic: bool, 1816 } 1817 1818 impl Aml for CpuMethods { 1819 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1820 if self.dynamic { 1821 // CPU status method 1822 aml::Method::new( 1823 "CSTA".into(), 1824 1, 1825 true, 1826 vec![ 1827 // Take lock defined above 1828 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1829 // Write CPU number (in first argument) to I/O port via field 1830 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1831 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1832 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 1833 &aml::If::new( 1834 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 1835 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 1836 ), 1837 // Release lock 1838 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1839 // Return 0 or 0xf 1840 &aml::Return::new(&aml::Local(0)), 1841 ], 1842 ) 1843 .append_aml_bytes(bytes); 1844 1845 let mut cpu_notifies = Vec::new(); 1846 for cpu_id in 0..self.max_vcpus { 1847 cpu_notifies.push(CpuNotify { cpu_id }); 1848 } 1849 1850 let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new(); 1851 for cpu_id in 0..self.max_vcpus { 1852 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 1853 } 1854 1855 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).append_aml_bytes(bytes); 1856 1857 aml::Method::new( 1858 "CEJ0".into(), 1859 1, 1860 true, 1861 vec![ 1862 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1863 // Write CPU number (in first argument) to I/O port via field 1864 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1865 // Set CEJ0 bit 1866 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 1867 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1868 ], 1869 ) 1870 .append_aml_bytes(bytes); 1871 1872 aml::Method::new( 1873 "CSCN".into(), 1874 0, 1875 true, 1876 vec![ 1877 // Take lock defined above 1878 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1879 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1880 &aml::While::new( 1881 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 1882 vec![ 1883 // Write CPU number (in first argument) to I/O port via field 1884 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 1885 // Check if CINS bit is set 1886 &aml::If::new( 1887 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 1888 // Notify device if it is 1889 vec![ 1890 &aml::MethodCall::new( 1891 "CTFY".into(), 1892 vec![&aml::Local(0), &aml::ONE], 1893 ), 1894 // Reset CINS bit 1895 &aml::Store::new( 1896 &aml::Path::new("\\_SB_.PRES.CINS"), 1897 &aml::ONE, 1898 ), 1899 ], 1900 ), 1901 // Check if CRMV bit is set 1902 &aml::If::new( 1903 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 1904 // Notify device if it is (with the eject constant 0x3) 1905 vec![ 1906 &aml::MethodCall::new( 1907 "CTFY".into(), 1908 vec![&aml::Local(0), &3u8], 1909 ), 1910 // Reset CRMV bit 1911 &aml::Store::new( 1912 &aml::Path::new("\\_SB_.PRES.CRMV"), 1913 &aml::ONE, 1914 ), 1915 ], 1916 ), 1917 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 1918 ], 1919 ), 1920 // Release lock 1921 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1922 ], 1923 ) 1924 .append_aml_bytes(bytes) 1925 } else { 1926 aml::Method::new("CSCN".into(), 0, true, vec![]).append_aml_bytes(bytes) 1927 } 1928 } 1929 } 1930 1931 impl Aml for CpuManager { 1932 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1933 #[cfg(target_arch = "x86_64")] 1934 if let Some(acpi_address) = self.acpi_address { 1935 // CPU hotplug controller 1936 aml::Device::new( 1937 "_SB_.PRES".into(), 1938 vec![ 1939 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 1940 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 1941 // Mutex to protect concurrent access as we write to choose CPU and then read back status 1942 &aml::Mutex::new("CPLK".into(), 0), 1943 &aml::Name::new( 1944 "_CRS".into(), 1945 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 1946 aml::AddressSpaceCachable::NotCacheable, 1947 true, 1948 acpi_address.0, 1949 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 1950 )]), 1951 ), 1952 // OpRegion and Fields map MMIO range into individual field values 1953 &aml::OpRegion::new( 1954 "PRST".into(), 1955 aml::OpRegionSpace::SystemMemory, 1956 acpi_address.0 as usize, 1957 CPU_MANAGER_ACPI_SIZE, 1958 ), 1959 &aml::Field::new( 1960 "PRST".into(), 1961 aml::FieldAccessType::Byte, 1962 aml::FieldUpdateRule::WriteAsZeroes, 1963 vec![ 1964 aml::FieldEntry::Reserved(32), 1965 aml::FieldEntry::Named(*b"CPEN", 1), 1966 aml::FieldEntry::Named(*b"CINS", 1), 1967 aml::FieldEntry::Named(*b"CRMV", 1), 1968 aml::FieldEntry::Named(*b"CEJ0", 1), 1969 aml::FieldEntry::Reserved(4), 1970 aml::FieldEntry::Named(*b"CCMD", 8), 1971 ], 1972 ), 1973 &aml::Field::new( 1974 "PRST".into(), 1975 aml::FieldAccessType::DWord, 1976 aml::FieldUpdateRule::Preserve, 1977 vec![ 1978 aml::FieldEntry::Named(*b"CSEL", 32), 1979 aml::FieldEntry::Reserved(32), 1980 aml::FieldEntry::Named(*b"CDAT", 32), 1981 ], 1982 ), 1983 ], 1984 ) 1985 .append_aml_bytes(bytes); 1986 } 1987 1988 // CPU devices 1989 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 1990 let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05")); 1991 // Bundle methods together under a common object 1992 let methods = CpuMethods { 1993 max_vcpus: self.config.max_vcpus, 1994 dynamic: self.dynamic, 1995 }; 1996 let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods]; 1997 1998 let mut cpu_devices = Vec::new(); 1999 for cpu_id in 0..self.config.max_vcpus { 2000 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 2001 let cpu_device = Cpu { 2002 cpu_id, 2003 proximity_domain, 2004 dynamic: self.dynamic, 2005 }; 2006 2007 cpu_devices.push(cpu_device); 2008 } 2009 2010 for cpu_device in cpu_devices.iter() { 2011 cpu_data_inner.push(cpu_device); 2012 } 2013 2014 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).append_aml_bytes(bytes) 2015 } 2016 } 2017 2018 impl Pausable for CpuManager { 2019 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2020 // Tell the vCPUs to pause themselves next time they exit 2021 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2022 2023 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2024 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2025 // above. 2026 for state in self.vcpu_states.iter() { 2027 state.signal_thread(); 2028 } 2029 2030 for vcpu in self.vcpus.iter() { 2031 let mut vcpu = vcpu.lock().unwrap(); 2032 vcpu.pause()?; 2033 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2034 if !self.config.kvm_hyperv { 2035 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2036 MigratableError::Pause(anyhow!( 2037 "Could not notify guest it has been paused {:?}", 2038 e 2039 )) 2040 })?; 2041 } 2042 } 2043 2044 Ok(()) 2045 } 2046 2047 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2048 for vcpu in self.vcpus.iter() { 2049 vcpu.lock().unwrap().resume()?; 2050 } 2051 2052 // Toggle the vCPUs pause boolean 2053 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2054 2055 // Unpark all the VCPU threads. 2056 // Once unparked, the next thing they will do is checking for the pause 2057 // boolean. Since it'll be set to false, they will exit their pause loop 2058 // and go back to vmx root. 2059 for state in self.vcpu_states.iter() { 2060 state.unpark_thread(); 2061 } 2062 Ok(()) 2063 } 2064 } 2065 2066 impl Snapshottable for CpuManager { 2067 fn id(&self) -> String { 2068 CPU_MANAGER_SNAPSHOT_ID.to_string() 2069 } 2070 2071 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2072 let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID); 2073 2074 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2075 for vcpu in &self.vcpus { 2076 let cpu_snapshot = vcpu.lock().unwrap().snapshot()?; 2077 cpu_manager_snapshot.add_snapshot(cpu_snapshot); 2078 } 2079 2080 Ok(cpu_manager_snapshot) 2081 } 2082 2083 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 2084 for (cpu_id, snapshot) in snapshot.snapshots.iter() { 2085 let cpu_id = cpu_id.parse::<usize>().unwrap(); 2086 info!("Restoring VCPU {}", cpu_id); 2087 let vcpu = self.vcpus[cpu_id].clone(); 2088 self.configure_vcpu(vcpu, None, Some(*snapshot.clone())) 2089 .map_err(|e| { 2090 MigratableError::Restore(anyhow!("Could not configure vCPU {:?}", e)) 2091 })? 2092 } 2093 2094 Ok(()) 2095 } 2096 } 2097 2098 impl Transportable for CpuManager {} 2099 impl Migratable for CpuManager {} 2100 2101 #[cfg(feature = "guest_debug")] 2102 impl Debuggable for CpuManager { 2103 #[cfg(feature = "kvm")] 2104 fn set_guest_debug( 2105 &self, 2106 cpu_id: usize, 2107 addrs: &[GuestAddress], 2108 singlestep: bool, 2109 ) -> std::result::Result<(), DebuggableError> { 2110 self.vcpus[cpu_id] 2111 .lock() 2112 .unwrap() 2113 .vcpu 2114 .set_guest_debug(addrs, singlestep) 2115 .map_err(DebuggableError::SetDebug) 2116 } 2117 2118 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2119 Ok(()) 2120 } 2121 2122 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2123 Ok(()) 2124 } 2125 2126 #[cfg(target_arch = "x86_64")] 2127 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2128 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2129 let gregs = self 2130 .get_regs(cpu_id as u8) 2131 .map_err(DebuggableError::ReadRegs)?; 2132 let regs = [ 2133 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, 2134 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, 2135 ]; 2136 2137 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2138 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2139 let eflags = gregs.rflags as u32; 2140 let rip = gregs.rip; 2141 2142 // Segment registers: CS, SS, DS, ES, FS, GS 2143 let sregs = self 2144 .get_sregs(cpu_id as u8) 2145 .map_err(DebuggableError::ReadRegs)?; 2146 let segments = X86SegmentRegs { 2147 cs: sregs.cs.selector as u32, 2148 ss: sregs.ss.selector as u32, 2149 ds: sregs.ds.selector as u32, 2150 es: sregs.es.selector as u32, 2151 fs: sregs.fs.selector as u32, 2152 gs: sregs.gs.selector as u32, 2153 }; 2154 2155 // TODO: Add other registers 2156 2157 Ok(CoreRegs { 2158 regs, 2159 eflags, 2160 rip, 2161 segments, 2162 ..Default::default() 2163 }) 2164 } 2165 2166 #[cfg(target_arch = "aarch64")] 2167 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2168 let gregs = self 2169 .get_regs(cpu_id as u8) 2170 .map_err(DebuggableError::ReadRegs)?; 2171 Ok(CoreRegs { 2172 x: gregs.regs.regs, 2173 sp: gregs.regs.sp, 2174 pc: gregs.regs.pc, 2175 ..Default::default() 2176 }) 2177 } 2178 2179 #[cfg(target_arch = "x86_64")] 2180 fn write_regs( 2181 &self, 2182 cpu_id: usize, 2183 regs: &CoreRegs, 2184 ) -> std::result::Result<(), DebuggableError> { 2185 let orig_gregs = self 2186 .get_regs(cpu_id as u8) 2187 .map_err(DebuggableError::ReadRegs)?; 2188 let gregs = StandardRegisters { 2189 rax: regs.regs[0], 2190 rbx: regs.regs[1], 2191 rcx: regs.regs[2], 2192 rdx: regs.regs[3], 2193 rsi: regs.regs[4], 2194 rdi: regs.regs[5], 2195 rbp: regs.regs[6], 2196 rsp: regs.regs[7], 2197 r8: regs.regs[8], 2198 r9: regs.regs[9], 2199 r10: regs.regs[10], 2200 r11: regs.regs[11], 2201 r12: regs.regs[12], 2202 r13: regs.regs[13], 2203 r14: regs.regs[14], 2204 r15: regs.regs[15], 2205 rip: regs.rip, 2206 // Update the lower 32-bit of rflags. 2207 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), 2208 }; 2209 2210 self.set_regs(cpu_id as u8, &gregs) 2211 .map_err(DebuggableError::WriteRegs)?; 2212 2213 // Segment registers: CS, SS, DS, ES, FS, GS 2214 // Since GDB care only selectors, we call get_sregs() first. 2215 let mut sregs = self 2216 .get_sregs(cpu_id as u8) 2217 .map_err(DebuggableError::ReadRegs)?; 2218 sregs.cs.selector = regs.segments.cs as u16; 2219 sregs.ss.selector = regs.segments.ss as u16; 2220 sregs.ds.selector = regs.segments.ds as u16; 2221 sregs.es.selector = regs.segments.es as u16; 2222 sregs.fs.selector = regs.segments.fs as u16; 2223 sregs.gs.selector = regs.segments.gs as u16; 2224 2225 self.set_sregs(cpu_id as u8, &sregs) 2226 .map_err(DebuggableError::WriteRegs)?; 2227 2228 // TODO: Add other registers 2229 2230 Ok(()) 2231 } 2232 2233 #[cfg(target_arch = "aarch64")] 2234 fn write_regs( 2235 &self, 2236 cpu_id: usize, 2237 regs: &CoreRegs, 2238 ) -> std::result::Result<(), DebuggableError> { 2239 let mut gregs = self 2240 .get_regs(cpu_id as u8) 2241 .map_err(DebuggableError::ReadRegs)?; 2242 2243 gregs.regs.regs = regs.x; 2244 gregs.regs.sp = regs.sp; 2245 gregs.regs.pc = regs.pc; 2246 2247 self.set_regs(cpu_id as u8, &gregs) 2248 .map_err(DebuggableError::WriteRegs)?; 2249 2250 Ok(()) 2251 } 2252 2253 fn read_mem( 2254 &self, 2255 cpu_id: usize, 2256 vaddr: GuestAddress, 2257 len: usize, 2258 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2259 let mut buf = vec![0; len]; 2260 let mut total_read = 0_u64; 2261 2262 while total_read < len as u64 { 2263 let gaddr = vaddr.0 + total_read; 2264 let paddr = match self.translate_gva(cpu_id as u8, gaddr) { 2265 Ok(paddr) => paddr, 2266 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2267 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2268 }; 2269 let psize = arch::PAGE_SIZE as u64; 2270 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2271 self.vm_memory 2272 .memory() 2273 .read( 2274 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2275 GuestAddress(paddr), 2276 ) 2277 .map_err(DebuggableError::ReadMem)?; 2278 total_read += read_len; 2279 } 2280 Ok(buf) 2281 } 2282 2283 fn write_mem( 2284 &self, 2285 cpu_id: usize, 2286 vaddr: &GuestAddress, 2287 data: &[u8], 2288 ) -> std::result::Result<(), DebuggableError> { 2289 let mut total_written = 0_u64; 2290 2291 while total_written < data.len() as u64 { 2292 let gaddr = vaddr.0 + total_written; 2293 let paddr = match self.translate_gva(cpu_id as u8, gaddr) { 2294 Ok(paddr) => paddr, 2295 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2296 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2297 }; 2298 let psize = arch::PAGE_SIZE as u64; 2299 let write_len = std::cmp::min( 2300 data.len() as u64 - total_written, 2301 psize - (paddr & (psize - 1)), 2302 ); 2303 self.vm_memory 2304 .memory() 2305 .write( 2306 &data[total_written as usize..total_written as usize + write_len as usize], 2307 GuestAddress(paddr), 2308 ) 2309 .map_err(DebuggableError::WriteMem)?; 2310 total_written += write_len; 2311 } 2312 Ok(()) 2313 } 2314 2315 fn active_vcpus(&self) -> usize { 2316 self.present_vcpus() as usize 2317 } 2318 } 2319 2320 #[cfg(feature = "guest_debug")] 2321 impl Elf64Writable for CpuManager {} 2322 2323 #[cfg(feature = "guest_debug")] 2324 impl CpuElf64Writable for CpuManager { 2325 fn cpu_write_elf64_note( 2326 &mut self, 2327 dump_state: &DumpState, 2328 ) -> std::result::Result<(), GuestDebuggableError> { 2329 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2330 for vcpu in &self.vcpus { 2331 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2332 let mut pos: usize = 0; 2333 let mut buf = vec![0; note_size as usize]; 2334 let descsz = size_of::<X86_64ElfPrStatus>(); 2335 let vcpu_id = vcpu.lock().unwrap().id; 2336 2337 let note = Elf64_Nhdr { 2338 n_namesz: COREDUMP_NAME_SIZE, 2339 n_descsz: descsz as u32, 2340 n_type: NT_PRSTATUS, 2341 }; 2342 2343 let bytes: &[u8] = note.as_slice(); 2344 buf.splice(0.., bytes.to_vec()); 2345 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2346 buf.resize(pos + 4, 0); 2347 buf.splice(pos.., "CORE".to_string().into_bytes()); 2348 2349 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2350 buf.resize(pos + 32 + 4, 0); 2351 let pid = vcpu_id as u64; 2352 let bytes: &[u8] = pid.as_slice(); 2353 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2354 2355 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2356 2357 let orig_rax: u64 = 0; 2358 let gregs = self.vcpus[usize::from(vcpu_id)] 2359 .lock() 2360 .unwrap() 2361 .vcpu 2362 .get_regs() 2363 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2364 2365 let regs1 = [ 2366 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11, 2367 gregs.r10, 2368 ]; 2369 let regs2 = [ 2370 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax, 2371 ]; 2372 2373 let sregs = self.vcpus[usize::from(vcpu_id)] 2374 .lock() 2375 .unwrap() 2376 .vcpu 2377 .get_sregs() 2378 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2379 2380 debug!( 2381 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2382 gregs.rip, 2383 gregs.rsp, 2384 sregs.gs.base, 2385 sregs.cs.selector, 2386 sregs.ss.selector, 2387 sregs.ds.selector, 2388 ); 2389 2390 let regs = X86_64UserRegs { 2391 regs1, 2392 regs2, 2393 rip: gregs.rip, 2394 cs: sregs.cs.selector as u64, 2395 eflags: gregs.rflags, 2396 rsp: gregs.rsp, 2397 ss: sregs.ss.selector as u64, 2398 fs_base: sregs.fs.base, 2399 gs_base: sregs.gs.base, 2400 ds: sregs.ds.selector as u64, 2401 es: sregs.es.selector as u64, 2402 fs: sregs.fs.selector as u64, 2403 gs: sregs.gs.selector as u64, 2404 }; 2405 2406 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2407 let bytes: &[u8] = regs.as_slice(); 2408 buf.resize(note_size as usize, 0); 2409 buf.splice(pos.., bytes.to_vec()); 2410 buf.resize(note_size as usize, 0); 2411 2412 coredump_file 2413 .write(&buf) 2414 .map_err(GuestDebuggableError::CoredumpFile)?; 2415 } 2416 2417 Ok(()) 2418 } 2419 2420 fn cpu_write_vmm_note( 2421 &mut self, 2422 dump_state: &DumpState, 2423 ) -> std::result::Result<(), GuestDebuggableError> { 2424 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2425 for vcpu in &self.vcpus { 2426 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2427 let mut pos: usize = 0; 2428 let mut buf = vec![0; note_size as usize]; 2429 let descsz = size_of::<DumpCpusState>(); 2430 let vcpu_id = vcpu.lock().unwrap().id; 2431 2432 let note = Elf64_Nhdr { 2433 n_namesz: COREDUMP_NAME_SIZE, 2434 n_descsz: descsz as u32, 2435 n_type: 0, 2436 }; 2437 2438 let bytes: &[u8] = note.as_slice(); 2439 buf.splice(0.., bytes.to_vec()); 2440 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2441 2442 buf.resize(pos + 4, 0); 2443 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2444 2445 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2446 2447 let gregs = self.vcpus[usize::from(vcpu_id)] 2448 .lock() 2449 .unwrap() 2450 .vcpu 2451 .get_regs() 2452 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2453 2454 let regs1 = [ 2455 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp, 2456 gregs.rbp, 2457 ]; 2458 2459 let regs2 = [ 2460 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, 2461 gregs.r15, 2462 ]; 2463 2464 let sregs = self.vcpus[usize::from(vcpu_id)] 2465 .lock() 2466 .unwrap() 2467 .vcpu 2468 .get_sregs() 2469 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2470 2471 let mut msrs = vec![MsrEntry { 2472 index: msr_index::MSR_KERNEL_GS_BASE, 2473 ..Default::default() 2474 }]; 2475 2476 self.vcpus[vcpu_id as usize] 2477 .lock() 2478 .unwrap() 2479 .vcpu 2480 .get_msrs(&mut msrs) 2481 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2482 let kernel_gs_base = msrs[0].data; 2483 2484 let cs = CpuSegment::new(sregs.cs); 2485 let ds = CpuSegment::new(sregs.ds); 2486 let es = CpuSegment::new(sregs.es); 2487 let fs = CpuSegment::new(sregs.fs); 2488 let gs = CpuSegment::new(sregs.gs); 2489 let ss = CpuSegment::new(sregs.ss); 2490 let ldt = CpuSegment::new(sregs.ldt); 2491 let tr = CpuSegment::new(sregs.tr); 2492 let gdt = CpuSegment::new_from_table(sregs.gdt); 2493 let idt = CpuSegment::new_from_table(sregs.idt); 2494 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2495 let regs = DumpCpusState { 2496 version: 1, 2497 size: size_of::<DumpCpusState>() as u32, 2498 regs1, 2499 regs2, 2500 rip: gregs.rip, 2501 rflags: gregs.rflags, 2502 cs, 2503 ds, 2504 es, 2505 fs, 2506 gs, 2507 ss, 2508 ldt, 2509 tr, 2510 gdt, 2511 idt, 2512 cr, 2513 kernel_gs_base, 2514 }; 2515 2516 let bytes: &[u8] = regs.as_slice(); 2517 buf.resize(note_size as usize, 0); 2518 buf.splice(pos.., bytes.to_vec()); 2519 buf.resize(note_size as usize, 0); 2520 2521 coredump_file 2522 .write(&buf) 2523 .map_err(GuestDebuggableError::CoredumpFile)?; 2524 } 2525 2526 Ok(()) 2527 } 2528 } 2529 2530 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2531 #[cfg(test)] 2532 mod tests { 2533 use arch::x86_64::interrupts::*; 2534 use arch::x86_64::regs::*; 2535 use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters}; 2536 2537 #[test] 2538 fn test_setlint() { 2539 let hv = hypervisor::new().unwrap(); 2540 let vm = hv.create_vm().expect("new VM fd creation failed"); 2541 assert!(hv.check_required_extensions().is_ok()); 2542 // Calling get_lapic will fail if there is no irqchip before hand. 2543 assert!(vm.create_irq_chip().is_ok()); 2544 let vcpu = vm.create_vcpu(0, None).unwrap(); 2545 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2546 2547 // Compute the value that is expected to represent LVT0 and LVT1. 2548 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2549 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2550 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2551 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2552 2553 set_lint(&vcpu).unwrap(); 2554 2555 // Compute the value that represents LVT0 and LVT1 after set_lint. 2556 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2557 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2558 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2559 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2560 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2561 } 2562 2563 #[test] 2564 fn test_setup_fpu() { 2565 let hv = hypervisor::new().unwrap(); 2566 let vm = hv.create_vm().expect("new VM fd creation failed"); 2567 let vcpu = vm.create_vcpu(0, None).unwrap(); 2568 setup_fpu(&vcpu).unwrap(); 2569 2570 let expected_fpu: FpuState = FpuState { 2571 fcw: 0x37f, 2572 mxcsr: 0x1f80, 2573 ..Default::default() 2574 }; 2575 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2576 // TODO: auto-generate kvm related structures with PartialEq on. 2577 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2578 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2579 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2580 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2581 // remove it at all. 2582 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2583 } 2584 2585 #[test] 2586 fn test_setup_msrs() { 2587 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2588 2589 let hv = hypervisor::new().unwrap(); 2590 let vm = hv.create_vm().expect("new VM fd creation failed"); 2591 let vcpu = vm.create_vcpu(0, None).unwrap(); 2592 setup_msrs(&vcpu).unwrap(); 2593 2594 // This test will check against the last MSR entry configured (the tenth one). 2595 // See create_msr_entries for details. 2596 let mut msrs = vec![MsrEntry { 2597 index: msr_index::MSR_IA32_MISC_ENABLE, 2598 ..Default::default() 2599 }]; 2600 2601 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2602 // in this test case scenario. 2603 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2604 assert_eq!(read_msrs, 1); 2605 2606 // Official entries that were setup when we did setup_msrs. We need to assert that the 2607 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2608 // expect. 2609 let entry_vec = vcpu.boot_msr_entries(); 2610 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2611 } 2612 2613 #[test] 2614 fn test_setup_regs() { 2615 let hv = hypervisor::new().unwrap(); 2616 let vm = hv.create_vm().expect("new VM fd creation failed"); 2617 let vcpu = vm.create_vcpu(0, None).unwrap(); 2618 2619 let expected_regs: StandardRegisters = StandardRegisters { 2620 rflags: 0x0000000000000002u64, 2621 rbx: arch::layout::PVH_INFO_START.0, 2622 rip: 1, 2623 ..Default::default() 2624 }; 2625 2626 setup_regs(&vcpu, expected_regs.rip).unwrap(); 2627 2628 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2629 assert_eq!(actual_regs, expected_regs); 2630 } 2631 } 2632 2633 #[cfg(target_arch = "aarch64")] 2634 #[cfg(test)] 2635 mod tests { 2636 use arch::{aarch64::regs, layout}; 2637 use hypervisor::kvm::aarch64::is_system_register; 2638 use hypervisor::kvm::kvm_bindings::{ 2639 kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2640 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2641 }; 2642 use hypervisor::{arm64_core_reg_id, offset__of}; 2643 use std::mem; 2644 2645 #[test] 2646 fn test_setup_regs() { 2647 let hv = hypervisor::new().unwrap(); 2648 let vm = hv.create_vm().unwrap(); 2649 let vcpu = vm.create_vcpu(0, None).unwrap(); 2650 2651 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2652 // Must fail when vcpu is not initialized yet. 2653 assert!(res.is_err()); 2654 2655 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2656 vm.get_preferred_target(&mut kvi).unwrap(); 2657 vcpu.vcpu_init(&kvi).unwrap(); 2658 2659 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2660 } 2661 2662 #[test] 2663 fn test_read_mpidr() { 2664 let hv = hypervisor::new().unwrap(); 2665 let vm = hv.create_vm().unwrap(); 2666 let vcpu = vm.create_vcpu(0, None).unwrap(); 2667 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2668 vm.get_preferred_target(&mut kvi).unwrap(); 2669 2670 // Must fail when vcpu is not initialized yet. 2671 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2672 2673 vcpu.vcpu_init(&kvi).unwrap(); 2674 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2675 } 2676 2677 #[test] 2678 fn test_is_system_register() { 2679 let offset = offset__of!(user_pt_regs, pc); 2680 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2681 assert!(!is_system_register(regid)); 2682 let regid = KVM_REG_ARM64 as u64 | KVM_REG_SIZE_U64 as u64 | KVM_REG_ARM64_SYSREG as u64; 2683 assert!(is_system_register(regid)); 2684 } 2685 2686 #[test] 2687 fn test_save_restore_core_regs() { 2688 let hv = hypervisor::new().unwrap(); 2689 let vm = hv.create_vm().unwrap(); 2690 let vcpu = vm.create_vcpu(0, None).unwrap(); 2691 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2692 vm.get_preferred_target(&mut kvi).unwrap(); 2693 2694 // Must fail when vcpu is not initialized yet. 2695 let res = vcpu.get_regs(); 2696 assert!(res.is_err()); 2697 assert_eq!( 2698 format!("{}", res.unwrap_err()), 2699 "Failed to get core register: Exec format error (os error 8)" 2700 ); 2701 2702 let mut state = kvm_regs::default(); 2703 let res = vcpu.set_regs(&state); 2704 assert!(res.is_err()); 2705 assert_eq!( 2706 format!("{}", res.unwrap_err()), 2707 "Failed to set core register: Exec format error (os error 8)" 2708 ); 2709 2710 vcpu.vcpu_init(&kvi).unwrap(); 2711 let res = vcpu.get_regs(); 2712 assert!(res.is_ok()); 2713 state = res.unwrap(); 2714 assert_eq!(state.regs.pstate, 0x3C5); 2715 2716 assert!(vcpu.set_regs(&state).is_ok()); 2717 } 2718 2719 #[test] 2720 fn test_get_set_mpstate() { 2721 let hv = hypervisor::new().unwrap(); 2722 let vm = hv.create_vm().unwrap(); 2723 let vcpu = vm.create_vcpu(0, None).unwrap(); 2724 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2725 vm.get_preferred_target(&mut kvi).unwrap(); 2726 2727 let res = vcpu.get_mp_state(); 2728 assert!(res.is_ok()); 2729 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 2730 } 2731 } 2732