1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 #[cfg(feature = "guest_debug")] 16 use crate::coredump::{ 17 CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, 18 GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, 19 NT_PRSTATUS, 20 }; 21 use crate::device_manager::DeviceManager; 22 #[cfg(feature = "guest_debug")] 23 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 24 use crate::memory_manager::MemoryManager; 25 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 26 #[cfg(target_arch = "x86_64")] 27 use crate::vm::physical_bits; 28 use crate::GuestMemoryMmap; 29 use crate::CPU_MANAGER_SNAPSHOT_ID; 30 use acpi_tables::{aml, aml::Aml, sdt::Sdt}; 31 use anyhow::anyhow; 32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 33 use arch::aarch64::regs; 34 use arch::EntryPoint; 35 use arch::NumaNodes; 36 #[cfg(target_arch = "aarch64")] 37 use devices::gic::Gic; 38 use devices::interrupt_controller::InterruptController; 39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; 43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 44 use hypervisor::aarch64::StandardRegisters; 45 #[cfg(feature = "guest_debug")] 46 use hypervisor::arch::x86::msr_index; 47 #[cfg(target_arch = "x86_64")] 48 use hypervisor::arch::x86::CpuIdEntry; 49 #[cfg(feature = "guest_debug")] 50 use hypervisor::arch::x86::MsrEntry; 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters}; 53 #[cfg(target_arch = "aarch64")] 54 use hypervisor::kvm::kvm_bindings; 55 #[cfg(feature = "tdx")] 56 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; 57 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps}; 58 use libc::{c_void, siginfo_t}; 59 #[cfg(feature = "guest_debug")] 60 use linux_loader::elf::Elf64_Nhdr; 61 use seccompiler::{apply_filter, SeccompAction}; 62 use std::collections::BTreeMap; 63 #[cfg(feature = "guest_debug")] 64 use std::io::Write; 65 #[cfg(feature = "guest_debug")] 66 use std::mem::size_of; 67 use std::os::unix::thread::JoinHandleExt; 68 use std::sync::atomic::{AtomicBool, Ordering}; 69 use std::sync::{Arc, Barrier, Mutex}; 70 use std::{cmp, io, result, thread}; 71 use thiserror::Error; 72 use tracer::trace_scoped; 73 use vm_device::BusDevice; 74 #[cfg(feature = "guest_debug")] 75 use vm_memory::ByteValued; 76 #[cfg(feature = "guest_debug")] 77 use vm_memory::{Bytes, GuestAddressSpace}; 78 use vm_memory::{GuestAddress, GuestMemoryAtomic}; 79 use vm_migration::{ 80 Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable, 81 Transportable, 82 }; 83 use vmm_sys_util::eventfd::EventFd; 84 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 85 86 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 87 /// Extract the specified bits of a 64-bit integer. 88 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, 89 /// following expression should return 3 (`0b11`): 90 /// `extract_bits_64!(0b0000_0110u64, 1, 2)` 91 /// 92 macro_rules! extract_bits_64 { 93 ($value: tt, $offset: tt, $length: tt) => { 94 ($value >> $offset) & (!0u64 >> (64 - $length)) 95 }; 96 } 97 98 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 99 100 #[derive(Debug, Error)] 101 pub enum Error { 102 #[error("Error creating vCPU: {0}")] 103 VcpuCreate(#[source] anyhow::Error), 104 105 #[error("Error running bCPU: {0}")] 106 VcpuRun(#[source] anyhow::Error), 107 108 #[error("Error spawning vCPU thread: {0}")] 109 VcpuSpawn(#[source] io::Error), 110 111 #[error("Error generating common CPUID: {0}")] 112 CommonCpuId(#[source] arch::Error), 113 114 #[error("Error configuring vCPU: {0}")] 115 VcpuConfiguration(#[source] arch::Error), 116 117 #[cfg(target_arch = "aarch64")] 118 #[error("Error fetching preferred target: {0}")] 119 VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), 120 121 #[cfg(target_arch = "aarch64")] 122 #[error("Error initialising vCPU: {0}")] 123 VcpuArmInit(#[source] hypervisor::HypervisorCpuError), 124 125 #[error("Failed to join on vCPU threads: {0:?}")] 126 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 127 128 #[error("Error adding CpuManager to MMIO bus: {0}")] 129 BusError(#[source] vm_device::BusError), 130 131 #[error("Requested vCPUs exceed maximum")] 132 DesiredVCpuCountExceedsMax, 133 134 #[error("Cannot create seccomp filter: {0}")] 135 CreateSeccompFilter(#[source] seccompiler::Error), 136 137 #[error("Cannot apply seccomp filter: {0}")] 138 ApplySeccompFilter(#[source] seccompiler::Error), 139 140 #[error("Error starting vCPU after restore: {0}")] 141 StartRestoreVcpu(#[source] anyhow::Error), 142 143 #[error("Unexpected VmExit")] 144 UnexpectedVmExit, 145 146 #[error("Failed to allocate MMIO address for CpuManager")] 147 AllocateMmmioAddress, 148 149 #[cfg(feature = "tdx")] 150 #[error("Error initializing TDX: {0}")] 151 InitializeTdx(#[source] hypervisor::HypervisorCpuError), 152 153 #[cfg(target_arch = "aarch64")] 154 #[error("Error initializing PMU: {0}")] 155 InitPmu(#[source] hypervisor::HypervisorCpuError), 156 157 #[cfg(feature = "guest_debug")] 158 #[error("Error during CPU debug: {0}")] 159 CpuDebug(#[source] hypervisor::HypervisorCpuError), 160 161 #[cfg(feature = "guest_debug")] 162 #[error("Error translating virtual address: {0}")] 163 TranslateVirtualAddress(#[source] anyhow::Error), 164 165 #[cfg(target_arch = "x86_64")] 166 #[error("Error setting up AMX: {0}")] 167 AmxEnable(#[source] anyhow::Error), 168 } 169 pub type Result<T> = result::Result<T, Error>; 170 171 #[cfg(target_arch = "x86_64")] 172 #[allow(dead_code)] 173 #[repr(packed)] 174 struct LocalApic { 175 pub r#type: u8, 176 pub length: u8, 177 pub processor_id: u8, 178 pub apic_id: u8, 179 pub flags: u32, 180 } 181 182 #[allow(dead_code)] 183 #[repr(packed)] 184 #[derive(Default)] 185 struct Ioapic { 186 pub r#type: u8, 187 pub length: u8, 188 pub ioapic_id: u8, 189 _reserved: u8, 190 pub apic_address: u32, 191 pub gsi_base: u32, 192 } 193 194 #[cfg(target_arch = "aarch64")] 195 #[allow(dead_code)] 196 #[repr(packed)] 197 struct GicC { 198 pub r#type: u8, 199 pub length: u8, 200 pub reserved0: u16, 201 pub cpu_interface_number: u32, 202 pub uid: u32, 203 pub flags: u32, 204 pub parking_version: u32, 205 pub performance_interrupt: u32, 206 pub parked_address: u64, 207 pub base_address: u64, 208 pub gicv_base_address: u64, 209 pub gich_base_address: u64, 210 pub vgic_interrupt: u32, 211 pub gicr_base_address: u64, 212 pub mpidr: u64, 213 pub proc_power_effi_class: u8, 214 pub reserved1: u8, 215 pub spe_overflow_interrupt: u16, 216 } 217 218 #[cfg(target_arch = "aarch64")] 219 #[allow(dead_code)] 220 #[repr(packed)] 221 struct GicD { 222 pub r#type: u8, 223 pub length: u8, 224 pub reserved0: u16, 225 pub gic_id: u32, 226 pub base_address: u64, 227 pub global_irq_base: u32, 228 pub version: u8, 229 pub reserved1: [u8; 3], 230 } 231 232 #[cfg(target_arch = "aarch64")] 233 #[allow(dead_code)] 234 #[repr(packed)] 235 struct GicR { 236 pub r#type: u8, 237 pub length: u8, 238 pub reserved: u16, 239 pub base_address: u64, 240 pub range_length: u32, 241 } 242 243 #[cfg(target_arch = "aarch64")] 244 #[allow(dead_code)] 245 #[repr(packed)] 246 struct GicIts { 247 pub r#type: u8, 248 pub length: u8, 249 pub reserved0: u16, 250 pub translation_id: u32, 251 pub base_address: u64, 252 pub reserved1: u32, 253 } 254 255 #[cfg(target_arch = "aarch64")] 256 #[allow(dead_code)] 257 #[repr(packed)] 258 struct ProcessorHierarchyNode { 259 pub r#type: u8, 260 pub length: u8, 261 pub reserved: u16, 262 pub flags: u32, 263 pub parent: u32, 264 pub acpi_processor_id: u32, 265 pub num_private_resources: u32, 266 } 267 268 #[allow(dead_code)] 269 #[repr(packed)] 270 #[derive(Default)] 271 struct InterruptSourceOverride { 272 pub r#type: u8, 273 pub length: u8, 274 pub bus: u8, 275 pub source: u8, 276 pub gsi: u32, 277 pub flags: u16, 278 } 279 280 #[cfg(feature = "guest_debug")] 281 macro_rules! round_up { 282 ($n:expr,$d:expr) => { 283 (($n / ($d + 1)) + 1) * $d 284 }; 285 } 286 287 /// A wrapper around creating and using a kvm-based VCPU. 288 pub struct Vcpu { 289 // The hypervisor abstracted CPU. 290 vcpu: Arc<dyn hypervisor::Vcpu>, 291 id: u8, 292 #[cfg(target_arch = "aarch64")] 293 mpidr: u64, 294 saved_state: Option<CpuState>, 295 } 296 297 impl Vcpu { 298 /// Constructs a new VCPU for `vm`. 299 /// 300 /// # Arguments 301 /// 302 /// * `id` - Represents the CPU number between [0, max vcpus). 303 /// * `vm` - The virtual machine this vcpu will get attached to. 304 /// * `vm_ops` - Optional object for exit handling. 305 pub fn new( 306 id: u8, 307 vm: &Arc<dyn hypervisor::Vm>, 308 vm_ops: Option<Arc<dyn VmOps>>, 309 ) -> Result<Self> { 310 let vcpu = vm 311 .create_vcpu(id, vm_ops) 312 .map_err(|e| Error::VcpuCreate(e.into()))?; 313 // Initially the cpuid per vCPU is the one supported by this VM. 314 Ok(Vcpu { 315 vcpu, 316 id, 317 #[cfg(target_arch = "aarch64")] 318 mpidr: 0, 319 saved_state: None, 320 }) 321 } 322 323 /// Configures a vcpu and should be called once per vcpu when created. 324 /// 325 /// # Arguments 326 /// 327 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 328 /// * `vm_memory` - Guest memory. 329 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 330 pub fn configure( 331 &mut self, 332 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 333 kernel_entry_point: Option<EntryPoint>, 334 #[cfg(target_arch = "x86_64")] vm_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 335 #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>, 336 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 337 ) -> Result<()> { 338 #[cfg(target_arch = "aarch64")] 339 { 340 self.init(vm)?; 341 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, kernel_entry_point) 342 .map_err(Error::VcpuConfiguration)?; 343 } 344 info!("Configuring vCPU: cpu_id = {}", self.id); 345 #[cfg(target_arch = "x86_64")] 346 arch::configure_vcpu( 347 &self.vcpu, 348 self.id, 349 kernel_entry_point, 350 vm_memory, 351 cpuid, 352 kvm_hyperv, 353 ) 354 .map_err(Error::VcpuConfiguration)?; 355 356 Ok(()) 357 } 358 359 /// Gets the MPIDR register value. 360 #[cfg(target_arch = "aarch64")] 361 pub fn get_mpidr(&self) -> u64 { 362 self.mpidr 363 } 364 365 /// Gets the saved vCPU state. 366 #[cfg(target_arch = "aarch64")] 367 pub fn get_saved_state(&self) -> Option<CpuState> { 368 self.saved_state.clone() 369 } 370 371 /// Initializes an aarch64 specific vcpu for booting Linux. 372 #[cfg(target_arch = "aarch64")] 373 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 374 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 375 376 // This reads back the kernel's preferred target type. 377 vm.get_preferred_target(&mut kvi) 378 .map_err(Error::VcpuArmPreferredTarget)?; 379 // We already checked that the capability is supported. 380 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 381 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 382 // Non-boot cpus are powered off initially. 383 if self.id > 0 { 384 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 385 } 386 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 387 } 388 389 /// Runs the VCPU until it exits, returning the reason. 390 /// 391 /// Note that the state of the VCPU and associated VM must be setup first for this to do 392 /// anything useful. 393 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 394 self.vcpu.run() 395 } 396 } 397 398 const VCPU_SNAPSHOT_ID: &str = "vcpu"; 399 impl Pausable for Vcpu {} 400 impl Snapshottable for Vcpu { 401 fn id(&self) -> String { 402 VCPU_SNAPSHOT_ID.to_string() 403 } 404 405 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 406 let saved_state = self 407 .vcpu 408 .state() 409 .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?; 410 411 let mut vcpu_snapshot = Snapshot::new(&format!("{:03}", self.id)); 412 vcpu_snapshot.add_data_section(SnapshotDataSection::new_from_state( 413 VCPU_SNAPSHOT_ID, 414 &saved_state, 415 )?); 416 417 self.saved_state = Some(saved_state); 418 419 Ok(vcpu_snapshot) 420 } 421 422 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 423 let saved_state: CpuState = snapshot.to_state(VCPU_SNAPSHOT_ID)?; 424 425 self.vcpu 426 .set_state(&saved_state) 427 .map_err(|e| MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e)))?; 428 429 self.saved_state = Some(saved_state); 430 431 Ok(()) 432 } 433 } 434 435 pub struct CpuManager { 436 hypervisor_type: HypervisorType, 437 config: CpusConfig, 438 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 439 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 440 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 441 vm_memory: GuestMemoryAtomic<GuestMemoryMmap>, 442 #[cfg(target_arch = "x86_64")] 443 cpuid: Vec<CpuIdEntry>, 444 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 445 vm: Arc<dyn hypervisor::Vm>, 446 vcpus_kill_signalled: Arc<AtomicBool>, 447 vcpus_pause_signalled: Arc<AtomicBool>, 448 exit_evt: EventFd, 449 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 450 reset_evt: EventFd, 451 #[cfg(feature = "guest_debug")] 452 vm_debug_evt: EventFd, 453 vcpu_states: Vec<VcpuState>, 454 selected_cpu: u8, 455 vcpus: Vec<Arc<Mutex<Vcpu>>>, 456 seccomp_action: SeccompAction, 457 vm_ops: Arc<dyn VmOps>, 458 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 459 acpi_address: Option<GuestAddress>, 460 proximity_domain_per_cpu: BTreeMap<u8, u32>, 461 affinity: BTreeMap<u8, Vec<u8>>, 462 dynamic: bool, 463 } 464 465 const CPU_ENABLE_FLAG: usize = 0; 466 const CPU_INSERTING_FLAG: usize = 1; 467 const CPU_REMOVING_FLAG: usize = 2; 468 const CPU_EJECT_FLAG: usize = 3; 469 470 const CPU_STATUS_OFFSET: u64 = 4; 471 const CPU_SELECTION_OFFSET: u64 = 0; 472 473 impl BusDevice for CpuManager { 474 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 475 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 476 data.fill(0); 477 478 match offset { 479 CPU_SELECTION_OFFSET => { 480 data[0] = self.selected_cpu; 481 } 482 CPU_STATUS_OFFSET => { 483 if self.selected_cpu < self.max_vcpus() { 484 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 485 if state.active() { 486 data[0] |= 1 << CPU_ENABLE_FLAG; 487 } 488 if state.inserting { 489 data[0] |= 1 << CPU_INSERTING_FLAG; 490 } 491 if state.removing { 492 data[0] |= 1 << CPU_REMOVING_FLAG; 493 } 494 } else { 495 warn!("Out of range vCPU id: {}", self.selected_cpu); 496 } 497 } 498 _ => { 499 warn!( 500 "Unexpected offset for accessing CPU manager device: {:#}", 501 offset 502 ); 503 } 504 } 505 } 506 507 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 508 match offset { 509 CPU_SELECTION_OFFSET => { 510 self.selected_cpu = data[0]; 511 } 512 CPU_STATUS_OFFSET => { 513 if self.selected_cpu < self.max_vcpus() { 514 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 515 // The ACPI code writes back a 1 to acknowledge the insertion 516 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 517 && state.inserting 518 { 519 state.inserting = false; 520 } 521 // Ditto for removal 522 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 523 && state.removing 524 { 525 state.removing = false; 526 } 527 // Trigger removal of vCPU 528 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 529 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 530 error!("Error removing vCPU: {:?}", e); 531 } 532 } 533 } else { 534 warn!("Out of range vCPU id: {}", self.selected_cpu); 535 } 536 } 537 _ => { 538 warn!( 539 "Unexpected offset for accessing CPU manager device: {:#}", 540 offset 541 ); 542 } 543 } 544 None 545 } 546 } 547 548 #[derive(Default)] 549 struct VcpuState { 550 inserting: bool, 551 removing: bool, 552 handle: Option<thread::JoinHandle<()>>, 553 kill: Arc<AtomicBool>, 554 vcpu_run_interrupted: Arc<AtomicBool>, 555 } 556 557 impl VcpuState { 558 fn active(&self) -> bool { 559 self.handle.is_some() 560 } 561 562 fn signal_thread(&self) { 563 if let Some(handle) = self.handle.as_ref() { 564 loop { 565 unsafe { 566 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 567 } 568 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 569 break; 570 } else { 571 // This is more effective than thread::yield_now() at 572 // avoiding a priority inversion with the vCPU thread 573 thread::sleep(std::time::Duration::from_millis(1)); 574 } 575 } 576 } 577 } 578 579 fn join_thread(&mut self) -> Result<()> { 580 if let Some(handle) = self.handle.take() { 581 handle.join().map_err(Error::ThreadCleanup)? 582 } 583 584 Ok(()) 585 } 586 587 fn unpark_thread(&self) { 588 if let Some(handle) = self.handle.as_ref() { 589 handle.thread().unpark() 590 } 591 } 592 } 593 594 impl CpuManager { 595 #[allow(unused_variables)] 596 #[allow(clippy::too_many_arguments)] 597 pub fn new( 598 config: &CpusConfig, 599 device_manager: &Arc<Mutex<DeviceManager>>, 600 memory_manager: &Arc<Mutex<MemoryManager>>, 601 vm: Arc<dyn hypervisor::Vm>, 602 exit_evt: EventFd, 603 reset_evt: EventFd, 604 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 605 hypervisor: Arc<dyn hypervisor::Hypervisor>, 606 seccomp_action: SeccompAction, 607 vm_ops: Arc<dyn VmOps>, 608 #[cfg(feature = "tdx")] tdx_enabled: bool, 609 numa_nodes: &NumaNodes, 610 ) -> Result<Arc<Mutex<CpuManager>>> { 611 let guest_memory = memory_manager.lock().unwrap().guest_memory(); 612 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 613 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 614 let hypervisor_type = hypervisor.hypervisor_type(); 615 616 #[cfg(target_arch = "x86_64")] 617 let sgx_epc_sections = memory_manager 618 .lock() 619 .unwrap() 620 .sgx_epc_region() 621 .as_ref() 622 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 623 #[cfg(target_arch = "x86_64")] 624 let cpuid = { 625 let phys_bits = physical_bits(config.max_phys_bits); 626 arch::generate_common_cpuid( 627 hypervisor, 628 config 629 .topology 630 .clone() 631 .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)), 632 sgx_epc_sections, 633 phys_bits, 634 config.kvm_hyperv, 635 #[cfg(feature = "tdx")] 636 tdx_enabled, 637 ) 638 .map_err(Error::CommonCpuId)? 639 }; 640 #[cfg(target_arch = "x86_64")] 641 if config.features.amx { 642 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 643 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 644 const XFEATURE_XTILEDATA: usize = 18; 645 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 646 647 // This is safe as the syscall is only modifing kernel internal 648 // data structures that the kernel is itself expected to safeguard. 649 let amx_tile = unsafe { 650 libc::syscall( 651 libc::SYS_arch_prctl, 652 ARCH_REQ_XCOMP_GUEST_PERM, 653 XFEATURE_XTILEDATA, 654 ) 655 }; 656 657 if amx_tile != 0 { 658 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 659 } else { 660 // This is safe as the mask being modified (not marked mutable as it is 661 // modified in unsafe only which is permitted) isn't in use elsewhere. 662 let mask: usize = 0; 663 let result = unsafe { 664 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 665 }; 666 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 667 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 668 } 669 } 670 } 671 672 let device_manager = device_manager.lock().unwrap(); 673 674 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 675 let mut cpu_list = Vec::new(); 676 for (proximity_domain, numa_node) in numa_nodes.iter() { 677 for cpu in numa_node.cpus.iter() { 678 cpu_list.push((*cpu, *proximity_domain)) 679 } 680 } 681 cpu_list 682 } 683 .into_iter() 684 .collect(); 685 686 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 687 cpu_affinity 688 .iter() 689 .map(|a| (a.vcpu, a.host_cpus.clone())) 690 .collect() 691 } else { 692 BTreeMap::new() 693 }; 694 695 #[cfg(feature = "tdx")] 696 let dynamic = !tdx_enabled; 697 #[cfg(not(feature = "tdx"))] 698 let dynamic = true; 699 700 let acpi_address = if dynamic { 701 Some( 702 device_manager 703 .allocator() 704 .lock() 705 .unwrap() 706 .allocate_platform_mmio_addresses(None, CPU_MANAGER_ACPI_SIZE as u64, None) 707 .ok_or(Error::AllocateMmmioAddress)?, 708 ) 709 } else { 710 None 711 }; 712 713 let cpu_manager = Arc::new(Mutex::new(CpuManager { 714 hypervisor_type, 715 config: config.clone(), 716 interrupt_controller: device_manager.interrupt_controller().clone(), 717 vm_memory: guest_memory, 718 #[cfg(target_arch = "x86_64")] 719 cpuid, 720 vm, 721 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 722 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 723 vcpu_states, 724 exit_evt, 725 reset_evt, 726 #[cfg(feature = "guest_debug")] 727 vm_debug_evt, 728 selected_cpu: 0, 729 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 730 seccomp_action, 731 vm_ops, 732 acpi_address, 733 proximity_domain_per_cpu, 734 affinity, 735 dynamic, 736 })); 737 738 if let Some(acpi_address) = acpi_address { 739 device_manager 740 .mmio_bus() 741 .insert( 742 cpu_manager.clone(), 743 acpi_address.0, 744 CPU_MANAGER_ACPI_SIZE as u64, 745 ) 746 .map_err(Error::BusError)?; 747 } 748 749 Ok(cpu_manager) 750 } 751 752 fn create_vcpu( 753 &mut self, 754 cpu_id: u8, 755 entry_point: Option<EntryPoint>, 756 snapshot: Option<Snapshot>, 757 ) -> Result<()> { 758 info!("Creating vCPU: cpu_id = {}", cpu_id); 759 760 let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?; 761 762 if let Some(snapshot) = snapshot { 763 // AArch64 vCPUs should be initialized after created. 764 #[cfg(target_arch = "aarch64")] 765 vcpu.init(&self.vm)?; 766 767 vcpu.restore(snapshot).expect("Failed to restore vCPU"); 768 } else { 769 #[cfg(target_arch = "x86_64")] 770 vcpu.configure( 771 entry_point, 772 &self.vm_memory, 773 self.cpuid.clone(), 774 self.config.kvm_hyperv, 775 ) 776 .expect("Failed to configure vCPU"); 777 778 #[cfg(target_arch = "aarch64")] 779 vcpu.configure(&self.vm, entry_point) 780 .expect("Failed to configure vCPU"); 781 } 782 783 // Adding vCPU to the CpuManager's vCPU list. 784 let vcpu = Arc::new(Mutex::new(vcpu)); 785 self.vcpus.push(vcpu); 786 787 Ok(()) 788 } 789 790 /// Only create new vCPUs if there aren't any inactive ones to reuse 791 fn create_vcpus(&mut self, desired_vcpus: u8, entry_point: Option<EntryPoint>) -> Result<()> { 792 info!( 793 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 794 desired_vcpus, 795 self.config.max_vcpus, 796 self.vcpus.len(), 797 self.present_vcpus() 798 ); 799 800 if desired_vcpus > self.config.max_vcpus { 801 return Err(Error::DesiredVCpuCountExceedsMax); 802 } 803 804 // Only create vCPUs in excess of all the allocated vCPUs. 805 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 806 self.create_vcpu(cpu_id, entry_point, None)?; 807 } 808 809 Ok(()) 810 } 811 812 #[cfg(target_arch = "aarch64")] 813 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 814 for cpu in self.vcpus.iter() { 815 let cpu = cpu.lock().unwrap(); 816 // Check if PMU attr is available, if not, log the information. 817 if cpu.vcpu.has_pmu_support() { 818 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; 819 } else { 820 debug!( 821 "PMU attribute is not supported in vCPU{}, skip PMU init!", 822 cpu.id 823 ); 824 return Ok(false); 825 } 826 } 827 828 Ok(true) 829 } 830 831 fn start_vcpu( 832 &mut self, 833 vcpu: Arc<Mutex<Vcpu>>, 834 vcpu_id: u8, 835 vcpu_thread_barrier: Arc<Barrier>, 836 inserting: bool, 837 ) -> Result<()> { 838 let reset_evt = self.reset_evt.try_clone().unwrap(); 839 let exit_evt = self.exit_evt.try_clone().unwrap(); 840 #[cfg(feature = "guest_debug")] 841 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 842 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 843 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 844 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 845 846 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 847 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 848 .vcpu_run_interrupted 849 .clone(); 850 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 851 852 // Prepare the CPU set the current vCPU is expected to run onto. 853 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 854 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 855 unsafe { libc::CPU_ZERO(&mut cpuset) }; 856 for host_cpu in host_cpus { 857 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) }; 858 } 859 cpuset 860 }); 861 862 // Retrieve seccomp filter for vcpu thread 863 let vcpu_seccomp_filter = 864 get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type) 865 .map_err(Error::CreateSeccompFilter)?; 866 867 #[cfg(target_arch = "x86_64")] 868 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 869 870 info!("Starting vCPU: cpu_id = {}", vcpu_id); 871 872 let handle = Some( 873 thread::Builder::new() 874 .name(format!("vcpu{}", vcpu_id)) 875 .spawn(move || { 876 // Schedule the thread to run on the expected CPU set 877 if let Some(cpuset) = cpuset.as_ref() { 878 let ret = unsafe { 879 libc::sched_setaffinity( 880 0, 881 std::mem::size_of::<libc::cpu_set_t>(), 882 cpuset as *const libc::cpu_set_t, 883 ) 884 }; 885 886 if ret != 0 { 887 error!( 888 "Failed scheduling the vCPU {} on the expected CPU set: {}", 889 vcpu_id, 890 io::Error::last_os_error() 891 ); 892 return; 893 } 894 } 895 896 // Apply seccomp filter for vcpu thread. 897 if !vcpu_seccomp_filter.is_empty() { 898 if let Err(e) = 899 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 900 { 901 error!("Error applying seccomp filter: {:?}", e); 902 return; 903 } 904 } 905 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 906 // This uses an async signal safe handler to kill the vcpu handles. 907 register_signal_handler(SIGRTMIN(), handle_signal) 908 .expect("Failed to register vcpu signal handler"); 909 // Block until all CPUs are ready. 910 vcpu_thread_barrier.wait(); 911 912 std::panic::catch_unwind(move || { 913 loop { 914 // If we are being told to pause, we park the thread 915 // until the pause boolean is toggled. 916 // The resume operation is responsible for toggling 917 // the boolean and unpark the thread. 918 // We enter a loop because park() could spuriously 919 // return. We will then park() again unless the 920 // pause boolean has been toggled. 921 922 // Need to use Ordering::SeqCst as we have multiple 923 // loads and stores to different atomics and we need 924 // to see them in a consistent order in all threads 925 926 if vcpu_pause_signalled.load(Ordering::SeqCst) { 927 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 928 // completed by returning to KVM_RUN. From the kernel docs: 929 // 930 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 931 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 932 // operations are complete (and guest state is consistent) only after userspace 933 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 934 // incomplete operations and then check for pending signals. 935 // The pending state of the operation is not preserved in state which is 936 // visible to userspace, thus userspace should ensure that the operation is 937 // completed before performing a live migration. Userspace can re-enter the 938 // guest with an unmasked signal pending or with the immediate_exit field set 939 // to complete pending operations without allowing any further instructions 940 // to be executed. 941 942 #[cfg(feature = "kvm")] 943 { 944 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 945 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 946 error!("Unexpected VM exit on \"immediate_exit\" run"); 947 break; 948 } 949 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 950 } 951 952 vcpu_run_interrupted.store(true, Ordering::SeqCst); 953 while vcpu_pause_signalled.load(Ordering::SeqCst) { 954 thread::park(); 955 } 956 vcpu_run_interrupted.store(false, Ordering::SeqCst); 957 } 958 959 // We've been told to terminate 960 if vcpu_kill_signalled.load(Ordering::SeqCst) 961 || vcpu_kill.load(Ordering::SeqCst) 962 { 963 vcpu_run_interrupted.store(true, Ordering::SeqCst); 964 break; 965 } 966 967 #[cfg(feature = "tdx")] 968 let mut vcpu = vcpu.lock().unwrap(); 969 #[cfg(not(feature = "tdx"))] 970 let vcpu = vcpu.lock().unwrap(); 971 // vcpu.run() returns false on a triple-fault so trigger a reset 972 match vcpu.run() { 973 Ok(run) => match run { 974 #[cfg(feature = "kvm")] 975 VmExit::Debug => { 976 info!("VmExit::Debug"); 977 #[cfg(feature = "guest_debug")] 978 { 979 vcpu_pause_signalled.store(true, Ordering::SeqCst); 980 let raw_tid = get_raw_tid(vcpu_id as usize); 981 vm_debug_evt.write(raw_tid as u64).unwrap(); 982 } 983 } 984 #[cfg(target_arch = "x86_64")] 985 VmExit::IoapicEoi(vector) => { 986 if let Some(interrupt_controller) = 987 &interrupt_controller_clone 988 { 989 interrupt_controller 990 .lock() 991 .unwrap() 992 .end_of_interrupt(vector); 993 } 994 } 995 VmExit::Ignore => {} 996 VmExit::Hyperv => {} 997 VmExit::Reset => { 998 info!("VmExit::Reset"); 999 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1000 reset_evt.write(1).unwrap(); 1001 break; 1002 } 1003 VmExit::Shutdown => { 1004 info!("VmExit::Shutdown"); 1005 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1006 exit_evt.write(1).unwrap(); 1007 break; 1008 } 1009 #[cfg(feature = "tdx")] 1010 VmExit::Tdx => { 1011 if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { 1012 match vcpu.get_tdx_exit_details() { 1013 Ok(details) => match details { 1014 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 1015 TdxExitDetails::SetupEventNotifyInterrupt => { 1016 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 1017 } 1018 }, 1019 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1020 } 1021 vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); 1022 } else { 1023 // We should never reach this code as 1024 // this means the design from the code 1025 // is wrong. 1026 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1027 } 1028 } 1029 _ => { 1030 error!( 1031 "VCPU generated error: {:?}", 1032 Error::UnexpectedVmExit 1033 ); 1034 break; 1035 } 1036 }, 1037 1038 Err(e) => { 1039 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1040 break; 1041 } 1042 } 1043 1044 // We've been told to terminate 1045 if vcpu_kill_signalled.load(Ordering::SeqCst) 1046 || vcpu_kill.load(Ordering::SeqCst) 1047 { 1048 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1049 break; 1050 } 1051 } 1052 }) 1053 .or_else(|_| { 1054 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1055 error!("vCPU thread panicked"); 1056 panic_exit_evt.write(1) 1057 }) 1058 .ok(); 1059 }) 1060 .map_err(Error::VcpuSpawn)?, 1061 ); 1062 1063 // On hot plug calls into this function entry_point is None. It is for 1064 // those hotplug CPU additions that we need to set the inserting flag. 1065 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1066 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1067 1068 Ok(()) 1069 } 1070 1071 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1072 fn activate_vcpus( 1073 &mut self, 1074 desired_vcpus: u8, 1075 inserting: bool, 1076 paused: Option<bool>, 1077 ) -> Result<()> { 1078 if desired_vcpus > self.config.max_vcpus { 1079 return Err(Error::DesiredVCpuCountExceedsMax); 1080 } 1081 1082 let vcpu_thread_barrier = Arc::new(Barrier::new( 1083 (desired_vcpus - self.present_vcpus() + 1) as usize, 1084 )); 1085 1086 if let Some(paused) = paused { 1087 self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); 1088 } 1089 1090 info!( 1091 "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", 1092 desired_vcpus, 1093 self.vcpus.len(), 1094 self.present_vcpus(), 1095 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1096 ); 1097 1098 // This reuses any inactive vCPUs as well as any that were newly created 1099 for vcpu_id in self.present_vcpus()..desired_vcpus { 1100 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1101 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1102 } 1103 1104 // Unblock all CPU threads. 1105 vcpu_thread_barrier.wait(); 1106 Ok(()) 1107 } 1108 1109 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1110 // Mark vCPUs for removal, actual removal happens on ejection 1111 for cpu_id in desired_vcpus..self.present_vcpus() { 1112 self.vcpu_states[usize::from(cpu_id)].removing = true; 1113 } 1114 } 1115 1116 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1117 info!("Removing vCPU: cpu_id = {}", cpu_id); 1118 let mut state = &mut self.vcpu_states[usize::from(cpu_id)]; 1119 state.kill.store(true, Ordering::SeqCst); 1120 state.signal_thread(); 1121 state.join_thread()?; 1122 state.handle = None; 1123 1124 // Once the thread has exited, clear the "kill" so that it can reused 1125 state.kill.store(false, Ordering::SeqCst); 1126 1127 Ok(()) 1128 } 1129 1130 pub fn create_boot_vcpus(&mut self, entry_point: Option<EntryPoint>) -> Result<()> { 1131 trace_scoped!("create_boot_vcpus"); 1132 1133 self.create_vcpus(self.boot_vcpus(), entry_point) 1134 } 1135 1136 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1137 pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { 1138 self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) 1139 } 1140 1141 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1142 self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) 1143 .map_err(|e| { 1144 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1145 })?; 1146 1147 Ok(()) 1148 } 1149 1150 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1151 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1152 return Ok(false); 1153 } 1154 1155 if !self.dynamic { 1156 return Ok(false); 1157 } 1158 1159 match desired_vcpus.cmp(&self.present_vcpus()) { 1160 cmp::Ordering::Greater => { 1161 self.create_vcpus(desired_vcpus, None)?; 1162 self.activate_vcpus(desired_vcpus, true, None)?; 1163 Ok(true) 1164 } 1165 cmp::Ordering::Less => { 1166 self.mark_vcpus_for_removal(desired_vcpus); 1167 Ok(true) 1168 } 1169 _ => Ok(false), 1170 } 1171 } 1172 1173 pub fn shutdown(&mut self) -> Result<()> { 1174 // Tell the vCPUs to stop themselves next time they go through the loop 1175 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1176 1177 // Toggle the vCPUs pause boolean 1178 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1179 1180 // Unpark all the VCPU threads. 1181 for state in self.vcpu_states.iter() { 1182 state.unpark_thread(); 1183 } 1184 1185 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1186 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1187 // above. 1188 for state in self.vcpu_states.iter() { 1189 state.signal_thread(); 1190 } 1191 1192 // Wait for all the threads to finish. This removes the state from the vector. 1193 for mut state in self.vcpu_states.drain(..) { 1194 state.join_thread()?; 1195 } 1196 1197 Ok(()) 1198 } 1199 1200 #[cfg(feature = "tdx")] 1201 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1202 for vcpu in &self.vcpus { 1203 vcpu.lock() 1204 .unwrap() 1205 .vcpu 1206 .tdx_init(hob_address) 1207 .map_err(Error::InitializeTdx)?; 1208 } 1209 Ok(()) 1210 } 1211 1212 pub fn boot_vcpus(&self) -> u8 { 1213 self.config.boot_vcpus 1214 } 1215 1216 pub fn max_vcpus(&self) -> u8 { 1217 self.config.max_vcpus 1218 } 1219 1220 #[cfg(target_arch = "x86_64")] 1221 pub fn common_cpuid(&self) -> Vec<CpuIdEntry> { 1222 self.cpuid.clone() 1223 } 1224 1225 fn present_vcpus(&self) -> u8 { 1226 self.vcpu_states 1227 .iter() 1228 .fold(0, |acc, state| acc + state.active() as u8) 1229 } 1230 1231 #[cfg(target_arch = "aarch64")] 1232 pub fn get_mpidrs(&self) -> Vec<u64> { 1233 self.vcpus 1234 .iter() 1235 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1236 .collect() 1237 } 1238 1239 #[cfg(target_arch = "aarch64")] 1240 pub fn get_saved_states(&self) -> Vec<CpuState> { 1241 self.vcpus 1242 .iter() 1243 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1244 .collect() 1245 } 1246 1247 #[cfg(target_arch = "aarch64")] 1248 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1249 self.config 1250 .topology 1251 .clone() 1252 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1253 } 1254 1255 pub fn create_madt(&self) -> Sdt { 1256 use crate::acpi; 1257 // This is also checked in the commandline parsing. 1258 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1259 1260 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1261 #[cfg(target_arch = "x86_64")] 1262 { 1263 madt.write(36, arch::layout::APIC_START); 1264 1265 for cpu in 0..self.config.max_vcpus { 1266 let lapic = LocalApic { 1267 r#type: acpi::ACPI_APIC_PROCESSOR, 1268 length: 8, 1269 processor_id: cpu, 1270 apic_id: cpu, 1271 flags: if cpu < self.config.boot_vcpus { 1272 1 << MADT_CPU_ENABLE_FLAG 1273 } else { 1274 0 1275 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, 1276 }; 1277 madt.append(lapic); 1278 } 1279 1280 madt.append(Ioapic { 1281 r#type: acpi::ACPI_APIC_IO, 1282 length: 12, 1283 ioapic_id: 0, 1284 apic_address: arch::layout::IOAPIC_START.0 as u32, 1285 gsi_base: 0, 1286 ..Default::default() 1287 }); 1288 1289 madt.append(InterruptSourceOverride { 1290 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1291 length: 10, 1292 bus: 0, 1293 source: 4, 1294 gsi: 4, 1295 flags: 0, 1296 }); 1297 } 1298 1299 #[cfg(target_arch = "aarch64")] 1300 { 1301 /* Notes: 1302 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1303 */ 1304 1305 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1306 for cpu in 0..self.config.boot_vcpus { 1307 let vcpu = &self.vcpus[cpu as usize]; 1308 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1309 /* ARMv8 MPIDR format: 1310 Bits [63:40] Must be zero 1311 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1312 Bits [31:24] Must be zero 1313 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1314 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1315 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1316 */ 1317 let mpidr_mask = 0xff_00ff_ffff; 1318 let gicc = GicC { 1319 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1320 length: 80, 1321 reserved0: 0, 1322 cpu_interface_number: cpu as u32, 1323 uid: cpu as u32, 1324 flags: 1, 1325 parking_version: 0, 1326 performance_interrupt: 0, 1327 parked_address: 0, 1328 base_address: 0, 1329 gicv_base_address: 0, 1330 gich_base_address: 0, 1331 vgic_interrupt: 0, 1332 gicr_base_address: 0, 1333 mpidr: mpidr & mpidr_mask, 1334 proc_power_effi_class: 0, 1335 reserved1: 0, 1336 spe_overflow_interrupt: 0, 1337 }; 1338 1339 madt.append(gicc); 1340 } 1341 let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into()); 1342 1343 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1344 let gicd = GicD { 1345 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1346 length: 24, 1347 reserved0: 0, 1348 gic_id: 0, 1349 base_address: vgic_config.dist_addr, 1350 global_irq_base: 0, 1351 version: 3, 1352 reserved1: [0; 3], 1353 }; 1354 madt.append(gicd); 1355 1356 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1357 let gicr = GicR { 1358 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1359 length: 16, 1360 reserved: 0, 1361 base_address: vgic_config.redists_addr, 1362 range_length: vgic_config.redists_size as u32, 1363 }; 1364 madt.append(gicr); 1365 1366 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1367 let gicits = GicIts { 1368 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1369 length: 20, 1370 reserved0: 0, 1371 translation_id: 0, 1372 base_address: vgic_config.msi_addr, 1373 reserved1: 0, 1374 }; 1375 madt.append(gicits); 1376 1377 madt.update_checksum(); 1378 } 1379 1380 madt 1381 } 1382 1383 #[cfg(target_arch = "aarch64")] 1384 pub fn create_pptt(&self) -> Sdt { 1385 let pptt_start = 0; 1386 let mut cpus = 0; 1387 let mut uid = 0; 1388 // If topology is not specified, the default setting is: 1389 // 1 package, multiple cores, 1 thread per core 1390 // This is also the behavior when PPTT is missing. 1391 let (threads_per_core, cores_per_package, packages) = 1392 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1393 1394 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1395 1396 for cluster_idx in 0..packages { 1397 if cpus < self.config.boot_vcpus as usize { 1398 let cluster_offset = pptt.len() - pptt_start; 1399 let cluster_hierarchy_node = ProcessorHierarchyNode { 1400 r#type: 0, 1401 length: 20, 1402 reserved: 0, 1403 flags: 0x2, 1404 parent: 0, 1405 acpi_processor_id: cluster_idx as u32, 1406 num_private_resources: 0, 1407 }; 1408 pptt.append(cluster_hierarchy_node); 1409 1410 for core_idx in 0..cores_per_package { 1411 let core_offset = pptt.len() - pptt_start; 1412 1413 if threads_per_core > 1 { 1414 let core_hierarchy_node = ProcessorHierarchyNode { 1415 r#type: 0, 1416 length: 20, 1417 reserved: 0, 1418 flags: 0x2, 1419 parent: cluster_offset as u32, 1420 acpi_processor_id: core_idx as u32, 1421 num_private_resources: 0, 1422 }; 1423 pptt.append(core_hierarchy_node); 1424 1425 for _thread_idx in 0..threads_per_core { 1426 let thread_hierarchy_node = ProcessorHierarchyNode { 1427 r#type: 0, 1428 length: 20, 1429 reserved: 0, 1430 flags: 0xE, 1431 parent: core_offset as u32, 1432 acpi_processor_id: uid as u32, 1433 num_private_resources: 0, 1434 }; 1435 pptt.append(thread_hierarchy_node); 1436 uid += 1; 1437 } 1438 } else { 1439 let thread_hierarchy_node = ProcessorHierarchyNode { 1440 r#type: 0, 1441 length: 20, 1442 reserved: 0, 1443 flags: 0xA, 1444 parent: cluster_offset as u32, 1445 acpi_processor_id: uid as u32, 1446 num_private_resources: 0, 1447 }; 1448 pptt.append(thread_hierarchy_node); 1449 uid += 1; 1450 } 1451 } 1452 cpus += (cores_per_package * threads_per_core) as usize; 1453 } 1454 } 1455 1456 pptt.update_checksum(); 1457 pptt 1458 } 1459 1460 #[cfg(feature = "guest_debug")] 1461 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1462 self.vcpus[usize::from(cpu_id)] 1463 .lock() 1464 .unwrap() 1465 .vcpu 1466 .get_regs() 1467 .map_err(Error::CpuDebug) 1468 } 1469 1470 #[cfg(feature = "guest_debug")] 1471 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1472 self.vcpus[usize::from(cpu_id)] 1473 .lock() 1474 .unwrap() 1475 .vcpu 1476 .set_regs(regs) 1477 .map_err(Error::CpuDebug) 1478 } 1479 1480 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1481 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1482 self.vcpus[usize::from(cpu_id)] 1483 .lock() 1484 .unwrap() 1485 .vcpu 1486 .get_sregs() 1487 .map_err(Error::CpuDebug) 1488 } 1489 1490 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1491 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1492 self.vcpus[usize::from(cpu_id)] 1493 .lock() 1494 .unwrap() 1495 .vcpu 1496 .set_sregs(sregs) 1497 .map_err(Error::CpuDebug) 1498 } 1499 1500 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1501 fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result<u64> { 1502 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1503 .lock() 1504 .unwrap() 1505 .vcpu 1506 .translate_gva(gva, /* flags: unused */ 0) 1507 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1508 Ok(gpa) 1509 } 1510 1511 /// 1512 /// On AArch64, `translate_gva` API is not provided by KVM. We implemented 1513 /// it in VMM by walking through translation tables. 1514 /// 1515 /// Address translation is big topic, here we only focus the scenario that 1516 /// happens in VMM while debugging kernel. This `translate_gva` 1517 /// implementation is restricted to: 1518 /// - Exception Level 1 1519 /// - Translate high address range only (kernel space) 1520 /// 1521 /// This implementation supports following Arm-v8a features related to 1522 /// address translation: 1523 /// - FEAT_LPA 1524 /// - FEAT_LVA 1525 /// - FEAT_LPA2 1526 /// 1527 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 1528 fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result<u64> { 1529 let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] 1530 .lock() 1531 .unwrap() 1532 .vcpu 1533 .get_sys_reg(regs::TCR_EL1) 1534 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1535 let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] 1536 .lock() 1537 .unwrap() 1538 .vcpu 1539 .get_sys_reg(regs::TTBR1_EL1) 1540 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1541 let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] 1542 .lock() 1543 .unwrap() 1544 .vcpu 1545 .get_sys_reg(regs::ID_AA64MMFR0_EL1) 1546 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1547 1548 // Bit 55 of the VA determines the range, high (0xFFFxxx...) 1549 // or low (0x000xxx...). 1550 let high_range = extract_bits_64!(gva, 55, 1); 1551 if high_range == 0 { 1552 info!("VA (0x{:x}) range is not supported!", gva); 1553 return Ok(gva); 1554 } 1555 1556 // High range size offset 1557 let tsz = extract_bits_64!(tcr_el1, 16, 6); 1558 // Granule size 1559 let tg = extract_bits_64!(tcr_el1, 30, 2); 1560 // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 1561 let ds = extract_bits_64!(tcr_el1, 59, 1); 1562 1563 if tsz == 0 { 1564 info!("VA translation is not ready!"); 1565 return Ok(gva); 1566 } 1567 1568 // VA size is determined by TCR_BL1.T1SZ 1569 let va_size = 64 - tsz; 1570 // Number of bits in VA consumed in each level of translation 1571 let stride = match tg { 1572 3 => 13, // 64KB granule size 1573 1 => 11, // 16KB granule size 1574 _ => 9, // 4KB, default 1575 }; 1576 // Starting level of walking 1577 let mut level = 4 - (va_size - 4) / stride; 1578 1579 // PA or IPA size is determined 1580 let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); 1581 #[allow(clippy::identity_op)] 1582 let pa_range = extract_bits_64!(id_aa64mmfr0_el1, 0, 4); 1583 // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. 1584 // To be safe, we use the minimum value if they are different. 1585 let pa_range = std::cmp::min(tcr_ips, pa_range); 1586 // PA size in bits 1587 let pa_size = match pa_range { 1588 0 => 32, 1589 1 => 36, 1590 2 => 40, 1591 3 => 42, 1592 4 => 44, 1593 5 => 48, 1594 6 => 52, 1595 _ => { 1596 return Err(Error::TranslateVirtualAddress(anyhow!(format!( 1597 "PA range not supported {}", 1598 pa_range 1599 )))) 1600 } 1601 }; 1602 1603 let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); 1604 let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); 1605 // If FEAT_LPA2 is present, the translation table descriptor holds 1606 // 50 bits of the table address of next level. 1607 // Otherwise, it is 48 bits. 1608 let descaddrmask = if ds == 1 { 1609 !0u64 >> (64 - 50) // mask with 50 least significant bits 1610 } else { 1611 !0u64 >> (64 - 48) // mask with 48 least significant bits 1612 }; 1613 let descaddrmask = descaddrmask & !indexmask_grainsize; 1614 1615 // Translation table base address 1616 #[allow(clippy::identity_op)] 1617 let mut descaddr: u64 = extract_bits_64!(ttbr1_el1, 0, 48); 1618 // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table 1619 // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5]. 1620 if pa_size == 52 { 1621 descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; 1622 } 1623 1624 // Loop through tables of each level 1625 loop { 1626 // Table offset for current level 1627 let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; 1628 descaddr |= table_offset; 1629 descaddr &= !7u64; 1630 1631 let mut buf = [0; 8]; 1632 self.vm_memory 1633 .memory() 1634 .read(&mut buf, GuestAddress(descaddr)) 1635 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; 1636 let descriptor = u64::from_le_bytes(buf); 1637 1638 descaddr = descriptor & descaddrmask; 1639 // In the case of FEAT_LPA, the next-level translation table address 1640 // bits [48:51] comes from bits [12:15] of the current descriptor. 1641 // For FEAT_LPA2, the next-level translation table address 1642 // bits [50:51] comes from bits [8:9] of the current descriptor, 1643 // bits [48:49] comes from bits [48:49] of the descriptor which was 1644 // handled previously. 1645 if pa_size == 52 { 1646 if ds == 1 { 1647 // FEAT_LPA2 1648 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; 1649 } else { 1650 // FEAT_LPA 1651 descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; 1652 } 1653 } 1654 1655 if (descriptor & 2) != 0 && (level < 3) { 1656 // This is a table entry. Go down to next level. 1657 level += 1; 1658 indexmask = indexmask_grainsize; 1659 continue; 1660 } 1661 1662 break; 1663 } 1664 1665 // We have reached either: 1666 // - a page entry at level 3 or 1667 // - a block entry at level 1 or 2 1668 let page_size = 1u64 << ((stride * (4 - level)) + 3); 1669 descaddr &= !(page_size - 1); 1670 descaddr |= gva & (page_size - 1); 1671 1672 Ok(descaddr) 1673 } 1674 } 1675 1676 struct Cpu { 1677 cpu_id: u8, 1678 proximity_domain: u32, 1679 dynamic: bool, 1680 } 1681 1682 #[cfg(target_arch = "x86_64")] 1683 const MADT_CPU_ENABLE_FLAG: usize = 0; 1684 1685 #[cfg(target_arch = "x86_64")] 1686 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; 1687 1688 impl Cpu { 1689 #[cfg(target_arch = "x86_64")] 1690 fn generate_mat(&self) -> Vec<u8> { 1691 let lapic = LocalApic { 1692 r#type: 0, 1693 length: 8, 1694 processor_id: self.cpu_id, 1695 apic_id: self.cpu_id, 1696 flags: 1 << MADT_CPU_ENABLE_FLAG, 1697 }; 1698 1699 let mut mat_data: Vec<u8> = Vec::new(); 1700 mat_data.resize(std::mem::size_of_val(&lapic), 0); 1701 unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic }; 1702 1703 mat_data 1704 } 1705 } 1706 1707 impl Aml for Cpu { 1708 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1709 #[cfg(target_arch = "x86_64")] 1710 let mat_data: Vec<u8> = self.generate_mat(); 1711 #[allow(clippy::if_same_then_else)] 1712 if self.dynamic { 1713 aml::Device::new( 1714 format!("C{:03}", self.cpu_id).as_str().into(), 1715 vec![ 1716 &aml::Name::new("_HID".into(), &"ACPI0007"), 1717 &aml::Name::new("_UID".into(), &self.cpu_id), 1718 // Currently, AArch64 cannot support following fields. 1719 /* 1720 _STA return value: 1721 Bit [0] – Set if the device is present. 1722 Bit [1] – Set if the device is enabled and decoding its resources. 1723 Bit [2] – Set if the device should be shown in the UI. 1724 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1725 Bit [4] – Set if the battery is present. 1726 Bits [31:5] – Reserved (must be cleared). 1727 */ 1728 #[cfg(target_arch = "x86_64")] 1729 &aml::Method::new( 1730 "_STA".into(), 1731 0, 1732 false, 1733 // Call into CSTA method which will interrogate device 1734 vec![&aml::Return::new(&aml::MethodCall::new( 1735 "CSTA".into(), 1736 vec![&self.cpu_id], 1737 ))], 1738 ), 1739 &aml::Method::new( 1740 "_PXM".into(), 1741 0, 1742 false, 1743 vec![&aml::Return::new(&self.proximity_domain)], 1744 ), 1745 // The Linux kernel expects every CPU device to have a _MAT entry 1746 // containing the LAPIC for this processor with the enabled bit set 1747 // even it if is disabled in the MADT (non-boot CPU) 1748 #[cfg(target_arch = "x86_64")] 1749 &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)), 1750 // Trigger CPU ejection 1751 #[cfg(target_arch = "x86_64")] 1752 &aml::Method::new( 1753 "_EJ0".into(), 1754 1, 1755 false, 1756 // Call into CEJ0 method which will actually eject device 1757 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1758 ), 1759 ], 1760 ) 1761 .append_aml_bytes(bytes); 1762 } else { 1763 aml::Device::new( 1764 format!("C{:03}", self.cpu_id).as_str().into(), 1765 vec![ 1766 &aml::Name::new("_HID".into(), &"ACPI0007"), 1767 &aml::Name::new("_UID".into(), &self.cpu_id), 1768 #[cfg(target_arch = "x86_64")] 1769 &aml::Method::new( 1770 "_STA".into(), 1771 0, 1772 false, 1773 // Mark CPU present see CSTA implementation 1774 vec![&aml::Return::new(&0xfu8)], 1775 ), 1776 &aml::Method::new( 1777 "_PXM".into(), 1778 0, 1779 false, 1780 vec![&aml::Return::new(&self.proximity_domain)], 1781 ), 1782 // The Linux kernel expects every CPU device to have a _MAT entry 1783 // containing the LAPIC for this processor with the enabled bit set 1784 // even it if is disabled in the MADT (non-boot CPU) 1785 #[cfg(target_arch = "x86_64")] 1786 &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)), 1787 ], 1788 ) 1789 .append_aml_bytes(bytes); 1790 } 1791 } 1792 } 1793 1794 struct CpuNotify { 1795 cpu_id: u8, 1796 } 1797 1798 impl Aml for CpuNotify { 1799 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1800 let object = aml::Path::new(&format!("C{:03}", self.cpu_id)); 1801 aml::If::new( 1802 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1803 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1804 ) 1805 .append_aml_bytes(bytes) 1806 } 1807 } 1808 1809 struct CpuMethods { 1810 max_vcpus: u8, 1811 dynamic: bool, 1812 } 1813 1814 impl Aml for CpuMethods { 1815 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1816 if self.dynamic { 1817 // CPU status method 1818 aml::Method::new( 1819 "CSTA".into(), 1820 1, 1821 true, 1822 vec![ 1823 // Take lock defined above 1824 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1825 // Write CPU number (in first argument) to I/O port via field 1826 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1827 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1828 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 1829 &aml::If::new( 1830 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 1831 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 1832 ), 1833 // Release lock 1834 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1835 // Return 0 or 0xf 1836 &aml::Return::new(&aml::Local(0)), 1837 ], 1838 ) 1839 .append_aml_bytes(bytes); 1840 1841 let mut cpu_notifies = Vec::new(); 1842 for cpu_id in 0..self.max_vcpus { 1843 cpu_notifies.push(CpuNotify { cpu_id }); 1844 } 1845 1846 let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new(); 1847 for cpu_id in 0..self.max_vcpus { 1848 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 1849 } 1850 1851 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).append_aml_bytes(bytes); 1852 1853 aml::Method::new( 1854 "CEJ0".into(), 1855 1, 1856 true, 1857 vec![ 1858 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1859 // Write CPU number (in first argument) to I/O port via field 1860 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1861 // Set CEJ0 bit 1862 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 1863 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1864 ], 1865 ) 1866 .append_aml_bytes(bytes); 1867 1868 aml::Method::new( 1869 "CSCN".into(), 1870 0, 1871 true, 1872 vec![ 1873 // Take lock defined above 1874 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1875 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1876 &aml::While::new( 1877 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 1878 vec![ 1879 // Write CPU number (in first argument) to I/O port via field 1880 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 1881 // Check if CINS bit is set 1882 &aml::If::new( 1883 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 1884 // Notify device if it is 1885 vec![ 1886 &aml::MethodCall::new( 1887 "CTFY".into(), 1888 vec![&aml::Local(0), &aml::ONE], 1889 ), 1890 // Reset CINS bit 1891 &aml::Store::new( 1892 &aml::Path::new("\\_SB_.PRES.CINS"), 1893 &aml::ONE, 1894 ), 1895 ], 1896 ), 1897 // Check if CRMV bit is set 1898 &aml::If::new( 1899 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 1900 // Notify device if it is (with the eject constant 0x3) 1901 vec![ 1902 &aml::MethodCall::new( 1903 "CTFY".into(), 1904 vec![&aml::Local(0), &3u8], 1905 ), 1906 // Reset CRMV bit 1907 &aml::Store::new( 1908 &aml::Path::new("\\_SB_.PRES.CRMV"), 1909 &aml::ONE, 1910 ), 1911 ], 1912 ), 1913 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 1914 ], 1915 ), 1916 // Release lock 1917 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1918 ], 1919 ) 1920 .append_aml_bytes(bytes) 1921 } else { 1922 aml::Method::new("CSCN".into(), 0, true, vec![]).append_aml_bytes(bytes) 1923 } 1924 } 1925 } 1926 1927 impl Aml for CpuManager { 1928 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1929 #[cfg(target_arch = "x86_64")] 1930 if let Some(acpi_address) = self.acpi_address { 1931 // CPU hotplug controller 1932 aml::Device::new( 1933 "_SB_.PRES".into(), 1934 vec![ 1935 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 1936 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 1937 // Mutex to protect concurrent access as we write to choose CPU and then read back status 1938 &aml::Mutex::new("CPLK".into(), 0), 1939 &aml::Name::new( 1940 "_CRS".into(), 1941 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 1942 aml::AddressSpaceCachable::NotCacheable, 1943 true, 1944 acpi_address.0 as u64, 1945 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 1946 )]), 1947 ), 1948 // OpRegion and Fields map MMIO range into individual field values 1949 &aml::OpRegion::new( 1950 "PRST".into(), 1951 aml::OpRegionSpace::SystemMemory, 1952 acpi_address.0 as usize, 1953 CPU_MANAGER_ACPI_SIZE, 1954 ), 1955 &aml::Field::new( 1956 "PRST".into(), 1957 aml::FieldAccessType::Byte, 1958 aml::FieldUpdateRule::WriteAsZeroes, 1959 vec![ 1960 aml::FieldEntry::Reserved(32), 1961 aml::FieldEntry::Named(*b"CPEN", 1), 1962 aml::FieldEntry::Named(*b"CINS", 1), 1963 aml::FieldEntry::Named(*b"CRMV", 1), 1964 aml::FieldEntry::Named(*b"CEJ0", 1), 1965 aml::FieldEntry::Reserved(4), 1966 aml::FieldEntry::Named(*b"CCMD", 8), 1967 ], 1968 ), 1969 &aml::Field::new( 1970 "PRST".into(), 1971 aml::FieldAccessType::DWord, 1972 aml::FieldUpdateRule::Preserve, 1973 vec![ 1974 aml::FieldEntry::Named(*b"CSEL", 32), 1975 aml::FieldEntry::Reserved(32), 1976 aml::FieldEntry::Named(*b"CDAT", 32), 1977 ], 1978 ), 1979 ], 1980 ) 1981 .append_aml_bytes(bytes); 1982 } 1983 1984 // CPU devices 1985 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 1986 let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05")); 1987 // Bundle methods together under a common object 1988 let methods = CpuMethods { 1989 max_vcpus: self.config.max_vcpus, 1990 dynamic: self.dynamic, 1991 }; 1992 let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods]; 1993 1994 let mut cpu_devices = Vec::new(); 1995 for cpu_id in 0..self.config.max_vcpus { 1996 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 1997 let cpu_device = Cpu { 1998 cpu_id, 1999 proximity_domain, 2000 dynamic: self.dynamic, 2001 }; 2002 2003 cpu_devices.push(cpu_device); 2004 } 2005 2006 for cpu_device in cpu_devices.iter() { 2007 cpu_data_inner.push(cpu_device); 2008 } 2009 2010 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).append_aml_bytes(bytes) 2011 } 2012 } 2013 2014 impl Pausable for CpuManager { 2015 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2016 // Tell the vCPUs to pause themselves next time they exit 2017 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 2018 2019 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 2020 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 2021 // above. 2022 for state in self.vcpu_states.iter() { 2023 state.signal_thread(); 2024 } 2025 2026 for vcpu in self.vcpus.iter() { 2027 let mut vcpu = vcpu.lock().unwrap(); 2028 vcpu.pause()?; 2029 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2030 if !self.config.kvm_hyperv { 2031 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 2032 MigratableError::Pause(anyhow!( 2033 "Could not notify guest it has been paused {:?}", 2034 e 2035 )) 2036 })?; 2037 } 2038 } 2039 2040 Ok(()) 2041 } 2042 2043 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2044 for vcpu in self.vcpus.iter() { 2045 vcpu.lock().unwrap().resume()?; 2046 } 2047 2048 // Toggle the vCPUs pause boolean 2049 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 2050 2051 // Unpark all the VCPU threads. 2052 // Once unparked, the next thing they will do is checking for the pause 2053 // boolean. Since it'll be set to false, they will exit their pause loop 2054 // and go back to vmx root. 2055 for state in self.vcpu_states.iter() { 2056 state.unpark_thread(); 2057 } 2058 Ok(()) 2059 } 2060 } 2061 2062 impl Snapshottable for CpuManager { 2063 fn id(&self) -> String { 2064 CPU_MANAGER_SNAPSHOT_ID.to_string() 2065 } 2066 2067 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2068 let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID); 2069 2070 // The CpuManager snapshot is a collection of all vCPUs snapshots. 2071 for vcpu in &self.vcpus { 2072 let cpu_snapshot = vcpu.lock().unwrap().snapshot()?; 2073 cpu_manager_snapshot.add_snapshot(cpu_snapshot); 2074 } 2075 2076 Ok(cpu_manager_snapshot) 2077 } 2078 2079 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 2080 for (cpu_id, snapshot) in snapshot.snapshots.iter() { 2081 info!("Restoring VCPU {}", cpu_id); 2082 self.create_vcpu(cpu_id.parse::<u8>().unwrap(), None, Some(*snapshot.clone())) 2083 .map_err(|e| MigratableError::Restore(anyhow!("Could not create vCPU {:?}", e)))?; 2084 } 2085 2086 Ok(()) 2087 } 2088 } 2089 2090 impl Transportable for CpuManager {} 2091 impl Migratable for CpuManager {} 2092 2093 #[cfg(feature = "guest_debug")] 2094 impl Debuggable for CpuManager { 2095 #[cfg(feature = "kvm")] 2096 fn set_guest_debug( 2097 &self, 2098 cpu_id: usize, 2099 addrs: &[GuestAddress], 2100 singlestep: bool, 2101 ) -> std::result::Result<(), DebuggableError> { 2102 self.vcpus[cpu_id] 2103 .lock() 2104 .unwrap() 2105 .vcpu 2106 .set_guest_debug(addrs, singlestep) 2107 .map_err(DebuggableError::SetDebug) 2108 } 2109 2110 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2111 Ok(()) 2112 } 2113 2114 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2115 Ok(()) 2116 } 2117 2118 #[cfg(target_arch = "x86_64")] 2119 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2120 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 2121 let gregs = self 2122 .get_regs(cpu_id as u8) 2123 .map_err(DebuggableError::ReadRegs)?; 2124 let regs = [ 2125 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, 2126 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, 2127 ]; 2128 2129 // GDB exposes 32-bit eflags instead of 64-bit rflags. 2130 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 2131 let eflags = gregs.rflags as u32; 2132 let rip = gregs.rip; 2133 2134 // Segment registers: CS, SS, DS, ES, FS, GS 2135 let sregs = self 2136 .get_sregs(cpu_id as u8) 2137 .map_err(DebuggableError::ReadRegs)?; 2138 let segments = X86SegmentRegs { 2139 cs: sregs.cs.selector as u32, 2140 ss: sregs.ss.selector as u32, 2141 ds: sregs.ds.selector as u32, 2142 es: sregs.es.selector as u32, 2143 fs: sregs.fs.selector as u32, 2144 gs: sregs.gs.selector as u32, 2145 }; 2146 2147 // TODO: Add other registers 2148 2149 Ok(CoreRegs { 2150 regs, 2151 eflags, 2152 rip, 2153 segments, 2154 ..Default::default() 2155 }) 2156 } 2157 2158 #[cfg(target_arch = "aarch64")] 2159 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2160 let gregs = self 2161 .get_regs(cpu_id as u8) 2162 .map_err(DebuggableError::ReadRegs)?; 2163 Ok(CoreRegs { 2164 x: gregs.regs.regs, 2165 sp: gregs.regs.sp, 2166 pc: gregs.regs.pc, 2167 ..Default::default() 2168 }) 2169 } 2170 2171 #[cfg(target_arch = "x86_64")] 2172 fn write_regs( 2173 &self, 2174 cpu_id: usize, 2175 regs: &CoreRegs, 2176 ) -> std::result::Result<(), DebuggableError> { 2177 let orig_gregs = self 2178 .get_regs(cpu_id as u8) 2179 .map_err(DebuggableError::ReadRegs)?; 2180 let gregs = StandardRegisters { 2181 rax: regs.regs[0], 2182 rbx: regs.regs[1], 2183 rcx: regs.regs[2], 2184 rdx: regs.regs[3], 2185 rsi: regs.regs[4], 2186 rdi: regs.regs[5], 2187 rbp: regs.regs[6], 2188 rsp: regs.regs[7], 2189 r8: regs.regs[8], 2190 r9: regs.regs[9], 2191 r10: regs.regs[10], 2192 r11: regs.regs[11], 2193 r12: regs.regs[12], 2194 r13: regs.regs[13], 2195 r14: regs.regs[14], 2196 r15: regs.regs[15], 2197 rip: regs.rip, 2198 // Update the lower 32-bit of rflags. 2199 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), 2200 }; 2201 2202 self.set_regs(cpu_id as u8, &gregs) 2203 .map_err(DebuggableError::WriteRegs)?; 2204 2205 // Segment registers: CS, SS, DS, ES, FS, GS 2206 // Since GDB care only selectors, we call get_sregs() first. 2207 let mut sregs = self 2208 .get_sregs(cpu_id as u8) 2209 .map_err(DebuggableError::ReadRegs)?; 2210 sregs.cs.selector = regs.segments.cs as u16; 2211 sregs.ss.selector = regs.segments.ss as u16; 2212 sregs.ds.selector = regs.segments.ds as u16; 2213 sregs.es.selector = regs.segments.es as u16; 2214 sregs.fs.selector = regs.segments.fs as u16; 2215 sregs.gs.selector = regs.segments.gs as u16; 2216 2217 self.set_sregs(cpu_id as u8, &sregs) 2218 .map_err(DebuggableError::WriteRegs)?; 2219 2220 // TODO: Add other registers 2221 2222 Ok(()) 2223 } 2224 2225 #[cfg(target_arch = "aarch64")] 2226 fn write_regs( 2227 &self, 2228 cpu_id: usize, 2229 regs: &CoreRegs, 2230 ) -> std::result::Result<(), DebuggableError> { 2231 let mut gregs = self 2232 .get_regs(cpu_id as u8) 2233 .map_err(DebuggableError::ReadRegs)?; 2234 2235 gregs.regs.regs = regs.x; 2236 gregs.regs.sp = regs.sp; 2237 gregs.regs.pc = regs.pc; 2238 2239 self.set_regs(cpu_id as u8, &gregs) 2240 .map_err(DebuggableError::WriteRegs)?; 2241 2242 Ok(()) 2243 } 2244 2245 fn read_mem( 2246 &self, 2247 cpu_id: usize, 2248 vaddr: GuestAddress, 2249 len: usize, 2250 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2251 let mut buf = vec![0; len]; 2252 let mut total_read = 0_u64; 2253 2254 while total_read < len as u64 { 2255 let gaddr = vaddr.0 + total_read; 2256 let paddr = match self.translate_gva(cpu_id as u8, gaddr) { 2257 Ok(paddr) => paddr, 2258 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2259 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2260 }; 2261 let psize = arch::PAGE_SIZE as u64; 2262 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2263 self.vm_memory 2264 .memory() 2265 .read( 2266 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2267 GuestAddress(paddr), 2268 ) 2269 .map_err(DebuggableError::ReadMem)?; 2270 total_read += read_len; 2271 } 2272 Ok(buf) 2273 } 2274 2275 fn write_mem( 2276 &self, 2277 cpu_id: usize, 2278 vaddr: &GuestAddress, 2279 data: &[u8], 2280 ) -> std::result::Result<(), DebuggableError> { 2281 let mut total_written = 0_u64; 2282 2283 while total_written < data.len() as u64 { 2284 let gaddr = vaddr.0 + total_written; 2285 let paddr = match self.translate_gva(cpu_id as u8, gaddr) { 2286 Ok(paddr) => paddr, 2287 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2288 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2289 }; 2290 let psize = arch::PAGE_SIZE as u64; 2291 let write_len = std::cmp::min( 2292 data.len() as u64 - total_written, 2293 psize - (paddr & (psize - 1)), 2294 ); 2295 self.vm_memory 2296 .memory() 2297 .write( 2298 &data[total_written as usize..total_written as usize + write_len as usize], 2299 GuestAddress(paddr), 2300 ) 2301 .map_err(DebuggableError::WriteMem)?; 2302 total_written += write_len; 2303 } 2304 Ok(()) 2305 } 2306 2307 fn active_vcpus(&self) -> usize { 2308 self.present_vcpus() as usize 2309 } 2310 } 2311 2312 #[cfg(feature = "guest_debug")] 2313 impl Elf64Writable for CpuManager {} 2314 2315 #[cfg(feature = "guest_debug")] 2316 impl CpuElf64Writable for CpuManager { 2317 fn cpu_write_elf64_note( 2318 &mut self, 2319 dump_state: &DumpState, 2320 ) -> std::result::Result<(), GuestDebuggableError> { 2321 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2322 for vcpu in &self.vcpus { 2323 let note_size = self.get_note_size(NoteDescType::Elf, 1); 2324 let mut pos: usize = 0; 2325 let mut buf = vec![0; note_size as usize]; 2326 let descsz = size_of::<X86_64ElfPrStatus>(); 2327 let vcpu_id = vcpu.lock().unwrap().id; 2328 2329 let note = Elf64_Nhdr { 2330 n_namesz: COREDUMP_NAME_SIZE, 2331 n_descsz: descsz as u32, 2332 n_type: NT_PRSTATUS, 2333 }; 2334 2335 let bytes: &[u8] = note.as_slice(); 2336 buf.splice(0.., bytes.to_vec()); 2337 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2338 buf.resize(pos + 4, 0); 2339 buf.splice(pos.., "CORE".to_string().into_bytes()); 2340 2341 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2342 buf.resize(pos + 32 + 4, 0); 2343 let pid = vcpu_id as u64; 2344 let bytes: &[u8] = pid.as_slice(); 2345 buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ 2346 2347 pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>(); 2348 2349 let orig_rax: u64 = 0; 2350 let gregs = self.vcpus[usize::from(vcpu_id)] 2351 .lock() 2352 .unwrap() 2353 .vcpu 2354 .get_regs() 2355 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2356 2357 let regs1 = [ 2358 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11, 2359 gregs.r10, 2360 ]; 2361 let regs2 = [ 2362 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax, 2363 ]; 2364 2365 let sregs = self.vcpus[usize::from(vcpu_id)] 2366 .lock() 2367 .unwrap() 2368 .vcpu 2369 .get_sregs() 2370 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2371 2372 debug!( 2373 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", 2374 gregs.rip, 2375 gregs.rsp, 2376 sregs.gs.base, 2377 sregs.cs.selector, 2378 sregs.ss.selector, 2379 sregs.ds.selector, 2380 ); 2381 2382 let regs = X86_64UserRegs { 2383 regs1, 2384 regs2, 2385 rip: gregs.rip, 2386 cs: sregs.cs.selector as u64, 2387 eflags: gregs.rflags, 2388 rsp: gregs.rsp, 2389 ss: sregs.ss.selector as u64, 2390 fs_base: sregs.fs.base as u64, 2391 gs_base: sregs.gs.base as u64, 2392 ds: sregs.ds.selector as u64, 2393 es: sregs.es.selector as u64, 2394 fs: sregs.fs.selector as u64, 2395 gs: sregs.gs.selector as u64, 2396 }; 2397 2398 // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; 2399 let bytes: &[u8] = regs.as_slice(); 2400 buf.resize(note_size as usize, 0); 2401 buf.splice(pos.., bytes.to_vec()); 2402 buf.resize(note_size as usize, 0); 2403 2404 coredump_file 2405 .write(&buf) 2406 .map_err(GuestDebuggableError::CoredumpFile)?; 2407 } 2408 2409 Ok(()) 2410 } 2411 2412 fn cpu_write_vmm_note( 2413 &mut self, 2414 dump_state: &DumpState, 2415 ) -> std::result::Result<(), GuestDebuggableError> { 2416 let mut coredump_file = dump_state.file.as_ref().unwrap(); 2417 for vcpu in &self.vcpus { 2418 let note_size = self.get_note_size(NoteDescType::Vmm, 1); 2419 let mut pos: usize = 0; 2420 let mut buf = vec![0; note_size as usize]; 2421 let descsz = size_of::<DumpCpusState>(); 2422 let vcpu_id = vcpu.lock().unwrap().id; 2423 2424 let note = Elf64_Nhdr { 2425 n_namesz: COREDUMP_NAME_SIZE, 2426 n_descsz: descsz as u32, 2427 n_type: 0, 2428 }; 2429 2430 let bytes: &[u8] = note.as_slice(); 2431 buf.splice(0.., bytes.to_vec()); 2432 pos += round_up!(size_of::<Elf64_Nhdr>(), 4); 2433 2434 buf.resize(pos + 4, 0); 2435 buf.splice(pos.., "QEMU".to_string().into_bytes()); 2436 2437 pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); 2438 2439 let gregs = self.vcpus[usize::from(vcpu_id)] 2440 .lock() 2441 .unwrap() 2442 .vcpu 2443 .get_regs() 2444 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; 2445 2446 let regs1 = [ 2447 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp, 2448 gregs.rbp, 2449 ]; 2450 2451 let regs2 = [ 2452 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, 2453 gregs.r15, 2454 ]; 2455 2456 let sregs = self.vcpus[usize::from(vcpu_id)] 2457 .lock() 2458 .unwrap() 2459 .vcpu 2460 .get_sregs() 2461 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; 2462 2463 let mut msrs = vec![MsrEntry { 2464 index: msr_index::MSR_KERNEL_GS_BASE, 2465 ..Default::default() 2466 }]; 2467 2468 self.vcpus[vcpu_id as usize] 2469 .lock() 2470 .unwrap() 2471 .vcpu 2472 .get_msrs(&mut msrs) 2473 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; 2474 let kernel_gs_base = msrs[0].data; 2475 2476 let cs = CpuSegment::new(sregs.cs); 2477 let ds = CpuSegment::new(sregs.ds); 2478 let es = CpuSegment::new(sregs.es); 2479 let fs = CpuSegment::new(sregs.fs); 2480 let gs = CpuSegment::new(sregs.gs); 2481 let ss = CpuSegment::new(sregs.ss); 2482 let ldt = CpuSegment::new(sregs.ldt); 2483 let tr = CpuSegment::new(sregs.tr); 2484 let gdt = CpuSegment::new_from_table(sregs.gdt); 2485 let idt = CpuSegment::new_from_table(sregs.idt); 2486 let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; 2487 let regs = DumpCpusState { 2488 version: 1, 2489 size: size_of::<DumpCpusState>() as u32, 2490 regs1, 2491 regs2, 2492 rip: gregs.rip, 2493 rflags: gregs.rflags, 2494 cs, 2495 ds, 2496 es, 2497 fs, 2498 gs, 2499 ss, 2500 ldt, 2501 tr, 2502 gdt, 2503 idt, 2504 cr, 2505 kernel_gs_base, 2506 }; 2507 2508 let bytes: &[u8] = regs.as_slice(); 2509 buf.resize(note_size as usize, 0); 2510 buf.splice(pos.., bytes.to_vec()); 2511 buf.resize(note_size as usize, 0); 2512 2513 coredump_file 2514 .write(&buf) 2515 .map_err(GuestDebuggableError::CoredumpFile)?; 2516 } 2517 2518 Ok(()) 2519 } 2520 } 2521 2522 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2523 #[cfg(test)] 2524 mod tests { 2525 use arch::x86_64::interrupts::*; 2526 use arch::x86_64::regs::*; 2527 use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters}; 2528 2529 #[test] 2530 fn test_setlint() { 2531 let hv = hypervisor::new().unwrap(); 2532 let vm = hv.create_vm().expect("new VM fd creation failed"); 2533 assert!(hv.check_required_extensions().is_ok()); 2534 // Calling get_lapic will fail if there is no irqchip before hand. 2535 assert!(vm.create_irq_chip().is_ok()); 2536 let vcpu = vm.create_vcpu(0, None).unwrap(); 2537 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2538 2539 // Compute the value that is expected to represent LVT0 and LVT1. 2540 let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); 2541 let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); 2542 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2543 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2544 2545 set_lint(&vcpu).unwrap(); 2546 2547 // Compute the value that represents LVT0 and LVT1 after set_lint. 2548 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2549 let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); 2550 let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); 2551 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2552 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2553 } 2554 2555 #[test] 2556 fn test_setup_fpu() { 2557 let hv = hypervisor::new().unwrap(); 2558 let vm = hv.create_vm().expect("new VM fd creation failed"); 2559 let vcpu = vm.create_vcpu(0, None).unwrap(); 2560 setup_fpu(&vcpu).unwrap(); 2561 2562 let expected_fpu: FpuState = FpuState { 2563 fcw: 0x37f, 2564 mxcsr: 0x1f80, 2565 ..Default::default() 2566 }; 2567 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2568 // TODO: auto-generate kvm related structures with PartialEq on. 2569 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2570 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2571 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2572 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2573 // remove it at all. 2574 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2575 } 2576 2577 #[test] 2578 fn test_setup_msrs() { 2579 use hypervisor::arch::x86::{msr_index, MsrEntry}; 2580 2581 let hv = hypervisor::new().unwrap(); 2582 let vm = hv.create_vm().expect("new VM fd creation failed"); 2583 let vcpu = vm.create_vcpu(0, None).unwrap(); 2584 setup_msrs(&vcpu).unwrap(); 2585 2586 // This test will check against the last MSR entry configured (the tenth one). 2587 // See create_msr_entries for details. 2588 let mut msrs = vec![MsrEntry { 2589 index: msr_index::MSR_IA32_MISC_ENABLE, 2590 ..Default::default() 2591 }]; 2592 2593 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2594 // in this test case scenario. 2595 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2596 assert_eq!(read_msrs, 1); 2597 2598 // Official entries that were setup when we did setup_msrs. We need to assert that the 2599 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2600 // expect. 2601 let entry_vec = vcpu.boot_msr_entries(); 2602 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2603 } 2604 2605 #[test] 2606 fn test_setup_regs() { 2607 let hv = hypervisor::new().unwrap(); 2608 let vm = hv.create_vm().expect("new VM fd creation failed"); 2609 let vcpu = vm.create_vcpu(0, None).unwrap(); 2610 2611 let expected_regs: StandardRegisters = StandardRegisters { 2612 rflags: 0x0000000000000002u64, 2613 rbx: arch::layout::PVH_INFO_START.0, 2614 rip: 1, 2615 ..Default::default() 2616 }; 2617 2618 setup_regs(&vcpu, expected_regs.rip).unwrap(); 2619 2620 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2621 assert_eq!(actual_regs, expected_regs); 2622 } 2623 } 2624 2625 #[cfg(target_arch = "aarch64")] 2626 #[cfg(test)] 2627 mod tests { 2628 use arch::{aarch64::regs, layout}; 2629 use hypervisor::kvm::aarch64::is_system_register; 2630 use hypervisor::kvm::kvm_bindings::{ 2631 kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2632 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2633 }; 2634 use hypervisor::{arm64_core_reg_id, offset__of}; 2635 use std::mem; 2636 2637 #[test] 2638 fn test_setup_regs() { 2639 let hv = hypervisor::new().unwrap(); 2640 let vm = hv.create_vm().unwrap(); 2641 let vcpu = vm.create_vcpu(0, None).unwrap(); 2642 2643 let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); 2644 // Must fail when vcpu is not initialized yet. 2645 assert!(res.is_err()); 2646 2647 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2648 vm.get_preferred_target(&mut kvi).unwrap(); 2649 vcpu.vcpu_init(&kvi).unwrap(); 2650 2651 assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); 2652 } 2653 2654 #[test] 2655 fn test_read_mpidr() { 2656 let hv = hypervisor::new().unwrap(); 2657 let vm = hv.create_vm().unwrap(); 2658 let vcpu = vm.create_vcpu(0, None).unwrap(); 2659 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2660 vm.get_preferred_target(&mut kvi).unwrap(); 2661 2662 // Must fail when vcpu is not initialized yet. 2663 assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); 2664 2665 vcpu.vcpu_init(&kvi).unwrap(); 2666 assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); 2667 } 2668 2669 #[test] 2670 fn test_is_system_register() { 2671 let offset = offset__of!(user_pt_regs, pc); 2672 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2673 assert!(!is_system_register(regid)); 2674 let regid = KVM_REG_ARM64 as u64 | KVM_REG_SIZE_U64 as u64 | KVM_REG_ARM64_SYSREG as u64; 2675 assert!(is_system_register(regid)); 2676 } 2677 2678 #[test] 2679 fn test_save_restore_core_regs() { 2680 let hv = hypervisor::new().unwrap(); 2681 let vm = hv.create_vm().unwrap(); 2682 let vcpu = vm.create_vcpu(0, None).unwrap(); 2683 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2684 vm.get_preferred_target(&mut kvi).unwrap(); 2685 2686 // Must fail when vcpu is not initialized yet. 2687 let res = vcpu.get_regs(); 2688 assert!(res.is_err()); 2689 assert_eq!( 2690 format!("{}", res.unwrap_err()), 2691 "Failed to get core register: Exec format error (os error 8)" 2692 ); 2693 2694 let mut state = kvm_regs::default(); 2695 let res = vcpu.set_regs(&state); 2696 assert!(res.is_err()); 2697 assert_eq!( 2698 format!("{}", res.unwrap_err()), 2699 "Failed to set core register: Exec format error (os error 8)" 2700 ); 2701 2702 vcpu.vcpu_init(&kvi).unwrap(); 2703 let res = vcpu.get_regs(); 2704 assert!(res.is_ok()); 2705 state = res.unwrap(); 2706 assert_eq!(state.regs.pstate, 0x3C5); 2707 2708 assert!(vcpu.set_regs(&state).is_ok()); 2709 } 2710 2711 #[test] 2712 fn test_get_set_mpstate() { 2713 let hv = hypervisor::new().unwrap(); 2714 let vm = hv.create_vm().unwrap(); 2715 let vcpu = vm.create_vcpu(0, None).unwrap(); 2716 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2717 vm.get_preferred_target(&mut kvi).unwrap(); 2718 2719 let res = vcpu.get_mp_state(); 2720 assert!(res.is_ok()); 2721 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 2722 } 2723 } 2724