1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 use crate::device_manager::DeviceManager; 16 #[cfg(feature = "gdb")] 17 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; 18 use crate::memory_manager::MemoryManager; 19 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 20 #[cfg(target_arch = "x86_64")] 21 use crate::vm::physical_bits; 22 use crate::GuestMemoryMmap; 23 use crate::CPU_MANAGER_SNAPSHOT_ID; 24 use acpi_tables::{aml, aml::Aml, sdt::Sdt}; 25 use anyhow::anyhow; 26 use arch::EntryPoint; 27 use arch::NumaNodes; 28 use devices::interrupt_controller::InterruptController; 29 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] 30 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs}; 31 #[cfg(target_arch = "aarch64")] 32 use hypervisor::kvm::kvm_bindings; 33 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] 34 use hypervisor::x86_64::{SpecialRegisters, StandardRegisters}; 35 #[cfg(target_arch = "x86_64")] 36 use hypervisor::CpuId; 37 use hypervisor::{vm::VmmOps, CpuState, HypervisorCpuError, VmExit}; 38 #[cfg(feature = "tdx")] 39 use hypervisor::{TdxExitDetails, TdxExitStatus}; 40 use libc::{c_void, siginfo_t}; 41 use seccompiler::{apply_filter, SeccompAction}; 42 use std::collections::BTreeMap; 43 use std::os::unix::thread::JoinHandleExt; 44 use std::sync::atomic::{AtomicBool, Ordering}; 45 use std::sync::{Arc, Barrier, Mutex}; 46 use std::{cmp, io, result, thread}; 47 use vm_device::BusDevice; 48 use vm_memory::GuestAddress; 49 use vm_memory::GuestMemoryAtomic; 50 use vm_migration::{ 51 Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable, 52 Transportable, 53 }; 54 use vmm_sys_util::eventfd::EventFd; 55 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 56 57 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 58 59 #[derive(Debug)] 60 pub enum Error { 61 /// Cannot create the vCPU. 62 VcpuCreate(anyhow::Error), 63 64 /// Cannot run the VCPUs. 65 VcpuRun(anyhow::Error), 66 67 /// Cannot spawn a new vCPU thread. 68 VcpuSpawn(io::Error), 69 70 /// Cannot generate common CPUID 71 CommonCpuId(arch::Error), 72 73 /// Error configuring VCPU 74 VcpuConfiguration(arch::Error), 75 76 #[cfg(target_arch = "aarch64")] 77 /// Error fetching prefered target 78 VcpuArmPreferredTarget(hypervisor::HypervisorVmError), 79 80 #[cfg(target_arch = "aarch64")] 81 /// Error doing vCPU init on Arm. 82 VcpuArmInit(hypervisor::HypervisorCpuError), 83 84 /// Failed to join on vCPU threads 85 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 86 87 /// Cannot add legacy device to Bus. 88 BusError(vm_device::BusError), 89 90 /// Asking for more vCPUs that we can have 91 DesiredVCpuCountExceedsMax, 92 93 /// Cannot create seccomp filter 94 CreateSeccompFilter(seccompiler::Error), 95 96 /// Cannot apply seccomp filter 97 ApplySeccompFilter(seccompiler::Error), 98 99 /// Error starting vCPU after restore 100 StartRestoreVcpu(anyhow::Error), 101 102 /// Error because an unexpected VmExit type was received. 103 UnexpectedVmExit, 104 105 /// Failed to allocate MMIO address 106 AllocateMmmioAddress, 107 108 #[cfg(feature = "tdx")] 109 InitializeTdx(hypervisor::HypervisorCpuError), 110 111 #[cfg(target_arch = "aarch64")] 112 InitPmu(hypervisor::HypervisorCpuError), 113 114 /// Failed scheduling the thread on the expected CPU set. 115 ScheduleCpuSet, 116 117 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] 118 /// Error on debug related CPU ops. 119 CpuDebug(hypervisor::HypervisorCpuError), 120 121 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] 122 /// Failed to translate guest virtual address. 123 TranslateVirtualAddress(hypervisor::HypervisorCpuError), 124 125 #[cfg(all(feature = "amx", target_arch = "x86_64"))] 126 /// "Failed to setup AMX. 127 AmxEnable(anyhow::Error), 128 } 129 pub type Result<T> = result::Result<T, Error>; 130 131 #[cfg(target_arch = "x86_64")] 132 #[allow(dead_code)] 133 #[repr(packed)] 134 struct LocalApic { 135 pub r#type: u8, 136 pub length: u8, 137 pub processor_id: u8, 138 pub apic_id: u8, 139 pub flags: u32, 140 } 141 142 #[allow(dead_code)] 143 #[repr(packed)] 144 #[derive(Default)] 145 struct Ioapic { 146 pub r#type: u8, 147 pub length: u8, 148 pub ioapic_id: u8, 149 _reserved: u8, 150 pub apic_address: u32, 151 pub gsi_base: u32, 152 } 153 154 #[cfg(target_arch = "aarch64")] 155 #[allow(dead_code)] 156 #[repr(packed)] 157 struct GicC { 158 pub r#type: u8, 159 pub length: u8, 160 pub reserved0: u16, 161 pub cpu_interface_number: u32, 162 pub uid: u32, 163 pub flags: u32, 164 pub parking_version: u32, 165 pub performance_interrupt: u32, 166 pub parked_address: u64, 167 pub base_address: u64, 168 pub gicv_base_address: u64, 169 pub gich_base_address: u64, 170 pub vgic_interrupt: u32, 171 pub gicr_base_address: u64, 172 pub mpidr: u64, 173 pub proc_power_effi_class: u8, 174 pub reserved1: u8, 175 pub spe_overflow_interrupt: u16, 176 } 177 178 #[cfg(target_arch = "aarch64")] 179 #[allow(dead_code)] 180 #[repr(packed)] 181 struct GicD { 182 pub r#type: u8, 183 pub length: u8, 184 pub reserved0: u16, 185 pub gic_id: u32, 186 pub base_address: u64, 187 pub global_irq_base: u32, 188 pub version: u8, 189 pub reserved1: [u8; 3], 190 } 191 192 #[cfg(target_arch = "aarch64")] 193 #[allow(dead_code)] 194 #[repr(packed)] 195 struct GicR { 196 pub r#type: u8, 197 pub length: u8, 198 pub reserved: u16, 199 pub base_address: u64, 200 pub range_length: u32, 201 } 202 203 #[cfg(target_arch = "aarch64")] 204 #[allow(dead_code)] 205 #[repr(packed)] 206 struct GicIts { 207 pub r#type: u8, 208 pub length: u8, 209 pub reserved0: u16, 210 pub translation_id: u32, 211 pub base_address: u64, 212 pub reserved1: u32, 213 } 214 215 #[cfg(target_arch = "aarch64")] 216 #[allow(dead_code)] 217 #[repr(packed)] 218 struct ProcessorHierarchyNode { 219 pub r#type: u8, 220 pub length: u8, 221 pub reserved: u16, 222 pub flags: u32, 223 pub parent: u32, 224 pub acpi_processor_id: u32, 225 pub num_private_resources: u32, 226 } 227 228 #[allow(dead_code)] 229 #[repr(packed)] 230 #[derive(Default)] 231 struct InterruptSourceOverride { 232 pub r#type: u8, 233 pub length: u8, 234 pub bus: u8, 235 pub source: u8, 236 pub gsi: u32, 237 pub flags: u16, 238 } 239 240 /// A wrapper around creating and using a kvm-based VCPU. 241 pub struct Vcpu { 242 // The hypervisor abstracted CPU. 243 vcpu: Arc<dyn hypervisor::Vcpu>, 244 id: u8, 245 #[cfg(target_arch = "aarch64")] 246 mpidr: u64, 247 saved_state: Option<CpuState>, 248 } 249 250 impl Vcpu { 251 /// Constructs a new VCPU for `vm`. 252 /// 253 /// # Arguments 254 /// 255 /// * `id` - Represents the CPU number between [0, max vcpus). 256 /// * `vm` - The virtual machine this vcpu will get attached to. 257 /// * `vmmops` - Optional object for exit handling. 258 pub fn new( 259 id: u8, 260 vm: &Arc<dyn hypervisor::Vm>, 261 vmmops: Option<Arc<dyn VmmOps>>, 262 ) -> Result<Self> { 263 let vcpu = vm 264 .create_vcpu(id, vmmops) 265 .map_err(|e| Error::VcpuCreate(e.into()))?; 266 // Initially the cpuid per vCPU is the one supported by this VM. 267 Ok(Vcpu { 268 vcpu, 269 id, 270 #[cfg(target_arch = "aarch64")] 271 mpidr: 0, 272 saved_state: None, 273 }) 274 } 275 276 /// Configures a vcpu and should be called once per vcpu when created. 277 /// 278 /// # Arguments 279 /// 280 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 281 /// * `vm_memory` - Guest memory. 282 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 283 pub fn configure( 284 &mut self, 285 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 286 kernel_entry_point: Option<EntryPoint>, 287 #[cfg(target_arch = "x86_64")] vm_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 288 #[cfg(target_arch = "x86_64")] cpuid: CpuId, 289 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 290 ) -> Result<()> { 291 #[cfg(target_arch = "aarch64")] 292 { 293 self.init(vm)?; 294 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, kernel_entry_point) 295 .map_err(Error::VcpuConfiguration)?; 296 } 297 info!("Configuring vCPU: cpu_id = {}", self.id); 298 #[cfg(target_arch = "x86_64")] 299 arch::configure_vcpu( 300 &self.vcpu, 301 self.id, 302 kernel_entry_point, 303 vm_memory, 304 cpuid, 305 kvm_hyperv, 306 ) 307 .map_err(Error::VcpuConfiguration)?; 308 309 Ok(()) 310 } 311 312 /// Gets the MPIDR register value. 313 #[cfg(target_arch = "aarch64")] 314 pub fn get_mpidr(&self) -> u64 { 315 self.mpidr 316 } 317 318 /// Gets the saved vCPU state. 319 #[cfg(target_arch = "aarch64")] 320 pub fn get_saved_state(&self) -> Option<CpuState> { 321 self.saved_state.clone() 322 } 323 324 /// Initializes an aarch64 specific vcpu for booting Linux. 325 #[cfg(target_arch = "aarch64")] 326 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 327 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 328 329 // This reads back the kernel's preferred target type. 330 vm.get_preferred_target(&mut kvi) 331 .map_err(Error::VcpuArmPreferredTarget)?; 332 // We already checked that the capability is supported. 333 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 334 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; 335 // Non-boot cpus are powered off initially. 336 if self.id > 0 { 337 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 338 } 339 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 340 } 341 342 /// Runs the VCPU until it exits, returning the reason. 343 /// 344 /// Note that the state of the VCPU and associated VM must be setup first for this to do 345 /// anything useful. 346 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 347 self.vcpu.run() 348 } 349 } 350 351 const VCPU_SNAPSHOT_ID: &str = "vcpu"; 352 impl Pausable for Vcpu { 353 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 354 self.saved_state = 355 Some(self.vcpu.state().map_err(|e| { 356 MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)) 357 })?); 358 359 Ok(()) 360 } 361 362 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 363 if let Some(vcpu_state) = &self.saved_state { 364 self.vcpu.set_state(vcpu_state).map_err(|e| { 365 MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e)) 366 })?; 367 } 368 369 Ok(()) 370 } 371 } 372 impl Snapshottable for Vcpu { 373 fn id(&self) -> String { 374 VCPU_SNAPSHOT_ID.to_string() 375 } 376 377 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 378 let mut vcpu_snapshot = Snapshot::new(&format!("{}", self.id)); 379 vcpu_snapshot.add_data_section(SnapshotDataSection::new_from_state( 380 VCPU_SNAPSHOT_ID, 381 &self.saved_state, 382 )?); 383 384 Ok(vcpu_snapshot) 385 } 386 387 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 388 self.saved_state = Some(snapshot.to_state(VCPU_SNAPSHOT_ID)?); 389 Ok(()) 390 } 391 } 392 393 pub struct CpuManager { 394 config: CpusConfig, 395 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 396 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 397 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 398 vm_memory: GuestMemoryAtomic<GuestMemoryMmap>, 399 #[cfg(target_arch = "x86_64")] 400 cpuid: CpuId, 401 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 402 vm: Arc<dyn hypervisor::Vm>, 403 vcpus_kill_signalled: Arc<AtomicBool>, 404 vcpus_pause_signalled: Arc<AtomicBool>, 405 exit_evt: EventFd, 406 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 407 reset_evt: EventFd, 408 #[cfg(feature = "gdb")] 409 vm_debug_evt: EventFd, 410 vcpu_states: Vec<VcpuState>, 411 selected_cpu: u8, 412 vcpus: Vec<Arc<Mutex<Vcpu>>>, 413 seccomp_action: SeccompAction, 414 vmmops: Arc<dyn VmmOps>, 415 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 416 acpi_address: Option<GuestAddress>, 417 proximity_domain_per_cpu: BTreeMap<u8, u32>, 418 affinity: BTreeMap<u8, Vec<u8>>, 419 dynamic: bool, 420 } 421 422 const CPU_ENABLE_FLAG: usize = 0; 423 const CPU_INSERTING_FLAG: usize = 1; 424 const CPU_REMOVING_FLAG: usize = 2; 425 const CPU_EJECT_FLAG: usize = 3; 426 427 const CPU_STATUS_OFFSET: u64 = 4; 428 const CPU_SELECTION_OFFSET: u64 = 0; 429 430 impl BusDevice for CpuManager { 431 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 432 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 433 data.fill(0); 434 435 match offset { 436 CPU_SELECTION_OFFSET => { 437 data[0] = self.selected_cpu; 438 } 439 CPU_STATUS_OFFSET => { 440 if self.selected_cpu < self.max_vcpus() { 441 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 442 if state.active() { 443 data[0] |= 1 << CPU_ENABLE_FLAG; 444 } 445 if state.inserting { 446 data[0] |= 1 << CPU_INSERTING_FLAG; 447 } 448 if state.removing { 449 data[0] |= 1 << CPU_REMOVING_FLAG; 450 } 451 } else { 452 warn!("Out of range vCPU id: {}", self.selected_cpu); 453 } 454 } 455 _ => { 456 warn!( 457 "Unexpected offset for accessing CPU manager device: {:#}", 458 offset 459 ); 460 } 461 } 462 } 463 464 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 465 match offset { 466 CPU_SELECTION_OFFSET => { 467 self.selected_cpu = data[0]; 468 } 469 CPU_STATUS_OFFSET => { 470 if self.selected_cpu < self.max_vcpus() { 471 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 472 // The ACPI code writes back a 1 to acknowledge the insertion 473 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 474 && state.inserting 475 { 476 state.inserting = false; 477 } 478 // Ditto for removal 479 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 480 && state.removing 481 { 482 state.removing = false; 483 } 484 // Trigger removal of vCPU 485 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 486 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 487 error!("Error removing vCPU: {:?}", e); 488 } 489 } 490 } else { 491 warn!("Out of range vCPU id: {}", self.selected_cpu); 492 } 493 } 494 _ => { 495 warn!( 496 "Unexpected offset for accessing CPU manager device: {:#}", 497 offset 498 ); 499 } 500 } 501 None 502 } 503 } 504 505 #[derive(Default)] 506 struct VcpuState { 507 inserting: bool, 508 removing: bool, 509 handle: Option<thread::JoinHandle<()>>, 510 kill: Arc<AtomicBool>, 511 vcpu_run_interrupted: Arc<AtomicBool>, 512 } 513 514 impl VcpuState { 515 fn active(&self) -> bool { 516 self.handle.is_some() 517 } 518 519 fn signal_thread(&self) { 520 if let Some(handle) = self.handle.as_ref() { 521 loop { 522 unsafe { 523 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 524 } 525 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 526 break; 527 } else { 528 // This is more effective than thread::yield_now() at 529 // avoiding a priority inversion with the vCPU thread 530 thread::sleep(std::time::Duration::from_millis(1)); 531 } 532 } 533 } 534 } 535 536 fn join_thread(&mut self) -> Result<()> { 537 if let Some(handle) = self.handle.take() { 538 handle.join().map_err(Error::ThreadCleanup)? 539 } 540 541 Ok(()) 542 } 543 544 fn unpark_thread(&self) { 545 if let Some(handle) = self.handle.as_ref() { 546 handle.thread().unpark() 547 } 548 } 549 } 550 551 impl CpuManager { 552 #[allow(unused_variables)] 553 #[allow(clippy::too_many_arguments)] 554 pub fn new( 555 config: &CpusConfig, 556 device_manager: &Arc<Mutex<DeviceManager>>, 557 memory_manager: &Arc<Mutex<MemoryManager>>, 558 vm: Arc<dyn hypervisor::Vm>, 559 exit_evt: EventFd, 560 reset_evt: EventFd, 561 #[cfg(feature = "gdb")] vm_debug_evt: EventFd, 562 hypervisor: Arc<dyn hypervisor::Hypervisor>, 563 seccomp_action: SeccompAction, 564 vmmops: Arc<dyn VmmOps>, 565 #[cfg(feature = "tdx")] tdx_enabled: bool, 566 numa_nodes: &NumaNodes, 567 ) -> Result<Arc<Mutex<CpuManager>>> { 568 let guest_memory = memory_manager.lock().unwrap().guest_memory(); 569 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 570 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 571 572 #[cfg(target_arch = "x86_64")] 573 let sgx_epc_sections = memory_manager 574 .lock() 575 .unwrap() 576 .sgx_epc_region() 577 .as_ref() 578 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 579 #[cfg(target_arch = "x86_64")] 580 let cpuid = { 581 let phys_bits = physical_bits(config.max_phys_bits); 582 arch::generate_common_cpuid( 583 hypervisor, 584 config 585 .topology 586 .clone() 587 .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)), 588 sgx_epc_sections, 589 phys_bits, 590 config.kvm_hyperv, 591 #[cfg(feature = "tdx")] 592 tdx_enabled, 593 ) 594 .map_err(Error::CommonCpuId)? 595 }; 596 #[cfg(all(feature = "amx", target_arch = "x86_64"))] 597 if config.features.amx { 598 const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; 599 const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; 600 const XFEATURE_XTILEDATA: usize = 18; 601 const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; 602 603 // This is safe as the syscall is only modifing kernel internal 604 // data structures that the kernel is itself expected to safeguard. 605 let amx_tile = unsafe { 606 libc::syscall( 607 libc::SYS_arch_prctl, 608 ARCH_REQ_XCOMP_GUEST_PERM, 609 XFEATURE_XTILEDATA, 610 ) 611 }; 612 613 if amx_tile != 0 { 614 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 615 } else { 616 // This is safe as the mask being modified (not marked mutable as it is 617 // modified in unsafe only which is permitted) isn't in use elsewhere. 618 let mask: usize = 0; 619 let result = unsafe { 620 libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) 621 }; 622 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { 623 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); 624 } 625 } 626 } 627 628 let device_manager = device_manager.lock().unwrap(); 629 630 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 631 let mut cpu_list = Vec::new(); 632 for (proximity_domain, numa_node) in numa_nodes.iter() { 633 for cpu in numa_node.cpus.iter() { 634 cpu_list.push((*cpu, *proximity_domain)) 635 } 636 } 637 cpu_list 638 } 639 .into_iter() 640 .collect(); 641 642 let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { 643 cpu_affinity 644 .iter() 645 .map(|a| (a.vcpu, a.host_cpus.clone())) 646 .collect() 647 } else { 648 BTreeMap::new() 649 }; 650 651 #[cfg(feature = "tdx")] 652 let dynamic = !tdx_enabled; 653 #[cfg(not(feature = "tdx"))] 654 let dynamic = true; 655 656 let acpi_address = if dynamic { 657 Some( 658 device_manager 659 .allocator() 660 .lock() 661 .unwrap() 662 .allocate_platform_mmio_addresses(None, CPU_MANAGER_ACPI_SIZE as u64, None) 663 .ok_or(Error::AllocateMmmioAddress)?, 664 ) 665 } else { 666 None 667 }; 668 669 let cpu_manager = Arc::new(Mutex::new(CpuManager { 670 config: config.clone(), 671 interrupt_controller: device_manager.interrupt_controller().clone(), 672 vm_memory: guest_memory, 673 #[cfg(target_arch = "x86_64")] 674 cpuid, 675 vm, 676 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 677 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 678 vcpu_states, 679 exit_evt, 680 reset_evt, 681 #[cfg(feature = "gdb")] 682 vm_debug_evt, 683 selected_cpu: 0, 684 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 685 seccomp_action, 686 vmmops, 687 acpi_address, 688 proximity_domain_per_cpu, 689 affinity, 690 dynamic, 691 })); 692 693 if let Some(acpi_address) = acpi_address { 694 device_manager 695 .mmio_bus() 696 .insert( 697 cpu_manager.clone(), 698 acpi_address.0, 699 CPU_MANAGER_ACPI_SIZE as u64, 700 ) 701 .map_err(Error::BusError)?; 702 } 703 704 Ok(cpu_manager) 705 } 706 707 fn create_vcpu( 708 &mut self, 709 cpu_id: u8, 710 entry_point: Option<EntryPoint>, 711 snapshot: Option<Snapshot>, 712 ) -> Result<()> { 713 info!("Creating vCPU: cpu_id = {}", cpu_id); 714 715 let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vmmops.clone()))?; 716 717 if let Some(snapshot) = snapshot { 718 // AArch64 vCPUs should be initialized after created. 719 #[cfg(target_arch = "aarch64")] 720 vcpu.init(&self.vm)?; 721 722 vcpu.restore(snapshot).expect("Failed to restore vCPU"); 723 } else { 724 #[cfg(target_arch = "x86_64")] 725 vcpu.configure( 726 entry_point, 727 &self.vm_memory, 728 self.cpuid.clone(), 729 self.config.kvm_hyperv, 730 ) 731 .expect("Failed to configure vCPU"); 732 733 #[cfg(target_arch = "aarch64")] 734 vcpu.configure(&self.vm, entry_point) 735 .expect("Failed to configure vCPU"); 736 } 737 738 // Adding vCPU to the CpuManager's vCPU list. 739 let vcpu = Arc::new(Mutex::new(vcpu)); 740 self.vcpus.push(vcpu); 741 742 Ok(()) 743 } 744 745 /// Only create new vCPUs if there aren't any inactive ones to reuse 746 fn create_vcpus(&mut self, desired_vcpus: u8, entry_point: Option<EntryPoint>) -> Result<()> { 747 info!( 748 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 749 desired_vcpus, 750 self.config.max_vcpus, 751 self.vcpus.len(), 752 self.present_vcpus() 753 ); 754 755 if desired_vcpus > self.config.max_vcpus { 756 return Err(Error::DesiredVCpuCountExceedsMax); 757 } 758 759 // Only create vCPUs in excess of all the allocated vCPUs. 760 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 761 self.create_vcpu(cpu_id, entry_point, None)?; 762 } 763 764 Ok(()) 765 } 766 767 #[cfg(target_arch = "aarch64")] 768 pub fn init_pmu(&self, irq: u32) -> Result<bool> { 769 let cpu_attr = kvm_bindings::kvm_device_attr { 770 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 771 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT), 772 addr: 0x0, 773 flags: 0, 774 }; 775 776 for cpu in self.vcpus.iter() { 777 let tmp = irq; 778 let cpu_attr_irq = kvm_bindings::kvm_device_attr { 779 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 780 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ), 781 addr: &tmp as *const u32 as u64, 782 flags: 0, 783 }; 784 785 // Check if PMU attr is available, if not, log the information. 786 if cpu.lock().unwrap().vcpu.has_vcpu_attr(&cpu_attr).is_ok() { 787 // Set irq for PMU 788 cpu.lock() 789 .unwrap() 790 .vcpu 791 .set_vcpu_attr(&cpu_attr_irq) 792 .map_err(Error::InitPmu)?; 793 794 // Init PMU 795 cpu.lock() 796 .unwrap() 797 .vcpu 798 .set_vcpu_attr(&cpu_attr) 799 .map_err(Error::InitPmu)?; 800 } else { 801 debug!( 802 "PMU attribute is not supported in vCPU{}, skip PMU init!", 803 cpu.lock().unwrap().id 804 ); 805 return Ok(false); 806 } 807 } 808 809 Ok(true) 810 } 811 812 fn start_vcpu( 813 &mut self, 814 vcpu: Arc<Mutex<Vcpu>>, 815 vcpu_id: u8, 816 vcpu_thread_barrier: Arc<Barrier>, 817 inserting: bool, 818 ) -> Result<()> { 819 let reset_evt = self.reset_evt.try_clone().unwrap(); 820 let exit_evt = self.exit_evt.try_clone().unwrap(); 821 #[cfg(feature = "gdb")] 822 let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); 823 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 824 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 825 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 826 827 let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); 828 let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] 829 .vcpu_run_interrupted 830 .clone(); 831 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 832 833 // Prepare the CPU set the current vCPU is expected to run onto. 834 let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { 835 let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; 836 unsafe { libc::CPU_ZERO(&mut cpuset) }; 837 for host_cpu in host_cpus { 838 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) }; 839 } 840 cpuset 841 }); 842 843 // Retrieve seccomp filter for vcpu thread 844 let vcpu_seccomp_filter = get_seccomp_filter(&self.seccomp_action, Thread::Vcpu) 845 .map_err(Error::CreateSeccompFilter)?; 846 847 #[cfg(target_arch = "x86_64")] 848 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 849 850 info!("Starting vCPU: cpu_id = {}", vcpu_id); 851 852 let handle = Some( 853 thread::Builder::new() 854 .name(format!("vcpu{}", vcpu_id)) 855 .spawn(move || { 856 // Schedule the thread to run on the expected CPU set 857 if let Some(cpuset) = cpuset.as_ref() { 858 let ret = unsafe { 859 libc::sched_setaffinity( 860 0, 861 std::mem::size_of::<libc::cpu_set_t>(), 862 cpuset as *const libc::cpu_set_t, 863 ) 864 }; 865 866 if ret != 0 { 867 error!( 868 "Failed scheduling the vCPU {} on the expected CPU set: {}", 869 vcpu_id, 870 io::Error::last_os_error() 871 ); 872 return; 873 } 874 } 875 876 // Apply seccomp filter for vcpu thread. 877 if !vcpu_seccomp_filter.is_empty() { 878 if let Err(e) = 879 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 880 { 881 error!("Error applying seccomp filter: {:?}", e); 882 return; 883 } 884 } 885 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 886 // This uses an async signal safe handler to kill the vcpu handles. 887 register_signal_handler(SIGRTMIN(), handle_signal) 888 .expect("Failed to register vcpu signal handler"); 889 // Block until all CPUs are ready. 890 vcpu_thread_barrier.wait(); 891 892 std::panic::catch_unwind(move || { 893 loop { 894 // If we are being told to pause, we park the thread 895 // until the pause boolean is toggled. 896 // The resume operation is responsible for toggling 897 // the boolean and unpark the thread. 898 // We enter a loop because park() could spuriously 899 // return. We will then park() again unless the 900 // pause boolean has been toggled. 901 902 // Need to use Ordering::SeqCst as we have multiple 903 // loads and stores to different atomics and we need 904 // to see them in a consistent order in all threads 905 906 if vcpu_pause_signalled.load(Ordering::SeqCst) { 907 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are 908 // completed by returning to KVM_RUN. From the kernel docs: 909 // 910 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, 911 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding 912 // operations are complete (and guest state is consistent) only after userspace 913 // has re-entered the kernel with KVM_RUN. The kernel side will first finish 914 // incomplete operations and then check for pending signals. 915 // The pending state of the operation is not preserved in state which is 916 // visible to userspace, thus userspace should ensure that the operation is 917 // completed before performing a live migration. Userspace can re-enter the 918 // guest with an unmasked signal pending or with the immediate_exit field set 919 // to complete pending operations without allowing any further instructions 920 // to be executed. 921 922 #[cfg(feature = "kvm")] 923 { 924 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); 925 if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { 926 error!("Unexpected VM exit on \"immediate_exit\" run"); 927 break; 928 } 929 vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); 930 } 931 932 vcpu_run_interrupted.store(true, Ordering::SeqCst); 933 while vcpu_pause_signalled.load(Ordering::SeqCst) { 934 thread::park(); 935 } 936 vcpu_run_interrupted.store(false, Ordering::SeqCst); 937 } 938 939 // We've been told to terminate 940 if vcpu_kill_signalled.load(Ordering::SeqCst) 941 || vcpu_kill.load(Ordering::SeqCst) 942 { 943 vcpu_run_interrupted.store(true, Ordering::SeqCst); 944 break; 945 } 946 947 #[cfg(feature = "tdx")] 948 let mut vcpu = vcpu.lock().unwrap(); 949 #[cfg(not(feature = "tdx"))] 950 let vcpu = vcpu.lock().unwrap(); 951 // vcpu.run() returns false on a triple-fault so trigger a reset 952 match vcpu.run() { 953 Ok(run) => match run { 954 #[cfg(all(target_arch = "x86_64", feature = "kvm"))] 955 VmExit::Debug => { 956 info!("VmExit::Debug"); 957 #[cfg(feature = "gdb")] 958 { 959 vcpu_pause_signalled.store(true, Ordering::SeqCst); 960 let raw_tid = get_raw_tid(vcpu_id as usize); 961 vm_debug_evt.write(raw_tid as u64).unwrap(); 962 } 963 } 964 #[cfg(target_arch = "x86_64")] 965 VmExit::IoapicEoi(vector) => { 966 if let Some(interrupt_controller) = 967 &interrupt_controller_clone 968 { 969 interrupt_controller 970 .lock() 971 .unwrap() 972 .end_of_interrupt(vector); 973 } 974 } 975 VmExit::Ignore => {} 976 VmExit::Hyperv => {} 977 VmExit::Reset => { 978 info!("VmExit::Reset"); 979 vcpu_run_interrupted.store(true, Ordering::SeqCst); 980 reset_evt.write(1).unwrap(); 981 break; 982 } 983 VmExit::Shutdown => { 984 info!("VmExit::Shutdown"); 985 vcpu_run_interrupted.store(true, Ordering::SeqCst); 986 exit_evt.write(1).unwrap(); 987 break; 988 } 989 #[cfg(feature = "tdx")] 990 VmExit::Tdx => { 991 if let Some(vcpu_fd) = Arc::get_mut(&mut vcpu.vcpu) { 992 match vcpu_fd.get_tdx_exit_details() { 993 Ok(details) => match details { 994 TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), 995 TdxExitDetails::SetupEventNotifyInterrupt => { 996 warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") 997 } 998 }, 999 Err(e) => error!("Unexpected TDX VMCALL: {}", e), 1000 } 1001 vcpu_fd.set_tdx_status(TdxExitStatus::InvalidOperand); 1002 } else { 1003 // We should never reach this code as 1004 // this means the design from the code 1005 // is wrong. 1006 unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances"); 1007 } 1008 } 1009 _ => { 1010 error!( 1011 "VCPU generated error: {:?}", 1012 Error::UnexpectedVmExit 1013 ); 1014 break; 1015 } 1016 }, 1017 1018 Err(e) => { 1019 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 1020 break; 1021 } 1022 } 1023 1024 // We've been told to terminate 1025 if vcpu_kill_signalled.load(Ordering::SeqCst) 1026 || vcpu_kill.load(Ordering::SeqCst) 1027 { 1028 vcpu_run_interrupted.store(true, Ordering::SeqCst); 1029 break; 1030 } 1031 } 1032 }) 1033 .or_else(|_| { 1034 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 1035 error!("vCPU thread panicked"); 1036 panic_exit_evt.write(1) 1037 }) 1038 .ok(); 1039 }) 1040 .map_err(Error::VcpuSpawn)?, 1041 ); 1042 1043 // On hot plug calls into this function entry_point is None. It is for 1044 // those hotplug CPU additions that we need to set the inserting flag. 1045 self.vcpu_states[usize::from(vcpu_id)].handle = handle; 1046 self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; 1047 1048 Ok(()) 1049 } 1050 1051 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 1052 fn activate_vcpus(&mut self, desired_vcpus: u8, inserting: bool) -> Result<()> { 1053 if desired_vcpus > self.config.max_vcpus { 1054 return Err(Error::DesiredVCpuCountExceedsMax); 1055 } 1056 1057 let vcpu_thread_barrier = Arc::new(Barrier::new( 1058 (desired_vcpus - self.present_vcpus() + 1) as usize, 1059 )); 1060 1061 info!( 1062 "Starting vCPUs: desired = {}, allocated = {}, present = {}", 1063 desired_vcpus, 1064 self.vcpus.len(), 1065 self.present_vcpus() 1066 ); 1067 1068 // This reuses any inactive vCPUs as well as any that were newly created 1069 for vcpu_id in self.present_vcpus()..desired_vcpus { 1070 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1071 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; 1072 } 1073 1074 // Unblock all CPU threads. 1075 vcpu_thread_barrier.wait(); 1076 Ok(()) 1077 } 1078 1079 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 1080 // Mark vCPUs for removal, actual removal happens on ejection 1081 for cpu_id in desired_vcpus..self.present_vcpus() { 1082 self.vcpu_states[usize::from(cpu_id)].removing = true; 1083 } 1084 } 1085 1086 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 1087 info!("Removing vCPU: cpu_id = {}", cpu_id); 1088 let mut state = &mut self.vcpu_states[usize::from(cpu_id)]; 1089 state.kill.store(true, Ordering::SeqCst); 1090 state.signal_thread(); 1091 state.join_thread()?; 1092 state.handle = None; 1093 1094 // Once the thread has exited, clear the "kill" so that it can reused 1095 state.kill.store(false, Ordering::SeqCst); 1096 1097 Ok(()) 1098 } 1099 1100 pub fn create_boot_vcpus(&mut self, entry_point: Option<EntryPoint>) -> Result<()> { 1101 self.create_vcpus(self.boot_vcpus(), entry_point) 1102 } 1103 1104 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 1105 pub fn start_boot_vcpus(&mut self) -> Result<()> { 1106 self.activate_vcpus(self.boot_vcpus(), false) 1107 } 1108 1109 pub fn start_restored_vcpus(&mut self) -> Result<()> { 1110 let vcpu_numbers = self.vcpus.len() as u8; 1111 let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_numbers + 1) as usize)); 1112 // Restore the vCPUs in "paused" state. 1113 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 1114 1115 for vcpu_id in 0..vcpu_numbers { 1116 let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); 1117 1118 self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), false) 1119 .map_err(|e| { 1120 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 1121 })?; 1122 } 1123 // Unblock all restored CPU threads. 1124 vcpu_thread_barrier.wait(); 1125 Ok(()) 1126 } 1127 1128 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 1129 if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { 1130 return Ok(false); 1131 } 1132 1133 if !self.dynamic { 1134 return Ok(false); 1135 } 1136 1137 match desired_vcpus.cmp(&self.present_vcpus()) { 1138 cmp::Ordering::Greater => { 1139 self.create_vcpus(desired_vcpus, None)?; 1140 self.activate_vcpus(desired_vcpus, true)?; 1141 Ok(true) 1142 } 1143 cmp::Ordering::Less => { 1144 self.mark_vcpus_for_removal(desired_vcpus); 1145 Ok(true) 1146 } 1147 _ => Ok(false), 1148 } 1149 } 1150 1151 pub fn shutdown(&mut self) -> Result<()> { 1152 // Tell the vCPUs to stop themselves next time they go through the loop 1153 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 1154 1155 // Toggle the vCPUs pause boolean 1156 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1157 1158 // Unpark all the VCPU threads. 1159 for state in self.vcpu_states.iter() { 1160 state.unpark_thread(); 1161 } 1162 1163 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1164 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1165 // above. 1166 for state in self.vcpu_states.iter() { 1167 state.signal_thread(); 1168 } 1169 1170 // Wait for all the threads to finish. This removes the state from the vector. 1171 for mut state in self.vcpu_states.drain(..) { 1172 state.join_thread()?; 1173 } 1174 1175 Ok(()) 1176 } 1177 1178 #[cfg(feature = "tdx")] 1179 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 1180 for vcpu in &self.vcpus { 1181 vcpu.lock() 1182 .unwrap() 1183 .vcpu 1184 .tdx_init(hob_address) 1185 .map_err(Error::InitializeTdx)?; 1186 } 1187 Ok(()) 1188 } 1189 1190 pub fn boot_vcpus(&self) -> u8 { 1191 self.config.boot_vcpus 1192 } 1193 1194 pub fn max_vcpus(&self) -> u8 { 1195 self.config.max_vcpus 1196 } 1197 1198 #[cfg(target_arch = "x86_64")] 1199 pub fn common_cpuid(&self) -> CpuId { 1200 self.cpuid.clone() 1201 } 1202 1203 fn present_vcpus(&self) -> u8 { 1204 self.vcpu_states 1205 .iter() 1206 .fold(0, |acc, state| acc + state.active() as u8) 1207 } 1208 1209 #[cfg(target_arch = "aarch64")] 1210 pub fn get_mpidrs(&self) -> Vec<u64> { 1211 self.vcpus 1212 .iter() 1213 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1214 .collect() 1215 } 1216 1217 #[cfg(target_arch = "aarch64")] 1218 pub fn get_saved_states(&self) -> Vec<CpuState> { 1219 self.vcpus 1220 .iter() 1221 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1222 .collect() 1223 } 1224 1225 #[cfg(target_arch = "aarch64")] 1226 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1227 self.config 1228 .topology 1229 .clone() 1230 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1231 } 1232 1233 pub fn create_madt(&self) -> Sdt { 1234 use crate::acpi; 1235 // This is also checked in the commandline parsing. 1236 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1237 1238 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1239 #[cfg(target_arch = "x86_64")] 1240 { 1241 madt.write(36, arch::layout::APIC_START); 1242 1243 for cpu in 0..self.config.max_vcpus { 1244 let lapic = LocalApic { 1245 r#type: acpi::ACPI_APIC_PROCESSOR, 1246 length: 8, 1247 processor_id: cpu, 1248 apic_id: cpu, 1249 flags: if cpu < self.config.boot_vcpus { 1250 1 << MADT_CPU_ENABLE_FLAG 1251 } else { 1252 0 1253 }, 1254 }; 1255 madt.append(lapic); 1256 } 1257 1258 madt.append(Ioapic { 1259 r#type: acpi::ACPI_APIC_IO, 1260 length: 12, 1261 ioapic_id: 0, 1262 apic_address: arch::layout::IOAPIC_START.0 as u32, 1263 gsi_base: 0, 1264 ..Default::default() 1265 }); 1266 1267 madt.append(InterruptSourceOverride { 1268 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1269 length: 10, 1270 bus: 0, 1271 source: 4, 1272 gsi: 4, 1273 flags: 0, 1274 }); 1275 } 1276 1277 #[cfg(target_arch = "aarch64")] 1278 { 1279 use vm_memory::Address; 1280 /* Notes: 1281 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1282 */ 1283 1284 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1285 for cpu in 0..self.config.boot_vcpus { 1286 let vcpu = &self.vcpus[cpu as usize]; 1287 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1288 /* ARMv8 MPIDR format: 1289 Bits [63:40] Must be zero 1290 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1291 Bits [31:24] Must be zero 1292 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1293 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1294 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1295 */ 1296 let mpidr_mask = 0xff_00ff_ffff; 1297 let gicc = GicC { 1298 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1299 length: 80, 1300 reserved0: 0, 1301 cpu_interface_number: cpu as u32, 1302 uid: cpu as u32, 1303 flags: 1, 1304 parking_version: 0, 1305 performance_interrupt: 0, 1306 parked_address: 0, 1307 base_address: 0, 1308 gicv_base_address: 0, 1309 gich_base_address: 0, 1310 vgic_interrupt: 0, 1311 gicr_base_address: 0, 1312 mpidr: mpidr & mpidr_mask, 1313 proc_power_effi_class: 0, 1314 reserved1: 0, 1315 spe_overflow_interrupt: 0, 1316 }; 1317 1318 madt.append(gicc); 1319 } 1320 1321 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1322 let gicd = GicD { 1323 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1324 length: 24, 1325 reserved0: 0, 1326 gic_id: 0, 1327 base_address: arch::layout::MAPPED_IO_START.raw_value() - 0x0001_0000, 1328 global_irq_base: 0, 1329 version: 3, 1330 reserved1: [0; 3], 1331 }; 1332 madt.append(gicd); 1333 1334 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1335 let gicr_size: u32 = 0x0001_0000 * 2 * (self.config.boot_vcpus as u32); 1336 let gicr_base: u64 = 1337 arch::layout::MAPPED_IO_START.raw_value() - 0x0001_0000 - gicr_size as u64; 1338 let gicr = GicR { 1339 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1340 length: 16, 1341 reserved: 0, 1342 base_address: gicr_base, 1343 range_length: gicr_size, 1344 }; 1345 madt.append(gicr); 1346 1347 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1348 let gicits = GicIts { 1349 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1350 length: 20, 1351 reserved0: 0, 1352 translation_id: 0, 1353 base_address: gicr_base - 2 * 0x0001_0000, 1354 reserved1: 0, 1355 }; 1356 madt.append(gicits); 1357 1358 madt.update_checksum(); 1359 } 1360 1361 madt 1362 } 1363 1364 #[cfg(target_arch = "aarch64")] 1365 pub fn create_pptt(&self) -> Sdt { 1366 let pptt_start = 0; 1367 let mut cpus = 0; 1368 let mut uid = 0; 1369 // If topology is not specified, the default setting is: 1370 // 1 package, multiple cores, 1 thread per core 1371 // This is also the behavior when PPTT is missing. 1372 let (threads_per_core, cores_per_package, packages) = 1373 self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); 1374 1375 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1376 1377 for cluster_idx in 0..packages { 1378 if cpus < self.config.boot_vcpus as usize { 1379 let cluster_offset = pptt.len() - pptt_start; 1380 let cluster_hierarchy_node = ProcessorHierarchyNode { 1381 r#type: 0, 1382 length: 20, 1383 reserved: 0, 1384 flags: 0x2, 1385 parent: 0, 1386 acpi_processor_id: cluster_idx as u32, 1387 num_private_resources: 0, 1388 }; 1389 pptt.append(cluster_hierarchy_node); 1390 1391 for core_idx in 0..cores_per_package { 1392 let core_offset = pptt.len() - pptt_start; 1393 1394 if threads_per_core > 1 { 1395 let core_hierarchy_node = ProcessorHierarchyNode { 1396 r#type: 0, 1397 length: 20, 1398 reserved: 0, 1399 flags: 0x2, 1400 parent: cluster_offset as u32, 1401 acpi_processor_id: core_idx as u32, 1402 num_private_resources: 0, 1403 }; 1404 pptt.append(core_hierarchy_node); 1405 1406 for _thread_idx in 0..threads_per_core { 1407 let thread_hierarchy_node = ProcessorHierarchyNode { 1408 r#type: 0, 1409 length: 20, 1410 reserved: 0, 1411 flags: 0xE, 1412 parent: core_offset as u32, 1413 acpi_processor_id: uid as u32, 1414 num_private_resources: 0, 1415 }; 1416 pptt.append(thread_hierarchy_node); 1417 uid += 1; 1418 } 1419 } else { 1420 let thread_hierarchy_node = ProcessorHierarchyNode { 1421 r#type: 0, 1422 length: 20, 1423 reserved: 0, 1424 flags: 0xA, 1425 parent: cluster_offset as u32, 1426 acpi_processor_id: uid as u32, 1427 num_private_resources: 0, 1428 }; 1429 pptt.append(thread_hierarchy_node); 1430 uid += 1; 1431 } 1432 } 1433 cpus += (cores_per_package * threads_per_core) as usize; 1434 } 1435 } 1436 1437 pptt.update_checksum(); 1438 pptt 1439 } 1440 1441 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] 1442 fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> { 1443 self.vcpus[usize::from(cpu_id)] 1444 .lock() 1445 .unwrap() 1446 .vcpu 1447 .get_regs() 1448 .map_err(Error::CpuDebug) 1449 } 1450 1451 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] 1452 fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { 1453 self.vcpus[usize::from(cpu_id)] 1454 .lock() 1455 .unwrap() 1456 .vcpu 1457 .set_regs(regs) 1458 .map_err(Error::CpuDebug) 1459 } 1460 1461 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] 1462 fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> { 1463 self.vcpus[usize::from(cpu_id)] 1464 .lock() 1465 .unwrap() 1466 .vcpu 1467 .get_sregs() 1468 .map_err(Error::CpuDebug) 1469 } 1470 1471 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] 1472 fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { 1473 self.vcpus[usize::from(cpu_id)] 1474 .lock() 1475 .unwrap() 1476 .vcpu 1477 .set_sregs(sregs) 1478 .map_err(Error::CpuDebug) 1479 } 1480 1481 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] 1482 fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result<u64> { 1483 let (gpa, _) = self.vcpus[usize::from(cpu_id)] 1484 .lock() 1485 .unwrap() 1486 .vcpu 1487 .translate_gva(gva, /* flags: unused */ 0) 1488 .map_err(Error::TranslateVirtualAddress)?; 1489 Ok(gpa) 1490 } 1491 1492 pub fn vcpus_paused(&self) -> bool { 1493 self.vcpus_pause_signalled.load(Ordering::SeqCst) 1494 } 1495 } 1496 1497 struct Cpu { 1498 cpu_id: u8, 1499 proximity_domain: u32, 1500 dynamic: bool, 1501 } 1502 1503 #[cfg(target_arch = "x86_64")] 1504 const MADT_CPU_ENABLE_FLAG: usize = 0; 1505 1506 impl Cpu { 1507 #[cfg(target_arch = "x86_64")] 1508 fn generate_mat(&self) -> Vec<u8> { 1509 let lapic = LocalApic { 1510 r#type: 0, 1511 length: 8, 1512 processor_id: self.cpu_id, 1513 apic_id: self.cpu_id, 1514 flags: 1 << MADT_CPU_ENABLE_FLAG, 1515 }; 1516 1517 let mut mat_data: Vec<u8> = Vec::new(); 1518 mat_data.resize(std::mem::size_of_val(&lapic), 0); 1519 unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic }; 1520 1521 mat_data 1522 } 1523 } 1524 1525 impl Aml for Cpu { 1526 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1527 #[cfg(target_arch = "x86_64")] 1528 let mat_data: Vec<u8> = self.generate_mat(); 1529 #[allow(clippy::if_same_then_else)] 1530 if self.dynamic { 1531 aml::Device::new( 1532 format!("C{:03}", self.cpu_id).as_str().into(), 1533 vec![ 1534 &aml::Name::new("_HID".into(), &"ACPI0007"), 1535 &aml::Name::new("_UID".into(), &self.cpu_id), 1536 // Currently, AArch64 cannot support following fields. 1537 /* 1538 _STA return value: 1539 Bit [0] – Set if the device is present. 1540 Bit [1] – Set if the device is enabled and decoding its resources. 1541 Bit [2] – Set if the device should be shown in the UI. 1542 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1543 Bit [4] – Set if the battery is present. 1544 Bits [31:5] – Reserved (must be cleared). 1545 */ 1546 #[cfg(target_arch = "x86_64")] 1547 &aml::Method::new( 1548 "_STA".into(), 1549 0, 1550 false, 1551 // Call into CSTA method which will interrogate device 1552 vec![&aml::Return::new(&aml::MethodCall::new( 1553 "CSTA".into(), 1554 vec![&self.cpu_id], 1555 ))], 1556 ), 1557 &aml::Method::new( 1558 "_PXM".into(), 1559 0, 1560 false, 1561 vec![&aml::Return::new(&self.proximity_domain)], 1562 ), 1563 // The Linux kernel expects every CPU device to have a _MAT entry 1564 // containing the LAPIC for this processor with the enabled bit set 1565 // even it if is disabled in the MADT (non-boot CPU) 1566 #[cfg(target_arch = "x86_64")] 1567 &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)), 1568 // Trigger CPU ejection 1569 #[cfg(target_arch = "x86_64")] 1570 &aml::Method::new( 1571 "_EJ0".into(), 1572 1, 1573 false, 1574 // Call into CEJ0 method which will actually eject device 1575 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1576 ), 1577 ], 1578 ) 1579 .append_aml_bytes(bytes); 1580 } else { 1581 aml::Device::new( 1582 format!("C{:03}", self.cpu_id).as_str().into(), 1583 vec![ 1584 &aml::Name::new("_HID".into(), &"ACPI0007"), 1585 &aml::Name::new("_UID".into(), &self.cpu_id), 1586 #[cfg(target_arch = "x86_64")] 1587 &aml::Method::new( 1588 "_STA".into(), 1589 0, 1590 false, 1591 // Mark CPU present see CSTA implementation 1592 vec![&aml::Return::new(&0xfu8)], 1593 ), 1594 &aml::Method::new( 1595 "_PXM".into(), 1596 0, 1597 false, 1598 vec![&aml::Return::new(&self.proximity_domain)], 1599 ), 1600 // The Linux kernel expects every CPU device to have a _MAT entry 1601 // containing the LAPIC for this processor with the enabled bit set 1602 // even it if is disabled in the MADT (non-boot CPU) 1603 #[cfg(target_arch = "x86_64")] 1604 &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)), 1605 ], 1606 ) 1607 .append_aml_bytes(bytes); 1608 } 1609 } 1610 } 1611 1612 struct CpuNotify { 1613 cpu_id: u8, 1614 } 1615 1616 impl Aml for CpuNotify { 1617 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1618 let object = aml::Path::new(&format!("C{:03}", self.cpu_id)); 1619 aml::If::new( 1620 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1621 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1622 ) 1623 .append_aml_bytes(bytes) 1624 } 1625 } 1626 1627 struct CpuMethods { 1628 max_vcpus: u8, 1629 dynamic: bool, 1630 } 1631 1632 impl Aml for CpuMethods { 1633 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1634 if self.dynamic { 1635 // CPU status method 1636 aml::Method::new( 1637 "CSTA".into(), 1638 1, 1639 true, 1640 vec![ 1641 // Take lock defined above 1642 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1643 // Write CPU number (in first argument) to I/O port via field 1644 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1645 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1646 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 1647 &aml::If::new( 1648 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 1649 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 1650 ), 1651 // Release lock 1652 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1653 // Return 0 or 0xf 1654 &aml::Return::new(&aml::Local(0)), 1655 ], 1656 ) 1657 .append_aml_bytes(bytes); 1658 1659 let mut cpu_notifies = Vec::new(); 1660 for cpu_id in 0..self.max_vcpus { 1661 cpu_notifies.push(CpuNotify { cpu_id }); 1662 } 1663 1664 let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new(); 1665 for cpu_id in 0..self.max_vcpus { 1666 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 1667 } 1668 1669 aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).append_aml_bytes(bytes); 1670 1671 aml::Method::new( 1672 "CEJ0".into(), 1673 1, 1674 true, 1675 vec![ 1676 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1677 // Write CPU number (in first argument) to I/O port via field 1678 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1679 // Set CEJ0 bit 1680 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 1681 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1682 ], 1683 ) 1684 .append_aml_bytes(bytes); 1685 1686 aml::Method::new( 1687 "CSCN".into(), 1688 0, 1689 true, 1690 vec![ 1691 // Take lock defined above 1692 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1693 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1694 &aml::While::new( 1695 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 1696 vec![ 1697 // Write CPU number (in first argument) to I/O port via field 1698 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 1699 // Check if CINS bit is set 1700 &aml::If::new( 1701 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 1702 // Notify device if it is 1703 vec![ 1704 &aml::MethodCall::new( 1705 "CTFY".into(), 1706 vec![&aml::Local(0), &aml::ONE], 1707 ), 1708 // Reset CINS bit 1709 &aml::Store::new( 1710 &aml::Path::new("\\_SB_.PRES.CINS"), 1711 &aml::ONE, 1712 ), 1713 ], 1714 ), 1715 // Check if CRMV bit is set 1716 &aml::If::new( 1717 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 1718 // Notify device if it is (with the eject constant 0x3) 1719 vec![ 1720 &aml::MethodCall::new( 1721 "CTFY".into(), 1722 vec![&aml::Local(0), &3u8], 1723 ), 1724 // Reset CRMV bit 1725 &aml::Store::new( 1726 &aml::Path::new("\\_SB_.PRES.CRMV"), 1727 &aml::ONE, 1728 ), 1729 ], 1730 ), 1731 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 1732 ], 1733 ), 1734 // Release lock 1735 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1736 ], 1737 ) 1738 .append_aml_bytes(bytes) 1739 } else { 1740 aml::Method::new("CSCN".into(), 0, true, vec![]).append_aml_bytes(bytes) 1741 } 1742 } 1743 } 1744 1745 impl Aml for CpuManager { 1746 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1747 #[cfg(target_arch = "x86_64")] 1748 if let Some(acpi_address) = self.acpi_address { 1749 // CPU hotplug controller 1750 aml::Device::new( 1751 "_SB_.PRES".into(), 1752 vec![ 1753 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 1754 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 1755 // Mutex to protect concurrent access as we write to choose CPU and then read back status 1756 &aml::Mutex::new("CPLK".into(), 0), 1757 &aml::Name::new( 1758 "_CRS".into(), 1759 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 1760 aml::AddressSpaceCachable::NotCacheable, 1761 true, 1762 acpi_address.0 as u64, 1763 acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 1764 )]), 1765 ), 1766 // OpRegion and Fields map MMIO range into individual field values 1767 &aml::OpRegion::new( 1768 "PRST".into(), 1769 aml::OpRegionSpace::SystemMemory, 1770 acpi_address.0 as usize, 1771 CPU_MANAGER_ACPI_SIZE, 1772 ), 1773 &aml::Field::new( 1774 "PRST".into(), 1775 aml::FieldAccessType::Byte, 1776 aml::FieldUpdateRule::WriteAsZeroes, 1777 vec![ 1778 aml::FieldEntry::Reserved(32), 1779 aml::FieldEntry::Named(*b"CPEN", 1), 1780 aml::FieldEntry::Named(*b"CINS", 1), 1781 aml::FieldEntry::Named(*b"CRMV", 1), 1782 aml::FieldEntry::Named(*b"CEJ0", 1), 1783 aml::FieldEntry::Reserved(4), 1784 aml::FieldEntry::Named(*b"CCMD", 8), 1785 ], 1786 ), 1787 &aml::Field::new( 1788 "PRST".into(), 1789 aml::FieldAccessType::DWord, 1790 aml::FieldUpdateRule::Preserve, 1791 vec![ 1792 aml::FieldEntry::Named(*b"CSEL", 32), 1793 aml::FieldEntry::Reserved(32), 1794 aml::FieldEntry::Named(*b"CDAT", 32), 1795 ], 1796 ), 1797 ], 1798 ) 1799 .append_aml_bytes(bytes); 1800 } 1801 1802 // CPU devices 1803 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 1804 let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05")); 1805 // Bundle methods together under a common object 1806 let methods = CpuMethods { 1807 max_vcpus: self.config.max_vcpus, 1808 dynamic: self.dynamic, 1809 }; 1810 let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods]; 1811 1812 let mut cpu_devices = Vec::new(); 1813 for cpu_id in 0..self.config.max_vcpus { 1814 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 1815 let cpu_device = Cpu { 1816 cpu_id, 1817 proximity_domain, 1818 dynamic: self.dynamic, 1819 }; 1820 1821 cpu_devices.push(cpu_device); 1822 } 1823 1824 for cpu_device in cpu_devices.iter() { 1825 cpu_data_inner.push(cpu_device); 1826 } 1827 1828 aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).append_aml_bytes(bytes) 1829 } 1830 } 1831 1832 impl Pausable for CpuManager { 1833 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 1834 // Tell the vCPUs to pause themselves next time they exit 1835 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 1836 1837 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1838 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1839 // above. 1840 for state in self.vcpu_states.iter() { 1841 state.signal_thread(); 1842 } 1843 1844 for vcpu in self.vcpus.iter() { 1845 let mut vcpu = vcpu.lock().unwrap(); 1846 vcpu.pause()?; 1847 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 1848 if !self.config.kvm_hyperv { 1849 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 1850 MigratableError::Pause(anyhow!( 1851 "Could not notify guest it has been paused {:?}", 1852 e 1853 )) 1854 })?; 1855 } 1856 } 1857 1858 Ok(()) 1859 } 1860 1861 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 1862 for vcpu in self.vcpus.iter() { 1863 vcpu.lock().unwrap().resume()?; 1864 } 1865 1866 // Toggle the vCPUs pause boolean 1867 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1868 1869 // Unpark all the VCPU threads. 1870 // Once unparked, the next thing they will do is checking for the pause 1871 // boolean. Since it'll be set to false, they will exit their pause loop 1872 // and go back to vmx root. 1873 for state in self.vcpu_states.iter() { 1874 state.unpark_thread(); 1875 } 1876 Ok(()) 1877 } 1878 } 1879 1880 impl Snapshottable for CpuManager { 1881 fn id(&self) -> String { 1882 CPU_MANAGER_SNAPSHOT_ID.to_string() 1883 } 1884 1885 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1886 let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID); 1887 1888 // The CpuManager snapshot is a collection of all vCPUs snapshots. 1889 for vcpu in &self.vcpus { 1890 let cpu_snapshot = vcpu.lock().unwrap().snapshot()?; 1891 cpu_manager_snapshot.add_snapshot(cpu_snapshot); 1892 } 1893 1894 Ok(cpu_manager_snapshot) 1895 } 1896 1897 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 1898 for (cpu_id, snapshot) in snapshot.snapshots.iter() { 1899 info!("Restoring VCPU {}", cpu_id); 1900 self.create_vcpu(cpu_id.parse::<u8>().unwrap(), None, Some(*snapshot.clone())) 1901 .map_err(|e| MigratableError::Restore(anyhow!("Could not create vCPU {:?}", e)))?; 1902 } 1903 1904 Ok(()) 1905 } 1906 } 1907 1908 impl Transportable for CpuManager {} 1909 impl Migratable for CpuManager {} 1910 1911 #[cfg(feature = "gdb")] 1912 impl Debuggable for CpuManager { 1913 #[cfg(feature = "kvm")] 1914 fn set_guest_debug( 1915 &self, 1916 cpu_id: usize, 1917 addrs: &[GuestAddress], 1918 singlestep: bool, 1919 ) -> std::result::Result<(), DebuggableError> { 1920 self.vcpus[cpu_id] 1921 .lock() 1922 .unwrap() 1923 .vcpu 1924 .set_guest_debug(addrs, singlestep) 1925 .map_err(DebuggableError::SetDebug) 1926 } 1927 1928 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 1929 Ok(()) 1930 } 1931 1932 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 1933 Ok(()) 1934 } 1935 1936 #[cfg(target_arch = "x86_64")] 1937 fn read_regs(&self, cpu_id: usize) -> std::result::Result<X86_64CoreRegs, DebuggableError> { 1938 // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 1939 let gregs = self 1940 .get_regs(cpu_id as u8) 1941 .map_err(DebuggableError::ReadRegs)?; 1942 let regs = [ 1943 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, 1944 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, 1945 ]; 1946 1947 // GDB exposes 32-bit eflags instead of 64-bit rflags. 1948 // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml 1949 let eflags = gregs.rflags as u32; 1950 let rip = gregs.rip; 1951 1952 // Segment registers: CS, SS, DS, ES, FS, GS 1953 let sregs = self 1954 .get_sregs(cpu_id as u8) 1955 .map_err(DebuggableError::ReadRegs)?; 1956 let segments = X86SegmentRegs { 1957 cs: sregs.cs.selector as u32, 1958 ss: sregs.ss.selector as u32, 1959 ds: sregs.ds.selector as u32, 1960 es: sregs.es.selector as u32, 1961 fs: sregs.fs.selector as u32, 1962 gs: sregs.gs.selector as u32, 1963 }; 1964 1965 // TODO: Add other registers 1966 1967 Ok(X86_64CoreRegs { 1968 regs, 1969 eflags, 1970 rip, 1971 segments, 1972 ..Default::default() 1973 }) 1974 } 1975 1976 #[cfg(target_arch = "x86_64")] 1977 fn write_regs( 1978 &self, 1979 cpu_id: usize, 1980 regs: &X86_64CoreRegs, 1981 ) -> std::result::Result<(), DebuggableError> { 1982 let orig_gregs = self 1983 .get_regs(cpu_id as u8) 1984 .map_err(DebuggableError::ReadRegs)?; 1985 let gregs = StandardRegisters { 1986 rax: regs.regs[0], 1987 rbx: regs.regs[1], 1988 rcx: regs.regs[2], 1989 rdx: regs.regs[3], 1990 rsi: regs.regs[4], 1991 rdi: regs.regs[5], 1992 rbp: regs.regs[6], 1993 rsp: regs.regs[7], 1994 r8: regs.regs[8], 1995 r9: regs.regs[9], 1996 r10: regs.regs[10], 1997 r11: regs.regs[11], 1998 r12: regs.regs[12], 1999 r13: regs.regs[13], 2000 r14: regs.regs[14], 2001 r15: regs.regs[15], 2002 rip: regs.rip, 2003 // Update the lower 32-bit of rflags. 2004 rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), 2005 }; 2006 2007 self.set_regs(cpu_id as u8, &gregs) 2008 .map_err(DebuggableError::WriteRegs)?; 2009 2010 // Segment registers: CS, SS, DS, ES, FS, GS 2011 // Since GDB care only selectors, we call get_sregs() first. 2012 let mut sregs = self 2013 .get_sregs(cpu_id as u8) 2014 .map_err(DebuggableError::ReadRegs)?; 2015 sregs.cs.selector = regs.segments.cs as u16; 2016 sregs.ss.selector = regs.segments.ss as u16; 2017 sregs.ds.selector = regs.segments.ds as u16; 2018 sregs.es.selector = regs.segments.es as u16; 2019 sregs.fs.selector = regs.segments.fs as u16; 2020 sregs.gs.selector = regs.segments.gs as u16; 2021 2022 self.set_sregs(cpu_id as u8, &sregs) 2023 .map_err(DebuggableError::WriteRegs)?; 2024 2025 // TODO: Add other registers 2026 2027 Ok(()) 2028 } 2029 2030 #[cfg(target_arch = "x86_64")] 2031 fn read_mem( 2032 &self, 2033 cpu_id: usize, 2034 vaddr: GuestAddress, 2035 len: usize, 2036 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2037 let mut buf = vec![0; len]; 2038 let mut total_read = 0_u64; 2039 2040 while total_read < len as u64 { 2041 let gaddr = vaddr.0 + total_read; 2042 let paddr = match self.translate_gva(cpu_id as u8, gaddr) { 2043 Ok(paddr) => paddr, 2044 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2045 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2046 }; 2047 let psize = arch::PAGE_SIZE as u64; 2048 let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); 2049 self.vmmops 2050 .guest_mem_read( 2051 paddr, 2052 &mut buf[total_read as usize..total_read as usize + read_len as usize], 2053 ) 2054 .map_err(DebuggableError::ReadMem)?; 2055 total_read += read_len; 2056 } 2057 Ok(buf) 2058 } 2059 2060 #[cfg(target_arch = "x86_64")] 2061 fn write_mem( 2062 &self, 2063 cpu_id: usize, 2064 vaddr: &GuestAddress, 2065 data: &[u8], 2066 ) -> std::result::Result<(), DebuggableError> { 2067 let mut total_written = 0_u64; 2068 2069 while total_written < data.len() as u64 { 2070 let gaddr = vaddr.0 + total_written; 2071 let paddr = match self.translate_gva(cpu_id as u8, gaddr) { 2072 Ok(paddr) => paddr, 2073 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. 2074 Err(e) => return Err(DebuggableError::TranslateGva(e)), 2075 }; 2076 let psize = arch::PAGE_SIZE as u64; 2077 let write_len = std::cmp::min( 2078 data.len() as u64 - total_written, 2079 psize - (paddr & (psize - 1)), 2080 ); 2081 self.vmmops 2082 .guest_mem_write( 2083 paddr, 2084 &data[total_written as usize..total_written as usize + write_len as usize], 2085 ) 2086 .map_err(DebuggableError::WriteMem)?; 2087 total_written += write_len; 2088 } 2089 Ok(()) 2090 } 2091 2092 fn active_vcpus(&self) -> usize { 2093 self.present_vcpus() as usize 2094 } 2095 } 2096 2097 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2098 #[cfg(test)] 2099 mod tests { 2100 use arch::x86_64::interrupts::*; 2101 use arch::x86_64::regs::*; 2102 use hypervisor::x86_64::{FpuState, LapicState, StandardRegisters}; 2103 2104 #[test] 2105 fn test_setlint() { 2106 let hv = hypervisor::new().unwrap(); 2107 let vm = hv.create_vm().expect("new VM fd creation failed"); 2108 assert!(hv.check_required_extensions().is_ok()); 2109 // Calling get_lapic will fail if there is no irqchip before hand. 2110 assert!(vm.create_irq_chip().is_ok()); 2111 let vcpu = vm.create_vcpu(0, None).unwrap(); 2112 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 2113 2114 // Compute the value that is expected to represent LVT0 and LVT1. 2115 let lint0 = get_klapic_reg(&klapic_before, APIC_LVT0); 2116 let lint1 = get_klapic_reg(&klapic_before, APIC_LVT1); 2117 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 2118 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 2119 2120 set_lint(&vcpu).unwrap(); 2121 2122 // Compute the value that represents LVT0 and LVT1 after set_lint. 2123 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 2124 let lint0_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT0); 2125 let lint1_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT1); 2126 assert_eq!(lint0_mode_expected, lint0_mode_actual); 2127 assert_eq!(lint1_mode_expected, lint1_mode_actual); 2128 } 2129 2130 #[test] 2131 fn test_setup_fpu() { 2132 let hv = hypervisor::new().unwrap(); 2133 let vm = hv.create_vm().expect("new VM fd creation failed"); 2134 let vcpu = vm.create_vcpu(0, None).unwrap(); 2135 setup_fpu(&vcpu).unwrap(); 2136 2137 let expected_fpu: FpuState = FpuState { 2138 fcw: 0x37f, 2139 mxcsr: 0x1f80, 2140 ..Default::default() 2141 }; 2142 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 2143 // TODO: auto-generate kvm related structures with PartialEq on. 2144 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 2145 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 2146 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 2147 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 2148 // remove it at all. 2149 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 2150 } 2151 2152 #[test] 2153 fn test_setup_msrs() { 2154 use hypervisor::arch::x86::msr_index; 2155 use hypervisor::x86_64::{MsrEntries, MsrEntry}; 2156 2157 let hv = hypervisor::new().unwrap(); 2158 let vm = hv.create_vm().expect("new VM fd creation failed"); 2159 let vcpu = vm.create_vcpu(0, None).unwrap(); 2160 setup_msrs(&vcpu).unwrap(); 2161 2162 // This test will check against the last MSR entry configured (the tenth one). 2163 // See create_msr_entries for details. 2164 let mut msrs = MsrEntries::from_entries(&[MsrEntry { 2165 index: msr_index::MSR_IA32_MISC_ENABLE, 2166 ..Default::default() 2167 }]) 2168 .unwrap(); 2169 2170 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 2171 // in this test case scenario. 2172 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 2173 assert_eq!(read_msrs, 1); 2174 2175 // Official entries that were setup when we did setup_msrs. We need to assert that the 2176 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 2177 // expect. 2178 let entry_vec = hypervisor::x86_64::boot_msr_entries(); 2179 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 2180 } 2181 2182 #[test] 2183 fn test_setup_regs() { 2184 let hv = hypervisor::new().unwrap(); 2185 let vm = hv.create_vm().expect("new VM fd creation failed"); 2186 let vcpu = vm.create_vcpu(0, None).unwrap(); 2187 2188 let expected_regs: StandardRegisters = StandardRegisters { 2189 rflags: 0x0000000000000002u64, 2190 rbx: arch::layout::PVH_INFO_START.0, 2191 rip: 1, 2192 ..Default::default() 2193 }; 2194 2195 setup_regs(&vcpu, expected_regs.rip).unwrap(); 2196 2197 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 2198 assert_eq!(actual_regs, expected_regs); 2199 } 2200 } 2201 2202 #[cfg(target_arch = "aarch64")] 2203 #[cfg(test)] 2204 mod tests { 2205 use arch::aarch64::regs::*; 2206 use hypervisor::kvm::aarch64::{is_system_register, MPIDR_EL1}; 2207 use hypervisor::kvm::kvm_bindings::{ 2208 kvm_one_reg, kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 2209 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 2210 }; 2211 use hypervisor::{arm64_core_reg_id, offset__of}; 2212 use std::mem; 2213 2214 #[test] 2215 fn test_setup_regs() { 2216 let hv = hypervisor::new().unwrap(); 2217 let vm = hv.create_vm().unwrap(); 2218 let vcpu = vm.create_vcpu(0, None).unwrap(); 2219 2220 let res = setup_regs(&vcpu, 0, 0x0); 2221 // Must fail when vcpu is not initialized yet. 2222 assert!(res.is_err()); 2223 2224 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2225 vm.get_preferred_target(&mut kvi).unwrap(); 2226 vcpu.vcpu_init(&kvi).unwrap(); 2227 2228 assert!(setup_regs(&vcpu, 0, 0x0).is_ok()); 2229 } 2230 2231 #[test] 2232 fn test_read_mpidr() { 2233 let hv = hypervisor::new().unwrap(); 2234 let vm = hv.create_vm().unwrap(); 2235 let vcpu = vm.create_vcpu(0, None).unwrap(); 2236 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2237 vm.get_preferred_target(&mut kvi).unwrap(); 2238 2239 // Must fail when vcpu is not initialized yet. 2240 assert!(vcpu.read_mpidr().is_err()); 2241 2242 vcpu.vcpu_init(&kvi).unwrap(); 2243 assert_eq!(vcpu.read_mpidr().unwrap(), 0x80000000); 2244 } 2245 2246 #[test] 2247 fn test_is_system_register() { 2248 let offset = offset__of!(user_pt_regs, pc); 2249 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 2250 assert!(!is_system_register(regid)); 2251 let regid = KVM_REG_ARM64 as u64 | KVM_REG_SIZE_U64 as u64 | KVM_REG_ARM64_SYSREG as u64; 2252 assert!(is_system_register(regid)); 2253 } 2254 2255 #[test] 2256 fn test_save_restore_core_regs() { 2257 let hv = hypervisor::new().unwrap(); 2258 let vm = hv.create_vm().unwrap(); 2259 let vcpu = vm.create_vcpu(0, None).unwrap(); 2260 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2261 vm.get_preferred_target(&mut kvi).unwrap(); 2262 2263 // Must fail when vcpu is not initialized yet. 2264 let mut state = kvm_regs::default(); 2265 let res = vcpu.core_registers(&mut state); 2266 assert!(res.is_err()); 2267 assert_eq!( 2268 format!("{}", res.unwrap_err()), 2269 "Failed to get core register: Exec format error (os error 8)" 2270 ); 2271 2272 let res = vcpu.set_core_registers(&state); 2273 assert!(res.is_err()); 2274 assert_eq!( 2275 format!("{}", res.unwrap_err()), 2276 "Failed to set core register: Exec format error (os error 8)" 2277 ); 2278 2279 vcpu.vcpu_init(&kvi).unwrap(); 2280 assert!(vcpu.core_registers(&mut state).is_ok()); 2281 assert_eq!(state.regs.pstate, 0x3C5); 2282 2283 assert!(vcpu.set_core_registers(&state).is_ok()); 2284 let off = offset__of!(user_pt_regs, pstate); 2285 let pstate = vcpu 2286 .get_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 2287 .expect("Failed to call kvm get one reg"); 2288 assert_eq!(state.regs.pstate, pstate); 2289 } 2290 2291 #[test] 2292 fn test_save_restore_system_regs() { 2293 let hv = hypervisor::new().unwrap(); 2294 let vm = hv.create_vm().unwrap(); 2295 let vcpu = vm.create_vcpu(0, None).unwrap(); 2296 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2297 vm.get_preferred_target(&mut kvi).unwrap(); 2298 2299 // Must fail when vcpu is not initialized yet. 2300 let mut state: Vec<kvm_one_reg> = Vec::new(); 2301 let res = vcpu.system_registers(&mut state); 2302 assert!(res.is_err()); 2303 assert_eq!( 2304 format!("{}", res.unwrap_err()), 2305 "Failed to retrieve list of registers: Exec format error (os error 8)" 2306 ); 2307 2308 state.push(kvm_one_reg { 2309 id: MPIDR_EL1, 2310 addr: 0x00, 2311 }); 2312 let res = vcpu.set_system_registers(&state); 2313 assert!(res.is_err()); 2314 assert_eq!( 2315 format!("{}", res.unwrap_err()), 2316 "Failed to set system register: Exec format error (os error 8)" 2317 ); 2318 2319 vcpu.vcpu_init(&kvi).unwrap(); 2320 assert!(vcpu.system_registers(&mut state).is_ok()); 2321 let initial_mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr"); 2322 assert!(state.contains(&kvm_one_reg { 2323 id: MPIDR_EL1, 2324 addr: initial_mpidr 2325 })); 2326 2327 assert!(vcpu.set_system_registers(&state).is_ok()); 2328 let mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr"); 2329 assert_eq!(initial_mpidr, mpidr); 2330 } 2331 2332 #[test] 2333 fn test_get_set_mpstate() { 2334 let hv = hypervisor::new().unwrap(); 2335 let vm = hv.create_vm().unwrap(); 2336 let vcpu = vm.create_vcpu(0, None).unwrap(); 2337 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 2338 vm.get_preferred_target(&mut kvi).unwrap(); 2339 2340 let res = vcpu.get_mp_state(); 2341 assert!(res.is_ok()); 2342 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 2343 } 2344 } 2345