1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 use crate::device_manager::DeviceManager; 16 use crate::memory_manager::MemoryManager; 17 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 18 #[cfg(target_arch = "x86_64")] 19 use crate::vm::physical_bits; 20 use crate::GuestMemoryMmap; 21 use crate::CPU_MANAGER_SNAPSHOT_ID; 22 #[cfg(feature = "acpi")] 23 use acpi_tables::{aml, aml::Aml, sdt::Sdt}; 24 use anyhow::anyhow; 25 use arch::EntryPoint; 26 #[cfg(any(target_arch = "aarch64", feature = "acpi"))] 27 use arch::NumaNodes; 28 use devices::interrupt_controller::InterruptController; 29 #[cfg(target_arch = "aarch64")] 30 use hypervisor::kvm::kvm_bindings; 31 #[cfg(target_arch = "x86_64")] 32 use hypervisor::CpuId; 33 use hypervisor::{vm::VmmOps, CpuState, HypervisorCpuError, VmExit}; 34 use libc::{c_void, siginfo_t}; 35 use seccompiler::{apply_filter, SeccompAction}; 36 #[cfg(feature = "acpi")] 37 use std::collections::BTreeMap; 38 use std::os::unix::thread::JoinHandleExt; 39 use std::sync::atomic::{AtomicBool, Ordering}; 40 use std::sync::{Arc, Barrier, Mutex}; 41 use std::{cmp, io, result, thread}; 42 use vm_device::BusDevice; 43 #[cfg(feature = "acpi")] 44 use vm_memory::GuestAddress; 45 use vm_memory::GuestMemoryAtomic; 46 use vm_migration::{ 47 Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable, 48 Transportable, 49 }; 50 use vmm_sys_util::eventfd::EventFd; 51 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 52 53 #[cfg(feature = "acpi")] 54 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 55 56 #[derive(Debug)] 57 pub enum Error { 58 /// Cannot create the vCPU. 59 VcpuCreate(anyhow::Error), 60 61 /// Cannot run the VCPUs. 62 VcpuRun(anyhow::Error), 63 64 /// Cannot spawn a new vCPU thread. 65 VcpuSpawn(io::Error), 66 67 /// Cannot generate common CPUID 68 CommonCpuId(arch::Error), 69 70 /// Error configuring VCPU 71 VcpuConfiguration(arch::Error), 72 73 #[cfg(target_arch = "aarch64")] 74 /// Error fetching prefered target 75 VcpuArmPreferredTarget(hypervisor::HypervisorVmError), 76 77 #[cfg(target_arch = "aarch64")] 78 /// Error doing vCPU init on Arm. 79 VcpuArmInit(hypervisor::HypervisorCpuError), 80 81 /// Failed to join on vCPU threads 82 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 83 84 /// Cannot add legacy device to Bus. 85 BusError(vm_device::BusError), 86 87 /// Asking for more vCPUs that we can have 88 DesiredVCpuCountExceedsMax, 89 90 /// Cannot create seccomp filter 91 CreateSeccompFilter(seccompiler::Error), 92 93 /// Cannot apply seccomp filter 94 ApplySeccompFilter(seccompiler::Error), 95 96 /// Error starting vCPU after restore 97 StartRestoreVcpu(anyhow::Error), 98 99 /// Error because an unexpected VmExit type was received. 100 UnexpectedVmExit, 101 102 /// Failed to allocate MMIO address 103 AllocateMmmioAddress, 104 105 #[cfg(feature = "tdx")] 106 InitializeTdx(hypervisor::HypervisorCpuError), 107 } 108 pub type Result<T> = result::Result<T, Error>; 109 110 #[cfg(all(target_arch = "x86_64", feature = "acpi"))] 111 #[allow(dead_code)] 112 #[repr(packed)] 113 struct LocalApic { 114 pub r#type: u8, 115 pub length: u8, 116 pub processor_id: u8, 117 pub apic_id: u8, 118 pub flags: u32, 119 } 120 121 #[allow(dead_code)] 122 #[repr(packed)] 123 #[derive(Default)] 124 struct Ioapic { 125 pub r#type: u8, 126 pub length: u8, 127 pub ioapic_id: u8, 128 _reserved: u8, 129 pub apic_address: u32, 130 pub gsi_base: u32, 131 } 132 133 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 134 #[allow(dead_code)] 135 #[repr(packed)] 136 struct GicC { 137 pub r#type: u8, 138 pub length: u8, 139 pub reserved0: u16, 140 pub cpu_interface_number: u32, 141 pub uid: u32, 142 pub flags: u32, 143 pub parking_version: u32, 144 pub performance_interrupt: u32, 145 pub parked_address: u64, 146 pub base_address: u64, 147 pub gicv_base_address: u64, 148 pub gich_base_address: u64, 149 pub vgic_interrupt: u32, 150 pub gicr_base_address: u64, 151 pub mpidr: u64, 152 pub proc_power_effi_class: u8, 153 pub reserved1: u8, 154 pub spe_overflow_interrupt: u16, 155 } 156 157 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 158 #[allow(dead_code)] 159 #[repr(packed)] 160 struct GicD { 161 pub r#type: u8, 162 pub length: u8, 163 pub reserved0: u16, 164 pub gic_id: u32, 165 pub base_address: u64, 166 pub global_irq_base: u32, 167 pub version: u8, 168 pub reserved1: [u8; 3], 169 } 170 171 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 172 #[allow(dead_code)] 173 #[repr(packed)] 174 struct GicR { 175 pub r#type: u8, 176 pub length: u8, 177 pub reserved: u16, 178 pub base_address: u64, 179 pub range_length: u32, 180 } 181 182 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 183 #[allow(dead_code)] 184 #[repr(packed)] 185 struct GicIts { 186 pub r#type: u8, 187 pub length: u8, 188 pub reserved0: u16, 189 pub translation_id: u32, 190 pub base_address: u64, 191 pub reserved1: u32, 192 } 193 194 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 195 #[allow(dead_code)] 196 #[repr(packed)] 197 struct ProcessorHierarchyNode { 198 pub r#type: u8, 199 pub length: u8, 200 pub reserved: u16, 201 pub flags: u32, 202 pub parent: u32, 203 pub acpi_processor_id: u32, 204 pub num_private_resources: u32, 205 } 206 207 #[allow(dead_code)] 208 #[repr(packed)] 209 #[derive(Default)] 210 struct InterruptSourceOverride { 211 pub r#type: u8, 212 pub length: u8, 213 pub bus: u8, 214 pub source: u8, 215 pub gsi: u32, 216 pub flags: u16, 217 } 218 219 /// A wrapper around creating and using a kvm-based VCPU. 220 pub struct Vcpu { 221 // The hypervisor abstracted CPU. 222 vcpu: Arc<dyn hypervisor::Vcpu>, 223 id: u8, 224 #[cfg(target_arch = "aarch64")] 225 mpidr: u64, 226 saved_state: Option<CpuState>, 227 } 228 229 impl Vcpu { 230 /// Constructs a new VCPU for `vm`. 231 /// 232 /// # Arguments 233 /// 234 /// * `id` - Represents the CPU number between [0, max vcpus). 235 /// * `vm` - The virtual machine this vcpu will get attached to. 236 /// * `vmmops` - Optional object for exit handling. 237 pub fn new( 238 id: u8, 239 vm: &Arc<dyn hypervisor::Vm>, 240 vmmops: Option<Arc<dyn VmmOps>>, 241 ) -> Result<Arc<Mutex<Self>>> { 242 let vcpu = vm 243 .create_vcpu(id, vmmops) 244 .map_err(|e| Error::VcpuCreate(e.into()))?; 245 // Initially the cpuid per vCPU is the one supported by this VM. 246 Ok(Arc::new(Mutex::new(Vcpu { 247 vcpu, 248 id, 249 #[cfg(target_arch = "aarch64")] 250 mpidr: 0, 251 saved_state: None, 252 }))) 253 } 254 255 /// Configures a vcpu and should be called once per vcpu when created. 256 /// 257 /// # Arguments 258 /// 259 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 260 /// * `vm_memory` - Guest memory. 261 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 262 pub fn configure( 263 &mut self, 264 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 265 kernel_entry_point: Option<EntryPoint>, 266 vm_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 267 #[cfg(target_arch = "x86_64")] cpuid: CpuId, 268 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 269 ) -> Result<()> { 270 #[cfg(target_arch = "aarch64")] 271 { 272 self.init(vm)?; 273 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, kernel_entry_point, vm_memory) 274 .map_err(Error::VcpuConfiguration)?; 275 } 276 info!("Configuring vCPU: cpu_id = {}", self.id); 277 #[cfg(target_arch = "x86_64")] 278 arch::configure_vcpu( 279 &self.vcpu, 280 self.id, 281 kernel_entry_point, 282 vm_memory, 283 cpuid, 284 kvm_hyperv, 285 ) 286 .map_err(Error::VcpuConfiguration)?; 287 288 Ok(()) 289 } 290 291 /// Gets the MPIDR register value. 292 #[cfg(target_arch = "aarch64")] 293 pub fn get_mpidr(&self) -> u64 { 294 self.mpidr 295 } 296 297 /// Gets the saved vCPU state. 298 #[cfg(target_arch = "aarch64")] 299 pub fn get_saved_state(&self) -> Option<CpuState> { 300 self.saved_state.clone() 301 } 302 303 /// Initializes an aarch64 specific vcpu for booting Linux. 304 #[cfg(target_arch = "aarch64")] 305 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 306 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 307 308 // This reads back the kernel's preferred target type. 309 vm.get_preferred_target(&mut kvi) 310 .map_err(Error::VcpuArmPreferredTarget)?; 311 // We already checked that the capability is supported. 312 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 313 // Non-boot cpus are powered off initially. 314 if self.id > 0 { 315 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 316 } 317 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 318 } 319 320 /// Runs the VCPU until it exits, returning the reason. 321 /// 322 /// Note that the state of the VCPU and associated VM must be setup first for this to do 323 /// anything useful. 324 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 325 self.vcpu.run() 326 } 327 } 328 329 const VCPU_SNAPSHOT_ID: &str = "vcpu"; 330 impl Pausable for Vcpu { 331 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 332 self.saved_state = 333 Some(self.vcpu.state().map_err(|e| { 334 MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)) 335 })?); 336 337 Ok(()) 338 } 339 340 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 341 if let Some(vcpu_state) = &self.saved_state { 342 self.vcpu.set_state(vcpu_state).map_err(|e| { 343 MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e)) 344 })?; 345 } 346 347 Ok(()) 348 } 349 } 350 impl Snapshottable for Vcpu { 351 fn id(&self) -> String { 352 VCPU_SNAPSHOT_ID.to_string() 353 } 354 355 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 356 let mut vcpu_snapshot = Snapshot::new(&format!("{}", self.id)); 357 vcpu_snapshot.add_data_section(SnapshotDataSection::new_from_state( 358 VCPU_SNAPSHOT_ID, 359 &self.saved_state, 360 )?); 361 362 Ok(vcpu_snapshot) 363 } 364 365 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 366 self.saved_state = Some(snapshot.to_state(VCPU_SNAPSHOT_ID)?); 367 Ok(()) 368 } 369 } 370 371 pub struct CpuManager { 372 config: CpusConfig, 373 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 374 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 375 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 376 vm_memory: GuestMemoryAtomic<GuestMemoryMmap>, 377 #[cfg(target_arch = "x86_64")] 378 cpuid: CpuId, 379 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 380 vm: Arc<dyn hypervisor::Vm>, 381 vcpus_kill_signalled: Arc<AtomicBool>, 382 vcpus_pause_signalled: Arc<AtomicBool>, 383 exit_evt: EventFd, 384 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 385 reset_evt: EventFd, 386 vcpu_states: Vec<VcpuState>, 387 selected_cpu: u8, 388 vcpus: Vec<Arc<Mutex<Vcpu>>>, 389 seccomp_action: SeccompAction, 390 vmmops: Arc<dyn VmmOps>, 391 #[cfg(feature = "acpi")] 392 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 393 acpi_address: GuestAddress, 394 #[cfg(feature = "acpi")] 395 proximity_domain_per_cpu: BTreeMap<u8, u32>, 396 } 397 398 const CPU_ENABLE_FLAG: usize = 0; 399 const CPU_INSERTING_FLAG: usize = 1; 400 const CPU_REMOVING_FLAG: usize = 2; 401 const CPU_EJECT_FLAG: usize = 3; 402 403 const CPU_STATUS_OFFSET: u64 = 4; 404 const CPU_SELECTION_OFFSET: u64 = 0; 405 406 impl BusDevice for CpuManager { 407 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 408 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 409 data.fill(0); 410 411 match offset { 412 CPU_SELECTION_OFFSET => { 413 data[0] = self.selected_cpu; 414 } 415 CPU_STATUS_OFFSET => { 416 if self.selected_cpu < self.max_vcpus() { 417 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 418 if state.active() { 419 data[0] |= 1 << CPU_ENABLE_FLAG; 420 } 421 if state.inserting { 422 data[0] |= 1 << CPU_INSERTING_FLAG; 423 } 424 if state.removing { 425 data[0] |= 1 << CPU_REMOVING_FLAG; 426 } 427 } else { 428 warn!("Out of range vCPU id: {}", self.selected_cpu); 429 } 430 } 431 _ => { 432 warn!( 433 "Unexpected offset for accessing CPU manager device: {:#}", 434 offset 435 ); 436 } 437 } 438 } 439 440 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 441 match offset { 442 CPU_SELECTION_OFFSET => { 443 self.selected_cpu = data[0]; 444 } 445 CPU_STATUS_OFFSET => { 446 if self.selected_cpu < self.max_vcpus() { 447 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 448 // The ACPI code writes back a 1 to acknowledge the insertion 449 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 450 && state.inserting 451 { 452 state.inserting = false; 453 } 454 // Ditto for removal 455 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) 456 && state.removing 457 { 458 state.removing = false; 459 } 460 // Trigger removal of vCPU 461 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 462 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 463 error!("Error removing vCPU: {:?}", e); 464 } 465 } 466 } else { 467 warn!("Out of range vCPU id: {}", self.selected_cpu); 468 } 469 } 470 _ => { 471 warn!( 472 "Unexpected offset for accessing CPU manager device: {:#}", 473 offset 474 ); 475 } 476 } 477 None 478 } 479 } 480 481 #[derive(Default)] 482 struct VcpuState { 483 inserting: bool, 484 removing: bool, 485 handle: Option<thread::JoinHandle<()>>, 486 kill: Arc<AtomicBool>, 487 vcpu_run_interrupted: Arc<AtomicBool>, 488 } 489 490 impl VcpuState { 491 fn active(&self) -> bool { 492 self.handle.is_some() 493 } 494 495 fn signal_thread(&self) { 496 if let Some(handle) = self.handle.as_ref() { 497 loop { 498 unsafe { 499 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 500 } 501 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 502 break; 503 } else { 504 // This is more effective than thread::yield_now() at 505 // avoiding a priority inversion with the vCPU thread 506 thread::sleep(std::time::Duration::from_millis(1)); 507 } 508 } 509 } 510 } 511 512 fn join_thread(&mut self) -> Result<()> { 513 if let Some(handle) = self.handle.take() { 514 handle.join().map_err(Error::ThreadCleanup)? 515 } 516 517 Ok(()) 518 } 519 520 fn unpark_thread(&self) { 521 if let Some(handle) = self.handle.as_ref() { 522 handle.thread().unpark() 523 } 524 } 525 } 526 527 impl CpuManager { 528 #[allow(unused_variables)] 529 #[allow(clippy::too_many_arguments)] 530 pub fn new( 531 config: &CpusConfig, 532 device_manager: &Arc<Mutex<DeviceManager>>, 533 memory_manager: &Arc<Mutex<MemoryManager>>, 534 vm: Arc<dyn hypervisor::Vm>, 535 exit_evt: EventFd, 536 reset_evt: EventFd, 537 hypervisor: Arc<dyn hypervisor::Hypervisor>, 538 seccomp_action: SeccompAction, 539 vmmops: Arc<dyn VmmOps>, 540 #[cfg(feature = "tdx")] tdx_enabled: bool, 541 #[cfg(any(target_arch = "aarch64", feature = "acpi"))] numa_nodes: &NumaNodes, 542 ) -> Result<Arc<Mutex<CpuManager>>> { 543 let guest_memory = memory_manager.lock().unwrap().guest_memory(); 544 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 545 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 546 547 #[cfg(target_arch = "x86_64")] 548 let sgx_epc_sections = memory_manager 549 .lock() 550 .unwrap() 551 .sgx_epc_region() 552 .as_ref() 553 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 554 #[cfg(target_arch = "x86_64")] 555 let cpuid = { 556 let phys_bits = physical_bits( 557 config.max_phys_bits, 558 #[cfg(feature = "tdx")] 559 tdx_enabled, 560 ); 561 arch::generate_common_cpuid( 562 hypervisor, 563 config 564 .topology 565 .clone() 566 .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)), 567 sgx_epc_sections, 568 phys_bits, 569 config.kvm_hyperv, 570 #[cfg(feature = "tdx")] 571 tdx_enabled, 572 ) 573 .map_err(Error::CommonCpuId)? 574 }; 575 576 let device_manager = device_manager.lock().unwrap(); 577 #[cfg(feature = "acpi")] 578 let acpi_address = device_manager 579 .allocator() 580 .lock() 581 .unwrap() 582 .allocate_mmio_addresses(None, CPU_MANAGER_ACPI_SIZE as u64, None) 583 .ok_or(Error::AllocateMmmioAddress)?; 584 585 #[cfg(feature = "acpi")] 586 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 587 let mut cpu_list = Vec::new(); 588 for (proximity_domain, numa_node) in numa_nodes.iter() { 589 for cpu in numa_node.cpus.iter() { 590 cpu_list.push((*cpu, *proximity_domain)) 591 } 592 } 593 cpu_list 594 } 595 .into_iter() 596 .collect(); 597 598 let cpu_manager = Arc::new(Mutex::new(CpuManager { 599 config: config.clone(), 600 interrupt_controller: device_manager.interrupt_controller().clone(), 601 vm_memory: guest_memory, 602 #[cfg(target_arch = "x86_64")] 603 cpuid, 604 vm, 605 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 606 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 607 vcpu_states, 608 exit_evt, 609 reset_evt, 610 selected_cpu: 0, 611 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 612 seccomp_action, 613 vmmops, 614 #[cfg(feature = "acpi")] 615 acpi_address, 616 #[cfg(feature = "acpi")] 617 proximity_domain_per_cpu, 618 })); 619 620 #[cfg(feature = "acpi")] 621 device_manager 622 .mmio_bus() 623 .insert( 624 cpu_manager.clone(), 625 acpi_address.0, 626 CPU_MANAGER_ACPI_SIZE as u64, 627 ) 628 .map_err(Error::BusError)?; 629 630 Ok(cpu_manager) 631 } 632 633 fn create_vcpu( 634 &mut self, 635 cpu_id: u8, 636 entry_point: Option<EntryPoint>, 637 snapshot: Option<Snapshot>, 638 ) -> Result<Arc<Mutex<Vcpu>>> { 639 info!("Creating vCPU: cpu_id = {}", cpu_id); 640 641 let vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vmmops.clone()))?; 642 643 if let Some(snapshot) = snapshot { 644 // AArch64 vCPUs should be initialized after created. 645 #[cfg(target_arch = "aarch64")] 646 vcpu.lock().unwrap().init(&self.vm)?; 647 648 vcpu.lock() 649 .unwrap() 650 .restore(snapshot) 651 .expect("Failed to restore vCPU"); 652 } else { 653 let vm_memory = self.vm_memory.clone(); 654 655 #[cfg(target_arch = "x86_64")] 656 vcpu.lock() 657 .unwrap() 658 .configure( 659 entry_point, 660 &vm_memory, 661 self.cpuid.clone(), 662 self.config.kvm_hyperv, 663 ) 664 .expect("Failed to configure vCPU"); 665 666 #[cfg(target_arch = "aarch64")] 667 vcpu.lock() 668 .unwrap() 669 .configure(&self.vm, entry_point, &vm_memory) 670 .expect("Failed to configure vCPU"); 671 } 672 673 // Adding vCPU to the CpuManager's vCPU list. 674 self.vcpus.push(Arc::clone(&vcpu)); 675 676 Ok(vcpu) 677 } 678 679 /// Only create new vCPUs if there aren't any inactive ones to reuse 680 fn create_vcpus(&mut self, desired_vcpus: u8, entry_point: Option<EntryPoint>) -> Result<()> { 681 info!( 682 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 683 desired_vcpus, 684 self.config.max_vcpus, 685 self.vcpus.len(), 686 self.present_vcpus() 687 ); 688 689 if desired_vcpus > self.config.max_vcpus { 690 return Err(Error::DesiredVCpuCountExceedsMax); 691 } 692 693 // Only create vCPUs in excess of all the allocated vCPUs. 694 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 695 self.create_vcpu(cpu_id, entry_point, None)?; 696 } 697 698 Ok(()) 699 } 700 701 fn start_vcpu( 702 &mut self, 703 vcpu: Arc<Mutex<Vcpu>>, 704 vcpu_thread_barrier: Arc<Barrier>, 705 inserting: bool, 706 ) -> Result<()> { 707 let cpu_id = vcpu.lock().unwrap().id; 708 let reset_evt = self.reset_evt.try_clone().unwrap(); 709 let exit_evt = self.exit_evt.try_clone().unwrap(); 710 let panic_exit_evt = self.exit_evt.try_clone().unwrap(); 711 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 712 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 713 714 let vcpu_kill = self.vcpu_states[usize::from(cpu_id)].kill.clone(); 715 let vcpu_run_interrupted = self.vcpu_states[usize::from(cpu_id)] 716 .vcpu_run_interrupted 717 .clone(); 718 let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); 719 720 info!("Starting vCPU: cpu_id = {}", cpu_id); 721 722 // Retrieve seccomp filter for vcpu thread 723 let vcpu_seccomp_filter = get_seccomp_filter(&self.seccomp_action, Thread::Vcpu) 724 .map_err(Error::CreateSeccompFilter)?; 725 726 #[cfg(target_arch = "x86_64")] 727 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 728 729 let handle = Some( 730 thread::Builder::new() 731 .name(format!("vcpu{}", cpu_id)) 732 .spawn(move || { 733 // Apply seccomp filter for vcpu thread. 734 if !vcpu_seccomp_filter.is_empty() { 735 if let Err(e) = 736 apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 737 { 738 error!("Error applying seccomp filter: {:?}", e); 739 return; 740 } 741 } 742 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 743 // This uses an async signal safe handler to kill the vcpu handles. 744 register_signal_handler(SIGRTMIN(), handle_signal) 745 .expect("Failed to register vcpu signal handler"); 746 // Block until all CPUs are ready. 747 vcpu_thread_barrier.wait(); 748 749 std::panic::catch_unwind(move || { 750 loop { 751 // If we are being told to pause, we park the thread 752 // until the pause boolean is toggled. 753 // The resume operation is responsible for toggling 754 // the boolean and unpark the thread. 755 // We enter a loop because park() could spuriously 756 // return. We will then park() again unless the 757 // pause boolean has been toggled. 758 759 // Need to use Ordering::SeqCst as we have multiple 760 // loads and stores to different atomics and we need 761 // to see them in a consistent order in all threads 762 763 if vcpu_pause_signalled.load(Ordering::SeqCst) { 764 vcpu_run_interrupted.store(true, Ordering::SeqCst); 765 while vcpu_pause_signalled.load(Ordering::SeqCst) { 766 thread::park(); 767 } 768 vcpu_run_interrupted.store(false, Ordering::SeqCst); 769 } 770 771 // We've been told to terminate 772 if vcpu_kill_signalled.load(Ordering::SeqCst) 773 || vcpu_kill.load(Ordering::SeqCst) 774 { 775 vcpu_run_interrupted.store(true, Ordering::SeqCst); 776 break; 777 } 778 779 // vcpu.run() returns false on a triple-fault so trigger a reset 780 match vcpu.lock().unwrap().run() { 781 Ok(run) => match run { 782 #[cfg(target_arch = "x86_64")] 783 VmExit::IoapicEoi(vector) => { 784 if let Some(interrupt_controller) = 785 &interrupt_controller_clone 786 { 787 interrupt_controller 788 .lock() 789 .unwrap() 790 .end_of_interrupt(vector); 791 } 792 } 793 VmExit::Ignore => {} 794 VmExit::Hyperv => {} 795 VmExit::Reset => { 796 info!("VmExit::Reset"); 797 vcpu_run_interrupted.store(true, Ordering::SeqCst); 798 reset_evt.write(1).unwrap(); 799 break; 800 } 801 VmExit::Shutdown => { 802 info!("VmExit::Shutdown"); 803 vcpu_run_interrupted.store(true, Ordering::SeqCst); 804 exit_evt.write(1).unwrap(); 805 break; 806 } 807 _ => { 808 error!( 809 "VCPU generated error: {:?}", 810 Error::UnexpectedVmExit 811 ); 812 break; 813 } 814 }, 815 816 Err(e) => { 817 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 818 break; 819 } 820 } 821 822 // We've been told to terminate 823 if vcpu_kill_signalled.load(Ordering::SeqCst) 824 || vcpu_kill.load(Ordering::SeqCst) 825 { 826 vcpu_run_interrupted.store(true, Ordering::SeqCst); 827 break; 828 } 829 } 830 }) 831 .or_else(|_| { 832 panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); 833 error!("vCPU thread panicked"); 834 panic_exit_evt.write(1) 835 }) 836 .ok(); 837 }) 838 .map_err(Error::VcpuSpawn)?, 839 ); 840 841 // On hot plug calls into this function entry_point is None. It is for 842 // those hotplug CPU additions that we need to set the inserting flag. 843 self.vcpu_states[usize::from(cpu_id)].handle = handle; 844 self.vcpu_states[usize::from(cpu_id)].inserting = inserting; 845 846 Ok(()) 847 } 848 849 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 850 fn activate_vcpus(&mut self, desired_vcpus: u8, inserting: bool) -> Result<()> { 851 if desired_vcpus > self.config.max_vcpus { 852 return Err(Error::DesiredVCpuCountExceedsMax); 853 } 854 855 let vcpu_thread_barrier = Arc::new(Barrier::new( 856 (desired_vcpus - self.present_vcpus() + 1) as usize, 857 )); 858 859 info!( 860 "Starting vCPUs: desired = {}, allocated = {}, present = {}", 861 desired_vcpus, 862 self.vcpus.len(), 863 self.present_vcpus() 864 ); 865 866 // This reuses any inactive vCPUs as well as any that were newly created 867 for cpu_id in self.present_vcpus()..desired_vcpus { 868 let vcpu = Arc::clone(&self.vcpus[cpu_id as usize]); 869 self.start_vcpu(vcpu, vcpu_thread_barrier.clone(), inserting)?; 870 } 871 872 // Unblock all CPU threads. 873 vcpu_thread_barrier.wait(); 874 Ok(()) 875 } 876 877 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 878 // Mark vCPUs for removal, actual removal happens on ejection 879 for cpu_id in desired_vcpus..self.present_vcpus() { 880 self.vcpu_states[usize::from(cpu_id)].removing = true; 881 } 882 } 883 884 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 885 info!("Removing vCPU: cpu_id = {}", cpu_id); 886 let mut state = &mut self.vcpu_states[usize::from(cpu_id)]; 887 state.kill.store(true, Ordering::SeqCst); 888 state.signal_thread(); 889 state.join_thread()?; 890 state.handle = None; 891 892 // Once the thread has exited, clear the "kill" so that it can reused 893 state.kill.store(false, Ordering::SeqCst); 894 895 Ok(()) 896 } 897 898 pub fn create_boot_vcpus(&mut self, entry_point: Option<EntryPoint>) -> Result<()> { 899 self.create_vcpus(self.boot_vcpus(), entry_point) 900 } 901 902 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 903 pub fn start_boot_vcpus(&mut self) -> Result<()> { 904 self.activate_vcpus(self.boot_vcpus(), false) 905 } 906 907 pub fn start_restored_vcpus(&mut self) -> Result<()> { 908 let vcpu_numbers = self.vcpus.len(); 909 let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_numbers + 1) as usize)); 910 // Restore the vCPUs in "paused" state. 911 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 912 913 for vcpu_index in 0..vcpu_numbers { 914 let vcpu = Arc::clone(&self.vcpus[vcpu_index as usize]); 915 916 self.start_vcpu(vcpu, vcpu_thread_barrier.clone(), false) 917 .map_err(|e| { 918 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 919 })?; 920 } 921 // Unblock all restored CPU threads. 922 vcpu_thread_barrier.wait(); 923 Ok(()) 924 } 925 926 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 927 match desired_vcpus.cmp(&self.present_vcpus()) { 928 cmp::Ordering::Greater => { 929 self.create_vcpus(desired_vcpus, None)?; 930 self.activate_vcpus(desired_vcpus, true)?; 931 Ok(true) 932 } 933 cmp::Ordering::Less => { 934 self.mark_vcpus_for_removal(desired_vcpus); 935 Ok(true) 936 } 937 _ => Ok(false), 938 } 939 } 940 941 pub fn shutdown(&mut self) -> Result<()> { 942 // Tell the vCPUs to stop themselves next time they go through the loop 943 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 944 945 // Toggle the vCPUs pause boolean 946 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 947 948 // Unpark all the VCPU threads. 949 for state in self.vcpu_states.iter() { 950 state.unpark_thread(); 951 } 952 953 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 954 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 955 // above. 956 for state in self.vcpu_states.iter() { 957 state.signal_thread(); 958 } 959 960 // Wait for all the threads to finish. This removes the state from the vector. 961 for mut state in self.vcpu_states.drain(..) { 962 state.join_thread()?; 963 } 964 965 Ok(()) 966 } 967 968 #[cfg(feature = "tdx")] 969 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 970 for vcpu in &self.vcpus { 971 vcpu.lock() 972 .unwrap() 973 .vcpu 974 .tdx_init(hob_address) 975 .map_err(Error::InitializeTdx)?; 976 } 977 Ok(()) 978 } 979 980 pub fn boot_vcpus(&self) -> u8 { 981 self.config.boot_vcpus 982 } 983 984 pub fn max_vcpus(&self) -> u8 { 985 self.config.max_vcpus 986 } 987 988 #[cfg(target_arch = "x86_64")] 989 pub fn common_cpuid(&self) -> CpuId { 990 self.cpuid.clone() 991 } 992 993 fn present_vcpus(&self) -> u8 { 994 self.vcpu_states 995 .iter() 996 .fold(0, |acc, state| acc + state.active() as u8) 997 } 998 999 #[cfg(target_arch = "aarch64")] 1000 pub fn get_mpidrs(&self) -> Vec<u64> { 1001 self.vcpus 1002 .iter() 1003 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 1004 .collect() 1005 } 1006 1007 #[cfg(target_arch = "aarch64")] 1008 pub fn get_saved_states(&self) -> Vec<CpuState> { 1009 self.vcpus 1010 .iter() 1011 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 1012 .collect() 1013 } 1014 1015 #[cfg(target_arch = "aarch64")] 1016 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 1017 self.config 1018 .topology 1019 .clone() 1020 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1021 } 1022 1023 #[cfg(feature = "acpi")] 1024 pub fn create_madt(&self) -> Sdt { 1025 use crate::acpi; 1026 // This is also checked in the commandline parsing. 1027 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1028 1029 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1030 #[cfg(target_arch = "x86_64")] 1031 { 1032 madt.write(36, arch::layout::APIC_START); 1033 1034 for cpu in 0..self.config.max_vcpus { 1035 let lapic = LocalApic { 1036 r#type: acpi::ACPI_APIC_PROCESSOR, 1037 length: 8, 1038 processor_id: cpu, 1039 apic_id: cpu, 1040 flags: if cpu < self.config.boot_vcpus { 1041 1 << MADT_CPU_ENABLE_FLAG 1042 } else { 1043 0 1044 }, 1045 }; 1046 madt.append(lapic); 1047 } 1048 1049 madt.append(Ioapic { 1050 r#type: acpi::ACPI_APIC_IO, 1051 length: 12, 1052 ioapic_id: 0, 1053 apic_address: arch::layout::IOAPIC_START.0 as u32, 1054 gsi_base: 0, 1055 ..Default::default() 1056 }); 1057 1058 madt.append(InterruptSourceOverride { 1059 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1060 length: 10, 1061 bus: 0, 1062 source: 4, 1063 gsi: 4, 1064 flags: 0, 1065 }); 1066 } 1067 1068 #[cfg(target_arch = "aarch64")] 1069 { 1070 /* Notes: 1071 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1072 */ 1073 1074 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1075 for cpu in 0..self.config.boot_vcpus { 1076 let vcpu = &self.vcpus[cpu as usize]; 1077 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1078 /* ARMv8 MPIDR format: 1079 Bits [63:40] Must be zero 1080 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1081 Bits [31:24] Must be zero 1082 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1083 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1084 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1085 */ 1086 let mpidr_mask = 0xff_00ff_ffff; 1087 let gicc = GicC { 1088 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1089 length: 80, 1090 reserved0: 0, 1091 cpu_interface_number: cpu as u32, 1092 uid: cpu as u32, 1093 flags: 1, 1094 parking_version: 0, 1095 performance_interrupt: 0, 1096 parked_address: 0, 1097 base_address: 0, 1098 gicv_base_address: 0, 1099 gich_base_address: 0, 1100 vgic_interrupt: 0, 1101 gicr_base_address: 0, 1102 mpidr: mpidr & mpidr_mask, 1103 proc_power_effi_class: 0, 1104 reserved1: 0, 1105 spe_overflow_interrupt: 0, 1106 }; 1107 1108 madt.append(gicc); 1109 } 1110 1111 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1112 let gicd = GicD { 1113 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1114 length: 24, 1115 reserved0: 0, 1116 gic_id: 0, 1117 base_address: arch::layout::MAPPED_IO_START - 0x0001_0000, 1118 global_irq_base: 0, 1119 version: 3, 1120 reserved1: [0; 3], 1121 }; 1122 madt.append(gicd); 1123 1124 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1125 let gicr_size: u32 = 0x0001_0000 * 2 * (self.config.boot_vcpus as u32); 1126 let gicr_base: u64 = arch::layout::MAPPED_IO_START - 0x0001_0000 - gicr_size as u64; 1127 let gicr = GicR { 1128 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1129 length: 16, 1130 reserved: 0, 1131 base_address: gicr_base, 1132 range_length: gicr_size, 1133 }; 1134 madt.append(gicr); 1135 1136 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1137 let gicits = GicIts { 1138 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1139 length: 20, 1140 reserved0: 0, 1141 translation_id: 0, 1142 base_address: gicr_base - 2 * 0x0001_0000, 1143 reserved1: 0, 1144 }; 1145 madt.append(gicits); 1146 1147 madt.update_checksum(); 1148 } 1149 1150 madt 1151 } 1152 1153 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 1154 pub fn create_pptt(&self) -> Sdt { 1155 let pptt_start = 0; 1156 let mut cpus = 0; 1157 let mut uid = 0; 1158 let threads_per_core = self.get_vcpu_topology().unwrap_or_default().0 as u8; 1159 let cores_per_package = self.get_vcpu_topology().unwrap_or_default().1 as u8; 1160 let packages = self.get_vcpu_topology().unwrap_or_default().2 as u8; 1161 1162 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1163 1164 for cluster_idx in 0..packages { 1165 if cpus < self.config.boot_vcpus as usize { 1166 let cluster_offset = pptt.len() - pptt_start; 1167 let cluster_hierarchy_node = ProcessorHierarchyNode { 1168 r#type: 0, 1169 length: 20, 1170 reserved: 0, 1171 flags: 0x2, 1172 parent: 0, 1173 acpi_processor_id: cluster_idx as u32, 1174 num_private_resources: 0, 1175 }; 1176 pptt.append(cluster_hierarchy_node); 1177 1178 for core_idx in 0..cores_per_package { 1179 let core_offset = pptt.len() - pptt_start; 1180 1181 if threads_per_core > 1 { 1182 let core_hierarchy_node = ProcessorHierarchyNode { 1183 r#type: 0, 1184 length: 20, 1185 reserved: 0, 1186 flags: 0x2, 1187 parent: cluster_offset as u32, 1188 acpi_processor_id: core_idx as u32, 1189 num_private_resources: 0, 1190 }; 1191 pptt.append(core_hierarchy_node); 1192 1193 for _thread_idx in 0..threads_per_core { 1194 let thread_hierarchy_node = ProcessorHierarchyNode { 1195 r#type: 0, 1196 length: 20, 1197 reserved: 0, 1198 flags: 0xE, 1199 parent: core_offset as u32, 1200 acpi_processor_id: uid as u32, 1201 num_private_resources: 0, 1202 }; 1203 pptt.append(thread_hierarchy_node); 1204 uid += 1; 1205 } 1206 } else { 1207 let thread_hierarchy_node = ProcessorHierarchyNode { 1208 r#type: 0, 1209 length: 20, 1210 reserved: 0, 1211 flags: 0xA, 1212 parent: cluster_offset as u32, 1213 acpi_processor_id: uid as u32, 1214 num_private_resources: 0, 1215 }; 1216 pptt.append(thread_hierarchy_node); 1217 uid += 1; 1218 } 1219 } 1220 cpus += (cores_per_package * threads_per_core) as usize; 1221 } 1222 } 1223 1224 pptt.update_checksum(); 1225 pptt 1226 } 1227 } 1228 1229 #[cfg(feature = "acpi")] 1230 struct Cpu { 1231 cpu_id: u8, 1232 proximity_domain: u32, 1233 } 1234 1235 #[cfg(all(target_arch = "x86_64", feature = "acpi"))] 1236 const MADT_CPU_ENABLE_FLAG: usize = 0; 1237 1238 #[cfg(feature = "acpi")] 1239 impl Cpu { 1240 #[cfg(target_arch = "x86_64")] 1241 fn generate_mat(&self) -> Vec<u8> { 1242 let lapic = LocalApic { 1243 r#type: 0, 1244 length: 8, 1245 processor_id: self.cpu_id, 1246 apic_id: self.cpu_id, 1247 flags: 1 << MADT_CPU_ENABLE_FLAG, 1248 }; 1249 1250 let mut mat_data: Vec<u8> = Vec::new(); 1251 mat_data.resize(std::mem::size_of_val(&lapic), 0); 1252 unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic }; 1253 1254 mat_data 1255 } 1256 } 1257 1258 #[cfg(feature = "acpi")] 1259 impl Aml for Cpu { 1260 fn to_aml_bytes(&self) -> Vec<u8> { 1261 #[cfg(target_arch = "x86_64")] 1262 let mat_data: Vec<u8> = self.generate_mat(); 1263 1264 aml::Device::new( 1265 format!("C{:03}", self.cpu_id).as_str().into(), 1266 vec![ 1267 &aml::Name::new("_HID".into(), &"ACPI0007"), 1268 &aml::Name::new("_UID".into(), &self.cpu_id), 1269 // Currently, AArch64 cannot support following fields. 1270 /* 1271 _STA return value: 1272 Bit [0] – Set if the device is present. 1273 Bit [1] – Set if the device is enabled and decoding its resources. 1274 Bit [2] – Set if the device should be shown in the UI. 1275 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1276 Bit [4] – Set if the battery is present. 1277 Bits [31:5] – Reserved (must be cleared). 1278 */ 1279 #[cfg(target_arch = "x86_64")] 1280 &aml::Method::new( 1281 "_STA".into(), 1282 0, 1283 false, 1284 // Call into CSTA method which will interrogate device 1285 vec![&aml::Return::new(&aml::MethodCall::new( 1286 "CSTA".into(), 1287 vec![&self.cpu_id], 1288 ))], 1289 ), 1290 &aml::Method::new( 1291 "_PXM".into(), 1292 0, 1293 false, 1294 vec![&aml::Return::new(&self.proximity_domain)], 1295 ), 1296 // The Linux kernel expects every CPU device to have a _MAT entry 1297 // containing the LAPIC for this processor with the enabled bit set 1298 // even it if is disabled in the MADT (non-boot CPU) 1299 #[cfg(target_arch = "x86_64")] 1300 &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)), 1301 // Trigger CPU ejection 1302 #[cfg(target_arch = "x86_64")] 1303 &aml::Method::new( 1304 "_EJ0".into(), 1305 1, 1306 false, 1307 // Call into CEJ0 method which will actually eject device 1308 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1309 ), 1310 ], 1311 ) 1312 .to_aml_bytes() 1313 } 1314 } 1315 1316 #[cfg(feature = "acpi")] 1317 struct CpuNotify { 1318 cpu_id: u8, 1319 } 1320 1321 #[cfg(feature = "acpi")] 1322 impl Aml for CpuNotify { 1323 fn to_aml_bytes(&self) -> Vec<u8> { 1324 let object = aml::Path::new(&format!("C{:03}", self.cpu_id)); 1325 aml::If::new( 1326 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1327 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1328 ) 1329 .to_aml_bytes() 1330 } 1331 } 1332 1333 #[cfg(feature = "acpi")] 1334 struct CpuMethods { 1335 max_vcpus: u8, 1336 } 1337 1338 #[cfg(feature = "acpi")] 1339 impl Aml for CpuMethods { 1340 fn to_aml_bytes(&self) -> Vec<u8> { 1341 let mut bytes = Vec::new(); 1342 bytes.extend_from_slice( 1343 // CPU status method 1344 &aml::Method::new( 1345 "CSTA".into(), 1346 1, 1347 true, 1348 vec![ 1349 // Take lock defined above 1350 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1351 // Write CPU number (in first argument) to I/O port via field 1352 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1353 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1354 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 1355 &aml::If::new( 1356 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 1357 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 1358 ), 1359 // Release lock 1360 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1361 // Return 0 or 0xf 1362 &aml::Return::new(&aml::Local(0)), 1363 ], 1364 ) 1365 .to_aml_bytes(), 1366 ); 1367 1368 let mut cpu_notifies = Vec::new(); 1369 for cpu_id in 0..self.max_vcpus { 1370 cpu_notifies.push(CpuNotify { cpu_id }); 1371 } 1372 1373 let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new(); 1374 for cpu_id in 0..self.max_vcpus { 1375 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 1376 } 1377 1378 bytes.extend_from_slice( 1379 &aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(), 1380 ); 1381 1382 bytes.extend_from_slice( 1383 &aml::Method::new( 1384 "CEJ0".into(), 1385 1, 1386 true, 1387 vec![ 1388 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1389 // Write CPU number (in first argument) to I/O port via field 1390 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1391 // Set CEJ0 bit 1392 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 1393 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1394 ], 1395 ) 1396 .to_aml_bytes(), 1397 ); 1398 1399 bytes.extend_from_slice( 1400 &aml::Method::new( 1401 "CSCN".into(), 1402 0, 1403 true, 1404 vec![ 1405 // Take lock defined above 1406 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1407 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1408 &aml::While::new( 1409 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 1410 vec![ 1411 // Write CPU number (in first argument) to I/O port via field 1412 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 1413 // Check if CINS bit is set 1414 &aml::If::new( 1415 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 1416 // Notify device if it is 1417 vec![ 1418 &aml::MethodCall::new( 1419 "CTFY".into(), 1420 vec![&aml::Local(0), &aml::ONE], 1421 ), 1422 // Reset CINS bit 1423 &aml::Store::new( 1424 &aml::Path::new("\\_SB_.PRES.CINS"), 1425 &aml::ONE, 1426 ), 1427 ], 1428 ), 1429 // Check if CRMV bit is set 1430 &aml::If::new( 1431 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 1432 // Notify device if it is (with the eject constant 0x3) 1433 vec![ 1434 &aml::MethodCall::new( 1435 "CTFY".into(), 1436 vec![&aml::Local(0), &3u8], 1437 ), 1438 // Reset CRMV bit 1439 &aml::Store::new( 1440 &aml::Path::new("\\_SB_.PRES.CRMV"), 1441 &aml::ONE, 1442 ), 1443 ], 1444 ), 1445 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 1446 ], 1447 ), 1448 // Release lock 1449 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1450 ], 1451 ) 1452 .to_aml_bytes(), 1453 ); 1454 bytes 1455 } 1456 } 1457 1458 #[cfg(feature = "acpi")] 1459 impl Aml for CpuManager { 1460 fn to_aml_bytes(&self) -> Vec<u8> { 1461 let mut bytes = Vec::new(); 1462 // CPU hotplug controller 1463 #[cfg(target_arch = "x86_64")] 1464 bytes.extend_from_slice( 1465 &aml::Device::new( 1466 "_SB_.PRES".into(), 1467 vec![ 1468 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 1469 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 1470 // Mutex to protect concurrent access as we write to choose CPU and then read back status 1471 &aml::Mutex::new("CPLK".into(), 0), 1472 &aml::Name::new( 1473 "_CRS".into(), 1474 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 1475 aml::AddressSpaceCachable::NotCacheable, 1476 true, 1477 self.acpi_address.0 as u64, 1478 self.acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 1479 )]), 1480 ), 1481 // OpRegion and Fields map MMIO range into individual field values 1482 &aml::OpRegion::new( 1483 "PRST".into(), 1484 aml::OpRegionSpace::SystemMemory, 1485 self.acpi_address.0 as usize, 1486 CPU_MANAGER_ACPI_SIZE, 1487 ), 1488 &aml::Field::new( 1489 "PRST".into(), 1490 aml::FieldAccessType::Byte, 1491 aml::FieldUpdateRule::WriteAsZeroes, 1492 vec![ 1493 aml::FieldEntry::Reserved(32), 1494 aml::FieldEntry::Named(*b"CPEN", 1), 1495 aml::FieldEntry::Named(*b"CINS", 1), 1496 aml::FieldEntry::Named(*b"CRMV", 1), 1497 aml::FieldEntry::Named(*b"CEJ0", 1), 1498 aml::FieldEntry::Reserved(4), 1499 aml::FieldEntry::Named(*b"CCMD", 8), 1500 ], 1501 ), 1502 &aml::Field::new( 1503 "PRST".into(), 1504 aml::FieldAccessType::DWord, 1505 aml::FieldUpdateRule::Preserve, 1506 vec![ 1507 aml::FieldEntry::Named(*b"CSEL", 32), 1508 aml::FieldEntry::Reserved(32), 1509 aml::FieldEntry::Named(*b"CDAT", 32), 1510 ], 1511 ), 1512 ], 1513 ) 1514 .to_aml_bytes(), 1515 ); 1516 1517 // CPU devices 1518 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 1519 let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05")); 1520 // Bundle methods together under a common object 1521 let methods = CpuMethods { 1522 max_vcpus: self.config.max_vcpus, 1523 }; 1524 let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods]; 1525 1526 let mut cpu_devices = Vec::new(); 1527 for cpu_id in 0..self.config.max_vcpus { 1528 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 1529 let cpu_device = Cpu { 1530 cpu_id, 1531 proximity_domain, 1532 }; 1533 1534 cpu_devices.push(cpu_device); 1535 } 1536 1537 for cpu_device in cpu_devices.iter() { 1538 cpu_data_inner.push(cpu_device); 1539 } 1540 1541 bytes.extend_from_slice( 1542 &aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(), 1543 ); 1544 bytes 1545 } 1546 } 1547 1548 impl Pausable for CpuManager { 1549 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 1550 // Tell the vCPUs to pause themselves next time they exit 1551 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 1552 1553 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1554 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1555 // above. 1556 for state in self.vcpu_states.iter() { 1557 state.signal_thread(); 1558 } 1559 1560 for vcpu in self.vcpus.iter() { 1561 let mut vcpu = vcpu.lock().unwrap(); 1562 vcpu.pause()?; 1563 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 1564 if !self.config.kvm_hyperv { 1565 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 1566 MigratableError::Pause(anyhow!( 1567 "Could not notify guest it has been paused {:?}", 1568 e 1569 )) 1570 })?; 1571 } 1572 } 1573 1574 Ok(()) 1575 } 1576 1577 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 1578 for vcpu in self.vcpus.iter() { 1579 vcpu.lock().unwrap().resume()?; 1580 } 1581 1582 // Toggle the vCPUs pause boolean 1583 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1584 1585 // Unpark all the VCPU threads. 1586 // Once unparked, the next thing they will do is checking for the pause 1587 // boolean. Since it'll be set to false, they will exit their pause loop 1588 // and go back to vmx root. 1589 for state in self.vcpu_states.iter() { 1590 state.unpark_thread(); 1591 } 1592 Ok(()) 1593 } 1594 } 1595 1596 impl Snapshottable for CpuManager { 1597 fn id(&self) -> String { 1598 CPU_MANAGER_SNAPSHOT_ID.to_string() 1599 } 1600 1601 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1602 let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID); 1603 1604 // The CpuManager snapshot is a collection of all vCPUs snapshots. 1605 for vcpu in &self.vcpus { 1606 let cpu_snapshot = vcpu.lock().unwrap().snapshot()?; 1607 cpu_manager_snapshot.add_snapshot(cpu_snapshot); 1608 } 1609 1610 Ok(cpu_manager_snapshot) 1611 } 1612 1613 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 1614 for (cpu_id, snapshot) in snapshot.snapshots.iter() { 1615 info!("Restoring VCPU {}", cpu_id); 1616 self.create_vcpu(cpu_id.parse::<u8>().unwrap(), None, Some(*snapshot.clone())) 1617 .map_err(|e| MigratableError::Restore(anyhow!("Could not create vCPU {:?}", e)))?; 1618 } 1619 1620 Ok(()) 1621 } 1622 } 1623 1624 impl Transportable for CpuManager {} 1625 impl Migratable for CpuManager {} 1626 1627 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 1628 #[cfg(test)] 1629 mod tests { 1630 use arch::x86_64::interrupts::*; 1631 use arch::x86_64::regs::*; 1632 use hypervisor::x86_64::{FpuState, LapicState, StandardRegisters}; 1633 1634 #[test] 1635 fn test_setlint() { 1636 let hv = hypervisor::new().unwrap(); 1637 let vm = hv.create_vm().expect("new VM fd creation failed"); 1638 assert!(hv.check_required_extensions().is_ok()); 1639 // Calling get_lapic will fail if there is no irqchip before hand. 1640 assert!(vm.create_irq_chip().is_ok()); 1641 let vcpu = vm.create_vcpu(0, None).unwrap(); 1642 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 1643 1644 // Compute the value that is expected to represent LVT0 and LVT1. 1645 let lint0 = get_klapic_reg(&klapic_before, APIC_LVT0); 1646 let lint1 = get_klapic_reg(&klapic_before, APIC_LVT1); 1647 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 1648 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 1649 1650 set_lint(&vcpu).unwrap(); 1651 1652 // Compute the value that represents LVT0 and LVT1 after set_lint. 1653 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 1654 let lint0_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT0); 1655 let lint1_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT1); 1656 assert_eq!(lint0_mode_expected, lint0_mode_actual); 1657 assert_eq!(lint1_mode_expected, lint1_mode_actual); 1658 } 1659 1660 #[test] 1661 fn test_setup_fpu() { 1662 let hv = hypervisor::new().unwrap(); 1663 let vm = hv.create_vm().expect("new VM fd creation failed"); 1664 let vcpu = vm.create_vcpu(0, None).unwrap(); 1665 setup_fpu(&vcpu).unwrap(); 1666 1667 let expected_fpu: FpuState = FpuState { 1668 fcw: 0x37f, 1669 mxcsr: 0x1f80, 1670 ..Default::default() 1671 }; 1672 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 1673 // TODO: auto-generate kvm related structures with PartialEq on. 1674 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 1675 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 1676 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 1677 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 1678 // remove it at all. 1679 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 1680 } 1681 1682 #[test] 1683 fn test_setup_msrs() { 1684 use hypervisor::arch::x86::msr_index; 1685 use hypervisor::x86_64::{MsrEntries, MsrEntry}; 1686 1687 let hv = hypervisor::new().unwrap(); 1688 let vm = hv.create_vm().expect("new VM fd creation failed"); 1689 let vcpu = vm.create_vcpu(0, None).unwrap(); 1690 setup_msrs(&vcpu).unwrap(); 1691 1692 // This test will check against the last MSR entry configured (the tenth one). 1693 // See create_msr_entries for details. 1694 let mut msrs = MsrEntries::from_entries(&[MsrEntry { 1695 index: msr_index::MSR_IA32_MISC_ENABLE, 1696 ..Default::default() 1697 }]) 1698 .unwrap(); 1699 1700 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 1701 // in this test case scenario. 1702 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 1703 assert_eq!(read_msrs, 1); 1704 1705 // Official entries that were setup when we did setup_msrs. We need to assert that the 1706 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 1707 // expect. 1708 let entry_vec = hypervisor::x86_64::boot_msr_entries(); 1709 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 1710 } 1711 1712 #[test] 1713 fn test_setup_regs() { 1714 let hv = hypervisor::new().unwrap(); 1715 let vm = hv.create_vm().expect("new VM fd creation failed"); 1716 let vcpu = vm.create_vcpu(0, None).unwrap(); 1717 1718 let expected_regs: StandardRegisters = StandardRegisters { 1719 rflags: 0x0000000000000002u64, 1720 rbx: arch::layout::PVH_INFO_START.0, 1721 rip: 1, 1722 ..Default::default() 1723 }; 1724 1725 setup_regs(&vcpu, expected_regs.rip).unwrap(); 1726 1727 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 1728 assert_eq!(actual_regs, expected_regs); 1729 } 1730 } 1731 1732 #[cfg(target_arch = "aarch64")] 1733 #[cfg(test)] 1734 mod tests { 1735 use crate::GuestMemoryMmap; 1736 use arch::aarch64::layout; 1737 use arch::aarch64::regs::*; 1738 use hypervisor::kvm::aarch64::{is_system_register, MPIDR_EL1}; 1739 use hypervisor::kvm::kvm_bindings::{ 1740 kvm_one_reg, kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 1741 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 1742 }; 1743 use hypervisor::{arm64_core_reg_id, offset__of}; 1744 use std::mem; 1745 use vm_memory::GuestAddress; 1746 1747 #[test] 1748 fn test_setup_regs() { 1749 let hv = hypervisor::new().unwrap(); 1750 let vm = hv.create_vm().unwrap(); 1751 let vcpu = vm.create_vcpu(0, None).unwrap(); 1752 let regions = vec![( 1753 GuestAddress(layout::RAM_64BIT_START), 1754 (layout::FDT_MAX_SIZE + 0x1000) as usize, 1755 )]; 1756 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 1757 1758 let res = setup_regs(&vcpu, 0, 0x0, &mem); 1759 // Must fail when vcpu is not initialized yet. 1760 assert!(res.is_err()); 1761 1762 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1763 vm.get_preferred_target(&mut kvi).unwrap(); 1764 vcpu.vcpu_init(&kvi).unwrap(); 1765 1766 assert!(setup_regs(&vcpu, 0, 0x0, &mem).is_ok()); 1767 } 1768 1769 #[test] 1770 fn test_read_mpidr() { 1771 let hv = hypervisor::new().unwrap(); 1772 let vm = hv.create_vm().unwrap(); 1773 let vcpu = vm.create_vcpu(0, None).unwrap(); 1774 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1775 vm.get_preferred_target(&mut kvi).unwrap(); 1776 1777 // Must fail when vcpu is not initialized yet. 1778 assert!(vcpu.read_mpidr().is_err()); 1779 1780 vcpu.vcpu_init(&kvi).unwrap(); 1781 assert_eq!(vcpu.read_mpidr().unwrap(), 0x80000000); 1782 } 1783 1784 #[test] 1785 fn test_is_system_register() { 1786 let offset = offset__of!(user_pt_regs, pc); 1787 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 1788 assert!(!is_system_register(regid)); 1789 let regid = KVM_REG_ARM64 as u64 | KVM_REG_SIZE_U64 as u64 | KVM_REG_ARM64_SYSREG as u64; 1790 assert!(is_system_register(regid)); 1791 } 1792 1793 #[test] 1794 fn test_save_restore_core_regs() { 1795 let hv = hypervisor::new().unwrap(); 1796 let vm = hv.create_vm().unwrap(); 1797 let vcpu = vm.create_vcpu(0, None).unwrap(); 1798 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1799 vm.get_preferred_target(&mut kvi).unwrap(); 1800 1801 // Must fail when vcpu is not initialized yet. 1802 let mut state = kvm_regs::default(); 1803 let res = vcpu.core_registers(&mut state); 1804 assert!(res.is_err()); 1805 assert_eq!( 1806 format!("{}", res.unwrap_err()), 1807 "Failed to get core register: Exec format error (os error 8)" 1808 ); 1809 1810 let res = vcpu.set_core_registers(&state); 1811 assert!(res.is_err()); 1812 assert_eq!( 1813 format!("{}", res.unwrap_err()), 1814 "Failed to set core register: Exec format error (os error 8)" 1815 ); 1816 1817 vcpu.vcpu_init(&kvi).unwrap(); 1818 assert!(vcpu.core_registers(&mut state).is_ok()); 1819 assert_eq!(state.regs.pstate, 0x3C5); 1820 1821 assert!(vcpu.set_core_registers(&state).is_ok()); 1822 let off = offset__of!(user_pt_regs, pstate); 1823 let pstate = vcpu 1824 .get_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1825 .expect("Failed to call kvm get one reg"); 1826 assert_eq!(state.regs.pstate, pstate); 1827 } 1828 1829 #[test] 1830 fn test_save_restore_system_regs() { 1831 let hv = hypervisor::new().unwrap(); 1832 let vm = hv.create_vm().unwrap(); 1833 let vcpu = vm.create_vcpu(0, None).unwrap(); 1834 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1835 vm.get_preferred_target(&mut kvi).unwrap(); 1836 1837 // Must fail when vcpu is not initialized yet. 1838 let mut state: Vec<kvm_one_reg> = Vec::new(); 1839 let res = vcpu.system_registers(&mut state); 1840 assert!(res.is_err()); 1841 assert_eq!( 1842 format!("{}", res.unwrap_err()), 1843 "Failed to retrieve list of registers: Exec format error (os error 8)" 1844 ); 1845 1846 state.push(kvm_one_reg { 1847 id: MPIDR_EL1, 1848 addr: 0x00, 1849 }); 1850 let res = vcpu.set_system_registers(&state); 1851 assert!(res.is_err()); 1852 assert_eq!( 1853 format!("{}", res.unwrap_err()), 1854 "Failed to set system register: Exec format error (os error 8)" 1855 ); 1856 1857 vcpu.vcpu_init(&kvi).unwrap(); 1858 assert!(vcpu.system_registers(&mut state).is_ok()); 1859 let initial_mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr"); 1860 assert!(state.contains(&kvm_one_reg { 1861 id: MPIDR_EL1, 1862 addr: initial_mpidr 1863 })); 1864 1865 assert!(vcpu.set_system_registers(&state).is_ok()); 1866 let mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr"); 1867 assert_eq!(initial_mpidr, mpidr); 1868 } 1869 1870 #[test] 1871 fn test_get_set_mpstate() { 1872 let hv = hypervisor::new().unwrap(); 1873 let vm = hv.create_vm().unwrap(); 1874 let vcpu = vm.create_vcpu(0, None).unwrap(); 1875 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1876 vm.get_preferred_target(&mut kvi).unwrap(); 1877 1878 let res = vcpu.get_mp_state(); 1879 assert!(res.is_ok()); 1880 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 1881 } 1882 } 1883