1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 use crate::device_manager::DeviceManager; 16 use crate::memory_manager::MemoryManager; 17 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 18 #[cfg(target_arch = "x86_64")] 19 use crate::vm::physical_bits; 20 #[cfg(feature = "acpi")] 21 use crate::vm::NumaNodes; 22 use crate::GuestMemoryMmap; 23 use crate::CPU_MANAGER_SNAPSHOT_ID; 24 #[cfg(feature = "acpi")] 25 use acpi_tables::{aml, aml::Aml, sdt::Sdt}; 26 use anyhow::anyhow; 27 use arch::EntryPoint; 28 use devices::interrupt_controller::InterruptController; 29 #[cfg(target_arch = "aarch64")] 30 use hypervisor::kvm::kvm_bindings; 31 #[cfg(target_arch = "x86_64")] 32 use hypervisor::CpuId; 33 use hypervisor::{vm::VmmOps, CpuState, HypervisorCpuError, VmExit}; 34 use libc::{c_void, siginfo_t}; 35 use seccomp::{SeccompAction, SeccompFilter}; 36 #[cfg(feature = "acpi")] 37 use std::collections::BTreeMap; 38 use std::os::unix::thread::JoinHandleExt; 39 use std::sync::atomic::{AtomicBool, Ordering}; 40 use std::sync::{Arc, Barrier, Mutex}; 41 use std::{cmp, io, result, thread}; 42 use vm_device::BusDevice; 43 #[cfg(feature = "acpi")] 44 use vm_memory::GuestAddress; 45 use vm_memory::GuestMemoryAtomic; 46 use vm_migration::{ 47 Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable, 48 Transportable, 49 }; 50 use vmm_sys_util::eventfd::EventFd; 51 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 52 53 #[cfg(feature = "acpi")] 54 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 55 56 #[derive(Debug)] 57 pub enum Error { 58 /// Cannot create the vCPU. 59 VcpuCreate(anyhow::Error), 60 61 /// Cannot run the VCPUs. 62 VcpuRun(anyhow::Error), 63 64 /// Cannot spawn a new vCPU thread. 65 VcpuSpawn(io::Error), 66 67 /// Cannot generate common CPUID 68 CommonCpuId(arch::Error), 69 70 /// Error configuring VCPU 71 VcpuConfiguration(arch::Error), 72 73 #[cfg(target_arch = "aarch64")] 74 /// Error fetching prefered target 75 VcpuArmPreferredTarget(hypervisor::HypervisorVmError), 76 77 #[cfg(target_arch = "aarch64")] 78 /// Error doing vCPU init on Arm. 79 VcpuArmInit(hypervisor::HypervisorCpuError), 80 81 /// Failed to join on vCPU threads 82 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 83 84 /// Cannot add legacy device to Bus. 85 BusError(vm_device::BusError), 86 87 /// Asking for more vCPUs that we can have 88 DesiredVCpuCountExceedsMax, 89 90 /// Cannot create seccomp filter 91 CreateSeccompFilter(seccomp::SeccompError), 92 93 /// Cannot apply seccomp filter 94 ApplySeccompFilter(seccomp::Error), 95 96 /// Error starting vCPU after restore 97 StartRestoreVcpu(anyhow::Error), 98 99 /// Error because an unexpected VmExit type was received. 100 UnexpectedVmExit, 101 102 /// Failed to allocate MMIO address 103 AllocateMmmioAddress, 104 105 #[cfg(feature = "tdx")] 106 InitializeTdx(hypervisor::HypervisorCpuError), 107 } 108 pub type Result<T> = result::Result<T, Error>; 109 110 #[cfg(all(target_arch = "x86_64", feature = "acpi"))] 111 #[allow(dead_code)] 112 #[repr(packed)] 113 struct LocalApic { 114 pub r#type: u8, 115 pub length: u8, 116 pub processor_id: u8, 117 pub apic_id: u8, 118 pub flags: u32, 119 } 120 121 #[allow(dead_code)] 122 #[repr(packed)] 123 #[derive(Default)] 124 struct Ioapic { 125 pub r#type: u8, 126 pub length: u8, 127 pub ioapic_id: u8, 128 _reserved: u8, 129 pub apic_address: u32, 130 pub gsi_base: u32, 131 } 132 133 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 134 #[allow(dead_code)] 135 #[repr(packed)] 136 struct GicC { 137 pub r#type: u8, 138 pub length: u8, 139 pub reserved0: u16, 140 pub cpu_interface_number: u32, 141 pub uid: u32, 142 pub flags: u32, 143 pub parking_version: u32, 144 pub performance_interrupt: u32, 145 pub parked_address: u64, 146 pub base_address: u64, 147 pub gicv_base_address: u64, 148 pub gich_base_address: u64, 149 pub vgic_interrupt: u32, 150 pub gicr_base_address: u64, 151 pub mpidr: u64, 152 pub proc_power_effi_class: u8, 153 pub reserved1: u8, 154 pub spe_overflow_interrupt: u16, 155 } 156 157 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 158 #[allow(dead_code)] 159 #[repr(packed)] 160 struct GicD { 161 pub r#type: u8, 162 pub length: u8, 163 pub reserved0: u16, 164 pub gic_id: u32, 165 pub base_address: u64, 166 pub global_irq_base: u32, 167 pub version: u8, 168 pub reserved1: [u8; 3], 169 } 170 171 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 172 #[allow(dead_code)] 173 #[repr(packed)] 174 struct GicR { 175 pub r#type: u8, 176 pub length: u8, 177 pub reserved: u16, 178 pub base_address: u64, 179 pub range_length: u32, 180 } 181 182 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 183 #[allow(dead_code)] 184 #[repr(packed)] 185 struct GicIts { 186 pub r#type: u8, 187 pub length: u8, 188 pub reserved0: u16, 189 pub translation_id: u32, 190 pub base_address: u64, 191 pub reserved1: u32, 192 } 193 194 #[allow(dead_code)] 195 #[repr(packed)] 196 #[derive(Default)] 197 struct InterruptSourceOverride { 198 pub r#type: u8, 199 pub length: u8, 200 pub bus: u8, 201 pub source: u8, 202 pub gsi: u32, 203 pub flags: u16, 204 } 205 206 /// A wrapper around creating and using a kvm-based VCPU. 207 pub struct Vcpu { 208 // The hypervisor abstracted CPU. 209 vcpu: Arc<dyn hypervisor::Vcpu>, 210 id: u8, 211 #[cfg(target_arch = "aarch64")] 212 mpidr: u64, 213 saved_state: Option<CpuState>, 214 } 215 216 impl Vcpu { 217 /// Constructs a new VCPU for `vm`. 218 /// 219 /// # Arguments 220 /// 221 /// * `id` - Represents the CPU number between [0, max vcpus). 222 /// * `vm` - The virtual machine this vcpu will get attached to. 223 /// * `vmmops` - Optional object for exit handling. 224 pub fn new( 225 id: u8, 226 vm: &Arc<dyn hypervisor::Vm>, 227 vmmops: Option<Arc<Box<dyn VmmOps>>>, 228 ) -> Result<Arc<Mutex<Self>>> { 229 let vcpu = vm 230 .create_vcpu(id, vmmops) 231 .map_err(|e| Error::VcpuCreate(e.into()))?; 232 // Initially the cpuid per vCPU is the one supported by this VM. 233 Ok(Arc::new(Mutex::new(Vcpu { 234 vcpu, 235 id, 236 #[cfg(target_arch = "aarch64")] 237 mpidr: 0, 238 saved_state: None, 239 }))) 240 } 241 242 /// Configures a vcpu and should be called once per vcpu when created. 243 /// 244 /// # Arguments 245 /// 246 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 247 /// * `vm_memory` - Guest memory. 248 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 249 pub fn configure( 250 &mut self, 251 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 252 kernel_entry_point: Option<EntryPoint>, 253 vm_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 254 #[cfg(target_arch = "x86_64")] cpuid: CpuId, 255 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 256 ) -> Result<()> { 257 #[cfg(target_arch = "aarch64")] 258 { 259 self.init(vm)?; 260 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, kernel_entry_point, vm_memory) 261 .map_err(Error::VcpuConfiguration)?; 262 } 263 info!("Configuring vCPU: cpu_id = {}", self.id); 264 #[cfg(target_arch = "x86_64")] 265 arch::configure_vcpu( 266 &self.vcpu, 267 self.id, 268 kernel_entry_point, 269 vm_memory, 270 cpuid, 271 kvm_hyperv, 272 ) 273 .map_err(Error::VcpuConfiguration)?; 274 275 Ok(()) 276 } 277 278 /// Gets the MPIDR register value. 279 #[cfg(target_arch = "aarch64")] 280 pub fn get_mpidr(&self) -> u64 { 281 self.mpidr 282 } 283 284 /// Gets the saved vCPU state. 285 #[cfg(target_arch = "aarch64")] 286 pub fn get_saved_state(&self) -> Option<CpuState> { 287 self.saved_state.clone() 288 } 289 290 /// Initializes an aarch64 specific vcpu for booting Linux. 291 #[cfg(target_arch = "aarch64")] 292 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 293 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 294 295 // This reads back the kernel's preferred target type. 296 vm.get_preferred_target(&mut kvi) 297 .map_err(Error::VcpuArmPreferredTarget)?; 298 // We already checked that the capability is supported. 299 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 300 // Non-boot cpus are powered off initially. 301 if self.id > 0 { 302 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 303 } 304 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 305 } 306 307 /// Runs the VCPU until it exits, returning the reason. 308 /// 309 /// Note that the state of the VCPU and associated VM must be setup first for this to do 310 /// anything useful. 311 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 312 self.vcpu.run() 313 } 314 } 315 316 const VCPU_SNAPSHOT_ID: &str = "vcpu"; 317 impl Pausable for Vcpu { 318 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 319 self.saved_state = 320 Some(self.vcpu.state().map_err(|e| { 321 MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)) 322 })?); 323 324 Ok(()) 325 } 326 327 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 328 if let Some(vcpu_state) = &self.saved_state { 329 self.vcpu.set_state(vcpu_state).map_err(|e| { 330 MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e)) 331 })?; 332 } 333 334 Ok(()) 335 } 336 } 337 impl Snapshottable for Vcpu { 338 fn id(&self) -> String { 339 VCPU_SNAPSHOT_ID.to_string() 340 } 341 342 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 343 let mut vcpu_snapshot = Snapshot::new(&format!("{}", self.id)); 344 vcpu_snapshot.add_data_section(SnapshotDataSection::new_from_state( 345 VCPU_SNAPSHOT_ID, 346 &self.saved_state, 347 )?); 348 349 Ok(vcpu_snapshot) 350 } 351 352 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 353 self.saved_state = Some(snapshot.to_state(VCPU_SNAPSHOT_ID)?); 354 Ok(()) 355 } 356 } 357 358 pub struct CpuManager { 359 config: CpusConfig, 360 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 361 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 362 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 363 vm_memory: GuestMemoryAtomic<GuestMemoryMmap>, 364 #[cfg(target_arch = "x86_64")] 365 cpuid: CpuId, 366 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 367 vm: Arc<dyn hypervisor::Vm>, 368 vcpus_kill_signalled: Arc<AtomicBool>, 369 vcpus_pause_signalled: Arc<AtomicBool>, 370 exit_evt: EventFd, 371 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 372 reset_evt: EventFd, 373 vcpu_states: Vec<VcpuState>, 374 selected_cpu: u8, 375 vcpus: Vec<Arc<Mutex<Vcpu>>>, 376 seccomp_action: SeccompAction, 377 vmmops: Arc<Box<dyn VmmOps>>, 378 #[cfg(feature = "acpi")] 379 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 380 acpi_address: GuestAddress, 381 #[cfg(feature = "acpi")] 382 proximity_domain_per_cpu: BTreeMap<u8, u32>, 383 } 384 385 const CPU_ENABLE_FLAG: usize = 0; 386 const CPU_INSERTING_FLAG: usize = 1; 387 const CPU_REMOVING_FLAG: usize = 2; 388 const CPU_EJECT_FLAG: usize = 3; 389 390 const CPU_STATUS_OFFSET: u64 = 4; 391 const CPU_SELECTION_OFFSET: u64 = 0; 392 393 impl BusDevice for CpuManager { 394 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 395 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 396 data.copy_from_slice(&[0; 8][0..data.len()]); 397 398 match offset { 399 CPU_SELECTION_OFFSET => { 400 data[0] = self.selected_cpu; 401 } 402 CPU_STATUS_OFFSET => { 403 if self.selected_cpu < self.present_vcpus() { 404 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 405 if state.active() { 406 data[0] |= 1 << CPU_ENABLE_FLAG; 407 } 408 if state.inserting { 409 data[0] |= 1 << CPU_INSERTING_FLAG; 410 } 411 if state.removing { 412 data[0] |= 1 << CPU_REMOVING_FLAG; 413 } 414 } 415 } 416 _ => { 417 warn!( 418 "Unexpected offset for accessing CPU manager device: {:#}", 419 offset 420 ); 421 } 422 } 423 } 424 425 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 426 match offset { 427 CPU_SELECTION_OFFSET => { 428 self.selected_cpu = data[0]; 429 } 430 CPU_STATUS_OFFSET => { 431 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 432 // The ACPI code writes back a 1 to acknowledge the insertion 433 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 434 && state.inserting 435 { 436 state.inserting = false; 437 } 438 // Ditto for removal 439 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) && state.removing 440 { 441 state.removing = false; 442 } 443 // Trigger removal of vCPU 444 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 445 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 446 error!("Error removing vCPU: {:?}", e); 447 } 448 } 449 } 450 _ => { 451 warn!( 452 "Unexpected offset for accessing CPU manager device: {:#}", 453 offset 454 ); 455 } 456 } 457 None 458 } 459 } 460 461 #[derive(Default)] 462 struct VcpuState { 463 inserting: bool, 464 removing: bool, 465 handle: Option<thread::JoinHandle<()>>, 466 kill: Arc<AtomicBool>, 467 vcpu_run_interrupted: Arc<AtomicBool>, 468 } 469 470 impl VcpuState { 471 fn active(&self) -> bool { 472 self.handle.is_some() 473 } 474 475 fn signal_thread(&self) { 476 if let Some(handle) = self.handle.as_ref() { 477 loop { 478 unsafe { 479 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 480 } 481 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 482 break; 483 } else { 484 // This is more effective than thread::yield_now() at 485 // avoiding a priority inversion with the vCPU thread 486 thread::sleep(std::time::Duration::from_millis(1)); 487 } 488 } 489 } 490 } 491 492 fn join_thread(&mut self) -> Result<()> { 493 if let Some(handle) = self.handle.take() { 494 handle.join().map_err(Error::ThreadCleanup)? 495 } 496 497 Ok(()) 498 } 499 500 fn unpark_thread(&self) { 501 if let Some(handle) = self.handle.as_ref() { 502 handle.thread().unpark() 503 } 504 } 505 } 506 507 impl CpuManager { 508 #[allow(unused_variables)] 509 #[allow(clippy::too_many_arguments)] 510 pub fn new( 511 config: &CpusConfig, 512 device_manager: &Arc<Mutex<DeviceManager>>, 513 memory_manager: &Arc<Mutex<MemoryManager>>, 514 vm: Arc<dyn hypervisor::Vm>, 515 exit_evt: EventFd, 516 reset_evt: EventFd, 517 hypervisor: Arc<dyn hypervisor::Hypervisor>, 518 seccomp_action: SeccompAction, 519 vmmops: Arc<Box<dyn VmmOps>>, 520 #[cfg(feature = "tdx")] tdx_enabled: bool, 521 #[cfg(feature = "acpi")] numa_nodes: &NumaNodes, 522 ) -> Result<Arc<Mutex<CpuManager>>> { 523 let guest_memory = memory_manager.lock().unwrap().guest_memory(); 524 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 525 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 526 527 #[cfg(target_arch = "x86_64")] 528 let sgx_epc_sections = memory_manager 529 .lock() 530 .unwrap() 531 .sgx_epc_region() 532 .as_ref() 533 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 534 #[cfg(target_arch = "x86_64")] 535 let cpuid = { 536 let phys_bits = physical_bits( 537 config.max_phys_bits, 538 #[cfg(feature = "tdx")] 539 tdx_enabled, 540 ); 541 arch::generate_common_cpuid( 542 hypervisor, 543 config 544 .topology 545 .clone() 546 .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)), 547 sgx_epc_sections, 548 phys_bits, 549 config.kvm_hyperv, 550 #[cfg(feature = "tdx")] 551 tdx_enabled, 552 ) 553 .map_err(Error::CommonCpuId)? 554 }; 555 556 let device_manager = device_manager.lock().unwrap(); 557 #[cfg(feature = "acpi")] 558 let acpi_address = device_manager 559 .allocator() 560 .lock() 561 .unwrap() 562 .allocate_mmio_addresses(None, CPU_MANAGER_ACPI_SIZE as u64, None) 563 .ok_or(Error::AllocateMmmioAddress)?; 564 565 #[cfg(feature = "acpi")] 566 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 567 let mut cpu_list = Vec::new(); 568 for (proximity_domain, numa_node) in numa_nodes.iter() { 569 for cpu in numa_node.cpus().iter() { 570 cpu_list.push((*cpu, *proximity_domain)) 571 } 572 } 573 cpu_list 574 } 575 .into_iter() 576 .collect(); 577 578 let cpu_manager = Arc::new(Mutex::new(CpuManager { 579 config: config.clone(), 580 interrupt_controller: device_manager.interrupt_controller().clone(), 581 vm_memory: guest_memory, 582 #[cfg(target_arch = "x86_64")] 583 cpuid, 584 vm, 585 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 586 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 587 vcpu_states, 588 exit_evt, 589 reset_evt, 590 selected_cpu: 0, 591 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 592 seccomp_action, 593 vmmops, 594 #[cfg(feature = "acpi")] 595 acpi_address, 596 #[cfg(feature = "acpi")] 597 proximity_domain_per_cpu, 598 })); 599 600 #[cfg(feature = "acpi")] 601 device_manager 602 .mmio_bus() 603 .insert( 604 cpu_manager.clone(), 605 acpi_address.0, 606 CPU_MANAGER_ACPI_SIZE as u64, 607 ) 608 .map_err(Error::BusError)?; 609 610 Ok(cpu_manager) 611 } 612 613 fn create_vcpu( 614 &mut self, 615 cpu_id: u8, 616 entry_point: Option<EntryPoint>, 617 snapshot: Option<Snapshot>, 618 ) -> Result<Arc<Mutex<Vcpu>>> { 619 info!("Creating vCPU: cpu_id = {}", cpu_id); 620 621 let vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vmmops.clone()))?; 622 623 if let Some(snapshot) = snapshot { 624 // AArch64 vCPUs should be initialized after created. 625 #[cfg(target_arch = "aarch64")] 626 vcpu.lock().unwrap().init(&self.vm)?; 627 628 vcpu.lock() 629 .unwrap() 630 .restore(snapshot) 631 .expect("Failed to restore vCPU"); 632 } else { 633 let vm_memory = self.vm_memory.clone(); 634 635 #[cfg(target_arch = "x86_64")] 636 vcpu.lock() 637 .unwrap() 638 .configure( 639 entry_point, 640 &vm_memory, 641 self.cpuid.clone(), 642 self.config.kvm_hyperv, 643 ) 644 .expect("Failed to configure vCPU"); 645 646 #[cfg(target_arch = "aarch64")] 647 vcpu.lock() 648 .unwrap() 649 .configure(&self.vm, entry_point, &vm_memory) 650 .expect("Failed to configure vCPU"); 651 } 652 653 // Adding vCPU to the CpuManager's vCPU list. 654 self.vcpus.push(Arc::clone(&vcpu)); 655 656 Ok(vcpu) 657 } 658 659 /// Only create new vCPUs if there aren't any inactive ones to reuse 660 fn create_vcpus(&mut self, desired_vcpus: u8, entry_point: Option<EntryPoint>) -> Result<()> { 661 info!( 662 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 663 desired_vcpus, 664 self.config.max_vcpus, 665 self.vcpus.len(), 666 self.present_vcpus() 667 ); 668 669 if desired_vcpus > self.config.max_vcpus { 670 return Err(Error::DesiredVCpuCountExceedsMax); 671 } 672 673 // Only create vCPUs in excess of all the allocated vCPUs. 674 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 675 self.create_vcpu(cpu_id, entry_point, None)?; 676 } 677 678 Ok(()) 679 } 680 681 fn start_vcpu( 682 &mut self, 683 vcpu: Arc<Mutex<Vcpu>>, 684 vcpu_thread_barrier: Arc<Barrier>, 685 inserting: bool, 686 ) -> Result<()> { 687 let cpu_id = vcpu.lock().unwrap().id; 688 let reset_evt = self.reset_evt.try_clone().unwrap(); 689 let exit_evt = self.exit_evt.try_clone().unwrap(); 690 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 691 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 692 693 let vcpu_kill = self.vcpu_states[usize::from(cpu_id)].kill.clone(); 694 let vcpu_run_interrupted = self.vcpu_states[usize::from(cpu_id)] 695 .vcpu_run_interrupted 696 .clone(); 697 698 info!("Starting vCPU: cpu_id = {}", cpu_id); 699 700 // Retrieve seccomp filter for vcpu thread 701 let vcpu_seccomp_filter = get_seccomp_filter(&self.seccomp_action, Thread::Vcpu) 702 .map_err(Error::CreateSeccompFilter)?; 703 704 #[cfg(target_arch = "x86_64")] 705 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 706 707 let handle = Some( 708 thread::Builder::new() 709 .name(format!("vcpu{}", cpu_id)) 710 .spawn(move || { 711 // Apply seccomp filter for vcpu thread. 712 if let Err(e) = 713 SeccompFilter::apply(vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 714 { 715 error!("Error applying seccomp filter: {:?}", e); 716 return; 717 } 718 719 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 720 // This uses an async signal safe handler to kill the vcpu handles. 721 register_signal_handler(SIGRTMIN(), handle_signal) 722 .expect("Failed to register vcpu signal handler"); 723 724 // Block until all CPUs are ready. 725 vcpu_thread_barrier.wait(); 726 727 loop { 728 // If we are being told to pause, we park the thread 729 // until the pause boolean is toggled. 730 // The resume operation is responsible for toggling 731 // the boolean and unpark the thread. 732 // We enter a loop because park() could spuriously 733 // return. We will then park() again unless the 734 // pause boolean has been toggled. 735 736 // Need to use Ordering::SeqCst as we have multiple 737 // loads and stores to different atomics and we need 738 // to see them in a consistent order in all threads 739 740 if vcpu_pause_signalled.load(Ordering::SeqCst) { 741 vcpu_run_interrupted.store(true, Ordering::SeqCst); 742 while vcpu_pause_signalled.load(Ordering::SeqCst) { 743 thread::park(); 744 } 745 vcpu_run_interrupted.store(false, Ordering::SeqCst); 746 } 747 748 // We've been told to terminate 749 if vcpu_kill_signalled.load(Ordering::SeqCst) 750 || vcpu_kill.load(Ordering::SeqCst) 751 { 752 vcpu_run_interrupted.store(true, Ordering::SeqCst); 753 break; 754 } 755 756 // vcpu.run() returns false on a triple-fault so trigger a reset 757 match vcpu.lock().unwrap().run() { 758 Ok(run) => match run { 759 #[cfg(target_arch = "x86_64")] 760 VmExit::IoapicEoi(vector) => { 761 if let Some(interrupt_controller) = &interrupt_controller_clone 762 { 763 interrupt_controller 764 .lock() 765 .unwrap() 766 .end_of_interrupt(vector); 767 } 768 } 769 VmExit::Ignore => {} 770 VmExit::Hyperv => {} 771 VmExit::Reset => { 772 debug!("VmExit::Reset"); 773 vcpu_run_interrupted.store(true, Ordering::SeqCst); 774 reset_evt.write(1).unwrap(); 775 break; 776 } 777 VmExit::Shutdown => { 778 debug!("VmExit::Shutdown"); 779 vcpu_run_interrupted.store(true, Ordering::SeqCst); 780 exit_evt.write(1).unwrap(); 781 break; 782 } 783 _ => { 784 error!("VCPU generated error: {:?}", Error::UnexpectedVmExit); 785 break; 786 } 787 }, 788 789 Err(e) => { 790 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 791 break; 792 } 793 } 794 795 // We've been told to terminate 796 if vcpu_kill_signalled.load(Ordering::SeqCst) 797 || vcpu_kill.load(Ordering::SeqCst) 798 { 799 vcpu_run_interrupted.store(true, Ordering::SeqCst); 800 break; 801 } 802 } 803 }) 804 .map_err(Error::VcpuSpawn)?, 805 ); 806 807 // On hot plug calls into this function entry_point is None. It is for 808 // those hotplug CPU additions that we need to set the inserting flag. 809 self.vcpu_states[usize::from(cpu_id)].handle = handle; 810 self.vcpu_states[usize::from(cpu_id)].inserting = inserting; 811 812 Ok(()) 813 } 814 815 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 816 fn activate_vcpus(&mut self, desired_vcpus: u8, inserting: bool) -> Result<()> { 817 if desired_vcpus > self.config.max_vcpus { 818 return Err(Error::DesiredVCpuCountExceedsMax); 819 } 820 821 let vcpu_thread_barrier = Arc::new(Barrier::new( 822 (desired_vcpus - self.present_vcpus() + 1) as usize, 823 )); 824 825 info!( 826 "Starting vCPUs: desired = {}, allocated = {}, present = {}", 827 desired_vcpus, 828 self.vcpus.len(), 829 self.present_vcpus() 830 ); 831 832 // This reuses any inactive vCPUs as well as any that were newly created 833 for cpu_id in self.present_vcpus()..desired_vcpus { 834 let vcpu = Arc::clone(&self.vcpus[cpu_id as usize]); 835 self.start_vcpu(vcpu, vcpu_thread_barrier.clone(), inserting)?; 836 } 837 838 // Unblock all CPU threads. 839 vcpu_thread_barrier.wait(); 840 Ok(()) 841 } 842 843 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 844 // Mark vCPUs for removal, actual removal happens on ejection 845 for cpu_id in desired_vcpus..self.present_vcpus() { 846 self.vcpu_states[usize::from(cpu_id)].removing = true; 847 } 848 } 849 850 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 851 info!("Removing vCPU: cpu_id = {}", cpu_id); 852 let mut state = &mut self.vcpu_states[usize::from(cpu_id)]; 853 state.kill.store(true, Ordering::SeqCst); 854 state.signal_thread(); 855 state.join_thread()?; 856 state.handle = None; 857 858 // Once the thread has exited, clear the "kill" so that it can reused 859 state.kill.store(false, Ordering::SeqCst); 860 861 Ok(()) 862 } 863 864 pub fn create_boot_vcpus(&mut self, entry_point: Option<EntryPoint>) -> Result<()> { 865 self.create_vcpus(self.boot_vcpus(), entry_point) 866 } 867 868 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 869 pub fn start_boot_vcpus(&mut self) -> Result<()> { 870 self.activate_vcpus(self.boot_vcpus(), false) 871 } 872 873 pub fn start_restored_vcpus(&mut self) -> Result<()> { 874 let vcpu_numbers = self.vcpus.len(); 875 let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_numbers + 1) as usize)); 876 // Restore the vCPUs in "paused" state. 877 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 878 879 for vcpu_index in 0..vcpu_numbers { 880 let vcpu = Arc::clone(&self.vcpus[vcpu_index as usize]); 881 882 self.start_vcpu(vcpu, vcpu_thread_barrier.clone(), false) 883 .map_err(|e| { 884 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 885 })?; 886 } 887 // Unblock all restored CPU threads. 888 vcpu_thread_barrier.wait(); 889 Ok(()) 890 } 891 892 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 893 match desired_vcpus.cmp(&self.present_vcpus()) { 894 cmp::Ordering::Greater => { 895 self.create_vcpus(desired_vcpus, None)?; 896 self.activate_vcpus(desired_vcpus, true)?; 897 Ok(true) 898 } 899 cmp::Ordering::Less => { 900 self.mark_vcpus_for_removal(desired_vcpus); 901 Ok(true) 902 } 903 _ => Ok(false), 904 } 905 } 906 907 pub fn shutdown(&mut self) -> Result<()> { 908 // Tell the vCPUs to stop themselves next time they go through the loop 909 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 910 911 // Toggle the vCPUs pause boolean 912 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 913 914 // Unpark all the VCPU threads. 915 for state in self.vcpu_states.iter() { 916 state.unpark_thread(); 917 } 918 919 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 920 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 921 // above. 922 for state in self.vcpu_states.iter() { 923 state.signal_thread(); 924 } 925 926 // Wait for all the threads to finish. This removes the state from the vector. 927 for mut state in self.vcpu_states.drain(..) { 928 state.join_thread()?; 929 } 930 931 Ok(()) 932 } 933 934 #[cfg(feature = "tdx")] 935 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 936 for vcpu in &self.vcpus { 937 vcpu.lock() 938 .unwrap() 939 .vcpu 940 .tdx_init(hob_address) 941 .map_err(Error::InitializeTdx)?; 942 } 943 Ok(()) 944 } 945 946 pub fn boot_vcpus(&self) -> u8 { 947 self.config.boot_vcpus 948 } 949 950 pub fn max_vcpus(&self) -> u8 { 951 self.config.max_vcpus 952 } 953 954 #[cfg(target_arch = "x86_64")] 955 pub fn common_cpuid(&self) -> CpuId { 956 self.cpuid.clone() 957 } 958 959 fn present_vcpus(&self) -> u8 { 960 self.vcpu_states 961 .iter() 962 .fold(0, |acc, state| acc + state.active() as u8) 963 } 964 965 #[cfg(target_arch = "aarch64")] 966 pub fn get_mpidrs(&self) -> Vec<u64> { 967 self.vcpus 968 .iter() 969 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 970 .collect() 971 } 972 973 #[cfg(target_arch = "aarch64")] 974 pub fn get_saved_states(&self) -> Vec<CpuState> { 975 self.vcpus 976 .iter() 977 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 978 .collect() 979 } 980 981 #[cfg(feature = "acpi")] 982 pub fn create_madt(&self) -> Sdt { 983 use crate::acpi; 984 // This is also checked in the commandline parsing. 985 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 986 987 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 988 #[cfg(target_arch = "x86_64")] 989 { 990 madt.write(36, arch::layout::APIC_START); 991 992 for cpu in 0..self.config.max_vcpus { 993 let lapic = LocalApic { 994 r#type: acpi::ACPI_APIC_PROCESSOR, 995 length: 8, 996 processor_id: cpu, 997 apic_id: cpu, 998 flags: if cpu < self.config.boot_vcpus { 999 1 << MADT_CPU_ENABLE_FLAG 1000 } else { 1001 0 1002 }, 1003 }; 1004 madt.append(lapic); 1005 } 1006 1007 madt.append(Ioapic { 1008 r#type: acpi::ACPI_APIC_IO, 1009 length: 12, 1010 ioapic_id: 0, 1011 apic_address: arch::layout::IOAPIC_START.0 as u32, 1012 gsi_base: 0, 1013 ..Default::default() 1014 }); 1015 1016 madt.append(InterruptSourceOverride { 1017 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1018 length: 10, 1019 bus: 0, 1020 source: 4, 1021 gsi: 4, 1022 flags: 0, 1023 }); 1024 } 1025 1026 #[cfg(target_arch = "aarch64")] 1027 { 1028 /* Notes: 1029 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1030 */ 1031 1032 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1033 for cpu in 0..self.config.boot_vcpus { 1034 let vcpu = &self.vcpus[cpu as usize]; 1035 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1036 /* ARMv8 MPIDR format: 1037 Bits [63:40] Must be zero 1038 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1039 Bits [31:24] Must be zero 1040 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1041 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1042 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1043 */ 1044 let mpidr_mask = 0xff_00ff_ffff; 1045 let gicc = GicC { 1046 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1047 length: 80, 1048 reserved0: 0, 1049 cpu_interface_number: cpu as u32, 1050 uid: cpu as u32, 1051 flags: 1, 1052 parking_version: 0, 1053 performance_interrupt: 0, 1054 parked_address: 0, 1055 base_address: 0, 1056 gicv_base_address: 0, 1057 gich_base_address: 0, 1058 vgic_interrupt: 0, 1059 gicr_base_address: 0, 1060 mpidr: mpidr & mpidr_mask, 1061 proc_power_effi_class: 0, 1062 reserved1: 0, 1063 spe_overflow_interrupt: 0, 1064 }; 1065 1066 madt.append(gicc); 1067 } 1068 1069 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1070 let gicd = GicD { 1071 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1072 length: 24, 1073 reserved0: 0, 1074 gic_id: 0, 1075 base_address: arch::layout::MAPPED_IO_START - 0x0001_0000, 1076 global_irq_base: 0, 1077 version: 3, 1078 reserved1: [0; 3], 1079 }; 1080 madt.append(gicd); 1081 1082 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1083 let gicr_size: u32 = 0x0001_0000 * 2 * (self.config.boot_vcpus as u32); 1084 let gicr_base: u64 = arch::layout::MAPPED_IO_START - 0x0001_0000 - gicr_size as u64; 1085 let gicr = GicR { 1086 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1087 length: 16, 1088 reserved: 0, 1089 base_address: gicr_base, 1090 range_length: gicr_size, 1091 }; 1092 madt.append(gicr); 1093 1094 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1095 let gicits = GicIts { 1096 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1097 length: 20, 1098 reserved0: 0, 1099 translation_id: 0, 1100 base_address: gicr_base - 2 * 0x0001_0000, 1101 reserved1: 0, 1102 }; 1103 madt.append(gicits); 1104 1105 madt.update_checksum(); 1106 } 1107 1108 madt 1109 } 1110 } 1111 1112 #[cfg(feature = "acpi")] 1113 struct Cpu { 1114 cpu_id: u8, 1115 proximity_domain: u32, 1116 } 1117 1118 #[cfg(all(target_arch = "x86_64", feature = "acpi"))] 1119 const MADT_CPU_ENABLE_FLAG: usize = 0; 1120 1121 #[cfg(feature = "acpi")] 1122 impl Cpu { 1123 #[cfg(target_arch = "x86_64")] 1124 fn generate_mat(&self) -> Vec<u8> { 1125 let lapic = LocalApic { 1126 r#type: 0, 1127 length: 8, 1128 processor_id: self.cpu_id, 1129 apic_id: self.cpu_id, 1130 flags: 1 << MADT_CPU_ENABLE_FLAG, 1131 }; 1132 1133 let mut mat_data: Vec<u8> = Vec::new(); 1134 mat_data.resize(std::mem::size_of_val(&lapic), 0); 1135 unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic }; 1136 1137 mat_data 1138 } 1139 } 1140 1141 #[cfg(feature = "acpi")] 1142 impl Aml for Cpu { 1143 fn to_aml_bytes(&self) -> Vec<u8> { 1144 #[cfg(target_arch = "x86_64")] 1145 let mat_data: Vec<u8> = self.generate_mat(); 1146 1147 aml::Device::new( 1148 format!("C{:03}", self.cpu_id).as_str().into(), 1149 vec![ 1150 &aml::Name::new("_HID".into(), &"ACPI0007"), 1151 &aml::Name::new("_UID".into(), &self.cpu_id), 1152 // Currently, AArch64 cannot support following fields. 1153 /* 1154 _STA return value: 1155 Bit [0] – Set if the device is present. 1156 Bit [1] – Set if the device is enabled and decoding its resources. 1157 Bit [2] – Set if the device should be shown in the UI. 1158 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1159 Bit [4] – Set if the battery is present. 1160 Bits [31:5] – Reserved (must be cleared). 1161 */ 1162 #[cfg(target_arch = "x86_64")] 1163 &aml::Method::new( 1164 "_STA".into(), 1165 0, 1166 false, 1167 // Call into CSTA method which will interrogate device 1168 vec![&aml::Return::new(&aml::MethodCall::new( 1169 "CSTA".into(), 1170 vec![&self.cpu_id], 1171 ))], 1172 ), 1173 &aml::Method::new( 1174 "_PXM".into(), 1175 0, 1176 false, 1177 vec![&aml::Return::new(&self.proximity_domain)], 1178 ), 1179 // The Linux kernel expects every CPU device to have a _MAT entry 1180 // containing the LAPIC for this processor with the enabled bit set 1181 // even it if is disabled in the MADT (non-boot CPU) 1182 #[cfg(target_arch = "x86_64")] 1183 &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)), 1184 // Trigger CPU ejection 1185 #[cfg(target_arch = "x86_64")] 1186 &aml::Method::new( 1187 "_EJ0".into(), 1188 1, 1189 false, 1190 // Call into CEJ0 method which will actually eject device 1191 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1192 ), 1193 ], 1194 ) 1195 .to_aml_bytes() 1196 } 1197 } 1198 1199 #[cfg(feature = "acpi")] 1200 struct CpuNotify { 1201 cpu_id: u8, 1202 } 1203 1204 #[cfg(feature = "acpi")] 1205 impl Aml for CpuNotify { 1206 fn to_aml_bytes(&self) -> Vec<u8> { 1207 let object = aml::Path::new(&format!("C{:03}", self.cpu_id)); 1208 aml::If::new( 1209 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1210 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1211 ) 1212 .to_aml_bytes() 1213 } 1214 } 1215 1216 #[cfg(feature = "acpi")] 1217 struct CpuMethods { 1218 max_vcpus: u8, 1219 } 1220 1221 #[cfg(feature = "acpi")] 1222 impl Aml for CpuMethods { 1223 fn to_aml_bytes(&self) -> Vec<u8> { 1224 let mut bytes = Vec::new(); 1225 bytes.extend_from_slice( 1226 // CPU status method 1227 &aml::Method::new( 1228 "CSTA".into(), 1229 1, 1230 true, 1231 vec![ 1232 // Take lock defined above 1233 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1234 // Write CPU number (in first argument) to I/O port via field 1235 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1236 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1237 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 1238 &aml::If::new( 1239 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 1240 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 1241 ), 1242 // Release lock 1243 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1244 // Return 0 or 0xf 1245 &aml::Return::new(&aml::Local(0)), 1246 ], 1247 ) 1248 .to_aml_bytes(), 1249 ); 1250 1251 let mut cpu_notifies = Vec::new(); 1252 for cpu_id in 0..self.max_vcpus { 1253 cpu_notifies.push(CpuNotify { cpu_id }); 1254 } 1255 1256 let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new(); 1257 for cpu_id in 0..self.max_vcpus { 1258 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 1259 } 1260 1261 bytes.extend_from_slice( 1262 &aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(), 1263 ); 1264 1265 bytes.extend_from_slice( 1266 &aml::Method::new( 1267 "CEJ0".into(), 1268 1, 1269 true, 1270 vec![ 1271 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1272 // Write CPU number (in first argument) to I/O port via field 1273 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1274 // Set CEJ0 bit 1275 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 1276 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1277 ], 1278 ) 1279 .to_aml_bytes(), 1280 ); 1281 1282 bytes.extend_from_slice( 1283 &aml::Method::new( 1284 "CSCN".into(), 1285 0, 1286 true, 1287 vec![ 1288 // Take lock defined above 1289 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1290 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1291 &aml::While::new( 1292 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 1293 vec![ 1294 // Write CPU number (in first argument) to I/O port via field 1295 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 1296 // Check if CINS bit is set 1297 &aml::If::new( 1298 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 1299 // Notify device if it is 1300 vec![ 1301 &aml::MethodCall::new( 1302 "CTFY".into(), 1303 vec![&aml::Local(0), &aml::ONE], 1304 ), 1305 // Reset CINS bit 1306 &aml::Store::new( 1307 &aml::Path::new("\\_SB_.PRES.CINS"), 1308 &aml::ONE, 1309 ), 1310 ], 1311 ), 1312 // Check if CRMV bit is set 1313 &aml::If::new( 1314 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 1315 // Notify device if it is (with the eject constant 0x3) 1316 vec![ 1317 &aml::MethodCall::new( 1318 "CTFY".into(), 1319 vec![&aml::Local(0), &3u8], 1320 ), 1321 // Reset CRMV bit 1322 &aml::Store::new( 1323 &aml::Path::new("\\_SB_.PRES.CRMV"), 1324 &aml::ONE, 1325 ), 1326 ], 1327 ), 1328 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 1329 ], 1330 ), 1331 // Release lock 1332 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1333 ], 1334 ) 1335 .to_aml_bytes(), 1336 ); 1337 bytes 1338 } 1339 } 1340 1341 #[cfg(feature = "acpi")] 1342 impl Aml for CpuManager { 1343 fn to_aml_bytes(&self) -> Vec<u8> { 1344 let mut bytes = Vec::new(); 1345 // CPU hotplug controller 1346 #[cfg(target_arch = "x86_64")] 1347 bytes.extend_from_slice( 1348 &aml::Device::new( 1349 "_SB_.PRES".into(), 1350 vec![ 1351 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 1352 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 1353 // Mutex to protect concurrent access as we write to choose CPU and then read back status 1354 &aml::Mutex::new("CPLK".into(), 0), 1355 &aml::Name::new( 1356 "_CRS".into(), 1357 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 1358 aml::AddressSpaceCachable::NotCacheable, 1359 true, 1360 self.acpi_address.0 as u64, 1361 self.acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 1362 )]), 1363 ), 1364 // OpRegion and Fields map MMIO range into individual field values 1365 &aml::OpRegion::new( 1366 "PRST".into(), 1367 aml::OpRegionSpace::SystemMemory, 1368 self.acpi_address.0 as usize, 1369 CPU_MANAGER_ACPI_SIZE, 1370 ), 1371 &aml::Field::new( 1372 "PRST".into(), 1373 aml::FieldAccessType::Byte, 1374 aml::FieldUpdateRule::WriteAsZeroes, 1375 vec![ 1376 aml::FieldEntry::Reserved(32), 1377 aml::FieldEntry::Named(*b"CPEN", 1), 1378 aml::FieldEntry::Named(*b"CINS", 1), 1379 aml::FieldEntry::Named(*b"CRMV", 1), 1380 aml::FieldEntry::Named(*b"CEJ0", 1), 1381 aml::FieldEntry::Reserved(4), 1382 aml::FieldEntry::Named(*b"CCMD", 8), 1383 ], 1384 ), 1385 &aml::Field::new( 1386 "PRST".into(), 1387 aml::FieldAccessType::DWord, 1388 aml::FieldUpdateRule::Preserve, 1389 vec![ 1390 aml::FieldEntry::Named(*b"CSEL", 32), 1391 aml::FieldEntry::Reserved(32), 1392 aml::FieldEntry::Named(*b"CDAT", 32), 1393 ], 1394 ), 1395 ], 1396 ) 1397 .to_aml_bytes(), 1398 ); 1399 1400 // CPU devices 1401 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 1402 let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05")); 1403 // Bundle methods together under a common object 1404 let methods = CpuMethods { 1405 max_vcpus: self.config.max_vcpus, 1406 }; 1407 let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods]; 1408 1409 let mut cpu_devices = Vec::new(); 1410 for cpu_id in 0..self.config.max_vcpus { 1411 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 1412 let cpu_device = Cpu { 1413 cpu_id, 1414 proximity_domain, 1415 }; 1416 1417 cpu_devices.push(cpu_device); 1418 } 1419 1420 for cpu_device in cpu_devices.iter() { 1421 cpu_data_inner.push(cpu_device); 1422 } 1423 1424 bytes.extend_from_slice( 1425 &aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(), 1426 ); 1427 bytes 1428 } 1429 } 1430 1431 impl Pausable for CpuManager { 1432 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 1433 // Tell the vCPUs to pause themselves next time they exit 1434 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 1435 1436 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1437 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1438 // above. 1439 for state in self.vcpu_states.iter() { 1440 state.signal_thread(); 1441 } 1442 1443 for vcpu in self.vcpus.iter() { 1444 let mut vcpu = vcpu.lock().unwrap(); 1445 vcpu.pause()?; 1446 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 1447 if !self.config.kvm_hyperv { 1448 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 1449 MigratableError::Pause(anyhow!( 1450 "Could not notify guest it has been paused {:?}", 1451 e 1452 )) 1453 })?; 1454 } 1455 } 1456 1457 Ok(()) 1458 } 1459 1460 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 1461 for vcpu in self.vcpus.iter() { 1462 vcpu.lock().unwrap().resume()?; 1463 } 1464 1465 // Toggle the vCPUs pause boolean 1466 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1467 1468 // Unpark all the VCPU threads. 1469 // Once unparked, the next thing they will do is checking for the pause 1470 // boolean. Since it'll be set to false, they will exit their pause loop 1471 // and go back to vmx root. 1472 for state in self.vcpu_states.iter() { 1473 state.unpark_thread(); 1474 } 1475 Ok(()) 1476 } 1477 } 1478 1479 impl Snapshottable for CpuManager { 1480 fn id(&self) -> String { 1481 CPU_MANAGER_SNAPSHOT_ID.to_string() 1482 } 1483 1484 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1485 let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID); 1486 1487 // The CpuManager snapshot is a collection of all vCPUs snapshots. 1488 for vcpu in &self.vcpus { 1489 let cpu_snapshot = vcpu.lock().unwrap().snapshot()?; 1490 cpu_manager_snapshot.add_snapshot(cpu_snapshot); 1491 } 1492 1493 Ok(cpu_manager_snapshot) 1494 } 1495 1496 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 1497 for (cpu_id, snapshot) in snapshot.snapshots.iter() { 1498 debug!("Restoring VCPU {}", cpu_id); 1499 self.create_vcpu(cpu_id.parse::<u8>().unwrap(), None, Some(*snapshot.clone())) 1500 .map_err(|e| MigratableError::Restore(anyhow!("Could not create vCPU {:?}", e)))?; 1501 } 1502 1503 Ok(()) 1504 } 1505 } 1506 1507 impl Transportable for CpuManager {} 1508 impl Migratable for CpuManager {} 1509 1510 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 1511 #[cfg(test)] 1512 mod tests { 1513 use arch::x86_64::interrupts::*; 1514 use arch::x86_64::regs::*; 1515 use hypervisor::x86_64::{FpuState, LapicState, StandardRegisters}; 1516 1517 #[test] 1518 fn test_setlint() { 1519 let hv = hypervisor::new().unwrap(); 1520 let vm = hv.create_vm().expect("new VM fd creation failed"); 1521 assert!(hv.check_required_extensions().is_ok()); 1522 // Calling get_lapic will fail if there is no irqchip before hand. 1523 assert!(vm.create_irq_chip().is_ok()); 1524 let vcpu = vm.create_vcpu(0, None).unwrap(); 1525 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 1526 1527 // Compute the value that is expected to represent LVT0 and LVT1. 1528 let lint0 = get_klapic_reg(&klapic_before, APIC_LVT0); 1529 let lint1 = get_klapic_reg(&klapic_before, APIC_LVT1); 1530 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 1531 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 1532 1533 set_lint(&vcpu).unwrap(); 1534 1535 // Compute the value that represents LVT0 and LVT1 after set_lint. 1536 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 1537 let lint0_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT0); 1538 let lint1_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT1); 1539 assert_eq!(lint0_mode_expected, lint0_mode_actual); 1540 assert_eq!(lint1_mode_expected, lint1_mode_actual); 1541 } 1542 1543 #[test] 1544 fn test_setup_fpu() { 1545 let hv = hypervisor::new().unwrap(); 1546 let vm = hv.create_vm().expect("new VM fd creation failed"); 1547 let vcpu = vm.create_vcpu(0, None).unwrap(); 1548 setup_fpu(&vcpu).unwrap(); 1549 1550 let expected_fpu: FpuState = FpuState { 1551 fcw: 0x37f, 1552 mxcsr: 0x1f80, 1553 ..Default::default() 1554 }; 1555 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 1556 // TODO: auto-generate kvm related structures with PartialEq on. 1557 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 1558 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 1559 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 1560 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 1561 // remove it at all. 1562 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 1563 } 1564 1565 #[test] 1566 fn test_setup_msrs() { 1567 use hypervisor::arch::x86::msr_index; 1568 use hypervisor::x86_64::{MsrEntries, MsrEntry}; 1569 1570 let hv = hypervisor::new().unwrap(); 1571 let vm = hv.create_vm().expect("new VM fd creation failed"); 1572 let vcpu = vm.create_vcpu(0, None).unwrap(); 1573 setup_msrs(&vcpu).unwrap(); 1574 1575 // This test will check against the last MSR entry configured (the tenth one). 1576 // See create_msr_entries for details. 1577 let mut msrs = MsrEntries::from_entries(&[MsrEntry { 1578 index: msr_index::MSR_IA32_MISC_ENABLE, 1579 ..Default::default() 1580 }]) 1581 .unwrap(); 1582 1583 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 1584 // in this test case scenario. 1585 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 1586 assert_eq!(read_msrs, 1); 1587 1588 // Official entries that were setup when we did setup_msrs. We need to assert that the 1589 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 1590 // expect. 1591 let entry_vec = hypervisor::x86_64::boot_msr_entries(); 1592 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 1593 } 1594 1595 #[test] 1596 fn test_setup_regs() { 1597 let hv = hypervisor::new().unwrap(); 1598 let vm = hv.create_vm().expect("new VM fd creation failed"); 1599 let vcpu = vm.create_vcpu(0, None).unwrap(); 1600 1601 let expected_regs: StandardRegisters = StandardRegisters { 1602 rflags: 0x0000000000000002u64, 1603 rbx: arch::layout::PVH_INFO_START.0, 1604 rip: 1, 1605 ..Default::default() 1606 }; 1607 1608 setup_regs(&vcpu, expected_regs.rip).unwrap(); 1609 1610 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 1611 assert_eq!(actual_regs, expected_regs); 1612 } 1613 } 1614 1615 #[cfg(target_arch = "aarch64")] 1616 #[cfg(test)] 1617 mod tests { 1618 use crate::GuestMemoryMmap; 1619 use arch::aarch64::layout; 1620 use arch::aarch64::regs::*; 1621 use hypervisor::kvm::aarch64::{is_system_register, MPIDR_EL1}; 1622 use hypervisor::kvm::kvm_bindings::{ 1623 kvm_one_reg, kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 1624 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 1625 }; 1626 use hypervisor::{arm64_core_reg_id, offset__of}; 1627 use std::mem; 1628 use vm_memory::GuestAddress; 1629 1630 #[test] 1631 fn test_setup_regs() { 1632 let hv = hypervisor::new().unwrap(); 1633 let vm = hv.create_vm().unwrap(); 1634 let vcpu = vm.create_vcpu(0, None).unwrap(); 1635 let regions = vec![( 1636 GuestAddress(layout::RAM_64BIT_START), 1637 (layout::FDT_MAX_SIZE + 0x1000) as usize, 1638 )]; 1639 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 1640 1641 let res = setup_regs(&vcpu, 0, 0x0, &mem); 1642 // Must fail when vcpu is not initialized yet. 1643 assert!(res.is_err()); 1644 1645 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1646 vm.get_preferred_target(&mut kvi).unwrap(); 1647 vcpu.vcpu_init(&kvi).unwrap(); 1648 1649 assert!(setup_regs(&vcpu, 0, 0x0, &mem).is_ok()); 1650 } 1651 1652 #[test] 1653 fn test_read_mpidr() { 1654 let hv = hypervisor::new().unwrap(); 1655 let vm = hv.create_vm().unwrap(); 1656 let vcpu = vm.create_vcpu(0, None).unwrap(); 1657 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1658 vm.get_preferred_target(&mut kvi).unwrap(); 1659 1660 // Must fail when vcpu is not initialized yet. 1661 assert!(vcpu.read_mpidr().is_err()); 1662 1663 vcpu.vcpu_init(&kvi).unwrap(); 1664 assert_eq!(vcpu.read_mpidr().unwrap(), 0x80000000); 1665 } 1666 1667 #[test] 1668 fn test_is_system_register() { 1669 let offset = offset__of!(user_pt_regs, pc); 1670 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 1671 assert!(!is_system_register(regid)); 1672 let regid = KVM_REG_ARM64 as u64 | KVM_REG_SIZE_U64 as u64 | KVM_REG_ARM64_SYSREG as u64; 1673 assert!(is_system_register(regid)); 1674 } 1675 1676 #[test] 1677 fn test_save_restore_core_regs() { 1678 let hv = hypervisor::new().unwrap(); 1679 let vm = hv.create_vm().unwrap(); 1680 let vcpu = vm.create_vcpu(0, None).unwrap(); 1681 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1682 vm.get_preferred_target(&mut kvi).unwrap(); 1683 1684 // Must fail when vcpu is not initialized yet. 1685 let mut state = kvm_regs::default(); 1686 let res = vcpu.core_registers(&mut state); 1687 assert!(res.is_err()); 1688 assert_eq!( 1689 format!("{}", res.unwrap_err()), 1690 "Failed to get core register: Exec format error (os error 8)" 1691 ); 1692 1693 let res = vcpu.set_core_registers(&state); 1694 assert!(res.is_err()); 1695 assert_eq!( 1696 format!("{}", res.unwrap_err()), 1697 "Failed to set core register: Exec format error (os error 8)" 1698 ); 1699 1700 vcpu.vcpu_init(&kvi).unwrap(); 1701 assert!(vcpu.core_registers(&mut state).is_ok()); 1702 assert_eq!(state.regs.pstate, 0x3C5); 1703 1704 assert!(vcpu.set_core_registers(&state).is_ok()); 1705 let off = offset__of!(user_pt_regs, pstate); 1706 let pstate = vcpu 1707 .get_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1708 .expect("Failed to call kvm get one reg"); 1709 assert_eq!(state.regs.pstate, pstate); 1710 } 1711 1712 #[test] 1713 fn test_save_restore_system_regs() { 1714 let hv = hypervisor::new().unwrap(); 1715 let vm = hv.create_vm().unwrap(); 1716 let vcpu = vm.create_vcpu(0, None).unwrap(); 1717 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1718 vm.get_preferred_target(&mut kvi).unwrap(); 1719 1720 // Must fail when vcpu is not initialized yet. 1721 let mut state: Vec<kvm_one_reg> = Vec::new(); 1722 let res = vcpu.system_registers(&mut state); 1723 assert!(res.is_err()); 1724 assert_eq!( 1725 format!("{}", res.unwrap_err()), 1726 "Failed to retrieve list of registers: Exec format error (os error 8)" 1727 ); 1728 1729 state.push(kvm_one_reg { 1730 id: MPIDR_EL1, 1731 addr: 0x00, 1732 }); 1733 let res = vcpu.set_system_registers(&state); 1734 assert!(res.is_err()); 1735 assert_eq!( 1736 format!("{}", res.unwrap_err()), 1737 "Failed to set system register: Exec format error (os error 8)" 1738 ); 1739 1740 vcpu.vcpu_init(&kvi).unwrap(); 1741 assert!(vcpu.system_registers(&mut state).is_ok()); 1742 let initial_mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr"); 1743 assert!(state.contains(&kvm_one_reg { 1744 id: MPIDR_EL1, 1745 addr: initial_mpidr 1746 })); 1747 1748 assert!(vcpu.set_system_registers(&state).is_ok()); 1749 let mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr"); 1750 assert_eq!(initial_mpidr, mpidr); 1751 } 1752 1753 #[test] 1754 fn test_get_set_mpstate() { 1755 let hv = hypervisor::new().unwrap(); 1756 let vm = hv.create_vm().unwrap(); 1757 let vcpu = vm.create_vcpu(0, None).unwrap(); 1758 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1759 vm.get_preferred_target(&mut kvi).unwrap(); 1760 1761 let res = vcpu.get_mp_state(); 1762 assert!(res.is_ok()); 1763 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 1764 } 1765 } 1766