1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::CpusConfig; 15 use crate::device_manager::DeviceManager; 16 use crate::memory_manager::MemoryManager; 17 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 18 #[cfg(target_arch = "x86_64")] 19 use crate::vm::physical_bits; 20 #[cfg(feature = "acpi")] 21 use crate::vm::NumaNodes; 22 use crate::GuestMemoryMmap; 23 use crate::CPU_MANAGER_SNAPSHOT_ID; 24 #[cfg(feature = "acpi")] 25 use acpi_tables::{aml, aml::Aml, sdt::Sdt}; 26 use anyhow::anyhow; 27 use arch::EntryPoint; 28 use devices::interrupt_controller::InterruptController; 29 #[cfg(target_arch = "aarch64")] 30 use hypervisor::kvm::kvm_bindings; 31 #[cfg(target_arch = "x86_64")] 32 use hypervisor::CpuId; 33 use hypervisor::{vm::VmmOps, CpuState, HypervisorCpuError, VmExit}; 34 use libc::{c_void, siginfo_t}; 35 use seccomp::{SeccompAction, SeccompFilter}; 36 #[cfg(feature = "acpi")] 37 use std::collections::BTreeMap; 38 use std::os::unix::thread::JoinHandleExt; 39 use std::sync::atomic::{AtomicBool, Ordering}; 40 use std::sync::{Arc, Barrier, Mutex}; 41 use std::{cmp, io, result, thread}; 42 use vm_device::BusDevice; 43 #[cfg(feature = "acpi")] 44 use vm_memory::GuestAddress; 45 use vm_memory::GuestMemoryAtomic; 46 use vm_migration::{ 47 Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable, 48 Transportable, 49 }; 50 use vmm_sys_util::eventfd::EventFd; 51 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; 52 53 #[cfg(feature = "acpi")] 54 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; 55 56 #[derive(Debug)] 57 pub enum Error { 58 /// Cannot create the vCPU. 59 VcpuCreate(anyhow::Error), 60 61 /// Cannot run the VCPUs. 62 VcpuRun(anyhow::Error), 63 64 /// Cannot spawn a new vCPU thread. 65 VcpuSpawn(io::Error), 66 67 /// Cannot generate common CPUID 68 CommonCpuId(arch::Error), 69 70 /// Error configuring VCPU 71 VcpuConfiguration(arch::Error), 72 73 #[cfg(target_arch = "aarch64")] 74 /// Error fetching prefered target 75 VcpuArmPreferredTarget(hypervisor::HypervisorVmError), 76 77 #[cfg(target_arch = "aarch64")] 78 /// Error doing vCPU init on Arm. 79 VcpuArmInit(hypervisor::HypervisorCpuError), 80 81 /// Failed to join on vCPU threads 82 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 83 84 /// Cannot add legacy device to Bus. 85 BusError(vm_device::BusError), 86 87 /// Asking for more vCPUs that we can have 88 DesiredVCpuCountExceedsMax, 89 90 /// Cannot create seccomp filter 91 CreateSeccompFilter(seccomp::SeccompError), 92 93 /// Cannot apply seccomp filter 94 ApplySeccompFilter(seccomp::Error), 95 96 /// Error starting vCPU after restore 97 StartRestoreVcpu(anyhow::Error), 98 99 /// Error because an unexpected VmExit type was received. 100 UnexpectedVmExit, 101 102 /// Failed to allocate MMIO address 103 AllocateMmmioAddress, 104 105 #[cfg(feature = "tdx")] 106 InitializeTdx(hypervisor::HypervisorCpuError), 107 } 108 pub type Result<T> = result::Result<T, Error>; 109 110 #[cfg(all(target_arch = "x86_64", feature = "acpi"))] 111 #[allow(dead_code)] 112 #[repr(packed)] 113 struct LocalApic { 114 pub r#type: u8, 115 pub length: u8, 116 pub processor_id: u8, 117 pub apic_id: u8, 118 pub flags: u32, 119 } 120 121 #[allow(dead_code)] 122 #[repr(packed)] 123 #[derive(Default)] 124 struct Ioapic { 125 pub r#type: u8, 126 pub length: u8, 127 pub ioapic_id: u8, 128 _reserved: u8, 129 pub apic_address: u32, 130 pub gsi_base: u32, 131 } 132 133 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 134 #[allow(dead_code)] 135 #[repr(packed)] 136 struct GicC { 137 pub r#type: u8, 138 pub length: u8, 139 pub reserved0: u16, 140 pub cpu_interface_number: u32, 141 pub uid: u32, 142 pub flags: u32, 143 pub parking_version: u32, 144 pub performance_interrupt: u32, 145 pub parked_address: u64, 146 pub base_address: u64, 147 pub gicv_base_address: u64, 148 pub gich_base_address: u64, 149 pub vgic_interrupt: u32, 150 pub gicr_base_address: u64, 151 pub mpidr: u64, 152 pub proc_power_effi_class: u8, 153 pub reserved1: u8, 154 pub spe_overflow_interrupt: u16, 155 } 156 157 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 158 #[allow(dead_code)] 159 #[repr(packed)] 160 struct GicD { 161 pub r#type: u8, 162 pub length: u8, 163 pub reserved0: u16, 164 pub gic_id: u32, 165 pub base_address: u64, 166 pub global_irq_base: u32, 167 pub version: u8, 168 pub reserved1: [u8; 3], 169 } 170 171 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 172 #[allow(dead_code)] 173 #[repr(packed)] 174 struct GicR { 175 pub r#type: u8, 176 pub length: u8, 177 pub reserved: u16, 178 pub base_address: u64, 179 pub range_length: u32, 180 } 181 182 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 183 #[allow(dead_code)] 184 #[repr(packed)] 185 struct GicIts { 186 pub r#type: u8, 187 pub length: u8, 188 pub reserved0: u16, 189 pub translation_id: u32, 190 pub base_address: u64, 191 pub reserved1: u32, 192 } 193 194 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 195 #[allow(dead_code)] 196 #[repr(packed)] 197 struct ProcessorHierarchyNode { 198 pub r#type: u8, 199 pub length: u8, 200 pub reserved: u16, 201 pub flags: u32, 202 pub parent: u32, 203 pub acpi_processor_id: u32, 204 pub num_private_resources: u32, 205 } 206 207 #[allow(dead_code)] 208 #[repr(packed)] 209 #[derive(Default)] 210 struct InterruptSourceOverride { 211 pub r#type: u8, 212 pub length: u8, 213 pub bus: u8, 214 pub source: u8, 215 pub gsi: u32, 216 pub flags: u16, 217 } 218 219 /// A wrapper around creating and using a kvm-based VCPU. 220 pub struct Vcpu { 221 // The hypervisor abstracted CPU. 222 vcpu: Arc<dyn hypervisor::Vcpu>, 223 id: u8, 224 #[cfg(target_arch = "aarch64")] 225 mpidr: u64, 226 saved_state: Option<CpuState>, 227 } 228 229 impl Vcpu { 230 /// Constructs a new VCPU for `vm`. 231 /// 232 /// # Arguments 233 /// 234 /// * `id` - Represents the CPU number between [0, max vcpus). 235 /// * `vm` - The virtual machine this vcpu will get attached to. 236 /// * `vmmops` - Optional object for exit handling. 237 pub fn new( 238 id: u8, 239 vm: &Arc<dyn hypervisor::Vm>, 240 vmmops: Option<Arc<dyn VmmOps>>, 241 ) -> Result<Arc<Mutex<Self>>> { 242 let vcpu = vm 243 .create_vcpu(id, vmmops) 244 .map_err(|e| Error::VcpuCreate(e.into()))?; 245 // Initially the cpuid per vCPU is the one supported by this VM. 246 Ok(Arc::new(Mutex::new(Vcpu { 247 vcpu, 248 id, 249 #[cfg(target_arch = "aarch64")] 250 mpidr: 0, 251 saved_state: None, 252 }))) 253 } 254 255 /// Configures a vcpu and should be called once per vcpu when created. 256 /// 257 /// # Arguments 258 /// 259 /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. 260 /// * `vm_memory` - Guest memory. 261 /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. 262 pub fn configure( 263 &mut self, 264 #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>, 265 kernel_entry_point: Option<EntryPoint>, 266 vm_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 267 #[cfg(target_arch = "x86_64")] cpuid: CpuId, 268 #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, 269 ) -> Result<()> { 270 #[cfg(target_arch = "aarch64")] 271 { 272 self.init(vm)?; 273 self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, kernel_entry_point, vm_memory) 274 .map_err(Error::VcpuConfiguration)?; 275 } 276 info!("Configuring vCPU: cpu_id = {}", self.id); 277 #[cfg(target_arch = "x86_64")] 278 arch::configure_vcpu( 279 &self.vcpu, 280 self.id, 281 kernel_entry_point, 282 vm_memory, 283 cpuid, 284 kvm_hyperv, 285 ) 286 .map_err(Error::VcpuConfiguration)?; 287 288 Ok(()) 289 } 290 291 /// Gets the MPIDR register value. 292 #[cfg(target_arch = "aarch64")] 293 pub fn get_mpidr(&self) -> u64 { 294 self.mpidr 295 } 296 297 /// Gets the saved vCPU state. 298 #[cfg(target_arch = "aarch64")] 299 pub fn get_saved_state(&self) -> Option<CpuState> { 300 self.saved_state.clone() 301 } 302 303 /// Initializes an aarch64 specific vcpu for booting Linux. 304 #[cfg(target_arch = "aarch64")] 305 pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> { 306 let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); 307 308 // This reads back the kernel's preferred target type. 309 vm.get_preferred_target(&mut kvi) 310 .map_err(Error::VcpuArmPreferredTarget)?; 311 // We already checked that the capability is supported. 312 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; 313 // Non-boot cpus are powered off initially. 314 if self.id > 0 { 315 kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; 316 } 317 self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) 318 } 319 320 /// Runs the VCPU until it exits, returning the reason. 321 /// 322 /// Note that the state of the VCPU and associated VM must be setup first for this to do 323 /// anything useful. 324 pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> { 325 self.vcpu.run() 326 } 327 } 328 329 const VCPU_SNAPSHOT_ID: &str = "vcpu"; 330 impl Pausable for Vcpu { 331 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 332 self.saved_state = 333 Some(self.vcpu.state().map_err(|e| { 334 MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)) 335 })?); 336 337 Ok(()) 338 } 339 340 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 341 if let Some(vcpu_state) = &self.saved_state { 342 self.vcpu.set_state(vcpu_state).map_err(|e| { 343 MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e)) 344 })?; 345 } 346 347 Ok(()) 348 } 349 } 350 impl Snapshottable for Vcpu { 351 fn id(&self) -> String { 352 VCPU_SNAPSHOT_ID.to_string() 353 } 354 355 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 356 let mut vcpu_snapshot = Snapshot::new(&format!("{}", self.id)); 357 vcpu_snapshot.add_data_section(SnapshotDataSection::new_from_state( 358 VCPU_SNAPSHOT_ID, 359 &self.saved_state, 360 )?); 361 362 Ok(vcpu_snapshot) 363 } 364 365 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 366 self.saved_state = Some(snapshot.to_state(VCPU_SNAPSHOT_ID)?); 367 Ok(()) 368 } 369 } 370 371 pub struct CpuManager { 372 config: CpusConfig, 373 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 374 interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>, 375 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 376 vm_memory: GuestMemoryAtomic<GuestMemoryMmap>, 377 #[cfg(target_arch = "x86_64")] 378 cpuid: CpuId, 379 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 380 vm: Arc<dyn hypervisor::Vm>, 381 vcpus_kill_signalled: Arc<AtomicBool>, 382 vcpus_pause_signalled: Arc<AtomicBool>, 383 exit_evt: EventFd, 384 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 385 reset_evt: EventFd, 386 vcpu_states: Vec<VcpuState>, 387 selected_cpu: u8, 388 vcpus: Vec<Arc<Mutex<Vcpu>>>, 389 seccomp_action: SeccompAction, 390 vmmops: Arc<dyn VmmOps>, 391 #[cfg(feature = "acpi")] 392 #[cfg_attr(target_arch = "aarch64", allow(dead_code))] 393 acpi_address: GuestAddress, 394 #[cfg(feature = "acpi")] 395 proximity_domain_per_cpu: BTreeMap<u8, u32>, 396 } 397 398 const CPU_ENABLE_FLAG: usize = 0; 399 const CPU_INSERTING_FLAG: usize = 1; 400 const CPU_REMOVING_FLAG: usize = 2; 401 const CPU_EJECT_FLAG: usize = 3; 402 403 const CPU_STATUS_OFFSET: u64 = 4; 404 const CPU_SELECTION_OFFSET: u64 = 0; 405 406 impl BusDevice for CpuManager { 407 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 408 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 409 data.copy_from_slice(&[0; 8][0..data.len()]); 410 411 match offset { 412 CPU_SELECTION_OFFSET => { 413 data[0] = self.selected_cpu; 414 } 415 CPU_STATUS_OFFSET => { 416 if self.selected_cpu < self.present_vcpus() { 417 let state = &self.vcpu_states[usize::from(self.selected_cpu)]; 418 if state.active() { 419 data[0] |= 1 << CPU_ENABLE_FLAG; 420 } 421 if state.inserting { 422 data[0] |= 1 << CPU_INSERTING_FLAG; 423 } 424 if state.removing { 425 data[0] |= 1 << CPU_REMOVING_FLAG; 426 } 427 } 428 } 429 _ => { 430 warn!( 431 "Unexpected offset for accessing CPU manager device: {:#}", 432 offset 433 ); 434 } 435 } 436 } 437 438 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 439 match offset { 440 CPU_SELECTION_OFFSET => { 441 self.selected_cpu = data[0]; 442 } 443 CPU_STATUS_OFFSET => { 444 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; 445 // The ACPI code writes back a 1 to acknowledge the insertion 446 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) 447 && state.inserting 448 { 449 state.inserting = false; 450 } 451 // Ditto for removal 452 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) && state.removing 453 { 454 state.removing = false; 455 } 456 // Trigger removal of vCPU 457 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { 458 if let Err(e) = self.remove_vcpu(self.selected_cpu) { 459 error!("Error removing vCPU: {:?}", e); 460 } 461 } 462 } 463 _ => { 464 warn!( 465 "Unexpected offset for accessing CPU manager device: {:#}", 466 offset 467 ); 468 } 469 } 470 None 471 } 472 } 473 474 #[derive(Default)] 475 struct VcpuState { 476 inserting: bool, 477 removing: bool, 478 handle: Option<thread::JoinHandle<()>>, 479 kill: Arc<AtomicBool>, 480 vcpu_run_interrupted: Arc<AtomicBool>, 481 } 482 483 impl VcpuState { 484 fn active(&self) -> bool { 485 self.handle.is_some() 486 } 487 488 fn signal_thread(&self) { 489 if let Some(handle) = self.handle.as_ref() { 490 loop { 491 unsafe { 492 libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); 493 } 494 if self.vcpu_run_interrupted.load(Ordering::SeqCst) { 495 break; 496 } else { 497 // This is more effective than thread::yield_now() at 498 // avoiding a priority inversion with the vCPU thread 499 thread::sleep(std::time::Duration::from_millis(1)); 500 } 501 } 502 } 503 } 504 505 fn join_thread(&mut self) -> Result<()> { 506 if let Some(handle) = self.handle.take() { 507 handle.join().map_err(Error::ThreadCleanup)? 508 } 509 510 Ok(()) 511 } 512 513 fn unpark_thread(&self) { 514 if let Some(handle) = self.handle.as_ref() { 515 handle.thread().unpark() 516 } 517 } 518 } 519 520 impl CpuManager { 521 #[allow(unused_variables)] 522 #[allow(clippy::too_many_arguments)] 523 pub fn new( 524 config: &CpusConfig, 525 device_manager: &Arc<Mutex<DeviceManager>>, 526 memory_manager: &Arc<Mutex<MemoryManager>>, 527 vm: Arc<dyn hypervisor::Vm>, 528 exit_evt: EventFd, 529 reset_evt: EventFd, 530 hypervisor: Arc<dyn hypervisor::Hypervisor>, 531 seccomp_action: SeccompAction, 532 vmmops: Arc<dyn VmmOps>, 533 #[cfg(feature = "tdx")] tdx_enabled: bool, 534 #[cfg(feature = "acpi")] numa_nodes: &NumaNodes, 535 ) -> Result<Arc<Mutex<CpuManager>>> { 536 let guest_memory = memory_manager.lock().unwrap().guest_memory(); 537 let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); 538 vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); 539 540 #[cfg(target_arch = "x86_64")] 541 let sgx_epc_sections = memory_manager 542 .lock() 543 .unwrap() 544 .sgx_epc_region() 545 .as_ref() 546 .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); 547 #[cfg(target_arch = "x86_64")] 548 let cpuid = { 549 let phys_bits = physical_bits( 550 config.max_phys_bits, 551 #[cfg(feature = "tdx")] 552 tdx_enabled, 553 ); 554 arch::generate_common_cpuid( 555 hypervisor, 556 config 557 .topology 558 .clone() 559 .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)), 560 sgx_epc_sections, 561 phys_bits, 562 config.kvm_hyperv, 563 #[cfg(feature = "tdx")] 564 tdx_enabled, 565 ) 566 .map_err(Error::CommonCpuId)? 567 }; 568 569 let device_manager = device_manager.lock().unwrap(); 570 #[cfg(feature = "acpi")] 571 let acpi_address = device_manager 572 .allocator() 573 .lock() 574 .unwrap() 575 .allocate_mmio_addresses(None, CPU_MANAGER_ACPI_SIZE as u64, None) 576 .ok_or(Error::AllocateMmmioAddress)?; 577 578 #[cfg(feature = "acpi")] 579 let proximity_domain_per_cpu: BTreeMap<u8, u32> = { 580 let mut cpu_list = Vec::new(); 581 for (proximity_domain, numa_node) in numa_nodes.iter() { 582 for cpu in numa_node.cpus().iter() { 583 cpu_list.push((*cpu, *proximity_domain)) 584 } 585 } 586 cpu_list 587 } 588 .into_iter() 589 .collect(); 590 591 let cpu_manager = Arc::new(Mutex::new(CpuManager { 592 config: config.clone(), 593 interrupt_controller: device_manager.interrupt_controller().clone(), 594 vm_memory: guest_memory, 595 #[cfg(target_arch = "x86_64")] 596 cpuid, 597 vm, 598 vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), 599 vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), 600 vcpu_states, 601 exit_evt, 602 reset_evt, 603 selected_cpu: 0, 604 vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), 605 seccomp_action, 606 vmmops, 607 #[cfg(feature = "acpi")] 608 acpi_address, 609 #[cfg(feature = "acpi")] 610 proximity_domain_per_cpu, 611 })); 612 613 #[cfg(feature = "acpi")] 614 device_manager 615 .mmio_bus() 616 .insert( 617 cpu_manager.clone(), 618 acpi_address.0, 619 CPU_MANAGER_ACPI_SIZE as u64, 620 ) 621 .map_err(Error::BusError)?; 622 623 Ok(cpu_manager) 624 } 625 626 fn create_vcpu( 627 &mut self, 628 cpu_id: u8, 629 entry_point: Option<EntryPoint>, 630 snapshot: Option<Snapshot>, 631 ) -> Result<Arc<Mutex<Vcpu>>> { 632 info!("Creating vCPU: cpu_id = {}", cpu_id); 633 634 let vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vmmops.clone()))?; 635 636 if let Some(snapshot) = snapshot { 637 // AArch64 vCPUs should be initialized after created. 638 #[cfg(target_arch = "aarch64")] 639 vcpu.lock().unwrap().init(&self.vm)?; 640 641 vcpu.lock() 642 .unwrap() 643 .restore(snapshot) 644 .expect("Failed to restore vCPU"); 645 } else { 646 let vm_memory = self.vm_memory.clone(); 647 648 #[cfg(target_arch = "x86_64")] 649 vcpu.lock() 650 .unwrap() 651 .configure( 652 entry_point, 653 &vm_memory, 654 self.cpuid.clone(), 655 self.config.kvm_hyperv, 656 ) 657 .expect("Failed to configure vCPU"); 658 659 #[cfg(target_arch = "aarch64")] 660 vcpu.lock() 661 .unwrap() 662 .configure(&self.vm, entry_point, &vm_memory) 663 .expect("Failed to configure vCPU"); 664 } 665 666 // Adding vCPU to the CpuManager's vCPU list. 667 self.vcpus.push(Arc::clone(&vcpu)); 668 669 Ok(vcpu) 670 } 671 672 /// Only create new vCPUs if there aren't any inactive ones to reuse 673 fn create_vcpus(&mut self, desired_vcpus: u8, entry_point: Option<EntryPoint>) -> Result<()> { 674 info!( 675 "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", 676 desired_vcpus, 677 self.config.max_vcpus, 678 self.vcpus.len(), 679 self.present_vcpus() 680 ); 681 682 if desired_vcpus > self.config.max_vcpus { 683 return Err(Error::DesiredVCpuCountExceedsMax); 684 } 685 686 // Only create vCPUs in excess of all the allocated vCPUs. 687 for cpu_id in self.vcpus.len() as u8..desired_vcpus { 688 self.create_vcpu(cpu_id, entry_point, None)?; 689 } 690 691 Ok(()) 692 } 693 694 fn start_vcpu( 695 &mut self, 696 vcpu: Arc<Mutex<Vcpu>>, 697 vcpu_thread_barrier: Arc<Barrier>, 698 inserting: bool, 699 ) -> Result<()> { 700 let cpu_id = vcpu.lock().unwrap().id; 701 let reset_evt = self.reset_evt.try_clone().unwrap(); 702 let exit_evt = self.exit_evt.try_clone().unwrap(); 703 let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); 704 let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); 705 706 let vcpu_kill = self.vcpu_states[usize::from(cpu_id)].kill.clone(); 707 let vcpu_run_interrupted = self.vcpu_states[usize::from(cpu_id)] 708 .vcpu_run_interrupted 709 .clone(); 710 711 info!("Starting vCPU: cpu_id = {}", cpu_id); 712 713 // Retrieve seccomp filter for vcpu thread 714 let vcpu_seccomp_filter = get_seccomp_filter(&self.seccomp_action, Thread::Vcpu) 715 .map_err(Error::CreateSeccompFilter)?; 716 717 #[cfg(target_arch = "x86_64")] 718 let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); 719 720 let handle = Some( 721 thread::Builder::new() 722 .name(format!("vcpu{}", cpu_id)) 723 .spawn(move || { 724 // Apply seccomp filter for vcpu thread. 725 if let Err(e) = 726 SeccompFilter::apply(vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) 727 { 728 error!("Error applying seccomp filter: {:?}", e); 729 return; 730 } 731 732 extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} 733 // This uses an async signal safe handler to kill the vcpu handles. 734 register_signal_handler(SIGRTMIN(), handle_signal) 735 .expect("Failed to register vcpu signal handler"); 736 737 // Block until all CPUs are ready. 738 vcpu_thread_barrier.wait(); 739 740 loop { 741 // If we are being told to pause, we park the thread 742 // until the pause boolean is toggled. 743 // The resume operation is responsible for toggling 744 // the boolean and unpark the thread. 745 // We enter a loop because park() could spuriously 746 // return. We will then park() again unless the 747 // pause boolean has been toggled. 748 749 // Need to use Ordering::SeqCst as we have multiple 750 // loads and stores to different atomics and we need 751 // to see them in a consistent order in all threads 752 753 if vcpu_pause_signalled.load(Ordering::SeqCst) { 754 vcpu_run_interrupted.store(true, Ordering::SeqCst); 755 while vcpu_pause_signalled.load(Ordering::SeqCst) { 756 thread::park(); 757 } 758 vcpu_run_interrupted.store(false, Ordering::SeqCst); 759 } 760 761 // We've been told to terminate 762 if vcpu_kill_signalled.load(Ordering::SeqCst) 763 || vcpu_kill.load(Ordering::SeqCst) 764 { 765 vcpu_run_interrupted.store(true, Ordering::SeqCst); 766 break; 767 } 768 769 // vcpu.run() returns false on a triple-fault so trigger a reset 770 match vcpu.lock().unwrap().run() { 771 Ok(run) => match run { 772 #[cfg(target_arch = "x86_64")] 773 VmExit::IoapicEoi(vector) => { 774 if let Some(interrupt_controller) = &interrupt_controller_clone 775 { 776 interrupt_controller 777 .lock() 778 .unwrap() 779 .end_of_interrupt(vector); 780 } 781 } 782 VmExit::Ignore => {} 783 VmExit::Hyperv => {} 784 VmExit::Reset => { 785 debug!("VmExit::Reset"); 786 vcpu_run_interrupted.store(true, Ordering::SeqCst); 787 reset_evt.write(1).unwrap(); 788 break; 789 } 790 VmExit::Shutdown => { 791 debug!("VmExit::Shutdown"); 792 vcpu_run_interrupted.store(true, Ordering::SeqCst); 793 exit_evt.write(1).unwrap(); 794 break; 795 } 796 _ => { 797 error!("VCPU generated error: {:?}", Error::UnexpectedVmExit); 798 break; 799 } 800 }, 801 802 Err(e) => { 803 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); 804 break; 805 } 806 } 807 808 // We've been told to terminate 809 if vcpu_kill_signalled.load(Ordering::SeqCst) 810 || vcpu_kill.load(Ordering::SeqCst) 811 { 812 vcpu_run_interrupted.store(true, Ordering::SeqCst); 813 break; 814 } 815 } 816 }) 817 .map_err(Error::VcpuSpawn)?, 818 ); 819 820 // On hot plug calls into this function entry_point is None. It is for 821 // those hotplug CPU additions that we need to set the inserting flag. 822 self.vcpu_states[usize::from(cpu_id)].handle = handle; 823 self.vcpu_states[usize::from(cpu_id)].inserting = inserting; 824 825 Ok(()) 826 } 827 828 /// Start up as many vCPUs threads as needed to reach `desired_vcpus` 829 fn activate_vcpus(&mut self, desired_vcpus: u8, inserting: bool) -> Result<()> { 830 if desired_vcpus > self.config.max_vcpus { 831 return Err(Error::DesiredVCpuCountExceedsMax); 832 } 833 834 let vcpu_thread_barrier = Arc::new(Barrier::new( 835 (desired_vcpus - self.present_vcpus() + 1) as usize, 836 )); 837 838 info!( 839 "Starting vCPUs: desired = {}, allocated = {}, present = {}", 840 desired_vcpus, 841 self.vcpus.len(), 842 self.present_vcpus() 843 ); 844 845 // This reuses any inactive vCPUs as well as any that were newly created 846 for cpu_id in self.present_vcpus()..desired_vcpus { 847 let vcpu = Arc::clone(&self.vcpus[cpu_id as usize]); 848 self.start_vcpu(vcpu, vcpu_thread_barrier.clone(), inserting)?; 849 } 850 851 // Unblock all CPU threads. 852 vcpu_thread_barrier.wait(); 853 Ok(()) 854 } 855 856 fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { 857 // Mark vCPUs for removal, actual removal happens on ejection 858 for cpu_id in desired_vcpus..self.present_vcpus() { 859 self.vcpu_states[usize::from(cpu_id)].removing = true; 860 } 861 } 862 863 fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { 864 info!("Removing vCPU: cpu_id = {}", cpu_id); 865 let mut state = &mut self.vcpu_states[usize::from(cpu_id)]; 866 state.kill.store(true, Ordering::SeqCst); 867 state.signal_thread(); 868 state.join_thread()?; 869 state.handle = None; 870 871 // Once the thread has exited, clear the "kill" so that it can reused 872 state.kill.store(false, Ordering::SeqCst); 873 874 Ok(()) 875 } 876 877 pub fn create_boot_vcpus(&mut self, entry_point: Option<EntryPoint>) -> Result<()> { 878 self.create_vcpus(self.boot_vcpus(), entry_point) 879 } 880 881 // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. 882 pub fn start_boot_vcpus(&mut self) -> Result<()> { 883 self.activate_vcpus(self.boot_vcpus(), false) 884 } 885 886 pub fn start_restored_vcpus(&mut self) -> Result<()> { 887 let vcpu_numbers = self.vcpus.len(); 888 let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_numbers + 1) as usize)); 889 // Restore the vCPUs in "paused" state. 890 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 891 892 for vcpu_index in 0..vcpu_numbers { 893 let vcpu = Arc::clone(&self.vcpus[vcpu_index as usize]); 894 895 self.start_vcpu(vcpu, vcpu_thread_barrier.clone(), false) 896 .map_err(|e| { 897 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) 898 })?; 899 } 900 // Unblock all restored CPU threads. 901 vcpu_thread_barrier.wait(); 902 Ok(()) 903 } 904 905 pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> { 906 match desired_vcpus.cmp(&self.present_vcpus()) { 907 cmp::Ordering::Greater => { 908 self.create_vcpus(desired_vcpus, None)?; 909 self.activate_vcpus(desired_vcpus, true)?; 910 Ok(true) 911 } 912 cmp::Ordering::Less => { 913 self.mark_vcpus_for_removal(desired_vcpus); 914 Ok(true) 915 } 916 _ => Ok(false), 917 } 918 } 919 920 pub fn shutdown(&mut self) -> Result<()> { 921 // Tell the vCPUs to stop themselves next time they go through the loop 922 self.vcpus_kill_signalled.store(true, Ordering::SeqCst); 923 924 // Toggle the vCPUs pause boolean 925 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 926 927 // Unpark all the VCPU threads. 928 for state in self.vcpu_states.iter() { 929 state.unpark_thread(); 930 } 931 932 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 933 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 934 // above. 935 for state in self.vcpu_states.iter() { 936 state.signal_thread(); 937 } 938 939 // Wait for all the threads to finish. This removes the state from the vector. 940 for mut state in self.vcpu_states.drain(..) { 941 state.join_thread()?; 942 } 943 944 Ok(()) 945 } 946 947 #[cfg(feature = "tdx")] 948 pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { 949 for vcpu in &self.vcpus { 950 vcpu.lock() 951 .unwrap() 952 .vcpu 953 .tdx_init(hob_address) 954 .map_err(Error::InitializeTdx)?; 955 } 956 Ok(()) 957 } 958 959 pub fn boot_vcpus(&self) -> u8 { 960 self.config.boot_vcpus 961 } 962 963 pub fn max_vcpus(&self) -> u8 { 964 self.config.max_vcpus 965 } 966 967 #[cfg(target_arch = "x86_64")] 968 pub fn common_cpuid(&self) -> CpuId { 969 self.cpuid.clone() 970 } 971 972 fn present_vcpus(&self) -> u8 { 973 self.vcpu_states 974 .iter() 975 .fold(0, |acc, state| acc + state.active() as u8) 976 } 977 978 #[cfg(target_arch = "aarch64")] 979 pub fn get_mpidrs(&self) -> Vec<u64> { 980 self.vcpus 981 .iter() 982 .map(|cpu| cpu.lock().unwrap().get_mpidr()) 983 .collect() 984 } 985 986 #[cfg(target_arch = "aarch64")] 987 pub fn get_saved_states(&self) -> Vec<CpuState> { 988 self.vcpus 989 .iter() 990 .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) 991 .collect() 992 } 993 994 #[cfg(target_arch = "aarch64")] 995 pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { 996 self.config 997 .topology 998 .clone() 999 .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) 1000 } 1001 1002 #[cfg(feature = "acpi")] 1003 pub fn create_madt(&self) -> Sdt { 1004 use crate::acpi; 1005 // This is also checked in the commandline parsing. 1006 assert!(self.config.boot_vcpus <= self.config.max_vcpus); 1007 1008 let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); 1009 #[cfg(target_arch = "x86_64")] 1010 { 1011 madt.write(36, arch::layout::APIC_START); 1012 1013 for cpu in 0..self.config.max_vcpus { 1014 let lapic = LocalApic { 1015 r#type: acpi::ACPI_APIC_PROCESSOR, 1016 length: 8, 1017 processor_id: cpu, 1018 apic_id: cpu, 1019 flags: if cpu < self.config.boot_vcpus { 1020 1 << MADT_CPU_ENABLE_FLAG 1021 } else { 1022 0 1023 }, 1024 }; 1025 madt.append(lapic); 1026 } 1027 1028 madt.append(Ioapic { 1029 r#type: acpi::ACPI_APIC_IO, 1030 length: 12, 1031 ioapic_id: 0, 1032 apic_address: arch::layout::IOAPIC_START.0 as u32, 1033 gsi_base: 0, 1034 ..Default::default() 1035 }); 1036 1037 madt.append(InterruptSourceOverride { 1038 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, 1039 length: 10, 1040 bus: 0, 1041 source: 4, 1042 gsi: 4, 1043 flags: 0, 1044 }); 1045 } 1046 1047 #[cfg(target_arch = "aarch64")] 1048 { 1049 /* Notes: 1050 * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. 1051 */ 1052 1053 // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. 1054 for cpu in 0..self.config.boot_vcpus { 1055 let vcpu = &self.vcpus[cpu as usize]; 1056 let mpidr = vcpu.lock().unwrap().get_mpidr(); 1057 /* ARMv8 MPIDR format: 1058 Bits [63:40] Must be zero 1059 Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR 1060 Bits [31:24] Must be zero 1061 Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR 1062 Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR 1063 Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR 1064 */ 1065 let mpidr_mask = 0xff_00ff_ffff; 1066 let gicc = GicC { 1067 r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, 1068 length: 80, 1069 reserved0: 0, 1070 cpu_interface_number: cpu as u32, 1071 uid: cpu as u32, 1072 flags: 1, 1073 parking_version: 0, 1074 performance_interrupt: 0, 1075 parked_address: 0, 1076 base_address: 0, 1077 gicv_base_address: 0, 1078 gich_base_address: 0, 1079 vgic_interrupt: 0, 1080 gicr_base_address: 0, 1081 mpidr: mpidr & mpidr_mask, 1082 proc_power_effi_class: 0, 1083 reserved1: 0, 1084 spe_overflow_interrupt: 0, 1085 }; 1086 1087 madt.append(gicc); 1088 } 1089 1090 // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. 1091 let gicd = GicD { 1092 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, 1093 length: 24, 1094 reserved0: 0, 1095 gic_id: 0, 1096 base_address: arch::layout::MAPPED_IO_START - 0x0001_0000, 1097 global_irq_base: 0, 1098 version: 3, 1099 reserved1: [0; 3], 1100 }; 1101 madt.append(gicd); 1102 1103 // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. 1104 let gicr_size: u32 = 0x0001_0000 * 2 * (self.config.boot_vcpus as u32); 1105 let gicr_base: u64 = arch::layout::MAPPED_IO_START - 0x0001_0000 - gicr_size as u64; 1106 let gicr = GicR { 1107 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, 1108 length: 16, 1109 reserved: 0, 1110 base_address: gicr_base, 1111 range_length: gicr_size, 1112 }; 1113 madt.append(gicr); 1114 1115 // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. 1116 let gicits = GicIts { 1117 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, 1118 length: 20, 1119 reserved0: 0, 1120 translation_id: 0, 1121 base_address: gicr_base - 2 * 0x0001_0000, 1122 reserved1: 0, 1123 }; 1124 madt.append(gicits); 1125 1126 madt.update_checksum(); 1127 } 1128 1129 madt 1130 } 1131 1132 #[cfg(all(target_arch = "aarch64", feature = "acpi"))] 1133 pub fn create_pptt(&self) -> Sdt { 1134 let pptt_start = 0; 1135 let mut cpus = 0; 1136 let mut uid = 0; 1137 let threads_per_core = self.get_vcpu_topology().unwrap_or_default().0 as u8; 1138 let cores_per_package = self.get_vcpu_topology().unwrap_or_default().1 as u8; 1139 let packages = self.get_vcpu_topology().unwrap_or_default().2 as u8; 1140 1141 let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); 1142 1143 for cluster_idx in 0..packages { 1144 if cpus < self.config.boot_vcpus as usize { 1145 let cluster_offset = pptt.len() - pptt_start; 1146 let cluster_hierarchy_node = ProcessorHierarchyNode { 1147 r#type: 0, 1148 length: 20, 1149 reserved: 0, 1150 flags: 0x2, 1151 parent: 0, 1152 acpi_processor_id: cluster_idx as u32, 1153 num_private_resources: 0, 1154 }; 1155 pptt.append(cluster_hierarchy_node); 1156 1157 for core_idx in 0..cores_per_package { 1158 let core_offset = pptt.len() - pptt_start; 1159 1160 if threads_per_core > 1 { 1161 let core_hierarchy_node = ProcessorHierarchyNode { 1162 r#type: 0, 1163 length: 20, 1164 reserved: 0, 1165 flags: 0x2, 1166 parent: cluster_offset as u32, 1167 acpi_processor_id: core_idx as u32, 1168 num_private_resources: 0, 1169 }; 1170 pptt.append(core_hierarchy_node); 1171 1172 for _thread_idx in 0..threads_per_core { 1173 let thread_hierarchy_node = ProcessorHierarchyNode { 1174 r#type: 0, 1175 length: 20, 1176 reserved: 0, 1177 flags: 0xE, 1178 parent: core_offset as u32, 1179 acpi_processor_id: uid as u32, 1180 num_private_resources: 0, 1181 }; 1182 pptt.append(thread_hierarchy_node); 1183 uid += 1; 1184 } 1185 } else { 1186 let thread_hierarchy_node = ProcessorHierarchyNode { 1187 r#type: 0, 1188 length: 20, 1189 reserved: 0, 1190 flags: 0xA, 1191 parent: cluster_offset as u32, 1192 acpi_processor_id: uid as u32, 1193 num_private_resources: 0, 1194 }; 1195 pptt.append(thread_hierarchy_node); 1196 uid += 1; 1197 } 1198 } 1199 cpus += (cores_per_package * threads_per_core) as usize; 1200 } 1201 } 1202 1203 pptt.update_checksum(); 1204 pptt 1205 } 1206 } 1207 1208 #[cfg(feature = "acpi")] 1209 struct Cpu { 1210 cpu_id: u8, 1211 proximity_domain: u32, 1212 } 1213 1214 #[cfg(all(target_arch = "x86_64", feature = "acpi"))] 1215 const MADT_CPU_ENABLE_FLAG: usize = 0; 1216 1217 #[cfg(feature = "acpi")] 1218 impl Cpu { 1219 #[cfg(target_arch = "x86_64")] 1220 fn generate_mat(&self) -> Vec<u8> { 1221 let lapic = LocalApic { 1222 r#type: 0, 1223 length: 8, 1224 processor_id: self.cpu_id, 1225 apic_id: self.cpu_id, 1226 flags: 1 << MADT_CPU_ENABLE_FLAG, 1227 }; 1228 1229 let mut mat_data: Vec<u8> = Vec::new(); 1230 mat_data.resize(std::mem::size_of_val(&lapic), 0); 1231 unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic }; 1232 1233 mat_data 1234 } 1235 } 1236 1237 #[cfg(feature = "acpi")] 1238 impl Aml for Cpu { 1239 fn to_aml_bytes(&self) -> Vec<u8> { 1240 #[cfg(target_arch = "x86_64")] 1241 let mat_data: Vec<u8> = self.generate_mat(); 1242 1243 aml::Device::new( 1244 format!("C{:03}", self.cpu_id).as_str().into(), 1245 vec![ 1246 &aml::Name::new("_HID".into(), &"ACPI0007"), 1247 &aml::Name::new("_UID".into(), &self.cpu_id), 1248 // Currently, AArch64 cannot support following fields. 1249 /* 1250 _STA return value: 1251 Bit [0] – Set if the device is present. 1252 Bit [1] – Set if the device is enabled and decoding its resources. 1253 Bit [2] – Set if the device should be shown in the UI. 1254 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1255 Bit [4] – Set if the battery is present. 1256 Bits [31:5] – Reserved (must be cleared). 1257 */ 1258 #[cfg(target_arch = "x86_64")] 1259 &aml::Method::new( 1260 "_STA".into(), 1261 0, 1262 false, 1263 // Call into CSTA method which will interrogate device 1264 vec![&aml::Return::new(&aml::MethodCall::new( 1265 "CSTA".into(), 1266 vec![&self.cpu_id], 1267 ))], 1268 ), 1269 &aml::Method::new( 1270 "_PXM".into(), 1271 0, 1272 false, 1273 vec![&aml::Return::new(&self.proximity_domain)], 1274 ), 1275 // The Linux kernel expects every CPU device to have a _MAT entry 1276 // containing the LAPIC for this processor with the enabled bit set 1277 // even it if is disabled in the MADT (non-boot CPU) 1278 #[cfg(target_arch = "x86_64")] 1279 &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)), 1280 // Trigger CPU ejection 1281 #[cfg(target_arch = "x86_64")] 1282 &aml::Method::new( 1283 "_EJ0".into(), 1284 1, 1285 false, 1286 // Call into CEJ0 method which will actually eject device 1287 vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], 1288 ), 1289 ], 1290 ) 1291 .to_aml_bytes() 1292 } 1293 } 1294 1295 #[cfg(feature = "acpi")] 1296 struct CpuNotify { 1297 cpu_id: u8, 1298 } 1299 1300 #[cfg(feature = "acpi")] 1301 impl Aml for CpuNotify { 1302 fn to_aml_bytes(&self) -> Vec<u8> { 1303 let object = aml::Path::new(&format!("C{:03}", self.cpu_id)); 1304 aml::If::new( 1305 &aml::Equal::new(&aml::Arg(0), &self.cpu_id), 1306 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1307 ) 1308 .to_aml_bytes() 1309 } 1310 } 1311 1312 #[cfg(feature = "acpi")] 1313 struct CpuMethods { 1314 max_vcpus: u8, 1315 } 1316 1317 #[cfg(feature = "acpi")] 1318 impl Aml for CpuMethods { 1319 fn to_aml_bytes(&self) -> Vec<u8> { 1320 let mut bytes = Vec::new(); 1321 bytes.extend_from_slice( 1322 // CPU status method 1323 &aml::Method::new( 1324 "CSTA".into(), 1325 1, 1326 true, 1327 vec![ 1328 // Take lock defined above 1329 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1330 // Write CPU number (in first argument) to I/O port via field 1331 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1332 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1333 // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) 1334 &aml::If::new( 1335 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), 1336 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 1337 ), 1338 // Release lock 1339 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1340 // Return 0 or 0xf 1341 &aml::Return::new(&aml::Local(0)), 1342 ], 1343 ) 1344 .to_aml_bytes(), 1345 ); 1346 1347 let mut cpu_notifies = Vec::new(); 1348 for cpu_id in 0..self.max_vcpus { 1349 cpu_notifies.push(CpuNotify { cpu_id }); 1350 } 1351 1352 let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new(); 1353 for cpu_id in 0..self.max_vcpus { 1354 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); 1355 } 1356 1357 bytes.extend_from_slice( 1358 &aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(), 1359 ); 1360 1361 bytes.extend_from_slice( 1362 &aml::Method::new( 1363 "CEJ0".into(), 1364 1, 1365 true, 1366 vec![ 1367 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1368 // Write CPU number (in first argument) to I/O port via field 1369 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), 1370 // Set CEJ0 bit 1371 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), 1372 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1373 ], 1374 ) 1375 .to_aml_bytes(), 1376 ); 1377 1378 bytes.extend_from_slice( 1379 &aml::Method::new( 1380 "CSCN".into(), 1381 0, 1382 true, 1383 vec![ 1384 // Take lock defined above 1385 &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), 1386 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1387 &aml::While::new( 1388 &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), 1389 vec![ 1390 // Write CPU number (in first argument) to I/O port via field 1391 &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), 1392 // Check if CINS bit is set 1393 &aml::If::new( 1394 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), 1395 // Notify device if it is 1396 vec![ 1397 &aml::MethodCall::new( 1398 "CTFY".into(), 1399 vec![&aml::Local(0), &aml::ONE], 1400 ), 1401 // Reset CINS bit 1402 &aml::Store::new( 1403 &aml::Path::new("\\_SB_.PRES.CINS"), 1404 &aml::ONE, 1405 ), 1406 ], 1407 ), 1408 // Check if CRMV bit is set 1409 &aml::If::new( 1410 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), 1411 // Notify device if it is (with the eject constant 0x3) 1412 vec![ 1413 &aml::MethodCall::new( 1414 "CTFY".into(), 1415 vec![&aml::Local(0), &3u8], 1416 ), 1417 // Reset CRMV bit 1418 &aml::Store::new( 1419 &aml::Path::new("\\_SB_.PRES.CRMV"), 1420 &aml::ONE, 1421 ), 1422 ], 1423 ), 1424 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 1425 ], 1426 ), 1427 // Release lock 1428 &aml::Release::new("\\_SB_.PRES.CPLK".into()), 1429 ], 1430 ) 1431 .to_aml_bytes(), 1432 ); 1433 bytes 1434 } 1435 } 1436 1437 #[cfg(feature = "acpi")] 1438 impl Aml for CpuManager { 1439 fn to_aml_bytes(&self) -> Vec<u8> { 1440 let mut bytes = Vec::new(); 1441 // CPU hotplug controller 1442 #[cfg(target_arch = "x86_64")] 1443 bytes.extend_from_slice( 1444 &aml::Device::new( 1445 "_SB_.PRES".into(), 1446 vec![ 1447 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 1448 &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), 1449 // Mutex to protect concurrent access as we write to choose CPU and then read back status 1450 &aml::Mutex::new("CPLK".into(), 0), 1451 &aml::Name::new( 1452 "_CRS".into(), 1453 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 1454 aml::AddressSpaceCachable::NotCacheable, 1455 true, 1456 self.acpi_address.0 as u64, 1457 self.acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, 1458 )]), 1459 ), 1460 // OpRegion and Fields map MMIO range into individual field values 1461 &aml::OpRegion::new( 1462 "PRST".into(), 1463 aml::OpRegionSpace::SystemMemory, 1464 self.acpi_address.0 as usize, 1465 CPU_MANAGER_ACPI_SIZE, 1466 ), 1467 &aml::Field::new( 1468 "PRST".into(), 1469 aml::FieldAccessType::Byte, 1470 aml::FieldUpdateRule::WriteAsZeroes, 1471 vec![ 1472 aml::FieldEntry::Reserved(32), 1473 aml::FieldEntry::Named(*b"CPEN", 1), 1474 aml::FieldEntry::Named(*b"CINS", 1), 1475 aml::FieldEntry::Named(*b"CRMV", 1), 1476 aml::FieldEntry::Named(*b"CEJ0", 1), 1477 aml::FieldEntry::Reserved(4), 1478 aml::FieldEntry::Named(*b"CCMD", 8), 1479 ], 1480 ), 1481 &aml::Field::new( 1482 "PRST".into(), 1483 aml::FieldAccessType::DWord, 1484 aml::FieldUpdateRule::Preserve, 1485 vec![ 1486 aml::FieldEntry::Named(*b"CSEL", 32), 1487 aml::FieldEntry::Reserved(32), 1488 aml::FieldEntry::Named(*b"CDAT", 32), 1489 ], 1490 ), 1491 ], 1492 ) 1493 .to_aml_bytes(), 1494 ); 1495 1496 // CPU devices 1497 let hid = aml::Name::new("_HID".into(), &"ACPI0010"); 1498 let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05")); 1499 // Bundle methods together under a common object 1500 let methods = CpuMethods { 1501 max_vcpus: self.config.max_vcpus, 1502 }; 1503 let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods]; 1504 1505 let mut cpu_devices = Vec::new(); 1506 for cpu_id in 0..self.config.max_vcpus { 1507 let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); 1508 let cpu_device = Cpu { 1509 cpu_id, 1510 proximity_domain, 1511 }; 1512 1513 cpu_devices.push(cpu_device); 1514 } 1515 1516 for cpu_device in cpu_devices.iter() { 1517 cpu_data_inner.push(cpu_device); 1518 } 1519 1520 bytes.extend_from_slice( 1521 &aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(), 1522 ); 1523 bytes 1524 } 1525 } 1526 1527 impl Pausable for CpuManager { 1528 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 1529 // Tell the vCPUs to pause themselves next time they exit 1530 self.vcpus_pause_signalled.store(true, Ordering::SeqCst); 1531 1532 // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads 1533 // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set 1534 // above. 1535 for state in self.vcpu_states.iter() { 1536 state.signal_thread(); 1537 } 1538 1539 for vcpu in self.vcpus.iter() { 1540 let mut vcpu = vcpu.lock().unwrap(); 1541 vcpu.pause()?; 1542 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 1543 if !self.config.kvm_hyperv { 1544 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { 1545 MigratableError::Pause(anyhow!( 1546 "Could not notify guest it has been paused {:?}", 1547 e 1548 )) 1549 })?; 1550 } 1551 } 1552 1553 Ok(()) 1554 } 1555 1556 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 1557 for vcpu in self.vcpus.iter() { 1558 vcpu.lock().unwrap().resume()?; 1559 } 1560 1561 // Toggle the vCPUs pause boolean 1562 self.vcpus_pause_signalled.store(false, Ordering::SeqCst); 1563 1564 // Unpark all the VCPU threads. 1565 // Once unparked, the next thing they will do is checking for the pause 1566 // boolean. Since it'll be set to false, they will exit their pause loop 1567 // and go back to vmx root. 1568 for state in self.vcpu_states.iter() { 1569 state.unpark_thread(); 1570 } 1571 Ok(()) 1572 } 1573 } 1574 1575 impl Snapshottable for CpuManager { 1576 fn id(&self) -> String { 1577 CPU_MANAGER_SNAPSHOT_ID.to_string() 1578 } 1579 1580 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 1581 let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID); 1582 1583 // The CpuManager snapshot is a collection of all vCPUs snapshots. 1584 for vcpu in &self.vcpus { 1585 let cpu_snapshot = vcpu.lock().unwrap().snapshot()?; 1586 cpu_manager_snapshot.add_snapshot(cpu_snapshot); 1587 } 1588 1589 Ok(cpu_manager_snapshot) 1590 } 1591 1592 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 1593 for (cpu_id, snapshot) in snapshot.snapshots.iter() { 1594 debug!("Restoring VCPU {}", cpu_id); 1595 self.create_vcpu(cpu_id.parse::<u8>().unwrap(), None, Some(*snapshot.clone())) 1596 .map_err(|e| MigratableError::Restore(anyhow!("Could not create vCPU {:?}", e)))?; 1597 } 1598 1599 Ok(()) 1600 } 1601 } 1602 1603 impl Transportable for CpuManager {} 1604 impl Migratable for CpuManager {} 1605 1606 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 1607 #[cfg(test)] 1608 mod tests { 1609 use arch::x86_64::interrupts::*; 1610 use arch::x86_64::regs::*; 1611 use hypervisor::x86_64::{FpuState, LapicState, StandardRegisters}; 1612 1613 #[test] 1614 fn test_setlint() { 1615 let hv = hypervisor::new().unwrap(); 1616 let vm = hv.create_vm().expect("new VM fd creation failed"); 1617 assert!(hv.check_required_extensions().is_ok()); 1618 // Calling get_lapic will fail if there is no irqchip before hand. 1619 assert!(vm.create_irq_chip().is_ok()); 1620 let vcpu = vm.create_vcpu(0, None).unwrap(); 1621 let klapic_before: LapicState = vcpu.get_lapic().unwrap(); 1622 1623 // Compute the value that is expected to represent LVT0 and LVT1. 1624 let lint0 = get_klapic_reg(&klapic_before, APIC_LVT0); 1625 let lint1 = get_klapic_reg(&klapic_before, APIC_LVT1); 1626 let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); 1627 let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); 1628 1629 set_lint(&vcpu).unwrap(); 1630 1631 // Compute the value that represents LVT0 and LVT1 after set_lint. 1632 let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); 1633 let lint0_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT0); 1634 let lint1_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT1); 1635 assert_eq!(lint0_mode_expected, lint0_mode_actual); 1636 assert_eq!(lint1_mode_expected, lint1_mode_actual); 1637 } 1638 1639 #[test] 1640 fn test_setup_fpu() { 1641 let hv = hypervisor::new().unwrap(); 1642 let vm = hv.create_vm().expect("new VM fd creation failed"); 1643 let vcpu = vm.create_vcpu(0, None).unwrap(); 1644 setup_fpu(&vcpu).unwrap(); 1645 1646 let expected_fpu: FpuState = FpuState { 1647 fcw: 0x37f, 1648 mxcsr: 0x1f80, 1649 ..Default::default() 1650 }; 1651 let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); 1652 // TODO: auto-generate kvm related structures with PartialEq on. 1653 assert_eq!(expected_fpu.fcw, actual_fpu.fcw); 1654 // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. 1655 // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. 1656 // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should 1657 // remove it at all. 1658 // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); 1659 } 1660 1661 #[test] 1662 fn test_setup_msrs() { 1663 use hypervisor::arch::x86::msr_index; 1664 use hypervisor::x86_64::{MsrEntries, MsrEntry}; 1665 1666 let hv = hypervisor::new().unwrap(); 1667 let vm = hv.create_vm().expect("new VM fd creation failed"); 1668 let vcpu = vm.create_vcpu(0, None).unwrap(); 1669 setup_msrs(&vcpu).unwrap(); 1670 1671 // This test will check against the last MSR entry configured (the tenth one). 1672 // See create_msr_entries for details. 1673 let mut msrs = MsrEntries::from_entries(&[MsrEntry { 1674 index: msr_index::MSR_IA32_MISC_ENABLE, 1675 ..Default::default() 1676 }]) 1677 .unwrap(); 1678 1679 // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 1680 // in this test case scenario. 1681 let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); 1682 assert_eq!(read_msrs, 1); 1683 1684 // Official entries that were setup when we did setup_msrs. We need to assert that the 1685 // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we 1686 // expect. 1687 let entry_vec = hypervisor::x86_64::boot_msr_entries(); 1688 assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); 1689 } 1690 1691 #[test] 1692 fn test_setup_regs() { 1693 let hv = hypervisor::new().unwrap(); 1694 let vm = hv.create_vm().expect("new VM fd creation failed"); 1695 let vcpu = vm.create_vcpu(0, None).unwrap(); 1696 1697 let expected_regs: StandardRegisters = StandardRegisters { 1698 rflags: 0x0000000000000002u64, 1699 rbx: arch::layout::PVH_INFO_START.0, 1700 rip: 1, 1701 ..Default::default() 1702 }; 1703 1704 setup_regs(&vcpu, expected_regs.rip).unwrap(); 1705 1706 let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); 1707 assert_eq!(actual_regs, expected_regs); 1708 } 1709 } 1710 1711 #[cfg(target_arch = "aarch64")] 1712 #[cfg(test)] 1713 mod tests { 1714 use crate::GuestMemoryMmap; 1715 use arch::aarch64::layout; 1716 use arch::aarch64::regs::*; 1717 use hypervisor::kvm::aarch64::{is_system_register, MPIDR_EL1}; 1718 use hypervisor::kvm::kvm_bindings::{ 1719 kvm_one_reg, kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 1720 KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, 1721 }; 1722 use hypervisor::{arm64_core_reg_id, offset__of}; 1723 use std::mem; 1724 use vm_memory::GuestAddress; 1725 1726 #[test] 1727 fn test_setup_regs() { 1728 let hv = hypervisor::new().unwrap(); 1729 let vm = hv.create_vm().unwrap(); 1730 let vcpu = vm.create_vcpu(0, None).unwrap(); 1731 let regions = vec![( 1732 GuestAddress(layout::RAM_64BIT_START), 1733 (layout::FDT_MAX_SIZE + 0x1000) as usize, 1734 )]; 1735 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 1736 1737 let res = setup_regs(&vcpu, 0, 0x0, &mem); 1738 // Must fail when vcpu is not initialized yet. 1739 assert!(res.is_err()); 1740 1741 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1742 vm.get_preferred_target(&mut kvi).unwrap(); 1743 vcpu.vcpu_init(&kvi).unwrap(); 1744 1745 assert!(setup_regs(&vcpu, 0, 0x0, &mem).is_ok()); 1746 } 1747 1748 #[test] 1749 fn test_read_mpidr() { 1750 let hv = hypervisor::new().unwrap(); 1751 let vm = hv.create_vm().unwrap(); 1752 let vcpu = vm.create_vcpu(0, None).unwrap(); 1753 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1754 vm.get_preferred_target(&mut kvi).unwrap(); 1755 1756 // Must fail when vcpu is not initialized yet. 1757 assert!(vcpu.read_mpidr().is_err()); 1758 1759 vcpu.vcpu_init(&kvi).unwrap(); 1760 assert_eq!(vcpu.read_mpidr().unwrap(), 0x80000000); 1761 } 1762 1763 #[test] 1764 fn test_is_system_register() { 1765 let offset = offset__of!(user_pt_regs, pc); 1766 let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); 1767 assert!(!is_system_register(regid)); 1768 let regid = KVM_REG_ARM64 as u64 | KVM_REG_SIZE_U64 as u64 | KVM_REG_ARM64_SYSREG as u64; 1769 assert!(is_system_register(regid)); 1770 } 1771 1772 #[test] 1773 fn test_save_restore_core_regs() { 1774 let hv = hypervisor::new().unwrap(); 1775 let vm = hv.create_vm().unwrap(); 1776 let vcpu = vm.create_vcpu(0, None).unwrap(); 1777 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1778 vm.get_preferred_target(&mut kvi).unwrap(); 1779 1780 // Must fail when vcpu is not initialized yet. 1781 let mut state = kvm_regs::default(); 1782 let res = vcpu.core_registers(&mut state); 1783 assert!(res.is_err()); 1784 assert_eq!( 1785 format!("{}", res.unwrap_err()), 1786 "Failed to get core register: Exec format error (os error 8)" 1787 ); 1788 1789 let res = vcpu.set_core_registers(&state); 1790 assert!(res.is_err()); 1791 assert_eq!( 1792 format!("{}", res.unwrap_err()), 1793 "Failed to set core register: Exec format error (os error 8)" 1794 ); 1795 1796 vcpu.vcpu_init(&kvi).unwrap(); 1797 assert!(vcpu.core_registers(&mut state).is_ok()); 1798 assert_eq!(state.regs.pstate, 0x3C5); 1799 1800 assert!(vcpu.set_core_registers(&state).is_ok()); 1801 let off = offset__of!(user_pt_regs, pstate); 1802 let pstate = vcpu 1803 .get_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1804 .expect("Failed to call kvm get one reg"); 1805 assert_eq!(state.regs.pstate, pstate); 1806 } 1807 1808 #[test] 1809 fn test_save_restore_system_regs() { 1810 let hv = hypervisor::new().unwrap(); 1811 let vm = hv.create_vm().unwrap(); 1812 let vcpu = vm.create_vcpu(0, None).unwrap(); 1813 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1814 vm.get_preferred_target(&mut kvi).unwrap(); 1815 1816 // Must fail when vcpu is not initialized yet. 1817 let mut state: Vec<kvm_one_reg> = Vec::new(); 1818 let res = vcpu.system_registers(&mut state); 1819 assert!(res.is_err()); 1820 assert_eq!( 1821 format!("{}", res.unwrap_err()), 1822 "Failed to retrieve list of registers: Exec format error (os error 8)" 1823 ); 1824 1825 state.push(kvm_one_reg { 1826 id: MPIDR_EL1, 1827 addr: 0x00, 1828 }); 1829 let res = vcpu.set_system_registers(&state); 1830 assert!(res.is_err()); 1831 assert_eq!( 1832 format!("{}", res.unwrap_err()), 1833 "Failed to set system register: Exec format error (os error 8)" 1834 ); 1835 1836 vcpu.vcpu_init(&kvi).unwrap(); 1837 assert!(vcpu.system_registers(&mut state).is_ok()); 1838 let initial_mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr"); 1839 assert!(state.contains(&kvm_one_reg { 1840 id: MPIDR_EL1, 1841 addr: initial_mpidr 1842 })); 1843 1844 assert!(vcpu.set_system_registers(&state).is_ok()); 1845 let mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr"); 1846 assert_eq!(initial_mpidr, mpidr); 1847 } 1848 1849 #[test] 1850 fn test_get_set_mpstate() { 1851 let hv = hypervisor::new().unwrap(); 1852 let vm = hv.create_vm().unwrap(); 1853 let vcpu = vm.create_vcpu(0, None).unwrap(); 1854 let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); 1855 vm.get_preferred_target(&mut kvi).unwrap(); 1856 1857 let res = vcpu.get_mp_state(); 1858 assert!(res.is_ok()); 1859 assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); 1860 } 1861 } 1862