1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 pub use crate::aarch64::{ 13 check_required_kvm_extensions, is_system_register, VcpuInit, VcpuKvmState as CpuState, 14 MPIDR_EL1, 15 }; 16 use crate::cpu; 17 use crate::device; 18 use crate::hypervisor; 19 use crate::vec_with_array_field; 20 use crate::vm::{self, VmmOps}; 21 #[cfg(target_arch = "aarch64")] 22 use crate::{arm64_core_reg_id, offset__of}; 23 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 24 use serde_derive::{Deserialize, Serialize}; 25 use std::collections::HashMap; 26 #[cfg(target_arch = "aarch64")] 27 use std::convert::TryInto; 28 #[cfg(target_arch = "x86_64")] 29 use std::fs::File; 30 use std::os::unix::io::{AsRawFd, RawFd}; 31 use std::result; 32 #[cfg(target_arch = "x86_64")] 33 use std::sync::atomic::{AtomicBool, Ordering}; 34 use std::sync::{Arc, RwLock}; 35 #[cfg(target_arch = "x86_64")] 36 use vm_memory::Address; 37 use vmm_sys_util::eventfd::EventFd; 38 // x86_64 dependencies 39 #[cfg(target_arch = "x86_64")] 40 pub mod x86_64; 41 #[cfg(target_arch = "x86_64")] 42 use crate::arch::x86::NUM_IOAPIC_PINS; 43 #[cfg(target_arch = "aarch64")] 44 use aarch64::{RegList, Register, StandardRegisters}; 45 #[cfg(target_arch = "x86_64")] 46 use kvm_bindings::{ 47 kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP, 48 }; 49 #[cfg(target_arch = "x86_64")] 50 use x86_64::{ 51 check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters, KVM_TSS_ADDRESS, 52 }; 53 #[cfg(target_arch = "x86_64")] 54 pub use x86_64::{ 55 CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState as CpuState, 56 Xsave, CPUID_FLAG_VALID_INDEX, 57 }; 58 // aarch64 dependencies 59 #[cfg(target_arch = "aarch64")] 60 pub mod aarch64; 61 pub use kvm_bindings; 62 #[cfg(feature = "tdx")] 63 use kvm_bindings::KVMIO; 64 pub use kvm_bindings::{ 65 kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry, 66 kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, 67 KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 68 }; 69 #[cfg(target_arch = "aarch64")] 70 use kvm_bindings::{ 71 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE, 72 KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 73 }; 74 pub use kvm_ioctls; 75 pub use kvm_ioctls::{Cap, Kvm}; 76 #[cfg(target_arch = "aarch64")] 77 use std::mem; 78 use thiserror::Error; 79 #[cfg(feature = "tdx")] 80 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr}; 81 /// 82 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 83 /// 84 pub use { 85 kvm_bindings::kvm_clock_data as ClockData, kvm_bindings::kvm_create_device as CreateDevice, 86 kvm_bindings::kvm_device_attr as DeviceAttr, 87 kvm_bindings::kvm_irq_routing_entry as IrqRoutingEntry, kvm_bindings::kvm_mp_state as MpState, 88 kvm_bindings::kvm_userspace_memory_region as MemoryRegion, 89 kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::IoEventAddress, 90 kvm_ioctls::VcpuExit, 91 }; 92 93 #[cfg(target_arch = "x86_64")] 94 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 95 96 #[cfg(feature = "tdx")] 97 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 98 99 #[cfg(feature = "tdx")] 100 #[repr(u32)] 101 enum TdxCommand { 102 #[allow(dead_code)] 103 Capabilities = 0, 104 InitVm, 105 InitVcpu, 106 InitMemRegion, 107 Finalize, 108 } 109 110 #[derive(Clone, Copy, Debug, PartialEq, Deserialize, Serialize)] 111 pub struct KvmVmState {} 112 113 pub use KvmVmState as VmState; 114 115 struct KvmDirtyLogSlot { 116 slot: u32, 117 guest_phys_addr: u64, 118 memory_size: u64, 119 userspace_addr: u64, 120 } 121 122 /// Wrapper over KVM VM ioctls. 123 pub struct KvmVm { 124 fd: Arc<VmFd>, 125 #[cfg(target_arch = "x86_64")] 126 msrs: MsrEntries, 127 state: KvmVmState, 128 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 129 } 130 131 /// 132 /// Implementation of Vm trait for KVM 133 /// Example: 134 /// #[cfg(feature = "kvm")] 135 /// extern crate hypervisor 136 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 137 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 138 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 139 /// vm.set/get().unwrap() 140 /// 141 impl vm::Vm for KvmVm { 142 #[cfg(target_arch = "x86_64")] 143 /// 144 /// Sets the address of the three-page region in the VM's address space. 145 /// 146 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 147 self.fd 148 .set_tss_address(offset) 149 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 150 } 151 /// 152 /// Creates an in-kernel interrupt controller. 153 /// 154 fn create_irq_chip(&self) -> vm::Result<()> { 155 self.fd 156 .create_irq_chip() 157 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 158 } 159 /// 160 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 161 /// 162 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 163 self.fd 164 .register_irqfd(fd, gsi) 165 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 166 } 167 /// 168 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 169 /// 170 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 171 self.fd 172 .unregister_irqfd(fd, gsi) 173 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 174 } 175 /// 176 /// Creates a VcpuFd object from a vcpu RawFd. 177 /// 178 fn create_vcpu( 179 &self, 180 id: u8, 181 vmmops: Option<Arc<dyn VmmOps>>, 182 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 183 let vc = self 184 .fd 185 .create_vcpu(id as u64) 186 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 187 let vcpu = KvmVcpu { 188 fd: vc, 189 #[cfg(target_arch = "x86_64")] 190 msrs: self.msrs.clone(), 191 vmmops, 192 #[cfg(target_arch = "x86_64")] 193 hyperv_synic: AtomicBool::new(false), 194 }; 195 Ok(Arc::new(vcpu)) 196 } 197 /// 198 /// Registers an event to be signaled whenever a certain address is written to. 199 /// 200 fn register_ioevent( 201 &self, 202 fd: &EventFd, 203 addr: &IoEventAddress, 204 datamatch: Option<vm::DataMatch>, 205 ) -> vm::Result<()> { 206 if let Some(dm) = datamatch { 207 match dm { 208 vm::DataMatch::DataMatch32(kvm_dm32) => self 209 .fd 210 .register_ioevent(fd, addr, kvm_dm32) 211 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 212 vm::DataMatch::DataMatch64(kvm_dm64) => self 213 .fd 214 .register_ioevent(fd, addr, kvm_dm64) 215 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 216 } 217 } else { 218 self.fd 219 .register_ioevent(fd, addr, NoDatamatch) 220 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 221 } 222 } 223 /// 224 /// Unregisters an event from a certain address it has been previously registered to. 225 /// 226 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 227 self.fd 228 .unregister_ioevent(fd, addr, NoDatamatch) 229 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 230 } 231 /// 232 /// Sets the GSI routing table entries, overwriting any previously set 233 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 234 /// 235 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 236 let mut irq_routing = 237 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 238 irq_routing[0].nr = entries.len() as u32; 239 irq_routing[0].flags = 0; 240 241 unsafe { 242 let entries_slice: &mut [kvm_irq_routing_entry] = 243 irq_routing[0].entries.as_mut_slice(entries.len()); 244 entries_slice.copy_from_slice(entries); 245 } 246 247 self.fd 248 .set_gsi_routing(&irq_routing[0]) 249 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 250 } 251 /// 252 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 253 /// 254 fn make_user_memory_region( 255 &self, 256 slot: u32, 257 guest_phys_addr: u64, 258 memory_size: u64, 259 userspace_addr: u64, 260 readonly: bool, 261 log_dirty_pages: bool, 262 ) -> MemoryRegion { 263 MemoryRegion { 264 slot, 265 guest_phys_addr, 266 memory_size, 267 userspace_addr, 268 flags: if readonly { KVM_MEM_READONLY } else { 0 } 269 | if log_dirty_pages { 270 KVM_MEM_LOG_DIRTY_PAGES 271 } else { 272 0 273 }, 274 } 275 } 276 /// 277 /// Creates a guest physical memory region. 278 /// 279 fn create_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> { 280 let mut region = user_memory_region; 281 282 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 283 if (region.flags & KVM_MEM_READONLY) != 0 { 284 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 285 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 286 ))); 287 } 288 289 // Keep track of the regions that need dirty pages log 290 self.dirty_log_slots.write().unwrap().insert( 291 region.slot, 292 KvmDirtyLogSlot { 293 slot: region.slot, 294 guest_phys_addr: region.guest_phys_addr, 295 memory_size: region.memory_size, 296 userspace_addr: region.userspace_addr, 297 }, 298 ); 299 300 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 301 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 302 region.flags = 0; 303 } 304 305 // Safe because guest regions are guaranteed not to overlap. 306 unsafe { 307 self.fd 308 .set_user_memory_region(region) 309 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 310 } 311 } 312 /// 313 /// Removes a guest physical memory region. 314 /// 315 fn remove_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> { 316 let mut region = user_memory_region; 317 318 // Remove the corresponding entry from "self.dirty_log_slots" if needed 319 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 320 321 // Setting the size to 0 means "remove" 322 region.memory_size = 0; 323 // Safe because guest regions are guaranteed not to overlap. 324 unsafe { 325 self.fd 326 .set_user_memory_region(region) 327 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 328 } 329 } 330 /// 331 /// Creates an emulated device in the kernel. 332 /// 333 /// See the documentation for `KVM_CREATE_DEVICE`. 334 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> { 335 let fd = self 336 .fd 337 .create_device(device) 338 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 339 let device = KvmDevice { fd }; 340 Ok(Arc::new(device)) 341 } 342 /// 343 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 344 /// 345 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 346 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 347 self.fd 348 .get_preferred_target(kvi) 349 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 350 } 351 #[cfg(target_arch = "x86_64")] 352 fn enable_split_irq(&self) -> vm::Result<()> { 353 // Set TSS 354 self.fd 355 .set_tss_address(KVM_TSS_ADDRESS.raw_value() as usize) 356 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 357 // Create split irqchip 358 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 359 // are not. 360 let mut cap = kvm_enable_cap { 361 cap: KVM_CAP_SPLIT_IRQCHIP, 362 ..Default::default() 363 }; 364 cap.args[0] = NUM_IOAPIC_PINS as u64; 365 self.fd 366 .enable_cap(&cap) 367 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 368 Ok(()) 369 } 370 #[cfg(target_arch = "x86_64")] 371 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 372 let mut cap = kvm_enable_cap { 373 cap: KVM_CAP_SGX_ATTRIBUTE, 374 ..Default::default() 375 }; 376 cap.args[0] = file.as_raw_fd() as u64; 377 self.fd 378 .enable_cap(&cap) 379 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 380 Ok(()) 381 } 382 /// Retrieve guest clock. 383 #[cfg(target_arch = "x86_64")] 384 fn get_clock(&self) -> vm::Result<ClockData> { 385 self.fd 386 .get_clock() 387 .map_err(|e| vm::HypervisorVmError::GetClock(e.into())) 388 } 389 /// Set guest clock. 390 #[cfg(target_arch = "x86_64")] 391 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 392 self.fd 393 .set_clock(data) 394 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 395 } 396 /// Checks if a particular `Cap` is available. 397 fn check_extension(&self, c: Cap) -> bool { 398 self.fd.check_extension(c) 399 } 400 /// Create a device that is used for passthrough 401 fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> { 402 let mut vfio_dev = kvm_create_device { 403 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 404 fd: 0, 405 flags: 0, 406 }; 407 408 self.create_device(&mut vfio_dev) 409 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 410 } 411 /// 412 /// Get the Vm state. Return VM specific data 413 /// 414 fn state(&self) -> vm::Result<VmState> { 415 Ok(self.state) 416 } 417 /// 418 /// Set the VM state 419 /// 420 fn set_state(&self, _state: VmState) -> vm::Result<()> { 421 Ok(()) 422 } 423 424 /// 425 /// Start logging dirty pages 426 /// 427 fn start_dirty_log(&self) -> vm::Result<()> { 428 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 429 for (_, s) in dirty_log_slots.iter() { 430 let region = MemoryRegion { 431 slot: s.slot, 432 guest_phys_addr: s.guest_phys_addr, 433 memory_size: s.memory_size, 434 userspace_addr: s.userspace_addr, 435 flags: KVM_MEM_LOG_DIRTY_PAGES, 436 }; 437 // Safe because guest regions are guaranteed not to overlap. 438 unsafe { 439 self.fd 440 .set_user_memory_region(region) 441 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 442 } 443 } 444 445 Ok(()) 446 } 447 448 /// 449 /// Stop logging dirty pages 450 /// 451 fn stop_dirty_log(&self) -> vm::Result<()> { 452 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 453 for (_, s) in dirty_log_slots.iter() { 454 let region = MemoryRegion { 455 slot: s.slot, 456 guest_phys_addr: s.guest_phys_addr, 457 memory_size: s.memory_size, 458 userspace_addr: s.userspace_addr, 459 flags: 0, 460 }; 461 // Safe because guest regions are guaranteed not to overlap. 462 unsafe { 463 self.fd 464 .set_user_memory_region(region) 465 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 466 } 467 } 468 469 Ok(()) 470 } 471 472 /// 473 /// Get dirty pages bitmap (one bit per page) 474 /// 475 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 476 self.fd 477 .get_dirty_log(slot, memory_size as usize) 478 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 479 } 480 481 /// 482 /// Initialize TDX for this VM 483 /// 484 #[cfg(feature = "tdx")] 485 fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> { 486 #[repr(C)] 487 struct TdxInitVm { 488 max_vcpus: u32, 489 reserved: u32, 490 attributes: u64, 491 cpuid: u64, 492 } 493 let data = TdxInitVm { 494 max_vcpus, 495 reserved: 0, 496 attributes: 1, // TDX1_TD_ATTRIBUTE_DEBUG, 497 cpuid: cpuid.as_fam_struct_ptr() as u64, 498 }; 499 500 tdx_command( 501 &self.fd.as_raw_fd(), 502 TdxCommand::InitVm, 503 0, 504 &data as *const _ as u64, 505 ) 506 .map_err(vm::HypervisorVmError::InitializeTdx) 507 } 508 509 /// 510 /// Finalize the TDX setup for this VM 511 /// 512 #[cfg(feature = "tdx")] 513 fn tdx_finalize(&self) -> vm::Result<()> { 514 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 515 .map_err(vm::HypervisorVmError::FinalizeTdx) 516 } 517 518 /// 519 /// Initialize memory regions for the TDX VM 520 /// 521 #[cfg(feature = "tdx")] 522 fn tdx_init_memory_region( 523 &self, 524 host_address: u64, 525 guest_address: u64, 526 size: u64, 527 measure: bool, 528 ) -> vm::Result<()> { 529 #[repr(C)] 530 struct TdxInitMemRegion { 531 host_address: u64, 532 guest_address: u64, 533 pages: u64, 534 } 535 let data = TdxInitMemRegion { 536 host_address, 537 guest_address, 538 pages: size / 4096, 539 }; 540 541 tdx_command( 542 &self.fd.as_raw_fd(), 543 TdxCommand::InitMemRegion, 544 if measure { 1 } else { 0 }, 545 &data as *const _ as u64, 546 ) 547 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 548 } 549 } 550 551 #[cfg(feature = "tdx")] 552 fn tdx_command( 553 fd: &RawFd, 554 command: TdxCommand, 555 metadata: u32, 556 data: u64, 557 ) -> std::result::Result<(), std::io::Error> { 558 #[repr(C)] 559 struct TdxIoctlCmd { 560 command: TdxCommand, 561 metadata: u32, 562 data: u64, 563 } 564 let cmd = TdxIoctlCmd { 565 command, 566 metadata, 567 data, 568 }; 569 let ret = unsafe { 570 ioctl_with_val( 571 fd, 572 KVM_MEMORY_ENCRYPT_OP(), 573 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 574 ) 575 }; 576 577 if ret < 0 { 578 return Err(std::io::Error::last_os_error()); 579 } 580 Ok(()) 581 } 582 583 /// Wrapper over KVM system ioctls. 584 pub struct KvmHypervisor { 585 kvm: Kvm, 586 } 587 /// Enum for KVM related error 588 #[derive(Debug, Error)] 589 pub enum KvmError { 590 #[error("Capability missing: {0:?}")] 591 CapabilityMissing(Cap), 592 } 593 pub type KvmResult<T> = result::Result<T, KvmError>; 594 impl KvmHypervisor { 595 /// Create a hypervisor based on Kvm 596 pub fn new() -> hypervisor::Result<KvmHypervisor> { 597 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 598 let api_version = kvm_obj.get_api_version(); 599 600 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 601 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 602 } 603 604 Ok(KvmHypervisor { kvm: kvm_obj }) 605 } 606 } 607 /// Implementation of Hypervisor trait for KVM 608 /// Example: 609 /// #[cfg(feature = "kvm")] 610 /// extern crate hypervisor 611 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 612 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 613 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 614 /// 615 impl hypervisor::Hypervisor for KvmHypervisor { 616 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 617 /// Example 618 /// # extern crate hypervisor; 619 /// # use hypervisor::KvmHypervisor; 620 /// use hypervisor::KvmVm; 621 /// let hypervisor = KvmHypervisor::new().unwrap(); 622 /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() 623 /// 624 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 625 let fd: VmFd; 626 loop { 627 match self.kvm.create_vm_with_type(vm_type) { 628 Ok(res) => fd = res, 629 Err(e) => { 630 if e.errno() == libc::EINTR { 631 // If the error returned is EINTR, which means the 632 // ioctl has been interrupted, we have to retry as 633 // this can't be considered as a regular error. 634 continue; 635 } else { 636 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 637 } 638 } 639 } 640 break; 641 } 642 643 let vm_fd = Arc::new(fd); 644 645 #[cfg(target_arch = "x86_64")] 646 { 647 let msr_list = self.get_msr_list()?; 648 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 649 let mut msrs = MsrEntries::new(num_msrs).unwrap(); 650 let indices = msr_list.as_slice(); 651 let msr_entries = msrs.as_mut_slice(); 652 for (pos, index) in indices.iter().enumerate() { 653 msr_entries[pos].index = *index; 654 } 655 656 Ok(Arc::new(KvmVm { 657 fd: vm_fd, 658 msrs, 659 state: VmState {}, 660 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 661 })) 662 } 663 664 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 665 { 666 Ok(Arc::new(KvmVm { 667 fd: vm_fd, 668 state: VmState {}, 669 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 670 })) 671 } 672 } 673 674 /// Create a KVM vm object and return the object as Vm trait object 675 /// Example 676 /// # extern crate hypervisor; 677 /// # use hypervisor::KvmHypervisor; 678 /// use hypervisor::KvmVm; 679 /// let hypervisor = KvmHypervisor::new().unwrap(); 680 /// let vm = hypervisor.create_vm().unwrap() 681 /// 682 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 683 #[allow(unused_mut)] 684 let mut vm_type: u64 = 0; // Create with default platform type 685 686 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 687 // size from the host and use that when creating the VM, which may 688 // avoid unnecessary VM creation failures. 689 #[cfg(target_arch = "aarch64")] 690 if self.kvm.check_extension(Cap::ArmVmIPASize) { 691 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 692 } 693 694 self.create_vm_with_type(vm_type) 695 } 696 697 fn check_required_extensions(&self) -> hypervisor::Result<()> { 698 check_required_kvm_extensions(&self.kvm) 699 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 700 } 701 702 #[cfg(target_arch = "x86_64")] 703 /// 704 /// X86 specific call to get the system supported CPUID values. 705 /// 706 fn get_cpuid(&self) -> hypervisor::Result<CpuId> { 707 self.kvm 708 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 709 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into())) 710 } 711 712 #[cfg(target_arch = "x86_64")] 713 /// 714 /// Retrieve the list of MSRs supported by KVM. 715 /// 716 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 717 self.kvm 718 .get_msr_index_list() 719 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 720 } 721 #[cfg(target_arch = "aarch64")] 722 /// 723 /// Retrieve AArch64 host maximum IPA size supported by KVM. 724 /// 725 fn get_host_ipa_limit(&self) -> i32 { 726 self.kvm.get_host_ipa_limit() 727 } 728 } 729 /// Vcpu struct for KVM 730 pub struct KvmVcpu { 731 fd: VcpuFd, 732 #[cfg(target_arch = "x86_64")] 733 msrs: MsrEntries, 734 vmmops: Option<Arc<dyn vm::VmmOps>>, 735 #[cfg(target_arch = "x86_64")] 736 hyperv_synic: AtomicBool, 737 } 738 /// Implementation of Vcpu trait for KVM 739 /// Example: 740 /// #[cfg(feature = "kvm")] 741 /// extern crate hypervisor 742 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 743 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 744 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 745 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 746 /// vcpu.get/set().unwrap() 747 /// 748 impl cpu::Vcpu for KvmVcpu { 749 #[cfg(target_arch = "x86_64")] 750 /// 751 /// Returns the vCPU general purpose registers. 752 /// 753 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 754 self.fd 755 .get_regs() 756 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into())) 757 } 758 #[cfg(target_arch = "x86_64")] 759 /// 760 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 761 /// 762 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 763 self.fd 764 .set_regs(regs) 765 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 766 } 767 #[cfg(target_arch = "x86_64")] 768 /// 769 /// Returns the vCPU special registers. 770 /// 771 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 772 self.fd 773 .get_sregs() 774 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into())) 775 } 776 #[cfg(target_arch = "x86_64")] 777 /// 778 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 779 /// 780 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 781 self.fd 782 .set_sregs(sregs) 783 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 784 } 785 #[cfg(target_arch = "x86_64")] 786 /// 787 /// Returns the floating point state (FPU) from the vCPU. 788 /// 789 fn get_fpu(&self) -> cpu::Result<FpuState> { 790 self.fd 791 .get_fpu() 792 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into())) 793 } 794 #[cfg(target_arch = "x86_64")] 795 /// 796 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 797 /// 798 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 799 self.fd 800 .set_fpu(fpu) 801 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 802 } 803 #[cfg(target_arch = "x86_64")] 804 /// 805 /// X86 specific call to setup the CPUID registers. 806 /// 807 fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> { 808 self.fd 809 .set_cpuid2(cpuid) 810 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 811 } 812 #[cfg(target_arch = "x86_64")] 813 /// 814 /// X86 specific call to enable HyperV SynIC 815 /// 816 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 817 // Update the information about Hyper-V SynIC being enabled and 818 // emulated as it will influence later which MSRs should be saved. 819 self.hyperv_synic.store(true, Ordering::Release); 820 821 let cap = kvm_enable_cap { 822 cap: KVM_CAP_HYPERV_SYNIC, 823 ..Default::default() 824 }; 825 self.fd 826 .enable_cap(&cap) 827 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 828 } 829 /// 830 /// X86 specific call to retrieve the CPUID registers. 831 /// 832 #[cfg(target_arch = "x86_64")] 833 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> { 834 self.fd 835 .get_cpuid2(num_entries) 836 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into())) 837 } 838 #[cfg(target_arch = "x86_64")] 839 /// 840 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 841 /// 842 fn get_lapic(&self) -> cpu::Result<LapicState> { 843 self.fd 844 .get_lapic() 845 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into())) 846 } 847 #[cfg(target_arch = "x86_64")] 848 /// 849 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 850 /// 851 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 852 self.fd 853 .set_lapic(klapic) 854 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 855 } 856 #[cfg(target_arch = "x86_64")] 857 /// 858 /// Returns the model-specific registers (MSR) for this vCPU. 859 /// 860 fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> { 861 self.fd 862 .get_msrs(msrs) 863 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into())) 864 } 865 #[cfg(target_arch = "x86_64")] 866 /// 867 /// Setup the model-specific registers (MSR) for this vCPU. 868 /// Returns the number of MSR entries actually written. 869 /// 870 fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> { 871 self.fd 872 .set_msrs(msrs) 873 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 874 } 875 /// 876 /// Returns the vcpu's current "multiprocessing state". 877 /// 878 fn get_mp_state(&self) -> cpu::Result<MpState> { 879 self.fd 880 .get_mp_state() 881 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into())) 882 } 883 /// 884 /// Sets the vcpu's current "multiprocessing state". 885 /// 886 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 887 self.fd 888 .set_mp_state(mp_state) 889 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 890 } 891 #[cfg(target_arch = "x86_64")] 892 /// 893 /// X86 specific call that returns the vcpu's current "xsave struct". 894 /// 895 fn get_xsave(&self) -> cpu::Result<Xsave> { 896 self.fd 897 .get_xsave() 898 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 899 } 900 #[cfg(target_arch = "x86_64")] 901 /// 902 /// X86 specific call that sets the vcpu's current "xsave struct". 903 /// 904 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 905 self.fd 906 .set_xsave(xsave) 907 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 908 } 909 #[cfg(target_arch = "x86_64")] 910 /// 911 /// X86 specific call that returns the vcpu's current "xcrs". 912 /// 913 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 914 self.fd 915 .get_xcrs() 916 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 917 } 918 #[cfg(target_arch = "x86_64")] 919 /// 920 /// X86 specific call that sets the vcpu's current "xcrs". 921 /// 922 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 923 self.fd 924 .set_xcrs(xcrs) 925 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 926 } 927 /// 928 /// Triggers the running of the current virtual CPU returning an exit reason. 929 /// 930 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 931 match self.fd.run() { 932 Ok(run) => match run { 933 #[cfg(target_arch = "x86_64")] 934 VcpuExit::IoIn(addr, data) => { 935 if let Some(vmmops) = &self.vmmops { 936 return vmmops 937 .pio_read(addr.into(), data) 938 .map(|_| cpu::VmExit::Ignore) 939 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 940 } 941 942 Ok(cpu::VmExit::IoIn(addr, data)) 943 } 944 #[cfg(target_arch = "x86_64")] 945 VcpuExit::IoOut(addr, data) => { 946 if let Some(vmmops) = &self.vmmops { 947 return vmmops 948 .pio_write(addr.into(), data) 949 .map(|_| cpu::VmExit::Ignore) 950 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 951 } 952 953 Ok(cpu::VmExit::IoOut(addr, data)) 954 } 955 #[cfg(target_arch = "x86_64")] 956 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 957 #[cfg(target_arch = "x86_64")] 958 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 959 960 #[cfg(target_arch = "aarch64")] 961 VcpuExit::SystemEvent(event_type, flags) => { 962 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 963 // On Aarch64, when the VM is shutdown, run() returns 964 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 965 if event_type == KVM_SYSTEM_EVENT_RESET { 966 Ok(cpu::VmExit::Reset) 967 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 968 Ok(cpu::VmExit::Shutdown) 969 } else { 970 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 971 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 972 event_type, 973 flags 974 ))) 975 } 976 } 977 978 VcpuExit::MmioRead(addr, data) => { 979 if let Some(vmmops) = &self.vmmops { 980 return vmmops 981 .mmio_read(addr, data) 982 .map(|_| cpu::VmExit::Ignore) 983 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 984 } 985 986 Ok(cpu::VmExit::MmioRead(addr, data)) 987 } 988 VcpuExit::MmioWrite(addr, data) => { 989 if let Some(vmmops) = &self.vmmops { 990 return vmmops 991 .mmio_write(addr, data) 992 .map(|_| cpu::VmExit::Ignore) 993 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 994 } 995 996 Ok(cpu::VmExit::MmioWrite(addr, data)) 997 } 998 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 999 1000 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1001 "Unexpected exit reason on vcpu run: {:?}", 1002 r 1003 ))), 1004 }, 1005 1006 Err(ref e) => match e.errno() { 1007 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1008 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1009 "VCPU error {:?}", 1010 e 1011 ))), 1012 }, 1013 } 1014 } 1015 #[cfg(target_arch = "x86_64")] 1016 /// 1017 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 1018 /// states of the vcpu. 1019 /// 1020 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 1021 self.fd 1022 .get_vcpu_events() 1023 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 1024 } 1025 #[cfg(target_arch = "x86_64")] 1026 /// 1027 /// Sets pending exceptions, interrupts, and NMIs as well as related states 1028 /// of the vcpu. 1029 /// 1030 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 1031 self.fd 1032 .set_vcpu_events(events) 1033 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 1034 } 1035 #[cfg(target_arch = "x86_64")] 1036 /// 1037 /// Let the guest know that it has been paused, which prevents from 1038 /// potential soft lockups when being resumed. 1039 /// 1040 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1041 self.fd 1042 .kvmclock_ctrl() 1043 .map_err(|e| cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())) 1044 } 1045 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1046 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1047 self.fd 1048 .vcpu_init(kvi) 1049 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1050 } 1051 /// 1052 /// Sets the value of one register for this vCPU. 1053 /// 1054 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1055 fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> { 1056 self.fd 1057 .set_one_reg(reg_id, data) 1058 .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into())) 1059 } 1060 /// 1061 /// Gets the value of one register for this vCPU. 1062 /// 1063 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1064 fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> { 1065 self.fd 1066 .get_one_reg(reg_id) 1067 .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into())) 1068 } 1069 /// 1070 /// Gets a list of the guest registers that are supported for the 1071 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1072 /// 1073 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1074 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1075 self.fd 1076 .get_reg_list(reg_list) 1077 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1078 } 1079 /// 1080 /// Save the state of the core registers. 1081 /// 1082 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1083 fn core_registers(&self, state: &mut StandardRegisters) -> cpu::Result<()> { 1084 let mut off = offset__of!(user_pt_regs, regs); 1085 // There are 31 user_pt_regs: 1086 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1087 // These actually are the general-purpose registers of the Armv8-a 1088 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1089 for i in 0..31 { 1090 state.regs.regs[i] = self 1091 .fd 1092 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1093 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1094 off += std::mem::size_of::<u64>(); 1095 } 1096 1097 // We are now entering the "Other register" section of the ARMv8-a architecture. 1098 // First one, stack pointer. 1099 let off = offset__of!(user_pt_regs, sp); 1100 state.regs.sp = self 1101 .fd 1102 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1103 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1104 1105 // Second one, the program counter. 1106 let off = offset__of!(user_pt_regs, pc); 1107 state.regs.pc = self 1108 .fd 1109 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1110 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1111 1112 // Next is the processor state. 1113 let off = offset__of!(user_pt_regs, pstate); 1114 state.regs.pstate = self 1115 .fd 1116 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1117 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1118 1119 // The stack pointer associated with EL1 1120 let off = offset__of!(kvm_regs, sp_el1); 1121 state.sp_el1 = self 1122 .fd 1123 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1124 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1125 1126 // Exception Link Register for EL1, when taking an exception to EL1, this register 1127 // holds the address to which to return afterwards. 1128 let off = offset__of!(kvm_regs, elr_el1); 1129 state.elr_el1 = self 1130 .fd 1131 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1132 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1133 1134 // Saved Program Status Registers, there are 5 of them used in the kernel. 1135 let mut off = offset__of!(kvm_regs, spsr); 1136 for i in 0..KVM_NR_SPSR as usize { 1137 state.spsr[i] = self 1138 .fd 1139 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1140 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1141 off += std::mem::size_of::<u64>(); 1142 } 1143 1144 // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: 1145 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1146 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1147 for i in 0..32 { 1148 state.fp_regs.vregs[i][0] = self 1149 .fd 1150 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1151 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1152 off += mem::size_of::<u128>(); 1153 } 1154 1155 // Floating-point Status Register 1156 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1157 state.fp_regs.fpsr = self 1158 .fd 1159 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1160 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1161 as u32; 1162 1163 // Floating-point Control Register 1164 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1165 state.fp_regs.fpcr = self 1166 .fd 1167 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1168 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1169 as u32; 1170 Ok(()) 1171 } 1172 /// 1173 /// Restore the state of the core registers. 1174 /// 1175 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1176 fn set_core_registers(&self, state: &StandardRegisters) -> cpu::Result<()> { 1177 // The function follows the exact identical order from `state`. Look there 1178 // for some additional info on registers. 1179 let mut off = offset__of!(user_pt_regs, regs); 1180 for i in 0..31 { 1181 self.fd 1182 .set_one_reg( 1183 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1184 state.regs.regs[i], 1185 ) 1186 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1187 off += std::mem::size_of::<u64>(); 1188 } 1189 1190 let off = offset__of!(user_pt_regs, sp); 1191 self.fd 1192 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp) 1193 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1194 1195 let off = offset__of!(user_pt_regs, pc); 1196 self.fd 1197 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc) 1198 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1199 1200 let off = offset__of!(user_pt_regs, pstate); 1201 self.fd 1202 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate) 1203 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1204 1205 let off = offset__of!(kvm_regs, sp_el1); 1206 self.fd 1207 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1) 1208 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1209 1210 let off = offset__of!(kvm_regs, elr_el1); 1211 self.fd 1212 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1) 1213 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1214 1215 let mut off = offset__of!(kvm_regs, spsr); 1216 for i in 0..KVM_NR_SPSR as usize { 1217 self.fd 1218 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i]) 1219 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1220 off += std::mem::size_of::<u64>(); 1221 } 1222 1223 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1224 for i in 0..32 { 1225 self.fd 1226 .set_one_reg( 1227 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1228 state.fp_regs.vregs[i][0], 1229 ) 1230 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1231 off += mem::size_of::<u128>(); 1232 } 1233 1234 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1235 self.fd 1236 .set_one_reg( 1237 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1238 state.fp_regs.fpsr as u64, 1239 ) 1240 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1241 1242 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1243 self.fd 1244 .set_one_reg( 1245 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1246 state.fp_regs.fpcr as u64, 1247 ) 1248 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1249 Ok(()) 1250 } 1251 /// 1252 /// Save the state of the system registers. 1253 /// 1254 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1255 fn system_registers(&self, state: &mut Vec<Register>) -> cpu::Result<()> { 1256 // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are 1257 // around 500 registers. 1258 let mut reg_list = RegList::new(500).unwrap(); 1259 self.fd 1260 .get_reg_list(&mut reg_list) 1261 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1262 1263 // At this point reg_list should contain: core registers and system registers. 1264 // The register list contains the number of registers and their ids. We will be needing to 1265 // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list 1266 // the core registers which are represented in the kernel by kvm_regs structure and for which 1267 // we can calculate the id based on the offset in the structure. 1268 1269 reg_list.retain(|regid| *regid != 0); 1270 reg_list.as_slice().to_vec().sort_unstable(); 1271 1272 reg_list.retain(|regid| is_system_register(*regid)); 1273 1274 // Now, for the rest of the registers left in the previously fetched register list, we are 1275 // simply calling KVM_GET_ONE_REG. 1276 let indices = reg_list.as_slice(); 1277 for (_pos, index) in indices.iter().enumerate() { 1278 if _pos > 230 { 1279 break; 1280 } 1281 state.push(kvm_bindings::kvm_one_reg { 1282 id: *index, 1283 addr: self 1284 .fd 1285 .get_one_reg(*index) 1286 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?, 1287 }); 1288 } 1289 1290 Ok(()) 1291 } 1292 /// 1293 /// Restore the state of the system registers. 1294 /// 1295 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1296 fn set_system_registers(&self, state: &[Register]) -> cpu::Result<()> { 1297 for reg in state { 1298 self.fd 1299 .set_one_reg(reg.id, reg.addr) 1300 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 1301 } 1302 Ok(()) 1303 } 1304 /// 1305 /// Read the MPIDR - Multiprocessor Affinity Register. 1306 /// 1307 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1308 fn read_mpidr(&self) -> cpu::Result<u64> { 1309 self.fd 1310 .get_one_reg(MPIDR_EL1) 1311 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into())) 1312 } 1313 #[cfg(target_arch = "x86_64")] 1314 /// 1315 /// Get the current CPU state 1316 /// 1317 /// Ordering requirements: 1318 /// 1319 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1320 /// vCPU/LAPIC state. As such, it must be done before most everything 1321 /// else, otherwise we cannot restore everything and expect it to work. 1322 /// 1323 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1324 /// still running. 1325 /// 1326 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1327 /// 1328 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1329 /// it might as well be affected by internal state modifications of the 1330 /// GET ioctls. 1331 /// 1332 /// SREGS saves/restores a pending interrupt, similar to what 1333 /// VCPU_EVENTS also does. 1334 /// 1335 /// GET_MSRS requires a pre-populated data structure to do something 1336 /// meaningful. For SET_MSRS it will then contain good data. 1337 /// 1338 /// # Example 1339 /// 1340 /// ```rust 1341 /// # extern crate hypervisor; 1342 /// # use hypervisor::KvmHypervisor; 1343 /// # use std::sync::Arc; 1344 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1345 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1346 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1347 /// vm.enable_split_irq().unwrap(); 1348 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1349 /// let state = vcpu.state().unwrap(); 1350 /// ``` 1351 fn state(&self) -> cpu::Result<CpuState> { 1352 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1353 let mp_state = self.get_mp_state()?; 1354 let regs = self.get_regs()?; 1355 let sregs = self.get_sregs()?; 1356 let xsave = self.get_xsave()?; 1357 let xcrs = self.get_xcrs()?; 1358 let lapic_state = self.get_lapic()?; 1359 let fpu = self.get_fpu()?; 1360 1361 // Try to get all MSRs based on the list previously retrieved from KVM. 1362 // If the number of MSRs obtained from GET_MSRS is different from the 1363 // expected amount, we fallback onto a slower method by getting MSRs 1364 // by chunks. This is the only way to make sure we try to get as many 1365 // MSRs as possible, even if some MSRs are not supported. 1366 let mut msr_entries = self.msrs.clone(); 1367 1368 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1369 // emulated. 1370 if self.hyperv_synic.load(Ordering::Acquire) { 1371 let hyperv_synic_msrs = vec![ 1372 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1373 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1374 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1375 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 0x400000b5, 1376 0x400000b6, 0x400000b7, 1377 ]; 1378 for index in hyperv_synic_msrs { 1379 let msr = kvm_msr_entry { 1380 index, 1381 ..Default::default() 1382 }; 1383 msr_entries.push(msr).unwrap(); 1384 } 1385 } 1386 1387 let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize; 1388 let num_msrs = self.get_msrs(&mut msr_entries)?; 1389 let msrs = if num_msrs != expected_num_msrs { 1390 let mut faulty_msr_index = num_msrs; 1391 let mut msr_entries_tmp = 1392 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap(); 1393 1394 loop { 1395 warn!( 1396 "Detected faulty MSR 0x{:x} while getting MSRs", 1397 msr_entries.as_slice()[faulty_msr_index].index 1398 ); 1399 1400 let start_pos = faulty_msr_index + 1; 1401 let mut sub_msr_entries = 1402 MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap(); 1403 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1404 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1405 1406 for i in 0..num_msrs { 1407 msr_entries_tmp 1408 .push(sub_msr_entries.as_slice()[i]) 1409 .map_err(|e| { 1410 cpu::HypervisorCpuError::GetMsrEntries(anyhow!( 1411 "Failed adding MSR entries: {:?}", 1412 e 1413 )) 1414 })?; 1415 } 1416 1417 if num_msrs == expected_num_msrs { 1418 break; 1419 } 1420 1421 faulty_msr_index = start_pos + num_msrs; 1422 } 1423 1424 msr_entries_tmp 1425 } else { 1426 msr_entries 1427 }; 1428 1429 let vcpu_events = self.get_vcpu_events()?; 1430 1431 Ok(CpuState { 1432 cpuid, 1433 msrs, 1434 vcpu_events, 1435 regs, 1436 sregs, 1437 fpu, 1438 lapic_state, 1439 xsave, 1440 xcrs, 1441 mp_state, 1442 }) 1443 } 1444 /// 1445 /// Get the current AArch64 CPU state 1446 /// 1447 #[cfg(target_arch = "aarch64")] 1448 fn state(&self) -> cpu::Result<CpuState> { 1449 let mut state = CpuState { 1450 mp_state: self.get_mp_state()?, 1451 mpidr: self.read_mpidr()?, 1452 ..Default::default() 1453 }; 1454 self.core_registers(&mut state.core_regs)?; 1455 self.system_registers(&mut state.sys_regs)?; 1456 1457 Ok(state) 1458 } 1459 #[cfg(target_arch = "x86_64")] 1460 /// 1461 /// Restore the previously saved CPU state 1462 /// 1463 /// Ordering requirements: 1464 /// 1465 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1466 /// still running. 1467 /// 1468 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1469 /// if we ever change the BSP, we have to do that before restoring anything. 1470 /// The same seems to be true for CPUID stuff. 1471 /// 1472 /// SREGS saves/restores a pending interrupt, similar to what 1473 /// VCPU_EVENTS also does. 1474 /// 1475 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1476 /// done before SET_VCPU_EVENTS, which restores it. 1477 /// 1478 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1479 /// the apic base msr. 1480 /// 1481 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1482 /// only restores successfully, when the LAPIC is correctly configured. 1483 /// 1484 /// Arguments: CpuState 1485 /// # Example 1486 /// 1487 /// ```rust 1488 /// # extern crate hypervisor; 1489 /// # use hypervisor::KvmHypervisor; 1490 /// # use std::sync::Arc; 1491 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1492 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1493 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1494 /// vm.enable_split_irq().unwrap(); 1495 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1496 /// let state = vcpu.state().unwrap(); 1497 /// vcpu.set_state(&state).unwrap(); 1498 /// ``` 1499 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1500 self.set_cpuid2(&state.cpuid)?; 1501 self.set_mp_state(state.mp_state)?; 1502 self.set_regs(&state.regs)?; 1503 self.set_sregs(&state.sregs)?; 1504 self.set_xsave(&state.xsave)?; 1505 self.set_xcrs(&state.xcrs)?; 1506 self.set_lapic(&state.lapic_state)?; 1507 self.set_fpu(&state.fpu)?; 1508 1509 // Try to set all MSRs previously stored. 1510 // If the number of MSRs set from SET_MSRS is different from the 1511 // expected amount, we fallback onto a slower method by setting MSRs 1512 // by chunks. This is the only way to make sure we try to set as many 1513 // MSRs as possible, even if some MSRs are not supported. 1514 let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize; 1515 let num_msrs = self.set_msrs(&state.msrs)?; 1516 if num_msrs != expected_num_msrs { 1517 let mut faulty_msr_index = num_msrs; 1518 1519 loop { 1520 warn!( 1521 "Detected faulty MSR 0x{:x} while setting MSRs", 1522 state.msrs.as_slice()[faulty_msr_index].index 1523 ); 1524 1525 let start_pos = faulty_msr_index + 1; 1526 let sub_msr_entries = 1527 MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap(); 1528 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1529 let num_msrs = self.set_msrs(&sub_msr_entries)?; 1530 1531 if num_msrs == expected_num_msrs { 1532 break; 1533 } 1534 1535 faulty_msr_index = start_pos + num_msrs; 1536 } 1537 } 1538 1539 self.set_vcpu_events(&state.vcpu_events)?; 1540 1541 Ok(()) 1542 } 1543 /// 1544 /// Restore the previously saved AArch64 CPU state 1545 /// 1546 #[cfg(target_arch = "aarch64")] 1547 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1548 self.set_core_registers(&state.core_regs)?; 1549 self.set_system_registers(&state.sys_regs)?; 1550 self.set_mp_state(state.mp_state)?; 1551 1552 Ok(()) 1553 } 1554 1555 /// 1556 /// Initialize TDX for this CPU 1557 /// 1558 #[cfg(feature = "tdx")] 1559 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 1560 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 1561 .map_err(cpu::HypervisorCpuError::InitializeTdx) 1562 } 1563 } 1564 1565 /// Device struct for KVM 1566 pub struct KvmDevice { 1567 fd: DeviceFd, 1568 } 1569 1570 impl device::Device for KvmDevice { 1571 /// 1572 /// Set device attribute 1573 /// 1574 fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> { 1575 self.fd 1576 .set_device_attr(attr) 1577 .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into())) 1578 } 1579 /// 1580 /// Get device attribute 1581 /// 1582 fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> { 1583 self.fd 1584 .get_device_attr(attr) 1585 .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into())) 1586 } 1587 } 1588 1589 impl AsRawFd for KvmDevice { 1590 fn as_raw_fd(&self) -> RawFd { 1591 self.fd.as_raw_fd() 1592 } 1593 } 1594