1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 pub use crate::aarch64::{ 13 check_required_kvm_extensions, is_system_register, VcpuInit, VcpuKvmState as CpuState, 14 MPIDR_EL1, 15 }; 16 use crate::cpu; 17 use crate::device; 18 use crate::hypervisor; 19 use crate::vec_with_array_field; 20 use crate::vm::{self, VmmOps}; 21 #[cfg(target_arch = "aarch64")] 22 use crate::{arm64_core_reg_id, offset__of}; 23 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 24 use serde_derive::{Deserialize, Serialize}; 25 use std::collections::HashMap; 26 #[cfg(target_arch = "aarch64")] 27 use std::convert::TryInto; 28 #[cfg(target_arch = "x86_64")] 29 use std::fs::File; 30 use std::os::unix::io::{AsRawFd, RawFd}; 31 use std::result; 32 #[cfg(target_arch = "x86_64")] 33 use std::sync::atomic::{AtomicBool, Ordering}; 34 use std::sync::{Arc, RwLock}; 35 use vmm_sys_util::eventfd::EventFd; 36 // x86_64 dependencies 37 #[cfg(target_arch = "x86_64")] 38 pub mod x86_64; 39 #[cfg(target_arch = "x86_64")] 40 use crate::arch::x86::NUM_IOAPIC_PINS; 41 #[cfg(target_arch = "aarch64")] 42 use aarch64::{RegList, Register, StandardRegisters}; 43 #[cfg(target_arch = "x86_64")] 44 use kvm_bindings::{ 45 kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP, 46 }; 47 #[cfg(target_arch = "x86_64")] 48 use x86_64::{check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters}; 49 #[cfg(target_arch = "x86_64")] 50 pub use x86_64::{ 51 CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState as CpuState, 52 Xsave, CPUID_FLAG_VALID_INDEX, 53 }; 54 // aarch64 dependencies 55 #[cfg(target_arch = "aarch64")] 56 pub mod aarch64; 57 pub use kvm_bindings; 58 #[cfg(feature = "tdx")] 59 use kvm_bindings::KVMIO; 60 pub use kvm_bindings::{ 61 kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry, 62 kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, 63 KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 64 }; 65 #[cfg(target_arch = "aarch64")] 66 use kvm_bindings::{ 67 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE, 68 KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 69 }; 70 pub use kvm_ioctls; 71 pub use kvm_ioctls::{Cap, Kvm}; 72 #[cfg(target_arch = "aarch64")] 73 use std::mem; 74 use thiserror::Error; 75 #[cfg(feature = "tdx")] 76 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr}; 77 /// 78 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 79 /// 80 pub use { 81 kvm_bindings::kvm_clock_data as ClockData, kvm_bindings::kvm_create_device as CreateDevice, 82 kvm_bindings::kvm_device_attr as DeviceAttr, 83 kvm_bindings::kvm_irq_routing_entry as IrqRoutingEntry, kvm_bindings::kvm_mp_state as MpState, 84 kvm_bindings::kvm_userspace_memory_region as MemoryRegion, 85 kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::IoEventAddress, 86 kvm_ioctls::VcpuExit, 87 }; 88 89 #[cfg(target_arch = "x86_64")] 90 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 91 92 #[cfg(feature = "tdx")] 93 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 94 95 #[cfg(feature = "tdx")] 96 #[repr(u32)] 97 enum TdxCommand { 98 #[allow(dead_code)] 99 Capabilities = 0, 100 InitVm, 101 InitVcpu, 102 InitMemRegion, 103 Finalize, 104 } 105 106 #[derive(Clone, Copy, Debug, PartialEq, Deserialize, Serialize)] 107 pub struct KvmVmState {} 108 109 pub use KvmVmState as VmState; 110 111 struct KvmDirtyLogSlot { 112 slot: u32, 113 guest_phys_addr: u64, 114 memory_size: u64, 115 userspace_addr: u64, 116 } 117 118 /// Wrapper over KVM VM ioctls. 119 pub struct KvmVm { 120 fd: Arc<VmFd>, 121 #[cfg(target_arch = "x86_64")] 122 msrs: MsrEntries, 123 state: KvmVmState, 124 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 125 } 126 127 /// 128 /// Implementation of Vm trait for KVM 129 /// Example: 130 /// #[cfg(feature = "kvm")] 131 /// extern crate hypervisor 132 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 133 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 134 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 135 /// vm.set/get().unwrap() 136 /// 137 impl vm::Vm for KvmVm { 138 #[cfg(target_arch = "x86_64")] 139 /// 140 /// Sets the address of the one-page region in the VM's address space. 141 /// 142 fn set_identity_map_address(&self, address: u64) -> vm::Result<()> { 143 self.fd 144 .set_identity_map_address(address) 145 .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into())) 146 } 147 #[cfg(target_arch = "x86_64")] 148 /// 149 /// Sets the address of the three-page region in the VM's address space. 150 /// 151 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 152 self.fd 153 .set_tss_address(offset) 154 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 155 } 156 /// 157 /// Creates an in-kernel interrupt controller. 158 /// 159 fn create_irq_chip(&self) -> vm::Result<()> { 160 self.fd 161 .create_irq_chip() 162 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 163 } 164 /// 165 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 166 /// 167 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 168 self.fd 169 .register_irqfd(fd, gsi) 170 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 171 } 172 /// 173 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 174 /// 175 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 176 self.fd 177 .unregister_irqfd(fd, gsi) 178 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 179 } 180 /// 181 /// Creates a VcpuFd object from a vcpu RawFd. 182 /// 183 fn create_vcpu( 184 &self, 185 id: u8, 186 vmmops: Option<Arc<dyn VmmOps>>, 187 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 188 let vc = self 189 .fd 190 .create_vcpu(id as u64) 191 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 192 let vcpu = KvmVcpu { 193 fd: vc, 194 #[cfg(target_arch = "x86_64")] 195 msrs: self.msrs.clone(), 196 vmmops, 197 #[cfg(target_arch = "x86_64")] 198 hyperv_synic: AtomicBool::new(false), 199 }; 200 Ok(Arc::new(vcpu)) 201 } 202 /// 203 /// Registers an event to be signaled whenever a certain address is written to. 204 /// 205 fn register_ioevent( 206 &self, 207 fd: &EventFd, 208 addr: &IoEventAddress, 209 datamatch: Option<vm::DataMatch>, 210 ) -> vm::Result<()> { 211 if let Some(dm) = datamatch { 212 match dm { 213 vm::DataMatch::DataMatch32(kvm_dm32) => self 214 .fd 215 .register_ioevent(fd, addr, kvm_dm32) 216 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 217 vm::DataMatch::DataMatch64(kvm_dm64) => self 218 .fd 219 .register_ioevent(fd, addr, kvm_dm64) 220 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 221 } 222 } else { 223 self.fd 224 .register_ioevent(fd, addr, NoDatamatch) 225 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 226 } 227 } 228 /// 229 /// Unregisters an event from a certain address it has been previously registered to. 230 /// 231 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 232 self.fd 233 .unregister_ioevent(fd, addr, NoDatamatch) 234 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 235 } 236 /// 237 /// Sets the GSI routing table entries, overwriting any previously set 238 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 239 /// 240 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 241 let mut irq_routing = 242 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 243 irq_routing[0].nr = entries.len() as u32; 244 irq_routing[0].flags = 0; 245 246 // SAFETY: irq_routing initialized with entries.len() and now it is being turned into 247 // entries_slice with entries.len() again. It is guaranteed to be large enough to hold 248 // everything from entries. 249 unsafe { 250 let entries_slice: &mut [kvm_irq_routing_entry] = 251 irq_routing[0].entries.as_mut_slice(entries.len()); 252 entries_slice.copy_from_slice(entries); 253 } 254 255 self.fd 256 .set_gsi_routing(&irq_routing[0]) 257 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 258 } 259 /// 260 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 261 /// 262 fn make_user_memory_region( 263 &self, 264 slot: u32, 265 guest_phys_addr: u64, 266 memory_size: u64, 267 userspace_addr: u64, 268 readonly: bool, 269 log_dirty_pages: bool, 270 ) -> MemoryRegion { 271 MemoryRegion { 272 slot, 273 guest_phys_addr, 274 memory_size, 275 userspace_addr, 276 flags: if readonly { KVM_MEM_READONLY } else { 0 } 277 | if log_dirty_pages { 278 KVM_MEM_LOG_DIRTY_PAGES 279 } else { 280 0 281 }, 282 } 283 } 284 /// 285 /// Creates a guest physical memory region. 286 /// 287 fn create_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> { 288 let mut region = user_memory_region; 289 290 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 291 if (region.flags & KVM_MEM_READONLY) != 0 { 292 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 293 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 294 ))); 295 } 296 297 // Keep track of the regions that need dirty pages log 298 self.dirty_log_slots.write().unwrap().insert( 299 region.slot, 300 KvmDirtyLogSlot { 301 slot: region.slot, 302 guest_phys_addr: region.guest_phys_addr, 303 memory_size: region.memory_size, 304 userspace_addr: region.userspace_addr, 305 }, 306 ); 307 308 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 309 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 310 region.flags = 0; 311 } 312 313 // SAFETY: Safe because guest regions are guaranteed not to overlap. 314 unsafe { 315 self.fd 316 .set_user_memory_region(region) 317 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 318 } 319 } 320 /// 321 /// Removes a guest physical memory region. 322 /// 323 fn remove_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> { 324 let mut region = user_memory_region; 325 326 // Remove the corresponding entry from "self.dirty_log_slots" if needed 327 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 328 329 // Setting the size to 0 means "remove" 330 region.memory_size = 0; 331 // SAFETY: Safe because guest regions are guaranteed not to overlap. 332 unsafe { 333 self.fd 334 .set_user_memory_region(region) 335 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 336 } 337 } 338 /// 339 /// Creates an emulated device in the kernel. 340 /// 341 /// See the documentation for `KVM_CREATE_DEVICE`. 342 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> { 343 let fd = self 344 .fd 345 .create_device(device) 346 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 347 let device = KvmDevice { fd }; 348 Ok(Arc::new(device)) 349 } 350 /// 351 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 352 /// 353 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 354 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 355 self.fd 356 .get_preferred_target(kvi) 357 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 358 } 359 #[cfg(target_arch = "x86_64")] 360 fn enable_split_irq(&self) -> vm::Result<()> { 361 // Create split irqchip 362 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 363 // are not. 364 let mut cap = kvm_enable_cap { 365 cap: KVM_CAP_SPLIT_IRQCHIP, 366 ..Default::default() 367 }; 368 cap.args[0] = NUM_IOAPIC_PINS as u64; 369 self.fd 370 .enable_cap(&cap) 371 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 372 Ok(()) 373 } 374 #[cfg(target_arch = "x86_64")] 375 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 376 let mut cap = kvm_enable_cap { 377 cap: KVM_CAP_SGX_ATTRIBUTE, 378 ..Default::default() 379 }; 380 cap.args[0] = file.as_raw_fd() as u64; 381 self.fd 382 .enable_cap(&cap) 383 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 384 Ok(()) 385 } 386 /// Retrieve guest clock. 387 #[cfg(target_arch = "x86_64")] 388 fn get_clock(&self) -> vm::Result<ClockData> { 389 self.fd 390 .get_clock() 391 .map_err(|e| vm::HypervisorVmError::GetClock(e.into())) 392 } 393 /// Set guest clock. 394 #[cfg(target_arch = "x86_64")] 395 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 396 self.fd 397 .set_clock(data) 398 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 399 } 400 /// Checks if a particular `Cap` is available. 401 fn check_extension(&self, c: Cap) -> bool { 402 self.fd.check_extension(c) 403 } 404 /// Create a device that is used for passthrough 405 fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> { 406 let mut vfio_dev = kvm_create_device { 407 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 408 fd: 0, 409 flags: 0, 410 }; 411 412 self.create_device(&mut vfio_dev) 413 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 414 } 415 /// 416 /// Get the Vm state. Return VM specific data 417 /// 418 fn state(&self) -> vm::Result<VmState> { 419 Ok(self.state) 420 } 421 /// 422 /// Set the VM state 423 /// 424 fn set_state(&self, _state: VmState) -> vm::Result<()> { 425 Ok(()) 426 } 427 428 /// 429 /// Start logging dirty pages 430 /// 431 fn start_dirty_log(&self) -> vm::Result<()> { 432 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 433 for (_, s) in dirty_log_slots.iter() { 434 let region = MemoryRegion { 435 slot: s.slot, 436 guest_phys_addr: s.guest_phys_addr, 437 memory_size: s.memory_size, 438 userspace_addr: s.userspace_addr, 439 flags: KVM_MEM_LOG_DIRTY_PAGES, 440 }; 441 // SAFETY: Safe because guest regions are guaranteed not to overlap. 442 unsafe { 443 self.fd 444 .set_user_memory_region(region) 445 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 446 } 447 } 448 449 Ok(()) 450 } 451 452 /// 453 /// Stop logging dirty pages 454 /// 455 fn stop_dirty_log(&self) -> vm::Result<()> { 456 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 457 for (_, s) in dirty_log_slots.iter() { 458 let region = MemoryRegion { 459 slot: s.slot, 460 guest_phys_addr: s.guest_phys_addr, 461 memory_size: s.memory_size, 462 userspace_addr: s.userspace_addr, 463 flags: 0, 464 }; 465 // SAFETY: Safe because guest regions are guaranteed not to overlap. 466 unsafe { 467 self.fd 468 .set_user_memory_region(region) 469 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 470 } 471 } 472 473 Ok(()) 474 } 475 476 /// 477 /// Get dirty pages bitmap (one bit per page) 478 /// 479 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 480 self.fd 481 .get_dirty_log(slot, memory_size as usize) 482 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 483 } 484 485 /// 486 /// Initialize TDX for this VM 487 /// 488 #[cfg(feature = "tdx")] 489 fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> { 490 #[repr(C)] 491 struct TdxInitVm { 492 max_vcpus: u32, 493 tsc_khz: u32, 494 attributes: u64, 495 cpuid: u64, 496 mrconfigid: [u64; 6], 497 mrowner: [u64; 6], 498 mrownerconfig: [u64; 6], 499 reserved: [u64; 43], 500 } 501 let data = TdxInitVm { 502 max_vcpus, 503 tsc_khz: 0, 504 attributes: 1, // TDX1_TD_ATTRIBUTE_DEBUG, 505 cpuid: cpuid.as_fam_struct_ptr() as u64, 506 mrconfigid: [0; 6], 507 mrowner: [0; 6], 508 mrownerconfig: [0; 6], 509 reserved: [0; 43], 510 }; 511 512 tdx_command( 513 &self.fd.as_raw_fd(), 514 TdxCommand::InitVm, 515 0, 516 &data as *const _ as u64, 517 ) 518 .map_err(vm::HypervisorVmError::InitializeTdx) 519 } 520 521 /// 522 /// Finalize the TDX setup for this VM 523 /// 524 #[cfg(feature = "tdx")] 525 fn tdx_finalize(&self) -> vm::Result<()> { 526 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 527 .map_err(vm::HypervisorVmError::FinalizeTdx) 528 } 529 530 /// 531 /// Initialize memory regions for the TDX VM 532 /// 533 #[cfg(feature = "tdx")] 534 fn tdx_init_memory_region( 535 &self, 536 host_address: u64, 537 guest_address: u64, 538 size: u64, 539 measure: bool, 540 ) -> vm::Result<()> { 541 #[repr(C)] 542 struct TdxInitMemRegion { 543 host_address: u64, 544 guest_address: u64, 545 pages: u64, 546 } 547 let data = TdxInitMemRegion { 548 host_address, 549 guest_address, 550 pages: size / 4096, 551 }; 552 553 tdx_command( 554 &self.fd.as_raw_fd(), 555 TdxCommand::InitMemRegion, 556 if measure { 1 } else { 0 }, 557 &data as *const _ as u64, 558 ) 559 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 560 } 561 } 562 563 #[cfg(feature = "tdx")] 564 fn tdx_command( 565 fd: &RawFd, 566 command: TdxCommand, 567 metadata: u32, 568 data: u64, 569 ) -> std::result::Result<(), std::io::Error> { 570 #[repr(C)] 571 struct TdxIoctlCmd { 572 command: TdxCommand, 573 metadata: u32, 574 data: u64, 575 } 576 let cmd = TdxIoctlCmd { 577 command, 578 metadata, 579 data, 580 }; 581 // SAFETY: FFI call. All input parameters are valid. 582 let ret = unsafe { 583 ioctl_with_val( 584 fd, 585 KVM_MEMORY_ENCRYPT_OP(), 586 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 587 ) 588 }; 589 590 if ret < 0 { 591 return Err(std::io::Error::last_os_error()); 592 } 593 Ok(()) 594 } 595 596 /// Wrapper over KVM system ioctls. 597 pub struct KvmHypervisor { 598 kvm: Kvm, 599 } 600 /// Enum for KVM related error 601 #[derive(Debug, Error)] 602 pub enum KvmError { 603 #[error("Capability missing: {0:?}")] 604 CapabilityMissing(Cap), 605 } 606 pub type KvmResult<T> = result::Result<T, KvmError>; 607 impl KvmHypervisor { 608 /// Create a hypervisor based on Kvm 609 pub fn new() -> hypervisor::Result<KvmHypervisor> { 610 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 611 let api_version = kvm_obj.get_api_version(); 612 613 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 614 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 615 } 616 617 Ok(KvmHypervisor { kvm: kvm_obj }) 618 } 619 } 620 /// Implementation of Hypervisor trait for KVM 621 /// Example: 622 /// #[cfg(feature = "kvm")] 623 /// extern crate hypervisor 624 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 625 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 626 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 627 /// 628 impl hypervisor::Hypervisor for KvmHypervisor { 629 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 630 /// Example 631 /// # extern crate hypervisor; 632 /// # use hypervisor::KvmHypervisor; 633 /// use hypervisor::KvmVm; 634 /// let hypervisor = KvmHypervisor::new().unwrap(); 635 /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() 636 /// 637 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 638 let fd: VmFd; 639 loop { 640 match self.kvm.create_vm_with_type(vm_type) { 641 Ok(res) => fd = res, 642 Err(e) => { 643 if e.errno() == libc::EINTR { 644 // If the error returned is EINTR, which means the 645 // ioctl has been interrupted, we have to retry as 646 // this can't be considered as a regular error. 647 continue; 648 } else { 649 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 650 } 651 } 652 } 653 break; 654 } 655 656 let vm_fd = Arc::new(fd); 657 658 #[cfg(target_arch = "x86_64")] 659 { 660 let msr_list = self.get_msr_list()?; 661 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 662 let mut msrs = MsrEntries::new(num_msrs).unwrap(); 663 let indices = msr_list.as_slice(); 664 let msr_entries = msrs.as_mut_slice(); 665 for (pos, index) in indices.iter().enumerate() { 666 msr_entries[pos].index = *index; 667 } 668 669 Ok(Arc::new(KvmVm { 670 fd: vm_fd, 671 msrs, 672 state: VmState {}, 673 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 674 })) 675 } 676 677 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 678 { 679 Ok(Arc::new(KvmVm { 680 fd: vm_fd, 681 state: VmState {}, 682 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 683 })) 684 } 685 } 686 687 /// Create a KVM vm object and return the object as Vm trait object 688 /// Example 689 /// # extern crate hypervisor; 690 /// # use hypervisor::KvmHypervisor; 691 /// use hypervisor::KvmVm; 692 /// let hypervisor = KvmHypervisor::new().unwrap(); 693 /// let vm = hypervisor.create_vm().unwrap() 694 /// 695 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 696 #[allow(unused_mut)] 697 let mut vm_type: u64 = 0; // Create with default platform type 698 699 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 700 // size from the host and use that when creating the VM, which may 701 // avoid unnecessary VM creation failures. 702 #[cfg(target_arch = "aarch64")] 703 if self.kvm.check_extension(Cap::ArmVmIPASize) { 704 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 705 } 706 707 self.create_vm_with_type(vm_type) 708 } 709 710 fn check_required_extensions(&self) -> hypervisor::Result<()> { 711 check_required_kvm_extensions(&self.kvm) 712 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 713 } 714 715 #[cfg(target_arch = "x86_64")] 716 /// 717 /// X86 specific call to get the system supported CPUID values. 718 /// 719 fn get_cpuid(&self) -> hypervisor::Result<CpuId> { 720 self.kvm 721 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 722 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into())) 723 } 724 725 #[cfg(target_arch = "x86_64")] 726 /// 727 /// Retrieve the list of MSRs supported by KVM. 728 /// 729 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 730 self.kvm 731 .get_msr_index_list() 732 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 733 } 734 #[cfg(target_arch = "aarch64")] 735 /// 736 /// Retrieve AArch64 host maximum IPA size supported by KVM. 737 /// 738 fn get_host_ipa_limit(&self) -> i32 { 739 self.kvm.get_host_ipa_limit() 740 } 741 } 742 /// Vcpu struct for KVM 743 pub struct KvmVcpu { 744 fd: VcpuFd, 745 #[cfg(target_arch = "x86_64")] 746 msrs: MsrEntries, 747 vmmops: Option<Arc<dyn vm::VmmOps>>, 748 #[cfg(target_arch = "x86_64")] 749 hyperv_synic: AtomicBool, 750 } 751 /// Implementation of Vcpu trait for KVM 752 /// Example: 753 /// #[cfg(feature = "kvm")] 754 /// extern crate hypervisor 755 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 756 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 757 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 758 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 759 /// vcpu.get/set().unwrap() 760 /// 761 impl cpu::Vcpu for KvmVcpu { 762 #[cfg(target_arch = "x86_64")] 763 /// 764 /// Returns the vCPU general purpose registers. 765 /// 766 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 767 self.fd 768 .get_regs() 769 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into())) 770 } 771 #[cfg(target_arch = "x86_64")] 772 /// 773 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 774 /// 775 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 776 self.fd 777 .set_regs(regs) 778 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 779 } 780 781 #[cfg(target_arch = "aarch64")] 782 /// 783 /// Set attribute for vcpu. 784 /// 785 fn set_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> { 786 self.fd 787 .set_device_attr(attr) 788 .map_err(|e| cpu::HypervisorCpuError::SetVcpuAttribute(e.into())) 789 } 790 791 #[cfg(target_arch = "aarch64")] 792 /// 793 /// Check if vcpu has a certain attribute. 794 /// 795 fn has_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> { 796 self.fd 797 .has_device_attr(attr) 798 .map_err(|e| cpu::HypervisorCpuError::HasVcpuAttribute(e.into())) 799 } 800 801 #[cfg(target_arch = "x86_64")] 802 /// 803 /// Returns the vCPU special registers. 804 /// 805 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 806 self.fd 807 .get_sregs() 808 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into())) 809 } 810 #[cfg(target_arch = "x86_64")] 811 /// 812 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 813 /// 814 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 815 self.fd 816 .set_sregs(sregs) 817 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 818 } 819 #[cfg(target_arch = "x86_64")] 820 /// 821 /// Returns the floating point state (FPU) from the vCPU. 822 /// 823 fn get_fpu(&self) -> cpu::Result<FpuState> { 824 self.fd 825 .get_fpu() 826 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into())) 827 } 828 #[cfg(target_arch = "x86_64")] 829 /// 830 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 831 /// 832 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 833 self.fd 834 .set_fpu(fpu) 835 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 836 } 837 #[cfg(target_arch = "x86_64")] 838 /// 839 /// X86 specific call to setup the CPUID registers. 840 /// 841 fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> { 842 self.fd 843 .set_cpuid2(cpuid) 844 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 845 } 846 #[cfg(target_arch = "x86_64")] 847 /// 848 /// X86 specific call to enable HyperV SynIC 849 /// 850 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 851 // Update the information about Hyper-V SynIC being enabled and 852 // emulated as it will influence later which MSRs should be saved. 853 self.hyperv_synic.store(true, Ordering::Release); 854 855 let cap = kvm_enable_cap { 856 cap: KVM_CAP_HYPERV_SYNIC, 857 ..Default::default() 858 }; 859 self.fd 860 .enable_cap(&cap) 861 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 862 } 863 /// 864 /// X86 specific call to retrieve the CPUID registers. 865 /// 866 #[cfg(target_arch = "x86_64")] 867 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> { 868 self.fd 869 .get_cpuid2(num_entries) 870 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into())) 871 } 872 #[cfg(target_arch = "x86_64")] 873 /// 874 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 875 /// 876 fn get_lapic(&self) -> cpu::Result<LapicState> { 877 self.fd 878 .get_lapic() 879 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into())) 880 } 881 #[cfg(target_arch = "x86_64")] 882 /// 883 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 884 /// 885 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 886 self.fd 887 .set_lapic(klapic) 888 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 889 } 890 #[cfg(target_arch = "x86_64")] 891 /// 892 /// Returns the model-specific registers (MSR) for this vCPU. 893 /// 894 fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> { 895 self.fd 896 .get_msrs(msrs) 897 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into())) 898 } 899 #[cfg(target_arch = "x86_64")] 900 /// 901 /// Setup the model-specific registers (MSR) for this vCPU. 902 /// Returns the number of MSR entries actually written. 903 /// 904 fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> { 905 self.fd 906 .set_msrs(msrs) 907 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 908 } 909 /// 910 /// Returns the vcpu's current "multiprocessing state". 911 /// 912 fn get_mp_state(&self) -> cpu::Result<MpState> { 913 self.fd 914 .get_mp_state() 915 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into())) 916 } 917 /// 918 /// Sets the vcpu's current "multiprocessing state". 919 /// 920 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 921 self.fd 922 .set_mp_state(mp_state) 923 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 924 } 925 #[cfg(target_arch = "x86_64")] 926 /// 927 /// X86 specific call that returns the vcpu's current "xsave struct". 928 /// 929 fn get_xsave(&self) -> cpu::Result<Xsave> { 930 self.fd 931 .get_xsave() 932 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 933 } 934 #[cfg(target_arch = "x86_64")] 935 /// 936 /// X86 specific call that sets the vcpu's current "xsave struct". 937 /// 938 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 939 self.fd 940 .set_xsave(xsave) 941 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 942 } 943 #[cfg(target_arch = "x86_64")] 944 /// 945 /// X86 specific call that returns the vcpu's current "xcrs". 946 /// 947 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 948 self.fd 949 .get_xcrs() 950 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 951 } 952 #[cfg(target_arch = "x86_64")] 953 /// 954 /// X86 specific call that sets the vcpu's current "xcrs". 955 /// 956 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 957 self.fd 958 .set_xcrs(xcrs) 959 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 960 } 961 /// 962 /// Triggers the running of the current virtual CPU returning an exit reason. 963 /// 964 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 965 match self.fd.run() { 966 Ok(run) => match run { 967 #[cfg(target_arch = "x86_64")] 968 VcpuExit::IoIn(addr, data) => { 969 if let Some(vmmops) = &self.vmmops { 970 return vmmops 971 .pio_read(addr.into(), data) 972 .map(|_| cpu::VmExit::Ignore) 973 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 974 } 975 976 Ok(cpu::VmExit::IoIn(addr, data)) 977 } 978 #[cfg(target_arch = "x86_64")] 979 VcpuExit::IoOut(addr, data) => { 980 if let Some(vmmops) = &self.vmmops { 981 return vmmops 982 .pio_write(addr.into(), data) 983 .map(|_| cpu::VmExit::Ignore) 984 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 985 } 986 987 Ok(cpu::VmExit::IoOut(addr, data)) 988 } 989 #[cfg(target_arch = "x86_64")] 990 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 991 #[cfg(target_arch = "x86_64")] 992 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 993 994 #[cfg(target_arch = "aarch64")] 995 VcpuExit::SystemEvent(event_type, flags) => { 996 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 997 // On Aarch64, when the VM is shutdown, run() returns 998 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 999 if event_type == KVM_SYSTEM_EVENT_RESET { 1000 Ok(cpu::VmExit::Reset) 1001 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 1002 Ok(cpu::VmExit::Shutdown) 1003 } else { 1004 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1005 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 1006 event_type, 1007 flags 1008 ))) 1009 } 1010 } 1011 1012 VcpuExit::MmioRead(addr, data) => { 1013 if let Some(vmmops) = &self.vmmops { 1014 return vmmops 1015 .mmio_read(addr, data) 1016 .map(|_| cpu::VmExit::Ignore) 1017 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1018 } 1019 1020 Ok(cpu::VmExit::MmioRead(addr, data)) 1021 } 1022 VcpuExit::MmioWrite(addr, data) => { 1023 if let Some(vmmops) = &self.vmmops { 1024 return vmmops 1025 .mmio_write(addr, data) 1026 .map(|_| cpu::VmExit::Ignore) 1027 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1028 } 1029 1030 Ok(cpu::VmExit::MmioWrite(addr, data)) 1031 } 1032 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 1033 1034 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1035 "Unexpected exit reason on vcpu run: {:?}", 1036 r 1037 ))), 1038 }, 1039 1040 Err(ref e) => match e.errno() { 1041 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1042 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1043 "VCPU error {:?}", 1044 e 1045 ))), 1046 }, 1047 } 1048 } 1049 #[cfg(target_arch = "x86_64")] 1050 /// 1051 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 1052 /// states of the vcpu. 1053 /// 1054 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 1055 self.fd 1056 .get_vcpu_events() 1057 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 1058 } 1059 #[cfg(target_arch = "x86_64")] 1060 /// 1061 /// Sets pending exceptions, interrupts, and NMIs as well as related states 1062 /// of the vcpu. 1063 /// 1064 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 1065 self.fd 1066 .set_vcpu_events(events) 1067 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 1068 } 1069 #[cfg(target_arch = "x86_64")] 1070 /// 1071 /// Let the guest know that it has been paused, which prevents from 1072 /// potential soft lockups when being resumed. 1073 /// 1074 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1075 if let Err(e) = self.fd.kvmclock_ctrl() { 1076 // Linux kernel returns -EINVAL if the PV clock isn't yet initialised 1077 // which could be because we're still in firmware or the guest doesn't 1078 // use KVM clock. 1079 if e.errno() != libc::EINVAL { 1080 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())); 1081 } 1082 } 1083 1084 Ok(()) 1085 } 1086 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1087 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1088 self.fd 1089 .vcpu_init(kvi) 1090 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1091 } 1092 /// 1093 /// Sets the value of one register for this vCPU. 1094 /// 1095 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1096 fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> { 1097 self.fd 1098 .set_one_reg(reg_id, data) 1099 .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into())) 1100 } 1101 /// 1102 /// Gets the value of one register for this vCPU. 1103 /// 1104 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1105 fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> { 1106 self.fd 1107 .get_one_reg(reg_id) 1108 .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into())) 1109 } 1110 /// 1111 /// Gets a list of the guest registers that are supported for the 1112 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1113 /// 1114 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1115 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1116 self.fd 1117 .get_reg_list(reg_list) 1118 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1119 } 1120 /// 1121 /// Save the state of the core registers. 1122 /// 1123 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1124 fn core_registers(&self, state: &mut StandardRegisters) -> cpu::Result<()> { 1125 let mut off = offset__of!(user_pt_regs, regs); 1126 // There are 31 user_pt_regs: 1127 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1128 // These actually are the general-purpose registers of the Armv8-a 1129 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1130 for i in 0..31 { 1131 state.regs.regs[i] = self 1132 .fd 1133 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1134 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1135 off += std::mem::size_of::<u64>(); 1136 } 1137 1138 // We are now entering the "Other register" section of the ARMv8-a architecture. 1139 // First one, stack pointer. 1140 let off = offset__of!(user_pt_regs, sp); 1141 state.regs.sp = self 1142 .fd 1143 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1144 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1145 1146 // Second one, the program counter. 1147 let off = offset__of!(user_pt_regs, pc); 1148 state.regs.pc = self 1149 .fd 1150 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1151 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1152 1153 // Next is the processor state. 1154 let off = offset__of!(user_pt_regs, pstate); 1155 state.regs.pstate = self 1156 .fd 1157 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1158 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1159 1160 // The stack pointer associated with EL1 1161 let off = offset__of!(kvm_regs, sp_el1); 1162 state.sp_el1 = self 1163 .fd 1164 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1165 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1166 1167 // Exception Link Register for EL1, when taking an exception to EL1, this register 1168 // holds the address to which to return afterwards. 1169 let off = offset__of!(kvm_regs, elr_el1); 1170 state.elr_el1 = self 1171 .fd 1172 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1173 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1174 1175 // Saved Program Status Registers, there are 5 of them used in the kernel. 1176 let mut off = offset__of!(kvm_regs, spsr); 1177 for i in 0..KVM_NR_SPSR as usize { 1178 state.spsr[i] = self 1179 .fd 1180 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1181 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1182 off += std::mem::size_of::<u64>(); 1183 } 1184 1185 // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: 1186 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1187 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1188 for i in 0..32 { 1189 state.fp_regs.vregs[i] = self 1190 .fd 1191 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1192 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1193 .into(); 1194 off += mem::size_of::<u128>(); 1195 } 1196 1197 // Floating-point Status Register 1198 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1199 state.fp_regs.fpsr = self 1200 .fd 1201 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1202 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1203 as u32; 1204 1205 // Floating-point Control Register 1206 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1207 state.fp_regs.fpcr = self 1208 .fd 1209 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1210 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1211 as u32; 1212 Ok(()) 1213 } 1214 /// 1215 /// Restore the state of the core registers. 1216 /// 1217 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1218 fn set_core_registers(&self, state: &StandardRegisters) -> cpu::Result<()> { 1219 // The function follows the exact identical order from `state`. Look there 1220 // for some additional info on registers. 1221 let mut off = offset__of!(user_pt_regs, regs); 1222 for i in 0..31 { 1223 self.fd 1224 .set_one_reg( 1225 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1226 state.regs.regs[i], 1227 ) 1228 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1229 off += std::mem::size_of::<u64>(); 1230 } 1231 1232 let off = offset__of!(user_pt_regs, sp); 1233 self.fd 1234 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp) 1235 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1236 1237 let off = offset__of!(user_pt_regs, pc); 1238 self.fd 1239 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc) 1240 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1241 1242 let off = offset__of!(user_pt_regs, pstate); 1243 self.fd 1244 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate) 1245 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1246 1247 let off = offset__of!(kvm_regs, sp_el1); 1248 self.fd 1249 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1) 1250 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1251 1252 let off = offset__of!(kvm_regs, elr_el1); 1253 self.fd 1254 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1) 1255 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1256 1257 let mut off = offset__of!(kvm_regs, spsr); 1258 for i in 0..KVM_NR_SPSR as usize { 1259 self.fd 1260 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i]) 1261 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1262 off += std::mem::size_of::<u64>(); 1263 } 1264 1265 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1266 for i in 0..32 { 1267 self.fd 1268 .set_one_reg( 1269 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1270 state.fp_regs.vregs[i] as u64, 1271 ) 1272 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1273 off += mem::size_of::<u128>(); 1274 } 1275 1276 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1277 self.fd 1278 .set_one_reg( 1279 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1280 state.fp_regs.fpsr as u64, 1281 ) 1282 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1283 1284 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1285 self.fd 1286 .set_one_reg( 1287 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1288 state.fp_regs.fpcr as u64, 1289 ) 1290 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1291 Ok(()) 1292 } 1293 /// 1294 /// Save the state of the system registers. 1295 /// 1296 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1297 fn system_registers(&self, state: &mut Vec<Register>) -> cpu::Result<()> { 1298 // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are 1299 // around 500 registers. 1300 let mut reg_list = RegList::new(500).unwrap(); 1301 self.fd 1302 .get_reg_list(&mut reg_list) 1303 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1304 1305 // At this point reg_list should contain: core registers and system registers. 1306 // The register list contains the number of registers and their ids. We will be needing to 1307 // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list 1308 // the core registers which are represented in the kernel by kvm_regs structure and for which 1309 // we can calculate the id based on the offset in the structure. 1310 reg_list.retain(|regid| is_system_register(*regid)); 1311 1312 // Now, for the rest of the registers left in the previously fetched register list, we are 1313 // simply calling KVM_GET_ONE_REG. 1314 let indices = reg_list.as_slice(); 1315 for index in indices.iter() { 1316 state.push(kvm_bindings::kvm_one_reg { 1317 id: *index, 1318 addr: self 1319 .fd 1320 .get_one_reg(*index) 1321 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?, 1322 }); 1323 } 1324 1325 Ok(()) 1326 } 1327 /// 1328 /// Restore the state of the system registers. 1329 /// 1330 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1331 fn set_system_registers(&self, state: &[Register]) -> cpu::Result<()> { 1332 for reg in state { 1333 self.fd 1334 .set_one_reg(reg.id, reg.addr) 1335 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 1336 } 1337 Ok(()) 1338 } 1339 /// 1340 /// Read the MPIDR - Multiprocessor Affinity Register. 1341 /// 1342 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1343 fn read_mpidr(&self) -> cpu::Result<u64> { 1344 self.fd 1345 .get_one_reg(MPIDR_EL1) 1346 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into())) 1347 } 1348 #[cfg(target_arch = "x86_64")] 1349 /// 1350 /// Get the current CPU state 1351 /// 1352 /// Ordering requirements: 1353 /// 1354 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1355 /// vCPU/LAPIC state. As such, it must be done before most everything 1356 /// else, otherwise we cannot restore everything and expect it to work. 1357 /// 1358 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1359 /// still running. 1360 /// 1361 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1362 /// 1363 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1364 /// it might as well be affected by internal state modifications of the 1365 /// GET ioctls. 1366 /// 1367 /// SREGS saves/restores a pending interrupt, similar to what 1368 /// VCPU_EVENTS also does. 1369 /// 1370 /// GET_MSRS requires a pre-populated data structure to do something 1371 /// meaningful. For SET_MSRS it will then contain good data. 1372 /// 1373 /// # Example 1374 /// 1375 /// ```rust 1376 /// # extern crate hypervisor; 1377 /// # use hypervisor::KvmHypervisor; 1378 /// # use std::sync::Arc; 1379 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1380 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1381 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1382 /// vm.enable_split_irq().unwrap(); 1383 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1384 /// let state = vcpu.state().unwrap(); 1385 /// ``` 1386 fn state(&self) -> cpu::Result<CpuState> { 1387 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1388 let mp_state = self.get_mp_state()?; 1389 let regs = self.get_regs()?; 1390 let sregs = self.get_sregs()?; 1391 let xsave = self.get_xsave()?; 1392 let xcrs = self.get_xcrs()?; 1393 let lapic_state = self.get_lapic()?; 1394 let fpu = self.get_fpu()?; 1395 1396 // Try to get all MSRs based on the list previously retrieved from KVM. 1397 // If the number of MSRs obtained from GET_MSRS is different from the 1398 // expected amount, we fallback onto a slower method by getting MSRs 1399 // by chunks. This is the only way to make sure we try to get as many 1400 // MSRs as possible, even if some MSRs are not supported. 1401 let mut msr_entries = self.msrs.clone(); 1402 1403 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1404 // emulated. 1405 if self.hyperv_synic.load(Ordering::Acquire) { 1406 let hyperv_synic_msrs = vec![ 1407 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1408 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1409 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1410 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 1411 0x400000b5, 0x400000b6, 0x400000b7, 1412 ]; 1413 for index in hyperv_synic_msrs { 1414 let msr = kvm_msr_entry { 1415 index, 1416 ..Default::default() 1417 }; 1418 msr_entries.push(msr).unwrap(); 1419 } 1420 } 1421 1422 let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize; 1423 let num_msrs = self.get_msrs(&mut msr_entries)?; 1424 let msrs = if num_msrs != expected_num_msrs { 1425 let mut faulty_msr_index = num_msrs; 1426 let mut msr_entries_tmp = 1427 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap(); 1428 1429 loop { 1430 warn!( 1431 "Detected faulty MSR 0x{:x} while getting MSRs", 1432 msr_entries.as_slice()[faulty_msr_index].index 1433 ); 1434 1435 let start_pos = faulty_msr_index + 1; 1436 let mut sub_msr_entries = 1437 MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap(); 1438 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1439 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1440 1441 for i in 0..num_msrs { 1442 msr_entries_tmp 1443 .push(sub_msr_entries.as_slice()[i]) 1444 .map_err(|e| { 1445 cpu::HypervisorCpuError::GetMsrEntries(anyhow!( 1446 "Failed adding MSR entries: {:?}", 1447 e 1448 )) 1449 })?; 1450 } 1451 1452 if num_msrs == expected_num_msrs { 1453 break; 1454 } 1455 1456 faulty_msr_index = start_pos + num_msrs; 1457 } 1458 1459 msr_entries_tmp 1460 } else { 1461 msr_entries 1462 }; 1463 1464 let vcpu_events = self.get_vcpu_events()?; 1465 1466 Ok(CpuState { 1467 cpuid, 1468 msrs, 1469 vcpu_events, 1470 regs, 1471 sregs, 1472 fpu, 1473 lapic_state, 1474 xsave, 1475 xcrs, 1476 mp_state, 1477 }) 1478 } 1479 /// 1480 /// Get the current AArch64 CPU state 1481 /// 1482 #[cfg(target_arch = "aarch64")] 1483 fn state(&self) -> cpu::Result<CpuState> { 1484 let mut state = CpuState { 1485 mp_state: self.get_mp_state()?, 1486 mpidr: self.read_mpidr()?, 1487 ..Default::default() 1488 }; 1489 self.core_registers(&mut state.core_regs)?; 1490 self.system_registers(&mut state.sys_regs)?; 1491 1492 Ok(state) 1493 } 1494 #[cfg(target_arch = "x86_64")] 1495 /// 1496 /// Restore the previously saved CPU state 1497 /// 1498 /// Ordering requirements: 1499 /// 1500 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1501 /// still running. 1502 /// 1503 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1504 /// if we ever change the BSP, we have to do that before restoring anything. 1505 /// The same seems to be true for CPUID stuff. 1506 /// 1507 /// SREGS saves/restores a pending interrupt, similar to what 1508 /// VCPU_EVENTS also does. 1509 /// 1510 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1511 /// done before SET_VCPU_EVENTS, which restores it. 1512 /// 1513 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1514 /// the apic base msr. 1515 /// 1516 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1517 /// only restores successfully, when the LAPIC is correctly configured. 1518 /// 1519 /// Arguments: CpuState 1520 /// # Example 1521 /// 1522 /// ```rust 1523 /// # extern crate hypervisor; 1524 /// # use hypervisor::KvmHypervisor; 1525 /// # use std::sync::Arc; 1526 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1527 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1528 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1529 /// vm.enable_split_irq().unwrap(); 1530 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1531 /// let state = vcpu.state().unwrap(); 1532 /// vcpu.set_state(&state).unwrap(); 1533 /// ``` 1534 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1535 self.set_cpuid2(&state.cpuid)?; 1536 self.set_mp_state(state.mp_state)?; 1537 self.set_regs(&state.regs)?; 1538 self.set_sregs(&state.sregs)?; 1539 self.set_xsave(&state.xsave)?; 1540 self.set_xcrs(&state.xcrs)?; 1541 self.set_lapic(&state.lapic_state)?; 1542 self.set_fpu(&state.fpu)?; 1543 1544 // Try to set all MSRs previously stored. 1545 // If the number of MSRs set from SET_MSRS is different from the 1546 // expected amount, we fallback onto a slower method by setting MSRs 1547 // by chunks. This is the only way to make sure we try to set as many 1548 // MSRs as possible, even if some MSRs are not supported. 1549 let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize; 1550 let num_msrs = self.set_msrs(&state.msrs)?; 1551 if num_msrs != expected_num_msrs { 1552 let mut faulty_msr_index = num_msrs; 1553 1554 loop { 1555 warn!( 1556 "Detected faulty MSR 0x{:x} while setting MSRs", 1557 state.msrs.as_slice()[faulty_msr_index].index 1558 ); 1559 1560 let start_pos = faulty_msr_index + 1; 1561 let sub_msr_entries = 1562 MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap(); 1563 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1564 let num_msrs = self.set_msrs(&sub_msr_entries)?; 1565 1566 if num_msrs == expected_num_msrs { 1567 break; 1568 } 1569 1570 faulty_msr_index = start_pos + num_msrs; 1571 } 1572 } 1573 1574 self.set_vcpu_events(&state.vcpu_events)?; 1575 1576 Ok(()) 1577 } 1578 /// 1579 /// Restore the previously saved AArch64 CPU state 1580 /// 1581 #[cfg(target_arch = "aarch64")] 1582 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1583 self.set_core_registers(&state.core_regs)?; 1584 self.set_system_registers(&state.sys_regs)?; 1585 self.set_mp_state(state.mp_state)?; 1586 1587 Ok(()) 1588 } 1589 1590 /// 1591 /// Initialize TDX for this CPU 1592 /// 1593 #[cfg(feature = "tdx")] 1594 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 1595 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 1596 .map_err(cpu::HypervisorCpuError::InitializeTdx) 1597 } 1598 } 1599 1600 /// Device struct for KVM 1601 pub struct KvmDevice { 1602 fd: DeviceFd, 1603 } 1604 1605 impl device::Device for KvmDevice { 1606 /// 1607 /// Set device attribute 1608 /// 1609 fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> { 1610 self.fd 1611 .set_device_attr(attr) 1612 .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into())) 1613 } 1614 /// 1615 /// Get device attribute 1616 /// 1617 fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> { 1618 self.fd 1619 .get_device_attr(attr) 1620 .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into())) 1621 } 1622 } 1623 1624 impl AsRawFd for KvmDevice { 1625 fn as_raw_fd(&self) -> RawFd { 1626 self.fd.as_raw_fd() 1627 } 1628 } 1629