1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 pub use crate::aarch64::{ 13 check_required_kvm_extensions, is_system_register, VcpuInit, VcpuKvmState as CpuState, 14 MPIDR_EL1, 15 }; 16 use crate::cpu; 17 use crate::device; 18 use crate::hypervisor; 19 use crate::vec_with_array_field; 20 use crate::vm::{self, VmmOps}; 21 #[cfg(target_arch = "aarch64")] 22 use crate::{arm64_core_reg_id, offset__of}; 23 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 24 use serde_derive::{Deserialize, Serialize}; 25 use std::collections::HashMap; 26 #[cfg(target_arch = "aarch64")] 27 use std::convert::TryInto; 28 #[cfg(target_arch = "x86_64")] 29 use std::fs::File; 30 use std::os::unix::io::{AsRawFd, RawFd}; 31 use std::result; 32 #[cfg(target_arch = "x86_64")] 33 use std::sync::atomic::{AtomicBool, Ordering}; 34 use std::sync::{Arc, RwLock}; 35 use vmm_sys_util::eventfd::EventFd; 36 // x86_64 dependencies 37 #[cfg(target_arch = "x86_64")] 38 pub mod x86_64; 39 #[cfg(target_arch = "x86_64")] 40 use crate::arch::x86::NUM_IOAPIC_PINS; 41 #[cfg(target_arch = "aarch64")] 42 use aarch64::{RegList, Register, StandardRegisters}; 43 #[cfg(target_arch = "x86_64")] 44 use kvm_bindings::{ 45 kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP, 46 }; 47 #[cfg(target_arch = "x86_64")] 48 use x86_64::{check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters}; 49 #[cfg(target_arch = "x86_64")] 50 pub use x86_64::{ 51 CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState as CpuState, 52 Xsave, CPUID_FLAG_VALID_INDEX, 53 }; 54 // aarch64 dependencies 55 #[cfg(target_arch = "aarch64")] 56 pub mod aarch64; 57 pub use kvm_bindings; 58 #[cfg(feature = "tdx")] 59 use kvm_bindings::KVMIO; 60 pub use kvm_bindings::{ 61 kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry, 62 kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, 63 KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 64 }; 65 #[cfg(target_arch = "aarch64")] 66 use kvm_bindings::{ 67 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE, 68 KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 69 }; 70 pub use kvm_ioctls; 71 pub use kvm_ioctls::{Cap, Kvm}; 72 #[cfg(target_arch = "aarch64")] 73 use std::mem; 74 use thiserror::Error; 75 #[cfg(feature = "tdx")] 76 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr}; 77 /// 78 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 79 /// 80 pub use { 81 kvm_bindings::kvm_clock_data as ClockData, kvm_bindings::kvm_create_device as CreateDevice, 82 kvm_bindings::kvm_device_attr as DeviceAttr, 83 kvm_bindings::kvm_irq_routing_entry as IrqRoutingEntry, kvm_bindings::kvm_mp_state as MpState, 84 kvm_bindings::kvm_userspace_memory_region as MemoryRegion, 85 kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::IoEventAddress, 86 kvm_ioctls::VcpuExit, 87 }; 88 89 #[cfg(target_arch = "x86_64")] 90 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 91 92 #[cfg(feature = "tdx")] 93 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 94 95 #[cfg(feature = "tdx")] 96 #[repr(u32)] 97 enum TdxCommand { 98 #[allow(dead_code)] 99 Capabilities = 0, 100 InitVm, 101 InitVcpu, 102 InitMemRegion, 103 Finalize, 104 } 105 106 #[derive(Clone, Copy, Debug, PartialEq, Deserialize, Serialize)] 107 pub struct KvmVmState {} 108 109 pub use KvmVmState as VmState; 110 111 struct KvmDirtyLogSlot { 112 slot: u32, 113 guest_phys_addr: u64, 114 memory_size: u64, 115 userspace_addr: u64, 116 } 117 118 /// Wrapper over KVM VM ioctls. 119 pub struct KvmVm { 120 fd: Arc<VmFd>, 121 #[cfg(target_arch = "x86_64")] 122 msrs: MsrEntries, 123 state: KvmVmState, 124 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 125 } 126 127 /// 128 /// Implementation of Vm trait for KVM 129 /// Example: 130 /// #[cfg(feature = "kvm")] 131 /// extern crate hypervisor 132 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 133 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 134 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 135 /// vm.set/get().unwrap() 136 /// 137 impl vm::Vm for KvmVm { 138 #[cfg(target_arch = "x86_64")] 139 /// 140 /// Sets the address of the three-page region in the VM's address space. 141 /// 142 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 143 self.fd 144 .set_tss_address(offset) 145 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 146 } 147 /// 148 /// Creates an in-kernel interrupt controller. 149 /// 150 fn create_irq_chip(&self) -> vm::Result<()> { 151 self.fd 152 .create_irq_chip() 153 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 154 } 155 /// 156 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 157 /// 158 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 159 self.fd 160 .register_irqfd(fd, gsi) 161 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 162 } 163 /// 164 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 165 /// 166 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 167 self.fd 168 .unregister_irqfd(fd, gsi) 169 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 170 } 171 /// 172 /// Creates a VcpuFd object from a vcpu RawFd. 173 /// 174 fn create_vcpu( 175 &self, 176 id: u8, 177 vmmops: Option<Arc<dyn VmmOps>>, 178 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 179 let vc = self 180 .fd 181 .create_vcpu(id as u64) 182 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 183 let vcpu = KvmVcpu { 184 fd: vc, 185 #[cfg(target_arch = "x86_64")] 186 msrs: self.msrs.clone(), 187 vmmops, 188 #[cfg(target_arch = "x86_64")] 189 hyperv_synic: AtomicBool::new(false), 190 }; 191 Ok(Arc::new(vcpu)) 192 } 193 /// 194 /// Registers an event to be signaled whenever a certain address is written to. 195 /// 196 fn register_ioevent( 197 &self, 198 fd: &EventFd, 199 addr: &IoEventAddress, 200 datamatch: Option<vm::DataMatch>, 201 ) -> vm::Result<()> { 202 if let Some(dm) = datamatch { 203 match dm { 204 vm::DataMatch::DataMatch32(kvm_dm32) => self 205 .fd 206 .register_ioevent(fd, addr, kvm_dm32) 207 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 208 vm::DataMatch::DataMatch64(kvm_dm64) => self 209 .fd 210 .register_ioevent(fd, addr, kvm_dm64) 211 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 212 } 213 } else { 214 self.fd 215 .register_ioevent(fd, addr, NoDatamatch) 216 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 217 } 218 } 219 /// 220 /// Unregisters an event from a certain address it has been previously registered to. 221 /// 222 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 223 self.fd 224 .unregister_ioevent(fd, addr, NoDatamatch) 225 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 226 } 227 /// 228 /// Sets the GSI routing table entries, overwriting any previously set 229 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 230 /// 231 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 232 let mut irq_routing = 233 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 234 irq_routing[0].nr = entries.len() as u32; 235 irq_routing[0].flags = 0; 236 237 // SAFETY: irq_routing initialized with entries.len() and now it is being turned into 238 // entries_slice with entries.len() again. It is guaranteed to be large enough to hold 239 // everything from entries. 240 unsafe { 241 let entries_slice: &mut [kvm_irq_routing_entry] = 242 irq_routing[0].entries.as_mut_slice(entries.len()); 243 entries_slice.copy_from_slice(entries); 244 } 245 246 self.fd 247 .set_gsi_routing(&irq_routing[0]) 248 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 249 } 250 /// 251 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 252 /// 253 fn make_user_memory_region( 254 &self, 255 slot: u32, 256 guest_phys_addr: u64, 257 memory_size: u64, 258 userspace_addr: u64, 259 readonly: bool, 260 log_dirty_pages: bool, 261 ) -> MemoryRegion { 262 MemoryRegion { 263 slot, 264 guest_phys_addr, 265 memory_size, 266 userspace_addr, 267 flags: if readonly { KVM_MEM_READONLY } else { 0 } 268 | if log_dirty_pages { 269 KVM_MEM_LOG_DIRTY_PAGES 270 } else { 271 0 272 }, 273 } 274 } 275 /// 276 /// Creates a guest physical memory region. 277 /// 278 fn create_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> { 279 let mut region = user_memory_region; 280 281 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 282 if (region.flags & KVM_MEM_READONLY) != 0 { 283 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 284 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 285 ))); 286 } 287 288 // Keep track of the regions that need dirty pages log 289 self.dirty_log_slots.write().unwrap().insert( 290 region.slot, 291 KvmDirtyLogSlot { 292 slot: region.slot, 293 guest_phys_addr: region.guest_phys_addr, 294 memory_size: region.memory_size, 295 userspace_addr: region.userspace_addr, 296 }, 297 ); 298 299 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 300 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 301 region.flags = 0; 302 } 303 304 // SAFETY: Safe because guest regions are guaranteed not to overlap. 305 unsafe { 306 self.fd 307 .set_user_memory_region(region) 308 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 309 } 310 } 311 /// 312 /// Removes a guest physical memory region. 313 /// 314 fn remove_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> { 315 let mut region = user_memory_region; 316 317 // Remove the corresponding entry from "self.dirty_log_slots" if needed 318 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 319 320 // Setting the size to 0 means "remove" 321 region.memory_size = 0; 322 // SAFETY: Safe because guest regions are guaranteed not to overlap. 323 unsafe { 324 self.fd 325 .set_user_memory_region(region) 326 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 327 } 328 } 329 /// 330 /// Creates an emulated device in the kernel. 331 /// 332 /// See the documentation for `KVM_CREATE_DEVICE`. 333 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> { 334 let fd = self 335 .fd 336 .create_device(device) 337 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 338 let device = KvmDevice { fd }; 339 Ok(Arc::new(device)) 340 } 341 /// 342 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 343 /// 344 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 345 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 346 self.fd 347 .get_preferred_target(kvi) 348 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 349 } 350 #[cfg(target_arch = "x86_64")] 351 fn enable_split_irq(&self) -> vm::Result<()> { 352 // Create split irqchip 353 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 354 // are not. 355 let mut cap = kvm_enable_cap { 356 cap: KVM_CAP_SPLIT_IRQCHIP, 357 ..Default::default() 358 }; 359 cap.args[0] = NUM_IOAPIC_PINS as u64; 360 self.fd 361 .enable_cap(&cap) 362 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 363 Ok(()) 364 } 365 #[cfg(target_arch = "x86_64")] 366 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 367 let mut cap = kvm_enable_cap { 368 cap: KVM_CAP_SGX_ATTRIBUTE, 369 ..Default::default() 370 }; 371 cap.args[0] = file.as_raw_fd() as u64; 372 self.fd 373 .enable_cap(&cap) 374 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 375 Ok(()) 376 } 377 /// Retrieve guest clock. 378 #[cfg(target_arch = "x86_64")] 379 fn get_clock(&self) -> vm::Result<ClockData> { 380 self.fd 381 .get_clock() 382 .map_err(|e| vm::HypervisorVmError::GetClock(e.into())) 383 } 384 /// Set guest clock. 385 #[cfg(target_arch = "x86_64")] 386 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 387 self.fd 388 .set_clock(data) 389 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 390 } 391 /// Checks if a particular `Cap` is available. 392 fn check_extension(&self, c: Cap) -> bool { 393 self.fd.check_extension(c) 394 } 395 /// Create a device that is used for passthrough 396 fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> { 397 let mut vfio_dev = kvm_create_device { 398 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 399 fd: 0, 400 flags: 0, 401 }; 402 403 self.create_device(&mut vfio_dev) 404 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 405 } 406 /// 407 /// Get the Vm state. Return VM specific data 408 /// 409 fn state(&self) -> vm::Result<VmState> { 410 Ok(self.state) 411 } 412 /// 413 /// Set the VM state 414 /// 415 fn set_state(&self, _state: VmState) -> vm::Result<()> { 416 Ok(()) 417 } 418 419 /// 420 /// Start logging dirty pages 421 /// 422 fn start_dirty_log(&self) -> vm::Result<()> { 423 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 424 for (_, s) in dirty_log_slots.iter() { 425 let region = MemoryRegion { 426 slot: s.slot, 427 guest_phys_addr: s.guest_phys_addr, 428 memory_size: s.memory_size, 429 userspace_addr: s.userspace_addr, 430 flags: KVM_MEM_LOG_DIRTY_PAGES, 431 }; 432 // SAFETY: Safe because guest regions are guaranteed not to overlap. 433 unsafe { 434 self.fd 435 .set_user_memory_region(region) 436 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 437 } 438 } 439 440 Ok(()) 441 } 442 443 /// 444 /// Stop logging dirty pages 445 /// 446 fn stop_dirty_log(&self) -> vm::Result<()> { 447 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 448 for (_, s) in dirty_log_slots.iter() { 449 let region = MemoryRegion { 450 slot: s.slot, 451 guest_phys_addr: s.guest_phys_addr, 452 memory_size: s.memory_size, 453 userspace_addr: s.userspace_addr, 454 flags: 0, 455 }; 456 // SAFETY: Safe because guest regions are guaranteed not to overlap. 457 unsafe { 458 self.fd 459 .set_user_memory_region(region) 460 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 461 } 462 } 463 464 Ok(()) 465 } 466 467 /// 468 /// Get dirty pages bitmap (one bit per page) 469 /// 470 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 471 self.fd 472 .get_dirty_log(slot, memory_size as usize) 473 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 474 } 475 476 /// 477 /// Initialize TDX for this VM 478 /// 479 #[cfg(feature = "tdx")] 480 fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> { 481 #[repr(C)] 482 struct TdxInitVm { 483 max_vcpus: u32, 484 tsc_khz: u32, 485 attributes: u64, 486 cpuid: u64, 487 mrconfigid: [u64; 6], 488 mrowner: [u64; 6], 489 mrownerconfig: [u64; 6], 490 reserved: [u64; 43], 491 } 492 let data = TdxInitVm { 493 max_vcpus, 494 tsc_khz: 0, 495 attributes: 1, // TDX1_TD_ATTRIBUTE_DEBUG, 496 cpuid: cpuid.as_fam_struct_ptr() as u64, 497 mrconfigid: [0; 6], 498 mrowner: [0; 6], 499 mrownerconfig: [0; 6], 500 reserved: [0; 43], 501 }; 502 503 tdx_command( 504 &self.fd.as_raw_fd(), 505 TdxCommand::InitVm, 506 0, 507 &data as *const _ as u64, 508 ) 509 .map_err(vm::HypervisorVmError::InitializeTdx) 510 } 511 512 /// 513 /// Finalize the TDX setup for this VM 514 /// 515 #[cfg(feature = "tdx")] 516 fn tdx_finalize(&self) -> vm::Result<()> { 517 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 518 .map_err(vm::HypervisorVmError::FinalizeTdx) 519 } 520 521 /// 522 /// Initialize memory regions for the TDX VM 523 /// 524 #[cfg(feature = "tdx")] 525 fn tdx_init_memory_region( 526 &self, 527 host_address: u64, 528 guest_address: u64, 529 size: u64, 530 measure: bool, 531 ) -> vm::Result<()> { 532 #[repr(C)] 533 struct TdxInitMemRegion { 534 host_address: u64, 535 guest_address: u64, 536 pages: u64, 537 } 538 let data = TdxInitMemRegion { 539 host_address, 540 guest_address, 541 pages: size / 4096, 542 }; 543 544 tdx_command( 545 &self.fd.as_raw_fd(), 546 TdxCommand::InitMemRegion, 547 if measure { 1 } else { 0 }, 548 &data as *const _ as u64, 549 ) 550 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 551 } 552 } 553 554 #[cfg(feature = "tdx")] 555 fn tdx_command( 556 fd: &RawFd, 557 command: TdxCommand, 558 metadata: u32, 559 data: u64, 560 ) -> std::result::Result<(), std::io::Error> { 561 #[repr(C)] 562 struct TdxIoctlCmd { 563 command: TdxCommand, 564 metadata: u32, 565 data: u64, 566 } 567 let cmd = TdxIoctlCmd { 568 command, 569 metadata, 570 data, 571 }; 572 // SAFETY: FFI call. All input parameters are valid. 573 let ret = unsafe { 574 ioctl_with_val( 575 fd, 576 KVM_MEMORY_ENCRYPT_OP(), 577 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 578 ) 579 }; 580 581 if ret < 0 { 582 return Err(std::io::Error::last_os_error()); 583 } 584 Ok(()) 585 } 586 587 /// Wrapper over KVM system ioctls. 588 pub struct KvmHypervisor { 589 kvm: Kvm, 590 } 591 /// Enum for KVM related error 592 #[derive(Debug, Error)] 593 pub enum KvmError { 594 #[error("Capability missing: {0:?}")] 595 CapabilityMissing(Cap), 596 } 597 pub type KvmResult<T> = result::Result<T, KvmError>; 598 impl KvmHypervisor { 599 /// Create a hypervisor based on Kvm 600 pub fn new() -> hypervisor::Result<KvmHypervisor> { 601 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 602 let api_version = kvm_obj.get_api_version(); 603 604 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 605 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 606 } 607 608 Ok(KvmHypervisor { kvm: kvm_obj }) 609 } 610 } 611 /// Implementation of Hypervisor trait for KVM 612 /// Example: 613 /// #[cfg(feature = "kvm")] 614 /// extern crate hypervisor 615 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 616 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 617 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 618 /// 619 impl hypervisor::Hypervisor for KvmHypervisor { 620 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 621 /// Example 622 /// # extern crate hypervisor; 623 /// # use hypervisor::KvmHypervisor; 624 /// use hypervisor::KvmVm; 625 /// let hypervisor = KvmHypervisor::new().unwrap(); 626 /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() 627 /// 628 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 629 let fd: VmFd; 630 loop { 631 match self.kvm.create_vm_with_type(vm_type) { 632 Ok(res) => fd = res, 633 Err(e) => { 634 if e.errno() == libc::EINTR { 635 // If the error returned is EINTR, which means the 636 // ioctl has been interrupted, we have to retry as 637 // this can't be considered as a regular error. 638 continue; 639 } else { 640 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 641 } 642 } 643 } 644 break; 645 } 646 647 let vm_fd = Arc::new(fd); 648 649 #[cfg(target_arch = "x86_64")] 650 { 651 let msr_list = self.get_msr_list()?; 652 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 653 let mut msrs = MsrEntries::new(num_msrs).unwrap(); 654 let indices = msr_list.as_slice(); 655 let msr_entries = msrs.as_mut_slice(); 656 for (pos, index) in indices.iter().enumerate() { 657 msr_entries[pos].index = *index; 658 } 659 660 Ok(Arc::new(KvmVm { 661 fd: vm_fd, 662 msrs, 663 state: VmState {}, 664 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 665 })) 666 } 667 668 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 669 { 670 Ok(Arc::new(KvmVm { 671 fd: vm_fd, 672 state: VmState {}, 673 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 674 })) 675 } 676 } 677 678 /// Create a KVM vm object and return the object as Vm trait object 679 /// Example 680 /// # extern crate hypervisor; 681 /// # use hypervisor::KvmHypervisor; 682 /// use hypervisor::KvmVm; 683 /// let hypervisor = KvmHypervisor::new().unwrap(); 684 /// let vm = hypervisor.create_vm().unwrap() 685 /// 686 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 687 #[allow(unused_mut)] 688 let mut vm_type: u64 = 0; // Create with default platform type 689 690 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 691 // size from the host and use that when creating the VM, which may 692 // avoid unnecessary VM creation failures. 693 #[cfg(target_arch = "aarch64")] 694 if self.kvm.check_extension(Cap::ArmVmIPASize) { 695 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 696 } 697 698 self.create_vm_with_type(vm_type) 699 } 700 701 fn check_required_extensions(&self) -> hypervisor::Result<()> { 702 check_required_kvm_extensions(&self.kvm) 703 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 704 } 705 706 #[cfg(target_arch = "x86_64")] 707 /// 708 /// X86 specific call to get the system supported CPUID values. 709 /// 710 fn get_cpuid(&self) -> hypervisor::Result<CpuId> { 711 self.kvm 712 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 713 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into())) 714 } 715 716 #[cfg(target_arch = "x86_64")] 717 /// 718 /// Retrieve the list of MSRs supported by KVM. 719 /// 720 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 721 self.kvm 722 .get_msr_index_list() 723 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 724 } 725 #[cfg(target_arch = "aarch64")] 726 /// 727 /// Retrieve AArch64 host maximum IPA size supported by KVM. 728 /// 729 fn get_host_ipa_limit(&self) -> i32 { 730 self.kvm.get_host_ipa_limit() 731 } 732 } 733 /// Vcpu struct for KVM 734 pub struct KvmVcpu { 735 fd: VcpuFd, 736 #[cfg(target_arch = "x86_64")] 737 msrs: MsrEntries, 738 vmmops: Option<Arc<dyn vm::VmmOps>>, 739 #[cfg(target_arch = "x86_64")] 740 hyperv_synic: AtomicBool, 741 } 742 /// Implementation of Vcpu trait for KVM 743 /// Example: 744 /// #[cfg(feature = "kvm")] 745 /// extern crate hypervisor 746 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 747 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 748 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 749 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 750 /// vcpu.get/set().unwrap() 751 /// 752 impl cpu::Vcpu for KvmVcpu { 753 #[cfg(target_arch = "x86_64")] 754 /// 755 /// Returns the vCPU general purpose registers. 756 /// 757 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 758 self.fd 759 .get_regs() 760 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into())) 761 } 762 #[cfg(target_arch = "x86_64")] 763 /// 764 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 765 /// 766 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 767 self.fd 768 .set_regs(regs) 769 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 770 } 771 #[cfg(target_arch = "x86_64")] 772 /// 773 /// Returns the vCPU special registers. 774 /// 775 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 776 self.fd 777 .get_sregs() 778 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into())) 779 } 780 #[cfg(target_arch = "x86_64")] 781 /// 782 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 783 /// 784 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 785 self.fd 786 .set_sregs(sregs) 787 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 788 } 789 #[cfg(target_arch = "x86_64")] 790 /// 791 /// Returns the floating point state (FPU) from the vCPU. 792 /// 793 fn get_fpu(&self) -> cpu::Result<FpuState> { 794 self.fd 795 .get_fpu() 796 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into())) 797 } 798 #[cfg(target_arch = "x86_64")] 799 /// 800 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 801 /// 802 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 803 self.fd 804 .set_fpu(fpu) 805 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 806 } 807 #[cfg(target_arch = "x86_64")] 808 /// 809 /// X86 specific call to setup the CPUID registers. 810 /// 811 fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> { 812 self.fd 813 .set_cpuid2(cpuid) 814 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 815 } 816 #[cfg(target_arch = "x86_64")] 817 /// 818 /// X86 specific call to enable HyperV SynIC 819 /// 820 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 821 // Update the information about Hyper-V SynIC being enabled and 822 // emulated as it will influence later which MSRs should be saved. 823 self.hyperv_synic.store(true, Ordering::Release); 824 825 let cap = kvm_enable_cap { 826 cap: KVM_CAP_HYPERV_SYNIC, 827 ..Default::default() 828 }; 829 self.fd 830 .enable_cap(&cap) 831 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 832 } 833 /// 834 /// X86 specific call to retrieve the CPUID registers. 835 /// 836 #[cfg(target_arch = "x86_64")] 837 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> { 838 self.fd 839 .get_cpuid2(num_entries) 840 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into())) 841 } 842 #[cfg(target_arch = "x86_64")] 843 /// 844 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 845 /// 846 fn get_lapic(&self) -> cpu::Result<LapicState> { 847 self.fd 848 .get_lapic() 849 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into())) 850 } 851 #[cfg(target_arch = "x86_64")] 852 /// 853 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 854 /// 855 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 856 self.fd 857 .set_lapic(klapic) 858 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 859 } 860 #[cfg(target_arch = "x86_64")] 861 /// 862 /// Returns the model-specific registers (MSR) for this vCPU. 863 /// 864 fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> { 865 self.fd 866 .get_msrs(msrs) 867 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into())) 868 } 869 #[cfg(target_arch = "x86_64")] 870 /// 871 /// Setup the model-specific registers (MSR) for this vCPU. 872 /// Returns the number of MSR entries actually written. 873 /// 874 fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> { 875 self.fd 876 .set_msrs(msrs) 877 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 878 } 879 /// 880 /// Returns the vcpu's current "multiprocessing state". 881 /// 882 fn get_mp_state(&self) -> cpu::Result<MpState> { 883 self.fd 884 .get_mp_state() 885 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into())) 886 } 887 /// 888 /// Sets the vcpu's current "multiprocessing state". 889 /// 890 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 891 self.fd 892 .set_mp_state(mp_state) 893 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 894 } 895 #[cfg(target_arch = "x86_64")] 896 /// 897 /// X86 specific call that returns the vcpu's current "xsave struct". 898 /// 899 fn get_xsave(&self) -> cpu::Result<Xsave> { 900 self.fd 901 .get_xsave() 902 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 903 } 904 #[cfg(target_arch = "x86_64")] 905 /// 906 /// X86 specific call that sets the vcpu's current "xsave struct". 907 /// 908 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 909 self.fd 910 .set_xsave(xsave) 911 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 912 } 913 #[cfg(target_arch = "x86_64")] 914 /// 915 /// X86 specific call that returns the vcpu's current "xcrs". 916 /// 917 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 918 self.fd 919 .get_xcrs() 920 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 921 } 922 #[cfg(target_arch = "x86_64")] 923 /// 924 /// X86 specific call that sets the vcpu's current "xcrs". 925 /// 926 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 927 self.fd 928 .set_xcrs(xcrs) 929 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 930 } 931 /// 932 /// Triggers the running of the current virtual CPU returning an exit reason. 933 /// 934 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 935 match self.fd.run() { 936 Ok(run) => match run { 937 #[cfg(target_arch = "x86_64")] 938 VcpuExit::IoIn(addr, data) => { 939 if let Some(vmmops) = &self.vmmops { 940 return vmmops 941 .pio_read(addr.into(), data) 942 .map(|_| cpu::VmExit::Ignore) 943 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 944 } 945 946 Ok(cpu::VmExit::IoIn(addr, data)) 947 } 948 #[cfg(target_arch = "x86_64")] 949 VcpuExit::IoOut(addr, data) => { 950 if let Some(vmmops) = &self.vmmops { 951 return vmmops 952 .pio_write(addr.into(), data) 953 .map(|_| cpu::VmExit::Ignore) 954 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 955 } 956 957 Ok(cpu::VmExit::IoOut(addr, data)) 958 } 959 #[cfg(target_arch = "x86_64")] 960 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 961 #[cfg(target_arch = "x86_64")] 962 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 963 964 #[cfg(target_arch = "aarch64")] 965 VcpuExit::SystemEvent(event_type, flags) => { 966 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 967 // On Aarch64, when the VM is shutdown, run() returns 968 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 969 if event_type == KVM_SYSTEM_EVENT_RESET { 970 Ok(cpu::VmExit::Reset) 971 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 972 Ok(cpu::VmExit::Shutdown) 973 } else { 974 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 975 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 976 event_type, 977 flags 978 ))) 979 } 980 } 981 982 VcpuExit::MmioRead(addr, data) => { 983 if let Some(vmmops) = &self.vmmops { 984 return vmmops 985 .mmio_read(addr, data) 986 .map(|_| cpu::VmExit::Ignore) 987 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 988 } 989 990 Ok(cpu::VmExit::MmioRead(addr, data)) 991 } 992 VcpuExit::MmioWrite(addr, data) => { 993 if let Some(vmmops) = &self.vmmops { 994 return vmmops 995 .mmio_write(addr, data) 996 .map(|_| cpu::VmExit::Ignore) 997 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 998 } 999 1000 Ok(cpu::VmExit::MmioWrite(addr, data)) 1001 } 1002 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 1003 1004 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1005 "Unexpected exit reason on vcpu run: {:?}", 1006 r 1007 ))), 1008 }, 1009 1010 Err(ref e) => match e.errno() { 1011 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1012 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1013 "VCPU error {:?}", 1014 e 1015 ))), 1016 }, 1017 } 1018 } 1019 #[cfg(target_arch = "x86_64")] 1020 /// 1021 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 1022 /// states of the vcpu. 1023 /// 1024 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 1025 self.fd 1026 .get_vcpu_events() 1027 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 1028 } 1029 #[cfg(target_arch = "x86_64")] 1030 /// 1031 /// Sets pending exceptions, interrupts, and NMIs as well as related states 1032 /// of the vcpu. 1033 /// 1034 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 1035 self.fd 1036 .set_vcpu_events(events) 1037 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 1038 } 1039 #[cfg(target_arch = "x86_64")] 1040 /// 1041 /// Let the guest know that it has been paused, which prevents from 1042 /// potential soft lockups when being resumed. 1043 /// 1044 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1045 self.fd 1046 .kvmclock_ctrl() 1047 .map_err(|e| cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())) 1048 } 1049 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1050 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1051 self.fd 1052 .vcpu_init(kvi) 1053 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1054 } 1055 /// 1056 /// Sets the value of one register for this vCPU. 1057 /// 1058 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1059 fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> { 1060 self.fd 1061 .set_one_reg(reg_id, data) 1062 .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into())) 1063 } 1064 /// 1065 /// Gets the value of one register for this vCPU. 1066 /// 1067 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1068 fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> { 1069 self.fd 1070 .get_one_reg(reg_id) 1071 .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into())) 1072 } 1073 /// 1074 /// Gets a list of the guest registers that are supported for the 1075 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1076 /// 1077 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1078 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1079 self.fd 1080 .get_reg_list(reg_list) 1081 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1082 } 1083 /// 1084 /// Save the state of the core registers. 1085 /// 1086 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1087 fn core_registers(&self, state: &mut StandardRegisters) -> cpu::Result<()> { 1088 let mut off = offset__of!(user_pt_regs, regs); 1089 // There are 31 user_pt_regs: 1090 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1091 // These actually are the general-purpose registers of the Armv8-a 1092 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1093 for i in 0..31 { 1094 state.regs.regs[i] = self 1095 .fd 1096 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1097 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1098 off += std::mem::size_of::<u64>(); 1099 } 1100 1101 // We are now entering the "Other register" section of the ARMv8-a architecture. 1102 // First one, stack pointer. 1103 let off = offset__of!(user_pt_regs, sp); 1104 state.regs.sp = self 1105 .fd 1106 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1107 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1108 1109 // Second one, the program counter. 1110 let off = offset__of!(user_pt_regs, pc); 1111 state.regs.pc = self 1112 .fd 1113 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1114 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1115 1116 // Next is the processor state. 1117 let off = offset__of!(user_pt_regs, pstate); 1118 state.regs.pstate = self 1119 .fd 1120 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1121 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1122 1123 // The stack pointer associated with EL1 1124 let off = offset__of!(kvm_regs, sp_el1); 1125 state.sp_el1 = self 1126 .fd 1127 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1128 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1129 1130 // Exception Link Register for EL1, when taking an exception to EL1, this register 1131 // holds the address to which to return afterwards. 1132 let off = offset__of!(kvm_regs, elr_el1); 1133 state.elr_el1 = self 1134 .fd 1135 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1136 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1137 1138 // Saved Program Status Registers, there are 5 of them used in the kernel. 1139 let mut off = offset__of!(kvm_regs, spsr); 1140 for i in 0..KVM_NR_SPSR as usize { 1141 state.spsr[i] = self 1142 .fd 1143 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1144 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1145 off += std::mem::size_of::<u64>(); 1146 } 1147 1148 // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: 1149 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1150 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1151 for i in 0..32 { 1152 state.fp_regs.vregs[i] = self 1153 .fd 1154 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1155 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1156 .into(); 1157 off += mem::size_of::<u128>(); 1158 } 1159 1160 // Floating-point Status Register 1161 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1162 state.fp_regs.fpsr = self 1163 .fd 1164 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1165 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1166 as u32; 1167 1168 // Floating-point Control Register 1169 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1170 state.fp_regs.fpcr = self 1171 .fd 1172 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1173 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1174 as u32; 1175 Ok(()) 1176 } 1177 /// 1178 /// Restore the state of the core registers. 1179 /// 1180 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1181 fn set_core_registers(&self, state: &StandardRegisters) -> cpu::Result<()> { 1182 // The function follows the exact identical order from `state`. Look there 1183 // for some additional info on registers. 1184 let mut off = offset__of!(user_pt_regs, regs); 1185 for i in 0..31 { 1186 self.fd 1187 .set_one_reg( 1188 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1189 state.regs.regs[i], 1190 ) 1191 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1192 off += std::mem::size_of::<u64>(); 1193 } 1194 1195 let off = offset__of!(user_pt_regs, sp); 1196 self.fd 1197 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp) 1198 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1199 1200 let off = offset__of!(user_pt_regs, pc); 1201 self.fd 1202 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc) 1203 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1204 1205 let off = offset__of!(user_pt_regs, pstate); 1206 self.fd 1207 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate) 1208 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1209 1210 let off = offset__of!(kvm_regs, sp_el1); 1211 self.fd 1212 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1) 1213 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1214 1215 let off = offset__of!(kvm_regs, elr_el1); 1216 self.fd 1217 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1) 1218 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1219 1220 let mut off = offset__of!(kvm_regs, spsr); 1221 for i in 0..KVM_NR_SPSR as usize { 1222 self.fd 1223 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i]) 1224 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1225 off += std::mem::size_of::<u64>(); 1226 } 1227 1228 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1229 for i in 0..32 { 1230 self.fd 1231 .set_one_reg( 1232 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1233 state.fp_regs.vregs[i] as u64, 1234 ) 1235 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1236 off += mem::size_of::<u128>(); 1237 } 1238 1239 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1240 self.fd 1241 .set_one_reg( 1242 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1243 state.fp_regs.fpsr as u64, 1244 ) 1245 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1246 1247 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1248 self.fd 1249 .set_one_reg( 1250 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1251 state.fp_regs.fpcr as u64, 1252 ) 1253 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1254 Ok(()) 1255 } 1256 /// 1257 /// Save the state of the system registers. 1258 /// 1259 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1260 fn system_registers(&self, state: &mut Vec<Register>) -> cpu::Result<()> { 1261 // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are 1262 // around 500 registers. 1263 let mut reg_list = RegList::new(500).unwrap(); 1264 self.fd 1265 .get_reg_list(&mut reg_list) 1266 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1267 1268 // At this point reg_list should contain: core registers and system registers. 1269 // The register list contains the number of registers and their ids. We will be needing to 1270 // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list 1271 // the core registers which are represented in the kernel by kvm_regs structure and for which 1272 // we can calculate the id based on the offset in the structure. 1273 reg_list.retain(|regid| is_system_register(*regid)); 1274 1275 // Now, for the rest of the registers left in the previously fetched register list, we are 1276 // simply calling KVM_GET_ONE_REG. 1277 let indices = reg_list.as_slice(); 1278 for index in indices.iter() { 1279 state.push(kvm_bindings::kvm_one_reg { 1280 id: *index, 1281 addr: self 1282 .fd 1283 .get_one_reg(*index) 1284 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?, 1285 }); 1286 } 1287 1288 Ok(()) 1289 } 1290 /// 1291 /// Restore the state of the system registers. 1292 /// 1293 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1294 fn set_system_registers(&self, state: &[Register]) -> cpu::Result<()> { 1295 for reg in state { 1296 self.fd 1297 .set_one_reg(reg.id, reg.addr) 1298 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 1299 } 1300 Ok(()) 1301 } 1302 /// 1303 /// Read the MPIDR - Multiprocessor Affinity Register. 1304 /// 1305 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1306 fn read_mpidr(&self) -> cpu::Result<u64> { 1307 self.fd 1308 .get_one_reg(MPIDR_EL1) 1309 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into())) 1310 } 1311 #[cfg(target_arch = "x86_64")] 1312 /// 1313 /// Get the current CPU state 1314 /// 1315 /// Ordering requirements: 1316 /// 1317 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1318 /// vCPU/LAPIC state. As such, it must be done before most everything 1319 /// else, otherwise we cannot restore everything and expect it to work. 1320 /// 1321 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1322 /// still running. 1323 /// 1324 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1325 /// 1326 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1327 /// it might as well be affected by internal state modifications of the 1328 /// GET ioctls. 1329 /// 1330 /// SREGS saves/restores a pending interrupt, similar to what 1331 /// VCPU_EVENTS also does. 1332 /// 1333 /// GET_MSRS requires a pre-populated data structure to do something 1334 /// meaningful. For SET_MSRS it will then contain good data. 1335 /// 1336 /// # Example 1337 /// 1338 /// ```rust 1339 /// # extern crate hypervisor; 1340 /// # use hypervisor::KvmHypervisor; 1341 /// # use std::sync::Arc; 1342 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1343 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1344 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1345 /// vm.enable_split_irq().unwrap(); 1346 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1347 /// let state = vcpu.state().unwrap(); 1348 /// ``` 1349 fn state(&self) -> cpu::Result<CpuState> { 1350 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1351 let mp_state = self.get_mp_state()?; 1352 let regs = self.get_regs()?; 1353 let sregs = self.get_sregs()?; 1354 let xsave = self.get_xsave()?; 1355 let xcrs = self.get_xcrs()?; 1356 let lapic_state = self.get_lapic()?; 1357 let fpu = self.get_fpu()?; 1358 1359 // Try to get all MSRs based on the list previously retrieved from KVM. 1360 // If the number of MSRs obtained from GET_MSRS is different from the 1361 // expected amount, we fallback onto a slower method by getting MSRs 1362 // by chunks. This is the only way to make sure we try to get as many 1363 // MSRs as possible, even if some MSRs are not supported. 1364 let mut msr_entries = self.msrs.clone(); 1365 1366 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1367 // emulated. 1368 if self.hyperv_synic.load(Ordering::Acquire) { 1369 let hyperv_synic_msrs = vec![ 1370 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1371 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1372 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1373 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 1374 0x400000b5, 0x400000b6, 0x400000b7, 1375 ]; 1376 for index in hyperv_synic_msrs { 1377 let msr = kvm_msr_entry { 1378 index, 1379 ..Default::default() 1380 }; 1381 msr_entries.push(msr).unwrap(); 1382 } 1383 } 1384 1385 let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize; 1386 let num_msrs = self.get_msrs(&mut msr_entries)?; 1387 let msrs = if num_msrs != expected_num_msrs { 1388 let mut faulty_msr_index = num_msrs; 1389 let mut msr_entries_tmp = 1390 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap(); 1391 1392 loop { 1393 warn!( 1394 "Detected faulty MSR 0x{:x} while getting MSRs", 1395 msr_entries.as_slice()[faulty_msr_index].index 1396 ); 1397 1398 let start_pos = faulty_msr_index + 1; 1399 let mut sub_msr_entries = 1400 MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap(); 1401 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1402 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1403 1404 for i in 0..num_msrs { 1405 msr_entries_tmp 1406 .push(sub_msr_entries.as_slice()[i]) 1407 .map_err(|e| { 1408 cpu::HypervisorCpuError::GetMsrEntries(anyhow!( 1409 "Failed adding MSR entries: {:?}", 1410 e 1411 )) 1412 })?; 1413 } 1414 1415 if num_msrs == expected_num_msrs { 1416 break; 1417 } 1418 1419 faulty_msr_index = start_pos + num_msrs; 1420 } 1421 1422 msr_entries_tmp 1423 } else { 1424 msr_entries 1425 }; 1426 1427 let vcpu_events = self.get_vcpu_events()?; 1428 1429 Ok(CpuState { 1430 cpuid, 1431 msrs, 1432 vcpu_events, 1433 regs, 1434 sregs, 1435 fpu, 1436 lapic_state, 1437 xsave, 1438 xcrs, 1439 mp_state, 1440 }) 1441 } 1442 /// 1443 /// Get the current AArch64 CPU state 1444 /// 1445 #[cfg(target_arch = "aarch64")] 1446 fn state(&self) -> cpu::Result<CpuState> { 1447 let mut state = CpuState { 1448 mp_state: self.get_mp_state()?, 1449 mpidr: self.read_mpidr()?, 1450 ..Default::default() 1451 }; 1452 self.core_registers(&mut state.core_regs)?; 1453 self.system_registers(&mut state.sys_regs)?; 1454 1455 Ok(state) 1456 } 1457 #[cfg(target_arch = "x86_64")] 1458 /// 1459 /// Restore the previously saved CPU state 1460 /// 1461 /// Ordering requirements: 1462 /// 1463 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1464 /// still running. 1465 /// 1466 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1467 /// if we ever change the BSP, we have to do that before restoring anything. 1468 /// The same seems to be true for CPUID stuff. 1469 /// 1470 /// SREGS saves/restores a pending interrupt, similar to what 1471 /// VCPU_EVENTS also does. 1472 /// 1473 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1474 /// done before SET_VCPU_EVENTS, which restores it. 1475 /// 1476 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1477 /// the apic base msr. 1478 /// 1479 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1480 /// only restores successfully, when the LAPIC is correctly configured. 1481 /// 1482 /// Arguments: CpuState 1483 /// # Example 1484 /// 1485 /// ```rust 1486 /// # extern crate hypervisor; 1487 /// # use hypervisor::KvmHypervisor; 1488 /// # use std::sync::Arc; 1489 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1490 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1491 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1492 /// vm.enable_split_irq().unwrap(); 1493 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1494 /// let state = vcpu.state().unwrap(); 1495 /// vcpu.set_state(&state).unwrap(); 1496 /// ``` 1497 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1498 self.set_cpuid2(&state.cpuid)?; 1499 self.set_mp_state(state.mp_state)?; 1500 self.set_regs(&state.regs)?; 1501 self.set_sregs(&state.sregs)?; 1502 self.set_xsave(&state.xsave)?; 1503 self.set_xcrs(&state.xcrs)?; 1504 self.set_lapic(&state.lapic_state)?; 1505 self.set_fpu(&state.fpu)?; 1506 1507 // Try to set all MSRs previously stored. 1508 // If the number of MSRs set from SET_MSRS is different from the 1509 // expected amount, we fallback onto a slower method by setting MSRs 1510 // by chunks. This is the only way to make sure we try to set as many 1511 // MSRs as possible, even if some MSRs are not supported. 1512 let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize; 1513 let num_msrs = self.set_msrs(&state.msrs)?; 1514 if num_msrs != expected_num_msrs { 1515 let mut faulty_msr_index = num_msrs; 1516 1517 loop { 1518 warn!( 1519 "Detected faulty MSR 0x{:x} while setting MSRs", 1520 state.msrs.as_slice()[faulty_msr_index].index 1521 ); 1522 1523 let start_pos = faulty_msr_index + 1; 1524 let sub_msr_entries = 1525 MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap(); 1526 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1527 let num_msrs = self.set_msrs(&sub_msr_entries)?; 1528 1529 if num_msrs == expected_num_msrs { 1530 break; 1531 } 1532 1533 faulty_msr_index = start_pos + num_msrs; 1534 } 1535 } 1536 1537 self.set_vcpu_events(&state.vcpu_events)?; 1538 1539 Ok(()) 1540 } 1541 /// 1542 /// Restore the previously saved AArch64 CPU state 1543 /// 1544 #[cfg(target_arch = "aarch64")] 1545 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1546 self.set_core_registers(&state.core_regs)?; 1547 self.set_system_registers(&state.sys_regs)?; 1548 self.set_mp_state(state.mp_state)?; 1549 1550 Ok(()) 1551 } 1552 1553 /// 1554 /// Initialize TDX for this CPU 1555 /// 1556 #[cfg(feature = "tdx")] 1557 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 1558 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 1559 .map_err(cpu::HypervisorCpuError::InitializeTdx) 1560 } 1561 } 1562 1563 /// Device struct for KVM 1564 pub struct KvmDevice { 1565 fd: DeviceFd, 1566 } 1567 1568 impl device::Device for KvmDevice { 1569 /// 1570 /// Set device attribute 1571 /// 1572 fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> { 1573 self.fd 1574 .set_device_attr(attr) 1575 .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into())) 1576 } 1577 /// 1578 /// Get device attribute 1579 /// 1580 fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> { 1581 self.fd 1582 .get_device_attr(attr) 1583 .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into())) 1584 } 1585 } 1586 1587 impl AsRawFd for KvmDevice { 1588 fn as_raw_fd(&self) -> RawFd { 1589 self.fd.as_raw_fd() 1590 } 1591 } 1592