1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 pub use crate::aarch64::{ 13 check_required_kvm_extensions, is_system_register, VcpuInit, VcpuKvmState as CpuState, 14 MPIDR_EL1, 15 }; 16 use crate::cpu; 17 use crate::device; 18 use crate::hypervisor; 19 use crate::vec_with_array_field; 20 use crate::vm::{self, VmmOps}; 21 #[cfg(target_arch = "aarch64")] 22 use crate::{arm64_core_reg_id, offset__of}; 23 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 24 use serde_derive::{Deserialize, Serialize}; 25 use std::collections::HashMap; 26 #[cfg(target_arch = "aarch64")] 27 use std::convert::TryInto; 28 #[cfg(target_arch = "x86_64")] 29 use std::fs::File; 30 use std::os::unix::io::{AsRawFd, RawFd}; 31 use std::result; 32 #[cfg(target_arch = "x86_64")] 33 use std::sync::atomic::{AtomicBool, Ordering}; 34 use std::sync::{Arc, RwLock}; 35 #[cfg(target_arch = "x86_64")] 36 use vm_memory::Address; 37 use vmm_sys_util::eventfd::EventFd; 38 // x86_64 dependencies 39 #[cfg(target_arch = "x86_64")] 40 pub mod x86_64; 41 #[cfg(target_arch = "x86_64")] 42 use crate::arch::x86::NUM_IOAPIC_PINS; 43 #[cfg(target_arch = "aarch64")] 44 use aarch64::{RegList, Register, StandardRegisters}; 45 #[cfg(target_arch = "x86_64")] 46 use kvm_bindings::{ 47 kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP, 48 }; 49 #[cfg(target_arch = "x86_64")] 50 use x86_64::{ 51 check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters, KVM_TSS_ADDRESS, 52 }; 53 #[cfg(target_arch = "x86_64")] 54 pub use x86_64::{ 55 CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState as CpuState, 56 Xsave, CPUID_FLAG_VALID_INDEX, 57 }; 58 // aarch64 dependencies 59 #[cfg(target_arch = "aarch64")] 60 pub mod aarch64; 61 pub use kvm_bindings; 62 #[cfg(feature = "tdx")] 63 use kvm_bindings::KVMIO; 64 pub use kvm_bindings::{ 65 kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry, 66 kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, 67 KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 68 }; 69 #[cfg(target_arch = "aarch64")] 70 use kvm_bindings::{ 71 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE, 72 KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 73 }; 74 pub use kvm_ioctls; 75 pub use kvm_ioctls::{Cap, Kvm}; 76 #[cfg(target_arch = "aarch64")] 77 use std::mem; 78 use thiserror::Error; 79 #[cfg(feature = "tdx")] 80 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr}; 81 /// 82 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 83 /// 84 pub use { 85 kvm_bindings::kvm_clock_data as ClockData, kvm_bindings::kvm_create_device as CreateDevice, 86 kvm_bindings::kvm_device_attr as DeviceAttr, 87 kvm_bindings::kvm_irq_routing_entry as IrqRoutingEntry, kvm_bindings::kvm_mp_state as MpState, 88 kvm_bindings::kvm_userspace_memory_region as MemoryRegion, 89 kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::IoEventAddress, 90 kvm_ioctls::VcpuExit, 91 }; 92 93 #[cfg(target_arch = "x86_64")] 94 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 95 96 #[cfg(feature = "tdx")] 97 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 98 99 #[cfg(feature = "tdx")] 100 #[repr(u32)] 101 enum TdxCommand { 102 #[allow(dead_code)] 103 Capabilities = 0, 104 InitVm, 105 InitVcpu, 106 InitMemRegion, 107 Finalize, 108 } 109 110 #[derive(Clone, Copy, Debug, PartialEq, Deserialize, Serialize)] 111 pub struct KvmVmState {} 112 113 pub use KvmVmState as VmState; 114 115 struct KvmDirtyLogSlot { 116 slot: u32, 117 guest_phys_addr: u64, 118 memory_size: u64, 119 userspace_addr: u64, 120 } 121 122 /// Wrapper over KVM VM ioctls. 123 pub struct KvmVm { 124 fd: Arc<VmFd>, 125 #[cfg(target_arch = "x86_64")] 126 msrs: MsrEntries, 127 state: KvmVmState, 128 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 129 } 130 131 /// 132 /// Implementation of Vm trait for KVM 133 /// Example: 134 /// #[cfg(feature = "kvm")] 135 /// extern crate hypervisor 136 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 137 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 138 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 139 /// vm.set/get().unwrap() 140 /// 141 impl vm::Vm for KvmVm { 142 #[cfg(target_arch = "x86_64")] 143 /// 144 /// Sets the address of the three-page region in the VM's address space. 145 /// 146 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 147 self.fd 148 .set_tss_address(offset) 149 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 150 } 151 /// 152 /// Creates an in-kernel interrupt controller. 153 /// 154 fn create_irq_chip(&self) -> vm::Result<()> { 155 self.fd 156 .create_irq_chip() 157 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 158 } 159 /// 160 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 161 /// 162 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 163 self.fd 164 .register_irqfd(fd, gsi) 165 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 166 } 167 /// 168 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 169 /// 170 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 171 self.fd 172 .unregister_irqfd(fd, gsi) 173 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 174 } 175 /// 176 /// Creates a VcpuFd object from a vcpu RawFd. 177 /// 178 fn create_vcpu( 179 &self, 180 id: u8, 181 vmmops: Option<Arc<dyn VmmOps>>, 182 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 183 let vc = self 184 .fd 185 .create_vcpu(id as u64) 186 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 187 let vcpu = KvmVcpu { 188 fd: vc, 189 #[cfg(target_arch = "x86_64")] 190 msrs: self.msrs.clone(), 191 vmmops, 192 #[cfg(target_arch = "x86_64")] 193 hyperv_synic: AtomicBool::new(false), 194 }; 195 Ok(Arc::new(vcpu)) 196 } 197 /// 198 /// Registers an event to be signaled whenever a certain address is written to. 199 /// 200 fn register_ioevent( 201 &self, 202 fd: &EventFd, 203 addr: &IoEventAddress, 204 datamatch: Option<vm::DataMatch>, 205 ) -> vm::Result<()> { 206 if let Some(dm) = datamatch { 207 match dm { 208 vm::DataMatch::DataMatch32(kvm_dm32) => self 209 .fd 210 .register_ioevent(fd, addr, kvm_dm32) 211 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 212 vm::DataMatch::DataMatch64(kvm_dm64) => self 213 .fd 214 .register_ioevent(fd, addr, kvm_dm64) 215 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 216 } 217 } else { 218 self.fd 219 .register_ioevent(fd, addr, NoDatamatch) 220 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 221 } 222 } 223 /// 224 /// Unregisters an event from a certain address it has been previously registered to. 225 /// 226 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 227 self.fd 228 .unregister_ioevent(fd, addr, NoDatamatch) 229 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 230 } 231 /// 232 /// Sets the GSI routing table entries, overwriting any previously set 233 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 234 /// 235 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 236 let mut irq_routing = 237 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 238 irq_routing[0].nr = entries.len() as u32; 239 irq_routing[0].flags = 0; 240 241 unsafe { 242 let entries_slice: &mut [kvm_irq_routing_entry] = 243 irq_routing[0].entries.as_mut_slice(entries.len()); 244 entries_slice.copy_from_slice(entries); 245 } 246 247 self.fd 248 .set_gsi_routing(&irq_routing[0]) 249 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 250 } 251 /// 252 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 253 /// 254 fn make_user_memory_region( 255 &self, 256 slot: u32, 257 guest_phys_addr: u64, 258 memory_size: u64, 259 userspace_addr: u64, 260 readonly: bool, 261 log_dirty_pages: bool, 262 ) -> MemoryRegion { 263 MemoryRegion { 264 slot, 265 guest_phys_addr, 266 memory_size, 267 userspace_addr, 268 flags: if readonly { KVM_MEM_READONLY } else { 0 } 269 | if log_dirty_pages { 270 KVM_MEM_LOG_DIRTY_PAGES 271 } else { 272 0 273 }, 274 } 275 } 276 /// 277 /// Creates a guest physical memory region. 278 /// 279 fn create_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> { 280 let mut region = user_memory_region; 281 282 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 283 if (region.flags & KVM_MEM_READONLY) != 0 { 284 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 285 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 286 ))); 287 } 288 289 // Keep track of the regions that need dirty pages log 290 self.dirty_log_slots.write().unwrap().insert( 291 region.slot, 292 KvmDirtyLogSlot { 293 slot: region.slot, 294 guest_phys_addr: region.guest_phys_addr, 295 memory_size: region.memory_size, 296 userspace_addr: region.userspace_addr, 297 }, 298 ); 299 300 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 301 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 302 region.flags = 0; 303 } 304 305 // Safe because guest regions are guaranteed not to overlap. 306 unsafe { 307 self.fd 308 .set_user_memory_region(region) 309 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 310 } 311 } 312 /// 313 /// Removes a guest physical memory region. 314 /// 315 fn remove_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> { 316 let mut region = user_memory_region; 317 318 // Remove the corresponding entry from "self.dirty_log_slots" if needed 319 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 320 321 // Setting the size to 0 means "remove" 322 region.memory_size = 0; 323 // Safe because guest regions are guaranteed not to overlap. 324 unsafe { 325 self.fd 326 .set_user_memory_region(region) 327 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 328 } 329 } 330 /// 331 /// Creates an emulated device in the kernel. 332 /// 333 /// See the documentation for `KVM_CREATE_DEVICE`. 334 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> { 335 let fd = self 336 .fd 337 .create_device(device) 338 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 339 let device = KvmDevice { fd }; 340 Ok(Arc::new(device)) 341 } 342 /// 343 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 344 /// 345 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 346 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 347 self.fd 348 .get_preferred_target(kvi) 349 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 350 } 351 #[cfg(target_arch = "x86_64")] 352 fn enable_split_irq(&self) -> vm::Result<()> { 353 // Set TSS 354 self.fd 355 .set_tss_address(KVM_TSS_ADDRESS.raw_value() as usize) 356 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 357 // Create split irqchip 358 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 359 // are not. 360 let mut cap = kvm_enable_cap { 361 cap: KVM_CAP_SPLIT_IRQCHIP, 362 ..Default::default() 363 }; 364 cap.args[0] = NUM_IOAPIC_PINS as u64; 365 self.fd 366 .enable_cap(&cap) 367 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 368 Ok(()) 369 } 370 #[cfg(target_arch = "x86_64")] 371 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 372 let mut cap = kvm_enable_cap { 373 cap: KVM_CAP_SGX_ATTRIBUTE, 374 ..Default::default() 375 }; 376 cap.args[0] = file.as_raw_fd() as u64; 377 self.fd 378 .enable_cap(&cap) 379 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 380 Ok(()) 381 } 382 /// Retrieve guest clock. 383 #[cfg(target_arch = "x86_64")] 384 fn get_clock(&self) -> vm::Result<ClockData> { 385 self.fd 386 .get_clock() 387 .map_err(|e| vm::HypervisorVmError::GetClock(e.into())) 388 } 389 /// Set guest clock. 390 #[cfg(target_arch = "x86_64")] 391 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 392 self.fd 393 .set_clock(data) 394 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 395 } 396 /// Checks if a particular `Cap` is available. 397 fn check_extension(&self, c: Cap) -> bool { 398 self.fd.check_extension(c) 399 } 400 /// Create a device that is used for passthrough 401 fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> { 402 let mut vfio_dev = kvm_create_device { 403 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 404 fd: 0, 405 flags: 0, 406 }; 407 408 self.create_device(&mut vfio_dev) 409 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 410 } 411 /// 412 /// Get the Vm state. Return VM specific data 413 /// 414 fn state(&self) -> vm::Result<VmState> { 415 Ok(self.state) 416 } 417 /// 418 /// Set the VM state 419 /// 420 fn set_state(&self, _state: VmState) -> vm::Result<()> { 421 Ok(()) 422 } 423 424 /// 425 /// Start logging dirty pages 426 /// 427 fn start_dirty_log(&self) -> vm::Result<()> { 428 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 429 for (_, s) in dirty_log_slots.iter() { 430 let region = MemoryRegion { 431 slot: s.slot, 432 guest_phys_addr: s.guest_phys_addr, 433 memory_size: s.memory_size, 434 userspace_addr: s.userspace_addr, 435 flags: KVM_MEM_LOG_DIRTY_PAGES, 436 }; 437 // Safe because guest regions are guaranteed not to overlap. 438 unsafe { 439 self.fd 440 .set_user_memory_region(region) 441 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 442 } 443 } 444 445 Ok(()) 446 } 447 448 /// 449 /// Stop logging dirty pages 450 /// 451 fn stop_dirty_log(&self) -> vm::Result<()> { 452 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 453 for (_, s) in dirty_log_slots.iter() { 454 let region = MemoryRegion { 455 slot: s.slot, 456 guest_phys_addr: s.guest_phys_addr, 457 memory_size: s.memory_size, 458 userspace_addr: s.userspace_addr, 459 flags: 0, 460 }; 461 // Safe because guest regions are guaranteed not to overlap. 462 unsafe { 463 self.fd 464 .set_user_memory_region(region) 465 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 466 } 467 } 468 469 Ok(()) 470 } 471 472 /// 473 /// Get dirty pages bitmap (one bit per page) 474 /// 475 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 476 self.fd 477 .get_dirty_log(slot, memory_size as usize) 478 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 479 } 480 481 /// 482 /// Initialize TDX for this VM 483 /// 484 #[cfg(feature = "tdx")] 485 fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> { 486 #[repr(C)] 487 struct TdxInitVm { 488 max_vcpus: u32, 489 tsc_khz: u32, 490 attributes: u64, 491 cpuid: u64, 492 mrconfigid: [u64; 6], 493 mrowner: [u64; 6], 494 mrownerconfig: [u64; 6], 495 reserved: [u64; 43], 496 } 497 let data = TdxInitVm { 498 max_vcpus, 499 tsc_khz: 0, 500 attributes: 1, // TDX1_TD_ATTRIBUTE_DEBUG, 501 cpuid: cpuid.as_fam_struct_ptr() as u64, 502 mrconfigid: [0; 6], 503 mrowner: [0; 6], 504 mrownerconfig: [0; 6], 505 reserved: [0; 43], 506 }; 507 508 tdx_command( 509 &self.fd.as_raw_fd(), 510 TdxCommand::InitVm, 511 0, 512 &data as *const _ as u64, 513 ) 514 .map_err(vm::HypervisorVmError::InitializeTdx) 515 } 516 517 /// 518 /// Finalize the TDX setup for this VM 519 /// 520 #[cfg(feature = "tdx")] 521 fn tdx_finalize(&self) -> vm::Result<()> { 522 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 523 .map_err(vm::HypervisorVmError::FinalizeTdx) 524 } 525 526 /// 527 /// Initialize memory regions for the TDX VM 528 /// 529 #[cfg(feature = "tdx")] 530 fn tdx_init_memory_region( 531 &self, 532 host_address: u64, 533 guest_address: u64, 534 size: u64, 535 measure: bool, 536 ) -> vm::Result<()> { 537 #[repr(C)] 538 struct TdxInitMemRegion { 539 host_address: u64, 540 guest_address: u64, 541 pages: u64, 542 } 543 let data = TdxInitMemRegion { 544 host_address, 545 guest_address, 546 pages: size / 4096, 547 }; 548 549 tdx_command( 550 &self.fd.as_raw_fd(), 551 TdxCommand::InitMemRegion, 552 if measure { 1 } else { 0 }, 553 &data as *const _ as u64, 554 ) 555 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 556 } 557 } 558 559 #[cfg(feature = "tdx")] 560 fn tdx_command( 561 fd: &RawFd, 562 command: TdxCommand, 563 metadata: u32, 564 data: u64, 565 ) -> std::result::Result<(), std::io::Error> { 566 #[repr(C)] 567 struct TdxIoctlCmd { 568 command: TdxCommand, 569 metadata: u32, 570 data: u64, 571 } 572 let cmd = TdxIoctlCmd { 573 command, 574 metadata, 575 data, 576 }; 577 let ret = unsafe { 578 ioctl_with_val( 579 fd, 580 KVM_MEMORY_ENCRYPT_OP(), 581 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 582 ) 583 }; 584 585 if ret < 0 { 586 return Err(std::io::Error::last_os_error()); 587 } 588 Ok(()) 589 } 590 591 /// Wrapper over KVM system ioctls. 592 pub struct KvmHypervisor { 593 kvm: Kvm, 594 } 595 /// Enum for KVM related error 596 #[derive(Debug, Error)] 597 pub enum KvmError { 598 #[error("Capability missing: {0:?}")] 599 CapabilityMissing(Cap), 600 } 601 pub type KvmResult<T> = result::Result<T, KvmError>; 602 impl KvmHypervisor { 603 /// Create a hypervisor based on Kvm 604 pub fn new() -> hypervisor::Result<KvmHypervisor> { 605 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 606 let api_version = kvm_obj.get_api_version(); 607 608 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 609 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 610 } 611 612 Ok(KvmHypervisor { kvm: kvm_obj }) 613 } 614 } 615 /// Implementation of Hypervisor trait for KVM 616 /// Example: 617 /// #[cfg(feature = "kvm")] 618 /// extern crate hypervisor 619 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 620 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 621 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 622 /// 623 impl hypervisor::Hypervisor for KvmHypervisor { 624 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 625 /// Example 626 /// # extern crate hypervisor; 627 /// # use hypervisor::KvmHypervisor; 628 /// use hypervisor::KvmVm; 629 /// let hypervisor = KvmHypervisor::new().unwrap(); 630 /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() 631 /// 632 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 633 let fd: VmFd; 634 loop { 635 match self.kvm.create_vm_with_type(vm_type) { 636 Ok(res) => fd = res, 637 Err(e) => { 638 if e.errno() == libc::EINTR { 639 // If the error returned is EINTR, which means the 640 // ioctl has been interrupted, we have to retry as 641 // this can't be considered as a regular error. 642 continue; 643 } else { 644 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 645 } 646 } 647 } 648 break; 649 } 650 651 let vm_fd = Arc::new(fd); 652 653 #[cfg(target_arch = "x86_64")] 654 { 655 let msr_list = self.get_msr_list()?; 656 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 657 let mut msrs = MsrEntries::new(num_msrs).unwrap(); 658 let indices = msr_list.as_slice(); 659 let msr_entries = msrs.as_mut_slice(); 660 for (pos, index) in indices.iter().enumerate() { 661 msr_entries[pos].index = *index; 662 } 663 664 Ok(Arc::new(KvmVm { 665 fd: vm_fd, 666 msrs, 667 state: VmState {}, 668 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 669 })) 670 } 671 672 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 673 { 674 Ok(Arc::new(KvmVm { 675 fd: vm_fd, 676 state: VmState {}, 677 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 678 })) 679 } 680 } 681 682 /// Create a KVM vm object and return the object as Vm trait object 683 /// Example 684 /// # extern crate hypervisor; 685 /// # use hypervisor::KvmHypervisor; 686 /// use hypervisor::KvmVm; 687 /// let hypervisor = KvmHypervisor::new().unwrap(); 688 /// let vm = hypervisor.create_vm().unwrap() 689 /// 690 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 691 #[allow(unused_mut)] 692 let mut vm_type: u64 = 0; // Create with default platform type 693 694 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 695 // size from the host and use that when creating the VM, which may 696 // avoid unnecessary VM creation failures. 697 #[cfg(target_arch = "aarch64")] 698 if self.kvm.check_extension(Cap::ArmVmIPASize) { 699 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 700 } 701 702 self.create_vm_with_type(vm_type) 703 } 704 705 fn check_required_extensions(&self) -> hypervisor::Result<()> { 706 check_required_kvm_extensions(&self.kvm) 707 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 708 } 709 710 #[cfg(target_arch = "x86_64")] 711 /// 712 /// X86 specific call to get the system supported CPUID values. 713 /// 714 fn get_cpuid(&self) -> hypervisor::Result<CpuId> { 715 self.kvm 716 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 717 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into())) 718 } 719 720 #[cfg(target_arch = "x86_64")] 721 /// 722 /// Retrieve the list of MSRs supported by KVM. 723 /// 724 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 725 self.kvm 726 .get_msr_index_list() 727 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 728 } 729 #[cfg(target_arch = "aarch64")] 730 /// 731 /// Retrieve AArch64 host maximum IPA size supported by KVM. 732 /// 733 fn get_host_ipa_limit(&self) -> i32 { 734 self.kvm.get_host_ipa_limit() 735 } 736 } 737 /// Vcpu struct for KVM 738 pub struct KvmVcpu { 739 fd: VcpuFd, 740 #[cfg(target_arch = "x86_64")] 741 msrs: MsrEntries, 742 vmmops: Option<Arc<dyn vm::VmmOps>>, 743 #[cfg(target_arch = "x86_64")] 744 hyperv_synic: AtomicBool, 745 } 746 /// Implementation of Vcpu trait for KVM 747 /// Example: 748 /// #[cfg(feature = "kvm")] 749 /// extern crate hypervisor 750 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 751 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 752 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 753 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 754 /// vcpu.get/set().unwrap() 755 /// 756 impl cpu::Vcpu for KvmVcpu { 757 #[cfg(target_arch = "x86_64")] 758 /// 759 /// Returns the vCPU general purpose registers. 760 /// 761 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 762 self.fd 763 .get_regs() 764 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into())) 765 } 766 #[cfg(target_arch = "x86_64")] 767 /// 768 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 769 /// 770 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 771 self.fd 772 .set_regs(regs) 773 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 774 } 775 #[cfg(target_arch = "x86_64")] 776 /// 777 /// Returns the vCPU special registers. 778 /// 779 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 780 self.fd 781 .get_sregs() 782 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into())) 783 } 784 #[cfg(target_arch = "x86_64")] 785 /// 786 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 787 /// 788 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 789 self.fd 790 .set_sregs(sregs) 791 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 792 } 793 #[cfg(target_arch = "x86_64")] 794 /// 795 /// Returns the floating point state (FPU) from the vCPU. 796 /// 797 fn get_fpu(&self) -> cpu::Result<FpuState> { 798 self.fd 799 .get_fpu() 800 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into())) 801 } 802 #[cfg(target_arch = "x86_64")] 803 /// 804 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 805 /// 806 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 807 self.fd 808 .set_fpu(fpu) 809 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 810 } 811 #[cfg(target_arch = "x86_64")] 812 /// 813 /// X86 specific call to setup the CPUID registers. 814 /// 815 fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> { 816 self.fd 817 .set_cpuid2(cpuid) 818 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 819 } 820 #[cfg(target_arch = "x86_64")] 821 /// 822 /// X86 specific call to enable HyperV SynIC 823 /// 824 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 825 // Update the information about Hyper-V SynIC being enabled and 826 // emulated as it will influence later which MSRs should be saved. 827 self.hyperv_synic.store(true, Ordering::Release); 828 829 let cap = kvm_enable_cap { 830 cap: KVM_CAP_HYPERV_SYNIC, 831 ..Default::default() 832 }; 833 self.fd 834 .enable_cap(&cap) 835 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 836 } 837 /// 838 /// X86 specific call to retrieve the CPUID registers. 839 /// 840 #[cfg(target_arch = "x86_64")] 841 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> { 842 self.fd 843 .get_cpuid2(num_entries) 844 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into())) 845 } 846 #[cfg(target_arch = "x86_64")] 847 /// 848 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 849 /// 850 fn get_lapic(&self) -> cpu::Result<LapicState> { 851 self.fd 852 .get_lapic() 853 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into())) 854 } 855 #[cfg(target_arch = "x86_64")] 856 /// 857 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 858 /// 859 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 860 self.fd 861 .set_lapic(klapic) 862 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 863 } 864 #[cfg(target_arch = "x86_64")] 865 /// 866 /// Returns the model-specific registers (MSR) for this vCPU. 867 /// 868 fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> { 869 self.fd 870 .get_msrs(msrs) 871 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into())) 872 } 873 #[cfg(target_arch = "x86_64")] 874 /// 875 /// Setup the model-specific registers (MSR) for this vCPU. 876 /// Returns the number of MSR entries actually written. 877 /// 878 fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> { 879 self.fd 880 .set_msrs(msrs) 881 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 882 } 883 /// 884 /// Returns the vcpu's current "multiprocessing state". 885 /// 886 fn get_mp_state(&self) -> cpu::Result<MpState> { 887 self.fd 888 .get_mp_state() 889 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into())) 890 } 891 /// 892 /// Sets the vcpu's current "multiprocessing state". 893 /// 894 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 895 self.fd 896 .set_mp_state(mp_state) 897 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 898 } 899 #[cfg(target_arch = "x86_64")] 900 /// 901 /// X86 specific call that returns the vcpu's current "xsave struct". 902 /// 903 fn get_xsave(&self) -> cpu::Result<Xsave> { 904 self.fd 905 .get_xsave() 906 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 907 } 908 #[cfg(target_arch = "x86_64")] 909 /// 910 /// X86 specific call that sets the vcpu's current "xsave struct". 911 /// 912 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 913 self.fd 914 .set_xsave(xsave) 915 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 916 } 917 #[cfg(target_arch = "x86_64")] 918 /// 919 /// X86 specific call that returns the vcpu's current "xcrs". 920 /// 921 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 922 self.fd 923 .get_xcrs() 924 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 925 } 926 #[cfg(target_arch = "x86_64")] 927 /// 928 /// X86 specific call that sets the vcpu's current "xcrs". 929 /// 930 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 931 self.fd 932 .set_xcrs(xcrs) 933 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 934 } 935 /// 936 /// Triggers the running of the current virtual CPU returning an exit reason. 937 /// 938 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 939 match self.fd.run() { 940 Ok(run) => match run { 941 #[cfg(target_arch = "x86_64")] 942 VcpuExit::IoIn(addr, data) => { 943 if let Some(vmmops) = &self.vmmops { 944 return vmmops 945 .pio_read(addr.into(), data) 946 .map(|_| cpu::VmExit::Ignore) 947 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 948 } 949 950 Ok(cpu::VmExit::IoIn(addr, data)) 951 } 952 #[cfg(target_arch = "x86_64")] 953 VcpuExit::IoOut(addr, data) => { 954 if let Some(vmmops) = &self.vmmops { 955 return vmmops 956 .pio_write(addr.into(), data) 957 .map(|_| cpu::VmExit::Ignore) 958 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 959 } 960 961 Ok(cpu::VmExit::IoOut(addr, data)) 962 } 963 #[cfg(target_arch = "x86_64")] 964 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 965 #[cfg(target_arch = "x86_64")] 966 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 967 968 #[cfg(target_arch = "aarch64")] 969 VcpuExit::SystemEvent(event_type, flags) => { 970 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 971 // On Aarch64, when the VM is shutdown, run() returns 972 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 973 if event_type == KVM_SYSTEM_EVENT_RESET { 974 Ok(cpu::VmExit::Reset) 975 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 976 Ok(cpu::VmExit::Shutdown) 977 } else { 978 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 979 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 980 event_type, 981 flags 982 ))) 983 } 984 } 985 986 VcpuExit::MmioRead(addr, data) => { 987 if let Some(vmmops) = &self.vmmops { 988 return vmmops 989 .mmio_read(addr, data) 990 .map(|_| cpu::VmExit::Ignore) 991 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 992 } 993 994 Ok(cpu::VmExit::MmioRead(addr, data)) 995 } 996 VcpuExit::MmioWrite(addr, data) => { 997 if let Some(vmmops) = &self.vmmops { 998 return vmmops 999 .mmio_write(addr, data) 1000 .map(|_| cpu::VmExit::Ignore) 1001 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1002 } 1003 1004 Ok(cpu::VmExit::MmioWrite(addr, data)) 1005 } 1006 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 1007 1008 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1009 "Unexpected exit reason on vcpu run: {:?}", 1010 r 1011 ))), 1012 }, 1013 1014 Err(ref e) => match e.errno() { 1015 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1016 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1017 "VCPU error {:?}", 1018 e 1019 ))), 1020 }, 1021 } 1022 } 1023 #[cfg(target_arch = "x86_64")] 1024 /// 1025 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 1026 /// states of the vcpu. 1027 /// 1028 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 1029 self.fd 1030 .get_vcpu_events() 1031 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 1032 } 1033 #[cfg(target_arch = "x86_64")] 1034 /// 1035 /// Sets pending exceptions, interrupts, and NMIs as well as related states 1036 /// of the vcpu. 1037 /// 1038 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 1039 self.fd 1040 .set_vcpu_events(events) 1041 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 1042 } 1043 #[cfg(target_arch = "x86_64")] 1044 /// 1045 /// Let the guest know that it has been paused, which prevents from 1046 /// potential soft lockups when being resumed. 1047 /// 1048 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1049 self.fd 1050 .kvmclock_ctrl() 1051 .map_err(|e| cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())) 1052 } 1053 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1054 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1055 self.fd 1056 .vcpu_init(kvi) 1057 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1058 } 1059 /// 1060 /// Sets the value of one register for this vCPU. 1061 /// 1062 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1063 fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> { 1064 self.fd 1065 .set_one_reg(reg_id, data) 1066 .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into())) 1067 } 1068 /// 1069 /// Gets the value of one register for this vCPU. 1070 /// 1071 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1072 fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> { 1073 self.fd 1074 .get_one_reg(reg_id) 1075 .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into())) 1076 } 1077 /// 1078 /// Gets a list of the guest registers that are supported for the 1079 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1080 /// 1081 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1082 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1083 self.fd 1084 .get_reg_list(reg_list) 1085 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1086 } 1087 /// 1088 /// Save the state of the core registers. 1089 /// 1090 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1091 fn core_registers(&self, state: &mut StandardRegisters) -> cpu::Result<()> { 1092 let mut off = offset__of!(user_pt_regs, regs); 1093 // There are 31 user_pt_regs: 1094 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1095 // These actually are the general-purpose registers of the Armv8-a 1096 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1097 for i in 0..31 { 1098 state.regs.regs[i] = self 1099 .fd 1100 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1101 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1102 off += std::mem::size_of::<u64>(); 1103 } 1104 1105 // We are now entering the "Other register" section of the ARMv8-a architecture. 1106 // First one, stack pointer. 1107 let off = offset__of!(user_pt_regs, sp); 1108 state.regs.sp = self 1109 .fd 1110 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1111 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1112 1113 // Second one, the program counter. 1114 let off = offset__of!(user_pt_regs, pc); 1115 state.regs.pc = self 1116 .fd 1117 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1118 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1119 1120 // Next is the processor state. 1121 let off = offset__of!(user_pt_regs, pstate); 1122 state.regs.pstate = self 1123 .fd 1124 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1125 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1126 1127 // The stack pointer associated with EL1 1128 let off = offset__of!(kvm_regs, sp_el1); 1129 state.sp_el1 = self 1130 .fd 1131 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1132 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1133 1134 // Exception Link Register for EL1, when taking an exception to EL1, this register 1135 // holds the address to which to return afterwards. 1136 let off = offset__of!(kvm_regs, elr_el1); 1137 state.elr_el1 = self 1138 .fd 1139 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1140 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1141 1142 // Saved Program Status Registers, there are 5 of them used in the kernel. 1143 let mut off = offset__of!(kvm_regs, spsr); 1144 for i in 0..KVM_NR_SPSR as usize { 1145 state.spsr[i] = self 1146 .fd 1147 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1148 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1149 off += std::mem::size_of::<u64>(); 1150 } 1151 1152 // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: 1153 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1154 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1155 for i in 0..32 { 1156 state.fp_regs.vregs[i] = self 1157 .fd 1158 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1159 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1160 .into(); 1161 off += mem::size_of::<u128>(); 1162 } 1163 1164 // Floating-point Status Register 1165 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1166 state.fp_regs.fpsr = self 1167 .fd 1168 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1169 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1170 as u32; 1171 1172 // Floating-point Control Register 1173 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1174 state.fp_regs.fpcr = self 1175 .fd 1176 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1177 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1178 as u32; 1179 Ok(()) 1180 } 1181 /// 1182 /// Restore the state of the core registers. 1183 /// 1184 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1185 fn set_core_registers(&self, state: &StandardRegisters) -> cpu::Result<()> { 1186 // The function follows the exact identical order from `state`. Look there 1187 // for some additional info on registers. 1188 let mut off = offset__of!(user_pt_regs, regs); 1189 for i in 0..31 { 1190 self.fd 1191 .set_one_reg( 1192 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1193 state.regs.regs[i], 1194 ) 1195 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1196 off += std::mem::size_of::<u64>(); 1197 } 1198 1199 let off = offset__of!(user_pt_regs, sp); 1200 self.fd 1201 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp) 1202 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1203 1204 let off = offset__of!(user_pt_regs, pc); 1205 self.fd 1206 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc) 1207 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1208 1209 let off = offset__of!(user_pt_regs, pstate); 1210 self.fd 1211 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate) 1212 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1213 1214 let off = offset__of!(kvm_regs, sp_el1); 1215 self.fd 1216 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1) 1217 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1218 1219 let off = offset__of!(kvm_regs, elr_el1); 1220 self.fd 1221 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1) 1222 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1223 1224 let mut off = offset__of!(kvm_regs, spsr); 1225 for i in 0..KVM_NR_SPSR as usize { 1226 self.fd 1227 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i]) 1228 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1229 off += std::mem::size_of::<u64>(); 1230 } 1231 1232 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1233 for i in 0..32 { 1234 self.fd 1235 .set_one_reg( 1236 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1237 state.fp_regs.vregs[i] as u64, 1238 ) 1239 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1240 off += mem::size_of::<u128>(); 1241 } 1242 1243 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1244 self.fd 1245 .set_one_reg( 1246 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1247 state.fp_regs.fpsr as u64, 1248 ) 1249 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1250 1251 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1252 self.fd 1253 .set_one_reg( 1254 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1255 state.fp_regs.fpcr as u64, 1256 ) 1257 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1258 Ok(()) 1259 } 1260 /// 1261 /// Save the state of the system registers. 1262 /// 1263 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1264 fn system_registers(&self, state: &mut Vec<Register>) -> cpu::Result<()> { 1265 // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are 1266 // around 500 registers. 1267 let mut reg_list = RegList::new(500).unwrap(); 1268 self.fd 1269 .get_reg_list(&mut reg_list) 1270 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1271 1272 // At this point reg_list should contain: core registers and system registers. 1273 // The register list contains the number of registers and their ids. We will be needing to 1274 // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list 1275 // the core registers which are represented in the kernel by kvm_regs structure and for which 1276 // we can calculate the id based on the offset in the structure. 1277 reg_list.retain(|regid| is_system_register(*regid)); 1278 1279 // Now, for the rest of the registers left in the previously fetched register list, we are 1280 // simply calling KVM_GET_ONE_REG. 1281 let indices = reg_list.as_slice(); 1282 for index in indices.iter() { 1283 state.push(kvm_bindings::kvm_one_reg { 1284 id: *index, 1285 addr: self 1286 .fd 1287 .get_one_reg(*index) 1288 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?, 1289 }); 1290 } 1291 1292 Ok(()) 1293 } 1294 /// 1295 /// Restore the state of the system registers. 1296 /// 1297 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1298 fn set_system_registers(&self, state: &[Register]) -> cpu::Result<()> { 1299 for reg in state { 1300 self.fd 1301 .set_one_reg(reg.id, reg.addr) 1302 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 1303 } 1304 Ok(()) 1305 } 1306 /// 1307 /// Read the MPIDR - Multiprocessor Affinity Register. 1308 /// 1309 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1310 fn read_mpidr(&self) -> cpu::Result<u64> { 1311 self.fd 1312 .get_one_reg(MPIDR_EL1) 1313 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into())) 1314 } 1315 #[cfg(target_arch = "x86_64")] 1316 /// 1317 /// Get the current CPU state 1318 /// 1319 /// Ordering requirements: 1320 /// 1321 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1322 /// vCPU/LAPIC state. As such, it must be done before most everything 1323 /// else, otherwise we cannot restore everything and expect it to work. 1324 /// 1325 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1326 /// still running. 1327 /// 1328 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1329 /// 1330 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1331 /// it might as well be affected by internal state modifications of the 1332 /// GET ioctls. 1333 /// 1334 /// SREGS saves/restores a pending interrupt, similar to what 1335 /// VCPU_EVENTS also does. 1336 /// 1337 /// GET_MSRS requires a pre-populated data structure to do something 1338 /// meaningful. For SET_MSRS it will then contain good data. 1339 /// 1340 /// # Example 1341 /// 1342 /// ```rust 1343 /// # extern crate hypervisor; 1344 /// # use hypervisor::KvmHypervisor; 1345 /// # use std::sync::Arc; 1346 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1347 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1348 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1349 /// vm.enable_split_irq().unwrap(); 1350 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1351 /// let state = vcpu.state().unwrap(); 1352 /// ``` 1353 fn state(&self) -> cpu::Result<CpuState> { 1354 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1355 let mp_state = self.get_mp_state()?; 1356 let regs = self.get_regs()?; 1357 let sregs = self.get_sregs()?; 1358 let xsave = self.get_xsave()?; 1359 let xcrs = self.get_xcrs()?; 1360 let lapic_state = self.get_lapic()?; 1361 let fpu = self.get_fpu()?; 1362 1363 // Try to get all MSRs based on the list previously retrieved from KVM. 1364 // If the number of MSRs obtained from GET_MSRS is different from the 1365 // expected amount, we fallback onto a slower method by getting MSRs 1366 // by chunks. This is the only way to make sure we try to get as many 1367 // MSRs as possible, even if some MSRs are not supported. 1368 let mut msr_entries = self.msrs.clone(); 1369 1370 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1371 // emulated. 1372 if self.hyperv_synic.load(Ordering::Acquire) { 1373 let hyperv_synic_msrs = vec![ 1374 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1375 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1376 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1377 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 0x400000b5, 1378 0x400000b6, 0x400000b7, 1379 ]; 1380 for index in hyperv_synic_msrs { 1381 let msr = kvm_msr_entry { 1382 index, 1383 ..Default::default() 1384 }; 1385 msr_entries.push(msr).unwrap(); 1386 } 1387 } 1388 1389 let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize; 1390 let num_msrs = self.get_msrs(&mut msr_entries)?; 1391 let msrs = if num_msrs != expected_num_msrs { 1392 let mut faulty_msr_index = num_msrs; 1393 let mut msr_entries_tmp = 1394 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap(); 1395 1396 loop { 1397 warn!( 1398 "Detected faulty MSR 0x{:x} while getting MSRs", 1399 msr_entries.as_slice()[faulty_msr_index].index 1400 ); 1401 1402 let start_pos = faulty_msr_index + 1; 1403 let mut sub_msr_entries = 1404 MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap(); 1405 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1406 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1407 1408 for i in 0..num_msrs { 1409 msr_entries_tmp 1410 .push(sub_msr_entries.as_slice()[i]) 1411 .map_err(|e| { 1412 cpu::HypervisorCpuError::GetMsrEntries(anyhow!( 1413 "Failed adding MSR entries: {:?}", 1414 e 1415 )) 1416 })?; 1417 } 1418 1419 if num_msrs == expected_num_msrs { 1420 break; 1421 } 1422 1423 faulty_msr_index = start_pos + num_msrs; 1424 } 1425 1426 msr_entries_tmp 1427 } else { 1428 msr_entries 1429 }; 1430 1431 let vcpu_events = self.get_vcpu_events()?; 1432 1433 Ok(CpuState { 1434 cpuid, 1435 msrs, 1436 vcpu_events, 1437 regs, 1438 sregs, 1439 fpu, 1440 lapic_state, 1441 xsave, 1442 xcrs, 1443 mp_state, 1444 }) 1445 } 1446 /// 1447 /// Get the current AArch64 CPU state 1448 /// 1449 #[cfg(target_arch = "aarch64")] 1450 fn state(&self) -> cpu::Result<CpuState> { 1451 let mut state = CpuState { 1452 mp_state: self.get_mp_state()?, 1453 mpidr: self.read_mpidr()?, 1454 ..Default::default() 1455 }; 1456 self.core_registers(&mut state.core_regs)?; 1457 self.system_registers(&mut state.sys_regs)?; 1458 1459 Ok(state) 1460 } 1461 #[cfg(target_arch = "x86_64")] 1462 /// 1463 /// Restore the previously saved CPU state 1464 /// 1465 /// Ordering requirements: 1466 /// 1467 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1468 /// still running. 1469 /// 1470 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1471 /// if we ever change the BSP, we have to do that before restoring anything. 1472 /// The same seems to be true for CPUID stuff. 1473 /// 1474 /// SREGS saves/restores a pending interrupt, similar to what 1475 /// VCPU_EVENTS also does. 1476 /// 1477 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1478 /// done before SET_VCPU_EVENTS, which restores it. 1479 /// 1480 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1481 /// the apic base msr. 1482 /// 1483 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1484 /// only restores successfully, when the LAPIC is correctly configured. 1485 /// 1486 /// Arguments: CpuState 1487 /// # Example 1488 /// 1489 /// ```rust 1490 /// # extern crate hypervisor; 1491 /// # use hypervisor::KvmHypervisor; 1492 /// # use std::sync::Arc; 1493 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1494 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1495 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1496 /// vm.enable_split_irq().unwrap(); 1497 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1498 /// let state = vcpu.state().unwrap(); 1499 /// vcpu.set_state(&state).unwrap(); 1500 /// ``` 1501 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1502 self.set_cpuid2(&state.cpuid)?; 1503 self.set_mp_state(state.mp_state)?; 1504 self.set_regs(&state.regs)?; 1505 self.set_sregs(&state.sregs)?; 1506 self.set_xsave(&state.xsave)?; 1507 self.set_xcrs(&state.xcrs)?; 1508 self.set_lapic(&state.lapic_state)?; 1509 self.set_fpu(&state.fpu)?; 1510 1511 // Try to set all MSRs previously stored. 1512 // If the number of MSRs set from SET_MSRS is different from the 1513 // expected amount, we fallback onto a slower method by setting MSRs 1514 // by chunks. This is the only way to make sure we try to set as many 1515 // MSRs as possible, even if some MSRs are not supported. 1516 let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize; 1517 let num_msrs = self.set_msrs(&state.msrs)?; 1518 if num_msrs != expected_num_msrs { 1519 let mut faulty_msr_index = num_msrs; 1520 1521 loop { 1522 warn!( 1523 "Detected faulty MSR 0x{:x} while setting MSRs", 1524 state.msrs.as_slice()[faulty_msr_index].index 1525 ); 1526 1527 let start_pos = faulty_msr_index + 1; 1528 let sub_msr_entries = 1529 MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap(); 1530 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1531 let num_msrs = self.set_msrs(&sub_msr_entries)?; 1532 1533 if num_msrs == expected_num_msrs { 1534 break; 1535 } 1536 1537 faulty_msr_index = start_pos + num_msrs; 1538 } 1539 } 1540 1541 self.set_vcpu_events(&state.vcpu_events)?; 1542 1543 Ok(()) 1544 } 1545 /// 1546 /// Restore the previously saved AArch64 CPU state 1547 /// 1548 #[cfg(target_arch = "aarch64")] 1549 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1550 self.set_core_registers(&state.core_regs)?; 1551 self.set_system_registers(&state.sys_regs)?; 1552 self.set_mp_state(state.mp_state)?; 1553 1554 Ok(()) 1555 } 1556 1557 /// 1558 /// Initialize TDX for this CPU 1559 /// 1560 #[cfg(feature = "tdx")] 1561 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 1562 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 1563 .map_err(cpu::HypervisorCpuError::InitializeTdx) 1564 } 1565 } 1566 1567 /// Device struct for KVM 1568 pub struct KvmDevice { 1569 fd: DeviceFd, 1570 } 1571 1572 impl device::Device for KvmDevice { 1573 /// 1574 /// Set device attribute 1575 /// 1576 fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> { 1577 self.fd 1578 .set_device_attr(attr) 1579 .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into())) 1580 } 1581 /// 1582 /// Get device attribute 1583 /// 1584 fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> { 1585 self.fd 1586 .get_device_attr(attr) 1587 .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into())) 1588 } 1589 } 1590 1591 impl AsRawFd for KvmDevice { 1592 fn as_raw_fd(&self) -> RawFd { 1593 self.fd.as_raw_fd() 1594 } 1595 } 1596