1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 use crate::aarch64::gic::KvmGicV3Its; 13 #[cfg(target_arch = "aarch64")] 14 pub use crate::aarch64::{ 15 check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit, 16 VcpuKvmState as CpuState, MPIDR_EL1, 17 }; 18 #[cfg(target_arch = "aarch64")] 19 use crate::arch::aarch64::gic::Vgic; 20 use crate::cpu; 21 use crate::device; 22 use crate::hypervisor; 23 use crate::vec_with_array_field; 24 use crate::vm::{self, InterruptSourceConfig, VmOps}; 25 #[cfg(target_arch = "aarch64")] 26 use crate::{arm64_core_reg_id, offset__of}; 27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 28 use serde::{Deserialize, Serialize}; 29 use std::collections::HashMap; 30 #[cfg(target_arch = "aarch64")] 31 use std::convert::TryInto; 32 #[cfg(target_arch = "x86_64")] 33 use std::fs::File; 34 use std::os::unix::io::{AsRawFd, RawFd}; 35 use std::result; 36 #[cfg(target_arch = "x86_64")] 37 use std::sync::atomic::{AtomicBool, Ordering}; 38 #[cfg(target_arch = "aarch64")] 39 use std::sync::Mutex; 40 use std::sync::{Arc, RwLock}; 41 use vmm_sys_util::eventfd::EventFd; 42 // x86_64 dependencies 43 #[cfg(target_arch = "x86_64")] 44 pub mod x86_64; 45 #[cfg(target_arch = "x86_64")] 46 use crate::arch::x86::NUM_IOAPIC_PINS; 47 #[cfg(target_arch = "aarch64")] 48 use aarch64::{RegList, Register, StandardRegisters}; 49 #[cfg(target_arch = "x86_64")] 50 use kvm_bindings::{ 51 kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, 52 KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP, 53 }; 54 #[cfg(target_arch = "x86_64")] 55 use x86_64::{check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters}; 56 #[cfg(target_arch = "x86_64")] 57 pub use x86_64::{ 58 CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState as CpuState, 59 Xsave, CPUID_FLAG_VALID_INDEX, 60 }; 61 // aarch64 dependencies 62 #[cfg(target_arch = "aarch64")] 63 pub mod aarch64; 64 pub use kvm_bindings; 65 #[cfg(feature = "tdx")] 66 use kvm_bindings::KVMIO; 67 pub use kvm_bindings::{ 68 kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry, 69 kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, 70 KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 71 }; 72 #[cfg(target_arch = "aarch64")] 73 use kvm_bindings::{ 74 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE, 75 KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 76 }; 77 pub use kvm_ioctls; 78 pub use kvm_ioctls::{Cap, Kvm}; 79 #[cfg(target_arch = "aarch64")] 80 use std::mem; 81 use thiserror::Error; 82 #[cfg(feature = "tdx")] 83 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr}; 84 /// 85 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 86 /// 87 pub use { 88 kvm_bindings::kvm_clock_data as ClockData, kvm_bindings::kvm_create_device as CreateDevice, 89 kvm_bindings::kvm_device_attr as DeviceAttr, 90 kvm_bindings::kvm_irq_routing_entry as IrqRoutingEntry, kvm_bindings::kvm_mp_state as MpState, 91 kvm_bindings::kvm_run, kvm_bindings::kvm_userspace_memory_region as MemoryRegion, 92 kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::IoEventAddress, 93 kvm_ioctls::VcpuExit, 94 }; 95 96 #[cfg(target_arch = "x86_64")] 97 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 98 99 #[cfg(feature = "tdx")] 100 const KVM_EXIT_TDX: u32 = 35; 101 #[cfg(feature = "tdx")] 102 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002; 103 #[cfg(feature = "tdx")] 104 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004; 105 #[cfg(feature = "tdx")] 106 const TDG_VP_VMCALL_SUCCESS: u64 = 0; 107 #[cfg(feature = "tdx")] 108 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000; 109 110 #[cfg(feature = "tdx")] 111 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 112 113 #[cfg(feature = "tdx")] 114 #[repr(u32)] 115 enum TdxCommand { 116 Capabilities = 0, 117 InitVm, 118 InitVcpu, 119 InitMemRegion, 120 Finalize, 121 } 122 123 #[cfg(feature = "tdx")] 124 pub enum TdxExitDetails { 125 GetQuote, 126 SetupEventNotifyInterrupt, 127 } 128 129 #[cfg(feature = "tdx")] 130 pub enum TdxExitStatus { 131 Success, 132 InvalidOperand, 133 } 134 135 #[cfg(feature = "tdx")] 136 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6; 137 138 #[cfg(feature = "tdx")] 139 #[repr(C)] 140 #[derive(Debug, Default)] 141 pub struct TdxCpuidConfig { 142 pub leaf: u32, 143 pub sub_leaf: u32, 144 pub eax: u32, 145 pub ebx: u32, 146 pub ecx: u32, 147 pub edx: u32, 148 } 149 150 #[cfg(feature = "tdx")] 151 #[repr(C)] 152 #[derive(Debug, Default)] 153 pub struct TdxCapabilities { 154 pub attrs_fixed0: u64, 155 pub attrs_fixed1: u64, 156 pub xfam_fixed0: u64, 157 pub xfam_fixed1: u64, 158 pub nr_cpuid_configs: u32, 159 pub padding: u32, 160 pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS], 161 } 162 163 #[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize)] 164 pub struct KvmVmState {} 165 166 pub use KvmVmState as VmState; 167 168 struct KvmDirtyLogSlot { 169 slot: u32, 170 guest_phys_addr: u64, 171 memory_size: u64, 172 userspace_addr: u64, 173 } 174 175 /// Wrapper over KVM VM ioctls. 176 pub struct KvmVm { 177 fd: Arc<VmFd>, 178 #[cfg(target_arch = "x86_64")] 179 msrs: MsrEntries, 180 state: KvmVmState, 181 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 182 } 183 184 /// 185 /// Implementation of Vm trait for KVM 186 /// Example: 187 /// #[cfg(feature = "kvm")] 188 /// extern crate hypervisor 189 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 190 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 191 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 192 /// vm.set/get().unwrap() 193 /// 194 impl vm::Vm for KvmVm { 195 #[cfg(target_arch = "x86_64")] 196 /// 197 /// Sets the address of the one-page region in the VM's address space. 198 /// 199 fn set_identity_map_address(&self, address: u64) -> vm::Result<()> { 200 self.fd 201 .set_identity_map_address(address) 202 .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into())) 203 } 204 #[cfg(target_arch = "x86_64")] 205 /// 206 /// Sets the address of the three-page region in the VM's address space. 207 /// 208 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 209 self.fd 210 .set_tss_address(offset) 211 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 212 } 213 /// 214 /// Creates an in-kernel interrupt controller. 215 /// 216 fn create_irq_chip(&self) -> vm::Result<()> { 217 self.fd 218 .create_irq_chip() 219 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 220 } 221 /// 222 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 223 /// 224 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 225 self.fd 226 .register_irqfd(fd, gsi) 227 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 228 } 229 /// 230 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 231 /// 232 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 233 self.fd 234 .unregister_irqfd(fd, gsi) 235 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 236 } 237 /// 238 /// Creates a VcpuFd object from a vcpu RawFd. 239 /// 240 fn create_vcpu( 241 &self, 242 id: u8, 243 vm_ops: Option<Arc<dyn VmOps>>, 244 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 245 let vc = self 246 .fd 247 .create_vcpu(id as u64) 248 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 249 let vcpu = KvmVcpu { 250 fd: vc, 251 #[cfg(target_arch = "x86_64")] 252 msrs: self.msrs.clone(), 253 vm_ops, 254 #[cfg(target_arch = "x86_64")] 255 hyperv_synic: AtomicBool::new(false), 256 }; 257 Ok(Arc::new(vcpu)) 258 } 259 #[cfg(target_arch = "aarch64")] 260 /// 261 /// Creates a virtual GIC device. 262 /// 263 fn create_vgic( 264 &self, 265 vcpu_count: u64, 266 dist_addr: u64, 267 dist_size: u64, 268 redist_size: u64, 269 msi_size: u64, 270 nr_irqs: u32, 271 ) -> vm::Result<Arc<Mutex<dyn Vgic>>> { 272 let gic_device = KvmGicV3Its::new( 273 self, 274 vcpu_count, 275 dist_addr, 276 dist_size, 277 redist_size, 278 msi_size, 279 nr_irqs, 280 ) 281 .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?; 282 Ok(Arc::new(Mutex::new(gic_device))) 283 } 284 /// 285 /// Registers an event to be signaled whenever a certain address is written to. 286 /// 287 fn register_ioevent( 288 &self, 289 fd: &EventFd, 290 addr: &IoEventAddress, 291 datamatch: Option<vm::DataMatch>, 292 ) -> vm::Result<()> { 293 if let Some(dm) = datamatch { 294 match dm { 295 vm::DataMatch::DataMatch32(kvm_dm32) => self 296 .fd 297 .register_ioevent(fd, addr, kvm_dm32) 298 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 299 vm::DataMatch::DataMatch64(kvm_dm64) => self 300 .fd 301 .register_ioevent(fd, addr, kvm_dm64) 302 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 303 } 304 } else { 305 self.fd 306 .register_ioevent(fd, addr, NoDatamatch) 307 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 308 } 309 } 310 /// 311 /// Unregisters an event from a certain address it has been previously registered to. 312 /// 313 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 314 self.fd 315 .unregister_ioevent(fd, addr, NoDatamatch) 316 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 317 } 318 319 /// 320 /// Constructs a routing entry 321 /// 322 fn make_routing_entry( 323 &self, 324 gsi: u32, 325 config: &InterruptSourceConfig, 326 ) -> kvm_irq_routing_entry { 327 match &config { 328 InterruptSourceConfig::MsiIrq(cfg) => { 329 let mut kvm_route = kvm_irq_routing_entry { 330 gsi, 331 type_: KVM_IRQ_ROUTING_MSI, 332 ..Default::default() 333 }; 334 335 kvm_route.u.msi.address_lo = cfg.low_addr; 336 kvm_route.u.msi.address_hi = cfg.high_addr; 337 kvm_route.u.msi.data = cfg.data; 338 339 if self.check_extension(crate::kvm::Cap::MsiDevid) { 340 // On AArch64, there is limitation on the range of the 'devid', 341 // it can not be greater than 65536 (the max of u16). 342 // 343 // BDF can not be used directly, because 'segment' is in high 344 // 16 bits. The layout of the u32 BDF is: 345 // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| 346 // | segment | bus | device | function | 347 // 348 // Now that we support 1 bus only in a segment, we can build a 349 // 'devid' by replacing the 'bus' bits with the low 8 bits of 350 // 'segment' data. 351 // This way we can resolve the range checking problem and give 352 // different `devid` to all the devices. Limitation is that at 353 // most 256 segments can be supported. 354 // 355 let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff; 356 357 kvm_route.flags = KVM_MSI_VALID_DEVID; 358 kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid; 359 } 360 kvm_route 361 } 362 InterruptSourceConfig::LegacyIrq(cfg) => { 363 let mut kvm_route = kvm_irq_routing_entry { 364 gsi, 365 type_: KVM_IRQ_ROUTING_IRQCHIP, 366 ..Default::default() 367 }; 368 kvm_route.u.irqchip.irqchip = cfg.irqchip; 369 kvm_route.u.irqchip.pin = cfg.pin; 370 371 kvm_route 372 } 373 } 374 } 375 376 /// 377 /// Sets the GSI routing table entries, overwriting any previously set 378 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 379 /// 380 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 381 let mut irq_routing = 382 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 383 irq_routing[0].nr = entries.len() as u32; 384 irq_routing[0].flags = 0; 385 386 // SAFETY: irq_routing initialized with entries.len() and now it is being turned into 387 // entries_slice with entries.len() again. It is guaranteed to be large enough to hold 388 // everything from entries. 389 unsafe { 390 let entries_slice: &mut [kvm_irq_routing_entry] = 391 irq_routing[0].entries.as_mut_slice(entries.len()); 392 entries_slice.copy_from_slice(entries); 393 } 394 395 self.fd 396 .set_gsi_routing(&irq_routing[0]) 397 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 398 } 399 /// 400 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 401 /// 402 fn make_user_memory_region( 403 &self, 404 slot: u32, 405 guest_phys_addr: u64, 406 memory_size: u64, 407 userspace_addr: u64, 408 readonly: bool, 409 log_dirty_pages: bool, 410 ) -> MemoryRegion { 411 MemoryRegion { 412 slot, 413 guest_phys_addr, 414 memory_size, 415 userspace_addr, 416 flags: if readonly { KVM_MEM_READONLY } else { 0 } 417 | if log_dirty_pages { 418 KVM_MEM_LOG_DIRTY_PAGES 419 } else { 420 0 421 }, 422 } 423 } 424 /// 425 /// Creates a guest physical memory region. 426 /// 427 fn create_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> { 428 let mut region = user_memory_region; 429 430 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 431 if (region.flags & KVM_MEM_READONLY) != 0 { 432 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 433 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 434 ))); 435 } 436 437 // Keep track of the regions that need dirty pages log 438 self.dirty_log_slots.write().unwrap().insert( 439 region.slot, 440 KvmDirtyLogSlot { 441 slot: region.slot, 442 guest_phys_addr: region.guest_phys_addr, 443 memory_size: region.memory_size, 444 userspace_addr: region.userspace_addr, 445 }, 446 ); 447 448 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 449 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 450 region.flags = 0; 451 } 452 453 // SAFETY: Safe because guest regions are guaranteed not to overlap. 454 unsafe { 455 self.fd 456 .set_user_memory_region(region) 457 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 458 } 459 } 460 /// 461 /// Removes a guest physical memory region. 462 /// 463 fn remove_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> { 464 let mut region = user_memory_region; 465 466 // Remove the corresponding entry from "self.dirty_log_slots" if needed 467 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 468 469 // Setting the size to 0 means "remove" 470 region.memory_size = 0; 471 // SAFETY: Safe because guest regions are guaranteed not to overlap. 472 unsafe { 473 self.fd 474 .set_user_memory_region(region) 475 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 476 } 477 } 478 /// 479 /// Creates an emulated device in the kernel. 480 /// 481 /// See the documentation for `KVM_CREATE_DEVICE`. 482 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> { 483 let fd = self 484 .fd 485 .create_device(device) 486 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 487 let device = KvmDevice { fd }; 488 Ok(Arc::new(device)) 489 } 490 /// 491 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 492 /// 493 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 494 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 495 self.fd 496 .get_preferred_target(kvi) 497 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 498 } 499 #[cfg(target_arch = "x86_64")] 500 fn enable_split_irq(&self) -> vm::Result<()> { 501 // Create split irqchip 502 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 503 // are not. 504 let mut cap = kvm_enable_cap { 505 cap: KVM_CAP_SPLIT_IRQCHIP, 506 ..Default::default() 507 }; 508 cap.args[0] = NUM_IOAPIC_PINS as u64; 509 self.fd 510 .enable_cap(&cap) 511 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 512 Ok(()) 513 } 514 #[cfg(target_arch = "x86_64")] 515 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 516 let mut cap = kvm_enable_cap { 517 cap: KVM_CAP_SGX_ATTRIBUTE, 518 ..Default::default() 519 }; 520 cap.args[0] = file.as_raw_fd() as u64; 521 self.fd 522 .enable_cap(&cap) 523 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 524 Ok(()) 525 } 526 /// Retrieve guest clock. 527 #[cfg(target_arch = "x86_64")] 528 fn get_clock(&self) -> vm::Result<ClockData> { 529 self.fd 530 .get_clock() 531 .map_err(|e| vm::HypervisorVmError::GetClock(e.into())) 532 } 533 /// Set guest clock. 534 #[cfg(target_arch = "x86_64")] 535 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 536 self.fd 537 .set_clock(data) 538 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 539 } 540 /// Checks if a particular `Cap` is available. 541 fn check_extension(&self, c: Cap) -> bool { 542 self.fd.check_extension(c) 543 } 544 /// Create a device that is used for passthrough 545 fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> { 546 let mut vfio_dev = kvm_create_device { 547 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 548 fd: 0, 549 flags: 0, 550 }; 551 552 self.create_device(&mut vfio_dev) 553 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 554 } 555 /// 556 /// Get the Vm state. Return VM specific data 557 /// 558 fn state(&self) -> vm::Result<VmState> { 559 Ok(self.state) 560 } 561 /// 562 /// Set the VM state 563 /// 564 fn set_state(&self, _state: VmState) -> vm::Result<()> { 565 Ok(()) 566 } 567 568 /// 569 /// Start logging dirty pages 570 /// 571 fn start_dirty_log(&self) -> vm::Result<()> { 572 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 573 for (_, s) in dirty_log_slots.iter() { 574 let region = MemoryRegion { 575 slot: s.slot, 576 guest_phys_addr: s.guest_phys_addr, 577 memory_size: s.memory_size, 578 userspace_addr: s.userspace_addr, 579 flags: KVM_MEM_LOG_DIRTY_PAGES, 580 }; 581 // SAFETY: Safe because guest regions are guaranteed not to overlap. 582 unsafe { 583 self.fd 584 .set_user_memory_region(region) 585 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 586 } 587 } 588 589 Ok(()) 590 } 591 592 /// 593 /// Stop logging dirty pages 594 /// 595 fn stop_dirty_log(&self) -> vm::Result<()> { 596 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 597 for (_, s) in dirty_log_slots.iter() { 598 let region = MemoryRegion { 599 slot: s.slot, 600 guest_phys_addr: s.guest_phys_addr, 601 memory_size: s.memory_size, 602 userspace_addr: s.userspace_addr, 603 flags: 0, 604 }; 605 // SAFETY: Safe because guest regions are guaranteed not to overlap. 606 unsafe { 607 self.fd 608 .set_user_memory_region(region) 609 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 610 } 611 } 612 613 Ok(()) 614 } 615 616 /// 617 /// Get dirty pages bitmap (one bit per page) 618 /// 619 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 620 self.fd 621 .get_dirty_log(slot, memory_size as usize) 622 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 623 } 624 625 /// 626 /// Initialize TDX for this VM 627 /// 628 #[cfg(feature = "tdx")] 629 fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> { 630 #[repr(C)] 631 struct TdxInitVm { 632 max_vcpus: u32, 633 tsc_khz: u32, 634 attributes: u64, 635 cpuid: u64, 636 mrconfigid: [u64; 6], 637 mrowner: [u64; 6], 638 mrownerconfig: [u64; 6], 639 reserved: [u64; 43], 640 } 641 let data = TdxInitVm { 642 max_vcpus, 643 tsc_khz: 0, 644 attributes: 0, 645 cpuid: cpuid.as_fam_struct_ptr() as u64, 646 mrconfigid: [0; 6], 647 mrowner: [0; 6], 648 mrownerconfig: [0; 6], 649 reserved: [0; 43], 650 }; 651 652 tdx_command( 653 &self.fd.as_raw_fd(), 654 TdxCommand::InitVm, 655 0, 656 &data as *const _ as u64, 657 ) 658 .map_err(vm::HypervisorVmError::InitializeTdx) 659 } 660 661 /// 662 /// Finalize the TDX setup for this VM 663 /// 664 #[cfg(feature = "tdx")] 665 fn tdx_finalize(&self) -> vm::Result<()> { 666 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 667 .map_err(vm::HypervisorVmError::FinalizeTdx) 668 } 669 670 /// 671 /// Initialize memory regions for the TDX VM 672 /// 673 #[cfg(feature = "tdx")] 674 fn tdx_init_memory_region( 675 &self, 676 host_address: u64, 677 guest_address: u64, 678 size: u64, 679 measure: bool, 680 ) -> vm::Result<()> { 681 #[repr(C)] 682 struct TdxInitMemRegion { 683 host_address: u64, 684 guest_address: u64, 685 pages: u64, 686 } 687 let data = TdxInitMemRegion { 688 host_address, 689 guest_address, 690 pages: size / 4096, 691 }; 692 693 tdx_command( 694 &self.fd.as_raw_fd(), 695 TdxCommand::InitMemRegion, 696 if measure { 1 } else { 0 }, 697 &data as *const _ as u64, 698 ) 699 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 700 } 701 } 702 703 #[cfg(feature = "tdx")] 704 fn tdx_command( 705 fd: &RawFd, 706 command: TdxCommand, 707 metadata: u32, 708 data: u64, 709 ) -> std::result::Result<(), std::io::Error> { 710 #[repr(C)] 711 struct TdxIoctlCmd { 712 command: TdxCommand, 713 metadata: u32, 714 data: u64, 715 } 716 let cmd = TdxIoctlCmd { 717 command, 718 metadata, 719 data, 720 }; 721 // SAFETY: FFI call. All input parameters are valid. 722 let ret = unsafe { 723 ioctl_with_val( 724 fd, 725 KVM_MEMORY_ENCRYPT_OP(), 726 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 727 ) 728 }; 729 730 if ret < 0 { 731 return Err(std::io::Error::last_os_error()); 732 } 733 Ok(()) 734 } 735 736 /// Wrapper over KVM system ioctls. 737 pub struct KvmHypervisor { 738 kvm: Kvm, 739 } 740 /// Enum for KVM related error 741 #[derive(Debug, Error)] 742 pub enum KvmError { 743 #[error("Capability missing: {0:?}")] 744 CapabilityMissing(Cap), 745 } 746 pub type KvmResult<T> = result::Result<T, KvmError>; 747 impl KvmHypervisor { 748 /// Create a hypervisor based on Kvm 749 pub fn new() -> hypervisor::Result<KvmHypervisor> { 750 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 751 let api_version = kvm_obj.get_api_version(); 752 753 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 754 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 755 } 756 757 Ok(KvmHypervisor { kvm: kvm_obj }) 758 } 759 } 760 /// Implementation of Hypervisor trait for KVM 761 /// Example: 762 /// #[cfg(feature = "kvm")] 763 /// extern crate hypervisor 764 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 765 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 766 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 767 /// 768 impl hypervisor::Hypervisor for KvmHypervisor { 769 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 770 /// Example 771 /// # extern crate hypervisor; 772 /// # use hypervisor::KvmHypervisor; 773 /// use hypervisor::KvmVm; 774 /// let hypervisor = KvmHypervisor::new().unwrap(); 775 /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() 776 /// 777 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 778 let fd: VmFd; 779 loop { 780 match self.kvm.create_vm_with_type(vm_type) { 781 Ok(res) => fd = res, 782 Err(e) => { 783 if e.errno() == libc::EINTR { 784 // If the error returned is EINTR, which means the 785 // ioctl has been interrupted, we have to retry as 786 // this can't be considered as a regular error. 787 continue; 788 } else { 789 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 790 } 791 } 792 } 793 break; 794 } 795 796 let vm_fd = Arc::new(fd); 797 798 #[cfg(target_arch = "x86_64")] 799 { 800 let msr_list = self.get_msr_list()?; 801 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 802 let mut msrs = MsrEntries::new(num_msrs).unwrap(); 803 let indices = msr_list.as_slice(); 804 let msr_entries = msrs.as_mut_slice(); 805 for (pos, index) in indices.iter().enumerate() { 806 msr_entries[pos].index = *index; 807 } 808 809 Ok(Arc::new(KvmVm { 810 fd: vm_fd, 811 msrs, 812 state: VmState {}, 813 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 814 })) 815 } 816 817 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 818 { 819 Ok(Arc::new(KvmVm { 820 fd: vm_fd, 821 state: VmState {}, 822 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 823 })) 824 } 825 } 826 827 /// Create a KVM vm object and return the object as Vm trait object 828 /// Example 829 /// # extern crate hypervisor; 830 /// # use hypervisor::KvmHypervisor; 831 /// use hypervisor::KvmVm; 832 /// let hypervisor = KvmHypervisor::new().unwrap(); 833 /// let vm = hypervisor.create_vm().unwrap() 834 /// 835 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 836 #[allow(unused_mut)] 837 let mut vm_type: u64 = 0; // Create with default platform type 838 839 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 840 // size from the host and use that when creating the VM, which may 841 // avoid unnecessary VM creation failures. 842 #[cfg(target_arch = "aarch64")] 843 if self.kvm.check_extension(Cap::ArmVmIPASize) { 844 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 845 } 846 847 self.create_vm_with_type(vm_type) 848 } 849 850 fn check_required_extensions(&self) -> hypervisor::Result<()> { 851 check_required_kvm_extensions(&self.kvm) 852 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 853 } 854 855 #[cfg(target_arch = "x86_64")] 856 /// 857 /// X86 specific call to get the system supported CPUID values. 858 /// 859 fn get_cpuid(&self) -> hypervisor::Result<CpuId> { 860 self.kvm 861 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 862 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into())) 863 } 864 865 #[cfg(target_arch = "x86_64")] 866 /// 867 /// Retrieve the list of MSRs supported by KVM. 868 /// 869 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 870 self.kvm 871 .get_msr_index_list() 872 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 873 } 874 #[cfg(target_arch = "aarch64")] 875 /// 876 /// Retrieve AArch64 host maximum IPA size supported by KVM. 877 /// 878 fn get_host_ipa_limit(&self) -> i32 { 879 self.kvm.get_host_ipa_limit() 880 } 881 882 /// 883 /// Retrieve TDX capabilities 884 /// 885 #[cfg(feature = "tdx")] 886 fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> { 887 let data = TdxCapabilities { 888 nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32, 889 ..Default::default() 890 }; 891 892 tdx_command( 893 &self.kvm.as_raw_fd(), 894 TdxCommand::Capabilities, 895 0, 896 &data as *const _ as u64, 897 ) 898 .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?; 899 900 Ok(data) 901 } 902 } 903 /// Vcpu struct for KVM 904 pub struct KvmVcpu { 905 fd: VcpuFd, 906 #[cfg(target_arch = "x86_64")] 907 msrs: MsrEntries, 908 vm_ops: Option<Arc<dyn vm::VmOps>>, 909 #[cfg(target_arch = "x86_64")] 910 hyperv_synic: AtomicBool, 911 } 912 /// Implementation of Vcpu trait for KVM 913 /// Example: 914 /// #[cfg(feature = "kvm")] 915 /// extern crate hypervisor 916 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 917 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 918 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 919 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 920 /// vcpu.get/set().unwrap() 921 /// 922 impl cpu::Vcpu for KvmVcpu { 923 #[cfg(target_arch = "x86_64")] 924 /// 925 /// Returns the vCPU general purpose registers. 926 /// 927 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 928 self.fd 929 .get_regs() 930 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into())) 931 } 932 #[cfg(target_arch = "x86_64")] 933 /// 934 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 935 /// 936 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 937 self.fd 938 .set_regs(regs) 939 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 940 } 941 942 #[cfg(target_arch = "aarch64")] 943 /// 944 /// Set attribute for vcpu. 945 /// 946 fn set_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> { 947 self.fd 948 .set_device_attr(attr) 949 .map_err(|e| cpu::HypervisorCpuError::SetVcpuAttribute(e.into())) 950 } 951 952 #[cfg(target_arch = "aarch64")] 953 /// 954 /// Check if vcpu has a certain attribute. 955 /// 956 fn has_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> { 957 self.fd 958 .has_device_attr(attr) 959 .map_err(|e| cpu::HypervisorCpuError::HasVcpuAttribute(e.into())) 960 } 961 962 #[cfg(target_arch = "x86_64")] 963 /// 964 /// Returns the vCPU special registers. 965 /// 966 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 967 self.fd 968 .get_sregs() 969 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into())) 970 } 971 #[cfg(target_arch = "x86_64")] 972 /// 973 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 974 /// 975 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 976 self.fd 977 .set_sregs(sregs) 978 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 979 } 980 #[cfg(target_arch = "x86_64")] 981 /// 982 /// Returns the floating point state (FPU) from the vCPU. 983 /// 984 fn get_fpu(&self) -> cpu::Result<FpuState> { 985 self.fd 986 .get_fpu() 987 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into())) 988 } 989 #[cfg(target_arch = "x86_64")] 990 /// 991 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 992 /// 993 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 994 self.fd 995 .set_fpu(fpu) 996 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 997 } 998 #[cfg(target_arch = "x86_64")] 999 /// 1000 /// X86 specific call to setup the CPUID registers. 1001 /// 1002 fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> { 1003 self.fd 1004 .set_cpuid2(cpuid) 1005 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 1006 } 1007 #[cfg(target_arch = "x86_64")] 1008 /// 1009 /// X86 specific call to enable HyperV SynIC 1010 /// 1011 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 1012 // Update the information about Hyper-V SynIC being enabled and 1013 // emulated as it will influence later which MSRs should be saved. 1014 self.hyperv_synic.store(true, Ordering::Release); 1015 1016 let cap = kvm_enable_cap { 1017 cap: KVM_CAP_HYPERV_SYNIC, 1018 ..Default::default() 1019 }; 1020 self.fd 1021 .enable_cap(&cap) 1022 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 1023 } 1024 /// 1025 /// X86 specific call to retrieve the CPUID registers. 1026 /// 1027 #[cfg(target_arch = "x86_64")] 1028 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> { 1029 self.fd 1030 .get_cpuid2(num_entries) 1031 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into())) 1032 } 1033 #[cfg(target_arch = "x86_64")] 1034 /// 1035 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1036 /// 1037 fn get_lapic(&self) -> cpu::Result<LapicState> { 1038 self.fd 1039 .get_lapic() 1040 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into())) 1041 } 1042 #[cfg(target_arch = "x86_64")] 1043 /// 1044 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1045 /// 1046 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 1047 self.fd 1048 .set_lapic(klapic) 1049 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 1050 } 1051 #[cfg(target_arch = "x86_64")] 1052 /// 1053 /// Returns the model-specific registers (MSR) for this vCPU. 1054 /// 1055 fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> { 1056 self.fd 1057 .get_msrs(msrs) 1058 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into())) 1059 } 1060 #[cfg(target_arch = "x86_64")] 1061 /// 1062 /// Setup the model-specific registers (MSR) for this vCPU. 1063 /// Returns the number of MSR entries actually written. 1064 /// 1065 fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> { 1066 self.fd 1067 .set_msrs(msrs) 1068 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 1069 } 1070 /// 1071 /// Returns the vcpu's current "multiprocessing state". 1072 /// 1073 fn get_mp_state(&self) -> cpu::Result<MpState> { 1074 self.fd 1075 .get_mp_state() 1076 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into())) 1077 } 1078 /// 1079 /// Sets the vcpu's current "multiprocessing state". 1080 /// 1081 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 1082 self.fd 1083 .set_mp_state(mp_state) 1084 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 1085 } 1086 #[cfg(target_arch = "x86_64")] 1087 /// 1088 /// X86 specific call that returns the vcpu's current "xsave struct". 1089 /// 1090 fn get_xsave(&self) -> cpu::Result<Xsave> { 1091 self.fd 1092 .get_xsave() 1093 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 1094 } 1095 #[cfg(target_arch = "x86_64")] 1096 /// 1097 /// X86 specific call that sets the vcpu's current "xsave struct". 1098 /// 1099 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 1100 self.fd 1101 .set_xsave(xsave) 1102 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 1103 } 1104 #[cfg(target_arch = "x86_64")] 1105 /// 1106 /// X86 specific call that returns the vcpu's current "xcrs". 1107 /// 1108 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 1109 self.fd 1110 .get_xcrs() 1111 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 1112 } 1113 #[cfg(target_arch = "x86_64")] 1114 /// 1115 /// X86 specific call that sets the vcpu's current "xcrs". 1116 /// 1117 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 1118 self.fd 1119 .set_xcrs(xcrs) 1120 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 1121 } 1122 #[cfg(target_arch = "x86_64")] 1123 /// 1124 /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl. 1125 /// 1126 fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> { 1127 let tr = self 1128 .fd 1129 .translate_gva(gva) 1130 .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?; 1131 // tr.valid is set if the GVA is mapped to valid GPA. 1132 match tr.valid { 1133 0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!( 1134 "Invalid GVA: {:#x}", 1135 gva 1136 ))), 1137 _ => Ok((tr.physical_address, 0)), 1138 } 1139 } 1140 /// 1141 /// Triggers the running of the current virtual CPU returning an exit reason. 1142 /// 1143 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 1144 match self.fd.run() { 1145 Ok(run) => match run { 1146 #[cfg(target_arch = "x86_64")] 1147 VcpuExit::IoIn(addr, data) => { 1148 if let Some(vm_ops) = &self.vm_ops { 1149 return vm_ops 1150 .pio_read(addr.into(), data) 1151 .map(|_| cpu::VmExit::Ignore) 1152 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1153 } 1154 1155 Ok(cpu::VmExit::IoIn(addr, data)) 1156 } 1157 #[cfg(target_arch = "x86_64")] 1158 VcpuExit::IoOut(addr, data) => { 1159 if let Some(vm_ops) = &self.vm_ops { 1160 return vm_ops 1161 .pio_write(addr.into(), data) 1162 .map(|_| cpu::VmExit::Ignore) 1163 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1164 } 1165 1166 Ok(cpu::VmExit::IoOut(addr, data)) 1167 } 1168 #[cfg(target_arch = "x86_64")] 1169 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 1170 #[cfg(target_arch = "x86_64")] 1171 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 1172 1173 #[cfg(target_arch = "aarch64")] 1174 VcpuExit::SystemEvent(event_type, flags) => { 1175 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 1176 // On Aarch64, when the VM is shutdown, run() returns 1177 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 1178 if event_type == KVM_SYSTEM_EVENT_RESET { 1179 Ok(cpu::VmExit::Reset) 1180 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 1181 Ok(cpu::VmExit::Shutdown) 1182 } else { 1183 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1184 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 1185 event_type, 1186 flags 1187 ))) 1188 } 1189 } 1190 1191 VcpuExit::MmioRead(addr, data) => { 1192 if let Some(vm_ops) = &self.vm_ops { 1193 return vm_ops 1194 .mmio_read(addr, data) 1195 .map(|_| cpu::VmExit::Ignore) 1196 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1197 } 1198 1199 Ok(cpu::VmExit::MmioRead(addr, data)) 1200 } 1201 VcpuExit::MmioWrite(addr, data) => { 1202 if let Some(vm_ops) = &self.vm_ops { 1203 return vm_ops 1204 .mmio_write(addr, data) 1205 .map(|_| cpu::VmExit::Ignore) 1206 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1207 } 1208 1209 Ok(cpu::VmExit::MmioWrite(addr, data)) 1210 } 1211 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 1212 #[cfg(feature = "tdx")] 1213 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), 1214 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), 1215 1216 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1217 "Unexpected exit reason on vcpu run: {:?}", 1218 r 1219 ))), 1220 }, 1221 1222 Err(ref e) => match e.errno() { 1223 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1224 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1225 "VCPU error {:?}", 1226 e 1227 ))), 1228 }, 1229 } 1230 } 1231 #[cfg(target_arch = "x86_64")] 1232 /// 1233 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 1234 /// states of the vcpu. 1235 /// 1236 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 1237 self.fd 1238 .get_vcpu_events() 1239 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 1240 } 1241 #[cfg(target_arch = "x86_64")] 1242 /// 1243 /// Sets pending exceptions, interrupts, and NMIs as well as related states 1244 /// of the vcpu. 1245 /// 1246 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 1247 self.fd 1248 .set_vcpu_events(events) 1249 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 1250 } 1251 #[cfg(target_arch = "x86_64")] 1252 /// 1253 /// Let the guest know that it has been paused, which prevents from 1254 /// potential soft lockups when being resumed. 1255 /// 1256 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1257 if let Err(e) = self.fd.kvmclock_ctrl() { 1258 // Linux kernel returns -EINVAL if the PV clock isn't yet initialised 1259 // which could be because we're still in firmware or the guest doesn't 1260 // use KVM clock. 1261 if e.errno() != libc::EINVAL { 1262 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())); 1263 } 1264 } 1265 1266 Ok(()) 1267 } 1268 #[cfg(target_arch = "x86_64")] 1269 /// 1270 /// Sets debug registers to set hardware breakpoints and/or enable single step. 1271 /// 1272 fn set_guest_debug( 1273 &self, 1274 addrs: &[vm_memory::GuestAddress], 1275 singlestep: bool, 1276 ) -> cpu::Result<()> { 1277 if addrs.len() > 4 { 1278 return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!( 1279 "Support 4 breakpoints at most but {} addresses are passed", 1280 addrs.len() 1281 ))); 1282 } 1283 1284 let mut dbg = kvm_guest_debug { 1285 control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP, 1286 ..Default::default() 1287 }; 1288 if singlestep { 1289 dbg.control |= KVM_GUESTDBG_SINGLESTEP; 1290 } 1291 1292 // Set bits 9 and 10. 1293 // bit 9: GE (global exact breakpoint enable) flag. 1294 // bit 10: always 1. 1295 dbg.arch.debugreg[7] = 0x0600; 1296 1297 for (i, addr) in addrs.iter().enumerate() { 1298 dbg.arch.debugreg[i] = addr.0; 1299 // Set global breakpoint enable flag 1300 dbg.arch.debugreg[7] |= 2 << (i * 2); 1301 } 1302 1303 self.fd 1304 .set_guest_debug(&dbg) 1305 .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into())) 1306 } 1307 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1308 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1309 self.fd 1310 .vcpu_init(kvi) 1311 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1312 } 1313 /// 1314 /// Sets the value of one register for this vCPU. 1315 /// 1316 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1317 fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> { 1318 self.fd 1319 .set_one_reg(reg_id, data) 1320 .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into())) 1321 } 1322 /// 1323 /// Gets the value of one register for this vCPU. 1324 /// 1325 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1326 fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> { 1327 self.fd 1328 .get_one_reg(reg_id) 1329 .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into())) 1330 } 1331 /// 1332 /// Gets a list of the guest registers that are supported for the 1333 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1334 /// 1335 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1336 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1337 self.fd 1338 .get_reg_list(reg_list) 1339 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1340 } 1341 /// 1342 /// Save the state of the core registers. 1343 /// 1344 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1345 fn core_registers(&self, state: &mut StandardRegisters) -> cpu::Result<()> { 1346 let mut off = offset__of!(user_pt_regs, regs); 1347 // There are 31 user_pt_regs: 1348 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1349 // These actually are the general-purpose registers of the Armv8-a 1350 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1351 for i in 0..31 { 1352 state.regs.regs[i] = self 1353 .fd 1354 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1355 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1356 off += std::mem::size_of::<u64>(); 1357 } 1358 1359 // We are now entering the "Other register" section of the ARMv8-a architecture. 1360 // First one, stack pointer. 1361 let off = offset__of!(user_pt_regs, sp); 1362 state.regs.sp = self 1363 .fd 1364 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1365 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1366 1367 // Second one, the program counter. 1368 let off = offset__of!(user_pt_regs, pc); 1369 state.regs.pc = self 1370 .fd 1371 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1372 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1373 1374 // Next is the processor state. 1375 let off = offset__of!(user_pt_regs, pstate); 1376 state.regs.pstate = self 1377 .fd 1378 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1379 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1380 1381 // The stack pointer associated with EL1 1382 let off = offset__of!(kvm_regs, sp_el1); 1383 state.sp_el1 = self 1384 .fd 1385 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1386 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1387 1388 // Exception Link Register for EL1, when taking an exception to EL1, this register 1389 // holds the address to which to return afterwards. 1390 let off = offset__of!(kvm_regs, elr_el1); 1391 state.elr_el1 = self 1392 .fd 1393 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1394 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1395 1396 // Saved Program Status Registers, there are 5 of them used in the kernel. 1397 let mut off = offset__of!(kvm_regs, spsr); 1398 for i in 0..KVM_NR_SPSR as usize { 1399 state.spsr[i] = self 1400 .fd 1401 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1402 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1403 off += std::mem::size_of::<u64>(); 1404 } 1405 1406 // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: 1407 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1408 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1409 for i in 0..32 { 1410 state.fp_regs.vregs[i] = self 1411 .fd 1412 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1413 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1414 .into(); 1415 off += mem::size_of::<u128>(); 1416 } 1417 1418 // Floating-point Status Register 1419 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1420 state.fp_regs.fpsr = self 1421 .fd 1422 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1423 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1424 as u32; 1425 1426 // Floating-point Control Register 1427 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1428 state.fp_regs.fpcr = self 1429 .fd 1430 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1431 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1432 as u32; 1433 Ok(()) 1434 } 1435 /// 1436 /// Restore the state of the core registers. 1437 /// 1438 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1439 fn set_core_registers(&self, state: &StandardRegisters) -> cpu::Result<()> { 1440 // The function follows the exact identical order from `state`. Look there 1441 // for some additional info on registers. 1442 let mut off = offset__of!(user_pt_regs, regs); 1443 for i in 0..31 { 1444 self.fd 1445 .set_one_reg( 1446 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1447 state.regs.regs[i], 1448 ) 1449 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1450 off += std::mem::size_of::<u64>(); 1451 } 1452 1453 let off = offset__of!(user_pt_regs, sp); 1454 self.fd 1455 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp) 1456 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1457 1458 let off = offset__of!(user_pt_regs, pc); 1459 self.fd 1460 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc) 1461 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1462 1463 let off = offset__of!(user_pt_regs, pstate); 1464 self.fd 1465 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate) 1466 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1467 1468 let off = offset__of!(kvm_regs, sp_el1); 1469 self.fd 1470 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1) 1471 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1472 1473 let off = offset__of!(kvm_regs, elr_el1); 1474 self.fd 1475 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1) 1476 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1477 1478 let mut off = offset__of!(kvm_regs, spsr); 1479 for i in 0..KVM_NR_SPSR as usize { 1480 self.fd 1481 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i]) 1482 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1483 off += std::mem::size_of::<u64>(); 1484 } 1485 1486 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1487 for i in 0..32 { 1488 self.fd 1489 .set_one_reg( 1490 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1491 state.fp_regs.vregs[i] as u64, 1492 ) 1493 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1494 off += mem::size_of::<u128>(); 1495 } 1496 1497 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1498 self.fd 1499 .set_one_reg( 1500 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1501 state.fp_regs.fpsr as u64, 1502 ) 1503 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1504 1505 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1506 self.fd 1507 .set_one_reg( 1508 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1509 state.fp_regs.fpcr as u64, 1510 ) 1511 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1512 Ok(()) 1513 } 1514 /// 1515 /// Save the state of the system registers. 1516 /// 1517 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1518 fn system_registers(&self, state: &mut Vec<Register>) -> cpu::Result<()> { 1519 // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are 1520 // around 500 registers. 1521 let mut reg_list = RegList::new(500).unwrap(); 1522 self.fd 1523 .get_reg_list(&mut reg_list) 1524 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1525 1526 // At this point reg_list should contain: core registers and system registers. 1527 // The register list contains the number of registers and their ids. We will be needing to 1528 // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list 1529 // the core registers which are represented in the kernel by kvm_regs structure and for which 1530 // we can calculate the id based on the offset in the structure. 1531 reg_list.retain(|regid| is_system_register(*regid)); 1532 1533 // Now, for the rest of the registers left in the previously fetched register list, we are 1534 // simply calling KVM_GET_ONE_REG. 1535 let indices = reg_list.as_slice(); 1536 for index in indices.iter() { 1537 state.push(kvm_bindings::kvm_one_reg { 1538 id: *index, 1539 addr: self 1540 .fd 1541 .get_one_reg(*index) 1542 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?, 1543 }); 1544 } 1545 1546 Ok(()) 1547 } 1548 /// 1549 /// Restore the state of the system registers. 1550 /// 1551 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1552 fn set_system_registers(&self, state: &[Register]) -> cpu::Result<()> { 1553 for reg in state { 1554 self.fd 1555 .set_one_reg(reg.id, reg.addr) 1556 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 1557 } 1558 Ok(()) 1559 } 1560 /// 1561 /// Read the MPIDR - Multiprocessor Affinity Register. 1562 /// 1563 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1564 fn read_mpidr(&self) -> cpu::Result<u64> { 1565 self.fd 1566 .get_one_reg(MPIDR_EL1) 1567 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into())) 1568 } 1569 /// 1570 /// Configure core registers for a given CPU. 1571 /// 1572 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] 1573 fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { 1574 #[allow(non_upper_case_globals)] 1575 // PSR (Processor State Register) bits. 1576 // Taken from arch/arm64/include/uapi/asm/ptrace.h. 1577 const PSR_MODE_EL1h: u64 = 0x0000_0005; 1578 const PSR_F_BIT: u64 = 0x0000_0040; 1579 const PSR_I_BIT: u64 = 0x0000_0080; 1580 const PSR_A_BIT: u64 = 0x0000_0100; 1581 const PSR_D_BIT: u64 = 0x0000_0200; 1582 // Taken from arch/arm64/kvm/inject_fault.c. 1583 const PSTATE_FAULT_BITS_64: u64 = 1584 PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT; 1585 1586 let kreg_off = offset__of!(kvm_regs, regs); 1587 1588 // Get the register index of the PSTATE (Processor State) register. 1589 let pstate = offset__of!(user_pt_regs, pstate) + kreg_off; 1590 self.set_reg( 1591 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate), 1592 PSTATE_FAULT_BITS_64, 1593 ) 1594 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1595 1596 // Other vCPUs are powered off initially awaiting PSCI wakeup. 1597 if cpu_id == 0 { 1598 // Setting the PC (Processor Counter) to the current program address (kernel address). 1599 let pc = offset__of!(user_pt_regs, pc) + kreg_off; 1600 self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64) 1601 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1602 1603 // Last mandatory thing to set -> the address pointing to the FDT (also called DTB). 1604 // "The device tree blob (dtb) must be placed on an 8-byte boundary and must 1605 // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt. 1606 // We are choosing to place it the end of DRAM. See `get_fdt_addr`. 1607 let regs0 = offset__of!(user_pt_regs, regs) + kreg_off; 1608 self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start) 1609 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1610 } 1611 Ok(()) 1612 } 1613 1614 #[cfg(target_arch = "x86_64")] 1615 /// 1616 /// Get the current CPU state 1617 /// 1618 /// Ordering requirements: 1619 /// 1620 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1621 /// vCPU/LAPIC state. As such, it must be done before most everything 1622 /// else, otherwise we cannot restore everything and expect it to work. 1623 /// 1624 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1625 /// still running. 1626 /// 1627 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1628 /// 1629 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1630 /// it might as well be affected by internal state modifications of the 1631 /// GET ioctls. 1632 /// 1633 /// SREGS saves/restores a pending interrupt, similar to what 1634 /// VCPU_EVENTS also does. 1635 /// 1636 /// GET_MSRS requires a pre-populated data structure to do something 1637 /// meaningful. For SET_MSRS it will then contain good data. 1638 /// 1639 /// # Example 1640 /// 1641 /// ```rust 1642 /// # extern crate hypervisor; 1643 /// # use hypervisor::KvmHypervisor; 1644 /// # use std::sync::Arc; 1645 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1646 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1647 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1648 /// vm.enable_split_irq().unwrap(); 1649 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1650 /// let state = vcpu.state().unwrap(); 1651 /// ``` 1652 fn state(&self) -> cpu::Result<CpuState> { 1653 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1654 let mp_state = self.get_mp_state()?; 1655 let regs = self.get_regs()?; 1656 let sregs = self.get_sregs()?; 1657 let xsave = self.get_xsave()?; 1658 let xcrs = self.get_xcrs()?; 1659 let lapic_state = self.get_lapic()?; 1660 let fpu = self.get_fpu()?; 1661 1662 // Try to get all MSRs based on the list previously retrieved from KVM. 1663 // If the number of MSRs obtained from GET_MSRS is different from the 1664 // expected amount, we fallback onto a slower method by getting MSRs 1665 // by chunks. This is the only way to make sure we try to get as many 1666 // MSRs as possible, even if some MSRs are not supported. 1667 let mut msr_entries = self.msrs.clone(); 1668 1669 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1670 // emulated. 1671 if self.hyperv_synic.load(Ordering::Acquire) { 1672 let hyperv_synic_msrs = vec![ 1673 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1674 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1675 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1676 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 1677 0x400000b5, 0x400000b6, 0x400000b7, 1678 ]; 1679 for index in hyperv_synic_msrs { 1680 let msr = kvm_msr_entry { 1681 index, 1682 ..Default::default() 1683 }; 1684 msr_entries.push(msr).unwrap(); 1685 } 1686 } 1687 1688 let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize; 1689 let num_msrs = self.get_msrs(&mut msr_entries)?; 1690 let msrs = if num_msrs != expected_num_msrs { 1691 let mut faulty_msr_index = num_msrs; 1692 let mut msr_entries_tmp = 1693 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap(); 1694 1695 loop { 1696 warn!( 1697 "Detected faulty MSR 0x{:x} while getting MSRs", 1698 msr_entries.as_slice()[faulty_msr_index].index 1699 ); 1700 1701 let start_pos = faulty_msr_index + 1; 1702 let mut sub_msr_entries = 1703 MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap(); 1704 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1705 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1706 1707 for i in 0..num_msrs { 1708 msr_entries_tmp 1709 .push(sub_msr_entries.as_slice()[i]) 1710 .map_err(|e| { 1711 cpu::HypervisorCpuError::GetMsrEntries(anyhow!( 1712 "Failed adding MSR entries: {:?}", 1713 e 1714 )) 1715 })?; 1716 } 1717 1718 if num_msrs == expected_num_msrs { 1719 break; 1720 } 1721 1722 faulty_msr_index = start_pos + num_msrs; 1723 } 1724 1725 msr_entries_tmp 1726 } else { 1727 msr_entries 1728 }; 1729 1730 let vcpu_events = self.get_vcpu_events()?; 1731 1732 Ok(CpuState { 1733 cpuid, 1734 msrs, 1735 vcpu_events, 1736 regs, 1737 sregs, 1738 fpu, 1739 lapic_state, 1740 xsave, 1741 xcrs, 1742 mp_state, 1743 }) 1744 } 1745 /// 1746 /// Get the current AArch64 CPU state 1747 /// 1748 #[cfg(target_arch = "aarch64")] 1749 fn state(&self) -> cpu::Result<CpuState> { 1750 let mut state = CpuState { 1751 mp_state: self.get_mp_state()?, 1752 mpidr: self.read_mpidr()?, 1753 ..Default::default() 1754 }; 1755 self.core_registers(&mut state.core_regs)?; 1756 self.system_registers(&mut state.sys_regs)?; 1757 1758 Ok(state) 1759 } 1760 #[cfg(target_arch = "x86_64")] 1761 /// 1762 /// Restore the previously saved CPU state 1763 /// 1764 /// Ordering requirements: 1765 /// 1766 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1767 /// still running. 1768 /// 1769 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1770 /// if we ever change the BSP, we have to do that before restoring anything. 1771 /// The same seems to be true for CPUID stuff. 1772 /// 1773 /// SREGS saves/restores a pending interrupt, similar to what 1774 /// VCPU_EVENTS also does. 1775 /// 1776 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1777 /// done before SET_VCPU_EVENTS, which restores it. 1778 /// 1779 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1780 /// the apic base msr. 1781 /// 1782 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1783 /// only restores successfully, when the LAPIC is correctly configured. 1784 /// 1785 /// Arguments: CpuState 1786 /// # Example 1787 /// 1788 /// ```rust 1789 /// # extern crate hypervisor; 1790 /// # use hypervisor::KvmHypervisor; 1791 /// # use std::sync::Arc; 1792 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1793 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1794 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1795 /// vm.enable_split_irq().unwrap(); 1796 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1797 /// let state = vcpu.state().unwrap(); 1798 /// vcpu.set_state(&state).unwrap(); 1799 /// ``` 1800 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1801 self.set_cpuid2(&state.cpuid)?; 1802 self.set_mp_state(state.mp_state)?; 1803 self.set_regs(&state.regs)?; 1804 self.set_sregs(&state.sregs)?; 1805 self.set_xsave(&state.xsave)?; 1806 self.set_xcrs(&state.xcrs)?; 1807 self.set_lapic(&state.lapic_state)?; 1808 self.set_fpu(&state.fpu)?; 1809 1810 // Try to set all MSRs previously stored. 1811 // If the number of MSRs set from SET_MSRS is different from the 1812 // expected amount, we fallback onto a slower method by setting MSRs 1813 // by chunks. This is the only way to make sure we try to set as many 1814 // MSRs as possible, even if some MSRs are not supported. 1815 let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize; 1816 let num_msrs = self.set_msrs(&state.msrs)?; 1817 if num_msrs != expected_num_msrs { 1818 let mut faulty_msr_index = num_msrs; 1819 1820 loop { 1821 warn!( 1822 "Detected faulty MSR 0x{:x} while setting MSRs", 1823 state.msrs.as_slice()[faulty_msr_index].index 1824 ); 1825 1826 let start_pos = faulty_msr_index + 1; 1827 let sub_msr_entries = 1828 MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap(); 1829 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1830 let num_msrs = self.set_msrs(&sub_msr_entries)?; 1831 1832 if num_msrs == expected_num_msrs { 1833 break; 1834 } 1835 1836 faulty_msr_index = start_pos + num_msrs; 1837 } 1838 } 1839 1840 self.set_vcpu_events(&state.vcpu_events)?; 1841 1842 Ok(()) 1843 } 1844 /// 1845 /// Restore the previously saved AArch64 CPU state 1846 /// 1847 #[cfg(target_arch = "aarch64")] 1848 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1849 self.set_core_registers(&state.core_regs)?; 1850 self.set_system_registers(&state.sys_regs)?; 1851 self.set_mp_state(state.mp_state)?; 1852 1853 Ok(()) 1854 } 1855 1856 /// 1857 /// Initialize TDX for this CPU 1858 /// 1859 #[cfg(feature = "tdx")] 1860 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 1861 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 1862 .map_err(cpu::HypervisorCpuError::InitializeTdx) 1863 } 1864 1865 /// 1866 /// Set the "immediate_exit" state 1867 /// 1868 fn set_immediate_exit(&self, exit: bool) { 1869 self.fd.set_kvm_immediate_exit(exit.into()); 1870 } 1871 1872 /// 1873 /// Returns the details about TDX exit reason 1874 /// 1875 #[cfg(feature = "tdx")] 1876 fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> { 1877 let kvm_run = self.fd.get_kvm_run(); 1878 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 1879 1880 tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND; 1881 1882 if tdx_vmcall.type_ != 0 { 1883 return Err(cpu::HypervisorCpuError::UnknownTdxVmCall); 1884 } 1885 1886 match tdx_vmcall.subfunction { 1887 TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote), 1888 TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => { 1889 Ok(TdxExitDetails::SetupEventNotifyInterrupt) 1890 } 1891 _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall), 1892 } 1893 } 1894 1895 /// 1896 /// Set the status code for TDX exit 1897 /// 1898 #[cfg(feature = "tdx")] 1899 fn set_tdx_status(&mut self, status: TdxExitStatus) { 1900 let kvm_run = self.fd.get_kvm_run(); 1901 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 1902 1903 tdx_vmcall.status_code = match status { 1904 TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS, 1905 TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND, 1906 }; 1907 } 1908 #[cfg(target_arch = "x86_64")] 1909 /// 1910 /// Return the list of initial MSR entries for a VCPU 1911 /// 1912 fn boot_msr_entries(&self) -> MsrEntries { 1913 use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB}; 1914 use kvm_bindings::kvm_msr_entry as MsrEntry; 1915 1916 MsrEntries::from_entries(&[ 1917 msr!(msr_index::MSR_IA32_SYSENTER_CS), 1918 msr!(msr_index::MSR_IA32_SYSENTER_ESP), 1919 msr!(msr_index::MSR_IA32_SYSENTER_EIP), 1920 msr!(msr_index::MSR_STAR), 1921 msr!(msr_index::MSR_CSTAR), 1922 msr!(msr_index::MSR_LSTAR), 1923 msr!(msr_index::MSR_KERNEL_GS_BASE), 1924 msr!(msr_index::MSR_SYSCALL_MASK), 1925 msr!(msr_index::MSR_IA32_TSC), 1926 msr_data!( 1927 msr_index::MSR_IA32_MISC_ENABLE, 1928 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64 1929 ), 1930 msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB), 1931 ]) 1932 .unwrap() 1933 } 1934 } 1935 1936 /// Device struct for KVM 1937 pub struct KvmDevice { 1938 fd: DeviceFd, 1939 } 1940 1941 impl device::Device for KvmDevice { 1942 /// 1943 /// Set device attribute 1944 /// 1945 fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> { 1946 self.fd 1947 .set_device_attr(attr) 1948 .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into())) 1949 } 1950 /// 1951 /// Get device attribute 1952 /// 1953 fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> { 1954 self.fd 1955 .get_device_attr(attr) 1956 .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into())) 1957 } 1958 } 1959 1960 impl AsRawFd for KvmDevice { 1961 fn as_raw_fd(&self) -> RawFd { 1962 self.fd.as_raw_fd() 1963 } 1964 } 1965