1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 use crate::aarch64::gic::KvmGicV3Its; 13 #[cfg(target_arch = "aarch64")] 14 pub use crate::aarch64::{ 15 check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit, 16 VcpuKvmState, 17 }; 18 #[cfg(target_arch = "aarch64")] 19 use crate::arch::aarch64::gic::{Vgic, VgicConfig}; 20 use crate::cpu; 21 use crate::hypervisor; 22 use crate::vec_with_array_field; 23 use crate::vm::{self, InterruptSourceConfig, VmOps}; 24 use crate::HypervisorType; 25 #[cfg(target_arch = "aarch64")] 26 use crate::{arm64_core_reg_id, offset_of}; 27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 28 use std::any::Any; 29 use std::collections::HashMap; 30 #[cfg(target_arch = "aarch64")] 31 use std::convert::TryInto; 32 #[cfg(target_arch = "x86_64")] 33 use std::fs::File; 34 #[cfg(target_arch = "x86_64")] 35 use std::os::unix::io::AsRawFd; 36 #[cfg(feature = "tdx")] 37 use std::os::unix::io::RawFd; 38 use std::result; 39 #[cfg(target_arch = "x86_64")] 40 use std::sync::atomic::{AtomicBool, Ordering}; 41 #[cfg(target_arch = "aarch64")] 42 use std::sync::Mutex; 43 use std::sync::{Arc, RwLock}; 44 use vmm_sys_util::eventfd::EventFd; 45 // x86_64 dependencies 46 #[cfg(target_arch = "x86_64")] 47 pub mod x86_64; 48 #[cfg(target_arch = "x86_64")] 49 use crate::arch::x86::{ 50 CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters, 51 NUM_IOAPIC_PINS, 52 }; 53 #[cfg(target_arch = "x86_64")] 54 use crate::ClockData; 55 use crate::{ 56 CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion, 57 USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, 58 }; 59 #[cfg(target_arch = "aarch64")] 60 use aarch64::{RegList, Register, StandardRegisters}; 61 #[cfg(target_arch = "x86_64")] 62 use kvm_bindings::{ 63 kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP, 64 KVM_GUESTDBG_USE_HW_BP, 65 }; 66 #[cfg(target_arch = "x86_64")] 67 use x86_64::check_required_kvm_extensions; 68 #[cfg(target_arch = "x86_64")] 69 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave}; 70 // aarch64 dependencies 71 #[cfg(target_arch = "aarch64")] 72 pub mod aarch64; 73 pub use kvm_bindings; 74 #[cfg(feature = "tdx")] 75 use kvm_bindings::KVMIO; 76 pub use kvm_bindings::{ 77 kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug, 78 kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, 79 KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, 80 KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 81 }; 82 #[cfg(target_arch = "aarch64")] 83 use kvm_bindings::{ 84 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64, 85 KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK, 86 KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK, 87 KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 88 }; 89 pub use kvm_ioctls; 90 pub use kvm_ioctls::{Cap, Kvm}; 91 #[cfg(target_arch = "aarch64")] 92 use std::mem; 93 use thiserror::Error; 94 use vfio_ioctls::VfioDeviceFd; 95 #[cfg(feature = "tdx")] 96 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr}; 97 /// 98 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 99 /// 100 pub use { 101 kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr, 102 kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit, 103 }; 104 105 #[cfg(target_arch = "x86_64")] 106 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 107 108 #[cfg(feature = "tdx")] 109 const KVM_EXIT_TDX: u32 = 50; 110 #[cfg(feature = "tdx")] 111 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002; 112 #[cfg(feature = "tdx")] 113 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004; 114 #[cfg(feature = "tdx")] 115 const TDG_VP_VMCALL_SUCCESS: u64 = 0; 116 #[cfg(feature = "tdx")] 117 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000; 118 119 #[cfg(feature = "tdx")] 120 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 121 122 #[cfg(feature = "tdx")] 123 #[repr(u32)] 124 enum TdxCommand { 125 Capabilities = 0, 126 InitVm, 127 InitVcpu, 128 InitMemRegion, 129 Finalize, 130 } 131 132 #[cfg(feature = "tdx")] 133 pub enum TdxExitDetails { 134 GetQuote, 135 SetupEventNotifyInterrupt, 136 } 137 138 #[cfg(feature = "tdx")] 139 pub enum TdxExitStatus { 140 Success, 141 InvalidOperand, 142 } 143 144 #[cfg(feature = "tdx")] 145 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6; 146 147 #[cfg(feature = "tdx")] 148 #[repr(C)] 149 #[derive(Debug, Default)] 150 pub struct TdxCpuidConfig { 151 pub leaf: u32, 152 pub sub_leaf: u32, 153 pub eax: u32, 154 pub ebx: u32, 155 pub ecx: u32, 156 pub edx: u32, 157 } 158 159 #[cfg(feature = "tdx")] 160 #[repr(C)] 161 #[derive(Debug, Default)] 162 pub struct TdxCapabilities { 163 pub attrs_fixed0: u64, 164 pub attrs_fixed1: u64, 165 pub xfam_fixed0: u64, 166 pub xfam_fixed1: u64, 167 pub nr_cpuid_configs: u32, 168 pub padding: u32, 169 pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS], 170 } 171 172 impl From<kvm_userspace_memory_region> for UserMemoryRegion { 173 fn from(region: kvm_userspace_memory_region) -> Self { 174 let mut flags = USER_MEMORY_REGION_READ; 175 if region.flags & KVM_MEM_READONLY == 0 { 176 flags |= USER_MEMORY_REGION_WRITE; 177 } 178 if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 { 179 flags |= USER_MEMORY_REGION_LOG_DIRTY; 180 } 181 182 UserMemoryRegion { 183 slot: region.slot, 184 guest_phys_addr: region.guest_phys_addr, 185 memory_size: region.memory_size, 186 userspace_addr: region.userspace_addr, 187 flags, 188 } 189 } 190 } 191 192 impl From<UserMemoryRegion> for kvm_userspace_memory_region { 193 fn from(region: UserMemoryRegion) -> Self { 194 assert!( 195 region.flags & USER_MEMORY_REGION_READ != 0, 196 "KVM mapped memory is always readable" 197 ); 198 199 let mut flags = 0; 200 if region.flags & USER_MEMORY_REGION_WRITE == 0 { 201 flags |= KVM_MEM_READONLY; 202 } 203 if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 { 204 flags |= KVM_MEM_LOG_DIRTY_PAGES; 205 } 206 207 kvm_userspace_memory_region { 208 slot: region.slot, 209 guest_phys_addr: region.guest_phys_addr, 210 memory_size: region.memory_size, 211 userspace_addr: region.userspace_addr, 212 flags, 213 } 214 } 215 } 216 217 impl From<kvm_mp_state> for MpState { 218 fn from(s: kvm_mp_state) -> Self { 219 MpState::Kvm(s) 220 } 221 } 222 223 impl From<MpState> for kvm_mp_state { 224 fn from(ms: MpState) -> Self { 225 match ms { 226 MpState::Kvm(s) => s, 227 /* Needed in case other hypervisors are enabled */ 228 #[allow(unreachable_patterns)] 229 _ => panic!("CpuState is not valid"), 230 } 231 } 232 } 233 234 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress { 235 fn from(a: kvm_ioctls::IoEventAddress) -> Self { 236 match a { 237 kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x), 238 kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x), 239 } 240 } 241 } 242 243 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress { 244 fn from(a: IoEventAddress) -> Self { 245 match a { 246 IoEventAddress::Pio(x) => Self::Pio(x), 247 IoEventAddress::Mmio(x) => Self::Mmio(x), 248 } 249 } 250 } 251 252 impl From<VcpuKvmState> for CpuState { 253 fn from(s: VcpuKvmState) -> Self { 254 CpuState::Kvm(s) 255 } 256 } 257 258 impl From<CpuState> for VcpuKvmState { 259 fn from(s: CpuState) -> Self { 260 match s { 261 CpuState::Kvm(s) => s, 262 /* Needed in case other hypervisors are enabled */ 263 #[allow(unreachable_patterns)] 264 _ => panic!("CpuState is not valid"), 265 } 266 } 267 } 268 269 #[cfg(target_arch = "x86_64")] 270 impl From<kvm_clock_data> for ClockData { 271 fn from(d: kvm_clock_data) -> Self { 272 ClockData::Kvm(d) 273 } 274 } 275 276 #[cfg(target_arch = "x86_64")] 277 impl From<ClockData> for kvm_clock_data { 278 fn from(ms: ClockData) -> Self { 279 match ms { 280 ClockData::Kvm(s) => s, 281 /* Needed in case other hypervisors are enabled */ 282 #[allow(unreachable_patterns)] 283 _ => panic!("CpuState is not valid"), 284 } 285 } 286 } 287 288 impl From<kvm_irq_routing_entry> for IrqRoutingEntry { 289 fn from(s: kvm_irq_routing_entry) -> Self { 290 IrqRoutingEntry::Kvm(s) 291 } 292 } 293 294 impl From<IrqRoutingEntry> for kvm_irq_routing_entry { 295 fn from(e: IrqRoutingEntry) -> Self { 296 match e { 297 IrqRoutingEntry::Kvm(e) => e, 298 /* Needed in case other hypervisors are enabled */ 299 #[allow(unreachable_patterns)] 300 _ => panic!("IrqRoutingEntry is not valid"), 301 } 302 } 303 } 304 305 struct KvmDirtyLogSlot { 306 slot: u32, 307 guest_phys_addr: u64, 308 memory_size: u64, 309 userspace_addr: u64, 310 } 311 312 /// Wrapper over KVM VM ioctls. 313 pub struct KvmVm { 314 fd: Arc<VmFd>, 315 #[cfg(target_arch = "x86_64")] 316 msrs: Vec<MsrEntry>, 317 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 318 } 319 320 impl KvmVm { 321 /// 322 /// Creates an emulated device in the kernel. 323 /// 324 /// See the documentation for `KVM_CREATE_DEVICE`. 325 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> { 326 let device_fd = self 327 .fd 328 .create_device(device) 329 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 330 Ok(VfioDeviceFd::new_from_kvm(device_fd)) 331 } 332 /// Checks if a particular `Cap` is available. 333 pub fn check_extension(&self, c: Cap) -> bool { 334 self.fd.check_extension(c) 335 } 336 } 337 338 /// Implementation of Vm trait for KVM 339 /// 340 /// # Examples 341 /// 342 /// ``` 343 /// # use hypervisor::kvm::KvmHypervisor; 344 /// # use std::sync::Arc; 345 /// let kvm = KvmHypervisor::new().unwrap(); 346 /// let hypervisor = Arc::new(kvm); 347 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 348 /// ``` 349 impl vm::Vm for KvmVm { 350 #[cfg(target_arch = "x86_64")] 351 /// 352 /// Sets the address of the one-page region in the VM's address space. 353 /// 354 fn set_identity_map_address(&self, address: u64) -> vm::Result<()> { 355 self.fd 356 .set_identity_map_address(address) 357 .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into())) 358 } 359 #[cfg(target_arch = "x86_64")] 360 /// 361 /// Sets the address of the three-page region in the VM's address space. 362 /// 363 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 364 self.fd 365 .set_tss_address(offset) 366 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 367 } 368 /// 369 /// Creates an in-kernel interrupt controller. 370 /// 371 fn create_irq_chip(&self) -> vm::Result<()> { 372 self.fd 373 .create_irq_chip() 374 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 375 } 376 /// 377 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 378 /// 379 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 380 self.fd 381 .register_irqfd(fd, gsi) 382 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 383 } 384 /// 385 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 386 /// 387 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 388 self.fd 389 .unregister_irqfd(fd, gsi) 390 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 391 } 392 /// 393 /// Creates a VcpuFd object from a vcpu RawFd. 394 /// 395 fn create_vcpu( 396 &self, 397 id: u8, 398 vm_ops: Option<Arc<dyn VmOps>>, 399 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 400 let vc = self 401 .fd 402 .create_vcpu(id as u64) 403 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 404 let vcpu = KvmVcpu { 405 fd: vc, 406 #[cfg(target_arch = "x86_64")] 407 msrs: self.msrs.clone(), 408 vm_ops, 409 #[cfg(target_arch = "x86_64")] 410 hyperv_synic: AtomicBool::new(false), 411 }; 412 Ok(Arc::new(vcpu)) 413 } 414 #[cfg(target_arch = "aarch64")] 415 /// 416 /// Creates a virtual GIC device. 417 /// 418 fn create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>> { 419 let gic_device = KvmGicV3Its::new(self, config) 420 .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?; 421 Ok(Arc::new(Mutex::new(gic_device))) 422 } 423 /// 424 /// Registers an event to be signaled whenever a certain address is written to. 425 /// 426 fn register_ioevent( 427 &self, 428 fd: &EventFd, 429 addr: &IoEventAddress, 430 datamatch: Option<vm::DataMatch>, 431 ) -> vm::Result<()> { 432 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 433 if let Some(dm) = datamatch { 434 match dm { 435 vm::DataMatch::DataMatch32(kvm_dm32) => self 436 .fd 437 .register_ioevent(fd, addr, kvm_dm32) 438 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 439 vm::DataMatch::DataMatch64(kvm_dm64) => self 440 .fd 441 .register_ioevent(fd, addr, kvm_dm64) 442 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 443 } 444 } else { 445 self.fd 446 .register_ioevent(fd, addr, NoDatamatch) 447 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 448 } 449 } 450 /// 451 /// Unregisters an event from a certain address it has been previously registered to. 452 /// 453 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 454 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 455 self.fd 456 .unregister_ioevent(fd, addr, NoDatamatch) 457 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 458 } 459 460 /// 461 /// Constructs a routing entry 462 /// 463 fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry { 464 match &config { 465 InterruptSourceConfig::MsiIrq(cfg) => { 466 let mut kvm_route = kvm_irq_routing_entry { 467 gsi, 468 type_: KVM_IRQ_ROUTING_MSI, 469 ..Default::default() 470 }; 471 472 kvm_route.u.msi.address_lo = cfg.low_addr; 473 kvm_route.u.msi.address_hi = cfg.high_addr; 474 kvm_route.u.msi.data = cfg.data; 475 476 if self.check_extension(crate::kvm::Cap::MsiDevid) { 477 // On AArch64, there is limitation on the range of the 'devid', 478 // it can not be greater than 65536 (the max of u16). 479 // 480 // BDF can not be used directly, because 'segment' is in high 481 // 16 bits. The layout of the u32 BDF is: 482 // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| 483 // | segment | bus | device | function | 484 // 485 // Now that we support 1 bus only in a segment, we can build a 486 // 'devid' by replacing the 'bus' bits with the low 8 bits of 487 // 'segment' data. 488 // This way we can resolve the range checking problem and give 489 // different `devid` to all the devices. Limitation is that at 490 // most 256 segments can be supported. 491 // 492 let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff; 493 494 kvm_route.flags = KVM_MSI_VALID_DEVID; 495 kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid; 496 } 497 kvm_route.into() 498 } 499 InterruptSourceConfig::LegacyIrq(cfg) => { 500 let mut kvm_route = kvm_irq_routing_entry { 501 gsi, 502 type_: KVM_IRQ_ROUTING_IRQCHIP, 503 ..Default::default() 504 }; 505 kvm_route.u.irqchip.irqchip = cfg.irqchip; 506 kvm_route.u.irqchip.pin = cfg.pin; 507 508 kvm_route.into() 509 } 510 } 511 } 512 513 /// 514 /// Sets the GSI routing table entries, overwriting any previously set 515 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 516 /// 517 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 518 let mut irq_routing = 519 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 520 irq_routing[0].nr = entries.len() as u32; 521 irq_routing[0].flags = 0; 522 let entries: Vec<kvm_irq_routing_entry> = entries 523 .iter() 524 .map(|entry| match entry { 525 IrqRoutingEntry::Kvm(e) => *e, 526 #[allow(unreachable_patterns)] 527 _ => panic!("IrqRoutingEntry type is wrong"), 528 }) 529 .collect(); 530 531 // SAFETY: irq_routing initialized with entries.len() and now it is being turned into 532 // entries_slice with entries.len() again. It is guaranteed to be large enough to hold 533 // everything from entries. 534 unsafe { 535 let entries_slice: &mut [kvm_irq_routing_entry] = 536 irq_routing[0].entries.as_mut_slice(entries.len()); 537 entries_slice.copy_from_slice(&entries); 538 } 539 540 self.fd 541 .set_gsi_routing(&irq_routing[0]) 542 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 543 } 544 /// 545 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 546 /// 547 fn make_user_memory_region( 548 &self, 549 slot: u32, 550 guest_phys_addr: u64, 551 memory_size: u64, 552 userspace_addr: u64, 553 readonly: bool, 554 log_dirty_pages: bool, 555 ) -> UserMemoryRegion { 556 kvm_userspace_memory_region { 557 slot, 558 guest_phys_addr, 559 memory_size, 560 userspace_addr, 561 flags: if readonly { KVM_MEM_READONLY } else { 0 } 562 | if log_dirty_pages { 563 KVM_MEM_LOG_DIRTY_PAGES 564 } else { 565 0 566 }, 567 } 568 .into() 569 } 570 /// 571 /// Creates a guest physical memory region. 572 /// 573 fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 574 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 575 576 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 577 if (region.flags & KVM_MEM_READONLY) != 0 { 578 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 579 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 580 ))); 581 } 582 583 // Keep track of the regions that need dirty pages log 584 self.dirty_log_slots.write().unwrap().insert( 585 region.slot, 586 KvmDirtyLogSlot { 587 slot: region.slot, 588 guest_phys_addr: region.guest_phys_addr, 589 memory_size: region.memory_size, 590 userspace_addr: region.userspace_addr, 591 }, 592 ); 593 594 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 595 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 596 region.flags = 0; 597 } 598 599 // SAFETY: Safe because guest regions are guaranteed not to overlap. 600 unsafe { 601 self.fd 602 .set_user_memory_region(region) 603 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 604 } 605 } 606 /// 607 /// Removes a guest physical memory region. 608 /// 609 fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 610 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 611 612 // Remove the corresponding entry from "self.dirty_log_slots" if needed 613 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 614 615 // Setting the size to 0 means "remove" 616 region.memory_size = 0; 617 // SAFETY: Safe because guest regions are guaranteed not to overlap. 618 unsafe { 619 self.fd 620 .set_user_memory_region(region) 621 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 622 } 623 } 624 /// 625 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 626 /// 627 #[cfg(target_arch = "aarch64")] 628 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 629 self.fd 630 .get_preferred_target(kvi) 631 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 632 } 633 #[cfg(target_arch = "x86_64")] 634 fn enable_split_irq(&self) -> vm::Result<()> { 635 // Create split irqchip 636 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 637 // are not. 638 let mut cap = kvm_enable_cap { 639 cap: KVM_CAP_SPLIT_IRQCHIP, 640 ..Default::default() 641 }; 642 cap.args[0] = NUM_IOAPIC_PINS as u64; 643 self.fd 644 .enable_cap(&cap) 645 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 646 Ok(()) 647 } 648 #[cfg(target_arch = "x86_64")] 649 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 650 let mut cap = kvm_enable_cap { 651 cap: KVM_CAP_SGX_ATTRIBUTE, 652 ..Default::default() 653 }; 654 cap.args[0] = file.as_raw_fd() as u64; 655 self.fd 656 .enable_cap(&cap) 657 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 658 Ok(()) 659 } 660 /// Retrieve guest clock. 661 #[cfg(target_arch = "x86_64")] 662 fn get_clock(&self) -> vm::Result<ClockData> { 663 Ok(self 664 .fd 665 .get_clock() 666 .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))? 667 .into()) 668 } 669 /// Set guest clock. 670 #[cfg(target_arch = "x86_64")] 671 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 672 let data = (*data).into(); 673 self.fd 674 .set_clock(&data) 675 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 676 } 677 /// Create a device that is used for passthrough 678 fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> { 679 let mut vfio_dev = kvm_create_device { 680 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 681 fd: 0, 682 flags: 0, 683 }; 684 685 self.create_device(&mut vfio_dev) 686 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 687 } 688 /// 689 /// Start logging dirty pages 690 /// 691 fn start_dirty_log(&self) -> vm::Result<()> { 692 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 693 for (_, s) in dirty_log_slots.iter() { 694 let region = kvm_userspace_memory_region { 695 slot: s.slot, 696 guest_phys_addr: s.guest_phys_addr, 697 memory_size: s.memory_size, 698 userspace_addr: s.userspace_addr, 699 flags: KVM_MEM_LOG_DIRTY_PAGES, 700 }; 701 // SAFETY: Safe because guest regions are guaranteed not to overlap. 702 unsafe { 703 self.fd 704 .set_user_memory_region(region) 705 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 706 } 707 } 708 709 Ok(()) 710 } 711 712 /// 713 /// Stop logging dirty pages 714 /// 715 fn stop_dirty_log(&self) -> vm::Result<()> { 716 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 717 for (_, s) in dirty_log_slots.iter() { 718 let region = kvm_userspace_memory_region { 719 slot: s.slot, 720 guest_phys_addr: s.guest_phys_addr, 721 memory_size: s.memory_size, 722 userspace_addr: s.userspace_addr, 723 flags: 0, 724 }; 725 // SAFETY: Safe because guest regions are guaranteed not to overlap. 726 unsafe { 727 self.fd 728 .set_user_memory_region(region) 729 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 730 } 731 } 732 733 Ok(()) 734 } 735 736 /// 737 /// Get dirty pages bitmap (one bit per page) 738 /// 739 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 740 self.fd 741 .get_dirty_log(slot, memory_size as usize) 742 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 743 } 744 745 /// 746 /// Initialize TDX for this VM 747 /// 748 #[cfg(feature = "tdx")] 749 fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> { 750 const TDX_ATTR_SEPT_VE_DISABLE: usize = 28; 751 752 let mut cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 753 cpuid.iter().map(|e| (*e).into()).collect(); 754 cpuid.resize(256, kvm_bindings::kvm_cpuid_entry2::default()); 755 756 #[repr(C)] 757 struct TdxInitVm { 758 attributes: u64, 759 max_vcpus: u32, 760 padding: u32, 761 mrconfigid: [u64; 6], 762 mrowner: [u64; 6], 763 mrownerconfig: [u64; 6], 764 cpuid_nent: u32, 765 cpuid_padding: u32, 766 cpuid_entries: [kvm_bindings::kvm_cpuid_entry2; 256], 767 } 768 let data = TdxInitVm { 769 attributes: 1 << TDX_ATTR_SEPT_VE_DISABLE, 770 max_vcpus, 771 padding: 0, 772 mrconfigid: [0; 6], 773 mrowner: [0; 6], 774 mrownerconfig: [0; 6], 775 cpuid_nent: cpuid.len() as u32, 776 cpuid_padding: 0, 777 cpuid_entries: cpuid.as_slice().try_into().unwrap(), 778 }; 779 780 tdx_command( 781 &self.fd.as_raw_fd(), 782 TdxCommand::InitVm, 783 0, 784 &data as *const _ as u64, 785 ) 786 .map_err(vm::HypervisorVmError::InitializeTdx) 787 } 788 789 /// 790 /// Finalize the TDX setup for this VM 791 /// 792 #[cfg(feature = "tdx")] 793 fn tdx_finalize(&self) -> vm::Result<()> { 794 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 795 .map_err(vm::HypervisorVmError::FinalizeTdx) 796 } 797 798 /// 799 /// Initialize memory regions for the TDX VM 800 /// 801 #[cfg(feature = "tdx")] 802 fn tdx_init_memory_region( 803 &self, 804 host_address: u64, 805 guest_address: u64, 806 size: u64, 807 measure: bool, 808 ) -> vm::Result<()> { 809 #[repr(C)] 810 struct TdxInitMemRegion { 811 host_address: u64, 812 guest_address: u64, 813 pages: u64, 814 } 815 let data = TdxInitMemRegion { 816 host_address, 817 guest_address, 818 pages: size / 4096, 819 }; 820 821 tdx_command( 822 &self.fd.as_raw_fd(), 823 TdxCommand::InitMemRegion, 824 u32::from(measure), 825 &data as *const _ as u64, 826 ) 827 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 828 } 829 /// Downcast to the underlying KvmVm type 830 fn as_any(&self) -> &dyn Any { 831 self 832 } 833 } 834 835 #[cfg(feature = "tdx")] 836 fn tdx_command( 837 fd: &RawFd, 838 command: TdxCommand, 839 flags: u32, 840 data: u64, 841 ) -> std::result::Result<(), std::io::Error> { 842 #[repr(C)] 843 struct TdxIoctlCmd { 844 command: TdxCommand, 845 flags: u32, 846 data: u64, 847 error: u64, 848 unused: u64, 849 } 850 let cmd = TdxIoctlCmd { 851 command, 852 flags, 853 data, 854 error: 0, 855 unused: 0, 856 }; 857 // SAFETY: FFI call. All input parameters are valid. 858 let ret = unsafe { 859 ioctl_with_val( 860 fd, 861 KVM_MEMORY_ENCRYPT_OP(), 862 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 863 ) 864 }; 865 866 if ret < 0 { 867 return Err(std::io::Error::last_os_error()); 868 } 869 Ok(()) 870 } 871 872 /// Wrapper over KVM system ioctls. 873 pub struct KvmHypervisor { 874 kvm: Kvm, 875 } 876 877 impl KvmHypervisor { 878 #[cfg(target_arch = "x86_64")] 879 /// 880 /// Retrieve the list of MSRs supported by the hypervisor. 881 /// 882 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 883 self.kvm 884 .get_msr_index_list() 885 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 886 } 887 } 888 889 /// Enum for KVM related error 890 #[derive(Debug, Error)] 891 pub enum KvmError { 892 #[error("Capability missing: {0:?}")] 893 CapabilityMissing(Cap), 894 } 895 pub type KvmResult<T> = result::Result<T, KvmError>; 896 impl KvmHypervisor { 897 /// Create a hypervisor based on Kvm 898 #[allow(clippy::new_ret_no_self)] 899 pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> { 900 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 901 let api_version = kvm_obj.get_api_version(); 902 903 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 904 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 905 } 906 907 Ok(Arc::new(KvmHypervisor { kvm: kvm_obj })) 908 } 909 /// Check if the hypervisor is available 910 pub fn is_available() -> hypervisor::Result<bool> { 911 match std::fs::metadata("/dev/kvm") { 912 Ok(_) => Ok(true), 913 Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false), 914 Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck( 915 err.into(), 916 )), 917 } 918 } 919 } 920 /// Implementation of Hypervisor trait for KVM 921 /// 922 /// # Examples 923 /// 924 /// ``` 925 /// # use hypervisor::kvm::KvmHypervisor; 926 /// # use std::sync::Arc; 927 /// let kvm = KvmHypervisor::new().unwrap(); 928 /// let hypervisor = Arc::new(kvm); 929 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 930 /// ``` 931 impl hypervisor::Hypervisor for KvmHypervisor { 932 /// 933 /// Returns the type of the hypervisor 934 /// 935 fn hypervisor_type(&self) -> HypervisorType { 936 HypervisorType::Kvm 937 } 938 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 939 /// 940 /// # Examples 941 /// 942 /// ``` 943 /// # use hypervisor::kvm::KvmHypervisor; 944 /// use hypervisor::kvm::KvmVm; 945 /// let hypervisor = KvmHypervisor::new().unwrap(); 946 /// let vm = hypervisor.create_vm_with_type(0).unwrap(); 947 /// ``` 948 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 949 let fd: VmFd; 950 loop { 951 match self.kvm.create_vm_with_type(vm_type) { 952 Ok(res) => fd = res, 953 Err(e) => { 954 if e.errno() == libc::EINTR { 955 // If the error returned is EINTR, which means the 956 // ioctl has been interrupted, we have to retry as 957 // this can't be considered as a regular error. 958 continue; 959 } else { 960 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 961 } 962 } 963 } 964 break; 965 } 966 967 let vm_fd = Arc::new(fd); 968 969 #[cfg(target_arch = "x86_64")] 970 { 971 let msr_list = self.get_msr_list()?; 972 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 973 let mut msrs: Vec<MsrEntry> = vec![ 974 MsrEntry { 975 ..Default::default() 976 }; 977 num_msrs 978 ]; 979 let indices = msr_list.as_slice(); 980 for (pos, index) in indices.iter().enumerate() { 981 msrs[pos].index = *index; 982 } 983 984 Ok(Arc::new(KvmVm { 985 fd: vm_fd, 986 msrs, 987 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 988 })) 989 } 990 991 #[cfg(target_arch = "aarch64")] 992 { 993 Ok(Arc::new(KvmVm { 994 fd: vm_fd, 995 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 996 })) 997 } 998 } 999 1000 /// Create a KVM vm object and return the object as Vm trait object 1001 /// 1002 /// # Examples 1003 /// 1004 /// ``` 1005 /// # use hypervisor::kvm::KvmHypervisor; 1006 /// use hypervisor::kvm::KvmVm; 1007 /// let hypervisor = KvmHypervisor::new().unwrap(); 1008 /// let vm = hypervisor.create_vm().unwrap(); 1009 /// ``` 1010 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 1011 #[allow(unused_mut)] 1012 let mut vm_type: u64 = 0; // Create with default platform type 1013 1014 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 1015 // size from the host and use that when creating the VM, which may 1016 // avoid unnecessary VM creation failures. 1017 #[cfg(target_arch = "aarch64")] 1018 if self.kvm.check_extension(Cap::ArmVmIPASize) { 1019 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 1020 } 1021 1022 self.create_vm_with_type(vm_type) 1023 } 1024 1025 fn check_required_extensions(&self) -> hypervisor::Result<()> { 1026 check_required_kvm_extensions(&self.kvm) 1027 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 1028 } 1029 1030 #[cfg(target_arch = "x86_64")] 1031 /// 1032 /// X86 specific call to get the system supported CPUID values. 1033 /// 1034 fn get_supported_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> { 1035 let kvm_cpuid = self 1036 .kvm 1037 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 1038 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?; 1039 1040 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1041 1042 Ok(v) 1043 } 1044 1045 #[cfg(target_arch = "aarch64")] 1046 /// 1047 /// Retrieve AArch64 host maximum IPA size supported by KVM. 1048 /// 1049 fn get_host_ipa_limit(&self) -> i32 { 1050 self.kvm.get_host_ipa_limit() 1051 } 1052 1053 /// 1054 /// Retrieve TDX capabilities 1055 /// 1056 #[cfg(feature = "tdx")] 1057 fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> { 1058 let data = TdxCapabilities { 1059 nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32, 1060 ..Default::default() 1061 }; 1062 1063 tdx_command( 1064 &self.kvm.as_raw_fd(), 1065 TdxCommand::Capabilities, 1066 0, 1067 &data as *const _ as u64, 1068 ) 1069 .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?; 1070 1071 Ok(data) 1072 } 1073 1074 /// 1075 /// Get the number of supported hardware breakpoints 1076 /// 1077 fn get_guest_debug_hw_bps(&self) -> usize { 1078 #[cfg(target_arch = "x86_64")] 1079 { 1080 4 1081 } 1082 #[cfg(target_arch = "aarch64")] 1083 { 1084 self.kvm.get_guest_debug_hw_bps() as usize 1085 } 1086 } 1087 1088 /// Get maximum number of vCPUs 1089 fn get_max_vcpus(&self) -> u32 { 1090 self.kvm.get_max_vcpus().min(u32::MAX as usize) as u32 1091 } 1092 } 1093 /// Vcpu struct for KVM 1094 pub struct KvmVcpu { 1095 fd: VcpuFd, 1096 #[cfg(target_arch = "x86_64")] 1097 msrs: Vec<MsrEntry>, 1098 vm_ops: Option<Arc<dyn vm::VmOps>>, 1099 #[cfg(target_arch = "x86_64")] 1100 hyperv_synic: AtomicBool, 1101 } 1102 /// Implementation of Vcpu trait for KVM 1103 /// 1104 /// # Examples 1105 /// 1106 /// ``` 1107 /// # use hypervisor::kvm::KvmHypervisor; 1108 /// # use std::sync::Arc; 1109 /// let kvm = KvmHypervisor::new().unwrap(); 1110 /// let hypervisor = Arc::new(kvm); 1111 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 1112 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1113 /// ``` 1114 impl cpu::Vcpu for KvmVcpu { 1115 #[cfg(target_arch = "x86_64")] 1116 /// 1117 /// Returns the vCPU general purpose registers. 1118 /// 1119 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1120 Ok(self 1121 .fd 1122 .get_regs() 1123 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))? 1124 .into()) 1125 } 1126 /// 1127 /// Returns the vCPU general purpose registers. 1128 /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG` 1129 /// is used to get registers one by one. 1130 /// 1131 #[cfg(target_arch = "aarch64")] 1132 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1133 let mut state: StandardRegisters = kvm_regs::default(); 1134 let mut off = offset_of!(user_pt_regs, regs); 1135 // There are 31 user_pt_regs: 1136 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1137 // These actually are the general-purpose registers of the Armv8-a 1138 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1139 for i in 0..31 { 1140 state.regs.regs[i] = self 1141 .fd 1142 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1143 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1144 .try_into() 1145 .unwrap(); 1146 off += std::mem::size_of::<u64>(); 1147 } 1148 1149 // We are now entering the "Other register" section of the ARMv8-a architecture. 1150 // First one, stack pointer. 1151 let off = offset_of!(user_pt_regs, sp); 1152 state.regs.sp = self 1153 .fd 1154 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1155 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1156 .try_into() 1157 .unwrap(); 1158 1159 // Second one, the program counter. 1160 let off = offset_of!(user_pt_regs, pc); 1161 state.regs.pc = self 1162 .fd 1163 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1164 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1165 .try_into() 1166 .unwrap(); 1167 1168 // Next is the processor state. 1169 let off = offset_of!(user_pt_regs, pstate); 1170 state.regs.pstate = self 1171 .fd 1172 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1173 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1174 .try_into() 1175 .unwrap(); 1176 1177 // The stack pointer associated with EL1 1178 let off = offset_of!(kvm_regs, sp_el1); 1179 state.sp_el1 = self 1180 .fd 1181 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1182 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1183 .try_into() 1184 .unwrap(); 1185 1186 // Exception Link Register for EL1, when taking an exception to EL1, this register 1187 // holds the address to which to return afterwards. 1188 let off = offset_of!(kvm_regs, elr_el1); 1189 state.elr_el1 = self 1190 .fd 1191 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1192 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1193 .try_into() 1194 .unwrap(); 1195 1196 // Saved Program Status Registers, there are 5 of them used in the kernel. 1197 let mut off = offset_of!(kvm_regs, spsr); 1198 for i in 0..KVM_NR_SPSR as usize { 1199 state.spsr[i] = self 1200 .fd 1201 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1202 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1203 .try_into() 1204 .unwrap(); 1205 off += std::mem::size_of::<u64>(); 1206 } 1207 1208 // Now moving on to floating point registers which are stored in the user_fpsimd_state in the kernel: 1209 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1210 let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs); 1211 for i in 0..32 { 1212 state.fp_regs.vregs[i] = self 1213 .fd 1214 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1215 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1216 off += mem::size_of::<u128>(); 1217 } 1218 1219 // Floating-point Status Register 1220 let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr); 1221 state.fp_regs.fpsr = self 1222 .fd 1223 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1224 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1225 .try_into() 1226 .unwrap(); 1227 1228 // Floating-point Control Register 1229 let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr); 1230 state.fp_regs.fpcr = self 1231 .fd 1232 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1233 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1234 .try_into() 1235 .unwrap(); 1236 Ok(state) 1237 } 1238 #[cfg(target_arch = "x86_64")] 1239 /// 1240 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 1241 /// 1242 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 1243 let regs = (*regs).into(); 1244 self.fd 1245 .set_regs(®s) 1246 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 1247 } 1248 1249 /// 1250 /// Sets the vCPU general purpose registers. 1251 /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG` 1252 /// is used to set registers one by one. 1253 /// 1254 #[cfg(target_arch = "aarch64")] 1255 fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> { 1256 // The function follows the exact identical order from `state`. Look there 1257 // for some additional info on registers. 1258 let mut off = offset_of!(user_pt_regs, regs); 1259 for i in 0..31 { 1260 self.fd 1261 .set_one_reg( 1262 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1263 state.regs.regs[i].into(), 1264 ) 1265 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1266 off += std::mem::size_of::<u64>(); 1267 } 1268 1269 let off = offset_of!(user_pt_regs, sp); 1270 self.fd 1271 .set_one_reg( 1272 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1273 state.regs.sp.into(), 1274 ) 1275 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1276 1277 let off = offset_of!(user_pt_regs, pc); 1278 self.fd 1279 .set_one_reg( 1280 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1281 state.regs.pc.into(), 1282 ) 1283 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1284 1285 let off = offset_of!(user_pt_regs, pstate); 1286 self.fd 1287 .set_one_reg( 1288 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1289 state.regs.pstate.into(), 1290 ) 1291 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1292 1293 let off = offset_of!(kvm_regs, sp_el1); 1294 self.fd 1295 .set_one_reg( 1296 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1297 state.sp_el1.into(), 1298 ) 1299 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1300 1301 let off = offset_of!(kvm_regs, elr_el1); 1302 self.fd 1303 .set_one_reg( 1304 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1305 state.elr_el1.into(), 1306 ) 1307 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1308 1309 let mut off = offset_of!(kvm_regs, spsr); 1310 for i in 0..KVM_NR_SPSR as usize { 1311 self.fd 1312 .set_one_reg( 1313 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1314 state.spsr[i].into(), 1315 ) 1316 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1317 off += std::mem::size_of::<u64>(); 1318 } 1319 1320 let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs); 1321 for i in 0..32 { 1322 self.fd 1323 .set_one_reg( 1324 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1325 state.fp_regs.vregs[i], 1326 ) 1327 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1328 off += mem::size_of::<u128>(); 1329 } 1330 1331 let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr); 1332 self.fd 1333 .set_one_reg( 1334 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1335 state.fp_regs.fpsr.into(), 1336 ) 1337 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1338 1339 let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr); 1340 self.fd 1341 .set_one_reg( 1342 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1343 state.fp_regs.fpcr.into(), 1344 ) 1345 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1346 Ok(()) 1347 } 1348 1349 #[cfg(target_arch = "x86_64")] 1350 /// 1351 /// Returns the vCPU special registers. 1352 /// 1353 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 1354 Ok(self 1355 .fd 1356 .get_sregs() 1357 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))? 1358 .into()) 1359 } 1360 #[cfg(target_arch = "x86_64")] 1361 /// 1362 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 1363 /// 1364 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 1365 let sregs = (*sregs).into(); 1366 self.fd 1367 .set_sregs(&sregs) 1368 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 1369 } 1370 #[cfg(target_arch = "x86_64")] 1371 /// 1372 /// Returns the floating point state (FPU) from the vCPU. 1373 /// 1374 fn get_fpu(&self) -> cpu::Result<FpuState> { 1375 Ok(self 1376 .fd 1377 .get_fpu() 1378 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))? 1379 .into()) 1380 } 1381 #[cfg(target_arch = "x86_64")] 1382 /// 1383 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 1384 /// 1385 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 1386 let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into(); 1387 self.fd 1388 .set_fpu(&fpu) 1389 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 1390 } 1391 #[cfg(target_arch = "x86_64")] 1392 /// 1393 /// X86 specific call to setup the CPUID registers. 1394 /// 1395 fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> { 1396 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 1397 cpuid.iter().map(|e| (*e).into()).collect(); 1398 let kvm_cpuid = <CpuId>::from_entries(&cpuid) 1399 .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?; 1400 1401 self.fd 1402 .set_cpuid2(&kvm_cpuid) 1403 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 1404 } 1405 #[cfg(target_arch = "x86_64")] 1406 /// 1407 /// X86 specific call to enable HyperV SynIC 1408 /// 1409 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 1410 // Update the information about Hyper-V SynIC being enabled and 1411 // emulated as it will influence later which MSRs should be saved. 1412 self.hyperv_synic.store(true, Ordering::Release); 1413 1414 let cap = kvm_enable_cap { 1415 cap: KVM_CAP_HYPERV_SYNIC, 1416 ..Default::default() 1417 }; 1418 self.fd 1419 .enable_cap(&cap) 1420 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 1421 } 1422 /// 1423 /// X86 specific call to retrieve the CPUID registers. 1424 /// 1425 #[cfg(target_arch = "x86_64")] 1426 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> { 1427 let kvm_cpuid = self 1428 .fd 1429 .get_cpuid2(num_entries) 1430 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?; 1431 1432 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1433 1434 Ok(v) 1435 } 1436 #[cfg(target_arch = "x86_64")] 1437 /// 1438 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1439 /// 1440 fn get_lapic(&self) -> cpu::Result<LapicState> { 1441 Ok(self 1442 .fd 1443 .get_lapic() 1444 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))? 1445 .into()) 1446 } 1447 #[cfg(target_arch = "x86_64")] 1448 /// 1449 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1450 /// 1451 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 1452 let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into(); 1453 self.fd 1454 .set_lapic(&klapic) 1455 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 1456 } 1457 #[cfg(target_arch = "x86_64")] 1458 /// 1459 /// Returns the model-specific registers (MSR) for this vCPU. 1460 /// 1461 fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> { 1462 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1463 let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1464 let succ = self 1465 .fd 1466 .get_msrs(&mut kvm_msrs) 1467 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?; 1468 1469 msrs[..succ].copy_from_slice( 1470 &kvm_msrs.as_slice()[..succ] 1471 .iter() 1472 .map(|e| (*e).into()) 1473 .collect::<Vec<MsrEntry>>(), 1474 ); 1475 1476 Ok(succ) 1477 } 1478 #[cfg(target_arch = "x86_64")] 1479 /// 1480 /// Setup the model-specific registers (MSR) for this vCPU. 1481 /// Returns the number of MSR entries actually written. 1482 /// 1483 fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> { 1484 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1485 let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1486 self.fd 1487 .set_msrs(&kvm_msrs) 1488 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 1489 } 1490 /// 1491 /// Returns the vcpu's current "multiprocessing state". 1492 /// 1493 fn get_mp_state(&self) -> cpu::Result<MpState> { 1494 Ok(self 1495 .fd 1496 .get_mp_state() 1497 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))? 1498 .into()) 1499 } 1500 /// 1501 /// Sets the vcpu's current "multiprocessing state". 1502 /// 1503 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 1504 self.fd 1505 .set_mp_state(mp_state.into()) 1506 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 1507 } 1508 #[cfg(target_arch = "x86_64")] 1509 /// 1510 /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl. 1511 /// 1512 fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> { 1513 let tr = self 1514 .fd 1515 .translate_gva(gva) 1516 .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?; 1517 // tr.valid is set if the GVA is mapped to valid GPA. 1518 match tr.valid { 1519 0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!( 1520 "Invalid GVA: {:#x}", 1521 gva 1522 ))), 1523 _ => Ok((tr.physical_address, 0)), 1524 } 1525 } 1526 /// 1527 /// Triggers the running of the current virtual CPU returning an exit reason. 1528 /// 1529 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 1530 match self.fd.run() { 1531 Ok(run) => match run { 1532 #[cfg(target_arch = "x86_64")] 1533 VcpuExit::IoIn(addr, data) => { 1534 if let Some(vm_ops) = &self.vm_ops { 1535 return vm_ops 1536 .pio_read(addr.into(), data) 1537 .map(|_| cpu::VmExit::Ignore) 1538 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1539 } 1540 1541 Ok(cpu::VmExit::IoIn(addr, data)) 1542 } 1543 #[cfg(target_arch = "x86_64")] 1544 VcpuExit::IoOut(addr, data) => { 1545 if let Some(vm_ops) = &self.vm_ops { 1546 return vm_ops 1547 .pio_write(addr.into(), data) 1548 .map(|_| cpu::VmExit::Ignore) 1549 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1550 } 1551 1552 Ok(cpu::VmExit::IoOut(addr, data)) 1553 } 1554 #[cfg(target_arch = "x86_64")] 1555 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 1556 #[cfg(target_arch = "x86_64")] 1557 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 1558 1559 #[cfg(target_arch = "aarch64")] 1560 VcpuExit::SystemEvent(event_type, flags) => { 1561 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 1562 // On Aarch64, when the VM is shutdown, run() returns 1563 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 1564 if event_type == KVM_SYSTEM_EVENT_RESET { 1565 Ok(cpu::VmExit::Reset) 1566 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 1567 Ok(cpu::VmExit::Shutdown) 1568 } else { 1569 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1570 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 1571 event_type, 1572 flags 1573 ))) 1574 } 1575 } 1576 1577 VcpuExit::MmioRead(addr, data) => { 1578 if let Some(vm_ops) = &self.vm_ops { 1579 return vm_ops 1580 .mmio_read(addr, data) 1581 .map(|_| cpu::VmExit::Ignore) 1582 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1583 } 1584 1585 Ok(cpu::VmExit::MmioRead(addr, data)) 1586 } 1587 VcpuExit::MmioWrite(addr, data) => { 1588 if let Some(vm_ops) = &self.vm_ops { 1589 return vm_ops 1590 .mmio_write(addr, data) 1591 .map(|_| cpu::VmExit::Ignore) 1592 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1593 } 1594 1595 Ok(cpu::VmExit::MmioWrite(addr, data)) 1596 } 1597 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 1598 #[cfg(feature = "tdx")] 1599 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), 1600 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), 1601 1602 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1603 "Unexpected exit reason on vcpu run: {:?}", 1604 r 1605 ))), 1606 }, 1607 1608 Err(ref e) => match e.errno() { 1609 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1610 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1611 "VCPU error {:?}", 1612 e 1613 ))), 1614 }, 1615 } 1616 } 1617 #[cfg(target_arch = "x86_64")] 1618 /// 1619 /// Let the guest know that it has been paused, which prevents from 1620 /// potential soft lockups when being resumed. 1621 /// 1622 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1623 if let Err(e) = self.fd.kvmclock_ctrl() { 1624 // Linux kernel returns -EINVAL if the PV clock isn't yet initialised 1625 // which could be because we're still in firmware or the guest doesn't 1626 // use KVM clock. 1627 if e.errno() != libc::EINVAL { 1628 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())); 1629 } 1630 } 1631 1632 Ok(()) 1633 } 1634 /// 1635 /// Sets debug registers to set hardware breakpoints and/or enable single step. 1636 /// 1637 fn set_guest_debug( 1638 &self, 1639 addrs: &[vm_memory::GuestAddress], 1640 singlestep: bool, 1641 ) -> cpu::Result<()> { 1642 let mut dbg = kvm_guest_debug { 1643 #[cfg(target_arch = "x86_64")] 1644 control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP, 1645 #[cfg(target_arch = "aarch64")] 1646 control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW, 1647 ..Default::default() 1648 }; 1649 if singlestep { 1650 dbg.control |= KVM_GUESTDBG_SINGLESTEP; 1651 } 1652 1653 // Set the debug registers. 1654 // Here we assume that the number of addresses do not exceed what 1655 // `Hypervisor::get_guest_debug_hw_bps()` specifies. 1656 #[cfg(target_arch = "x86_64")] 1657 { 1658 // Set bits 9 and 10. 1659 // bit 9: GE (global exact breakpoint enable) flag. 1660 // bit 10: always 1. 1661 dbg.arch.debugreg[7] = 0x0600; 1662 1663 for (i, addr) in addrs.iter().enumerate() { 1664 dbg.arch.debugreg[i] = addr.0; 1665 // Set global breakpoint enable flag 1666 dbg.arch.debugreg[7] |= 2 << (i * 2); 1667 } 1668 } 1669 #[cfg(target_arch = "aarch64")] 1670 { 1671 for (i, addr) in addrs.iter().enumerate() { 1672 // DBGBCR_EL1 (Debug Breakpoint Control Registers, D13.3.2): 1673 // bit 0: 1 (Enabled) 1674 // bit 1~2: 0b11 (PMC = EL1/EL0) 1675 // bit 5~8: 0b1111 (BAS = AArch64) 1676 // others: 0 1677 dbg.arch.dbg_bcr[i] = 0b1u64 | 0b110u64 | 0b1_1110_0000u64; 1678 // DBGBVR_EL1 (Debug Breakpoint Value Registers, D13.3.3): 1679 // bit 2~52: VA[2:52] 1680 dbg.arch.dbg_bvr[i] = (!0u64 >> 11) & addr.0; 1681 } 1682 } 1683 self.fd 1684 .set_guest_debug(&dbg) 1685 .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into())) 1686 } 1687 #[cfg(target_arch = "aarch64")] 1688 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1689 self.fd 1690 .vcpu_init(kvi) 1691 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1692 } 1693 /// 1694 /// Gets a list of the guest registers that are supported for the 1695 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1696 /// 1697 #[cfg(target_arch = "aarch64")] 1698 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1699 self.fd 1700 .get_reg_list(reg_list) 1701 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1702 } 1703 /// 1704 /// Gets the value of a system register 1705 /// 1706 #[cfg(target_arch = "aarch64")] 1707 fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> { 1708 // 1709 // Arm Architecture Reference Manual defines the encoding of 1710 // AArch64 system registers, see 1711 // https://developer.arm.com/documentation/ddi0487 (chapter D12). 1712 // While KVM defines another ID for each AArch64 system register, 1713 // which is used in calling `KVM_G/SET_ONE_REG` to access a system 1714 // register of a guest. 1715 // A mapping exists between the Arm standard encoding and the KVM ID. 1716 // This function takes the standard u32 ID as input parameter, converts 1717 // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to 1718 // get the value of the system parameter. 1719 // 1720 let id: u64 = KVM_REG_ARM64 1721 | KVM_REG_SIZE_U64 1722 | KVM_REG_ARM64_SYSREG as u64 1723 | ((((sys_reg) >> 5) 1724 & (KVM_REG_ARM64_SYSREG_OP0_MASK 1725 | KVM_REG_ARM64_SYSREG_OP1_MASK 1726 | KVM_REG_ARM64_SYSREG_CRN_MASK 1727 | KVM_REG_ARM64_SYSREG_CRM_MASK 1728 | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64); 1729 Ok(self 1730 .fd 1731 .get_one_reg(id) 1732 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))? 1733 .try_into() 1734 .unwrap()) 1735 } 1736 /// 1737 /// Configure core registers for a given CPU. 1738 /// 1739 #[cfg(target_arch = "aarch64")] 1740 fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { 1741 #[allow(non_upper_case_globals)] 1742 // PSR (Processor State Register) bits. 1743 // Taken from arch/arm64/include/uapi/asm/ptrace.h. 1744 const PSR_MODE_EL1h: u64 = 0x0000_0005; 1745 const PSR_F_BIT: u64 = 0x0000_0040; 1746 const PSR_I_BIT: u64 = 0x0000_0080; 1747 const PSR_A_BIT: u64 = 0x0000_0100; 1748 const PSR_D_BIT: u64 = 0x0000_0200; 1749 // Taken from arch/arm64/kvm/inject_fault.c. 1750 const PSTATE_FAULT_BITS_64: u64 = 1751 PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT; 1752 1753 let kreg_off = offset_of!(kvm_regs, regs); 1754 1755 // Get the register index of the PSTATE (Processor State) register. 1756 let pstate = offset_of!(user_pt_regs, pstate) + kreg_off; 1757 self.fd 1758 .set_one_reg( 1759 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate), 1760 PSTATE_FAULT_BITS_64.into(), 1761 ) 1762 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1763 1764 // Other vCPUs are powered off initially awaiting PSCI wakeup. 1765 if cpu_id == 0 { 1766 // Setting the PC (Processor Counter) to the current program address (kernel address). 1767 let pc = offset_of!(user_pt_regs, pc) + kreg_off; 1768 self.fd 1769 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip.into()) 1770 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1771 1772 // Last mandatory thing to set -> the address pointing to the FDT (also called DTB). 1773 // "The device tree blob (dtb) must be placed on an 8-byte boundary and must 1774 // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt. 1775 // We are choosing to place it the end of DRAM. See `get_fdt_addr`. 1776 let regs0 = offset_of!(user_pt_regs, regs) + kreg_off; 1777 self.fd 1778 .set_one_reg( 1779 arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), 1780 fdt_start.into(), 1781 ) 1782 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1783 } 1784 Ok(()) 1785 } 1786 1787 #[cfg(target_arch = "x86_64")] 1788 /// 1789 /// Get the current CPU state 1790 /// 1791 /// Ordering requirements: 1792 /// 1793 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1794 /// vCPU/LAPIC state. As such, it must be done before most everything 1795 /// else, otherwise we cannot restore everything and expect it to work. 1796 /// 1797 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1798 /// still running. 1799 /// 1800 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1801 /// 1802 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1803 /// it might as well be affected by internal state modifications of the 1804 /// GET ioctls. 1805 /// 1806 /// SREGS saves/restores a pending interrupt, similar to what 1807 /// VCPU_EVENTS also does. 1808 /// 1809 /// GET_MSRS requires a pre-populated data structure to do something 1810 /// meaningful. For SET_MSRS it will then contain good data. 1811 /// 1812 /// # Example 1813 /// 1814 /// ```rust 1815 /// # use hypervisor::kvm::KvmHypervisor; 1816 /// # use std::sync::Arc; 1817 /// let kvm = KvmHypervisor::new().unwrap(); 1818 /// let hv = Arc::new(kvm); 1819 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1820 /// vm.enable_split_irq().unwrap(); 1821 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1822 /// let state = vcpu.state().unwrap(); 1823 /// ``` 1824 fn state(&self) -> cpu::Result<CpuState> { 1825 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1826 let mp_state = self.get_mp_state()?.into(); 1827 let regs = self.get_regs()?; 1828 let sregs = self.get_sregs()?; 1829 let xsave = self.get_xsave()?; 1830 let xcrs = self.get_xcrs()?; 1831 let lapic_state = self.get_lapic()?; 1832 let fpu = self.get_fpu()?; 1833 1834 // Try to get all MSRs based on the list previously retrieved from KVM. 1835 // If the number of MSRs obtained from GET_MSRS is different from the 1836 // expected amount, we fallback onto a slower method by getting MSRs 1837 // by chunks. This is the only way to make sure we try to get as many 1838 // MSRs as possible, even if some MSRs are not supported. 1839 let mut msr_entries = self.msrs.clone(); 1840 1841 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1842 // emulated. 1843 if self.hyperv_synic.load(Ordering::Acquire) { 1844 let hyperv_synic_msrs = vec![ 1845 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1846 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1847 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1848 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 1849 0x400000b5, 0x400000b6, 0x400000b7, 1850 ]; 1851 for index in hyperv_synic_msrs { 1852 let msr = kvm_msr_entry { 1853 index, 1854 ..Default::default() 1855 }; 1856 msr_entries.push(msr.into()); 1857 } 1858 } 1859 1860 let expected_num_msrs = msr_entries.len(); 1861 let num_msrs = self.get_msrs(&mut msr_entries)?; 1862 let msrs = if num_msrs != expected_num_msrs { 1863 let mut faulty_msr_index = num_msrs; 1864 let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec(); 1865 1866 loop { 1867 warn!( 1868 "Detected faulty MSR 0x{:x} while getting MSRs", 1869 msr_entries[faulty_msr_index].index 1870 ); 1871 1872 // Skip the first bad MSR 1873 let start_pos = faulty_msr_index + 1; 1874 1875 let mut sub_msr_entries = msr_entries[start_pos..].to_vec(); 1876 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1877 1878 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]); 1879 1880 if num_msrs == sub_msr_entries.len() { 1881 break; 1882 } 1883 1884 faulty_msr_index = start_pos + num_msrs; 1885 } 1886 1887 msr_entries_tmp 1888 } else { 1889 msr_entries 1890 }; 1891 1892 let vcpu_events = self.get_vcpu_events()?; 1893 let tsc_khz = self.tsc_khz()?; 1894 1895 Ok(VcpuKvmState { 1896 cpuid, 1897 msrs, 1898 vcpu_events, 1899 regs: regs.into(), 1900 sregs: sregs.into(), 1901 fpu, 1902 lapic_state, 1903 xsave, 1904 xcrs, 1905 mp_state, 1906 tsc_khz, 1907 } 1908 .into()) 1909 } 1910 /// 1911 /// Get the current AArch64 CPU state 1912 /// 1913 #[cfg(target_arch = "aarch64")] 1914 fn state(&self) -> cpu::Result<CpuState> { 1915 let mut state = VcpuKvmState { 1916 mp_state: self.get_mp_state()?.into(), 1917 ..Default::default() 1918 }; 1919 // Get core registers 1920 state.core_regs = self.get_regs()?; 1921 1922 // Get systerm register 1923 // Call KVM_GET_REG_LIST to get all registers available to the guest. 1924 // For ArmV8 there are around 500 registers. 1925 let mut sys_regs: Vec<Register> = Vec::new(); 1926 let mut reg_list = RegList::new(500).unwrap(); 1927 self.fd 1928 .get_reg_list(&mut reg_list) 1929 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1930 1931 // At this point reg_list should contain: core registers and system 1932 // registers. 1933 // The register list contains the number of registers and their ids. We 1934 // will be needing to call KVM_GET_ONE_REG on each id in order to save 1935 // all of them. We carve out from the list the core registers which are 1936 // represented in the kernel by kvm_regs structure and for which we can 1937 // calculate the id based on the offset in the structure. 1938 reg_list.retain(|regid| is_system_register(*regid)); 1939 1940 // Now, for the rest of the registers left in the previously fetched 1941 // register list, we are simply calling KVM_GET_ONE_REG. 1942 let indices = reg_list.as_slice(); 1943 for index in indices.iter() { 1944 sys_regs.push(kvm_bindings::kvm_one_reg { 1945 id: *index, 1946 addr: self 1947 .fd 1948 .get_one_reg(*index) 1949 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))? 1950 .try_into() 1951 .unwrap(), 1952 }); 1953 } 1954 1955 state.sys_regs = sys_regs; 1956 1957 Ok(state.into()) 1958 } 1959 #[cfg(target_arch = "x86_64")] 1960 /// 1961 /// Restore the previously saved CPU state 1962 /// 1963 /// Ordering requirements: 1964 /// 1965 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1966 /// still running. 1967 /// 1968 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1969 /// if we ever change the BSP, we have to do that before restoring anything. 1970 /// The same seems to be true for CPUID stuff. 1971 /// 1972 /// SREGS saves/restores a pending interrupt, similar to what 1973 /// VCPU_EVENTS also does. 1974 /// 1975 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1976 /// done before SET_VCPU_EVENTS, which restores it. 1977 /// 1978 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1979 /// the apic base msr. 1980 /// 1981 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1982 /// only restores successfully, when the LAPIC is correctly configured. 1983 /// 1984 /// Arguments: CpuState 1985 /// # Example 1986 /// 1987 /// ```rust 1988 /// # use hypervisor::kvm::KvmHypervisor; 1989 /// # use std::sync::Arc; 1990 /// let kvm = KvmHypervisor::new().unwrap(); 1991 /// let hv = Arc::new(kvm); 1992 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1993 /// vm.enable_split_irq().unwrap(); 1994 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1995 /// let state = vcpu.state().unwrap(); 1996 /// vcpu.set_state(&state).unwrap(); 1997 /// ``` 1998 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1999 let state: VcpuKvmState = state.clone().into(); 2000 self.set_cpuid2(&state.cpuid)?; 2001 self.set_mp_state(state.mp_state.into())?; 2002 self.set_regs(&state.regs.into())?; 2003 self.set_sregs(&state.sregs.into())?; 2004 self.set_xsave(&state.xsave)?; 2005 self.set_xcrs(&state.xcrs)?; 2006 self.set_lapic(&state.lapic_state)?; 2007 self.set_fpu(&state.fpu)?; 2008 2009 if let Some(freq) = state.tsc_khz { 2010 self.set_tsc_khz(freq)?; 2011 } 2012 2013 // Try to set all MSRs previously stored. 2014 // If the number of MSRs set from SET_MSRS is different from the 2015 // expected amount, we fallback onto a slower method by setting MSRs 2016 // by chunks. This is the only way to make sure we try to set as many 2017 // MSRs as possible, even if some MSRs are not supported. 2018 let expected_num_msrs = state.msrs.len(); 2019 let num_msrs = self.set_msrs(&state.msrs)?; 2020 if num_msrs != expected_num_msrs { 2021 let mut faulty_msr_index = num_msrs; 2022 2023 loop { 2024 warn!( 2025 "Detected faulty MSR 0x{:x} while setting MSRs", 2026 state.msrs[faulty_msr_index].index 2027 ); 2028 2029 // Skip the first bad MSR 2030 let start_pos = faulty_msr_index + 1; 2031 2032 let sub_msr_entries = state.msrs[start_pos..].to_vec(); 2033 2034 let num_msrs = self.set_msrs(&sub_msr_entries)?; 2035 2036 if num_msrs == sub_msr_entries.len() { 2037 break; 2038 } 2039 2040 faulty_msr_index = start_pos + num_msrs; 2041 } 2042 } 2043 2044 self.set_vcpu_events(&state.vcpu_events)?; 2045 2046 Ok(()) 2047 } 2048 /// 2049 /// Restore the previously saved AArch64 CPU state 2050 /// 2051 #[cfg(target_arch = "aarch64")] 2052 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 2053 let state: VcpuKvmState = state.clone().into(); 2054 // Set core registers 2055 self.set_regs(&state.core_regs)?; 2056 // Set system registers 2057 for reg in &state.sys_regs { 2058 self.fd 2059 .set_one_reg(reg.id, reg.addr.into()) 2060 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 2061 } 2062 2063 self.set_mp_state(state.mp_state.into())?; 2064 2065 Ok(()) 2066 } 2067 2068 /// 2069 /// Initialize TDX for this CPU 2070 /// 2071 #[cfg(feature = "tdx")] 2072 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 2073 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 2074 .map_err(cpu::HypervisorCpuError::InitializeTdx) 2075 } 2076 2077 /// 2078 /// Set the "immediate_exit" state 2079 /// 2080 fn set_immediate_exit(&self, exit: bool) { 2081 self.fd.set_kvm_immediate_exit(exit.into()); 2082 } 2083 2084 /// 2085 /// Returns the details about TDX exit reason 2086 /// 2087 #[cfg(feature = "tdx")] 2088 fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> { 2089 let kvm_run = self.fd.get_kvm_run(); 2090 // SAFETY: accessing a union field in a valid structure 2091 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2092 2093 tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND; 2094 2095 if tdx_vmcall.type_ != 0 { 2096 return Err(cpu::HypervisorCpuError::UnknownTdxVmCall); 2097 } 2098 2099 match tdx_vmcall.subfunction { 2100 TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote), 2101 TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => { 2102 Ok(TdxExitDetails::SetupEventNotifyInterrupt) 2103 } 2104 _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall), 2105 } 2106 } 2107 2108 /// 2109 /// Set the status code for TDX exit 2110 /// 2111 #[cfg(feature = "tdx")] 2112 fn set_tdx_status(&mut self, status: TdxExitStatus) { 2113 let kvm_run = self.fd.get_kvm_run(); 2114 // SAFETY: accessing a union field in a valid structure 2115 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2116 2117 tdx_vmcall.status_code = match status { 2118 TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS, 2119 TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND, 2120 }; 2121 } 2122 #[cfg(target_arch = "x86_64")] 2123 /// 2124 /// Return the list of initial MSR entries for a VCPU 2125 /// 2126 fn boot_msr_entries(&self) -> Vec<MsrEntry> { 2127 use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB}; 2128 2129 [ 2130 msr!(msr_index::MSR_IA32_SYSENTER_CS), 2131 msr!(msr_index::MSR_IA32_SYSENTER_ESP), 2132 msr!(msr_index::MSR_IA32_SYSENTER_EIP), 2133 msr!(msr_index::MSR_STAR), 2134 msr!(msr_index::MSR_CSTAR), 2135 msr!(msr_index::MSR_LSTAR), 2136 msr!(msr_index::MSR_KERNEL_GS_BASE), 2137 msr!(msr_index::MSR_SYSCALL_MASK), 2138 msr!(msr_index::MSR_IA32_TSC), 2139 msr_data!( 2140 msr_index::MSR_IA32_MISC_ENABLE, 2141 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64 2142 ), 2143 msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB), 2144 ] 2145 .to_vec() 2146 } 2147 #[cfg(target_arch = "aarch64")] 2148 fn has_pmu_support(&self) -> bool { 2149 let cpu_attr = kvm_bindings::kvm_device_attr { 2150 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2151 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT), 2152 addr: 0x0, 2153 flags: 0, 2154 }; 2155 self.fd.has_device_attr(&cpu_attr).is_ok() 2156 } 2157 #[cfg(target_arch = "aarch64")] 2158 fn init_pmu(&self, irq: u32) -> cpu::Result<()> { 2159 let cpu_attr = kvm_bindings::kvm_device_attr { 2160 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2161 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT), 2162 addr: 0x0, 2163 flags: 0, 2164 }; 2165 let cpu_attr_irq = kvm_bindings::kvm_device_attr { 2166 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2167 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ), 2168 addr: &irq as *const u32 as u64, 2169 flags: 0, 2170 }; 2171 self.fd 2172 .set_device_attr(&cpu_attr_irq) 2173 .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?; 2174 self.fd 2175 .set_device_attr(&cpu_attr) 2176 .map_err(|_| cpu::HypervisorCpuError::InitializePmu) 2177 } 2178 2179 #[cfg(target_arch = "x86_64")] 2180 /// 2181 /// Get the frequency of the TSC if available 2182 /// 2183 fn tsc_khz(&self) -> cpu::Result<Option<u32>> { 2184 match self.fd.get_tsc_khz() { 2185 Err(e) => { 2186 if e.errno() == libc::EIO { 2187 Ok(None) 2188 } else { 2189 Err(cpu::HypervisorCpuError::GetTscKhz(e.into())) 2190 } 2191 } 2192 Ok(v) => Ok(Some(v)), 2193 } 2194 } 2195 2196 #[cfg(target_arch = "x86_64")] 2197 /// 2198 /// Set the frequency of the TSC if available 2199 /// 2200 fn set_tsc_khz(&self, freq: u32) -> cpu::Result<()> { 2201 match self.fd.set_tsc_khz(freq) { 2202 Err(e) => { 2203 if e.errno() == libc::EIO { 2204 Ok(()) 2205 } else { 2206 Err(cpu::HypervisorCpuError::SetTscKhz(e.into())) 2207 } 2208 } 2209 Ok(_) => Ok(()), 2210 } 2211 } 2212 } 2213 2214 impl KvmVcpu { 2215 #[cfg(target_arch = "x86_64")] 2216 /// 2217 /// X86 specific call that returns the vcpu's current "xsave struct". 2218 /// 2219 fn get_xsave(&self) -> cpu::Result<Xsave> { 2220 self.fd 2221 .get_xsave() 2222 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 2223 } 2224 #[cfg(target_arch = "x86_64")] 2225 /// 2226 /// X86 specific call that sets the vcpu's current "xsave struct". 2227 /// 2228 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 2229 self.fd 2230 .set_xsave(xsave) 2231 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 2232 } 2233 #[cfg(target_arch = "x86_64")] 2234 /// 2235 /// X86 specific call that returns the vcpu's current "xcrs". 2236 /// 2237 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 2238 self.fd 2239 .get_xcrs() 2240 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 2241 } 2242 #[cfg(target_arch = "x86_64")] 2243 /// 2244 /// X86 specific call that sets the vcpu's current "xcrs". 2245 /// 2246 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 2247 self.fd 2248 .set_xcrs(xcrs) 2249 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 2250 } 2251 #[cfg(target_arch = "x86_64")] 2252 /// 2253 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 2254 /// states of the vcpu. 2255 /// 2256 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 2257 self.fd 2258 .get_vcpu_events() 2259 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 2260 } 2261 #[cfg(target_arch = "x86_64")] 2262 /// 2263 /// Sets pending exceptions, interrupts, and NMIs as well as related states 2264 /// of the vcpu. 2265 /// 2266 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 2267 self.fd 2268 .set_vcpu_events(events) 2269 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 2270 } 2271 } 2272