1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 use crate::aarch64::gic::KvmGicV3Its; 13 #[cfg(target_arch = "aarch64")] 14 pub use crate::aarch64::{ 15 check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit, 16 VcpuKvmState, MPIDR_EL1, 17 }; 18 #[cfg(target_arch = "aarch64")] 19 use crate::arch::aarch64::gic::Vgic; 20 use crate::cpu; 21 use crate::device; 22 use crate::hypervisor; 23 use crate::vec_with_array_field; 24 use crate::vm::{self, InterruptSourceConfig, VmOps}; 25 #[cfg(target_arch = "aarch64")] 26 use crate::{arm64_core_reg_id, offset__of}; 27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 28 use std::any::Any; 29 use std::collections::HashMap; 30 #[cfg(target_arch = "aarch64")] 31 use std::convert::TryInto; 32 #[cfg(target_arch = "x86_64")] 33 use std::fs::File; 34 #[cfg(target_arch = "x86_64")] 35 use std::os::unix::io::AsRawFd; 36 #[cfg(feature = "tdx")] 37 use std::os::unix::io::RawFd; 38 use std::result; 39 #[cfg(target_arch = "x86_64")] 40 use std::sync::atomic::{AtomicBool, Ordering}; 41 #[cfg(target_arch = "aarch64")] 42 use std::sync::Mutex; 43 use std::sync::{Arc, RwLock}; 44 use vmm_sys_util::eventfd::EventFd; 45 // x86_64 dependencies 46 #[cfg(target_arch = "x86_64")] 47 pub mod x86_64; 48 #[cfg(target_arch = "x86_64")] 49 use crate::arch::x86::{ 50 CpuIdEntry, FpuState, SpecialRegisters, StandardRegisters, NUM_IOAPIC_PINS, 51 }; 52 #[cfg(target_arch = "x86_64")] 53 use crate::ClockData; 54 use crate::{ 55 CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion, 56 USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, 57 }; 58 #[cfg(target_arch = "aarch64")] 59 use aarch64::{RegList, Register, StandardRegisters}; 60 #[cfg(target_arch = "x86_64")] 61 use kvm_bindings::{ 62 kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, 63 KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP, 64 }; 65 #[cfg(target_arch = "x86_64")] 66 use x86_64::check_required_kvm_extensions; 67 #[cfg(target_arch = "x86_64")] 68 pub use x86_64::{CpuId, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState, Xsave}; 69 // aarch64 dependencies 70 #[cfg(target_arch = "aarch64")] 71 pub mod aarch64; 72 pub use kvm_bindings; 73 #[cfg(feature = "tdx")] 74 use kvm_bindings::KVMIO; 75 pub use kvm_bindings::{ 76 kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, 77 kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, 78 KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 79 }; 80 #[cfg(target_arch = "aarch64")] 81 use kvm_bindings::{ 82 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE, 83 KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 84 }; 85 pub use kvm_ioctls; 86 pub use kvm_ioctls::{Cap, Kvm}; 87 #[cfg(target_arch = "aarch64")] 88 use std::mem; 89 use thiserror::Error; 90 #[cfg(feature = "tdx")] 91 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr}; 92 /// 93 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 94 /// 95 pub use { 96 kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr, 97 kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, 98 kvm_ioctls::VcpuExit, 99 }; 100 101 #[cfg(target_arch = "x86_64")] 102 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 103 104 #[cfg(feature = "tdx")] 105 const KVM_EXIT_TDX: u32 = 35; 106 #[cfg(feature = "tdx")] 107 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002; 108 #[cfg(feature = "tdx")] 109 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004; 110 #[cfg(feature = "tdx")] 111 const TDG_VP_VMCALL_SUCCESS: u64 = 0; 112 #[cfg(feature = "tdx")] 113 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000; 114 115 #[cfg(feature = "tdx")] 116 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 117 118 #[cfg(feature = "tdx")] 119 #[repr(u32)] 120 enum TdxCommand { 121 Capabilities = 0, 122 InitVm, 123 InitVcpu, 124 InitMemRegion, 125 Finalize, 126 } 127 128 #[cfg(feature = "tdx")] 129 pub enum TdxExitDetails { 130 GetQuote, 131 SetupEventNotifyInterrupt, 132 } 133 134 #[cfg(feature = "tdx")] 135 pub enum TdxExitStatus { 136 Success, 137 InvalidOperand, 138 } 139 140 #[cfg(feature = "tdx")] 141 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6; 142 143 #[cfg(feature = "tdx")] 144 #[repr(C)] 145 #[derive(Debug, Default)] 146 pub struct TdxCpuidConfig { 147 pub leaf: u32, 148 pub sub_leaf: u32, 149 pub eax: u32, 150 pub ebx: u32, 151 pub ecx: u32, 152 pub edx: u32, 153 } 154 155 #[cfg(feature = "tdx")] 156 #[repr(C)] 157 #[derive(Debug, Default)] 158 pub struct TdxCapabilities { 159 pub attrs_fixed0: u64, 160 pub attrs_fixed1: u64, 161 pub xfam_fixed0: u64, 162 pub xfam_fixed1: u64, 163 pub nr_cpuid_configs: u32, 164 pub padding: u32, 165 pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS], 166 } 167 168 impl From<kvm_userspace_memory_region> for UserMemoryRegion { 169 fn from(region: kvm_userspace_memory_region) -> Self { 170 let mut flags = USER_MEMORY_REGION_READ; 171 if region.flags & KVM_MEM_READONLY == 0 { 172 flags |= USER_MEMORY_REGION_WRITE; 173 } 174 if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 { 175 flags |= USER_MEMORY_REGION_LOG_DIRTY; 176 } 177 178 UserMemoryRegion { 179 slot: region.slot, 180 guest_phys_addr: region.guest_phys_addr, 181 memory_size: region.memory_size, 182 userspace_addr: region.userspace_addr, 183 flags, 184 } 185 } 186 } 187 188 impl From<UserMemoryRegion> for kvm_userspace_memory_region { 189 fn from(region: UserMemoryRegion) -> Self { 190 assert!( 191 region.flags & USER_MEMORY_REGION_READ != 0, 192 "KVM mapped memory is always readable" 193 ); 194 195 let mut flags = 0; 196 if region.flags & USER_MEMORY_REGION_WRITE == 0 { 197 flags |= KVM_MEM_READONLY; 198 } 199 if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 { 200 flags |= KVM_MEM_LOG_DIRTY_PAGES; 201 } 202 203 kvm_userspace_memory_region { 204 slot: region.slot, 205 guest_phys_addr: region.guest_phys_addr, 206 memory_size: region.memory_size, 207 userspace_addr: region.userspace_addr, 208 flags, 209 } 210 } 211 } 212 213 impl From<kvm_mp_state> for MpState { 214 fn from(s: kvm_mp_state) -> Self { 215 MpState::Kvm(s) 216 } 217 } 218 219 impl From<MpState> for kvm_mp_state { 220 fn from(ms: MpState) -> Self { 221 match ms { 222 MpState::Kvm(s) => s, 223 /* Needed in case other hypervisors are enabled */ 224 #[allow(unreachable_patterns)] 225 _ => panic!("CpuState is not valid"), 226 } 227 } 228 } 229 230 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress { 231 fn from(a: kvm_ioctls::IoEventAddress) -> Self { 232 match a { 233 kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x), 234 kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x), 235 } 236 } 237 } 238 239 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress { 240 fn from(a: IoEventAddress) -> Self { 241 match a { 242 IoEventAddress::Pio(x) => Self::Pio(x), 243 IoEventAddress::Mmio(x) => Self::Mmio(x), 244 } 245 } 246 } 247 248 impl From<VcpuKvmState> for CpuState { 249 fn from(s: VcpuKvmState) -> Self { 250 CpuState::Kvm(s) 251 } 252 } 253 254 impl From<CpuState> for VcpuKvmState { 255 fn from(s: CpuState) -> Self { 256 match s { 257 CpuState::Kvm(s) => s, 258 /* Needed in case other hypervisors are enabled */ 259 #[allow(unreachable_patterns)] 260 _ => panic!("CpuState is not valid"), 261 } 262 } 263 } 264 265 #[cfg(target_arch = "x86_64")] 266 impl From<kvm_clock_data> for ClockData { 267 fn from(d: kvm_clock_data) -> Self { 268 ClockData::Kvm(d) 269 } 270 } 271 272 #[cfg(target_arch = "x86_64")] 273 impl From<ClockData> for kvm_clock_data { 274 fn from(ms: ClockData) -> Self { 275 match ms { 276 ClockData::Kvm(s) => s, 277 /* Needed in case other hypervisors are enabled */ 278 #[allow(unreachable_patterns)] 279 _ => panic!("CpuState is not valid"), 280 } 281 } 282 } 283 284 impl From<kvm_irq_routing_entry> for IrqRoutingEntry { 285 fn from(s: kvm_irq_routing_entry) -> Self { 286 IrqRoutingEntry::Kvm(s) 287 } 288 } 289 290 impl From<IrqRoutingEntry> for kvm_irq_routing_entry { 291 fn from(e: IrqRoutingEntry) -> Self { 292 match e { 293 IrqRoutingEntry::Kvm(e) => e, 294 /* Needed in case other hypervisors are enabled */ 295 #[allow(unreachable_patterns)] 296 _ => panic!("IrqRoutingEntry is not valid"), 297 } 298 } 299 } 300 301 struct KvmDirtyLogSlot { 302 slot: u32, 303 guest_phys_addr: u64, 304 memory_size: u64, 305 userspace_addr: u64, 306 } 307 308 /// Wrapper over KVM VM ioctls. 309 pub struct KvmVm { 310 fd: Arc<VmFd>, 311 #[cfg(target_arch = "x86_64")] 312 msrs: MsrEntries, 313 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 314 } 315 316 /// 317 /// Implementation of Vm trait for KVM 318 /// Example: 319 /// #[cfg(feature = "kvm")] 320 /// extern crate hypervisor 321 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 322 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 323 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 324 /// vm.set/get().unwrap() 325 /// 326 impl vm::Vm for KvmVm { 327 #[cfg(target_arch = "x86_64")] 328 /// 329 /// Sets the address of the one-page region in the VM's address space. 330 /// 331 fn set_identity_map_address(&self, address: u64) -> vm::Result<()> { 332 self.fd 333 .set_identity_map_address(address) 334 .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into())) 335 } 336 #[cfg(target_arch = "x86_64")] 337 /// 338 /// Sets the address of the three-page region in the VM's address space. 339 /// 340 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 341 self.fd 342 .set_tss_address(offset) 343 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 344 } 345 /// 346 /// Creates an in-kernel interrupt controller. 347 /// 348 fn create_irq_chip(&self) -> vm::Result<()> { 349 self.fd 350 .create_irq_chip() 351 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 352 } 353 /// 354 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 355 /// 356 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 357 self.fd 358 .register_irqfd(fd, gsi) 359 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 360 } 361 /// 362 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 363 /// 364 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 365 self.fd 366 .unregister_irqfd(fd, gsi) 367 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 368 } 369 /// 370 /// Creates a VcpuFd object from a vcpu RawFd. 371 /// 372 fn create_vcpu( 373 &self, 374 id: u8, 375 vm_ops: Option<Arc<dyn VmOps>>, 376 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 377 let vc = self 378 .fd 379 .create_vcpu(id as u64) 380 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 381 let vcpu = KvmVcpu { 382 fd: vc, 383 #[cfg(target_arch = "x86_64")] 384 msrs: self.msrs.clone(), 385 vm_ops, 386 #[cfg(target_arch = "x86_64")] 387 hyperv_synic: AtomicBool::new(false), 388 }; 389 Ok(Arc::new(vcpu)) 390 } 391 #[cfg(target_arch = "aarch64")] 392 /// 393 /// Creates a virtual GIC device. 394 /// 395 fn create_vgic( 396 &self, 397 vcpu_count: u64, 398 dist_addr: u64, 399 dist_size: u64, 400 redist_size: u64, 401 msi_size: u64, 402 nr_irqs: u32, 403 ) -> vm::Result<Arc<Mutex<dyn Vgic>>> { 404 let gic_device = KvmGicV3Its::new( 405 self, 406 vcpu_count, 407 dist_addr, 408 dist_size, 409 redist_size, 410 msi_size, 411 nr_irqs, 412 ) 413 .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?; 414 Ok(Arc::new(Mutex::new(gic_device))) 415 } 416 /// 417 /// Registers an event to be signaled whenever a certain address is written to. 418 /// 419 fn register_ioevent( 420 &self, 421 fd: &EventFd, 422 addr: &IoEventAddress, 423 datamatch: Option<vm::DataMatch>, 424 ) -> vm::Result<()> { 425 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 426 if let Some(dm) = datamatch { 427 match dm { 428 vm::DataMatch::DataMatch32(kvm_dm32) => self 429 .fd 430 .register_ioevent(fd, addr, kvm_dm32) 431 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 432 vm::DataMatch::DataMatch64(kvm_dm64) => self 433 .fd 434 .register_ioevent(fd, addr, kvm_dm64) 435 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 436 } 437 } else { 438 self.fd 439 .register_ioevent(fd, addr, NoDatamatch) 440 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 441 } 442 } 443 /// 444 /// Unregisters an event from a certain address it has been previously registered to. 445 /// 446 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 447 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 448 self.fd 449 .unregister_ioevent(fd, addr, NoDatamatch) 450 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 451 } 452 453 /// 454 /// Constructs a routing entry 455 /// 456 fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry { 457 match &config { 458 InterruptSourceConfig::MsiIrq(cfg) => { 459 let mut kvm_route = kvm_irq_routing_entry { 460 gsi, 461 type_: KVM_IRQ_ROUTING_MSI, 462 ..Default::default() 463 }; 464 465 kvm_route.u.msi.address_lo = cfg.low_addr; 466 kvm_route.u.msi.address_hi = cfg.high_addr; 467 kvm_route.u.msi.data = cfg.data; 468 469 if self.check_extension(crate::kvm::Cap::MsiDevid) { 470 // On AArch64, there is limitation on the range of the 'devid', 471 // it can not be greater than 65536 (the max of u16). 472 // 473 // BDF can not be used directly, because 'segment' is in high 474 // 16 bits. The layout of the u32 BDF is: 475 // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| 476 // | segment | bus | device | function | 477 // 478 // Now that we support 1 bus only in a segment, we can build a 479 // 'devid' by replacing the 'bus' bits with the low 8 bits of 480 // 'segment' data. 481 // This way we can resolve the range checking problem and give 482 // different `devid` to all the devices. Limitation is that at 483 // most 256 segments can be supported. 484 // 485 let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff; 486 487 kvm_route.flags = KVM_MSI_VALID_DEVID; 488 kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid; 489 } 490 kvm_route.into() 491 } 492 InterruptSourceConfig::LegacyIrq(cfg) => { 493 let mut kvm_route = kvm_irq_routing_entry { 494 gsi, 495 type_: KVM_IRQ_ROUTING_IRQCHIP, 496 ..Default::default() 497 }; 498 kvm_route.u.irqchip.irqchip = cfg.irqchip; 499 kvm_route.u.irqchip.pin = cfg.pin; 500 501 kvm_route.into() 502 } 503 } 504 } 505 506 /// 507 /// Sets the GSI routing table entries, overwriting any previously set 508 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 509 /// 510 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 511 let mut irq_routing = 512 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 513 irq_routing[0].nr = entries.len() as u32; 514 irq_routing[0].flags = 0; 515 let entries: Vec<kvm_irq_routing_entry> = entries 516 .iter() 517 .map(|entry| match entry { 518 IrqRoutingEntry::Kvm(e) => *e, 519 #[allow(unreachable_patterns)] 520 _ => panic!("IrqRoutingEntry type is wrong"), 521 }) 522 .collect(); 523 524 // SAFETY: irq_routing initialized with entries.len() and now it is being turned into 525 // entries_slice with entries.len() again. It is guaranteed to be large enough to hold 526 // everything from entries. 527 unsafe { 528 let entries_slice: &mut [kvm_irq_routing_entry] = 529 irq_routing[0].entries.as_mut_slice(entries.len()); 530 entries_slice.copy_from_slice(&entries); 531 } 532 533 self.fd 534 .set_gsi_routing(&irq_routing[0]) 535 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 536 } 537 /// 538 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 539 /// 540 fn make_user_memory_region( 541 &self, 542 slot: u32, 543 guest_phys_addr: u64, 544 memory_size: u64, 545 userspace_addr: u64, 546 readonly: bool, 547 log_dirty_pages: bool, 548 ) -> UserMemoryRegion { 549 kvm_userspace_memory_region { 550 slot, 551 guest_phys_addr, 552 memory_size, 553 userspace_addr, 554 flags: if readonly { KVM_MEM_READONLY } else { 0 } 555 | if log_dirty_pages { 556 KVM_MEM_LOG_DIRTY_PAGES 557 } else { 558 0 559 }, 560 } 561 .into() 562 } 563 /// 564 /// Creates a guest physical memory region. 565 /// 566 fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 567 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 568 569 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 570 if (region.flags & KVM_MEM_READONLY) != 0 { 571 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 572 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 573 ))); 574 } 575 576 // Keep track of the regions that need dirty pages log 577 self.dirty_log_slots.write().unwrap().insert( 578 region.slot, 579 KvmDirtyLogSlot { 580 slot: region.slot, 581 guest_phys_addr: region.guest_phys_addr, 582 memory_size: region.memory_size, 583 userspace_addr: region.userspace_addr, 584 }, 585 ); 586 587 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 588 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 589 region.flags = 0; 590 } 591 592 // SAFETY: Safe because guest regions are guaranteed not to overlap. 593 unsafe { 594 self.fd 595 .set_user_memory_region(region) 596 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 597 } 598 } 599 /// 600 /// Removes a guest physical memory region. 601 /// 602 fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 603 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 604 605 // Remove the corresponding entry from "self.dirty_log_slots" if needed 606 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 607 608 // Setting the size to 0 means "remove" 609 region.memory_size = 0; 610 // SAFETY: Safe because guest regions are guaranteed not to overlap. 611 unsafe { 612 self.fd 613 .set_user_memory_region(region) 614 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 615 } 616 } 617 /// 618 /// Creates an emulated device in the kernel. 619 /// 620 /// See the documentation for `KVM_CREATE_DEVICE`. 621 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> { 622 let device_fd = self 623 .fd 624 .create_device(device) 625 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 626 Ok(Arc::new(device_fd)) 627 } 628 /// 629 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 630 /// 631 #[cfg(target_arch = "aarch64")] 632 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 633 self.fd 634 .get_preferred_target(kvi) 635 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 636 } 637 #[cfg(target_arch = "x86_64")] 638 fn enable_split_irq(&self) -> vm::Result<()> { 639 // Create split irqchip 640 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 641 // are not. 642 let mut cap = kvm_enable_cap { 643 cap: KVM_CAP_SPLIT_IRQCHIP, 644 ..Default::default() 645 }; 646 cap.args[0] = NUM_IOAPIC_PINS as u64; 647 self.fd 648 .enable_cap(&cap) 649 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 650 Ok(()) 651 } 652 #[cfg(target_arch = "x86_64")] 653 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 654 let mut cap = kvm_enable_cap { 655 cap: KVM_CAP_SGX_ATTRIBUTE, 656 ..Default::default() 657 }; 658 cap.args[0] = file.as_raw_fd() as u64; 659 self.fd 660 .enable_cap(&cap) 661 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 662 Ok(()) 663 } 664 /// Retrieve guest clock. 665 #[cfg(target_arch = "x86_64")] 666 fn get_clock(&self) -> vm::Result<ClockData> { 667 Ok(self 668 .fd 669 .get_clock() 670 .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))? 671 .into()) 672 } 673 /// Set guest clock. 674 #[cfg(target_arch = "x86_64")] 675 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 676 let data = (*data).into(); 677 self.fd 678 .set_clock(&data) 679 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 680 } 681 /// Checks if a particular `Cap` is available. 682 fn check_extension(&self, c: Cap) -> bool { 683 self.fd.check_extension(c) 684 } 685 /// Create a device that is used for passthrough 686 fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> { 687 let mut vfio_dev = kvm_create_device { 688 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 689 fd: 0, 690 flags: 0, 691 }; 692 693 self.create_device(&mut vfio_dev) 694 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 695 } 696 /// 697 /// Start logging dirty pages 698 /// 699 fn start_dirty_log(&self) -> vm::Result<()> { 700 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 701 for (_, s) in dirty_log_slots.iter() { 702 let region = kvm_userspace_memory_region { 703 slot: s.slot, 704 guest_phys_addr: s.guest_phys_addr, 705 memory_size: s.memory_size, 706 userspace_addr: s.userspace_addr, 707 flags: KVM_MEM_LOG_DIRTY_PAGES, 708 }; 709 // SAFETY: Safe because guest regions are guaranteed not to overlap. 710 unsafe { 711 self.fd 712 .set_user_memory_region(region) 713 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 714 } 715 } 716 717 Ok(()) 718 } 719 720 /// 721 /// Stop logging dirty pages 722 /// 723 fn stop_dirty_log(&self) -> vm::Result<()> { 724 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 725 for (_, s) in dirty_log_slots.iter() { 726 let region = kvm_userspace_memory_region { 727 slot: s.slot, 728 guest_phys_addr: s.guest_phys_addr, 729 memory_size: s.memory_size, 730 userspace_addr: s.userspace_addr, 731 flags: 0, 732 }; 733 // SAFETY: Safe because guest regions are guaranteed not to overlap. 734 unsafe { 735 self.fd 736 .set_user_memory_region(region) 737 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 738 } 739 } 740 741 Ok(()) 742 } 743 744 /// 745 /// Get dirty pages bitmap (one bit per page) 746 /// 747 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 748 self.fd 749 .get_dirty_log(slot, memory_size as usize) 750 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 751 } 752 753 /// 754 /// Initialize TDX for this VM 755 /// 756 #[cfg(feature = "tdx")] 757 fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> { 758 use std::io::{Error, ErrorKind}; 759 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 760 cpuid.iter().map(|e| (*e).into()).collect(); 761 let kvm_cpuid = kvm_bindings::CpuId::from_entries(&cpuid).map_err(|_| { 762 vm::HypervisorVmError::InitializeTdx(Error::new( 763 ErrorKind::Other, 764 "failed to allocate CpuId", 765 )) 766 })?; 767 768 #[repr(C)] 769 struct TdxInitVm { 770 max_vcpus: u32, 771 tsc_khz: u32, 772 attributes: u64, 773 cpuid: u64, 774 mrconfigid: [u64; 6], 775 mrowner: [u64; 6], 776 mrownerconfig: [u64; 6], 777 reserved: [u64; 43], 778 } 779 let data = TdxInitVm { 780 max_vcpus, 781 tsc_khz: 0, 782 attributes: 0, 783 cpuid: kvm_cpuid.as_fam_struct_ptr() as u64, 784 mrconfigid: [0; 6], 785 mrowner: [0; 6], 786 mrownerconfig: [0; 6], 787 reserved: [0; 43], 788 }; 789 790 tdx_command( 791 &self.fd.as_raw_fd(), 792 TdxCommand::InitVm, 793 0, 794 &data as *const _ as u64, 795 ) 796 .map_err(vm::HypervisorVmError::InitializeTdx) 797 } 798 799 /// 800 /// Finalize the TDX setup for this VM 801 /// 802 #[cfg(feature = "tdx")] 803 fn tdx_finalize(&self) -> vm::Result<()> { 804 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 805 .map_err(vm::HypervisorVmError::FinalizeTdx) 806 } 807 808 /// 809 /// Initialize memory regions for the TDX VM 810 /// 811 #[cfg(feature = "tdx")] 812 fn tdx_init_memory_region( 813 &self, 814 host_address: u64, 815 guest_address: u64, 816 size: u64, 817 measure: bool, 818 ) -> vm::Result<()> { 819 #[repr(C)] 820 struct TdxInitMemRegion { 821 host_address: u64, 822 guest_address: u64, 823 pages: u64, 824 } 825 let data = TdxInitMemRegion { 826 host_address, 827 guest_address, 828 pages: size / 4096, 829 }; 830 831 tdx_command( 832 &self.fd.as_raw_fd(), 833 TdxCommand::InitMemRegion, 834 if measure { 1 } else { 0 }, 835 &data as *const _ as u64, 836 ) 837 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 838 } 839 } 840 841 #[cfg(feature = "tdx")] 842 fn tdx_command( 843 fd: &RawFd, 844 command: TdxCommand, 845 metadata: u32, 846 data: u64, 847 ) -> std::result::Result<(), std::io::Error> { 848 #[repr(C)] 849 struct TdxIoctlCmd { 850 command: TdxCommand, 851 metadata: u32, 852 data: u64, 853 } 854 let cmd = TdxIoctlCmd { 855 command, 856 metadata, 857 data, 858 }; 859 // SAFETY: FFI call. All input parameters are valid. 860 let ret = unsafe { 861 ioctl_with_val( 862 fd, 863 KVM_MEMORY_ENCRYPT_OP(), 864 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 865 ) 866 }; 867 868 if ret < 0 { 869 return Err(std::io::Error::last_os_error()); 870 } 871 Ok(()) 872 } 873 874 /// Wrapper over KVM system ioctls. 875 pub struct KvmHypervisor { 876 kvm: Kvm, 877 } 878 /// Enum for KVM related error 879 #[derive(Debug, Error)] 880 pub enum KvmError { 881 #[error("Capability missing: {0:?}")] 882 CapabilityMissing(Cap), 883 } 884 pub type KvmResult<T> = result::Result<T, KvmError>; 885 impl KvmHypervisor { 886 /// Create a hypervisor based on Kvm 887 pub fn new() -> hypervisor::Result<KvmHypervisor> { 888 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 889 let api_version = kvm_obj.get_api_version(); 890 891 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 892 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 893 } 894 895 Ok(KvmHypervisor { kvm: kvm_obj }) 896 } 897 } 898 /// Implementation of Hypervisor trait for KVM 899 /// Example: 900 /// #[cfg(feature = "kvm")] 901 /// extern crate hypervisor 902 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 903 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 904 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 905 /// 906 impl hypervisor::Hypervisor for KvmHypervisor { 907 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 908 /// Example 909 /// # extern crate hypervisor; 910 /// # use hypervisor::KvmHypervisor; 911 /// use hypervisor::KvmVm; 912 /// let hypervisor = KvmHypervisor::new().unwrap(); 913 /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() 914 /// 915 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 916 let fd: VmFd; 917 loop { 918 match self.kvm.create_vm_with_type(vm_type) { 919 Ok(res) => fd = res, 920 Err(e) => { 921 if e.errno() == libc::EINTR { 922 // If the error returned is EINTR, which means the 923 // ioctl has been interrupted, we have to retry as 924 // this can't be considered as a regular error. 925 continue; 926 } else { 927 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 928 } 929 } 930 } 931 break; 932 } 933 934 let vm_fd = Arc::new(fd); 935 936 #[cfg(target_arch = "x86_64")] 937 { 938 let msr_list = self.get_msr_list()?; 939 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 940 let mut msrs = MsrEntries::new(num_msrs).unwrap(); 941 let indices = msr_list.as_slice(); 942 let msr_entries = msrs.as_mut_slice(); 943 for (pos, index) in indices.iter().enumerate() { 944 msr_entries[pos].index = *index; 945 } 946 947 Ok(Arc::new(KvmVm { 948 fd: vm_fd, 949 msrs, 950 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 951 })) 952 } 953 954 #[cfg(target_arch = "aarch64")] 955 { 956 Ok(Arc::new(KvmVm { 957 fd: vm_fd, 958 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 959 })) 960 } 961 } 962 963 /// Create a KVM vm object and return the object as Vm trait object 964 /// Example 965 /// # extern crate hypervisor; 966 /// # use hypervisor::KvmHypervisor; 967 /// use hypervisor::KvmVm; 968 /// let hypervisor = KvmHypervisor::new().unwrap(); 969 /// let vm = hypervisor.create_vm().unwrap() 970 /// 971 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 972 #[allow(unused_mut)] 973 let mut vm_type: u64 = 0; // Create with default platform type 974 975 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 976 // size from the host and use that when creating the VM, which may 977 // avoid unnecessary VM creation failures. 978 #[cfg(target_arch = "aarch64")] 979 if self.kvm.check_extension(Cap::ArmVmIPASize) { 980 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 981 } 982 983 self.create_vm_with_type(vm_type) 984 } 985 986 fn check_required_extensions(&self) -> hypervisor::Result<()> { 987 check_required_kvm_extensions(&self.kvm) 988 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 989 } 990 991 #[cfg(target_arch = "x86_64")] 992 /// 993 /// X86 specific call to get the system supported CPUID values. 994 /// 995 fn get_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> { 996 let kvm_cpuid = self 997 .kvm 998 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 999 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?; 1000 1001 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1002 1003 Ok(v) 1004 } 1005 1006 #[cfg(target_arch = "x86_64")] 1007 /// 1008 /// Retrieve the list of MSRs supported by KVM. 1009 /// 1010 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 1011 self.kvm 1012 .get_msr_index_list() 1013 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 1014 } 1015 #[cfg(target_arch = "aarch64")] 1016 /// 1017 /// Retrieve AArch64 host maximum IPA size supported by KVM. 1018 /// 1019 fn get_host_ipa_limit(&self) -> i32 { 1020 self.kvm.get_host_ipa_limit() 1021 } 1022 1023 /// 1024 /// Retrieve TDX capabilities 1025 /// 1026 #[cfg(feature = "tdx")] 1027 fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> { 1028 let data = TdxCapabilities { 1029 nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32, 1030 ..Default::default() 1031 }; 1032 1033 tdx_command( 1034 &self.kvm.as_raw_fd(), 1035 TdxCommand::Capabilities, 1036 0, 1037 &data as *const _ as u64, 1038 ) 1039 .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?; 1040 1041 Ok(data) 1042 } 1043 } 1044 /// Vcpu struct for KVM 1045 pub struct KvmVcpu { 1046 fd: VcpuFd, 1047 #[cfg(target_arch = "x86_64")] 1048 msrs: MsrEntries, 1049 vm_ops: Option<Arc<dyn vm::VmOps>>, 1050 #[cfg(target_arch = "x86_64")] 1051 hyperv_synic: AtomicBool, 1052 } 1053 /// Implementation of Vcpu trait for KVM 1054 /// Example: 1055 /// #[cfg(feature = "kvm")] 1056 /// extern crate hypervisor 1057 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1058 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1059 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 1060 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1061 /// vcpu.get/set().unwrap() 1062 /// 1063 impl cpu::Vcpu for KvmVcpu { 1064 #[cfg(target_arch = "x86_64")] 1065 /// 1066 /// Returns the vCPU general purpose registers. 1067 /// 1068 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1069 Ok(self 1070 .fd 1071 .get_regs() 1072 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))? 1073 .into()) 1074 } 1075 /// 1076 /// Returns the vCPU general purpose registers. 1077 /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG` 1078 /// is used to get registers one by one. 1079 /// 1080 #[cfg(target_arch = "aarch64")] 1081 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1082 let mut state: StandardRegisters = kvm_regs::default(); 1083 let mut off = offset__of!(user_pt_regs, regs); 1084 // There are 31 user_pt_regs: 1085 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1086 // These actually are the general-purpose registers of the Armv8-a 1087 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1088 for i in 0..31 { 1089 state.regs.regs[i] = self 1090 .fd 1091 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1092 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1093 off += std::mem::size_of::<u64>(); 1094 } 1095 1096 // We are now entering the "Other register" section of the ARMv8-a architecture. 1097 // First one, stack pointer. 1098 let off = offset__of!(user_pt_regs, sp); 1099 state.regs.sp = self 1100 .fd 1101 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1102 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1103 1104 // Second one, the program counter. 1105 let off = offset__of!(user_pt_regs, pc); 1106 state.regs.pc = self 1107 .fd 1108 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1109 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1110 1111 // Next is the processor state. 1112 let off = offset__of!(user_pt_regs, pstate); 1113 state.regs.pstate = self 1114 .fd 1115 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1116 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1117 1118 // The stack pointer associated with EL1 1119 let off = offset__of!(kvm_regs, sp_el1); 1120 state.sp_el1 = self 1121 .fd 1122 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1123 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1124 1125 // Exception Link Register for EL1, when taking an exception to EL1, this register 1126 // holds the address to which to return afterwards. 1127 let off = offset__of!(kvm_regs, elr_el1); 1128 state.elr_el1 = self 1129 .fd 1130 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1131 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1132 1133 // Saved Program Status Registers, there are 5 of them used in the kernel. 1134 let mut off = offset__of!(kvm_regs, spsr); 1135 for i in 0..KVM_NR_SPSR as usize { 1136 state.spsr[i] = self 1137 .fd 1138 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1139 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1140 off += std::mem::size_of::<u64>(); 1141 } 1142 1143 // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: 1144 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1145 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1146 for i in 0..32 { 1147 state.fp_regs.vregs[i] = self 1148 .fd 1149 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1150 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1151 .into(); 1152 off += mem::size_of::<u128>(); 1153 } 1154 1155 // Floating-point Status Register 1156 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1157 state.fp_regs.fpsr = self 1158 .fd 1159 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1160 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1161 as u32; 1162 1163 // Floating-point Control Register 1164 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1165 state.fp_regs.fpcr = self 1166 .fd 1167 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1168 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1169 as u32; 1170 Ok(state) 1171 } 1172 #[cfg(target_arch = "x86_64")] 1173 /// 1174 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 1175 /// 1176 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 1177 let regs = (*regs).into(); 1178 self.fd 1179 .set_regs(®s) 1180 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 1181 } 1182 1183 /// 1184 /// Sets the vCPU general purpose registers. 1185 /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG` 1186 /// is used to set registers one by one. 1187 /// 1188 #[cfg(target_arch = "aarch64")] 1189 fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> { 1190 // The function follows the exact identical order from `state`. Look there 1191 // for some additional info on registers. 1192 let mut off = offset__of!(user_pt_regs, regs); 1193 for i in 0..31 { 1194 self.fd 1195 .set_one_reg( 1196 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1197 state.regs.regs[i], 1198 ) 1199 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1200 off += std::mem::size_of::<u64>(); 1201 } 1202 1203 let off = offset__of!(user_pt_regs, sp); 1204 self.fd 1205 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp) 1206 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1207 1208 let off = offset__of!(user_pt_regs, pc); 1209 self.fd 1210 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc) 1211 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1212 1213 let off = offset__of!(user_pt_regs, pstate); 1214 self.fd 1215 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate) 1216 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1217 1218 let off = offset__of!(kvm_regs, sp_el1); 1219 self.fd 1220 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1) 1221 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1222 1223 let off = offset__of!(kvm_regs, elr_el1); 1224 self.fd 1225 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1) 1226 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1227 1228 let mut off = offset__of!(kvm_regs, spsr); 1229 for i in 0..KVM_NR_SPSR as usize { 1230 self.fd 1231 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i]) 1232 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1233 off += std::mem::size_of::<u64>(); 1234 } 1235 1236 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1237 for i in 0..32 { 1238 self.fd 1239 .set_one_reg( 1240 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1241 state.fp_regs.vregs[i] as u64, 1242 ) 1243 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1244 off += mem::size_of::<u128>(); 1245 } 1246 1247 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1248 self.fd 1249 .set_one_reg( 1250 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1251 state.fp_regs.fpsr as u64, 1252 ) 1253 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1254 1255 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1256 self.fd 1257 .set_one_reg( 1258 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1259 state.fp_regs.fpcr as u64, 1260 ) 1261 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1262 Ok(()) 1263 } 1264 1265 #[cfg(target_arch = "aarch64")] 1266 /// 1267 /// Set attribute for vcpu. 1268 /// 1269 fn set_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> { 1270 self.fd 1271 .set_device_attr(attr) 1272 .map_err(|e| cpu::HypervisorCpuError::SetVcpuAttribute(e.into())) 1273 } 1274 1275 #[cfg(target_arch = "aarch64")] 1276 /// 1277 /// Check if vcpu has a certain attribute. 1278 /// 1279 fn has_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> { 1280 self.fd 1281 .has_device_attr(attr) 1282 .map_err(|e| cpu::HypervisorCpuError::HasVcpuAttribute(e.into())) 1283 } 1284 1285 #[cfg(target_arch = "x86_64")] 1286 /// 1287 /// Returns the vCPU special registers. 1288 /// 1289 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 1290 Ok(self 1291 .fd 1292 .get_sregs() 1293 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))? 1294 .into()) 1295 } 1296 #[cfg(target_arch = "x86_64")] 1297 /// 1298 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 1299 /// 1300 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 1301 let sregs = (*sregs).into(); 1302 self.fd 1303 .set_sregs(&sregs) 1304 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 1305 } 1306 #[cfg(target_arch = "x86_64")] 1307 /// 1308 /// Returns the floating point state (FPU) from the vCPU. 1309 /// 1310 fn get_fpu(&self) -> cpu::Result<FpuState> { 1311 Ok(self 1312 .fd 1313 .get_fpu() 1314 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))? 1315 .into()) 1316 } 1317 #[cfg(target_arch = "x86_64")] 1318 /// 1319 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 1320 /// 1321 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 1322 let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into(); 1323 self.fd 1324 .set_fpu(&fpu) 1325 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 1326 } 1327 #[cfg(target_arch = "x86_64")] 1328 /// 1329 /// X86 specific call to setup the CPUID registers. 1330 /// 1331 fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> { 1332 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 1333 cpuid.iter().map(|e| (*e).into()).collect(); 1334 let kvm_cpuid = <CpuId>::from_entries(&cpuid) 1335 .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?; 1336 1337 self.fd 1338 .set_cpuid2(&kvm_cpuid) 1339 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 1340 } 1341 #[cfg(target_arch = "x86_64")] 1342 /// 1343 /// X86 specific call to enable HyperV SynIC 1344 /// 1345 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 1346 // Update the information about Hyper-V SynIC being enabled and 1347 // emulated as it will influence later which MSRs should be saved. 1348 self.hyperv_synic.store(true, Ordering::Release); 1349 1350 let cap = kvm_enable_cap { 1351 cap: KVM_CAP_HYPERV_SYNIC, 1352 ..Default::default() 1353 }; 1354 self.fd 1355 .enable_cap(&cap) 1356 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 1357 } 1358 /// 1359 /// X86 specific call to retrieve the CPUID registers. 1360 /// 1361 #[cfg(target_arch = "x86_64")] 1362 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> { 1363 let kvm_cpuid = self 1364 .fd 1365 .get_cpuid2(num_entries) 1366 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?; 1367 1368 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1369 1370 Ok(v) 1371 } 1372 #[cfg(target_arch = "x86_64")] 1373 /// 1374 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1375 /// 1376 fn get_lapic(&self) -> cpu::Result<LapicState> { 1377 self.fd 1378 .get_lapic() 1379 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into())) 1380 } 1381 #[cfg(target_arch = "x86_64")] 1382 /// 1383 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1384 /// 1385 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 1386 self.fd 1387 .set_lapic(klapic) 1388 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 1389 } 1390 #[cfg(target_arch = "x86_64")] 1391 /// 1392 /// Returns the model-specific registers (MSR) for this vCPU. 1393 /// 1394 fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> { 1395 self.fd 1396 .get_msrs(msrs) 1397 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into())) 1398 } 1399 #[cfg(target_arch = "x86_64")] 1400 /// 1401 /// Setup the model-specific registers (MSR) for this vCPU. 1402 /// Returns the number of MSR entries actually written. 1403 /// 1404 fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> { 1405 self.fd 1406 .set_msrs(msrs) 1407 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 1408 } 1409 /// 1410 /// Returns the vcpu's current "multiprocessing state". 1411 /// 1412 fn get_mp_state(&self) -> cpu::Result<MpState> { 1413 Ok(self 1414 .fd 1415 .get_mp_state() 1416 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))? 1417 .into()) 1418 } 1419 /// 1420 /// Sets the vcpu's current "multiprocessing state". 1421 /// 1422 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 1423 self.fd 1424 .set_mp_state(mp_state.into()) 1425 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 1426 } 1427 #[cfg(target_arch = "x86_64")] 1428 /// 1429 /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl. 1430 /// 1431 fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> { 1432 let tr = self 1433 .fd 1434 .translate_gva(gva) 1435 .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?; 1436 // tr.valid is set if the GVA is mapped to valid GPA. 1437 match tr.valid { 1438 0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!( 1439 "Invalid GVA: {:#x}", 1440 gva 1441 ))), 1442 _ => Ok((tr.physical_address, 0)), 1443 } 1444 } 1445 /// 1446 /// Triggers the running of the current virtual CPU returning an exit reason. 1447 /// 1448 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 1449 match self.fd.run() { 1450 Ok(run) => match run { 1451 #[cfg(target_arch = "x86_64")] 1452 VcpuExit::IoIn(addr, data) => { 1453 if let Some(vm_ops) = &self.vm_ops { 1454 return vm_ops 1455 .pio_read(addr.into(), data) 1456 .map(|_| cpu::VmExit::Ignore) 1457 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1458 } 1459 1460 Ok(cpu::VmExit::IoIn(addr, data)) 1461 } 1462 #[cfg(target_arch = "x86_64")] 1463 VcpuExit::IoOut(addr, data) => { 1464 if let Some(vm_ops) = &self.vm_ops { 1465 return vm_ops 1466 .pio_write(addr.into(), data) 1467 .map(|_| cpu::VmExit::Ignore) 1468 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1469 } 1470 1471 Ok(cpu::VmExit::IoOut(addr, data)) 1472 } 1473 #[cfg(target_arch = "x86_64")] 1474 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 1475 #[cfg(target_arch = "x86_64")] 1476 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 1477 1478 #[cfg(target_arch = "aarch64")] 1479 VcpuExit::SystemEvent(event_type, flags) => { 1480 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 1481 // On Aarch64, when the VM is shutdown, run() returns 1482 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 1483 if event_type == KVM_SYSTEM_EVENT_RESET { 1484 Ok(cpu::VmExit::Reset) 1485 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 1486 Ok(cpu::VmExit::Shutdown) 1487 } else { 1488 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1489 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 1490 event_type, 1491 flags 1492 ))) 1493 } 1494 } 1495 1496 VcpuExit::MmioRead(addr, data) => { 1497 if let Some(vm_ops) = &self.vm_ops { 1498 return vm_ops 1499 .mmio_read(addr, data) 1500 .map(|_| cpu::VmExit::Ignore) 1501 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1502 } 1503 1504 Ok(cpu::VmExit::MmioRead(addr, data)) 1505 } 1506 VcpuExit::MmioWrite(addr, data) => { 1507 if let Some(vm_ops) = &self.vm_ops { 1508 return vm_ops 1509 .mmio_write(addr, data) 1510 .map(|_| cpu::VmExit::Ignore) 1511 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1512 } 1513 1514 Ok(cpu::VmExit::MmioWrite(addr, data)) 1515 } 1516 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 1517 #[cfg(feature = "tdx")] 1518 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), 1519 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), 1520 1521 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1522 "Unexpected exit reason on vcpu run: {:?}", 1523 r 1524 ))), 1525 }, 1526 1527 Err(ref e) => match e.errno() { 1528 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1529 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1530 "VCPU error {:?}", 1531 e 1532 ))), 1533 }, 1534 } 1535 } 1536 #[cfg(target_arch = "x86_64")] 1537 /// 1538 /// Let the guest know that it has been paused, which prevents from 1539 /// potential soft lockups when being resumed. 1540 /// 1541 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1542 if let Err(e) = self.fd.kvmclock_ctrl() { 1543 // Linux kernel returns -EINVAL if the PV clock isn't yet initialised 1544 // which could be because we're still in firmware or the guest doesn't 1545 // use KVM clock. 1546 if e.errno() != libc::EINVAL { 1547 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())); 1548 } 1549 } 1550 1551 Ok(()) 1552 } 1553 #[cfg(target_arch = "x86_64")] 1554 /// 1555 /// Sets debug registers to set hardware breakpoints and/or enable single step. 1556 /// 1557 fn set_guest_debug( 1558 &self, 1559 addrs: &[vm_memory::GuestAddress], 1560 singlestep: bool, 1561 ) -> cpu::Result<()> { 1562 if addrs.len() > 4 { 1563 return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!( 1564 "Support 4 breakpoints at most but {} addresses are passed", 1565 addrs.len() 1566 ))); 1567 } 1568 1569 let mut dbg = kvm_guest_debug { 1570 control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP, 1571 ..Default::default() 1572 }; 1573 if singlestep { 1574 dbg.control |= KVM_GUESTDBG_SINGLESTEP; 1575 } 1576 1577 // Set bits 9 and 10. 1578 // bit 9: GE (global exact breakpoint enable) flag. 1579 // bit 10: always 1. 1580 dbg.arch.debugreg[7] = 0x0600; 1581 1582 for (i, addr) in addrs.iter().enumerate() { 1583 dbg.arch.debugreg[i] = addr.0; 1584 // Set global breakpoint enable flag 1585 dbg.arch.debugreg[7] |= 2 << (i * 2); 1586 } 1587 1588 self.fd 1589 .set_guest_debug(&dbg) 1590 .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into())) 1591 } 1592 #[cfg(target_arch = "aarch64")] 1593 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1594 self.fd 1595 .vcpu_init(kvi) 1596 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1597 } 1598 /// 1599 /// Sets the value of one register for this vCPU. 1600 /// 1601 #[cfg(target_arch = "aarch64")] 1602 fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> { 1603 self.fd 1604 .set_one_reg(reg_id, data) 1605 .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into())) 1606 } 1607 /// 1608 /// Gets the value of one register for this vCPU. 1609 /// 1610 #[cfg(target_arch = "aarch64")] 1611 fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> { 1612 self.fd 1613 .get_one_reg(reg_id) 1614 .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into())) 1615 } 1616 /// 1617 /// Gets a list of the guest registers that are supported for the 1618 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1619 /// 1620 #[cfg(target_arch = "aarch64")] 1621 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1622 self.fd 1623 .get_reg_list(reg_list) 1624 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1625 } 1626 /// 1627 /// Save the state of the system registers. 1628 /// 1629 #[cfg(target_arch = "aarch64")] 1630 fn get_sys_regs(&self) -> cpu::Result<Vec<Register>> { 1631 // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are 1632 // around 500 registers. 1633 let mut state: Vec<Register> = Vec::new(); 1634 let mut reg_list = RegList::new(500).unwrap(); 1635 self.fd 1636 .get_reg_list(&mut reg_list) 1637 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1638 1639 // At this point reg_list should contain: core registers and system registers. 1640 // The register list contains the number of registers and their ids. We will be needing to 1641 // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list 1642 // the core registers which are represented in the kernel by kvm_regs structure and for which 1643 // we can calculate the id based on the offset in the structure. 1644 reg_list.retain(|regid| is_system_register(*regid)); 1645 1646 // Now, for the rest of the registers left in the previously fetched register list, we are 1647 // simply calling KVM_GET_ONE_REG. 1648 let indices = reg_list.as_slice(); 1649 for index in indices.iter() { 1650 state.push(kvm_bindings::kvm_one_reg { 1651 id: *index, 1652 addr: self 1653 .fd 1654 .get_one_reg(*index) 1655 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?, 1656 }); 1657 } 1658 1659 Ok(state) 1660 } 1661 /// 1662 /// Restore the state of the system registers. 1663 /// 1664 #[cfg(target_arch = "aarch64")] 1665 fn set_sys_regs(&self, state: &[Register]) -> cpu::Result<()> { 1666 for reg in state { 1667 self.fd 1668 .set_one_reg(reg.id, reg.addr) 1669 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 1670 } 1671 Ok(()) 1672 } 1673 /// 1674 /// Read the MPIDR - Multiprocessor Affinity Register. 1675 /// 1676 #[cfg(target_arch = "aarch64")] 1677 fn read_mpidr(&self) -> cpu::Result<u64> { 1678 self.fd 1679 .get_one_reg(MPIDR_EL1) 1680 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into())) 1681 } 1682 /// 1683 /// Configure core registers for a given CPU. 1684 /// 1685 #[cfg(target_arch = "aarch64")] 1686 fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { 1687 #[allow(non_upper_case_globals)] 1688 // PSR (Processor State Register) bits. 1689 // Taken from arch/arm64/include/uapi/asm/ptrace.h. 1690 const PSR_MODE_EL1h: u64 = 0x0000_0005; 1691 const PSR_F_BIT: u64 = 0x0000_0040; 1692 const PSR_I_BIT: u64 = 0x0000_0080; 1693 const PSR_A_BIT: u64 = 0x0000_0100; 1694 const PSR_D_BIT: u64 = 0x0000_0200; 1695 // Taken from arch/arm64/kvm/inject_fault.c. 1696 const PSTATE_FAULT_BITS_64: u64 = 1697 PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT; 1698 1699 let kreg_off = offset__of!(kvm_regs, regs); 1700 1701 // Get the register index of the PSTATE (Processor State) register. 1702 let pstate = offset__of!(user_pt_regs, pstate) + kreg_off; 1703 self.set_reg( 1704 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate), 1705 PSTATE_FAULT_BITS_64, 1706 ) 1707 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1708 1709 // Other vCPUs are powered off initially awaiting PSCI wakeup. 1710 if cpu_id == 0 { 1711 // Setting the PC (Processor Counter) to the current program address (kernel address). 1712 let pc = offset__of!(user_pt_regs, pc) + kreg_off; 1713 self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64) 1714 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1715 1716 // Last mandatory thing to set -> the address pointing to the FDT (also called DTB). 1717 // "The device tree blob (dtb) must be placed on an 8-byte boundary and must 1718 // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt. 1719 // We are choosing to place it the end of DRAM. See `get_fdt_addr`. 1720 let regs0 = offset__of!(user_pt_regs, regs) + kreg_off; 1721 self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start) 1722 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1723 } 1724 Ok(()) 1725 } 1726 1727 #[cfg(target_arch = "x86_64")] 1728 /// 1729 /// Get the current CPU state 1730 /// 1731 /// Ordering requirements: 1732 /// 1733 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1734 /// vCPU/LAPIC state. As such, it must be done before most everything 1735 /// else, otherwise we cannot restore everything and expect it to work. 1736 /// 1737 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1738 /// still running. 1739 /// 1740 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1741 /// 1742 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1743 /// it might as well be affected by internal state modifications of the 1744 /// GET ioctls. 1745 /// 1746 /// SREGS saves/restores a pending interrupt, similar to what 1747 /// VCPU_EVENTS also does. 1748 /// 1749 /// GET_MSRS requires a pre-populated data structure to do something 1750 /// meaningful. For SET_MSRS it will then contain good data. 1751 /// 1752 /// # Example 1753 /// 1754 /// ```rust 1755 /// # extern crate hypervisor; 1756 /// # use hypervisor::KvmHypervisor; 1757 /// # use std::sync::Arc; 1758 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1759 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1760 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1761 /// vm.enable_split_irq().unwrap(); 1762 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1763 /// let state = vcpu.state().unwrap(); 1764 /// ``` 1765 fn state(&self) -> cpu::Result<CpuState> { 1766 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1767 let mp_state = self.get_mp_state()?.into(); 1768 let regs = self.get_regs()?; 1769 let sregs = self.get_sregs()?; 1770 let xsave = self.get_xsave()?; 1771 let xcrs = self.get_xcrs()?; 1772 let lapic_state = self.get_lapic()?; 1773 let fpu = self.get_fpu()?; 1774 1775 // Try to get all MSRs based on the list previously retrieved from KVM. 1776 // If the number of MSRs obtained from GET_MSRS is different from the 1777 // expected amount, we fallback onto a slower method by getting MSRs 1778 // by chunks. This is the only way to make sure we try to get as many 1779 // MSRs as possible, even if some MSRs are not supported. 1780 let mut msr_entries = self.msrs.clone(); 1781 1782 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1783 // emulated. 1784 if self.hyperv_synic.load(Ordering::Acquire) { 1785 let hyperv_synic_msrs = vec![ 1786 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1787 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1788 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1789 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 1790 0x400000b5, 0x400000b6, 0x400000b7, 1791 ]; 1792 for index in hyperv_synic_msrs { 1793 let msr = kvm_msr_entry { 1794 index, 1795 ..Default::default() 1796 }; 1797 msr_entries.push(msr).unwrap(); 1798 } 1799 } 1800 1801 let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize; 1802 let num_msrs = self.get_msrs(&mut msr_entries)?; 1803 let msrs = if num_msrs != expected_num_msrs { 1804 let mut faulty_msr_index = num_msrs; 1805 let mut msr_entries_tmp = 1806 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap(); 1807 1808 loop { 1809 warn!( 1810 "Detected faulty MSR 0x{:x} while getting MSRs", 1811 msr_entries.as_slice()[faulty_msr_index].index 1812 ); 1813 1814 let start_pos = faulty_msr_index + 1; 1815 let mut sub_msr_entries = 1816 MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap(); 1817 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1818 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1819 1820 for i in 0..num_msrs { 1821 msr_entries_tmp 1822 .push(sub_msr_entries.as_slice()[i]) 1823 .map_err(|e| { 1824 cpu::HypervisorCpuError::GetMsrEntries(anyhow!( 1825 "Failed adding MSR entries: {:?}", 1826 e 1827 )) 1828 })?; 1829 } 1830 1831 if num_msrs == expected_num_msrs { 1832 break; 1833 } 1834 1835 faulty_msr_index = start_pos + num_msrs; 1836 } 1837 1838 msr_entries_tmp 1839 } else { 1840 msr_entries 1841 }; 1842 1843 let vcpu_events = self.get_vcpu_events()?; 1844 1845 Ok(VcpuKvmState { 1846 cpuid, 1847 msrs, 1848 vcpu_events, 1849 regs: regs.into(), 1850 sregs: sregs.into(), 1851 fpu, 1852 lapic_state, 1853 xsave, 1854 xcrs, 1855 mp_state, 1856 } 1857 .into()) 1858 } 1859 /// 1860 /// Get the current AArch64 CPU state 1861 /// 1862 #[cfg(target_arch = "aarch64")] 1863 fn state(&self) -> cpu::Result<CpuState> { 1864 let mut state = VcpuKvmState { 1865 mp_state: self.get_mp_state()?.into(), 1866 mpidr: self.read_mpidr()?, 1867 ..Default::default() 1868 }; 1869 state.core_regs = self.get_regs()?; 1870 state.sys_regs = self.get_sys_regs()?; 1871 1872 Ok(state.into()) 1873 } 1874 #[cfg(target_arch = "x86_64")] 1875 /// 1876 /// Restore the previously saved CPU state 1877 /// 1878 /// Ordering requirements: 1879 /// 1880 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1881 /// still running. 1882 /// 1883 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1884 /// if we ever change the BSP, we have to do that before restoring anything. 1885 /// The same seems to be true for CPUID stuff. 1886 /// 1887 /// SREGS saves/restores a pending interrupt, similar to what 1888 /// VCPU_EVENTS also does. 1889 /// 1890 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1891 /// done before SET_VCPU_EVENTS, which restores it. 1892 /// 1893 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1894 /// the apic base msr. 1895 /// 1896 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1897 /// only restores successfully, when the LAPIC is correctly configured. 1898 /// 1899 /// Arguments: CpuState 1900 /// # Example 1901 /// 1902 /// ```rust 1903 /// # extern crate hypervisor; 1904 /// # use hypervisor::KvmHypervisor; 1905 /// # use std::sync::Arc; 1906 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1907 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1908 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1909 /// vm.enable_split_irq().unwrap(); 1910 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1911 /// let state = vcpu.state().unwrap(); 1912 /// vcpu.set_state(&state).unwrap(); 1913 /// ``` 1914 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1915 let state: VcpuKvmState = state.clone().into(); 1916 self.set_cpuid2(&state.cpuid)?; 1917 self.set_mp_state(state.mp_state.into())?; 1918 self.set_regs(&state.regs.into())?; 1919 self.set_sregs(&state.sregs.into())?; 1920 self.set_xsave(&state.xsave)?; 1921 self.set_xcrs(&state.xcrs)?; 1922 self.set_lapic(&state.lapic_state)?; 1923 self.set_fpu(&state.fpu)?; 1924 1925 // Try to set all MSRs previously stored. 1926 // If the number of MSRs set from SET_MSRS is different from the 1927 // expected amount, we fallback onto a slower method by setting MSRs 1928 // by chunks. This is the only way to make sure we try to set as many 1929 // MSRs as possible, even if some MSRs are not supported. 1930 let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize; 1931 let num_msrs = self.set_msrs(&state.msrs)?; 1932 if num_msrs != expected_num_msrs { 1933 let mut faulty_msr_index = num_msrs; 1934 1935 loop { 1936 warn!( 1937 "Detected faulty MSR 0x{:x} while setting MSRs", 1938 state.msrs.as_slice()[faulty_msr_index].index 1939 ); 1940 1941 let start_pos = faulty_msr_index + 1; 1942 let sub_msr_entries = 1943 MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap(); 1944 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1945 let num_msrs = self.set_msrs(&sub_msr_entries)?; 1946 1947 if num_msrs == expected_num_msrs { 1948 break; 1949 } 1950 1951 faulty_msr_index = start_pos + num_msrs; 1952 } 1953 } 1954 1955 self.set_vcpu_events(&state.vcpu_events)?; 1956 1957 Ok(()) 1958 } 1959 /// 1960 /// Restore the previously saved AArch64 CPU state 1961 /// 1962 #[cfg(target_arch = "aarch64")] 1963 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1964 let state: VcpuKvmState = state.clone().into(); 1965 self.set_regs(&state.core_regs)?; 1966 self.set_sys_regs(&state.sys_regs)?; 1967 self.set_mp_state(state.mp_state.into())?; 1968 1969 Ok(()) 1970 } 1971 1972 /// 1973 /// Initialize TDX for this CPU 1974 /// 1975 #[cfg(feature = "tdx")] 1976 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 1977 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 1978 .map_err(cpu::HypervisorCpuError::InitializeTdx) 1979 } 1980 1981 /// 1982 /// Set the "immediate_exit" state 1983 /// 1984 fn set_immediate_exit(&self, exit: bool) { 1985 self.fd.set_kvm_immediate_exit(exit.into()); 1986 } 1987 1988 /// 1989 /// Returns the details about TDX exit reason 1990 /// 1991 #[cfg(feature = "tdx")] 1992 fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> { 1993 let kvm_run = self.fd.get_kvm_run(); 1994 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 1995 1996 tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND; 1997 1998 if tdx_vmcall.type_ != 0 { 1999 return Err(cpu::HypervisorCpuError::UnknownTdxVmCall); 2000 } 2001 2002 match tdx_vmcall.subfunction { 2003 TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote), 2004 TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => { 2005 Ok(TdxExitDetails::SetupEventNotifyInterrupt) 2006 } 2007 _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall), 2008 } 2009 } 2010 2011 /// 2012 /// Set the status code for TDX exit 2013 /// 2014 #[cfg(feature = "tdx")] 2015 fn set_tdx_status(&mut self, status: TdxExitStatus) { 2016 let kvm_run = self.fd.get_kvm_run(); 2017 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2018 2019 tdx_vmcall.status_code = match status { 2020 TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS, 2021 TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND, 2022 }; 2023 } 2024 #[cfg(target_arch = "x86_64")] 2025 /// 2026 /// Return the list of initial MSR entries for a VCPU 2027 /// 2028 fn boot_msr_entries(&self) -> MsrEntries { 2029 use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB}; 2030 use kvm_bindings::kvm_msr_entry as MsrEntry; 2031 2032 MsrEntries::from_entries(&[ 2033 msr!(msr_index::MSR_IA32_SYSENTER_CS), 2034 msr!(msr_index::MSR_IA32_SYSENTER_ESP), 2035 msr!(msr_index::MSR_IA32_SYSENTER_EIP), 2036 msr!(msr_index::MSR_STAR), 2037 msr!(msr_index::MSR_CSTAR), 2038 msr!(msr_index::MSR_LSTAR), 2039 msr!(msr_index::MSR_KERNEL_GS_BASE), 2040 msr!(msr_index::MSR_SYSCALL_MASK), 2041 msr!(msr_index::MSR_IA32_TSC), 2042 msr_data!( 2043 msr_index::MSR_IA32_MISC_ENABLE, 2044 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64 2045 ), 2046 msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB), 2047 ]) 2048 .unwrap() 2049 } 2050 } 2051 2052 impl KvmVcpu { 2053 #[cfg(target_arch = "x86_64")] 2054 /// 2055 /// X86 specific call that returns the vcpu's current "xsave struct". 2056 /// 2057 fn get_xsave(&self) -> cpu::Result<Xsave> { 2058 self.fd 2059 .get_xsave() 2060 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 2061 } 2062 #[cfg(target_arch = "x86_64")] 2063 /// 2064 /// X86 specific call that sets the vcpu's current "xsave struct". 2065 /// 2066 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 2067 self.fd 2068 .set_xsave(xsave) 2069 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 2070 } 2071 #[cfg(target_arch = "x86_64")] 2072 /// 2073 /// X86 specific call that returns the vcpu's current "xcrs". 2074 /// 2075 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 2076 self.fd 2077 .get_xcrs() 2078 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 2079 } 2080 #[cfg(target_arch = "x86_64")] 2081 /// 2082 /// X86 specific call that sets the vcpu's current "xcrs". 2083 /// 2084 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 2085 self.fd 2086 .set_xcrs(xcrs) 2087 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 2088 } 2089 #[cfg(target_arch = "x86_64")] 2090 /// 2091 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 2092 /// states of the vcpu. 2093 /// 2094 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 2095 self.fd 2096 .get_vcpu_events() 2097 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 2098 } 2099 #[cfg(target_arch = "x86_64")] 2100 /// 2101 /// Sets pending exceptions, interrupts, and NMIs as well as related states 2102 /// of the vcpu. 2103 /// 2104 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 2105 self.fd 2106 .set_vcpu_events(events) 2107 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 2108 } 2109 } 2110 2111 /// Device struct for KVM 2112 pub type KvmDevice = DeviceFd; 2113 2114 impl device::Device for KvmDevice { 2115 /// 2116 /// Set device attribute 2117 /// 2118 fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> { 2119 self.set_device_attr(attr) 2120 .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into())) 2121 } 2122 /// 2123 /// Get device attribute 2124 /// 2125 fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> { 2126 self.get_device_attr(attr) 2127 .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into())) 2128 } 2129 /// 2130 /// Cast to the underlying KVM device fd 2131 /// 2132 fn as_any(&self) -> &dyn Any { 2133 self 2134 } 2135 } 2136