1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 use crate::aarch64::gic::KvmGicV3Its; 13 #[cfg(target_arch = "aarch64")] 14 pub use crate::aarch64::{ 15 check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit, 16 VcpuKvmState, MPIDR_EL1, 17 }; 18 #[cfg(target_arch = "aarch64")] 19 use crate::arch::aarch64::gic::Vgic; 20 use crate::cpu; 21 use crate::device; 22 use crate::hypervisor; 23 use crate::vec_with_array_field; 24 use crate::vm::{self, InterruptSourceConfig, VmOps}; 25 #[cfg(target_arch = "aarch64")] 26 use crate::{arm64_core_reg_id, offset__of}; 27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 28 use std::any::Any; 29 use std::collections::HashMap; 30 #[cfg(target_arch = "aarch64")] 31 use std::convert::TryInto; 32 #[cfg(target_arch = "x86_64")] 33 use std::fs::File; 34 #[cfg(target_arch = "x86_64")] 35 use std::os::unix::io::AsRawFd; 36 #[cfg(feature = "tdx")] 37 use std::os::unix::io::RawFd; 38 use std::result; 39 #[cfg(target_arch = "x86_64")] 40 use std::sync::atomic::{AtomicBool, Ordering}; 41 #[cfg(target_arch = "aarch64")] 42 use std::sync::Mutex; 43 use std::sync::{Arc, RwLock}; 44 use vmm_sys_util::eventfd::EventFd; 45 // x86_64 dependencies 46 #[cfg(target_arch = "x86_64")] 47 pub mod x86_64; 48 #[cfg(target_arch = "x86_64")] 49 use crate::arch::x86::NUM_IOAPIC_PINS; 50 #[cfg(target_arch = "x86_64")] 51 use crate::ClockData; 52 use crate::{ 53 CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion, 54 USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, 55 }; 56 #[cfg(target_arch = "aarch64")] 57 use aarch64::{RegList, Register, StandardRegisters}; 58 #[cfg(target_arch = "x86_64")] 59 use kvm_bindings::{ 60 kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, 61 KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP, 62 }; 63 #[cfg(target_arch = "x86_64")] 64 use x86_64::{check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters}; 65 #[cfg(target_arch = "x86_64")] 66 pub use x86_64::{ 67 CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState, Xsave, 68 CPUID_FLAG_VALID_INDEX, 69 }; 70 // aarch64 dependencies 71 #[cfg(target_arch = "aarch64")] 72 pub mod aarch64; 73 pub use kvm_bindings; 74 #[cfg(feature = "tdx")] 75 use kvm_bindings::KVMIO; 76 pub use kvm_bindings::{ 77 kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, 78 kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, 79 KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 80 }; 81 #[cfg(target_arch = "aarch64")] 82 use kvm_bindings::{ 83 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE, 84 KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 85 }; 86 pub use kvm_ioctls; 87 pub use kvm_ioctls::{Cap, Kvm}; 88 #[cfg(target_arch = "aarch64")] 89 use std::mem; 90 use thiserror::Error; 91 #[cfg(feature = "tdx")] 92 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr}; 93 /// 94 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 95 /// 96 pub use { 97 kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr, 98 kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, 99 kvm_ioctls::VcpuExit, 100 }; 101 102 #[cfg(target_arch = "x86_64")] 103 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 104 105 #[cfg(feature = "tdx")] 106 const KVM_EXIT_TDX: u32 = 35; 107 #[cfg(feature = "tdx")] 108 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002; 109 #[cfg(feature = "tdx")] 110 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004; 111 #[cfg(feature = "tdx")] 112 const TDG_VP_VMCALL_SUCCESS: u64 = 0; 113 #[cfg(feature = "tdx")] 114 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000; 115 116 #[cfg(feature = "tdx")] 117 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 118 119 #[cfg(feature = "tdx")] 120 #[repr(u32)] 121 enum TdxCommand { 122 Capabilities = 0, 123 InitVm, 124 InitVcpu, 125 InitMemRegion, 126 Finalize, 127 } 128 129 #[cfg(feature = "tdx")] 130 pub enum TdxExitDetails { 131 GetQuote, 132 SetupEventNotifyInterrupt, 133 } 134 135 #[cfg(feature = "tdx")] 136 pub enum TdxExitStatus { 137 Success, 138 InvalidOperand, 139 } 140 141 #[cfg(feature = "tdx")] 142 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6; 143 144 #[cfg(feature = "tdx")] 145 #[repr(C)] 146 #[derive(Debug, Default)] 147 pub struct TdxCpuidConfig { 148 pub leaf: u32, 149 pub sub_leaf: u32, 150 pub eax: u32, 151 pub ebx: u32, 152 pub ecx: u32, 153 pub edx: u32, 154 } 155 156 #[cfg(feature = "tdx")] 157 #[repr(C)] 158 #[derive(Debug, Default)] 159 pub struct TdxCapabilities { 160 pub attrs_fixed0: u64, 161 pub attrs_fixed1: u64, 162 pub xfam_fixed0: u64, 163 pub xfam_fixed1: u64, 164 pub nr_cpuid_configs: u32, 165 pub padding: u32, 166 pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS], 167 } 168 169 impl From<kvm_userspace_memory_region> for UserMemoryRegion { 170 fn from(region: kvm_userspace_memory_region) -> Self { 171 let mut flags = USER_MEMORY_REGION_READ; 172 if region.flags & KVM_MEM_READONLY == 0 { 173 flags |= USER_MEMORY_REGION_WRITE; 174 } 175 if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 { 176 flags |= USER_MEMORY_REGION_LOG_DIRTY; 177 } 178 179 UserMemoryRegion { 180 slot: region.slot, 181 guest_phys_addr: region.guest_phys_addr, 182 memory_size: region.memory_size, 183 userspace_addr: region.userspace_addr, 184 flags, 185 } 186 } 187 } 188 189 impl From<UserMemoryRegion> for kvm_userspace_memory_region { 190 fn from(region: UserMemoryRegion) -> Self { 191 assert!( 192 region.flags & USER_MEMORY_REGION_READ != 0, 193 "KVM mapped memory is always readable" 194 ); 195 196 let mut flags = 0; 197 if region.flags & USER_MEMORY_REGION_WRITE == 0 { 198 flags |= KVM_MEM_READONLY; 199 } 200 if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 { 201 flags |= KVM_MEM_LOG_DIRTY_PAGES; 202 } 203 204 kvm_userspace_memory_region { 205 slot: region.slot, 206 guest_phys_addr: region.guest_phys_addr, 207 memory_size: region.memory_size, 208 userspace_addr: region.userspace_addr, 209 flags, 210 } 211 } 212 } 213 214 impl From<kvm_mp_state> for MpState { 215 fn from(s: kvm_mp_state) -> Self { 216 MpState::Kvm(s) 217 } 218 } 219 220 impl From<MpState> for kvm_mp_state { 221 fn from(ms: MpState) -> Self { 222 match ms { 223 MpState::Kvm(s) => s, 224 /* Needed in case other hypervisors are enabled */ 225 #[allow(unreachable_patterns)] 226 _ => panic!("CpuState is not valid"), 227 } 228 } 229 } 230 231 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress { 232 fn from(a: kvm_ioctls::IoEventAddress) -> Self { 233 match a { 234 kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x), 235 kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x), 236 } 237 } 238 } 239 240 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress { 241 fn from(a: IoEventAddress) -> Self { 242 match a { 243 IoEventAddress::Pio(x) => Self::Pio(x), 244 IoEventAddress::Mmio(x) => Self::Mmio(x), 245 } 246 } 247 } 248 249 impl From<VcpuKvmState> for CpuState { 250 fn from(s: VcpuKvmState) -> Self { 251 CpuState::Kvm(s) 252 } 253 } 254 255 impl From<CpuState> for VcpuKvmState { 256 fn from(s: CpuState) -> Self { 257 match s { 258 CpuState::Kvm(s) => s, 259 /* Needed in case other hypervisors are enabled */ 260 #[allow(unreachable_patterns)] 261 _ => panic!("CpuState is not valid"), 262 } 263 } 264 } 265 266 #[cfg(target_arch = "x86_64")] 267 impl From<kvm_clock_data> for ClockData { 268 fn from(d: kvm_clock_data) -> Self { 269 ClockData::Kvm(d) 270 } 271 } 272 273 #[cfg(target_arch = "x86_64")] 274 impl From<ClockData> for kvm_clock_data { 275 fn from(ms: ClockData) -> Self { 276 match ms { 277 ClockData::Kvm(s) => s, 278 /* Needed in case other hypervisors are enabled */ 279 #[allow(unreachable_patterns)] 280 _ => panic!("CpuState is not valid"), 281 } 282 } 283 } 284 285 impl From<kvm_irq_routing_entry> for IrqRoutingEntry { 286 fn from(s: kvm_irq_routing_entry) -> Self { 287 IrqRoutingEntry::Kvm(s) 288 } 289 } 290 291 impl From<IrqRoutingEntry> for kvm_irq_routing_entry { 292 fn from(e: IrqRoutingEntry) -> Self { 293 match e { 294 IrqRoutingEntry::Kvm(e) => e, 295 /* Needed in case other hypervisors are enabled */ 296 #[allow(unreachable_patterns)] 297 _ => panic!("IrqRoutingEntry is not valid"), 298 } 299 } 300 } 301 302 struct KvmDirtyLogSlot { 303 slot: u32, 304 guest_phys_addr: u64, 305 memory_size: u64, 306 userspace_addr: u64, 307 } 308 309 /// Wrapper over KVM VM ioctls. 310 pub struct KvmVm { 311 fd: Arc<VmFd>, 312 #[cfg(target_arch = "x86_64")] 313 msrs: MsrEntries, 314 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 315 } 316 317 /// 318 /// Implementation of Vm trait for KVM 319 /// Example: 320 /// #[cfg(feature = "kvm")] 321 /// extern crate hypervisor 322 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 323 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 324 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 325 /// vm.set/get().unwrap() 326 /// 327 impl vm::Vm for KvmVm { 328 #[cfg(target_arch = "x86_64")] 329 /// 330 /// Sets the address of the one-page region in the VM's address space. 331 /// 332 fn set_identity_map_address(&self, address: u64) -> vm::Result<()> { 333 self.fd 334 .set_identity_map_address(address) 335 .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into())) 336 } 337 #[cfg(target_arch = "x86_64")] 338 /// 339 /// Sets the address of the three-page region in the VM's address space. 340 /// 341 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 342 self.fd 343 .set_tss_address(offset) 344 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 345 } 346 /// 347 /// Creates an in-kernel interrupt controller. 348 /// 349 fn create_irq_chip(&self) -> vm::Result<()> { 350 self.fd 351 .create_irq_chip() 352 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 353 } 354 /// 355 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 356 /// 357 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 358 self.fd 359 .register_irqfd(fd, gsi) 360 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 361 } 362 /// 363 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 364 /// 365 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 366 self.fd 367 .unregister_irqfd(fd, gsi) 368 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 369 } 370 /// 371 /// Creates a VcpuFd object from a vcpu RawFd. 372 /// 373 fn create_vcpu( 374 &self, 375 id: u8, 376 vm_ops: Option<Arc<dyn VmOps>>, 377 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 378 let vc = self 379 .fd 380 .create_vcpu(id as u64) 381 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 382 let vcpu = KvmVcpu { 383 fd: vc, 384 #[cfg(target_arch = "x86_64")] 385 msrs: self.msrs.clone(), 386 vm_ops, 387 #[cfg(target_arch = "x86_64")] 388 hyperv_synic: AtomicBool::new(false), 389 }; 390 Ok(Arc::new(vcpu)) 391 } 392 #[cfg(target_arch = "aarch64")] 393 /// 394 /// Creates a virtual GIC device. 395 /// 396 fn create_vgic( 397 &self, 398 vcpu_count: u64, 399 dist_addr: u64, 400 dist_size: u64, 401 redist_size: u64, 402 msi_size: u64, 403 nr_irqs: u32, 404 ) -> vm::Result<Arc<Mutex<dyn Vgic>>> { 405 let gic_device = KvmGicV3Its::new( 406 self, 407 vcpu_count, 408 dist_addr, 409 dist_size, 410 redist_size, 411 msi_size, 412 nr_irqs, 413 ) 414 .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?; 415 Ok(Arc::new(Mutex::new(gic_device))) 416 } 417 /// 418 /// Registers an event to be signaled whenever a certain address is written to. 419 /// 420 fn register_ioevent( 421 &self, 422 fd: &EventFd, 423 addr: &IoEventAddress, 424 datamatch: Option<vm::DataMatch>, 425 ) -> vm::Result<()> { 426 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 427 if let Some(dm) = datamatch { 428 match dm { 429 vm::DataMatch::DataMatch32(kvm_dm32) => self 430 .fd 431 .register_ioevent(fd, addr, kvm_dm32) 432 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 433 vm::DataMatch::DataMatch64(kvm_dm64) => self 434 .fd 435 .register_ioevent(fd, addr, kvm_dm64) 436 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 437 } 438 } else { 439 self.fd 440 .register_ioevent(fd, addr, NoDatamatch) 441 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 442 } 443 } 444 /// 445 /// Unregisters an event from a certain address it has been previously registered to. 446 /// 447 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 448 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 449 self.fd 450 .unregister_ioevent(fd, addr, NoDatamatch) 451 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 452 } 453 454 /// 455 /// Constructs a routing entry 456 /// 457 fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry { 458 match &config { 459 InterruptSourceConfig::MsiIrq(cfg) => { 460 let mut kvm_route = kvm_irq_routing_entry { 461 gsi, 462 type_: KVM_IRQ_ROUTING_MSI, 463 ..Default::default() 464 }; 465 466 kvm_route.u.msi.address_lo = cfg.low_addr; 467 kvm_route.u.msi.address_hi = cfg.high_addr; 468 kvm_route.u.msi.data = cfg.data; 469 470 if self.check_extension(crate::kvm::Cap::MsiDevid) { 471 // On AArch64, there is limitation on the range of the 'devid', 472 // it can not be greater than 65536 (the max of u16). 473 // 474 // BDF can not be used directly, because 'segment' is in high 475 // 16 bits. The layout of the u32 BDF is: 476 // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| 477 // | segment | bus | device | function | 478 // 479 // Now that we support 1 bus only in a segment, we can build a 480 // 'devid' by replacing the 'bus' bits with the low 8 bits of 481 // 'segment' data. 482 // This way we can resolve the range checking problem and give 483 // different `devid` to all the devices. Limitation is that at 484 // most 256 segments can be supported. 485 // 486 let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff; 487 488 kvm_route.flags = KVM_MSI_VALID_DEVID; 489 kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid; 490 } 491 kvm_route.into() 492 } 493 InterruptSourceConfig::LegacyIrq(cfg) => { 494 let mut kvm_route = kvm_irq_routing_entry { 495 gsi, 496 type_: KVM_IRQ_ROUTING_IRQCHIP, 497 ..Default::default() 498 }; 499 kvm_route.u.irqchip.irqchip = cfg.irqchip; 500 kvm_route.u.irqchip.pin = cfg.pin; 501 502 kvm_route.into() 503 } 504 } 505 } 506 507 /// 508 /// Sets the GSI routing table entries, overwriting any previously set 509 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 510 /// 511 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 512 let mut irq_routing = 513 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 514 irq_routing[0].nr = entries.len() as u32; 515 irq_routing[0].flags = 0; 516 let entries: Vec<kvm_irq_routing_entry> = entries 517 .iter() 518 .map(|entry| match entry { 519 IrqRoutingEntry::Kvm(e) => *e, 520 #[allow(unreachable_patterns)] 521 _ => panic!("IrqRoutingEntry type is wrong"), 522 }) 523 .collect(); 524 525 // SAFETY: irq_routing initialized with entries.len() and now it is being turned into 526 // entries_slice with entries.len() again. It is guaranteed to be large enough to hold 527 // everything from entries. 528 unsafe { 529 let entries_slice: &mut [kvm_irq_routing_entry] = 530 irq_routing[0].entries.as_mut_slice(entries.len()); 531 entries_slice.copy_from_slice(&entries); 532 } 533 534 self.fd 535 .set_gsi_routing(&irq_routing[0]) 536 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 537 } 538 /// 539 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 540 /// 541 fn make_user_memory_region( 542 &self, 543 slot: u32, 544 guest_phys_addr: u64, 545 memory_size: u64, 546 userspace_addr: u64, 547 readonly: bool, 548 log_dirty_pages: bool, 549 ) -> UserMemoryRegion { 550 kvm_userspace_memory_region { 551 slot, 552 guest_phys_addr, 553 memory_size, 554 userspace_addr, 555 flags: if readonly { KVM_MEM_READONLY } else { 0 } 556 | if log_dirty_pages { 557 KVM_MEM_LOG_DIRTY_PAGES 558 } else { 559 0 560 }, 561 } 562 .into() 563 } 564 /// 565 /// Creates a guest physical memory region. 566 /// 567 fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 568 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 569 570 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 571 if (region.flags & KVM_MEM_READONLY) != 0 { 572 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 573 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 574 ))); 575 } 576 577 // Keep track of the regions that need dirty pages log 578 self.dirty_log_slots.write().unwrap().insert( 579 region.slot, 580 KvmDirtyLogSlot { 581 slot: region.slot, 582 guest_phys_addr: region.guest_phys_addr, 583 memory_size: region.memory_size, 584 userspace_addr: region.userspace_addr, 585 }, 586 ); 587 588 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 589 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 590 region.flags = 0; 591 } 592 593 // SAFETY: Safe because guest regions are guaranteed not to overlap. 594 unsafe { 595 self.fd 596 .set_user_memory_region(region) 597 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 598 } 599 } 600 /// 601 /// Removes a guest physical memory region. 602 /// 603 fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 604 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 605 606 // Remove the corresponding entry from "self.dirty_log_slots" if needed 607 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 608 609 // Setting the size to 0 means "remove" 610 region.memory_size = 0; 611 // SAFETY: Safe because guest regions are guaranteed not to overlap. 612 unsafe { 613 self.fd 614 .set_user_memory_region(region) 615 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 616 } 617 } 618 /// 619 /// Creates an emulated device in the kernel. 620 /// 621 /// See the documentation for `KVM_CREATE_DEVICE`. 622 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> { 623 let device_fd = self 624 .fd 625 .create_device(device) 626 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 627 Ok(Arc::new(device_fd)) 628 } 629 /// 630 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 631 /// 632 #[cfg(target_arch = "aarch64")] 633 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 634 self.fd 635 .get_preferred_target(kvi) 636 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 637 } 638 #[cfg(target_arch = "x86_64")] 639 fn enable_split_irq(&self) -> vm::Result<()> { 640 // Create split irqchip 641 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 642 // are not. 643 let mut cap = kvm_enable_cap { 644 cap: KVM_CAP_SPLIT_IRQCHIP, 645 ..Default::default() 646 }; 647 cap.args[0] = NUM_IOAPIC_PINS as u64; 648 self.fd 649 .enable_cap(&cap) 650 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 651 Ok(()) 652 } 653 #[cfg(target_arch = "x86_64")] 654 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 655 let mut cap = kvm_enable_cap { 656 cap: KVM_CAP_SGX_ATTRIBUTE, 657 ..Default::default() 658 }; 659 cap.args[0] = file.as_raw_fd() as u64; 660 self.fd 661 .enable_cap(&cap) 662 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 663 Ok(()) 664 } 665 /// Retrieve guest clock. 666 #[cfg(target_arch = "x86_64")] 667 fn get_clock(&self) -> vm::Result<ClockData> { 668 Ok(self 669 .fd 670 .get_clock() 671 .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))? 672 .into()) 673 } 674 /// Set guest clock. 675 #[cfg(target_arch = "x86_64")] 676 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 677 let data = (*data).into(); 678 self.fd 679 .set_clock(&data) 680 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 681 } 682 /// Checks if a particular `Cap` is available. 683 fn check_extension(&self, c: Cap) -> bool { 684 self.fd.check_extension(c) 685 } 686 /// Create a device that is used for passthrough 687 fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> { 688 let mut vfio_dev = kvm_create_device { 689 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 690 fd: 0, 691 flags: 0, 692 }; 693 694 self.create_device(&mut vfio_dev) 695 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 696 } 697 /// 698 /// Start logging dirty pages 699 /// 700 fn start_dirty_log(&self) -> vm::Result<()> { 701 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 702 for (_, s) in dirty_log_slots.iter() { 703 let region = kvm_userspace_memory_region { 704 slot: s.slot, 705 guest_phys_addr: s.guest_phys_addr, 706 memory_size: s.memory_size, 707 userspace_addr: s.userspace_addr, 708 flags: KVM_MEM_LOG_DIRTY_PAGES, 709 }; 710 // SAFETY: Safe because guest regions are guaranteed not to overlap. 711 unsafe { 712 self.fd 713 .set_user_memory_region(region) 714 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 715 } 716 } 717 718 Ok(()) 719 } 720 721 /// 722 /// Stop logging dirty pages 723 /// 724 fn stop_dirty_log(&self) -> vm::Result<()> { 725 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 726 for (_, s) in dirty_log_slots.iter() { 727 let region = kvm_userspace_memory_region { 728 slot: s.slot, 729 guest_phys_addr: s.guest_phys_addr, 730 memory_size: s.memory_size, 731 userspace_addr: s.userspace_addr, 732 flags: 0, 733 }; 734 // SAFETY: Safe because guest regions are guaranteed not to overlap. 735 unsafe { 736 self.fd 737 .set_user_memory_region(region) 738 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 739 } 740 } 741 742 Ok(()) 743 } 744 745 /// 746 /// Get dirty pages bitmap (one bit per page) 747 /// 748 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 749 self.fd 750 .get_dirty_log(slot, memory_size as usize) 751 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 752 } 753 754 /// 755 /// Initialize TDX for this VM 756 /// 757 #[cfg(feature = "tdx")] 758 fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> { 759 #[repr(C)] 760 struct TdxInitVm { 761 max_vcpus: u32, 762 tsc_khz: u32, 763 attributes: u64, 764 cpuid: u64, 765 mrconfigid: [u64; 6], 766 mrowner: [u64; 6], 767 mrownerconfig: [u64; 6], 768 reserved: [u64; 43], 769 } 770 let data = TdxInitVm { 771 max_vcpus, 772 tsc_khz: 0, 773 attributes: 0, 774 cpuid: cpuid.as_fam_struct_ptr() as u64, 775 mrconfigid: [0; 6], 776 mrowner: [0; 6], 777 mrownerconfig: [0; 6], 778 reserved: [0; 43], 779 }; 780 781 tdx_command( 782 &self.fd.as_raw_fd(), 783 TdxCommand::InitVm, 784 0, 785 &data as *const _ as u64, 786 ) 787 .map_err(vm::HypervisorVmError::InitializeTdx) 788 } 789 790 /// 791 /// Finalize the TDX setup for this VM 792 /// 793 #[cfg(feature = "tdx")] 794 fn tdx_finalize(&self) -> vm::Result<()> { 795 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 796 .map_err(vm::HypervisorVmError::FinalizeTdx) 797 } 798 799 /// 800 /// Initialize memory regions for the TDX VM 801 /// 802 #[cfg(feature = "tdx")] 803 fn tdx_init_memory_region( 804 &self, 805 host_address: u64, 806 guest_address: u64, 807 size: u64, 808 measure: bool, 809 ) -> vm::Result<()> { 810 #[repr(C)] 811 struct TdxInitMemRegion { 812 host_address: u64, 813 guest_address: u64, 814 pages: u64, 815 } 816 let data = TdxInitMemRegion { 817 host_address, 818 guest_address, 819 pages: size / 4096, 820 }; 821 822 tdx_command( 823 &self.fd.as_raw_fd(), 824 TdxCommand::InitMemRegion, 825 if measure { 1 } else { 0 }, 826 &data as *const _ as u64, 827 ) 828 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 829 } 830 } 831 832 #[cfg(feature = "tdx")] 833 fn tdx_command( 834 fd: &RawFd, 835 command: TdxCommand, 836 metadata: u32, 837 data: u64, 838 ) -> std::result::Result<(), std::io::Error> { 839 #[repr(C)] 840 struct TdxIoctlCmd { 841 command: TdxCommand, 842 metadata: u32, 843 data: u64, 844 } 845 let cmd = TdxIoctlCmd { 846 command, 847 metadata, 848 data, 849 }; 850 // SAFETY: FFI call. All input parameters are valid. 851 let ret = unsafe { 852 ioctl_with_val( 853 fd, 854 KVM_MEMORY_ENCRYPT_OP(), 855 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 856 ) 857 }; 858 859 if ret < 0 { 860 return Err(std::io::Error::last_os_error()); 861 } 862 Ok(()) 863 } 864 865 /// Wrapper over KVM system ioctls. 866 pub struct KvmHypervisor { 867 kvm: Kvm, 868 } 869 /// Enum for KVM related error 870 #[derive(Debug, Error)] 871 pub enum KvmError { 872 #[error("Capability missing: {0:?}")] 873 CapabilityMissing(Cap), 874 } 875 pub type KvmResult<T> = result::Result<T, KvmError>; 876 impl KvmHypervisor { 877 /// Create a hypervisor based on Kvm 878 pub fn new() -> hypervisor::Result<KvmHypervisor> { 879 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 880 let api_version = kvm_obj.get_api_version(); 881 882 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 883 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 884 } 885 886 Ok(KvmHypervisor { kvm: kvm_obj }) 887 } 888 } 889 /// Implementation of Hypervisor trait for KVM 890 /// Example: 891 /// #[cfg(feature = "kvm")] 892 /// extern crate hypervisor 893 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 894 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 895 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 896 /// 897 impl hypervisor::Hypervisor for KvmHypervisor { 898 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 899 /// Example 900 /// # extern crate hypervisor; 901 /// # use hypervisor::KvmHypervisor; 902 /// use hypervisor::KvmVm; 903 /// let hypervisor = KvmHypervisor::new().unwrap(); 904 /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() 905 /// 906 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 907 let fd: VmFd; 908 loop { 909 match self.kvm.create_vm_with_type(vm_type) { 910 Ok(res) => fd = res, 911 Err(e) => { 912 if e.errno() == libc::EINTR { 913 // If the error returned is EINTR, which means the 914 // ioctl has been interrupted, we have to retry as 915 // this can't be considered as a regular error. 916 continue; 917 } else { 918 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 919 } 920 } 921 } 922 break; 923 } 924 925 let vm_fd = Arc::new(fd); 926 927 #[cfg(target_arch = "x86_64")] 928 { 929 let msr_list = self.get_msr_list()?; 930 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 931 let mut msrs = MsrEntries::new(num_msrs).unwrap(); 932 let indices = msr_list.as_slice(); 933 let msr_entries = msrs.as_mut_slice(); 934 for (pos, index) in indices.iter().enumerate() { 935 msr_entries[pos].index = *index; 936 } 937 938 Ok(Arc::new(KvmVm { 939 fd: vm_fd, 940 msrs, 941 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 942 })) 943 } 944 945 #[cfg(target_arch = "aarch64")] 946 { 947 Ok(Arc::new(KvmVm { 948 fd: vm_fd, 949 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 950 })) 951 } 952 } 953 954 /// Create a KVM vm object and return the object as Vm trait object 955 /// Example 956 /// # extern crate hypervisor; 957 /// # use hypervisor::KvmHypervisor; 958 /// use hypervisor::KvmVm; 959 /// let hypervisor = KvmHypervisor::new().unwrap(); 960 /// let vm = hypervisor.create_vm().unwrap() 961 /// 962 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 963 #[allow(unused_mut)] 964 let mut vm_type: u64 = 0; // Create with default platform type 965 966 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 967 // size from the host and use that when creating the VM, which may 968 // avoid unnecessary VM creation failures. 969 #[cfg(target_arch = "aarch64")] 970 if self.kvm.check_extension(Cap::ArmVmIPASize) { 971 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 972 } 973 974 self.create_vm_with_type(vm_type) 975 } 976 977 fn check_required_extensions(&self) -> hypervisor::Result<()> { 978 check_required_kvm_extensions(&self.kvm) 979 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 980 } 981 982 #[cfg(target_arch = "x86_64")] 983 /// 984 /// X86 specific call to get the system supported CPUID values. 985 /// 986 fn get_cpuid(&self) -> hypervisor::Result<CpuId> { 987 self.kvm 988 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 989 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into())) 990 } 991 992 #[cfg(target_arch = "x86_64")] 993 /// 994 /// Retrieve the list of MSRs supported by KVM. 995 /// 996 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 997 self.kvm 998 .get_msr_index_list() 999 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 1000 } 1001 #[cfg(target_arch = "aarch64")] 1002 /// 1003 /// Retrieve AArch64 host maximum IPA size supported by KVM. 1004 /// 1005 fn get_host_ipa_limit(&self) -> i32 { 1006 self.kvm.get_host_ipa_limit() 1007 } 1008 1009 /// 1010 /// Retrieve TDX capabilities 1011 /// 1012 #[cfg(feature = "tdx")] 1013 fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> { 1014 let data = TdxCapabilities { 1015 nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32, 1016 ..Default::default() 1017 }; 1018 1019 tdx_command( 1020 &self.kvm.as_raw_fd(), 1021 TdxCommand::Capabilities, 1022 0, 1023 &data as *const _ as u64, 1024 ) 1025 .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?; 1026 1027 Ok(data) 1028 } 1029 } 1030 /// Vcpu struct for KVM 1031 pub struct KvmVcpu { 1032 fd: VcpuFd, 1033 #[cfg(target_arch = "x86_64")] 1034 msrs: MsrEntries, 1035 vm_ops: Option<Arc<dyn vm::VmOps>>, 1036 #[cfg(target_arch = "x86_64")] 1037 hyperv_synic: AtomicBool, 1038 } 1039 /// Implementation of Vcpu trait for KVM 1040 /// Example: 1041 /// #[cfg(feature = "kvm")] 1042 /// extern crate hypervisor 1043 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1044 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1045 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 1046 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1047 /// vcpu.get/set().unwrap() 1048 /// 1049 impl cpu::Vcpu for KvmVcpu { 1050 #[cfg(target_arch = "x86_64")] 1051 /// 1052 /// Returns the vCPU general purpose registers. 1053 /// 1054 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1055 self.fd 1056 .get_regs() 1057 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into())) 1058 } 1059 /// 1060 /// Returns the vCPU general purpose registers. 1061 /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG` 1062 /// is used to get registers one by one. 1063 /// 1064 #[cfg(target_arch = "aarch64")] 1065 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1066 let mut state: StandardRegisters = kvm_regs::default(); 1067 let mut off = offset__of!(user_pt_regs, regs); 1068 // There are 31 user_pt_regs: 1069 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1070 // These actually are the general-purpose registers of the Armv8-a 1071 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1072 for i in 0..31 { 1073 state.regs.regs[i] = self 1074 .fd 1075 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1076 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1077 off += std::mem::size_of::<u64>(); 1078 } 1079 1080 // We are now entering the "Other register" section of the ARMv8-a architecture. 1081 // First one, stack pointer. 1082 let off = offset__of!(user_pt_regs, sp); 1083 state.regs.sp = self 1084 .fd 1085 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1086 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1087 1088 // Second one, the program counter. 1089 let off = offset__of!(user_pt_regs, pc); 1090 state.regs.pc = self 1091 .fd 1092 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1093 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1094 1095 // Next is the processor state. 1096 let off = offset__of!(user_pt_regs, pstate); 1097 state.regs.pstate = self 1098 .fd 1099 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1100 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1101 1102 // The stack pointer associated with EL1 1103 let off = offset__of!(kvm_regs, sp_el1); 1104 state.sp_el1 = self 1105 .fd 1106 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1107 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1108 1109 // Exception Link Register for EL1, when taking an exception to EL1, this register 1110 // holds the address to which to return afterwards. 1111 let off = offset__of!(kvm_regs, elr_el1); 1112 state.elr_el1 = self 1113 .fd 1114 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1115 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1116 1117 // Saved Program Status Registers, there are 5 of them used in the kernel. 1118 let mut off = offset__of!(kvm_regs, spsr); 1119 for i in 0..KVM_NR_SPSR as usize { 1120 state.spsr[i] = self 1121 .fd 1122 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1123 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1124 off += std::mem::size_of::<u64>(); 1125 } 1126 1127 // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: 1128 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1129 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1130 for i in 0..32 { 1131 state.fp_regs.vregs[i] = self 1132 .fd 1133 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1134 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1135 .into(); 1136 off += mem::size_of::<u128>(); 1137 } 1138 1139 // Floating-point Status Register 1140 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1141 state.fp_regs.fpsr = self 1142 .fd 1143 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1144 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1145 as u32; 1146 1147 // Floating-point Control Register 1148 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1149 state.fp_regs.fpcr = self 1150 .fd 1151 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1152 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1153 as u32; 1154 Ok(state) 1155 } 1156 #[cfg(target_arch = "x86_64")] 1157 /// 1158 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 1159 /// 1160 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 1161 self.fd 1162 .set_regs(regs) 1163 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 1164 } 1165 1166 /// 1167 /// Sets the vCPU general purpose registers. 1168 /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG` 1169 /// is used to set registers one by one. 1170 /// 1171 #[cfg(target_arch = "aarch64")] 1172 fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> { 1173 // The function follows the exact identical order from `state`. Look there 1174 // for some additional info on registers. 1175 let mut off = offset__of!(user_pt_regs, regs); 1176 for i in 0..31 { 1177 self.fd 1178 .set_one_reg( 1179 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1180 state.regs.regs[i], 1181 ) 1182 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1183 off += std::mem::size_of::<u64>(); 1184 } 1185 1186 let off = offset__of!(user_pt_regs, sp); 1187 self.fd 1188 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp) 1189 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1190 1191 let off = offset__of!(user_pt_regs, pc); 1192 self.fd 1193 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc) 1194 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1195 1196 let off = offset__of!(user_pt_regs, pstate); 1197 self.fd 1198 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate) 1199 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1200 1201 let off = offset__of!(kvm_regs, sp_el1); 1202 self.fd 1203 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1) 1204 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1205 1206 let off = offset__of!(kvm_regs, elr_el1); 1207 self.fd 1208 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1) 1209 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1210 1211 let mut off = offset__of!(kvm_regs, spsr); 1212 for i in 0..KVM_NR_SPSR as usize { 1213 self.fd 1214 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i]) 1215 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1216 off += std::mem::size_of::<u64>(); 1217 } 1218 1219 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1220 for i in 0..32 { 1221 self.fd 1222 .set_one_reg( 1223 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1224 state.fp_regs.vregs[i] as u64, 1225 ) 1226 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1227 off += mem::size_of::<u128>(); 1228 } 1229 1230 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1231 self.fd 1232 .set_one_reg( 1233 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1234 state.fp_regs.fpsr as u64, 1235 ) 1236 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1237 1238 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1239 self.fd 1240 .set_one_reg( 1241 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1242 state.fp_regs.fpcr as u64, 1243 ) 1244 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1245 Ok(()) 1246 } 1247 1248 #[cfg(target_arch = "aarch64")] 1249 /// 1250 /// Set attribute for vcpu. 1251 /// 1252 fn set_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> { 1253 self.fd 1254 .set_device_attr(attr) 1255 .map_err(|e| cpu::HypervisorCpuError::SetVcpuAttribute(e.into())) 1256 } 1257 1258 #[cfg(target_arch = "aarch64")] 1259 /// 1260 /// Check if vcpu has a certain attribute. 1261 /// 1262 fn has_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> { 1263 self.fd 1264 .has_device_attr(attr) 1265 .map_err(|e| cpu::HypervisorCpuError::HasVcpuAttribute(e.into())) 1266 } 1267 1268 #[cfg(target_arch = "x86_64")] 1269 /// 1270 /// Returns the vCPU special registers. 1271 /// 1272 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 1273 self.fd 1274 .get_sregs() 1275 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into())) 1276 } 1277 #[cfg(target_arch = "x86_64")] 1278 /// 1279 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 1280 /// 1281 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 1282 self.fd 1283 .set_sregs(sregs) 1284 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 1285 } 1286 #[cfg(target_arch = "x86_64")] 1287 /// 1288 /// Returns the floating point state (FPU) from the vCPU. 1289 /// 1290 fn get_fpu(&self) -> cpu::Result<FpuState> { 1291 self.fd 1292 .get_fpu() 1293 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into())) 1294 } 1295 #[cfg(target_arch = "x86_64")] 1296 /// 1297 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 1298 /// 1299 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 1300 self.fd 1301 .set_fpu(fpu) 1302 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 1303 } 1304 #[cfg(target_arch = "x86_64")] 1305 /// 1306 /// X86 specific call to setup the CPUID registers. 1307 /// 1308 fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> { 1309 self.fd 1310 .set_cpuid2(cpuid) 1311 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 1312 } 1313 #[cfg(target_arch = "x86_64")] 1314 /// 1315 /// X86 specific call to enable HyperV SynIC 1316 /// 1317 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 1318 // Update the information about Hyper-V SynIC being enabled and 1319 // emulated as it will influence later which MSRs should be saved. 1320 self.hyperv_synic.store(true, Ordering::Release); 1321 1322 let cap = kvm_enable_cap { 1323 cap: KVM_CAP_HYPERV_SYNIC, 1324 ..Default::default() 1325 }; 1326 self.fd 1327 .enable_cap(&cap) 1328 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 1329 } 1330 /// 1331 /// X86 specific call to retrieve the CPUID registers. 1332 /// 1333 #[cfg(target_arch = "x86_64")] 1334 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> { 1335 self.fd 1336 .get_cpuid2(num_entries) 1337 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into())) 1338 } 1339 #[cfg(target_arch = "x86_64")] 1340 /// 1341 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1342 /// 1343 fn get_lapic(&self) -> cpu::Result<LapicState> { 1344 self.fd 1345 .get_lapic() 1346 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into())) 1347 } 1348 #[cfg(target_arch = "x86_64")] 1349 /// 1350 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1351 /// 1352 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 1353 self.fd 1354 .set_lapic(klapic) 1355 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 1356 } 1357 #[cfg(target_arch = "x86_64")] 1358 /// 1359 /// Returns the model-specific registers (MSR) for this vCPU. 1360 /// 1361 fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> { 1362 self.fd 1363 .get_msrs(msrs) 1364 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into())) 1365 } 1366 #[cfg(target_arch = "x86_64")] 1367 /// 1368 /// Setup the model-specific registers (MSR) for this vCPU. 1369 /// Returns the number of MSR entries actually written. 1370 /// 1371 fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> { 1372 self.fd 1373 .set_msrs(msrs) 1374 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 1375 } 1376 /// 1377 /// Returns the vcpu's current "multiprocessing state". 1378 /// 1379 fn get_mp_state(&self) -> cpu::Result<MpState> { 1380 Ok(self 1381 .fd 1382 .get_mp_state() 1383 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))? 1384 .into()) 1385 } 1386 /// 1387 /// Sets the vcpu's current "multiprocessing state". 1388 /// 1389 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 1390 self.fd 1391 .set_mp_state(mp_state.into()) 1392 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 1393 } 1394 #[cfg(target_arch = "x86_64")] 1395 /// 1396 /// X86 specific call that returns the vcpu's current "xsave struct". 1397 /// 1398 fn get_xsave(&self) -> cpu::Result<Xsave> { 1399 self.fd 1400 .get_xsave() 1401 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 1402 } 1403 #[cfg(target_arch = "x86_64")] 1404 /// 1405 /// X86 specific call that sets the vcpu's current "xsave struct". 1406 /// 1407 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 1408 self.fd 1409 .set_xsave(xsave) 1410 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 1411 } 1412 #[cfg(target_arch = "x86_64")] 1413 /// 1414 /// X86 specific call that returns the vcpu's current "xcrs". 1415 /// 1416 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 1417 self.fd 1418 .get_xcrs() 1419 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 1420 } 1421 #[cfg(target_arch = "x86_64")] 1422 /// 1423 /// X86 specific call that sets the vcpu's current "xcrs". 1424 /// 1425 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 1426 self.fd 1427 .set_xcrs(xcrs) 1428 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 1429 } 1430 #[cfg(target_arch = "x86_64")] 1431 /// 1432 /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl. 1433 /// 1434 fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> { 1435 let tr = self 1436 .fd 1437 .translate_gva(gva) 1438 .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?; 1439 // tr.valid is set if the GVA is mapped to valid GPA. 1440 match tr.valid { 1441 0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!( 1442 "Invalid GVA: {:#x}", 1443 gva 1444 ))), 1445 _ => Ok((tr.physical_address, 0)), 1446 } 1447 } 1448 /// 1449 /// Triggers the running of the current virtual CPU returning an exit reason. 1450 /// 1451 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 1452 match self.fd.run() { 1453 Ok(run) => match run { 1454 #[cfg(target_arch = "x86_64")] 1455 VcpuExit::IoIn(addr, data) => { 1456 if let Some(vm_ops) = &self.vm_ops { 1457 return vm_ops 1458 .pio_read(addr.into(), data) 1459 .map(|_| cpu::VmExit::Ignore) 1460 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1461 } 1462 1463 Ok(cpu::VmExit::IoIn(addr, data)) 1464 } 1465 #[cfg(target_arch = "x86_64")] 1466 VcpuExit::IoOut(addr, data) => { 1467 if let Some(vm_ops) = &self.vm_ops { 1468 return vm_ops 1469 .pio_write(addr.into(), data) 1470 .map(|_| cpu::VmExit::Ignore) 1471 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1472 } 1473 1474 Ok(cpu::VmExit::IoOut(addr, data)) 1475 } 1476 #[cfg(target_arch = "x86_64")] 1477 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 1478 #[cfg(target_arch = "x86_64")] 1479 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 1480 1481 #[cfg(target_arch = "aarch64")] 1482 VcpuExit::SystemEvent(event_type, flags) => { 1483 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 1484 // On Aarch64, when the VM is shutdown, run() returns 1485 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 1486 if event_type == KVM_SYSTEM_EVENT_RESET { 1487 Ok(cpu::VmExit::Reset) 1488 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 1489 Ok(cpu::VmExit::Shutdown) 1490 } else { 1491 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1492 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 1493 event_type, 1494 flags 1495 ))) 1496 } 1497 } 1498 1499 VcpuExit::MmioRead(addr, data) => { 1500 if let Some(vm_ops) = &self.vm_ops { 1501 return vm_ops 1502 .mmio_read(addr, data) 1503 .map(|_| cpu::VmExit::Ignore) 1504 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1505 } 1506 1507 Ok(cpu::VmExit::MmioRead(addr, data)) 1508 } 1509 VcpuExit::MmioWrite(addr, data) => { 1510 if let Some(vm_ops) = &self.vm_ops { 1511 return vm_ops 1512 .mmio_write(addr, data) 1513 .map(|_| cpu::VmExit::Ignore) 1514 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1515 } 1516 1517 Ok(cpu::VmExit::MmioWrite(addr, data)) 1518 } 1519 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 1520 #[cfg(feature = "tdx")] 1521 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), 1522 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), 1523 1524 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1525 "Unexpected exit reason on vcpu run: {:?}", 1526 r 1527 ))), 1528 }, 1529 1530 Err(ref e) => match e.errno() { 1531 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1532 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1533 "VCPU error {:?}", 1534 e 1535 ))), 1536 }, 1537 } 1538 } 1539 #[cfg(target_arch = "x86_64")] 1540 /// 1541 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 1542 /// states of the vcpu. 1543 /// 1544 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 1545 self.fd 1546 .get_vcpu_events() 1547 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 1548 } 1549 #[cfg(target_arch = "x86_64")] 1550 /// 1551 /// Sets pending exceptions, interrupts, and NMIs as well as related states 1552 /// of the vcpu. 1553 /// 1554 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 1555 self.fd 1556 .set_vcpu_events(events) 1557 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 1558 } 1559 #[cfg(target_arch = "x86_64")] 1560 /// 1561 /// Let the guest know that it has been paused, which prevents from 1562 /// potential soft lockups when being resumed. 1563 /// 1564 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1565 if let Err(e) = self.fd.kvmclock_ctrl() { 1566 // Linux kernel returns -EINVAL if the PV clock isn't yet initialised 1567 // which could be because we're still in firmware or the guest doesn't 1568 // use KVM clock. 1569 if e.errno() != libc::EINVAL { 1570 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())); 1571 } 1572 } 1573 1574 Ok(()) 1575 } 1576 #[cfg(target_arch = "x86_64")] 1577 /// 1578 /// Sets debug registers to set hardware breakpoints and/or enable single step. 1579 /// 1580 fn set_guest_debug( 1581 &self, 1582 addrs: &[vm_memory::GuestAddress], 1583 singlestep: bool, 1584 ) -> cpu::Result<()> { 1585 if addrs.len() > 4 { 1586 return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!( 1587 "Support 4 breakpoints at most but {} addresses are passed", 1588 addrs.len() 1589 ))); 1590 } 1591 1592 let mut dbg = kvm_guest_debug { 1593 control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP, 1594 ..Default::default() 1595 }; 1596 if singlestep { 1597 dbg.control |= KVM_GUESTDBG_SINGLESTEP; 1598 } 1599 1600 // Set bits 9 and 10. 1601 // bit 9: GE (global exact breakpoint enable) flag. 1602 // bit 10: always 1. 1603 dbg.arch.debugreg[7] = 0x0600; 1604 1605 for (i, addr) in addrs.iter().enumerate() { 1606 dbg.arch.debugreg[i] = addr.0; 1607 // Set global breakpoint enable flag 1608 dbg.arch.debugreg[7] |= 2 << (i * 2); 1609 } 1610 1611 self.fd 1612 .set_guest_debug(&dbg) 1613 .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into())) 1614 } 1615 #[cfg(target_arch = "aarch64")] 1616 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1617 self.fd 1618 .vcpu_init(kvi) 1619 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1620 } 1621 /// 1622 /// Sets the value of one register for this vCPU. 1623 /// 1624 #[cfg(target_arch = "aarch64")] 1625 fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> { 1626 self.fd 1627 .set_one_reg(reg_id, data) 1628 .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into())) 1629 } 1630 /// 1631 /// Gets the value of one register for this vCPU. 1632 /// 1633 #[cfg(target_arch = "aarch64")] 1634 fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> { 1635 self.fd 1636 .get_one_reg(reg_id) 1637 .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into())) 1638 } 1639 /// 1640 /// Gets a list of the guest registers that are supported for the 1641 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1642 /// 1643 #[cfg(target_arch = "aarch64")] 1644 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1645 self.fd 1646 .get_reg_list(reg_list) 1647 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1648 } 1649 /// 1650 /// Save the state of the system registers. 1651 /// 1652 #[cfg(target_arch = "aarch64")] 1653 fn get_sys_regs(&self) -> cpu::Result<Vec<Register>> { 1654 // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are 1655 // around 500 registers. 1656 let mut state: Vec<Register> = Vec::new(); 1657 let mut reg_list = RegList::new(500).unwrap(); 1658 self.fd 1659 .get_reg_list(&mut reg_list) 1660 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1661 1662 // At this point reg_list should contain: core registers and system registers. 1663 // The register list contains the number of registers and their ids. We will be needing to 1664 // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list 1665 // the core registers which are represented in the kernel by kvm_regs structure and for which 1666 // we can calculate the id based on the offset in the structure. 1667 reg_list.retain(|regid| is_system_register(*regid)); 1668 1669 // Now, for the rest of the registers left in the previously fetched register list, we are 1670 // simply calling KVM_GET_ONE_REG. 1671 let indices = reg_list.as_slice(); 1672 for index in indices.iter() { 1673 state.push(kvm_bindings::kvm_one_reg { 1674 id: *index, 1675 addr: self 1676 .fd 1677 .get_one_reg(*index) 1678 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?, 1679 }); 1680 } 1681 1682 Ok(state) 1683 } 1684 /// 1685 /// Restore the state of the system registers. 1686 /// 1687 #[cfg(target_arch = "aarch64")] 1688 fn set_sys_regs(&self, state: &[Register]) -> cpu::Result<()> { 1689 for reg in state { 1690 self.fd 1691 .set_one_reg(reg.id, reg.addr) 1692 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 1693 } 1694 Ok(()) 1695 } 1696 /// 1697 /// Read the MPIDR - Multiprocessor Affinity Register. 1698 /// 1699 #[cfg(target_arch = "aarch64")] 1700 fn read_mpidr(&self) -> cpu::Result<u64> { 1701 self.fd 1702 .get_one_reg(MPIDR_EL1) 1703 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into())) 1704 } 1705 /// 1706 /// Configure core registers for a given CPU. 1707 /// 1708 #[cfg(target_arch = "aarch64")] 1709 fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { 1710 #[allow(non_upper_case_globals)] 1711 // PSR (Processor State Register) bits. 1712 // Taken from arch/arm64/include/uapi/asm/ptrace.h. 1713 const PSR_MODE_EL1h: u64 = 0x0000_0005; 1714 const PSR_F_BIT: u64 = 0x0000_0040; 1715 const PSR_I_BIT: u64 = 0x0000_0080; 1716 const PSR_A_BIT: u64 = 0x0000_0100; 1717 const PSR_D_BIT: u64 = 0x0000_0200; 1718 // Taken from arch/arm64/kvm/inject_fault.c. 1719 const PSTATE_FAULT_BITS_64: u64 = 1720 PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT; 1721 1722 let kreg_off = offset__of!(kvm_regs, regs); 1723 1724 // Get the register index of the PSTATE (Processor State) register. 1725 let pstate = offset__of!(user_pt_regs, pstate) + kreg_off; 1726 self.set_reg( 1727 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate), 1728 PSTATE_FAULT_BITS_64, 1729 ) 1730 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1731 1732 // Other vCPUs are powered off initially awaiting PSCI wakeup. 1733 if cpu_id == 0 { 1734 // Setting the PC (Processor Counter) to the current program address (kernel address). 1735 let pc = offset__of!(user_pt_regs, pc) + kreg_off; 1736 self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64) 1737 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1738 1739 // Last mandatory thing to set -> the address pointing to the FDT (also called DTB). 1740 // "The device tree blob (dtb) must be placed on an 8-byte boundary and must 1741 // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt. 1742 // We are choosing to place it the end of DRAM. See `get_fdt_addr`. 1743 let regs0 = offset__of!(user_pt_regs, regs) + kreg_off; 1744 self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start) 1745 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1746 } 1747 Ok(()) 1748 } 1749 1750 #[cfg(target_arch = "x86_64")] 1751 /// 1752 /// Get the current CPU state 1753 /// 1754 /// Ordering requirements: 1755 /// 1756 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1757 /// vCPU/LAPIC state. As such, it must be done before most everything 1758 /// else, otherwise we cannot restore everything and expect it to work. 1759 /// 1760 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1761 /// still running. 1762 /// 1763 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1764 /// 1765 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1766 /// it might as well be affected by internal state modifications of the 1767 /// GET ioctls. 1768 /// 1769 /// SREGS saves/restores a pending interrupt, similar to what 1770 /// VCPU_EVENTS also does. 1771 /// 1772 /// GET_MSRS requires a pre-populated data structure to do something 1773 /// meaningful. For SET_MSRS it will then contain good data. 1774 /// 1775 /// # Example 1776 /// 1777 /// ```rust 1778 /// # extern crate hypervisor; 1779 /// # use hypervisor::KvmHypervisor; 1780 /// # use std::sync::Arc; 1781 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1782 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1783 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1784 /// vm.enable_split_irq().unwrap(); 1785 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1786 /// let state = vcpu.state().unwrap(); 1787 /// ``` 1788 fn state(&self) -> cpu::Result<CpuState> { 1789 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1790 let mp_state = self.get_mp_state()?.into(); 1791 let regs = self.get_regs()?; 1792 let sregs = self.get_sregs()?; 1793 let xsave = self.get_xsave()?; 1794 let xcrs = self.get_xcrs()?; 1795 let lapic_state = self.get_lapic()?; 1796 let fpu = self.get_fpu()?; 1797 1798 // Try to get all MSRs based on the list previously retrieved from KVM. 1799 // If the number of MSRs obtained from GET_MSRS is different from the 1800 // expected amount, we fallback onto a slower method by getting MSRs 1801 // by chunks. This is the only way to make sure we try to get as many 1802 // MSRs as possible, even if some MSRs are not supported. 1803 let mut msr_entries = self.msrs.clone(); 1804 1805 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1806 // emulated. 1807 if self.hyperv_synic.load(Ordering::Acquire) { 1808 let hyperv_synic_msrs = vec![ 1809 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1810 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1811 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1812 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 1813 0x400000b5, 0x400000b6, 0x400000b7, 1814 ]; 1815 for index in hyperv_synic_msrs { 1816 let msr = kvm_msr_entry { 1817 index, 1818 ..Default::default() 1819 }; 1820 msr_entries.push(msr).unwrap(); 1821 } 1822 } 1823 1824 let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize; 1825 let num_msrs = self.get_msrs(&mut msr_entries)?; 1826 let msrs = if num_msrs != expected_num_msrs { 1827 let mut faulty_msr_index = num_msrs; 1828 let mut msr_entries_tmp = 1829 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap(); 1830 1831 loop { 1832 warn!( 1833 "Detected faulty MSR 0x{:x} while getting MSRs", 1834 msr_entries.as_slice()[faulty_msr_index].index 1835 ); 1836 1837 let start_pos = faulty_msr_index + 1; 1838 let mut sub_msr_entries = 1839 MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap(); 1840 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1841 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1842 1843 for i in 0..num_msrs { 1844 msr_entries_tmp 1845 .push(sub_msr_entries.as_slice()[i]) 1846 .map_err(|e| { 1847 cpu::HypervisorCpuError::GetMsrEntries(anyhow!( 1848 "Failed adding MSR entries: {:?}", 1849 e 1850 )) 1851 })?; 1852 } 1853 1854 if num_msrs == expected_num_msrs { 1855 break; 1856 } 1857 1858 faulty_msr_index = start_pos + num_msrs; 1859 } 1860 1861 msr_entries_tmp 1862 } else { 1863 msr_entries 1864 }; 1865 1866 let vcpu_events = self.get_vcpu_events()?; 1867 1868 Ok(VcpuKvmState { 1869 cpuid, 1870 msrs, 1871 vcpu_events, 1872 regs, 1873 sregs, 1874 fpu, 1875 lapic_state, 1876 xsave, 1877 xcrs, 1878 mp_state, 1879 } 1880 .into()) 1881 } 1882 /// 1883 /// Get the current AArch64 CPU state 1884 /// 1885 #[cfg(target_arch = "aarch64")] 1886 fn state(&self) -> cpu::Result<CpuState> { 1887 let mut state = VcpuKvmState { 1888 mp_state: self.get_mp_state()?.into(), 1889 mpidr: self.read_mpidr()?, 1890 ..Default::default() 1891 }; 1892 state.core_regs = self.get_regs()?; 1893 state.sys_regs = self.get_sys_regs()?; 1894 1895 Ok(state.into()) 1896 } 1897 #[cfg(target_arch = "x86_64")] 1898 /// 1899 /// Restore the previously saved CPU state 1900 /// 1901 /// Ordering requirements: 1902 /// 1903 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1904 /// still running. 1905 /// 1906 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1907 /// if we ever change the BSP, we have to do that before restoring anything. 1908 /// The same seems to be true for CPUID stuff. 1909 /// 1910 /// SREGS saves/restores a pending interrupt, similar to what 1911 /// VCPU_EVENTS also does. 1912 /// 1913 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1914 /// done before SET_VCPU_EVENTS, which restores it. 1915 /// 1916 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1917 /// the apic base msr. 1918 /// 1919 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1920 /// only restores successfully, when the LAPIC is correctly configured. 1921 /// 1922 /// Arguments: CpuState 1923 /// # Example 1924 /// 1925 /// ```rust 1926 /// # extern crate hypervisor; 1927 /// # use hypervisor::KvmHypervisor; 1928 /// # use std::sync::Arc; 1929 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1930 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1931 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1932 /// vm.enable_split_irq().unwrap(); 1933 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1934 /// let state = vcpu.state().unwrap(); 1935 /// vcpu.set_state(&state).unwrap(); 1936 /// ``` 1937 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1938 let state: VcpuKvmState = state.clone().into(); 1939 self.set_cpuid2(&state.cpuid)?; 1940 self.set_mp_state(state.mp_state.into())?; 1941 self.set_regs(&state.regs)?; 1942 self.set_sregs(&state.sregs)?; 1943 self.set_xsave(&state.xsave)?; 1944 self.set_xcrs(&state.xcrs)?; 1945 self.set_lapic(&state.lapic_state)?; 1946 self.set_fpu(&state.fpu)?; 1947 1948 // Try to set all MSRs previously stored. 1949 // If the number of MSRs set from SET_MSRS is different from the 1950 // expected amount, we fallback onto a slower method by setting MSRs 1951 // by chunks. This is the only way to make sure we try to set as many 1952 // MSRs as possible, even if some MSRs are not supported. 1953 let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize; 1954 let num_msrs = self.set_msrs(&state.msrs)?; 1955 if num_msrs != expected_num_msrs { 1956 let mut faulty_msr_index = num_msrs; 1957 1958 loop { 1959 warn!( 1960 "Detected faulty MSR 0x{:x} while setting MSRs", 1961 state.msrs.as_slice()[faulty_msr_index].index 1962 ); 1963 1964 let start_pos = faulty_msr_index + 1; 1965 let sub_msr_entries = 1966 MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap(); 1967 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; 1968 let num_msrs = self.set_msrs(&sub_msr_entries)?; 1969 1970 if num_msrs == expected_num_msrs { 1971 break; 1972 } 1973 1974 faulty_msr_index = start_pos + num_msrs; 1975 } 1976 } 1977 1978 self.set_vcpu_events(&state.vcpu_events)?; 1979 1980 Ok(()) 1981 } 1982 /// 1983 /// Restore the previously saved AArch64 CPU state 1984 /// 1985 #[cfg(target_arch = "aarch64")] 1986 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1987 let state: VcpuKvmState = state.clone().into(); 1988 self.set_regs(&state.core_regs)?; 1989 self.set_sys_regs(&state.sys_regs)?; 1990 self.set_mp_state(state.mp_state.into())?; 1991 1992 Ok(()) 1993 } 1994 1995 /// 1996 /// Initialize TDX for this CPU 1997 /// 1998 #[cfg(feature = "tdx")] 1999 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 2000 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 2001 .map_err(cpu::HypervisorCpuError::InitializeTdx) 2002 } 2003 2004 /// 2005 /// Set the "immediate_exit" state 2006 /// 2007 fn set_immediate_exit(&self, exit: bool) { 2008 self.fd.set_kvm_immediate_exit(exit.into()); 2009 } 2010 2011 /// 2012 /// Returns the details about TDX exit reason 2013 /// 2014 #[cfg(feature = "tdx")] 2015 fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> { 2016 let kvm_run = self.fd.get_kvm_run(); 2017 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2018 2019 tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND; 2020 2021 if tdx_vmcall.type_ != 0 { 2022 return Err(cpu::HypervisorCpuError::UnknownTdxVmCall); 2023 } 2024 2025 match tdx_vmcall.subfunction { 2026 TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote), 2027 TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => { 2028 Ok(TdxExitDetails::SetupEventNotifyInterrupt) 2029 } 2030 _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall), 2031 } 2032 } 2033 2034 /// 2035 /// Set the status code for TDX exit 2036 /// 2037 #[cfg(feature = "tdx")] 2038 fn set_tdx_status(&mut self, status: TdxExitStatus) { 2039 let kvm_run = self.fd.get_kvm_run(); 2040 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2041 2042 tdx_vmcall.status_code = match status { 2043 TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS, 2044 TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND, 2045 }; 2046 } 2047 #[cfg(target_arch = "x86_64")] 2048 /// 2049 /// Return the list of initial MSR entries for a VCPU 2050 /// 2051 fn boot_msr_entries(&self) -> MsrEntries { 2052 use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB}; 2053 use kvm_bindings::kvm_msr_entry as MsrEntry; 2054 2055 MsrEntries::from_entries(&[ 2056 msr!(msr_index::MSR_IA32_SYSENTER_CS), 2057 msr!(msr_index::MSR_IA32_SYSENTER_ESP), 2058 msr!(msr_index::MSR_IA32_SYSENTER_EIP), 2059 msr!(msr_index::MSR_STAR), 2060 msr!(msr_index::MSR_CSTAR), 2061 msr!(msr_index::MSR_LSTAR), 2062 msr!(msr_index::MSR_KERNEL_GS_BASE), 2063 msr!(msr_index::MSR_SYSCALL_MASK), 2064 msr!(msr_index::MSR_IA32_TSC), 2065 msr_data!( 2066 msr_index::MSR_IA32_MISC_ENABLE, 2067 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64 2068 ), 2069 msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB), 2070 ]) 2071 .unwrap() 2072 } 2073 } 2074 2075 /// Device struct for KVM 2076 pub type KvmDevice = DeviceFd; 2077 2078 impl device::Device for KvmDevice { 2079 /// 2080 /// Set device attribute 2081 /// 2082 fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> { 2083 self.set_device_attr(attr) 2084 .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into())) 2085 } 2086 /// 2087 /// Get device attribute 2088 /// 2089 fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> { 2090 self.get_device_attr(attr) 2091 .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into())) 2092 } 2093 /// 2094 /// Cast to the underlying KVM device fd 2095 /// 2096 fn as_any(&self) -> &dyn Any { 2097 self 2098 } 2099 } 2100