1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 use crate::aarch64::gic::KvmGicV3Its; 13 #[cfg(target_arch = "aarch64")] 14 pub use crate::aarch64::{ 15 check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit, 16 VcpuKvmState, MPIDR_EL1, 17 }; 18 #[cfg(target_arch = "aarch64")] 19 use crate::arch::aarch64::gic::Vgic; 20 use crate::cpu; 21 use crate::device; 22 use crate::hypervisor; 23 use crate::vec_with_array_field; 24 use crate::vm::{self, InterruptSourceConfig, VmOps}; 25 #[cfg(target_arch = "aarch64")] 26 use crate::{arm64_core_reg_id, offset__of}; 27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 28 use std::any::Any; 29 use std::collections::HashMap; 30 #[cfg(target_arch = "aarch64")] 31 use std::convert::TryInto; 32 #[cfg(target_arch = "x86_64")] 33 use std::fs::File; 34 #[cfg(target_arch = "x86_64")] 35 use std::os::unix::io::AsRawFd; 36 #[cfg(feature = "tdx")] 37 use std::os::unix::io::RawFd; 38 use std::result; 39 #[cfg(target_arch = "x86_64")] 40 use std::sync::atomic::{AtomicBool, Ordering}; 41 #[cfg(target_arch = "aarch64")] 42 use std::sync::Mutex; 43 use std::sync::{Arc, RwLock}; 44 use vmm_sys_util::eventfd::EventFd; 45 // x86_64 dependencies 46 #[cfg(target_arch = "x86_64")] 47 pub mod x86_64; 48 #[cfg(target_arch = "x86_64")] 49 use crate::arch::x86::{ 50 CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters, 51 NUM_IOAPIC_PINS, 52 }; 53 #[cfg(target_arch = "x86_64")] 54 use crate::ClockData; 55 use crate::{ 56 CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion, 57 USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, 58 }; 59 #[cfg(target_arch = "aarch64")] 60 use aarch64::{RegList, Register, StandardRegisters}; 61 #[cfg(target_arch = "x86_64")] 62 use kvm_bindings::{ 63 kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, 64 KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP, 65 }; 66 #[cfg(target_arch = "x86_64")] 67 use x86_64::check_required_kvm_extensions; 68 #[cfg(target_arch = "x86_64")] 69 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave}; 70 // aarch64 dependencies 71 #[cfg(target_arch = "aarch64")] 72 pub mod aarch64; 73 pub use kvm_bindings; 74 #[cfg(feature = "tdx")] 75 use kvm_bindings::KVMIO; 76 pub use kvm_bindings::{ 77 kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, 78 kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, 79 KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 80 }; 81 #[cfg(target_arch = "aarch64")] 82 use kvm_bindings::{ 83 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE, 84 KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 85 }; 86 pub use kvm_ioctls; 87 pub use kvm_ioctls::{Cap, Kvm}; 88 #[cfg(target_arch = "aarch64")] 89 use std::mem; 90 use thiserror::Error; 91 #[cfg(feature = "tdx")] 92 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr}; 93 /// 94 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 95 /// 96 pub use { 97 kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr, 98 kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, 99 kvm_ioctls::VcpuExit, 100 }; 101 102 #[cfg(target_arch = "x86_64")] 103 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 104 105 #[cfg(feature = "tdx")] 106 const KVM_EXIT_TDX: u32 = 35; 107 #[cfg(feature = "tdx")] 108 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002; 109 #[cfg(feature = "tdx")] 110 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004; 111 #[cfg(feature = "tdx")] 112 const TDG_VP_VMCALL_SUCCESS: u64 = 0; 113 #[cfg(feature = "tdx")] 114 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000; 115 116 #[cfg(feature = "tdx")] 117 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 118 119 #[cfg(feature = "tdx")] 120 #[repr(u32)] 121 enum TdxCommand { 122 Capabilities = 0, 123 InitVm, 124 InitVcpu, 125 InitMemRegion, 126 Finalize, 127 } 128 129 #[cfg(feature = "tdx")] 130 pub enum TdxExitDetails { 131 GetQuote, 132 SetupEventNotifyInterrupt, 133 } 134 135 #[cfg(feature = "tdx")] 136 pub enum TdxExitStatus { 137 Success, 138 InvalidOperand, 139 } 140 141 #[cfg(feature = "tdx")] 142 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6; 143 144 #[cfg(feature = "tdx")] 145 #[repr(C)] 146 #[derive(Debug, Default)] 147 pub struct TdxCpuidConfig { 148 pub leaf: u32, 149 pub sub_leaf: u32, 150 pub eax: u32, 151 pub ebx: u32, 152 pub ecx: u32, 153 pub edx: u32, 154 } 155 156 #[cfg(feature = "tdx")] 157 #[repr(C)] 158 #[derive(Debug, Default)] 159 pub struct TdxCapabilities { 160 pub attrs_fixed0: u64, 161 pub attrs_fixed1: u64, 162 pub xfam_fixed0: u64, 163 pub xfam_fixed1: u64, 164 pub nr_cpuid_configs: u32, 165 pub padding: u32, 166 pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS], 167 } 168 169 impl From<kvm_userspace_memory_region> for UserMemoryRegion { 170 fn from(region: kvm_userspace_memory_region) -> Self { 171 let mut flags = USER_MEMORY_REGION_READ; 172 if region.flags & KVM_MEM_READONLY == 0 { 173 flags |= USER_MEMORY_REGION_WRITE; 174 } 175 if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 { 176 flags |= USER_MEMORY_REGION_LOG_DIRTY; 177 } 178 179 UserMemoryRegion { 180 slot: region.slot, 181 guest_phys_addr: region.guest_phys_addr, 182 memory_size: region.memory_size, 183 userspace_addr: region.userspace_addr, 184 flags, 185 } 186 } 187 } 188 189 impl From<UserMemoryRegion> for kvm_userspace_memory_region { 190 fn from(region: UserMemoryRegion) -> Self { 191 assert!( 192 region.flags & USER_MEMORY_REGION_READ != 0, 193 "KVM mapped memory is always readable" 194 ); 195 196 let mut flags = 0; 197 if region.flags & USER_MEMORY_REGION_WRITE == 0 { 198 flags |= KVM_MEM_READONLY; 199 } 200 if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 { 201 flags |= KVM_MEM_LOG_DIRTY_PAGES; 202 } 203 204 kvm_userspace_memory_region { 205 slot: region.slot, 206 guest_phys_addr: region.guest_phys_addr, 207 memory_size: region.memory_size, 208 userspace_addr: region.userspace_addr, 209 flags, 210 } 211 } 212 } 213 214 impl From<kvm_mp_state> for MpState { 215 fn from(s: kvm_mp_state) -> Self { 216 MpState::Kvm(s) 217 } 218 } 219 220 impl From<MpState> for kvm_mp_state { 221 fn from(ms: MpState) -> Self { 222 match ms { 223 MpState::Kvm(s) => s, 224 /* Needed in case other hypervisors are enabled */ 225 #[allow(unreachable_patterns)] 226 _ => panic!("CpuState is not valid"), 227 } 228 } 229 } 230 231 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress { 232 fn from(a: kvm_ioctls::IoEventAddress) -> Self { 233 match a { 234 kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x), 235 kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x), 236 } 237 } 238 } 239 240 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress { 241 fn from(a: IoEventAddress) -> Self { 242 match a { 243 IoEventAddress::Pio(x) => Self::Pio(x), 244 IoEventAddress::Mmio(x) => Self::Mmio(x), 245 } 246 } 247 } 248 249 impl From<VcpuKvmState> for CpuState { 250 fn from(s: VcpuKvmState) -> Self { 251 CpuState::Kvm(s) 252 } 253 } 254 255 impl From<CpuState> for VcpuKvmState { 256 fn from(s: CpuState) -> Self { 257 match s { 258 CpuState::Kvm(s) => s, 259 /* Needed in case other hypervisors are enabled */ 260 #[allow(unreachable_patterns)] 261 _ => panic!("CpuState is not valid"), 262 } 263 } 264 } 265 266 #[cfg(target_arch = "x86_64")] 267 impl From<kvm_clock_data> for ClockData { 268 fn from(d: kvm_clock_data) -> Self { 269 ClockData::Kvm(d) 270 } 271 } 272 273 #[cfg(target_arch = "x86_64")] 274 impl From<ClockData> for kvm_clock_data { 275 fn from(ms: ClockData) -> Self { 276 match ms { 277 ClockData::Kvm(s) => s, 278 /* Needed in case other hypervisors are enabled */ 279 #[allow(unreachable_patterns)] 280 _ => panic!("CpuState is not valid"), 281 } 282 } 283 } 284 285 impl From<kvm_irq_routing_entry> for IrqRoutingEntry { 286 fn from(s: kvm_irq_routing_entry) -> Self { 287 IrqRoutingEntry::Kvm(s) 288 } 289 } 290 291 impl From<IrqRoutingEntry> for kvm_irq_routing_entry { 292 fn from(e: IrqRoutingEntry) -> Self { 293 match e { 294 IrqRoutingEntry::Kvm(e) => e, 295 /* Needed in case other hypervisors are enabled */ 296 #[allow(unreachable_patterns)] 297 _ => panic!("IrqRoutingEntry is not valid"), 298 } 299 } 300 } 301 302 struct KvmDirtyLogSlot { 303 slot: u32, 304 guest_phys_addr: u64, 305 memory_size: u64, 306 userspace_addr: u64, 307 } 308 309 /// Wrapper over KVM VM ioctls. 310 pub struct KvmVm { 311 fd: Arc<VmFd>, 312 #[cfg(target_arch = "x86_64")] 313 msrs: Vec<MsrEntry>, 314 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 315 } 316 317 /// 318 /// Implementation of Vm trait for KVM 319 /// Example: 320 /// #[cfg(feature = "kvm")] 321 /// extern crate hypervisor 322 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 323 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 324 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 325 /// vm.set/get().unwrap() 326 /// 327 impl vm::Vm for KvmVm { 328 #[cfg(target_arch = "x86_64")] 329 /// 330 /// Sets the address of the one-page region in the VM's address space. 331 /// 332 fn set_identity_map_address(&self, address: u64) -> vm::Result<()> { 333 self.fd 334 .set_identity_map_address(address) 335 .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into())) 336 } 337 #[cfg(target_arch = "x86_64")] 338 /// 339 /// Sets the address of the three-page region in the VM's address space. 340 /// 341 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 342 self.fd 343 .set_tss_address(offset) 344 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 345 } 346 /// 347 /// Creates an in-kernel interrupt controller. 348 /// 349 fn create_irq_chip(&self) -> vm::Result<()> { 350 self.fd 351 .create_irq_chip() 352 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 353 } 354 /// 355 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 356 /// 357 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 358 self.fd 359 .register_irqfd(fd, gsi) 360 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 361 } 362 /// 363 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 364 /// 365 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 366 self.fd 367 .unregister_irqfd(fd, gsi) 368 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 369 } 370 /// 371 /// Creates a VcpuFd object from a vcpu RawFd. 372 /// 373 fn create_vcpu( 374 &self, 375 id: u8, 376 vm_ops: Option<Arc<dyn VmOps>>, 377 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 378 let vc = self 379 .fd 380 .create_vcpu(id as u64) 381 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 382 let vcpu = KvmVcpu { 383 fd: vc, 384 #[cfg(target_arch = "x86_64")] 385 msrs: self.msrs.clone(), 386 vm_ops, 387 #[cfg(target_arch = "x86_64")] 388 hyperv_synic: AtomicBool::new(false), 389 }; 390 Ok(Arc::new(vcpu)) 391 } 392 #[cfg(target_arch = "aarch64")] 393 /// 394 /// Creates a virtual GIC device. 395 /// 396 fn create_vgic( 397 &self, 398 vcpu_count: u64, 399 dist_addr: u64, 400 dist_size: u64, 401 redist_size: u64, 402 msi_size: u64, 403 nr_irqs: u32, 404 ) -> vm::Result<Arc<Mutex<dyn Vgic>>> { 405 let gic_device = KvmGicV3Its::new( 406 self, 407 vcpu_count, 408 dist_addr, 409 dist_size, 410 redist_size, 411 msi_size, 412 nr_irqs, 413 ) 414 .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?; 415 Ok(Arc::new(Mutex::new(gic_device))) 416 } 417 /// 418 /// Registers an event to be signaled whenever a certain address is written to. 419 /// 420 fn register_ioevent( 421 &self, 422 fd: &EventFd, 423 addr: &IoEventAddress, 424 datamatch: Option<vm::DataMatch>, 425 ) -> vm::Result<()> { 426 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 427 if let Some(dm) = datamatch { 428 match dm { 429 vm::DataMatch::DataMatch32(kvm_dm32) => self 430 .fd 431 .register_ioevent(fd, addr, kvm_dm32) 432 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 433 vm::DataMatch::DataMatch64(kvm_dm64) => self 434 .fd 435 .register_ioevent(fd, addr, kvm_dm64) 436 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 437 } 438 } else { 439 self.fd 440 .register_ioevent(fd, addr, NoDatamatch) 441 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 442 } 443 } 444 /// 445 /// Unregisters an event from a certain address it has been previously registered to. 446 /// 447 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 448 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 449 self.fd 450 .unregister_ioevent(fd, addr, NoDatamatch) 451 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 452 } 453 454 /// 455 /// Constructs a routing entry 456 /// 457 fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry { 458 match &config { 459 InterruptSourceConfig::MsiIrq(cfg) => { 460 let mut kvm_route = kvm_irq_routing_entry { 461 gsi, 462 type_: KVM_IRQ_ROUTING_MSI, 463 ..Default::default() 464 }; 465 466 kvm_route.u.msi.address_lo = cfg.low_addr; 467 kvm_route.u.msi.address_hi = cfg.high_addr; 468 kvm_route.u.msi.data = cfg.data; 469 470 if self.check_extension(crate::kvm::Cap::MsiDevid) { 471 // On AArch64, there is limitation on the range of the 'devid', 472 // it can not be greater than 65536 (the max of u16). 473 // 474 // BDF can not be used directly, because 'segment' is in high 475 // 16 bits. The layout of the u32 BDF is: 476 // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| 477 // | segment | bus | device | function | 478 // 479 // Now that we support 1 bus only in a segment, we can build a 480 // 'devid' by replacing the 'bus' bits with the low 8 bits of 481 // 'segment' data. 482 // This way we can resolve the range checking problem and give 483 // different `devid` to all the devices. Limitation is that at 484 // most 256 segments can be supported. 485 // 486 let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff; 487 488 kvm_route.flags = KVM_MSI_VALID_DEVID; 489 kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid; 490 } 491 kvm_route.into() 492 } 493 InterruptSourceConfig::LegacyIrq(cfg) => { 494 let mut kvm_route = kvm_irq_routing_entry { 495 gsi, 496 type_: KVM_IRQ_ROUTING_IRQCHIP, 497 ..Default::default() 498 }; 499 kvm_route.u.irqchip.irqchip = cfg.irqchip; 500 kvm_route.u.irqchip.pin = cfg.pin; 501 502 kvm_route.into() 503 } 504 } 505 } 506 507 /// 508 /// Sets the GSI routing table entries, overwriting any previously set 509 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 510 /// 511 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 512 let mut irq_routing = 513 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 514 irq_routing[0].nr = entries.len() as u32; 515 irq_routing[0].flags = 0; 516 let entries: Vec<kvm_irq_routing_entry> = entries 517 .iter() 518 .map(|entry| match entry { 519 IrqRoutingEntry::Kvm(e) => *e, 520 #[allow(unreachable_patterns)] 521 _ => panic!("IrqRoutingEntry type is wrong"), 522 }) 523 .collect(); 524 525 // SAFETY: irq_routing initialized with entries.len() and now it is being turned into 526 // entries_slice with entries.len() again. It is guaranteed to be large enough to hold 527 // everything from entries. 528 unsafe { 529 let entries_slice: &mut [kvm_irq_routing_entry] = 530 irq_routing[0].entries.as_mut_slice(entries.len()); 531 entries_slice.copy_from_slice(&entries); 532 } 533 534 self.fd 535 .set_gsi_routing(&irq_routing[0]) 536 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 537 } 538 /// 539 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 540 /// 541 fn make_user_memory_region( 542 &self, 543 slot: u32, 544 guest_phys_addr: u64, 545 memory_size: u64, 546 userspace_addr: u64, 547 readonly: bool, 548 log_dirty_pages: bool, 549 ) -> UserMemoryRegion { 550 kvm_userspace_memory_region { 551 slot, 552 guest_phys_addr, 553 memory_size, 554 userspace_addr, 555 flags: if readonly { KVM_MEM_READONLY } else { 0 } 556 | if log_dirty_pages { 557 KVM_MEM_LOG_DIRTY_PAGES 558 } else { 559 0 560 }, 561 } 562 .into() 563 } 564 /// 565 /// Creates a guest physical memory region. 566 /// 567 fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 568 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 569 570 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 571 if (region.flags & KVM_MEM_READONLY) != 0 { 572 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 573 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 574 ))); 575 } 576 577 // Keep track of the regions that need dirty pages log 578 self.dirty_log_slots.write().unwrap().insert( 579 region.slot, 580 KvmDirtyLogSlot { 581 slot: region.slot, 582 guest_phys_addr: region.guest_phys_addr, 583 memory_size: region.memory_size, 584 userspace_addr: region.userspace_addr, 585 }, 586 ); 587 588 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 589 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 590 region.flags = 0; 591 } 592 593 // SAFETY: Safe because guest regions are guaranteed not to overlap. 594 unsafe { 595 self.fd 596 .set_user_memory_region(region) 597 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 598 } 599 } 600 /// 601 /// Removes a guest physical memory region. 602 /// 603 fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 604 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 605 606 // Remove the corresponding entry from "self.dirty_log_slots" if needed 607 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 608 609 // Setting the size to 0 means "remove" 610 region.memory_size = 0; 611 // SAFETY: Safe because guest regions are guaranteed not to overlap. 612 unsafe { 613 self.fd 614 .set_user_memory_region(region) 615 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 616 } 617 } 618 /// 619 /// Creates an emulated device in the kernel. 620 /// 621 /// See the documentation for `KVM_CREATE_DEVICE`. 622 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> { 623 let device_fd = self 624 .fd 625 .create_device(device) 626 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 627 Ok(Arc::new(device_fd)) 628 } 629 /// 630 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 631 /// 632 #[cfg(target_arch = "aarch64")] 633 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 634 self.fd 635 .get_preferred_target(kvi) 636 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 637 } 638 #[cfg(target_arch = "x86_64")] 639 fn enable_split_irq(&self) -> vm::Result<()> { 640 // Create split irqchip 641 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 642 // are not. 643 let mut cap = kvm_enable_cap { 644 cap: KVM_CAP_SPLIT_IRQCHIP, 645 ..Default::default() 646 }; 647 cap.args[0] = NUM_IOAPIC_PINS as u64; 648 self.fd 649 .enable_cap(&cap) 650 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 651 Ok(()) 652 } 653 #[cfg(target_arch = "x86_64")] 654 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 655 let mut cap = kvm_enable_cap { 656 cap: KVM_CAP_SGX_ATTRIBUTE, 657 ..Default::default() 658 }; 659 cap.args[0] = file.as_raw_fd() as u64; 660 self.fd 661 .enable_cap(&cap) 662 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 663 Ok(()) 664 } 665 /// Retrieve guest clock. 666 #[cfg(target_arch = "x86_64")] 667 fn get_clock(&self) -> vm::Result<ClockData> { 668 Ok(self 669 .fd 670 .get_clock() 671 .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))? 672 .into()) 673 } 674 /// Set guest clock. 675 #[cfg(target_arch = "x86_64")] 676 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 677 let data = (*data).into(); 678 self.fd 679 .set_clock(&data) 680 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 681 } 682 /// Checks if a particular `Cap` is available. 683 fn check_extension(&self, c: Cap) -> bool { 684 self.fd.check_extension(c) 685 } 686 /// Create a device that is used for passthrough 687 fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> { 688 let mut vfio_dev = kvm_create_device { 689 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 690 fd: 0, 691 flags: 0, 692 }; 693 694 self.create_device(&mut vfio_dev) 695 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 696 } 697 /// 698 /// Start logging dirty pages 699 /// 700 fn start_dirty_log(&self) -> vm::Result<()> { 701 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 702 for (_, s) in dirty_log_slots.iter() { 703 let region = kvm_userspace_memory_region { 704 slot: s.slot, 705 guest_phys_addr: s.guest_phys_addr, 706 memory_size: s.memory_size, 707 userspace_addr: s.userspace_addr, 708 flags: KVM_MEM_LOG_DIRTY_PAGES, 709 }; 710 // SAFETY: Safe because guest regions are guaranteed not to overlap. 711 unsafe { 712 self.fd 713 .set_user_memory_region(region) 714 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 715 } 716 } 717 718 Ok(()) 719 } 720 721 /// 722 /// Stop logging dirty pages 723 /// 724 fn stop_dirty_log(&self) -> vm::Result<()> { 725 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 726 for (_, s) in dirty_log_slots.iter() { 727 let region = kvm_userspace_memory_region { 728 slot: s.slot, 729 guest_phys_addr: s.guest_phys_addr, 730 memory_size: s.memory_size, 731 userspace_addr: s.userspace_addr, 732 flags: 0, 733 }; 734 // SAFETY: Safe because guest regions are guaranteed not to overlap. 735 unsafe { 736 self.fd 737 .set_user_memory_region(region) 738 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 739 } 740 } 741 742 Ok(()) 743 } 744 745 /// 746 /// Get dirty pages bitmap (one bit per page) 747 /// 748 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 749 self.fd 750 .get_dirty_log(slot, memory_size as usize) 751 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 752 } 753 754 /// 755 /// Initialize TDX for this VM 756 /// 757 #[cfg(feature = "tdx")] 758 fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> { 759 use std::io::{Error, ErrorKind}; 760 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 761 cpuid.iter().map(|e| (*e).into()).collect(); 762 let kvm_cpuid = kvm_bindings::CpuId::from_entries(&cpuid).map_err(|_| { 763 vm::HypervisorVmError::InitializeTdx(Error::new( 764 ErrorKind::Other, 765 "failed to allocate CpuId", 766 )) 767 })?; 768 769 #[repr(C)] 770 struct TdxInitVm { 771 max_vcpus: u32, 772 tsc_khz: u32, 773 attributes: u64, 774 cpuid: u64, 775 mrconfigid: [u64; 6], 776 mrowner: [u64; 6], 777 mrownerconfig: [u64; 6], 778 reserved: [u64; 43], 779 } 780 let data = TdxInitVm { 781 max_vcpus, 782 tsc_khz: 0, 783 attributes: 0, 784 cpuid: kvm_cpuid.as_fam_struct_ptr() as u64, 785 mrconfigid: [0; 6], 786 mrowner: [0; 6], 787 mrownerconfig: [0; 6], 788 reserved: [0; 43], 789 }; 790 791 tdx_command( 792 &self.fd.as_raw_fd(), 793 TdxCommand::InitVm, 794 0, 795 &data as *const _ as u64, 796 ) 797 .map_err(vm::HypervisorVmError::InitializeTdx) 798 } 799 800 /// 801 /// Finalize the TDX setup for this VM 802 /// 803 #[cfg(feature = "tdx")] 804 fn tdx_finalize(&self) -> vm::Result<()> { 805 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 806 .map_err(vm::HypervisorVmError::FinalizeTdx) 807 } 808 809 /// 810 /// Initialize memory regions for the TDX VM 811 /// 812 #[cfg(feature = "tdx")] 813 fn tdx_init_memory_region( 814 &self, 815 host_address: u64, 816 guest_address: u64, 817 size: u64, 818 measure: bool, 819 ) -> vm::Result<()> { 820 #[repr(C)] 821 struct TdxInitMemRegion { 822 host_address: u64, 823 guest_address: u64, 824 pages: u64, 825 } 826 let data = TdxInitMemRegion { 827 host_address, 828 guest_address, 829 pages: size / 4096, 830 }; 831 832 tdx_command( 833 &self.fd.as_raw_fd(), 834 TdxCommand::InitMemRegion, 835 if measure { 1 } else { 0 }, 836 &data as *const _ as u64, 837 ) 838 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 839 } 840 } 841 842 #[cfg(feature = "tdx")] 843 fn tdx_command( 844 fd: &RawFd, 845 command: TdxCommand, 846 metadata: u32, 847 data: u64, 848 ) -> std::result::Result<(), std::io::Error> { 849 #[repr(C)] 850 struct TdxIoctlCmd { 851 command: TdxCommand, 852 metadata: u32, 853 data: u64, 854 } 855 let cmd = TdxIoctlCmd { 856 command, 857 metadata, 858 data, 859 }; 860 // SAFETY: FFI call. All input parameters are valid. 861 let ret = unsafe { 862 ioctl_with_val( 863 fd, 864 KVM_MEMORY_ENCRYPT_OP(), 865 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 866 ) 867 }; 868 869 if ret < 0 { 870 return Err(std::io::Error::last_os_error()); 871 } 872 Ok(()) 873 } 874 875 /// Wrapper over KVM system ioctls. 876 pub struct KvmHypervisor { 877 kvm: Kvm, 878 } 879 880 impl KvmHypervisor { 881 #[cfg(target_arch = "x86_64")] 882 /// 883 /// Retrieve the list of MSRs supported by the hypervisor. 884 /// 885 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 886 self.kvm 887 .get_msr_index_list() 888 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 889 } 890 } 891 892 /// Enum for KVM related error 893 #[derive(Debug, Error)] 894 pub enum KvmError { 895 #[error("Capability missing: {0:?}")] 896 CapabilityMissing(Cap), 897 } 898 pub type KvmResult<T> = result::Result<T, KvmError>; 899 impl KvmHypervisor { 900 /// Create a hypervisor based on Kvm 901 pub fn new() -> hypervisor::Result<KvmHypervisor> { 902 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 903 let api_version = kvm_obj.get_api_version(); 904 905 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 906 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 907 } 908 909 Ok(KvmHypervisor { kvm: kvm_obj }) 910 } 911 } 912 /// Implementation of Hypervisor trait for KVM 913 /// Example: 914 /// #[cfg(feature = "kvm")] 915 /// extern crate hypervisor 916 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 917 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 918 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 919 /// 920 impl hypervisor::Hypervisor for KvmHypervisor { 921 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 922 /// Example 923 /// # extern crate hypervisor; 924 /// # use hypervisor::KvmHypervisor; 925 /// use hypervisor::KvmVm; 926 /// let hypervisor = KvmHypervisor::new().unwrap(); 927 /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() 928 /// 929 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 930 let fd: VmFd; 931 loop { 932 match self.kvm.create_vm_with_type(vm_type) { 933 Ok(res) => fd = res, 934 Err(e) => { 935 if e.errno() == libc::EINTR { 936 // If the error returned is EINTR, which means the 937 // ioctl has been interrupted, we have to retry as 938 // this can't be considered as a regular error. 939 continue; 940 } else { 941 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 942 } 943 } 944 } 945 break; 946 } 947 948 let vm_fd = Arc::new(fd); 949 950 #[cfg(target_arch = "x86_64")] 951 { 952 let msr_list = self.get_msr_list()?; 953 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 954 let mut msrs: Vec<MsrEntry> = vec![ 955 MsrEntry { 956 ..Default::default() 957 }; 958 num_msrs 959 ]; 960 let indices = msr_list.as_slice(); 961 for (pos, index) in indices.iter().enumerate() { 962 msrs[pos].index = *index; 963 } 964 965 Ok(Arc::new(KvmVm { 966 fd: vm_fd, 967 msrs, 968 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 969 })) 970 } 971 972 #[cfg(target_arch = "aarch64")] 973 { 974 Ok(Arc::new(KvmVm { 975 fd: vm_fd, 976 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 977 })) 978 } 979 } 980 981 /// Create a KVM vm object and return the object as Vm trait object 982 /// Example 983 /// # extern crate hypervisor; 984 /// # use hypervisor::KvmHypervisor; 985 /// use hypervisor::KvmVm; 986 /// let hypervisor = KvmHypervisor::new().unwrap(); 987 /// let vm = hypervisor.create_vm().unwrap() 988 /// 989 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 990 #[allow(unused_mut)] 991 let mut vm_type: u64 = 0; // Create with default platform type 992 993 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 994 // size from the host and use that when creating the VM, which may 995 // avoid unnecessary VM creation failures. 996 #[cfg(target_arch = "aarch64")] 997 if self.kvm.check_extension(Cap::ArmVmIPASize) { 998 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 999 } 1000 1001 self.create_vm_with_type(vm_type) 1002 } 1003 1004 fn check_required_extensions(&self) -> hypervisor::Result<()> { 1005 check_required_kvm_extensions(&self.kvm) 1006 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 1007 } 1008 1009 #[cfg(target_arch = "x86_64")] 1010 /// 1011 /// X86 specific call to get the system supported CPUID values. 1012 /// 1013 fn get_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> { 1014 let kvm_cpuid = self 1015 .kvm 1016 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 1017 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?; 1018 1019 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1020 1021 Ok(v) 1022 } 1023 1024 #[cfg(target_arch = "aarch64")] 1025 /// 1026 /// Retrieve AArch64 host maximum IPA size supported by KVM. 1027 /// 1028 fn get_host_ipa_limit(&self) -> i32 { 1029 self.kvm.get_host_ipa_limit() 1030 } 1031 1032 /// 1033 /// Retrieve TDX capabilities 1034 /// 1035 #[cfg(feature = "tdx")] 1036 fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> { 1037 let data = TdxCapabilities { 1038 nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32, 1039 ..Default::default() 1040 }; 1041 1042 tdx_command( 1043 &self.kvm.as_raw_fd(), 1044 TdxCommand::Capabilities, 1045 0, 1046 &data as *const _ as u64, 1047 ) 1048 .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?; 1049 1050 Ok(data) 1051 } 1052 } 1053 /// Vcpu struct for KVM 1054 pub struct KvmVcpu { 1055 fd: VcpuFd, 1056 #[cfg(target_arch = "x86_64")] 1057 msrs: Vec<MsrEntry>, 1058 vm_ops: Option<Arc<dyn vm::VmOps>>, 1059 #[cfg(target_arch = "x86_64")] 1060 hyperv_synic: AtomicBool, 1061 } 1062 /// Implementation of Vcpu trait for KVM 1063 /// Example: 1064 /// #[cfg(feature = "kvm")] 1065 /// extern crate hypervisor 1066 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1067 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1068 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 1069 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1070 /// vcpu.get/set().unwrap() 1071 /// 1072 impl cpu::Vcpu for KvmVcpu { 1073 #[cfg(target_arch = "x86_64")] 1074 /// 1075 /// Returns the vCPU general purpose registers. 1076 /// 1077 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1078 Ok(self 1079 .fd 1080 .get_regs() 1081 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))? 1082 .into()) 1083 } 1084 /// 1085 /// Returns the vCPU general purpose registers. 1086 /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG` 1087 /// is used to get registers one by one. 1088 /// 1089 #[cfg(target_arch = "aarch64")] 1090 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1091 let mut state: StandardRegisters = kvm_regs::default(); 1092 let mut off = offset__of!(user_pt_regs, regs); 1093 // There are 31 user_pt_regs: 1094 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1095 // These actually are the general-purpose registers of the Armv8-a 1096 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1097 for i in 0..31 { 1098 state.regs.regs[i] = self 1099 .fd 1100 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1101 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1102 off += std::mem::size_of::<u64>(); 1103 } 1104 1105 // We are now entering the "Other register" section of the ARMv8-a architecture. 1106 // First one, stack pointer. 1107 let off = offset__of!(user_pt_regs, sp); 1108 state.regs.sp = self 1109 .fd 1110 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1111 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1112 1113 // Second one, the program counter. 1114 let off = offset__of!(user_pt_regs, pc); 1115 state.regs.pc = self 1116 .fd 1117 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1118 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1119 1120 // Next is the processor state. 1121 let off = offset__of!(user_pt_regs, pstate); 1122 state.regs.pstate = self 1123 .fd 1124 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1125 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1126 1127 // The stack pointer associated with EL1 1128 let off = offset__of!(kvm_regs, sp_el1); 1129 state.sp_el1 = self 1130 .fd 1131 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1132 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1133 1134 // Exception Link Register for EL1, when taking an exception to EL1, this register 1135 // holds the address to which to return afterwards. 1136 let off = offset__of!(kvm_regs, elr_el1); 1137 state.elr_el1 = self 1138 .fd 1139 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1140 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1141 1142 // Saved Program Status Registers, there are 5 of them used in the kernel. 1143 let mut off = offset__of!(kvm_regs, spsr); 1144 for i in 0..KVM_NR_SPSR as usize { 1145 state.spsr[i] = self 1146 .fd 1147 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1148 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1149 off += std::mem::size_of::<u64>(); 1150 } 1151 1152 // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: 1153 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1154 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1155 for i in 0..32 { 1156 state.fp_regs.vregs[i] = self 1157 .fd 1158 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1159 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1160 .into(); 1161 off += mem::size_of::<u128>(); 1162 } 1163 1164 // Floating-point Status Register 1165 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1166 state.fp_regs.fpsr = self 1167 .fd 1168 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1169 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1170 as u32; 1171 1172 // Floating-point Control Register 1173 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1174 state.fp_regs.fpcr = self 1175 .fd 1176 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1177 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1178 as u32; 1179 Ok(state) 1180 } 1181 #[cfg(target_arch = "x86_64")] 1182 /// 1183 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 1184 /// 1185 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 1186 let regs = (*regs).into(); 1187 self.fd 1188 .set_regs(®s) 1189 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 1190 } 1191 1192 /// 1193 /// Sets the vCPU general purpose registers. 1194 /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG` 1195 /// is used to set registers one by one. 1196 /// 1197 #[cfg(target_arch = "aarch64")] 1198 fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> { 1199 // The function follows the exact identical order from `state`. Look there 1200 // for some additional info on registers. 1201 let mut off = offset__of!(user_pt_regs, regs); 1202 for i in 0..31 { 1203 self.fd 1204 .set_one_reg( 1205 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1206 state.regs.regs[i], 1207 ) 1208 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1209 off += std::mem::size_of::<u64>(); 1210 } 1211 1212 let off = offset__of!(user_pt_regs, sp); 1213 self.fd 1214 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp) 1215 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1216 1217 let off = offset__of!(user_pt_regs, pc); 1218 self.fd 1219 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc) 1220 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1221 1222 let off = offset__of!(user_pt_regs, pstate); 1223 self.fd 1224 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate) 1225 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1226 1227 let off = offset__of!(kvm_regs, sp_el1); 1228 self.fd 1229 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1) 1230 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1231 1232 let off = offset__of!(kvm_regs, elr_el1); 1233 self.fd 1234 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1) 1235 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1236 1237 let mut off = offset__of!(kvm_regs, spsr); 1238 for i in 0..KVM_NR_SPSR as usize { 1239 self.fd 1240 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i]) 1241 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1242 off += std::mem::size_of::<u64>(); 1243 } 1244 1245 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1246 for i in 0..32 { 1247 self.fd 1248 .set_one_reg( 1249 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1250 state.fp_regs.vregs[i] as u64, 1251 ) 1252 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1253 off += mem::size_of::<u128>(); 1254 } 1255 1256 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1257 self.fd 1258 .set_one_reg( 1259 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1260 state.fp_regs.fpsr as u64, 1261 ) 1262 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1263 1264 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1265 self.fd 1266 .set_one_reg( 1267 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1268 state.fp_regs.fpcr as u64, 1269 ) 1270 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1271 Ok(()) 1272 } 1273 1274 #[cfg(target_arch = "aarch64")] 1275 /// 1276 /// Set attribute for vcpu. 1277 /// 1278 fn set_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> { 1279 self.fd 1280 .set_device_attr(attr) 1281 .map_err(|e| cpu::HypervisorCpuError::SetVcpuAttribute(e.into())) 1282 } 1283 1284 #[cfg(target_arch = "aarch64")] 1285 /// 1286 /// Check if vcpu has a certain attribute. 1287 /// 1288 fn has_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> { 1289 self.fd 1290 .has_device_attr(attr) 1291 .map_err(|e| cpu::HypervisorCpuError::HasVcpuAttribute(e.into())) 1292 } 1293 1294 #[cfg(target_arch = "x86_64")] 1295 /// 1296 /// Returns the vCPU special registers. 1297 /// 1298 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 1299 Ok(self 1300 .fd 1301 .get_sregs() 1302 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))? 1303 .into()) 1304 } 1305 #[cfg(target_arch = "x86_64")] 1306 /// 1307 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 1308 /// 1309 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 1310 let sregs = (*sregs).into(); 1311 self.fd 1312 .set_sregs(&sregs) 1313 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 1314 } 1315 #[cfg(target_arch = "x86_64")] 1316 /// 1317 /// Returns the floating point state (FPU) from the vCPU. 1318 /// 1319 fn get_fpu(&self) -> cpu::Result<FpuState> { 1320 Ok(self 1321 .fd 1322 .get_fpu() 1323 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))? 1324 .into()) 1325 } 1326 #[cfg(target_arch = "x86_64")] 1327 /// 1328 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 1329 /// 1330 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 1331 let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into(); 1332 self.fd 1333 .set_fpu(&fpu) 1334 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 1335 } 1336 #[cfg(target_arch = "x86_64")] 1337 /// 1338 /// X86 specific call to setup the CPUID registers. 1339 /// 1340 fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> { 1341 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 1342 cpuid.iter().map(|e| (*e).into()).collect(); 1343 let kvm_cpuid = <CpuId>::from_entries(&cpuid) 1344 .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?; 1345 1346 self.fd 1347 .set_cpuid2(&kvm_cpuid) 1348 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 1349 } 1350 #[cfg(target_arch = "x86_64")] 1351 /// 1352 /// X86 specific call to enable HyperV SynIC 1353 /// 1354 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 1355 // Update the information about Hyper-V SynIC being enabled and 1356 // emulated as it will influence later which MSRs should be saved. 1357 self.hyperv_synic.store(true, Ordering::Release); 1358 1359 let cap = kvm_enable_cap { 1360 cap: KVM_CAP_HYPERV_SYNIC, 1361 ..Default::default() 1362 }; 1363 self.fd 1364 .enable_cap(&cap) 1365 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 1366 } 1367 /// 1368 /// X86 specific call to retrieve the CPUID registers. 1369 /// 1370 #[cfg(target_arch = "x86_64")] 1371 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> { 1372 let kvm_cpuid = self 1373 .fd 1374 .get_cpuid2(num_entries) 1375 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?; 1376 1377 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1378 1379 Ok(v) 1380 } 1381 #[cfg(target_arch = "x86_64")] 1382 /// 1383 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1384 /// 1385 fn get_lapic(&self) -> cpu::Result<LapicState> { 1386 Ok(self 1387 .fd 1388 .get_lapic() 1389 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))? 1390 .into()) 1391 } 1392 #[cfg(target_arch = "x86_64")] 1393 /// 1394 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1395 /// 1396 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 1397 let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into(); 1398 self.fd 1399 .set_lapic(&klapic) 1400 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 1401 } 1402 #[cfg(target_arch = "x86_64")] 1403 /// 1404 /// Returns the model-specific registers (MSR) for this vCPU. 1405 /// 1406 fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> { 1407 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1408 let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1409 let succ = self 1410 .fd 1411 .get_msrs(&mut kvm_msrs) 1412 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?; 1413 1414 msrs[..succ].copy_from_slice( 1415 &kvm_msrs.as_slice()[..succ] 1416 .iter() 1417 .map(|e| (*e).into()) 1418 .collect::<Vec<MsrEntry>>(), 1419 ); 1420 1421 Ok(succ) 1422 } 1423 #[cfg(target_arch = "x86_64")] 1424 /// 1425 /// Setup the model-specific registers (MSR) for this vCPU. 1426 /// Returns the number of MSR entries actually written. 1427 /// 1428 fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> { 1429 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1430 let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1431 self.fd 1432 .set_msrs(&kvm_msrs) 1433 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 1434 } 1435 /// 1436 /// Returns the vcpu's current "multiprocessing state". 1437 /// 1438 fn get_mp_state(&self) -> cpu::Result<MpState> { 1439 Ok(self 1440 .fd 1441 .get_mp_state() 1442 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))? 1443 .into()) 1444 } 1445 /// 1446 /// Sets the vcpu's current "multiprocessing state". 1447 /// 1448 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 1449 self.fd 1450 .set_mp_state(mp_state.into()) 1451 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 1452 } 1453 #[cfg(target_arch = "x86_64")] 1454 /// 1455 /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl. 1456 /// 1457 fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> { 1458 let tr = self 1459 .fd 1460 .translate_gva(gva) 1461 .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?; 1462 // tr.valid is set if the GVA is mapped to valid GPA. 1463 match tr.valid { 1464 0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!( 1465 "Invalid GVA: {:#x}", 1466 gva 1467 ))), 1468 _ => Ok((tr.physical_address, 0)), 1469 } 1470 } 1471 /// 1472 /// Triggers the running of the current virtual CPU returning an exit reason. 1473 /// 1474 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 1475 match self.fd.run() { 1476 Ok(run) => match run { 1477 #[cfg(target_arch = "x86_64")] 1478 VcpuExit::IoIn(addr, data) => { 1479 if let Some(vm_ops) = &self.vm_ops { 1480 return vm_ops 1481 .pio_read(addr.into(), data) 1482 .map(|_| cpu::VmExit::Ignore) 1483 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1484 } 1485 1486 Ok(cpu::VmExit::IoIn(addr, data)) 1487 } 1488 #[cfg(target_arch = "x86_64")] 1489 VcpuExit::IoOut(addr, data) => { 1490 if let Some(vm_ops) = &self.vm_ops { 1491 return vm_ops 1492 .pio_write(addr.into(), data) 1493 .map(|_| cpu::VmExit::Ignore) 1494 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1495 } 1496 1497 Ok(cpu::VmExit::IoOut(addr, data)) 1498 } 1499 #[cfg(target_arch = "x86_64")] 1500 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 1501 #[cfg(target_arch = "x86_64")] 1502 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 1503 1504 #[cfg(target_arch = "aarch64")] 1505 VcpuExit::SystemEvent(event_type, flags) => { 1506 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 1507 // On Aarch64, when the VM is shutdown, run() returns 1508 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 1509 if event_type == KVM_SYSTEM_EVENT_RESET { 1510 Ok(cpu::VmExit::Reset) 1511 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 1512 Ok(cpu::VmExit::Shutdown) 1513 } else { 1514 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1515 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 1516 event_type, 1517 flags 1518 ))) 1519 } 1520 } 1521 1522 VcpuExit::MmioRead(addr, data) => { 1523 if let Some(vm_ops) = &self.vm_ops { 1524 return vm_ops 1525 .mmio_read(addr, data) 1526 .map(|_| cpu::VmExit::Ignore) 1527 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1528 } 1529 1530 Ok(cpu::VmExit::MmioRead(addr, data)) 1531 } 1532 VcpuExit::MmioWrite(addr, data) => { 1533 if let Some(vm_ops) = &self.vm_ops { 1534 return vm_ops 1535 .mmio_write(addr, data) 1536 .map(|_| cpu::VmExit::Ignore) 1537 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1538 } 1539 1540 Ok(cpu::VmExit::MmioWrite(addr, data)) 1541 } 1542 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 1543 #[cfg(feature = "tdx")] 1544 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), 1545 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), 1546 1547 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1548 "Unexpected exit reason on vcpu run: {:?}", 1549 r 1550 ))), 1551 }, 1552 1553 Err(ref e) => match e.errno() { 1554 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1555 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1556 "VCPU error {:?}", 1557 e 1558 ))), 1559 }, 1560 } 1561 } 1562 #[cfg(target_arch = "x86_64")] 1563 /// 1564 /// Let the guest know that it has been paused, which prevents from 1565 /// potential soft lockups when being resumed. 1566 /// 1567 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1568 if let Err(e) = self.fd.kvmclock_ctrl() { 1569 // Linux kernel returns -EINVAL if the PV clock isn't yet initialised 1570 // which could be because we're still in firmware or the guest doesn't 1571 // use KVM clock. 1572 if e.errno() != libc::EINVAL { 1573 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())); 1574 } 1575 } 1576 1577 Ok(()) 1578 } 1579 #[cfg(target_arch = "x86_64")] 1580 /// 1581 /// Sets debug registers to set hardware breakpoints and/or enable single step. 1582 /// 1583 fn set_guest_debug( 1584 &self, 1585 addrs: &[vm_memory::GuestAddress], 1586 singlestep: bool, 1587 ) -> cpu::Result<()> { 1588 if addrs.len() > 4 { 1589 return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!( 1590 "Support 4 breakpoints at most but {} addresses are passed", 1591 addrs.len() 1592 ))); 1593 } 1594 1595 let mut dbg = kvm_guest_debug { 1596 control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP, 1597 ..Default::default() 1598 }; 1599 if singlestep { 1600 dbg.control |= KVM_GUESTDBG_SINGLESTEP; 1601 } 1602 1603 // Set bits 9 and 10. 1604 // bit 9: GE (global exact breakpoint enable) flag. 1605 // bit 10: always 1. 1606 dbg.arch.debugreg[7] = 0x0600; 1607 1608 for (i, addr) in addrs.iter().enumerate() { 1609 dbg.arch.debugreg[i] = addr.0; 1610 // Set global breakpoint enable flag 1611 dbg.arch.debugreg[7] |= 2 << (i * 2); 1612 } 1613 1614 self.fd 1615 .set_guest_debug(&dbg) 1616 .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into())) 1617 } 1618 #[cfg(target_arch = "aarch64")] 1619 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1620 self.fd 1621 .vcpu_init(kvi) 1622 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1623 } 1624 /// 1625 /// Sets the value of one register for this vCPU. 1626 /// 1627 #[cfg(target_arch = "aarch64")] 1628 fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> { 1629 self.fd 1630 .set_one_reg(reg_id, data) 1631 .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into())) 1632 } 1633 /// 1634 /// Gets the value of one register for this vCPU. 1635 /// 1636 #[cfg(target_arch = "aarch64")] 1637 fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> { 1638 self.fd 1639 .get_one_reg(reg_id) 1640 .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into())) 1641 } 1642 /// 1643 /// Gets a list of the guest registers that are supported for the 1644 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1645 /// 1646 #[cfg(target_arch = "aarch64")] 1647 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1648 self.fd 1649 .get_reg_list(reg_list) 1650 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1651 } 1652 /// 1653 /// Save the state of the system registers. 1654 /// 1655 #[cfg(target_arch = "aarch64")] 1656 fn get_sys_regs(&self) -> cpu::Result<Vec<Register>> { 1657 // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are 1658 // around 500 registers. 1659 let mut state: Vec<Register> = Vec::new(); 1660 let mut reg_list = RegList::new(500).unwrap(); 1661 self.fd 1662 .get_reg_list(&mut reg_list) 1663 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1664 1665 // At this point reg_list should contain: core registers and system registers. 1666 // The register list contains the number of registers and their ids. We will be needing to 1667 // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list 1668 // the core registers which are represented in the kernel by kvm_regs structure and for which 1669 // we can calculate the id based on the offset in the structure. 1670 reg_list.retain(|regid| is_system_register(*regid)); 1671 1672 // Now, for the rest of the registers left in the previously fetched register list, we are 1673 // simply calling KVM_GET_ONE_REG. 1674 let indices = reg_list.as_slice(); 1675 for index in indices.iter() { 1676 state.push(kvm_bindings::kvm_one_reg { 1677 id: *index, 1678 addr: self 1679 .fd 1680 .get_one_reg(*index) 1681 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?, 1682 }); 1683 } 1684 1685 Ok(state) 1686 } 1687 /// 1688 /// Restore the state of the system registers. 1689 /// 1690 #[cfg(target_arch = "aarch64")] 1691 fn set_sys_regs(&self, state: &[Register]) -> cpu::Result<()> { 1692 for reg in state { 1693 self.fd 1694 .set_one_reg(reg.id, reg.addr) 1695 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 1696 } 1697 Ok(()) 1698 } 1699 /// 1700 /// Read the MPIDR - Multiprocessor Affinity Register. 1701 /// 1702 #[cfg(target_arch = "aarch64")] 1703 fn read_mpidr(&self) -> cpu::Result<u64> { 1704 self.fd 1705 .get_one_reg(MPIDR_EL1) 1706 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into())) 1707 } 1708 /// 1709 /// Configure core registers for a given CPU. 1710 /// 1711 #[cfg(target_arch = "aarch64")] 1712 fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { 1713 #[allow(non_upper_case_globals)] 1714 // PSR (Processor State Register) bits. 1715 // Taken from arch/arm64/include/uapi/asm/ptrace.h. 1716 const PSR_MODE_EL1h: u64 = 0x0000_0005; 1717 const PSR_F_BIT: u64 = 0x0000_0040; 1718 const PSR_I_BIT: u64 = 0x0000_0080; 1719 const PSR_A_BIT: u64 = 0x0000_0100; 1720 const PSR_D_BIT: u64 = 0x0000_0200; 1721 // Taken from arch/arm64/kvm/inject_fault.c. 1722 const PSTATE_FAULT_BITS_64: u64 = 1723 PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT; 1724 1725 let kreg_off = offset__of!(kvm_regs, regs); 1726 1727 // Get the register index of the PSTATE (Processor State) register. 1728 let pstate = offset__of!(user_pt_regs, pstate) + kreg_off; 1729 self.set_reg( 1730 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate), 1731 PSTATE_FAULT_BITS_64, 1732 ) 1733 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1734 1735 // Other vCPUs are powered off initially awaiting PSCI wakeup. 1736 if cpu_id == 0 { 1737 // Setting the PC (Processor Counter) to the current program address (kernel address). 1738 let pc = offset__of!(user_pt_regs, pc) + kreg_off; 1739 self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64) 1740 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1741 1742 // Last mandatory thing to set -> the address pointing to the FDT (also called DTB). 1743 // "The device tree blob (dtb) must be placed on an 8-byte boundary and must 1744 // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt. 1745 // We are choosing to place it the end of DRAM. See `get_fdt_addr`. 1746 let regs0 = offset__of!(user_pt_regs, regs) + kreg_off; 1747 self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start) 1748 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1749 } 1750 Ok(()) 1751 } 1752 1753 #[cfg(target_arch = "x86_64")] 1754 /// 1755 /// Get the current CPU state 1756 /// 1757 /// Ordering requirements: 1758 /// 1759 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1760 /// vCPU/LAPIC state. As such, it must be done before most everything 1761 /// else, otherwise we cannot restore everything and expect it to work. 1762 /// 1763 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1764 /// still running. 1765 /// 1766 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1767 /// 1768 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1769 /// it might as well be affected by internal state modifications of the 1770 /// GET ioctls. 1771 /// 1772 /// SREGS saves/restores a pending interrupt, similar to what 1773 /// VCPU_EVENTS also does. 1774 /// 1775 /// GET_MSRS requires a pre-populated data structure to do something 1776 /// meaningful. For SET_MSRS it will then contain good data. 1777 /// 1778 /// # Example 1779 /// 1780 /// ```rust 1781 /// # extern crate hypervisor; 1782 /// # use hypervisor::KvmHypervisor; 1783 /// # use std::sync::Arc; 1784 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1785 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1786 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1787 /// vm.enable_split_irq().unwrap(); 1788 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1789 /// let state = vcpu.state().unwrap(); 1790 /// ``` 1791 fn state(&self) -> cpu::Result<CpuState> { 1792 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1793 let mp_state = self.get_mp_state()?.into(); 1794 let regs = self.get_regs()?; 1795 let sregs = self.get_sregs()?; 1796 let xsave = self.get_xsave()?; 1797 let xcrs = self.get_xcrs()?; 1798 let lapic_state = self.get_lapic()?; 1799 let fpu = self.get_fpu()?; 1800 1801 // Try to get all MSRs based on the list previously retrieved from KVM. 1802 // If the number of MSRs obtained from GET_MSRS is different from the 1803 // expected amount, we fallback onto a slower method by getting MSRs 1804 // by chunks. This is the only way to make sure we try to get as many 1805 // MSRs as possible, even if some MSRs are not supported. 1806 let mut msr_entries = self.msrs.clone(); 1807 1808 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1809 // emulated. 1810 if self.hyperv_synic.load(Ordering::Acquire) { 1811 let hyperv_synic_msrs = vec![ 1812 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1813 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1814 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1815 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 1816 0x400000b5, 0x400000b6, 0x400000b7, 1817 ]; 1818 for index in hyperv_synic_msrs { 1819 let msr = kvm_msr_entry { 1820 index, 1821 ..Default::default() 1822 }; 1823 msr_entries.push(msr.into()); 1824 } 1825 } 1826 1827 let expected_num_msrs = msr_entries.len(); 1828 let num_msrs = self.get_msrs(&mut msr_entries)?; 1829 let msrs = if num_msrs != expected_num_msrs { 1830 let mut faulty_msr_index = num_msrs; 1831 let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec(); 1832 1833 loop { 1834 warn!( 1835 "Detected faulty MSR 0x{:x} while getting MSRs", 1836 msr_entries[faulty_msr_index].index 1837 ); 1838 1839 // Skip the first bad MSR 1840 let start_pos = faulty_msr_index + 1; 1841 1842 let mut sub_msr_entries = msr_entries[start_pos..].to_vec(); 1843 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1844 1845 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]); 1846 1847 if num_msrs == sub_msr_entries.len() { 1848 break; 1849 } 1850 1851 faulty_msr_index = start_pos + num_msrs; 1852 } 1853 1854 msr_entries_tmp 1855 } else { 1856 msr_entries 1857 }; 1858 1859 let vcpu_events = self.get_vcpu_events()?; 1860 1861 Ok(VcpuKvmState { 1862 cpuid, 1863 msrs, 1864 vcpu_events, 1865 regs: regs.into(), 1866 sregs: sregs.into(), 1867 fpu, 1868 lapic_state, 1869 xsave, 1870 xcrs, 1871 mp_state, 1872 } 1873 .into()) 1874 } 1875 /// 1876 /// Get the current AArch64 CPU state 1877 /// 1878 #[cfg(target_arch = "aarch64")] 1879 fn state(&self) -> cpu::Result<CpuState> { 1880 let mut state = VcpuKvmState { 1881 mp_state: self.get_mp_state()?.into(), 1882 mpidr: self.read_mpidr()?, 1883 ..Default::default() 1884 }; 1885 state.core_regs = self.get_regs()?; 1886 state.sys_regs = self.get_sys_regs()?; 1887 1888 Ok(state.into()) 1889 } 1890 #[cfg(target_arch = "x86_64")] 1891 /// 1892 /// Restore the previously saved CPU state 1893 /// 1894 /// Ordering requirements: 1895 /// 1896 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1897 /// still running. 1898 /// 1899 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1900 /// if we ever change the BSP, we have to do that before restoring anything. 1901 /// The same seems to be true for CPUID stuff. 1902 /// 1903 /// SREGS saves/restores a pending interrupt, similar to what 1904 /// VCPU_EVENTS also does. 1905 /// 1906 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1907 /// done before SET_VCPU_EVENTS, which restores it. 1908 /// 1909 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1910 /// the apic base msr. 1911 /// 1912 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1913 /// only restores successfully, when the LAPIC is correctly configured. 1914 /// 1915 /// Arguments: CpuState 1916 /// # Example 1917 /// 1918 /// ```rust 1919 /// # extern crate hypervisor; 1920 /// # use hypervisor::KvmHypervisor; 1921 /// # use std::sync::Arc; 1922 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1923 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1924 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1925 /// vm.enable_split_irq().unwrap(); 1926 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1927 /// let state = vcpu.state().unwrap(); 1928 /// vcpu.set_state(&state).unwrap(); 1929 /// ``` 1930 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1931 let state: VcpuKvmState = state.clone().into(); 1932 self.set_cpuid2(&state.cpuid)?; 1933 self.set_mp_state(state.mp_state.into())?; 1934 self.set_regs(&state.regs.into())?; 1935 self.set_sregs(&state.sregs.into())?; 1936 self.set_xsave(&state.xsave)?; 1937 self.set_xcrs(&state.xcrs)?; 1938 self.set_lapic(&state.lapic_state)?; 1939 self.set_fpu(&state.fpu)?; 1940 1941 // Try to set all MSRs previously stored. 1942 // If the number of MSRs set from SET_MSRS is different from the 1943 // expected amount, we fallback onto a slower method by setting MSRs 1944 // by chunks. This is the only way to make sure we try to set as many 1945 // MSRs as possible, even if some MSRs are not supported. 1946 let expected_num_msrs = state.msrs.len(); 1947 let num_msrs = self.set_msrs(&state.msrs)?; 1948 if num_msrs != expected_num_msrs { 1949 let mut faulty_msr_index = num_msrs; 1950 1951 loop { 1952 warn!( 1953 "Detected faulty MSR 0x{:x} while setting MSRs", 1954 state.msrs[faulty_msr_index].index 1955 ); 1956 1957 // Skip the first bad MSR 1958 let start_pos = faulty_msr_index + 1; 1959 1960 let sub_msr_entries = state.msrs[start_pos..].to_vec(); 1961 1962 let num_msrs = self.set_msrs(&sub_msr_entries)?; 1963 1964 if num_msrs == sub_msr_entries.len() { 1965 break; 1966 } 1967 1968 faulty_msr_index = start_pos + num_msrs; 1969 } 1970 } 1971 1972 self.set_vcpu_events(&state.vcpu_events)?; 1973 1974 Ok(()) 1975 } 1976 /// 1977 /// Restore the previously saved AArch64 CPU state 1978 /// 1979 #[cfg(target_arch = "aarch64")] 1980 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1981 let state: VcpuKvmState = state.clone().into(); 1982 self.set_regs(&state.core_regs)?; 1983 self.set_sys_regs(&state.sys_regs)?; 1984 self.set_mp_state(state.mp_state.into())?; 1985 1986 Ok(()) 1987 } 1988 1989 /// 1990 /// Initialize TDX for this CPU 1991 /// 1992 #[cfg(feature = "tdx")] 1993 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 1994 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 1995 .map_err(cpu::HypervisorCpuError::InitializeTdx) 1996 } 1997 1998 /// 1999 /// Set the "immediate_exit" state 2000 /// 2001 fn set_immediate_exit(&self, exit: bool) { 2002 self.fd.set_kvm_immediate_exit(exit.into()); 2003 } 2004 2005 /// 2006 /// Returns the details about TDX exit reason 2007 /// 2008 #[cfg(feature = "tdx")] 2009 fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> { 2010 let kvm_run = self.fd.get_kvm_run(); 2011 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2012 2013 tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND; 2014 2015 if tdx_vmcall.type_ != 0 { 2016 return Err(cpu::HypervisorCpuError::UnknownTdxVmCall); 2017 } 2018 2019 match tdx_vmcall.subfunction { 2020 TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote), 2021 TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => { 2022 Ok(TdxExitDetails::SetupEventNotifyInterrupt) 2023 } 2024 _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall), 2025 } 2026 } 2027 2028 /// 2029 /// Set the status code for TDX exit 2030 /// 2031 #[cfg(feature = "tdx")] 2032 fn set_tdx_status(&mut self, status: TdxExitStatus) { 2033 let kvm_run = self.fd.get_kvm_run(); 2034 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2035 2036 tdx_vmcall.status_code = match status { 2037 TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS, 2038 TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND, 2039 }; 2040 } 2041 #[cfg(target_arch = "x86_64")] 2042 /// 2043 /// Return the list of initial MSR entries for a VCPU 2044 /// 2045 fn boot_msr_entries(&self) -> Vec<MsrEntry> { 2046 use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB}; 2047 2048 [ 2049 msr!(msr_index::MSR_IA32_SYSENTER_CS), 2050 msr!(msr_index::MSR_IA32_SYSENTER_ESP), 2051 msr!(msr_index::MSR_IA32_SYSENTER_EIP), 2052 msr!(msr_index::MSR_STAR), 2053 msr!(msr_index::MSR_CSTAR), 2054 msr!(msr_index::MSR_LSTAR), 2055 msr!(msr_index::MSR_KERNEL_GS_BASE), 2056 msr!(msr_index::MSR_SYSCALL_MASK), 2057 msr!(msr_index::MSR_IA32_TSC), 2058 msr_data!( 2059 msr_index::MSR_IA32_MISC_ENABLE, 2060 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64 2061 ), 2062 msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB), 2063 ] 2064 .to_vec() 2065 } 2066 } 2067 2068 impl KvmVcpu { 2069 #[cfg(target_arch = "x86_64")] 2070 /// 2071 /// X86 specific call that returns the vcpu's current "xsave struct". 2072 /// 2073 fn get_xsave(&self) -> cpu::Result<Xsave> { 2074 self.fd 2075 .get_xsave() 2076 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 2077 } 2078 #[cfg(target_arch = "x86_64")] 2079 /// 2080 /// X86 specific call that sets the vcpu's current "xsave struct". 2081 /// 2082 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 2083 self.fd 2084 .set_xsave(xsave) 2085 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 2086 } 2087 #[cfg(target_arch = "x86_64")] 2088 /// 2089 /// X86 specific call that returns the vcpu's current "xcrs". 2090 /// 2091 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 2092 self.fd 2093 .get_xcrs() 2094 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 2095 } 2096 #[cfg(target_arch = "x86_64")] 2097 /// 2098 /// X86 specific call that sets the vcpu's current "xcrs". 2099 /// 2100 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 2101 self.fd 2102 .set_xcrs(xcrs) 2103 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 2104 } 2105 #[cfg(target_arch = "x86_64")] 2106 /// 2107 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 2108 /// states of the vcpu. 2109 /// 2110 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 2111 self.fd 2112 .get_vcpu_events() 2113 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 2114 } 2115 #[cfg(target_arch = "x86_64")] 2116 /// 2117 /// Sets pending exceptions, interrupts, and NMIs as well as related states 2118 /// of the vcpu. 2119 /// 2120 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 2121 self.fd 2122 .set_vcpu_events(events) 2123 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 2124 } 2125 } 2126 2127 /// Device struct for KVM 2128 pub type KvmDevice = DeviceFd; 2129 2130 impl device::Device for KvmDevice { 2131 /// 2132 /// Set device attribute 2133 /// 2134 fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> { 2135 self.set_device_attr(attr) 2136 .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into())) 2137 } 2138 /// 2139 /// Get device attribute 2140 /// 2141 fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> { 2142 self.get_device_attr(attr) 2143 .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into())) 2144 } 2145 /// 2146 /// Cast to the underlying KVM device fd 2147 /// 2148 fn as_any(&self) -> &dyn Any { 2149 self 2150 } 2151 } 2152