use std::sync::LazyLock;
use hyperlight_common::outb::VmAction;
#[cfg(gdb)]
use kvm_bindings::kvm_guest_debug;
use kvm_bindings::{
kvm_debugregs, kvm_fpu, kvm_regs, kvm_sregs, kvm_userspace_memory_region, kvm_xsave,
};
use kvm_ioctls::Cap::UserMemory;
use kvm_ioctls::{Kvm, VcpuExit, VcpuFd, VmFd};
use tracing::{Span, instrument};
#[cfg(feature = "trace_guest")]
use tracing_opentelemetry::OpenTelemetrySpanExt;
#[cfg(feature = "hw-interrupts")]
use vmm_sys_util::eventfd::EventFd;
#[cfg(gdb)]
use crate::hypervisor::gdb::{DebugError, DebuggableVm};
use crate::hypervisor::regs::{
CommonDebugRegs, CommonFpu, CommonRegisters, CommonSpecialRegisters, FP_CONTROL_WORD_DEFAULT,
MXCSR_DEFAULT,
};
#[cfg(all(test, not(feature = "nanvix-unstable")))]
use crate::hypervisor::virtual_machine::XSAVE_BUFFER_SIZE;
#[cfg(feature = "hw-interrupts")]
use crate::hypervisor::virtual_machine::x86_64::hw_interrupts::TimerThread;
use crate::hypervisor::virtual_machine::{
CreateVmError, MapMemoryError, RegisterError, RunVcpuError, UnmapMemoryError, VirtualMachine,
VmExit,
};
use crate::mem::memory_region::MemoryRegion;
#[cfg(feature = "trace_guest")]
use crate::sandbox::trace::TraceContext as SandboxTraceContext;
const CPUID_FUNCTION_PROCESSOR_CAPACITY_PARAMETERS_AND_EXTENDED_FEATURE_IDENTIFICATION: u32 =
0x8000_0008;
#[instrument(skip_all, parent = Span::current(), level = "Trace")]
pub(crate) fn is_hypervisor_present() -> bool {
if let Ok(kvm) = Kvm::new() {
let api_version = kvm.get_api_version();
match api_version {
version if version == 12 && kvm.check_extension(UserMemory) => true,
12 => {
tracing::info!("KVM does not have KVM_CAP_USER_MEMORY capability");
false
}
version => {
tracing::info!("KVM GET_API_VERSION returned {}, expected 12", version);
false
}
}
} else {
tracing::info!("KVM is not available on this system");
false
}
}
#[derive(Debug)]
pub(crate) struct KvmVm {
vm_fd: VmFd,
vcpu_fd: VcpuFd,
#[cfg(feature = "hw-interrupts")]
timer_irq_eventfd: EventFd,
#[cfg(feature = "hw-interrupts")]
timer: Option<TimerThread>,
#[cfg(gdb)]
debug_regs: kvm_guest_debug,
}
static KVM: LazyLock<std::result::Result<Kvm, CreateVmError>> =
LazyLock::new(|| Kvm::new().map_err(|e| CreateVmError::HypervisorNotAvailable(e.into())));
#[cfg(feature = "hw-interrupts")]
impl KvmVm {
fn setup_irqfd(vm_fd: &VmFd) -> std::result::Result<EventFd, CreateVmError> {
vm_fd
.create_irq_chip()
.map_err(|e| CreateVmError::InitializeVm(e.into()))?;
let eventfd = EventFd::new(0).map_err(|e| {
CreateVmError::InitializeVm(
kvm_ioctls::Error::new(e.raw_os_error().unwrap_or(libc::EIO)).into(),
)
})?;
vm_fd
.register_irqfd(&eventfd, 0)
.map_err(|e| CreateVmError::InitializeVm(e.into()))?;
Ok(eventfd)
}
}
impl KvmVm {
#[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
pub(crate) fn new() -> std::result::Result<Self, CreateVmError> {
let hv = KVM.as_ref().map_err(|e| e.clone())?;
let vm_fd = hv
.create_vm_with_type(0)
.map_err(|e| CreateVmError::CreateVmFd(e.into()))?;
#[cfg(feature = "hw-interrupts")]
let timer_irq_eventfd = Self::setup_irqfd(&vm_fd)?;
let vcpu_fd = vm_fd
.create_vcpu(0)
.map_err(|e| CreateVmError::CreateVcpuFd(e.into()))?;
let mut kvm_cpuid = hv
.get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
.map_err(|e| CreateVmError::InitializeVm(e.into()))?;
for entry in kvm_cpuid.as_mut_slice().iter_mut() {
if entry.function
== CPUID_FUNCTION_PROCESSOR_CAPACITY_PARAMETERS_AND_EXTENDED_FEATURE_IDENTIFICATION
{
entry.eax &= !0xff;
entry.eax |= hyperlight_common::layout::MAX_GPA.ilog2() + 1;
}
}
vcpu_fd
.set_cpuid2(&kvm_cpuid)
.map_err(|e| CreateVmError::InitializeVm(e.into()))?;
Ok(Self {
vm_fd,
vcpu_fd,
#[cfg(feature = "hw-interrupts")]
timer_irq_eventfd,
#[cfg(feature = "hw-interrupts")]
timer: None,
#[cfg(gdb)]
debug_regs: kvm_guest_debug::default(),
})
}
#[cfg(feature = "hw-interrupts")]
fn run_vcpu_hw_interrupts(&mut self) -> std::result::Result<VmExit, RunVcpuError> {
loop {
match self.vcpu_fd.run() {
Ok(VcpuExit::IoOut(port, data)) => {
if port == VmAction::Halt as u16 {
if let Some(mut t) = self.timer.take() {
t.stop();
}
return Ok(VmExit::Halt());
}
if port == VmAction::PvTimerConfig as u16 {
let data_copy = data.to_vec();
self.handle_pv_timer_config(&data_copy);
continue;
}
if (0x40..=0x43).contains(&port) {
continue;
}
return Ok(VmExit::IoOut(port, data.to_vec()));
}
Ok(VcpuExit::MmioRead(addr, _)) => return Ok(VmExit::MmioRead(addr)),
Ok(VcpuExit::MmioWrite(addr, _)) => return Ok(VmExit::MmioWrite(addr)),
#[cfg(gdb)]
Ok(VcpuExit::Debug(debug_exit)) => {
return Ok(VmExit::Debug {
dr6: debug_exit.dr6,
exception: debug_exit.exception,
});
}
Err(e) => match e.errno() {
libc::EINTR => return Ok(VmExit::Cancelled()),
libc::EAGAIN => continue,
_ => return Err(RunVcpuError::Unknown(e.into())),
},
Ok(other) => {
return Ok(VmExit::Unknown(format!(
"Unknown KVM VCPU exit: {:?}",
other
)));
}
}
}
}
#[cfg(feature = "hw-interrupts")]
fn handle_pv_timer_config(&mut self, data: &[u8]) {
use super::super::x86_64::hw_interrupts::handle_pv_timer_config;
let eventfd_clone = match self.timer_irq_eventfd.try_clone() {
Ok(fd) => fd,
Err(e) => {
tracing::warn!("failed to clone eventfd for timer config: {e}");
return;
}
};
handle_pv_timer_config(&mut self.timer, data, move || {
let _ = eventfd_clone.write(1);
});
}
#[cfg(not(feature = "hw-interrupts"))]
fn run_vcpu_default(&mut self) -> std::result::Result<VmExit, RunVcpuError> {
match self.vcpu_fd.run() {
Ok(VcpuExit::Hlt) => Ok(VmExit::Halt()),
Ok(VcpuExit::IoOut(port, _)) if port == VmAction::Halt as u16 => Ok(VmExit::Halt()),
Ok(VcpuExit::IoOut(port, data)) => Ok(VmExit::IoOut(port, data.to_vec())),
Ok(VcpuExit::MmioRead(addr, _)) => Ok(VmExit::MmioRead(addr)),
Ok(VcpuExit::MmioWrite(addr, _)) => Ok(VmExit::MmioWrite(addr)),
#[cfg(gdb)]
Ok(VcpuExit::Debug(debug_exit)) => Ok(VmExit::Debug {
dr6: debug_exit.dr6,
exception: debug_exit.exception,
}),
Err(e) => match e.errno() {
libc::EINTR => Ok(VmExit::Cancelled()),
libc::EAGAIN => Ok(VmExit::Retry()),
_ => Err(RunVcpuError::Unknown(e.into())),
},
Ok(other) => Ok(VmExit::Unknown(format!(
"Unknown KVM VCPU exit: {:?}",
other
))),
}
}
}
impl VirtualMachine for KvmVm {
unsafe fn map_memory(
&mut self,
(slot, region): (u32, &MemoryRegion),
) -> std::result::Result<(), MapMemoryError> {
let mut kvm_region: kvm_userspace_memory_region = region.into();
kvm_region.slot = slot;
unsafe { self.vm_fd.set_user_memory_region(kvm_region) }
.map_err(|e| MapMemoryError::Hypervisor(e.into()))
}
fn unmap_memory(
&mut self,
(slot, region): (u32, &MemoryRegion),
) -> std::result::Result<(), UnmapMemoryError> {
let mut kvm_region: kvm_userspace_memory_region = region.into();
kvm_region.slot = slot;
kvm_region.memory_size = 0;
unsafe { self.vm_fd.set_user_memory_region(kvm_region) }
.map_err(|e| UnmapMemoryError::Hypervisor(e.into()))
}
fn run_vcpu(
&mut self,
#[cfg(feature = "trace_guest")] tc: &mut SandboxTraceContext,
) -> std::result::Result<VmExit, RunVcpuError> {
#[cfg(feature = "trace_guest")]
tc.setup_guest_trace(Span::current().context());
#[cfg(feature = "hw-interrupts")]
return self.run_vcpu_hw_interrupts();
#[cfg(not(feature = "hw-interrupts"))]
self.run_vcpu_default()
}
fn regs(&self) -> std::result::Result<CommonRegisters, RegisterError> {
let kvm_regs = self
.vcpu_fd
.get_regs()
.map_err(|e| RegisterError::GetRegs(e.into()))?;
Ok((&kvm_regs).into())
}
fn set_regs(&self, regs: &CommonRegisters) -> std::result::Result<(), RegisterError> {
let kvm_regs: kvm_regs = regs.into();
self.vcpu_fd
.set_regs(&kvm_regs)
.map_err(|e| RegisterError::SetRegs(e.into()))?;
Ok(())
}
fn fpu(&self) -> std::result::Result<CommonFpu, RegisterError> {
let kvm_fpu = self
.vcpu_fd
.get_fpu()
.map_err(|e| RegisterError::GetFpu(e.into()))?;
Ok((&kvm_fpu).into())
}
fn set_fpu(&self, fpu: &CommonFpu) -> std::result::Result<(), RegisterError> {
let kvm_fpu: kvm_fpu = fpu.into();
self.vcpu_fd
.set_fpu(&kvm_fpu)
.map_err(|e| RegisterError::SetFpu(e.into()))?;
Ok(())
}
fn sregs(&self) -> std::result::Result<CommonSpecialRegisters, RegisterError> {
let kvm_sregs = self
.vcpu_fd
.get_sregs()
.map_err(|e| RegisterError::GetSregs(e.into()))?;
Ok((&kvm_sregs).into())
}
fn set_sregs(&self, sregs: &CommonSpecialRegisters) -> std::result::Result<(), RegisterError> {
let kvm_sregs: kvm_sregs = sregs.into();
self.vcpu_fd
.set_sregs(&kvm_sregs)
.map_err(|e| RegisterError::SetSregs(e.into()))?;
Ok(())
}
fn debug_regs(&self) -> std::result::Result<CommonDebugRegs, RegisterError> {
let kvm_debug_regs = self
.vcpu_fd
.get_debug_regs()
.map_err(|e| RegisterError::GetDebugRegs(e.into()))?;
Ok(kvm_debug_regs.into())
}
fn set_debug_regs(&self, drs: &CommonDebugRegs) -> std::result::Result<(), RegisterError> {
let kvm_debug_regs: kvm_debugregs = drs.into();
self.vcpu_fd
.set_debug_regs(&kvm_debug_regs)
.map_err(|e| RegisterError::SetDebugRegs(e.into()))?;
Ok(())
}
#[allow(dead_code)]
fn xsave(&self) -> std::result::Result<Vec<u8>, RegisterError> {
let xsave = self
.vcpu_fd
.get_xsave()
.map_err(|e| RegisterError::GetXsave(e.into()))?;
Ok(xsave
.region
.into_iter()
.flat_map(u32::to_le_bytes)
.collect())
}
fn reset_xsave(&self) -> std::result::Result<(), RegisterError> {
let mut xsave = kvm_xsave::default();
xsave.region[0] = FP_CONTROL_WORD_DEFAULT as u32;
xsave.region[6] = MXCSR_DEFAULT;
xsave.region[128] = 0x3;
unsafe {
self.vcpu_fd
.set_xsave(&xsave)
.map_err(|e| RegisterError::SetXsave(e.into()))?
};
Ok(())
}
#[cfg(test)]
#[cfg(not(feature = "nanvix-unstable"))]
fn set_xsave(&self, xsave: &[u32]) -> std::result::Result<(), RegisterError> {
if std::mem::size_of_val(xsave) != XSAVE_BUFFER_SIZE {
return Err(RegisterError::XsaveSizeMismatch {
expected: XSAVE_BUFFER_SIZE as u32,
actual: std::mem::size_of_val(xsave) as u32,
});
}
let xsave = kvm_xsave {
region: xsave.try_into().expect("xsave slice has correct length"),
..Default::default()
};
unsafe {
self.vcpu_fd
.set_xsave(&xsave)
.map_err(|e| RegisterError::SetXsave(e.into()))?
};
Ok(())
}
}
#[cfg(gdb)]
impl DebuggableVm for KvmVm {
fn translate_gva(&self, gva: u64) -> std::result::Result<u64, DebugError> {
let gpa = self
.vcpu_fd
.translate_gva(gva)
.map_err(|_| DebugError::TranslateGva(gva))?;
if gpa.valid == 0 {
Err(DebugError::TranslateGva(gva))
} else {
Ok(gpa.physical_address)
}
}
fn set_debug(&mut self, enable: bool) -> std::result::Result<(), DebugError> {
use kvm_bindings::{KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_USE_HW_BP, KVM_GUESTDBG_USE_SW_BP};
tracing::info!("Setting debug to {}", enable);
if enable {
self.debug_regs.control |=
KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_USE_SW_BP;
} else {
self.debug_regs.control &=
!(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_USE_SW_BP);
}
self.vcpu_fd
.set_guest_debug(&self.debug_regs)
.map_err(|e| RegisterError::SetDebugRegs(e.into()))?;
Ok(())
}
fn set_single_step(&mut self, enable: bool) -> std::result::Result<(), DebugError> {
use kvm_bindings::KVM_GUESTDBG_SINGLESTEP;
tracing::info!("Setting single step to {}", enable);
if enable {
self.debug_regs.control |= KVM_GUESTDBG_SINGLESTEP;
} else {
self.debug_regs.control &= !KVM_GUESTDBG_SINGLESTEP;
}
self.vcpu_fd
.set_guest_debug(&self.debug_regs)
.map_err(|e| RegisterError::SetDebugRegs(e.into()))?;
let mut regs = self.regs()?;
if enable {
regs.rflags |= 1 << 8;
} else {
regs.rflags &= !(1 << 8);
}
self.set_regs(®s)?;
Ok(())
}
fn add_hw_breakpoint(&mut self, addr: u64) -> std::result::Result<(), DebugError> {
use crate::hypervisor::gdb::arch::MAX_NO_OF_HW_BP;
if self.debug_regs.arch.debugreg[..4].contains(&addr) {
return Ok(());
}
let i = (0..MAX_NO_OF_HW_BP)
.position(|i| self.debug_regs.arch.debugreg[7] & (1 << (i * 2)) == 0)
.ok_or(DebugError::TooManyHwBreakpoints(MAX_NO_OF_HW_BP))?;
self.debug_regs.arch.debugreg[i] = addr;
self.debug_regs.arch.debugreg[7] |= 1 << (i * 2);
self.vcpu_fd
.set_guest_debug(&self.debug_regs)
.map_err(|e| RegisterError::SetDebugRegs(e.into()))?;
Ok(())
}
fn remove_hw_breakpoint(&mut self, addr: u64) -> std::result::Result<(), DebugError> {
let index = self.debug_regs.arch.debugreg[..4]
.iter()
.position(|&a| a == addr)
.ok_or(DebugError::HwBreakpointNotFound(addr))?;
self.debug_regs.arch.debugreg[index] = 0;
self.debug_regs.arch.debugreg[7] &= !(1 << (index * 2));
self.vcpu_fd
.set_guest_debug(&self.debug_regs)
.map_err(|e| RegisterError::SetDebugRegs(e.into()))?;
Ok(())
}
}
#[cfg(test)]
#[cfg(feature = "hw-interrupts")]
mod hw_interrupt_tests {
use super::*;
#[test]
fn halt_port_is_not_standard_device() {
const HALT: u16 = VmAction::Halt as u16;
const _: () = assert!(HALT != 0x20 && HALT != 0x21);
const _: () = assert!(HALT != 0xA0 && HALT != 0xA1);
const _: () = assert!(HALT != 0x40 && HALT != 0x41 && HALT != 0x42 && HALT != 0x43);
const _: () = assert!(HALT != 0x61);
}
}