#[cfg(gdb)]
use std::fmt::Debug;
#[cfg(feature = "hw-interrupts")]
use std::sync::Arc;
use std::sync::LazyLock;
use hyperlight_common::outb::VmAction;
#[cfg(feature = "hw-interrupts")]
use mshv_bindings::LapicState;
#[cfg(gdb)]
use mshv_bindings::{DebugRegisters, hv_message_type_HVMSG_X64_EXCEPTION_INTERCEPT};
use mshv_bindings::{
FloatingPointUnit, SpecialRegisters, StandardRegisters, XSave, hv_message_type,
hv_message_type_HVMSG_GPA_INTERCEPT, hv_message_type_HVMSG_UNMAPPED_GPA,
hv_message_type_HVMSG_X64_HALT, hv_message_type_HVMSG_X64_IO_PORT_INTERCEPT,
hv_partition_property_code_HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES,
hv_partition_synthetic_processor_features, hv_register_assoc,
hv_register_name_HV_X64_REGISTER_RIP, hv_register_value, mshv_create_partition_v2,
mshv_user_mem_region,
};
#[cfg(feature = "hw-interrupts")]
use mshv_bindings::{
hv_interrupt_type_HV_X64_INTERRUPT_TYPE_FIXED, hv_register_name_HV_X64_REGISTER_RAX,
};
#[cfg(feature = "hw-interrupts")]
use mshv_ioctls::InterruptRequest;
use mshv_ioctls::{Mshv, VcpuFd, VmFd};
use tracing::{Span, instrument};
#[cfg(feature = "trace_guest")]
use tracing_opentelemetry::OpenTelemetrySpanExt;
#[cfg(gdb)]
use crate::hypervisor::gdb::{DebugError, DebuggableVm};
use crate::hypervisor::regs::{
CommonDebugRegs, CommonFpu, CommonRegisters, CommonSpecialRegisters, FP_CONTROL_WORD_DEFAULT,
MXCSR_DEFAULT,
};
#[cfg(all(test, not(feature = "nanvix-unstable")))]
use crate::hypervisor::virtual_machine::XSAVE_BUFFER_SIZE;
#[cfg(feature = "hw-interrupts")]
use crate::hypervisor::virtual_machine::x86_64::hw_interrupts::TimerThread;
use crate::hypervisor::virtual_machine::{
CreateVmError, MapMemoryError, RegisterError, RunVcpuError, UnmapMemoryError, VirtualMachine,
VmExit, XSAVE_MIN_SIZE,
};
use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
#[cfg(feature = "trace_guest")]
use crate::sandbox::trace::TraceContext as SandboxTraceContext;
#[instrument(skip_all, parent = Span::current(), level = "Trace")]
pub(crate) fn is_hypervisor_present() -> bool {
match Mshv::new() {
Ok(_) => true,
Err(_) => {
tracing::info!("MSHV is not available on this system");
false
}
}
}
#[derive(Debug)]
pub(crate) struct MshvVm {
#[cfg(feature = "hw-interrupts")]
vm_fd: Arc<VmFd>,
#[cfg(not(feature = "hw-interrupts"))]
vm_fd: VmFd,
vcpu_fd: VcpuFd,
#[cfg(feature = "hw-interrupts")]
timer: Option<TimerThread>,
}
static MSHV: LazyLock<std::result::Result<Mshv, CreateVmError>> =
LazyLock::new(|| Mshv::new().map_err(|e| CreateVmError::HypervisorNotAvailable(e.into())));
impl MshvVm {
#[instrument(skip_all, parent = Span::current(), level = "Trace")]
pub(crate) fn new() -> std::result::Result<Self, CreateVmError> {
let mshv = MSHV.as_ref().map_err(|e| e.clone())?;
#[allow(unused_mut)]
let mut pr: mshv_create_partition_v2 = Default::default();
#[cfg(feature = "hw-interrupts")]
{
use mshv_bindings::MSHV_PT_BIT_LAPIC;
pr.pt_flags = 1u64 << MSHV_PT_BIT_LAPIC;
}
let vm_fd = mshv
.create_vm_with_args(&pr)
.map_err(|e| CreateVmError::CreateVmFd(e.into()))?;
let vcpu_fd = {
let features: hv_partition_synthetic_processor_features = Default::default();
vm_fd
.set_partition_property(
hv_partition_property_code_HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES,
unsafe { features.as_uint64[0] },
)
.map_err(|e| CreateVmError::SetPartitionProperty(e.into()))?;
vm_fd
.initialize()
.map_err(|e| CreateVmError::InitializeVm(e.into()))?;
vm_fd
.create_vcpu(0)
.map_err(|e| CreateVmError::CreateVcpuFd(e.into()))?
};
#[cfg(feature = "hw-interrupts")]
Self::init_lapic(&vcpu_fd)?;
Ok(Self {
#[cfg(feature = "hw-interrupts")]
vm_fd: Arc::new(vm_fd),
#[cfg(not(feature = "hw-interrupts"))]
vm_fd,
vcpu_fd,
#[cfg(feature = "hw-interrupts")]
timer: None,
})
}
}
impl VirtualMachine for MshvVm {
unsafe fn map_memory(
&mut self,
(_slot, region): (u32, &MemoryRegion),
) -> std::result::Result<(), MapMemoryError> {
let mshv_region: mshv_user_mem_region = region.into();
self.vm_fd
.map_user_memory(mshv_region)
.map_err(|e| MapMemoryError::Hypervisor(e.into()))
}
fn unmap_memory(
&mut self,
(_slot, region): (u32, &MemoryRegion),
) -> std::result::Result<(), UnmapMemoryError> {
let mshv_region: mshv_user_mem_region = region.into();
self.vm_fd
.unmap_user_memory(mshv_region)
.map_err(|e| UnmapMemoryError::Hypervisor(e.into()))
}
#[cfg_attr(not(feature = "hw-interrupts"), allow(clippy::never_loop))]
fn run_vcpu(
&mut self,
#[cfg(feature = "trace_guest")] tc: &mut SandboxTraceContext,
) -> std::result::Result<VmExit, RunVcpuError> {
const HALT_MESSAGE: hv_message_type = hv_message_type_HVMSG_X64_HALT;
const IO_PORT_INTERCEPT_MESSAGE: hv_message_type =
hv_message_type_HVMSG_X64_IO_PORT_INTERCEPT;
const UNMAPPED_GPA_MESSAGE: hv_message_type = hv_message_type_HVMSG_UNMAPPED_GPA;
const INVALID_GPA_ACCESS_MESSAGE: hv_message_type = hv_message_type_HVMSG_GPA_INTERCEPT;
#[cfg(gdb)]
const EXCEPTION_INTERCEPT: hv_message_type = hv_message_type_HVMSG_X64_EXCEPTION_INTERCEPT;
#[cfg(feature = "trace_guest")]
tc.setup_guest_trace(Span::current().context());
loop {
let exit_reason = self.vcpu_fd.run();
match exit_reason {
Ok(m) => {
let msg_type = m.header.message_type;
match msg_type {
HALT_MESSAGE => {
#[cfg(feature = "hw-interrupts")]
if self.timer.as_ref().is_some_and(|t| t.is_active()) {
continue;
}
return Ok(VmExit::Halt());
}
IO_PORT_INTERCEPT_MESSAGE => {
let io_message = m
.to_ioport_info()
.map_err(|_| RunVcpuError::DecodeIOMessage(msg_type))?;
let port_number = io_message.port_number;
let rip = io_message.header.rip;
let rax = io_message.rax;
let instruction_length = io_message.header.instruction_length() as u64;
let is_write = io_message.header.intercept_access_type != 0;
self.vcpu_fd
.set_reg(&[hv_register_assoc {
name: hv_register_name_HV_X64_REGISTER_RIP,
value: hv_register_value {
reg64: rip + instruction_length,
},
..Default::default()
}])
.map_err(|e| RunVcpuError::IncrementRip(e.into()))?;
if is_write && port_number == VmAction::Halt as u16 {
#[cfg(feature = "hw-interrupts")]
{
if let Some(mut t) = self.timer.take() {
t.stop();
}
}
return Ok(VmExit::Halt());
}
#[cfg(feature = "hw-interrupts")]
{
if is_write {
let data = rax.to_le_bytes();
if self.handle_hw_io_out(port_number, &data) {
continue;
}
} else if let Some(val) =
super::super::x86_64::hw_interrupts::handle_io_in(port_number)
{
self.vcpu_fd
.set_reg(&[hv_register_assoc {
name: hv_register_name_HV_X64_REGISTER_RAX,
value: hv_register_value { reg64: val },
..Default::default()
}])
.map_err(|e| RunVcpuError::Unknown(e.into()))?;
continue;
}
}
let _ = is_write;
return Ok(VmExit::IoOut(port_number, rax.to_le_bytes().to_vec()));
}
UNMAPPED_GPA_MESSAGE => {
let mimo_message = m
.to_memory_info()
.map_err(|_| RunVcpuError::DecodeIOMessage(msg_type))?;
let addr = mimo_message.guest_physical_address;
return match MemoryRegionFlags::try_from(mimo_message)
.map_err(|_| RunVcpuError::ParseGpaAccessInfo)?
{
MemoryRegionFlags::READ => Ok(VmExit::MmioRead(addr)),
MemoryRegionFlags::WRITE => Ok(VmExit::MmioWrite(addr)),
_ => Ok(VmExit::Unknown("Unknown MMIO access".to_string())),
};
}
INVALID_GPA_ACCESS_MESSAGE => {
let mimo_message = m
.to_memory_info()
.map_err(|_| RunVcpuError::DecodeIOMessage(msg_type))?;
let gpa = mimo_message.guest_physical_address;
let access_info = MemoryRegionFlags::try_from(mimo_message)
.map_err(|_| RunVcpuError::ParseGpaAccessInfo)?;
return match access_info {
MemoryRegionFlags::READ => Ok(VmExit::MmioRead(gpa)),
MemoryRegionFlags::WRITE => Ok(VmExit::MmioWrite(gpa)),
_ => Ok(VmExit::Unknown("Unknown MMIO access".to_string())),
};
}
#[cfg(gdb)]
EXCEPTION_INTERCEPT => {
let ex_info = m
.to_exception_info()
.map_err(|_| RunVcpuError::DecodeIOMessage(msg_type))?;
let DebugRegisters { dr6, .. } = self
.vcpu_fd
.get_debug_regs()
.map_err(|e| RunVcpuError::GetDr6(e.into()))?;
return Ok(VmExit::Debug {
dr6,
exception: ex_info.exception_vector as u32,
});
}
other => {
return Ok(VmExit::Unknown(format!(
"Unknown MSHV VCPU exit: {:?}",
other
)));
}
}
}
Err(e) => match e.errno() {
libc::EINTR => {
return Ok(VmExit::Cancelled());
}
libc::EAGAIN => {
#[cfg(not(feature = "hw-interrupts"))]
{
return Ok(VmExit::Retry());
}
#[cfg(feature = "hw-interrupts")]
continue;
}
_ => return Err(RunVcpuError::Unknown(e.into())),
},
}
}
}
fn regs(&self) -> std::result::Result<CommonRegisters, RegisterError> {
let mshv_regs = self
.vcpu_fd
.get_regs()
.map_err(|e| RegisterError::GetRegs(e.into()))?;
Ok((&mshv_regs).into())
}
fn set_regs(&self, regs: &CommonRegisters) -> std::result::Result<(), RegisterError> {
let mshv_regs: StandardRegisters = regs.into();
self.vcpu_fd
.set_regs(&mshv_regs)
.map_err(|e| RegisterError::SetRegs(e.into()))?;
Ok(())
}
fn fpu(&self) -> std::result::Result<CommonFpu, RegisterError> {
let mshv_fpu = self
.vcpu_fd
.get_fpu()
.map_err(|e| RegisterError::GetFpu(e.into()))?;
Ok((&mshv_fpu).into())
}
fn set_fpu(&self, fpu: &CommonFpu) -> std::result::Result<(), RegisterError> {
let mshv_fpu: FloatingPointUnit = fpu.into();
self.vcpu_fd
.set_fpu(&mshv_fpu)
.map_err(|e| RegisterError::SetFpu(e.into()))?;
Ok(())
}
fn sregs(&self) -> std::result::Result<CommonSpecialRegisters, RegisterError> {
let mshv_sregs = self
.vcpu_fd
.get_sregs()
.map_err(|e| RegisterError::GetSregs(e.into()))?;
Ok((&mshv_sregs).into())
}
fn set_sregs(&self, sregs: &CommonSpecialRegisters) -> std::result::Result<(), RegisterError> {
let mshv_sregs: SpecialRegisters = sregs.into();
self.vcpu_fd
.set_sregs(&mshv_sregs)
.map_err(|e| RegisterError::SetSregs(e.into()))?;
Ok(())
}
fn debug_regs(&self) -> std::result::Result<CommonDebugRegs, RegisterError> {
let debug_regs = self
.vcpu_fd
.get_debug_regs()
.map_err(|e| RegisterError::GetDebugRegs(e.into()))?;
Ok(debug_regs.into())
}
fn set_debug_regs(&self, drs: &CommonDebugRegs) -> std::result::Result<(), RegisterError> {
let mshv_debug_regs = drs.into();
self.vcpu_fd
.set_debug_regs(&mshv_debug_regs)
.map_err(|e| RegisterError::SetDebugRegs(e.into()))?;
Ok(())
}
#[allow(dead_code)]
fn xsave(&self) -> std::result::Result<Vec<u8>, RegisterError> {
let xsave = self
.vcpu_fd
.get_xsave()
.map_err(|e| RegisterError::GetXsave(e.into()))?;
Ok(xsave.buffer.to_vec())
}
fn reset_xsave(&self) -> std::result::Result<(), RegisterError> {
let current_xsave = self
.vcpu_fd
.get_xsave()
.map_err(|e| RegisterError::GetXsave(e.into()))?;
if current_xsave.buffer.len() < XSAVE_MIN_SIZE {
return Err(RegisterError::XsaveSizeMismatch {
expected: XSAVE_MIN_SIZE as u32,
actual: current_xsave.buffer.len() as u32,
});
}
let mut buf = XSave::default();
buf.buffer[520..528].copy_from_slice(¤t_xsave.buffer[520..528]);
buf.buffer[0..2].copy_from_slice(&FP_CONTROL_WORD_DEFAULT.to_le_bytes());
buf.buffer[24..28].copy_from_slice(&MXCSR_DEFAULT.to_le_bytes());
buf.buffer[512..520].copy_from_slice(&0x3u64.to_le_bytes());
self.vcpu_fd
.set_xsave(&buf)
.map_err(|e| RegisterError::SetXsave(e.into()))?;
Ok(())
}
#[cfg(test)]
#[cfg(not(feature = "nanvix-unstable"))]
fn set_xsave(&self, xsave: &[u32]) -> std::result::Result<(), RegisterError> {
if std::mem::size_of_val(xsave) != XSAVE_BUFFER_SIZE {
return Err(RegisterError::XsaveSizeMismatch {
expected: XSAVE_BUFFER_SIZE as u32,
actual: std::mem::size_of_val(xsave) as u32,
});
}
let (prefix, bytes, suffix) = unsafe { xsave.align_to() };
if !prefix.is_empty() || !suffix.is_empty() {
return Err(RegisterError::InvalidXsaveAlignment);
}
let buf = XSave {
buffer: bytes
.try_into()
.expect("xsave slice has correct length and prefix and suffix are empty"),
};
self.vcpu_fd
.set_xsave(&buf)
.map_err(|e| RegisterError::SetXsave(e.into()))?;
Ok(())
}
}
#[cfg(gdb)]
impl DebuggableVm for MshvVm {
fn translate_gva(&self, gva: u64) -> std::result::Result<u64, DebugError> {
use mshv_bindings::HV_TRANSLATE_GVA_VALIDATE_READ;
let flags = HV_TRANSLATE_GVA_VALIDATE_READ as u64;
let (addr, _) = self
.vcpu_fd
.translate_gva(gva, flags)
.map_err(|_| DebugError::TranslateGva(gva))?;
Ok(addr)
}
fn set_debug(&mut self, enabled: bool) -> std::result::Result<(), DebugError> {
use mshv_bindings::{
HV_INTERCEPT_ACCESS_MASK_EXECUTE, HV_INTERCEPT_ACCESS_MASK_NONE,
hv_intercept_parameters, hv_intercept_type_HV_INTERCEPT_TYPE_EXCEPTION,
mshv_install_intercept,
};
use crate::hypervisor::gdb::arch::{BP_EX_ID, DB_EX_ID};
let access_type_mask = if enabled {
HV_INTERCEPT_ACCESS_MASK_EXECUTE
} else {
HV_INTERCEPT_ACCESS_MASK_NONE
};
for vector in [DB_EX_ID, BP_EX_ID] {
self.vm_fd
.install_intercept(mshv_install_intercept {
access_type_mask,
intercept_type: hv_intercept_type_HV_INTERCEPT_TYPE_EXCEPTION,
intercept_parameter: hv_intercept_parameters {
exception_vector: vector as u16,
},
})
.map_err(|e| DebugError::Intercept {
enable: enabled,
inner: e.into(),
})?;
}
Ok(())
}
fn set_single_step(&mut self, enable: bool) -> std::result::Result<(), DebugError> {
let mut regs = self.regs()?;
if enable {
regs.rflags |= 1 << 8;
} else {
regs.rflags &= !(1 << 8);
}
self.set_regs(®s)?;
Ok(())
}
fn add_hw_breakpoint(&mut self, addr: u64) -> std::result::Result<(), DebugError> {
use crate::hypervisor::gdb::arch::MAX_NO_OF_HW_BP;
let mut regs = self.debug_regs()?;
if [regs.dr0, regs.dr1, regs.dr2, regs.dr3].contains(&addr) {
return Ok(());
}
let i = (0..MAX_NO_OF_HW_BP)
.position(|i| regs.dr7 & (1 << (i * 2)) == 0)
.ok_or(DebugError::TooManyHwBreakpoints(MAX_NO_OF_HW_BP))?;
*[&mut regs.dr0, &mut regs.dr1, &mut regs.dr2, &mut regs.dr3][i] = addr;
regs.dr7 |= 1 << (i * 2);
self.set_debug_regs(®s)?;
Ok(())
}
fn remove_hw_breakpoint(&mut self, addr: u64) -> std::result::Result<(), DebugError> {
let mut debug_regs = self.debug_regs()?;
let regs = [
&mut debug_regs.dr0,
&mut debug_regs.dr1,
&mut debug_regs.dr2,
&mut debug_regs.dr3,
];
if let Some(i) = regs.iter().position(|&&mut reg| reg == addr) {
*regs[i] = 0;
debug_regs.dr7 &= !(1 << (i * 2));
self.set_debug_regs(&debug_regs)?;
Ok(())
} else {
Err(DebugError::HwBreakpointNotFound(addr))
}
}
}
#[cfg(feature = "hw-interrupts")]
fn lapic_regs_as_u8(regs: &[::std::os::raw::c_char; 1024]) -> &[u8] {
unsafe { &*(regs as *const [::std::os::raw::c_char; 1024] as *const [u8; 1024]) }
}
#[cfg(feature = "hw-interrupts")]
fn lapic_regs_as_u8_mut(regs: &mut [::std::os::raw::c_char; 1024]) -> &mut [u8] {
unsafe { &mut *(regs as *mut [::std::os::raw::c_char; 1024] as *mut [u8; 1024]) }
}
#[cfg(feature = "hw-interrupts")]
impl MshvVm {
const APIC_BASE_DEFAULT: u64 = 0xFEE00900;
fn init_lapic(vcpu_fd: &VcpuFd) -> std::result::Result<(), CreateVmError> {
use super::super::x86_64::hw_interrupts::init_lapic_registers;
let mut lapic: LapicState = vcpu_fd
.get_lapic()
.map_err(|e| CreateVmError::InitializeVm(e.into()))?;
init_lapic_registers(lapic_regs_as_u8_mut(&mut lapic.regs));
vcpu_fd
.set_lapic(&lapic)
.map_err(|e| CreateVmError::InitializeVm(e.into()))?;
Ok(())
}
fn do_lapic_eoi(&self) {
if let Ok(mut lapic) = self.vcpu_fd.get_lapic() {
super::super::x86_64::hw_interrupts::lapic_eoi(lapic_regs_as_u8_mut(&mut lapic.regs));
let _ = self.vcpu_fd.set_lapic(&lapic);
}
}
fn handle_hw_io_out(&mut self, port: u16, data: &[u8]) -> bool {
if port == VmAction::PvTimerConfig as u16 {
if self.timer.is_none() {
use mshv_bindings::hv_register_name_HV_X64_REGISTER_APIC_BASE;
let mut apic_base_reg = [hv_register_assoc {
name: hv_register_name_HV_X64_REGISTER_APIC_BASE,
value: hv_register_value { reg64: 0 },
..Default::default()
}];
if self.vcpu_fd.get_reg(&mut apic_base_reg).is_ok() {
let cur = unsafe { apic_base_reg[0].value.reg64 };
if cur & (1 << 11) == 0 {
let _ = self.vcpu_fd.set_reg(&[hv_register_assoc {
name: hv_register_name_HV_X64_REGISTER_APIC_BASE,
value: hv_register_value {
reg64: Self::APIC_BASE_DEFAULT,
},
..Default::default()
}]);
}
}
if let Ok(mut lapic) = self.vcpu_fd.get_lapic() {
let regs = lapic_regs_as_u8(&lapic.regs);
let svr = super::super::x86_64::hw_interrupts::read_lapic_u32(regs, 0xF0);
if svr & 0x100 == 0 {
let regs_mut = lapic_regs_as_u8_mut(&mut lapic.regs);
super::super::x86_64::hw_interrupts::write_lapic_u32(regs_mut, 0xF0, 0x1FF);
super::super::x86_64::hw_interrupts::write_lapic_u32(regs_mut, 0x80, 0); let _ = self.vcpu_fd.set_lapic(&lapic);
}
}
}
let vm_fd = Arc::clone(&self.vm_fd);
let vector = super::super::x86_64::hw_interrupts::TIMER_VECTOR;
super::super::x86_64::hw_interrupts::handle_pv_timer_config(
&mut self.timer,
data,
move || {
if let Err(e) = vm_fd.request_virtual_interrupt(&InterruptRequest {
interrupt_type: hv_interrupt_type_HV_X64_INTERRUPT_TYPE_FIXED,
apic_id: 0,
vector,
level_triggered: false,
logical_destination_mode: false,
long_mode: false,
}) {
tracing::warn!("MSHV request_virtual_interrupt failed: {e}");
}
},
);
return true;
}
let timer_active = self.timer.as_ref().is_some_and(|t| t.is_active());
super::super::x86_64::hw_interrupts::handle_common_io_out(port, data, timer_active, || {
self.do_lapic_eoi()
})
}
}
#[cfg(test)]
#[cfg(feature = "hw-interrupts")]
mod hw_interrupt_tests {
use super::*;
#[test]
fn lapic_regs_conversion_roundtrip() {
let mut regs = [0i8; 1024];
let bytes = lapic_regs_as_u8_mut(&mut regs);
super::super::super::x86_64::hw_interrupts::write_lapic_u32(bytes, 0xF0, 0xDEAD_BEEF);
let bytes = lapic_regs_as_u8(®s);
assert_eq!(
super::super::super::x86_64::hw_interrupts::read_lapic_u32(bytes, 0xF0),
0xDEAD_BEEF
);
}
#[test]
fn apic_base_default_value() {
let base = MshvVm::APIC_BASE_DEFAULT;
assert_ne!(base & (1 << 8), 0, "BSP flag should be set");
assert_ne!(base & (1 << 11), 0, "global enable should be set");
assert_eq!(
base & 0xFFFFF000,
0xFEE00000,
"base address should be 0xFEE00000"
);
}
}