use alloc::collections::VecDeque;
use core::{
arch::naked_asm,
fmt::{Debug, Formatter, Result},
mem::size_of,
};
use ax_errno::{AxResult, ax_err, ax_err_type};
use axaddrspace::{
GuestPhysAddr, GuestVirtAddr, HostPhysAddr, NestedPageFaultInfo,
device::{AccessWidth, Port, SysRegAddr, SysRegAddrRange},
};
use axdevice_base::BaseDeviceOps;
use axvcpu::{AxArchVCpu, AxVCpuExitReason};
use axvisor_api::vmm::{VCpuId, VMId};
use bit_field::BitField;
use raw_cpuid::CpuId;
use x86::{
bits64::vmx,
controlregs::Xcr0,
dtables::{self, DescriptorTablePointer},
segmentation::SegmentSelector,
};
use x86_64::registers::control::{Cr0, Cr0Flags, Cr3, Cr4, Cr4Flags, EferFlags};
use x86_vlapic::EmulatedLocalApic;
use super::{
VmxExitInfo, as_axerr,
definitions::VmxExitReason,
structs::{IOBitmap, MsrBitmap, VmxRegion},
vmcs::{
self, ApicAccessExitType, VmcsControl32, VmcsControl64, VmcsControlNW, VmcsGuest16,
VmcsGuest32, VmcsGuest64, VmcsGuestNW, VmcsHost16, VmcsHost32, VmcsHost64, VmcsHostNW,
},
};
use crate::{
ept::GuestPageWalkInfo, msr::Msr, regs::GeneralRegisters, restore_host_interrupt_flag,
xstate::XState,
};
const VMX_PREEMPTION_TIMER_SET_VALUE: u32 = 1_000_000;
const QEMU_EXIT_PORT: u16 = 0x604;
const QEMU_EXIT_MAGIC: u64 = 0x2000;
#[derive(PartialEq, Eq, Debug)]
pub enum VmCpuMode {
Real,
Protected,
Compatibility, Mode64, }
const MSR_IA32_EFER_LMA_BIT: u64 = 1 << 10;
const CR0_PE: usize = 1 << 0;
#[repr(C)]
pub struct VmxVcpu {
guest_regs: GeneralRegisters,
host_stack_top: u64,
host_rflags: u64,
launched: bool,
entry: Option<GuestPhysAddr>,
ept_root: Option<HostPhysAddr>,
vmcs: VmxRegion,
io_bitmap: IOBitmap,
msr_bitmap: MsrBitmap,
pending_events: VecDeque<(u8, Option<u32>)>,
vlapic: EmulatedLocalApic,
xstate: XState,
#[cfg(feature = "tracing")]
guest_regs_exiting: GeneralRegisters,
}
impl VmxVcpu {
pub fn new(vm_id: VMId, vcpu_id: VCpuId) -> AxResult<Self> {
let vmcs_revision_id = super::read_vmcs_revision_id();
let vcpu = Self {
guest_regs: GeneralRegisters::default(),
host_stack_top: 0,
host_rflags: 0,
launched: false,
entry: None,
ept_root: None,
vmcs: VmxRegion::new(vmcs_revision_id, false)?,
io_bitmap: IOBitmap::passthrough_all()?,
msr_bitmap: MsrBitmap::passthrough_all()?,
pending_events: VecDeque::with_capacity(8),
vlapic: EmulatedLocalApic::new(vm_id, vcpu_id),
xstate: XState::new(),
#[cfg(feature = "tracing")]
guest_regs_exiting: GeneralRegisters::default(),
};
info!("[HV] created VmxVcpu(vmcs: {:#x})", vcpu.vmcs.phys_addr());
Ok(vcpu)
}
pub fn setup(&mut self, ept_root: HostPhysAddr, entry: GuestPhysAddr) -> AxResult {
self.setup_vmcs(entry, ept_root)?;
Ok(())
}
pub fn bind_to_current_processor(&self) -> AxResult {
debug!(
"VmxVcpu bind to current processor vmcs @ {:#x}",
self.vmcs.phys_addr()
);
unsafe {
vmx::vmptrld(self.vmcs.phys_addr().as_usize() as u64).map_err(as_axerr)?;
}
self.setup_vmcs_host()?;
Ok(())
}
pub fn unbind_from_current_processor(&self) -> AxResult {
debug!(
"VmxVcpu unbind from current processor vmcs @ {:#x}",
self.vmcs.phys_addr()
);
unsafe {
vmx::vmclear(self.vmcs.phys_addr().as_usize() as u64).map_err(as_axerr)?;
}
Ok(())
}
pub fn get_cpu_mode(&self) -> VmCpuMode {
let ia32_efer = Msr::IA32_EFER.read();
let cs_access_right = VmcsGuest32::CS_ACCESS_RIGHTS.read().unwrap();
let cr0 = VmcsGuestNW::CR0.read().unwrap();
if (ia32_efer & MSR_IA32_EFER_LMA_BIT) != 0 {
if (cs_access_right & 0x2000) != 0 {
VmCpuMode::Mode64
} else {
VmCpuMode::Compatibility
}
} else if (cr0 & CR0_PE) != 0 {
VmCpuMode::Protected
} else {
VmCpuMode::Real
}
}
pub fn inner_run(&mut self) -> Option<VmxExitInfo> {
self.inject_pending_events().unwrap();
self.load_guest_xstate();
#[cfg(feature = "tracing")]
{
use crate::regs::GeneralRegistersDiff;
let diff = GeneralRegistersDiff::new(self.guest_regs_exiting, self.guest_regs);
if !diff.is_same() {
debug!("VCpu registers changed during handling VM-exit: {diff:#x?}");
} else {
debug!("VCpu registers unchanged during handling VM-exit");
}
}
unsafe {
if self.launched {
self.vmx_resume();
} else {
self.launched = true;
VmcsHostNW::RSP
.write(&self.host_stack_top as *const _ as usize)
.unwrap();
self.vmx_launch();
}
}
self.load_host_xstate();
restore_host_interrupt_flag(self.host_rflags);
#[cfg(feature = "tracing")]
{
self.guest_regs_exiting = self.guest_regs;
}
let exit_info = self.exit_info().unwrap();
match self.builtin_vmexit_handler(&exit_info) {
Some(result) => match result {
Ok(()) => None,
Err(err) => {
panic!(
"VmxVcpu failed to handle a VM-exit that should be handled by itself: \
{:?}, error {:?}, vcpu: {:#x?}",
exit_info.exit_reason, err, self
);
}
},
None => Some(exit_info),
}
}
pub fn exit_info(&self) -> AxResult<vmcs::VmxExitInfo> {
vmcs::exit_info()
}
pub fn raw_interrupt_exit_info(&self) -> AxResult<u32> {
vmcs::raw_interrupt_exit_info()
}
pub fn interrupt_exit_info(&self) -> AxResult<vmcs::VmxInterruptInfo> {
vmcs::interrupt_exit_info()
}
pub fn io_exit_info(&self) -> AxResult<vmcs::VmxIoExitInfo> {
vmcs::io_exit_info()
}
pub fn nested_page_fault_info(&self) -> AxResult<NestedPageFaultInfo> {
vmcs::ept_violation_info()
}
pub fn apic_access_exit_info(&self) -> AxResult<vmcs::ApicAccessExitInfo> {
vmcs::apic_access_exit_info()
}
pub fn regs(&self) -> &GeneralRegisters {
&self.guest_regs
}
pub fn regs_mut(&mut self) -> &mut GeneralRegisters {
&mut self.guest_regs
}
pub fn stack_pointer(&self) -> usize {
VmcsGuestNW::RSP.read().unwrap()
}
pub fn set_stack_pointer(&mut self, rsp: usize) {
VmcsGuestNW::RSP.write(rsp).unwrap()
}
pub fn gla2gva(&self, guest_rip: GuestVirtAddr) -> GuestVirtAddr {
let cpu_mode = self.get_cpu_mode();
let seg_base = if cpu_mode == VmCpuMode::Mode64 {
0
} else {
VmcsGuestNW::CS_BASE.read().unwrap()
};
guest_rip + seg_base
}
pub fn get_ptw_info(&self) -> GuestPageWalkInfo {
let top_entry = VmcsGuestNW::CR3.read().unwrap();
let level = self.get_paging_level();
let is_write_access = false;
let is_inst_fetch = false;
let is_user_mode_access = ((VmcsGuest32::SS_ACCESS_RIGHTS.read().unwrap() >> 5) & 0x3) == 3;
let mut pse = true;
let mut nxe =
(VmcsGuest64::IA32_EFER.read().unwrap() & EferFlags::NO_EXECUTE_ENABLE.bits()) != 0;
let wp = (VmcsGuestNW::CR0.read().unwrap() & Cr0Flags::WRITE_PROTECT.bits() as usize) != 0;
let is_smap_on = (VmcsGuestNW::CR4.read().unwrap()
& Cr4Flags::SUPERVISOR_MODE_ACCESS_PREVENTION.bits() as usize)
!= 0;
let is_smep_on = (VmcsGuestNW::CR4.read().unwrap()
& Cr4Flags::SUPERVISOR_MODE_EXECUTION_PROTECTION.bits() as usize)
!= 0;
let width: u32;
if level == 4 || level == 3 {
width = 9;
} else if level == 2 {
width = 10;
pse = VmcsGuestNW::CR4.read().unwrap() & Cr4Flags::PAGE_SIZE_EXTENSION.bits() as usize
!= 0;
nxe = false;
} else {
width = 0;
}
GuestPageWalkInfo {
top_entry,
level,
width,
is_user_mode_access,
is_write_access,
is_inst_fetch,
pse,
wp,
nxe,
is_smap_on,
is_smep_on,
}
}
pub fn rip(&self) -> usize {
VmcsGuestNW::RIP.read().unwrap()
}
pub fn cs(&self) -> u16 {
VmcsGuest16::CS_SELECTOR.read().unwrap()
}
pub fn advance_rip(&mut self, instr_len: u8) -> AxResult {
VmcsGuestNW::RIP.write(VmcsGuestNW::RIP.read()? + instr_len as usize)
}
pub fn queue_event(&mut self, vector: u8, err_code: Option<u32>) {
self.pending_events.push_back((vector, err_code));
}
pub fn set_interrupt_window(&mut self, enable: bool) -> AxResult {
let mut ctrl = VmcsControl32::PRIMARY_PROCBASED_EXEC_CONTROLS.read()?;
let bits = vmcs::controls::PrimaryControls::INTERRUPT_WINDOW_EXITING.bits();
if enable {
ctrl |= bits
} else {
ctrl &= !bits
}
VmcsControl32::PRIMARY_PROCBASED_EXEC_CONTROLS.write(ctrl)?;
Ok(())
}
pub fn set_io_intercept_of_range(&mut self, port_base: u32, count: u32, intercept: bool) {
self.io_bitmap
.set_intercept_of_range(port_base, count, intercept)
}
pub fn set_msr_intercept_of_range(&mut self, msr: u32, intercept: bool) {
self.msr_bitmap.set_read_intercept(msr, intercept);
self.msr_bitmap.set_write_intercept(msr, intercept);
}
}
impl VmxVcpu {
fn setup_io_bitmap(&mut self) -> AxResult {
let io_to_be_intercepted = QEMU_EXIT_PORT..QEMU_EXIT_PORT + 1; self.io_bitmap.set_intercept_of_range(
io_to_be_intercepted.start as _,
io_to_be_intercepted.count() as u32,
true,
);
Ok(())
}
#[allow(dead_code)]
fn setup_msr_bitmap(&mut self) -> AxResult {
const IA32_UMWAIT_CONTROL: u32 = 0xe1;
self.msr_bitmap
.set_write_intercept(IA32_UMWAIT_CONTROL, true);
self.msr_bitmap
.set_read_intercept(IA32_UMWAIT_CONTROL, true);
for msr in 0x800..=0x83f {
self.msr_bitmap.set_read_intercept(msr, true);
self.msr_bitmap.set_write_intercept(msr, true);
}
Ok(())
}
fn setup_vmcs(&mut self, entry: GuestPhysAddr, ept_root: HostPhysAddr) -> AxResult {
let paddr = self.vmcs.phys_addr().as_usize() as u64;
unsafe {
vmx::vmclear(paddr).map_err(as_axerr)?;
}
self.bind_to_current_processor()?;
self.setup_msr_bitmap()?;
self.setup_vmcs_guest(entry)?;
self.setup_vmcs_control(ept_root, true)?;
self.unbind_from_current_processor()?;
Ok(())
}
fn setup_vmcs_host(&self) -> AxResult {
VmcsHost64::IA32_PAT.write(Msr::IA32_PAT.read())?;
VmcsHost64::IA32_EFER.write(Msr::IA32_EFER.read())?;
VmcsHostNW::CR0.write(Cr0::read_raw() as _)?;
VmcsHostNW::CR3.write(Cr3::read_raw().0.start_address().as_u64() as _)?;
VmcsHostNW::CR4.write(Cr4::read_raw() as _)?;
VmcsHost16::ES_SELECTOR.write(x86::segmentation::es().bits())?;
VmcsHost16::CS_SELECTOR.write(x86::segmentation::cs().bits())?;
VmcsHost16::SS_SELECTOR.write(x86::segmentation::ss().bits())?;
VmcsHost16::DS_SELECTOR.write(x86::segmentation::ds().bits())?;
VmcsHost16::FS_SELECTOR.write(x86::segmentation::fs().bits())?;
VmcsHost16::GS_SELECTOR.write(x86::segmentation::gs().bits())?;
VmcsHostNW::FS_BASE.write(Msr::IA32_FS_BASE.read() as _)?;
VmcsHostNW::GS_BASE.write(Msr::IA32_GS_BASE.read() as _)?;
let tr = unsafe { x86::task::tr() };
let mut gdtp = DescriptorTablePointer::<u64>::default();
let mut idtp = DescriptorTablePointer::<u64>::default();
unsafe {
dtables::sgdt(&mut gdtp);
dtables::sidt(&mut idtp);
}
VmcsHost16::TR_SELECTOR.write(tr.bits())?;
VmcsHostNW::TR_BASE.write(get_tr_base(tr, &gdtp) as _)?;
VmcsHostNW::GDTR_BASE.write(gdtp.base as _)?;
VmcsHostNW::IDTR_BASE.write(idtp.base as _)?;
VmcsHostNW::RIP.write(Self::vmx_exit as *const () as usize)?;
VmcsHostNW::IA32_SYSENTER_ESP.write(0)?;
VmcsHostNW::IA32_SYSENTER_EIP.write(0)?;
VmcsHost32::IA32_SYSENTER_CS.write(0)?;
Ok(())
}
fn setup_vmcs_guest(&mut self, entry: GuestPhysAddr) -> AxResult {
let cr0_val: Cr0Flags =
Cr0Flags::NOT_WRITE_THROUGH | Cr0Flags::CACHE_DISABLE | Cr0Flags::EXTENSION_TYPE;
self.set_cr(0, cr0_val.bits());
self.set_cr(4, 0);
macro_rules! set_guest_segment {
($seg:ident, $access_rights:expr) => {{
use VmcsGuest16::*;
use VmcsGuest32::*;
use VmcsGuestNW::*;
paste::paste! {
[<$seg _SELECTOR>].write(0)?;
[<$seg _BASE>].write(0)?;
[<$seg _LIMIT>].write(0xffff)?;
[<$seg _ACCESS_RIGHTS>].write($access_rights)?;
}
}};
}
set_guest_segment!(ES, 0x93); set_guest_segment!(CS, 0x9b); set_guest_segment!(SS, 0x93);
set_guest_segment!(DS, 0x93);
set_guest_segment!(FS, 0x93);
set_guest_segment!(GS, 0x93);
set_guest_segment!(TR, 0x8b); set_guest_segment!(LDTR, 0x82);
VmcsGuestNW::GDTR_BASE.write(0)?;
VmcsGuest32::GDTR_LIMIT.write(0xffff)?;
VmcsGuestNW::IDTR_BASE.write(0)?;
VmcsGuest32::IDTR_LIMIT.write(0xffff)?;
VmcsGuestNW::CR3.write(0)?;
VmcsGuestNW::DR7.write(0x400)?;
VmcsGuestNW::RSP.write(0)?;
VmcsGuestNW::RIP.write(entry.as_usize())?;
VmcsGuestNW::RFLAGS.write(0x2)?;
VmcsGuestNW::PENDING_DBG_EXCEPTIONS.write(0)?;
VmcsGuestNW::IA32_SYSENTER_ESP.write(0)?;
VmcsGuestNW::IA32_SYSENTER_EIP.write(0)?;
VmcsGuest32::IA32_SYSENTER_CS.write(0)?;
VmcsGuest32::INTERRUPTIBILITY_STATE.write(0)?;
VmcsGuest32::ACTIVITY_STATE.write(0)?;
VmcsGuest32::VMX_PREEMPTION_TIMER_VALUE.write(VMX_PREEMPTION_TIMER_SET_VALUE)?;
VmcsGuest64::LINK_PTR.write(u64::MAX)?; VmcsGuest64::IA32_DEBUGCTL.write(0)?;
VmcsGuest64::IA32_PAT.write(Msr::IA32_PAT.read())?;
VmcsGuest64::IA32_EFER.write(0)?;
Ok(())
}
fn setup_vmcs_control(&mut self, ept_root: HostPhysAddr, is_guest: bool) -> AxResult {
use PinbasedControls as PinCtrl;
use super::vmcs::controls::*;
let raw_cpuid = CpuId::new();
vmcs::set_control(
VmcsControl32::PINBASED_EXEC_CONTROLS,
Msr::IA32_VMX_TRUE_PINBASED_CTLS,
Msr::IA32_VMX_PINBASED_CTLS.read() as u32,
(PinCtrl::NMI_EXITING | PinCtrl::EXTERNAL_INTERRUPT_EXITING).bits(),
0,
)?;
use PrimaryControls as CpuCtrl;
vmcs::set_control(
VmcsControl32::PRIMARY_PROCBASED_EXEC_CONTROLS,
Msr::IA32_VMX_TRUE_PROCBASED_CTLS,
Msr::IA32_VMX_PROCBASED_CTLS.read() as u32,
(CpuCtrl::USE_IO_BITMAPS | CpuCtrl::USE_MSR_BITMAPS | CpuCtrl::SECONDARY_CONTROLS)
.bits(),
(CpuCtrl::CR3_LOAD_EXITING
| CpuCtrl::CR3_STORE_EXITING
| CpuCtrl::CR8_LOAD_EXITING
| CpuCtrl::CR8_STORE_EXITING)
.bits(),
)?;
use SecondaryControls as CpuCtrl2;
let mut val =
CpuCtrl2::ENABLE_EPT | CpuCtrl2::UNRESTRICTED_GUEST;
if let Some(features) = raw_cpuid.get_extended_processor_and_feature_identifiers()
&& features.has_rdtscp()
{
val |= CpuCtrl2::ENABLE_RDTSCP;
}
if let Some(features) = raw_cpuid.get_extended_feature_info()
&& features.has_invpcid()
{
val |= CpuCtrl2::ENABLE_INVPCID;
}
if let Some(features) = raw_cpuid.get_extended_state_info()
&& features.has_xsaves_xrstors()
{
val |= CpuCtrl2::ENABLE_XSAVES_XRSTORS;
}
vmcs::set_control(
VmcsControl32::SECONDARY_PROCBASED_EXEC_CONTROLS,
Msr::IA32_VMX_PROCBASED_CTLS2,
Msr::IA32_VMX_PROCBASED_CTLS2.read() as u32,
val.bits(),
0,
)?;
use ExitControls as ExitCtrl;
vmcs::set_control(
VmcsControl32::VMEXIT_CONTROLS,
Msr::IA32_VMX_TRUE_EXIT_CTLS,
Msr::IA32_VMX_EXIT_CTLS.read() as u32,
(ExitCtrl::HOST_ADDRESS_SPACE_SIZE
| ExitCtrl::ACK_INTERRUPT_ON_EXIT
| ExitCtrl::SAVE_IA32_PAT
| ExitCtrl::LOAD_IA32_PAT
| ExitCtrl::SAVE_IA32_EFER
| ExitCtrl::LOAD_IA32_EFER)
.bits(),
0,
)?;
let mut val = EntryCtrl::LOAD_IA32_PAT | EntryCtrl::LOAD_IA32_EFER;
if !is_guest {
val |= EntryCtrl::IA32E_MODE_GUEST;
}
use EntryControls as EntryCtrl;
vmcs::set_control(
VmcsControl32::VMENTRY_CONTROLS,
Msr::IA32_VMX_TRUE_ENTRY_CTLS,
Msr::IA32_VMX_ENTRY_CTLS.read() as u32,
val.bits(),
0,
)?;
vmcs::set_ept_pointer(ept_root)?;
VmcsControl32::VMEXIT_MSR_STORE_COUNT.write(0)?;
VmcsControl32::VMEXIT_MSR_LOAD_COUNT.write(0)?;
VmcsControl32::VMENTRY_MSR_LOAD_COUNT.write(0)?;
VmcsControl32::CR3_TARGET_COUNT.write(0)?;
let exception_bitmap: u32 = 1 << 6;
self.setup_io_bitmap()?;
VmcsControl32::EXCEPTION_BITMAP.write(exception_bitmap)?;
VmcsControl64::IO_BITMAP_A_ADDR.write(self.io_bitmap.phys_addr().0.as_usize() as _)?;
VmcsControl64::IO_BITMAP_B_ADDR.write(self.io_bitmap.phys_addr().1.as_usize() as _)?;
VmcsControl64::MSR_BITMAPS_ADDR.write(self.msr_bitmap.phys_addr().as_usize() as _)?;
Ok(())
}
fn get_paging_level(&self) -> usize {
let mut level: u32 = 0; let cr0 = VmcsGuestNW::CR0.read().unwrap();
let cr4 = VmcsGuestNW::CR4.read().unwrap();
let efer = VmcsGuest64::IA32_EFER.read().unwrap();
if cr0 & Cr0Flags::PAGING.bits() as usize != 0 {
if cr4 & Cr4Flags::PHYSICAL_ADDRESS_EXTENSION.bits() as usize != 0 {
if efer & EferFlags::LONG_MODE_ACTIVE.bits() != 0 {
level = 4;
} else {
level = 3;
}
} else {
level = 2;
}
}
level as usize
}
}
impl VmxVcpu {
fn set_cr(&mut self, cr_idx: usize, val: u64) {
(|| -> AxResult {
match cr_idx {
0 => {
let must0 = Msr::IA32_VMX_CR0_FIXED1.read()
& !(Cr0Flags::NOT_WRITE_THROUGH | Cr0Flags::CACHE_DISABLE).bits();
let must1 = Msr::IA32_VMX_CR0_FIXED0.read()
& !(Cr0Flags::PAGING | Cr0Flags::PROTECTED_MODE_ENABLE).bits();
VmcsGuestNW::CR0.write(((val & must0) | must1) as _)?;
VmcsControlNW::CR0_READ_SHADOW.write(val as _)?;
VmcsControlNW::CR0_GUEST_HOST_MASK.write((must1 | !must0) as _)?;
}
3 => VmcsGuestNW::CR3.write(val as _)?,
4 => {
let must0 = Msr::IA32_VMX_CR4_FIXED1.read();
let must1 = Msr::IA32_VMX_CR4_FIXED0.read();
let val = val | Cr4Flags::VIRTUAL_MACHINE_EXTENSIONS.bits();
VmcsGuestNW::CR4.write(((val & must0) | must1) as _)?;
VmcsControlNW::CR4_READ_SHADOW.write(val as _)?;
VmcsControlNW::CR4_GUEST_HOST_MASK.write((must1 | !must0) as _)?;
}
_ => unreachable!(),
};
Ok(())
})()
.expect("Failed to write guest control register")
}
#[allow(dead_code)]
fn cr(&self, cr_idx: usize) -> usize {
(|| -> AxResult<usize> {
Ok(match cr_idx {
0 => VmcsGuestNW::CR0.read()?,
3 => VmcsGuestNW::CR3.read()?,
4 => {
let host_mask = VmcsControlNW::CR4_GUEST_HOST_MASK.read()?;
(VmcsControlNW::CR4_READ_SHADOW.read()? & host_mask)
| (VmcsGuestNW::CR4.read()? & !host_mask)
}
_ => unreachable!(),
})
})()
.expect("Failed to read guest control register")
}
}
macro_rules! vmx_entry_with {
($instr:literal) => {
naked_asm!(
"pushfq", "pop qword ptr [rdi + {host_rflags}]",
save_regs_to_stack!(), "mov [rdi + {host_stack_size}], rsp", "mov rsp, rdi", restore_regs_from_stack!(), $instr, "jmp {failed}",
host_stack_size = const size_of::<GeneralRegisters>(),
host_rflags = const size_of::<GeneralRegisters>() + size_of::<u64>(),
failed = sym Self::vmx_entry_failed,
)
}
}
impl VmxVcpu {
#[unsafe(naked)]
unsafe extern "C" fn vmx_launch(&mut self) -> usize {
vmx_entry_with!("vmlaunch")
}
#[unsafe(naked)]
unsafe extern "C" fn vmx_resume(&mut self) -> usize {
vmx_entry_with!("vmresume")
}
#[unsafe(naked)]
unsafe extern "C" fn vmx_exit(&mut self) -> usize {
naked_asm!(
"cli", save_regs_to_stack!(), "mov rsp, [rsp + {host_stack_top}]", restore_regs_from_stack!(), "ret",
host_stack_top = const size_of::<GeneralRegisters>(),
);
}
fn vmx_entry_failed() -> ! {
panic!("{}", vmcs::instruction_error().as_str())
}
fn allow_interrupt(&self) -> bool {
let rflags = VmcsGuestNW::RFLAGS.read().unwrap();
let block_state = VmcsGuest32::INTERRUPTIBILITY_STATE.read().unwrap();
rflags as u64 & x86_64::registers::rflags::RFlags::INTERRUPT_FLAG.bits() != 0
&& block_state == 0
}
fn inject_pending_events(&mut self) -> AxResult {
if let Some(event) = self.pending_events.front() {
if event.0 < 32 || self.allow_interrupt() {
vmcs::inject_event(event.0, event.1)?;
self.pending_events.pop_front();
} else {
self.set_interrupt_window(true)?;
}
}
Ok(())
}
fn builtin_vmexit_handler(&mut self, exit_info: &VmxExitInfo) -> Option<AxResult> {
const X2APIC_MSR_BASE: u32 = 0x800;
const X2APIC_MSR_END: u32 = 0x8ff; match exit_info.exit_reason {
VmxExitReason::INTERRUPT_WINDOW => Some(self.set_interrupt_window(false)),
VmxExitReason::PREEMPTION_TIMER => Some(self.handle_vmx_preemption_timer()),
VmxExitReason::XSETBV => Some(self.handle_xsetbv()),
VmxExitReason::CR_ACCESS => Some(self.handle_cr()),
VmxExitReason::CPUID => Some(self.handle_cpuid()),
msr_rw @ (VmxExitReason::MSR_READ | VmxExitReason::MSR_WRITE)
if {
let msr = self.regs().rcx as u32;
(X2APIC_MSR_BASE..=X2APIC_MSR_END).contains(&msr)
} =>
{
Some(self.handle_apic_msr_access(
msr_rw == VmxExitReason::MSR_WRITE,
self.regs().rcx as u32,
))
}
VmxExitReason::APIC_ACCESS => Some(self.handle_apic_access(exit_info)),
_ => None,
}
}
fn read_edx_eax(&self) -> u64 {
((self.regs().rdx & 0xffff_ffff) << 32) | (self.regs().rax & 0xffff_ffff)
}
fn write_edx_eax(&mut self, val: u64) {
self.regs_mut().rax = val & 0xffff_ffff;
self.regs_mut().rdx = val >> 32;
}
fn handle_apic_msr_access(&mut self, write: bool, msr: u32) -> AxResult {
const VMEXIT_INSTR_LEN_RDMSR_WRMSR: u8 = 2;
self.advance_rip(VMEXIT_INSTR_LEN_RDMSR_WRMSR)?;
let msr = msr as _;
if write {
let value = self.read_edx_eax() as usize;
trace!("handle_vlapic_msr_write: msr={msr:#x}, value={value:#x}");
<EmulatedLocalApic as BaseDeviceOps<SysRegAddrRange>>::handle_write(
&self.vlapic,
SysRegAddr::new(msr),
AccessWidth::Qword,
value,
)
} else {
let value = <EmulatedLocalApic as BaseDeviceOps<SysRegAddrRange>>::handle_read(
&self.vlapic,
SysRegAddr::new(msr),
AccessWidth::Qword,
)? as u64;
trace!("handle_vlapic_msr_read: msr={msr:#x}, value={value:#x}");
self.write_edx_eax(value);
Ok(())
}
}
fn handle_apic_access(&mut self, exit_info: &VmxExitInfo) -> AxResult {
let apic_access_exit_info = self.apic_access_exit_info()?;
let write = match apic_access_exit_info.access_type {
ApicAccessExitType::LinearDataWrite => true,
ApicAccessExitType::LinearDataRead => false,
_ => {
warn!(
"Unsupported APIC access type: {:?}",
apic_access_exit_info.access_type
);
return ax_err!(BadState, "Unsupported APIC access type");
}
};
let _ = write;
self.advance_rip(exit_info.exit_instruction_length as _)?;
unimplemented!("apic access");
}
fn handle_vmx_preemption_timer(&mut self) -> AxResult {
VmcsGuest32::VMX_PREEMPTION_TIMER_VALUE.write(VMX_PREEMPTION_TIMER_SET_VALUE)?;
Ok(())
}
#[allow(clippy::single_match)]
fn handle_cr(&mut self) -> AxResult {
const VM_EXIT_INSTR_LEN_MV_TO_CR: u8 = 3;
let cr_access_info = vmcs::cr_access_info()?;
let reg = cr_access_info.gpr;
let cr = cr_access_info.cr_number;
match cr_access_info.access_type {
0 => {
let val = if reg == 4 {
self.stack_pointer() as u64
} else {
self.guest_regs.get_reg_of_index(reg)
};
if cr == 0 || cr == 4 {
self.advance_rip(VM_EXIT_INSTR_LEN_MV_TO_CR)?;
self.set_cr(cr as usize, val);
if cr == 0 && Cr0Flags::from_bits_truncate(val).contains(Cr0Flags::PAGING) {
vmcs::update_efer()?;
}
return Ok(());
}
}
_ => {}
};
panic!(
"Guest's access to cr not allowed: {:#x?}, {:#x?}",
self, cr_access_info
);
}
fn handle_cpuid(&mut self) -> AxResult {
use raw_cpuid::{CpuIdResult, cpuid};
const VM_EXIT_INSTR_LEN_CPUID: u8 = 2;
const LEAF_FEATURE_INFO: u32 = 0x1;
const LEAF_STRUCTURED_EXTENDED_FEATURE_FLAGS_ENUMERATION: u32 = 0x7;
const LEAF_PROCESSOR_EXTENDED_STATE_ENUMERATION: u32 = 0xd;
const EAX_FREQUENCY_INFO: u32 = 0x16;
const LEAF_HYPERVISOR_INFO: u32 = 0x4000_0000;
const LEAF_HYPERVISOR_FEATURE: u32 = 0x4000_0001;
const VENDOR_STR: &[u8; 12] = b"RVMRVMRVMRVM";
let vendor_regs = unsafe { &*(VENDOR_STR.as_ptr() as *const [u32; 3]) };
let regs_clone = *self.regs_mut();
let function = regs_clone.rax as u32;
let res = match function {
LEAF_FEATURE_INFO => {
const FEATURE_VMX: u32 = 1 << 5;
const FEATURE_HYPERVISOR: u32 = 1 << 31;
const FEATURE_MCE: u32 = 1 << 7;
let mut res = cpuid!(regs_clone.rax, regs_clone.rcx);
res.ecx &= !FEATURE_VMX;
res.ecx |= FEATURE_HYPERVISOR;
res.eax &= !FEATURE_MCE;
res
}
LEAF_STRUCTURED_EXTENDED_FEATURE_FLAGS_ENUMERATION => {
let mut res = cpuid!(regs_clone.rax, regs_clone.rcx);
if regs_clone.rcx == 0 {
res.ecx.set_bit(5, false); res.ecx.set_bit(16, false); }
res
}
LEAF_PROCESSOR_EXTENDED_STATE_ENUMERATION => {
self.load_guest_xstate();
let res = cpuid!(regs_clone.rax, regs_clone.rcx);
self.load_host_xstate();
res
}
LEAF_HYPERVISOR_INFO => CpuIdResult {
eax: LEAF_HYPERVISOR_FEATURE,
ebx: vendor_regs[0],
ecx: vendor_regs[1],
edx: vendor_regs[2],
},
LEAF_HYPERVISOR_FEATURE => CpuIdResult {
eax: 0,
ebx: 0,
ecx: 0,
edx: 0,
},
EAX_FREQUENCY_INFO => {
const TIMER_FREQUENCY_MHZ: u32 = 3_000;
let mut res = cpuid!(regs_clone.rax, regs_clone.rcx);
if res.eax == 0 {
warn!(
"handle_cpuid: Failed to get TSC frequency by CPUID, default to \
{TIMER_FREQUENCY_MHZ} MHz"
);
res.eax = TIMER_FREQUENCY_MHZ;
}
res
}
_ => cpuid!(regs_clone.rax, regs_clone.rcx),
};
trace!(
"VM exit: CPUID({:#x}, {:#x}): {:?}",
regs_clone.rax, regs_clone.rcx, res
);
let regs = self.regs_mut();
regs.rax = res.eax as _;
regs.rbx = res.ebx as _;
regs.rcx = res.ecx as _;
regs.rdx = res.edx as _;
self.advance_rip(VM_EXIT_INSTR_LEN_CPUID)?;
Ok(())
}
fn handle_xsetbv(&mut self) -> AxResult {
const XCR_XCR0: u64 = 0;
const VM_EXIT_INSTR_LEN_XSETBV: u8 = 3;
let index = self.guest_regs.rcx.get_bits(0..32);
let value = self.guest_regs.rdx.get_bits(0..32) << 32 | self.guest_regs.rax.get_bits(0..32);
if index == XCR_XCR0 {
Xcr0::from_bits(value)
.and_then(|x| {
if !x.contains(Xcr0::XCR0_FPU_MMX_STATE) {
return None;
}
if x.contains(Xcr0::XCR0_AVX_STATE) && !x.contains(Xcr0::XCR0_SSE_STATE) {
return None;
}
if x.contains(Xcr0::XCR0_BNDCSR_STATE) ^ x.contains(Xcr0::XCR0_BNDREG_STATE) {
return None;
}
if x.contains(Xcr0::XCR0_OPMASK_STATE)
|| x.contains(Xcr0::XCR0_ZMM_HI256_STATE)
|| x.contains(Xcr0::XCR0_HI16_ZMM_STATE)
|| !x.contains(Xcr0::XCR0_AVX_STATE)
|| !x.contains(Xcr0::XCR0_OPMASK_STATE)
|| !x.contains(Xcr0::XCR0_ZMM_HI256_STATE)
|| !x.contains(Xcr0::XCR0_HI16_ZMM_STATE)
{
return None;
}
Some(x)
})
.ok_or(ax_err_type!(InvalidInput))
.and_then(|x| {
self.xstate.guest_xcr0 = x.bits();
self.advance_rip(VM_EXIT_INSTR_LEN_XSETBV)
})
} else {
ax_err!(Unsupported, "only xcr0 is supported")
}
}
fn load_guest_xstate(&mut self) {
self.xstate.switch_to_guest();
}
fn load_host_xstate(&mut self) {
self.xstate.switch_to_host();
}
}
impl Drop for VmxVcpu {
fn drop(&mut self) {
unsafe { vmx::vmclear(self.vmcs.phys_addr().as_usize() as u64).unwrap() };
info!("[HV] dropped VmxVcpu(vmcs: {:#x})", self.vmcs.phys_addr());
}
}
fn get_tr_base(tr: SegmentSelector, gdt: &DescriptorTablePointer<u64>) -> u64 {
let index = tr.index() as usize;
let table_len = (gdt.limit as usize + 1) / core::mem::size_of::<u64>();
let table = unsafe { core::slice::from_raw_parts(gdt.base, table_len) };
let entry = table[index];
if entry & (1 << 47) != 0 {
let base_low = entry.get_bits(16..40) | entry.get_bits(56..64) << 24;
let base_high = table[index + 1] & 0xffff_ffff;
base_low | base_high << 32
} else {
0
}
}
impl Debug for VmxVcpu {
fn fmt(&self, f: &mut Formatter) -> Result {
(|| -> AxResult<Result> {
Ok(f.debug_struct("VmxVcpu")
.field("guest_regs", &self.guest_regs)
.field("rip", &VmcsGuestNW::RIP.read()?)
.field("rsp", &VmcsGuestNW::RSP.read()?)
.field("rflags", &VmcsGuestNW::RFLAGS.read()?)
.field("cr0", &VmcsGuestNW::CR0.read()?)
.field("cr3", &VmcsGuestNW::CR3.read()?)
.field("cr4", &VmcsGuestNW::CR4.read()?)
.field("cs", &VmcsGuest16::CS_SELECTOR.read()?)
.field("fs_base", &VmcsGuestNW::FS_BASE.read()?)
.field("gs_base", &VmcsGuestNW::GS_BASE.read()?)
.field("tss", &VmcsGuest16::TR_SELECTOR.read()?)
.finish())
})()
.unwrap()
}
}
impl AxArchVCpu for VmxVcpu {
type CreateConfig = ();
type SetupConfig = ();
fn new(vm_id: VMId, vcpu_id: VCpuId, _config: Self::CreateConfig) -> AxResult<Self> {
Self::new(vm_id, vcpu_id)
}
fn set_entry(&mut self, entry: GuestPhysAddr) -> AxResult {
self.entry = Some(entry);
Ok(())
}
fn set_ept_root(&mut self, ept_root: HostPhysAddr) -> AxResult {
self.ept_root = Some(ept_root);
Ok(())
}
fn setup(&mut self, _config: Self::SetupConfig) -> AxResult {
self.setup_vmcs(self.entry.unwrap(), self.ept_root.unwrap())
}
fn run(&mut self) -> AxResult<AxVCpuExitReason> {
match self.inner_run() {
Some(exit_info) => Ok(if exit_info.entry_failure {
AxVCpuExitReason::FailEntry {
hardware_entry_failure_reason: 0,
}
} else {
match exit_info.exit_reason {
VmxExitReason::VMCALL => {
self.advance_rip(exit_info.exit_instruction_length as _)?;
AxVCpuExitReason::Hypercall {
nr: self.regs().rax,
args: [
self.regs().rdi,
self.regs().rsi,
self.regs().rdx,
self.regs().rcx,
self.regs().r8,
self.regs().r9,
],
}
}
VmxExitReason::IO_INSTRUCTION => {
let io_info = self.io_exit_info().unwrap();
self.advance_rip(exit_info.exit_instruction_length as _)?;
let port = io_info.port;
if io_info.is_repeat || io_info.is_string {
warn!("VMX unsupported IO-Exit: {io_info:#x?} of {exit_info:#x?}");
warn!("VCpu {self:#x?}");
AxVCpuExitReason::Halt
} else {
let width = match AccessWidth::try_from(io_info.access_size as usize) {
Ok(width) => width,
Err(_) => {
warn!("VMX invalid IO-Exit: {io_info:#x?} of {exit_info:#x?}");
warn!("VCpu {self:#x?}");
return Ok(AxVCpuExitReason::Halt);
}
};
if io_info.is_in {
AxVCpuExitReason::IoRead {
port: Port(port),
width,
}
} else if port == QEMU_EXIT_PORT
&& width == AccessWidth::Word
&& self.regs().rax == QEMU_EXIT_MAGIC
{
AxVCpuExitReason::SystemDown
} else {
AxVCpuExitReason::IoWrite {
port: Port(port),
width,
data: self.regs().rax.get_bits(width.bits_range()),
}
}
}
}
VmxExitReason::EXTERNAL_INTERRUPT => {
let int_info = self.interrupt_exit_info()?;
assert!(int_info.valid);
AxVCpuExitReason::ExternalInterrupt {
vector: int_info.vector as _,
}
}
VmxExitReason::MSR_READ => {
AxVCpuExitReason::SysRegRead {
addr: SysRegAddr::new(self.regs().rcx as _),
reg: 0,
}
}
VmxExitReason::MSR_WRITE => {
let value = (self.regs().rax & 0xffff_ffff)
| ((self.regs().rdx & 0xffff_ffff) << 32);
AxVCpuExitReason::SysRegWrite {
addr: SysRegAddr::new(self.regs().rcx as _),
value,
}
}
_ => {
warn!("VMX unsupported VM-Exit: {exit_info:#x?}");
warn!("VCpu {self:#x?}");
AxVCpuExitReason::Halt
}
}
}),
None => Ok(AxVCpuExitReason::Nothing),
}
}
fn bind(&mut self) -> AxResult {
self.bind_to_current_processor()
}
fn unbind(&mut self) -> AxResult {
self.launched = false;
self.unbind_from_current_processor()
}
fn set_gpr(&mut self, reg: usize, val: usize) {
self.regs_mut().set_reg_of_index(reg as u8, val as u64);
}
fn inject_interrupt(&mut self, vector: usize) -> AxResult {
if vector != 0 {
} else {
warn!("interrupt queued in inject_interrupt: vector 0");
panic!()
}
self.queue_event(vector as u8, None);
Ok(())
}
fn set_return_value(&mut self, val: usize) {
self.regs_mut().rax = val as u64;
}
}
#[cfg(test)]
mod tests {
use alloc::format;
use super::*;
#[test]
fn test_vm_cpu_mode_enum() {
assert_ne!(VmCpuMode::Real, VmCpuMode::Protected);
assert_ne!(VmCpuMode::Protected, VmCpuMode::Compatibility);
assert_ne!(VmCpuMode::Compatibility, VmCpuMode::Mode64);
let debug_str = format!("{:?}", VmCpuMode::Mode64);
assert!(debug_str.contains("Mode64"));
}
#[test]
fn test_general_registers_operations() {
let mut regs = GeneralRegisters::default();
assert_eq!(regs.rax, 0);
assert_eq!(regs.rbx, 0);
regs.rax = 0x1234567890abcdef;
regs.rbx = 0xfedcba0987654321;
assert_eq!(regs.rax, 0x1234567890abcdef);
assert_eq!(regs.rbx, 0xfedcba0987654321);
regs.set_reg_of_index(0, 0x1111111111111111); assert_eq!(regs.get_reg_of_index(0), 0x1111111111111111);
regs.set_reg_of_index(1, 0x2222222222222222); assert_eq!(regs.get_reg_of_index(1), 0x2222222222222222);
}
#[test]
fn test_constants() {
assert_eq!(VMX_PREEMPTION_TIMER_SET_VALUE, 1_000_000);
assert_eq!(QEMU_EXIT_PORT, 0x604);
assert_eq!(QEMU_EXIT_MAGIC, 0x2000);
assert_eq!(MSR_IA32_EFER_LMA_BIT, 1 << 10);
assert_eq!(CR0_PE, 1 << 0);
}
#[test]
fn test_bit_operations() {
use bit_field::BitField;
let mut value = 0u64;
value.set_bits(0..32, 0x12345678);
value.set_bits(32..64, 0xabcdef00);
assert_eq!(value.get_bits(0..32), 0x12345678);
assert_eq!(value.get_bits(32..64), 0xabcdef00);
}
mod vmx_vcpu_tests {
use super::*;
fn create_test_vcpu_regs() -> GeneralRegisters {
let mut regs = GeneralRegisters::default();
regs.rax = 0x1000;
regs.rbx = 0x2000;
regs.rcx = 0x3000;
regs.rdx = 0x4000;
regs
}
#[test]
fn test_general_registers_clone() {
let regs = create_test_vcpu_regs();
let cloned_regs = regs.clone();
assert_eq!(regs.rax, cloned_regs.rax);
assert_eq!(regs.rbx, cloned_regs.rbx);
assert_eq!(regs.rcx, cloned_regs.rcx);
assert_eq!(regs.rdx, cloned_regs.rdx);
}
#[test]
fn test_edx_eax_operations() {
let rax = 0x12345678u64;
let rdx = 0xabcdef00u64;
let combined = ((rdx & 0xffff_ffff) << 32) | (rax & 0xffff_ffff);
assert_eq!(combined, 0xabcdef0012345678);
let val = 0xfedcba0987654321u64;
let new_rax = val & 0xffff_ffff;
let new_rdx = val >> 32;
assert_eq!(new_rax, 0x87654321);
assert_eq!(new_rdx, 0xfedcba09);
}
#[test]
fn test_register_bit_operations() {
let mut regs = GeneralRegisters::default();
regs.rcx = 0;
regs.rcx.set_bits(0..32, 0x12345678);
assert_eq!(regs.rcx.get_bits(0..32), 0x12345678);
regs.rdx = 0xffffffffffffffff;
regs.rdx.set_bits(32..64, 0);
assert_eq!(regs.rdx.get_bits(32..64), 0);
assert_eq!(regs.rdx.get_bits(0..32), 0xffffffff);
}
#[test]
fn test_gla2gva_logic() {
let guest_rip = 0x1000usize;
let seg_base_64bit = 0; let seg_base_other = 0x10000;
let gva_64bit = guest_rip + seg_base_64bit;
assert_eq!(gva_64bit, 0x1000);
let gva_other = guest_rip + seg_base_other;
assert_eq!(gva_other, 0x11000);
}
#[test]
fn test_interrupt_vector_validation() {
let valid_exception = 6; let valid_interrupt = 0x20;
let invalid_vector = 0;
assert!(valid_exception < 32); assert!(valid_interrupt >= 32); assert_eq!(invalid_vector, 0); }
#[test]
fn test_page_walk_info_struct() {
let ptw_info = GuestPageWalkInfo {
top_entry: 0x1000,
level: 4,
width: 9,
is_user_mode_access: false,
is_write_access: false,
is_inst_fetch: false,
pse: true,
wp: true,
nxe: true,
is_smap_on: false,
is_smep_on: false,
};
assert_eq!(ptw_info.level, 4);
assert_eq!(ptw_info.width, 9);
assert_eq!(ptw_info.top_entry, 0x1000);
}
#[test]
fn test_cpuid_constants() {
const LEAF_FEATURE_INFO: u32 = 0x1;
const LEAF_HYPERVISOR_INFO: u32 = 0x4000_0000;
const FEATURE_VMX: u32 = 1 << 5;
const FEATURE_HYPERVISOR: u32 = 1 << 31;
assert_eq!(LEAF_FEATURE_INFO, 1);
assert_eq!(LEAF_HYPERVISOR_INFO, 0x40000000);
assert_eq!(FEATURE_VMX, 32);
assert_eq!(FEATURE_HYPERVISOR, 0x80000000);
}
#[test]
fn test_cr_flags_operations() {
use x86_64::registers::control::{Cr0Flags, Cr4Flags};
let cr0_flags = Cr0Flags::PAGING | Cr0Flags::PROTECTED_MODE_ENABLE;
assert!(cr0_flags.contains(Cr0Flags::PAGING));
assert!(cr0_flags.contains(Cr0Flags::PROTECTED_MODE_ENABLE));
assert!(!cr0_flags.contains(Cr0Flags::CACHE_DISABLE));
let cr4_flags = Cr4Flags::VIRTUAL_MACHINE_EXTENSIONS | Cr4Flags::PAGE_SIZE_EXTENSION;
assert!(cr4_flags.contains(Cr4Flags::VIRTUAL_MACHINE_EXTENSIONS));
assert!(cr4_flags.contains(Cr4Flags::PAGE_SIZE_EXTENSION));
}
#[test]
fn test_access_width_operations() {
use axaddrspace::device::AccessWidth;
assert_eq!(AccessWidth::Byte as usize, 0);
assert_eq!(AccessWidth::Word as usize, 1);
assert_eq!(AccessWidth::Dword as usize, 2);
assert_eq!(AccessWidth::Qword as usize, 3);
assert_eq!(AccessWidth::try_from(1), Ok(AccessWidth::Byte));
assert_eq!(AccessWidth::try_from(2), Ok(AccessWidth::Word));
assert_eq!(AccessWidth::try_from(4), Ok(AccessWidth::Dword));
assert_eq!(AccessWidth::try_from(8), Ok(AccessWidth::Qword));
}
}
#[test]
fn test_get_tr_base_logic() {
let mut test_entry = 0u64;
test_entry |= 1u64 << 47; test_entry |= (0x1000u64 & 0xFFFFFF) << 16;
let present = test_entry & (1 << 47) != 0;
assert!(present);
let base_low = (test_entry >> 16) & 0xFFFFFF;
let base_high = (test_entry >> 56) & 0xFF;
let base_addr = base_low | (base_high << 24);
assert_eq!(base_addr, 0x1000);
}
#[test]
fn test_vmx_exit_reason_enum() {
let test_reason = VmxExitReason::VMCALL;
match test_reason {
VmxExitReason::VMCALL => assert!(true),
_ => assert!(false),
}
}
#[test]
fn test_debug_implementations() {
let cpu_mode = VmCpuMode::Mode64;
let debug_str = format!("{:?}", cpu_mode);
assert!(!debug_str.is_empty());
let regs = GeneralRegisters::default();
let debug_str = format!("{:?}", regs);
assert!(!debug_str.is_empty());
}
}