use std::cell::RefCell;
use std::collections::HashMap;
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, AtomicUsize, Ordering};
use std::sync::{Arc, Mutex, Once, OnceLock};
use kvm_bindings::{
kvm_clock_data, kvm_debugregs, kvm_fpu, kvm_irqchip, kvm_lapic_state, kvm_mp_state,
kvm_msr_entry, kvm_pit_state2, kvm_regs, kvm_segment, kvm_sregs, kvm_userspace_memory_region,
kvm_vcpu_events, kvm_xcrs, CpuId, Msrs, KVM_MAX_CPUID_ENTRIES, KVM_MEM_READONLY,
KVM_MP_STATE_UNINITIALIZED,
};
use kvm_ioctls::{IoEventAddress, Kvm, NoDatamatch, VcpuExit as KvmExit, VcpuFd, VmFd};
use vmm_sys_util::eventfd::EventFd;
use crate::hypervisor::{CoreReg, HypervisorVcpu, HypervisorVm, SysReg, VcpuExit, VcpuHandle};
pub mod run;
#[derive(Debug)]
pub struct KvmError(String);
impl KvmError {
fn unsupported(what: &str) -> Self {
KvmError(format!("kvm: unsupported ({what})"))
}
}
impl From<kvm_ioctls::Error> for KvmError {
fn from(e: kvm_ioctls::Error) -> Self {
KvmError(format!("kvm: {e}"))
}
}
impl From<std::io::Error> for KvmError {
fn from(e: std::io::Error) -> Self {
KvmError(format!("kvm io: {e}"))
}
}
impl From<crate::arch::x86_64::boot::BootError> for KvmError {
fn from(e: crate::arch::x86_64::boot::BootError) -> Self {
KvmError(format!("kvm boot: {e}"))
}
}
impl std::fmt::Display for KvmError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.0)
}
}
impl std::error::Error for KvmError {}
impl crate::hypervisor::BackendError for KvmError {
fn other(msg: &str) -> Self {
KvmError(msg.to_string())
}
}
pub struct KvmVm {
_kvm: Kvm,
vm: Arc<VmFd>,
next_slot: AtomicU32,
next_vcpu: AtomicU64,
supported_cpuid: CpuId,
slots: Mutex<HashMap<u64, u32>>,
}
impl HypervisorVm for KvmVm {
type Error = KvmError;
type Vcpu = KvmVcpu;
fn create() -> Result<Self, KvmError> {
let kvm = Kvm::new()?;
let vm = kvm.create_vm()?;
vm.set_tss_address(0xfffb_d000)?;
vm.create_irq_chip()?;
let mut supported_cpuid = kvm.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES)?;
for e in supported_cpuid.as_mut_slice() {
if e.function == 7 && e.index == 0 {
e.ecx &= !(1 << 7);
e.edx &= !(1 << 20);
}
}
Ok(KvmVm {
_kvm: kvm,
vm: Arc::new(vm),
next_slot: AtomicU32::new(0),
next_vcpu: AtomicU64::new(0),
supported_cpuid,
slots: Mutex::new(HashMap::new()),
})
}
unsafe fn map_ram(
&self,
host_ptr: *mut u8,
gpa: u64,
len: usize,
prot: u64,
) -> Result<(), KvmError> {
let slot = self.next_slot.fetch_add(1, Ordering::Relaxed);
let mut flags = 0u32;
if prot & crate::hypervisor::prot::WRITE == 0 {
flags |= KVM_MEM_READONLY;
}
let region = kvm_userspace_memory_region {
slot,
guest_phys_addr: gpa,
memory_size: len as u64,
userspace_addr: host_ptr as u64,
flags,
};
unsafe { self.vm.set_user_memory_region(region)? };
self.slots.lock().unwrap().insert(gpa, slot);
Ok(())
}
unsafe fn unmap_ram(&self, gpa: u64, _len: usize) -> Result<(), KvmError> {
if let Some(slot) = self.slots.lock().unwrap().remove(&gpa) {
let region = kvm_userspace_memory_region {
slot,
guest_phys_addr: gpa,
memory_size: 0,
userspace_addr: 0,
flags: 0,
};
unsafe { self.vm.set_user_memory_region(region)? };
}
Ok(())
}
fn create_vcpu(&self) -> Result<KvmVcpu, KvmError> {
let kvm_index = self.next_vcpu.fetch_add(1, Ordering::Relaxed);
let mut vcpu = self.vm.create_vcpu(kvm_index)?;
vcpu.set_cpuid2(&self.supported_cpuid)?;
install_force_exit_signal();
let id = NEXT_REG_ID.fetch_add(1, Ordering::Relaxed);
let immediate_exit_ptr = std::ptr::addr_of_mut!(vcpu.get_kvm_run().immediate_exit) as usize;
let reg = Arc::new(VcpuReg {
tid: AtomicU64::new(0),
exit: AtomicBool::new(false),
immediate_exit_ptr: AtomicUsize::new(immediate_exit_ptr),
});
registry().lock().unwrap().insert(id, reg.clone());
Ok(KvmVcpu {
vcpu: RefCell::new(vcpu),
id,
reg,
bound: AtomicBool::new(false),
})
}
fn set_irq(&self, intid: u32, level: bool) -> Result<(), KvmError> {
self.vm.set_irq_line(intid, level)?;
Ok(())
}
fn irq_line(&self) -> Arc<dyn Fn(u32, bool) + Send + Sync> {
let vm = Arc::clone(&self.vm);
Arc::new(move |intid, level| {
let _ = vm.set_irq_line(intid, level);
})
}
fn capture_intc(&self) -> Result<Vec<u8>, KvmError> {
let s = self.capture_devices()?;
let mut out = Vec::new();
push_pod(&mut out, &s.pit);
for chip in &s.irqchips {
push_pod(&mut out, chip);
}
push_pod(&mut out, &s.clock);
Ok(out)
}
fn restore_intc(&self, blob: &[u8]) -> Result<(), KvmError> {
let mut p = 0usize;
let pit = read_pod(blob, &mut p)?;
let irqchips = [
read_pod(blob, &mut p)?,
read_pod(blob, &mut p)?,
read_pod(blob, &mut p)?,
];
let clock = read_pod(blob, &mut p)?;
self.restore_devices(&KvmDeviceState {
pit,
irqchips,
clock,
})
}
fn dax_mapper(self: &Arc<Self>) -> Arc<dyn crate::fuse::HvfMapper> {
crate::kvm::run::kvm_dax_mapper(Arc::clone(self))
}
fn host_monotonic_ticks() -> u64 {
let mut ts = libc::timespec {
tv_sec: 0,
tv_nsec: 0,
};
unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut ts) };
(ts.tv_sec as u64)
.wrapping_mul(1_000_000_000)
.wrapping_add(ts.tv_nsec as u64)
}
fn boot_linux(
&self,
vcpu: &KvmVcpu,
mem: &mut [u8],
cfg: &crate::hypervisor::LinuxBootConfig,
) -> Result<(), KvmError> {
use crate::arch::x86_64::boot::{self, BootConfig};
let bcfg = BootConfig {
mem_size: cfg.ram_size,
cmdline: cfg.cmdline,
bzimage: cfg.kernel,
initrd: cfg.initrd,
};
let regs = boot::setup_boot(mem, &bcfg)?;
vcpu.apply_boot_regs(®s)
}
}
fn push_pod<T>(out: &mut Vec<u8>, v: &T) {
let bytes =
unsafe { std::slice::from_raw_parts(v as *const T as *const u8, std::mem::size_of::<T>()) };
out.extend_from_slice(&(bytes.len() as u32).to_le_bytes());
out.extend_from_slice(bytes);
}
fn read_pod<T: Copy>(b: &[u8], p: &mut usize) -> Result<T, KvmError> {
if *p + 4 > b.len() {
return Err(KvmError("intc blob truncated (length prefix)".to_string()));
}
let len = u32::from_le_bytes([b[*p], b[*p + 1], b[*p + 2], b[*p + 3]]) as usize;
*p += 4;
if len != std::mem::size_of::<T>() || *p + len > b.len() {
return Err(KvmError("intc blob field size mismatch".to_string()));
}
let mut v = std::mem::MaybeUninit::<T>::uninit();
unsafe {
std::ptr::copy_nonoverlapping(b[*p..].as_ptr(), v.as_mut_ptr() as *mut u8, len);
}
*p += len;
Ok(unsafe { v.assume_init() })
}
fn write_pod_to<T>(w: &mut dyn std::io::Write, v: &T) -> std::io::Result<()> {
let bytes =
unsafe { std::slice::from_raw_parts(v as *const T as *const u8, std::mem::size_of::<T>()) };
w.write_all(&(bytes.len() as u32).to_le_bytes())?;
w.write_all(bytes)
}
fn read_pod_from<T: Copy>(r: &mut dyn std::io::Read) -> std::io::Result<T> {
let mut lb = [0u8; 4];
r.read_exact(&mut lb)?;
let len = u32::from_le_bytes(lb) as usize;
if len != std::mem::size_of::<T>() {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"snapshot-state field size mismatch",
));
}
let mut v = std::mem::MaybeUninit::<T>::uninit();
let buf = unsafe { std::slice::from_raw_parts_mut(v.as_mut_ptr() as *mut u8, len) };
r.read_exact(buf)?;
Ok(unsafe { v.assume_init() })
}
impl KvmVm {
pub fn create_pit(&self) -> Result<(), KvmError> {
self.vm
.create_pit2(kvm_bindings::kvm_pit_config::default())?;
Ok(())
}
pub fn set_irq_line(&self, irq: u32, level: bool) -> Result<(), KvmError> {
self.vm.set_irq_line(irq, level)?;
Ok(())
}
pub fn register_mmio_ioevent(&self, fd: &EventFd, addr: u64) -> Result<(), KvmError> {
self.vm
.register_ioevent(fd, &IoEventAddress::Mmio(addr), NoDatamatch)?;
Ok(())
}
pub fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> Result<(), KvmError> {
self.vm.register_irqfd(fd, gsi)?;
Ok(())
}
pub fn capture_devices(&self) -> Result<KvmDeviceState, KvmError> {
let pit = self.vm.get_pit2()?;
let mut irqchips = [
kvm_irqchip::default(),
kvm_irqchip::default(),
kvm_irqchip::default(),
];
for (i, chip) in irqchips.iter_mut().enumerate() {
chip.chip_id = i as u32; self.vm.get_irqchip(chip)?;
}
let clock = self.vm.get_clock()?;
Ok(KvmDeviceState {
pit,
irqchips,
clock,
})
}
pub fn restore_devices(&self, s: &KvmDeviceState) -> Result<(), KvmError> {
self.vm.set_pit2(&s.pit)?;
for chip in &s.irqchips {
self.vm.set_irqchip(chip)?;
}
let mut clock = s.clock;
clock.flags = 0;
self.vm.set_clock(&clock)?;
Ok(())
}
}
#[derive(Clone)]
pub struct KvmDeviceState {
pit: kvm_pit_state2,
irqchips: [kvm_irqchip; 3],
clock: kvm_clock_data,
}
struct VcpuReg {
tid: AtomicU64,
exit: AtomicBool,
immediate_exit_ptr: AtomicUsize,
}
static NEXT_REG_ID: AtomicU64 = AtomicU64::new(0);
fn registry() -> &'static Mutex<HashMap<u64, Arc<VcpuReg>>> {
static R: OnceLock<Mutex<HashMap<u64, Arc<VcpuReg>>>> = OnceLock::new();
R.get_or_init(|| Mutex::new(HashMap::new()))
}
extern "C" fn sigusr1_noop(_sig: libc::c_int) {}
fn install_force_exit_signal() {
static ONCE: Once = Once::new();
ONCE.call_once(|| unsafe {
let mut sa: libc::sigaction = std::mem::zeroed();
sa.sa_sigaction = sigusr1_noop as usize;
libc::sigemptyset(&mut sa.sa_mask);
sa.sa_flags = 0; libc::sigaction(libc::SIGUSR1, &sa, std::ptr::null_mut());
});
}
#[derive(Clone, Copy)]
pub struct KvmVcpuHandle {
vcpu_id: u64,
}
impl VcpuHandle for KvmVcpuHandle {
fn force_exit(handles: &[Self]) {
install_force_exit_signal();
let reg = registry().lock().unwrap();
for h in handles {
if let Some(e) = reg.get(&h.vcpu_id) {
e.exit.store(true, Ordering::SeqCst);
let imm = e.immediate_exit_ptr.load(Ordering::SeqCst);
if imm != 0 {
unsafe { std::ptr::write_volatile(imm as *mut u8, 1u8) };
}
let tid = e.tid.load(Ordering::SeqCst);
if tid != 0 {
unsafe {
libc::pthread_kill(tid as libc::pthread_t, libc::SIGUSR1);
}
}
}
}
}
}
const SNAPSHOT_MSRS: &[u32] = &[
0x0000_0010, 0x0000_0174, 0x0000_0175, 0x0000_0176, 0xc000_0081, 0xc000_0082, 0xc000_0083, 0xc000_0084, 0xc000_0102, 0xc000_0103, ];
#[derive(Clone)]
pub struct KvmSnapshotState {
regs: kvm_regs,
sregs: kvm_sregs,
fpu: kvm_fpu,
xcrs: kvm_xcrs,
events: kvm_vcpu_events,
mp_state: kvm_mp_state,
debug_regs: kvm_debugregs,
lapic: kvm_lapic_state,
msrs: Msrs,
}
pub struct KvmVcpu {
vcpu: RefCell<VcpuFd>,
id: u64,
reg: Arc<VcpuReg>,
bound: AtomicBool,
}
impl Drop for KvmVcpu {
fn drop(&mut self) {
registry().lock().unwrap().remove(&self.id);
}
}
impl HypervisorVcpu for KvmVcpu {
type Error = KvmError;
type Handle = KvmVcpuHandle;
type SnapshotState = KvmSnapshotState;
fn exit_token(&self) -> KvmVcpuHandle {
KvmVcpuHandle { vcpu_id: self.id }
}
fn capture_snapshot(&self) -> Result<KvmSnapshotState, KvmError> {
self.capture_snapshot_locked(&self.vcpu.borrow())
}
fn restore_snapshot(&self, s: &KvmSnapshotState) -> Result<(), KvmError> {
self.restore_snapshot_locked(&self.vcpu.borrow(), s)
}
fn capture_clock_ref(_state: &Self::SnapshotState, _host_now: u64) -> u64 {
0
}
fn restore_clock(&self, _captured_ref: u64, _host_now: u64) -> Result<u64, KvmError> {
Ok(0)
}
fn write_snapshot_state(
s: &KvmSnapshotState,
w: &mut dyn std::io::Write,
) -> std::io::Result<()> {
write_pod_to(w, &s.regs)?;
write_pod_to(w, &s.sregs)?;
write_pod_to(w, &s.fpu)?;
write_pod_to(w, &s.xcrs)?;
write_pod_to(w, &s.events)?;
write_pod_to(w, &s.mp_state)?;
write_pod_to(w, &s.debug_regs)?;
write_pod_to(w, &s.lapic)?;
let entries = s.msrs.as_slice();
w.write_all(&(entries.len() as u32).to_le_bytes())?;
for e in entries {
w.write_all(&e.index.to_le_bytes())?;
w.write_all(&e.data.to_le_bytes())?;
}
Ok(())
}
fn read_snapshot_state(r: &mut dyn std::io::Read) -> std::io::Result<KvmSnapshotState> {
let regs = read_pod_from(r)?;
let sregs = read_pod_from(r)?;
let fpu = read_pod_from(r)?;
let xcrs = read_pod_from(r)?;
let events = read_pod_from(r)?;
let mp_state = read_pod_from(r)?;
let debug_regs = read_pod_from(r)?;
let lapic = read_pod_from(r)?;
let mut lb = [0u8; 4];
r.read_exact(&mut lb)?;
let nmsr = u32::from_le_bytes(lb);
let mut entries = Vec::with_capacity((nmsr as usize).min(4096));
for _ in 0..nmsr {
let mut ib = [0u8; 4];
r.read_exact(&mut ib)?;
let mut db = [0u8; 8];
r.read_exact(&mut db)?;
entries.push(kvm_msr_entry {
index: u32::from_le_bytes(ib),
data: u64::from_le_bytes(db),
..Default::default()
});
}
let msrs = Msrs::from_entries(&entries).map_err(|e| {
std::io::Error::new(std::io::ErrorKind::InvalidData, format!("msrs: {e:?}"))
})?;
Ok(KvmSnapshotState {
regs,
sregs,
fpu,
xcrs,
events,
mp_state,
debug_regs,
lapic,
msrs,
})
}
fn get_core(&self, _reg: CoreReg) -> Result<u64, KvmError> {
Err(KvmError::unsupported("aarch64 CoreReg on x86"))
}
fn set_core(&self, _reg: CoreReg, _value: u64) -> Result<(), KvmError> {
Err(KvmError::unsupported("aarch64 CoreReg on x86"))
}
fn get_sys(&self, _reg: SysReg) -> Result<u64, KvmError> {
Err(KvmError::unsupported("aarch64 SysReg on x86"))
}
fn set_sys(&self, _reg: SysReg, _value: u64) -> Result<(), KvmError> {
Err(KvmError::unsupported("aarch64 SysReg on x86"))
}
fn step(&self) -> Result<VcpuExit, KvmError> {
self.bind_thread()?;
let mut vcpu = self.vcpu.borrow_mut();
let exit = match vcpu.run() {
Ok(e) => e,
Err(e) if e.errno() == libc::EINTR => return Ok(VcpuExit::Canceled),
Err(e) => return Err(e.into()),
};
Ok(match exit {
KvmExit::IoOut(port, data) => VcpuExit::Io {
port,
write: true,
size: data.len() as u8,
data: pack_u32(data),
},
KvmExit::IoIn(port, data) => VcpuExit::Io {
port,
write: false,
size: data.len() as u8,
data: 0,
},
KvmExit::MmioWrite(addr, data) => VcpuExit::Mmio {
phys_addr: addr,
write: true,
len: data.len() as u8,
data: pack_u64(data),
},
KvmExit::MmioRead(addr, data) => VcpuExit::Mmio {
phys_addr: addr,
write: false,
len: data.len() as u8,
data: 0,
},
KvmExit::Hlt | KvmExit::Shutdown => VcpuExit::Halt,
KvmExit::Intr => VcpuExit::Canceled,
_ => VcpuExit::Unknown(0),
})
}
}
impl KvmVcpu {
pub fn should_exit(&self) -> bool {
self.reg.exit.load(Ordering::SeqCst)
}
pub fn clear_exit(&self) {
self.reg.exit.store(false, Ordering::SeqCst);
}
pub(crate) fn restore_snapshot_locked(
&self,
v: &VcpuFd,
s: &KvmSnapshotState,
) -> Result<(), KvmError> {
v.set_sregs(&s.sregs)?;
v.set_regs(&s.regs)?;
v.set_fpu(&s.fpu)?;
v.set_xcrs(&s.xcrs)?;
v.set_lapic(&s.lapic)?;
let set = v.set_msrs(&s.msrs)?;
if set != SNAPSHOT_MSRS.len() {
return Err(KvmError(format!(
"kvm: set_msrs wrote {set}/{} entries",
SNAPSHOT_MSRS.len()
)));
}
v.set_vcpu_events(&s.events)?;
v.set_debug_regs(&s.debug_regs)?;
v.set_mp_state(s.mp_state)?;
Ok(())
}
pub(crate) fn capture_snapshot_locked(&self, v: &VcpuFd) -> Result<KvmSnapshotState, KvmError> {
let entries: Vec<kvm_msr_entry> = SNAPSHOT_MSRS
.iter()
.map(|&index| kvm_msr_entry {
index,
data: 0,
..Default::default()
})
.collect();
let mut msrs =
Msrs::from_entries(&entries).map_err(|e| KvmError(format!("kvm: msrs fam: {e:?}")))?;
let got = v.get_msrs(&mut msrs)?;
if got != SNAPSHOT_MSRS.len() {
return Err(KvmError(format!(
"kvm: get_msrs read {got}/{} entries",
SNAPSHOT_MSRS.len()
)));
}
Ok(KvmSnapshotState {
regs: v.get_regs()?,
sregs: v.get_sregs()?,
fpu: v.get_fpu()?,
xcrs: v.get_xcrs()?,
events: v.get_vcpu_events()?,
mp_state: v.get_mp_state()?,
debug_regs: v.get_debug_regs()?,
lapic: v.get_lapic()?,
msrs,
})
}
pub fn bind_thread(&self) -> Result<(), KvmError> {
if self.bound.swap(true, Ordering::SeqCst) {
return Ok(());
}
install_force_exit_signal();
unsafe {
let mut set: libc::sigset_t = std::mem::zeroed();
libc::sigemptyset(&mut set);
libc::sigaddset(&mut set, libc::SIGUSR1);
libc::pthread_sigmask(libc::SIG_BLOCK, &set, std::ptr::null_mut());
}
self.set_kvm_signal_mask_empty()?;
let tid = unsafe { libc::pthread_self() } as u64;
self.reg.tid.store(tid, Ordering::SeqCst);
Ok(())
}
fn set_kvm_signal_mask_empty(&self) -> Result<(), KvmError> {
use std::os::unix::io::AsRawFd;
const KVM_SET_SIGNAL_MASK: libc::c_ulong = 0x4004_ae8b;
let mut buf = [0u8; 4 + 8];
buf[0..4].copy_from_slice(&8u32.to_le_bytes()); let fd = self.vcpu.borrow().as_raw_fd();
let r = unsafe { libc::ioctl(fd, KVM_SET_SIGNAL_MASK, buf.as_ptr()) };
if r != 0 {
return Err(KvmError::from(std::io::Error::last_os_error()));
}
Ok(())
}
pub fn enter_long_mode(&self, entry: u64, cr3: u64) -> Result<(), KvmError> {
let vcpu = self.vcpu.borrow();
let mut sregs = vcpu.get_sregs()?;
let code = kvm_segment {
base: 0,
limit: 0xffff_ffff,
selector: 0x08,
type_: 0b1011, present: 1,
dpl: 0,
db: 0,
s: 1,
l: 1, g: 1,
..Default::default()
};
let data = kvm_segment {
base: 0,
limit: 0xffff_ffff,
selector: 0x10,
type_: 0b0011, present: 1,
dpl: 0,
db: 1,
s: 1,
l: 0,
g: 1,
..Default::default()
};
sregs.cs = code;
sregs.ds = data;
sregs.es = data;
sregs.fs = data;
sregs.gs = data;
sregs.ss = data;
sregs.cr3 = cr3;
sregs.cr0 = 0x8000_0001; sregs.cr4 = 0x0000_0020; sregs.efer = 0x0000_0500; vcpu.set_sregs(&sregs)?;
let mut regs = vcpu.get_regs()?;
regs.rip = entry;
regs.rflags = 0x2; vcpu.set_regs(®s)?;
Ok(())
}
pub fn park_for_sipi(&self) -> Result<(), KvmError> {
let st = kvm_mp_state {
mp_state: KVM_MP_STATE_UNINITIALIZED,
};
self.vcpu.borrow().set_mp_state(st)?;
Ok(())
}
pub fn apply_boot_regs(&self, b: &crate::arch::x86_64::boot::BootRegs) -> Result<(), KvmError> {
let vcpu = self.vcpu.borrow();
let mut sregs = vcpu.get_sregs()?;
let cs = seg_to_kvm(&b.cs);
let ds = seg_to_kvm(&b.ds);
sregs.cs = cs;
sregs.ds = ds;
sregs.es = ds;
sregs.ss = ds;
sregs.fs = ds;
sregs.gs = ds;
sregs.gdt.base = b.gdt_base;
sregs.gdt.limit = b.gdt_limit;
sregs.cr0 = b.cr0;
sregs.cr3 = b.cr3;
sregs.cr4 = b.cr4;
sregs.efer = b.efer;
vcpu.set_sregs(&sregs)?;
let mut regs = vcpu.get_regs()?;
regs.rip = b.rip;
regs.rsi = b.rsi;
regs.rflags = b.rflags;
vcpu.set_regs(®s)?;
Ok(())
}
}
fn seg_to_kvm(s: &crate::arch::x86_64::boot::Segment) -> kvm_segment {
kvm_segment {
base: s.base,
limit: s.limit,
selector: s.selector,
type_: s.type_,
present: s.present,
dpl: s.dpl,
s: s.s,
l: s.l,
db: s.db,
g: s.g,
..Default::default()
}
}
fn pack_u32(data: &[u8]) -> u32 {
let mut buf = [0u8; 4];
let n = data.len().min(4);
buf[..n].copy_from_slice(&data[..n]);
u32::from_le_bytes(buf)
}
fn pack_u64(data: &[u8]) -> u64 {
let mut buf = [0u8; 8];
let n = data.len().min(8);
buf[..n].copy_from_slice(&data[..n]);
u64::from_le_bytes(buf)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::hypervisor::prot;
#[test]
fn pod_blob_round_trips_and_rejects_corruption() {
let mut buf = Vec::new();
push_pod(&mut buf, &0x1122_3344_5566_7788u64);
push_pod(&mut buf, &[0xAAu8; 8]);
let mut p = 0;
let a: u64 = read_pod(&buf, &mut p).unwrap();
let b: [u8; 8] = read_pod(&buf, &mut p).unwrap();
assert_eq!(a, 0x1122_3344_5566_7788);
assert_eq!(b, [0xAA; 8]);
assert_eq!(p, buf.len(), "consumed the whole blob");
let mut p = 0;
assert!(read_pod::<u64>(&buf[..2], &mut p).is_err());
let mut p = 0;
assert!(read_pod::<u32>(&buf, &mut p).is_err());
let mut bad = Vec::new();
bad.extend_from_slice(&8u32.to_le_bytes());
bad.extend_from_slice(&[1, 2, 3, 4]);
let mut p = 0;
assert!(read_pod::<u64>(&bad, &mut p).is_err());
}
#[test]
fn floor_long_mode_io_exit_through_seam() {
const MEM: usize = 0x20_0000;
let vm = KvmVm::create().expect("create VM");
let host = unsafe {
libc::mmap(
std::ptr::null_mut(),
MEM,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
-1,
0,
)
};
assert!(host != libc::MAP_FAILED, "mmap");
let host = host as *mut u8;
let put_u64 = |gpa: u64, v: u64| unsafe {
std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
};
put_u64(0x1000, 0x2000 | 0x3);
put_u64(0x2000, 0x3000 | 0x3);
put_u64(0x3000, 0x83);
let stub = [0x66u8, 0xba, 0xf8, 0x03, 0xb0, b'K', 0xee, 0xeb, 0xfe];
unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };
unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
let vcpu = vm.create_vcpu().expect("create_vcpu");
vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
match vcpu.step().expect("step") {
VcpuExit::Io {
port, write, data, ..
} => {
assert_eq!(port, 0x3f8, "serial port");
assert!(write, "OUT direction");
assert_eq!(data & 0xff, u32::from(b'K'), "serial byte");
}
other => panic!("unexpected exit: {other:?}"),
}
unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
}
#[test]
fn apply_boot_regs_configures_long_mode_entry() {
use crate::arch::x86_64::boot::{setup_boot, BootConfig};
let vm = KvmVm::create().expect("create VM");
let vcpu = vm.create_vcpu().expect("create_vcpu");
let setup_sects = 4u8;
let pm_off = (setup_sects as usize + 1) * 512;
let mut bz = vec![0u8; pm_off + 512];
bz[0x1f1] = setup_sects;
let mem_size = 2 * 1024 * 1024;
let mut mem = vec![0u8; mem_size];
let cfg = BootConfig {
mem_size,
cmdline: "console=ttyS0",
bzimage: &bz,
initrd: None,
};
let regs = setup_boot(&mut mem, &cfg).expect("setup_boot");
vcpu.apply_boot_regs(®s).expect("apply_boot_regs");
let sregs = vcpu.vcpu.borrow().get_sregs().expect("get_sregs");
assert_eq!(sregs.cr0, 0x8000_0001, "PG|PE");
assert_eq!(sregs.cr4, 0x20, "PAE");
assert_eq!(sregs.efer & 0x500, 0x500, "LME|LMA active");
assert_eq!(sregs.cr3, 0x1000, "PML4 root");
assert_eq!(sregs.cs.selector, 0x10, "__BOOT_CS");
assert_eq!(sregs.cs.l, 1, "64-bit code segment");
assert_eq!(sregs.ds.selector, 0x18, "__BOOT_DS");
assert_eq!(sregs.gdt.base, 0x4000, "GDT base");
assert_eq!(sregs.gdt.limit, 31, "GDT limit");
let r = vcpu.vcpu.borrow().get_regs().expect("get_regs");
assert_eq!(r.rip, 0x10_0200, "64-bit entry = load+0x200");
assert_eq!(r.rsi, 0x1_0000, "RSI = boot_params");
}
#[test]
fn force_exit_breaks_a_spinning_vcpu() {
use std::sync::mpsc;
use std::time::Duration;
const MEM: usize = 0x20_0000;
let vm = KvmVm::create().expect("create VM");
let host = unsafe {
libc::mmap(
std::ptr::null_mut(),
MEM,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
-1,
0,
)
};
assert!(host != libc::MAP_FAILED, "mmap");
let host = host as *mut u8;
let put_u64 = |gpa: u64, v: u64| unsafe {
std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
};
put_u64(0x1000, 0x2000 | 0x3);
put_u64(0x2000, 0x3000 | 0x3);
put_u64(0x3000, 0x83);
let stub = [0xebu8, 0xfe];
unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };
unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
let vcpu = vm.create_vcpu().expect("create_vcpu");
vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
let handle = vcpu.exit_token();
let (ready_tx, ready_rx) = mpsc::channel::<()>();
let (tx, rx) = mpsc::channel::<&'static str>();
let runner = std::thread::spawn(move || {
vcpu.bind_thread().expect("bind_thread");
let _ = ready_tx.send(());
loop {
match vcpu.step() {
Ok(VcpuExit::Canceled) => {
let _ = tx.send("canceled");
break;
}
Ok(_) => continue,
Err(_) => {
let _ = tx.send("err");
break;
}
}
}
});
ready_rx
.recv_timeout(Duration::from_secs(2))
.expect("vcpu ready");
KvmVcpuHandle::force_exit(&[handle]);
let got = rx.recv_timeout(Duration::from_secs(5));
assert_eq!(
got.ok(),
Some("canceled"),
"force_exit did not break the spinning vCPU out of KVM_RUN"
);
runner.join().expect("join runner");
unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
}
#[test]
fn force_exit_before_bind_still_stops_vcpu() {
use std::sync::mpsc;
use std::time::Duration;
const MEM: usize = 0x20_0000;
let vm = KvmVm::create().expect("create VM");
let host = unsafe {
libc::mmap(
std::ptr::null_mut(),
MEM,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
-1,
0,
)
};
assert!(host != libc::MAP_FAILED, "mmap");
let host = host as *mut u8;
let put_u64 = |gpa: u64, v: u64| unsafe {
std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
};
put_u64(0x1000, 0x2000 | 0x3);
put_u64(0x2000, 0x3000 | 0x3);
put_u64(0x3000, 0x83);
let stub = [0xebu8, 0xfe];
unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };
unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
let vcpu = vm.create_vcpu().expect("create_vcpu");
vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
let handle = vcpu.exit_token();
KvmVcpuHandle::force_exit(&[handle]);
let (tx, rx) = mpsc::channel::<&'static str>();
let runner = std::thread::spawn(move || {
let r = match vcpu.step() {
Ok(VcpuExit::Canceled) => "canceled",
Ok(_) => "other",
Err(_) => "err",
};
let _ = tx.send(r);
});
let got = rx.recv_timeout(Duration::from_secs(5));
assert_eq!(
got.ok(),
Some("canceled"),
"immediate_exit did not gate the first KVM_RUN entry for a pre-bind force_exit"
);
runner.join().expect("join runner");
unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
}
#[test]
fn force_exit_stops_all_vcpus() {
use std::sync::mpsc;
use std::time::Duration;
const MEM: usize = 0x20_0000;
let vm = KvmVm::create().expect("create VM");
let host = unsafe {
libc::mmap(
std::ptr::null_mut(),
MEM,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
-1,
0,
)
};
assert!(host != libc::MAP_FAILED, "mmap");
let host = host as *mut u8;
let put_u64 = |gpa: u64, v: u64| unsafe {
std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
};
put_u64(0x1000, 0x2000 | 0x3);
put_u64(0x2000, 0x3000 | 0x3);
put_u64(0x3000, 0x83);
let stub = [0xebu8, 0xfe]; unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };
unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
let (ready_tx, ready_rx) = mpsc::channel::<()>();
let (done_tx, done_rx) = mpsc::channel::<&'static str>();
let mut handles = Vec::new();
let mut runners = Vec::new();
for _ in 0..2 {
let vcpu = vm.create_vcpu().expect("create_vcpu");
vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
handles.push(vcpu.exit_token());
let ready = ready_tx.clone();
let done = done_tx.clone();
runners.push(std::thread::spawn(move || {
vcpu.bind_thread().expect("bind_thread");
let _ = ready.send(());
loop {
match vcpu.step() {
Ok(VcpuExit::Canceled) => {
let _ = done.send("canceled");
break;
}
Ok(_) => continue,
Err(_) => {
let _ = done.send("err");
break;
}
}
}
}));
}
ready_rx
.recv_timeout(Duration::from_secs(2))
.expect("vcpu0");
ready_rx
.recv_timeout(Duration::from_secs(2))
.expect("vcpu1");
KvmVcpuHandle::force_exit(&handles);
for _ in 0..2 {
assert_eq!(
done_rx.recv_timeout(Duration::from_secs(5)).ok(),
Some("canceled"),
"a vCPU was not stopped by the shared force_exit"
);
}
for r in runners {
r.join().expect("join runner");
}
unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
}
#[test]
fn snapshot_round_trips_vcpu_state() {
const MEM: usize = 0x20_0000;
let vm = KvmVm::create().expect("create VM");
let host = unsafe {
libc::mmap(
std::ptr::null_mut(),
MEM,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
-1,
0,
)
};
assert!(host != libc::MAP_FAILED, "mmap");
let host = host as *mut u8;
let put_u64 = |gpa: u64, v: u64| unsafe {
std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
};
put_u64(0x1000, 0x2000 | 0x3);
put_u64(0x2000, 0x3000 | 0x3);
put_u64(0x3000, 0x83);
unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
let vcpu = vm.create_vcpu().expect("create_vcpu");
vcpu.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
{
let v = vcpu.vcpu.borrow();
let mut r = v.get_regs().expect("get_regs");
r.rax = 0x1234_5678_9abc_def0;
r.rbx = 0x00ca_feba_be00_1357;
r.rip = 0x0;
v.set_regs(&r).expect("set_regs");
}
let snap = vcpu.capture_snapshot().expect("capture_snapshot");
{
let v = vcpu.vcpu.borrow();
let mut r = v.get_regs().expect("get_regs");
r.rax = 0;
r.rbx = 0;
r.rip = 0x1000;
v.set_regs(&r).expect("set_regs");
}
vcpu.restore_snapshot(&snap).expect("restore_snapshot");
let r = vcpu.vcpu.borrow().get_regs().expect("get_regs");
assert_eq!(r.rax, 0x1234_5678_9abc_def0, "rax restored");
assert_eq!(r.rbx, 0x00ca_feba_be00_1357, "rbx restored");
assert_eq!(r.rip, 0x0, "rip restored");
unsafe { libc::munmap(host as *mut libc::c_void, MEM) };
}
#[test]
fn snapshot_resumes_running_guest_in_fresh_vm() {
use std::time::Duration;
const MEM: usize = 0x20_0000;
let stub = [0x48u8, 0xff, 0xc0, 0xeb, 0xfb];
let build = || -> (KvmVm, *mut u8) {
let vm = KvmVm::create().expect("create VM");
let host = unsafe {
libc::mmap(
std::ptr::null_mut(),
MEM,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_NORESERVE,
-1,
0,
)
} as *mut u8;
assert!(host as *mut libc::c_void != libc::MAP_FAILED, "mmap");
let put_u64 = |gpa: u64, v: u64| unsafe {
std::ptr::write(host.add(gpa as usize) as *mut u64, v.to_le());
};
put_u64(0x1000, 0x2000 | 0x3);
put_u64(0x2000, 0x3000 | 0x3);
put_u64(0x3000, 0x83);
unsafe { std::ptr::copy_nonoverlapping(stub.as_ptr(), host, stub.len()) };
unsafe { vm.map_ram(host, 0, MEM, prot::RWX).expect("map_ram") };
(vm, host)
};
let run_until_kicked = move |vcpu: KvmVcpu| -> (KvmSnapshotState, u64) {
vcpu.bind_thread().expect("bind_thread");
loop {
match vcpu.step() {
Ok(VcpuExit::Canceled) => break,
Ok(_) => continue,
Err(e) => panic!("step: {e}"),
}
}
let rax = vcpu.vcpu.borrow().get_regs().expect("get_regs").rax;
(vcpu.capture_snapshot().expect("capture"), rax)
};
let (vm1, host1) = build();
let vcpu1 = vm1.create_vcpu().expect("create_vcpu");
vcpu1.enter_long_mode(0x0, 0x1000).expect("enter_long_mode");
let h1 = vcpu1.exit_token();
let t1 = std::thread::spawn(move || run_until_kicked(vcpu1));
std::thread::sleep(Duration::from_millis(50));
KvmVcpuHandle::force_exit(&[h1]);
let (snap, rax_at_snap) = t1.join().expect("join t1");
assert!(rax_at_snap > 0, "guest should have incremented rax");
let mut saved_ram = vec![0u8; MEM];
unsafe { std::ptr::copy_nonoverlapping(host1, saved_ram.as_mut_ptr(), MEM) };
unsafe { libc::munmap(host1 as *mut libc::c_void, MEM) };
let (vm2, host2) = build();
unsafe { std::ptr::copy_nonoverlapping(saved_ram.as_ptr(), host2, MEM) };
let vcpu2 = vm2.create_vcpu().expect("create_vcpu");
vcpu2.restore_snapshot(&snap).expect("restore_snapshot");
let rax_restored = vcpu2.vcpu.borrow().get_regs().expect("get_regs").rax;
assert_eq!(rax_restored, rax_at_snap, "rax not transferred to fresh VM");
let h2 = vcpu2.exit_token();
let t2 = std::thread::spawn(move || run_until_kicked(vcpu2));
std::thread::sleep(Duration::from_millis(50));
KvmVcpuHandle::force_exit(&[h2]);
let (_snap2, rax_after) = t2.join().expect("join t2");
assert!(
rax_after > rax_at_snap,
"restored guest did not resume: rax {rax_after} <= snapshot {rax_at_snap}"
);
unsafe { libc::munmap(host2 as *mut libc::c_void, MEM) };
}
#[test]
fn device_state_captures_and_restores() {
let vm = KvmVm::create().expect("create VM");
vm.create_pit().expect("create_pit");
let dev = vm.capture_devices().expect("capture_devices");
vm.restore_devices(&dev).expect("restore_devices");
let _dev2 = vm.capture_devices().expect("recapture_devices");
}
}