use anyhow::{Context, Result};
use kvm_bindings::{
KVM_CAP_HALT_POLL, KVM_CAP_SPLIT_IRQCHIP, KVM_CAP_X2APIC_API, KVM_CAP_X86_DISABLE_EXITS,
KVM_CLOCK_TSC_STABLE, KVM_IRQ_ROUTING_MSI, KVM_PIT_SPEAKER_DUMMY,
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK, KVM_X2APIC_API_USE_32BIT_IDS,
KVM_X86_DISABLE_EXITS_HLT, KVM_X86_DISABLE_EXITS_PAUSE, KvmIrqRouting, kvm_enable_cap,
kvm_irq_routing, kvm_irq_routing_entry, kvm_irq_routing_entry__bindgen_ty_1,
kvm_irq_routing_msi, kvm_irq_routing_msi__bindgen_ty_1, kvm_pit_config,
};
use kvm_ioctls::{Cap, Kvm, VcpuFd, VmFd};
use std::mem::ManuallyDrop;
use std::sync::Arc;
use vm_memory::{GuestAddress, GuestMemoryMmap};
use super::ioapic::{IOAPIC_BASE, IOAPIC_SIZE, Ioapic, MsiRoute};
use super::topology::{apic_id, generate_cpuid, max_apic_id};
use crate::vmm::numa_mem::{NumaMemoryLayout, ReservationGuard};
use crate::vmm::pi_mutex::PiMutex;
use crate::vmm::topology::Topology;
pub(crate) const KERNEL_LOAD_ADDR: u64 = 0x100000;
pub(crate) const BOOT_PARAMS_ADDR: u64 = 0x7000;
pub(crate) const CMDLINE_ADDR: u64 = 0x20000;
pub(crate) const CMDLINE_MAX: usize = 4096;
pub(crate) const EBDA_START: u64 = 0x9FC00;
pub(crate) const HIMEM_START: u64 = 0x10_0000;
pub(crate) const MMIO_GAP_START: u64 = 0xC000_0000;
pub(crate) const MMIO_GAP_END: u64 = 0x1_0000_0000;
pub(crate) const VIRTIO_CONSOLE_MMIO_BASE: u64 = MMIO_GAP_START;
pub(crate) const VIRTIO_BLK_MMIO_BASE: u64 = MMIO_GAP_START + 0x1000;
pub(crate) const VIRTIO_NET_MMIO_BASE: u64 = MMIO_GAP_START + 0x2000;
pub(crate) const VIRTIO_CONSOLE_IRQ: u32 = 5;
pub(crate) const VIRTIO_BLK_IRQ: u32 = 6;
pub(crate) const VIRTIO_NET_IRQ: u32 = 7;
pub(crate) const E820_RAM: u32 = 1;
pub(crate) const STARTUP64_OFFSET: u64 = 0x200;
const KVM_TSS_ADDRESS: u64 = 0xfffb_d000;
const KVM_IDENTITY_MAP_ADDRESS: u64 = KVM_TSS_ADDRESS + 3 * 4096;
const NUM_IOAPIC_PINS: u64 = 24;
pub(crate) const MAX_XAPIC_ID: u32 = 254;
const HALT_POLL_NS: u64 = 200_000;
const REQUIRED_CAPS: &[Cap] = &[
Cap::Irqchip,
Cap::Ioeventfd,
Cap::Irqfd,
Cap::UserMemory,
Cap::SetTssAddr,
Cap::Pit2,
Cap::PitState2,
Cap::AdjustClock,
Cap::Debugregs,
Cap::MpState,
Cap::VcpuEvents,
Cap::Xcrs,
Cap::Xsave,
Cap::ExtCpuid,
];
#[allow(dead_code)] pub struct KtstrKvm {
pub kvm: ManuallyDrop<Kvm>,
pub vm_fd: ManuallyDrop<VmFd>,
pub vcpus: Vec<VcpuFd>,
pub guest_mem: ManuallyDrop<GuestMemoryMmap>,
pub topology: Topology,
pub(crate) numa_layout: Option<NumaMemoryLayout>,
pub has_immediate_exit: bool,
pub(crate) split_irqchip: bool,
pub(crate) ioapic: Option<Arc<PiMutex<Ioapic>>>,
use_hugepages: bool,
performance_mode: bool,
_reservation: Option<ReservationGuard>,
pub(crate) cow_overlay_guards: Vec<crate::vmm::initramfs::CowOverlayGuard>,
}
impl Drop for KtstrKvm {
fn drop(&mut self) {
unsafe {
let vcpus = std::mem::take(&mut self.vcpus);
drop(vcpus);
ManuallyDrop::drop(&mut self.vm_fd);
ManuallyDrop::drop(&mut self.guest_mem);
let reservation = self._reservation.take();
drop(reservation);
let cow_guards = std::mem::take(&mut self.cow_overlay_guards);
drop(cow_guards);
ManuallyDrop::drop(&mut self.kvm);
}
}
}
impl KtstrKvm {
pub fn new(topo: Topology, memory_mib: u32, performance_mode: bool) -> Result<Self> {
Self::new_inner(topo, Some(memory_mib), false, performance_mode)
}
pub fn new_with_hugepages(
topo: Topology,
memory_mib: u32,
performance_mode: bool,
) -> Result<Self> {
Self::new_inner(topo, Some(memory_mib), true, performance_mode)
}
pub fn new_deferred(
topo: Topology,
use_hugepages: bool,
performance_mode: bool,
) -> Result<Self> {
Self::new_inner(topo, None, use_hugepages, performance_mode)
}
pub fn allocate_and_register_memory(&mut self, memory_mib: u32) -> Result<()> {
let layout = NumaMemoryLayout::compute(
&self.topology,
memory_mib,
0,
Some((MMIO_GAP_START, MMIO_GAP_END)),
)?;
let alloc =
layout.allocate_and_register(&self.vm_fd, self.use_hugepages, self.performance_mode)?;
unsafe { ManuallyDrop::drop(&mut self.guest_mem) };
self.guest_mem = ManuallyDrop::new(alloc.guest_mem);
self._reservation = Some(alloc.reservation);
self.numa_layout = Some(layout);
Ok(())
}
fn new_inner(
topo: Topology,
memory_mib: Option<u32>,
use_hugepages: bool,
performance_mode: bool,
) -> Result<Self> {
let kvm = Kvm::new().context("open /dev/kvm")?;
for &cap in REQUIRED_CAPS {
anyhow::ensure!(
kvm.check_extension(cap),
"KVM missing required capability: {:?}",
cap
);
}
let has_immediate_exit = kvm.check_extension(Cap::ImmediateExit);
let vm_fd = crate::vmm::create_vm_with_retry(&kvm)?;
vm_fd
.set_tss_address(KVM_TSS_ADDRESS as usize)
.map_err(|e| crate::vmm::map_transient_to_contention(e, "set TSS"))?;
vm_fd
.set_identity_map_address(KVM_IDENTITY_MAP_ADDRESS)
.map_err(|e| crate::vmm::map_transient_to_contention(e, "set identity map address"))?;
let max_apic_id = max_apic_id(&topo);
let split_irqchip = max_apic_id > MAX_XAPIC_ID;
if split_irqchip {
let mut cap = kvm_enable_cap {
cap: KVM_CAP_SPLIT_IRQCHIP,
..Default::default()
};
cap.args[0] = NUM_IOAPIC_PINS;
vm_fd
.enable_cap(&cap)
.map_err(|e| crate::vmm::map_transient_to_contention(e, "enable split IRQ chip"))?;
let mut cap = kvm_enable_cap {
cap: KVM_CAP_X2APIC_API,
..Default::default()
};
cap.args[0] =
(KVM_X2APIC_API_USE_32BIT_IDS | KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) as u64;
vm_fd.enable_cap(&cap).context("enable x2APIC API")?;
} else {
vm_fd
.create_irq_chip()
.map_err(|e| crate::vmm::map_transient_to_contention(e, "create IRQ chip"))?;
let pit_config = kvm_pit_config {
flags: KVM_PIT_SPEAKER_DUMMY,
..Default::default()
};
vm_fd
.create_pit2(pit_config)
.map_err(|e| crate::vmm::map_transient_to_contention(e, "create PIT"))?;
}
let ioapic = split_irqchip.then(|| Arc::new(PiMutex::new(Ioapic::new())));
if performance_mode {
let mut cap = kvm_enable_cap {
cap: KVM_CAP_X86_DISABLE_EXITS,
..Default::default()
};
cap.args[0] = KVM_X86_DISABLE_EXITS_PAUSE as u64;
if let Err(e) = vm_fd.enable_cap(&cap) {
eprintln!(
"performance_mode: WARNING: \
KVM_CAP_X86_DISABLE_EXITS (PAUSE) not supported: {e}"
);
}
cap.args[0] = KVM_X86_DISABLE_EXITS_HLT as u64;
if let Err(e) = vm_fd.enable_cap(&cap) {
eprintln!(
"performance_mode: WARNING: \
KVM_CAP_X86_DISABLE_EXITS (HLT) rejected: {e}"
);
}
}
if !performance_mode {
let host_cpus = unsafe { libc::sysconf(libc::_SC_NPROCESSORS_ONLN) };
let poll_ns: u64 = if host_cpus > 0 && topo.total_cpus() <= host_cpus as u32 {
HALT_POLL_NS
} else {
0
};
let mut cap = kvm_enable_cap {
cap: KVM_CAP_HALT_POLL,
..Default::default()
};
cap.args[0] = poll_ns;
if let Err(e) = vm_fd.enable_cap(&cap) {
eprintln!(
"kvm: WARNING: KVM_CAP_HALT_POLL not supported ({e}), using kernel default"
);
}
}
let (guest_mem, numa_layout, reservation) = match memory_mib {
Some(mb) => {
let layout =
NumaMemoryLayout::compute(&topo, mb, 0, Some((MMIO_GAP_START, MMIO_GAP_END)))?;
let alloc =
layout.allocate_and_register(&vm_fd, use_hugepages, performance_mode)?;
(alloc.guest_mem, Some(layout), Some(alloc.reservation))
}
None => {
let placeholder = GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0), 4096)])
.context("allocate placeholder guest memory")?;
(placeholder, None, None)
}
};
let base_cpuid = kvm
.get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
.context("get_supported_cpuid")?;
if let Some(layout) = &numa_layout {
let phys_bits = base_cpuid
.as_slice()
.iter()
.find(|e| e.function == 0x8000_0008)
.map(|e| e.eax & 0xff)
.unwrap_or(36);
if let Some(top) = layout.ram_top_exceeds_phys_bits(phys_bits) {
return Err(anyhow::Error::new(
crate::vmm::host_topology::ResourceContention {
reason: format!(
"guest RAM top {top:#x} exceeds the guest MAXPHYADDR \
(1<<{phys_bits}); this host's physical-address \
width cannot back a VM this large without the guest \
silently truncating RAM"
),
},
));
}
}
let total = topo.total_cpus();
let max_vcpus = kvm.get_max_vcpus();
if total as usize > max_vcpus {
return Err(anyhow::Error::new(
crate::vmm::host_topology::ResourceContention {
reason: format!(
"topology requires {total} vCPUs but this host's \
KVM_CAP_MAX_VCPUS is {max_vcpus}; cannot run a VM this wide"
),
},
));
}
let max_vcpu_id = kvm.get_max_vcpu_id();
if (max_apic_id as usize) >= max_vcpu_id {
return Err(anyhow::Error::new(
crate::vmm::host_topology::ResourceContention {
reason: format!(
"topology's max APIC ID {max_apic_id} (the KVM vcpu_id) is \
>= this host's KVM_CAP_MAX_VCPU_ID {max_vcpu_id}; cannot \
create a vCPU at that ID"
),
},
));
}
let mut vcpus = Vec::with_capacity(total as usize);
for cpu_id in 0..total {
let aid = apic_id(&topo, cpu_id);
let vcpu = vm_fd.create_vcpu(aid as u64).map_err(|e| {
crate::vmm::map_transient_to_contention(
e,
format!("create vCPU cpu_id={cpu_id} apic_id={aid}"),
)
})?;
let cpuid_entries =
generate_cpuid(base_cpuid.as_slice(), &topo, cpu_id, performance_mode);
let cpuid = kvm_bindings::CpuId::from_entries(&cpuid_entries).context("build CpuId")?;
vcpu.set_cpuid2(&cpuid)
.with_context(|| format!("set CPUID for vCPU {cpu_id}"))?;
vcpus.push(vcpu);
}
if performance_mode {
match vm_fd.get_clock() {
Ok(clock) => {
let mut set_data = clock;
set_data.flags = 0;
if let Err(e) = vm_fd.set_clock(&set_data) {
eprintln!(
"performance_mode: WARNING: KVM_SET_CLOCK failed ({e}), \
cannot check TSC stability"
);
} else {
match vm_fd.get_clock() {
Ok(clock2) => {
if clock2.flags & KVM_CLOCK_TSC_STABLE == 0 {
eprintln!(
"performance_mode: WARNING: TSC not stable \
(KVM_CLOCK_TSC_STABLE not set), \
timing measurements may have higher variance \
(nested virt?)."
);
}
}
Err(e) => {
eprintln!(
"performance_mode: WARNING: KVM_GET_CLOCK failed ({e}), \
cannot check TSC stability"
);
}
}
}
}
Err(e) => {
eprintln!(
"performance_mode: WARNING: KVM_GET_CLOCK failed ({e}), \
cannot check TSC stability"
);
}
}
}
Ok(KtstrKvm {
kvm: ManuallyDrop::new(kvm),
vm_fd: ManuallyDrop::new(vm_fd),
vcpus,
guest_mem: ManuallyDrop::new(guest_mem),
topology: topo,
numa_layout,
has_immediate_exit,
split_irqchip,
ioapic,
use_hugepages,
performance_mode,
_reservation: reservation,
cow_overlay_guards: Vec::new(),
})
}
}
pub(crate) fn kvm_get_clock_via_raw_fd(
vm_fd: i32,
) -> std::io::Result<kvm_bindings::kvm_clock_data> {
const _: () = assert!(std::mem::size_of::<kvm_bindings::kvm_clock_data>() == 48);
const KVM_GET_CLOCK_IOCTL: libc::c_ulong = 0x8030_ae7c;
let mut clock = kvm_bindings::kvm_clock_data::default();
let rc = unsafe {
libc::ioctl(
vm_fd,
KVM_GET_CLOCK_IOCTL,
&mut clock as *mut kvm_bindings::kvm_clock_data,
)
};
if rc < 0 {
Err(std::io::Error::last_os_error())
} else {
Ok(clock)
}
}
pub(crate) fn kvm_set_clock_via_raw_fd(
vm_fd: i32,
clock: &kvm_bindings::kvm_clock_data,
) -> std::io::Result<()> {
const KVM_SET_CLOCK_IOCTL: libc::c_ulong = 0x4030_ae7b;
let rc = unsafe {
libc::ioctl(
vm_fd,
KVM_SET_CLOCK_IOCTL,
clock as *const kvm_bindings::kvm_clock_data,
)
};
if rc < 0 {
Err(std::io::Error::last_os_error())
} else {
Ok(())
}
}
pub(crate) fn kvm_set_gsi_routing_via_raw_fd(
vm_fd: i32,
routing: &KvmIrqRouting,
) -> std::io::Result<()> {
const _: () = assert!(std::mem::size_of::<kvm_irq_routing>() == 8);
const KVM_SET_GSI_ROUTING_IOCTL: libc::c_ulong = 0x4008_AE6A;
let rc = unsafe {
libc::ioctl(
vm_fd,
KVM_SET_GSI_ROUTING_IOCTL,
routing.as_fam_struct_ref() as *const kvm_irq_routing,
)
};
if rc < 0 {
Err(std::io::Error::last_os_error())
} else {
Ok(())
}
}
fn build_device_msi_routing(routes: &[(u32, MsiRoute)]) -> Result<KvmIrqRouting> {
let mut routing = KvmIrqRouting::new(routes.len()).map_err(|e| {
anyhow::anyhow!(
"allocate kvm_irq_routing for {} routes: {e:?}",
routes.len()
)
})?;
let slice = routing.as_mut_slice();
for (i, (gsi, msi)) in routes.iter().enumerate() {
slice[i] = kvm_irq_routing_entry {
gsi: *gsi,
type_: KVM_IRQ_ROUTING_MSI,
flags: 0,
pad: 0,
u: kvm_irq_routing_entry__bindgen_ty_1 {
msi: kvm_irq_routing_msi {
address_lo: msi.address_lo,
address_hi: msi.address_hi,
data: msi.data,
__bindgen_anon_1: kvm_irq_routing_msi__bindgen_ty_1 { pad: 0 },
},
},
};
}
Ok(routing)
}
pub(crate) struct IoapicHandle {
ioapic: Arc<PiMutex<Ioapic>>,
vm_fd_raw: i32,
routing_failures: std::sync::atomic::AtomicU64,
last_installed: PiMutex<Option<Vec<(u32, MsiRoute)>>>,
}
impl IoapicHandle {
pub(crate) fn new(ioapic: Arc<PiMutex<Ioapic>>, vm_fd_raw: i32) -> Self {
IoapicHandle {
ioapic,
vm_fd_raw,
routing_failures: std::sync::atomic::AtomicU64::new(0),
last_installed: PiMutex::new(None),
}
}
pub(crate) fn mmio_read(&self, offset: u64, data: &mut [u8]) {
self.ioapic.lock().mmio_read(offset, data);
}
pub(crate) fn mmio_write(&self, offset: u64, data: &[u8]) -> Result<()> {
let fd = self.vm_fd_raw;
self.mmio_write_with(offset, data, move |routing| {
kvm_set_gsi_routing_via_raw_fd(fd, routing)
})
}
fn mmio_write_with(
&self,
offset: u64,
data: &[u8],
install: impl FnOnce(&KvmIrqRouting) -> std::io::Result<()>,
) -> Result<()> {
let routes = {
let mut io = self.ioapic.lock();
if io.mmio_write(offset, data) {
Some(io.gsi_routes())
} else {
None
}
};
if let Some(routes) = routes {
let mut last = self.last_installed.lock();
if last.as_deref() == Some(routes.as_slice()) {
return Ok(());
}
let routing = build_device_msi_routing(&routes)?;
if let Err(e) = install(&routing) {
self.routing_failures
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
return Err(anyhow::anyhow!("KVM_SET_GSI_ROUTING: {e}"));
}
*last = Some(routes);
}
Ok(())
}
pub(crate) fn eoi(&self, vector: u8) {
let pending = self.ioapic.lock().end_of_interrupt(vector);
debug_assert!(
pending.is_empty(),
"v0 IOAPIC is edge-only but EOI returned {} pin(s) needing level \
re-injection — a level-triggered device was added without \
completing level support (re-injection is dropped here)",
pending.len()
);
}
pub(crate) fn routing_failures(&self) -> u64 {
self.routing_failures
.load(std::sync::atomic::Ordering::Relaxed)
}
pub(crate) fn in_range(&self, addr: u64) -> Option<u64> {
(IOAPIC_BASE..IOAPIC_BASE + IOAPIC_SIZE)
.contains(&addr)
.then(|| addr - IOAPIC_BASE)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ioapic_handle_dedups_install_and_caches_on_success_only() {
use crate::vmm::x86_64::ioapic::{IOREGSEL, IOWIN, REG_REDTBL_BASE};
use std::cell::Cell;
let handle = IoapicHandle::new(std::sync::Arc::new(PiMutex::new(Ioapic::new())), -1);
let lo_reg = (REG_REDTBL_BASE + 2 * 6) as u8;
let installs = Cell::new(0u32);
let step = |off: u64, data: &[u8], ok: bool| -> Result<()> {
handle.mmio_write_with(off, data, |_routing| {
installs.set(installs.get() + 1);
if ok {
Ok(())
} else {
Err(std::io::Error::other("injected install failure"))
}
})
};
step(IOREGSEL, &[lo_reg], true).unwrap();
step(IOWIN, &0x40u32.to_le_bytes(), true).unwrap();
assert_eq!(
installs.get(),
1,
"programming an unmasked RTE installs once"
);
step(IOREGSEL, &[lo_reg], true).unwrap();
step(IOWIN, &0x40u32.to_le_bytes(), true).unwrap();
assert_eq!(
installs.get(),
1,
"a redundant RTE rewrite must dedup (no second install)"
);
step(IOREGSEL, &[lo_reg], true).unwrap();
assert!(
step(IOWIN, &0x50u32.to_le_bytes(), false).is_err(),
"an injected install failure propagates as an error"
);
assert_eq!(installs.get(), 2, "the changed RTE attempts an install");
assert_eq!(
handle.routing_failures(),
1,
"the failed install is counted"
);
step(IOREGSEL, &[lo_reg], true).unwrap();
step(IOWIN, &0x50u32.to_le_bytes(), true).unwrap();
assert_eq!(
installs.get(),
3,
"a failed install must not poison the cache — the identical retry re-installs"
);
}
#[test]
fn build_device_msi_routing_lays_out_fam_entries() {
let routes = vec![
(
4u32,
MsiRoute {
address_lo: 0xFEE0_1004,
address_hi: 0x0000_0100,
data: 0x0000_8030,
},
),
(
6u32,
MsiRoute {
address_lo: 0xFEE0_2000,
address_hi: 0x0000_0000,
data: 0x0000_0040,
},
),
];
let mut routing = build_device_msi_routing(&routes).expect("build routing");
let entries = routing.as_mut_slice();
assert_eq!(entries.len(), 2, "one FAM entry per route");
for (i, (gsi, msi)) in routes.iter().enumerate() {
let e = &entries[i];
assert_eq!(e.gsi, *gsi, "entry {i} gsi");
assert_eq!(e.type_, KVM_IRQ_ROUTING_MSI, "entry {i} type is MSI");
assert_eq!(e.flags, 0, "entry {i} flags");
let m = unsafe { e.u.msi };
assert_eq!(m.address_lo, msi.address_lo, "entry {i} address_lo");
assert_eq!(m.address_hi, msi.address_hi, "entry {i} address_hi");
assert_eq!(m.data, msi.data, "entry {i} data");
}
}
#[test]
fn build_device_msi_routing_empty_is_valid() {
let mut routing = build_device_msi_routing(&[]).expect("empty routing");
assert_eq!(routing.as_mut_slice().len(), 0, "no entries for empty set");
}
use std::os::fd::AsRawFd;
use vm_memory::GuestMemory;
#[test]
fn create_vm_basic() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 128, false);
assert!(vm.is_ok(), "VM creation failed: {:?}", vm.err());
let vm = vm.unwrap();
assert_eq!(vm.vcpus.len(), 2);
}
#[test]
fn create_vm_multi_llc() {
let topo = Topology {
llcs: 2,
cores_per_llc: 2,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 256, false);
assert!(vm.is_ok(), "multi-LLC VM creation failed: {:?}", vm.err());
let vm = vm.unwrap();
assert_eq!(vm.vcpus.len(), 8);
}
#[test]
fn create_vm_single_cpu() {
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 64, false);
assert!(vm.is_ok());
assert_eq!(vm.unwrap().vcpus.len(), 1);
}
#[test]
fn create_vm_large_topology() {
let topo = Topology {
llcs: 4,
cores_per_llc: 4,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 512, false);
assert!(vm.is_ok(), "large topology failed: {:?}", vm.err());
assert_eq!(vm.unwrap().vcpus.len(), 32);
}
#[test]
fn create_vm_odd_topology() {
let topo = Topology {
llcs: 3,
cores_per_llc: 3,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 128, false);
assert!(vm.is_ok(), "odd topology failed: {:?}", vm.err());
assert_eq!(vm.unwrap().vcpus.len(), 9);
}
#[test]
fn memory_size_correct() {
use vm_memory::GuestMemoryRegion;
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 256, false).unwrap();
let total: u64 = vm.guest_mem.iter().map(|r| r.len()).sum();
assert_eq!(total, 256 << 20);
}
#[test]
fn tss_address_matches_firecracker() {
assert_eq!(KVM_TSS_ADDRESS, 0xfffb_d000);
}
#[test]
fn identity_map_follows_tss() {
assert_eq!(KVM_IDENTITY_MAP_ADDRESS, KVM_TSS_ADDRESS + 3 * 4096);
assert_eq!(KVM_IDENTITY_MAP_ADDRESS, 0xfffc_0000);
}
#[test]
fn required_caps_non_empty() {
assert!(!REQUIRED_CAPS.is_empty());
assert!(REQUIRED_CAPS.len() >= 14);
}
#[test]
fn small_topology_uses_full_irqchip() {
let topo = Topology {
llcs: 2,
cores_per_llc: 4,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
assert!(max_apic_id(&topo) <= MAX_XAPIC_ID);
let vm = KtstrKvm::new(topo, 256, false).unwrap();
assert!(!vm.split_irqchip, "small topology should use full IRQ chip");
}
#[test]
fn large_topology_uses_split_irqchip() {
let topo = Topology {
llcs: 14,
cores_per_llc: 9,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
assert!(
max_apic_id(&topo) > MAX_XAPIC_ID,
"max APIC ID {} should exceed {}",
max_apic_id(&topo),
MAX_XAPIC_ID,
);
let vm = match KtstrKvm::new(topo, 4096, false) {
Ok(v) => v,
Err(e) => {
skip!("large_topology VM creation: {e:#}");
}
};
assert!(vm.split_irqchip, "large topology should use split IRQ chip");
assert_eq!(vm.vcpus.len(), 252);
}
#[test]
fn split_irqchip_boundary() {
let small = Topology {
llcs: 8,
cores_per_llc: 8,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
assert!(
max_apic_id(&small) <= MAX_XAPIC_ID,
"8l/8c/2t max APIC ID {} should be <= 254",
max_apic_id(&small),
);
let vm = KtstrKvm::new(small, 2048, false).unwrap();
assert!(!vm.split_irqchip);
let still_small = Topology {
llcs: 15,
cores_per_llc: 8,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
assert!(
max_apic_id(&still_small) <= MAX_XAPIC_ID,
"15l/8c/2t max APIC ID {} should be <= 254",
max_apic_id(&still_small),
);
let vm = KtstrKvm::new(still_small, 4096, false).unwrap();
assert!(!vm.split_irqchip);
}
#[test]
fn immediate_exit_cap_detected() {
use crate::vmm::x86_64::test_helpers::single_vcpu_kvm;
let vm = single_vcpu_kvm();
assert!(vm.has_immediate_exit);
}
#[test]
fn performance_mode_succeeds() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 128, true);
assert!(
vm.is_ok(),
"performance_mode VM creation failed: {:?}",
vm.err()
);
}
#[test]
fn performance_mode_does_not_affect_vcpu_count() {
let topo = Topology {
llcs: 2,
cores_per_llc: 2,
threads_per_core: 2,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm_normal = KtstrKvm::new(topo, 256, false).unwrap();
let vm_perf = KtstrKvm::new(topo, 256, true).unwrap();
assert_eq!(vm_normal.vcpus.len(), vm_perf.vcpus.len());
}
#[test]
fn halt_poll_ns_constant() {
assert_eq!(HALT_POLL_NS, 200_000);
}
#[test]
fn non_perf_mode_succeeds_with_halt_poll() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 128, false);
assert!(
vm.is_ok(),
"non-perf VM with halt poll failed: {:?}",
vm.err()
);
}
#[test]
fn disable_exits_hlt_bit_value() {
assert_eq!(KVM_X86_DISABLE_EXITS_HLT, 2);
}
#[test]
fn disable_exits_pause_and_hlt_no_overlap() {
assert_ne!(
KVM_X86_DISABLE_EXITS_PAUSE, KVM_X86_DISABLE_EXITS_HLT,
"PAUSE and HLT bits must be distinct"
);
assert_eq!(
KVM_X86_DISABLE_EXITS_PAUSE & KVM_X86_DISABLE_EXITS_HLT,
0,
"PAUSE and HLT bits must not overlap"
);
}
#[test]
fn tsc_stability_check_roundtrip() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 64, true).unwrap();
let clock = vm.vm_fd.get_clock().unwrap();
let mut set_data = clock;
set_data.flags = 0;
vm.vm_fd.set_clock(&set_data).unwrap();
let clock2 = vm.vm_fd.get_clock().unwrap();
let _ = clock2.flags & KVM_CLOCK_TSC_STABLE;
}
#[test]
fn kvm_clock_data_default_is_zeroed() {
let clock = kvm_bindings::kvm_clock_data::default();
assert_eq!(clock.clock, 0);
assert_eq!(clock.flags, 0);
assert_eq!(clock.pad0, 0);
assert_eq!(clock.realtime, 0);
assert_eq!(clock.host_tsc, 0);
assert_eq!(clock.pad, [0u32; 4]);
}
#[test]
fn kvm_clock_data_size_matches_ioctl_encoding() {
assert_eq!(std::mem::size_of::<kvm_bindings::kvm_clock_data>(), 48);
}
#[test]
fn raw_fd_get_clock_matches_safe_wrapper() {
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 64, false).unwrap();
let raw_fd = vm.vm_fd.as_raw_fd();
let via_safe = vm.vm_fd.get_clock().expect("safe GET_CLOCK");
let via_raw = super::kvm_get_clock_via_raw_fd(raw_fd).expect("raw GET_CLOCK");
assert!(
via_raw.clock >= via_safe.clock,
"raw-fd GET regressed below safe GET (raw={}, safe={}) — ioctl number drift",
via_raw.clock,
via_safe.clock,
);
assert!(
via_raw.clock - via_safe.clock < 1_000_000_000,
"raw-fd vs safe GET differ by >1s (raw={}, safe={}) — likely different kernel state",
via_raw.clock,
via_safe.clock,
);
}
#[test]
fn raw_fd_set_clock_roundtrip_with_flags_zero() {
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 64, false).unwrap();
let raw_fd = vm.vm_fd.as_raw_fd();
let mut clock = super::kvm_get_clock_via_raw_fd(raw_fd).expect("raw GET_CLOCK");
clock.flags = 0;
super::kvm_set_clock_via_raw_fd(raw_fd, &clock).expect("raw SET_CLOCK");
let after = super::kvm_get_clock_via_raw_fd(raw_fd).expect("raw GET_CLOCK after");
assert!(after.clock >= clock.clock);
}
#[test]
fn performance_mode_with_hlt_disable_succeeds() {
let topo = Topology {
llcs: 1,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = KtstrKvm::new(topo, 128, true);
assert!(
vm.is_ok(),
"performance_mode with HLT disable failed: {:?}",
vm.err()
);
}
}