pub mod cgroup_sandbox;
pub mod console;
pub mod disk_config;
pub mod disk_template;
pub mod host_topology;
pub mod initramfs;
pub(crate) mod kvm_stats;
pub mod topology;
pub(crate) mod builder;
pub(crate) mod capture_numa;
pub(crate) mod capture_scx;
pub(crate) mod capture_tasks;
pub(crate) mod contention;
pub(crate) mod exit_dispatch;
pub(crate) mod freeze_coord;
pub(crate) mod initramfs_cache;
pub(crate) mod net_config;
pub(crate) mod numa_mem;
pub(crate) mod result;
pub(crate) mod rust_init;
pub(crate) mod setup;
pub(crate) mod vcpu;
pub(crate) mod virtio_blk;
pub(crate) mod virtio_console;
pub(crate) mod virtio_net;
pub(crate) mod bulk;
pub(crate) mod guest_comms;
pub(crate) mod host_comms;
pub mod wire;
mod memory_budget;
mod pi_mutex;
mod terminal;
mod vcpu_panic;
mod vmlinux;
#[allow(unused_imports)]
pub use net_config::NetConfig;
pub use virtio_blk::VirtioBlkCounters;
#[allow(unused_imports)]
pub use virtio_net::VirtioNetCounters;
pub use builder::KtstrVmBuilder;
#[allow(unused_imports)]
pub use result::KVM_INTERESTING_STATS;
pub use result::{KvmStatsTotals, VmResult};
pub(crate) use contention::{
create_vm_with_retry, host_resource_snapshot, map_transient_to_contention,
};
pub(crate) use pi_mutex::PiMutex;
pub(crate) use terminal::TerminalRawGuard;
pub(crate) use vcpu::{
BpfMapWriteParams, ImmediateExitHandle, register_vcpu_signal_handler, set_thread_cpumask,
vcpu_signal,
};
pub(crate) use vmlinux::find_vmlinux;
#[cfg(target_arch = "aarch64")]
pub mod aarch64;
#[cfg(target_arch = "x86_64")]
pub mod x86_64;
#[cfg(target_arch = "x86_64")]
#[allow(unused_imports)]
pub use x86_64::acpi;
#[cfg(target_arch = "x86_64")]
#[allow(unused_imports)]
pub use x86_64::boot;
#[cfg(target_arch = "x86_64")]
pub use x86_64::kvm;
#[cfg(target_arch = "x86_64")]
#[allow(unused_imports)]
pub use x86_64::mptable;
#[cfg(target_arch = "aarch64")]
#[allow(unused_imports)]
pub use aarch64::boot;
#[cfg(target_arch = "aarch64")]
pub use aarch64::kvm;
pub use topology::Topology;
use anyhow::{Context, Result};
use std::path::PathBuf;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicI32, Ordering};
use std::thread::JoinHandle;
use std::time::{Duration, Instant};
#[cfg(target_arch = "x86_64")]
const DRAM_BASE: u64 = 0;
#[cfg(target_arch = "aarch64")]
const DRAM_BASE: u64 = kvm::DRAM_START;
pub struct KtstrVm {
pub(crate) kernel: PathBuf,
pub(crate) init_binary: Option<PathBuf>,
pub(crate) scheduler_binary: Option<PathBuf>,
pub(crate) run_args: Vec<String>,
pub(crate) sched_args: Vec<String>,
pub(crate) topology: Topology,
pub(crate) memory_mb: Option<u32>,
pub(crate) memory_min_mb: u32,
pub(crate) cmdline_extra: String,
pub(crate) timeout: Duration,
pub(crate) monitor_thresholds: Option<crate::monitor::MonitorThresholds>,
pub(crate) watchdog_timeout: Option<Duration>,
pub(crate) bpf_map_writes: Vec<BpfMapWriteParams>,
pub(crate) performance_mode: bool,
pub(crate) no_perf_mode: bool,
pub(crate) pinning_plan: Option<host_topology::PinningPlan>,
pub(crate) mbind_node_map: Vec<Vec<usize>>,
#[allow(dead_code)]
pub(crate) no_perf_plan: Option<host_topology::LlcPlan>,
pub(crate) host_topo: Option<host_topology::HostTopology>,
pub(crate) sched_enable_cmds: Vec<String>,
pub(crate) sched_disable_cmds: Vec<String>,
pub(crate) include_files: Vec<(String, PathBuf)>,
pub(crate) disks: Vec<disk_config::DiskConfig>,
pub(crate) network: Option<net_config::NetConfig>,
pub(crate) template_staging_image: Option<PathBuf>,
pub(crate) busybox: bool,
pub(crate) dmesg: bool,
pub(crate) exec_cmd: Option<String>,
pub(crate) jemalloc_probe_binary: Option<PathBuf>,
pub(crate) jemalloc_alloc_worker_binary: Option<PathBuf>,
pub(crate) failure_dump_path: Option<PathBuf>,
pub(crate) dual_snapshot: bool,
}
struct RunLocks {
#[allow(dead_code)]
locks: Vec<std::os::fd::OwnedFd>,
default_cpu_mask: Option<Vec<usize>>,
}
impl KtstrVm {
pub fn builder() -> KtstrVmBuilder {
KtstrVmBuilder::default()
}
fn suffix_params(&self) -> initramfs::SuffixParams<'_> {
initramfs::SuffixParams {
args: &self.run_args,
sched_args: &self.sched_args,
sched_enable: &self.sched_enable_cmds,
sched_disable: &self.sched_disable_cmds,
exec_cmd: self.exec_cmd.as_deref(),
}
}
pub fn run(&self) -> Result<VmResult> {
let start = Instant::now();
let initramfs_handle = self.spawn_initramfs_resolve();
let (mut vm, kernel_result) = self.create_vm_and_load_kernel()?;
#[cfg(target_arch = "x86_64")]
let _kernel_result = {
let kr = self.setup_memory(&mut vm, kernel_result, initramfs_handle)?;
self.setup_vcpus(&vm, kr.entry)?;
kr
};
#[cfg(target_arch = "aarch64")]
let _kernel_result = {
let kr = self.setup_memory_aarch64(&mut vm, kernel_result, initramfs_handle)?;
self.setup_vcpus_aarch64(&vm, kr.entry)?;
kr
};
let stats_ctx = kvm_stats::open_stats_context(&vm.vcpus);
if stats_ctx.is_none() {
tracing::debug!("KVM_GET_STATS_FD not supported, skipping stats collection");
}
tracing::debug!(elapsed_us = start.elapsed().as_micros(), "total_setup");
let run_start = Instant::now();
let run_locks = self.acquire_run_locks()?;
let run = self.run_vm(run_start, vm, run_locks.default_cpu_mask.as_deref())?;
drop(run_locks);
let mut result = self.collect_results(start, run)?;
if let Some(ctx) = stats_ctx {
result.kvm_stats = Some(ctx.read_stats());
}
Ok(result)
}
fn acquire_run_locks(&self) -> Result<RunLocks> {
if self.no_perf_mode {
if let Some(ref plan) = self.no_perf_plan {
let stub = host_topology::PinningPlan {
assignments: Vec::new(),
service_cpu: None,
llc_indices: plan.locked_llcs.clone(),
locks: Vec::new(),
};
match host_topology::acquire_resource_locks(
&stub,
&stub.llc_indices,
host_topology::LlcLockMode::Shared,
)? {
host_topology::LockOutcome::Acquired { locks, .. } => Ok(RunLocks {
locks,
default_cpu_mask: None,
}),
host_topology::LockOutcome::Unavailable(reason) => {
Err(anyhow::Error::new(host_topology::ResourceContention {
reason,
}))
}
}
} else {
Ok(RunLocks {
locks: Vec::new(),
default_cpu_mask: None,
})
}
} else if self.performance_mode {
if let Some(ref plan) = self.pinning_plan {
match host_topology::acquire_resource_locks(
plan,
&plan.llc_indices,
host_topology::LlcLockMode::Exclusive,
)? {
host_topology::LockOutcome::Acquired { locks, .. } => Ok(RunLocks {
locks,
default_cpu_mask: None,
}),
host_topology::LockOutcome::Unavailable(reason) => {
Err(anyhow::Error::new(host_topology::ResourceContention {
reason,
}))
}
}
} else {
Ok(RunLocks {
locks: Vec::new(),
default_cpu_mask: None,
})
}
} else {
let total_cpus = self.topology.total_cpus() as usize;
let host_cpus = self
.host_topo
.as_ref()
.map(|h| h.total_cpus())
.unwrap_or(total_cpus);
let result =
host_topology::acquire_cpu_locks(total_cpus, host_cpus, self.host_topo.as_ref())?;
Ok(RunLocks {
locks: result.locks,
default_cpu_mask: Some(result.cpus),
})
}
}
pub fn run_interactive(&self) -> Result<()> {
let start = Instant::now();
let initramfs_handle = self.spawn_initramfs_resolve();
let (mut vm, kernel_result) = self.create_vm_and_load_kernel()?;
#[cfg(target_arch = "x86_64")]
{
let kr = self.setup_memory(&mut vm, kernel_result, initramfs_handle)?;
self.setup_vcpus(&vm, kr.entry)?;
}
#[cfg(target_arch = "aarch64")]
{
let kr = self.setup_memory_aarch64(&mut vm, kernel_result, initramfs_handle)?;
self.setup_vcpus_aarch64(&vm, kr.entry)?;
}
let com1 = Arc::new(PiMutex::new(console::Serial::new(console::COM1_BASE)));
let com2 = Arc::new(PiMutex::new(console::Serial::new(console::COM2_BASE)));
let mut vc = virtio_console::VirtioConsole::new();
vc.set_mem((*vm.guest_mem).clone());
let virtio_con = Arc::new(PiMutex::new(vc));
#[cfg(target_arch = "x86_64")]
if vm.split_irqchip {
anyhow::bail!(
"interactive shell requires irqfd; split-irqchip mode \
has no IOAPIC and the guest's serial / virtio-console \
drivers have no polling fallback — reduce topology \
so all APIC IDs are at or below 254 (MAX_XAPIC_ID)",
);
}
#[cfg(target_arch = "x86_64")]
{
vm.vm_fd
.register_irqfd(com1.lock().irq_evt(), console::COM1_IRQ)
.context("register COM1 irqfd")?;
vm.vm_fd
.register_irqfd(com2.lock().irq_evt(), console::COM2_IRQ)
.context("register COM2 irqfd")?;
vm.vm_fd
.register_irqfd(virtio_con.lock().irq_evt(), kvm::VIRTIO_CONSOLE_IRQ)
.context("register virtio-console irqfd")?;
}
#[cfg(target_arch = "aarch64")]
{
vm.vm_fd
.register_irqfd(com1.lock().irq_evt(), kvm::SERIAL_IRQ)
.context("register serial irqfd")?;
vm.vm_fd
.register_irqfd(com2.lock().irq_evt(), kvm::SERIAL2_IRQ)
.context("register serial2 irqfd")?;
vm.vm_fd
.register_irqfd(virtio_con.lock().irq_evt(), kvm::VIRTIO_CONSOLE_IRQ)
.context("register virtio-console irqfd")?;
}
let virtio_blk = self.init_virtio_blk(&vm)?;
let virtio_net = self.init_virtio_net(&vm)?;
let exec_mode = self.exec_cmd.is_some();
if !exec_mode {
use std::os::unix::io::AsRawFd;
let stdin_fd = std::io::stdin().as_raw_fd();
let borrowed = unsafe { std::os::unix::io::BorrowedFd::borrow_raw(stdin_fd) };
anyhow::ensure!(
nix::unistd::isatty(borrowed).unwrap_or(false),
"stdin must be a terminal for interactive shell mode",
);
}
let _raw_guard = if exec_mode {
None
} else {
Some(TerminalRawGuard::enter().context("failed to set terminal to raw mode")?)
};
let (wakeup_r, wakeup_w) = nix::unistd::pipe().context("create stdin wakeup pipe")?;
let kill = Arc::new(AtomicBool::new(false));
let kill_evt = Arc::new(
vmm_sys_util::eventfd::EventFd::new(libc::EFD_NONBLOCK)
.context("create shell kill eventfd")?,
);
let freeze = Arc::new(AtomicBool::new(false));
let watchpoint =
Arc::new(vcpu::WatchpointArm::new().context("create WatchpointArm.hit_evt EventFd")?);
let bsp_parked = Arc::new(AtomicBool::new(false));
let bsp_regs: Arc<std::sync::Mutex<Option<exit_dispatch::VcpuRegSnapshot>>> =
Arc::new(std::sync::Mutex::new(None));
let has_immediate_exit = vm.has_immediate_exit;
let mut vcpus = std::mem::take(&mut vm.vcpus);
let mut bsp = vcpus.remove(0);
let ap_pins = vec![None; vcpus.len()];
let _run_locks = self.acquire_run_locks()?;
let no_perf_mask: Option<&[usize]> = self.no_perf_plan.as_ref().map(|p| p.cpus.as_slice());
let n_aps = vcpus.len();
let ap_tid_slots: Vec<(Arc<AtomicI32>, Arc<crate::sync::Latch>)> = (0..n_aps)
.map(|_| {
(
Arc::new(AtomicI32::new(0)),
Arc::new(crate::sync::Latch::new()),
)
})
.collect();
let (ap_threads, _ap_freeze) = self.spawn_ap_threads(
vcpus,
has_immediate_exit,
&com1,
&com2,
Some(&virtio_con),
virtio_blk.as_ref(),
virtio_net.as_ref(),
&kill,
&kill_evt,
&freeze,
&watchpoint,
&ap_pins,
no_perf_mask,
&ap_tid_slots,
None,
None,
)?;
let bsp_ie_for_stdin = if has_immediate_exit {
Some(ImmediateExitHandle::from_vcpu(&mut bsp))
} else {
None
};
let bsp_tid = unsafe { libc::pthread_self() };
let vc_for_stdin = virtio_con.clone();
let kill_for_stdin = kill.clone();
let stdin_thread = std::thread::Builder::new()
.name("interactive-stdin".into())
.spawn(move || {
use std::io::Read;
use std::os::unix::io::{AsFd, AsRawFd};
let wakeup_fd = wakeup_r;
let stdin_fd = std::io::stdin().as_raw_fd();
let mut buf = [0u8; 4096];
let mut saw_ctrl_a = false;
loop {
if kill_for_stdin.load(Ordering::Acquire) {
break;
}
let stdin_borrowed =
unsafe { std::os::unix::io::BorrowedFd::borrow_raw(stdin_fd) };
let wakeup_borrowed = wakeup_fd.as_fd();
let mut fds = [
nix::poll::PollFd::new(stdin_borrowed, nix::poll::PollFlags::POLLIN),
nix::poll::PollFd::new(wakeup_borrowed, nix::poll::PollFlags::POLLIN),
];
match nix::poll::poll(&mut fds, 100u16) {
Ok(0) => continue, Err(nix::errno::Errno::EINTR) => continue,
Err(_) => break,
Ok(_) => {}
}
if fds[1]
.revents()
.is_some_and(|r| r.intersects(nix::poll::PollFlags::POLLIN))
{
break;
}
if fds[0]
.revents()
.is_some_and(|r| r.intersects(nix::poll::PollFlags::POLLIN))
{
let mut stdin = std::io::stdin().lock();
match stdin.read(&mut buf) {
Ok(0) => break,
Ok(n) => {
let mut forward_start = 0usize;
for i in 0..n {
if saw_ctrl_a {
saw_ctrl_a = false;
if buf[i] == b'x' || buf[i] == b'X' {
eprintln!("\r\nTerminated.");
kill_for_stdin.store(true, Ordering::Release);
if let Some(ref ie) = bsp_ie_for_stdin {
ie.set(1);
std::sync::atomic::fence(Ordering::Release);
}
unsafe {
libc::pthread_kill(bsp_tid, vcpu_signal());
}
return;
}
if forward_start < i {
vc_for_stdin.lock().queue_input(&buf[forward_start..i]);
forward_start = i;
}
vc_for_stdin.lock().queue_input(&[0x01]);
}
if buf[i] == 0x01 {
if forward_start < i {
vc_for_stdin.lock().queue_input(&buf[forward_start..i]);
}
saw_ctrl_a = true;
forward_start = i + 1;
continue;
}
}
if forward_start < n {
vc_for_stdin.lock().queue_input(&buf[forward_start..n]);
}
}
Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
Err(_) => break,
}
}
}
})
.context("spawn stdin reader thread")?;
let vc_for_stdout = virtio_con.clone();
let kill_for_stdout = kill.clone();
let stdout_thread: JoinHandle<bool> = std::thread::Builder::new()
.name("interactive-stdout".into())
.spawn(move || {
use std::io::Write;
let mut wrote_any = false;
let tx_evt_raw_fd = {
let guard = vc_for_stdout.lock();
std::os::unix::io::AsRawFd::as_raw_fd(guard.tx_evt())
};
let mut stdout = std::io::stdout().lock();
loop {
if kill_for_stdout.load(Ordering::Acquire) {
break;
}
let borrowed =
unsafe { std::os::unix::io::BorrowedFd::borrow_raw(tx_evt_raw_fd) };
let mut fds = [nix::poll::PollFd::new(
borrowed,
nix::poll::PollFlags::POLLIN,
)];
match nix::poll::poll(&mut fds, 50u16) {
Ok(0) => continue,
Err(nix::errno::Errno::EINTR) => continue,
Err(_) => break,
Ok(_) => {
let _ = vc_for_stdout.lock().tx_evt().read();
}
}
if kill_for_stdout.load(Ordering::Acquire) {
break;
}
let data = vc_for_stdout.lock().drain_output();
if !data.is_empty() {
let valid_len = match std::str::from_utf8(&data) {
Ok(_) => data.len(),
Err(e) => e.valid_up_to(),
};
if valid_len > 0 {
if stdout.write_all(&data[..valid_len]).is_err()
|| stdout.flush().is_err()
{
kill_for_stdout.store(true, Ordering::Release);
break;
}
wrote_any = true;
}
}
}
let data = vc_for_stdout.lock().drain_output();
if !data.is_empty() {
let valid_len = match std::str::from_utf8(&data) {
Ok(_) => data.len(),
Err(e) => e.valid_up_to(),
};
if valid_len > 0 {
let _ = stdout.write_all(&data[..valid_len]);
let _ = stdout.flush();
wrote_any = true;
}
}
wrote_any
})
.context("spawn stdout writer thread")?;
let (dmesg_thread, dmesg_wakeup_evt) = if self.dmesg {
use std::os::unix::io::AsRawFd;
use vmm_sys_util::epoll::{ControlOperation, Epoll, EpollEvent, EventSet};
use vmm_sys_util::eventfd::{EFD_NONBLOCK, EventFd};
let data_evt = com1
.lock()
.install_data_evt()
.context("install COM1 dmesg data eventfd")?;
let wakeup_evt =
Arc::new(EventFd::new(EFD_NONBLOCK).context("create dmesg wakeup eventfd")?);
let com1_for_dmesg = com1.clone();
let kill_for_dmesg = kill.clone();
let wakeup_for_thread = wakeup_evt.clone();
const DATA_TOKEN: u64 = 0;
const WAKEUP_TOKEN: u64 = 1;
let handle = std::thread::Builder::new()
.name("interactive-dmesg".into())
.spawn(move || {
use std::io::Write;
let epoll = match Epoll::new() {
Ok(e) => e,
Err(e) => {
tracing::warn!(%e, "interactive-dmesg: epoll_create1 failed");
return;
}
};
if let Err(e) = epoll.ctl(
ControlOperation::Add,
data_evt.as_raw_fd(),
EpollEvent::new(EventSet::IN, DATA_TOKEN),
) {
tracing::warn!(%e, "interactive-dmesg: add data_evt to epoll");
return;
}
if let Err(e) = epoll.ctl(
ControlOperation::Add,
wakeup_for_thread.as_raw_fd(),
EpollEvent::new(EventSet::IN, WAKEUP_TOKEN),
) {
tracing::warn!(%e, "interactive-dmesg: add wakeup to epoll");
return;
}
let mut events = [EpollEvent::default(); 2];
loop {
if kill_for_dmesg.load(Ordering::Acquire) {
break;
}
match epoll.wait(-1, &mut events) {
Ok(_) => {}
Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
Err(e) => {
tracing::warn!(%e, "interactive-dmesg: epoll_wait failed");
break;
}
}
let _ = data_evt.read();
let _ = wakeup_for_thread.read();
let data = com1_for_dmesg.lock().drain_output();
if !data.is_empty() {
let mut stderr = std::io::stderr().lock();
let _ = stderr.write_all(&data);
let _ = stderr.flush();
}
}
let data = com1_for_dmesg.lock().drain_output();
if !data.is_empty() {
let mut stderr = std::io::stderr().lock();
let _ = stderr.write_all(&data);
let _ = stderr.flush();
}
})
.context("spawn dmesg thread")?;
(Some(handle), Some(wakeup_evt))
} else {
(None, None)
};
if let Some(mask) = self.no_perf_plan.as_ref().map(|p| p.cpus.as_slice()) {
set_thread_cpumask(mask, "BSP (shell)");
}
register_vcpu_signal_handler();
let interactive_timeout = Duration::from_secs(24 * 60 * 60);
self.run_bsp_loop(
&mut bsp,
&com1,
&com2,
Some(&virtio_con),
virtio_blk.as_ref(),
virtio_net.as_ref(),
&kill,
&freeze,
&watchpoint,
&bsp_parked,
&bsp_regs,
has_immediate_exit,
start,
interactive_timeout,
None,
None,
None,
None,
&std::sync::Arc::new(std::sync::atomic::AtomicU64::new(0)),
);
kill.store(true, Ordering::Release);
let _ = nix::unistd::write(&wakeup_w, &[0u8]);
drop(wakeup_w);
if let Some(ref evt) = dmesg_wakeup_evt {
let _ = evt.write(1);
}
for vt in &ap_threads {
if !vt.exited.load(Ordering::Acquire) {
vt.kick();
}
}
for vt in ap_threads {
vt.wait_for_exit(Duration::from_secs(5));
let _ = vt.handle.join();
}
let stdout_wrote = stdout_thread.join().unwrap_or(false);
let _ = stdin_thread.join();
if let Some(dt) = dmesg_thread {
let _ = dt.join();
}
drop(dmesg_wakeup_evt);
drop(_raw_guard);
if exec_mode && !stdout_wrote {
let app_output = com2.lock().output();
if !app_output.is_empty() {
use std::io::Write;
let mut stdout = std::io::stdout().lock();
let _ = stdout.write_all(app_output.as_bytes());
let _ = stdout.flush();
}
}
if !self.dmesg {
let console_output = com1.lock().output();
if !console_output.is_empty() {
eprintln!("{console_output}");
}
}
if !exec_mode {
eprintln!("Connection to VM closed.");
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[cfg(target_arch = "x86_64")]
fn ap_mp_state_set_correctly() {
let topo = Topology {
llcs: 2,
cores_per_llc: 2,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = kvm::KtstrKvm::new(topo, 128, false).unwrap();
for vcpu in &vm.vcpus[1..] {
let state = vcpu.get_mp_state().unwrap();
assert_eq!(
state.mp_state,
kvm_bindings::KVM_MP_STATE_UNINITIALIZED,
"AP should default to UNINITIALIZED"
);
}
}
#[test]
fn boot_kernel_produces_output() {
let kernel = crate::test_support::require_kernel();
let vm = skip_on_contention!(
KtstrVm::builder()
.kernel(&kernel)
.topology(1, 1, 1, 1)
.memory_mb(256)
.timeout(Duration::from_secs(10))
.cmdline("loglevel=7")
.build()
);
let result = vm.run().unwrap();
assert!(
result.stderr.contains("Linux") || result.stderr.contains("Booting"),
"kernel console should contain boot messages"
);
}
#[test]
fn boot_kernel_smp_topology() {
let kernel = crate::test_support::require_kernel();
let vm = skip_on_contention!(
KtstrVm::builder()
.kernel(&kernel)
.topology(1, 2, 2, 1) .memory_mb(256)
.timeout(Duration::from_secs(10))
.cmdline("loglevel=7")
.build()
);
let result = vm.run().unwrap();
assert!(!result.stderr.is_empty(), "no console output from SMP boot");
}
#[test]
fn bench_boot_time() {
let kernel = crate::test_support::require_kernel();
for (label, llcs, cores, threads, mem) in [("1cpu", 1, 1, 1, 256), ("4cpu", 2, 2, 1, 512)] {
let start = Instant::now();
let vm = match KtstrVm::builder()
.kernel(&kernel)
.topology(1, llcs, cores, threads)
.memory_mb(mem)
.timeout(Duration::from_secs(10))
.build()
{
Ok(vm) => vm,
Err(e)
if e.downcast_ref::<host_topology::ResourceContention>()
.is_some() =>
{
crate::report::test_skip(format_args!("{label}: resource contention: {e}"));
continue;
}
Err(e) => panic!("{e:#}"),
};
let setup = start.elapsed();
let result = vm.run().unwrap();
let boot_ms = result
.stderr
.lines()
.rev()
.find(|l| l.contains("Kernel panic") || l.contains("end Kernel panic"))
.and_then(|l| {
l.trim()
.strip_prefix('[')
.and_then(|s| s.split(']').next())
.and_then(|s| s.trim().parse::<f64>().ok())
})
.map(|s| (s * 1000.0) as u64)
.unwrap_or(0);
eprintln!(
"BENCH {label}: setup={:.0}ms kernel_boot={boot_ms}ms wall={:.0}ms timed_out={}",
setup.as_millis(),
result.duration.as_millis(),
result.timed_out,
);
}
}
#[test]
fn kvm_has_immediate_exit_cap() {
let topo = Topology {
llcs: 1,
cores_per_llc: 1,
threads_per_core: 1,
numa_nodes: 1,
nodes: None,
distances: None,
};
let vm = kvm::KtstrKvm::new(topo, 64, false).unwrap();
assert!(
vm.has_immediate_exit,
"KVM_CAP_IMMEDIATE_EXIT should be available on modern kernels"
);
}
#[test]
fn boot_kernel_with_monitor() {
let kernel = crate::test_support::require_kernel();
let _vmlinux = crate::test_support::require_vmlinux(&kernel);
let vm = skip_on_contention!(
KtstrVm::builder()
.kernel(&kernel)
.topology(1, 1, 2, 1)
.memory_mb(256)
.timeout(Duration::from_secs(5))
.watchdog_timeout(Duration::from_secs(2))
.build()
);
let result = vm.run().unwrap();
let Some(ref report) = result.monitor else {
return;
};
assert!(
report.summary.total_samples > 0,
"monitor should have collected at least one sample"
);
let populated = report
.samples
.iter()
.rev()
.find(|s| s.cpus.iter().any(|c| c.rq_clock > 1_000_000))
.expect(
"no monitor sample showed populated runqueue data — every sample \
had all CPUs at rq_clock <= 1ms, \
or the monitor is reading the wrong rq offsets",
);
assert_eq!(
populated.cpus.len(),
2,
"topology requested 2 CPUs but monitor saw {}",
populated.cpus.len()
);
for (i, cpu) in populated.cpus.iter().enumerate() {
if cpu.rq_clock <= 1_000_000 {
continue;
}
assert!(
cpu.rq_clock < 300_000_000_000,
"cpu {i}: rq_clock must be < 300s (ns), got {}",
cpu.rq_clock
);
}
if let Some(ref obs) = report.watchdog_observation {
assert_eq!(
obs.expected_jiffies, obs.observed_jiffies,
"watchdog write/read roundtrip mismatch: expected={} observed={}",
obs.expected_jiffies, obs.observed_jiffies
);
}
for (i, cpu) in populated.cpus.iter().enumerate() {
assert!(
cpu.event_counters.is_none(),
"cpu {i}: event_counters must be None when no scheduler is loaded"
);
}
}
#[test]
fn monitor_data_valid_latch_records_live_page_offset() {
let kernel = crate::test_support::require_kernel();
let _vmlinux = crate::test_support::require_vmlinux(&kernel);
let vm = skip_on_contention!(
KtstrVm::builder()
.kernel(&kernel)
.topology(1, 1, 2, 1)
.memory_mb(256)
.timeout(Duration::from_secs(5))
.watchdog_timeout(Duration::from_secs(2))
.build()
);
let result = vm.run().unwrap();
let Some(ref report) = result.monitor else {
return;
};
assert!(
report.summary.total_samples > 0,
"monitor produced no samples — DATA_VALID latch \
observability cannot be evaluated"
);
assert_ne!(
report.page_offset, 0,
"DATA_VALID latch never fired during the run — \
monitor.page_offset stayed at the initial 0 sentinel. \
page_offset_base was never resolved or \
__per_cpu_offset[0] never became non-zero before the \
run closed",
);
assert!(
report.page_offset & (1u64 << 63) != 0,
"monitor.page_offset {:#x} is not in the canonical \
upper half — page_offset_resolved gate accepted a \
user-space address",
report.page_offset,
);
assert_eq!(
report.page_offset & 0xFFF,
0,
"monitor.page_offset {:#x} is not 4 KiB aligned",
report.page_offset,
);
}
#[test]
fn sys_rdy_releases_monitor_before_5s_timeout() {
let kernel = crate::test_support::require_kernel();
let _vmlinux = crate::test_support::require_vmlinux(&kernel);
let vm = skip_on_contention!(
KtstrVm::builder()
.kernel(&kernel)
.topology(1, 1, 2, 1)
.memory_mb(256)
.timeout(Duration::from_secs(5))
.build()
);
let result = vm.run().unwrap();
let Some(ref report) = result.monitor else {
return;
};
assert!(
report.summary.total_samples > 0,
"monitor produced no samples within 5 s — sys_rdy never \
unblocked the boot wait, or the boot wait never woke on \
kill_evt either. Run wall time: {:?}",
result.duration,
);
let first = report
.samples
.first()
.expect("total_samples > 0 but samples list empty");
assert!(
first.elapsed_ms < 4_000,
"first monitor sample landed at {} ms — that is past the \
4 s budget and within the in-monitor 5 s sys_rdy \
timeout window. The sys_rdy eventfd is not actually \
unblocking the boot wait; the loop fell through on the \
5 s ceiling. Total samples: {}, run duration: {:?}",
first.elapsed_ms,
report.summary.total_samples,
result.duration,
);
}
#[test]
fn monitor_exits_cleanly_when_guest_panics_before_sys_rdy() {
let kernel = crate::test_support::require_kernel();
let _vmlinux = crate::test_support::require_vmlinux(&kernel);
let vm = skip_on_contention!(
KtstrVm::builder()
.kernel(&kernel)
.topology(1, 1, 2, 1)
.memory_mb(256)
.timeout(Duration::from_secs(10))
.cmdline("init=/nonexistent panic=-1")
.build()
);
let result = vm.run().unwrap();
assert!(
!result.timed_out,
"guest never panicked / rebooted within 10 s — the test's \
premise (panic-before-sys_rdy → kernel reboot → VM exit) \
is not holding. Stderr tail: {:?}",
result.stderr.lines().rev().take(5).collect::<Vec<_>>(),
);
assert!(
result.duration < Duration::from_secs(8),
"VM ran for {:?} — past the 8 s budget. The monitor's \
boot wait did not wake on kill_evt; the loop sat on the \
sys_rdy ceiling instead. timed_out={}, exit_code={}",
result.duration,
result.timed_out,
result.exit_code,
);
}
#[test]
fn first_sample_has_valid_rq_clock_thanks_to_sys_rdy() {
let kernel = crate::test_support::require_kernel();
let _vmlinux = crate::test_support::require_vmlinux(&kernel);
let vm = skip_on_contention!(
KtstrVm::builder()
.kernel(&kernel)
.topology(1, 1, 2, 1)
.memory_mb(256)
.timeout(Duration::from_secs(5))
.watchdog_timeout(Duration::from_secs(2))
.build()
);
let result = vm.run().unwrap();
let Some(ref report) = result.monitor else {
return;
};
assert!(
report.summary.total_samples > 0,
"monitor produced no samples — cannot evaluate \
FIRST-sample semantics"
);
let first = report
.samples
.first()
.expect("total_samples > 0 but samples list empty");
let any_populated = first.cpus.iter().any(|c| c.rq_clock > 1_000_000);
assert!(
any_populated,
"FIRST monitor sample at elapsed_ms={} had every CPU at \
rq_clock <= 1ms — SYS_RDY did not actually wait for the \
guest's runqueue fields to be populated, or the \
per-iteration refresh ran against pre-boot zeros. \
cpus.rq_clock: {:?}, total_samples: {}, run duration: {:?}",
first.elapsed_ms,
first.cpus.iter().map(|c| c.rq_clock).collect::<Vec<_>>(),
report.summary.total_samples,
result.duration,
);
}
#[test]
fn watchdog_timeout_override_lands_in_guest_memory() {
let kernel = crate::test_support::require_kernel();
let vmlinux = crate::test_support::require_vmlinux(&kernel);
let syms = crate::test_support::require_kernel_symbols(&vmlinux);
if syms.scx_root.is_none() {
skip!("scx_root not present (needs Linux 6.16+ with sched_ext enabled)");
}
let offsets = crate::test_support::require_kernel_offsets(&vmlinux);
if offsets.watchdog_offsets.is_none() {
skip!(
"scx_sched.watchdog_timeout field not in BTF \
(needs Linux 7.1+; pre-7.1 exposes watchdog timeout as a file-scope \
scx_watchdog_timeout symbol handled separately)"
);
}
const TIMEOUT_SECS: u64 = 2;
let hz = crate::monitor::guest_kernel_hz(Some(&kernel));
let expected_jiffies = TIMEOUT_SECS * hz;
let sched_bin = crate::test_support::require_binary("scx-ktstr");
let vm = skip_on_contention!(
KtstrVm::builder()
.kernel(&kernel)
.topology(1, 1, 1, 1)
.memory_mb(256)
.timeout(Duration::from_secs(5))
.scheduler_binary(&sched_bin)
.watchdog_timeout(Duration::from_secs(TIMEOUT_SECS))
.build()
);
let result = vm.run().unwrap();
let report = result.monitor.as_ref().expect(
"ktstr: monitor report missing — require_kernel_offsets, scx_root, and \
watchdog_offsets all resolved at setup, so monitor initialization must \
have succeeded. A None report here is a bug in monitor startup",
);
let Some(obs) = &report.watchdog_observation else {
skip!(
"watchdog observation missing — the scheduler did not attach \
(scx_root remained null throughout the run)"
);
};
assert_eq!(
obs.expected_jiffies, expected_jiffies,
"expected_jiffies recorded by monitor ({}) does not match {} * HZ {} = {}",
obs.expected_jiffies, TIMEOUT_SECS, hz, expected_jiffies,
);
assert_eq!(
obs.observed_jiffies, obs.expected_jiffies,
"host wrote {} jiffies to scx_sched.watchdog_timeout but guest memory holds {} — host-write mechanism broken",
obs.expected_jiffies, obs.observed_jiffies,
);
}
#[test]
fn watchdog_override_prevents_stall_exit() {
let kernel = crate::test_support::require_kernel();
let _vmlinux = crate::test_support::require_vmlinux(&kernel);
let sched_bin = crate::test_support::require_binary("scx-ktstr");
let vm = skip_on_contention!(
KtstrVm::builder()
.kernel(&kernel)
.topology(1, 1, 2, 1)
.memory_mb(256)
.timeout(Duration::from_secs(30))
.scheduler_binary(&sched_bin)
.watchdog_timeout(Duration::from_secs(300))
.build()
);
let result = vm.run().unwrap();
assert!(
result.crash_message.is_none(),
"no crash expected with 300s watchdog: {:?}",
result.crash_message
);
let output = &result.output;
let stderr = &result.stderr;
let lifecycle_phase_seen = |phase: crate::vmm::wire::LifecyclePhase| -> bool {
let Some(ref drain) = result.guest_messages else {
return false;
};
drain.entries.iter().any(|e| {
e.msg_type == crate::vmm::wire::MSG_TYPE_LIFECYCLE
&& e.crc_ok
&& !e.payload.is_empty()
&& crate::vmm::wire::LifecyclePhase::from_wire(e.payload[0]) == Some(phase)
})
};
assert!(
!lifecycle_phase_seen(crate::vmm::wire::LifecyclePhase::SchedulerDied),
"scheduler no longer running after 15s — either the watchdog fired or the \
scheduler exited for another reason. output: {output:?}, stderr: {stderr:?}",
);
assert!(
!lifecycle_phase_seen(crate::vmm::wire::LifecyclePhase::SchedulerNotAttached),
"scheduler did not attach — no watchdog override to evaluate. \
output: {output:?}, stderr: {stderr:?}",
);
assert!(
!output.contains("sched_ext: disabled") && !stderr.contains("sched_ext: disabled"),
"kernel disabled sched_ext during run — a watchdog stall or ops \
error fired. output: {output:?}, stderr: {stderr:?}",
);
if let Some(ref report) = result.monitor
&& let Some(ref obs) = report.watchdog_observation
{
let hz = crate::monitor::guest_kernel_hz(Some(&kernel));
let expected_jiffies = 300 * hz;
assert_eq!(
obs.expected_jiffies, expected_jiffies,
"watchdog override should be 300s * HZ={hz}"
);
assert_eq!(
obs.observed_jiffies, obs.expected_jiffies,
"write/read roundtrip mismatch"
);
}
}
#[test]
fn sched_domain_data_populated() {
let kernel = crate::test_support::require_kernel();
let vmlinux = crate::test_support::require_vmlinux(&kernel);
let offsets = crate::test_support::require_kernel_offsets(&vmlinux);
if offsets.sched_domain_offsets.is_none() {
skip!(
"sched_domain BTF fields not found (likely CONFIG_SMP=n; \
struct sched_domain is absent or incomplete in BTF on UP kernels, \
and on pre-6.17 kernels the rq.sd field is also compiled out)"
);
}
let vm = skip_on_contention!(
KtstrVm::builder()
.kernel(&kernel)
.topology(1, 1, 2, 1)
.memory_mb(256)
.timeout(Duration::from_secs(5))
.watchdog_timeout(Duration::from_secs(2))
.build()
);
let result = vm.run().unwrap();
let report = result.monitor.as_ref().expect(
"ktstr: monitor report missing — require_kernel_offsets and \
sched_domain_offsets resolved at setup, so monitor initialization \
must have succeeded. A None report here is a bug in monitor startup",
);
assert!(
report.summary.total_samples > 0,
"monitor should have collected at least one sample"
);
let populated = report
.samples
.iter()
.rev()
.find(|s| {
s.cpus.iter().any(|c| {
c.sched_domains
.as_ref()
.is_some_and(|doms| !doms.is_empty())
})
})
.unwrap_or_else(|| {
panic!(
"no sample had any CPU with non-empty sched_domains across \
{} collected samples — monitor samples may be racing boot-time \
kernel thread that builds the domain tree, or `rq.sd` offsets \
are wrong",
report.samples.len(),
);
});
for cpu in &populated.cpus {
if let Some(ref doms) = cpu.sched_domains {
if doms.is_empty() {
continue;
}
for w in doms.windows(2) {
assert!(
w[1].level > w[0].level,
"domain levels must be strictly increasing: {} -> {}",
w[0].level,
w[1].level
);
}
assert!(
doms[0].span_weight >= 2,
"lowest domain span_weight must be >= 2 for a 2-CPU topology, got {}",
doms[0].span_weight
);
for dom in doms {
assert!(
dom.span_weight > 0,
"domain level {} span_weight must be > 0",
dom.level
);
}
}
}
}
#[test]
fn builder_performance_mode_false_no_validation() {
let exe = crate::resolve_current_exe().unwrap();
let result = KtstrVmBuilder::default()
.kernel(&exe)
.topology(1, 1, 1, 1)
.performance_mode(false)
.build();
match result {
Ok(_) => {}
Err(e)
if e.downcast_ref::<host_topology::ResourceContention>()
.is_some() =>
{
skip!("resource contention: {e}");
}
Err(e) => panic!("performance_mode=false should not validate host topology: {e:#}",),
}
}
#[test]
fn builder_performance_mode_oversubscribed_fails() {
let exe = crate::resolve_current_exe().unwrap();
let host_topo = host_topology::HostTopology::from_sysfs().unwrap();
let too_many = host_topo.total_cpus() as u32 + 1;
let result = KtstrVmBuilder::default()
.kernel(&exe)
.topology(1, 1, too_many, 1)
.performance_mode(true)
.build();
match result {
Ok(_) => panic!("oversubscribed topology should fail"),
Err(e) => {
let msg = format!("{e}");
assert!(
msg.contains("performance_mode"),
"error should mention performance_mode: {msg}",
);
}
}
}
#[test]
fn builder_performance_mode_too_many_llcs_fails() {
let exe = crate::resolve_current_exe().unwrap();
let host_topo = host_topology::HostTopology::from_sysfs().unwrap();
let too_many_llcs = host_topo.llc_groups.len() as u32 + 1;
if (too_many_llcs as usize + 1) <= host_topo.total_cpus() {
let result = KtstrVmBuilder::default()
.kernel(&exe)
.topology(1, too_many_llcs, 1, 1)
.performance_mode(true)
.build();
assert!(
result.is_err(),
"more virtual LLCs than host LLCs should fail",
);
}
}
#[test]
fn builder_performance_mode_valid_succeeds() {
let exe = crate::resolve_current_exe().unwrap();
let host_topo = host_topology::HostTopology::from_sysfs().unwrap();
if host_topo.total_cpus() < 3 {
skip!("need >= 3 host CPUs for performance_mode test");
}
let result = KtstrVmBuilder::default()
.kernel(&exe)
.topology(1, 1, 2, 1)
.performance_mode(true)
.build();
match result {
Ok(_) => {}
Err(e)
if e.downcast_ref::<host_topology::ResourceContention>()
.is_some() =>
{
skip!("resource contention: {e}");
}
Err(e) => panic!("valid topology with performance_mode should build: {e:#}",),
}
}
#[test]
fn builder_performance_mode_preserves_in_vm() {
let exe = crate::resolve_current_exe().unwrap();
let host_topo = host_topology::HostTopology::from_sysfs().unwrap();
if host_topo.total_cpus() < 3 {
skip!("need >= 3 host CPUs for performance_mode test");
}
let vm = skip_on_contention!(
KtstrVmBuilder::default()
.kernel(&exe)
.topology(1, 1, 2, 1)
.performance_mode(true)
.build()
);
assert!(vm.performance_mode);
}
#[test]
fn builder_performance_mode_false_preserves_in_vm() {
let exe = crate::resolve_current_exe().unwrap();
let vm = skip_on_contention!(
KtstrVmBuilder::default()
.kernel(&exe)
.topology(1, 1, 1, 1)
.performance_mode(false)
.build()
);
assert!(!vm.performance_mode);
}
#[test]
fn builder_performance_mode_mbind_nodes_populated() {
let exe = crate::resolve_current_exe().unwrap();
let host_topo = host_topology::HostTopology::from_sysfs().unwrap();
if host_topo.total_cpus() < 3 {
skip!("need >= 3 host CPUs for performance_mode test");
}
let vm = KtstrVmBuilder::default()
.kernel(&exe)
.topology(1, 1, 2, 1)
.performance_mode(true)
.build();
if let Ok(vm) = vm {
assert!(
!vm.mbind_node_map.is_empty(),
"mbind_node_map should be populated for performance_mode",
);
}
}
}