use anyhow::Result;
use std::time::Duration;
use super::host_topology;
pub(crate) const KVM_CREATE_VM_EINTR_DELAYS: [Duration; 8] = [
Duration::from_micros(100),
Duration::from_micros(500),
Duration::from_millis(2),
Duration::from_millis(10),
Duration::from_millis(50),
Duration::from_millis(100),
Duration::from_millis(200),
Duration::from_millis(500),
];
pub(crate) const TRANSIENT_HOST_ERRNOS: &[i32] = &[
libc::ENOMEM,
libc::EMFILE,
libc::ENFILE,
libc::EBUSY,
libc::EAGAIN,
];
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct HostResourceSnapshot {
pub fd_count: usize,
pub vm_rss: String,
pub threads: String,
pub near_limit: bool,
}
impl std::fmt::Display for HostResourceSnapshot {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"fds={fds}, vmrss={vmrss}, threads={threads}, near_limit={near_limit}",
fds = self.fd_count,
vmrss = self.vm_rss,
threads = self.threads,
near_limit = self.near_limit,
)
}
}
pub(crate) fn host_resource_snapshot() -> HostResourceSnapshot {
let status = std::fs::read_to_string("/proc/self/status").unwrap_or_default();
let limits = std::fs::read_to_string("/proc/self/limits").unwrap_or_default();
let fd_count: usize = std::fs::read_dir("/proc/self/fd")
.map(|d| d.filter_map(|e| e.ok()).count())
.unwrap_or(0);
let pick = |s: &str, prefix: &str| -> String {
for line in s.lines() {
if let Some(rest) = line.strip_prefix(prefix) {
return rest.trim().to_string();
}
}
"<unknown>".into()
};
let pick_limit_value = |key: &str| -> Option<u64> {
for line in limits.lines() {
if let Some(rest) = line.strip_prefix(key) {
let token = rest.split_whitespace().next()?;
return token.parse::<u64>().ok();
}
}
None
};
let vm_rss = pick(&status, "VmRSS:");
let threads = pick(&status, "Threads:");
let near_limit_fds = pick_limit_value("Max open files")
.map(|cap| {
let scaled_count = (fd_count as u64).saturating_mul(10);
let scaled_cap = cap.saturating_mul(9);
scaled_count >= scaled_cap
})
.unwrap_or(false);
let thread_count = threads.parse::<u64>().unwrap_or(0);
let near_limit_procs = pick_limit_value("Max processes")
.map(|cap| {
let scaled_count = thread_count.saturating_mul(10);
let scaled_cap = cap.saturating_mul(9);
scaled_count >= scaled_cap
})
.unwrap_or(false);
let near_limit = near_limit_fds || near_limit_procs;
HostResourceSnapshot {
fd_count,
vm_rss,
threads,
near_limit,
}
}
pub(crate) fn map_transient_to_contention(
e: kvm_ioctls::Error,
context: impl Into<String>,
) -> anyhow::Error {
let context = context.into();
let errno = e.errno();
if TRANSIENT_HOST_ERRNOS.contains(&errno) {
let snapshot = host_resource_snapshot();
let bypass_requested =
std::env::var("KTSTR_CONTENTION_BYPASS").ok().as_deref() == Some("1");
if bypass_requested && !snapshot.near_limit {
return anyhow::Error::new(e).context(format!(
"{context}: KVM errno {errno} ({errno_name}) — errno looks transient \
but host is NOT near limits ({snapshot}). KTSTR_CONTENTION_BYPASS=1 \
routed this through as a hard error: likely a kernel-side bug \
(leak / stuck device / cgroup-exhausted state) rather than peer \
contention. Check `dmesg` for the affected subsystem.",
errno_name = errno_name(errno),
));
}
anyhow::Error::new(host_topology::ResourceContention {
reason: format!(
"{context}: transient KVM errno {errno} ({}): host resources: {snapshot}\n \
hint: KVM ioctl failed with a host-resource errno; another peer may be \
holding the budget. nextest will not retry; the SKIP banner records this \
attempt for stats tooling.\n \
hint: if `near_limit=false` in the snapshot above and SKIPs persist \
across runs, the errno is likely a kernel-side regression (leak / stuck \
device / cgroup-exhausted state) — check `dmesg` for the affected \
subsystem rather than retrying the test, or set \
`KTSTR_CONTENTION_BYPASS=1` to surface such failures as hard errors.",
errno_name(errno)
),
})
} else {
anyhow::Error::new(e).context(context)
}
}
pub(crate) fn errno_name(errno: i32) -> std::borrow::Cow<'static, str> {
use std::borrow::Cow;
match errno {
libc::ENOMEM => Cow::Borrowed("ENOMEM"),
libc::EMFILE => Cow::Borrowed("EMFILE"),
libc::ENFILE => Cow::Borrowed("ENFILE"),
libc::EBUSY => Cow::Borrowed("EBUSY"),
libc::EAGAIN => Cow::Borrowed("EAGAIN"),
libc::EINTR => Cow::Borrowed("EINTR"),
libc::EINVAL => Cow::Borrowed("EINVAL"),
libc::ENOSYS => Cow::Borrowed("ENOSYS"),
libc::EPERM => Cow::Borrowed("EPERM"),
libc::EACCES => Cow::Borrowed("EACCES"),
libc::ENOSPC => Cow::Borrowed("ENOSPC"),
libc::ENODEV => Cow::Borrowed("ENODEV"),
libc::ENOTSUP => Cow::Borrowed("ENOTSUP"),
libc::EFAULT => Cow::Borrowed("EFAULT"),
libc::EIO => Cow::Borrowed("EIO"),
libc::EBADF => Cow::Borrowed("EBADF"),
libc::ESRCH => Cow::Borrowed("ESRCH"),
libc::ENOENT => Cow::Borrowed("ENOENT"),
libc::ECHILD => Cow::Borrowed("ECHILD"),
libc::EEXIST => Cow::Borrowed("EEXIST"),
libc::EOVERFLOW => Cow::Borrowed("EOVERFLOW"),
libc::ETIMEDOUT => Cow::Borrowed("ETIMEDOUT"),
libc::ENOTTY => Cow::Borrowed("ENOTTY"),
other => Cow::Owned(format!("errno={other}")),
}
}
fn eintr_exhausted_contention() -> anyhow::Error {
let snapshot = host_resource_snapshot();
let total_delay: std::time::Duration = KVM_CREATE_VM_EINTR_DELAYS.iter().copied().sum();
anyhow::Error::new(host_topology::ResourceContention {
reason: format!(
"create VM: KVM_CREATE_VM kept returning EINTR after \
{n} retries totalling {total_ms} ms — sustained \
signal pressure on the host. host resources: {snapshot}\n \
hint: a peer process is firing realtime / SIGRTMIN \
signals at a rate that out-paces the EINTR backoff \
schedule. nextest will not retry; the SKIP banner \
records this attempt for stats tooling.",
n = KVM_CREATE_VM_EINTR_DELAYS.len(),
total_ms = total_delay.as_millis(),
),
})
}
pub(crate) fn create_vm_with_retry(kvm: &kvm_ioctls::Kvm) -> Result<kvm_ioctls::VmFd> {
let mut attempts = 0usize;
loop {
match kvm.create_vm() {
Ok(fd) => break Ok(fd),
Err(e) if e.errno() == libc::EINTR && attempts < KVM_CREATE_VM_EINTR_DELAYS.len() => {
let delay = KVM_CREATE_VM_EINTR_DELAYS[attempts];
tracing::warn!(
attempt = attempts,
delay_us = delay.as_micros() as u64,
"KVM_CREATE_VM EINTR; retrying"
);
std::thread::sleep(delay);
attempts += 1;
}
Err(e) if e.errno() == libc::EINTR => {
break Err(eintr_exhausted_contention());
}
Err(e) => break Err(map_transient_to_contention(e, "create VM")),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn host_resource_snapshot_emits_all_keys() {
let s = format!("{}", host_resource_snapshot());
assert!(s.contains("fds="), "snapshot missing fds=: {s}");
assert!(s.contains("vmrss="), "snapshot missing vmrss=: {s}");
assert!(s.contains("threads="), "snapshot missing threads=: {s}");
assert!(
s.contains("near_limit="),
"snapshot missing near_limit=: {s}"
);
assert!(
!s.contains('\n'),
"snapshot must be single-line for banner formatting: {s:?}"
);
}
#[test]
fn host_resource_snapshot_does_not_leak_raw_rlimits() {
let s = format!("{}", host_resource_snapshot());
assert!(
!s.contains("max_files="),
"host_resource_snapshot must not leak the RLIMIT_NOFILE soft cap; got: {s}",
);
assert!(
!s.contains("max_procs="),
"host_resource_snapshot must not leak the RLIMIT_NPROC soft cap; got: {s}",
);
if let Some(rest) = s
.strip_prefix("fds=")
.or_else(|| s.split_once("fds=").map(|(_, r)| r))
{
let fds_field = rest.split(',').next().unwrap_or(rest);
assert!(
!fds_field.contains('/'),
"snapshot's fds= field must not include the cap (slash form); got: {s}",
);
}
}
#[test]
fn host_resource_snapshot_near_limit_is_boolean() {
let s = format!("{}", host_resource_snapshot());
assert!(
s.contains("near_limit=true") || s.contains("near_limit=false"),
"near_limit must be boolean; got: {s}",
);
}
#[test]
fn host_resource_snapshot_typed_field_agrees_with_rendered_banner() {
let snapshot = host_resource_snapshot();
let rendered = format!("{snapshot}");
let expected_token = if snapshot.near_limit {
"near_limit=true"
} else {
"near_limit=false"
};
assert!(
rendered.contains(expected_token),
"rendered banner must agree with typed field; \
snapshot.near_limit={field_val}, rendered={rendered}",
field_val = snapshot.near_limit,
);
}
#[test]
fn host_resource_snapshot_display_format_is_pinned() {
let snapshot = HostResourceSnapshot {
fd_count: 64,
vm_rss: "24 kB".into(),
threads: "8".into(),
near_limit: false,
};
assert_eq!(
format!("{snapshot}"),
"fds=64, vmrss=24 kB, threads=8, near_limit=false",
);
let snapshot_at_limit = HostResourceSnapshot {
fd_count: 1023,
vm_rss: "999 mB".into(),
threads: "256".into(),
near_limit: true,
};
assert_eq!(
format!("{snapshot_at_limit}"),
"fds=1023, vmrss=999 mB, threads=256, near_limit=true",
);
}
#[test]
fn map_transient_to_contention_classifies_enomem() {
let _lock = crate::test_support::test_helpers::lock_env();
let _bypass_off =
crate::test_support::test_helpers::EnvVarGuard::remove("KTSTR_CONTENTION_BYPASS");
for &errno in TRANSIENT_HOST_ERRNOS {
let kvm_err = kvm_ioctls::Error::new(errno);
let mapped = map_transient_to_contention(kvm_err, "create VM");
assert!(
mapped
.downcast_ref::<host_topology::ResourceContention>()
.is_some(),
"errno {errno} ({}): expected ResourceContention, got {mapped:#}",
errno_name(errno),
);
let rendered = format!("{mapped:#}");
assert!(
rendered.contains("create VM"),
"errno {errno} banner missing context: {rendered}"
);
assert!(
rendered.contains(&*errno_name(errno)),
"errno {errno} banner missing errno name: {rendered}"
);
assert!(
rendered.contains("host resources:"),
"errno {errno} banner missing host-resource snapshot: {rendered}"
);
}
}
#[test]
fn map_transient_to_contention_passes_through_hard_errors() {
for &errno in &[libc::EINVAL, libc::ENOSYS, libc::EPERM, libc::EACCES] {
let kvm_err = kvm_ioctls::Error::new(errno);
let mapped = map_transient_to_contention(kvm_err, "set TSS");
assert!(
mapped
.downcast_ref::<host_topology::ResourceContention>()
.is_none(),
"errno {errno} ({}): hard fault must NOT be classified as \
ResourceContention; got {mapped:#}",
errno_name(errno),
);
let rendered = format!("{mapped:#}");
assert!(
rendered.contains("set TSS"),
"errno {errno} banner missing context: {rendered}"
);
}
}
#[test]
fn map_transient_to_contention_does_not_classify_eintr() {
let kvm_err = kvm_ioctls::Error::new(libc::EINTR);
let mapped = map_transient_to_contention(kvm_err, "create VM");
assert!(
mapped
.downcast_ref::<host_topology::ResourceContention>()
.is_none(),
"EINTR must NOT classify as ResourceContention via \
map_transient_to_contention — the retry loop handles \
single EINTR; only EXHAUSTED EINTR is contention. \
got: {mapped:#}",
);
}
#[test]
fn set_user_memory_region_routing_via_map_transient() {
let _lock = crate::test_support::test_helpers::lock_env();
let _bypass_off =
crate::test_support::test_helpers::EnvVarGuard::remove("KTSTR_CONTENTION_BYPASS");
for &errno in TRANSIENT_HOST_ERRNOS {
let kvm_err = kvm_ioctls::Error::new(errno);
let mapped = map_transient_to_contention(kvm_err, "set_user_memory_region");
assert!(
mapped
.downcast_ref::<host_topology::ResourceContention>()
.is_some(),
"errno {errno} ({}): set_user_memory_region routing must \
classify as ResourceContention; got {mapped:#}",
errno_name(errno),
);
let rendered = format!("{mapped:#}");
assert!(
rendered.contains("set_user_memory_region"),
"errno {errno} banner missing memslot-install context tag: {rendered}"
);
assert!(
rendered.contains(&*errno_name(errno)),
"errno {errno} banner missing errno name: {rendered}"
);
}
let kvm_err = kvm_ioctls::Error::new(libc::EINVAL);
let mapped = map_transient_to_contention(kvm_err, "set_user_memory_region");
assert!(
mapped
.downcast_ref::<host_topology::ResourceContention>()
.is_none(),
"EINVAL from set_user_memory_region must NOT classify as \
ResourceContention — bad memslot layout is a programming \
fault that SKIP-skipping would hide; got: {mapped:#}",
);
}
#[test]
fn map_transient_to_contention_bypass_when_near_limit_false() {
let _lock = crate::test_support::test_helpers::lock_env();
let snapshot = host_resource_snapshot();
if snapshot.near_limit {
return;
}
let _bypass_on =
crate::test_support::test_helpers::EnvVarGuard::set("KTSTR_CONTENTION_BYPASS", "1");
let kvm_err = kvm_ioctls::Error::new(libc::ENOMEM);
let mapped = map_transient_to_contention(kvm_err, "set_user_memory_region");
assert!(
mapped
.downcast_ref::<host_topology::ResourceContention>()
.is_none(),
"with KTSTR_CONTENTION_BYPASS=1 and near_limit=false, ENOMEM must \
surface as a hard error rather than ResourceContention; got: {mapped:#}",
);
let rendered = format!("{mapped:#}");
assert!(
rendered.contains("KTSTR_CONTENTION_BYPASS=1"),
"bypass diagnostic must mention the env var so the operator can \
see why this surfaced as a hard error; got: {rendered}",
);
assert!(
rendered.contains("NOT near limits"),
"bypass diagnostic must explain the `near_limit=false` rationale; \
got: {rendered}",
);
}
#[test]
fn eintr_exhausted_contention_format() {
let err = eintr_exhausted_contention();
assert!(
err.downcast_ref::<host_topology::ResourceContention>()
.is_some(),
"EINTR-exhausted error must downcast to ResourceContention; got: {err:#}",
);
let rendered = format!("{err:#}");
assert!(
rendered.contains("create VM"),
"banner missing context tag: {rendered}",
);
assert!(
rendered.contains("EINTR"),
"banner missing EINTR-specific classification: {rendered}",
);
assert!(
rendered.contains("host resources:"),
"banner missing host-resource snapshot: {rendered}",
);
assert!(
rendered.contains("signal"),
"banner missing signal-storm hint: {rendered}",
);
}
#[test]
fn create_vm_with_retry_succeeds_under_no_signal_pressure() {
let kvm = match kvm_ioctls::Kvm::new() {
Ok(k) => k,
Err(e) => {
panic!(
"Kvm::new() failed: {e}; cannot exercise \
create_vm_with_retry on this host"
);
}
};
let vm = create_vm_with_retry(&kvm);
match vm {
Ok(_) => {
}
Err(e)
if e.downcast_ref::<host_topology::ResourceContention>()
.is_some() =>
{
}
Err(e) => {
panic!(
"create_vm_with_retry returned an unexpected \
non-contention error on a no-signal-pressure host: {e:#}"
);
}
}
}
#[test]
fn errno_name_fallthrough_renders_raw_value() {
let rendered = errno_name(9999);
assert!(
rendered.contains("errno=9999"),
"fallthrough must include the exact `errno=N` format string \
so callers grepping for it find the integer reliably; got: {rendered}",
);
assert!(
!rendered.contains("<other>"),
"fallthrough must not collapse to the placeholder <other>; got: {rendered}",
);
}
#[test]
fn errno_name_maps_canonical_names() {
for (errno, expected) in [
(libc::ENOMEM, "ENOMEM"),
(libc::EBUSY, "EBUSY"),
(libc::EAGAIN, "EAGAIN"),
(libc::EINVAL, "EINVAL"),
(libc::ENOSYS, "ENOSYS"),
(libc::EPERM, "EPERM"),
(libc::EACCES, "EACCES"),
] {
let rendered = errno_name(errno);
assert_eq!(
&*rendered, expected,
"errno {errno} must render as {expected}; got {rendered}",
);
}
}
#[test]
fn kvm_create_vm_eintr_delays_total_budget() {
let mut prev = Duration::ZERO;
let mut total = Duration::ZERO;
for &d in &KVM_CREATE_VM_EINTR_DELAYS {
assert!(
d >= prev,
"EINTR delays must be monotonic: {prev:?} → {d:?}"
);
prev = d;
total += d;
}
assert!(
total >= Duration::from_millis(800),
"total EINTR budget must be ≥ 800 ms to absorb signal storms; got {total:?}"
);
assert!(
total <= Duration::from_secs(2),
"total EINTR budget must be ≤ 2 s to stay within the \
freeze rendezvous window; got {total:?}"
);
}
}