use super::*;
#[derive(Debug)]
pub(crate) enum StartupStatus {
Died,
Alive,
}
#[derive(Debug, PartialEq, Eq)]
enum ScxAttachStatus {
Attached,
Timeout,
SysfsAbsent,
}
impl ScxAttachStatus {
fn is_attached(&self) -> bool {
matches!(self, ScxAttachStatus::Attached)
}
}
fn poll_scx_attached(
interval: std::time::Duration,
timeout: std::time::Duration,
) -> ScxAttachStatus {
use crate::vmm::freeze_coord::evented_wait::{KernfsWaitOutcome, kernfs_evented_wait};
use nix::sys::inotify::AddWatchFlags;
let start = std::time::Instant::now();
let mut buf = String::with_capacity(64);
let mut ever_read_ok = false;
let check_done = || -> Option<()> {
buf.clear();
let read_outcome = std::fs::File::open(SYSFS_SCHED_EXT_ROOT_OPS).and_then(|mut f| {
use std::io::Read;
f.read_to_string(&mut buf)
});
if read_outcome.is_ok() {
ever_read_ok = true;
if !buf.trim().is_empty() {
return Some(());
}
}
None
};
let outcome = kernfs_evented_wait(
"/sys/kernel/sched_ext/",
AddWatchFlags::IN_CREATE | AddWatchFlags::IN_MOVED_TO,
Some("/sys/kernel/sched_ext/root/ops"),
interval,
start + timeout,
check_done,
);
match outcome {
KernfsWaitOutcome::Done(()) => ScxAttachStatus::Attached,
KernfsWaitOutcome::NoEventedSource => {
tracing::warn!(
"poll_scx_attached: both attr-fd open (/sys/kernel/sched_ext/root/ops) \
AND inotify_add_watch (/sys/kernel/sched_ext/) failed; surfacing \
SysfsAbsent. Diagnose: zcat /proc/config.gz | grep -E \
'CONFIG_SCHED_CLASS_EXT|CONFIG_INOTIFY_USER' — both must be =y"
);
ScxAttachStatus::SysfsAbsent
}
KernfsWaitOutcome::Timeout => {
let status = if ever_read_ok {
ScxAttachStatus::Timeout
} else {
ScxAttachStatus::SysfsAbsent
};
tracing::warn!(
elapsed_s = start.elapsed().as_secs_f64(),
timeout_s = timeout.as_secs_f64(),
ever_read_ok,
status = ?status,
"poll_scx_attached: timeout — sched_ext attach not observed \
within deadline"
);
status
}
}
}
pub(crate) fn poll_startup(
child: &mut Child,
interval: std::time::Duration,
timeout: std::time::Duration,
) -> StartupStatus {
let pid = child.id();
let pidfd =
unsafe { libc::syscall(libc::SYS_pidfd_open, pid as libc::c_int, 0u32) as libc::c_int };
if pidfd < 0 {
return if poll_proc_pid_absent(pid, interval, timeout) {
StartupStatus::Died
} else {
StartupStatus::Alive
};
}
let start = std::time::Instant::now();
let result = loop {
let now = std::time::Instant::now();
if now >= start + timeout {
break if proc_pid_alive(pid) {
StartupStatus::Alive
} else {
StartupStatus::Died
};
}
let remaining_ms = (start + timeout - now).as_millis().min(i32::MAX as u128) as i32;
let mut pfd = libc::pollfd {
fd: pidfd,
events: libc::POLLIN,
revents: 0,
};
let rc = unsafe { libc::poll(&mut pfd, 1, remaining_ms) };
if rc > 0 && pfd.revents & libc::POLLIN != 0 {
break StartupStatus::Died;
}
};
unsafe {
libc::close(pidfd);
}
result
}
pub(crate) struct ProbeDrain {
pub(crate) stop: Arc<AtomicBool>,
pub(crate) output_done: Arc<crate::sync::Latch>,
}
fn drain_probe_pipeline(drain: Option<&ProbeDrain>) {
let Some(d) = drain else { return };
d.stop.store(true, Ordering::Release);
d.output_done.wait();
}
pub(crate) fn reap_child_bounded(
child: &mut std::process::Child,
timeout: std::time::Duration,
) -> bool {
if let Ok(Some(_)) = child.try_wait() {
return true;
}
match crate::sync::pidfd_poll_exited(child.id() as libc::pid_t, timeout) {
crate::sync::PidfdWait::Exited => {
let _ = child.wait();
true
}
crate::sync::PidfdWait::TimedOut => false,
crate::sync::PidfdWait::NoPidfd => matches!(child.try_wait(), Ok(Some(_))),
}
}
#[tracing::instrument(skip(probe_drain))]
pub(crate) fn start_scheduler(probe_drain: Option<ProbeDrain>) -> (Option<Child>, Option<String>) {
spawn_scheduler_from_paths("/scheduler", "/sched_args", "/tmp/sched.log", probe_drain)
}
#[derive(Debug)]
pub(crate) enum SpawnSchedulerError {
SpawnFailed(std::io::Error),
StartupDied { log_path: String },
NotAttached {
reason: &'static str,
log_path: String,
},
}
impl std::fmt::Display for SpawnSchedulerError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::SpawnFailed(e) => {
write!(f, "Command::spawn failed: {e}")
}
Self::StartupDied { log_path } => {
write!(
f,
"scheduler exited before passing the 1-second liveness gate \
(framework waits for the scheduler binary to remain alive at \
least 1 s before checking for sched_ext bind via /sys/kernel/\
sched_ext/root/ops). Common causes: BPF verifier rejection \
(look for 'libbpf' / 'verifier' lines in the log), missing \
CONFIG_SCHED_CLASS_EXT, scheduler binary segfault at init, \
argv validation failure. Log content rendered below as part \
of the failure dump (log captured at {log_path}); the process \
was reaped and SCHED_PID cleared before this error surfaced."
)
}
Self::NotAttached { reason, log_path } => {
write!(
f,
"scheduler alive but did not bind to sched_ext within the \
attach window: {reason} (framework polls /sys/kernel/sched_ext/\
root/ops for the BPF scheduler attach marker after the \
scheduler binary's liveness gate; this variant surfaces when \
the binary stayed alive but never wrote the bind marker). \
Common causes for 'timeout': BPF program load stalled on a \
slow CI runner past the 10s window, verifier ran long but \
succeeded eventually (bump the window or warm the BPF cache). \
Common causes for 'sched_ext sysfs absent': kernel built \
without CONFIG_SCHED_CLASS_EXT (rebuild with that config). \
Log content rendered below as part of the failure dump (log \
captured at {log_path}); the framework SIGKILLed and reaped \
the orphan + cleared SCHED_PID before this error surfaced."
)
}
}
}
}
impl std::error::Error for SpawnSchedulerError {}
pub(crate) fn try_spawn_scheduler(
binary_path: &str,
args_path: &str,
log_path: &str,
) -> Result<Option<(Child, String)>, SpawnSchedulerError> {
if !Path::new(binary_path).exists() {
return Ok(None);
}
let sched_args = fs::read_to_string(args_path)
.unwrap_or_default()
.trim()
.to_string();
let args: Vec<&str> = if sched_args.is_empty() {
vec![]
} else {
sched_args.split_whitespace().collect()
};
let log_file = fs::File::create(log_path).ok();
let stdout = match log_file.as_ref().and_then(|f| f.try_clone().ok()) {
Some(f) => Stdio::from(f),
None => Stdio::null(),
};
let stderr = match log_file {
Some(f) => Stdio::from(f),
None => Stdio::null(),
};
let sched_rust_log = match std::env::var("RUST_LOG") {
Ok(existing) => format!("{existing},scx_utils::libbpf_logger=warn"),
Err(_) => "info,scx_utils::libbpf_logger=warn".to_string(),
};
let mut child = Command::new(binary_path)
.args(&args)
.env("RUST_LOG", &sched_rust_log)
.stdout(stdout)
.stderr(stderr)
.spawn()
.map_err(SpawnSchedulerError::SpawnFailed)?;
SCHED_PID.store(child.id() as i32, Ordering::Release);
match poll_startup(
&mut child,
std::time::Duration::from_millis(50),
std::time::Duration::from_secs(1),
) {
StartupStatus::Died => {
SCHED_PID.store(0, Ordering::Release);
Err(SpawnSchedulerError::StartupDied {
log_path: log_path.to_string(),
})
}
StartupStatus::Alive => {
let status = poll_scx_attached(
std::time::Duration::from_millis(50),
std::time::Duration::from_secs(10),
);
if !status.is_attached() {
let reason = match status {
ScxAttachStatus::Timeout => "timeout",
ScxAttachStatus::SysfsAbsent => "sched_ext sysfs absent",
ScxAttachStatus::Attached => unreachable!(),
};
let pid = child.id() as libc::pid_t;
unsafe {
let _ = libc::kill(pid, libc::SIGKILL);
}
let _ = child.wait();
SCHED_PID.store(0, Ordering::Release);
return Err(SpawnSchedulerError::NotAttached {
reason,
log_path: log_path.to_string(),
});
}
Ok(Some((child, log_path.to_string())))
}
}
}
#[tracing::instrument(skip(probe_drain), fields(binary = %binary_path))]
pub(crate) fn spawn_scheduler_from_paths(
binary_path: &str,
args_path: &str,
log_path: &str,
probe_drain: Option<ProbeDrain>,
) -> (Option<Child>, Option<String>) {
match try_spawn_scheduler(binary_path, args_path, log_path) {
Ok(None) => (None, None),
Ok(Some((child, log))) => (Some(child), Some(log)),
Err(SpawnSchedulerError::SpawnFailed(e)) => {
tracing::error!(err = %e, "ktstr-init: spawn scheduler failed");
crate::vmm::guest_comms::send_sched_log(crate::verifier::SCHED_OUTPUT_START.as_bytes());
send_sched_log_text(&format!("failed to spawn: {e}"));
crate::vmm::guest_comms::send_sched_log(crate::verifier::SCHED_OUTPUT_END.as_bytes());
crate::vmm::guest_comms::send_lifecycle(
crate::vmm::wire::LifecyclePhase::SchedulerDied,
"",
);
crate::vmm::guest_comms::send_exit(1);
drain_probe_pipeline(probe_drain.as_ref());
force_reboot();
}
Err(SpawnSchedulerError::StartupDied { log_path }) => {
dump_sched_output(&log_path);
crate::vmm::guest_comms::send_lifecycle(
crate::vmm::wire::LifecyclePhase::SchedulerDied,
"",
);
crate::vmm::guest_comms::send_exit(1);
drain_probe_pipeline(probe_drain.as_ref());
force_reboot();
}
Err(SpawnSchedulerError::NotAttached { reason, log_path }) => {
dump_sched_output(&log_path);
crate::vmm::guest_comms::send_lifecycle(
crate::vmm::wire::LifecyclePhase::SchedulerNotAttached,
reason,
);
crate::vmm::guest_comms::send_exit(1);
drain_probe_pipeline(probe_drain.as_ref());
force_reboot();
}
}
}