use anyhow::Result;
use std::collections::BTreeMap;
use std::time::Duration;
use super::super::config::SchedPolicy;
pub(super) fn read_schedstat(tid: Option<libc::pid_t>) -> Option<(u64, u64, u64)> {
let path: std::borrow::Cow<'static, str> = match tid {
None => std::borrow::Cow::Borrowed("/proc/self/schedstat"),
Some(t) => std::borrow::Cow::Owned(format!("/proc/self/task/{t}/schedstat")),
};
let data = match std::fs::read_to_string(&*path) {
Ok(d) => d,
Err(_) => {
warn_schedstat_unavailable_once();
return None;
}
};
parse_schedstat_line(&data)
}
pub(super) fn parse_schedstat_line(data: &str) -> Option<(u64, u64, u64)> {
let mut parts = data.split_whitespace();
let cpu_time = parts.next()?.parse::<u64>().ok()?;
let run_delay = parts.next()?.parse::<u64>().ok()?;
let timeslices = parts.next()?.parse::<u64>().ok()?;
Some((cpu_time, run_delay, timeslices))
}
pub(super) fn warn_schedstat_unavailable_once() {
static WARNED: std::sync::Once = std::sync::Once::new();
if claim_warn_slot(&WARNED) {
eprintln!(
"workload: /proc/self/schedstat unavailable (CONFIG_SCHEDSTATS off?); \
schedstat_* fields in WorkerReport will be zero"
);
}
}
pub(super) fn claim_warn_slot(once: &std::sync::Once) -> bool {
let mut first = false;
once.call_once(|| first = true);
first
}
pub(super) fn read_numa_maps_pages() -> BTreeMap<usize, u64> {
let content = match std::fs::read_to_string("/proc/self/numa_maps") {
Ok(c) => c,
Err(_) => return BTreeMap::new(),
};
let entries = crate::assert::parse_numa_maps(&content);
let mut totals: BTreeMap<usize, u64> = BTreeMap::new();
for entry in &entries {
for (&node, &count) in &entry.node_pages {
*totals.entry(node).or_insert(0) += count;
}
}
totals
}
pub(super) fn read_numa_maps_region_pages(region_addr: u64) -> BTreeMap<usize, u64> {
let content = match std::fs::read_to_string("/proc/self/numa_maps") {
Ok(c) => c,
Err(_) => return BTreeMap::new(),
};
sum_region_node_pages(&crate::assert::parse_numa_maps(&content), region_addr)
}
fn sum_region_node_pages(
entries: &[crate::assert::NumaMapsEntry],
region_addr: u64,
) -> BTreeMap<usize, u64> {
let mut totals: BTreeMap<usize, u64> = BTreeMap::new();
for entry in entries {
if entry.addr == region_addr {
for (&node, &count) in &entry.node_pages {
*totals.entry(node).or_insert(0) += count;
}
}
}
totals
}
pub(super) fn read_vmstat_numa_pages_migrated() -> u64 {
let content = match std::fs::read_to_string("/proc/vmstat") {
Ok(c) => c,
Err(_) => return 0,
};
crate::assert::parse_vmstat_numa_pages_migrated(&content).unwrap_or(0)
}
pub(super) fn clock_gettime_ns(clk: libc::clockid_t) -> Option<u64> {
let mut ts = libc::timespec {
tv_sec: 0,
tv_nsec: 0,
};
let rc = unsafe { libc::clock_gettime(clk, &mut ts) };
if rc != 0 {
warn_clock_gettime_failed_once(clk);
return None;
}
Some((ts.tv_sec as u64) * 1_000_000_000 + (ts.tv_nsec as u64))
}
pub(super) fn warn_clock_gettime_failed_once(clk: libc::clockid_t) {
static WARNED_THREAD: std::sync::Once = std::sync::Once::new();
static WARNED_MONO: std::sync::Once = std::sync::Once::new();
let once = match clk {
libc::CLOCK_THREAD_CPUTIME_ID => &WARNED_THREAD,
libc::CLOCK_MONOTONIC => &WARNED_MONO,
_ => unreachable!("unexpected clockid {clk}"),
};
once.call_once(|| {
let errno = std::io::Error::last_os_error();
eprintln!(
"workload: clock_gettime(clk={clk}) failed: {errno}; affected samples will be zero or skipped"
);
});
}
pub(super) fn thread_cpu_time_ns() -> u64 {
clock_gettime_ns(libc::CLOCK_THREAD_CPUTIME_ID).unwrap_or(0)
}
pub(super) fn duration_to_kernel_ns(d: Duration, field: &str) -> Result<u64> {
let ns_u128 = d.as_nanos();
if ns_u128 > i64::MAX as u128 {
anyhow::bail!(
"sched_setattr: {field} duration ({ns_u128} ns) exceeds i64::MAX — \
nanosecond count must fit in 63 bits (kernel reserves bit 63)"
);
}
Ok(ns_u128 as u64)
}
pub(super) fn set_sched_policy(pid: libc::pid_t, policy: SchedPolicy) -> Result<()> {
if pid <= 0 {
anyhow::bail!("sched_setscheduler: invalid pid {pid} (must be > 0)");
}
let (pol, prio) = match policy {
SchedPolicy::Normal => return Ok(()),
SchedPolicy::Batch => (libc::SCHED_BATCH, 0),
SchedPolicy::Idle => (libc::SCHED_IDLE, 0),
SchedPolicy::Fifo(p) => (libc::SCHED_FIFO, p.clamp(1, 99) as i32),
SchedPolicy::RoundRobin(p) => (libc::SCHED_RR, p.clamp(1, 99) as i32),
SchedPolicy::Deadline {
runtime,
deadline,
period,
} => {
if deadline.is_zero() {
anyhow::bail!(
"sched_setattr: deadline must be > 0 (kernel `__checkparam_dl` rejects zero deadline)"
);
}
let runtime_ns = duration_to_kernel_ns(runtime, "runtime")?;
let deadline_ns = duration_to_kernel_ns(deadline, "deadline")?;
let period_ns = duration_to_kernel_ns(period, "period")?;
if runtime_ns < 1024 {
anyhow::bail!(
"sched_setattr: runtime ({runtime_ns} ns) below kernel DL_SCALE floor (1024 ns)"
);
}
if runtime_ns > deadline_ns {
anyhow::bail!(
"sched_setattr: runtime ({runtime_ns} ns) > deadline ({deadline_ns} ns)"
);
}
if period_ns != 0 && deadline_ns > period_ns {
anyhow::bail!(
"sched_setattr: deadline ({deadline_ns} ns) > period ({period_ns} ns)"
);
}
let mut attr: libc::sched_attr = unsafe { std::mem::zeroed() };
attr.size = std::mem::size_of::<libc::sched_attr>() as u32;
attr.sched_policy = libc::SCHED_DEADLINE as u32;
attr.sched_runtime = runtime_ns;
attr.sched_deadline = deadline_ns;
attr.sched_period = period_ns;
let ret = unsafe {
libc::syscall(
libc::SYS_sched_setattr,
pid,
&attr as *const libc::sched_attr,
0u32,
)
};
if ret != 0 {
anyhow::bail!("sched_setattr: {}", std::io::Error::last_os_error());
}
return Ok(());
}
};
let param = libc::sched_param {
sched_priority: prio,
};
if unsafe { libc::sched_setscheduler(pid, pol, ¶m) } != 0 {
anyhow::bail!("sched_setscheduler: {}", std::io::Error::last_os_error());
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::assert::parse_numa_maps;
#[test]
fn region_pages_scopes_to_matching_vma() {
let content = "\
00400000 default file=/usr/bin/x mapped=300 N0=200 N1=100\n\
7f0000000000 default anon=16384 dirty=16384 N0=16384\n";
let entries = parse_numa_maps(content);
let region = sum_region_node_pages(&entries, 0x7f00_0000_0000);
assert_eq!(region.get(&0), Some(&16384), "region node-0 pages summed");
assert_eq!(region.get(&1), None, "region has no node-1 pages");
let total: u64 = region.values().sum();
assert_eq!(
total, 16384,
"decoy file VMA (N0=200,N1=100) must be excluded from the region scope"
);
}
#[test]
fn region_pages_empty_when_no_vma_matches() {
let entries = parse_numa_maps("00400000 default anon=10 N0=10\n");
assert!(sum_region_node_pages(&entries, 0xdead_beef).is_empty());
}
}