ktstr 0.17.0

Test harness for Linux process schedulers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
//! Scheduler / clock / metric helpers used by `worker_main`. Holds
//! the schedstat parser and reader, NUMA / vmstat readers, the
//! `clock_gettime_ns` wrapper, and `set_sched_policy` (which lowers
//! [`SchedPolicy`] to `sched_setattr` / `sched_setscheduler`).
//! Extracted from `worker/mod.rs` so the production file stays
//! under the per-file line budget.

use anyhow::Result;
use std::collections::BTreeMap;
use std::time::Duration;

use super::super::config::SchedPolicy;

/// Read schedstat for the calling worker and return
/// `(cpu_time_ns, run_delay_ns, timeslices)`.
///
/// `tid` selects which `/proc` path is read:
/// - `None` → `/proc/self/schedstat`. `/proc/self` resolves to
///   `/proc/<TGID>` (the thread-group leader's task_struct), which
///   is correct for `CloneMode::Fork` workers because each fork
///   worker IS its own thread-group leader (`gettid() == getpid()`).
/// - `Some(tid)` → `/proc/self/task/<tid>/schedstat`. Required for
///   `CloneMode::Thread` workers: every thread in the parent
///   tgid sees the same `/proc/self/schedstat` (the parent's
///   leader stats), so reading it from a thread worker reports
///   the test runner's stats, not the worker's. The
///   `/proc/self/task/<tid>` path returns the per-task
///   schedstat: field 1 = `task->se.sum_exec_runtime` (on-CPU ns),
///   fields 2/3 = `task->sched_info.run_delay`/`pcount`. Available on Linux
///   2.6+; ktstr's 6.16 kernel floor guarantees it.
///
/// Returns `None` when the file cannot be opened (kernel built
/// without `CONFIG_SCHEDSTATS`, or `/proc` unavailable) or when any
/// of the first three whitespace-separated fields is missing or not
/// parseable as `u64`. Callers must distinguish "unavailable" from
/// "zero observed" — the previous `(0, 0, 0)`-on-failure return was
/// silently ambiguous across "schedstats disabled", "I/O error",
/// and "worker genuinely did no work yet", which caused
/// `assert_not_starved`-style checks to ratify the wrong invariant
/// on kernels without schedstats.
///
/// Emits a process-wide one-shot warning to stderr the first time
/// the file cannot be opened so the test log records the cause
/// without flooding on every per-worker read. The parse-failure
/// branches return `None` silently — a schedstat line that opens
/// but fails `u64::parse` indicates a kernel-ABI drift that should
/// not occur on any production kernel and warrants investigation by
/// the maintainer rather than per-run log noise.
pub(super) fn read_schedstat(tid: Option<libc::pid_t>) -> Option<(u64, u64, u64)> {
    let path: std::borrow::Cow<'static, str> = match tid {
        None => std::borrow::Cow::Borrowed("/proc/self/schedstat"),
        Some(t) => std::borrow::Cow::Owned(format!("/proc/self/task/{t}/schedstat")),
    };
    let data = match std::fs::read_to_string(&*path) {
        Ok(d) => d,
        Err(_) => {
            warn_schedstat_unavailable_once();
            return None;
        }
    };
    parse_schedstat_line(&data)
}

/// Pure parser split from [`read_schedstat`] for unit testability.
/// Parses the first three whitespace-separated fields of a
/// `/proc/self/schedstat` line as `(cpu_time_ns, run_delay_ns,
/// timeslices)`. Returns `None` when any of the three tokens is
/// missing or not parseable as `u64` — matches the silent-failure
/// contract described on `read_schedstat`. Synthetic fixtures can
/// exercise the parse-failure branches (truncated line, non-u64
/// token, empty input, trailing garbage) without standing up a
/// real `/proc/self/schedstat`.
pub(super) fn parse_schedstat_line(data: &str) -> Option<(u64, u64, u64)> {
    let mut parts = data.split_whitespace();
    let cpu_time = parts.next()?.parse::<u64>().ok()?;
    let run_delay = parts.next()?.parse::<u64>().ok()?;
    let timeslices = parts.next()?.parse::<u64>().ok()?;
    Some((cpu_time, run_delay, timeslices))
}

/// Print a single "schedstat unavailable" warning for the lifetime
/// of the process. The workload spawns `N_WORKERS` threads, each of
/// which calls `read_schedstat` twice; without a gate this would
/// emit up to `2N` duplicate lines on a kernel without
/// `CONFIG_SCHEDSTATS`.
pub(super) fn warn_schedstat_unavailable_once() {
    static WARNED: std::sync::Once = std::sync::Once::new();
    if claim_warn_slot(&WARNED) {
        eprintln!(
            "workload: /proc/self/schedstat unavailable (CONFIG_SCHEDSTATS off?); \
             schedstat_* fields in WorkerReport will be zero"
        );
    }
}

/// Claim the one-shot warning slot guarded by `once`. Returns `true`
/// exactly once across the lifetime of a given `Once` (the first
/// call), `false` on every subsequent call. This is the testable
/// core of the "warn at most once" gate: the caller emits its
/// warning only when this returns `true`, so a regression that
/// dropped the gate (warning on every call) would make this return
/// `true` repeatedly and trip the unit test. Backed by
/// [`std::sync::Once::call_once`] so the first-call determination is
/// race-free across the worker threads that share a process-wide
/// `Once`.
pub(super) fn claim_warn_slot(once: &std::sync::Once) -> bool {
    let mut first = false;
    once.call_once(|| first = true);
    first
}

/// Aggregate per-node page counts from `/proc/self/numa_maps`.
/// Returns empty map on failure.
pub(super) fn read_numa_maps_pages() -> BTreeMap<usize, u64> {
    let content = match std::fs::read_to_string("/proc/self/numa_maps") {
        Ok(c) => c,
        Err(_) => return BTreeMap::new(),
    };
    let entries = crate::assert::parse_numa_maps(&content);
    let mut totals: BTreeMap<usize, u64> = BTreeMap::new();
    for entry in &entries {
        for (&node, &count) in &entry.node_pages {
            *totals.entry(node).or_insert(0) += count;
        }
    }
    totals
}

/// Per-node page counts for a SINGLE VMA — the one whose `/proc/self/numa_maps`
/// start address equals `region_addr`. Returns empty when no VMA matches (the
/// region was unmapped, never faulted, or the address is wrong).
///
/// Whole-process [`read_numa_maps_pages`] is dominated by the binary/libc/stack
/// pages COW-inherited from the parent BEFORE the worker called
/// `set_mempolicy` — pages the policy never placed (`set_mempolicy` governs only
/// future faults). For a `MemPolicy` locality test the metric must reflect ONLY
/// the policy-governed allocation, so scope the count to the worker's own
/// working-set region (an `mmap` first-touched under the policy on the
/// cpuset-confined node). The region must still be mapped when this runs.
pub(super) fn read_numa_maps_region_pages(region_addr: u64) -> BTreeMap<usize, u64> {
    let content = match std::fs::read_to_string("/proc/self/numa_maps") {
        Ok(c) => c,
        Err(_) => return BTreeMap::new(),
    };
    sum_region_node_pages(&crate::assert::parse_numa_maps(&content), region_addr)
}

/// Sum per-node page counts across only the parsed numa_maps entries whose VMA
/// start address equals `region_addr`. Pure core of
/// [`read_numa_maps_region_pages`], split out so the VMA-address selection is
/// unit-testable without `/proc`.
fn sum_region_node_pages(
    entries: &[crate::assert::NumaMapsEntry],
    region_addr: u64,
) -> BTreeMap<usize, u64> {
    let mut totals: BTreeMap<usize, u64> = BTreeMap::new();
    for entry in entries {
        if entry.addr == region_addr {
            for (&node, &count) in &entry.node_pages {
                *totals.entry(node).or_insert(0) += count;
            }
        }
    }
    totals
}

/// Read `numa_pages_migrated` from `/proc/vmstat`. Returns 0 on failure.
pub(super) fn read_vmstat_numa_pages_migrated() -> u64 {
    let content = match std::fs::read_to_string("/proc/vmstat") {
        Ok(c) => c,
        Err(_) => return 0,
    };
    crate::assert::parse_vmstat_numa_pages_migrated(&content).unwrap_or(0)
}

/// Read `clk` via `clock_gettime` and return the raw timespec packed
/// as `tv_sec * 1e9 + tv_nsec` (ns units), or `None` if the syscall
/// fails. The semantics of the returned value depend on `clk`:
/// `CLOCK_MONOTONIC` is nanoseconds since an unspecified boot epoch,
/// `CLOCK_THREAD_CPUTIME_ID` is nanoseconds of CPU time charged to
/// the calling thread. Centralizes the error check that previously
/// was either discarded entirely (producing garbage timespec readings
/// that fed into wake-latency reservoirs) or collapsed to a 0
/// sentinel indistinguishable from "clock read zero".
pub(super) fn clock_gettime_ns(clk: libc::clockid_t) -> Option<u64> {
    let mut ts = libc::timespec {
        tv_sec: 0,
        tv_nsec: 0,
    };
    let rc = unsafe { libc::clock_gettime(clk, &mut ts) };
    if rc != 0 {
        warn_clock_gettime_failed_once(clk);
        return None;
    }
    Some((ts.tv_sec as u64) * 1_000_000_000 + (ts.tv_nsec as u64))
}

/// Print a single `clock_gettime` failure warning per clock id for
/// the lifetime of the process. Same rationale as
/// `warn_schedstat_unavailable_once`: dozens of workers will fail
/// once each on a misconfigured host. Only `CLOCK_THREAD_CPUTIME_ID`
/// and `CLOCK_MONOTONIC` are ever passed in by this file; any other
/// clock id is a programming error and should panic in development
/// rather than silently falling through to a speculative catch-all.
pub(super) fn warn_clock_gettime_failed_once(clk: libc::clockid_t) {
    static WARNED_THREAD: std::sync::Once = std::sync::Once::new();
    static WARNED_MONO: std::sync::Once = std::sync::Once::new();
    let once = match clk {
        libc::CLOCK_THREAD_CPUTIME_ID => &WARNED_THREAD,
        libc::CLOCK_MONOTONIC => &WARNED_MONO,
        _ => unreachable!("unexpected clockid {clk}"),
    };
    once.call_once(|| {
        // Capture errno INSIDE `call_once` — on every subsequent
        // call the `Once` has already run and the computation is
        // short-circuited, so there is no point paying the syscall
        // cost to read `last_os_error` again just to drop it.
        let errno = std::io::Error::last_os_error();
        eprintln!(
            "workload: clock_gettime(clk={clk}) failed: {errno}; affected samples will be zero or skipped"
        );
    });
}

/// Read the calling thread's CPU-time counter. Returns 0 on syscall
/// failure after emitting a one-shot stderr warning — callers treat
/// the value as a per-thread cumulative counter and cannot usefully
/// distinguish "zero ns" from "clock failed" at the counter's
/// granularity (nanoseconds), so the 0 fallback is an acceptable
/// compromise. The failure path is near-impossible on Linux (kernel
/// must support `CLOCK_THREAD_CPUTIME_ID`, which has been default
/// since 2.6.12). If this lands in a hostile environment where
/// failure is real, callers should migrate to `clock_gettime_ns`
/// directly and handle `None`.
pub(super) fn thread_cpu_time_ns() -> u64 {
    clock_gettime_ns(libc::CLOCK_THREAD_CPUTIME_ID).unwrap_or(0)
}

/// Convert a [`Duration`] to the kernel's `u64` nanosecond
/// representation for `sched_setattr(2)` while enforcing the
/// bit-63-clear constraint `__checkparam_dl` imposes on
/// `sched_deadline` and `sched_period`.
///
/// `Duration::as_nanos()` returns `u128`; the kernel's UAPI struct
/// fields are `u64`. Any duration longer than `i64::MAX` ns
/// (~292 years) either flips bit 63 of the truncated `u64` (kernel
/// reserved) or wraps on the cast entirely. Both outcomes are
/// rejected here so the user sees a named-field error rather than
/// a kernel `EINVAL` after a silent truncation.
///
/// `field` is the human-readable field label embedded in the
/// error message ("runtime", "deadline", "period") so a
/// rejection points at the offending input.
pub(super) fn duration_to_kernel_ns(d: Duration, field: &str) -> Result<u64> {
    let ns_u128 = d.as_nanos();
    if ns_u128 > i64::MAX as u128 {
        anyhow::bail!(
            "sched_setattr: {field} duration ({ns_u128} ns) exceeds i64::MAX — \
             nanosecond count must fit in 63 bits (kernel reserves bit 63)"
        );
    }
    Ok(ns_u128 as u64)
}

/// Lower a [`SchedPolicy`] to a `sched_setscheduler` / `sched_setattr`
/// syscall and apply it to `pid`. `Normal` is a no-op (the kernel
/// default); `Batch`/`Idle` use `SCHED_BATCH`/`SCHED_IDLE` with prio
/// 0; `Fifo`/`RoundRobin` clamp the static priority to `1..=99` and
/// use `sched_setscheduler`; `Deadline` runs the kernel's
/// `__checkparam_dl` structural checks (zero-deadline rejection,
/// `runtime <= deadline <= period` ordering when period is
/// non-zero, DL_SCALE 1024 ns floor, bit-63-clear overflow guard
/// via `duration_to_kernel_ns`) before issuing
/// `syscall(SYS_sched_setattr, ...)` directly because glibc does
/// not wrap that syscall.
///
/// Returns `Err` on any pre-flight rejection or kernel-side
/// EINVAL/EPERM with a named-field error message; the caller
/// treats failure as fatal (worker bails to its cleanup tail).
pub(super) fn set_sched_policy(pid: libc::pid_t, policy: SchedPolicy) -> Result<()> {
    // Reject pid <= 0: pid 0 means "calling process" to the syscall,
    // pid -1 means "every process in the session," and pid < -1
    // targets a process group. None are valid inputs from within
    // this crate, which only ever stores real worker pids. Mirrors
    // `process_alive` in scenario/mod.rs.
    if pid <= 0 {
        anyhow::bail!("sched_setscheduler: invalid pid {pid} (must be > 0)");
    }
    let (pol, prio) = match policy {
        SchedPolicy::Normal => return Ok(()),
        SchedPolicy::Batch => (libc::SCHED_BATCH, 0),
        SchedPolicy::Idle => (libc::SCHED_IDLE, 0),
        SchedPolicy::Fifo(p) => (libc::SCHED_FIFO, p.clamp(1, 99) as i32),
        SchedPolicy::RoundRobin(p) => (libc::SCHED_RR, p.clamp(1, 99) as i32),
        SchedPolicy::Deadline {
            runtime,
            deadline,
            period,
        } => {
            // SCHED_DEADLINE has no `sched_param` representation —
            // the kernel only accepts it through `sched_setattr(2)`.
            // glibc does not wrap that syscall, so we issue it
            // directly via `syscall(SYS_sched_setattr, ...)`.
            //
            // `__checkparam_dl` (kernel/sched/deadline.c) rejects
            // anything that violates `sched_deadline != 0`,
            // `runtime >= 1024 ns`, the bit-63-clear requirement on
            // `deadline`/`period`, the `runtime <= deadline <=
            // effective_period` ordering (where `effective_period`
            // is `sched_deadline` when `sched_period == 0`), and
            // the sysctl-controlled period bounds. The sysctl
            // values are runtime-tunable via
            // `/proc/sys/kernel/sched_deadline_period_{min,max}_us`,
            // so this pre-validation only mirrors the structural
            // checks (zero-deadline, ordering, top-bit, DL_SCALE
            // floor) — the sysctl bound check happens kernel-side
            // and surfaces as a syscall EINVAL.
            //
            // The Duration → u64 ns conversions ALSO enforce the
            // kernel's bit-63-clear constraint as a single
            // i64::MAX overflow check in `duration_to_kernel_ns`:
            // `Duration::as_nanos()` returns `u128`, and a value
            // exceeding `i64::MAX` would either flip bit 63 of the
            // truncated u64 (kernel reserved) or wrap on the cast
            // entirely. Doing the conversion here keeps the
            // top-bit check and the syscall arg in lockstep.
            if deadline.is_zero() {
                anyhow::bail!(
                    "sched_setattr: deadline must be > 0 (kernel `__checkparam_dl` rejects zero deadline)"
                );
            }
            let runtime_ns = duration_to_kernel_ns(runtime, "runtime")?;
            let deadline_ns = duration_to_kernel_ns(deadline, "deadline")?;
            let period_ns = duration_to_kernel_ns(period, "period")?;
            if runtime_ns < 1024 {
                anyhow::bail!(
                    "sched_setattr: runtime ({runtime_ns} ns) below kernel DL_SCALE floor (1024 ns)"
                );
            }
            if runtime_ns > deadline_ns {
                anyhow::bail!(
                    "sched_setattr: runtime ({runtime_ns} ns) > deadline ({deadline_ns} ns)"
                );
            }
            // `period == Duration::ZERO` is legal: the kernel
            // substitutes `sched_deadline` for the period in that
            // case (see `if (!period) period = attr->sched_deadline;`
            // in `__checkparam_dl`). Only enforce `deadline <=
            // period` when period is non-zero.
            if period_ns != 0 && deadline_ns > period_ns {
                anyhow::bail!(
                    "sched_setattr: deadline ({deadline_ns} ns) > period ({period_ns} ns)"
                );
            }
            // SAFETY: `sched_attr` is a UAPI struct of plain
            // integer fields (no padding bytes affect kernel
            // behavior; the kernel reads `size` and treats unknown
            // tail as zero). Zero-initializing is the canonical
            // way to construct it because libc's `s!` macro
            // derives only `Clone, Copy, Debug` — no `Default`.
            let mut attr: libc::sched_attr = unsafe { std::mem::zeroed() };
            attr.size = std::mem::size_of::<libc::sched_attr>() as u32;
            attr.sched_policy = libc::SCHED_DEADLINE as u32;
            attr.sched_runtime = runtime_ns;
            attr.sched_deadline = deadline_ns;
            attr.sched_period = period_ns;
            // sched_setattr(pid_t pid, struct sched_attr *attr,
            //               unsigned int flags). flags=0 — the
            // kernel reserves them for future use.
            //
            // SAFETY:
            // - `pid` is validated > 0 at the top of
            //   `set_sched_policy`, so the kernel cannot interpret
            //   it as the broadcast / process-group target encoded
            //   by 0 / negative pid_t values.
            // - `&attr` is a borrow of a stack local that lives
            //   for the entire syscall — we do not move or drop
            //   `attr` between the borrow and the syscall return.
            //   `libc::sched_attr` is `#[repr(C)]` (UAPI) and was
            //   zeroed via `std::mem::zeroed()` then field-
            //   initialized, so the bytes the kernel reads are
            //   either the values explicitly set above or zero
            //   (the kernel-defined unset value for every
            //   remaining field).
            // - `attr.size` is the actual `size_of::<libc::sched_attr>()`
            //   the kernel ABI expects for `sched_setattr(2)`'s
            //   forward-compat protocol: the kernel uses `size`
            //   to gate which fields it reads and ignores tail
            //   bytes beyond its own struct definition. Sending
            //   our struct's size and zeroing the body cleanly
            //   covers older AND newer kernels.
            // - `flags = 0u32` is the only currently-defined
            //   value; the kernel rejects unknown flag bits with
            //   EINVAL.
            // - The kernel copies `attr` into kernel space inside
            //   the syscall (`copy_struct_from_user` in
            //   kernel/sched/syscalls.c) and does not retain a
            //   reference to our stack memory after the syscall
            //   returns, so the borrow only needs to outlive the
            //   single syscall.
            let ret = unsafe {
                libc::syscall(
                    libc::SYS_sched_setattr,
                    pid,
                    &attr as *const libc::sched_attr,
                    0u32,
                )
            };
            if ret != 0 {
                anyhow::bail!("sched_setattr: {}", std::io::Error::last_os_error());
            }
            return Ok(());
        }
    };
    let param = libc::sched_param {
        sched_priority: prio,
    };
    if unsafe { libc::sched_setscheduler(pid, pol, &param) } != 0 {
        anyhow::bail!("sched_setscheduler: {}", std::io::Error::last_os_error());
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::assert::parse_numa_maps;

    /// The region-scoped read must sum `N<node>=` ONLY for the VMA whose start
    /// address equals `region_addr`, excluding every other (COW-inherited) VMA —
    /// this is what makes `page_locality` measure the policy-governed allocation
    /// rather than the whole-process RSS.
    #[test]
    fn region_pages_scopes_to_matching_vma() {
        // A decoy inherited file-backed VMA at 0x400000 plus the policy-governed
        // anon region VMA at 0x7f0000000000.
        let content = "\
00400000 default file=/usr/bin/x mapped=300 N0=200 N1=100\n\
7f0000000000 default anon=16384 dirty=16384 N0=16384\n";
        let entries = parse_numa_maps(content);
        let region = sum_region_node_pages(&entries, 0x7f00_0000_0000);
        assert_eq!(region.get(&0), Some(&16384), "region node-0 pages summed");
        assert_eq!(region.get(&1), None, "region has no node-1 pages");
        let total: u64 = region.values().sum();
        assert_eq!(
            total, 16384,
            "decoy file VMA (N0=200,N1=100) must be excluded from the region scope"
        );
    }

    /// No VMA matches `region_addr` -> empty map. The caller turns this into
    /// total=0 -> page_locality 0.0 -> a hard FAIL, never a vacuous pass.
    #[test]
    fn region_pages_empty_when_no_vma_matches() {
        let entries = parse_numa_maps("00400000 default anon=10 N0=10\n");
        assert!(sum_region_node_pages(&entries, 0xdead_beef).is_empty());
    }
}