ktstr 0.10.0

Test harness for Linux process schedulers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
//! End-to-end exercise of the scheduler-lifecycle Ops
//! ([`Op::AttachScheduler`](ktstr::scenario::ops::Op::AttachScheduler),
//! [`Op::DetachScheduler`](ktstr::scenario::ops::Op::DetachScheduler),
//! [`Op::RestartScheduler`](ktstr::scenario::ops::Op::RestartScheduler),
//! [`Op::ReplaceScheduler`](ktstr::scenario::ops::Op::ReplaceScheduler))
//! against a real VM. Boots with scx-ktstr as the boot scheduler,
//! stages a second scx-ktstr-backed entry under a distinct name,
//! and runs Op::ReplaceScheduler mid-experiment to validate the
//! swap path end-to-end.
//!
//! The two `Scheduler` constants both resolve to the same scx-ktstr
//! binary via `SchedulerSpec::Discover("scx-ktstr")` — the staging
//! pipeline keys on `Scheduler.name` not on the binary path, so
//! distinct names land at distinct
//! `/staging/schedulers/<name>/scheduler` archive entries even when
//! the resolved binaries match. This keeps the test focused on the
//! lifecycle wire-up (kill prev / spawn new / SCHED_PID swap)
//! rather than requiring a second in-tree scheduler binary.

use anyhow::Result;
use ktstr::assert::AssertResult;
use ktstr::ktstr_test;
use ktstr::prelude::{SampleSeries, VmResult};
use ktstr::scenario::Ctx;
use ktstr::test_support::{Scheduler, SchedulerSpec};

const PRIMARY_SCHED: Scheduler =
    Scheduler::named("lifecycle_primary").binary(SchedulerSpec::Discover("scx-ktstr"));

const STAGED_ALT_SCHED: Scheduler =
    Scheduler::named("lifecycle_alt").binary(SchedulerSpec::Discover("scx-ktstr"));

/// Boot baseline for the cold-start attach test below: the kernel's
/// default scheduler (EEVDF on current kernels) so no scx_* userspace
/// binary runs at boot. `Op::AttachScheduler` then dispatches the
/// first scx attach mid-scenario.
const COLD_START_BOOT: Scheduler = Scheduler::named("cold_start_boot").binary(SchedulerSpec::Eevdf);

/// Staged scheduler for the cold-start attach test.
const COLD_START_ALT_SCHED: Scheduler =
    Scheduler::named("cold_start_alt").binary(SchedulerSpec::Discover("scx-ktstr"));

/// Scheduler instance configured with `--stall-after 1` so it stops
/// dispatching one second into the run. The kernel's scx watchdog
/// then detects the stall and exits the scheduler with
/// `SCX_EXIT_ERROR_STALL`, surfaced to the host as a scheduler death
/// the dispatch-loop hold should observe and truncate early on. Used
/// only by `dispatch_hold_truncates_when_scheduler_dies_midstep`.
const STALL_AFTER_1S_SCHED: Scheduler = Scheduler::named("stall_after_1s")
    .binary(SchedulerSpec::Discover("scx-ktstr"))
    .sched_args(&["--stall-after", "1"]);

/// Host-side `post_vm` shared by the three scheduler-lifecycle tests
/// (attach / replace / restart). Each runs the lifecycle Op then a
/// short workload-free settle; on its own that only proves the Op
/// chain didn't error — NOT that the post-op scheduler actually
/// schedules. scx-ktstr runs in FULL mode (no `SCX_OPS_SWITCH_PARTIAL`),
/// so every runnable fair-class guest task — init, kworkers, RCU
/// kthreads, the runner — flows through `ktstr_dispatch`, which bumps
/// `nr_dispatched` after `scx_bpf_dsq_move_to_local`. So
/// `nr_dispatched > 0` at any
/// periodic sample proves the bound scheduler ran its dispatch path
/// (past the crash/stall/degrade/slow gates) on system traffic alone —
/// no dedicated workload needed. A bind-without-dispatch regression (Op
/// succeeds, scheduler never schedules) reads 0 across every sample and
/// fails here.
fn assert_post_op_dispatch(result: &VmResult) -> Result<()> {
    let series = SampleSeries::from_drained_typed(
        result.snapshot_bridge.drain_ordered_with_stats(),
        result.monitor.clone(),
    )
    .periodic_only();
    anyhow::ensure!(
        !series.is_empty(),
        "no periodic samples on the bridge — the freeze coordinator never \
         fired (periodic_target={}, periodic_fired={}); cannot prove the \
         scheduler dispatched after the lifecycle op",
        result.periodic_target,
        result.periodic_fired,
    );
    let bpf_dispatched = series.bpf("nr_dispatched", |snap| snap.var("nr_dispatched").as_u64());
    let any_progress = bpf_dispatched
        .iter_full()
        .any(|(_, _, slot)| matches!(slot, Ok(v) if *v > 0));
    anyhow::ensure!(
        any_progress,
        "scx-ktstr nr_dispatched read 0 across every periodic sample — the \
         scheduler bound to sched_ext (the lifecycle op succeeded) but never \
         ran its dispatch path. Bind-without-dispatch regression: the post-op \
         scheduler attached but isn't scheduling.",
    );
    Ok(())
}

/// Boots with `lifecycle_primary` as the boot scheduler, stages
/// `lifecycle_alt` into `/staging/schedulers/lifecycle_alt/`, and
/// dispatches Op::ReplaceScheduler mid-scenario. A successful
/// dispatch:
///
/// 1. Reads SCHED_PID (boot scheduler's pid set by start_scheduler)
/// 2. SIGTERM → SIGKILL escalation via kill_scheduler_process
/// 3. Clears SCHED_PID via set_sched_pid(0)
/// 4. Spawns the staged binary at the per-name archive path
/// 5. spawn_scheduler_from_paths re-populates SCHED_PID with the
///    new child's pid via the internal Release store
///
/// Any failure mode (missing staged binary, kill failure, spawn
/// failure, attach failure) surfaces as an actionable error
/// through the apply_ops error path, fails the step, and bubbles
/// up to the test verdict.
///
/// The test body succeeds when the Op chain completes without
/// error — the framework's scheduler-attached watchdog +
/// scx-ktstr's `sched_ext_dump` tracepoint confirm the
/// post-replace scheduler bound to sched_ext correctly (same
/// path the boot scheduler exercises).
#[ktstr_test(
    scheduler = PRIMARY_SCHED,
    staged_schedulers = [STAGED_ALT_SCHED],
    llcs = 1,
    cores = 2,
    threads = 1,
    memory_mib = 512,
    duration_s = 5,
    cleanup_budget_ms = 5000,
    num_snapshots = 3,
    post_vm = assert_post_op_dispatch,
)]
fn scheduler_replace_mid_experiment_swaps_via_staged_pack(ctx: &Ctx) -> Result<AssertResult> {
    use ktstr::scenario::ops::{HoldSpec, Op, Step, execute_steps};
    let steps = vec![
        // Pre-swap settle window — boot scheduler runs alone so the
        // post-swap PID change is unambiguous.
        Step::new(
            vec![],
            HoldSpec::fixed(std::time::Duration::from_millis(500)),
        ),
        // The swap. ReplaceScheduler kills the boot scheduler, spawns
        // STAGED_ALT_SCHED's binary from the staged archive path,
        // re-publishes SCHED_PID to the new child. Failure here
        // (missing binary, kill failure, attach failure) bubbles up
        // through the apply_ops error path.
        Step::new(
            vec![Op::replace_scheduler(&STAGED_ALT_SCHED)],
            HoldSpec::fixed(std::time::Duration::from_millis(500)),
        ),
        // Post-swap settle window. The staged scheduler's bind to
        // sched_ext gets verified by the spawn_scheduler_from_paths
        // attach poll; this hold simply gives downstream metric
        // capture a window to confirm the post-swap scheduler ran
        // workload-free without panicking.
        Step::new(
            vec![],
            HoldSpec::fixed(std::time::Duration::from_millis(500)),
        ),
    ];
    execute_steps(ctx, steps)
}

/// Pins the `Op::AttachScheduler` cold-start dispatch — the only
/// scheduler-lifecycle path with zero prior e2e coverage. Restart
/// and Replace tests exercise the kill-then-attach sequence where
/// a primary scheduler ran first; attach-from-no-scheduler exercises
/// a different code path (no SCHED_PID to clear, no scx_disable to
/// wait for, fresh slab allocation for the new scheduler's scx_sched).
///
/// Boots under `SchedulerSpec::Eevdf` so no scx_* binary runs at boot;
/// mid-scenario `Op::AttachScheduler(COLD_START_ALT_SCHED)` dispatches
/// the first scx attach. A successful attach:
///
/// 1. spawn_scheduler_from_paths spawns the staged scx-ktstr binary
/// 2. The spawn helper's `poll_scx_attached` confirms the BPF
///    scheduler bound to `/sys/kernel/sched_ext/root/ops`
/// 3. SCHED_PID gets populated via the spawn helper's Release store
/// 4. The framework's scheduler-attached watchdog observes the new
///    scheduler and resets its deadline
///
/// Any failure (staging mis-pack, spawn failure, attach timeout) bails
/// through the apply_ops error path.
#[ktstr_test(
    scheduler = COLD_START_BOOT,
    staged_schedulers = [COLD_START_ALT_SCHED],
    llcs = 1,
    cores = 2,
    threads = 1,
    memory_mib = 512,
    duration_s = 5,
    cleanup_budget_ms = 5000,
    num_snapshots = 3,
    post_vm = assert_post_op_dispatch,
)]
fn scheduler_attach_from_cold_start_succeeds(ctx: &Ctx) -> Result<AssertResult> {
    use ktstr::scenario::ops::{HoldSpec, Op, Step, execute_steps};
    let steps = vec![
        // Pre-attach settle — no scheduler running, EEVDF handles
        // scheduling. Confirms the test infrastructure survives
        // bootless scheduler state.
        Step::new(
            vec![],
            HoldSpec::fixed(std::time::Duration::from_millis(500)),
        ),
        // The cold-start attach. AttachScheduler spawns
        // COLD_START_ALT_SCHED's staged binary, polls for sched_ext
        // bind, publishes SCHED_PID. No prior scheduler means no
        // detach work — first scx_alloc_and_add_sched on this VM.
        Step::new(
            vec![Op::attach_scheduler(&COLD_START_ALT_SCHED)],
            HoldSpec::fixed(std::time::Duration::from_millis(500)),
        ),
        // Post-attach settle — the freshly-attached scheduler runs
        // workload-free for a window so the live SCHED_PID monitor
        // confirms it stays bound to sched_ext without panicking.
        Step::new(
            vec![],
            HoldSpec::fixed(std::time::Duration::from_millis(500)),
        ),
    ];
    execute_steps(ctx, steps)
}

/// Pins `hold_or_sched_died`'s mid-hold scheduler-death detection +
/// early-truncation behaviour. Without the early truncation, an Op
/// step's hold would block until its full configured duration even
/// when the scheduler died at second 1 — leaving the per-step
/// `sched_died_during_hold` flag accurate but the wall-clock time
/// inflated, masking the actual death-detection latency.
///
/// Mechanism: scheduler launches with `--stall-after 1`. The BPF
/// `stall` flag flips at t≈1s; `ktstr_dispatch` stops moving tasks
/// to the shared DSQ; the kernel scx watchdog detects stalled
/// runnable tasks within a few seconds and exits the scheduler with
/// `SCX_EXIT_ERROR_STALL`. The dispatch-loop hold is configured
/// for 15 s — well beyond the watchdog deadline — so the hold MUST
/// truncate early when the scheduler dies. The post-VM assertion
/// rejects any elapsed time near the full 15 s budget.
#[ktstr_test(
    scheduler = STALL_AFTER_1S_SCHED,
    llcs = 1,
    cores = 2,
    threads = 1,
    memory_mib = 512,
    duration_s = 20,
    watchdog_timeout_s = 30,
    auto_repro = false,
    expect_err = true,
    cleanup_budget_ms = 5000,
)]
fn dispatch_hold_truncates_when_scheduler_dies_midstep(ctx: &Ctx) -> Result<AssertResult> {
    use ktstr::scenario::ops::{HoldSpec, Step, execute_steps};
    let t0 = std::time::Instant::now();
    let steps = vec![Step::new(
        vec![],
        HoldSpec::fixed(std::time::Duration::from_secs(15)),
    )];
    let result = execute_steps(ctx, steps);
    let elapsed = t0.elapsed();

    // Assert early-truncation: the configured hold is 15 s but the
    // scheduler dies via the stall watchdog at roughly t=1s+watchdog.
    // 12 s threshold is well below the configured 15 s hold and well
    // above any plausible watchdog deadline (kernel scx watchdog is
    // typically 2-10s depending on the build), so a step elapsed
    // exceeding 12 s pins a regression where `hold_or_sched_died`
    // stops detecting scheduler death mid-step.
    let ceiling = std::time::Duration::from_secs(12);
    if elapsed >= ceiling {
        return Ok(AssertResult::fail_msg(format!(
            "dispatch loop did not truncate after scheduler-stall death: \
             configured hold = 15 s, scheduler stalled at t≈1 s + watchdog, \
             actual elapsed = {elapsed:?} (≥ {ceiling:?} ceiling). \
             hold_or_sched_died's mid-hold scheduler-death observation is \
             broken — the per-step hold ran to completion despite the \
             scheduler dying. Check pidfd_wait_exit + the dispatch loop's \
             death-observation branch in src/scenario/ops/mod.rs."
        )));
    }
    // Propagate the inner result so the framework's expect_err =
    // true + auto-repro = false machinery still sees the genuine
    // SCX_EXIT_ERROR_STALL failure surfacing through apply_ops.
    result
}

/// `/bin/false` staged as a "scheduler" — exits non-zero
/// immediately, well inside the spawn helper's 1 s liveness window.
/// Used only by `replace_with_broken_binary_surfaces_startup_died`
/// to exercise `SpawnSchedulerError::StartupDied` end-to-end.
/// Choosing `/bin/false` over an invalid-CLI-arg approach decouples
/// the test from scx-ktstr's clap parser staying the same.
const BROKEN_BINARY_SCHED: Scheduler =
    Scheduler::named("broken_binary").binary(SchedulerSpec::Path("/bin/false"));

/// Pins `try_spawn_scheduler`'s `StartupDied` path + the
/// SCHED_PID cleanup + `SpawnSchedulerError::Display` content.
/// When `Op::ReplaceScheduler` stages
/// `/bin/false` as the scheduler binary, the spawn helper's
/// `poll_startup` observes the immediate non-zero exit and
/// bubbles back `SpawnSchedulerError::StartupDied`. The
/// `apply_ops` error path surfaces it as a typed step failure
/// that the framework's `expect_err = true` machinery accepts.
///
/// What this pins:
///   - The error CLASS reaches the operator (StartupDied, not
///     NotAttached or generic spawn failure)
///   - The post-mortem state guarantee — SCHED_PID gets cleared
///     to 0 and the process is reaped — implicit: if it didn't,
///     the next test in the same VM session would observe a stale
///     dead pid and behave unpredictably. The framework's per-VM
///     isolation makes this hard to assert directly, but a
///     regression that leaves SCHED_PID dangling would surface as
///     downstream flake.
#[ktstr_test(
    scheduler = PRIMARY_SCHED,
    staged_schedulers = [BROKEN_BINARY_SCHED],
    llcs = 1,
    cores = 2,
    threads = 1,
    memory_mib = 512,
    duration_s = 5,
    auto_repro = false,
    expect_err = true,
    cleanup_budget_ms = 5000,
)]
fn replace_with_broken_binary_surfaces_startup_died(ctx: &Ctx) -> Result<AssertResult> {
    use ktstr::scenario::ops::{HoldSpec, Op, Step, execute_steps};
    let steps = vec![
        // Settle window so the primary scheduler is firmly attached
        // before we kick off the doomed Op::Replace.
        Step::new(
            vec![],
            HoldSpec::fixed(std::time::Duration::from_millis(500)),
        ),
        // The doomed Replace. kill_current_scheduler kills the
        // primary; spawn_scheduler_for_op then exec's /bin/false as
        // the "scheduler" binary; /bin/false exits non-zero
        // immediately; poll_startup observes the exit;
        // SpawnSchedulerError::StartupDied bubbles back through
        // apply_ops as a typed Op failure. execute_steps returns Err.
        Step::new(
            vec![Op::replace_scheduler(&BROKEN_BINARY_SCHED)],
            HoldSpec::fixed(std::time::Duration::from_millis(500)),
        ),
    ];
    let result = execute_steps(ctx, steps);
    // The Op::Replace MUST fail; if it somehow succeeds, the
    // SpawnSchedulerError::StartupDied path isn't being exercised
    // and the test is silently wrong.
    if let Ok(ref ok_result) = result
        && ok_result.is_pass()
    {
        return Ok(AssertResult::fail_msg(
            "Op::ReplaceScheduler with /bin/false as the scheduler binary DID \
             NOT FAIL — try_spawn_scheduler's StartupDied path is no longer being \
             exercised (poll_startup is missing the immediate exit, or the spawn \
             helper started swallowing exit codes)."
                .to_string(),
        ));
    }
    result
}

/// Validates the [`Op::RestartScheduler`](ktstr::scenario::ops::Op::RestartScheduler)
/// hot-restart path: kills the currently-attached scheduler and
/// re-spawns the BOOT scheduler at `/scheduler` + `/sched_args` +
/// `/tmp/sched.log` (the canonical boot paths the wrapper passes
/// to `spawn_scheduler_from_paths`). Successful restart means:
///
/// 1. SCHED_PID atomic reflects a NEW pid post-restart (different
///    from the boot pid). The Op handler reads SCHED_PID before
///    kill, SIGTERMs, waits for sched_ext state to reach
///    `disabled`, spawns, and the spawn helper re-publishes
///    SCHED_PID via [`set_sched_pid`](ktstr::vmm::rust_init::set_sched_pid).
/// 2. The post-restart scheduler successfully binds to sched_ext —
///    verified inside `spawn_scheduler_from_paths` via
///    `poll_scx_attached` against `/sys/kernel/sched_ext/root/ops`.
/// 3. The framework's host-side scheduler liveness monitor does
///    NOT flag the kill as "scheduler died unexpectedly" —
///    `SCHED_EXIT_SUPPRESS` gates the guest's sched_exit_monitor
///    from sending the SchedExit message that would otherwise
///    promote into the run-wide kill flag.
///
/// State-preservation note: scheduler BPF state is intentionally
/// RESET across an Op::RestartScheduler (the kernel teardown +
/// fresh prog load drops per-CPU + arena state). The test
/// validates that the restart MECHANICS work — that the scheduler
/// can be torn down + re-attached cleanly without leaving the
/// guest in a stuck state. Per-state continuity is a separate
/// concern outside Op::RestartScheduler's contract.
#[ktstr_test(
    scheduler = PRIMARY_SCHED,
    llcs = 1,
    cores = 2,
    threads = 1,
    memory_mib = 512,
    duration_s = 5,
    cleanup_budget_ms = 5000,
    num_snapshots = 3,
    post_vm = assert_post_op_dispatch,
)]
fn scheduler_restart_mid_experiment_reattaches_cleanly(ctx: &Ctx) -> Result<AssertResult> {
    use ktstr::scenario::ops::{HoldSpec, Op, Step, execute_steps};
    let steps = vec![
        // Pre-restart settle window — boot scheduler runs alone.
        Step::new(
            vec![],
            HoldSpec::fixed(std::time::Duration::from_millis(500)),
        ),
        // The restart. RestartScheduler kills the boot scheduler
        // via the same SIGTERM + sysrq-S + wait_for_scx_disabled
        // path as ReplaceScheduler, then re-spawns the BOOT
        // scheduler binary at /scheduler + /sched_args. Failure
        // here (kill timeout, scx state stuck, attach failure)
        // bubbles up through the apply_ops error path.
        Step::new(
            vec![Op::restart_scheduler()],
            HoldSpec::fixed(std::time::Duration::from_millis(500)),
        ),
        // Post-restart settle window. The freshly-spawned boot
        // scheduler's bind to sched_ext gets verified by the
        // spawn helper's attach poll; this hold gives the live
        // SCHED_PID monitor a window to confirm the post-restart
        // scheduler runs workload-free without panicking.
        Step::new(
            vec![],
            HoldSpec::fixed(std::time::Duration::from_millis(500)),
        ),
    ];
    execute_steps(ctx, steps)
}