cellos-host-firecracker 0.5.1

Firecracker microVM backend for CellOS — jailer integration, warm pool with snapshot/restore, KVM nested-virtualisation aware.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
//! K5 / FC-53 — vCPU quota enforcement window.
//!
//! Acceptance gate (from [Plans/firecracker-release-readiness.md] L115):
//!
//! > FC-53: vCPU quota is enforced. Acceptance: e2e workload spinning N+1
//! > threads where N = derived vCPU count consumes ≤ N vCPUs of host time
//! > over a 10-second window, measured via `/proc/<pid>/stat` deltas of
//! > the firecracker process.
//!
//! # Two-tier coverage
//!
//! 1. **Pure-Rust unit tier (always-on):** the parser/arithmetic helpers
//!    that turn two `/proc/<pid>/stat` snapshots into a CPU-seconds delta
//!    and grade it against the (vcpu_count, wall_clock, tolerance) budget.
//!    These run on every CI leg — including non-Linux hosts where the
//!    surrounding crate fails to compile, in which case the test binary
//!    is simply not built.
//!
//! 2. **Linux integration tier (`#[ignore]`'d):** the real e2e — spawn a
//!    Firecracker microVM with `vcpu_count=1` (the
//!    `DEFAULT_VCPU_COUNT` derived by `derive_vcpu_count` when no
//!    `cpu.max` is declared, lib.rs L84), run two `yes > /dev/null`
//!    loops competing for that single vCPU plus a `sleep 10`, sample
//!    `/proc/<firecracker-pid>/stat` before and after, and assert the
//!    quota holds. Marked `#[ignore]` because it requires a real
//!    firecracker binary, kernel, rootfs, and elevated privileges; run
//!    explicitly with `cargo test -- --ignored`.
//!
//! # `/proc/<pid>/stat` parsing notes
//!
//! The format is documented in `proc(5)`. Fields are space-separated, but
//! field 2 (`comm`) is the executable basename wrapped in parentheses and
//! **may itself contain spaces** (the kernel does not escape them — a
//! process named `multi word` legitimately appears as `(multi word)`).
//! The robust split is therefore:
//!
//!   1. find the LAST `)` in the line — this terminates `comm`,
//!   2. split the remainder by ASCII whitespace.
//!
//! After that split, field 14 (`utime`) is index 11 of the post-comm
//! tail and field 15 (`stime`) is index 12 — because fields 1 and 2
//! (`pid` and `comm`) sit on the left of the `)` we discarded.

use std::num::ParseIntError;

// ── Pure-Rust unit tier ─────────────────────────────────────────────────────

/// Failure modes for `parse_proc_stat_cpu_jiffies`.
#[derive(Debug)]
pub enum ParseError {
    /// Line did not contain a `)` — `comm` field terminator missing.
    NoCommTerminator,
    /// The post-comm tail had fewer than 13 whitespace-separated fields,
    /// so `utime` (offset 11) and `stime` (offset 12) cannot be read.
    TooFewFields { got: usize },
    /// `utime` failed to parse as `u64`.
    UtimeNotU64(ParseIntError),
    /// `stime` failed to parse as `u64`.
    StimeNotU64(ParseIntError),
}

impl std::fmt::Display for ParseError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            ParseError::NoCommTerminator => {
                write!(
                    f,
                    "no `)` in /proc/<pid>/stat line — comm field unterminated"
                )
            }
            ParseError::TooFewFields { got } => write!(
                f,
                "post-comm tail has only {got} whitespace-separated fields; \
                 need at least 13 to reach utime + stime"
            ),
            ParseError::UtimeNotU64(e) => write!(f, "utime not parseable as u64: {e}"),
            ParseError::StimeNotU64(e) => write!(f, "stime not parseable as u64: {e}"),
        }
    }
}

impl std::error::Error for ParseError {}

/// Parse a single line of `/proc/<pid>/stat` and return `(utime, stime)` in
/// clock ticks. See module docs for the parenthesised-comm rationale.
pub fn parse_proc_stat_cpu_jiffies(stat: &str) -> Result<(u64, u64), ParseError> {
    // Split AFTER the last ')' so `comm` containing spaces is still safe.
    let close = stat.rfind(')').ok_or(ParseError::NoCommTerminator)?;
    let tail = &stat[close + 1..];

    let fields: Vec<&str> = tail.split_ascii_whitespace().collect();
    // After the ')', the kernel emits fields starting at field 3 (`state`).
    // utime is field 14 (post-comm offset 11), stime is field 15 (offset 12).
    if fields.len() < 13 {
        return Err(ParseError::TooFewFields { got: fields.len() });
    }
    let utime = fields[11].parse::<u64>().map_err(ParseError::UtimeNotU64)?;
    let stime = fields[12].parse::<u64>().map_err(ParseError::StimeNotU64)?;
    Ok((utime, stime))
}

/// Convert two `(utime, stime)` jiffy snapshots into the CPU seconds
/// consumed in the interval. `clock_ticks_per_sec` is `sysconf(_SC_CLK_TCK)`
/// on the host (typically 100 on x86_64 Linux).
///
/// Saturating subtraction guards against the unlikely case where the after
/// snapshot is older than the before snapshot (e.g. test author swapped
/// the args) — a negative delta is meaningless here so we report 0.
pub fn compute_cpu_seconds_used(
    stat_before: (u64, u64),
    stat_after: (u64, u64),
    clock_ticks_per_sec: u64,
) -> f64 {
    debug_assert!(
        clock_ticks_per_sec > 0,
        "clock_ticks_per_sec must be > 0 (sysconf(_SC_CLK_TCK))"
    );
    let du = stat_after.0.saturating_sub(stat_before.0);
    let ds = stat_after.1.saturating_sub(stat_before.1);
    let total_ticks = du.saturating_add(ds);
    total_ticks as f64 / clock_ticks_per_sec as f64
}

/// FC-53 quota check.
///
/// Returns `true` iff the firecracker process consumed at most
/// `vcpu_count * wall_clock_seconds` CPU-seconds, with a `tolerance_pct`
/// slack on top to absorb measurement noise (clock-tick rounding,
/// scheduler jitter, the firecracker VMM's own bookkeeping threads).
///
/// Mathematically:
///
/// ```text
/// budget = vcpu_count * wall_clock_seconds * (1 + tolerance_pct/100)
/// pass   = (cpu_seconds_used <= budget)
/// ```
pub fn quota_enforced(
    cpu_seconds_used: f64,
    vcpu_count: u32,
    wall_clock_seconds: f64,
    tolerance_pct: f64,
) -> bool {
    let budget = f64::from(vcpu_count) * wall_clock_seconds * (1.0 + tolerance_pct / 100.0);
    cpu_seconds_used <= budget
}

// ── Unit tests ─────────────────────────────────────────────────────────────

#[cfg(test)]
mod parser {
    use super::*;

    /// Build a synthetic `/proc/<pid>/stat` line with a chosen comm and
    /// (utime, stime). All other fields are filler so the line has the
    /// canonical >= 52-field shape — but we only require 13 post-comm
    /// fields, so the helper emits 14 to keep the parser exercised.
    fn synth(comm: &str, utime: u64, stime: u64) -> String {
        // Layout after `)` (post-comm offsets):
        //   0:state 1:ppid 2:pgrp 3:session 4:tty_nr 5:tpgid 6:flags
        //   7:minflt 8:cminflt 9:majflt 10:cmajflt 11:utime 12:stime
        //   13:cutime ...
        format!("1 ({comm}) S 0 1 1 0 -1 4194304 100 0 0 0 {utime} {stime} 0 0 20 0 1 0 0",)
    }

    #[test]
    fn typical_firecracker_line_parses() {
        let line = synth("firecracker", 123, 45);
        assert_eq!(parse_proc_stat_cpu_jiffies(&line).unwrap(), (123, 45));
    }

    #[test]
    fn comm_with_embedded_space_parses() {
        // Kernel does not escape spaces inside `comm`. The split must use
        // the LAST `)` so we don't get fooled by spaces inside the parens.
        let line = synth("multi word", 200, 50);
        assert_eq!(parse_proc_stat_cpu_jiffies(&line).unwrap(), (200, 50));
    }

    #[test]
    fn comm_with_embedded_paren_picks_last_close() {
        // A pathological comm that contains its own `)`: the algorithm picks
        // the LAST `)`, so the trailing fields parse cleanly. (Inside the
        // earlier `(`...`)` is treated as part of comm; that's by design —
        // proc(5) requires the reader to scan from end-of-line back.)
        let line = "42 (weird) name) R 0 1 1 0 -1 4 0 0 0 0 7 11 0 0 20 0";
        // post-comm tail = "R 0 1 1 0 -1 4 0 0 0 0 7 11 0 0 20 0"
        // utime is index 11 = 7, stime is index 12 = 11
        assert_eq!(parse_proc_stat_cpu_jiffies(line).unwrap(), (7, 11));
    }

    #[test]
    fn missing_close_paren_is_error() {
        let line = "1 firecracker S 0 1 1 0 -1 4 0 0 0 0 1 2 0 0 20 0";
        let err = parse_proc_stat_cpu_jiffies(line).expect_err("must error");
        assert!(matches!(err, ParseError::NoCommTerminator), "got: {err}");
    }

    #[test]
    fn too_few_fields_is_error() {
        // `)` is present but there are not enough tail fields to reach
        // utime/stime offsets.
        let line = "1 (fc) R 0 1 1 0";
        let err = parse_proc_stat_cpu_jiffies(line).expect_err("must error");
        assert!(matches!(err, ParseError::TooFewFields { .. }), "got: {err}");
    }

    #[test]
    fn non_numeric_utime_is_error() {
        let line = "1 (fc) S 0 1 1 0 -1 4 0 0 0 0 NOPE 5 0 0 20 0 1 0 0";
        let err = parse_proc_stat_cpu_jiffies(line).expect_err("must error");
        assert!(matches!(err, ParseError::UtimeNotU64(_)), "got: {err}");
    }

    #[test]
    fn non_numeric_stime_is_error() {
        let line = "1 (fc) S 0 1 1 0 -1 4 0 0 0 0 7 NOPE 0 0 20 0 1 0 0";
        let err = parse_proc_stat_cpu_jiffies(line).expect_err("must error");
        assert!(matches!(err, ParseError::StimeNotU64(_)), "got: {err}");
    }
}

#[cfg(test)]
mod arithmetic {
    use super::*;

    #[test]
    fn cpu_seconds_simple_delta() {
        // before: 100 utime + 50 stime; after: 200 utime + 80 stime
        // delta = 100 + 30 = 130 ticks; @ 100 Hz = 1.30 s
        let s = compute_cpu_seconds_used((100, 50), (200, 80), 100);
        assert!((s - 1.30).abs() < 1e-9, "got {s}");
    }

    #[test]
    fn cpu_seconds_zero_delta() {
        let s = compute_cpu_seconds_used((42, 7), (42, 7), 100);
        assert_eq!(s, 0.0);
    }

    #[test]
    fn cpu_seconds_swapped_snapshots_saturate_to_zero() {
        // Caller passed (after, before) by mistake — saturating sub yields
        // 0.0 rather than panicking on signed overflow.
        let s = compute_cpu_seconds_used((200, 80), (100, 50), 100);
        assert_eq!(s, 0.0);
    }

    #[test]
    fn cpu_seconds_typical_kernel_clock_tck_100() {
        // 1000 ticks delta @ 100 Hz = 10.0 s of CPU
        let s = compute_cpu_seconds_used((0, 0), (700, 300), 100);
        assert!((s - 10.0).abs() < 1e-9, "got {s}");
    }
}

#[cfg(test)]
mod typed_reason_contract {
    //! FC-50-T53 follow-up: lock the typed `LifecycleReason` wire-string
    //! contract from inside the FC-53 test file.
    //!
    //! Why this lives here: FC-53's primary tests cover pure helpers
    //! (`/proc/<pid>/stat` parsing, jiffy arithmetic, quota predicate) and
    //! never reference `cellos_core::LifecycleReason`. The original tester
    //! wave-8 audit (audits/tester-wave-8-2026-05-07.md) flagged that a
    //! rename of `LifecycleReason::SignalKilled` would not trip any FC-53
    //! test even though the file's narrative is "vCPU quota broken →
    //! supervisor SIGKILLs the firecracker process".
    //!
    //! The forced-termination path that FC-53 exists to catch — supervisor
    //! kills the firecracker process because its threads escaped the cgroup
    //! cpu.max — surfaces on the lifecycle-destroyed event with
    //! `reason = LifecycleReason::SignalKilled.as_wire_str()` once the
    //! FC-50 wiring lands. We pin the wire-string here so a typed-rename
    //! drift in `cellos-core` is a compile error in this file (the
    //! `LifecycleReason::SignalKilled` path) AND a value mismatch is a
    //! unit-test failure (the `"signal_killed"` literal). Same shape as
    //! the FC-52 sibling assertion in `fc52_oom_enforcement.rs`.

    #[test]
    fn fc53_typed_reason_locks_signal_killed_wire_string() {
        let typed_reason = cellos_core::LifecycleReason::SignalKilled;
        assert_eq!(
            typed_reason.as_wire_str(),
            "signal_killed",
            "FC-50 typed enum's wire form for SignalKilled must equal the \
             canonical `signal_killed` token. A rename of the variant or a \
             serde-shape drift surfaces here as a unit failure rather than \
             going unnoticed until a supervisor-emission integration leg \
             flips the wire bytes. FC-53's vCPU-quota busted path \
             (supervisor SIGKILLs the firecracker process whose threads \
             escaped the cgroup cpu.max budget) lands as \
             `reason = signal_killed` once FC-50 wiring reaches this code \
             path — see the FC-52 sibling assertion for the OOM equivalent."
        );
    }
}

#[cfg(test)]
mod quota {
    use super::*;

    #[test]
    fn one_vcpu_under_budget_is_ok() {
        // 1 vCPU, 5 wall-seconds, used 4.5 CPU-seconds → fits.
        assert!(quota_enforced(4.5, 1, 5.0, 10.0));
    }

    #[test]
    fn one_vcpu_at_budget_is_ok() {
        // Exactly 1 vCPU * 10 s * 1.10 = 11.0 budget; consume 11.0.
        assert!(quota_enforced(11.0, 1, 10.0, 10.0));
    }

    #[test]
    fn one_vcpu_just_over_tolerance_breaks_quota() {
        // 1 vCPU, 10 s, 10% slack → 11.0 budget. Used 11.001 → fail.
        assert!(!quota_enforced(11.001, 1, 10.0, 10.0));
    }

    #[test]
    fn one_vcpu_used_eight_breaks_quota_under_strict_tolerance() {
        // 1 vCPU, 5 s, 0% slack → 5.0 budget; 8.0 used → quota broken
        // (this is the canonical "two yes loops escaped" failure mode).
        assert!(!quota_enforced(8.0, 1, 5.0, 0.0));
    }

    #[test]
    fn two_vcpus_used_eight_under_ten_seconds_is_ok() {
        // 2 vCPU * 10 s = 20.0; 8.0 used → comfortably under budget.
        assert!(quota_enforced(8.0, 2, 10.0, 0.0));
    }

    #[test]
    fn fc53_canonical_n_plus_one_threads_one_vcpu_pass() {
        // The plan's exact scenario, passing leg: N=1 vCPU, 10 s window,
        // two competing yes loops. With quota enforced, the firecracker
        // process must NOT exceed ~10 CPU-seconds. We simulate a healthy
        // run: 9.5 CPU-seconds consumed → quota holds.
        assert!(quota_enforced(9.5, 1, 10.0, 10.0));
    }

    #[test]
    fn fc53_canonical_n_plus_one_threads_one_vcpu_fail() {
        // The same scenario, failing leg: quota was NOT enforced (e.g. the
        // VMM was misconfigured and both yes loops ran on real host cores).
        // 18 CPU-seconds in a 10 s window → ~1.8 cores → fails 1-vCPU
        // budget even with 10% slack (budget = 11.0).
        assert!(!quota_enforced(18.0, 1, 10.0, 10.0));
    }

    #[test]
    fn zero_vcpu_zero_budget() {
        // Degenerate edge case: 0 vCPU → any positive CPU consumption
        // breaks quota. (Not a real config — derive_vcpu_count clamps to
        // >= 1 — but the helper must not divide-by-zero or panic.)
        assert!(quota_enforced(0.0, 0, 10.0, 10.0));
        assert!(!quota_enforced(0.001, 0, 10.0, 10.0));
    }
}

// ── Linux integration tier (`#[ignore]`'d) ─────────────────────────────────
//
// The real e2e: spawn a Firecracker microVM with vcpu_count=1 running two
// `yes > /dev/null` loops plus a `sleep 10`, sample
// `/proc/<firecracker-pid>/stat` before and after the 10-second window,
// and grade against the FC-53 budget.
//
// `#[ignore]` because the test requires:
//   * a real `firecracker` binary on PATH (or in `CELLOS_FIRECRACKER_BIN`),
//   * a kernel image and rootfs whose digests match the manifest,
//   * permissions to spawn microVMs (KVM access, possibly jailer setup).
//
// Run with:
//   cargo test -p cellos-host-firecracker --test fc53_vcpu_quota -- --ignored
//
// On non-Linux hosts the surrounding crate is unix-only so this module
// would not compile against it; the `cfg(target_os = "linux")` gate keeps
// the file harmless on macOS/Windows CI legs that still run `cargo check`.

#[cfg(target_os = "linux")]
#[test]
#[ignore = "requires firecracker binary, kernel, rootfs, and KVM; run with --ignored"]
fn fc53_vcpu_quota_one_vcpu_n_plus_one_threads_10s_window() {
    use std::fs;
    use std::time::{Duration, Instant};

    // The plan specifies a 10-second window. We allow 10% measurement
    // tolerance to absorb the firecracker VMM's own overhead (vcpu thread
    // scheduling, vsock plumbing, the small number of housekeeping
    // threads firecracker keeps regardless of guest workload).
    const WALL_CLOCK_SECONDS: f64 = 10.0;
    const TOLERANCE_PCT: f64 = 10.0;
    const VCPU_COUNT: u32 = 1; // matches DEFAULT_VCPU_COUNT in lib.rs L84.

    // The harness expects the e2e workflow to have:
    //   1. spawned the cell with vcpu_count=1 and argv =
    //      ["/bin/sh", "-c", "yes > /dev/null & yes > /dev/null & sleep 10"],
    //   2. exported the firecracker process PID via
    //      CELLOS_FIRECRACKER_FC53_PID, and
    //   3. dropped the test into this `--ignored` invocation.
    //
    // Driving the VM boot from within `cargo test` would require linking
    // the supervisor's full admission pipeline which is not appropriate
    // for a backend-crate integration test; the workflow split mirrors
    // the FC-14 e2e harness in this directory.
    let pid_str = std::env::var("CELLOS_FIRECRACKER_FC53_PID").expect(
        "CELLOS_FIRECRACKER_FC53_PID must be the firecracker process PID \
         when running this test with --ignored. The e2e workflow exports \
         it after spawning the cell with vcpu_count=1 and \
         argv=[/bin/sh,-c,yes > /dev/null & yes > /dev/null & sleep 10].",
    );
    let pid: u32 = pid_str
        .trim()
        .parse()
        .expect("CELLOS_FIRECRACKER_FC53_PID must be a positive integer");

    let stat_path = format!("/proc/{pid}/stat");

    // Determine the kernel's clock tick rate. SC_CLK_TCK is 100 on every
    // mainstream x86_64 Linux distro, but reading it from the kernel via
    // libc::sysconf is the durable answer.
    // SAFETY: sysconf(_SC_CLK_TCK) is a pure read of a kernel-defined
    // constant; it has no preconditions and no side effects.
    let clk_tck: u64 = {
        // Avoid pulling libc in as a dep just for this — `getconf` is in
        // every base Linux image and the workflow already shells out
        // similarly for FC-14.
        let out = std::process::Command::new("getconf")
            .arg("CLK_TCK")
            .output()
            .expect("getconf CLK_TCK failed; required for FC-53 measurement");
        let s = String::from_utf8_lossy(&out.stdout);
        s.trim()
            .parse::<u64>()
            .expect("getconf CLK_TCK output is not a u64")
    };

    let read_stat = || -> (u64, u64) {
        let s = fs::read_to_string(&stat_path).unwrap_or_else(|e| {
            panic!(
                "FC-53: failed to read {stat_path}: {e} \
                 (firecracker process likely exited before window closed)"
            )
        });
        parse_proc_stat_cpu_jiffies(&s)
            .unwrap_or_else(|e| panic!("FC-53: malformed {stat_path}: {e}\nline: {s}"))
    };

    let started = Instant::now();
    let before = read_stat();
    std::thread::sleep(Duration::from_secs_f64(WALL_CLOCK_SECONDS));
    let after = read_stat();
    let elapsed = started.elapsed().as_secs_f64();

    let used = compute_cpu_seconds_used(before, after, clk_tck);
    let pass = quota_enforced(used, VCPU_COUNT, elapsed, TOLERANCE_PCT);

    assert!(
        pass,
        "FC-53 violation: firecracker pid {pid} consumed {used:.3} CPU-seconds \
         over a {elapsed:.3} s wall-clock window with vcpu_count={VCPU_COUNT} \
         (tolerance {TOLERANCE_PCT}%). Budget was \
         {budget:.3} s. Two-yes-loops workload escaped the vCPU quota — \
         the VMM's MachineConfig.vcpu_count is not being honoured by the \
         host scheduler, or the cgroup cpu.max is missing.",
        budget = f64::from(VCPU_COUNT) * elapsed * (1.0 + TOLERANCE_PCT / 100.0)
    );
}