cellos-supervisor 0.5.1

CellOS execution-cell runner — boots cells in Firecracker microVMs or gVisor, enforces narrow typed authority, emits signed CloudEvents.
Documentation
//! Supervisor fails fast when `spec.run` exceeds the configured wall-clock timeout.

#[cfg(unix)]
mod unix {
    use std::fs::File;
    use std::io::{BufRead, BufReader, Write};
    use std::path::{Path, PathBuf};
    use std::process::Command;
    use std::time::{Duration, Instant};

    use serde_json::Value;

    fn supervisor_exe() -> PathBuf {
        if let Some(p) = std::env::var_os("CARGO_BIN_EXE_cellos_supervisor") {
            return PathBuf::from(p);
        }
        let root = Path::new(env!("CARGO_MANIFEST_DIR"))
            .parent()
            .and_then(|p| p.parent())
            .expect("cellos-supervisor crate under workspace root");
        let profile = std::env::var("PROFILE").unwrap_or_else(|_| "debug".into());
        root.join("target").join(profile).join("cellos-supervisor")
    }

    fn read_jsonl_events(path: &Path) -> Vec<Value> {
        let file = File::open(path).expect("open jsonl");
        BufReader::new(file)
            .lines()
            .map(|line| {
                serde_json::from_str::<Value>(&line.expect("read jsonl line"))
                    .expect("parse jsonl line")
            })
            .collect()
    }

    #[test]
    fn command_timeout_fails_supervisor_and_emits_spawn_error() {
        let dir = tempfile::tempdir().expect("tempdir");
        let spec_path = dir.path().join("spec.json");
        let jsonl_path = dir.path().join("events.jsonl");
        let json = r#"{
          "apiVersion":"cellos.io/v1",
          "kind":"ExecutionCell",
          "spec":{
            "id":"t-timeout",
            "authority":{"secretRefs":[]},
            "lifetime":{"ttlSeconds":60},
            "run":{
"secretDelivery": "env","argv":["/bin/sh","-c","sleep 2"]}
          }
        }"#;
        let mut f = File::create(&spec_path).expect("create spec");
        f.write_all(json.as_bytes()).expect("write spec");
        drop(f);

        let exe = supervisor_exe();
        let started = Instant::now();
        let status = Command::new(exe)
            .env("CELLOS_DEPLOYMENT_PROFILE", "portable")
            .env("CELL_OS_USE_NOOP_SINK", "1")
            .env("CELLOS_CELL_BACKEND", "stub")
            .env("CELL_OS_JSONL_EVENTS", &jsonl_path)
            .env("CELLOS_RUN_TIMEOUT_MS", "100")
            .current_dir(env!("CARGO_MANIFEST_DIR"))
            .arg(&spec_path)
            .status()
            .expect("spawn cellos-supervisor");
        let elapsed = started.elapsed();

        assert!(
            !status.success(),
            "supervisor should fail on command timeout"
        );
        assert!(
            elapsed < Duration::from_millis(1500),
            "timeout should stop the run well before the 2s sleep: {elapsed:?}"
        );

        let events = read_jsonl_events(&jsonl_path);
        let command_event = events
            .iter()
            .find(|event| event["type"] == "dev.cellos.events.cell.command.v1.completed")
            .expect("command completed event");
        assert_eq!(command_event["data"]["exitCode"], -1);
        // I2: env cap fires with attribution suffix so operators can
        // distinguish CELLOS_RUN_TIMEOUT_MS from spec.run.timeoutMs / TTL.
        assert_eq!(
            command_event["data"]["spawnError"],
            "command timed out after 100 ms (CELLOS_RUN_TIMEOUT_MS)"
        );
    }

    /// I2 critical-path closure: when a spec sets only `ttlSeconds` (no
    /// `timeoutMs`, no `CELLOS_RUN_TIMEOUT_MS` env cap), the TTL itself must
    /// act as the wall-clock ceiling for `spec.run`. This test asserts the
    /// pure-TTL fallback path of `effective_run_timeout` (supervisor.rs
    /// ~3091): `sleep 5` under `ttlSeconds: 1` must be killed at ~1000 ms,
    /// the `command.v1.completed` event must carry the TTL-derived
    /// `spawnError`, and the cell must reach `cell.lifecycle.v1.destroyed`.
    #[test]
    fn ttl_alone_acts_as_run_timeout_watchdog() {
        let dir = tempfile::tempdir().expect("tempdir");
        let spec_path = dir.path().join("spec.json");
        let jsonl_path = dir.path().join("events.jsonl");
        // No `timeoutMs`. `ttlSeconds: 1` is the *only* wall-clock bound.
        let json = r#"{
          "apiVersion":"cellos.io/v1",
          "kind":"ExecutionCell",
          "spec":{
            "id":"t-ttl-watchdog",
            "authority":{"secretRefs":[]},
            "lifetime":{"ttlSeconds":1},
            "run":{
"secretDelivery": "env","argv":["/bin/sh","-c","sleep 5"]}
          }
        }"#;
        let mut f = File::create(&spec_path).expect("create spec");
        f.write_all(json.as_bytes()).expect("write spec");
        drop(f);

        let exe = supervisor_exe();
        let started = Instant::now();
        let status = Command::new(exe)
            .env("CELLOS_DEPLOYMENT_PROFILE", "portable")
            .env("CELL_OS_USE_NOOP_SINK", "1")
            .env("CELLOS_CELL_BACKEND", "stub")
            .env("CELL_OS_JSONL_EVENTS", &jsonl_path)
            // Critical: env cap MUST be absent so the TTL is the only ceiling.
            .env_remove("CELLOS_RUN_TIMEOUT_MS")
            .current_dir(env!("CARGO_MANIFEST_DIR"))
            .arg(&spec_path)
            .status()
            .expect("spawn cellos-supervisor");
        let elapsed = started.elapsed();

        assert!(
            !status.success(),
            "supervisor should fail when TTL ceiling fires"
        );
        // Watchdog grace: TTL=1s + ~1.5s slack for spawn/teardown.
        assert!(
            elapsed < Duration::from_millis(2500),
            "TTL watchdog should stop the run well before the 5s sleep: {elapsed:?}"
        );

        let events = read_jsonl_events(&jsonl_path);
        let command_event = events
            .iter()
            .find(|event| event["type"] == "dev.cellos.events.cell.command.v1.completed")
            .expect("command completed event");
        assert_eq!(command_event["data"]["exitCode"], -1);
        // I2: TTL fallback emits the watchdog-attributed message so operators
        // can tell the supervisor watchdog (not a workload soft cap) killed
        // the cell. ttl_seconds is echoed for ground truth.
        assert_eq!(
            command_event["data"]["spawnError"],
            "cell killed by TTL watchdog after 1000 ms (ttl_seconds=1)",
            "TTL=1s must surface as the watchdog-attributed 1000ms wall-clock cap"
        );

        // Destruction guarantee: TTL is a destroy-by contract, not just a kill.
        assert!(
            events
                .iter()
                .any(|event| event["type"] == "dev.cellos.events.cell.lifecycle.v1.destroyed"),
            "cell.lifecycle.v1.destroyed must fire after TTL-watchdog kill"
        );
    }

    #[test]
    fn spec_timeout_without_env_cap_fails_supervisor() {
        let dir = tempfile::tempdir().expect("tempdir");
        let spec_path = dir.path().join("spec.json");
        let jsonl_path = dir.path().join("events.jsonl");
        let json = r#"{
          "apiVersion":"cellos.io/v1",
          "kind":"ExecutionCell",
          "spec":{
            "id":"t-timeout-spec",
            "authority":{"secretRefs":[]},
            "lifetime":{"ttlSeconds":60},
            "run":{
"secretDelivery": "env","argv":["/bin/sh","-c","sleep 2"],"timeoutMs":120}
          }
        }"#;
        let mut f = File::create(&spec_path).expect("create spec");
        f.write_all(json.as_bytes()).expect("write spec");
        drop(f);

        let exe = supervisor_exe();
        let status = Command::new(exe)
            .env("CELLOS_DEPLOYMENT_PROFILE", "portable")
            .env("CELL_OS_USE_NOOP_SINK", "1")
            .env("CELLOS_CELL_BACKEND", "stub")
            .env("CELL_OS_JSONL_EVENTS", &jsonl_path)
            .env_remove("CELLOS_RUN_TIMEOUT_MS")
            .current_dir(env!("CARGO_MANIFEST_DIR"))
            .arg(&spec_path)
            .status()
            .expect("spawn cellos-supervisor");

        assert!(
            !status.success(),
            "supervisor should fail on spec-defined timeout"
        );

        let events = read_jsonl_events(&jsonl_path);
        let command_event = events
            .iter()
            .find(|event| event["type"] == "dev.cellos.events.cell.command.v1.completed")
            .expect("command completed event");
        // I2: spec timeout fires with attribution suffix so operators can
        // distinguish spec.run.timeoutMs from the env cap and TTL watchdog.
        assert_eq!(
            command_event["data"]["spawnError"],
            "command timed out after 120 ms (spec.run.timeoutMs)"
        );
    }
}