cellos-supervisor 0.5.1

CellOS execution-cell runner — boots cells in Firecracker microVMs or gVisor, enforces narrow typed authority, emits signed CloudEvents.
Documentation
//! End-to-end Firecracker backend test.
//!
//! Drives the real path: supervisor → Firecracker VMM → cellos-init → /bin/true
//! → vsock exit-code bridge → clean teardown. The test is gated on KVM and on
//! the full set of Firecracker environment variables; if any precondition is
//! missing the test prints the reason and returns successfully (skipped).
//!
//! In CI this runs from `.github/workflows/firecracker-e2e.yml`, which is the
//! only place the prerequisites are reliably available. Locally, set the
//! variables documented in `docs/firecracker-runner.md` plus `/dev/kvm`
//! permissions to exercise the test.

#[cfg(unix)]
mod unix {
    use std::ffi::OsString;
    use std::fs::{self, File};
    use std::io::Write;
    use std::path::{Path, PathBuf};
    use std::process::{Command, Stdio};
    use std::time::{Duration, Instant};

    /// Required Firecracker env vars. Missing any of them is a skip, not a
    /// failure — local dev machines won't have them, CI does.
    const REQUIRED_ENV: &[&str] = &[
        "CELLOS_FIRECRACKER_BINARY",
        "CELLOS_FIRECRACKER_KERNEL_IMAGE",
        // Lane B / runner doc canonical name is *_ROOTFS_IMAGE; we also accept
        // the shorter *_ROOTFS form the lane-D task spec described, and bridge
        // them inside the test before spawning the supervisor.
        // See: handle_rootfs_alias() below.
        "CELLOS_FIRECRACKER_ROOTFS_IMAGE",
        "CELLOS_FIRECRACKER_SOCKET_DIR",
    ];

    /// Resolves the supervisor binary path. Cargo sets
    /// `CARGO_BIN_EXE_cellos-supervisor` at compile time for integration tests
    /// in the same package; the env var name uses the bin's literal name with
    /// hyphens. We also accept `CELLOS_SUPERVISOR_BIN` as an explicit override
    /// so the standalone smoke script can reuse this binary.
    fn supervisor_exe() -> PathBuf {
        if let Some(p) = std::env::var_os("CELLOS_SUPERVISOR_BIN") {
            return PathBuf::from(p);
        }
        // Cargo replaces hyphens with underscores in the env var key for the
        // CARGO_BIN_EXE_<name> macro lookup, but only when used via env!()
        // macro. For runtime lookup we try both forms.
        for key in [
            "CARGO_BIN_EXE_cellos-supervisor",
            "CARGO_BIN_EXE_cellos_supervisor",
        ] {
            if let Some(p) = std::env::var_os(key) {
                return PathBuf::from(p);
            }
        }
        // Fallback: workspace target dir.
        let crate_dir = Path::new(env!("CARGO_MANIFEST_DIR"));
        let workspace = crate_dir
            .parent()
            .and_then(|p| p.parent())
            .expect("cellos-supervisor sits two levels under workspace root");
        let profile = std::env::var("PROFILE").unwrap_or_else(|_| "release".into());
        workspace
            .join("target")
            .join(profile)
            .join("cellos-supervisor")
    }

    /// Some host backends accept `CELLOS_FIRECRACKER_ROOTFS` as shorthand for
    /// `CELLOS_FIRECRACKER_ROOTFS_IMAGE`. The current source uses the longer
    /// form. If only the short form is set, copy it into the long form so the
    /// supervisor picks it up — and vice-versa for backward compatibility.
    fn handle_rootfs_alias() {
        let long = std::env::var_os("CELLOS_FIRECRACKER_ROOTFS_IMAGE");
        let short = std::env::var_os("CELLOS_FIRECRACKER_ROOTFS");
        match (long, short) {
            (Some(_), _) => {}
            (None, Some(s)) => std::env::set_var("CELLOS_FIRECRACKER_ROOTFS_IMAGE", s),
            _ => {}
        }
    }

    fn skip(reason: &str) {
        eprintln!("firecracker_e2e: skipping — {reason}");
    }

    #[test]
    fn supervisor_runs_true_in_firecracker_microvm() {
        // Precondition 1: KVM device.
        if !Path::new("/dev/kvm").exists() {
            skip("/dev/kvm not present (no KVM on this host)");
            return;
        }

        // Bridge ROOTFS aliases before checking required vars.
        handle_rootfs_alias();

        // Precondition 2: required env vars.
        let missing: Vec<&str> = REQUIRED_ENV
            .iter()
            .copied()
            .filter(|k| std::env::var_os(k).is_none())
            .collect();
        if !missing.is_empty() {
            skip(&format!("missing env: {}", missing.join(", ")));
            return;
        }

        // Precondition 3: required files exist on disk. Bad paths in env will
        // produce confusing errors deep inside the VMM; check up front.
        for key in [
            "CELLOS_FIRECRACKER_BINARY",
            "CELLOS_FIRECRACKER_KERNEL_IMAGE",
            "CELLOS_FIRECRACKER_ROOTFS_IMAGE",
        ] {
            let path = std::env::var(key).expect("checked above");
            if !Path::new(&path).exists() {
                skip(&format!("{key}={path} does not exist on disk"));
                return;
            }
        }

        // Precondition 4: socket dir exists.
        let sock_dir = std::env::var("CELLOS_FIRECRACKER_SOCKET_DIR").expect("checked");
        if !Path::new(&sock_dir).is_dir() {
            // Try to create it; if we can't, skip.
            if fs::create_dir_all(&sock_dir).is_err() {
                skip(&format!("socket dir {sock_dir} not creatable"));
                return;
            }
        }

        // Precondition 5: supervisor binary is built.
        let exe = supervisor_exe();
        if !exe.is_file() {
            skip(&format!(
                "supervisor binary missing at {} — run `cargo build -p cellos-supervisor --release`",
                exe.display()
            ));
            return;
        }

        // Build the cell spec: /bin/true, 64 MiB RAM, 30s TTL, no egress.
        // The path is absolute because the guest rootfs may not have `true`
        // on PATH, and `/bin/true` is the canonical Alpine location.
        let tmp = tempfile::tempdir().expect("tempdir");
        let spec_path = tmp.path().join("cell.json");
        let spec_json = r#"{
  "apiVersion": "cellos.io/v1",
  "kind": "ExecutionCell",
  "spec": {
    "id": "fc-e2e-true",
    "authority": { "secretRefs": [], "egressRules": [] },
    "lifetime": { "ttlSeconds": 30 },
    "run": {
      "secretDelivery": "env",
      "argv": ["/bin/true"],
      "timeoutMs": 20000,
      "limits": { "memoryMaxBytes": 67108864 }
    }
  }
}"#;
        File::create(&spec_path)
            .and_then(|mut f| f.write_all(spec_json.as_bytes()))
            .expect("write cell spec");

        // Per-run export dir so we can assert at least one event JSONL was
        // emitted by the supervisor.
        let export_dir = tmp.path().join("events");
        fs::create_dir_all(&export_dir).expect("mkdir export dir");

        // Build the command. Forward all CELLOS_FIRECRACKER_* and the backend
        // selector; the supervisor reads them directly.
        let mut cmd = Command::new(&exe);
        cmd.env("CELL_OS_USE_NOOP_SINK", "1") // disable NATS sink
            .env("CELLOS_CELL_BACKEND", "firecracker")
            .env("CELLOS_EXPORT_DIR", &export_dir)
            .env("RUST_BACKTRACE", "1")
            .arg(&spec_path)
            .stdout(Stdio::piped())
            .stderr(Stdio::piped());

        // Inherit every CELLOS_FIRECRACKER_* var the harness set up.
        for (k, v) in std::env::vars_os() {
            if k.to_string_lossy().starts_with("CELLOS_FIRECRACKER_") {
                cmd.env(&k, &v);
            }
        }

        eprintln!("firecracker_e2e: spawning supervisor {}", exe.display());
        let mut child = cmd.spawn().expect("spawn supervisor");

        // 30s wait with a poll loop. Firecracker boot + /bin/true + teardown
        // should complete well under 10s on a healthy runner; 30s is the
        // documented worst-case.
        let deadline = Instant::now() + Duration::from_secs(30);
        let status = loop {
            match child.try_wait().expect("try_wait") {
                Some(status) => break status,
                None if Instant::now() >= deadline => {
                    let _ = child.kill();
                    let _ = child.wait();
                    panic!("supervisor did not exit within 30s");
                }
                None => std::thread::sleep(Duration::from_millis(200)),
            }
        };

        // Capture stderr/stdout for diagnostics on failure.
        let mut stderr_buf = String::new();
        let mut stdout_buf = String::new();
        if let Some(mut s) = child.stderr.take() {
            use std::io::Read;
            let _ = s.read_to_string(&mut stderr_buf);
        }
        if let Some(mut s) = child.stdout.take() {
            use std::io::Read;
            let _ = s.read_to_string(&mut stdout_buf);
        }

        assert!(
            status.success(),
            "supervisor exited non-zero: {status:?}\n--- stderr ---\n{stderr_buf}\n--- stdout ---\n{stdout_buf}"
        );

        // Assert at least one .jsonl exists under the export dir tree.
        let mut found_jsonl = false;
        let mut walker = vec![export_dir.clone()];
        while let Some(dir) = walker.pop() {
            let entries = match fs::read_dir(&dir) {
                Ok(it) => it,
                Err(_) => continue,
            };
            for entry in entries.flatten() {
                let path = entry.path();
                if path.is_dir() {
                    walker.push(path);
                } else if path.extension().and_then(|s| s.to_str()) == Some("jsonl") {
                    found_jsonl = true;
                    break;
                }
            }
            if found_jsonl {
                break;
            }
        }
        assert!(
            found_jsonl,
            "expected at least one event JSONL file under {}",
            export_dir.display()
        );

        // Drop tmpdir last so we keep artifacts available through the asserts.
        drop(tmp);
        let _ = OsString::new(); // silence unused-import warnings on some toolchains
    }
}