cellos-host-gvisor 0.5.1

gVisor runsc backend for CellOS — runs cells in user-space syscall-emulated sandboxes for environments without KVM.
Documentation
//! Linux-only `runsc` shell-out implementation of [`CellBackend`].
//!
//! L2-06-5 skeleton. We do NOT depend on `runsc` being installed at build
//! time; failure to locate or invoke the binary surfaces as a
//! [`CellosError::Host`] at `create()` time and the supervisor degrades
//! to whatever fallback the operator wired (typically the stub backend on
//! non-prod hosts).
//!
//! Behaviour:
//!
//! - `create(spec)`:
//!   1. Generate the bundle (`config.json` + empty `rootfs/`) via
//!      [`generate_bundle_config`] into a temporary directory keyed by the
//!      cell id.
//!   2. Spawn `runsc run --bundle <dir> <cell-id>` and detach. The child
//!      pid is tracked in an in-memory map keyed by cell id so
//!      `wait_for_in_vm_exit` can join it later.
//! - `wait_for_in_vm_exit(cell_id)`:
//!   - Returns `Some(Ok(code))` when the tracked child has exited; `None`
//!     if the cell id is not tracked by this backend (the supervisor's
//!     host-subprocess path takes over).
//! - `destroy(handle)`:
//!   - Best-effort `runsc kill <cell-id> SIGKILL`, then `runsc delete
//!     <cell-id>`. Errors are logged but do not block the teardown
//!     report — gVisor's own state is the source of truth.

use std::collections::HashMap;
use std::path::PathBuf;
use std::process::Stdio;
use std::sync::Arc;

use async_trait::async_trait;
use tokio::process::{Child, Command};
use tokio::sync::Mutex;
use tracing::{instrument, warn};

use cellos_core::ports::{CellBackend, CellHandle, TeardownReport};
use cellos_core::{CellosError, ExecutionCellDocument};

use crate::bundle::generate_bundle_config;

/// Override the `runsc` binary path. Defaults to looking up `runsc` on
/// `$PATH`. The override is consulted at every `create()` so tests can
/// inject a fake binary without rebuilding.
const RUNSC_BIN_ENV: &str = "CELLOS_GVISOR_RUNSC_BIN";

/// Override the bundle staging directory root. Defaults to
/// `${TMPDIR:-/tmp}/cellos-gvisor`. The supervisor creates a per-cell
/// subdirectory underneath this root.
const BUNDLE_ROOT_ENV: &str = "CELLOS_GVISOR_BUNDLE_ROOT";

struct TrackedCell {
    /// Bundle directory we created in `create()`; removed on `destroy()`.
    bundle_dir: PathBuf,
    /// The `runsc run` child process. `wait_for_in_vm_exit` joins it.
    child: Child,
}

/// gVisor-backed [`CellBackend`].
///
/// All real `runsc` invocations live in `create()` / `destroy()` /
/// `wait_for_in_vm_exit()`. The struct holds only an in-memory id → child
/// map; no persistent state.
pub struct GVisorCellBackend {
    tracked: Arc<Mutex<HashMap<String, TrackedCell>>>,
}

impl Default for GVisorCellBackend {
    fn default() -> Self {
        Self::new()
    }
}

impl GVisorCellBackend {
    pub fn new() -> Self {
        Self {
            tracked: Arc::new(Mutex::new(HashMap::new())),
        }
    }

    fn runsc_bin() -> String {
        std::env::var(RUNSC_BIN_ENV).unwrap_or_else(|_| "runsc".to_string())
    }

    fn bundle_root() -> PathBuf {
        if let Ok(s) = std::env::var(BUNDLE_ROOT_ENV) {
            return PathBuf::from(s);
        }
        let tmp = std::env::var("TMPDIR").unwrap_or_else(|_| "/tmp".to_string());
        PathBuf::from(tmp).join("cellos-gvisor")
    }
}

#[async_trait]
impl CellBackend for GVisorCellBackend {
    #[instrument(skip(self, spec), fields(cell_id = %spec.spec.id))]
    async fn create(&self, spec: &ExecutionCellDocument) -> Result<CellHandle, CellosError> {
        let cfg = generate_bundle_config(spec)
            .map_err(|e| CellosError::InvalidSpec(format!("gvisor bundle: {e}")))?;

        let cell_id = spec.spec.id.clone();
        let bundle_dir = Self::bundle_root().join(&cell_id);
        let rootfs_dir = bundle_dir.join("rootfs");

        // Bundle layout: <bundle_dir>/config.json + <bundle_dir>/rootfs/
        std::fs::create_dir_all(&rootfs_dir).map_err(|e| {
            CellosError::Host(format!("gvisor: create bundle dir {bundle_dir:?}: {e}"))
        })?;
        let config_path = bundle_dir.join("config.json");
        let json = serde_json::to_vec_pretty(&cfg)
            .map_err(|e| CellosError::Host(format!("gvisor: serialize config.json: {e}")))?;
        std::fs::write(&config_path, json)
            .map_err(|e| CellosError::Host(format!("gvisor: write {config_path:?}: {e}")))?;

        let mut cmd = Command::new(Self::runsc_bin());
        cmd.arg("run")
            .arg("--bundle")
            .arg(&bundle_dir)
            .arg(&cell_id)
            .stdin(Stdio::null())
            .stdout(Stdio::null())
            .stderr(Stdio::null());

        let child = cmd.spawn().map_err(|e| {
            CellosError::Host(format!(
                "gvisor: spawn `runsc run --bundle {bundle_dir:?} {cell_id}` failed: {e}"
            ))
        })?;

        self.tracked.lock().await.insert(
            cell_id.clone(),
            TrackedCell {
                bundle_dir: bundle_dir.clone(),
                child,
            },
        );

        Ok(CellHandle {
            cell_id,
            cgroup_path: None,
            // gVisor manages its own network namespace; we do not apply
            // host-side nftables in this skeleton. The supervisor's
            // host-subprocess fallback surfaces the signal when the spec
            // declares egress.
            nft_rules_applied: None,
            kernel_digest_sha256: None,
            rootfs_digest_sha256: None,
            firecracker_digest_sha256: None,
        })
    }

    #[instrument(skip(self))]
    async fn wait_for_in_vm_exit(&self, cell_id: &str) -> Option<Result<i32, CellosError>> {
        // Pull the tracked entry out so we can await the child without
        // holding the map lock across the await point.
        let mut tracked = self.tracked.lock().await;
        let entry = tracked.remove(cell_id)?;
        drop(tracked);

        let TrackedCell {
            bundle_dir,
            mut child,
        } = entry;

        let status = match child.wait().await {
            Ok(s) => s,
            Err(e) => {
                return Some(Err(CellosError::Host(format!(
                    "gvisor: wait for runsc child of {cell_id}: {e}"
                ))));
            }
        };
        // Bundle dir cleanup is owned by destroy(); leaving it here so
        // post-mortem inspection still works between wait and destroy.
        let _ = bundle_dir;
        Some(Ok(status.code().unwrap_or(-1)))
    }

    #[instrument(skip(self, handle), fields(cell_id = %handle.cell_id))]
    async fn destroy(&self, handle: &CellHandle) -> Result<TeardownReport, CellosError> {
        let mut tracked = self.tracked.lock().await;
        let entry = tracked.remove(&handle.cell_id);
        let still_tracked = tracked.len();
        drop(tracked);

        // Best-effort: kill the container, then delete it. We never error
        // out the teardown report on these — gVisor's own state is the
        // authority on whether the cell exists.
        let runsc = Self::runsc_bin();
        for (sub, args) in [
            ("kill", vec!["kill", &handle.cell_id, "SIGKILL"]),
            ("delete", vec!["delete", &handle.cell_id]),
        ] {
            let res = Command::new(&runsc)
                .args(&args)
                .stdin(Stdio::null())
                .stdout(Stdio::null())
                .stderr(Stdio::null())
                .status()
                .await;
            if let Err(e) = res {
                warn!(error = %e, "gvisor: `runsc {sub} {}` failed (continuing)", handle.cell_id);
            }
        }

        // Best-effort bundle cleanup.
        if let Some(t) = entry {
            if let Err(e) = std::fs::remove_dir_all(&t.bundle_dir) {
                warn!(error = %e, bundle = ?t.bundle_dir, "gvisor: bundle cleanup failed");
            }
        }

        Ok(TeardownReport {
            cell_id: handle.cell_id.clone(),
            destroyed: true,
            peers_tracked_after: still_tracked,
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn runsc_bin_respects_env_override() {
        // SAFETY: the test uses a process-wide env mutation but the keys
        // are unique to this crate; concurrent runs would conflict only
        // with each other. Cargo test isolation is sufficient.
        let prev = std::env::var(RUNSC_BIN_ENV).ok();
        std::env::set_var(RUNSC_BIN_ENV, "/usr/local/bin/my-runsc");
        assert_eq!(GVisorCellBackend::runsc_bin(), "/usr/local/bin/my-runsc");
        match prev {
            Some(v) => std::env::set_var(RUNSC_BIN_ENV, v),
            None => std::env::remove_var(RUNSC_BIN_ENV),
        }
    }

    #[test]
    fn bundle_root_respects_env_override() {
        let prev = std::env::var(BUNDLE_ROOT_ENV).ok();
        std::env::set_var(BUNDLE_ROOT_ENV, "/var/lib/cellos-gvisor-test");
        assert_eq!(
            GVisorCellBackend::bundle_root(),
            PathBuf::from("/var/lib/cellos-gvisor-test")
        );
        match prev {
            Some(v) => std::env::set_var(BUNDLE_ROOT_ENV, v),
            None => std::env::remove_var(BUNDLE_ROOT_ENV),
        }
    }
}