cellos-host-cellos 0.5.1

Recursive CellOS-in-CellOS backend — runs CellOS cells as nested supervisors. Used for federated and self-hosting topologies.
Documentation
//! Contract tests for `cellos-host-cellos` (HCELLOS-COV).
//!
//! Goal: pin the observable contract of [`ProprietaryCellBackend`] against the
//! `CellBackend` trait so accidental drift between the simulated proprietary
//! host and the other backends (stub, Firecracker) is caught here.
//!
//! Surface notes (locked here, not invented):
//!
//! - `cellos-core::ports::CellBackend` exposes only `create`, `destroy`, and a
//!   `wait_for_in_vm_exit` hook. There is **no** `start` / `stop` method on the
//!   trait — workload start/stop is the supervisor's concern, not the host
//!   backend's. The "stop-without-start" check below therefore locks the
//!   trait surface (compile-time + behavior under repeated destroy), not a
//!   runtime stop call.
//! - CloudEvent emission is owned by the supervisor via the `EventSink` port,
//!   **not** by `CellBackend` impls. The simulated proprietary backend
//!   deliberately does not emit lifecycle events; the started → completed →
//!   destroyed CloudEvent ordering is exercised in the supervisor crate. Here
//!   we instead lock the **state-machine** ordering observable through the
//!   public probes (`tracked_cell_count`, `has_tracked_state`) plus the
//!   `(destroyed, peers_tracked_after)` shape that `destroy` returns.
//! - This backend is **stateful** (unlike the stub): it keeps a
//!   `HashMap<cell_id, CellRecord>` and removes the entry on `destroy`. That
//!   means repeat-destroy of the same handle is a typed `CellosError::Host`
//!   ("unknown or already destroyed"), and concurrent destroy must elect
//!   exactly one winner. Both properties are pinned below.

use std::sync::Arc;

use cellos_core::ports::{CellBackend, CellHandle};
use cellos_core::{CellosError, ExecutionCellDocument};
use cellos_host_cellos::ProprietaryCellBackend;

fn doc(id: &str) -> ExecutionCellDocument {
    serde_json::from_value(serde_json::json!({
        "apiVersion": "cellos.io/v1",
        "kind": "ExecutionCell",
        "spec": {
            "id": id,
            "authority": { "secretRefs": [] },
            "lifetime": { "ttlSeconds": 60 }
        }
    }))
    .expect("contract: spec must parse")
}

/// Lifecycle ordering: `create` → `destroy` must walk the host's tracked-cell
/// set in lockstep — visible (`count == 1`, `has_tracked_state == true`) right
/// after create, and gone (`count == 0`, `has_tracked_state == false`) right
/// after destroy. This is the host-cellos analogue of the supervisor-side
/// started → destroyed event ordering, scoped to the surface this backend
/// actually owns.
#[tokio::test]
async fn contract_create_then_destroy_lifecycle_ordering() {
    let host = ProprietaryCellBackend::new();
    let spec = doc("contract-lifecycle-1");

    // Pre-condition: host starts empty.
    assert_eq!(host.tracked_cell_count().await, 0);
    assert!(!host.has_tracked_state("contract-lifecycle-1").await);

    // create: cell becomes visible to host probes in the same transaction.
    let handle = host.create(&spec).await.expect("create must succeed");
    assert_eq!(handle.cell_id, "contract-lifecycle-1");
    assert_eq!(host.tracked_cell_count().await, 1);
    assert!(host.has_tracked_state("contract-lifecycle-1").await);
    // Backend does not own nftables enforcement; supervisor surfaces that
    // signal via the host-subprocess path. Lock as `None`.
    assert!(
        handle.nft_rules_applied.is_none(),
        "host-cellos must not claim nft enforcement"
    );

    // destroy: report shape + state-cleared post-condition in the same
    // transaction (no observable in-between state where the cell is both
    // "destroyed" and still tracked).
    let report = host.destroy(&handle).await.expect("destroy must succeed");
    assert_eq!(report.cell_id, "contract-lifecycle-1");
    assert!(
        report.destroyed,
        "destroy of a live cell reports destroyed=true"
    );
    assert_eq!(
        report.peers_tracked_after, 0,
        "single-cell host: no peer residue after destroy"
    );
    assert_eq!(host.tracked_cell_count().await, 0);
    assert!(!host.has_tracked_state("contract-lifecycle-1").await);
}

/// `wait_for_in_vm_exit` default contract: this backend does **not** own
/// in-VM execution (it is a simulated host until the real kernel ABI lands)
/// and must yield `None` so the supervisor falls back to launching
/// `spec.run.argv` as a host-side subprocess (per the trait doc on
/// `wait_for_in_vm_exit`).
#[tokio::test]
async fn contract_wait_for_in_vm_exit_is_none() {
    let host = ProprietaryCellBackend::new();
    assert!(
        host.wait_for_in_vm_exit("contract-no-vm").await.is_none(),
        "host-cellos must not claim ownership of in-VM execution"
    );
}

/// "Idempotent destroy" — adapted for a **stateful** backend.
///
/// The stub is stateless and treats repeat-destroy as a no-op success; this
/// backend is stateful and treats it as a typed error
/// (`CellosError::Host("... unknown or already destroyed ...")`) so a buggy
/// supervisor cannot silently double-tear-down. The "idempotency" we lock here
/// is therefore: **the second destroy is a deterministic typed error, not a
/// panic, and host state is unchanged across the second call**. That matches
/// the `TeardownReport::destroyed` field doc, which permits real backends to
/// surface a double-destroy as `Err` rather than `destroyed: false`.
#[tokio::test]
async fn contract_repeat_destroy_is_typed_error_not_panic() {
    let host = ProprietaryCellBackend::new();
    let handle = host
        .create(&doc("contract-idempotent-1"))
        .await
        .expect("create must succeed");

    let r1 = host
        .destroy(&handle)
        .await
        .expect("first destroy must succeed");
    assert!(r1.destroyed);
    assert_eq!(r1.peers_tracked_after, 0);

    let err = host
        .destroy(&handle)
        .await
        .expect_err("second destroy must error, not panic");
    assert!(
        matches!(err, CellosError::Host(_)),
        "expected CellosError::Host on repeat-destroy, got: {err:?}"
    );

    // Host state is stable across the failed second call.
    assert_eq!(host.tracked_cell_count().await, 0);
    assert!(!host.has_tracked_state("contract-idempotent-1").await);
}

/// "Stop-without-start" — mapped to the surface that actually exists.
///
/// `CellBackend` has no `stop` method, so a literal `stop()` call is a
/// compile-time error (which itself is the strongest possible lock). The
/// runtime analogue on this trait is "tear down a cell that was never
/// created": destroy a synthetic handle the backend has never seen. This
/// stateful backend's documented behavior (see
/// `tests/smoke.rs::smoke_backend_destroy_unknown_errors`) is that destroy of
/// an unknown id is a typed `CellosError::Host` whose message names the cell
/// id. We lock both the variant and the id-in-message property here.
#[tokio::test]
async fn contract_destroy_unknown_handle_is_typed_error() {
    let host = ProprietaryCellBackend::new();
    let err = host
        .destroy(&CellHandle {
            cell_id: "contract-never-created".into(),
            cgroup_path: None,
            nft_rules_applied: None,
            kernel_digest_sha256: None,
            rootfs_digest_sha256: None,
            firecracker_digest_sha256: None,
        })
        .await
        .expect_err("destroy of unknown handle must error on host-cellos");
    match err {
        CellosError::Host(ref msg) => assert!(
            msg.contains("contract-never-created"),
            "Host error must name the unknown cell id, got: {msg}"
        ),
        other => panic!("expected CellosError::Host, got: {other:?}"),
    }
    // No accidental insert/leak from the failed destroy path.
    assert_eq!(host.tracked_cell_count().await, 0);
}

/// Concurrent destroy: two `destroy()` futures racing on the same handle must
/// not double-fail (no panic, no aborted task). Because this backend is
/// stateful and serializes through a `Mutex`, exactly one racer will observe
/// the cell as live and succeed; the other will observe the post-removal map
/// and surface `CellosError::Host`. We lock both the no-panic property and
/// the "exactly one winner" outcome.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn contract_concurrent_destroy_does_not_double_fail() {
    let host = Arc::new(ProprietaryCellBackend::new());
    let handle = host
        .create(&doc("contract-concurrent-1"))
        .await
        .expect("create must succeed");

    let h1 = handle.clone();
    let h2 = handle.clone();
    let host1 = Arc::clone(&host);
    let host2 = Arc::clone(&host);

    let t1 = tokio::spawn(async move { host1.destroy(&h1).await });
    let t2 = tokio::spawn(async move { host2.destroy(&h2).await });

    // Both tasks must JOIN cleanly (no panic / abort) — that is the
    // "does not double-fail" property. Their inner Results may differ.
    let r1 = t1.await.expect("task1 join must not panic");
    let r2 = t2.await.expect("task2 join must not panic");

    // Exactly one winner: one Ok, one Err. (XOR.)
    let oks = [&r1, &r2].iter().filter(|r| r.is_ok()).count();
    let errs = [&r1, &r2].iter().filter(|r| r.is_err()).count();
    assert_eq!(
        oks, 1,
        "exactly one racer must win destroy: r1={r1:?} r2={r2:?}"
    );
    assert_eq!(
        errs, 1,
        "exactly one racer must lose destroy: r1={r1:?} r2={r2:?}"
    );

    // Loser must surface a typed Host error, not a generic / panic-string.
    let losing = if r1.is_err() { r1 } else { r2 };
    let losing_err = losing.expect_err("loser must be Err by construction");
    assert!(
        matches!(losing_err, CellosError::Host(_)),
        "loser must surface CellosError::Host, got: {losing_err:?}"
    );

    // Host state is fully cleared regardless of which racer won.
    assert_eq!(host.tracked_cell_count().await, 0);
    assert!(!host.has_tracked_state("contract-concurrent-1").await);
}

/// Typed-error contract for invalid input — `create` with an empty `spec.id`
/// must surface `CellosError::InvalidSpec`, not a generic backend error. This
/// pins the error variant so callers (supervisor, control plane) can match on
/// it without parsing strings, in lockstep with the stub's HSTUB-COV contract.
#[tokio::test]
async fn contract_create_empty_id_returns_invalid_spec() {
    let host = ProprietaryCellBackend::new();
    let err = host
        .create(&doc(""))
        .await
        .expect_err("empty id must be rejected");
    assert!(
        matches!(err, CellosError::InvalidSpec(_)),
        "expected CellosError::InvalidSpec, got: {err:?}"
    );
    // Failed create must not have inserted any tracked state.
    assert_eq!(host.tracked_cell_count().await, 0);
}

/// Duplicate-create contract: this backend forbids two live cells with the
/// same id (`CellosError::Host`). After the first cell is destroyed, the same
/// id must be reusable — the host's state machine has fully released the slot.
/// This pins the "no peer residue → reuse OK" property in lockstep with the
/// inline unit `destroy_removes_tracked_state_same_id_can_run_again`, but at
/// the trait-object level.
#[tokio::test]
async fn contract_duplicate_create_rejected_then_id_reusable_after_destroy() {
    let host = ProprietaryCellBackend::new();
    let h1 = host.create(&doc("contract-dup-1")).await.expect("create 1");

    let err = host
        .create(&doc("contract-dup-1"))
        .await
        .expect_err("duplicate live id must be rejected");
    assert!(
        matches!(err, CellosError::Host(_)),
        "expected CellosError::Host on duplicate live id, got: {err:?}"
    );

    host.destroy(&h1).await.expect("destroy clears the slot");
    let h2 = host
        .create(&doc("contract-dup-1"))
        .await
        .expect("id is reusable after destroy");
    assert_eq!(h2.cell_id, "contract-dup-1");
    host.destroy(&h2).await.expect("teardown");
}