supermachine 0.4.21

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
//! 100/100 reliability gate for multi-vCPU snapshot/restore.
//!
//! THIS IS THE SHIP-GATE for the smpark.ko approach. Bakes a
//! 4-vCPU snapshot with smpark.ko staged + auto-loaded by init-
//! oci, then drives 100 acquire+exec cycles. Expects zero
//! failures. Prior userspace-only attempt (rendezvous + LR drain
//! at HVF level) topped out at 8/10 and was rejected.
//!
//! Why 4 vCPUs: the failure rate scales with secondary count.
//! 1 vCPU: 100/100 trivially. 2 vCPUs: ~95/100 baseline (LR
//! drain is mostly correct). 4 vCPUs: ~80/100 baseline — this
//! is where the failure class becomes visible. 8/16 vCPUs go
//! lower still but few real workloads use that many.
//!
//! Run:
//!   cargo run --release --example _smpark_reliability_gate
//!
//! Optional env vars:
//!   SMPARK_GATE_CYCLES=100 — number of acquire+exec cycles
//!   SMPARK_GATE_VCPUS=4    — vCPUs for the bake
//!   SMPARK_GATE_KO=docs/design/extras/smpark/smpark.ko
//!     — path to the kernel module
//!
//! Exit code:
//!   0  — all cycles succeeded (ship gate passed)
//!   1  — at least one cycle failed (ship gate failed)
//!   2  — bake failed (gate inconclusive)

use std::path::PathBuf;
use std::time::{Duration, Instant};
use supermachine::Image;

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let cycles: usize = std::env::var("SMPARK_GATE_CYCLES")
        .ok()
        .and_then(|v| v.parse().ok())
        .unwrap_or(100);
    let vcpus: u32 = std::env::var("SMPARK_GATE_VCPUS")
        .ok()
        .and_then(|v| v.parse().ok())
        .unwrap_or(4);
    let smpark_ko = PathBuf::from(
        std::env::var("SMPARK_GATE_KO")
            .unwrap_or_else(|_| "docs/design/extras/smpark/smpark.ko".to_string()),
    );

    if !smpark_ko.is_file() {
        eprintln!(
            "[gate] smpark.ko not found at {} — set SMPARK_GATE_KO or run from repo root",
            smpark_ko.display()
        );
        std::process::exit(2);
    }

    eprintln!(
        "[gate] baking rust:1-slim with {} vCPUs + smpark.ko staged (pipelined path)...",
        vcpus
    );
    let bake_t0 = Instant::now();
    // No-op warmup callback forces the pipelined-bake path,
    // which drives `smpark_park` / `smpark_unpark` from the host
    // around `SNAPSHOT_ASYNC` and the warm `SNAPSHOT` calls. The
    // sequential bake (no warmup) uses the worker's internal
    // heartbeat trigger which can't issue host-driven RPCs from
    // within take_snapshot — vCPU 0 is the dispatcher and would
    // deadlock waiting for the agent it's supposed to be running.
    let image = match Image::builder("rust:1-slim")
        .with_name(format!("rust_1_slim_smpark_gate_{}vcpu", vcpus))
        .with_memory_mib(512)
        .with_vcpus(vcpus)
        .with_extra_file(&smpark_ko, "/supermachine-smpark.ko")
        .with_warmup_tag("smpark_gate")
        .with_warmup(|_vm| Ok(()))
        .build()
    {
        Ok(i) => i,
        Err(e) => {
            eprintln!("[gate] bake failed: {e}");
            std::process::exit(2);
        }
    };
    let bake_ms = bake_t0.elapsed().as_millis();
    eprintln!("[gate] bake done in {} ms", bake_ms);

    // Mode toggle:
    //   SMPARK_GATE_FRESH_RESTORE=1 (default) — `restore_on_release(true)`,
    //     pool restores the worker between cycles. Tests that
    //     restore-from-snapshot is reliable (the actual ship gate).
    //   SMPARK_GATE_FRESH_RESTORE=0 — `restore_on_release(false)`,
    //     same restored worker reused across cycles. Tests that
    //     a single restored multi-vCPU guest stays healthy under
    //     repeated exec — useful for isolating restore-time
    //     issues vs in-flight-execution issues.
    let fresh_restore = std::env::var("SMPARK_GATE_FRESH_RESTORE")
        .ok()
        .as_deref()
        != Some("0");
    let max_workers: usize = std::env::var("SMPARK_GATE_MAX")
        .ok()
        .and_then(|v| v.parse().ok())
        .unwrap_or(1);
    let pool = image
        .pool()
        .min(0)
        .max(max_workers)
        .restore_on_release(fresh_restore)
        // 30s per acquire — long enough to spawn+restore on a busy
        // host, short enough to surface a hang as FAIL rather than
        // wedging the gate. Default would be 60s.
        .acquire_timeout(Duration::from_secs(30))
        .build()?;
    eprintln!("[gate] pool max={max_workers}");
    eprintln!(
        "[gate] mode: fresh_restore={fresh_restore} (set SMPARK_GATE_FRESH_RESTORE=0 to reuse)"
    );

    let mut successes = 0usize;
    let mut failures = 0usize;
    let mut first_fail: Option<(usize, String)> = None;
    let cycles_t0 = Instant::now();
    for i in 0..cycles {
        let t0 = Instant::now();
        let outcome: Result<bool, String> = (|| -> Result<bool, String> {
            let vm = pool.acquire().map_err(|e| format!("acquire: {e}"))?;
            let out = vm
                .exec_builder()
                .argv(["sh", "-c", "echo ok && cat /proc/modules | grep -q smpark"])
                .timeout(Duration::from_secs(10))
                .output()
                .map_err(|e| format!("exec: {e}"))?;
            let stdout = String::from_utf8_lossy(&out.stdout);
            let exit_code = out.status.code().unwrap_or(-1);
            if exit_code != 0 || !stdout.contains("ok") {
                return Err(format!(
                    "exit={exit_code} stdout={stdout:?} stderr={:?}",
                    String::from_utf8_lossy(&out.stderr)
                ));
            }
            Ok(true)
        })();
        let cycle_ms = t0.elapsed().as_millis();
        match outcome {
            Ok(_) => {
                successes += 1;
                eprintln!(
                    "[gate] cycle {:3}/{cycles} OK in {} ms (total OK={successes} FAIL={failures})",
                    i + 1,
                    cycle_ms
                );
            }
            Err(e) => {
                failures += 1;
                eprintln!(
                    "[gate] cycle {:3}/{cycles} FAIL in {} ms: {e}",
                    i + 1,
                    cycle_ms
                );
                if first_fail.is_none() {
                    first_fail = Some((i + 1, e));
                }
            }
        }
    }
    let total_ms = cycles_t0.elapsed().as_millis();

    eprintln!();
    eprintln!("[gate] === SUMMARY ===");
    eprintln!("[gate] cycles      : {cycles}");
    eprintln!("[gate] successes   : {successes}");
    eprintln!("[gate] failures    : {failures}");
    eprintln!("[gate] total ms    : {total_ms}");
    if let Some((n, e)) = &first_fail {
        eprintln!("[gate] first fail  : cycle {n}: {e}");
    }
    if failures == 0 {
        eprintln!("[gate] PASS — multi-vCPU snapshot reliability gate met");
        Ok(())
    } else {
        eprintln!("[gate] FAIL — {failures} cycle(s) failed");
        std::process::exit(1);
    }
}