car-builder 0.32.1

Natural-language → validated car-workflow manifest builder for Common Agent Runtime
//! Pure loader + validator for the NL→workflow eval fixtures.
//!
//! This is the FIXTURES-FIRST deliverable of
//! `docs/proposals/h2-builder-discovery-acceptance.md` (Part 1). It validates
//! the *shape and bookkeeping* of `eval/nl2workflow.jsonl` so the fixture set
//! can't silently drift out of the acceptance bands — WITHOUT calling any model.
//! The live pass-rate harness that actually runs generation lands with the
//! implementation, in a separate commit, so acceptance can't be self-graded.

use serde::Deserialize;
use std::collections::HashSet;

/// One eval case — mirrors the acceptance spec's per-case schema.
#[derive(Debug, Deserialize)]
struct EvalCase {
    id: String,
    band: String,
    nl_request: String,
    must_have: MustHave,
    #[serde(default)]
    refusal_ok: Option<bool>,
    #[serde(default)]
    repair_forcing: Option<bool>,
    /// Human note; for repair-forcing cases this states HOW the case forces a
    /// first-pass failure (asserted non-empty below).
    #[serde(default)]
    notes: String,
}

/// Semantic assertions the built workflow must satisfy (checked by the live
/// harness later — here we only validate that the assertion shape is well-formed).
#[derive(Debug, Deserialize)]
struct MustHave {
    /// Inclusive [min, max] stage-count range.
    stage_count: [i64; 2],
    required_tools: Vec<String>,
    required_edge_conditions: Vec<String>,
}

const FIXTURE: &str = include_str!("../eval/nl2workflow.jsonl");

/// The three difficulty bands, each with exactly this many cases.
const BANDS: &[(&str, usize)] = &[
    ("single_stage", 10),
    ("multi_stage", 10),
    ("adversarial", 10),
];

/// Minimum number of repair-forcing cases (exercises the repair loop).
const MIN_REPAIR_FORCING: usize = 5;

/// The band in which `refusal_ok` labels are permitted (the adversarial band).
const REFUSAL_BAND: &str = "adversarial";

fn load_cases() -> Vec<EvalCase> {
    FIXTURE
        .lines()
        .filter(|l| !l.trim().is_empty())
        .enumerate()
        .map(|(i, line)| {
            serde_json::from_str::<EvalCase>(line)
                .unwrap_or_else(|e| panic!("line {} is not a valid eval case: {e}\n  {line}", i + 1))
        })
        .collect()
}

#[test]
fn every_line_parses() {
    let cases = load_cases();
    assert!(!cases.is_empty(), "fixture file is empty");
    // 30 cases total (3 bands x 10).
    assert_eq!(cases.len(), 30, "expected exactly 30 eval cases");
}

#[test]
fn band_counts_match_acceptance_spec() {
    let cases = load_cases();
    for (band, expected) in BANDS {
        let n = cases.iter().filter(|c| c.band == *band).count();
        assert_eq!(
            n, *expected,
            "band '{band}' must have exactly {expected} cases, found {n}"
        );
    }
    // No stray bands outside the three known ones.
    let known: HashSet<&str> = BANDS.iter().map(|(b, _)| *b).collect();
    for c in &cases {
        assert!(
            known.contains(c.band.as_str()),
            "case '{}' has unknown band '{}'",
            c.id,
            c.band
        );
    }
}

#[test]
fn ids_are_unique() {
    let cases = load_cases();
    let mut seen = HashSet::new();
    for c in &cases {
        assert!(
            seen.insert(c.id.clone()),
            "duplicate case id '{}'",
            c.id
        );
    }
}

#[test]
fn refusal_ok_only_in_adversarial_band() {
    let cases = load_cases();
    for c in &cases {
        if c.refusal_ok.is_some() {
            assert_eq!(
                c.band, REFUSAL_BAND,
                "case '{}' carries refusal_ok but is in band '{}', not '{REFUSAL_BAND}'",
                c.id, c.band
            );
        }
    }
    // And every adversarial case is explicitly labeled (refusal cap enforcement
    // in the live harness relies on each band-3 case declaring whether a typed
    // refusal passes).
    for c in cases.iter().filter(|c| c.band == REFUSAL_BAND) {
        assert!(
            c.refusal_ok.is_some(),
            "adversarial case '{}' must set refusal_ok explicitly (true|false)",
            c.id
        );
    }
}

#[test]
fn at_least_five_repair_forcing_cases() {
    let cases = load_cases();
    let repair: Vec<&EvalCase> = cases
        .iter()
        .filter(|c| c.repair_forcing == Some(true))
        .collect();
    assert!(
        repair.len() >= MIN_REPAIR_FORCING,
        "need >= {MIN_REPAIR_FORCING} repair_forcing cases, found {}",
        repair.len()
    );
    // Each repair-forcing case must document HOW it forces a first-pass failure,
    // so the live harness's repair-recovery assertion measures something real.
    for c in &repair {
        assert!(
            c.notes.to_uppercase().contains("HOW IT FORCES FAILURE"),
            "repair_forcing case '{}' must state HOW IT FORCES FAILURE in notes",
            c.id
        );
    }
}

#[test]
fn must_have_shapes_are_well_formed() {
    let cases = load_cases();
    for c in &cases {
        let [min, max] = c.must_have.stage_count;
        assert!(
            min >= 0 && max >= min,
            "case '{}' has an invalid stage_count range [{min}, {max}]",
            c.id
        );
        // Tool/edge-condition assertions are non-empty strings when present.
        for t in &c.must_have.required_tools {
            assert!(
                !t.trim().is_empty(),
                "case '{}' has an empty required_tools entry",
                c.id
            );
        }
        for e in &c.must_have.required_edge_conditions {
            assert!(
                !e.trim().is_empty(),
                "case '{}' has an empty required_edge_conditions entry",
                c.id
            );
        }
    }
}

#[test]
fn non_refusal_cases_have_a_non_empty_request() {
    // A case the builder is expected to satisfy (refusal not OK) must carry an
    // actual request to build from. A refusal-OK case may legitimately be empty.
    let cases = load_cases();
    for c in &cases {
        if c.refusal_ok != Some(true) {
            assert!(
                !c.nl_request.trim().is_empty(),
                "non-refusal case '{}' must have a non-empty nl_request",
                c.id
            );
        }
    }
}