use serde::Deserialize;
use std::collections::HashSet;
#[derive(Debug, Deserialize)]
struct EvalCase {
id: String,
band: String,
nl_request: String,
must_have: MustHave,
#[serde(default)]
refusal_ok: Option<bool>,
#[serde(default)]
repair_forcing: Option<bool>,
#[serde(default)]
notes: String,
}
#[derive(Debug, Deserialize)]
struct MustHave {
stage_count: [i64; 2],
required_tools: Vec<String>,
required_edge_conditions: Vec<String>,
}
const FIXTURE: &str = include_str!("../eval/nl2workflow.jsonl");
const BANDS: &[(&str, usize)] = &[
("single_stage", 10),
("multi_stage", 10),
("adversarial", 10),
];
const MIN_REPAIR_FORCING: usize = 5;
const REFUSAL_BAND: &str = "adversarial";
fn load_cases() -> Vec<EvalCase> {
FIXTURE
.lines()
.filter(|l| !l.trim().is_empty())
.enumerate()
.map(|(i, line)| {
serde_json::from_str::<EvalCase>(line)
.unwrap_or_else(|e| panic!("line {} is not a valid eval case: {e}\n {line}", i + 1))
})
.collect()
}
#[test]
fn every_line_parses() {
let cases = load_cases();
assert!(!cases.is_empty(), "fixture file is empty");
assert_eq!(cases.len(), 30, "expected exactly 30 eval cases");
}
#[test]
fn band_counts_match_acceptance_spec() {
let cases = load_cases();
for (band, expected) in BANDS {
let n = cases.iter().filter(|c| c.band == *band).count();
assert_eq!(
n, *expected,
"band '{band}' must have exactly {expected} cases, found {n}"
);
}
let known: HashSet<&str> = BANDS.iter().map(|(b, _)| *b).collect();
for c in &cases {
assert!(
known.contains(c.band.as_str()),
"case '{}' has unknown band '{}'",
c.id,
c.band
);
}
}
#[test]
fn ids_are_unique() {
let cases = load_cases();
let mut seen = HashSet::new();
for c in &cases {
assert!(
seen.insert(c.id.clone()),
"duplicate case id '{}'",
c.id
);
}
}
#[test]
fn refusal_ok_only_in_adversarial_band() {
let cases = load_cases();
for c in &cases {
if c.refusal_ok.is_some() {
assert_eq!(
c.band, REFUSAL_BAND,
"case '{}' carries refusal_ok but is in band '{}', not '{REFUSAL_BAND}'",
c.id, c.band
);
}
}
for c in cases.iter().filter(|c| c.band == REFUSAL_BAND) {
assert!(
c.refusal_ok.is_some(),
"adversarial case '{}' must set refusal_ok explicitly (true|false)",
c.id
);
}
}
#[test]
fn at_least_five_repair_forcing_cases() {
let cases = load_cases();
let repair: Vec<&EvalCase> = cases
.iter()
.filter(|c| c.repair_forcing == Some(true))
.collect();
assert!(
repair.len() >= MIN_REPAIR_FORCING,
"need >= {MIN_REPAIR_FORCING} repair_forcing cases, found {}",
repair.len()
);
for c in &repair {
assert!(
c.notes.to_uppercase().contains("HOW IT FORCES FAILURE"),
"repair_forcing case '{}' must state HOW IT FORCES FAILURE in notes",
c.id
);
}
}
#[test]
fn must_have_shapes_are_well_formed() {
let cases = load_cases();
for c in &cases {
let [min, max] = c.must_have.stage_count;
assert!(
min >= 0 && max >= min,
"case '{}' has an invalid stage_count range [{min}, {max}]",
c.id
);
for t in &c.must_have.required_tools {
assert!(
!t.trim().is_empty(),
"case '{}' has an empty required_tools entry",
c.id
);
}
for e in &c.must_have.required_edge_conditions {
assert!(
!e.trim().is_empty(),
"case '{}' has an empty required_edge_conditions entry",
c.id
);
}
}
}
#[test]
fn non_refusal_cases_have_a_non_empty_request() {
let cases = load_cases();
for c in &cases {
if c.refusal_ok != Some(true) {
assert!(
!c.nl_request.trim().is_empty(),
"non-refusal case '{}' must have a non-empty nl_request",
c.id
);
}
}
}