#![cfg(feature = "generation")]
use std::sync::Arc;
use swink_agent_eval::JudgeVerdict;
use swink_agent_eval::generation::{ExperimentGenerator, GenerationRequest, ToolDef, TopicPlanner};
use swink_agent_eval::testing::MockJudge;
fn verdict_json(body: &str) -> JudgeVerdict {
JudgeVerdict {
score: 1.0,
pass: true,
reason: Some(body.to_string()),
label: None,
}
}
fn topic_list_verdict(topics: &[&str]) -> JudgeVerdict {
let body = serde_json::to_string(&topics.to_vec()).unwrap();
verdict_json(&body)
}
fn case_body(name: &str, tool: Option<&str>) -> String {
let trajectory = tool
.map(|t| format!(r#","expected_trajectory":[{{"tool_name":"{t}"}}]"#))
.unwrap_or_default();
format!(
r#"{{"name":"{name}","system_prompt":"you are helpful","user_messages":["hi"],"expected_response":"okay","expected_assertion":"goal met"{trajectory}}}"#
)
}
#[tokio::test]
async fn twenty_cases_across_five_topics_have_even_distribution() {
let mut verdicts = vec![topic_list_verdict(&[
"alpha", "beta", "gamma", "delta", "epsilon",
])];
for i in 0..20 {
verdicts.push(verdict_json(&case_body(&format!("case-{i}"), None)));
}
let judge = Arc::new(MockJudge::with_verdicts(verdicts));
let planner = Arc::new(TopicPlanner::new(judge.clone(), "planner-model"));
let generator = ExperimentGenerator::new(judge, "gen-model", planner);
let request = GenerationRequest {
context: "multi-tool agent".into(),
task: "help the user".into(),
desired_count: 20,
num_topics: 5,
include_expected_output: true,
include_expected_trajectory: false,
include_expected_interactions: false,
include_metadata: false,
agent_tools: None,
};
let set = generator.generate(request).await.unwrap();
assert_eq!(
set.cases.len(),
20,
"expected 20 cases, got {}",
set.cases.len()
);
let mut counts = std::collections::HashMap::<String, u32>::new();
for case in &set.cases {
let topic = case
.id
.split("::")
.next()
.map(String::from)
.unwrap_or_default();
*counts.entry(topic).or_default() += 1;
}
assert_eq!(counts.len(), 5);
for (topic, count) in counts {
assert_eq!(count, 4, "topic {topic} had {count} cases");
}
}
#[tokio::test]
async fn trajectories_are_scoped_to_provided_tools() {
let mut verdicts = vec![topic_list_verdict(&["only"])];
let body = r#"{"name":"scoped","system_prompt":"p","user_messages":["m"],"expected_trajectory":[{"tool_name":"allowed_tool"},{"tool_name":"forbidden_tool"}]}"#;
verdicts.push(verdict_json(body));
let judge = Arc::new(MockJudge::with_verdicts(verdicts));
let planner = Arc::new(TopicPlanner::new(judge.clone(), "planner-model"));
let generator = ExperimentGenerator::new(judge, "gen-model", planner);
let request = GenerationRequest {
context: "ctx".into(),
task: "t".into(),
desired_count: 1,
num_topics: 1,
include_expected_output: false,
include_expected_trajectory: true,
include_expected_interactions: false,
include_metadata: false,
agent_tools: Some(vec![ToolDef::new("allowed_tool", "the allowed tool")]),
};
let set = generator.generate(request).await.unwrap();
assert_eq!(set.cases.len(), 1);
let trajectory = set.cases[0]
.expected_trajectory
.as_ref()
.expect("trajectory emitted");
assert_eq!(trajectory.len(), 1);
assert_eq!(trajectory[0].tool_name, "allowed_tool");
}
#[tokio::test]
async fn generator_retries_past_malformed_judge_output() {
let mut verdicts = vec![topic_list_verdict(&["only"])];
verdicts.push(verdict_json("not json"));
verdicts.push(verdict_json("{incomplete"));
verdicts.push(verdict_json(&case_body("recovered", None)));
let judge = Arc::new(MockJudge::with_verdicts(verdicts));
let planner = Arc::new(TopicPlanner::new(judge.clone(), "planner-model"));
let generator = ExperimentGenerator::new(judge, "gen-model", planner).with_retry_cap(3);
let request = GenerationRequest {
desired_count: 1,
num_topics: 1,
..GenerationRequest::default()
};
let set = generator.generate(request).await.unwrap();
assert_eq!(set.cases.len(), 1);
assert!(set.cases[0].name.contains("recovered"));
}
#[tokio::test]
async fn generator_validates_every_emitted_case() {
let mut verdicts = vec![topic_list_verdict(&["only"])];
for _ in 0..5 {
verdicts.push(verdict_json(
r#"{"name":"no-messages","system_prompt":"p"}"#,
));
}
let judge = Arc::new(MockJudge::with_verdicts(verdicts));
let planner = Arc::new(TopicPlanner::new(judge.clone(), "planner-model"));
let generator = ExperimentGenerator::new(judge, "gen-model", planner).with_retry_cap(3);
let request = GenerationRequest {
desired_count: 1,
num_topics: 1,
..GenerationRequest::default()
};
let set = generator.generate(request).await.unwrap();
assert_eq!(set.cases.len(), 0, "malformed draft must not appear");
}