use agentcarousel_core::{
annotate_run_cost, CaseId, CaseResult, CaseStatus, FixtureFile, Message, Role,
};
use agentcarousel_fixtures::load_fixture;
use agentcarousel_reporters::persist_run;
use agentcarousel_runner::{call_llm, run_eval, EvalConfig, GenerationMode, RunnerConfig};
use clap::Parser;
use console::style;
use similar::{ChangeTag, TextDiff};
use std::collections::HashMap;
use std::path::PathBuf;
use super::config::ResolvedConfig;
use super::exit_codes::ExitCode;
use super::fixture_utils::collect_fixture_paths;
use super::GlobalOptions;
#[derive(Debug, Parser)]
#[command(
long_about = "Automated prompt optimization loop.\n\nEach iteration: (1) eval to find failing cases, (2) analyze failures by rubric dimension, (3) synthesize 3 prompt candidates via LLM, (4) score each candidate, (5) apply the best one. Writes the winning prompt to prompt.md and saves an optimization report.\n\nRequires ANTHROPIC_API_KEY (or the key for your chosen model).",
after_help = "Examples:\n agc optimize fixtures/my-skill/\n agc optimize fixtures/my-skill/ --target-score 0.95 --max-iter 5 --budget 15\n agc optimize fixtures/my-skill/ --model claude-opus-4-7 --judge-model claude-opus-4-7"
)]
pub struct OptimizeArgs {
#[arg(value_name = "PATH", conflicts_with = "skill")]
path: Option<PathBuf>,
#[arg(long, value_name = "NAME", conflicts_with = "path")]
skill: Option<String>,
#[arg(long, default_value_t = 0.9)]
target_score: f32,
#[arg(long, default_value_t = 10.0)]
budget: f64,
#[arg(long, alias = "max-iter", default_value_t = 5)]
iterations: u32,
#[arg(long)]
model: Option<String>,
#[arg(long)]
judge_model: Option<String>,
#[arg(long)]
pub config: Option<PathBuf>,
#[arg(long)]
output: Option<PathBuf>,
#[arg(long)]
dry_run: bool,
#[arg(long, value_name = "URL")]
generator_endpoint: Option<String>,
#[arg(long, value_name = "URL")]
judge_endpoint: Option<String>,
}
#[derive(serde::Serialize, Debug)]
struct OptimizeReport {
skill: String,
baseline_score: f32,
final_score: f32,
target_score: f32,
target_reached: bool,
iterations_run: u32,
score_trajectory: Vec<f32>,
total_cost_usd: f64,
total_duration_secs: f64,
baseline_prompt: String,
final_prompt: String,
prompt_diff: String,
iterations: Vec<IterationRecord>,
}
#[derive(serde::Serialize, Debug, Clone)]
pub struct FailureAnalysis {
pub cluster_id: String,
pub representative_cases: Vec<(CaseId, String)>,
pub synthesis: String,
}
#[derive(serde::Serialize, Debug)]
struct IterationRecord {
iteration: u32,
score_before: f32,
failure_count: usize,
top_rubric_failure: Option<String>,
failure_clusters: Vec<FailureCluster>,
failure_analyses: Vec<FailureAnalysis>,
edit_candidates: Vec<PromptEditCandidate>,
candidates: Vec<CandidateRecord>,
applied: Option<usize>,
score_after: f32,
cost_usd: f64,
duration_secs: f64,
}
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
pub struct PromptEditCandidate {
pub id: String,
pub original_text: String,
pub replacement_text: String,
pub rationale: String,
pub targets_cluster: String,
}
impl PromptEditCandidate {
pub fn apply(&self, prompt: &str) -> String {
if self.original_text.is_empty() {
format!("{}\n{}", prompt.trim_end(), self.replacement_text)
} else {
prompt.replacen(&self.original_text, &self.replacement_text, 1)
}
}
}
#[derive(serde::Serialize, Debug)]
struct CandidateRecord {
index: usize,
score: f32,
delta: f32,
delta_failing: f32,
regression: f32,
selection_score: f32,
edit_id: String,
targets_cluster: String,
rationale: String,
}
pub fn run_optimize_command(
args: OptimizeArgs,
config: &ResolvedConfig,
globals: &GlobalOptions,
) -> i32 {
let model = args
.model
.clone()
.unwrap_or_else(|| config.generator.model.clone());
let judge_model = args
.judge_model
.clone()
.unwrap_or_else(|| config.judge.model.clone());
let generator_endpoint = args
.generator_endpoint
.clone()
.or_else(|| config.generator.endpoint.clone());
let judge_endpoint = args.judge_endpoint.clone();
let fixture_root = match (&args.skill, &args.path) {
(Some(name), _) => PathBuf::from("fixtures").join(name),
(_, Some(p)) => p.clone(),
(None, None) => {
eprintln!("error: provide either a PATH argument or --skill <name>");
return ExitCode::ConfigError.as_i32();
}
};
let fixture_paths = collect_fixture_paths(std::slice::from_ref(&fixture_root));
if fixture_paths.is_empty() {
eprintln!(
"error: no fixture files found at '{}'",
fixture_root.display()
);
return ExitCode::NotFound.as_i32();
}
let mut fixtures: Vec<FixtureFile> = Vec::new();
for path in &fixture_paths {
match load_fixture(path) {
Ok(f) => fixtures.push(f),
Err(e) => {
eprintln!("error: failed to load '{}': {e}", path.display());
return ExitCode::NotFound.as_i32();
}
}
}
let total_cases: usize = fixtures.iter().map(|f| f.cases.len()).sum();
if total_cases == 0 {
eprintln!("error: fixtures contain no cases");
return ExitCode::NotFound.as_i32();
}
let skill = fixtures[0].skill_or_agent.clone();
let prompt_path = PathBuf::from("fixtures").join(&skill).join("prompt.md");
if !prompt_path.exists() {
eprintln!(
"error: prompt.md not found at '{}'\n hint: agc optimize requires a skill directory with a prompt.md",
prompt_path.display()
);
return ExitCode::NotFound.as_i32();
}
let baseline_prompt = match std::fs::read_to_string(&prompt_path) {
Ok(s) => s,
Err(e) => {
eprintln!("error: could not read '{}': {e}", prompt_path.display());
return ExitCode::RuntimeError.as_i32();
}
};
if !globals.quiet {
println!(
"\n{} {} · {} cases · target={:.0}% · budget=${:.0} · max_iter={}",
style("agc optimize").bold(),
style(&skill).cyan(),
total_cases,
args.target_score * 100.0,
args.budget,
args.iterations,
);
println!(" prompt: {}", prompt_path.display());
println!(" model: {} judge: {}\n", model, judge_model);
}
if args.dry_run {
println!(
"{} dry-run mode — no API calls will be made",
style("note:").yellow().bold()
);
println!(
" Would optimize: {} cases across {} fixtures",
total_cases,
fixtures.len()
);
println!(" Prompt path: {}", prompt_path.display());
return ExitCode::Ok.as_i32();
}
let runtime = tokio::runtime::Builder::new_multi_thread()
.enable_io()
.enable_time()
.build()
.expect("tokio runtime");
let report = runtime.block_on(optimize_loop(
fixtures,
fixture_paths
.iter()
.map(|p| p.to_string_lossy().into_owned())
.collect(),
baseline_prompt,
prompt_path.clone(),
&model,
&judge_model,
generator_endpoint.as_deref(),
judge_endpoint.as_deref(),
&args,
config,
globals,
));
print_optimize_summary(&report, globals);
let report_path = args.output.unwrap_or_else(|| {
PathBuf::from(format!(
"optimize-report-{}.json",
chrono::Utc::now().format("%Y%m%d-%H%M%S")
))
});
match serde_json::to_string_pretty(&report) {
Ok(json) => {
if let Err(e) = std::fs::write(&report_path, json) {
eprintln!(
"warn: could not write report to '{}': {e}",
report_path.display()
);
} else {
println!(" report: {}", report_path.display());
}
}
Err(e) => eprintln!("warn: could not serialize report: {e}"),
}
if report.final_score >= args.target_score {
ExitCode::Ok.as_i32()
} else {
ExitCode::Failed.as_i32()
}
}
#[allow(clippy::too_many_arguments)]
async fn optimize_loop(
fixtures: Vec<FixtureFile>,
_fixture_paths: Vec<String>,
baseline_prompt: String,
prompt_path: PathBuf,
model: &str,
judge_model: &str,
generator_endpoint: Option<&str>,
judge_endpoint: Option<&str>,
args: &OptimizeArgs,
config: &ResolvedConfig,
globals: &GlobalOptions,
) -> OptimizeReport {
let loop_start = std::time::Instant::now();
let skill = fixtures[0].skill_or_agent.clone();
let rubric_lookup = build_rubric_lookup(&fixtures);
let mut current_prompt = baseline_prompt.clone();
let mut total_cost: f64 = 0.0;
let mut score_trajectory: Vec<f32> = Vec::new();
let mut iteration_records: Vec<IterationRecord> = Vec::new();
if !globals.quiet {
println!(
" {} baseline eval ({} cases)…",
style("⟳").dim(),
fixtures.iter().map(|f| f.cases.len()).sum::<usize>()
);
}
let baseline_run = eval_with_prompt(
fixtures.clone(),
¤t_prompt,
model,
judge_model,
generator_endpoint,
judge_endpoint,
config,
globals,
)
.await;
let _ = persist_run(&baseline_run);
let baseline_score = baseline_run.summary.pass_rate;
total_cost += baseline_run.summary.total_cost_usd.unwrap_or(0.0);
score_trajectory.push(baseline_score);
if !globals.quiet {
let failures = count_failures(&baseline_run);
println!(
" Baseline: {:.0}% pass rate ({}/{} passing, {} failing)",
baseline_score * 100.0,
baseline_run.summary.passed,
baseline_run.summary.total,
failures,
);
}
if baseline_score >= args.target_score {
if !globals.quiet {
println!(
" {} already at target — no optimization needed",
style("✓").green()
);
}
return OptimizeReport {
skill,
baseline_score,
final_score: baseline_score,
target_score: args.target_score,
target_reached: true,
iterations_run: 0,
score_trajectory,
total_cost_usd: total_cost,
total_duration_secs: loop_start.elapsed().as_secs_f64(),
prompt_diff: String::new(),
baseline_prompt: baseline_prompt.clone(),
final_prompt: current_prompt,
iterations: iteration_records,
};
}
let mut current_score = baseline_score;
let mut no_improvement_streak: u32 = 0;
for iter in 0..args.iterations {
if current_score >= args.target_score {
break;
}
if total_cost >= args.budget {
if !globals.quiet {
eprintln!(
" {} budget ${:.2} exhausted (spent ${:.4})",
style("warn:").yellow(),
args.budget,
total_cost
);
}
break;
}
if no_improvement_streak >= 2 {
if !globals.quiet {
eprintln!(
" {} 2 consecutive iterations with no improvement — stopping early",
style("warn:").yellow()
);
}
break;
}
let iter_start = std::time::Instant::now();
if !globals.quiet {
println!(
"\n {} Iteration {}/{} (current score: {:.0}%)",
style("→").bold(),
iter + 1,
args.iterations,
current_score * 100.0
);
}
let current_run = if iter == 0 {
&baseline_run
} else {
&baseline_run };
let failing_cases = collect_failures(current_run);
if failing_cases.is_empty() {
break;
}
let clusters = cluster_failures(&failing_cases, &rubric_lookup);
let top_rubric = clusters.first().map(|c| c.rubric_id.clone());
if !globals.quiet {
let msg = top_rubric.as_deref().unwrap_or("(no rubric data)");
println!(
" Failures: {} top dimension: {} ({} cluster(s))",
failing_cases.len(),
msg,
clusters.len(),
);
}
let case_map: HashMap<CaseId, &CaseResult> = failing_cases
.iter()
.map(|c| (c.case_id.clone(), *c))
.collect();
let analyses =
analyze_clusters_llm(&clusters, &case_map, judge_model, judge_endpoint).await;
let feedback = if analyses.is_empty() {
let failure_summary = format_failure_summary(&failing_cases, ¤t_prompt);
match analyze_failures_llm(
&failure_summary,
¤t_prompt,
judge_model,
judge_endpoint,
)
.await
{
Ok(f) => f,
Err(e) => {
eprintln!(" warn: analysis LLM call failed: {e} — skipping iteration");
continue;
}
}
} else {
analyses
.iter()
.map(|a| format!("[{}] {}", a.cluster_id, a.synthesis))
.collect::<Vec<_>>()
.join("\n")
};
if !globals.quiet {
let first_line = feedback.lines().next().unwrap_or("(no feedback)");
println!(
" Analysis: {}",
first_line.chars().take(120).collect::<String>()
);
}
let edit_candidates = match synthesize_edit_candidates_llm(
¤t_prompt,
&analyses,
&feedback,
model,
generator_endpoint,
)
.await
{
Ok(c) if !c.is_empty() => c,
Ok(_) => {
eprintln!(" warn: synthesis returned no edit candidates — skipping iteration");
continue;
}
Err(e) => {
eprintln!(" warn: synthesis LLM call failed: {e} — skipping iteration");
continue;
}
};
let top_cluster = clusters.first();
let failing_target_ids: Vec<CaseId> = top_cluster
.map(|c| c.representative.clone())
.unwrap_or_default();
let passing_target_ids: Vec<CaseId> = top_cluster
.map(|c| passing_cases_for_dimension(&baseline_run, &c.rubric_id, 5))
.unwrap_or_default();
let failing_fixtures = filter_fixtures_to_cases(&fixtures, &failing_target_ids);
let passing_fixtures = filter_fixtures_to_cases(&fixtures, &passing_target_ids);
let baseline_failing_rate = if failing_fixtures.is_empty() {
0.0_f32
} else {
let r = eval_with_prompt(
failing_fixtures.clone(),
¤t_prompt,
model,
judge_model,
generator_endpoint,
judge_endpoint,
config,
globals,
)
.await;
total_cost += r.summary.total_cost_usd.unwrap_or(0.0);
r.summary.pass_rate
};
let baseline_passing_rate = if passing_fixtures.is_empty() {
1.0_f32
} else {
let r = eval_with_prompt(
passing_fixtures.clone(),
¤t_prompt,
model,
judge_model,
generator_endpoint,
judge_endpoint,
config,
globals,
)
.await;
total_cost += r.summary.total_cost_usd.unwrap_or(0.0);
r.summary.pass_rate
};
if !globals.quiet {
println!(
" Scoring {} edit candidates (targeted: {} failing + {} regression)…",
edit_candidates.len(),
failing_target_ids.len(),
passing_target_ids.len(),
);
}
let mut candidate_records: Vec<CandidateRecord> = Vec::new();
let mut best_selection: f32 = 0.0;
let mut best_idx: Option<usize> = None;
let mut best_candidate_prompt: Option<String> = None;
for (i, edit) in edit_candidates.iter().enumerate() {
let applied_prompt = edit.apply(¤t_prompt);
let (delta_failing, score_on_failing) = if !failing_fixtures.is_empty() {
let r = eval_with_prompt(
failing_fixtures.clone(),
&applied_prompt,
model,
judge_model,
generator_endpoint,
judge_endpoint,
config,
globals,
)
.await;
let _ = persist_run(&r);
total_cost += r.summary.total_cost_usd.unwrap_or(0.0);
let rate = r.summary.pass_rate;
(rate - baseline_failing_rate, rate)
} else {
(0.0, 0.0)
};
let regression = if !passing_fixtures.is_empty() {
let r = eval_with_prompt(
passing_fixtures.clone(),
&applied_prompt,
model,
judge_model,
generator_endpoint,
judge_endpoint,
config,
globals,
)
.await;
total_cost += r.summary.total_cost_usd.unwrap_or(0.0);
(baseline_passing_rate - r.summary.pass_rate).max(0.0)
} else {
0.0
};
let selection_score = delta_failing - regression;
let score = score_on_failing;
let delta = score - current_score;
if !globals.quiet {
let arrow = if delta_failing > 0.0 {
style(format!("+{:.0}% failing", delta_failing * 100.0))
.green()
.to_string()
} else if delta_failing < 0.0 {
style(format!("{:.0}% failing", delta_failing * 100.0))
.red()
.to_string()
} else {
style("±0%".to_string()).dim().to_string()
};
println!(
" edit {}: {} regression={:.0}% sel={:.2} [→ {}]",
i + 1,
arrow,
regression * 100.0,
selection_score,
edit.targets_cluster,
);
}
candidate_records.push(CandidateRecord {
index: i + 1,
score,
delta,
delta_failing,
regression,
selection_score,
edit_id: edit.id.clone(),
targets_cluster: edit.targets_cluster.clone(),
rationale: edit.rationale.clone(),
});
if delta_failing > 0.0 && selection_score > best_selection {
best_selection = selection_score;
best_idx = Some(i + 1);
best_candidate_prompt = Some(applied_prompt);
}
}
let score_after;
let applied;
if let Some(new_prompt) = best_candidate_prompt {
current_prompt = new_prompt;
let bak_path = prompt_path.with_extension("md.bak");
if let Err(e) = std::fs::copy(&prompt_path, &bak_path) {
eprintln!(" warn: could not write prompt.md.bak: {e}");
}
if let Err(e) = std::fs::write(&prompt_path, ¤t_prompt) {
eprintln!(" warn: could not write prompt.md: {e}");
}
no_improvement_streak = 0;
let reeval = eval_with_prompt(
fixtures.clone(),
¤t_prompt,
model,
judge_model,
generator_endpoint,
judge_endpoint,
config,
globals,
)
.await;
let _ = persist_run(&reeval);
total_cost += reeval.summary.total_cost_usd.unwrap_or(0.0);
score_after = reeval.summary.pass_rate;
applied = best_idx;
current_score = score_after;
if !globals.quiet {
println!(
" {} Applied edit {} ({:.0}% → {:.0}% on full suite)",
style("✓").green(),
best_idx.unwrap_or(0),
current_score * 100.0,
score_after * 100.0,
);
}
} else {
score_after = current_score;
applied = None;
no_improvement_streak += 1;
if !globals.quiet {
println!(
" {} No edit improved failing cases without regression — holding (streak: {})",
style("→").dim(),
no_improvement_streak,
);
}
}
score_trajectory.push(score_after);
let iter_cost = total_cost - iteration_records.iter().map(|r| r.cost_usd).sum::<f64>();
let iter_duration = iter_start.elapsed().as_secs_f64();
iteration_records.push(IterationRecord {
iteration: iter + 1,
score_before: current_score - (score_after - current_score),
failure_count: failing_cases.len(),
top_rubric_failure: top_rubric,
failure_clusters: clusters,
failure_analyses: analyses,
edit_candidates,
candidates: candidate_records,
applied,
score_after,
cost_usd: iter_cost,
duration_secs: iter_duration,
});
}
let final_score = *score_trajectory.last().unwrap_or(&baseline_score);
let total_duration = loop_start.elapsed().as_secs_f64();
let final_prompt = current_prompt;
let prompt_diff = unified_diff(&baseline_prompt, &final_prompt);
OptimizeReport {
skill,
baseline_score,
final_score,
target_score: args.target_score,
target_reached: final_score >= args.target_score,
iterations_run: iteration_records.len() as u32,
score_trajectory,
total_cost_usd: total_cost,
total_duration_secs: total_duration,
baseline_prompt,
final_prompt,
prompt_diff,
iterations: iteration_records,
}
}
fn print_optimize_summary(report: &OptimizeReport, globals: &GlobalOptions) {
println!(
"\nOptimization complete: {} iteration(s), ${:.4} spent, {:.1}s",
report.iterations_run, report.total_cost_usd, report.total_duration_secs,
);
for rec in &report.iterations {
let delta = rec.score_after - rec.score_before;
let arrow = if delta > 0.0 {
style(format!("(+{:.0}%)", delta * 100.0))
.green()
.to_string()
} else if delta < 0.0 {
style(format!("({:.0}%)", delta * 100.0)).red().to_string()
} else {
style("(±0%)".to_string()).dim().to_string()
};
let edit_label = rec
.applied
.and_then(|idx| rec.edit_candidates.get(idx.saturating_sub(1)))
.map(|e| format!("Edit: {}", e.rationale.chars().take(60).collect::<String>()))
.unwrap_or_else(|| "no edit applied".to_string());
println!(
" Iteration {}: {:.0}% → {:.0}% {} {}",
rec.iteration,
rec.score_before * 100.0,
rec.score_after * 100.0,
arrow,
edit_label,
);
}
let score_line = if report.target_reached {
format!(
"Score: {:.2} ≥ target {:.2} {}",
report.final_score,
report.target_score,
style("✓").green().bold(),
)
} else {
format!(
"Score: {:.2} < target {:.2} {}",
report.final_score,
report.target_score,
style("✗").yellow(),
)
};
println!("{score_line}");
if !report.prompt_diff.is_empty() && !globals.quiet {
println!("\nPrompt diff:");
for line in report.prompt_diff.lines() {
if line.starts_with('+') {
println!(" {}", style(line).green());
} else if line.starts_with('-') {
println!(" {}", style(line).red());
} else {
println!(" {line}");
}
}
}
}
fn unified_diff(old: &str, new: &str) -> String {
if old == new {
return String::new();
}
let diff = TextDiff::from_lines(old, new);
let mut out = String::new();
for change in diff.iter_all_changes() {
let prefix = match change.tag() {
ChangeTag::Delete => "-",
ChangeTag::Insert => "+",
ChangeTag::Equal => " ",
};
out.push_str(prefix);
out.push_str(change.value());
if !change.value().ends_with('\n') {
out.push('\n');
}
}
out
}
#[allow(clippy::too_many_arguments)]
async fn eval_with_prompt(
fixtures: Vec<FixtureFile>,
system_prompt: &str,
model: &str,
judge_model: &str,
generator_endpoint: Option<&str>,
judge_endpoint: Option<&str>,
config: &ResolvedConfig,
globals: &GlobalOptions,
) -> agentcarousel_core::Run {
let injected = inject_system_prompt(fixtures, system_prompt);
let runner = RunnerConfig {
concurrency: config.runner.concurrency.unwrap_or(4),
timeout_secs: config.runner.timeout_secs,
offline: false,
mock_dir: config.runner.mock_dir.clone(),
generation_mode: GenerationMode::Live,
generator_model: Some(model.to_string()),
generator_max_tokens: config.generator.max_tokens,
generator_endpoint: generator_endpoint.map(|s| s.to_string()),
fail_fast: false,
mock_strict: false,
command: "optimize-eval".to_string(),
agentcarousel_version: env!("CARGO_PKG_VERSION").to_string(),
config_hash: "optimize".to_string(),
run_id: None,
batch_collect_id: None,
};
let eval_config = EvalConfig {
runner,
runs: 1,
seed: 0,
evaluator: config.eval.default_evaluator.clone(),
judge: true,
judge_model: Some(judge_model.to_string()),
judge_endpoint: judge_endpoint.map(|s| s.to_string()),
effectiveness_threshold: config.eval.effectiveness_threshold,
judge_max_tokens: config.judge.max_tokens,
progress: !globals.quiet,
};
let mut run = run_eval(injected, eval_config).await;
annotate_run_cost(&mut run, model, Some(judge_model));
run
}
fn inject_system_prompt(fixtures: Vec<FixtureFile>, system_prompt: &str) -> Vec<FixtureFile> {
fixtures
.into_iter()
.map(|mut fixture| {
fixture.cases = fixture
.cases
.into_iter()
.map(|mut case| {
case.input.messages.retain(|m| m.role != Role::System);
case.input.messages.insert(
0,
Message {
role: Role::System,
content: system_prompt.to_string(),
},
);
case
})
.collect();
fixture
})
.collect()
}
fn count_failures(run: &agentcarousel_core::Run) -> usize {
run.cases
.iter()
.filter(|c| c.status != CaseStatus::Passed)
.count()
}
fn collect_failures(run: &agentcarousel_core::Run) -> Vec<&CaseResult> {
run.cases
.iter()
.filter(|c| c.status != CaseStatus::Passed)
.collect()
}
fn filter_fixtures_to_cases(fixtures: &[FixtureFile], keep: &[CaseId]) -> Vec<FixtureFile> {
let id_set: std::collections::HashSet<&CaseId> = keep.iter().collect();
fixtures
.iter()
.filter_map(|f| {
let cases: Vec<_> = f
.cases
.iter()
.filter(|c| id_set.contains(&c.id))
.cloned()
.collect();
if cases.is_empty() {
None
} else {
let mut filtered = f.clone();
filtered.cases = cases;
Some(filtered)
}
})
.collect()
}
fn passing_cases_for_dimension(
run: &agentcarousel_core::Run,
rubric_id: &str,
limit: usize,
) -> Vec<CaseId> {
run.cases
.iter()
.filter(|c| {
c.status == CaseStatus::Passed
&& c.eval_scores
.as_ref()
.map(|s| s.rubric_scores.iter().any(|rs| rs.rubric_id == rubric_id))
.unwrap_or(false)
})
.take(limit)
.map(|c| c.case_id.clone())
.collect()
}
#[derive(serde::Serialize, Debug, Clone)]
pub struct FailureCluster {
pub rubric_id: String,
pub rubric_description: String,
pub case_ids: Vec<CaseId>,
pub representative: Vec<CaseId>,
}
pub fn build_rubric_lookup(fixtures: &[FixtureFile]) -> HashMap<String, String> {
let mut map = HashMap::new();
for fixture in fixtures {
for case in &fixture.cases {
if let Some(rubric_items) = &case.expected.rubric {
for item in rubric_items {
map.entry(item.id.clone())
.or_insert_with(|| item.description.clone());
}
}
}
}
map
}
pub fn cluster_failures(
failures: &[&CaseResult],
rubric_lookup: &HashMap<String, String>,
) -> Vec<FailureCluster> {
let mut groups: HashMap<String, Vec<CaseId>> = HashMap::new();
for case in failures {
let primary = match &case.eval_scores {
None => "rules_failure".to_string(),
Some(scores) if scores.rubric_scores.is_empty() => "rules_failure".to_string(),
Some(scores) => {
scores
.rubric_scores
.iter()
.filter(|rs| rs.score < 1.0)
.max_by(|a, b| {
let da = a.weight * (1.0 - a.score);
let db = b.weight * (1.0 - b.score);
da.partial_cmp(&db).unwrap_or(std::cmp::Ordering::Equal)
})
.map(|rs| rs.rubric_id.clone())
.unwrap_or_else(|| "rules_failure".to_string())
}
};
groups
.entry(primary)
.or_default()
.push(case.case_id.clone());
}
let mut clusters: Vec<FailureCluster> = groups
.into_iter()
.map(|(rubric_id, mut case_ids)| {
case_ids.sort_by(|a, b| a.0.cmp(&b.0));
let description = if rubric_id == "rules_failure" {
"Rules-based failure (no rubric scores available)".to_string()
} else {
rubric_lookup
.get(&rubric_id)
.cloned()
.unwrap_or_else(|| rubric_id.clone())
};
let representative = case_ids.iter().take(3).cloned().collect();
FailureCluster {
rubric_id,
rubric_description: description,
case_ids,
representative,
}
})
.collect();
clusters.sort_by_key(|c| std::cmp::Reverse(c.case_ids.len()));
clusters
}
fn format_failure_summary(failures: &[&CaseResult], current_prompt: &str) -> String {
let mut lines = vec![
format!("Current system prompt:\n{}\n", current_prompt),
format!("{} failing cases:\n", failures.len()),
];
for (i, case) in failures.iter().take(10).enumerate() {
lines.push(format!(" Case {}: {}", i + 1, case.case_id.0));
if let Some(scores) = &case.eval_scores {
let failing_rubrics: Vec<_> = scores
.rubric_scores
.iter()
.filter(|rs| rs.score < 0.5)
.map(|rs| {
let rationale = rs.rationale.as_deref().unwrap_or("no rationale");
format!(
" - {}: score={:.2} — {}",
rs.rubric_id, rs.score, rationale
)
})
.collect();
if !failing_rubrics.is_empty() {
lines.extend(failing_rubrics);
}
}
if let Some(err) = &case.error {
lines.push(format!(
" - error: {}",
err.chars().take(200).collect::<String>()
));
}
if let Some(output) = &case.trace.final_output {
lines.push(format!(
" - output: {}",
output.chars().take(300).collect::<String>()
));
}
}
if failures.len() > 10 {
lines.push(format!(" … and {} more", failures.len() - 10));
}
lines.join("\n")
}
pub async fn analyze_clusters_llm(
clusters: &[FailureCluster],
case_map: &HashMap<CaseId, &CaseResult>,
judge_model: &str,
judge_endpoint: Option<&str>,
) -> Vec<FailureAnalysis> {
let mut analyses = Vec::new();
for cluster in clusters {
let mut representative_cases: Vec<(CaseId, String)> = Vec::new();
for case_id in &cluster.representative {
let output = case_map
.get(case_id)
.and_then(|c| c.trace.final_output.as_deref())
.unwrap_or("(no output recorded)");
let error = case_map
.get(case_id)
.and_then(|c| c.error.as_deref())
.unwrap_or("");
let agent_result = if !error.is_empty() {
format!("Error: {}", &error[..error.len().min(300)])
} else {
output.chars().take(400).collect::<String>()
};
let prompt = format!(
"The agent failed this test case.\nThe case tests: {rubric_description}\nThe agent produced: {agent_result}\n\nIn exactly 2 sentences, explain what the system prompt is missing that would have caused the agent to succeed. Be specific and actionable.",
rubric_description = cluster.rubric_description,
agent_result = agent_result,
);
match call_llm(judge_model, &prompt, Some(256), judge_endpoint).await {
Ok(r) => representative_cases.push((case_id.clone(), r.output)),
Err(e) => {
representative_cases.push((case_id.clone(), format!("(analysis failed: {e})")))
}
}
}
let synthesis = if representative_cases.is_empty() {
format!(
"No representative cases found for cluster '{}'.",
cluster.rubric_id
)
} else {
let explanations = representative_cases
.iter()
.enumerate()
.map(|(i, (id, exp))| format!("Case {} ({}): {}", i + 1, id.0, exp))
.collect::<Vec<_>>()
.join("\n");
let synth_prompt = format!(
"These are per-case explanations of why an AI agent failed the rubric dimension '{rubric_id}' ({rubric_description}):\n\n{explanations}\n\nSummarize in exactly ONE actionable sentence the single most important change to the system prompt that would fix these failures.",
rubric_id = cluster.rubric_id,
rubric_description = cluster.rubric_description,
explanations = explanations,
);
match call_llm(judge_model, &synth_prompt, Some(200), judge_endpoint).await {
Ok(r) => r.output,
Err(_) => representative_cases
.first()
.map(|(_, s)| s.clone())
.unwrap_or_default(),
}
};
analyses.push(FailureAnalysis {
cluster_id: cluster.rubric_id.clone(),
representative_cases,
synthesis,
});
}
analyses
}
async fn analyze_failures_llm(
failure_summary: &str,
_current_prompt: &str,
judge_model: &str,
judge_endpoint: Option<&str>,
) -> Result<String, String> {
let prompt = format!(
r#"You are an expert prompt engineer. Analyze these AI agent test failures and identify the root cause.
{failure_summary}
Respond concisely with:
1. The primary failure pattern (one sentence)
2. The specific prompt weakness causing it (one sentence)
3. The single most impactful change to make (one sentence, concrete and actionable)
Keep your response under 200 words."#
);
let result = call_llm(judge_model, &prompt, Some(512), judge_endpoint).await?;
Ok(result.output)
}
async fn synthesize_edit_candidates_llm(
current_prompt: &str,
analyses: &[FailureAnalysis],
feedback: &str,
model: &str,
generator_endpoint: Option<&str>,
) -> Result<Vec<PromptEditCandidate>, String> {
let analysis_block = if analyses.is_empty() {
feedback.to_string()
} else {
analyses
.iter()
.map(|a| format!("- [{}] {}", a.cluster_id, a.synthesis))
.collect::<Vec<_>>()
.join("\n")
};
let prompt = format!(
r#"You are an expert prompt engineer. Here is an AI agent system prompt and the failure patterns observed when evaluating it.
CURRENT SYSTEM PROMPT:
{current_prompt}
FAILURE ANALYSIS:
{analysis_block}
Generate exactly 3 specific, minimal edits to improve the prompt. Each edit must:
1. Quote the EXACT text to replace from the current prompt (or use empty string to append at end)
2. Provide the replacement text
3. Explain in one sentence which failure it addresses
4. Name the cluster it targets
Respond with valid JSON only — no markdown, no explanation:
{{"edits": [{{"id": "edit-1", "original_text": "<exact text from prompt or empty>", "replacement_text": "<new text>", "rationale": "<one sentence>", "targets_cluster": "<cluster id>"}}, {{"id": "edit-2", ...}}, {{"id": "edit-3", ...}}]}}"#
);
let result = call_llm(model, &prompt, Some(8192), generator_endpoint).await?;
let text = result.output.trim();
let json_text = if text.starts_with("```") {
text.lines()
.skip(1)
.take_while(|l| !l.starts_with("```"))
.collect::<Vec<_>>()
.join("\n")
} else {
text.to_string()
};
let edits = match serde_json::from_str::<serde_json::Value>(&json_text) {
Ok(parsed) => parsed["edits"]
.as_array()
.ok_or("response missing 'edits' array")?
.iter()
.filter_map(|v| serde_json::from_value::<PromptEditCandidate>(v.clone()).ok())
.filter(|e| !e.replacement_text.is_empty())
.collect::<Vec<_>>(),
Err(_) => extract_partial_edits(&json_text),
};
if edits.is_empty() {
return Err(format!(
"edits array was empty or unparseable — raw: {:.200}",
json_text
));
}
Ok(edits)
}
fn extract_partial_edits(text: &str) -> Vec<PromptEditCandidate> {
let mut edits = Vec::new();
let mut search_from = 0;
while search_from < text.len() {
let Some(rel) = text[search_from..].find("{\"id\":") else {
break;
};
let start = search_from + rel;
let mut depth: i32 = 0;
let mut end = None;
for (i, c) in text[start..].char_indices() {
match c {
'{' => depth += 1,
'}' => {
depth -= 1;
if depth == 0 {
end = Some(start + i);
break;
}
}
_ => {}
}
}
match end {
Some(e) => {
let obj = &text[start..=e];
if let Ok(edit) = serde_json::from_str::<PromptEditCandidate>(obj) {
if !edit.replacement_text.is_empty() {
edits.push(edit);
}
}
search_from = e + 1;
}
None => break,
}
}
edits
}