use std::path::Path;
use std::sync::Arc;
use async_trait::async_trait;
use nexo_driver_types::{AcceptanceCriterion, AcceptanceFailure, AcceptanceVerdict};
use serde::{Deserialize, Serialize};
use crate::acceptance::AcceptanceEvaluator;
use crate::error::DriverError;
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "verdict", rename_all = "lowercase")]
pub enum JudgeVerdict {
Pass,
Fail,
}
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct JudgeResponse {
#[serde(flatten)]
pub verdict: JudgeVerdict,
#[serde(default)]
pub reasons: Vec<String>,
}
#[derive(Debug, thiserror::Error)]
pub enum JudgeError {
#[error("judge LLM call failed: {0}")]
LlmFailure(String),
#[error("judge timed out")]
Timeout,
#[error("judge produced malformed output: {0}")]
Malformed(String),
}
#[async_trait]
pub trait JudgeBackend: Send + Sync + 'static {
async fn ask(
&self,
criterion_text: &str,
worker_output: &str,
transcript_summary: &str,
) -> Result<String, JudgeError>;
}
pub struct LlmJudgeEvaluator {
pub backend: Arc<dyn JudgeBackend>,
pub fallback: Arc<dyn AcceptanceEvaluator>,
pub worker_output: String,
pub transcript_summary: String,
}
impl LlmJudgeEvaluator {
pub fn new(
backend: Arc<dyn JudgeBackend>,
fallback: Arc<dyn AcceptanceEvaluator>,
worker_output: impl Into<String>,
transcript_summary: impl Into<String>,
) -> Self {
Self {
backend,
fallback,
worker_output: worker_output.into(),
transcript_summary: transcript_summary.into(),
}
}
async fn evaluate_one(
&self,
idx: usize,
criterion_text: &str,
) -> Result<Option<AcceptanceFailure>, DriverError> {
let raw = match self
.backend
.ask(
criterion_text,
&self.worker_output,
&self.transcript_summary,
)
.await
{
Ok(s) => s,
Err(e) => {
return Ok(Some(AcceptanceFailure {
criterion_index: idx,
criterion_label: "llm_judge".to_string(),
message: format!("judge unavailable: {e}"),
evidence: None,
}));
}
};
match parse_judge_response(&raw) {
Some(JudgeResponse {
verdict: JudgeVerdict::Pass,
..
}) => Ok(None),
Some(JudgeResponse {
verdict: JudgeVerdict::Fail,
reasons,
}) => Ok(Some(AcceptanceFailure {
criterion_index: idx,
criterion_label: "llm_judge".to_string(),
message: if reasons.is_empty() {
"judge returned `fail` without reasons".to_string()
} else {
reasons.join("; ")
},
evidence: Some(truncate(&raw, 512)),
})),
None => Ok(Some(AcceptanceFailure {
criterion_index: idx,
criterion_label: "llm_judge".to_string(),
message: "judge produced malformed JSON — treating as fail".to_string(),
evidence: Some(truncate(&raw, 512)),
})),
}
}
}
#[async_trait]
impl AcceptanceEvaluator for LlmJudgeEvaluator {
async fn evaluate(
&self,
criteria: &[AcceptanceCriterion],
workspace: &Path,
) -> Result<AcceptanceVerdict, DriverError> {
let started = std::time::Instant::now();
let mut failures: Vec<AcceptanceFailure> = Vec::new();
let mut non_judge: Vec<AcceptanceCriterion> = Vec::with_capacity(criteria.len());
let mut non_judge_idx_map: Vec<usize> = Vec::with_capacity(criteria.len());
for (idx, c) in criteria.iter().enumerate() {
match c {
AcceptanceCriterion::LlmJudge { criterion_text } => {
if let Some(f) = self.evaluate_one(idx, criterion_text).await? {
failures.push(f);
}
}
_ => {
non_judge.push(c.clone());
non_judge_idx_map.push(idx);
}
}
}
if !non_judge.is_empty() {
let v = self.fallback.evaluate(&non_judge, workspace).await?;
for mut f in v.failures {
let original = non_judge_idx_map
.get(f.criterion_index)
.copied()
.unwrap_or(f.criterion_index);
f.criterion_index = original;
failures.push(f);
}
}
Ok(AcceptanceVerdict {
met: failures.is_empty(),
failures,
evaluated_at: chrono::Utc::now(),
elapsed_ms: started.elapsed().as_millis() as u64,
})
}
}
pub fn parse_judge_response(raw: &str) -> Option<JudgeResponse> {
if let Ok(parsed) = serde_json::from_str::<JudgeResponse>(raw.trim()) {
return Some(parsed);
}
let start = raw.find('{')?;
let mut depth = 0_i32;
let mut end = None;
for (i, ch) in raw[start..].char_indices() {
match ch {
'{' => depth += 1,
'}' => {
depth -= 1;
if depth == 0 {
end = Some(start + i + 1);
break;
}
}
_ => {}
}
}
let chunk = &raw[start..end?];
serde_json::from_str(chunk).ok()
}
fn truncate(s: &str, max_chars: usize) -> String {
if s.chars().count() <= max_chars {
s.to_string()
} else {
s.chars().take(max_chars).collect::<String>() + "…"
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::acceptance::NoopAcceptanceEvaluator;
use std::sync::Mutex;
struct ScriptedBackend {
responses: Mutex<Vec<Result<String, JudgeError>>>,
}
impl ScriptedBackend {
fn new(responses: Vec<Result<String, JudgeError>>) -> Self {
Self {
responses: Mutex::new(responses),
}
}
}
#[async_trait]
impl JudgeBackend for ScriptedBackend {
async fn ask(
&self,
_criterion_text: &str,
_worker_output: &str,
_transcript_summary: &str,
) -> Result<String, JudgeError> {
let mut g = self.responses.lock().unwrap();
g.remove(0)
}
}
fn evaluator(scripted: Vec<Result<String, JudgeError>>) -> LlmJudgeEvaluator {
LlmJudgeEvaluator::new(
Arc::new(ScriptedBackend::new(scripted)),
Arc::new(NoopAcceptanceEvaluator),
"worker did the thing",
"summary: 3 turns, 2 file edits",
)
}
#[test]
fn parse_pass_verdict() {
let raw = r#"{"verdict":"pass","reasons":[]}"#;
let r = parse_judge_response(raw).expect("pass parse");
assert_eq!(r.verdict, JudgeVerdict::Pass);
assert!(r.reasons.is_empty());
}
#[test]
fn parse_fail_verdict_with_reasons() {
let raw = r#"{"verdict":"fail","reasons":["missing null check","no test added"]}"#;
let r = parse_judge_response(raw).expect("fail parse");
assert_eq!(r.verdict, JudgeVerdict::Fail);
assert_eq!(r.reasons.len(), 2);
}
#[test]
fn parse_recovers_from_fenced_json() {
let raw = "```json\n{\"verdict\":\"pass\",\"reasons\":[]}\n```";
let r = parse_judge_response(raw).expect("fenced parse");
assert_eq!(r.verdict, JudgeVerdict::Pass);
}
#[test]
fn parse_returns_none_for_garbage() {
assert!(parse_judge_response("yes it passes").is_none());
assert!(parse_judge_response("").is_none());
assert!(parse_judge_response("{not_json").is_none());
}
#[test]
fn parse_missing_reasons_defaults_to_empty() {
let raw = r#"{"verdict":"pass"}"#;
let r = parse_judge_response(raw).expect("parse");
assert!(r.reasons.is_empty());
}
#[tokio::test]
async fn evaluator_pass_yields_met_verdict() {
let e = evaluator(vec![Ok(r#"{"verdict":"pass","reasons":[]}"#.to_string())]);
let crit = vec![AcceptanceCriterion::llm_judge("must add null check")];
let v = e
.evaluate(&crit, std::path::Path::new("."))
.await
.expect("evaluate");
assert!(v.met);
assert!(v.failures.is_empty());
}
#[tokio::test]
async fn evaluator_fail_yields_failure_with_reasons() {
let e = evaluator(vec![Ok(
r#"{"verdict":"fail","reasons":["null check absent","no regression test"]}"#
.to_string(),
)]);
let crit = vec![AcceptanceCriterion::llm_judge("must add null check")];
let v = e
.evaluate(&crit, std::path::Path::new("."))
.await
.expect("evaluate");
assert!(!v.met);
assert_eq!(v.failures.len(), 1);
assert_eq!(v.failures[0].criterion_label, "llm_judge");
assert!(v.failures[0].message.contains("null check absent"));
assert!(v.failures[0].message.contains("no regression test"));
}
#[tokio::test]
async fn evaluator_malformed_response_treated_as_fail() {
let e = evaluator(vec![Ok("yeah looks good to me".to_string())]);
let crit = vec![AcceptanceCriterion::llm_judge("must add null check")];
let v = e
.evaluate(&crit, std::path::Path::new("."))
.await
.expect("evaluate");
assert!(!v.met);
assert_eq!(v.failures.len(), 1);
assert!(v.failures[0].message.contains("malformed"));
}
#[tokio::test]
async fn evaluator_timeout_treated_as_fail() {
let e = evaluator(vec![Err(JudgeError::Timeout)]);
let crit = vec![AcceptanceCriterion::llm_judge("rule")];
let v = e
.evaluate(&crit, std::path::Path::new("."))
.await
.expect("evaluate");
assert!(!v.met);
assert!(v.failures[0].message.contains("timed out"));
}
#[tokio::test]
async fn evaluator_routes_only_llm_judge_to_backend() {
let e = evaluator(vec![Ok(r#"{"verdict":"pass","reasons":[]}"#.to_string())]);
let crit = vec![
AcceptanceCriterion::llm_judge("must add null check"),
AcceptanceCriterion::shell("cargo test"),
];
let v = e
.evaluate(&crit, std::path::Path::new("."))
.await
.expect("evaluate");
assert!(v.met);
}
#[tokio::test]
async fn evaluator_fail_without_reasons_surfaces_default_message() {
let e = evaluator(vec![Ok(r#"{"verdict":"fail"}"#.to_string())]);
let crit = vec![AcceptanceCriterion::llm_judge("rule")];
let v = e
.evaluate(&crit, std::path::Path::new("."))
.await
.expect("evaluate");
assert!(!v.met);
assert!(v.failures[0].message.contains("without reasons"));
}
}