#[cfg_attr(not(test), allow(dead_code))]
#[derive(Debug, Clone, PartialEq, Eq)]
pub(super) enum OutputVerdict {
Pass { score: u8 },
Retry { feedback: String, score: u8 },
Escalate { reason: String, score: u8 },
}
fn contains_placeholder_content(output_trimmed: &str) -> Option<&'static str> {
let lower = output_trimmed.to_ascii_lowercase();
let strong_patterns = [
"[insert",
"[your ",
"lorem ipsum",
"placeholder",
"example here",
];
if let Some(pattern) = strong_patterns
.iter()
.find(|pattern| lower.contains(**pattern))
{
return Some(pattern);
}
for line in lower.lines().map(str::trim) {
if line == "todo"
|| line == "todo:"
|| line.starts_with("todo:")
|| line.ends_with("todo")
|| line.ends_with("todo:")
{
return Some("todo");
}
}
None
}
pub(super) fn evaluate_output_heuristic(task_description: &str, output: &str) -> OutputVerdict {
let output_trimmed = output.trim();
if output_trimmed.len() < 20 {
return OutputVerdict::Retry {
feedback: "Output is too short to be substantive".into(),
score: 10,
};
}
if let Some(pattern) = contains_placeholder_content(output_trimmed) {
return OutputVerdict::Retry {
feedback: format!("Output contains placeholder pattern: '{pattern}'"),
score: 20,
};
}
let hollow_leads = [
"i'll help you with",
"i can help with",
"here's what i found",
"let me explain",
"great question",
];
let lower = output_trimmed.to_lowercase();
let first_line = lower.lines().next().unwrap_or("");
let starts_hollow = hollow_leads.iter().any(|h| first_line.contains(h));
if starts_hollow && output_trimmed.len() < 200 {
return OutputVerdict::Retry {
feedback: "Output appears to be a preamble without substantive content".into(),
score: 30,
};
}
let word_count = output_trimmed.split_whitespace().count();
let task_word_count = task_description.split_whitespace().count();
if task_word_count >= 15 && word_count < 30 {
return OutputVerdict::Retry {
feedback: "Output is disproportionately short relative to the task complexity".into(),
score: 35,
};
}
let has_structure = output_trimmed.contains('\n')
|| output_trimmed.contains("```")
|| output_trimmed.contains("- ")
|| output_trimmed.contains("1.");
let base_score = if word_count > 200 {
85
} else if word_count > 80 {
75
} else if word_count > 40 {
60
} else {
50
};
let structure_bonus = if has_structure { 10 } else { 0 };
let score = (base_score + structure_bonus).min(100);
OutputVerdict::Pass { score }
}
#[cfg(test)]
pub(super) fn parse_evaluation_response(response: &str) -> OutputVerdict {
if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(response) {
let verdict = parsed
.get("verdict")
.and_then(|v| v.as_str())
.unwrap_or("pass");
let score = parsed.get("score").and_then(|v| v.as_u64()).unwrap_or(50) as u8;
let feedback = parsed
.get("feedback")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
match verdict {
"retry" => OutputVerdict::Retry { feedback, score },
"escalate" => OutputVerdict::Escalate {
reason: feedback,
score,
},
_ => OutputVerdict::Pass { score },
}
} else {
OutputVerdict::Pass { score: 50 }
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_output_triggers_retry() {
let verdict = evaluate_output_heuristic("Analyze the codebase", "OK");
assert!(matches!(verdict, OutputVerdict::Retry { .. }));
}
#[test]
fn placeholder_triggers_retry() {
let verdict = evaluate_output_heuristic(
"Write a report",
"Here is the report:\n\n[insert your analysis here]\n\nConclusion: TODO",
);
assert!(matches!(verdict, OutputVerdict::Retry { .. }));
}
#[test]
fn discussing_real_todo_markers_does_not_trigger_placeholder_retry() {
let verdict = evaluate_output_heuristic(
"Review the codebase and explain unfinished work markers",
"The audit found three real TODO markers in the auth module. Two are in test code and one is in production request validation.",
);
assert!(
matches!(verdict, OutputVerdict::Pass { .. }),
"legitimate discussion of TODO markers should not be treated as a placeholder"
);
}
#[test]
fn hollow_preamble_without_substance_triggers_retry() {
let verdict = evaluate_output_heuristic(
"Analyze the authentication system for vulnerabilities",
"I'll help you with that! Let me look into it.",
);
assert!(matches!(verdict, OutputVerdict::Retry { .. }));
}
#[test]
fn substantive_output_passes() {
let output = "The authentication system uses JWT tokens with RS256 signing. \
Key findings:\n\
1. Token expiration is set to 24 hours, which is longer than recommended.\n\
2. The refresh token rotation is properly implemented.\n\
3. Session storage uses httpOnly cookies, preventing XSS token theft.\n\
4. Rate limiting on the login endpoint prevents brute force attacks.\n\
5. However, the password reset flow lacks email verification.\n\n\
Recommendations:\n\
- Reduce JWT expiration to 1 hour\n\
- Add email verification to password reset\n\
- Implement CSRF tokens for state-changing operations";
let verdict = evaluate_output_heuristic("Analyze the authentication system", output);
match verdict {
OutputVerdict::Pass { score } => assert!(score >= 70),
other => panic!("expected Pass, got {other:?}"),
}
}
#[test]
fn short_output_for_complex_task_triggers_retry() {
let verdict = evaluate_output_heuristic(
"Perform a comprehensive analysis of the microservices architecture including \
service boundaries, communication patterns, failure modes, and scaling characteristics",
"The architecture looks good overall.",
);
assert!(matches!(verdict, OutputVerdict::Retry { .. }));
}
#[test]
fn parse_pass_verdict() {
let response = r#"{"verdict": "pass", "score": 85, "feedback": ""}"#;
let verdict = parse_evaluation_response(response);
assert!(matches!(verdict, OutputVerdict::Pass { score: 85 }));
}
#[test]
fn parse_retry_verdict() {
let response = r#"{"verdict": "retry", "score": 30, "feedback": "Output is too generic"}"#;
let verdict = parse_evaluation_response(response);
match verdict {
OutputVerdict::Retry { feedback, score } => {
assert_eq!(score, 30);
assert!(feedback.contains("generic"));
}
_ => panic!("expected Retry verdict"),
}
}
#[test]
fn parse_escalate_verdict() {
let response =
r#"{"verdict": "escalate", "score": 10, "feedback": "Completely off-topic"}"#;
let verdict = parse_evaluation_response(response);
match verdict {
OutputVerdict::Escalate { reason, score } => {
assert_eq!(score, 10);
assert!(reason.contains("off-topic"));
}
_ => panic!("expected Escalate verdict"),
}
}
#[test]
fn parse_malformed_defaults_to_pass() {
let response = "this is not json";
let verdict = parse_evaluation_response(response);
assert!(matches!(verdict, OutputVerdict::Pass { score: 50 }));
}
#[test]
fn parse_missing_verdict_defaults_to_pass() {
let response = r#"{"score": 70}"#;
let verdict = parse_evaluation_response(response);
assert!(matches!(verdict, OutputVerdict::Pass { .. }));
}
}