nous_judge/
task_completion.rs1use std::sync::Arc;
7
8use nous_core::{EvalContext, EvalLayer, EvalScore, EvalTiming, NousEvaluator, NousResult};
9
10use crate::judge_provider::{JudgeProvider, parse_judge_scores};
11
12const SYSTEM_PROMPT: &str = "\
14You are an expert evaluator assessing whether an AI agent successfully completed its task. \
15Given the objective and the agent's final answer, score task completion from 0.0 (not completed at all) to 1.0 (perfectly completed). \
16Respond with ONLY a JSON object, no other text:\n\
17{\"completion\": 0.0}";
18
19pub struct TaskCompletion {
21 provider: Arc<dyn JudgeProvider>,
22}
23
24impl TaskCompletion {
25 pub fn new(provider: Arc<dyn JudgeProvider>) -> Self {
27 Self { provider }
28 }
29}
30
31impl NousEvaluator for TaskCompletion {
32 fn name(&self) -> &str {
33 "task_completion"
34 }
35
36 fn layer(&self) -> EvalLayer {
37 EvalLayer::Reasoning
38 }
39
40 fn timing(&self) -> EvalTiming {
41 EvalTiming::Async
42 }
43
44 fn evaluate(&self, ctx: &EvalContext) -> NousResult<Vec<EvalScore>> {
45 let objective = match ctx.metadata.get("objective") {
47 Some(obj) if !obj.is_empty() => obj,
48 _ => return Ok(vec![]),
49 };
50 let final_answer = match ctx.metadata.get("final_answer") {
51 Some(ans) if !ans.is_empty() => ans,
52 _ => return Ok(vec![]),
53 };
54
55 let prompt = format!(
56 "Objective:\n{}\n\nAgent's final answer:\n{}",
57 objective, final_answer
58 );
59
60 let response = self.provider.judge(SYSTEM_PROMPT, &prompt)?;
61
62 let scores = if let Some(parsed) = parse_judge_scores(&response) {
63 if let Some(value) = parsed.get("completion").and_then(serde_json::Value::as_f64) {
64 let clamped = value.clamp(0.0, 1.0);
65 let score = EvalScore::new(
66 "task_completion",
67 clamped,
68 EvalLayer::Reasoning,
69 EvalTiming::Async,
70 &ctx.session_id,
71 )?;
72 vec![score]
73 } else {
74 extract_first_numeric_value(&parsed, &ctx.session_id)?
76 }
77 } else {
78 extract_fallback_score(&response, &ctx.session_id)?
80 };
81
82 Ok(scores)
83 }
84}
85
86fn extract_first_numeric_value(
88 value: &serde_json::Value,
89 session_id: &str,
90) -> NousResult<Vec<EvalScore>> {
91 if let Some(obj) = value.as_object() {
92 for (_key, v) in obj {
93 if let Some(num) = v.as_f64() {
94 let clamped = num.clamp(0.0, 1.0);
95 let score = EvalScore::new(
96 "task_completion",
97 clamped,
98 EvalLayer::Reasoning,
99 EvalTiming::Async,
100 session_id,
101 )?;
102 return Ok(vec![score]);
103 }
104 }
105 }
106 Ok(vec![])
107}
108
109fn extract_fallback_score(response: &str, session_id: &str) -> NousResult<Vec<EvalScore>> {
111 for word in response.split_whitespace() {
112 let cleaned = word.trim_matches(|c: char| !c.is_ascii_digit() && c != '.');
113 if let Ok(value) = cleaned.parse::<f64>() {
114 if (0.0..=1.0).contains(&value) {
115 let score = EvalScore::new(
116 "task_completion",
117 value,
118 EvalLayer::Reasoning,
119 EvalTiming::Async,
120 session_id,
121 )?;
122 return Ok(vec![score]);
123 }
124 }
125 }
126 Ok(vec![])
127}
128
129#[cfg(test)]
130mod tests {
131 use super::*;
132 use crate::judge_provider::MockJudgeProvider;
133
134 fn make_ctx_with_task(objective: &str, final_answer: &str) -> EvalContext {
135 let mut ctx = EvalContext::new("test-session");
136 ctx.metadata.insert("objective".into(), objective.into());
137 ctx.metadata
138 .insert("final_answer".into(), final_answer.into());
139 ctx
140 }
141
142 #[test]
143 fn valid_json_response_produces_score() {
144 let provider = Arc::new(MockJudgeProvider {
145 response: r#"{"completion": 0.85}"#.into(),
146 });
147 let eval = TaskCompletion::new(provider);
148 let ctx = make_ctx_with_task("Write a hello world program", "print('Hello, World!')");
149 let scores = eval.evaluate(&ctx).unwrap();
150
151 assert_eq!(scores.len(), 1);
152 assert_eq!(scores[0].evaluator, "task_completion");
153 assert!((scores[0].value - 0.85).abs() < f64::EPSILON);
154 assert_eq!(scores[0].layer, EvalLayer::Reasoning);
155 assert_eq!(scores[0].timing, EvalTiming::Async);
156 }
157
158 #[test]
159 fn malformed_response_returns_empty() {
160 let provider = Arc::new(MockJudgeProvider {
161 response: "I'm not sure how to evaluate this.".into(),
162 });
163 let eval = TaskCompletion::new(provider);
164 let ctx = make_ctx_with_task("Do something", "I did it");
165 let scores = eval.evaluate(&ctx).unwrap();
166 assert!(scores.is_empty());
167 }
168
169 #[test]
170 fn missing_objective_returns_empty() {
171 let provider = Arc::new(MockJudgeProvider {
172 response: r#"{"completion": 0.9}"#.into(),
173 });
174 let eval = TaskCompletion::new(provider);
175 let mut ctx = EvalContext::new("test-session");
176 ctx.metadata
177 .insert("final_answer".into(), "some answer".into());
178 let scores = eval.evaluate(&ctx).unwrap();
179 assert!(scores.is_empty());
180 }
181
182 #[test]
183 fn missing_final_answer_returns_empty() {
184 let provider = Arc::new(MockJudgeProvider {
185 response: r#"{"completion": 0.9}"#.into(),
186 });
187 let eval = TaskCompletion::new(provider);
188 let mut ctx = EvalContext::new("test-session");
189 ctx.metadata
190 .insert("objective".into(), "some objective".into());
191 let scores = eval.evaluate(&ctx).unwrap();
192 assert!(scores.is_empty());
193 }
194
195 #[test]
196 fn empty_objective_returns_empty() {
197 let provider = Arc::new(MockJudgeProvider {
198 response: r#"{"completion": 0.9}"#.into(),
199 });
200 let eval = TaskCompletion::new(provider);
201 let mut ctx = EvalContext::new("test-session");
202 ctx.metadata.insert("objective".into(), String::new());
203 ctx.metadata
204 .insert("final_answer".into(), "some answer".into());
205 let scores = eval.evaluate(&ctx).unwrap();
206 assert!(scores.is_empty());
207 }
208
209 #[test]
210 fn json_in_markdown_extracted_correctly() {
211 let provider = Arc::new(MockJudgeProvider {
212 response: "My assessment:\n```\n{\"completion\": 0.92}\n```".into(),
213 });
214 let eval = TaskCompletion::new(provider);
215 let ctx = make_ctx_with_task("Build a REST API", "Here is the API implementation...");
216 let scores = eval.evaluate(&ctx).unwrap();
217
218 assert_eq!(scores.len(), 1);
219 assert!((scores[0].value - 0.92).abs() < f64::EPSILON);
220 }
221
222 #[test]
223 fn fallback_numeric_extraction() {
224 let provider = Arc::new(MockJudgeProvider {
225 response: "Task completion: 0.8 — mostly done.".into(),
226 });
227 let eval = TaskCompletion::new(provider);
228 let ctx = make_ctx_with_task("Write tests", "Added 5 tests");
229 let scores = eval.evaluate(&ctx).unwrap();
230
231 assert_eq!(scores.len(), 1);
232 assert!((scores[0].value - 0.8).abs() < f64::EPSILON);
233 }
234
235 #[test]
236 fn score_clamped_to_valid_range() {
237 let provider = Arc::new(MockJudgeProvider {
238 response: r#"{"completion": 1.5}"#.into(),
239 });
240 let eval = TaskCompletion::new(provider);
241 let ctx = make_ctx_with_task("Objective", "Answer");
242 let scores = eval.evaluate(&ctx).unwrap();
243
244 assert_eq!(scores.len(), 1);
245 assert!((scores[0].value - 1.0).abs() < f64::EPSILON);
246 }
247
248 #[test]
249 fn json_with_alternative_key_name() {
250 let provider = Arc::new(MockJudgeProvider {
251 response: r#"{"score": 0.6}"#.into(),
252 });
253 let eval = TaskCompletion::new(provider);
254 let ctx = make_ctx_with_task("Objective", "Answer");
255 let scores = eval.evaluate(&ctx).unwrap();
256
257 assert_eq!(scores.len(), 1);
259 assert!((scores[0].value - 0.6).abs() < f64::EPSILON);
260 }
261
262 #[test]
263 fn evaluator_metadata() {
264 let provider = Arc::new(MockJudgeProvider {
265 response: String::new(),
266 });
267 let eval = TaskCompletion::new(provider);
268 assert_eq!(eval.name(), "task_completion");
269 assert_eq!(eval.layer(), EvalLayer::Reasoning);
270 assert_eq!(eval.timing(), EvalTiming::Async);
271 }
272}