1use crate::error::{EvalError, Result};
11use adk_core::{Content, GenerateContentConfig, Llm, LlmRequest};
12use futures::StreamExt;
13use serde::{Deserialize, Serialize};
14use std::sync::Arc;
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct StructuredVerdict {
19 pub score: f64,
21 pub reasoning: String,
23 pub verdict: Verdict,
25}
26
27#[derive(Debug, Clone, Serialize, Deserialize)]
29#[serde(rename_all = "snake_case")]
30pub enum Verdict {
31 Pass,
33 Fail,
35 Partial,
37}
38
39#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct JudgeRubric {
42 pub name: String,
44 pub description: String,
46 pub scale: Vec<ScalePoint>,
48}
49
50#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct ScalePoint {
53 pub score: f64,
55 pub label: String,
57 pub description: String,
59}
60
61#[derive(Debug, Clone)]
63pub struct StructuredJudgeConfig {
64 pub prefer_function_calling: bool,
66 pub temperature: f64,
68 pub rubrics: Vec<JudgeRubric>,
70}
71
72impl Default for StructuredJudgeConfig {
73 fn default() -> Self {
74 Self { prefer_function_calling: true, temperature: 0.0, rubrics: Vec::new() }
75 }
76}
77
78pub struct StructuredJudge {
83 model: Arc<dyn Llm>,
84 config: StructuredJudgeConfig,
85}
86
87impl StructuredJudge {
88 pub fn new(model: Arc<dyn Llm>) -> Self {
90 Self { model, config: StructuredJudgeConfig::default() }
91 }
92
93 pub fn with_config(model: Arc<dyn Llm>, config: StructuredJudgeConfig) -> Self {
95 Self { model, config }
96 }
97
98 pub async fn judge(
103 &self,
104 expected: &str,
105 actual: &str,
106 criterion: &str,
107 ) -> Result<StructuredVerdict> {
108 let system_prompt = format!(
109 r#"You are an evaluation judge. Evaluate the actual response against the expected response for the given criterion.
110
111Criterion: {}
112
113You MUST respond with a JSON object containing exactly these fields:
114- "score": a number between 0.0 and 1.0
115- "reasoning": a string explaining your evaluation
116- "verdict": one of "pass", "fail", or "partial"
117
118Example response:
119{{"score": 0.85, "reasoning": "The response captures the key points but misses some details.", "verdict": "partial"}}"#,
120 criterion
121 );
122
123 let user_prompt =
124 format!("Expected response:\n\"{}\"\n\nActual response:\n\"{}\"", expected, actual);
125
126 self.execute_judgment(&system_prompt, &user_prompt).await
127 }
128
129 pub async fn judge_with_rubric(
134 &self,
135 response: &str,
136 context: &str,
137 rubric: &JudgeRubric,
138 ) -> Result<StructuredVerdict> {
139 let mut scale_description = String::new();
140 for point in &rubric.scale {
141 scale_description.push_str(&format!(
142 "- {:.1} ({}): {}\n",
143 point.score, point.label, point.description
144 ));
145 }
146
147 let system_prompt = format!(
148 r#"You are an evaluation judge. Evaluate the response using the following rubric.
149
150Rubric: {}
151Description: {}
152
153Scoring Scale:
154{}
155You MUST respond with a JSON object containing exactly these fields:
156- "score": a number between 0.0 and 1.0 matching one of the scale points
157- "reasoning": a string explaining your evaluation
158- "verdict": one of "pass", "fail", or "partial"
159
160Example response:
161{{"score": 0.75, "reasoning": "The response demonstrates good understanding but lacks depth.", "verdict": "partial"}}"#,
162 rubric.name, rubric.description, scale_description
163 );
164
165 let user_prompt =
166 format!("Context:\n\"{}\"\n\nResponse to evaluate:\n\"{}\"", context, response);
167
168 self.execute_judgment(&system_prompt, &user_prompt).await
169 }
170
171 async fn execute_judgment(
173 &self,
174 system_prompt: &str,
175 user_prompt: &str,
176 ) -> Result<StructuredVerdict> {
177 if self.config.prefer_function_calling {
179 match self.call_with_schema(system_prompt, user_prompt).await {
180 Ok(verdict) => return Ok(verdict),
181 Err(_) => {
182 }
184 }
185 }
186
187 self.call_with_json_fallback(system_prompt, user_prompt).await
189 }
190
191 async fn call_with_schema(
193 &self,
194 system_prompt: &str,
195 user_prompt: &str,
196 ) -> Result<StructuredVerdict> {
197 let schema = serde_json::json!({
198 "type": "object",
199 "properties": {
200 "score": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
201 "reasoning": { "type": "string" },
202 "verdict": { "type": "string", "enum": ["pass", "fail", "partial"] }
203 },
204 "required": ["score", "reasoning", "verdict"]
205 });
206
207 let full_prompt = format!("{system_prompt}\n\n{user_prompt}");
208
209 let config = GenerateContentConfig {
210 temperature: Some(self.config.temperature as f32),
211 response_schema: Some(schema),
212 ..Default::default()
213 };
214
215 let request =
216 LlmRequest::new(self.model.name(), vec![Content::new("user").with_text(&full_prompt)])
217 .with_config(config);
218
219 let response_text = self.collect_response(request).await?;
220 self.parse_verdict_from_text(&response_text)
221 }
222
223 async fn call_with_json_fallback(
225 &self,
226 system_prompt: &str,
227 user_prompt: &str,
228 ) -> Result<StructuredVerdict> {
229 let full_prompt = format!("{system_prompt}\n\n{user_prompt}");
230
231 let config = GenerateContentConfig {
232 temperature: Some(self.config.temperature as f32),
233 ..Default::default()
234 };
235
236 let request =
237 LlmRequest::new(self.model.name(), vec![Content::new("user").with_text(&full_prompt)])
238 .with_config(config);
239
240 let response_text = self.collect_response(request).await?;
241 self.parse_verdict_from_text(&response_text)
242 }
243
244 async fn collect_response(&self, request: LlmRequest) -> Result<String> {
246 let mut stream = self
247 .model
248 .generate_content(request, false)
249 .await
250 .map_err(|e| EvalError::JudgeError(format!("LLM judge call failed: {e}")))?;
251
252 let mut response_text = String::new();
253 while let Some(result) = stream.next().await {
254 let response =
255 result.map_err(|e| EvalError::JudgeError(format!("LLM response error: {e}")))?;
256 if let Some(content) = &response.content {
257 for part in &content.parts {
258 if let Some(text) = part.text() {
259 response_text.push_str(text);
260 }
261 }
262 }
263 }
264
265 if response_text.is_empty() {
266 return Err(EvalError::JudgeError("Empty response from judge".to_string()));
267 }
268
269 Ok(response_text)
270 }
271
272 fn parse_verdict_from_text(&self, text: &str) -> Result<StructuredVerdict> {
277 match extract_json_from_text(text) {
278 Some(json) => match serde_json::from_value::<StructuredVerdict>(json) {
279 Ok(mut verdict) => {
280 verdict.score = verdict.score.clamp(0.0, 1.0);
282 Ok(verdict)
283 }
284 Err(e) => Ok(StructuredVerdict {
285 score: 0.0,
286 reasoning: format!("Parse error: failed to deserialize verdict: {e}"),
287 verdict: Verdict::Fail,
288 }),
289 },
290 None => Ok(StructuredVerdict {
291 score: 0.0,
292 reasoning: format!(
293 "Parse error: could not extract JSON from response: {}",
294 truncate_for_error(text)
295 ),
296 verdict: Verdict::Fail,
297 }),
298 }
299 }
300}
301
302pub fn extract_json_from_text(text: &str) -> Option<serde_json::Value> {
309 let trimmed = text.trim();
310
311 if trimmed.starts_with('{')
313 && let Ok(value) = serde_json::from_str::<serde_json::Value>(trimmed)
314 && value.is_object()
315 {
316 return Some(value);
317 }
318
319 if let Some(json_str) = extract_from_code_fence(trimmed)
321 && let Ok(value) = serde_json::from_str::<serde_json::Value>(json_str)
322 && value.is_object()
323 {
324 return Some(value);
325 }
326
327 if let Some(start) = trimmed.find('{') {
329 let substring = &trimmed[start..];
331 if let Some(value) = try_parse_json_object(substring) {
332 return Some(value);
333 }
334 }
335
336 None
337}
338
339fn extract_from_code_fence(text: &str) -> Option<&str> {
341 let fence_start = text.find("```")?;
343 let after_fence = &text[fence_start + 3..];
344
345 let content_start = after_fence.find('\n')? + 1;
347 let content = &after_fence[content_start..];
348
349 let fence_end = content.find("```")?;
351 let inner = content[..fence_end].trim();
352
353 if inner.is_empty() { None } else { Some(inner) }
354}
355
356fn try_parse_json_object(text: &str) -> Option<serde_json::Value> {
360 if !text.starts_with('{') {
361 return None;
362 }
363
364 let mut depth = 0i32;
365 let mut in_string = false;
366 let mut escape_next = false;
367
368 for (i, ch) in text.char_indices() {
369 if escape_next {
370 escape_next = false;
371 continue;
372 }
373
374 if ch == '\\' && in_string {
375 escape_next = true;
376 continue;
377 }
378
379 if ch == '"' {
380 in_string = !in_string;
381 continue;
382 }
383
384 if in_string {
385 continue;
386 }
387
388 match ch {
389 '{' => depth += 1,
390 '}' => {
391 depth -= 1;
392 if depth == 0 {
393 let candidate = &text[..=i];
394 if let Ok(value) = serde_json::from_str::<serde_json::Value>(candidate)
395 && value.is_object()
396 {
397 return Some(value);
398 }
399 return None;
402 }
403 }
404 _ => {}
405 }
406 }
407
408 None
409}
410
411fn truncate_for_error(text: &str) -> String {
413 if text.len() <= 200 { text.to_string() } else { format!("{}...", &text[..200]) }
414}
415
416#[cfg(test)]
417mod tests {
418 use super::*;
419
420 #[test]
421 fn test_extract_raw_json() {
422 let input = r#"{"score": 0.8, "reasoning": "Good answer", "verdict": "pass"}"#;
423 let result = extract_json_from_text(input).unwrap();
424 assert_eq!(result["score"], 0.8);
425 assert_eq!(result["reasoning"], "Good answer");
426 assert_eq!(result["verdict"], "pass");
427 }
428
429 #[test]
430 fn test_extract_json_with_whitespace() {
431 let input = r#"
432 {"score": 0.5, "reasoning": "Average", "verdict": "partial"}
433 "#;
434 let result = extract_json_from_text(input).unwrap();
435 assert_eq!(result["score"], 0.5);
436 assert_eq!(result["verdict"], "partial");
437 }
438
439 #[test]
440 fn test_extract_json_from_markdown_fence() {
441 let input = r#"Here is my evaluation:
442
443```json
444{"score": 0.9, "reasoning": "Excellent match", "verdict": "pass"}
445```
446
447That's my assessment."#;
448 let result = extract_json_from_text(input).unwrap();
449 assert_eq!(result["score"], 0.9);
450 assert_eq!(result["verdict"], "pass");
451 }
452
453 #[test]
454 fn test_extract_json_from_fence_without_language() {
455 let input = r#"```
456{"score": 0.3, "reasoning": "Poor", "verdict": "fail"}
457```"#;
458 let result = extract_json_from_text(input).unwrap();
459 assert_eq!(result["score"], 0.3);
460 assert_eq!(result["verdict"], "fail");
461 }
462
463 #[test]
464 fn test_extract_json_embedded_in_prose() {
465 let input = r#"After careful consideration, I believe the score should be:
466{"score": 0.7, "reasoning": "Mostly correct but missing key detail", "verdict": "partial"}
467That is my final answer."#;
468 let result = extract_json_from_text(input).unwrap();
469 assert_eq!(result["score"], 0.7);
470 assert_eq!(result["verdict"], "partial");
471 }
472
473 #[test]
474 fn test_extract_json_returns_none_for_garbage() {
475 let input = "This is just a bunch of random text with no JSON at all.";
476 assert!(extract_json_from_text(input).is_none());
477 }
478
479 #[test]
480 fn test_extract_json_returns_none_for_invalid_json() {
481 let input = r#"{"score": bad_value, "reasoning": "test"}"#;
482 assert!(extract_json_from_text(input).is_none());
483 }
484
485 #[test]
486 fn test_extract_json_handles_nested_braces() {
487 let input =
488 r#"{"score": 0.6, "reasoning": "The {nested} braces are fine", "verdict": "partial"}"#;
489 let result = extract_json_from_text(input).unwrap();
490 assert_eq!(result["score"], 0.6);
491 assert!(result["reasoning"].as_str().unwrap().contains("{nested}"));
492 }
493
494 #[test]
495 fn test_extract_json_handles_escaped_quotes() {
496 let input =
497 r#"{"score": 0.5, "reasoning": "He said \"hello\" to me", "verdict": "partial"}"#;
498 let result = extract_json_from_text(input).unwrap();
499 assert_eq!(result["score"], 0.5);
500 }
501
502 #[test]
503 fn test_parse_verdict_fallback_on_missing_fields() {
504 let judge = StructuredJudge::new(Arc::new(adk_model::MockLlm::new("test")));
505 let result = judge.parse_verdict_from_text(r#"{"score": 0.5, "reasoning": "ok"}"#);
507 let verdict = result.unwrap();
508 assert_eq!(verdict.score, 0.0);
509 assert!(verdict.reasoning.contains("Parse error"));
510 }
511
512 #[test]
513 fn test_parse_verdict_fallback_on_no_json() {
514 let judge = StructuredJudge::new(Arc::new(adk_model::MockLlm::new("test")));
515 let result = judge.parse_verdict_from_text("I think the answer is good.");
516 let verdict = result.unwrap();
517 assert_eq!(verdict.score, 0.0);
518 assert!(verdict.reasoning.contains("Parse error"));
519 }
520
521 #[test]
522 fn test_parse_verdict_clamps_score() {
523 let judge = StructuredJudge::new(Arc::new(adk_model::MockLlm::new("test")));
524 let text = r#"{"score": 1.5, "reasoning": "Great", "verdict": "pass"}"#;
525 let verdict = judge.parse_verdict_from_text(text).unwrap();
526 assert_eq!(verdict.score, 1.0);
527
528 let text = r#"{"score": -0.3, "reasoning": "Bad", "verdict": "fail"}"#;
529 let verdict = judge.parse_verdict_from_text(text).unwrap();
530 assert_eq!(verdict.score, 0.0);
531 }
532
533 #[test]
534 fn test_structured_judge_config_defaults() {
535 let config = StructuredJudgeConfig::default();
536 assert!(config.prefer_function_calling);
537 assert_eq!(config.temperature, 0.0);
538 assert!(config.rubrics.is_empty());
539 }
540}