adk_eval/
structured_judge.rs

1//! Structured LLM judge producing typed verdicts.
2//!
3//! The [`StructuredJudge`] evaluates responses using an LLM and produces
4//! machine-parseable [`StructuredVerdict`] results with scores, reasoning,
5//! and categorical verdicts (pass/fail/partial).
6//!
7//! It attempts function-calling (via `response_schema`) first, then falls
8//! back to prompting for JSON output with a lenient extractor.
9
10use crate::error::{EvalError, Result};
11use adk_core::{Content, GenerateContentConfig, Llm, LlmRequest};
12use futures::StreamExt;
13use serde::{Deserialize, Serialize};
14use std::sync::Arc;
15
16/// Verdict from the structured judge.
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct StructuredVerdict {
19    /// Score from 0.0 to 1.0.
20    pub score: f64,
21    /// Human-readable reasoning for the verdict.
22    pub reasoning: String,
23    /// Categorical verdict.
24    pub verdict: Verdict,
25}
26
27/// Categorical outcome of a structured judgment.
28#[derive(Debug, Clone, Serialize, Deserialize)]
29#[serde(rename_all = "snake_case")]
30pub enum Verdict {
31    /// The response fully satisfies the criterion.
32    Pass,
33    /// The response does not satisfy the criterion.
34    Fail,
35    /// The response partially satisfies the criterion.
36    Partial,
37}
38
39/// Custom rubric for structured judging.
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct JudgeRubric {
42    /// Name of the rubric.
43    pub name: String,
44    /// Description of what the rubric evaluates.
45    pub description: String,
46    /// Scoring scale with defined points.
47    pub scale: Vec<ScalePoint>,
48}
49
50/// A single point on a rubric scoring scale.
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct ScalePoint {
53    /// Numeric score for this level.
54    pub score: f64,
55    /// Short label (e.g., "Excellent", "Poor").
56    pub label: String,
57    /// Detailed description of what this level means.
58    pub description: String,
59}
60
61/// Configuration for the structured judge.
62#[derive(Debug, Clone)]
63pub struct StructuredJudgeConfig {
64    /// Whether to attempt function calling (response_schema) first.
65    pub prefer_function_calling: bool,
66    /// Temperature for the judge LLM.
67    pub temperature: f64,
68    /// Custom rubrics (optional).
69    pub rubrics: Vec<JudgeRubric>,
70}
71
72impl Default for StructuredJudgeConfig {
73    fn default() -> Self {
74        Self { prefer_function_calling: true, temperature: 0.0, rubrics: Vec::new() }
75    }
76}
77
78/// Structured LLM judge that produces typed verdicts.
79///
80/// Tries function calling first (via response schema), then falls back
81/// to prompting for JSON output with a lenient parser.
82pub struct StructuredJudge {
83    model: Arc<dyn Llm>,
84    config: StructuredJudgeConfig,
85}
86
87impl StructuredJudge {
88    /// Create a new structured judge with default configuration.
89    pub fn new(model: Arc<dyn Llm>) -> Self {
90        Self { model, config: StructuredJudgeConfig::default() }
91    }
92
93    /// Create a structured judge with custom configuration.
94    pub fn with_config(model: Arc<dyn Llm>, config: StructuredJudgeConfig) -> Self {
95        Self { model, config }
96    }
97
98    /// Judge a response against expected output with a specific criterion.
99    ///
100    /// Tries function calling first, falls back to JSON extraction.
101    /// On unparseable response, returns score 0.0 with parse error in reasoning.
102    pub async fn judge(
103        &self,
104        expected: &str,
105        actual: &str,
106        criterion: &str,
107    ) -> Result<StructuredVerdict> {
108        let system_prompt = format!(
109            r#"You are an evaluation judge. Evaluate the actual response against the expected response for the given criterion.
110
111Criterion: {}
112
113You MUST respond with a JSON object containing exactly these fields:
114- "score": a number between 0.0 and 1.0
115- "reasoning": a string explaining your evaluation
116- "verdict": one of "pass", "fail", or "partial"
117
118Example response:
119{{"score": 0.85, "reasoning": "The response captures the key points but misses some details.", "verdict": "partial"}}"#,
120            criterion
121        );
122
123        let user_prompt =
124            format!("Expected response:\n\"{}\"\n\nActual response:\n\"{}\"", expected, actual);
125
126        self.execute_judgment(&system_prompt, &user_prompt).await
127    }
128
129    /// Judge with a custom rubric.
130    ///
131    /// Evaluates the response against the rubric's scale points and produces
132    /// a structured verdict.
133    pub async fn judge_with_rubric(
134        &self,
135        response: &str,
136        context: &str,
137        rubric: &JudgeRubric,
138    ) -> Result<StructuredVerdict> {
139        let mut scale_description = String::new();
140        for point in &rubric.scale {
141            scale_description.push_str(&format!(
142                "- {:.1} ({}): {}\n",
143                point.score, point.label, point.description
144            ));
145        }
146
147        let system_prompt = format!(
148            r#"You are an evaluation judge. Evaluate the response using the following rubric.
149
150Rubric: {}
151Description: {}
152
153Scoring Scale:
154{}
155You MUST respond with a JSON object containing exactly these fields:
156- "score": a number between 0.0 and 1.0 matching one of the scale points
157- "reasoning": a string explaining your evaluation
158- "verdict": one of "pass", "fail", or "partial"
159
160Example response:
161{{"score": 0.75, "reasoning": "The response demonstrates good understanding but lacks depth.", "verdict": "partial"}}"#,
162            rubric.name, rubric.description, scale_description
163        );
164
165        let user_prompt =
166            format!("Context:\n\"{}\"\n\nResponse to evaluate:\n\"{}\"", context, response);
167
168        self.execute_judgment(&system_prompt, &user_prompt).await
169    }
170
171    /// Execute a judgment using function-calling first, then JSON fallback.
172    async fn execute_judgment(
173        &self,
174        system_prompt: &str,
175        user_prompt: &str,
176    ) -> Result<StructuredVerdict> {
177        // Attempt 1: Try with response_schema (function-calling style)
178        if self.config.prefer_function_calling {
179            match self.call_with_schema(system_prompt, user_prompt).await {
180                Ok(verdict) => return Ok(verdict),
181                Err(_) => {
182                    // Fall through to JSON fallback
183                }
184            }
185        }
186
187        // Attempt 2: JSON fallback — prompt for JSON and parse leniently
188        self.call_with_json_fallback(system_prompt, user_prompt).await
189    }
190
191    /// Attempt judgment using response_schema for structured output.
192    async fn call_with_schema(
193        &self,
194        system_prompt: &str,
195        user_prompt: &str,
196    ) -> Result<StructuredVerdict> {
197        let schema = serde_json::json!({
198            "type": "object",
199            "properties": {
200                "score": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
201                "reasoning": { "type": "string" },
202                "verdict": { "type": "string", "enum": ["pass", "fail", "partial"] }
203            },
204            "required": ["score", "reasoning", "verdict"]
205        });
206
207        let full_prompt = format!("{system_prompt}\n\n{user_prompt}");
208
209        let config = GenerateContentConfig {
210            temperature: Some(self.config.temperature as f32),
211            response_schema: Some(schema),
212            ..Default::default()
213        };
214
215        let request =
216            LlmRequest::new(self.model.name(), vec![Content::new("user").with_text(&full_prompt)])
217                .with_config(config);
218
219        let response_text = self.collect_response(request).await?;
220        self.parse_verdict_from_text(&response_text)
221    }
222
223    /// Attempt judgment by prompting for JSON and parsing leniently.
224    async fn call_with_json_fallback(
225        &self,
226        system_prompt: &str,
227        user_prompt: &str,
228    ) -> Result<StructuredVerdict> {
229        let full_prompt = format!("{system_prompt}\n\n{user_prompt}");
230
231        let config = GenerateContentConfig {
232            temperature: Some(self.config.temperature as f32),
233            ..Default::default()
234        };
235
236        let request =
237            LlmRequest::new(self.model.name(), vec![Content::new("user").with_text(&full_prompt)])
238                .with_config(config);
239
240        let response_text = self.collect_response(request).await?;
241        self.parse_verdict_from_text(&response_text)
242    }
243
244    /// Collect all text from an LLM response stream.
245    async fn collect_response(&self, request: LlmRequest) -> Result<String> {
246        let mut stream = self
247            .model
248            .generate_content(request, false)
249            .await
250            .map_err(|e| EvalError::JudgeError(format!("LLM judge call failed: {e}")))?;
251
252        let mut response_text = String::new();
253        while let Some(result) = stream.next().await {
254            let response =
255                result.map_err(|e| EvalError::JudgeError(format!("LLM response error: {e}")))?;
256            if let Some(content) = &response.content {
257                for part in &content.parts {
258                    if let Some(text) = part.text() {
259                        response_text.push_str(text);
260                    }
261                }
262            }
263        }
264
265        if response_text.is_empty() {
266            return Err(EvalError::JudgeError("Empty response from judge".to_string()));
267        }
268
269        Ok(response_text)
270    }
271
272    /// Parse a StructuredVerdict from LLM response text.
273    ///
274    /// On failure, returns a fallback verdict with score 0.0 and the parse
275    /// error in the reasoning field.
276    fn parse_verdict_from_text(&self, text: &str) -> Result<StructuredVerdict> {
277        match extract_json_from_text(text) {
278            Some(json) => match serde_json::from_value::<StructuredVerdict>(json) {
279                Ok(mut verdict) => {
280                    // Clamp score to [0.0, 1.0]
281                    verdict.score = verdict.score.clamp(0.0, 1.0);
282                    Ok(verdict)
283                }
284                Err(e) => Ok(StructuredVerdict {
285                    score: 0.0,
286                    reasoning: format!("Parse error: failed to deserialize verdict: {e}"),
287                    verdict: Verdict::Fail,
288                }),
289            },
290            None => Ok(StructuredVerdict {
291                score: 0.0,
292                reasoning: format!(
293                    "Parse error: could not extract JSON from response: {}",
294                    truncate_for_error(text)
295                ),
296                verdict: Verdict::Fail,
297            }),
298        }
299    }
300}
301
302/// Lenient JSON extractor that finds JSON objects in arbitrary text.
303///
304/// Handles common LLM output patterns:
305/// - Raw JSON object
306/// - JSON wrapped in markdown code fences (```json ... ```)
307/// - JSON embedded in prose text
308pub fn extract_json_from_text(text: &str) -> Option<serde_json::Value> {
309    let trimmed = text.trim();
310
311    // Pattern 1: Raw JSON object — starts with `{`
312    if trimmed.starts_with('{')
313        && let Ok(value) = serde_json::from_str::<serde_json::Value>(trimmed)
314        && value.is_object()
315    {
316        return Some(value);
317    }
318
319    // Pattern 2: Markdown code fences (```json ... ``` or ``` ... ```)
320    if let Some(json_str) = extract_from_code_fence(trimmed)
321        && let Ok(value) = serde_json::from_str::<serde_json::Value>(json_str)
322        && value.is_object()
323    {
324        return Some(value);
325    }
326
327    // Pattern 3: Embedded JSON in prose — find the first `{` and try to parse
328    if let Some(start) = trimmed.find('{') {
329        // Try progressively from the outermost `{` to find a valid JSON object
330        let substring = &trimmed[start..];
331        if let Some(value) = try_parse_json_object(substring) {
332            return Some(value);
333        }
334    }
335
336    None
337}
338
339/// Extract content from markdown code fences.
340fn extract_from_code_fence(text: &str) -> Option<&str> {
341    // Look for ```json\n...\n``` or ```\n...\n```
342    let fence_start = text.find("```")?;
343    let after_fence = &text[fence_start + 3..];
344
345    // Skip optional language tag (e.g., "json")
346    let content_start = after_fence.find('\n')? + 1;
347    let content = &after_fence[content_start..];
348
349    // Find closing fence
350    let fence_end = content.find("```")?;
351    let inner = content[..fence_end].trim();
352
353    if inner.is_empty() { None } else { Some(inner) }
354}
355
356/// Try to parse a valid JSON object starting from the beginning of the string.
357///
358/// Uses brace counting to find the matching closing brace.
359fn try_parse_json_object(text: &str) -> Option<serde_json::Value> {
360    if !text.starts_with('{') {
361        return None;
362    }
363
364    let mut depth = 0i32;
365    let mut in_string = false;
366    let mut escape_next = false;
367
368    for (i, ch) in text.char_indices() {
369        if escape_next {
370            escape_next = false;
371            continue;
372        }
373
374        if ch == '\\' && in_string {
375            escape_next = true;
376            continue;
377        }
378
379        if ch == '"' {
380            in_string = !in_string;
381            continue;
382        }
383
384        if in_string {
385            continue;
386        }
387
388        match ch {
389            '{' => depth += 1,
390            '}' => {
391                depth -= 1;
392                if depth == 0 {
393                    let candidate = &text[..=i];
394                    if let Ok(value) = serde_json::from_str::<serde_json::Value>(candidate)
395                        && value.is_object()
396                    {
397                        return Some(value);
398                    }
399                    // If parse failed at this brace, keep going — there might
400                    // be a deeper valid match, but that's unlikely. Return None.
401                    return None;
402                }
403            }
404            _ => {}
405        }
406    }
407
408    None
409}
410
411/// Truncate text for inclusion in error messages.
412fn truncate_for_error(text: &str) -> String {
413    if text.len() <= 200 { text.to_string() } else { format!("{}...", &text[..200]) }
414}
415
416#[cfg(test)]
417mod tests {
418    use super::*;
419
420    #[test]
421    fn test_extract_raw_json() {
422        let input = r#"{"score": 0.8, "reasoning": "Good answer", "verdict": "pass"}"#;
423        let result = extract_json_from_text(input).unwrap();
424        assert_eq!(result["score"], 0.8);
425        assert_eq!(result["reasoning"], "Good answer");
426        assert_eq!(result["verdict"], "pass");
427    }
428
429    #[test]
430    fn test_extract_json_with_whitespace() {
431        let input = r#"
432        {"score": 0.5, "reasoning": "Average", "verdict": "partial"}
433        "#;
434        let result = extract_json_from_text(input).unwrap();
435        assert_eq!(result["score"], 0.5);
436        assert_eq!(result["verdict"], "partial");
437    }
438
439    #[test]
440    fn test_extract_json_from_markdown_fence() {
441        let input = r#"Here is my evaluation:
442
443```json
444{"score": 0.9, "reasoning": "Excellent match", "verdict": "pass"}
445```
446
447That's my assessment."#;
448        let result = extract_json_from_text(input).unwrap();
449        assert_eq!(result["score"], 0.9);
450        assert_eq!(result["verdict"], "pass");
451    }
452
453    #[test]
454    fn test_extract_json_from_fence_without_language() {
455        let input = r#"```
456{"score": 0.3, "reasoning": "Poor", "verdict": "fail"}
457```"#;
458        let result = extract_json_from_text(input).unwrap();
459        assert_eq!(result["score"], 0.3);
460        assert_eq!(result["verdict"], "fail");
461    }
462
463    #[test]
464    fn test_extract_json_embedded_in_prose() {
465        let input = r#"After careful consideration, I believe the score should be:
466{"score": 0.7, "reasoning": "Mostly correct but missing key detail", "verdict": "partial"}
467That is my final answer."#;
468        let result = extract_json_from_text(input).unwrap();
469        assert_eq!(result["score"], 0.7);
470        assert_eq!(result["verdict"], "partial");
471    }
472
473    #[test]
474    fn test_extract_json_returns_none_for_garbage() {
475        let input = "This is just a bunch of random text with no JSON at all.";
476        assert!(extract_json_from_text(input).is_none());
477    }
478
479    #[test]
480    fn test_extract_json_returns_none_for_invalid_json() {
481        let input = r#"{"score": bad_value, "reasoning": "test"}"#;
482        assert!(extract_json_from_text(input).is_none());
483    }
484
485    #[test]
486    fn test_extract_json_handles_nested_braces() {
487        let input =
488            r#"{"score": 0.6, "reasoning": "The {nested} braces are fine", "verdict": "partial"}"#;
489        let result = extract_json_from_text(input).unwrap();
490        assert_eq!(result["score"], 0.6);
491        assert!(result["reasoning"].as_str().unwrap().contains("{nested}"));
492    }
493
494    #[test]
495    fn test_extract_json_handles_escaped_quotes() {
496        let input =
497            r#"{"score": 0.5, "reasoning": "He said \"hello\" to me", "verdict": "partial"}"#;
498        let result = extract_json_from_text(input).unwrap();
499        assert_eq!(result["score"], 0.5);
500    }
501
502    #[test]
503    fn test_parse_verdict_fallback_on_missing_fields() {
504        let judge = StructuredJudge::new(Arc::new(adk_model::MockLlm::new("test")));
505        // JSON missing "verdict" field
506        let result = judge.parse_verdict_from_text(r#"{"score": 0.5, "reasoning": "ok"}"#);
507        let verdict = result.unwrap();
508        assert_eq!(verdict.score, 0.0);
509        assert!(verdict.reasoning.contains("Parse error"));
510    }
511
512    #[test]
513    fn test_parse_verdict_fallback_on_no_json() {
514        let judge = StructuredJudge::new(Arc::new(adk_model::MockLlm::new("test")));
515        let result = judge.parse_verdict_from_text("I think the answer is good.");
516        let verdict = result.unwrap();
517        assert_eq!(verdict.score, 0.0);
518        assert!(verdict.reasoning.contains("Parse error"));
519    }
520
521    #[test]
522    fn test_parse_verdict_clamps_score() {
523        let judge = StructuredJudge::new(Arc::new(adk_model::MockLlm::new("test")));
524        let text = r#"{"score": 1.5, "reasoning": "Great", "verdict": "pass"}"#;
525        let verdict = judge.parse_verdict_from_text(text).unwrap();
526        assert_eq!(verdict.score, 1.0);
527
528        let text = r#"{"score": -0.3, "reasoning": "Bad", "verdict": "fail"}"#;
529        let verdict = judge.parse_verdict_from_text(text).unwrap();
530        assert_eq!(verdict.score, 0.0);
531    }
532
533    #[test]
534    fn test_structured_judge_config_defaults() {
535        let config = StructuredJudgeConfig::default();
536        assert!(config.prefer_function_calling);
537        assert_eq!(config.temperature, 0.0);
538        assert!(config.rubrics.is_empty());
539    }
540}
adk_eval/structured_judge.rs

adk_eval/
structured_judge.rs