Skip to main content

adk_eval/
report.rs

1//! Evaluation result reporting
2//!
3//! Structures for representing and formatting evaluation results.
4
5use serde::{Deserialize, Serialize};
6use serde_json::Value;
7use std::collections::HashMap;
8use std::time::Duration;
9
10use crate::cost_tracker::CostMetrics;
11use crate::structured_judge::StructuredVerdict;
12use crate::trace_analyzer::TraceAnalysis;
13
14/// Complete evaluation report for a test file or eval set
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct EvaluationReport {
17    /// Unique identifier for this evaluation run
18    pub run_id: String,
19    /// When the evaluation started
20    pub started_at: chrono::DateTime<chrono::Utc>,
21    /// When the evaluation completed
22    pub completed_at: chrono::DateTime<chrono::Utc>,
23    /// Total duration
24    pub duration: Duration,
25    /// Results for each test case
26    pub results: Vec<EvaluationResult>,
27    /// Summary statistics
28    pub summary: EvaluationSummary,
29}
30
31impl EvaluationReport {
32    /// Create a new report
33    pub fn new(
34        run_id: &str,
35        results: Vec<EvaluationResult>,
36        started_at: chrono::DateTime<chrono::Utc>,
37    ) -> Self {
38        let completed_at = chrono::Utc::now();
39        let duration = (completed_at - started_at).to_std().unwrap_or_default();
40        let summary = EvaluationSummary::from_results(&results);
41
42        Self { run_id: run_id.to_string(), started_at, completed_at, duration, results, summary }
43    }
44
45    /// Check if all tests passed
46    pub fn all_passed(&self) -> bool {
47        self.summary.failed == 0
48    }
49
50    /// Get failed results only
51    pub fn failures(&self) -> Vec<&EvaluationResult> {
52        self.results.iter().filter(|r| !r.passed).collect()
53    }
54
55    /// Format as a human-readable string
56    pub fn format_summary(&self) -> String {
57        let mut output = String::new();
58        output.push_str(&format!("Evaluation Report: {}\n", self.run_id));
59        output.push_str(&format!("Duration: {:?}\n", self.duration));
60        output.push_str("\nSummary:\n");
61        output.push_str(&format!("  Total: {}\n", self.summary.total));
62        output.push_str(&format!("  Passed: {}\n", self.summary.passed));
63        output.push_str(&format!("  Failed: {}\n", self.summary.failed));
64        output.push_str(&format!("  Pass Rate: {:.1}%\n", self.summary.pass_rate * 100.0));
65
66        if !self.summary.avg_scores.is_empty() {
67            output.push_str("\nAverage Scores:\n");
68            for (criterion, score) in &self.summary.avg_scores {
69                output.push_str(&format!("  {}: {:.3}\n", criterion, score));
70            }
71        }
72
73        if self.summary.failed > 0 {
74            output.push_str("\nFailed Tests:\n");
75            for result in self.failures() {
76                output.push_str(&format!(
77                    "  - {} ({})\n",
78                    result.eval_id,
79                    result
80                        .failures
81                        .iter()
82                        .map(|f| f.criterion.as_str())
83                        .collect::<Vec<_>>()
84                        .join(", ")
85                ));
86            }
87        }
88
89        output
90    }
91
92    /// Export to JSON
93    pub fn to_json(&self) -> Result<String, serde_json::Error> {
94        serde_json::to_string_pretty(self)
95    }
96}
97
98/// Summary statistics for an evaluation run
99#[derive(Debug, Clone, Default, Serialize, Deserialize)]
100pub struct EvaluationSummary {
101    /// Total number of test cases
102    pub total: usize,
103    /// Number of passed test cases
104    pub passed: usize,
105    /// Number of failed test cases
106    pub failed: usize,
107    /// Pass rate (0.0 - 1.0)
108    pub pass_rate: f64,
109    /// Average scores by criterion
110    pub avg_scores: HashMap<String, f64>,
111}
112
113impl EvaluationSummary {
114    /// Calculate summary from results
115    pub fn from_results(results: &[EvaluationResult]) -> Self {
116        let total = results.len();
117        let passed = results.iter().filter(|r| r.passed).count();
118        let failed = total - passed;
119        let pass_rate = if total > 0 { passed as f64 / total as f64 } else { 0.0 };
120
121        // Calculate average scores
122        let mut score_sums: HashMap<String, (f64, usize)> = HashMap::new();
123        for result in results {
124            for (criterion, score) in &result.scores {
125                let entry = score_sums.entry(criterion.clone()).or_insert((0.0, 0));
126                entry.0 += score;
127                entry.1 += 1;
128            }
129        }
130
131        let avg_scores =
132            score_sums.into_iter().map(|(k, (sum, count))| (k, sum / count as f64)).collect();
133
134        Self { total, passed, failed, pass_rate, avg_scores }
135    }
136}
137
138/// Result for a single test case
139#[derive(Debug, Clone, Serialize, Deserialize)]
140pub struct EvaluationResult {
141    /// Test case identifier
142    pub eval_id: String,
143    /// Whether the test passed all criteria
144    pub passed: bool,
145    /// Scores for each criterion
146    pub scores: HashMap<String, f64>,
147    /// Failures (criteria that didn't meet threshold)
148    pub failures: Vec<Failure>,
149    /// Execution duration
150    pub duration: Duration,
151    /// Detailed turn results
152    #[serde(default)]
153    pub turn_results: Vec<TurnResult>,
154    /// Cost and latency metrics (populated when CostTracker is active)
155    #[serde(default, skip_serializing_if = "Option::is_none")]
156    pub cost_metrics: Option<CostMetrics>,
157    /// Trace analysis results (populated when TraceAnalyzer is active)
158    #[serde(default, skip_serializing_if = "Option::is_none")]
159    pub trace_analysis: Option<TraceAnalysis>,
160    /// Structured verdicts from the judge
161    #[serde(default)]
162    pub verdicts: Vec<StructuredVerdict>,
163}
164
165impl EvaluationResult {
166    /// Create a passed result
167    pub fn passed(eval_id: &str, scores: HashMap<String, f64>, duration: Duration) -> Self {
168        Self {
169            eval_id: eval_id.to_string(),
170            passed: true,
171            scores,
172            failures: vec![],
173            duration,
174            turn_results: vec![],
175            cost_metrics: None,
176            trace_analysis: None,
177            verdicts: vec![],
178        }
179    }
180
181    /// Create a failed result
182    pub fn failed(
183        eval_id: &str,
184        scores: HashMap<String, f64>,
185        failures: Vec<Failure>,
186        duration: Duration,
187    ) -> Self {
188        Self {
189            eval_id: eval_id.to_string(),
190            passed: false,
191            scores,
192            failures,
193            duration,
194            turn_results: vec![],
195            cost_metrics: None,
196            trace_analysis: None,
197            verdicts: vec![],
198        }
199    }
200
201    /// Add turn results
202    pub fn with_turn_results(mut self, turn_results: Vec<TurnResult>) -> Self {
203        self.turn_results = turn_results;
204        self
205    }
206}
207
208/// A single failure in evaluation
209#[derive(Debug, Clone, Serialize, Deserialize)]
210pub struct Failure {
211    /// Criterion that failed
212    pub criterion: String,
213    /// Expected value
214    pub expected: Value,
215    /// Actual value
216    pub actual: Value,
217    /// Score achieved
218    pub score: f64,
219    /// Threshold required
220    pub threshold: f64,
221    /// Additional details
222    #[serde(default)]
223    pub details: Option<String>,
224}
225
226impl Failure {
227    /// Create a new failure
228    pub fn new(
229        criterion: &str,
230        expected: Value,
231        actual: Value,
232        score: f64,
233        threshold: f64,
234    ) -> Self {
235        Self { criterion: criterion.to_string(), expected, actual, score, threshold, details: None }
236    }
237
238    /// Add details
239    pub fn with_details(mut self, details: &str) -> Self {
240        self.details = Some(details.to_string());
241        self
242    }
243
244    /// Format as human-readable string
245    pub fn format(&self) -> String {
246        let mut s = format!(
247            "{}: score {:.3} < threshold {:.3}",
248            self.criterion, self.score, self.threshold
249        );
250        if let Some(details) = &self.details {
251            s.push_str(&format!("\n  Details: {}", details));
252        }
253        s
254    }
255}
256
257/// Result for a single conversation turn
258#[derive(Debug, Clone, Serialize, Deserialize)]
259pub struct TurnResult {
260    /// Turn/invocation identifier
261    pub invocation_id: String,
262    /// Actual response from the agent
263    pub actual_response: Option<String>,
264    /// Expected response
265    pub expected_response: Option<String>,
266    /// Actual tool calls made
267    pub actual_tool_calls: Vec<crate::schema::ToolUse>,
268    /// Expected tool calls
269    pub expected_tool_calls: Vec<crate::schema::ToolUse>,
270    /// Scores for this turn
271    pub scores: HashMap<String, f64>,
272}
273
274/// Result for a single test case (alias for backward compatibility)
275pub type TestCaseResult = EvaluationResult;
276
277#[cfg(test)]
278mod tests {
279    use super::*;
280
281    #[test]
282    fn test_evaluation_summary() {
283        let results = vec![
284            EvaluationResult::passed(
285                "test_1",
286                HashMap::from([("tool_trajectory".to_string(), 1.0)]),
287                Duration::from_millis(100),
288            ),
289            EvaluationResult::passed(
290                "test_2",
291                HashMap::from([("tool_trajectory".to_string(), 0.8)]),
292                Duration::from_millis(150),
293            ),
294            EvaluationResult::failed(
295                "test_3",
296                HashMap::from([("tool_trajectory".to_string(), 0.5)]),
297                vec![Failure::new("tool_trajectory", Value::Null, Value::Null, 0.5, 0.8)],
298                Duration::from_millis(200),
299            ),
300        ];
301
302        let summary = EvaluationSummary::from_results(&results);
303        assert_eq!(summary.total, 3);
304        assert_eq!(summary.passed, 2);
305        assert_eq!(summary.failed, 1);
306        assert!((summary.pass_rate - 0.666).abs() < 0.01);
307    }
308
309    #[test]
310    fn test_failure_format() {
311        let failure = Failure::new(
312            "response_similarity",
313            Value::String("expected".to_string()),
314            Value::String("actual".to_string()),
315            0.6,
316            0.8,
317        )
318        .with_details("Responses differ significantly");
319
320        let formatted = failure.format();
321        assert!(formatted.contains("response_similarity"));
322        assert!(formatted.contains("0.600"));
323        assert!(formatted.contains("0.800"));
324    }
325}