1use serde::{Deserialize, Serialize};
6use serde_json::Value;
7use std::collections::HashMap;
8use std::time::Duration;
9
10use crate::cost_tracker::CostMetrics;
11use crate::structured_judge::StructuredVerdict;
12use crate::trace_analyzer::TraceAnalysis;
13
14#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct EvaluationReport {
17 pub run_id: String,
19 pub started_at: chrono::DateTime<chrono::Utc>,
21 pub completed_at: chrono::DateTime<chrono::Utc>,
23 pub duration: Duration,
25 pub results: Vec<EvaluationResult>,
27 pub summary: EvaluationSummary,
29}
30
31impl EvaluationReport {
32 pub fn new(
34 run_id: &str,
35 results: Vec<EvaluationResult>,
36 started_at: chrono::DateTime<chrono::Utc>,
37 ) -> Self {
38 let completed_at = chrono::Utc::now();
39 let duration = (completed_at - started_at).to_std().unwrap_or_default();
40 let summary = EvaluationSummary::from_results(&results);
41
42 Self { run_id: run_id.to_string(), started_at, completed_at, duration, results, summary }
43 }
44
45 pub fn all_passed(&self) -> bool {
47 self.summary.failed == 0
48 }
49
50 pub fn failures(&self) -> Vec<&EvaluationResult> {
52 self.results.iter().filter(|r| !r.passed).collect()
53 }
54
55 pub fn format_summary(&self) -> String {
57 let mut output = String::new();
58 output.push_str(&format!("Evaluation Report: {}\n", self.run_id));
59 output.push_str(&format!("Duration: {:?}\n", self.duration));
60 output.push_str("\nSummary:\n");
61 output.push_str(&format!(" Total: {}\n", self.summary.total));
62 output.push_str(&format!(" Passed: {}\n", self.summary.passed));
63 output.push_str(&format!(" Failed: {}\n", self.summary.failed));
64 output.push_str(&format!(" Pass Rate: {:.1}%\n", self.summary.pass_rate * 100.0));
65
66 if !self.summary.avg_scores.is_empty() {
67 output.push_str("\nAverage Scores:\n");
68 for (criterion, score) in &self.summary.avg_scores {
69 output.push_str(&format!(" {}: {:.3}\n", criterion, score));
70 }
71 }
72
73 if self.summary.failed > 0 {
74 output.push_str("\nFailed Tests:\n");
75 for result in self.failures() {
76 output.push_str(&format!(
77 " - {} ({})\n",
78 result.eval_id,
79 result
80 .failures
81 .iter()
82 .map(|f| f.criterion.as_str())
83 .collect::<Vec<_>>()
84 .join(", ")
85 ));
86 }
87 }
88
89 output
90 }
91
92 pub fn to_json(&self) -> Result<String, serde_json::Error> {
94 serde_json::to_string_pretty(self)
95 }
96}
97
98#[derive(Debug, Clone, Default, Serialize, Deserialize)]
100pub struct EvaluationSummary {
101 pub total: usize,
103 pub passed: usize,
105 pub failed: usize,
107 pub pass_rate: f64,
109 pub avg_scores: HashMap<String, f64>,
111}
112
113impl EvaluationSummary {
114 pub fn from_results(results: &[EvaluationResult]) -> Self {
116 let total = results.len();
117 let passed = results.iter().filter(|r| r.passed).count();
118 let failed = total - passed;
119 let pass_rate = if total > 0 { passed as f64 / total as f64 } else { 0.0 };
120
121 let mut score_sums: HashMap<String, (f64, usize)> = HashMap::new();
123 for result in results {
124 for (criterion, score) in &result.scores {
125 let entry = score_sums.entry(criterion.clone()).or_insert((0.0, 0));
126 entry.0 += score;
127 entry.1 += 1;
128 }
129 }
130
131 let avg_scores =
132 score_sums.into_iter().map(|(k, (sum, count))| (k, sum / count as f64)).collect();
133
134 Self { total, passed, failed, pass_rate, avg_scores }
135 }
136}
137
138#[derive(Debug, Clone, Serialize, Deserialize)]
140pub struct EvaluationResult {
141 pub eval_id: String,
143 pub passed: bool,
145 pub scores: HashMap<String, f64>,
147 pub failures: Vec<Failure>,
149 pub duration: Duration,
151 #[serde(default)]
153 pub turn_results: Vec<TurnResult>,
154 #[serde(default, skip_serializing_if = "Option::is_none")]
156 pub cost_metrics: Option<CostMetrics>,
157 #[serde(default, skip_serializing_if = "Option::is_none")]
159 pub trace_analysis: Option<TraceAnalysis>,
160 #[serde(default)]
162 pub verdicts: Vec<StructuredVerdict>,
163}
164
165impl EvaluationResult {
166 pub fn passed(eval_id: &str, scores: HashMap<String, f64>, duration: Duration) -> Self {
168 Self {
169 eval_id: eval_id.to_string(),
170 passed: true,
171 scores,
172 failures: vec![],
173 duration,
174 turn_results: vec![],
175 cost_metrics: None,
176 trace_analysis: None,
177 verdicts: vec![],
178 }
179 }
180
181 pub fn failed(
183 eval_id: &str,
184 scores: HashMap<String, f64>,
185 failures: Vec<Failure>,
186 duration: Duration,
187 ) -> Self {
188 Self {
189 eval_id: eval_id.to_string(),
190 passed: false,
191 scores,
192 failures,
193 duration,
194 turn_results: vec![],
195 cost_metrics: None,
196 trace_analysis: None,
197 verdicts: vec![],
198 }
199 }
200
201 pub fn with_turn_results(mut self, turn_results: Vec<TurnResult>) -> Self {
203 self.turn_results = turn_results;
204 self
205 }
206}
207
208#[derive(Debug, Clone, Serialize, Deserialize)]
210pub struct Failure {
211 pub criterion: String,
213 pub expected: Value,
215 pub actual: Value,
217 pub score: f64,
219 pub threshold: f64,
221 #[serde(default)]
223 pub details: Option<String>,
224}
225
226impl Failure {
227 pub fn new(
229 criterion: &str,
230 expected: Value,
231 actual: Value,
232 score: f64,
233 threshold: f64,
234 ) -> Self {
235 Self { criterion: criterion.to_string(), expected, actual, score, threshold, details: None }
236 }
237
238 pub fn with_details(mut self, details: &str) -> Self {
240 self.details = Some(details.to_string());
241 self
242 }
243
244 pub fn format(&self) -> String {
246 let mut s = format!(
247 "{}: score {:.3} < threshold {:.3}",
248 self.criterion, self.score, self.threshold
249 );
250 if let Some(details) = &self.details {
251 s.push_str(&format!("\n Details: {}", details));
252 }
253 s
254 }
255}
256
257#[derive(Debug, Clone, Serialize, Deserialize)]
259pub struct TurnResult {
260 pub invocation_id: String,
262 pub actual_response: Option<String>,
264 pub expected_response: Option<String>,
266 pub actual_tool_calls: Vec<crate::schema::ToolUse>,
268 pub expected_tool_calls: Vec<crate::schema::ToolUse>,
270 pub scores: HashMap<String, f64>,
272}
273
274pub type TestCaseResult = EvaluationResult;
276
277#[cfg(test)]
278mod tests {
279 use super::*;
280
281 #[test]
282 fn test_evaluation_summary() {
283 let results = vec![
284 EvaluationResult::passed(
285 "test_1",
286 HashMap::from([("tool_trajectory".to_string(), 1.0)]),
287 Duration::from_millis(100),
288 ),
289 EvaluationResult::passed(
290 "test_2",
291 HashMap::from([("tool_trajectory".to_string(), 0.8)]),
292 Duration::from_millis(150),
293 ),
294 EvaluationResult::failed(
295 "test_3",
296 HashMap::from([("tool_trajectory".to_string(), 0.5)]),
297 vec![Failure::new("tool_trajectory", Value::Null, Value::Null, 0.5, 0.8)],
298 Duration::from_millis(200),
299 ),
300 ];
301
302 let summary = EvaluationSummary::from_results(&results);
303 assert_eq!(summary.total, 3);
304 assert_eq!(summary.passed, 2);
305 assert_eq!(summary.failed, 1);
306 assert!((summary.pass_rate - 0.666).abs() < 0.01);
307 }
308
309 #[test]
310 fn test_failure_format() {
311 let failure = Failure::new(
312 "response_similarity",
313 Value::String("expected".to_string()),
314 Value::String("actual".to_string()),
315 0.6,
316 0.8,
317 )
318 .with_details("Responses differ significantly");
319
320 let formatted = failure.format();
321 assert!(formatted.contains("response_similarity"));
322 assert!(formatted.contains("0.600"));
323 assert!(formatted.contains("0.800"));
324 }
325}