use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::collections::HashMap;
use std::time::Duration;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationReport {
pub run_id: String,
pub started_at: chrono::DateTime<chrono::Utc>,
pub completed_at: chrono::DateTime<chrono::Utc>,
pub duration: Duration,
pub results: Vec<EvaluationResult>,
pub summary: EvaluationSummary,
}
impl EvaluationReport {
pub fn new(
run_id: &str,
results: Vec<EvaluationResult>,
started_at: chrono::DateTime<chrono::Utc>,
) -> Self {
let completed_at = chrono::Utc::now();
let duration = (completed_at - started_at).to_std().unwrap_or_default();
let summary = EvaluationSummary::from_results(&results);
Self { run_id: run_id.to_string(), started_at, completed_at, duration, results, summary }
}
pub fn all_passed(&self) -> bool {
self.summary.failed == 0
}
pub fn failures(&self) -> Vec<&EvaluationResult> {
self.results.iter().filter(|r| !r.passed).collect()
}
pub fn format_summary(&self) -> String {
let mut output = String::new();
output.push_str(&format!("Evaluation Report: {}\n", self.run_id));
output.push_str(&format!("Duration: {:?}\n", self.duration));
output.push_str("\nSummary:\n");
output.push_str(&format!(" Total: {}\n", self.summary.total));
output.push_str(&format!(" Passed: {}\n", self.summary.passed));
output.push_str(&format!(" Failed: {}\n", self.summary.failed));
output.push_str(&format!(" Pass Rate: {:.1}%\n", self.summary.pass_rate * 100.0));
if !self.summary.avg_scores.is_empty() {
output.push_str("\nAverage Scores:\n");
for (criterion, score) in &self.summary.avg_scores {
output.push_str(&format!(" {}: {:.3}\n", criterion, score));
}
}
if self.summary.failed > 0 {
output.push_str("\nFailed Tests:\n");
for result in self.failures() {
output.push_str(&format!(
" - {} ({})\n",
result.eval_id,
result
.failures
.iter()
.map(|f| f.criterion.as_str())
.collect::<Vec<_>>()
.join(", ")
));
}
}
output
}
pub fn to_json(&self) -> Result<String, serde_json::Error> {
serde_json::to_string_pretty(self)
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct EvaluationSummary {
pub total: usize,
pub passed: usize,
pub failed: usize,
pub pass_rate: f64,
pub avg_scores: HashMap<String, f64>,
}
impl EvaluationSummary {
pub fn from_results(results: &[EvaluationResult]) -> Self {
let total = results.len();
let passed = results.iter().filter(|r| r.passed).count();
let failed = total - passed;
let pass_rate = if total > 0 { passed as f64 / total as f64 } else { 0.0 };
let mut score_sums: HashMap<String, (f64, usize)> = HashMap::new();
for result in results {
for (criterion, score) in &result.scores {
let entry = score_sums.entry(criterion.clone()).or_insert((0.0, 0));
entry.0 += score;
entry.1 += 1;
}
}
let avg_scores =
score_sums.into_iter().map(|(k, (sum, count))| (k, sum / count as f64)).collect();
Self { total, passed, failed, pass_rate, avg_scores }
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationResult {
pub eval_id: String,
pub passed: bool,
pub scores: HashMap<String, f64>,
pub failures: Vec<Failure>,
pub duration: Duration,
#[serde(default)]
pub turn_results: Vec<TurnResult>,
}
impl EvaluationResult {
pub fn passed(eval_id: &str, scores: HashMap<String, f64>, duration: Duration) -> Self {
Self {
eval_id: eval_id.to_string(),
passed: true,
scores,
failures: vec![],
duration,
turn_results: vec![],
}
}
pub fn failed(
eval_id: &str,
scores: HashMap<String, f64>,
failures: Vec<Failure>,
duration: Duration,
) -> Self {
Self {
eval_id: eval_id.to_string(),
passed: false,
scores,
failures,
duration,
turn_results: vec![],
}
}
pub fn with_turn_results(mut self, turn_results: Vec<TurnResult>) -> Self {
self.turn_results = turn_results;
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Failure {
pub criterion: String,
pub expected: Value,
pub actual: Value,
pub score: f64,
pub threshold: f64,
#[serde(default)]
pub details: Option<String>,
}
impl Failure {
pub fn new(
criterion: &str,
expected: Value,
actual: Value,
score: f64,
threshold: f64,
) -> Self {
Self { criterion: criterion.to_string(), expected, actual, score, threshold, details: None }
}
pub fn with_details(mut self, details: &str) -> Self {
self.details = Some(details.to_string());
self
}
pub fn format(&self) -> String {
let mut s = format!(
"{}: score {:.3} < threshold {:.3}",
self.criterion, self.score, self.threshold
);
if let Some(details) = &self.details {
s.push_str(&format!("\n Details: {}", details));
}
s
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TurnResult {
pub invocation_id: String,
pub actual_response: Option<String>,
pub expected_response: Option<String>,
pub actual_tool_calls: Vec<crate::schema::ToolUse>,
pub expected_tool_calls: Vec<crate::schema::ToolUse>,
pub scores: HashMap<String, f64>,
}
pub type TestCaseResult = EvaluationResult;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_evaluation_summary() {
let results = vec![
EvaluationResult::passed(
"test_1",
HashMap::from([("tool_trajectory".to_string(), 1.0)]),
Duration::from_millis(100),
),
EvaluationResult::passed(
"test_2",
HashMap::from([("tool_trajectory".to_string(), 0.8)]),
Duration::from_millis(150),
),
EvaluationResult::failed(
"test_3",
HashMap::from([("tool_trajectory".to_string(), 0.5)]),
vec![Failure::new("tool_trajectory", Value::Null, Value::Null, 0.5, 0.8)],
Duration::from_millis(200),
),
];
let summary = EvaluationSummary::from_results(&results);
assert_eq!(summary.total, 3);
assert_eq!(summary.passed, 2);
assert_eq!(summary.failed, 1);
assert!((summary.pass_rate - 0.666).abs() < 0.01);
}
#[test]
fn test_failure_format() {
let failure = Failure::new(
"response_similarity",
Value::String("expected".to_string()),
Value::String("actual".to_string()),
0.6,
0.8,
)
.with_details("Responses differ significantly");
let formatted = failure.format();
assert!(formatted.contains("response_similarity"));
assert!(formatted.contains("0.600"));
assert!(formatted.contains("0.800"));
}
}