use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use uuid::Uuid;
use super::digest;
use super::error::Result;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(tag = "type", rename_all = "snake_case", content = "value")]
pub enum ScorerType {
ExactMatch,
SemanticSimilarity,
ToolCallSequence,
Custom(String),
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct ScorerConfig {
pub name: String,
pub scorer_type: ScorerType,
pub params: serde_json::Value,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct EvalThresholds {
pub min_pass_rate: f32,
pub max_regression: f32,
pub fail_fast: bool,
}
impl Default for EvalThresholds {
fn default() -> Self {
Self {
min_pass_rate: 0.95,
max_regression: 0.05,
fail_fast: false,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct EvalTestCase {
pub case_id: Uuid,
pub inputs: serde_json::Value,
pub expected: Option<serde_json::Value>,
pub tags: Vec<String>,
}
impl EvalTestCase {
pub fn new(inputs: serde_json::Value, expected: Option<serde_json::Value>) -> Self {
Self {
case_id: Uuid::new_v4(),
inputs,
expected,
tags: Vec::new(),
}
}
pub fn with_tag(mut self, tag: String) -> Self {
self.tags.push(tag);
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalSuiteFields {
pub name: String,
pub version: String,
pub test_cases: Vec<EvalTestCase>,
pub scorers: Vec<ScorerConfig>,
pub thresholds: EvalThresholds,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct EvalSuite {
pub suite_id: Uuid,
pub suite_digest: String,
pub name: String,
pub version: String,
pub test_cases: Vec<EvalTestCase>,
pub scorers: Vec<ScorerConfig>,
pub thresholds: EvalThresholds,
pub created_at: DateTime<Utc>,
}
impl EvalSuite {
pub fn new(name: String, version: String) -> Self {
Self {
suite_id: Uuid::new_v4(),
suite_digest: String::new(), name,
version,
test_cases: Vec::new(),
scorers: Vec::new(),
thresholds: EvalThresholds::default(),
created_at: Utc::now(),
}
}
pub fn add_test_case(mut self, test_case: EvalTestCase) -> Self {
self.test_cases.push(test_case);
self
}
pub fn add_scorer(mut self, scorer: ScorerConfig) -> Self {
self.scorers.push(scorer);
self
}
pub fn with_thresholds(mut self, thresholds: EvalThresholds) -> Self {
self.thresholds = thresholds;
self
}
pub fn compute_digest(fields: &EvalSuiteFields) -> Result<String> {
let json = serde_json::to_value(fields)?;
digest::compute_digest(&json)
}
pub fn finalize(mut self) -> Result<Self> {
let fields = EvalSuiteFields {
name: self.name.clone(),
version: self.version.clone(),
test_cases: self.test_cases.clone(),
scorers: self.scorers.clone(),
thresholds: self.thresholds.clone(),
};
self.suite_digest = Self::compute_digest(&fields)?;
Ok(self)
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct EvalCaseResult {
pub case_id: Uuid,
pub score: f32,
pub passed: bool,
pub actual: serde_json::Value,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct EvalRunReport {
pub suite_digest: String,
pub seed: u64,
pub total_cases: usize,
pub passed_cases: usize,
pub pass_rate: f32,
pub overall_pass: bool,
pub case_results: Vec<EvalCaseResult>,
}
#[derive(Debug, Clone, Copy)]
pub struct DeterministicEvalRunner {
pub seed: u64,
}
impl DeterministicEvalRunner {
pub fn new(seed: u64) -> Self {
Self { seed }
}
pub fn run_with_outputs(
&self,
suite: &EvalSuite,
actual_outputs: &HashMap<Uuid, serde_json::Value>,
) -> Result<EvalRunReport> {
if suite.suite_digest.is_empty() {
return Err(super::error::AivcsError::DigestMismatch {
expected: "<non-empty>".to_string(),
actual: "<empty>".to_string(),
});
}
let mut case_results = Vec::with_capacity(suite.test_cases.len());
for case in &suite.test_cases {
let actual = actual_outputs
.get(&case.case_id)
.cloned()
.unwrap_or(serde_json::Value::Null);
let score = self.score_case(suite, case, &actual);
let passed = if case.expected.is_some() {
score >= 1.0
} else {
score > 0.0
};
case_results.push(EvalCaseResult {
case_id: case.case_id,
score,
passed,
actual,
});
if suite.thresholds.fail_fast && !passed {
break;
}
}
let passed_cases = case_results.iter().filter(|c| c.passed).count();
let total_cases = case_results.len();
let pass_rate = if total_cases == 0 {
1.0
} else {
passed_cases as f32 / total_cases as f32
};
let overall_pass = pass_rate >= suite.thresholds.min_pass_rate;
Ok(EvalRunReport {
suite_digest: suite.suite_digest.clone(),
seed: self.seed,
total_cases,
passed_cases,
pass_rate,
overall_pass,
case_results,
})
}
fn score_case(
&self,
suite: &EvalSuite,
case: &EvalTestCase,
actual: &serde_json::Value,
) -> f32 {
if suite.scorers.is_empty() {
return match &case.expected {
Some(expected) => {
if expected == actual {
1.0
} else {
0.0
}
}
None => 1.0,
};
}
let mut scores = Vec::with_capacity(suite.scorers.len());
for scorer in &suite.scorers {
match scorer.scorer_type {
ScorerType::ExactMatch => {
let s = match &case.expected {
Some(expected) => {
if expected == actual {
1.0
} else {
0.0
}
}
None => 1.0,
};
scores.push(s);
}
ScorerType::SemanticSimilarity
| ScorerType::ToolCallSequence
| ScorerType::Custom(_) => {}
}
}
if scores.is_empty() {
return match &case.expected {
Some(expected) => {
if expected == actual {
1.0
} else {
0.0
}
}
None => 1.0,
};
}
scores.iter().sum::<f32>() / scores.len() as f32
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_eval_suite_serde_roundtrip() {
let suite = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
.add_test_case(EvalTestCase::new(
serde_json::json!({"input": "test"}),
Some(serde_json::json!({"output": "expected"})),
))
.add_scorer(ScorerConfig {
name: "exact_match".to_string(),
scorer_type: ScorerType::ExactMatch,
params: serde_json::json!({}),
});
let json = serde_json::to_string(&suite).expect("serialize");
let deserialized: EvalSuite = serde_json::from_str(&json).expect("deserialize");
assert_eq!(suite, deserialized);
}
#[test]
fn test_eval_thresholds_defaults() {
let thresholds = EvalThresholds::default();
assert_eq!(thresholds.min_pass_rate, 0.95);
assert_eq!(thresholds.max_regression, 0.05);
assert!(!thresholds.fail_fast);
}
#[test]
fn test_scorer_type_exact_match() {
let scorer_type = ScorerType::ExactMatch;
let json = serde_json::to_string(&scorer_type).expect("serialize");
let deserialized: ScorerType = serde_json::from_str(&json).expect("deserialize");
assert_eq!(scorer_type, deserialized);
}
#[test]
fn test_scorer_type_semantic_similarity() {
let scorer_type = ScorerType::SemanticSimilarity;
let json = serde_json::to_string(&scorer_type).expect("serialize");
let deserialized: ScorerType = serde_json::from_str(&json).expect("deserialize");
assert_eq!(scorer_type, deserialized);
}
#[test]
fn test_scorer_type_custom_roundtrip() {
let scorer_type = ScorerType::Custom("my_custom_scorer".to_string());
let json = serde_json::to_string(&scorer_type).expect("serialize");
let deserialized: ScorerType = serde_json::from_str(&json).expect("deserialize");
assert_eq!(scorer_type, deserialized);
}
#[test]
fn test_scorer_type_tool_call_sequence() {
let scorer_type = ScorerType::ToolCallSequence;
let json = serde_json::to_string(&scorer_type).expect("serialize");
let deserialized: ScorerType = serde_json::from_str(&json).expect("deserialize");
assert_eq!(scorer_type, deserialized);
}
#[test]
fn test_eval_test_case_new() {
let test_case = EvalTestCase::new(
serde_json::json!({"input": "test"}),
Some(serde_json::json!({"output": "expected"})),
);
assert_eq!(test_case.inputs, serde_json::json!({"input": "test"}));
assert_eq!(
test_case.expected,
Some(serde_json::json!({"output": "expected"}))
);
assert!(test_case.tags.is_empty());
}
#[test]
fn test_eval_test_case_with_tag() {
let test_case = EvalTestCase::new(
serde_json::json!({"input": "test"}),
Some(serde_json::json!({"output": "expected"})),
)
.with_tag("critical".to_string());
assert_eq!(test_case.tags, vec!["critical"]);
}
#[test]
fn test_scorer_config_serde_roundtrip() {
let config = ScorerConfig {
name: "test_scorer".to_string(),
scorer_type: ScorerType::SemanticSimilarity,
params: serde_json::json!({"threshold": 0.8}),
};
let json = serde_json::to_string(&config).expect("serialize");
let deserialized: ScorerConfig = serde_json::from_str(&json).expect("deserialize");
assert_eq!(config, deserialized);
}
#[test]
fn test_eval_suite_fluent_api() {
let suite = EvalSuite::new("test".to_string(), "1.0.0".to_string())
.add_test_case(EvalTestCase::new(
serde_json::json!({"input": "test"}),
Some(serde_json::json!({"output": "expected"})),
))
.add_scorer(ScorerConfig {
name: "scorer1".to_string(),
scorer_type: ScorerType::ExactMatch,
params: serde_json::json!({}),
})
.with_thresholds(EvalThresholds {
min_pass_rate: 0.90,
max_regression: 0.10,
fail_fast: true,
});
assert_eq!(suite.test_cases.len(), 1);
assert_eq!(suite.scorers.len(), 1);
assert_eq!(suite.thresholds.min_pass_rate, 0.90);
assert!(suite.thresholds.fail_fast);
}
#[test]
fn test_eval_suite_finalize_sets_digest() {
let suite = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
.add_test_case(EvalTestCase::new(
serde_json::json!({"input": "test"}),
Some(serde_json::json!({"output": "expected"})),
))
.add_scorer(ScorerConfig {
name: "scorer1".to_string(),
scorer_type: ScorerType::ExactMatch,
params: serde_json::json!({}),
});
assert_eq!(suite.suite_digest, "");
let finalized = suite.finalize().expect("finalize suite");
assert!(!finalized.suite_digest.is_empty());
assert_eq!(finalized.suite_digest.len(), 64);
assert!(finalized
.suite_digest
.chars()
.all(|c: char| c.is_ascii_hexdigit()));
}
#[test]
fn test_eval_suite_digest_stable() {
let suite = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
.add_test_case(EvalTestCase::new(
serde_json::json!({"input": "test"}),
Some(serde_json::json!({"output": "expected"})),
))
.add_scorer(ScorerConfig {
name: "scorer1".to_string(),
scorer_type: ScorerType::ExactMatch,
params: serde_json::json!({}),
});
let finalized1 = suite.clone().finalize().expect("finalize suite 1");
let finalized2 = suite.finalize().expect("finalize suite 2");
assert_eq!(
finalized1.suite_digest, finalized2.suite_digest,
"finalizing same suite object twice should produce same digest"
);
}
#[test]
fn test_eval_suite_digest_changes_on_mutation() {
let suite1 = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
.add_test_case(EvalTestCase::new(
serde_json::json!({"input": "test"}),
Some(serde_json::json!({"output": "expected"})),
))
.add_scorer(ScorerConfig {
name: "scorer1".to_string(),
scorer_type: ScorerType::ExactMatch,
params: serde_json::json!({}),
});
let finalized1 = suite1.finalize().expect("finalize suite 1");
let suite2 = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
.add_test_case(EvalTestCase::new(
serde_json::json!({"input": "different_test"}),
Some(serde_json::json!({"output": "expected"})),
))
.add_scorer(ScorerConfig {
name: "scorer1".to_string(),
scorer_type: ScorerType::ExactMatch,
params: serde_json::json!({}),
});
let finalized2 = suite2.finalize().expect("finalize suite 2");
assert_ne!(
finalized1.suite_digest, finalized2.suite_digest,
"different test cases should produce different digest"
);
}
#[test]
fn test_eval_suite_digest_version_change() {
let suite1 = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string());
let finalized1 = suite1.finalize().expect("finalize suite 1");
let suite2 = EvalSuite::new("test_suite".to_string(), "1.0.1".to_string());
let finalized2 = suite2.finalize().expect("finalize suite 2");
assert_ne!(
finalized1.suite_digest, finalized2.suite_digest,
"different version should produce different digest"
);
}
#[test]
fn test_deterministic_eval_runner_stable_score() {
let mut case1 = EvalTestCase::new(
serde_json::json!({"q":"2+2"}),
Some(serde_json::json!({"answer":"4"})),
);
case1.case_id = Uuid::parse_str("11111111-1111-1111-1111-111111111111").unwrap();
let mut case2 = EvalTestCase::new(
serde_json::json!({"q":"3*3"}),
Some(serde_json::json!({"answer":"9"})),
);
case2.case_id = Uuid::parse_str("22222222-2222-2222-2222-222222222222").unwrap();
let suite = EvalSuite::new("golden-suite".to_string(), "1.0.0".to_string())
.add_test_case(case1.clone())
.add_test_case(case2.clone())
.add_scorer(ScorerConfig {
name: "exact".to_string(),
scorer_type: ScorerType::ExactMatch,
params: serde_json::json!({}),
})
.with_thresholds(EvalThresholds {
min_pass_rate: 0.5,
max_regression: 0.0,
fail_fast: false,
})
.finalize()
.unwrap();
let mut outputs = HashMap::new();
outputs.insert(case1.case_id, serde_json::json!({"answer":"4"}));
outputs.insert(case2.case_id, serde_json::json!({"answer":"8"}));
let runner = DeterministicEvalRunner::new(42);
let report1 = runner.run_with_outputs(&suite, &outputs).unwrap();
let report2 = runner.run_with_outputs(&suite, &outputs).unwrap();
assert_eq!(report1, report2);
assert_eq!(report1.total_cases, 2);
assert_eq!(report1.passed_cases, 1);
assert_eq!(report1.pass_rate, 0.5);
assert!(report1.overall_pass);
}
#[test]
fn test_deterministic_eval_runner_golden_output() {
let mut case = EvalTestCase::new(
serde_json::json!({"q":"2+2"}),
Some(serde_json::json!({"answer":"4"})),
);
case.case_id = Uuid::parse_str("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa").unwrap();
let suite = EvalSuite::new("golden".to_string(), "1.0.0".to_string())
.add_test_case(case.clone())
.add_scorer(ScorerConfig {
name: "exact".to_string(),
scorer_type: ScorerType::ExactMatch,
params: serde_json::json!({}),
})
.finalize()
.unwrap();
let mut outputs = HashMap::new();
outputs.insert(case.case_id, serde_json::json!({"answer":"4"}));
let report = DeterministicEvalRunner::new(7)
.run_with_outputs(&suite, &outputs)
.unwrap();
let actual = serde_json::to_value(&report).unwrap();
let expected = serde_json::json!({
"suite_digest": suite.suite_digest,
"seed": 7,
"total_cases": 1,
"passed_cases": 1,
"pass_rate": 1.0,
"overall_pass": true,
"case_results": [
{
"case_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa",
"score": 1.0,
"passed": true,
"actual": {
"answer": "4"
}
}
]
});
assert_eq!(actual, expected);
}
}