1use chrono::{DateTime, Utc};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use uuid::Uuid;
7
8use super::digest;
9use super::error::Result;
10
11#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
13#[serde(tag = "type", rename_all = "snake_case", content = "value")]
14pub enum ScorerType {
15 ExactMatch,
17
18 SemanticSimilarity,
20
21 ToolCallSequence,
23
24 Custom(String),
26}
27
28#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
30pub struct ScorerConfig {
31 pub name: String,
33
34 pub scorer_type: ScorerType,
36
37 pub params: serde_json::Value,
39}
40
41#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
43pub struct EvalThresholds {
44 pub min_pass_rate: f32,
46
47 pub max_regression: f32,
49
50 pub fail_fast: bool,
52}
53
54impl Default for EvalThresholds {
55 fn default() -> Self {
56 Self {
57 min_pass_rate: 0.95,
58 max_regression: 0.05,
59 fail_fast: false,
60 }
61 }
62}
63
64#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
66pub struct EvalTestCase {
67 pub case_id: Uuid,
69
70 pub inputs: serde_json::Value,
72
73 pub expected: Option<serde_json::Value>,
75
76 pub tags: Vec<String>,
78}
79
80impl EvalTestCase {
81 pub fn new(inputs: serde_json::Value, expected: Option<serde_json::Value>) -> Self {
83 Self {
84 case_id: Uuid::new_v4(),
85 inputs,
86 expected,
87 tags: Vec::new(),
88 }
89 }
90
91 pub fn with_tag(mut self, tag: String) -> Self {
93 self.tags.push(tag);
94 self
95 }
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct EvalSuiteFields {
104 pub name: String,
106
107 pub version: String,
109
110 pub test_cases: Vec<EvalTestCase>,
112
113 pub scorers: Vec<ScorerConfig>,
115
116 pub thresholds: EvalThresholds,
118}
119
120#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
122pub struct EvalSuite {
123 pub suite_id: Uuid,
125
126 pub suite_digest: String,
128
129 pub name: String,
131
132 pub version: String,
134
135 pub test_cases: Vec<EvalTestCase>,
137
138 pub scorers: Vec<ScorerConfig>,
140
141 pub thresholds: EvalThresholds,
143
144 pub created_at: DateTime<Utc>,
146}
147
148impl EvalSuite {
149 pub fn new(name: String, version: String) -> Self {
151 Self {
152 suite_id: Uuid::new_v4(),
153 suite_digest: String::new(), name,
155 version,
156 test_cases: Vec::new(),
157 scorers: Vec::new(),
158 thresholds: EvalThresholds::default(),
159 created_at: Utc::now(),
160 }
161 }
162
163 pub fn add_test_case(mut self, test_case: EvalTestCase) -> Self {
165 self.test_cases.push(test_case);
166 self
167 }
168
169 pub fn add_scorer(mut self, scorer: ScorerConfig) -> Self {
171 self.scorers.push(scorer);
172 self
173 }
174
175 pub fn with_thresholds(mut self, thresholds: EvalThresholds) -> Self {
177 self.thresholds = thresholds;
178 self
179 }
180
181 pub fn compute_digest(fields: &EvalSuiteFields) -> Result<String> {
183 let json = serde_json::to_value(fields)?;
184 digest::compute_digest(&json)
185 }
186
187 pub fn finalize(mut self) -> Result<Self> {
189 let fields = EvalSuiteFields {
190 name: self.name.clone(),
191 version: self.version.clone(),
192 test_cases: self.test_cases.clone(),
193 scorers: self.scorers.clone(),
194 thresholds: self.thresholds.clone(),
195 };
196 self.suite_digest = Self::compute_digest(&fields)?;
197 Ok(self)
198 }
199}
200
201#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
203pub struct EvalCaseResult {
204 pub case_id: Uuid,
205 pub score: f32,
206 pub passed: bool,
207 pub actual: serde_json::Value,
208}
209
210#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
212pub struct EvalRunReport {
213 pub suite_digest: String,
214 pub seed: u64,
215 pub total_cases: usize,
216 pub passed_cases: usize,
217 pub pass_rate: f32,
218 pub overall_pass: bool,
219 pub case_results: Vec<EvalCaseResult>,
220}
221
222#[derive(Debug, Clone, Copy)]
224pub struct DeterministicEvalRunner {
225 pub seed: u64,
226}
227
228impl DeterministicEvalRunner {
229 pub fn new(seed: u64) -> Self {
230 Self { seed }
231 }
232
233 pub fn run_with_outputs(
239 &self,
240 suite: &EvalSuite,
241 actual_outputs: &HashMap<Uuid, serde_json::Value>,
242 ) -> Result<EvalRunReport> {
243 if suite.suite_digest.is_empty() {
244 return Err(super::error::AivcsError::DigestMismatch {
245 expected: "<non-empty>".to_string(),
246 actual: "<empty>".to_string(),
247 });
248 }
249
250 let mut case_results = Vec::with_capacity(suite.test_cases.len());
251
252 for case in &suite.test_cases {
253 let actual = actual_outputs
254 .get(&case.case_id)
255 .cloned()
256 .unwrap_or(serde_json::Value::Null);
257
258 let score = self.score_case(suite, case, &actual);
259 let passed = if case.expected.is_some() {
260 score >= 1.0
261 } else {
262 score > 0.0
263 };
264
265 case_results.push(EvalCaseResult {
266 case_id: case.case_id,
267 score,
268 passed,
269 actual,
270 });
271
272 if suite.thresholds.fail_fast && !passed {
273 break;
274 }
275 }
276
277 let passed_cases = case_results.iter().filter(|c| c.passed).count();
278 let total_cases = case_results.len();
279 let pass_rate = if total_cases == 0 {
280 1.0
281 } else {
282 passed_cases as f32 / total_cases as f32
283 };
284 let overall_pass = pass_rate >= suite.thresholds.min_pass_rate;
285
286 Ok(EvalRunReport {
287 suite_digest: suite.suite_digest.clone(),
288 seed: self.seed,
289 total_cases,
290 passed_cases,
291 pass_rate,
292 overall_pass,
293 case_results,
294 })
295 }
296
297 fn score_case(
298 &self,
299 suite: &EvalSuite,
300 case: &EvalTestCase,
301 actual: &serde_json::Value,
302 ) -> f32 {
303 if suite.scorers.is_empty() {
304 return match &case.expected {
305 Some(expected) => {
306 if expected == actual {
307 1.0
308 } else {
309 0.0
310 }
311 }
312 None => 1.0,
313 };
314 }
315
316 let mut scores = Vec::with_capacity(suite.scorers.len());
317 for scorer in &suite.scorers {
318 match scorer.scorer_type {
319 ScorerType::ExactMatch => {
320 let s = match &case.expected {
321 Some(expected) => {
322 if expected == actual {
323 1.0
324 } else {
325 0.0
326 }
327 }
328 None => 1.0,
329 };
330 scores.push(s);
331 }
332 ScorerType::SemanticSimilarity
335 | ScorerType::ToolCallSequence
336 | ScorerType::Custom(_) => {}
337 }
338 }
339
340 if scores.is_empty() {
341 return match &case.expected {
343 Some(expected) => {
344 if expected == actual {
345 1.0
346 } else {
347 0.0
348 }
349 }
350 None => 1.0,
351 };
352 }
353
354 scores.iter().sum::<f32>() / scores.len() as f32
355 }
356}
357
358#[cfg(test)]
359mod tests {
360 use super::*;
361
362 #[test]
363 fn test_eval_suite_serde_roundtrip() {
364 let suite = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
365 .add_test_case(EvalTestCase::new(
366 serde_json::json!({"input": "test"}),
367 Some(serde_json::json!({"output": "expected"})),
368 ))
369 .add_scorer(ScorerConfig {
370 name: "exact_match".to_string(),
371 scorer_type: ScorerType::ExactMatch,
372 params: serde_json::json!({}),
373 });
374
375 let json = serde_json::to_string(&suite).expect("serialize");
376 let deserialized: EvalSuite = serde_json::from_str(&json).expect("deserialize");
377
378 assert_eq!(suite, deserialized);
379 }
380
381 #[test]
382 fn test_eval_thresholds_defaults() {
383 let thresholds = EvalThresholds::default();
384 assert_eq!(thresholds.min_pass_rate, 0.95);
385 assert_eq!(thresholds.max_regression, 0.05);
386 assert!(!thresholds.fail_fast);
387 }
388
389 #[test]
390 fn test_scorer_type_exact_match() {
391 let scorer_type = ScorerType::ExactMatch;
392 let json = serde_json::to_string(&scorer_type).expect("serialize");
393 let deserialized: ScorerType = serde_json::from_str(&json).expect("deserialize");
394 assert_eq!(scorer_type, deserialized);
395 }
396
397 #[test]
398 fn test_scorer_type_semantic_similarity() {
399 let scorer_type = ScorerType::SemanticSimilarity;
400 let json = serde_json::to_string(&scorer_type).expect("serialize");
401 let deserialized: ScorerType = serde_json::from_str(&json).expect("deserialize");
402 assert_eq!(scorer_type, deserialized);
403 }
404
405 #[test]
406 fn test_scorer_type_custom_roundtrip() {
407 let scorer_type = ScorerType::Custom("my_custom_scorer".to_string());
408 let json = serde_json::to_string(&scorer_type).expect("serialize");
409 let deserialized: ScorerType = serde_json::from_str(&json).expect("deserialize");
410 assert_eq!(scorer_type, deserialized);
411 }
412
413 #[test]
414 fn test_scorer_type_tool_call_sequence() {
415 let scorer_type = ScorerType::ToolCallSequence;
416 let json = serde_json::to_string(&scorer_type).expect("serialize");
417 let deserialized: ScorerType = serde_json::from_str(&json).expect("deserialize");
418 assert_eq!(scorer_type, deserialized);
419 }
420
421 #[test]
422 fn test_eval_test_case_new() {
423 let test_case = EvalTestCase::new(
424 serde_json::json!({"input": "test"}),
425 Some(serde_json::json!({"output": "expected"})),
426 );
427
428 assert_eq!(test_case.inputs, serde_json::json!({"input": "test"}));
429 assert_eq!(
430 test_case.expected,
431 Some(serde_json::json!({"output": "expected"}))
432 );
433 assert!(test_case.tags.is_empty());
434 }
435
436 #[test]
437 fn test_eval_test_case_with_tag() {
438 let test_case = EvalTestCase::new(
439 serde_json::json!({"input": "test"}),
440 Some(serde_json::json!({"output": "expected"})),
441 )
442 .with_tag("critical".to_string());
443
444 assert_eq!(test_case.tags, vec!["critical"]);
445 }
446
447 #[test]
448 fn test_scorer_config_serde_roundtrip() {
449 let config = ScorerConfig {
450 name: "test_scorer".to_string(),
451 scorer_type: ScorerType::SemanticSimilarity,
452 params: serde_json::json!({"threshold": 0.8}),
453 };
454
455 let json = serde_json::to_string(&config).expect("serialize");
456 let deserialized: ScorerConfig = serde_json::from_str(&json).expect("deserialize");
457
458 assert_eq!(config, deserialized);
459 }
460
461 #[test]
462 fn test_eval_suite_fluent_api() {
463 let suite = EvalSuite::new("test".to_string(), "1.0.0".to_string())
464 .add_test_case(EvalTestCase::new(
465 serde_json::json!({"input": "test"}),
466 Some(serde_json::json!({"output": "expected"})),
467 ))
468 .add_scorer(ScorerConfig {
469 name: "scorer1".to_string(),
470 scorer_type: ScorerType::ExactMatch,
471 params: serde_json::json!({}),
472 })
473 .with_thresholds(EvalThresholds {
474 min_pass_rate: 0.90,
475 max_regression: 0.10,
476 fail_fast: true,
477 });
478
479 assert_eq!(suite.test_cases.len(), 1);
480 assert_eq!(suite.scorers.len(), 1);
481 assert_eq!(suite.thresholds.min_pass_rate, 0.90);
482 assert!(suite.thresholds.fail_fast);
483 }
484
485 #[test]
486 fn test_eval_suite_finalize_sets_digest() {
487 let suite = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
488 .add_test_case(EvalTestCase::new(
489 serde_json::json!({"input": "test"}),
490 Some(serde_json::json!({"output": "expected"})),
491 ))
492 .add_scorer(ScorerConfig {
493 name: "scorer1".to_string(),
494 scorer_type: ScorerType::ExactMatch,
495 params: serde_json::json!({}),
496 });
497
498 assert_eq!(suite.suite_digest, "");
500
501 let finalized = suite.finalize().expect("finalize suite");
502
503 assert!(!finalized.suite_digest.is_empty());
505 assert_eq!(finalized.suite_digest.len(), 64);
507 assert!(finalized
508 .suite_digest
509 .chars()
510 .all(|c: char| c.is_ascii_hexdigit()));
511 }
512
513 #[test]
514 fn test_eval_suite_digest_stable() {
515 let suite = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
517 .add_test_case(EvalTestCase::new(
518 serde_json::json!({"input": "test"}),
519 Some(serde_json::json!({"output": "expected"})),
520 ))
521 .add_scorer(ScorerConfig {
522 name: "scorer1".to_string(),
523 scorer_type: ScorerType::ExactMatch,
524 params: serde_json::json!({}),
525 });
526
527 let finalized1 = suite.clone().finalize().expect("finalize suite 1");
528 let finalized2 = suite.finalize().expect("finalize suite 2");
529
530 assert_eq!(
531 finalized1.suite_digest, finalized2.suite_digest,
532 "finalizing same suite object twice should produce same digest"
533 );
534 }
535
536 #[test]
537 fn test_eval_suite_digest_changes_on_mutation() {
538 let suite1 = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
539 .add_test_case(EvalTestCase::new(
540 serde_json::json!({"input": "test"}),
541 Some(serde_json::json!({"output": "expected"})),
542 ))
543 .add_scorer(ScorerConfig {
544 name: "scorer1".to_string(),
545 scorer_type: ScorerType::ExactMatch,
546 params: serde_json::json!({}),
547 });
548
549 let finalized1 = suite1.finalize().expect("finalize suite 1");
550
551 let suite2 = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
553 .add_test_case(EvalTestCase::new(
554 serde_json::json!({"input": "different_test"}),
555 Some(serde_json::json!({"output": "expected"})),
556 ))
557 .add_scorer(ScorerConfig {
558 name: "scorer1".to_string(),
559 scorer_type: ScorerType::ExactMatch,
560 params: serde_json::json!({}),
561 });
562
563 let finalized2 = suite2.finalize().expect("finalize suite 2");
564
565 assert_ne!(
566 finalized1.suite_digest, finalized2.suite_digest,
567 "different test cases should produce different digest"
568 );
569 }
570
571 #[test]
572 fn test_eval_suite_digest_version_change() {
573 let suite1 = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string());
574 let finalized1 = suite1.finalize().expect("finalize suite 1");
575
576 let suite2 = EvalSuite::new("test_suite".to_string(), "1.0.1".to_string());
577 let finalized2 = suite2.finalize().expect("finalize suite 2");
578
579 assert_ne!(
580 finalized1.suite_digest, finalized2.suite_digest,
581 "different version should produce different digest"
582 );
583 }
584
585 #[test]
586 fn test_deterministic_eval_runner_stable_score() {
587 let mut case1 = EvalTestCase::new(
588 serde_json::json!({"q":"2+2"}),
589 Some(serde_json::json!({"answer":"4"})),
590 );
591 case1.case_id = Uuid::parse_str("11111111-1111-1111-1111-111111111111").unwrap();
592
593 let mut case2 = EvalTestCase::new(
594 serde_json::json!({"q":"3*3"}),
595 Some(serde_json::json!({"answer":"9"})),
596 );
597 case2.case_id = Uuid::parse_str("22222222-2222-2222-2222-222222222222").unwrap();
598
599 let suite = EvalSuite::new("golden-suite".to_string(), "1.0.0".to_string())
600 .add_test_case(case1.clone())
601 .add_test_case(case2.clone())
602 .add_scorer(ScorerConfig {
603 name: "exact".to_string(),
604 scorer_type: ScorerType::ExactMatch,
605 params: serde_json::json!({}),
606 })
607 .with_thresholds(EvalThresholds {
608 min_pass_rate: 0.5,
609 max_regression: 0.0,
610 fail_fast: false,
611 })
612 .finalize()
613 .unwrap();
614
615 let mut outputs = HashMap::new();
616 outputs.insert(case1.case_id, serde_json::json!({"answer":"4"}));
617 outputs.insert(case2.case_id, serde_json::json!({"answer":"8"}));
618
619 let runner = DeterministicEvalRunner::new(42);
620 let report1 = runner.run_with_outputs(&suite, &outputs).unwrap();
621 let report2 = runner.run_with_outputs(&suite, &outputs).unwrap();
622
623 assert_eq!(report1, report2);
624 assert_eq!(report1.total_cases, 2);
625 assert_eq!(report1.passed_cases, 1);
626 assert_eq!(report1.pass_rate, 0.5);
627 assert!(report1.overall_pass);
628 }
629
630 #[test]
631 fn test_deterministic_eval_runner_golden_output() {
632 let mut case = EvalTestCase::new(
633 serde_json::json!({"q":"2+2"}),
634 Some(serde_json::json!({"answer":"4"})),
635 );
636 case.case_id = Uuid::parse_str("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa").unwrap();
637
638 let suite = EvalSuite::new("golden".to_string(), "1.0.0".to_string())
639 .add_test_case(case.clone())
640 .add_scorer(ScorerConfig {
641 name: "exact".to_string(),
642 scorer_type: ScorerType::ExactMatch,
643 params: serde_json::json!({}),
644 })
645 .finalize()
646 .unwrap();
647
648 let mut outputs = HashMap::new();
649 outputs.insert(case.case_id, serde_json::json!({"answer":"4"}));
650
651 let report = DeterministicEvalRunner::new(7)
652 .run_with_outputs(&suite, &outputs)
653 .unwrap();
654 let actual = serde_json::to_value(&report).unwrap();
655 let expected = serde_json::json!({
656 "suite_digest": suite.suite_digest,
657 "seed": 7,
658 "total_cases": 1,
659 "passed_cases": 1,
660 "pass_rate": 1.0,
661 "overall_pass": true,
662 "case_results": [
663 {
664 "case_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa",
665 "score": 1.0,
666 "passed": true,
667 "actual": {
668 "answer": "4"
669 }
670 }
671 ]
672 });
673 assert_eq!(actual, expected);
674 }
675}