Skip to main content

aivcs_core/domain/
eval.rs

1//! Evaluation suite definitions and scoring configuration.
2
3use chrono::{DateTime, Utc};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6use uuid::Uuid;
7
8use super::digest;
9use super::error::Result;
10
11/// Enumeration of available scorer types.
12#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
13#[serde(tag = "type", rename_all = "snake_case", content = "value")]
14pub enum ScorerType {
15    /// Exact match comparison.
16    ExactMatch,
17
18    /// Semantic similarity (embeddings-based).
19    SemanticSimilarity,
20
21    /// Tool call sequence matching.
22    ToolCallSequence,
23
24    /// Custom scorer extension.
25    Custom(String),
26}
27
28/// Configuration for a single scorer in an evaluation.
29#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
30pub struct ScorerConfig {
31    /// Name of this scorer (for reporting).
32    pub name: String,
33
34    /// Type of scorer.
35    pub scorer_type: ScorerType,
36
37    /// Scorer-specific parameters.
38    pub params: serde_json::Value,
39}
40
41/// Thresholds for evaluation pass/fail criteria.
42#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
43pub struct EvalThresholds {
44    /// Minimum pass rate (0.0–1.0) for suite to pass.
45    pub min_pass_rate: f32,
46
47    /// Maximum allowed regression rate (0.0–1.0) vs. baseline.
48    pub max_regression: f32,
49
50    /// If true, stop on first failure.
51    pub fail_fast: bool,
52}
53
54impl Default for EvalThresholds {
55    fn default() -> Self {
56        Self {
57            min_pass_rate: 0.95,
58            max_regression: 0.05,
59            fail_fast: false,
60        }
61    }
62}
63
64/// A single test case within an evaluation suite.
65#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
66pub struct EvalTestCase {
67    /// Unique identifier for this test case.
68    pub case_id: Uuid,
69
70    /// Input provided to the agent.
71    pub inputs: serde_json::Value,
72
73    /// Expected output (optional for scoring-based evals).
74    pub expected: Option<serde_json::Value>,
75
76    /// Tags for categorizing/filtering test cases.
77    pub tags: Vec<String>,
78}
79
80impl EvalTestCase {
81    /// Create a new test case.
82    pub fn new(inputs: serde_json::Value, expected: Option<serde_json::Value>) -> Self {
83        Self {
84            case_id: Uuid::new_v4(),
85            inputs,
86            expected,
87            tags: Vec::new(),
88        }
89    }
90
91    /// Add a tag to this test case.
92    pub fn with_tag(mut self, tag: String) -> Self {
93        self.tags.push(tag);
94        self
95    }
96}
97
98/// Fields that define an evaluation suite's semantic identity.
99///
100/// This struct contains only the fields that contribute to the suite's digest,
101/// excluding suite_id, suite_digest, and created_at.
102#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct EvalSuiteFields {
104    /// Name of the evaluation.
105    pub name: String,
106
107    /// Version string for the suite.
108    pub version: String,
109
110    /// Test cases in this suite.
111    pub test_cases: Vec<EvalTestCase>,
112
113    /// Scorers to use for evaluation.
114    pub scorers: Vec<ScorerConfig>,
115
116    /// Pass/fail thresholds.
117    pub thresholds: EvalThresholds,
118}
119
120/// A complete evaluation suite for testing an agent.
121#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
122pub struct EvalSuite {
123    /// Unique identifier for this suite.
124    pub suite_id: Uuid,
125
126    /// SHA256 hex digest of canonical suite definition.
127    pub suite_digest: String,
128
129    /// Name of the evaluation.
130    pub name: String,
131
132    /// Version string for the suite.
133    pub version: String,
134
135    /// Test cases in this suite.
136    pub test_cases: Vec<EvalTestCase>,
137
138    /// Scorers to use for evaluation.
139    pub scorers: Vec<ScorerConfig>,
140
141    /// Pass/fail thresholds.
142    pub thresholds: EvalThresholds,
143
144    /// When the suite was created.
145    pub created_at: DateTime<Utc>,
146}
147
148impl EvalSuite {
149    /// Create a new evaluation suite.
150    pub fn new(name: String, version: String) -> Self {
151        Self {
152            suite_id: Uuid::new_v4(),
153            suite_digest: String::new(), // Will be computed on finalization
154            name,
155            version,
156            test_cases: Vec::new(),
157            scorers: Vec::new(),
158            thresholds: EvalThresholds::default(),
159            created_at: Utc::now(),
160        }
161    }
162
163    /// Add a test case to the suite.
164    pub fn add_test_case(mut self, test_case: EvalTestCase) -> Self {
165        self.test_cases.push(test_case);
166        self
167    }
168
169    /// Add a scorer configuration.
170    pub fn add_scorer(mut self, scorer: ScorerConfig) -> Self {
171        self.scorers.push(scorer);
172        self
173    }
174
175    /// Set evaluation thresholds.
176    pub fn with_thresholds(mut self, thresholds: EvalThresholds) -> Self {
177        self.thresholds = thresholds;
178        self
179    }
180
181    /// Compute stable SHA256 digest from canonical JSON (RFC 8785-compliant).
182    pub fn compute_digest(fields: &EvalSuiteFields) -> Result<String> {
183        let json = serde_json::to_value(fields)?;
184        digest::compute_digest(&json)
185    }
186
187    /// Finalize the suite: compute and set suite_digest from current fields.
188    pub fn finalize(mut self) -> Result<Self> {
189        let fields = EvalSuiteFields {
190            name: self.name.clone(),
191            version: self.version.clone(),
192            test_cases: self.test_cases.clone(),
193            scorers: self.scorers.clone(),
194            thresholds: self.thresholds.clone(),
195        };
196        self.suite_digest = Self::compute_digest(&fields)?;
197        Ok(self)
198    }
199}
200
201/// Per-test-case deterministic evaluation result.
202#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
203pub struct EvalCaseResult {
204    pub case_id: Uuid,
205    pub score: f32,
206    pub passed: bool,
207    pub actual: serde_json::Value,
208}
209
210/// Deterministic evaluation run output for an EvalSuite.
211#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
212pub struct EvalRunReport {
213    pub suite_digest: String,
214    pub seed: u64,
215    pub total_cases: usize,
216    pub passed_cases: usize,
217    pub pass_rate: f32,
218    pub overall_pass: bool,
219    pub case_results: Vec<EvalCaseResult>,
220}
221
222/// Deterministic execution harness for EvalSuite runs.
223#[derive(Debug, Clone, Copy)]
224pub struct DeterministicEvalRunner {
225    pub seed: u64,
226}
227
228impl DeterministicEvalRunner {
229    pub fn new(seed: u64) -> Self {
230        Self { seed }
231    }
232
233    /// Execute suite scoring deterministically using provided case outputs.
234    ///
235    /// `actual_outputs` maps test case IDs to the concrete run output for that case.
236    ///
237    /// Returns `Err` if `suite.suite_digest` is empty (call `finalize()` first).
238    pub fn run_with_outputs(
239        &self,
240        suite: &EvalSuite,
241        actual_outputs: &HashMap<Uuid, serde_json::Value>,
242    ) -> Result<EvalRunReport> {
243        if suite.suite_digest.is_empty() {
244            return Err(super::error::AivcsError::DigestMismatch {
245                expected: "<non-empty>".to_string(),
246                actual: "<empty>".to_string(),
247            });
248        }
249
250        let mut case_results = Vec::with_capacity(suite.test_cases.len());
251
252        for case in &suite.test_cases {
253            let actual = actual_outputs
254                .get(&case.case_id)
255                .cloned()
256                .unwrap_or(serde_json::Value::Null);
257
258            let score = self.score_case(suite, case, &actual);
259            let passed = if case.expected.is_some() {
260                score >= 1.0
261            } else {
262                score > 0.0
263            };
264
265            case_results.push(EvalCaseResult {
266                case_id: case.case_id,
267                score,
268                passed,
269                actual,
270            });
271
272            if suite.thresholds.fail_fast && !passed {
273                break;
274            }
275        }
276
277        let passed_cases = case_results.iter().filter(|c| c.passed).count();
278        let total_cases = case_results.len();
279        let pass_rate = if total_cases == 0 {
280            1.0
281        } else {
282            passed_cases as f32 / total_cases as f32
283        };
284        let overall_pass = pass_rate >= suite.thresholds.min_pass_rate;
285
286        Ok(EvalRunReport {
287            suite_digest: suite.suite_digest.clone(),
288            seed: self.seed,
289            total_cases,
290            passed_cases,
291            pass_rate,
292            overall_pass,
293            case_results,
294        })
295    }
296
297    fn score_case(
298        &self,
299        suite: &EvalSuite,
300        case: &EvalTestCase,
301        actual: &serde_json::Value,
302    ) -> f32 {
303        if suite.scorers.is_empty() {
304            return match &case.expected {
305                Some(expected) => {
306                    if expected == actual {
307                        1.0
308                    } else {
309                        0.0
310                    }
311                }
312                None => 1.0,
313            };
314        }
315
316        let mut scores = Vec::with_capacity(suite.scorers.len());
317        for scorer in &suite.scorers {
318            match scorer.scorer_type {
319                ScorerType::ExactMatch => {
320                    let s = match &case.expected {
321                        Some(expected) => {
322                            if expected == actual {
323                                1.0
324                            } else {
325                                0.0
326                            }
327                        }
328                        None => 1.0,
329                    };
330                    scores.push(s);
331                }
332                // Unimplemented scorers are skipped to avoid silently dragging
333                // down scores. Once real implementations land, add arms here.
334                ScorerType::SemanticSimilarity
335                | ScorerType::ToolCallSequence
336                | ScorerType::Custom(_) => {}
337            }
338        }
339
340        if scores.is_empty() {
341            // No usable scorers contributed — fall back to exact-match semantics.
342            return match &case.expected {
343                Some(expected) => {
344                    if expected == actual {
345                        1.0
346                    } else {
347                        0.0
348                    }
349                }
350                None => 1.0,
351            };
352        }
353
354        scores.iter().sum::<f32>() / scores.len() as f32
355    }
356}
357
358#[cfg(test)]
359mod tests {
360    use super::*;
361
362    #[test]
363    fn test_eval_suite_serde_roundtrip() {
364        let suite = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
365            .add_test_case(EvalTestCase::new(
366                serde_json::json!({"input": "test"}),
367                Some(serde_json::json!({"output": "expected"})),
368            ))
369            .add_scorer(ScorerConfig {
370                name: "exact_match".to_string(),
371                scorer_type: ScorerType::ExactMatch,
372                params: serde_json::json!({}),
373            });
374
375        let json = serde_json::to_string(&suite).expect("serialize");
376        let deserialized: EvalSuite = serde_json::from_str(&json).expect("deserialize");
377
378        assert_eq!(suite, deserialized);
379    }
380
381    #[test]
382    fn test_eval_thresholds_defaults() {
383        let thresholds = EvalThresholds::default();
384        assert_eq!(thresholds.min_pass_rate, 0.95);
385        assert_eq!(thresholds.max_regression, 0.05);
386        assert!(!thresholds.fail_fast);
387    }
388
389    #[test]
390    fn test_scorer_type_exact_match() {
391        let scorer_type = ScorerType::ExactMatch;
392        let json = serde_json::to_string(&scorer_type).expect("serialize");
393        let deserialized: ScorerType = serde_json::from_str(&json).expect("deserialize");
394        assert_eq!(scorer_type, deserialized);
395    }
396
397    #[test]
398    fn test_scorer_type_semantic_similarity() {
399        let scorer_type = ScorerType::SemanticSimilarity;
400        let json = serde_json::to_string(&scorer_type).expect("serialize");
401        let deserialized: ScorerType = serde_json::from_str(&json).expect("deserialize");
402        assert_eq!(scorer_type, deserialized);
403    }
404
405    #[test]
406    fn test_scorer_type_custom_roundtrip() {
407        let scorer_type = ScorerType::Custom("my_custom_scorer".to_string());
408        let json = serde_json::to_string(&scorer_type).expect("serialize");
409        let deserialized: ScorerType = serde_json::from_str(&json).expect("deserialize");
410        assert_eq!(scorer_type, deserialized);
411    }
412
413    #[test]
414    fn test_scorer_type_tool_call_sequence() {
415        let scorer_type = ScorerType::ToolCallSequence;
416        let json = serde_json::to_string(&scorer_type).expect("serialize");
417        let deserialized: ScorerType = serde_json::from_str(&json).expect("deserialize");
418        assert_eq!(scorer_type, deserialized);
419    }
420
421    #[test]
422    fn test_eval_test_case_new() {
423        let test_case = EvalTestCase::new(
424            serde_json::json!({"input": "test"}),
425            Some(serde_json::json!({"output": "expected"})),
426        );
427
428        assert_eq!(test_case.inputs, serde_json::json!({"input": "test"}));
429        assert_eq!(
430            test_case.expected,
431            Some(serde_json::json!({"output": "expected"}))
432        );
433        assert!(test_case.tags.is_empty());
434    }
435
436    #[test]
437    fn test_eval_test_case_with_tag() {
438        let test_case = EvalTestCase::new(
439            serde_json::json!({"input": "test"}),
440            Some(serde_json::json!({"output": "expected"})),
441        )
442        .with_tag("critical".to_string());
443
444        assert_eq!(test_case.tags, vec!["critical"]);
445    }
446
447    #[test]
448    fn test_scorer_config_serde_roundtrip() {
449        let config = ScorerConfig {
450            name: "test_scorer".to_string(),
451            scorer_type: ScorerType::SemanticSimilarity,
452            params: serde_json::json!({"threshold": 0.8}),
453        };
454
455        let json = serde_json::to_string(&config).expect("serialize");
456        let deserialized: ScorerConfig = serde_json::from_str(&json).expect("deserialize");
457
458        assert_eq!(config, deserialized);
459    }
460
461    #[test]
462    fn test_eval_suite_fluent_api() {
463        let suite = EvalSuite::new("test".to_string(), "1.0.0".to_string())
464            .add_test_case(EvalTestCase::new(
465                serde_json::json!({"input": "test"}),
466                Some(serde_json::json!({"output": "expected"})),
467            ))
468            .add_scorer(ScorerConfig {
469                name: "scorer1".to_string(),
470                scorer_type: ScorerType::ExactMatch,
471                params: serde_json::json!({}),
472            })
473            .with_thresholds(EvalThresholds {
474                min_pass_rate: 0.90,
475                max_regression: 0.10,
476                fail_fast: true,
477            });
478
479        assert_eq!(suite.test_cases.len(), 1);
480        assert_eq!(suite.scorers.len(), 1);
481        assert_eq!(suite.thresholds.min_pass_rate, 0.90);
482        assert!(suite.thresholds.fail_fast);
483    }
484
485    #[test]
486    fn test_eval_suite_finalize_sets_digest() {
487        let suite = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
488            .add_test_case(EvalTestCase::new(
489                serde_json::json!({"input": "test"}),
490                Some(serde_json::json!({"output": "expected"})),
491            ))
492            .add_scorer(ScorerConfig {
493                name: "scorer1".to_string(),
494                scorer_type: ScorerType::ExactMatch,
495                params: serde_json::json!({}),
496            });
497
498        // Before finalize, suite_digest should be empty
499        assert_eq!(suite.suite_digest, "");
500
501        let finalized = suite.finalize().expect("finalize suite");
502
503        // After finalize, suite_digest should be set and non-empty
504        assert!(!finalized.suite_digest.is_empty());
505        // Verify it's a valid 64-char hex string (SHA256)
506        assert_eq!(finalized.suite_digest.len(), 64);
507        assert!(finalized
508            .suite_digest
509            .chars()
510            .all(|c: char| c.is_ascii_hexdigit()));
511    }
512
513    #[test]
514    fn test_eval_suite_digest_stable() {
515        // Test that finalize called twice on the same suite object produces same digest
516        let suite = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
517            .add_test_case(EvalTestCase::new(
518                serde_json::json!({"input": "test"}),
519                Some(serde_json::json!({"output": "expected"})),
520            ))
521            .add_scorer(ScorerConfig {
522                name: "scorer1".to_string(),
523                scorer_type: ScorerType::ExactMatch,
524                params: serde_json::json!({}),
525            });
526
527        let finalized1 = suite.clone().finalize().expect("finalize suite 1");
528        let finalized2 = suite.finalize().expect("finalize suite 2");
529
530        assert_eq!(
531            finalized1.suite_digest, finalized2.suite_digest,
532            "finalizing same suite object twice should produce same digest"
533        );
534    }
535
536    #[test]
537    fn test_eval_suite_digest_changes_on_mutation() {
538        let suite1 = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
539            .add_test_case(EvalTestCase::new(
540                serde_json::json!({"input": "test"}),
541                Some(serde_json::json!({"output": "expected"})),
542            ))
543            .add_scorer(ScorerConfig {
544                name: "scorer1".to_string(),
545                scorer_type: ScorerType::ExactMatch,
546                params: serde_json::json!({}),
547            });
548
549        let finalized1 = suite1.finalize().expect("finalize suite 1");
550
551        // Create suite with different test case
552        let suite2 = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string())
553            .add_test_case(EvalTestCase::new(
554                serde_json::json!({"input": "different_test"}),
555                Some(serde_json::json!({"output": "expected"})),
556            ))
557            .add_scorer(ScorerConfig {
558                name: "scorer1".to_string(),
559                scorer_type: ScorerType::ExactMatch,
560                params: serde_json::json!({}),
561            });
562
563        let finalized2 = suite2.finalize().expect("finalize suite 2");
564
565        assert_ne!(
566            finalized1.suite_digest, finalized2.suite_digest,
567            "different test cases should produce different digest"
568        );
569    }
570
571    #[test]
572    fn test_eval_suite_digest_version_change() {
573        let suite1 = EvalSuite::new("test_suite".to_string(), "1.0.0".to_string());
574        let finalized1 = suite1.finalize().expect("finalize suite 1");
575
576        let suite2 = EvalSuite::new("test_suite".to_string(), "1.0.1".to_string());
577        let finalized2 = suite2.finalize().expect("finalize suite 2");
578
579        assert_ne!(
580            finalized1.suite_digest, finalized2.suite_digest,
581            "different version should produce different digest"
582        );
583    }
584
585    #[test]
586    fn test_deterministic_eval_runner_stable_score() {
587        let mut case1 = EvalTestCase::new(
588            serde_json::json!({"q":"2+2"}),
589            Some(serde_json::json!({"answer":"4"})),
590        );
591        case1.case_id = Uuid::parse_str("11111111-1111-1111-1111-111111111111").unwrap();
592
593        let mut case2 = EvalTestCase::new(
594            serde_json::json!({"q":"3*3"}),
595            Some(serde_json::json!({"answer":"9"})),
596        );
597        case2.case_id = Uuid::parse_str("22222222-2222-2222-2222-222222222222").unwrap();
598
599        let suite = EvalSuite::new("golden-suite".to_string(), "1.0.0".to_string())
600            .add_test_case(case1.clone())
601            .add_test_case(case2.clone())
602            .add_scorer(ScorerConfig {
603                name: "exact".to_string(),
604                scorer_type: ScorerType::ExactMatch,
605                params: serde_json::json!({}),
606            })
607            .with_thresholds(EvalThresholds {
608                min_pass_rate: 0.5,
609                max_regression: 0.0,
610                fail_fast: false,
611            })
612            .finalize()
613            .unwrap();
614
615        let mut outputs = HashMap::new();
616        outputs.insert(case1.case_id, serde_json::json!({"answer":"4"}));
617        outputs.insert(case2.case_id, serde_json::json!({"answer":"8"}));
618
619        let runner = DeterministicEvalRunner::new(42);
620        let report1 = runner.run_with_outputs(&suite, &outputs).unwrap();
621        let report2 = runner.run_with_outputs(&suite, &outputs).unwrap();
622
623        assert_eq!(report1, report2);
624        assert_eq!(report1.total_cases, 2);
625        assert_eq!(report1.passed_cases, 1);
626        assert_eq!(report1.pass_rate, 0.5);
627        assert!(report1.overall_pass);
628    }
629
630    #[test]
631    fn test_deterministic_eval_runner_golden_output() {
632        let mut case = EvalTestCase::new(
633            serde_json::json!({"q":"2+2"}),
634            Some(serde_json::json!({"answer":"4"})),
635        );
636        case.case_id = Uuid::parse_str("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa").unwrap();
637
638        let suite = EvalSuite::new("golden".to_string(), "1.0.0".to_string())
639            .add_test_case(case.clone())
640            .add_scorer(ScorerConfig {
641                name: "exact".to_string(),
642                scorer_type: ScorerType::ExactMatch,
643                params: serde_json::json!({}),
644            })
645            .finalize()
646            .unwrap();
647
648        let mut outputs = HashMap::new();
649        outputs.insert(case.case_id, serde_json::json!({"answer":"4"}));
650
651        let report = DeterministicEvalRunner::new(7)
652            .run_with_outputs(&suite, &outputs)
653            .unwrap();
654        let actual = serde_json::to_value(&report).unwrap();
655        let expected = serde_json::json!({
656            "suite_digest": suite.suite_digest,
657            "seed": 7,
658            "total_cases": 1,
659            "passed_cases": 1,
660            "pass_rate": 1.0,
661            "overall_pass": true,
662            "case_results": [
663                {
664                    "case_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa",
665                    "score": 1.0,
666                    "passed": true,
667                    "actual": {
668                        "answer": "4"
669                    }
670                }
671            ]
672        });
673        assert_eq!(actual, expected);
674    }
675}