Skip to main content

agentforge_core/
score.rs

1use serde::{Deserialize, Serialize};
2
3/// Score result for a single dimension.
4#[derive(Debug, Clone, Serialize, Deserialize)]
5pub struct DimensionScore {
6    pub value: f64,
7    /// Confidence that this score is correct (0.0–1.0).
8    /// Scores below 0.5 confidence are flagged for human review.
9    pub confidence: f64,
10    pub method: ScoringMethod,
11    pub rationale: Option<String>,
12}
13
14impl Default for DimensionScore {
15    fn default() -> Self {
16        Self {
17            value: 0.0,
18            confidence: 1.0,
19            method: ScoringMethod::Deterministic,
20            rationale: None,
21        }
22    }
23}
24
25#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
26#[serde(rename_all = "snake_case")]
27pub enum ScoringMethod {
28    /// Deterministic check (schema validation, keyword match, etc.)
29    Deterministic,
30    /// LLM judge evaluation
31    LlmJudge,
32    /// Combination of both
33    Hybrid,
34    /// Heuristic approximation (no LLM, no hard rules)
35    Heuristic,
36}
37
38/// Scorecard: the aggregated scoring result for a full eval run,
39/// ready for display in the dashboard / CLI.
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct Scorecard {
42    pub run_id: uuid::Uuid,
43    pub agent_id: uuid::Uuid,
44    pub agent_name: String,
45    pub agent_version: String,
46    pub aggregate_score: f64,
47    pub pass_rate: f64,
48    pub total_scenarios: u32,
49    pub passed: u32,
50    pub failed: u32,
51    pub errors: u32,
52    pub review_needed: u32,
53    pub dimension_scores: crate::DimensionScores,
54    pub failure_clusters: Vec<crate::FailureClusterSummary>,
55    pub duration_seconds: u64,
56    pub total_input_tokens: u64,
57    pub total_output_tokens: u64,
58}
59
60/// A scorecard delta between two agent versions.
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct ScorecardDiff {
63    pub version_a: String,
64    pub version_b: String,
65    pub aggregate_score_delta: f64,
66    pub pass_rate_delta: f64,
67    pub dimension_deltas: DimensionDeltas,
68    pub regression_count: u32,
69    pub improvement_count: u32,
70    pub neutral_count: u32,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct DimensionDeltas {
75    pub task_completion: f64,
76    pub tool_selection: f64,
77    pub argument_correctness: f64,
78    pub schema_compliance: f64,
79    pub instruction_adherence: f64,
80    pub path_efficiency: f64,
81}
82
83#[cfg(test)]
84mod tests {
85    use super::*;
86
87    #[test]
88    fn scoring_method_serde() {
89        let json = serde_json::to_string(&ScoringMethod::LlmJudge).unwrap();
90        assert_eq!(json, r#""llm_judge""#);
91        let back: ScoringMethod = serde_json::from_str(&json).unwrap();
92        assert_eq!(back, ScoringMethod::LlmJudge);
93    }
94
95    // ── 12 new tests ─────────────────────────────────────────────────────────
96
97    #[test]
98    fn scoring_method_serde_all_variants() {
99        let pairs = [
100            (ScoringMethod::Deterministic, r#""deterministic""#),
101            (ScoringMethod::LlmJudge, r#""llm_judge""#),
102            (ScoringMethod::Hybrid, r#""hybrid""#),
103            (ScoringMethod::Heuristic, r#""heuristic""#),
104        ];
105        for (method, expected) in &pairs {
106            let json = serde_json::to_string(method).unwrap();
107            assert_eq!(
108                &json, expected,
109                "ScoringMethod::{:?} serialized incorrectly",
110                method
111            );
112            let back: ScoringMethod = serde_json::from_str(&json).unwrap();
113            assert_eq!(&back, method);
114        }
115    }
116
117    #[test]
118    fn scoring_method_all_variants_are_distinct() {
119        // Each variant serializes to a unique string — use that for uniqueness check
120        let all = [
121            ScoringMethod::Deterministic,
122            ScoringMethod::LlmJudge,
123            ScoringMethod::Hybrid,
124            ScoringMethod::Heuristic,
125        ];
126        let strs: std::collections::HashSet<_> = all
127            .iter()
128            .map(|m| serde_json::to_string(m).unwrap())
129            .collect();
130        assert_eq!(strs.len(), 4);
131    }
132
133    #[test]
134    fn dimension_score_default_value_is_zero() {
135        let ds = DimensionScore::default();
136        assert_eq!(ds.value, 0.0);
137        assert_eq!(ds.confidence, 1.0);
138        assert_eq!(ds.method, ScoringMethod::Deterministic);
139        assert!(ds.rationale.is_none());
140    }
141
142    #[test]
143    fn dimension_score_stores_rationale() {
144        let ds = DimensionScore {
145            value: 0.75,
146            confidence: 0.9,
147            method: ScoringMethod::LlmJudge,
148            rationale: Some("Agent called the wrong tool".to_string()),
149        };
150        assert!((ds.value - 0.75).abs() < 1e-9);
151        assert_eq!(ds.rationale.as_deref(), Some("Agent called the wrong tool"));
152    }
153
154    #[test]
155    fn dimension_score_serde_roundtrip() {
156        let ds = DimensionScore {
157            value: 0.85,
158            confidence: 0.95,
159            method: ScoringMethod::Hybrid,
160            rationale: Some("Partial pass".to_string()),
161        };
162        let json = serde_json::to_string(&ds).unwrap();
163        let back: DimensionScore = serde_json::from_str(&json).unwrap();
164        assert!((back.value - 0.85).abs() < 1e-9);
165        assert_eq!(back.method, ScoringMethod::Hybrid);
166    }
167
168    #[test]
169    fn dimension_score_null_rationale_serde() {
170        let ds = DimensionScore {
171            value: 1.0,
172            confidence: 1.0,
173            method: ScoringMethod::Deterministic,
174            rationale: None,
175        };
176        let json = serde_json::to_value(&ds).unwrap();
177        assert!(json["rationale"].is_null());
178    }
179
180    #[test]
181    fn scorecard_diff_stores_delta_fields() {
182        let diff = ScorecardDiff {
183            version_a: "v1.0.0".to_string(),
184            version_b: "v1.1.0".to_string(),
185            aggregate_score_delta: 0.05,
186            pass_rate_delta: 0.10,
187            dimension_deltas: DimensionDeltas {
188                task_completion: 0.02,
189                tool_selection: 0.03,
190                argument_correctness: -0.01,
191                schema_compliance: 0.00,
192                instruction_adherence: 0.01,
193                path_efficiency: 0.00,
194            },
195            regression_count: 2,
196            improvement_count: 5,
197            neutral_count: 3,
198        };
199        assert_eq!(diff.version_a, "v1.0.0");
200        assert!((diff.aggregate_score_delta - 0.05).abs() < 1e-9);
201        assert_eq!(diff.improvement_count, 5);
202    }
203
204    #[test]
205    fn scorecard_diff_serde_roundtrip() {
206        let diff = ScorecardDiff {
207            version_a: "a".to_string(),
208            version_b: "b".to_string(),
209            aggregate_score_delta: 0.03,
210            pass_rate_delta: 0.05,
211            dimension_deltas: DimensionDeltas {
212                task_completion: 0.01,
213                tool_selection: 0.02,
214                argument_correctness: 0.00,
215                schema_compliance: 0.00,
216                instruction_adherence: 0.00,
217                path_efficiency: 0.00,
218            },
219            regression_count: 0,
220            improvement_count: 3,
221            neutral_count: 7,
222        };
223        let json = serde_json::to_string(&diff).unwrap();
224        let back: ScorecardDiff = serde_json::from_str(&json).unwrap();
225        assert_eq!(back.version_b, "b");
226        assert!((back.aggregate_score_delta - 0.03).abs() < 1e-9);
227    }
228
229    #[test]
230    fn dimension_deltas_serde_roundtrip() {
231        let d = DimensionDeltas {
232            task_completion: 0.1,
233            tool_selection: -0.1,
234            argument_correctness: 0.0,
235            schema_compliance: 0.05,
236            instruction_adherence: -0.02,
237            path_efficiency: 0.01,
238        };
239        let json = serde_json::to_string(&d).unwrap();
240        let back: DimensionDeltas = serde_json::from_str(&json).unwrap();
241        assert!((back.task_completion - 0.1).abs() < 1e-9);
242        assert!((back.tool_selection - (-0.1)).abs() < 1e-9);
243    }
244
245    #[test]
246    fn scorecard_stores_all_fields() {
247        let run_id = uuid::Uuid::new_v4();
248        let agent_id = uuid::Uuid::new_v4();
249        let sc = Scorecard {
250            run_id,
251            agent_id,
252            agent_name: "test".to_string(),
253            agent_version: "1.0.0".to_string(),
254            aggregate_score: 0.75,
255            pass_rate: 0.60,
256            total_scenarios: 10,
257            passed: 6,
258            failed: 3,
259            errors: 1,
260            review_needed: 0,
261            dimension_scores: crate::DimensionScores::default(),
262            failure_clusters: vec![],
263            duration_seconds: 120,
264            total_input_tokens: 5000,
265            total_output_tokens: 2000,
266        };
267        assert_eq!(sc.run_id, run_id);
268        assert_eq!(sc.total_scenarios, 10);
269        assert_eq!(sc.passed + sc.failed + sc.errors, 10);
270    }
271
272    #[test]
273    fn scoring_method_eq_deterministic() {
274        assert_eq!(ScoringMethod::Deterministic, ScoringMethod::Deterministic);
275        assert_ne!(ScoringMethod::Deterministic, ScoringMethod::LlmJudge);
276    }
277
278    #[test]
279    fn dimension_score_confidence_range_is_valid() {
280        // Confidence must be [0, 1]; test boundary values
281        let low = DimensionScore {
282            value: 0.5,
283            confidence: 0.0,
284            method: ScoringMethod::Heuristic,
285            rationale: None,
286        };
287        let high = DimensionScore {
288            value: 0.5,
289            confidence: 1.0,
290            method: ScoringMethod::Heuristic,
291            rationale: None,
292        };
293        assert!((0.0..=1.0).contains(&low.confidence));
294        assert!((0.0..=1.0).contains(&high.confidence));
295    }
296}