1use serde::{Deserialize, Serialize};
2
3#[derive(Debug, Clone, Serialize, Deserialize)]
5pub struct DimensionScore {
6 pub value: f64,
7 pub confidence: f64,
10 pub method: ScoringMethod,
11 pub rationale: Option<String>,
12}
13
14impl Default for DimensionScore {
15 fn default() -> Self {
16 Self {
17 value: 0.0,
18 confidence: 1.0,
19 method: ScoringMethod::Deterministic,
20 rationale: None,
21 }
22 }
23}
24
25#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
26#[serde(rename_all = "snake_case")]
27pub enum ScoringMethod {
28 Deterministic,
30 LlmJudge,
32 Hybrid,
34 Heuristic,
36}
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct Scorecard {
42 pub run_id: uuid::Uuid,
43 pub agent_id: uuid::Uuid,
44 pub agent_name: String,
45 pub agent_version: String,
46 pub aggregate_score: f64,
47 pub pass_rate: f64,
48 pub total_scenarios: u32,
49 pub passed: u32,
50 pub failed: u32,
51 pub errors: u32,
52 pub review_needed: u32,
53 pub dimension_scores: crate::DimensionScores,
54 pub failure_clusters: Vec<crate::FailureClusterSummary>,
55 pub duration_seconds: u64,
56 pub total_input_tokens: u64,
57 pub total_output_tokens: u64,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct ScorecardDiff {
63 pub version_a: String,
64 pub version_b: String,
65 pub aggregate_score_delta: f64,
66 pub pass_rate_delta: f64,
67 pub dimension_deltas: DimensionDeltas,
68 pub regression_count: u32,
69 pub improvement_count: u32,
70 pub neutral_count: u32,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct DimensionDeltas {
75 pub task_completion: f64,
76 pub tool_selection: f64,
77 pub argument_correctness: f64,
78 pub schema_compliance: f64,
79 pub instruction_adherence: f64,
80 pub path_efficiency: f64,
81}
82
83#[cfg(test)]
84mod tests {
85 use super::*;
86
87 #[test]
88 fn scoring_method_serde() {
89 let json = serde_json::to_string(&ScoringMethod::LlmJudge).unwrap();
90 assert_eq!(json, r#""llm_judge""#);
91 let back: ScoringMethod = serde_json::from_str(&json).unwrap();
92 assert_eq!(back, ScoringMethod::LlmJudge);
93 }
94
95 #[test]
98 fn scoring_method_serde_all_variants() {
99 let pairs = [
100 (ScoringMethod::Deterministic, r#""deterministic""#),
101 (ScoringMethod::LlmJudge, r#""llm_judge""#),
102 (ScoringMethod::Hybrid, r#""hybrid""#),
103 (ScoringMethod::Heuristic, r#""heuristic""#),
104 ];
105 for (method, expected) in &pairs {
106 let json = serde_json::to_string(method).unwrap();
107 assert_eq!(
108 &json, expected,
109 "ScoringMethod::{:?} serialized incorrectly",
110 method
111 );
112 let back: ScoringMethod = serde_json::from_str(&json).unwrap();
113 assert_eq!(&back, method);
114 }
115 }
116
117 #[test]
118 fn scoring_method_all_variants_are_distinct() {
119 let all = [
121 ScoringMethod::Deterministic,
122 ScoringMethod::LlmJudge,
123 ScoringMethod::Hybrid,
124 ScoringMethod::Heuristic,
125 ];
126 let strs: std::collections::HashSet<_> = all
127 .iter()
128 .map(|m| serde_json::to_string(m).unwrap())
129 .collect();
130 assert_eq!(strs.len(), 4);
131 }
132
133 #[test]
134 fn dimension_score_default_value_is_zero() {
135 let ds = DimensionScore::default();
136 assert_eq!(ds.value, 0.0);
137 assert_eq!(ds.confidence, 1.0);
138 assert_eq!(ds.method, ScoringMethod::Deterministic);
139 assert!(ds.rationale.is_none());
140 }
141
142 #[test]
143 fn dimension_score_stores_rationale() {
144 let ds = DimensionScore {
145 value: 0.75,
146 confidence: 0.9,
147 method: ScoringMethod::LlmJudge,
148 rationale: Some("Agent called the wrong tool".to_string()),
149 };
150 assert!((ds.value - 0.75).abs() < 1e-9);
151 assert_eq!(ds.rationale.as_deref(), Some("Agent called the wrong tool"));
152 }
153
154 #[test]
155 fn dimension_score_serde_roundtrip() {
156 let ds = DimensionScore {
157 value: 0.85,
158 confidence: 0.95,
159 method: ScoringMethod::Hybrid,
160 rationale: Some("Partial pass".to_string()),
161 };
162 let json = serde_json::to_string(&ds).unwrap();
163 let back: DimensionScore = serde_json::from_str(&json).unwrap();
164 assert!((back.value - 0.85).abs() < 1e-9);
165 assert_eq!(back.method, ScoringMethod::Hybrid);
166 }
167
168 #[test]
169 fn dimension_score_null_rationale_serde() {
170 let ds = DimensionScore {
171 value: 1.0,
172 confidence: 1.0,
173 method: ScoringMethod::Deterministic,
174 rationale: None,
175 };
176 let json = serde_json::to_value(&ds).unwrap();
177 assert!(json["rationale"].is_null());
178 }
179
180 #[test]
181 fn scorecard_diff_stores_delta_fields() {
182 let diff = ScorecardDiff {
183 version_a: "v1.0.0".to_string(),
184 version_b: "v1.1.0".to_string(),
185 aggregate_score_delta: 0.05,
186 pass_rate_delta: 0.10,
187 dimension_deltas: DimensionDeltas {
188 task_completion: 0.02,
189 tool_selection: 0.03,
190 argument_correctness: -0.01,
191 schema_compliance: 0.00,
192 instruction_adherence: 0.01,
193 path_efficiency: 0.00,
194 },
195 regression_count: 2,
196 improvement_count: 5,
197 neutral_count: 3,
198 };
199 assert_eq!(diff.version_a, "v1.0.0");
200 assert!((diff.aggregate_score_delta - 0.05).abs() < 1e-9);
201 assert_eq!(diff.improvement_count, 5);
202 }
203
204 #[test]
205 fn scorecard_diff_serde_roundtrip() {
206 let diff = ScorecardDiff {
207 version_a: "a".to_string(),
208 version_b: "b".to_string(),
209 aggregate_score_delta: 0.03,
210 pass_rate_delta: 0.05,
211 dimension_deltas: DimensionDeltas {
212 task_completion: 0.01,
213 tool_selection: 0.02,
214 argument_correctness: 0.00,
215 schema_compliance: 0.00,
216 instruction_adherence: 0.00,
217 path_efficiency: 0.00,
218 },
219 regression_count: 0,
220 improvement_count: 3,
221 neutral_count: 7,
222 };
223 let json = serde_json::to_string(&diff).unwrap();
224 let back: ScorecardDiff = serde_json::from_str(&json).unwrap();
225 assert_eq!(back.version_b, "b");
226 assert!((back.aggregate_score_delta - 0.03).abs() < 1e-9);
227 }
228
229 #[test]
230 fn dimension_deltas_serde_roundtrip() {
231 let d = DimensionDeltas {
232 task_completion: 0.1,
233 tool_selection: -0.1,
234 argument_correctness: 0.0,
235 schema_compliance: 0.05,
236 instruction_adherence: -0.02,
237 path_efficiency: 0.01,
238 };
239 let json = serde_json::to_string(&d).unwrap();
240 let back: DimensionDeltas = serde_json::from_str(&json).unwrap();
241 assert!((back.task_completion - 0.1).abs() < 1e-9);
242 assert!((back.tool_selection - (-0.1)).abs() < 1e-9);
243 }
244
245 #[test]
246 fn scorecard_stores_all_fields() {
247 let run_id = uuid::Uuid::new_v4();
248 let agent_id = uuid::Uuid::new_v4();
249 let sc = Scorecard {
250 run_id,
251 agent_id,
252 agent_name: "test".to_string(),
253 agent_version: "1.0.0".to_string(),
254 aggregate_score: 0.75,
255 pass_rate: 0.60,
256 total_scenarios: 10,
257 passed: 6,
258 failed: 3,
259 errors: 1,
260 review_needed: 0,
261 dimension_scores: crate::DimensionScores::default(),
262 failure_clusters: vec![],
263 duration_seconds: 120,
264 total_input_tokens: 5000,
265 total_output_tokens: 2000,
266 };
267 assert_eq!(sc.run_id, run_id);
268 assert_eq!(sc.total_scenarios, 10);
269 assert_eq!(sc.passed + sc.failed + sc.errors, 10);
270 }
271
272 #[test]
273 fn scoring_method_eq_deterministic() {
274 assert_eq!(ScoringMethod::Deterministic, ScoringMethod::Deterministic);
275 assert_ne!(ScoringMethod::Deterministic, ScoringMethod::LlmJudge);
276 }
277
278 #[test]
279 fn dimension_score_confidence_range_is_valid() {
280 let low = DimensionScore {
282 value: 0.5,
283 confidence: 0.0,
284 method: ScoringMethod::Heuristic,
285 rationale: None,
286 };
287 let high = DimensionScore {
288 value: 0.5,
289 confidence: 1.0,
290 method: ScoringMethod::Heuristic,
291 rationale: None,
292 };
293 assert!((0.0..=1.0).contains(&low.confidence));
294 assert!((0.0..=1.0).contains(&high.confidence));
295 }
296}