1use serde::{Deserialize, Serialize};
8
9use crate::error::{NousError, NousResult};
10use crate::taxonomy::{EvalLayer, EvalTiming};
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct EvalScore {
17 pub evaluator: String,
19 pub value: f64,
21 pub label: ScoreLabel,
23 pub layer: EvalLayer,
25 pub timing: EvalTiming,
27 pub explanation: Option<String>,
29 pub session_id: String,
31 pub run_id: Option<String>,
33}
34
35impl EvalScore {
36 pub fn new(
38 evaluator: impl Into<String>,
39 value: f64,
40 layer: EvalLayer,
41 timing: EvalTiming,
42 session_id: impl Into<String>,
43 ) -> NousResult<Self> {
44 if !(0.0..=1.0).contains(&value) {
45 return Err(NousError::ScoreOutOfRange { value });
46 }
47 Ok(Self {
48 evaluator: evaluator.into(),
49 value,
50 label: ScoreLabel::from_value(value),
51 layer,
52 timing,
53 explanation: None,
54 session_id: session_id.into(),
55 run_id: None,
56 })
57 }
58
59 pub fn with_explanation(mut self, explanation: impl Into<String>) -> Self {
61 self.explanation = Some(explanation.into());
62 self
63 }
64
65 pub fn with_run_id(mut self, run_id: impl Into<String>) -> Self {
67 self.run_id = Some(run_id.into());
68 self
69 }
70}
71
72#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
74#[serde(rename_all = "snake_case")]
75pub enum ScoreLabel {
76 Good,
78 Warning,
80 Critical,
82}
83
84impl ScoreLabel {
85 pub fn from_value(value: f64) -> Self {
87 if value >= 0.8 {
88 Self::Good
89 } else if value >= 0.5 {
90 Self::Warning
91 } else {
92 Self::Critical
93 }
94 }
95
96 pub fn as_str(&self) -> &'static str {
98 match self {
99 Self::Good => "good",
100 Self::Warning => "warning",
101 Self::Critical => "critical",
102 }
103 }
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct EvalResult {
112 pub evaluator: String,
114 pub scores: Vec<EvalScore>,
116 pub timestamp_ms: u64,
118 pub duration_ms: u64,
120}
121
122impl EvalResult {
123 pub fn aggregate_score(&self) -> f64 {
125 if self.scores.is_empty() {
126 return 0.0;
127 }
128 let sum: f64 = self.scores.iter().map(|s| s.value).sum();
129 sum / self.scores.len() as f64
130 }
131
132 pub fn worst_label(&self) -> ScoreLabel {
134 self.scores
135 .iter()
136 .map(|s| s.label)
137 .min_by_key(|l| match l {
138 ScoreLabel::Critical => 0,
139 ScoreLabel::Warning => 1,
140 ScoreLabel::Good => 2,
141 })
142 .unwrap_or(ScoreLabel::Good)
143 }
144}
145
146#[cfg(test)]
147mod tests {
148 use super::*;
149
150 #[test]
151 fn eval_score_new_valid() {
152 let score = EvalScore::new(
153 "token_efficiency",
154 0.85,
155 EvalLayer::Execution,
156 EvalTiming::Inline,
157 "sess-1",
158 )
159 .unwrap();
160 assert_eq!(score.evaluator, "token_efficiency");
161 assert!((score.value - 0.85).abs() < f64::EPSILON);
162 assert_eq!(score.label, ScoreLabel::Good);
163 }
164
165 #[test]
166 fn eval_score_rejects_out_of_range() {
167 let result = EvalScore::new("test", 1.5, EvalLayer::Cost, EvalTiming::Inline, "s");
168 assert!(result.is_err());
169
170 let result = EvalScore::new("test", -0.1, EvalLayer::Cost, EvalTiming::Inline, "s");
171 assert!(result.is_err());
172 }
173
174 #[test]
175 fn eval_score_boundary_values() {
176 assert!(EvalScore::new("test", 0.0, EvalLayer::Cost, EvalTiming::Inline, "s").is_ok());
177 assert!(EvalScore::new("test", 1.0, EvalLayer::Cost, EvalTiming::Inline, "s").is_ok());
178 }
179
180 #[test]
181 fn score_label_from_value() {
182 assert_eq!(ScoreLabel::from_value(0.95), ScoreLabel::Good);
183 assert_eq!(ScoreLabel::from_value(0.80), ScoreLabel::Good);
184 assert_eq!(ScoreLabel::from_value(0.79), ScoreLabel::Warning);
185 assert_eq!(ScoreLabel::from_value(0.50), ScoreLabel::Warning);
186 assert_eq!(ScoreLabel::from_value(0.49), ScoreLabel::Critical);
187 assert_eq!(ScoreLabel::from_value(0.0), ScoreLabel::Critical);
188 }
189
190 #[test]
191 fn eval_score_with_explanation() {
192 let score = EvalScore::new("test", 0.7, EvalLayer::Action, EvalTiming::Inline, "s")
193 .unwrap()
194 .with_explanation("tool error rate elevated");
195 assert_eq!(
196 score.explanation.as_deref(),
197 Some("tool error rate elevated")
198 );
199 }
200
201 #[test]
202 fn eval_score_with_run_id() {
203 let score = EvalScore::new("test", 0.7, EvalLayer::Action, EvalTiming::Inline, "s")
204 .unwrap()
205 .with_run_id("run-1");
206 assert_eq!(score.run_id.as_deref(), Some("run-1"));
207 }
208
209 #[test]
210 fn eval_score_serde_roundtrip() {
211 let score = EvalScore::new("test", 0.75, EvalLayer::Reasoning, EvalTiming::Async, "s")
212 .unwrap()
213 .with_explanation("decent reasoning");
214 let json = serde_json::to_string(&score).unwrap();
215 let back: EvalScore = serde_json::from_str(&json).unwrap();
216 assert_eq!(back.evaluator, "test");
217 assert!((back.value - 0.75).abs() < f64::EPSILON);
218 assert_eq!(back.label, ScoreLabel::Warning);
219 }
220
221 #[test]
222 fn eval_result_aggregate_score() {
223 let result = EvalResult {
224 evaluator: "test".into(),
225 scores: vec![
226 EvalScore::new("a", 0.8, EvalLayer::Action, EvalTiming::Inline, "s").unwrap(),
227 EvalScore::new("b", 0.6, EvalLayer::Action, EvalTiming::Inline, "s").unwrap(),
228 ],
229 timestamp_ms: 1000,
230 duration_ms: 5,
231 };
232 assert!((result.aggregate_score() - 0.7).abs() < f64::EPSILON);
233 }
234
235 #[test]
236 fn eval_result_aggregate_empty() {
237 let result = EvalResult {
238 evaluator: "test".into(),
239 scores: vec![],
240 timestamp_ms: 0,
241 duration_ms: 0,
242 };
243 assert!((result.aggregate_score()).abs() < f64::EPSILON);
244 }
245
246 #[test]
247 fn eval_result_worst_label() {
248 let result = EvalResult {
249 evaluator: "test".into(),
250 scores: vec![
251 EvalScore::new("a", 0.9, EvalLayer::Action, EvalTiming::Inline, "s").unwrap(),
252 EvalScore::new("b", 0.3, EvalLayer::Action, EvalTiming::Inline, "s").unwrap(),
253 ],
254 timestamp_ms: 1000,
255 duration_ms: 5,
256 };
257 assert_eq!(result.worst_label(), ScoreLabel::Critical);
258 }
259}