Skip to main content

nous_core/
score.rs

1//! Evaluation score types.
2//!
3//! `EvalScore` is the atomic unit of evaluation output.
4//! `EvalResult` groups multiple scores from a single evaluator invocation.
5//! Both are OTel-aligned for emission as `gen_ai.evaluation.result` span events.
6
7use serde::{Deserialize, Serialize};
8
9use crate::error::{NousError, NousResult};
10use crate::taxonomy::{EvalLayer, EvalTiming};
11
12/// A single evaluation score.
13///
14/// Designed to map directly to an OpenTelemetry `gen_ai.evaluation.result` span event.
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct EvalScore {
17    /// Name of the evaluator that produced this score (e.g. `token_efficiency`).
18    pub evaluator: String,
19    /// Normalized score value in `[0.0, 1.0]`. Higher is better.
20    pub value: f64,
21    /// Categorical label (e.g. "good", "warning", "critical").
22    pub label: ScoreLabel,
23    /// Which layer of agent behavior this evaluates.
24    pub layer: EvalLayer,
25    /// Whether this was computed inline or async.
26    pub timing: EvalTiming,
27    /// Optional human-readable explanation.
28    pub explanation: Option<String>,
29    /// Session ID this score belongs to.
30    pub session_id: String,
31    /// Run ID within the session (if applicable).
32    pub run_id: Option<String>,
33}
34
35impl EvalScore {
36    /// Create a new score, validating the value is in `[0.0, 1.0]`.
37    pub fn new(
38        evaluator: impl Into<String>,
39        value: f64,
40        layer: EvalLayer,
41        timing: EvalTiming,
42        session_id: impl Into<String>,
43    ) -> NousResult<Self> {
44        if !(0.0..=1.0).contains(&value) {
45            return Err(NousError::ScoreOutOfRange { value });
46        }
47        Ok(Self {
48            evaluator: evaluator.into(),
49            value,
50            label: ScoreLabel::from_value(value),
51            layer,
52            timing,
53            explanation: None,
54            session_id: session_id.into(),
55            run_id: None,
56        })
57    }
58
59    /// Set the explanation.
60    pub fn with_explanation(mut self, explanation: impl Into<String>) -> Self {
61        self.explanation = Some(explanation.into());
62        self
63    }
64
65    /// Set the run ID.
66    pub fn with_run_id(mut self, run_id: impl Into<String>) -> Self {
67        self.run_id = Some(run_id.into());
68        self
69    }
70}
71
72/// Categorical score label derived from the numeric value.
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
74#[serde(rename_all = "snake_case")]
75pub enum ScoreLabel {
76    /// Score >= 0.8 — excellent quality.
77    Good,
78    /// Score >= 0.5 — acceptable but could improve.
79    Warning,
80    /// Score < 0.5 — needs attention.
81    Critical,
82}
83
84impl ScoreLabel {
85    /// Derive label from a normalized score value.
86    pub fn from_value(value: f64) -> Self {
87        if value >= 0.8 {
88            Self::Good
89        } else if value >= 0.5 {
90            Self::Warning
91        } else {
92            Self::Critical
93        }
94    }
95
96    /// String representation for OpenTelemetry attributes.
97    pub fn as_str(&self) -> &'static str {
98        match self {
99            Self::Good => "good",
100            Self::Warning => "warning",
101            Self::Critical => "critical",
102        }
103    }
104}
105
106/// A collection of scores from a single evaluator invocation.
107///
108/// Async evaluators (like LLM-as-judge) may produce multiple scores
109/// in a single call.
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct EvalResult {
112    /// The evaluator that produced these scores.
113    pub evaluator: String,
114    /// Individual scores.
115    pub scores: Vec<EvalScore>,
116    /// Timestamp of evaluation (ms since epoch).
117    pub timestamp_ms: u64,
118    /// Duration of the evaluation (ms).
119    pub duration_ms: u64,
120}
121
122impl EvalResult {
123    /// Aggregate quality score: mean of all score values.
124    pub fn aggregate_score(&self) -> f64 {
125        if self.scores.is_empty() {
126            return 0.0;
127        }
128        let sum: f64 = self.scores.iter().map(|s| s.value).sum();
129        sum / self.scores.len() as f64
130    }
131
132    /// Worst score label across all scores.
133    pub fn worst_label(&self) -> ScoreLabel {
134        self.scores
135            .iter()
136            .map(|s| s.label)
137            .min_by_key(|l| match l {
138                ScoreLabel::Critical => 0,
139                ScoreLabel::Warning => 1,
140                ScoreLabel::Good => 2,
141            })
142            .unwrap_or(ScoreLabel::Good)
143    }
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149
150    #[test]
151    fn eval_score_new_valid() {
152        let score = EvalScore::new(
153            "token_efficiency",
154            0.85,
155            EvalLayer::Execution,
156            EvalTiming::Inline,
157            "sess-1",
158        )
159        .unwrap();
160        assert_eq!(score.evaluator, "token_efficiency");
161        assert!((score.value - 0.85).abs() < f64::EPSILON);
162        assert_eq!(score.label, ScoreLabel::Good);
163    }
164
165    #[test]
166    fn eval_score_rejects_out_of_range() {
167        let result = EvalScore::new("test", 1.5, EvalLayer::Cost, EvalTiming::Inline, "s");
168        assert!(result.is_err());
169
170        let result = EvalScore::new("test", -0.1, EvalLayer::Cost, EvalTiming::Inline, "s");
171        assert!(result.is_err());
172    }
173
174    #[test]
175    fn eval_score_boundary_values() {
176        assert!(EvalScore::new("test", 0.0, EvalLayer::Cost, EvalTiming::Inline, "s").is_ok());
177        assert!(EvalScore::new("test", 1.0, EvalLayer::Cost, EvalTiming::Inline, "s").is_ok());
178    }
179
180    #[test]
181    fn score_label_from_value() {
182        assert_eq!(ScoreLabel::from_value(0.95), ScoreLabel::Good);
183        assert_eq!(ScoreLabel::from_value(0.80), ScoreLabel::Good);
184        assert_eq!(ScoreLabel::from_value(0.79), ScoreLabel::Warning);
185        assert_eq!(ScoreLabel::from_value(0.50), ScoreLabel::Warning);
186        assert_eq!(ScoreLabel::from_value(0.49), ScoreLabel::Critical);
187        assert_eq!(ScoreLabel::from_value(0.0), ScoreLabel::Critical);
188    }
189
190    #[test]
191    fn eval_score_with_explanation() {
192        let score = EvalScore::new("test", 0.7, EvalLayer::Action, EvalTiming::Inline, "s")
193            .unwrap()
194            .with_explanation("tool error rate elevated");
195        assert_eq!(
196            score.explanation.as_deref(),
197            Some("tool error rate elevated")
198        );
199    }
200
201    #[test]
202    fn eval_score_with_run_id() {
203        let score = EvalScore::new("test", 0.7, EvalLayer::Action, EvalTiming::Inline, "s")
204            .unwrap()
205            .with_run_id("run-1");
206        assert_eq!(score.run_id.as_deref(), Some("run-1"));
207    }
208
209    #[test]
210    fn eval_score_serde_roundtrip() {
211        let score = EvalScore::new("test", 0.75, EvalLayer::Reasoning, EvalTiming::Async, "s")
212            .unwrap()
213            .with_explanation("decent reasoning");
214        let json = serde_json::to_string(&score).unwrap();
215        let back: EvalScore = serde_json::from_str(&json).unwrap();
216        assert_eq!(back.evaluator, "test");
217        assert!((back.value - 0.75).abs() < f64::EPSILON);
218        assert_eq!(back.label, ScoreLabel::Warning);
219    }
220
221    #[test]
222    fn eval_result_aggregate_score() {
223        let result = EvalResult {
224            evaluator: "test".into(),
225            scores: vec![
226                EvalScore::new("a", 0.8, EvalLayer::Action, EvalTiming::Inline, "s").unwrap(),
227                EvalScore::new("b", 0.6, EvalLayer::Action, EvalTiming::Inline, "s").unwrap(),
228            ],
229            timestamp_ms: 1000,
230            duration_ms: 5,
231        };
232        assert!((result.aggregate_score() - 0.7).abs() < f64::EPSILON);
233    }
234
235    #[test]
236    fn eval_result_aggregate_empty() {
237        let result = EvalResult {
238            evaluator: "test".into(),
239            scores: vec![],
240            timestamp_ms: 0,
241            duration_ms: 0,
242        };
243        assert!((result.aggregate_score()).abs() < f64::EPSILON);
244    }
245
246    #[test]
247    fn eval_result_worst_label() {
248        let result = EvalResult {
249            evaluator: "test".into(),
250            scores: vec![
251                EvalScore::new("a", 0.9, EvalLayer::Action, EvalTiming::Inline, "s").unwrap(),
252                EvalScore::new("b", 0.3, EvalLayer::Action, EvalTiming::Inline, "s").unwrap(),
253            ],
254            timestamp_ms: 1000,
255            duration_ms: 5,
256        };
257        assert_eq!(result.worst_label(), ScoreLabel::Critical);
258    }
259}