use serde::{Deserialize, Serialize};
use crate::error::{NousError, NousResult};
use crate::taxonomy::{EvalLayer, EvalTiming};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalScore {
pub evaluator: String,
pub value: f64,
pub label: ScoreLabel,
pub layer: EvalLayer,
pub timing: EvalTiming,
pub explanation: Option<String>,
pub session_id: String,
pub run_id: Option<String>,
}
impl EvalScore {
pub fn new(
evaluator: impl Into<String>,
value: f64,
layer: EvalLayer,
timing: EvalTiming,
session_id: impl Into<String>,
) -> NousResult<Self> {
if !(0.0..=1.0).contains(&value) {
return Err(NousError::ScoreOutOfRange { value });
}
Ok(Self {
evaluator: evaluator.into(),
value,
label: ScoreLabel::from_value(value),
layer,
timing,
explanation: None,
session_id: session_id.into(),
run_id: None,
})
}
pub fn with_explanation(mut self, explanation: impl Into<String>) -> Self {
self.explanation = Some(explanation.into());
self
}
pub fn with_run_id(mut self, run_id: impl Into<String>) -> Self {
self.run_id = Some(run_id.into());
self
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ScoreLabel {
Good,
Warning,
Critical,
}
impl ScoreLabel {
pub fn from_value(value: f64) -> Self {
if value >= 0.8 {
Self::Good
} else if value >= 0.5 {
Self::Warning
} else {
Self::Critical
}
}
pub fn as_str(&self) -> &'static str {
match self {
Self::Good => "good",
Self::Warning => "warning",
Self::Critical => "critical",
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalResult {
pub evaluator: String,
pub scores: Vec<EvalScore>,
pub timestamp_ms: u64,
pub duration_ms: u64,
}
impl EvalResult {
pub fn aggregate_score(&self) -> f64 {
if self.scores.is_empty() {
return 0.0;
}
let sum: f64 = self.scores.iter().map(|s| s.value).sum();
sum / self.scores.len() as f64
}
pub fn worst_label(&self) -> ScoreLabel {
self.scores
.iter()
.map(|s| s.label)
.min_by_key(|l| match l {
ScoreLabel::Critical => 0,
ScoreLabel::Warning => 1,
ScoreLabel::Good => 2,
})
.unwrap_or(ScoreLabel::Good)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn eval_score_new_valid() {
let score = EvalScore::new(
"token_efficiency",
0.85,
EvalLayer::Execution,
EvalTiming::Inline,
"sess-1",
)
.unwrap();
assert_eq!(score.evaluator, "token_efficiency");
assert!((score.value - 0.85).abs() < f64::EPSILON);
assert_eq!(score.label, ScoreLabel::Good);
}
#[test]
fn eval_score_rejects_out_of_range() {
let result = EvalScore::new("test", 1.5, EvalLayer::Cost, EvalTiming::Inline, "s");
assert!(result.is_err());
let result = EvalScore::new("test", -0.1, EvalLayer::Cost, EvalTiming::Inline, "s");
assert!(result.is_err());
}
#[test]
fn eval_score_boundary_values() {
assert!(EvalScore::new("test", 0.0, EvalLayer::Cost, EvalTiming::Inline, "s").is_ok());
assert!(EvalScore::new("test", 1.0, EvalLayer::Cost, EvalTiming::Inline, "s").is_ok());
}
#[test]
fn score_label_from_value() {
assert_eq!(ScoreLabel::from_value(0.95), ScoreLabel::Good);
assert_eq!(ScoreLabel::from_value(0.80), ScoreLabel::Good);
assert_eq!(ScoreLabel::from_value(0.79), ScoreLabel::Warning);
assert_eq!(ScoreLabel::from_value(0.50), ScoreLabel::Warning);
assert_eq!(ScoreLabel::from_value(0.49), ScoreLabel::Critical);
assert_eq!(ScoreLabel::from_value(0.0), ScoreLabel::Critical);
}
#[test]
fn eval_score_with_explanation() {
let score = EvalScore::new("test", 0.7, EvalLayer::Action, EvalTiming::Inline, "s")
.unwrap()
.with_explanation("tool error rate elevated");
assert_eq!(
score.explanation.as_deref(),
Some("tool error rate elevated")
);
}
#[test]
fn eval_score_with_run_id() {
let score = EvalScore::new("test", 0.7, EvalLayer::Action, EvalTiming::Inline, "s")
.unwrap()
.with_run_id("run-1");
assert_eq!(score.run_id.as_deref(), Some("run-1"));
}
#[test]
fn eval_score_serde_roundtrip() {
let score = EvalScore::new("test", 0.75, EvalLayer::Reasoning, EvalTiming::Async, "s")
.unwrap()
.with_explanation("decent reasoning");
let json = serde_json::to_string(&score).unwrap();
let back: EvalScore = serde_json::from_str(&json).unwrap();
assert_eq!(back.evaluator, "test");
assert!((back.value - 0.75).abs() < f64::EPSILON);
assert_eq!(back.label, ScoreLabel::Warning);
}
#[test]
fn eval_result_aggregate_score() {
let result = EvalResult {
evaluator: "test".into(),
scores: vec![
EvalScore::new("a", 0.8, EvalLayer::Action, EvalTiming::Inline, "s").unwrap(),
EvalScore::new("b", 0.6, EvalLayer::Action, EvalTiming::Inline, "s").unwrap(),
],
timestamp_ms: 1000,
duration_ms: 5,
};
assert!((result.aggregate_score() - 0.7).abs() < f64::EPSILON);
}
#[test]
fn eval_result_aggregate_empty() {
let result = EvalResult {
evaluator: "test".into(),
scores: vec![],
timestamp_ms: 0,
duration_ms: 0,
};
assert!((result.aggregate_score()).abs() < f64::EPSILON);
}
#[test]
fn eval_result_worst_label() {
let result = EvalResult {
evaluator: "test".into(),
scores: vec![
EvalScore::new("a", 0.9, EvalLayer::Action, EvalTiming::Inline, "s").unwrap(),
EvalScore::new("b", 0.3, EvalLayer::Action, EvalTiming::Inline, "s").unwrap(),
],
timestamp_ms: 1000,
duration_ms: 5,
};
assert_eq!(result.worst_label(), ScoreLabel::Critical);
}
}