Skip to main content

nous_core/
evaluator.rs

1//! Evaluator trait — the core abstraction for all Nous evaluators.
2//!
3//! Evaluators are pure functions: given an `EvalContext`, they produce
4//! zero or more `EvalScore`s. Inline evaluators must complete in < 2ms.
5
6use crate::error::NousResult;
7use crate::score::EvalScore;
8use crate::taxonomy::{EvalLayer, EvalTiming};
9
10/// Context provided to evaluators for scoring.
11///
12/// Carries the information an evaluator needs without requiring
13/// it to depend on Arcan types directly.
14#[derive(Debug, Clone)]
15pub struct EvalContext {
16    /// Session ID.
17    pub session_id: String,
18    /// Run ID within the session.
19    pub run_id: Option<String>,
20    /// Current iteration within the run.
21    pub iteration: Option<u32>,
22    /// Input token count for the current model call.
23    pub input_tokens: Option<u64>,
24    /// Output token count for the current model call.
25    pub output_tokens: Option<u64>,
26    /// Remaining token budget.
27    pub tokens_remaining: Option<u64>,
28    /// Total tokens used so far in the session.
29    pub total_tokens_used: Option<u64>,
30    /// Number of tool calls in this run.
31    pub tool_call_count: Option<u32>,
32    /// Number of tool errors in this run.
33    pub tool_error_count: Option<u32>,
34    /// Tool name (for tool-specific evaluators).
35    pub tool_name: Option<String>,
36    /// Whether the tool call resulted in an error.
37    pub tool_errored: Option<bool>,
38    /// Maximum iterations configured.
39    pub max_iterations: Option<u32>,
40    /// Arbitrary key-value metadata.
41    pub metadata: std::collections::HashMap<String, String>,
42}
43
44impl EvalContext {
45    /// Create a minimal context with just a session ID.
46    pub fn new(session_id: impl Into<String>) -> Self {
47        Self {
48            session_id: session_id.into(),
49            run_id: None,
50            iteration: None,
51            input_tokens: None,
52            output_tokens: None,
53            tokens_remaining: None,
54            total_tokens_used: None,
55            tool_call_count: None,
56            tool_error_count: None,
57            tool_name: None,
58            tool_errored: None,
59            max_iterations: None,
60            metadata: std::collections::HashMap::new(),
61        }
62    }
63}
64
65/// The core evaluator trait.
66///
67/// All Nous evaluators implement this trait. Inline evaluators
68/// must be fast (< 2ms, no I/O). Async evaluators may take longer.
69pub trait NousEvaluator: Send + Sync {
70    /// Unique name for this evaluator (e.g. `token_efficiency`).
71    fn name(&self) -> &str;
72
73    /// Which behavior layer this evaluator measures.
74    fn layer(&self) -> EvalLayer;
75
76    /// Whether this runs inline or async.
77    fn timing(&self) -> EvalTiming;
78
79    /// Evaluate the given context and produce scores.
80    ///
81    /// Returns an empty vec if there's insufficient data to score.
82    fn evaluate(&self, ctx: &EvalContext) -> NousResult<Vec<EvalScore>>;
83}
84
85/// Hook points where evaluators can be attached in the agent lifecycle.
86#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
87pub enum EvalHook {
88    /// Before a model call is made.
89    BeforeModelCall,
90    /// After a model call completes.
91    AfterModelCall,
92    /// Before a tool call is executed.
93    PreToolCall,
94    /// After a tool call completes.
95    PostToolCall,
96    /// After a full run finishes.
97    OnRunFinished,
98}
99
100impl EvalHook {
101    /// String representation for logging and events.
102    pub fn as_str(&self) -> &'static str {
103        match self {
104            Self::BeforeModelCall => "before_model_call",
105            Self::AfterModelCall => "after_model_call",
106            Self::PreToolCall => "pre_tool_call",
107            Self::PostToolCall => "post_tool_call",
108            Self::OnRunFinished => "on_run_finished",
109        }
110    }
111}
112
113#[cfg(test)]
114mod tests {
115    use super::*;
116    use crate::taxonomy::{EvalLayer, EvalTiming};
117
118    struct MockEvaluator;
119
120    impl NousEvaluator for MockEvaluator {
121        fn name(&self) -> &str {
122            "mock"
123        }
124
125        fn layer(&self) -> EvalLayer {
126            EvalLayer::Execution
127        }
128
129        fn timing(&self) -> EvalTiming {
130            EvalTiming::Inline
131        }
132
133        fn evaluate(&self, ctx: &EvalContext) -> NousResult<Vec<EvalScore>> {
134            let score = EvalScore::new(
135                self.name(),
136                0.9,
137                self.layer(),
138                self.timing(),
139                &ctx.session_id,
140            )?;
141            Ok(vec![score])
142        }
143    }
144
145    #[test]
146    fn mock_evaluator_produces_score() {
147        let evaluator = MockEvaluator;
148        let ctx = EvalContext::new("sess-1");
149        let scores = evaluator.evaluate(&ctx).unwrap();
150        assert_eq!(scores.len(), 1);
151        assert_eq!(scores[0].evaluator, "mock");
152        assert!((scores[0].value - 0.9).abs() < f64::EPSILON);
153    }
154
155    #[test]
156    fn eval_context_new_minimal() {
157        let ctx = EvalContext::new("test");
158        assert_eq!(ctx.session_id, "test");
159        assert!(ctx.run_id.is_none());
160        assert!(ctx.input_tokens.is_none());
161    }
162
163    #[test]
164    fn eval_hook_as_str() {
165        assert_eq!(EvalHook::BeforeModelCall.as_str(), "before_model_call");
166        assert_eq!(EvalHook::AfterModelCall.as_str(), "after_model_call");
167        assert_eq!(EvalHook::PreToolCall.as_str(), "pre_tool_call");
168        assert_eq!(EvalHook::PostToolCall.as_str(), "post_tool_call");
169        assert_eq!(EvalHook::OnRunFinished.as_str(), "on_run_finished");
170    }
171}