Skip to main content

noether_engine/
stage_test.rs

1//! Behavioral verification of a stage against its own examples.
2//!
3//! Every stage ships with `examples: [{input, output}, ...]` used for
4//! semantic search, documentation, and — in this module — as a minimal
5//! behavioral test suite. [`verify_stage`] runs each example through an
6//! executor and compares the actual output against the declared output
7//! via canonical JSON hashing.
8//!
9//! ## What gets tested
10//!
11//! The [`StageSkipReason`] enum identifies stages whose example outputs
12//! are **illustrative, not reproducible**: network calls, LLM inference,
13//! explicitly non-deterministic effects, and time-sensitive stages. For
14//! those, a hash comparison would fail spuriously; callers receive a
15//! `Skipped { reason }` outcome instead of `Failed` so CI gates can be
16//! set accordingly.
17//!
18//! For everything else (`Pure`, plain `Fallible`) the comparison is
19//! exact — if the implementation returns the wrong shape or the wrong
20//! value, the test fails.
21
22use noether_core::effects::Effect;
23use noether_core::stage::Stage;
24use serde_json::Value;
25
26use crate::executor::{ExecutionError, StageExecutor};
27
28/// Per-example verification outcome.
29#[derive(Debug, Clone, PartialEq)]
30pub enum ExampleOutcome {
31    /// Implementation produced the declared output (canonical hash match).
32    Ok,
33    /// Implementation produced a different output.
34    Mismatch { expected: Value, actual: Value },
35    /// Executor returned an error.
36    Errored { message: String },
37}
38
39/// Why a stage's behavioral verification is not meaningful.
40#[derive(Debug, Clone, Copy, PartialEq, Eq)]
41pub enum StageSkipReason {
42    /// Stage carries `Effect::Network`.
43    Network,
44    /// Stage carries `Effect::Llm`.
45    Llm,
46    /// Stage carries `Effect::NonDeterministic`.
47    NonDeterministic,
48    /// Stage carries `Effect::Process` — touches external process state.
49    Process,
50    /// Stage has no examples to verify against.
51    NoExamples,
52    /// Executor reports no implementation for this stage ID.
53    NoImplementation,
54}
55
56impl std::fmt::Display for StageSkipReason {
57    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
58        match self {
59            Self::Network => write!(f, "network effect — example outputs are illustrative"),
60            Self::Llm => write!(f, "LLM effect — output is non-reproducible"),
61            Self::NonDeterministic => write!(f, "non-deterministic effect"),
62            Self::Process => write!(f, "process effect — side-effectful"),
63            Self::NoExamples => write!(f, "no examples declared"),
64            Self::NoImplementation => write!(f, "no implementation available in this executor"),
65        }
66    }
67}
68
69/// Verification report for a single stage.
70#[derive(Debug, Clone)]
71pub struct StageReport {
72    pub stage_id: String,
73    pub description: String,
74    pub outcome: ReportOutcome,
75}
76
77#[derive(Debug, Clone)]
78pub enum ReportOutcome {
79    /// Stage was skipped — no verdict.
80    Skipped { reason: StageSkipReason },
81    /// Stage was tested. Individual example results live in `examples`.
82    Tested { examples: Vec<ExampleOutcome> },
83}
84
85impl StageReport {
86    /// True when every example matched (or the stage was skipped).
87    pub fn passed(&self) -> bool {
88        match &self.outcome {
89            ReportOutcome::Skipped { .. } => true,
90            ReportOutcome::Tested { examples } => {
91                examples.iter().all(|e| matches!(e, ExampleOutcome::Ok))
92            }
93        }
94    }
95
96    /// True when any example failed to match the declared output.
97    pub fn failed(&self) -> bool {
98        matches!(&self.outcome, ReportOutcome::Tested { examples }
99            if examples.iter().any(|e| !matches!(e, ExampleOutcome::Ok)))
100    }
101}
102
103/// Decide whether a stage's behavioral verification is meaningful.
104fn skip_reason(stage: &Stage) -> Option<StageSkipReason> {
105    if stage.examples.is_empty() {
106        return Some(StageSkipReason::NoExamples);
107    }
108    for effect in stage.signature.effects.iter() {
109        match effect {
110            Effect::Network => return Some(StageSkipReason::Network),
111            Effect::Llm { .. } => return Some(StageSkipReason::Llm),
112            Effect::NonDeterministic => return Some(StageSkipReason::NonDeterministic),
113            Effect::Process => return Some(StageSkipReason::Process),
114            _ => {}
115        }
116    }
117    None
118}
119
120/// Canonical comparison — two JSON values are equal iff their
121/// JCS-canonical byte strings match. Tolerates field-order differences
122/// and numeric canonicalisation (`1.0` vs `1`).
123fn canonical_eq(a: &Value, b: &Value) -> bool {
124    match (serde_jcs::to_vec(a), serde_jcs::to_vec(b)) {
125        (Ok(x), Ok(y)) => x == y,
126        _ => a == b, // fall back to structural equality if JCS fails
127    }
128}
129
130/// Run every example through the executor and return a report.
131///
132/// Skipped stages short-circuit before touching the executor. For tested
133/// stages, each example produces an [`ExampleOutcome`].
134pub fn verify_stage<E: StageExecutor>(stage: &Stage, executor: &E) -> StageReport {
135    if let Some(reason) = skip_reason(stage) {
136        return StageReport {
137            stage_id: stage.id.0.clone(),
138            description: stage.description.clone(),
139            outcome: ReportOutcome::Skipped { reason },
140        };
141    }
142
143    let mut examples = Vec::with_capacity(stage.examples.len());
144    for example in &stage.examples {
145        let outcome = match executor.execute(&stage.id, &example.input) {
146            Ok(actual) => {
147                if canonical_eq(&actual, &example.output) {
148                    ExampleOutcome::Ok
149                } else {
150                    ExampleOutcome::Mismatch {
151                        expected: example.output.clone(),
152                        actual,
153                    }
154                }
155            }
156            Err(ExecutionError::StageNotFound(_)) => {
157                return StageReport {
158                    stage_id: stage.id.0.clone(),
159                    description: stage.description.clone(),
160                    outcome: ReportOutcome::Skipped {
161                        reason: StageSkipReason::NoImplementation,
162                    },
163                };
164            }
165            Err(e) => ExampleOutcome::Errored {
166                message: format!("{e}"),
167            },
168        };
169        examples.push(outcome);
170    }
171
172    StageReport {
173        stage_id: stage.id.0.clone(),
174        description: stage.description.clone(),
175        outcome: ReportOutcome::Tested { examples },
176    }
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182    use crate::executor::ExecutionError;
183    use noether_core::capability::Capability;
184    use noether_core::effects::EffectSet;
185    use noether_core::stage::{CostEstimate, Example, Stage, StageId, StageSignature};
186    use noether_core::types::NType;
187    use serde_json::json;
188    use std::collections::BTreeSet;
189
190    /// Mock executor that returns a fixed value, ignoring input.
191    struct ConstExec {
192        out: Value,
193    }
194
195    impl StageExecutor for ConstExec {
196        fn execute(
197            &self,
198            _id: &StageId,
199            _input: &Value,
200        ) -> Result<Value, crate::executor::ExecutionError> {
201            Ok(self.out.clone())
202        }
203    }
204
205    /// Mock executor that mirrors input back as output.
206    struct EchoExec;
207
208    impl StageExecutor for EchoExec {
209        fn execute(&self, _id: &StageId, input: &Value) -> Result<Value, ExecutionError> {
210            Ok(input.clone())
211        }
212    }
213
214    fn make_stage(effects: EffectSet, examples: Vec<Example>) -> Stage {
215        Stage {
216            id: StageId("test-stage".into()),
217            canonical_id: None,
218            signature: StageSignature {
219                input: NType::Any,
220                output: NType::Any,
221                effects,
222                implementation_hash: "hash".into(),
223            },
224            capabilities: BTreeSet::new(),
225            cost: CostEstimate {
226                time_ms_p50: None,
227                tokens_est: None,
228                memory_mb: None,
229            },
230            description: "test".into(),
231            examples,
232            lifecycle: noether_core::stage::StageLifecycle::Active,
233            ed25519_signature: None,
234            signer_public_key: None,
235            implementation_code: None,
236            implementation_language: None,
237            ui_style: None,
238            tags: vec![],
239            aliases: vec![],
240            name: None,
241        }
242    }
243
244    #[test]
245    fn pure_stage_passes_when_executor_matches() {
246        let stage = make_stage(
247            EffectSet::pure(),
248            vec![Example {
249                input: json!({"x": 1}),
250                output: json!({"x": 1}),
251            }],
252        );
253        let report = verify_stage(&stage, &EchoExec);
254        assert!(report.passed());
255    }
256
257    #[test]
258    fn pure_stage_fails_when_executor_diverges() {
259        let stage = make_stage(
260            EffectSet::pure(),
261            vec![Example {
262                input: json!({"x": 1}),
263                output: json!({"x": 2}),
264            }],
265        );
266        let report = verify_stage(
267            &stage,
268            &ConstExec {
269                out: json!({"x": 1}),
270            },
271        );
272        assert!(report.failed());
273    }
274
275    #[test]
276    fn network_stage_is_skipped() {
277        let stage = make_stage(
278            EffectSet::new(vec![Effect::Network]),
279            vec![Example {
280                input: json!(null),
281                output: json!(null),
282            }],
283        );
284        let report = verify_stage(&stage, &EchoExec);
285        assert!(matches!(
286            report.outcome,
287            ReportOutcome::Skipped {
288                reason: StageSkipReason::Network
289            }
290        ));
291    }
292
293    #[test]
294    fn llm_stage_is_skipped() {
295        let stage = make_stage(
296            EffectSet::new(vec![Effect::Llm {
297                model: "any".into(),
298            }]),
299            vec![Example {
300                input: json!(null),
301                output: json!(null),
302            }],
303        );
304        let report = verify_stage(&stage, &EchoExec);
305        assert!(matches!(
306            report.outcome,
307            ReportOutcome::Skipped {
308                reason: StageSkipReason::Llm
309            }
310        ));
311    }
312
313    #[test]
314    fn canonical_eq_ignores_field_order_and_numeric_form() {
315        assert!(canonical_eq(
316            &json!({"a": 1, "b": 2}),
317            &json!({"b": 2, "a": 1})
318        ));
319        assert!(canonical_eq(&json!(1.0), &json!(1)));
320        assert!(!canonical_eq(&json!({"a": 1}), &json!({"a": 2})));
321    }
322
323    #[test]
324    fn no_examples_is_skipped() {
325        let stage = make_stage(EffectSet::pure(), vec![]);
326        let report = verify_stage(&stage, &EchoExec);
327        assert!(matches!(
328            report.outcome,
329            ReportOutcome::Skipped {
330                reason: StageSkipReason::NoExamples
331            }
332        ));
333    }
334
335    // Silence the unused warning on the Capability import in certain test
336    // configurations.
337    #[allow(dead_code)]
338    fn _capability_use() -> Capability {
339        Capability::Network
340    }
341}