Skip to main content

noether_engine/
stage_test.rs

1//! Behavioral verification of a stage against its own examples.
2//!
3//! Every stage ships with `examples: [{input, output}, ...]` used for
4//! semantic search, documentation, and — in this module — as a minimal
5//! behavioral test suite. [`verify_stage`] runs each example through an
6//! executor and compares the actual output against the declared output
7//! via canonical JSON hashing.
8//!
9//! ## What gets tested
10//!
11//! The [`StageSkipReason`] enum identifies stages whose example outputs
12//! are **illustrative, not reproducible**: network calls, LLM inference,
13//! explicitly non-deterministic effects, and time-sensitive stages. For
14//! those, a hash comparison would fail spuriously; callers receive a
15//! `Skipped { reason }` outcome instead of `Failed` so CI gates can be
16//! set accordingly.
17//!
18//! For everything else (`Pure`, plain `Fallible`) the comparison is
19//! exact — if the implementation returns the wrong shape or the wrong
20//! value, the test fails.
21
22use noether_core::effects::Effect;
23use noether_core::stage::Stage;
24use serde_json::Value;
25
26use crate::executor::{ExecutionError, StageExecutor};
27
28/// Per-example verification outcome.
29#[derive(Debug, Clone, PartialEq)]
30pub enum ExampleOutcome {
31    /// Implementation produced the declared output (canonical hash match).
32    Ok,
33    /// Implementation produced a different output.
34    Mismatch { expected: Value, actual: Value },
35    /// Executor returned an error.
36    Errored { message: String },
37}
38
39/// Why a stage's behavioral verification is not meaningful.
40#[derive(Debug, Clone, Copy, PartialEq, Eq)]
41pub enum StageSkipReason {
42    /// Stage carries `Effect::Network`.
43    Network,
44    /// Stage carries `Effect::Llm`.
45    Llm,
46    /// Stage carries `Effect::NonDeterministic`.
47    NonDeterministic,
48    /// Stage carries `Effect::Process` — touches external process state.
49    Process,
50    /// Stage has no examples to verify against.
51    NoExamples,
52    /// Executor reports no implementation for this stage ID.
53    NoImplementation,
54}
55
56impl std::fmt::Display for StageSkipReason {
57    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
58        match self {
59            Self::Network => write!(f, "network effect — example outputs are illustrative"),
60            Self::Llm => write!(f, "LLM effect — output is non-reproducible"),
61            Self::NonDeterministic => write!(f, "non-deterministic effect"),
62            Self::Process => write!(f, "process effect — side-effectful"),
63            Self::NoExamples => write!(f, "no examples declared"),
64            Self::NoImplementation => write!(f, "no implementation available in this executor"),
65        }
66    }
67}
68
69/// Verification report for a single stage.
70#[derive(Debug, Clone)]
71pub struct StageReport {
72    pub stage_id: String,
73    pub description: String,
74    pub outcome: ReportOutcome,
75}
76
77#[derive(Debug, Clone)]
78pub enum ReportOutcome {
79    /// Stage was skipped — no verdict.
80    Skipped { reason: StageSkipReason },
81    /// Stage was tested. Individual example results live in `examples`.
82    Tested { examples: Vec<ExampleOutcome> },
83}
84
85impl StageReport {
86    /// True when every example matched (or the stage was skipped).
87    pub fn passed(&self) -> bool {
88        match &self.outcome {
89            ReportOutcome::Skipped { .. } => true,
90            ReportOutcome::Tested { examples } => {
91                examples.iter().all(|e| matches!(e, ExampleOutcome::Ok))
92            }
93        }
94    }
95
96    /// True when any example failed to match the declared output.
97    pub fn failed(&self) -> bool {
98        matches!(&self.outcome, ReportOutcome::Tested { examples }
99            if examples.iter().any(|e| !matches!(e, ExampleOutcome::Ok)))
100    }
101}
102
103/// Decide whether a stage's behavioral verification is meaningful.
104fn skip_reason(stage: &Stage) -> Option<StageSkipReason> {
105    if stage.examples.is_empty() {
106        return Some(StageSkipReason::NoExamples);
107    }
108    for effect in stage.signature.effects.iter() {
109        match effect {
110            Effect::Network => return Some(StageSkipReason::Network),
111            Effect::Llm { .. } => return Some(StageSkipReason::Llm),
112            Effect::NonDeterministic => return Some(StageSkipReason::NonDeterministic),
113            Effect::Process => return Some(StageSkipReason::Process),
114            _ => {}
115        }
116    }
117    None
118}
119
120/// Canonical comparison — two JSON values are equal iff their
121/// JCS-canonical byte strings match. Tolerates field-order differences
122/// and numeric canonicalisation (`1.0` vs `1`).
123fn canonical_eq(a: &Value, b: &Value) -> bool {
124    match (serde_jcs::to_vec(a), serde_jcs::to_vec(b)) {
125        (Ok(x), Ok(y)) => x == y,
126        _ => a == b, // fall back to structural equality if JCS fails
127    }
128}
129
130/// Run every example through the executor and return a report.
131///
132/// Skipped stages short-circuit before touching the executor. For tested
133/// stages, each example produces an [`ExampleOutcome`].
134pub fn verify_stage<E: StageExecutor>(stage: &Stage, executor: &E) -> StageReport {
135    if let Some(reason) = skip_reason(stage) {
136        return StageReport {
137            stage_id: stage.id.0.clone(),
138            description: stage.description.clone(),
139            outcome: ReportOutcome::Skipped { reason },
140        };
141    }
142
143    let mut examples = Vec::with_capacity(stage.examples.len());
144    for example in &stage.examples {
145        let outcome = match executor.execute(&stage.id, &example.input) {
146            Ok(actual) => {
147                if canonical_eq(&actual, &example.output) {
148                    ExampleOutcome::Ok
149                } else {
150                    ExampleOutcome::Mismatch {
151                        expected: example.output.clone(),
152                        actual,
153                    }
154                }
155            }
156            Err(ExecutionError::StageNotFound(_)) => {
157                return StageReport {
158                    stage_id: stage.id.0.clone(),
159                    description: stage.description.clone(),
160                    outcome: ReportOutcome::Skipped {
161                        reason: StageSkipReason::NoImplementation,
162                    },
163                };
164            }
165            Err(e) => ExampleOutcome::Errored {
166                message: format!("{e}"),
167            },
168        };
169        examples.push(outcome);
170    }
171
172    StageReport {
173        stage_id: stage.id.0.clone(),
174        description: stage.description.clone(),
175        outcome: ReportOutcome::Tested { examples },
176    }
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182    use crate::executor::ExecutionError;
183    use noether_core::capability::Capability;
184    use noether_core::effects::EffectSet;
185    use noether_core::stage::{CostEstimate, Example, Stage, StageId, StageSignature};
186    use noether_core::types::NType;
187    use serde_json::json;
188    use std::collections::BTreeSet;
189
190    /// Mock executor that returns a fixed value, ignoring input.
191    struct ConstExec {
192        out: Value,
193    }
194
195    impl StageExecutor for ConstExec {
196        fn execute(
197            &self,
198            _id: &StageId,
199            _input: &Value,
200        ) -> Result<Value, crate::executor::ExecutionError> {
201            Ok(self.out.clone())
202        }
203    }
204
205    /// Mock executor that mirrors input back as output.
206    struct EchoExec;
207
208    impl StageExecutor for EchoExec {
209        fn execute(&self, _id: &StageId, input: &Value) -> Result<Value, ExecutionError> {
210            Ok(input.clone())
211        }
212    }
213
214    fn make_stage(effects: EffectSet, examples: Vec<Example>) -> Stage {
215        Stage {
216            id: StageId("test-stage".into()),
217            signature_id: None,
218            signature: StageSignature {
219                input: NType::Any,
220                output: NType::Any,
221                effects,
222                implementation_hash: "hash".into(),
223            },
224            capabilities: BTreeSet::new(),
225            cost: CostEstimate {
226                time_ms_p50: None,
227                tokens_est: None,
228                memory_mb: None,
229            },
230            description: "test".into(),
231            examples,
232            lifecycle: noether_core::stage::StageLifecycle::Active,
233            ed25519_signature: None,
234            signer_public_key: None,
235            implementation_code: None,
236            implementation_language: None,
237            ui_style: None,
238            tags: vec![],
239            aliases: vec![],
240            name: None,
241            properties: Vec::new(),
242        }
243    }
244
245    #[test]
246    fn pure_stage_passes_when_executor_matches() {
247        let stage = make_stage(
248            EffectSet::pure(),
249            vec![Example {
250                input: json!({"x": 1}),
251                output: json!({"x": 1}),
252            }],
253        );
254        let report = verify_stage(&stage, &EchoExec);
255        assert!(report.passed());
256    }
257
258    #[test]
259    fn pure_stage_fails_when_executor_diverges() {
260        let stage = make_stage(
261            EffectSet::pure(),
262            vec![Example {
263                input: json!({"x": 1}),
264                output: json!({"x": 2}),
265            }],
266        );
267        let report = verify_stage(
268            &stage,
269            &ConstExec {
270                out: json!({"x": 1}),
271            },
272        );
273        assert!(report.failed());
274    }
275
276    #[test]
277    fn network_stage_is_skipped() {
278        let stage = make_stage(
279            EffectSet::new(vec![Effect::Network]),
280            vec![Example {
281                input: json!(null),
282                output: json!(null),
283            }],
284        );
285        let report = verify_stage(&stage, &EchoExec);
286        assert!(matches!(
287            report.outcome,
288            ReportOutcome::Skipped {
289                reason: StageSkipReason::Network
290            }
291        ));
292    }
293
294    #[test]
295    fn llm_stage_is_skipped() {
296        let stage = make_stage(
297            EffectSet::new(vec![Effect::Llm {
298                model: "any".into(),
299            }]),
300            vec![Example {
301                input: json!(null),
302                output: json!(null),
303            }],
304        );
305        let report = verify_stage(&stage, &EchoExec);
306        assert!(matches!(
307            report.outcome,
308            ReportOutcome::Skipped {
309                reason: StageSkipReason::Llm
310            }
311        ));
312    }
313
314    #[test]
315    fn canonical_eq_ignores_field_order_and_numeric_form() {
316        assert!(canonical_eq(
317            &json!({"a": 1, "b": 2}),
318            &json!({"b": 2, "a": 1})
319        ));
320        assert!(canonical_eq(&json!(1.0), &json!(1)));
321        assert!(!canonical_eq(&json!({"a": 1}), &json!({"a": 2})));
322    }
323
324    #[test]
325    fn no_examples_is_skipped() {
326        let stage = make_stage(EffectSet::pure(), vec![]);
327        let report = verify_stage(&stage, &EchoExec);
328        assert!(matches!(
329            report.outcome,
330            ReportOutcome::Skipped {
331                reason: StageSkipReason::NoExamples
332            }
333        ));
334    }
335
336    // Silence the unused warning on the Capability import in certain test
337    // configurations.
338    #[allow(dead_code)]
339    fn _capability_use() -> Capability {
340        Capability::Network
341    }
342}