Skip to main content

agentic_eval/
determinism.rs

1//! Determinism: does a program produce byte-identical output across runs?
2//!
3//! Agents parse, cache, and diff program output. Non-deterministic output (locale
4//! dates, hash-ordered maps, terminal-width-dependent tables, embedded timestamps)
5//! breaks all three: the agent can't reuse a cache, a diff is all noise, and a
6//! parser keyed on column positions desyncs. This module runs an output-producer
7//! repeatedly and reports whether every run was identical.
8
9use std::collections::BTreeSet;
10
11/// The result of a determinism assessment.
12#[cfg_attr(feature = "serde", derive(serde::Serialize))]
13#[derive(Debug, Clone)]
14pub struct DeterminismReport {
15    /// How many times the producer was run.
16    pub runs: usize,
17    /// Number of *distinct* outputs observed (1 = fully deterministic).
18    pub distinct: usize,
19    /// True iff every run produced byte-identical output.
20    pub deterministic: bool,
21    /// The first output (representative sample).
22    pub first: String,
23}
24
25/// Run `produce` `runs` times and report whether all outputs are byte-identical.
26/// `runs` is clamped to ≥ 2 (one run can't show non-determinism).
27pub fn assess_determinism(runs: usize, mut produce: impl FnMut() -> String) -> DeterminismReport {
28    let runs = runs.max(2);
29    let mut seen: BTreeSet<String> = BTreeSet::new();
30    let mut first = String::new();
31    for i in 0..runs {
32        let out = produce();
33        if i == 0 {
34            first = out.clone();
35        }
36        seen.insert(out);
37    }
38    DeterminismReport {
39        runs,
40        distinct: seen.len(),
41        deterministic: seen.len() == 1,
42        first,
43    }
44}
45
46impl std::fmt::Display for DeterminismReport {
47    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48        write!(
49            f,
50            "{} ({} distinct / {} runs)",
51            if self.deterministic {
52                "deterministic"
53            } else {
54                "NON-deterministic"
55            },
56            self.distinct,
57            self.runs
58        )
59    }
60}
61
62/// Whether two renderings of the *same* value are byte-identical — the
63/// cross-representation determinism check (e.g. does a canonical encoder produce
64/// the same bytes regardless of input key order?).
65pub fn stable_across(a: &str, b: &str) -> bool {
66    a == b
67}
68
69#[cfg(test)]
70mod tests {
71    use super::*;
72    use std::cell::Cell;
73
74    #[test]
75    fn constant_output_is_deterministic() {
76        let r = assess_determinism(5, || "name\tsize\nfoo\t10".to_string());
77        assert!(r.deterministic);
78        assert_eq!(r.distinct, 1);
79        assert_eq!(r.runs, 5);
80    }
81
82    #[test]
83    fn varying_output_is_flagged_nondeterministic() {
84        let n = Cell::new(0);
85        // Simulates an embedded timestamp / counter that changes each run.
86        let r = assess_determinism(4, || {
87            let v = n.get();
88            n.set(v + 1);
89            format!("rows=3 generated_at={v}")
90        });
91        assert!(!r.deterministic);
92        assert_eq!(r.distinct, 4);
93    }
94
95    #[test]
96    fn single_run_is_clamped_to_two() {
97        let r = assess_determinism(1, || "x".to_string());
98        assert_eq!(r.runs, 2);
99        assert!(r.deterministic);
100    }
101
102    #[test]
103    fn stable_across_compares_bytes() {
104        assert!(stable_across(r#"{"a":1,"b":2}"#, r#"{"a":1,"b":2}"#));
105        // Different key order → not byte-stable (a canonical encoder should avoid).
106        assert!(!stable_across(r#"{"a":1,"b":2}"#, r#"{"b":2,"a":1}"#));
107    }
108}