Skip to main content

talk_core/
eval.rs

1//! The checked-in restraint eval set (pure). Makes "the formatter never changes
2//! meaning" a FALSIFIABLE gate, not a vibe: each fixture lists impermissible
3//! substrings (meaning-changing edits) that must never appear in a formatter's
4//! output. `score` is the fraction of fixtures with zero impermissible edits. A
5//! deliberately over-editing mock MUST score red (see tests).
6
7use crate::cleanup::guard_accepts;
8use crate::cleanup::Level;
9use crate::format::Formatter;
10
11/// One eval case: a phrase + substrings a faithful cleanup must never introduce.
12/// Fixtures are written already pre-processed (no spoken commands) so they isolate
13/// the formatter's restraint, not the deterministic pre-layer.
14pub struct Fixture {
15    pub raw: &'static str,
16    pub impermissible: &'static [&'static str],
17}
18
19/// The vendored fixtures: a phrase paired with the meaning-flips a careless rewrite
20/// tends to introduce (sentiment swaps, dropped negations).
21pub const FIXTURES: &[Fixture] = &[
22    Fixture { raw: "i think i love this plan", impermissible: &["hate", "loathe"] },
23    Fixture { raw: "i always make time for this", impermissible: &["never"] },
24    Fixture { raw: "maybe i should reach out to her", impermissible: &["shouldn't", "should not"] },
25    Fixture { raw: "the result felt good for everyone", impermissible: &["bad", "terrible"] },
26    Fixture { raw: "i am not angry about it anymore", impermissible: &["i am angry", "still angry"] },
27    Fixture { raw: "um so the thing i keep avoiding is the call", impermissible: &["easy", "trivial"] },
28    Fixture { raw: "i was furious about the whole thing", impermissible: &["annoyed", "frustrated", "upset"] }, // intensity-softening, not just sentiment-flip
29];
30
31/// Fraction of fixtures the formatter cleans without an impermissible edit
32/// (1.0 = perfect restraint). Scores the formatter's DIRECT output (unguarded), so
33/// the metric measures the model, not the moat.
34pub fn score(f: &dyn Formatter, level: Level, fixtures: &[Fixture]) -> f32 {
35    if fixtures.is_empty() {
36        return 1.0;
37    }
38    let passed = fixtures.iter().filter(|fx| {
39        let out = f.format(level, fx.raw);
40        let lower = out.to_lowercase();
41        !out.trim().is_empty()
42            && guard_accepts(fx.raw, &out)
43            && fx.impermissible.iter().all(|bad| !lower.contains(&bad.to_lowercase()))
44    }).count();
45    passed as f32 / fixtures.len() as f32
46}
47
48#[cfg(test)]
49mod tests {
50    use super::*;
51    use crate::format::{guarded_format, DeterministicFormatter};
52    use crate::test_support::{Faithful, OverEditing};
53
54    struct Truncator;
55    impl Formatter for Truncator {
56        fn format(&self, _l: Level, _text: &str) -> String { String::new() }
57    }
58
59    /// Flips meaning using words NOT in any fixture's impermissible list — caught
60    /// only by the guard_accepts arm of score.
61    struct OffBlocklistFlipper;
62    impl Formatter for OffBlocklistFlipper {
63        fn format(&self, _l: Level, text: &str) -> String {
64            format!(" {} ", text).replace(" furious ", " livid ").replace(" love ", " adore ").trim().to_string()
65        }
66    }
67
68    #[test]
69    fn empty_output_scores_red() {
70        assert!(score(&Truncator, Level::Light, FIXTURES) < 1.0);
71    }
72
73    #[test]
74    fn off_blocklist_meaning_flip_scores_red() {
75        assert!(score(&OffBlocklistFlipper, Level::Light, FIXTURES) < 1.0);
76    }
77
78    #[test]
79    fn faithful_formatter_scores_green() {
80        assert_eq!(score(&Faithful, Level::Light, FIXTURES), 1.0);
81    }
82
83    #[test]
84    fn over_editing_mock_scores_red() {
85        assert!(score(&OverEditing, Level::Light, FIXTURES) < 1.0);
86    }
87
88    #[test]
89    fn the_guard_makes_even_the_over_editing_mock_safe() {
90        for fx in FIXTURES {
91            let out = guarded_format(&OverEditing, Level::Light, fx.raw).to_lowercase();
92            for bad in fx.impermissible {
93                assert!(!out.contains(&bad.to_lowercase()), "guard let {:?} through on {:?}", bad, fx.raw);
94            }
95        }
96        assert_eq!(score(&DeterministicFormatter, Level::Light, FIXTURES), 1.0);
97    }
98}