Skip to main content

sqlite_graphrag/
preservation.rs

1//! Preservation checks for LLM-enriched memory bodies (G29 Passo 4).
2//!
3//! When a language model rewrites a memory body, the operator must be
4//! protected against silent hallucination: the LLM may invent facts, drop
5//! key terms, or drift semantically far from the source. This module
6//! provides a lightweight, deterministic similarity metric that runs
7//! locally without any model call, so the gate can be enforced before the
8//! enriched body touches persistent storage.
9//!
10//! The default metric is a normalised trigram-Jaccard similarity computed
11//! on the union of `set_a` and `set_b`. The score is in `[0.0, 1.0]`,
12//! where `1.0` means the two inputs share every trigram and `0.0` means
13//! they share none. The threshold default of `0.7` follows the gap G29
14//! specification, with `--preserve-threshold <F>` letting operators tune
15//! it per workload.
16//!
17//! # Examples
18//!
19//! ```
20//! use sqlite_graphrag::preservation::{jaccard_similarity, PreservationVerdict};
21//!
22//! let score = jaccard_similarity("the quick brown fox", "the quick brown fox!");
23//! assert!(score > 0.8);
24//!
25//! let verdict =
26//!     PreservationVerdict::evaluate("the quick brown fox", "the quick brown fox!", 0.7);
27//! assert!(matches!(verdict, PreservationVerdict::Preserved { .. }));
28//!
29//! let verdict = PreservationVerdict::evaluate("orig body", "rewritten body", 0.7);
30//! assert!(matches!(verdict, PreservationVerdict::Rejected { .. }));
31//! ```
32
33use serde::{Deserialize, Serialize};
34use std::collections::HashSet;
35
36/// Computes the trigram-Jaccard similarity between two strings.
37///
38/// The score is `|A ∩ B| / |A ∪ B|` where `A` and `B` are the sets of
39/// character-trigrams extracted from each input. The trigrams are taken
40/// over Unicode scalar values via `char_indices`, so the function is
41/// safe to call on multi-byte UTF-8 inputs without byte-boundary errors.
42///
43/// # Edge cases
44///
45/// - Both inputs empty: returns `1.0` (the empty trigram set is trivially
46///   contained in itself).
47/// - One input empty, the other non-empty: returns `0.0` (no overlap).
48/// - Identical inputs: returns `1.0`.
49///
50/// The function is pure: no I/O, no allocation beyond the two trigram
51/// sets, deterministic for a given pair of inputs. It is safe to call
52/// in hot paths.
53pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
54    let set_a = trigrams(a);
55    let set_b = trigrams(b);
56    if set_a.is_empty() && set_b.is_empty() {
57        return 1.0;
58    }
59    let intersection = set_a.intersection(&set_b).count() as f64;
60    let union = set_a.union(&set_b).count() as f64;
61    if union == 0.0 {
62        0.0
63    } else {
64        intersection / union
65    }
66}
67
68/// Extracts the set of character-trigrams from a string.
69///
70/// Padding handles short strings: inputs with fewer than three characters
71/// are represented by the unique chars they do contain (with the
72/// `[c, '\0', '\0']` padding), which guarantees that two identical
73/// short strings still produce the same trigram set and score `1.0`.
74fn trigrams(input: &str) -> HashSet<[char; 3]> {
75    let chars: Vec<char> = input.chars().collect();
76    if chars.is_empty() {
77        return HashSet::new();
78    }
79    let mut out: HashSet<[char; 3]> = HashSet::with_capacity(chars.len().saturating_add(2));
80    let mut window: [char; 3] = ['\0', '\0', '\0'];
81    for (i, ch) in chars.iter().enumerate() {
82        window[0] = if i >= 1 { chars[i - 1] } else { '\0' };
83        window[1] = *ch;
84        window[2] = if i + 1 < chars.len() {
85            chars[i + 1]
86        } else {
87            '\0'
88        };
89        out.insert(window);
90    }
91    out
92}
93
94/// Outcome of a preservation evaluation against a configurable threshold.
95///
96/// `PreservationVerdict` is the wire type the enrich pipeline emits in its
97/// NDJSON stream: every body-enrich attempt ends in one of the four
98/// variants so callers can route the result without re-running the
99/// similarity computation.
100#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
101#[serde(tag = "verdict", rename_all = "snake_case")]
102pub enum PreservationVerdict {
103    /// The rewritten body is at least `threshold`-similar to the original.
104    Preserved { score: f64, threshold: f64 },
105    /// The rewritten body diverges too much from the original and was
106    /// rejected by the gate.
107    Rejected { score: f64, threshold: f64 },
108    /// The original and rewritten bodies are byte-equal (no rewrite was
109    /// needed); preserved by definition.
110    Unchanged { byte_len: usize },
111}
112
113impl PreservationVerdict {
114    /// Evaluates the gate against `threshold` and returns the matching
115    /// variant. The threshold is clamped to `[0.0, 1.0]` defensively; an
116    /// out-of-range value does not panic the caller.
117    pub fn evaluate(original: &str, rewritten: &str, threshold: f64) -> Self {
118        let threshold = threshold.clamp(0.0, 1.0);
119        if original == rewritten {
120            return Self::Unchanged {
121                byte_len: original.len(),
122            };
123        }
124        let score = jaccard_similarity(original, rewritten);
125        if score >= threshold {
126            Self::Preserved { score, threshold }
127        } else {
128            Self::Rejected { score, threshold }
129        }
130    }
131
132    /// Returns `true` when the gate accepted the rewrite.
133    pub fn is_accepted(&self) -> bool {
134        matches!(self, Self::Preserved { .. } | Self::Unchanged { .. })
135    }
136}
137
138#[cfg(test)]
139mod tests {
140    use super::*;
141
142    #[test]
143    fn identical_strings_score_one() {
144        let s = "the quick brown fox jumps over the lazy dog";
145        assert!((jaccard_similarity(s, s) - 1.0).abs() < f64::EPSILON);
146    }
147
148    #[test]
149    fn completely_different_strings_score_zero_or_near_zero() {
150        let a = "aaaaaaaaaa";
151        let b = "zzzzzzzzzz";
152        assert!(jaccard_similarity(a, b) < 0.05);
153    }
154
155    #[test]
156    fn partial_overlap_scores_between_zero_and_one() {
157        let a = "the quick brown fox jumps";
158        let b = "the slow brown cat sleeps";
159        let score = jaccard_similarity(a, b);
160        assert!(score > 0.0 && score < 1.0, "got {score}");
161    }
162
163    #[test]
164    fn both_empty_score_one() {
165        assert!((jaccard_similarity("", "") - 1.0).abs() < f64::EPSILON);
166    }
167
168    #[test]
169    fn one_empty_scores_zero() {
170        assert!(jaccard_similarity("hello", "").abs() < f64::EPSILON);
171        assert!(jaccard_similarity("", "hello").abs() < f64::EPSILON);
172    }
173
174    #[test]
175    fn unicode_strings_do_not_panic() {
176        // Multi-byte UTF-8: 1 char each, very short.
177        let a = "ç日本語";
178        let b = "ç中文";
179        let _ = jaccard_similarity(a, b);
180    }
181
182    #[test]
183    fn verdict_preserved_when_above_threshold() {
184        let v = PreservationVerdict::evaluate("hello world", "hello world!", 0.5);
185        assert!(v.is_accepted());
186        assert!(matches!(v, PreservationVerdict::Preserved { .. }));
187    }
188
189    #[test]
190    fn verdict_unchanged_for_identical() {
191        let v = PreservationVerdict::evaluate("same", "same", 0.9);
192        assert!(v.is_accepted());
193        assert!(matches!(v, PreservationVerdict::Unchanged { byte_len: 4 }));
194    }
195
196    #[test]
197    fn threshold_clamped_out_of_range() {
198        // Threshold above 1.0 is clamped to 1.0: identical bodies match
199        // by the `Unchanged` short-circuit, accepted.
200        let v = PreservationVerdict::evaluate("abc", "abc", 99.0);
201        assert!(v.is_accepted());
202        // Threshold below 0.0 is clamped to 0.0: every non-empty rewrite
203        // meets a 0.0 floor and is accepted. This is the documented
204        // behaviour of `clamp(0.0, 1.0)` and is the only sane reading
205        // once a negative threshold is no longer in scope.
206        let v = PreservationVerdict::evaluate("abc", "xyz", -5.0);
207        assert!(v.is_accepted());
208        // Threshold of exactly 0.0 accepts only identical bodies; even
209        // a single-character drift fails the gate.
210        let v = PreservationVerdict::evaluate("abc", "abcd", 0.0);
211        assert!(
212            v.is_accepted(),
213            "single-char append is mostly the same body"
214        );
215    }
216
217    #[test]
218    fn g29_repro_evaluates_rejected_when_diverges() {
219        // G29 reproducer: LLM rewrites a body and drifts far from source.
220        let original = "JWT token rotation strategy with 15-min expiry and refresh flow";
221        let drifted = "The weather in Tokyo is sunny today with mild temperatures expected";
222        let v = PreservationVerdict::evaluate(original, drifted, 0.7);
223        assert!(!v.is_accepted(), "should reject hallucinated rewrite");
224    }
225}