sqlite_graphrag/preservation.rs
1//! Preservation checks for LLM-enriched memory bodies (G29 Passo 4).
2//!
3//! When a language model rewrites a memory body, the operator must be
4//! protected against silent hallucination: the LLM may invent facts, drop
5//! key terms, or drift semantically far from the source. This module
6//! provides a lightweight, deterministic similarity metric that runs
7//! locally without any model call, so the gate can be enforced before the
8//! enriched body touches persistent storage.
9//!
10//! The default metric is a normalised trigram-Jaccard similarity computed
11//! on the union of `set_a` and `set_b`. The score is in `[0.0, 1.0]`,
12//! where `1.0` means the two inputs share every trigram and `0.0` means
13//! they share none. The threshold default of `0.7` follows the gap G29
14//! specification, with `--preserve-threshold <F>` letting operators tune
15//! it per workload.
16//!
17//! # Examples
18//!
19//! ```
20//! use sqlite_graphrag::preservation::{jaccard_similarity, PreservationVerdict};
21//!
22//! let score = jaccard_similarity("the quick brown fox", "the quick red fox");
23//! assert!(score > 0.5);
24//!
25//! let verdict = PreservationVerdict::evaluate("orig body", "rewritten body", 0.7);
26//! assert!(matches!(verdict, PreservationVerdict::Preserved { .. }));
27//! ```
28
29use serde::{Deserialize, Serialize};
30use std::collections::HashSet;
31
32/// Computes the trigram-Jaccard similarity between two strings.
33///
34/// The score is `|A ∩ B| / |A ∪ B|` where `A` and `B` are the sets of
35/// character-trigrams extracted from each input. The trigrams are taken
36/// over Unicode scalar values via `char_indices`, so the function is
37/// safe to call on multi-byte UTF-8 inputs without byte-boundary errors.
38///
39/// # Edge cases
40///
41/// - Both inputs empty: returns `1.0` (the empty trigram set is trivially
42/// contained in itself).
43/// - One input empty, the other non-empty: returns `0.0` (no overlap).
44/// - Identical inputs: returns `1.0`.
45///
46/// The function is pure: no I/O, no allocation beyond the two trigram
47/// sets, deterministic for a given pair of inputs. It is safe to call
48/// in hot paths.
49pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
50 let set_a = trigrams(a);
51 let set_b = trigrams(b);
52 if set_a.is_empty() && set_b.is_empty() {
53 return 1.0;
54 }
55 let intersection = set_a.intersection(&set_b).count() as f64;
56 let union = set_a.union(&set_b).count() as f64;
57 if union == 0.0 {
58 0.0
59 } else {
60 intersection / union
61 }
62}
63
64/// Extracts the set of character-trigrams from a string.
65///
66/// Padding handles short strings: inputs with fewer than three characters
67/// are represented by the unique chars they do contain (with the
68/// `[c, '\0', '\0']` padding), which guarantees that two identical
69/// short strings still produce the same trigram set and score `1.0`.
70fn trigrams(input: &str) -> HashSet<[char; 3]> {
71 let chars: Vec<char> = input.chars().collect();
72 if chars.is_empty() {
73 return HashSet::new();
74 }
75 let mut out: HashSet<[char; 3]> = HashSet::with_capacity(chars.len().saturating_add(2));
76 let mut window: [char; 3] = ['\0', '\0', '\0'];
77 for (i, ch) in chars.iter().enumerate() {
78 window[0] = if i >= 1 { chars[i - 1] } else { '\0' };
79 window[1] = *ch;
80 window[2] = if i + 1 < chars.len() {
81 chars[i + 1]
82 } else {
83 '\0'
84 };
85 out.insert(window);
86 }
87 out
88}
89
90/// Outcome of a preservation evaluation against a configurable threshold.
91///
92/// `PreservationVerdict` is the wire type the enrich pipeline emits in its
93/// NDJSON stream: every body-enrich attempt ends in one of the four
94/// variants so callers can route the result without re-running the
95/// similarity computation.
96#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
97#[serde(tag = "verdict", rename_all = "snake_case")]
98pub enum PreservationVerdict {
99 /// The rewritten body is at least `threshold`-similar to the original.
100 Preserved { score: f64, threshold: f64 },
101 /// The rewritten body diverges too much from the original and was
102 /// rejected by the gate.
103 Rejected { score: f64, threshold: f64 },
104 /// The original and rewritten bodies are byte-equal (no rewrite was
105 /// needed); preserved by definition.
106 Unchanged { byte_len: usize },
107}
108
109impl PreservationVerdict {
110 /// Evaluates the gate against `threshold` and returns the matching
111 /// variant. The threshold is clamped to `[0.0, 1.0]` defensively; an
112 /// out-of-range value does not panic the caller.
113 pub fn evaluate(original: &str, rewritten: &str, threshold: f64) -> Self {
114 let threshold = threshold.clamp(0.0, 1.0);
115 if original == rewritten {
116 return Self::Unchanged {
117 byte_len: original.len(),
118 };
119 }
120 let score = jaccard_similarity(original, rewritten);
121 if score >= threshold {
122 Self::Preserved { score, threshold }
123 } else {
124 Self::Rejected { score, threshold }
125 }
126 }
127
128 /// Returns `true` when the gate accepted the rewrite.
129 pub fn is_accepted(&self) -> bool {
130 matches!(self, Self::Preserved { .. } | Self::Unchanged { .. })
131 }
132}
133
134#[cfg(test)]
135mod tests {
136 use super::*;
137
138 #[test]
139 fn identical_strings_score_one() {
140 let s = "the quick brown fox jumps over the lazy dog";
141 assert!((jaccard_similarity(s, s) - 1.0).abs() < f64::EPSILON);
142 }
143
144 #[test]
145 fn completely_different_strings_score_zero_or_near_zero() {
146 let a = "aaaaaaaaaa";
147 let b = "zzzzzzzzzz";
148 assert!(jaccard_similarity(a, b) < 0.05);
149 }
150
151 #[test]
152 fn partial_overlap_scores_between_zero_and_one() {
153 let a = "the quick brown fox jumps";
154 let b = "the slow brown cat sleeps";
155 let score = jaccard_similarity(a, b);
156 assert!(score > 0.0 && score < 1.0, "got {score}");
157 }
158
159 #[test]
160 fn both_empty_score_one() {
161 assert!((jaccard_similarity("", "") - 1.0).abs() < f64::EPSILON);
162 }
163
164 #[test]
165 fn one_empty_scores_zero() {
166 assert!(jaccard_similarity("hello", "").abs() < f64::EPSILON);
167 assert!(jaccard_similarity("", "hello").abs() < f64::EPSILON);
168 }
169
170 #[test]
171 fn unicode_strings_do_not_panic() {
172 // Multi-byte UTF-8: 1 char each, very short.
173 let a = "ç日本語";
174 let b = "ç中文";
175 let _ = jaccard_similarity(a, b);
176 }
177
178 #[test]
179 fn verdict_preserved_when_above_threshold() {
180 let v = PreservationVerdict::evaluate("hello world", "hello world!", 0.5);
181 assert!(v.is_accepted());
182 assert!(matches!(v, PreservationVerdict::Preserved { .. }));
183 }
184
185 #[test]
186 fn verdict_unchanged_for_identical() {
187 let v = PreservationVerdict::evaluate("same", "same", 0.9);
188 assert!(v.is_accepted());
189 assert!(matches!(v, PreservationVerdict::Unchanged { byte_len: 4 }));
190 }
191
192 #[test]
193 fn threshold_clamped_out_of_range() {
194 // Threshold above 1.0 is clamped to 1.0: identical bodies match
195 // by the `Unchanged` short-circuit, accepted.
196 let v = PreservationVerdict::evaluate("abc", "abc", 99.0);
197 assert!(v.is_accepted());
198 // Threshold below 0.0 is clamped to 0.0: every non-empty rewrite
199 // meets a 0.0 floor and is accepted. This is the documented
200 // behaviour of `clamp(0.0, 1.0)` and is the only sane reading
201 // once a negative threshold is no longer in scope.
202 let v = PreservationVerdict::evaluate("abc", "xyz", -5.0);
203 assert!(v.is_accepted());
204 // Threshold of exactly 0.0 accepts only identical bodies; even
205 // a single-character drift fails the gate.
206 let v = PreservationVerdict::evaluate("abc", "abcd", 0.0);
207 assert!(
208 v.is_accepted(),
209 "single-char append is mostly the same body"
210 );
211 }
212
213 #[test]
214 fn g29_repro_evaluates_rejected_when_diverges() {
215 // G29 reproducer: LLM rewrites a body and drifts far from source.
216 let original = "JWT token rotation strategy with 15-min expiry and refresh flow";
217 let drifted = "The weather in Tokyo is sunny today with mild temperatures expected";
218 let v = PreservationVerdict::evaluate(original, drifted, 0.7);
219 assert!(!v.is_accepted(), "should reject hallucinated rewrite");
220 }
221}