Skip to main content

gaze/
token_shape.rs

1use std::sync::OnceLock;
2
3use regex::Regex;
4
5use crate::detector::BUILTIN_CLASS_NAMES;
6
7#[derive(Debug, Clone, PartialEq, Eq)]
8#[non_exhaustive]
9pub struct TokenShapeShadow {
10    pub recognizer_id: String,
11    pub offending_pattern: String,
12    pub shadowed_shape: String,
13}
14
15pub fn pattern() -> &'static Regex {
16    static PATTERN: OnceLock<Regex> = OnceLock::new();
17    PATTERN.get_or_init(|| Regex::new(&build_pattern()).expect("token shape regex must compile"))
18}
19
20pub fn contains_token(s: &str) -> bool {
21    pattern().is_match(s)
22}
23
24pub fn find_token(s: &str) -> Option<&str> {
25    pattern().find(s).map(|m| m.as_str())
26}
27
28pub fn find_tokens(s: &str) -> impl Iterator<Item = &str> {
29    pattern().find_iter(s).map(|m| m.as_str())
30}
31
32pub fn sample_token_shapes() -> &'static [&'static str] {
33    &[
34        "<deadbeef:Email_1>",
35        "<deadbeef:Name_1>",
36        "<deadbeef:Location_1>",
37        "<deadbeef:Organization_1>",
38        "<deadbeef:Custom:class_alpha_1>",
39        "email1.deadbeef@gaze-fake.invalid",
40        "deadbeef:email_1",
41        "deadbeef:custom:class_alpha_1",
42        "<Email_1>",
43        "<email_1>",
44        "<Custom:class_alpha_1>",
45        "<custom:class_alpha_1>",
46        "Email_1",
47        "email_1",
48        "custom:class_alpha_1",
49        "email1@example.test",
50        "email1@gaze-fake.invalid",
51    ]
52}
53
54pub fn reject_if_shadows_token_shape(
55    compiled: &Regex,
56    recognizer_id: &str,
57) -> Result<(), TokenShapeShadow> {
58    for sample in sample_token_shapes() {
59        if compiled.is_match(sample) {
60            return Err(TokenShapeShadow {
61                recognizer_id: recognizer_id.to_string(),
62                offending_pattern: compiled.as_str().to_string(),
63                shadowed_shape: (*sample).to_string(),
64            });
65        }
66    }
67
68    Ok(())
69}
70
71pub fn starts_with_session_prefix(s: &str) -> bool {
72    let bytes = s.as_bytes();
73    let is_lower_hex = |b: u8| b.is_ascii_digit() || (b'a'..=b'f').contains(&b);
74
75    if bytes.len() >= 10
76        && bytes[0] == b'<'
77        && bytes[9] == b':'
78        && bytes[1..9].iter().copied().all(is_lower_hex)
79    {
80        return true;
81    }
82
83    if bytes.len() >= 15 && bytes.starts_with(b"email") {
84        let mut i = 5;
85        while i < bytes.len() && bytes[i].is_ascii_digit() {
86            i += 1;
87        }
88        if i > 5 && i + 9 < bytes.len() && bytes[i] == b'.' {
89            let hex_start = i + 1;
90            let hex_end = hex_start + 8;
91            if hex_end < bytes.len()
92                && bytes[hex_end] == b'@'
93                && bytes[hex_start..hex_end].iter().copied().all(is_lower_hex)
94            {
95                return true;
96            }
97        }
98    }
99
100    bytes.len() >= 9 && bytes[8] == b':' && bytes[0..8].iter().copied().all(is_lower_hex)
101}
102
103pub fn is_trap(match_text: &str) -> bool {
104    !starts_with_session_prefix(match_text)
105}
106
107fn build_pattern() -> String {
108    let builtin_alt = BUILTIN_CLASS_NAMES.join("|");
109    let builtin_lower_alt = BUILTIN_CLASS_NAMES
110        .iter()
111        .map(|name| name.to_ascii_lowercase())
112        .collect::<Vec<_>>()
113        .join("|");
114
115    format!(
116        r"<[0-9a-f]{{8}}:(?:{builtin_alt})_\d+>|<[0-9a-f]{{8}}:Custom:[a-z0-9_]*_\d+>|\bemail\d+\.[0-9a-f]{{8}}@gaze-fake\.invalid\b|\b[0-9a-f]{{8}}:(?:{builtin_lower_alt})_\d+\b|\b[0-9a-f]{{8}}:custom:[a-z0-9_]*_\d+\b|<(?:{builtin_alt})_\d+>|<Custom:[a-z0-9_]*_\d+>|\b(?:{builtin_lower_alt})_\d+\b|\bcustom:[a-z0-9_]*_\d+\b|\bemail\d+@example\.test\b|\bemail\d+@gaze-fake\.invalid\b|<[A-Z][a-zA-Z0-9]+_\d+>|<[a-z][a-zA-Z0-9_]*_\d+>|\b[A-Z][a-zA-Z0-9]+_\d+\b|\b[a-z][a-zA-Z0-9_]*_\d+\b",
117        builtin_alt = builtin_alt,
118        builtin_lower_alt = builtin_lower_alt,
119    )
120}
121
122#[cfg(test)]
123mod tests {
124    use super::*;
125    use crate::detector::PiiClass;
126    use crate::session::{Scope, Session};
127
128    fn raw_for(class: &PiiClass) -> &'static str {
129        match class {
130            PiiClass::Email => "alice@example.com",
131            PiiClass::Name => "Alice Smith",
132            PiiClass::Location => "Dublin",
133            PiiClass::Organization => "Acme Inc",
134            PiiClass::Custom(_) => "42",
135        }
136    }
137
138    fn tokenized_for(class: PiiClass) -> String {
139        let session = Session::new(Scope::Ephemeral).expect("session");
140        session
141            .tokenize(&class, raw_for(&class))
142            .expect("tokenized placeholder")
143    }
144
145    fn format_preserving_for(class: PiiClass) -> String {
146        let session = Session::new(Scope::Ephemeral).expect("session");
147        session
148            .format_preserving_fake(&class, raw_for(&class))
149            .expect("format-preserving placeholder")
150    }
151
152    #[test]
153    fn pattern_is_stable_across_calls() {
154        assert!(std::ptr::eq(pattern(), pattern()));
155    }
156
157    #[test]
158    fn every_emitted_token_matches_shape_regex() {
159        for class in PiiClass::builtin_variants()
160            .iter()
161            .cloned()
162            .chain(std::iter::once(PiiClass::custom("class_alpha")))
163        {
164            assert!(contains_token(&tokenized_for(class.clone())));
165            assert!(contains_token(&format_preserving_for(class)));
166        }
167    }
168
169    #[test]
170    fn every_sample_matches_pattern_regex() {
171        for sample in sample_token_shapes() {
172            assert!(
173                contains_token(sample),
174                "sample should match token regex: {sample}"
175            );
176        }
177    }
178
179    #[test]
180    fn builtin_class_names_match_impl() {
181        for (class, expected) in PiiClass::builtin_variants()
182            .iter()
183            .zip(BUILTIN_CLASS_NAMES.iter())
184        {
185            assert_eq!(class.class_name(), *expected);
186        }
187    }
188
189    #[test]
190    fn builtin_class_regex_superset() {
191        for class in PiiClass::builtin_variants() {
192            assert!(contains_token(&format!(
193                "<a7f3b8e2:{}_1>",
194                class.class_name()
195            )));
196            assert!(contains_token(&format!(
197                "a7f3b8e2:{}_1",
198                class.class_name().to_ascii_lowercase()
199            )));
200            assert!(contains_token(&format!("<{}_1>", class.class_name())));
201        }
202    }
203
204    #[test]
205    fn custom_and_builtin_do_not_collide() {
206        let builtin = tokenized_for(PiiClass::Email);
207        let custom = tokenized_for(PiiClass::custom("email"));
208
209        assert!(builtin.ends_with(":Email_1>"));
210        assert!(custom.ends_with(":Custom:email_1>"));
211        assert_ne!(builtin, custom);
212        assert!(contains_token(&builtin));
213        assert!(contains_token(&custom));
214    }
215
216    #[test]
217    fn empty_normalized_name_matches_current_shape() {
218        let token = tokenized_for(PiiClass::custom("!!!"));
219        assert!(token.ends_with(":Custom:_1>"));
220        assert!(contains_token(&token));
221    }
222
223    #[test]
224    fn single_char_custom_name_matches_current_shape() {
225        let token = tokenized_for(PiiClass::custom("x"));
226        assert!(token.ends_with(":Custom:x_1>"));
227        assert!(contains_token(&token));
228    }
229
230    #[test]
231    fn custom_token_matches_as_single_span() {
232        let haystack = format!("before <Custom:{}> after", ["class_alpha", "1"].join("_"));
233        let matched = pattern().find(&haystack).expect("custom token match");
234        assert_eq!(
235            matched.as_str(),
236            format!("<Custom:{}>", ["class_alpha", "1"].join("_"))
237        );
238    }
239
240    #[test]
241    fn contains_bare_shapes_in_prose() {
242        assert!(contains_token(&format!(
243            "See <{}>.",
244            ["Email", "1"].join("_")
245        )));
246        assert!(contains_token(&format!(
247            "See <Custom:{}>.",
248            ["class_alpha", "1"].join("_")
249        )));
250        assert!(contains_token("Reply to name_1."));
251        assert!(contains_token("Email email1@example.test later."));
252    }
253
254    #[test]
255    fn legacy_shape_parity_traps_all_known_v03_forms() {
256        for shape in [
257            format!("<{}>", ["Email", "1"].join("_")),
258            format!("<Custom:{}>", ["class_alpha", "1"].join("_")),
259            format!("<{}>", ["Foo", "5"].join("_")),
260            format!("<{}>", ["foo", "1"].join("_")),
261            ["Email", "7"].join("_"),
262            ["location", "7"].join("_"),
263            ["name", "1"].join("_"),
264            ["organization", "1"].join("_"),
265            ["email", "1"].join("_"),
266            format!("custom:{}", ["class_alpha", "1"].join("_")),
267            "email3@example.test".to_string(),
268            "email3@gaze-fake.invalid".to_string(),
269        ] {
270            assert!(contains_token(&shape), "shape should be trapped: {shape}");
271        }
272    }
273
274    #[test]
275    fn session_prefix_scan_classifies_prefixed_forms() {
276        assert!(starts_with_session_prefix("<a7f3b8e2:Email_1>"));
277        assert!(starts_with_session_prefix(
278            "email1.a7f3b8e2@gaze-fake.invalid"
279        ));
280        assert!(starts_with_session_prefix("a7f3b8e2:name_1"));
281        assert!(!is_trap("<a7f3b8e2:Email_1>"));
282    }
283
284    #[test]
285    fn session_prefix_scan_rejects_trap_forms() {
286        assert!(is_trap(&format!("<{}>", ["Email", "1"].join("_"))));
287        assert!(is_trap("email1@example.test"));
288        assert!(is_trap("email1@gaze-fake.invalid"));
289        assert!(is_trap("<A7F3B8E2:Email_1>"));
290    }
291
292    #[test]
293    fn rejects_non_tokens() {
294        assert!(!contains_token("See <Email_1bar>."));
295        assert!(!contains_token("literal email@example.com address"));
296        assert!(!contains_token("<Custom:-_1>"));
297    }
298
299    #[test]
300    fn wrapped_tokens_match_across_text_contexts() {
301        assert!(contains_token("See <Email_1>."));
302        assert!(contains_token("Plain <Email_1> token"));
303        assert!(contains_token("<<Email_1>>"));
304    }
305
306    #[test]
307    fn restore_wrapped_token_in_prose() {
308        let session = Session::new(Scope::Ephemeral).expect("session");
309        let first = session
310            .tokenize(&PiiClass::Email, "alice@example.com")
311            .expect("first token");
312        let second = session
313            .tokenize(&PiiClass::Email, "bob@example.com")
314            .expect("second token");
315
316        let rendered = format!("See {first}. Reply {second}");
317        let restored = pattern().replace_all(&rendered, |captures: &regex::Captures<'_>| {
318            session.restore_strict(&captures[0]).expect("known token")
319        });
320
321        assert_eq!(restored, "See alice@example.com. Reply bob@example.com");
322    }
323}