1use std::sync::OnceLock;
2
3use regex::Regex;
4
5use crate::detector::BUILTIN_CLASS_NAMES;
6
7#[derive(Debug, Clone, PartialEq, Eq)]
8#[non_exhaustive]
9pub struct TokenShapeShadow {
10 pub recognizer_id: String,
11 pub offending_pattern: String,
12 pub shadowed_shape: String,
13}
14
15pub fn pattern() -> &'static Regex {
16 static PATTERN: OnceLock<Regex> = OnceLock::new();
17 PATTERN.get_or_init(|| Regex::new(&build_pattern()).expect("token shape regex must compile"))
18}
19
20pub fn contains_token(s: &str) -> bool {
21 pattern().is_match(s)
22}
23
24pub fn find_token(s: &str) -> Option<&str> {
25 pattern().find(s).map(|m| m.as_str())
26}
27
28pub fn find_tokens(s: &str) -> impl Iterator<Item = &str> {
29 pattern().find_iter(s).map(|m| m.as_str())
30}
31
32pub fn sample_token_shapes() -> &'static [&'static str] {
33 &[
34 "<deadbeef:Email_1>",
35 "<deadbeef:Name_1>",
36 "<deadbeef:Location_1>",
37 "<deadbeef:Organization_1>",
38 "<deadbeef:Custom:class_alpha_1>",
39 "email1.deadbeef@gaze-fake.invalid",
40 "deadbeef:email_1",
41 "deadbeef:custom:class_alpha_1",
42 "<Email_1>",
43 "<email_1>",
44 "<Custom:class_alpha_1>",
45 "<custom:class_alpha_1>",
46 "Email_1",
47 "email_1",
48 "custom:class_alpha_1",
49 "email1@example.test",
50 "email1@gaze-fake.invalid",
51 ]
52}
53
54pub fn reject_if_shadows_token_shape(
55 compiled: &Regex,
56 recognizer_id: &str,
57) -> Result<(), TokenShapeShadow> {
58 for sample in sample_token_shapes() {
59 if compiled.is_match(sample) {
60 return Err(TokenShapeShadow {
61 recognizer_id: recognizer_id.to_string(),
62 offending_pattern: compiled.as_str().to_string(),
63 shadowed_shape: (*sample).to_string(),
64 });
65 }
66 }
67
68 Ok(())
69}
70
71pub fn starts_with_session_prefix(s: &str) -> bool {
72 let bytes = s.as_bytes();
73 let is_lower_hex = |b: u8| b.is_ascii_digit() || (b'a'..=b'f').contains(&b);
74
75 if bytes.len() >= 10
76 && bytes[0] == b'<'
77 && bytes[9] == b':'
78 && bytes[1..9].iter().copied().all(is_lower_hex)
79 {
80 return true;
81 }
82
83 if bytes.len() >= 15 && bytes.starts_with(b"email") {
84 let mut i = 5;
85 while i < bytes.len() && bytes[i].is_ascii_digit() {
86 i += 1;
87 }
88 if i > 5 && i + 9 < bytes.len() && bytes[i] == b'.' {
89 let hex_start = i + 1;
90 let hex_end = hex_start + 8;
91 if hex_end < bytes.len()
92 && bytes[hex_end] == b'@'
93 && bytes[hex_start..hex_end].iter().copied().all(is_lower_hex)
94 {
95 return true;
96 }
97 }
98 }
99
100 bytes.len() >= 9 && bytes[8] == b':' && bytes[0..8].iter().copied().all(is_lower_hex)
101}
102
103pub fn is_trap(match_text: &str) -> bool {
104 !starts_with_session_prefix(match_text)
105}
106
107fn build_pattern() -> String {
108 let builtin_alt = BUILTIN_CLASS_NAMES.join("|");
109 let builtin_lower_alt = BUILTIN_CLASS_NAMES
110 .iter()
111 .map(|name| name.to_ascii_lowercase())
112 .collect::<Vec<_>>()
113 .join("|");
114
115 format!(
116 r"<[0-9a-f]{{8}}:(?:{builtin_alt})_\d+>|<[0-9a-f]{{8}}:Custom:[a-z0-9_]*_\d+>|\bemail\d+\.[0-9a-f]{{8}}@gaze-fake\.invalid\b|\b[0-9a-f]{{8}}:(?:{builtin_lower_alt})_\d+\b|\b[0-9a-f]{{8}}:custom:[a-z0-9_]*_\d+\b|<(?:{builtin_alt})_\d+>|<Custom:[a-z0-9_]*_\d+>|\b(?:{builtin_lower_alt})_\d+\b|\bcustom:[a-z0-9_]*_\d+\b|\bemail\d+@example\.test\b|\bemail\d+@gaze-fake\.invalid\b|<[A-Z][a-zA-Z0-9]+_\d+>|<[a-z][a-zA-Z0-9_]*_\d+>|\b[A-Z][a-zA-Z0-9]+_\d+\b|\b[a-z][a-zA-Z0-9_]*_\d+\b",
117 builtin_alt = builtin_alt,
118 builtin_lower_alt = builtin_lower_alt,
119 )
120}
121
122#[cfg(test)]
123mod tests {
124 use super::*;
125 use crate::detector::PiiClass;
126 use crate::session::{Scope, Session};
127
128 fn raw_for(class: &PiiClass) -> &'static str {
129 match class {
130 PiiClass::Email => "alice@example.com",
131 PiiClass::Name => "Alice Smith",
132 PiiClass::Location => "Dublin",
133 PiiClass::Organization => "Acme Inc",
134 PiiClass::Custom(_) => "42",
135 }
136 }
137
138 fn tokenized_for(class: PiiClass) -> String {
139 let session = Session::new(Scope::Ephemeral).expect("session");
140 session
141 .tokenize(&class, raw_for(&class))
142 .expect("tokenized placeholder")
143 }
144
145 fn format_preserving_for(class: PiiClass) -> String {
146 let session = Session::new(Scope::Ephemeral).expect("session");
147 session
148 .format_preserving_fake(&class, raw_for(&class))
149 .expect("format-preserving placeholder")
150 }
151
152 #[test]
153 fn pattern_is_stable_across_calls() {
154 assert!(std::ptr::eq(pattern(), pattern()));
155 }
156
157 #[test]
158 fn every_emitted_token_matches_shape_regex() {
159 for class in PiiClass::builtin_variants()
160 .iter()
161 .cloned()
162 .chain(std::iter::once(PiiClass::custom("class_alpha")))
163 {
164 assert!(contains_token(&tokenized_for(class.clone())));
165 assert!(contains_token(&format_preserving_for(class)));
166 }
167 }
168
169 #[test]
170 fn every_sample_matches_pattern_regex() {
171 for sample in sample_token_shapes() {
172 assert!(
173 contains_token(sample),
174 "sample should match token regex: {sample}"
175 );
176 }
177 }
178
179 #[test]
180 fn builtin_class_names_match_impl() {
181 for (class, expected) in PiiClass::builtin_variants()
182 .iter()
183 .zip(BUILTIN_CLASS_NAMES.iter())
184 {
185 assert_eq!(class.class_name(), *expected);
186 }
187 }
188
189 #[test]
190 fn builtin_class_regex_superset() {
191 for class in PiiClass::builtin_variants() {
192 assert!(contains_token(&format!(
193 "<a7f3b8e2:{}_1>",
194 class.class_name()
195 )));
196 assert!(contains_token(&format!(
197 "a7f3b8e2:{}_1",
198 class.class_name().to_ascii_lowercase()
199 )));
200 assert!(contains_token(&format!("<{}_1>", class.class_name())));
201 }
202 }
203
204 #[test]
205 fn custom_and_builtin_do_not_collide() {
206 let builtin = tokenized_for(PiiClass::Email);
207 let custom = tokenized_for(PiiClass::custom("email"));
208
209 assert!(builtin.ends_with(":Email_1>"));
210 assert!(custom.ends_with(":Custom:email_1>"));
211 assert_ne!(builtin, custom);
212 assert!(contains_token(&builtin));
213 assert!(contains_token(&custom));
214 }
215
216 #[test]
217 fn empty_normalized_name_matches_current_shape() {
218 let token = tokenized_for(PiiClass::custom("!!!"));
219 assert!(token.ends_with(":Custom:_1>"));
220 assert!(contains_token(&token));
221 }
222
223 #[test]
224 fn single_char_custom_name_matches_current_shape() {
225 let token = tokenized_for(PiiClass::custom("x"));
226 assert!(token.ends_with(":Custom:x_1>"));
227 assert!(contains_token(&token));
228 }
229
230 #[test]
231 fn custom_token_matches_as_single_span() {
232 let haystack = format!("before <Custom:{}> after", ["class_alpha", "1"].join("_"));
233 let matched = pattern().find(&haystack).expect("custom token match");
234 assert_eq!(
235 matched.as_str(),
236 format!("<Custom:{}>", ["class_alpha", "1"].join("_"))
237 );
238 }
239
240 #[test]
241 fn contains_bare_shapes_in_prose() {
242 assert!(contains_token(&format!(
243 "See <{}>.",
244 ["Email", "1"].join("_")
245 )));
246 assert!(contains_token(&format!(
247 "See <Custom:{}>.",
248 ["class_alpha", "1"].join("_")
249 )));
250 assert!(contains_token("Reply to name_1."));
251 assert!(contains_token("Email email1@example.test later."));
252 }
253
254 #[test]
255 fn legacy_shape_parity_traps_all_known_v03_forms() {
256 for shape in [
257 format!("<{}>", ["Email", "1"].join("_")),
258 format!("<Custom:{}>", ["class_alpha", "1"].join("_")),
259 format!("<{}>", ["Foo", "5"].join("_")),
260 format!("<{}>", ["foo", "1"].join("_")),
261 ["Email", "7"].join("_"),
262 ["location", "7"].join("_"),
263 ["name", "1"].join("_"),
264 ["organization", "1"].join("_"),
265 ["email", "1"].join("_"),
266 format!("custom:{}", ["class_alpha", "1"].join("_")),
267 "email3@example.test".to_string(),
268 "email3@gaze-fake.invalid".to_string(),
269 ] {
270 assert!(contains_token(&shape), "shape should be trapped: {shape}");
271 }
272 }
273
274 #[test]
275 fn session_prefix_scan_classifies_prefixed_forms() {
276 assert!(starts_with_session_prefix("<a7f3b8e2:Email_1>"));
277 assert!(starts_with_session_prefix(
278 "email1.a7f3b8e2@gaze-fake.invalid"
279 ));
280 assert!(starts_with_session_prefix("a7f3b8e2:name_1"));
281 assert!(!is_trap("<a7f3b8e2:Email_1>"));
282 }
283
284 #[test]
285 fn session_prefix_scan_rejects_trap_forms() {
286 assert!(is_trap(&format!("<{}>", ["Email", "1"].join("_"))));
287 assert!(is_trap("email1@example.test"));
288 assert!(is_trap("email1@gaze-fake.invalid"));
289 assert!(is_trap("<A7F3B8E2:Email_1>"));
290 }
291
292 #[test]
293 fn rejects_non_tokens() {
294 assert!(!contains_token("See <Email_1bar>."));
295 assert!(!contains_token("literal email@example.com address"));
296 assert!(!contains_token("<Custom:-_1>"));
297 }
298
299 #[test]
300 fn wrapped_tokens_match_across_text_contexts() {
301 assert!(contains_token("See <Email_1>."));
302 assert!(contains_token("Plain <Email_1> token"));
303 assert!(contains_token("<<Email_1>>"));
304 }
305
306 #[test]
307 fn restore_wrapped_token_in_prose() {
308 let session = Session::new(Scope::Ephemeral).expect("session");
309 let first = session
310 .tokenize(&PiiClass::Email, "alice@example.com")
311 .expect("first token");
312 let second = session
313 .tokenize(&PiiClass::Email, "bob@example.com")
314 .expect("second token");
315
316 let rendered = format!("See {first}. Reply {second}");
317 let restored = pattern().replace_all(&rendered, |captures: ®ex::Captures<'_>| {
318 session.restore_strict(&captures[0]).expect("known token")
319 });
320
321 assert_eq!(restored, "See alice@example.com. Reply bob@example.com");
322 }
323}