1use anyhow::{Context, Result};
14use hashbrown::HashMap;
15use once_cell::sync::Lazy;
16use regex::Regex;
17use serde::{Deserialize, Serialize};
18use std::sync::{Arc, Mutex};
19use tracing::debug;
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
23pub enum PiiType {
24 Email,
25 PhoneNumber,
26 SocialSecurityNumber,
27 CreditCard,
28 IpAddress,
29 ApiKey,
30 AuthToken,
31 Url,
32 Custom,
33}
34
35impl PiiType {
36 pub fn as_str(&self) -> &'static str {
37 match self {
38 Self::Email => "email",
39 Self::PhoneNumber => "phone_number",
40 Self::SocialSecurityNumber => "ssn",
41 Self::CreditCard => "credit_card",
42 Self::IpAddress => "ip_address",
43 Self::ApiKey => "api_key",
44 Self::AuthToken => "auth_token",
45 Self::Url => "url",
46 Self::Custom => "custom",
47 }
48 }
49}
50
51static DEFAULT_PII_PATTERNS: Lazy<Result<Vec<(PiiType, Regex)>, String>> = Lazy::new(|| {
53 let patterns = vec![
54 (
55 PiiType::Email,
56 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
57 ),
58 (
59 PiiType::PhoneNumber,
60 r"(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}",
61 ),
62 (PiiType::SocialSecurityNumber, r"[0-9]{3}-[0-9]{2}-[0-9]{4}"),
63 (
64 PiiType::CreditCard,
65 r"[0-9]{4}[\s-]?[0-9]{4}[\s-]?[0-9]{4}[\s-]?[0-9]{4}",
66 ),
67 (
68 PiiType::IpAddress,
69 r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",
70 ),
71 (
72 PiiType::ApiKey,
73 r#"(?:api[_-]?key|apikey|API[_-]?KEY)\s*[:=]\s*['"]?[a-zA-Z0-9_-]{32,}['"]?"#,
74 ),
75 (
76 PiiType::AuthToken,
77 r"(?:bearer|token|authorization)\s+[a-zA-Z0-9._-]+",
78 ),
79 ];
80
81 let mut compiled = Vec::with_capacity(patterns.len());
82 for (pii_type, pattern) in patterns {
83 match Regex::new(pattern) {
84 Ok(regex) => compiled.push((pii_type, regex)),
85 Err(e) => {
86 return Err(format!(
87 "Failed to compile PII regex for {:?}: {}",
88 pii_type, e
89 ));
90 }
91 }
92 }
93 Ok(compiled)
94});
95
96#[derive(Debug, Clone, Serialize, Deserialize)]
98pub struct DetectedPii {
99 pub value: String,
100 pub pii_type: PiiType,
101 pub start: usize,
102 pub end: usize,
103 pub context: String,
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct PiiToken {
109 pub token: String,
110 pub original_value: String,
111 pub pii_type: PiiType,
112 pub created_at: String,
113}
114
115#[derive(Clone)]
117pub struct PiiTokenizer {
118 patterns: HashMap<PiiType, Regex>,
119 inner: Arc<Mutex<PiiTokenizerInner>>,
120}
121
122struct PiiTokenizerInner {
124 token_store: HashMap<String, PiiToken>,
125}
126
127impl PiiTokenizer {
128 pub fn new() -> Result<Self> {
130 let patterns = DEFAULT_PII_PATTERNS
133 .as_ref()
134 .map_err(|e| anyhow::anyhow!("PII pattern initialization failed: {}", e))?
135 .iter()
136 .map(|(pii_type, regex)| (*pii_type, regex.clone()))
137 .collect();
138
139 Ok(Self {
140 patterns,
141 inner: Arc::new(Mutex::new(PiiTokenizerInner {
142 token_store: HashMap::new(),
143 })),
144 })
145 }
146
147 pub fn detect_pii(&self, text: &str) -> Result<Vec<DetectedPii>> {
149 let mut detected = Vec::with_capacity(8);
150
151 for (pii_type, pattern) in &self.patterns {
152 for mat in pattern.find_iter(text) {
153 let value = text[mat.start()..mat.end()].to_string();
155 let context_start = mat.start().saturating_sub(20);
156 let context_end = (mat.end() + 20).min(text.len());
157 let context = text[context_start..context_end]
158 .replace('\n', "\\n")
159 .replace('\r', "\\r");
160
161 debug!(
162 pii_type = pii_type.as_str(),
163 context = %context,
164 "Detected PII in text"
165 );
166
167 detected.push(DetectedPii {
168 value,
169 pii_type: *pii_type,
170 start: mat.start(),
171 end: mat.end(),
172 context,
173 });
174 }
175 }
176
177 Ok(detected)
178 }
179
180 pub fn tokenize_string(&self, text: &str) -> Result<(String, HashMap<String, PiiToken>)> {
182 let mut detected = self.detect_pii(text)?;
183
184 if detected.is_empty() {
185 return Ok((text.to_string(), HashMap::new()));
186 }
187
188 detected.sort_by_key(|d| d.start);
190
191 let mut result = text.to_string();
192 let mut new_tokens = HashMap::new();
193
194 for detection in detected.iter().rev() {
196 let token = self.generate_token(&detection.value, detection.pii_type)?;
197 let token_str = &token.token;
198 result.replace_range(detection.start..detection.end, token_str);
199 new_tokens.insert(token_str.clone(), token);
200 }
201
202 {
204 let mut inner = self
205 .inner
206 .lock()
207 .map_err(|e| anyhow::anyhow!("Failed to acquire token store lock: {}", e))?;
208 inner.token_store.extend(new_tokens.clone());
210 }
211
212 debug!(pii_count = detected.len(), "Tokenized PII in string");
213
214 Ok((result, new_tokens))
215 }
216
217 pub fn detokenize_string(&self, text: &str) -> Result<String> {
219 let inner = self
220 .inner
221 .lock()
222 .map_err(|e| anyhow::anyhow!("Failed to acquire token store lock: {}", e))?;
223 let mut result = text.to_string();
224
225 for (token, pii_token) in inner.token_store.iter() {
226 result = result.replace(token, &pii_token.original_value);
227 }
228
229 Ok(result)
230 }
231
232 pub fn clear_tokens(&self) {
234 let mut inner = match self.inner.lock() {
235 Ok(guard) => guard,
236 Err(poisoned) => poisoned.into_inner(),
237 };
238 inner.token_store.clear();
239 debug!("Cleared all PII tokens");
240 }
241
242 pub fn audit_trail(&self) -> Result<Vec<(String, PiiType, String)>> {
244 let inner = self
245 .inner
246 .lock()
247 .map_err(|e| anyhow::anyhow!("Failed to acquire token store lock: {}", e))?;
248 Ok(inner
249 .token_store
250 .values()
251 .map(|t| (t.token.clone(), t.pii_type, t.created_at.clone()))
252 .collect())
253 }
254
255 fn generate_token(&self, value: &str, pii_type: PiiType) -> Result<PiiToken> {
257 use std::collections::hash_map::DefaultHasher;
258 use std::hash::{Hash, Hasher};
259
260 let mut hasher = DefaultHasher::new();
261 value.hash(&mut hasher);
262 let hash = hasher.finish();
263
264 let token = format!("__PII_{}_{:x}__", pii_type.as_str(), hash);
265
266 Ok(PiiToken {
267 token,
268 original_value: value.to_string(),
269 pii_type,
270 created_at: chrono::Utc::now().to_rfc3339(),
271 })
272 }
273
274 pub fn register_pattern(&mut self, pii_type: PiiType, pattern: &str) -> Result<()> {
276 let regex = Regex::new(pattern).context("invalid regex pattern for PII detection")?;
277 self.patterns.insert(pii_type, regex);
278 debug!(
279 pii_type = pii_type.as_str(),
280 pattern = pattern,
281 "Registered custom PII pattern"
282 );
283 Ok(())
284 }
285}
286
287impl Default for PiiTokenizer {
288 fn default() -> Self {
289 Self::new().unwrap_or_else(|_| Self {
290 patterns: Default::default(),
291 inner: Arc::new(Mutex::new(PiiTokenizerInner {
292 token_store: HashMap::new(),
293 })),
294 })
295 }
296}
297
298#[cfg(test)]
299mod tests {
300 use super::*;
301 use anyhow::Result;
302
303 #[test]
304 fn test_detect_email() -> Result<()> {
305 let tokenizer = PiiTokenizer::new()?;
306 let text = "Contact me at john@example.com for more info";
307 let detected = tokenizer.detect_pii(text)?;
308
309 assert!(!detected.is_empty());
310 assert!(detected.iter().any(|d| d.pii_type == PiiType::Email));
311 Ok(())
312 }
313
314 #[test]
315 fn test_detect_phone() -> Result<()> {
316 let tokenizer = PiiTokenizer::new()?;
317 let text = "Call me at 555-123-4567";
318 let detected = tokenizer.detect_pii(text)?;
319
320 assert!(!detected.is_empty());
321 assert!(detected.iter().any(|d| d.pii_type == PiiType::PhoneNumber));
322 Ok(())
323 }
324
325 #[test]
326 fn test_tokenize_string() -> Result<()> {
327 let tokenizer = PiiTokenizer::new()?;
328 let text = "Email: john@example.com, Phone: 555-123-4567";
329 let (tokenized, tokens) = tokenizer.tokenize_string(text)?;
330
331 assert!(tokenized.contains("__PII_"));
332 assert!(!tokenized.contains("john@example.com"));
333 assert!(!tokens.is_empty());
334 Ok(())
335 }
336
337 #[test]
338 fn test_no_pii_detected() -> Result<()> {
339 let tokenizer = PiiTokenizer::new()?;
340 let text = "This is regular text with no sensitive information";
341 let detected = tokenizer.detect_pii(text)?;
342
343 assert!(detected.is_empty());
344 Ok(())
345 }
346}