1use regex::Regex;
2use std::collections::HashMap;
3use thiserror::Error;
4
5#[derive(Debug, Clone)]
6pub struct Sanitizer {
7 patterns: HashMap<PiiType, Regex>,
8 enabled: bool,
9}
10
11#[derive(Debug, Clone, PartialEq, Eq, Hash)]
12pub enum PiiType {
13 Ssn,
14 CreditCard,
15 Email,
16 Phone,
17 ApiKey,
18 IpAddress,
19 Custom(String),
20}
21
22impl Sanitizer {
23 pub fn new() -> Self {
24 let mut patterns = HashMap::new();
25
26 patterns.insert(
27 PiiType::Ssn,
28 Regex::new(r"\b\d{3}-\d{2}-\d{4}\b|\b\d{3}\s\d{2}\s\d{4}\b|\d{9}").unwrap(),
29 );
30 patterns.insert(
31 PiiType::CreditCard,
32 Regex::new(r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b").unwrap(),
33 );
34 patterns.insert(
35 PiiType::Email,
36 Regex::new(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b").unwrap(),
37 );
38 patterns.insert(
39 PiiType::Phone,
40 Regex::new(r"(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}").unwrap(),
41 );
42 patterns.insert(
43 PiiType::ApiKey,
44 Regex::new(r"\b(sk-|bai_|api_|key_|AIza|AKIA|ya29\.|xox[bpoa]-)[A-Za-z0-9_-]{15,}\b")
45 .unwrap(),
46 );
47 patterns.insert(
48 PiiType::IpAddress,
49 Regex::new(r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b").unwrap(),
50 );
51
52 Self {
53 patterns,
54 enabled: true,
55 }
56 }
57
58 pub fn disabled() -> Self {
59 Self {
60 patterns: HashMap::new(),
61 enabled: false,
62 }
63 }
64
65 pub fn add_pattern(&mut self, name: &str, pattern: &str) -> Result<(), SanitizationError> {
67 let regex =
68 Regex::new(pattern).map_err(|e| SanitizationError::InvalidPattern(e.to_string()))?;
69 self.patterns
70 .insert(PiiType::Custom(name.to_string()), regex);
71 Ok(())
72 }
73
74 pub fn remove_pattern(&mut self, pii_type: &PiiType) -> bool {
76 self.patterns.remove(pii_type).is_some()
77 }
78
79 pub fn set_enabled(&mut self, enabled: bool) {
81 self.enabled = enabled;
82 }
83
84 pub fn sanitize(&self, text: &str) -> SanitizationResult {
86 if !self.enabled {
87 return SanitizationResult {
88 sanitized: text.to_string(),
89 redactions: Vec::new(),
90 };
91 }
92
93 let mut result = text.to_string();
94 let mut redactions = Vec::new();
95
96 let mut all_matches = Vec::new();
98
99 for (pii_type, regex) in &self.patterns {
100 for mat in regex.find_iter(text) {
101 all_matches.push((mat.start(), mat.end(), pii_type.clone()));
102 }
103 }
104
105 all_matches.sort_by_key(|(start, end, _)| (*start, std::cmp::Reverse(end - start)));
107
108 let mut non_overlapping_matches = Vec::new();
110 let mut last_end = 0;
111
112 for (start, end, pii_type) in all_matches {
113 if start >= last_end {
114 non_overlapping_matches.push((start, end, pii_type));
115 last_end = end;
116 } else if start < last_end {
117 if let Some(last_match) = non_overlapping_matches.last() {
119 if (end - start) > (last_match.1 - last_match.0) {
120 non_overlapping_matches.pop();
122 non_overlapping_matches.push((start, end, pii_type));
123 last_end = end;
124 }
125 }
126 }
127 }
128
129 for (start, end, pii_type) in non_overlapping_matches.into_iter().rev() {
131 let redaction_marker = self.get_redaction_marker(&pii_type);
132 let original_length = end - start;
133
134 result.replace_range(start..end, &redaction_marker);
135
136 redactions.push(Redaction {
137 pii_type: pii_type.clone(),
138 original_length,
139 start_position: start,
140 end_position: start + redaction_marker.len(), });
142 }
143
144 redactions.sort_by_key(|r| r.start_position);
146
147 SanitizationResult {
148 sanitized: result,
149 redactions,
150 }
151 }
152
153 pub fn sanitize_json(&self, value: &serde_json::Value) -> SanitizationJsonResult {
155 if !self.enabled {
156 return SanitizationJsonResult {
157 sanitized: value.clone(),
158 redactions: Vec::new(),
159 };
160 }
161
162 let mut redactions = Vec::new();
163 let sanitized = self.sanitize_json_recursive(value, &mut redactions, String::new());
164
165 SanitizationJsonResult {
166 sanitized,
167 redactions,
168 }
169 }
170
171 fn sanitize_json_recursive(
172 &self,
173 value: &serde_json::Value,
174 redactions: &mut Vec<JsonRedaction>,
175 path: String,
176 ) -> serde_json::Value {
177 match value {
178 serde_json::Value::String(s) => {
179 let result = self.sanitize(s);
180 if !result.redactions.is_empty() {
181 for redaction in result.redactions {
182 redactions.push(JsonRedaction {
183 path: path.clone(),
184 pii_type: redaction.pii_type,
185 original_length: redaction.original_length,
186 });
187 }
188 }
189 serde_json::Value::String(result.sanitized)
190 }
191 serde_json::Value::Object(obj) => {
192 let mut new_obj = serde_json::Map::new();
193 for (key, val) in obj {
194 let new_path = if path.is_empty() {
195 key.clone()
196 } else {
197 format!("{}.{}", path, key)
198 };
199 new_obj.insert(
200 key.clone(),
201 self.sanitize_json_recursive(val, redactions, new_path),
202 );
203 }
204 serde_json::Value::Object(new_obj)
205 }
206 serde_json::Value::Array(arr) => {
207 let mut new_arr = Vec::new();
208 for (i, val) in arr.iter().enumerate() {
209 let new_path = format!("{}[{}]", path, i);
210 new_arr.push(self.sanitize_json_recursive(val, redactions, new_path));
211 }
212 serde_json::Value::Array(new_arr)
213 }
214 _ => value.clone(), }
216 }
217
218 pub fn contains_pii(&self, text: &str) -> Vec<PiiMatch> {
220 if !self.enabled {
221 return Vec::new();
222 }
223
224 let mut matches = Vec::new();
225
226 for (pii_type, regex) in &self.patterns {
227 for mat in regex.find_iter(text) {
228 matches.push(PiiMatch {
229 pii_type: pii_type.clone(),
230 start: mat.start(),
231 end: mat.end(),
232 });
233 }
234 }
235
236 matches.sort_by_key(|m| m.start);
237 matches
238 }
239
240 pub fn analyze(&self, text: &str) -> PiiAnalysis {
242 let matches = self.contains_pii(text);
243 let mut type_counts = HashMap::new();
244
245 for pii_match in &matches {
246 *type_counts.entry(pii_match.pii_type.clone()).or_insert(0) += 1;
247 }
248
249 let total_matches = matches.len();
250 let unique_types = type_counts.len();
251 let has_pii = !matches.is_empty();
252
253 PiiAnalysis {
254 has_pii,
255 total_matches,
256 unique_types,
257 type_counts,
258 matches,
259 }
260 }
261
262 fn get_redaction_marker(&self, pii_type: &PiiType) -> String {
263 match pii_type {
264 PiiType::Ssn => "[REDACTED_SSN]".to_string(),
265 PiiType::CreditCard => "[REDACTED_CREDIT_CARD]".to_string(),
266 PiiType::Email => "[REDACTED_EMAIL]".to_string(),
267 PiiType::Phone => "[REDACTED_PHONE]".to_string(),
268 PiiType::ApiKey => "[REDACTED_API_KEY]".to_string(),
269 PiiType::IpAddress => "[REDACTED_IP]".to_string(),
270 PiiType::Custom(name) => format!("[REDACTED_{}]", name.to_uppercase()),
271 }
272 }
273}
274
275impl Default for Sanitizer {
276 fn default() -> Self {
277 Self::new()
278 }
279}
280
281#[derive(Debug, Clone)]
282pub struct SanitizationResult {
283 pub sanitized: String,
284 pub redactions: Vec<Redaction>,
285}
286
287#[derive(Debug, Clone)]
288pub struct SanitizationJsonResult {
289 pub sanitized: serde_json::Value,
290 pub redactions: Vec<JsonRedaction>,
291}
292
293#[derive(Debug, Clone)]
294pub struct Redaction {
295 pub pii_type: PiiType,
296 pub original_length: usize,
297 pub start_position: usize,
298 pub end_position: usize,
299}
300
301#[derive(Debug, Clone)]
302pub struct JsonRedaction {
303 pub path: String,
304 pub pii_type: PiiType,
305 pub original_length: usize,
306}
307
308#[derive(Debug, Clone)]
309pub struct PiiMatch {
310 pub pii_type: PiiType,
311 pub start: usize,
312 pub end: usize,
313}
314
315#[derive(Debug, Clone)]
316pub struct PiiAnalysis {
317 pub has_pii: bool,
318 pub total_matches: usize,
319 pub unique_types: usize,
320 pub type_counts: HashMap<PiiType, usize>,
321 pub matches: Vec<PiiMatch>,
322}
323
324#[derive(Error, Debug, Clone, PartialEq)]
325pub enum SanitizationError {
326 #[error("Invalid pattern: {0}")]
327 InvalidPattern(String),
328}
329
330#[cfg(test)]
331mod tests {
332 use super::*;
333 use serde_json::json;
334
335 #[test]
336 fn test_sanitizer_creation() {
337 let sanitizer = Sanitizer::new();
338 assert!(sanitizer.enabled);
339 assert!(!sanitizer.patterns.is_empty());
340 }
341
342 #[test]
343 fn test_disabled_sanitizer() {
344 let sanitizer = Sanitizer::disabled();
345 assert!(!sanitizer.enabled);
346
347 let result = sanitizer.sanitize("test@email.com");
348 assert_eq!(result.sanitized, "test@email.com");
349 assert!(result.redactions.is_empty());
350 }
351
352 #[test]
353 fn test_email_sanitization() {
354 let sanitizer = Sanitizer::new();
355 let result = sanitizer.sanitize("Contact me at john.doe@example.com for details.");
356
357 assert_eq!(
358 result.sanitized,
359 "Contact me at [REDACTED_EMAIL] for details."
360 );
361 assert_eq!(result.redactions.len(), 1);
362 assert!(matches!(result.redactions[0].pii_type, PiiType::Email));
363 }
364
365 #[test]
366 fn test_ssn_sanitization() {
367 let sanitizer = Sanitizer::new();
368
369 let result = sanitizer.sanitize("My SSN is 123-45-6789.");
371 assert_eq!(result.sanitized, "My SSN is [REDACTED_SSN].");
372
373 let result = sanitizer.sanitize("SSN: 123 45 6789");
375 assert_eq!(result.sanitized, "SSN: [REDACTED_SSN]");
376
377 let result = sanitizer.sanitize("SSN123456789");
379 assert_eq!(result.sanitized, "SSN[REDACTED_SSN]");
380 }
381
382 #[test]
383 fn test_credit_card_sanitization() {
384 let sanitizer = Sanitizer::new();
385
386 let result = sanitizer.sanitize("Card number: 4532-1234-5678-9012");
387 assert_eq!(result.sanitized, "Card number: [REDACTED_CREDIT_CARD]");
388
389 let result = sanitizer.sanitize("Card: 4532123456789012");
390 assert_eq!(result.sanitized, "Card: [REDACTED_CREDIT_CARD]");
391 }
392
393 #[test]
394 fn test_phone_sanitization() {
395 let sanitizer = Sanitizer::new();
396
397 let result = sanitizer.sanitize("Call me at (555) 123-4567");
398 assert_eq!(result.sanitized, "Call me at [REDACTED_PHONE]");
399
400 let result = sanitizer.sanitize("Phone: +1-555-123-4567");
401 assert_eq!(result.sanitized, "Phone: [REDACTED_PHONE]");
402 }
403
404 #[test]
405 fn test_api_key_sanitization() {
406 let sanitizer = Sanitizer::new();
407
408 let result = sanitizer.sanitize("OpenAI key: sk-1234567890abcdef1234567890abcdef");
409 assert_eq!(result.sanitized, "OpenAI key: [REDACTED_API_KEY]");
410
411 let result = sanitizer.sanitize("API key: api_1234567890abcdef");
412 assert_eq!(result.sanitized, "API key: [REDACTED_API_KEY]");
413 }
414
415 #[test]
416 fn test_ip_address_sanitization() {
417 let sanitizer = Sanitizer::new();
418
419 let result = sanitizer.sanitize("Server IP: 192.168.1.100");
420 assert_eq!(result.sanitized, "Server IP: [REDACTED_IP]");
421 }
422
423 #[test]
424 fn test_multiple_pii_sanitization() {
425 let sanitizer = Sanitizer::new();
426
427 let text = "Contact john@example.com at 555-123-4567 or visit 192.168.1.100";
428 let result = sanitizer.sanitize(text);
429
430 assert_eq!(
431 result.sanitized,
432 "Contact [REDACTED_EMAIL] at [REDACTED_PHONE] or visit [REDACTED_IP]"
433 );
434 assert_eq!(result.redactions.len(), 3);
435 }
436
437 #[test]
438 fn test_overlapping_patterns() {
439 let mut sanitizer = Sanitizer::new();
440
441 sanitizer.add_pattern("test", r"\d{3}-\d{2}").unwrap();
443
444 let result = sanitizer.sanitize("SSN: 123-45-6789");
445
446 assert_eq!(result.redactions.len(), 1);
448 }
449
450 #[test]
451 fn test_json_sanitization() {
452 let sanitizer = Sanitizer::new();
453
454 let data = json!({
455 "user": {
456 "email": "john@example.com",
457 "phone": "555-123-4567"
458 },
459 "config": {
460 "api_key": "sk-1234567890abcdef1234567890abcdef",
461 "timeout": 30
462 }
463 });
464
465 let result = sanitizer.sanitize_json(&data);
466
467 assert_eq!(result.sanitized["user"]["email"], "[REDACTED_EMAIL]");
469 assert_eq!(result.sanitized["user"]["phone"], "[REDACTED_PHONE]");
470 assert_eq!(result.sanitized["config"]["api_key"], "[REDACTED_API_KEY]");
471 assert_eq!(result.sanitized["config"]["timeout"], 30); assert_eq!(result.redactions.len(), 3);
474 }
475
476 #[test]
477 fn test_contains_pii() {
478 let sanitizer = Sanitizer::new();
479
480 let text = "Email: john@example.com, Phone: 555-123-4567";
481 let matches = sanitizer.contains_pii(text);
482
483 assert_eq!(matches.len(), 2);
484 assert!(matches.iter().any(|m| matches!(m.pii_type, PiiType::Email)));
485 assert!(matches.iter().any(|m| matches!(m.pii_type, PiiType::Phone)));
486 }
487
488 #[test]
489 fn test_pii_analysis() {
490 let sanitizer = Sanitizer::new();
491
492 let text = "Contact john@example.com or jane@test.org at 555-123-4567";
493 let analysis = sanitizer.analyze(text);
494
495 assert!(analysis.has_pii);
496 assert_eq!(analysis.total_matches, 3);
497 assert_eq!(analysis.unique_types, 2); assert_eq!(*analysis.type_counts.get(&PiiType::Email).unwrap(), 2);
499 assert_eq!(*analysis.type_counts.get(&PiiType::Phone).unwrap(), 1);
500 }
501
502 #[test]
503 fn test_custom_pattern() {
504 let mut sanitizer = Sanitizer::new();
505
506 sanitizer
507 .add_pattern("employee_id", r"\bEMP-\d{6}\b")
508 .unwrap();
509
510 let result = sanitizer.sanitize("Employee ID: EMP-123456");
511 assert_eq!(result.sanitized, "Employee ID: [REDACTED_EMPLOYEE_ID]");
512 }
513
514 #[test]
515 fn test_invalid_pattern() {
516 let mut sanitizer = Sanitizer::new();
517
518 let result = sanitizer.add_pattern("invalid", r"[");
519 assert!(result.is_err());
520 assert!(matches!(
521 result.unwrap_err(),
522 SanitizationError::InvalidPattern(_)
523 ));
524 }
525
526 #[test]
527 fn test_pattern_removal() {
528 let mut sanitizer = Sanitizer::new();
529
530 assert!(sanitizer.remove_pattern(&PiiType::Email));
531 assert!(!sanitizer.remove_pattern(&PiiType::Email)); let result = sanitizer.sanitize("Email: test@example.com");
534 assert_eq!(result.sanitized, "Email: test@example.com"); }
536
537 #[test]
538 fn test_enable_disable() {
539 let mut sanitizer = Sanitizer::new();
540
541 sanitizer.set_enabled(false);
542 let result = sanitizer.sanitize("Email: test@example.com");
543 assert_eq!(result.sanitized, "Email: test@example.com");
544
545 sanitizer.set_enabled(true);
546 let result = sanitizer.sanitize("Email: test@example.com");
547 assert_eq!(result.sanitized, "Email: [REDACTED_EMAIL]");
548 }
549
550 #[test]
551 fn test_no_false_positives() {
552 let sanitizer = Sanitizer::new();
553
554 let non_pii_texts = vec![
556 "Version 1.2.3.4 released", "Price: $12.34", "Date: 12-34-5678", "Call ext 123", ];
561
562 for text in non_pii_texts {
563 let result = sanitizer.sanitize(text);
564 assert!(!result.sanitized.is_empty());
566 }
567 }
568
569 #[test]
570 fn test_performance_large_text() {
571 let sanitizer = Sanitizer::new();
572
573 let large_text = "Lorem ipsum dolor sit amet. ".repeat(1000) + "Contact: test@example.com";
575
576 let start = std::time::Instant::now();
577 let result = sanitizer.sanitize(&large_text);
578 let duration = start.elapsed();
579
580 assert!(duration.as_millis() < 100);
582 assert!(result.sanitized.contains("[REDACTED_EMAIL]"));
583 }
584}