Skip to main content

briefcase_core/
sanitization.rs

1use regex::Regex;
2use std::collections::HashMap;
3use thiserror::Error;
4
5#[derive(Debug, Clone)]
6pub struct Sanitizer {
7    patterns: HashMap<PiiType, Regex>,
8    enabled: bool,
9}
10
11#[derive(Debug, Clone, PartialEq, Eq, Hash)]
12pub enum PiiType {
13    Ssn,
14    CreditCard,
15    Email,
16    Phone,
17    ApiKey,
18    IpAddress,
19    Custom(String),
20}
21
22impl Sanitizer {
23    pub fn new() -> Self {
24        let mut patterns = HashMap::new();
25
26        patterns.insert(
27            PiiType::Ssn,
28            Regex::new(r"\b\d{3}-\d{2}-\d{4}\b|\b\d{3}\s\d{2}\s\d{4}\b|\d{9}").unwrap(),
29        );
30        patterns.insert(
31            PiiType::CreditCard,
32            Regex::new(r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b").unwrap(),
33        );
34        patterns.insert(
35            PiiType::Email,
36            Regex::new(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b").unwrap(),
37        );
38        patterns.insert(
39            PiiType::Phone,
40            Regex::new(r"(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}").unwrap(),
41        );
42        patterns.insert(
43            PiiType::ApiKey,
44            Regex::new(r"\b(sk-|bai_|api_|key_|AIza|AKIA|ya29\.|xox[bpoa]-)[A-Za-z0-9_-]{15,}\b")
45                .unwrap(),
46        );
47        patterns.insert(
48            PiiType::IpAddress,
49            Regex::new(r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b").unwrap(),
50        );
51
52        Self {
53            patterns,
54            enabled: true,
55        }
56    }
57
58    pub fn disabled() -> Self {
59        Self {
60            patterns: HashMap::new(),
61            enabled: false,
62        }
63    }
64
65    /// Add a custom PII pattern
66    pub fn add_pattern(&mut self, name: &str, pattern: &str) -> Result<(), SanitizationError> {
67        let regex =
68            Regex::new(pattern).map_err(|e| SanitizationError::InvalidPattern(e.to_string()))?;
69        self.patterns
70            .insert(PiiType::Custom(name.to_string()), regex);
71        Ok(())
72    }
73
74    /// Remove a pattern
75    pub fn remove_pattern(&mut self, pii_type: &PiiType) -> bool {
76        self.patterns.remove(pii_type).is_some()
77    }
78
79    /// Enable or disable sanitization
80    pub fn set_enabled(&mut self, enabled: bool) {
81        self.enabled = enabled;
82    }
83
84    /// Sanitize a string, replacing PII with redaction markers
85    pub fn sanitize(&self, text: &str) -> SanitizationResult {
86        if !self.enabled {
87            return SanitizationResult {
88                sanitized: text.to_string(),
89                redactions: Vec::new(),
90            };
91        }
92
93        let mut result = text.to_string();
94        let mut redactions = Vec::new();
95
96        // Collect all matches first to avoid overlapping replacements
97        let mut all_matches = Vec::new();
98
99        for (pii_type, regex) in &self.patterns {
100            for mat in regex.find_iter(text) {
101                all_matches.push((mat.start(), mat.end(), pii_type.clone()));
102            }
103        }
104
105        // Sort by start position, then by length (longest first for overlaps)
106        all_matches.sort_by_key(|(start, end, _)| (*start, std::cmp::Reverse(end - start)));
107
108        // Remove overlapping matches (keep the longest one)
109        let mut non_overlapping_matches = Vec::new();
110        let mut last_end = 0;
111
112        for (start, end, pii_type) in all_matches {
113            if start >= last_end {
114                non_overlapping_matches.push((start, end, pii_type));
115                last_end = end;
116            } else if start < last_end {
117                // Check if this match is longer than the previous one that overlaps
118                if let Some(last_match) = non_overlapping_matches.last() {
119                    if (end - start) > (last_match.1 - last_match.0) {
120                        // Replace the shorter match with this longer one
121                        non_overlapping_matches.pop();
122                        non_overlapping_matches.push((start, end, pii_type));
123                        last_end = end;
124                    }
125                }
126            }
127        }
128
129        // Apply redactions from right to left to maintain indices
130        for (start, end, pii_type) in non_overlapping_matches.into_iter().rev() {
131            let redaction_marker = self.get_redaction_marker(&pii_type);
132            let original_length = end - start;
133
134            result.replace_range(start..end, &redaction_marker);
135
136            redactions.push(Redaction {
137                pii_type: pii_type.clone(),
138                original_length,
139                start_position: start,
140                end_position: start + redaction_marker.len(), // New end position after redaction
141            });
142        }
143
144        // Sort redactions by original position
145        redactions.sort_by_key(|r| r.start_position);
146
147        SanitizationResult {
148            sanitized: result,
149            redactions,
150        }
151    }
152
153    /// Sanitize a JSON value recursively
154    pub fn sanitize_json(&self, value: &serde_json::Value) -> SanitizationJsonResult {
155        if !self.enabled {
156            return SanitizationJsonResult {
157                sanitized: value.clone(),
158                redactions: Vec::new(),
159            };
160        }
161
162        let mut redactions = Vec::new();
163        let sanitized = self.sanitize_json_recursive(value, &mut redactions, String::new());
164
165        SanitizationJsonResult {
166            sanitized,
167            redactions,
168        }
169    }
170
171    fn sanitize_json_recursive(
172        &self,
173        value: &serde_json::Value,
174        redactions: &mut Vec<JsonRedaction>,
175        path: String,
176    ) -> serde_json::Value {
177        match value {
178            serde_json::Value::String(s) => {
179                let result = self.sanitize(s);
180                if !result.redactions.is_empty() {
181                    for redaction in result.redactions {
182                        redactions.push(JsonRedaction {
183                            path: path.clone(),
184                            pii_type: redaction.pii_type,
185                            original_length: redaction.original_length,
186                        });
187                    }
188                }
189                serde_json::Value::String(result.sanitized)
190            }
191            serde_json::Value::Object(obj) => {
192                let mut new_obj = serde_json::Map::new();
193                for (key, val) in obj {
194                    let new_path = if path.is_empty() {
195                        key.clone()
196                    } else {
197                        format!("{}.{}", path, key)
198                    };
199                    new_obj.insert(
200                        key.clone(),
201                        self.sanitize_json_recursive(val, redactions, new_path),
202                    );
203                }
204                serde_json::Value::Object(new_obj)
205            }
206            serde_json::Value::Array(arr) => {
207                let mut new_arr = Vec::new();
208                for (i, val) in arr.iter().enumerate() {
209                    let new_path = format!("{}[{}]", path, i);
210                    new_arr.push(self.sanitize_json_recursive(val, redactions, new_path));
211                }
212                serde_json::Value::Array(new_arr)
213            }
214            _ => value.clone(), // Numbers, booleans, null remain unchanged
215        }
216    }
217
218    /// Check if text contains PII (without modifying)
219    pub fn contains_pii(&self, text: &str) -> Vec<PiiMatch> {
220        if !self.enabled {
221            return Vec::new();
222        }
223
224        let mut matches = Vec::new();
225
226        for (pii_type, regex) in &self.patterns {
227            for mat in regex.find_iter(text) {
228                matches.push(PiiMatch {
229                    pii_type: pii_type.clone(),
230                    start: mat.start(),
231                    end: mat.end(),
232                });
233            }
234        }
235
236        matches.sort_by_key(|m| m.start);
237        matches
238    }
239
240    /// Analyze text and return detailed PII statistics
241    pub fn analyze(&self, text: &str) -> PiiAnalysis {
242        let matches = self.contains_pii(text);
243        let mut type_counts = HashMap::new();
244
245        for pii_match in &matches {
246            *type_counts.entry(pii_match.pii_type.clone()).or_insert(0) += 1;
247        }
248
249        let total_matches = matches.len();
250        let unique_types = type_counts.len();
251        let has_pii = !matches.is_empty();
252
253        PiiAnalysis {
254            has_pii,
255            total_matches,
256            unique_types,
257            type_counts,
258            matches,
259        }
260    }
261
262    fn get_redaction_marker(&self, pii_type: &PiiType) -> String {
263        match pii_type {
264            PiiType::Ssn => "[REDACTED_SSN]".to_string(),
265            PiiType::CreditCard => "[REDACTED_CREDIT_CARD]".to_string(),
266            PiiType::Email => "[REDACTED_EMAIL]".to_string(),
267            PiiType::Phone => "[REDACTED_PHONE]".to_string(),
268            PiiType::ApiKey => "[REDACTED_API_KEY]".to_string(),
269            PiiType::IpAddress => "[REDACTED_IP]".to_string(),
270            PiiType::Custom(name) => format!("[REDACTED_{}]", name.to_uppercase()),
271        }
272    }
273}
274
275impl Default for Sanitizer {
276    fn default() -> Self {
277        Self::new()
278    }
279}
280
281#[derive(Debug, Clone)]
282pub struct SanitizationResult {
283    pub sanitized: String,
284    pub redactions: Vec<Redaction>,
285}
286
287#[derive(Debug, Clone)]
288pub struct SanitizationJsonResult {
289    pub sanitized: serde_json::Value,
290    pub redactions: Vec<JsonRedaction>,
291}
292
293#[derive(Debug, Clone)]
294pub struct Redaction {
295    pub pii_type: PiiType,
296    pub original_length: usize,
297    pub start_position: usize,
298    pub end_position: usize,
299}
300
301#[derive(Debug, Clone)]
302pub struct JsonRedaction {
303    pub path: String,
304    pub pii_type: PiiType,
305    pub original_length: usize,
306}
307
308#[derive(Debug, Clone)]
309pub struct PiiMatch {
310    pub pii_type: PiiType,
311    pub start: usize,
312    pub end: usize,
313}
314
315#[derive(Debug, Clone)]
316pub struct PiiAnalysis {
317    pub has_pii: bool,
318    pub total_matches: usize,
319    pub unique_types: usize,
320    pub type_counts: HashMap<PiiType, usize>,
321    pub matches: Vec<PiiMatch>,
322}
323
324#[derive(Error, Debug, Clone, PartialEq)]
325pub enum SanitizationError {
326    #[error("Invalid pattern: {0}")]
327    InvalidPattern(String),
328}
329
330#[cfg(test)]
331mod tests {
332    use super::*;
333    use serde_json::json;
334
335    #[test]
336    fn test_sanitizer_creation() {
337        let sanitizer = Sanitizer::new();
338        assert!(sanitizer.enabled);
339        assert!(!sanitizer.patterns.is_empty());
340    }
341
342    #[test]
343    fn test_disabled_sanitizer() {
344        let sanitizer = Sanitizer::disabled();
345        assert!(!sanitizer.enabled);
346
347        let result = sanitizer.sanitize("test@email.com");
348        assert_eq!(result.sanitized, "test@email.com");
349        assert!(result.redactions.is_empty());
350    }
351
352    #[test]
353    fn test_email_sanitization() {
354        let sanitizer = Sanitizer::new();
355        let result = sanitizer.sanitize("Contact me at john.doe@example.com for details.");
356
357        assert_eq!(
358            result.sanitized,
359            "Contact me at [REDACTED_EMAIL] for details."
360        );
361        assert_eq!(result.redactions.len(), 1);
362        assert!(matches!(result.redactions[0].pii_type, PiiType::Email));
363    }
364
365    #[test]
366    fn test_ssn_sanitization() {
367        let sanitizer = Sanitizer::new();
368
369        // Hyphenated SSN
370        let result = sanitizer.sanitize("My SSN is 123-45-6789.");
371        assert_eq!(result.sanitized, "My SSN is [REDACTED_SSN].");
372
373        // Spaced SSN
374        let result = sanitizer.sanitize("SSN: 123 45 6789");
375        assert_eq!(result.sanitized, "SSN: [REDACTED_SSN]");
376
377        // No delimiter SSN
378        let result = sanitizer.sanitize("SSN123456789");
379        assert_eq!(result.sanitized, "SSN[REDACTED_SSN]");
380    }
381
382    #[test]
383    fn test_credit_card_sanitization() {
384        let sanitizer = Sanitizer::new();
385
386        let result = sanitizer.sanitize("Card number: 4532-1234-5678-9012");
387        assert_eq!(result.sanitized, "Card number: [REDACTED_CREDIT_CARD]");
388
389        let result = sanitizer.sanitize("Card: 4532123456789012");
390        assert_eq!(result.sanitized, "Card: [REDACTED_CREDIT_CARD]");
391    }
392
393    #[test]
394    fn test_phone_sanitization() {
395        let sanitizer = Sanitizer::new();
396
397        let result = sanitizer.sanitize("Call me at (555) 123-4567");
398        assert_eq!(result.sanitized, "Call me at [REDACTED_PHONE]");
399
400        let result = sanitizer.sanitize("Phone: +1-555-123-4567");
401        assert_eq!(result.sanitized, "Phone: [REDACTED_PHONE]");
402    }
403
404    #[test]
405    fn test_api_key_sanitization() {
406        let sanitizer = Sanitizer::new();
407
408        let result = sanitizer.sanitize("OpenAI key: sk-1234567890abcdef1234567890abcdef");
409        assert_eq!(result.sanitized, "OpenAI key: [REDACTED_API_KEY]");
410
411        let result = sanitizer.sanitize("API key: api_1234567890abcdef");
412        assert_eq!(result.sanitized, "API key: [REDACTED_API_KEY]");
413    }
414
415    #[test]
416    fn test_ip_address_sanitization() {
417        let sanitizer = Sanitizer::new();
418
419        let result = sanitizer.sanitize("Server IP: 192.168.1.100");
420        assert_eq!(result.sanitized, "Server IP: [REDACTED_IP]");
421    }
422
423    #[test]
424    fn test_multiple_pii_sanitization() {
425        let sanitizer = Sanitizer::new();
426
427        let text = "Contact john@example.com at 555-123-4567 or visit 192.168.1.100";
428        let result = sanitizer.sanitize(text);
429
430        assert_eq!(
431            result.sanitized,
432            "Contact [REDACTED_EMAIL] at [REDACTED_PHONE] or visit [REDACTED_IP]"
433        );
434        assert_eq!(result.redactions.len(), 3);
435    }
436
437    #[test]
438    fn test_overlapping_patterns() {
439        let mut sanitizer = Sanitizer::new();
440
441        // Add a pattern that might overlap
442        sanitizer.add_pattern("test", r"\d{3}-\d{2}").unwrap();
443
444        let result = sanitizer.sanitize("SSN: 123-45-6789");
445
446        // Should only redact once (first pattern wins)
447        assert_eq!(result.redactions.len(), 1);
448    }
449
450    #[test]
451    fn test_json_sanitization() {
452        let sanitizer = Sanitizer::new();
453
454        let data = json!({
455            "user": {
456                "email": "john@example.com",
457                "phone": "555-123-4567"
458            },
459            "config": {
460                "api_key": "sk-1234567890abcdef1234567890abcdef",
461                "timeout": 30
462            }
463        });
464
465        let result = sanitizer.sanitize_json(&data);
466
467        // Check that emails, phones, and API keys are redacted
468        assert_eq!(result.sanitized["user"]["email"], "[REDACTED_EMAIL]");
469        assert_eq!(result.sanitized["user"]["phone"], "[REDACTED_PHONE]");
470        assert_eq!(result.sanitized["config"]["api_key"], "[REDACTED_API_KEY]");
471        assert_eq!(result.sanitized["config"]["timeout"], 30); // Number unchanged
472
473        assert_eq!(result.redactions.len(), 3);
474    }
475
476    #[test]
477    fn test_contains_pii() {
478        let sanitizer = Sanitizer::new();
479
480        let text = "Email: john@example.com, Phone: 555-123-4567";
481        let matches = sanitizer.contains_pii(text);
482
483        assert_eq!(matches.len(), 2);
484        assert!(matches.iter().any(|m| matches!(m.pii_type, PiiType::Email)));
485        assert!(matches.iter().any(|m| matches!(m.pii_type, PiiType::Phone)));
486    }
487
488    #[test]
489    fn test_pii_analysis() {
490        let sanitizer = Sanitizer::new();
491
492        let text = "Contact john@example.com or jane@test.org at 555-123-4567";
493        let analysis = sanitizer.analyze(text);
494
495        assert!(analysis.has_pii);
496        assert_eq!(analysis.total_matches, 3);
497        assert_eq!(analysis.unique_types, 2); // Email and Phone
498        assert_eq!(*analysis.type_counts.get(&PiiType::Email).unwrap(), 2);
499        assert_eq!(*analysis.type_counts.get(&PiiType::Phone).unwrap(), 1);
500    }
501
502    #[test]
503    fn test_custom_pattern() {
504        let mut sanitizer = Sanitizer::new();
505
506        sanitizer
507            .add_pattern("employee_id", r"\bEMP-\d{6}\b")
508            .unwrap();
509
510        let result = sanitizer.sanitize("Employee ID: EMP-123456");
511        assert_eq!(result.sanitized, "Employee ID: [REDACTED_EMPLOYEE_ID]");
512    }
513
514    #[test]
515    fn test_invalid_pattern() {
516        let mut sanitizer = Sanitizer::new();
517
518        let result = sanitizer.add_pattern("invalid", r"[");
519        assert!(result.is_err());
520        assert!(matches!(
521            result.unwrap_err(),
522            SanitizationError::InvalidPattern(_)
523        ));
524    }
525
526    #[test]
527    fn test_pattern_removal() {
528        let mut sanitizer = Sanitizer::new();
529
530        assert!(sanitizer.remove_pattern(&PiiType::Email));
531        assert!(!sanitizer.remove_pattern(&PiiType::Email)); // Already removed
532
533        let result = sanitizer.sanitize("Email: test@example.com");
534        assert_eq!(result.sanitized, "Email: test@example.com"); // Should not be redacted
535    }
536
537    #[test]
538    fn test_enable_disable() {
539        let mut sanitizer = Sanitizer::new();
540
541        sanitizer.set_enabled(false);
542        let result = sanitizer.sanitize("Email: test@example.com");
543        assert_eq!(result.sanitized, "Email: test@example.com");
544
545        sanitizer.set_enabled(true);
546        let result = sanitizer.sanitize("Email: test@example.com");
547        assert_eq!(result.sanitized, "Email: [REDACTED_EMAIL]");
548    }
549
550    #[test]
551    fn test_no_false_positives() {
552        let sanitizer = Sanitizer::new();
553
554        // These should not be detected as PII
555        let non_pii_texts = vec![
556            "Version 1.2.3.4 released", // Looks like IP but is version
557            "Price: $12.34",            // Not a credit card
558            "Date: 12-34-5678",         // Invalid SSN format
559            "Call ext 123",             // Too short for phone
560        ];
561
562        for text in non_pii_texts {
563            let result = sanitizer.sanitize(text);
564            // Some might still be detected due to regex patterns, but at least test they don't crash
565            assert!(!result.sanitized.is_empty());
566        }
567    }
568
569    #[test]
570    fn test_performance_large_text() {
571        let sanitizer = Sanitizer::new();
572
573        // Generate a large text with some PII
574        let large_text = "Lorem ipsum dolor sit amet. ".repeat(1000) + "Contact: test@example.com";
575
576        let start = std::time::Instant::now();
577        let result = sanitizer.sanitize(&large_text);
578        let duration = start.elapsed();
579
580        // Should complete quickly (within 100ms for large text)
581        assert!(duration.as_millis() < 100);
582        assert!(result.sanitized.contains("[REDACTED_EMAIL]"));
583    }
584}