Skip to main content

edgeparse_core/utils/
sanitizer.rs

1//! Content sanitization — PII masking, Unicode normalization.
2
3use regex::Regex;
4use std::sync::LazyLock;
5
6/// Sanitization rule: a regex pattern and its replacement.
7#[derive(Debug, Clone)]
8pub struct SanitizationRule {
9    /// Human-readable name of the rule
10    pub name: &'static str,
11    /// Compiled regex pattern
12    pattern: &'static LazyLock<Regex>,
13    /// Replacement placeholder
14    pub replacement: &'static str,
15}
16
17static EMAIL_RE: LazyLock<Regex> =
18    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}").unwrap());
19
20static PHONE_RE: LazyLock<Regex> = LazyLock::new(|| {
21    Regex::new(r"(\+?\d{1,3}[-.\s]?)?\(?\d{2,4}\)?[-.\s]?\d{3,4}[-.\s]?\d{3,4}").unwrap()
22});
23
24static IP_RE: LazyLock<Regex> =
25    LazyLock::new(|| Regex::new(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b").unwrap());
26
27static CREDIT_CARD_RE: LazyLock<Regex> =
28    LazyLock::new(|| Regex::new(r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b").unwrap());
29
30static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"https?://[^\s<>"']+"#).unwrap());
31
32/// Default sanitization rules for PII masking.
33pub fn default_rules() -> Vec<SanitizationRule> {
34    vec![
35        SanitizationRule {
36            name: "email",
37            pattern: &EMAIL_RE,
38            replacement: "[EMAIL]",
39        },
40        SanitizationRule {
41            name: "credit_card",
42            pattern: &CREDIT_CARD_RE,
43            replacement: "[CREDIT_CARD]",
44        },
45        SanitizationRule {
46            name: "ip_address",
47            pattern: &IP_RE,
48            replacement: "[IP]",
49        },
50        SanitizationRule {
51            name: "url",
52            pattern: &URL_RE,
53            replacement: "[URL]",
54        },
55        SanitizationRule {
56            name: "phone",
57            pattern: &PHONE_RE,
58            replacement: "[PHONE]",
59        },
60    ]
61}
62
63/// Apply all sanitization rules to a text string.
64pub fn sanitize_text(text: &str, rules: &[SanitizationRule]) -> String {
65    let mut result = text.to_string();
66    for rule in rules {
67        result = rule
68            .pattern
69            .replace_all(&result, rule.replacement)
70            .to_string();
71    }
72    result
73}
74
75/// Normalize Unicode text (NFC normalization).
76pub fn normalize_unicode(text: &str) -> String {
77    use unicode_normalization::UnicodeNormalization;
78    text.nfc().collect()
79}
80
81#[cfg(test)]
82mod tests {
83    use super::*;
84
85    #[test]
86    fn test_sanitize_email() {
87        let rules = default_rules();
88        let result = sanitize_text("Contact john@example.com for info", &rules);
89        assert!(result.contains("[EMAIL]"));
90        assert!(!result.contains("john@example.com"));
91    }
92
93    #[test]
94    fn test_sanitize_url() {
95        let rules = default_rules();
96        let result = sanitize_text("Visit https://example.com/page for details", &rules);
97        assert!(result.contains("[URL]"));
98        assert!(!result.contains("https://example.com"));
99    }
100
101    #[test]
102    fn test_sanitize_credit_card() {
103        let rules = default_rules();
104        let result = sanitize_text("Card: 4111-1111-1111-1111", &rules);
105        assert!(result.contains("[CREDIT_CARD]"));
106    }
107
108    #[test]
109    fn test_normalize_unicode() {
110        // Combining accent vs precomposed
111        let decomposed = "e\u{0301}"; // e + combining acute
112        let normalized = normalize_unicode(decomposed);
113        assert_eq!(normalized, "é");
114    }
115}