edgeparse_core/utils/
sanitizer.rs1use regex::Regex;
4use std::sync::LazyLock;
5
6#[derive(Debug, Clone)]
8pub struct SanitizationRule {
9 pub name: &'static str,
11 pattern: &'static LazyLock<Regex>,
13 pub replacement: &'static str,
15}
16
17static EMAIL_RE: LazyLock<Regex> =
18 LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}").unwrap());
19
20static PHONE_RE: LazyLock<Regex> = LazyLock::new(|| {
21 Regex::new(r"(\+?\d{1,3}[-.\s]?)?\(?\d{2,4}\)?[-.\s]?\d{3,4}[-.\s]?\d{3,4}").unwrap()
22});
23
24static IP_RE: LazyLock<Regex> =
25 LazyLock::new(|| Regex::new(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b").unwrap());
26
27static CREDIT_CARD_RE: LazyLock<Regex> =
28 LazyLock::new(|| Regex::new(r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b").unwrap());
29
30static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"https?://[^\s<>"']+"#).unwrap());
31
32pub fn default_rules() -> Vec<SanitizationRule> {
34 vec![
35 SanitizationRule {
36 name: "email",
37 pattern: &EMAIL_RE,
38 replacement: "[EMAIL]",
39 },
40 SanitizationRule {
41 name: "credit_card",
42 pattern: &CREDIT_CARD_RE,
43 replacement: "[CREDIT_CARD]",
44 },
45 SanitizationRule {
46 name: "ip_address",
47 pattern: &IP_RE,
48 replacement: "[IP]",
49 },
50 SanitizationRule {
51 name: "url",
52 pattern: &URL_RE,
53 replacement: "[URL]",
54 },
55 SanitizationRule {
56 name: "phone",
57 pattern: &PHONE_RE,
58 replacement: "[PHONE]",
59 },
60 ]
61}
62
63pub fn sanitize_text(text: &str, rules: &[SanitizationRule]) -> String {
65 let mut result = text.to_string();
66 for rule in rules {
67 result = rule
68 .pattern
69 .replace_all(&result, rule.replacement)
70 .to_string();
71 }
72 result
73}
74
75pub fn normalize_unicode(text: &str) -> String {
77 use unicode_normalization::UnicodeNormalization;
78 text.nfc().collect()
79}
80
81#[cfg(test)]
82mod tests {
83 use super::*;
84
85 #[test]
86 fn test_sanitize_email() {
87 let rules = default_rules();
88 let result = sanitize_text("Contact john@example.com for info", &rules);
89 assert!(result.contains("[EMAIL]"));
90 assert!(!result.contains("john@example.com"));
91 }
92
93 #[test]
94 fn test_sanitize_url() {
95 let rules = default_rules();
96 let result = sanitize_text("Visit https://example.com/page for details", &rules);
97 assert!(result.contains("[URL]"));
98 assert!(!result.contains("https://example.com"));
99 }
100
101 #[test]
102 fn test_sanitize_credit_card() {
103 let rules = default_rules();
104 let result = sanitize_text("Card: 4111-1111-1111-1111", &rules);
105 assert!(result.contains("[CREDIT_CARD]"));
106 }
107
108 #[test]
109 fn test_normalize_unicode() {
110 let decomposed = "e\u{0301}"; let normalized = normalize_unicode(decomposed);
113 assert_eq!(normalized, "é");
114 }
115}