1use regex::Regex;
26use std::collections::HashMap;
27
28pub struct Redactor;
30
31impl Redactor {
32 pub fn scrub(text: &str) -> String {
34 let mut scrubbed = Self::scrub_patterns(text);
36
37 scrubbed = Self::scrub_high_entropy(&scrubbed);
39
40 scrubbed
41 }
42
43 fn scrub_patterns(text: &str) -> String {
45 let mut result = text.to_string();
46
47 let email_re = Regex::new(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}").unwrap();
49 result = email_re
50 .replace_all(&result, "[REDACTED_EMAIL]")
51 .to_string();
52
53 let ip_re = Regex::new(r"\b(?:\d{1,3}\.){3}\d{1,3}\b").unwrap();
55 result = ip_re.replace_all(&result, "[REDACTED_IP]").to_string();
56
57 let token_re = Regex::new(r"(?i)(sk-|ghp_|xoxb-|AIza)[a-zA-Z0-9_\-]+").unwrap();
59 result = token_re
60 .replace_all(&result, "[REDACTED_TOKEN]")
61 .to_string();
62
63 result
64 }
65
66 fn scrub_high_entropy(text: &str) -> String {
68 let mut result = text.to_string();
70 let delimiters = [
71 ' ', '\t', '\n', '\r', '(', ')', '{', '}', '[', ']', ',', ';', ':', '=', '+', '-', '*',
72 '/',
73 ];
74
75 let mut tokenizable = text.to_string();
77 for d in delimiters {
78 tokenizable = tokenizable.replace(d, " ");
79 }
80
81 let tokens: Vec<&str> = tokenizable.split_whitespace().collect();
82
83 for token in tokens {
84 if token.len() > 20 {
86 let entropy = Self::calculate_shannon_entropy(token);
87 if entropy > 5.2 {
90 result = result.replace(token, "[REDACTED_HIGH_ENTROPY]");
91 }
92 }
93 }
94
95 result
96 }
97
98 fn calculate_shannon_entropy(s: &str) -> f64 {
101 if s.is_empty() {
102 return 0.0;
103 }
104
105 let mut frequencies = HashMap::new();
106 for c in s.chars() {
107 *frequencies.entry(c).or_insert(0) += 1;
108 }
109
110 let len = s.chars().count() as f64;
111 let mut entropy = 0.0;
112
113 for &count in frequencies.values() {
114 let p = count as f64 / len;
115 entropy -= p * p.log2();
116 }
117
118 entropy
119 }
120}
121
122#[cfg(test)]
123mod tests {
124 use super::*;
125
126 #[test]
127 fn test_email_redaction() {
128 let input = "Contact me at admin@example.com for keys.";
129 let output = Redactor::scrub(input);
130 assert!(output.contains("[REDACTED_EMAIL]"));
131 assert!(!output.contains("admin@example.com"));
132 }
133
134 #[test]
135 fn test_high_entropy_redaction() {
136 let input = "Token: sk-proj-1234567890abcdefghij";
138 let output = Redactor::scrub(input);
139 assert!(
140 output.contains("[REDACTED_TOKEN]"),
141 "Expected token redaction for sk- prefix. Got: {}",
142 output
143 );
144
145 let input2 = "ghp_aBcDeFgHiJkLmNoPqRsTuVwXyZ12345";
147 let output2 = Redactor::scrub(input2);
148 assert!(
149 output2.contains("[REDACTED_TOKEN]"),
150 "Expected token redaction for ghp_ prefix. Got: {}",
151 output2
152 );
153 }
154
155 #[test]
156 fn test_low_entropy_preservation() {
157 let input = "This is a perfectly normal sentence that should not be redacted.";
159 let output = Redactor::scrub(input);
160 assert!(!output.contains("[REDACTED]"));
161 assert_eq!(input, output);
162 }
163}