use regex::Regex;
use std::collections::HashMap;
pub struct Redactor;
impl Redactor {
pub fn scrub(text: &str) -> String {
let mut scrubbed = Self::scrub_patterns(text);
scrubbed = Self::scrub_high_entropy(&scrubbed);
scrubbed
}
fn scrub_patterns(text: &str) -> String {
let mut result = text.to_string();
let email_re = Regex::new(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}").unwrap();
result = email_re
.replace_all(&result, "[REDACTED_EMAIL]")
.to_string();
let ip_re = Regex::new(r"\b(?:\d{1,3}\.){3}\d{1,3}\b").unwrap();
result = ip_re.replace_all(&result, "[REDACTED_IP]").to_string();
let token_re = Regex::new(r"(?i)(sk-|ghp_|xoxb-|AIza)[a-zA-Z0-9_\-]+").unwrap();
result = token_re
.replace_all(&result, "[REDACTED_TOKEN]")
.to_string();
result
}
fn scrub_high_entropy(text: &str) -> String {
let mut result = text.to_string();
let delimiters = [
' ', '\t', '\n', '\r', '(', ')', '{', '}', '[', ']', ',', ';', ':', '=', '+', '-', '*',
'/',
];
let mut tokenizable = text.to_string();
for d in delimiters {
tokenizable = tokenizable.replace(d, " ");
}
let tokens: Vec<&str> = tokenizable.split_whitespace().collect();
for token in tokens {
if token.len() > 20 {
let entropy = Self::calculate_shannon_entropy(token);
if entropy > 5.2 {
result = result.replace(token, "[REDACTED_HIGH_ENTROPY]");
}
}
}
result
}
fn calculate_shannon_entropy(s: &str) -> f64 {
if s.is_empty() {
return 0.0;
}
let mut frequencies = HashMap::new();
for c in s.chars() {
*frequencies.entry(c).or_insert(0) += 1;
}
let len = s.chars().count() as f64;
let mut entropy = 0.0;
for &count in frequencies.values() {
let p = count as f64 / len;
entropy -= p * p.log2();
}
entropy
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_email_redaction() {
let input = "Contact me at admin@example.com for keys.";
let output = Redactor::scrub(input);
assert!(output.contains("[REDACTED_EMAIL]"));
assert!(!output.contains("admin@example.com"));
}
#[test]
fn test_high_entropy_redaction() {
let input = "Token: sk-proj-1234567890abcdefghij";
let output = Redactor::scrub(input);
assert!(
output.contains("[REDACTED_TOKEN]"),
"Expected token redaction for sk- prefix. Got: {}",
output
);
let input2 = "ghp_aBcDeFgHiJkLmNoPqRsTuVwXyZ12345";
let output2 = Redactor::scrub(input2);
assert!(
output2.contains("[REDACTED_TOKEN]"),
"Expected token redaction for ghp_ prefix. Got: {}",
output2
);
}
#[test]
fn test_low_entropy_preservation() {
let input = "This is a perfectly normal sentence that should not be redacted.";
let output = Redactor::scrub(input);
assert!(!output.contains("[REDACTED]"));
assert_eq!(input, output);
}
}