use regex::Regex;
use std::collections::HashMap;
use std::sync::OnceLock;
const ENTROPY_TOKEN_MIN_LEN: usize = 24;
#[derive(Debug, Clone)]
pub enum LeakResult {
Clean,
Detected {
patterns: Vec<String>,
redacted: String,
},
}
#[derive(Debug, Clone)]
pub struct LeakDetector {
sensitivity: f64,
}
impl Default for LeakDetector {
fn default() -> Self {
Self::new()
}
}
impl LeakDetector {
pub fn new() -> Self {
Self { sensitivity: 0.7 }
}
pub fn with_sensitivity(sensitivity: f64) -> Self {
Self {
sensitivity: sensitivity.clamp(0.0, 1.0),
}
}
pub fn scan(&self, content: &str) -> LeakResult {
let mut patterns = Vec::new();
let mut redacted = content.to_string();
self.check_api_keys(content, &mut patterns, &mut redacted);
self.check_aws_credentials(content, &mut patterns, &mut redacted);
self.check_generic_secrets(content, &mut patterns, &mut redacted);
self.check_private_keys(content, &mut patterns, &mut redacted);
self.check_jwt_tokens(content, &mut patterns, &mut redacted);
self.check_database_urls(content, &mut patterns, &mut redacted);
self.check_high_entropy_tokens(content, &mut patterns, &mut redacted);
if patterns.is_empty() {
LeakResult::Clean
} else {
LeakResult::Detected { patterns, redacted }
}
}
fn check_api_keys(&self, content: &str, patterns: &mut Vec<String>, redacted: &mut String) {
static API_KEY_PATTERNS: OnceLock<Vec<(Regex, &'static str)>> = OnceLock::new();
let regexes = API_KEY_PATTERNS.get_or_init(|| {
vec![
(
Regex::new(r"sk_(live|test)_[a-zA-Z0-9]{24,}").unwrap(),
"Stripe secret key",
),
(
Regex::new(r"pk_(live|test)_[a-zA-Z0-9]{24,}").unwrap(),
"Stripe publishable key",
),
(
Regex::new(r"sk-[a-zA-Z0-9]{20,}T3BlbkFJ[a-zA-Z0-9]{20,}").unwrap(),
"OpenAI API key",
),
(
Regex::new(r"sk-[a-zA-Z0-9]{48,}").unwrap(),
"OpenAI-style API key",
),
(
Regex::new(r"sk-ant-[a-zA-Z0-9-_]{32,}").unwrap(),
"Anthropic API key",
),
(
Regex::new(r"AIza[a-zA-Z0-9_-]{35}").unwrap(),
"Google API key",
),
(
Regex::new(r"gh[pousr]_[a-zA-Z0-9]{36,}").unwrap(),
"GitHub token",
),
(
Regex::new(r"github_pat_[a-zA-Z0-9_]{22,}").unwrap(),
"GitHub PAT",
),
(
Regex::new(r#"api[_-]?key[=:]\s*['"]*[a-zA-Z0-9_-]{20,}"#).unwrap(),
"Generic API key",
),
]
});
for (regex, name) in regexes {
if regex.is_match(content) {
patterns.push(String::from(*name));
*redacted = regex
.replace_all(redacted, "[REDACTED_API_KEY]")
.to_string();
}
}
}
fn check_aws_credentials(
&self,
content: &str,
patterns: &mut Vec<String>,
redacted: &mut String,
) {
static AWS_PATTERNS: OnceLock<Vec<(Regex, &'static str)>> = OnceLock::new();
let regexes = AWS_PATTERNS.get_or_init(|| {
vec![
(
Regex::new(r"AKIA[A-Z0-9]{16}").unwrap(),
"AWS Access Key ID",
),
(
Regex::new(
r#"aws[_-]?secret[_-]?access[_-]?key[=:]\s*['"]*[a-zA-Z0-9/+=]{40}"#,
)
.unwrap(),
"AWS Secret Access Key",
),
]
});
for (regex, name) in regexes {
if regex.is_match(content) {
patterns.push(String::from(*name));
*redacted = regex
.replace_all(redacted, "[REDACTED_AWS_CREDENTIAL]")
.to_string();
}
}
}
fn check_generic_secrets(
&self,
content: &str,
patterns: &mut Vec<String>,
redacted: &mut String,
) {
static SECRET_PATTERNS: OnceLock<Vec<(Regex, &'static str)>> = OnceLock::new();
let regexes = SECRET_PATTERNS.get_or_init(|| {
vec![
(
Regex::new(r#"(?i)password[=:]\s*['"]*[^\s'"]{8,}"#).unwrap(),
"Password in config",
),
(
Regex::new(r#"(?i)secret[=:]\s*['"]*[a-zA-Z0-9_-]{16,}"#).unwrap(),
"Secret value",
),
(
Regex::new(r#"(?i)token[=:]\s*['"]*[a-zA-Z0-9_.-]{20,}"#).unwrap(),
"Token value",
),
]
});
for (regex, name) in regexes {
if regex.is_match(content) && self.sensitivity > 0.5 {
patterns.push(String::from(*name));
*redacted = regex.replace_all(redacted, "[REDACTED_SECRET]").to_string();
}
}
}
fn check_private_keys(&self, content: &str, patterns: &mut Vec<String>, redacted: &mut String) {
let key_patterns = [
(
"-----BEGIN RSA PRIVATE KEY-----",
"-----END RSA PRIVATE KEY-----",
"RSA private key",
),
(
"-----BEGIN EC PRIVATE KEY-----",
"-----END EC PRIVATE KEY-----",
"EC private key",
),
(
"-----BEGIN PRIVATE KEY-----",
"-----END PRIVATE KEY-----",
"Private key",
),
(
"-----BEGIN OPENSSH PRIVATE KEY-----",
"-----END OPENSSH PRIVATE KEY-----",
"OpenSSH private key",
),
];
for (begin, end, name) in key_patterns {
if content.contains(begin) && content.contains(end) {
patterns.push(name.to_string());
if let Some(start_idx) = content.find(begin) {
if let Some(end_idx) = content.find(end) {
let key_block = &content[start_idx..end_idx + end.len()];
*redacted = redacted.replace(key_block, "[REDACTED_PRIVATE_KEY]");
}
}
}
}
}
fn check_jwt_tokens(&self, content: &str, patterns: &mut Vec<String>, redacted: &mut String) {
static JWT_PATTERN: OnceLock<Regex> = OnceLock::new();
let regex = JWT_PATTERN.get_or_init(|| {
Regex::new(r"eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*").unwrap()
});
if regex.is_match(content) {
patterns.push("JWT token".to_string());
*redacted = regex.replace_all(redacted, "[REDACTED_JWT]").to_string();
}
}
fn check_database_urls(
&self,
content: &str,
patterns: &mut Vec<String>,
redacted: &mut String,
) {
static DB_PATTERNS: OnceLock<Vec<(Regex, &'static str)>> = OnceLock::new();
let regexes = DB_PATTERNS.get_or_init(|| {
vec![
(
Regex::new(r"postgres(ql)?://[^:]+:[^@]+@[^\s]+").unwrap(),
"PostgreSQL connection URL",
),
(
Regex::new(r"mysql://[^:]+:[^@]+@[^\s]+").unwrap(),
"MySQL connection URL",
),
(
Regex::new(r"mongodb(\+srv)?://[^:]+:[^@]+@[^\s]+").unwrap(),
"MongoDB connection URL",
),
(
Regex::new(r"redis://[^:]+:[^@]+@[^\s]+").unwrap(),
"Redis connection URL",
),
]
});
for (regex, name) in regexes {
if regex.is_match(content) {
patterns.push(String::from(*name));
*redacted = regex
.replace_all(redacted, "[REDACTED_DATABASE_URL]")
.to_string();
}
}
}
fn check_high_entropy_tokens(
&self,
content: &str,
patterns: &mut Vec<String>,
redacted: &mut String,
) {
let entropy_threshold = 3.5 + self.sensitivity * 1.25;
static URL_PATTERN: OnceLock<Regex> = OnceLock::new();
let url_re = URL_PATTERN.get_or_init(|| Regex::new(r"https?://\S+").unwrap());
static MEDIA_MARKER_PATTERN: OnceLock<Regex> = OnceLock::new();
let media_re = MEDIA_MARKER_PATTERN.get_or_init(|| {
Regex::new(r"\[(IMAGE|VIDEO|VOICE|AUDIO|DOCUMENT|FILE):[^\]]*\]").unwrap()
});
let content_stripped = url_re.replace_all(content, "");
let content_without_urls = media_re.replace_all(&content_stripped, "");
let tokens = extract_candidate_tokens(&content_without_urls);
for token in tokens {
if token.len() >= ENTROPY_TOKEN_MIN_LEN {
let entropy = shannon_entropy(token);
if entropy >= entropy_threshold && has_mixed_alpha_digit(token) {
patterns.push("High-entropy token".to_string());
*redacted = redacted.replace(token, "[REDACTED_HIGH_ENTROPY_TOKEN]");
}
}
}
}
}
fn extract_candidate_tokens(content: &str) -> Vec<&str> {
content
.split(|c: char| !c.is_ascii_alphanumeric() && c != '_' && c != '-' && c != '+' && c != '/')
.filter(|s| !s.is_empty())
.collect()
}
fn shannon_entropy(s: &str) -> f64 {
let len = s.len() as f64;
if len == 0.0 {
return 0.0;
}
let mut freq: HashMap<u8, usize> = HashMap::new();
for &b in s.as_bytes() {
*freq.entry(b).or_insert(0) += 1;
}
freq.values().fold(0.0, |acc, &count| {
let p = count as f64 / len;
acc - p * p.log2()
})
}
fn has_mixed_alpha_digit(s: &str) -> bool {
let has_alpha = s.bytes().any(|b| b.is_ascii_alphabetic());
let has_digit = s.bytes().any(|b| b.is_ascii_digit());
has_alpha && has_digit
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn clean_content_passes() {
let detector = LeakDetector::new();
let result = detector.scan("This is just some normal text");
assert!(matches!(result, LeakResult::Clean));
}
#[test]
fn detects_stripe_keys() {
let detector = LeakDetector::new();
let content = "My Stripe key is sk_test_1234567890abcdefghijklmnop";
let result = detector.scan(content);
match result {
LeakResult::Detected { patterns, redacted } => {
assert!(patterns.iter().any(|p| p.contains("Stripe")));
assert!(redacted.contains("[REDACTED"));
}
LeakResult::Clean => panic!("Should detect Stripe key"),
}
}
#[test]
fn detects_aws_credentials() {
let detector = LeakDetector::new();
let content = "AWS key: AKIAIOSFODNN7EXAMPLE";
let result = detector.scan(content);
match result {
LeakResult::Detected { patterns, .. } => {
assert!(patterns.iter().any(|p| p.contains("AWS")));
}
LeakResult::Clean => panic!("Should detect AWS key"),
}
}
#[test]
fn detects_private_keys() {
let detector = LeakDetector::new();
let content = r#"
-----BEGIN RSA PRIVATE KEY-----
MIIEowIBAAKCAQEA0ZPr5JeyVDonXsKhfq...
-----END RSA PRIVATE KEY-----
"#;
let result = detector.scan(content);
match result {
LeakResult::Detected { patterns, redacted } => {
assert!(patterns.iter().any(|p| p.contains("private key")));
assert!(redacted.contains("[REDACTED_PRIVATE_KEY]"));
}
LeakResult::Clean => panic!("Should detect private key"),
}
}
#[test]
fn detects_jwt_tokens() {
let detector = LeakDetector::new();
let content = "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U";
let result = detector.scan(content);
match result {
LeakResult::Detected { patterns, redacted } => {
assert!(patterns.iter().any(|p| p.contains("JWT")));
assert!(redacted.contains("[REDACTED_JWT]"));
}
LeakResult::Clean => panic!("Should detect JWT"),
}
}
#[test]
fn detects_database_urls() {
let detector = LeakDetector::new();
let content = "DATABASE_URL=postgres://user:secretpassword@localhost:5432/mydb";
let result = detector.scan(content);
match result {
LeakResult::Detected { patterns, .. } => {
assert!(patterns.iter().any(|p| p.contains("PostgreSQL")));
}
LeakResult::Clean => panic!("Should detect database URL"),
}
}
#[test]
fn low_sensitivity_skips_generic() {
let detector = LeakDetector::with_sensitivity(0.3);
let content = "secret=mygenericvalue123456";
let result = detector.scan(content);
assert!(matches!(result, LeakResult::Clean));
}
#[test]
fn url_path_segments_not_flagged() {
let detector = LeakDetector::new();
let content =
"See https://example.org/documents/2024-report-a1b2c3d4e5f6g7h8i9j0.pdf for details";
let result = detector.scan(content);
assert!(
matches!(result, LeakResult::Clean),
"URL path segments should not trigger high-entropy detection"
);
}
#[test]
fn url_with_long_path_not_redacted() {
let detector = LeakDetector::new();
let content = "Reference: https://gov.example.com/publications/research/2024-annual-fiscal-policy-review-9a8b7c6d5e4f3g2h1i0j.html";
let result = detector.scan(content);
assert!(
matches!(result, LeakResult::Clean),
"Long URL paths should not be redacted"
);
}
#[test]
fn media_markers_not_redacted_as_high_entropy() {
let detector = LeakDetector::new();
let content = "Here is the image: [IMAGE:/Users/matt/.zeroclaw/workspace/skills/image-gen/images/20260324_135911.png]";
let result = detector.scan(content);
assert!(
matches!(result, LeakResult::Clean),
"Local media markers should not be redacted"
);
}
#[test]
fn detects_high_entropy_token_outside_url() {
let detector = LeakDetector::new();
let content = "Found credential: aB3xK9mW2pQ7vL4nR8sT1yU6hD0jF5cG";
let result = detector.scan(content);
match result {
LeakResult::Detected { patterns, redacted } => {
assert!(patterns.iter().any(|p| p.contains("High-entropy")));
assert!(redacted.contains("[REDACTED_HIGH_ENTROPY_TOKEN]"));
}
LeakResult::Clean => panic!("Should detect high-entropy token"),
}
}
#[test]
fn low_sensitivity_raises_entropy_threshold() {
let detector = LeakDetector::with_sensitivity(0.3);
let content = "token found: ab12ab12ab12ab12ab12ab12ab12ab12";
let result = detector.scan(content);
assert!(
matches!(result, LeakResult::Clean),
"Low-entropy repetitive tokens should not be flagged"
);
}
#[test]
fn extract_candidate_tokens_splits_correctly() {
let tokens = extract_candidate_tokens("foo.bar:baz qux-quux key=val");
assert!(tokens.contains(&"foo"));
assert!(tokens.contains(&"bar"));
assert!(tokens.contains(&"baz"));
assert!(tokens.contains(&"qux-quux"));
assert!(tokens.contains(&"key"));
assert!(tokens.contains(&"val"));
}
#[test]
fn media_marker_image_path_not_redacted() {
let detector = LeakDetector::new();
let content = "Here is your image: [IMAGE:/Users/matt/.zeroclaw/workspace/skills/image-gen/images/20260324_135911.png]";
let result = detector.scan(content);
assert!(
matches!(result, LeakResult::Clean),
"Media marker image paths should not trigger high-entropy detection"
);
}
#[test]
fn media_marker_video_not_redacted() {
let detector = LeakDetector::new();
let content = "Attached: [VIDEO:/path/to/long/video/file/name123456.mp4]";
let result = detector.scan(content);
assert!(
matches!(result, LeakResult::Clean),
"Media marker video paths should not trigger high-entropy detection"
);
}
#[test]
fn actual_high_entropy_still_detected() {
let detector = LeakDetector::new();
let content = "Leaked credential: aB3xK9mW2pQ7vL4nR8sT1yU6hD0jF5cG";
let result = detector.scan(content);
match result {
LeakResult::Detected { patterns, redacted } => {
assert!(patterns.iter().any(|p| p.contains("High-entropy")));
assert!(redacted.contains("[REDACTED_HIGH_ENTROPY_TOKEN]"));
}
LeakResult::Clean => {
panic!("Should still detect high-entropy tokens outside media markers")
}
}
}
#[test]
fn shannon_entropy_empty_string() {
assert_eq!(shannon_entropy(""), 0.0);
}
#[test]
fn shannon_entropy_single_char() {
assert_eq!(shannon_entropy("aaaa"), 0.0);
}
#[test]
fn shannon_entropy_two_equal_chars() {
let e = shannon_entropy("abab");
assert!((e - 1.0).abs() < 0.001);
}
}