use regex::RegexBuilder;
use std::sync::OnceLock;
static CLEANING_PATTERNS: OnceLock<Vec<(&'static str, &'static str)>> = OnceLock::new();
static SECRET_PATTERNS: OnceLock<Vec<(&'static str, &'static str)>> = OnceLock::new();
fn get_cleaning_patterns() -> &'static Vec<(&'static str, &'static str)> {
CLEANING_PATTERNS.get_or_init(|| {
vec![
(r"//.*?$", ""), (r"/\*[\s\S]*?\*/", ""), (r"console\.(log|error|warn|info)\([^)]*\);?", ""), (r"^\s*[\r\n]", ""), (r" +$", ""), (r"^\s*import\s+.*?;?\s*$", ""), (r"^\s*\n+", "\n"), ]
})
}
fn get_secret_patterns() -> &'static Vec<(&'static str, &'static str)> {
SECRET_PATTERNS.get_or_init(|| {
vec![
(r#"((?:api|stripe|access|auth|client|secret|private|jwt)[_-]?(?:key|secret|token))\s*=\s*["']([^"']+)["']"#, "$1=[REDACTED]"),
(r#"^(API[_-]?KEY|API[_-]?SECRET|ACCESS[_-]?TOKEN|AUTH[_-]?TOKEN|CLIENT[_-]?SECRET|DB[_-]?PASSWORD|DATABASE[_-]?PASSWORD|AWS_ACCESS_KEY_ID|AWS_SECRET_ACCESS_KEY|GOOGLE_API_KEY|AZURE_CLIENT_SECRET|DATABASE_URL|MONGO_URI|MYSQL_URL|JWT[_-]?SECRET|SECRET[_-]?KEY|PRIVATE[_-]?KEY)\s*=\s*(?:"[^"]*"|'[^']*'|[^\s#\n]*)"#, "$1=[REDACTED]"),
(r"bearer\s+[a-zA-Z0-9\-._~+\/=]+", "bearer [REDACTED]"),
(r"eyJ[A-Za-z0-9_\-=]+\.[A-Za-z0-9_\-=]+\.[A-Za-z0-9_\-=.]+", "[REDACTED_JWT]"),
(r"\b[a-f0-9]{40}\b", "[REDACTED_HASH]"),
(r"\b[a-f0-9]{64}\b", "[REDACTED_HASH]"),
(r#"["']([A-Za-z0-9+/]{40,}={0,2})["']"#, "[REDACTED_BASE64]"),
]
})
}
pub fn count_tokens(text: &str) -> usize {
text.split_whitespace().count()
}
pub fn clean_code(code: &str) -> String {
let mut result = code.to_string();
for (pattern_str, replacement) in get_cleaning_patterns() {
if let Ok(re) = RegexBuilder::new(pattern_str)
.multi_line(true)
.dot_matches_new_line(true)
.build()
{
result = re.replace_all(&result, *replacement).to_string();
}
}
result.trim().to_string()
}
pub fn redact_secrets(code: &str) -> String {
let mut result = code.to_string();
for (pattern_str, replacement) in get_secret_patterns() {
if let Ok(re) = RegexBuilder::new(pattern_str)
.multi_line(true)
.case_insensitive(true)
.build()
{
result = re.replace_all(&result, *replacement).to_string();
}
}
result
}
fn remove_redacted_lines(code: &str) -> String {
code
.lines()
.filter(|line| !line.contains("[REDACTED"))
.collect::<Vec<_>>()
.join("\n")
}
pub fn clean_and_redact(code: &str) -> String {
let redacted = redact_secrets(code);
let without_redacted_lines = remove_redacted_lines(&redacted);
let cleaned = clean_code(&without_redacted_lines);
cleaned.trim().to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_count_tokens() {
assert_eq!(count_tokens("hello world"), 2);
assert_eq!(count_tokens("one two three four"), 4);
}
#[test]
fn test_clean_comments() {
let code = "let x = 1; // this is a comment\nlet y = 2;";
let cleaned = clean_code(code);
assert!(!cleaned.contains("comment"), "Result: {}", cleaned);
assert!(cleaned.contains("let x"), "Result: {}", cleaned);
}
#[test]
fn test_redact_api_key() {
let code = r#"const API_KEY="sk-1234567890abcdef""#;
let redacted = redact_secrets(code);
assert!(redacted.contains("[REDACTED]"), "Result: {}", redacted);
}
#[test]
fn test_clean_and_redact() {
let code = r#"
// API endpoint
const API_KEY = "secret-key-123";
console.log("test");
"#;
let result = clean_and_redact(code);
assert!(
!result.contains("//"),
"Comments should be removed. Result: {}",
result
);
}
}