use std::sync::LazyLock;
use regex::Regex;
#[derive(Debug, Clone)]
pub struct PiiMatch {
pub category: &'static str,
pub masked: String,
}
struct Pattern {
category: &'static str,
regex: Regex,
}
static PATTERNS: LazyLock<Vec<Pattern>> = LazyLock::new(|| {
let defs: &[(&str, &str)] = &[
("API Key (OpenAI)", r"sk-[a-zA-Z0-9]{20,}"),
("API Key (AWS)", r"AKIA[0-9A-Z]{16}"),
("API Key (Anthropic)", r"sk-ant-[a-zA-Z0-9\-]{20,}"),
("Bearer Token", r"Bearer\s+[a-zA-Z0-9\-._~+/]{20,}=*"),
("Private Key", r"-----BEGIN\s+(?:RSA\s+)?PRIVATE\s+KEY-----"),
("Certificate", r"-----BEGIN\s+CERTIFICATE-----"),
(
"Password",
r"(?i)(?:password|passwd|pwd|secret)\s*[=:]\s*\S+",
),
(
"DB Connection String",
r"(?i)(?:mongodb|postgres|mysql|redis|sqlite)://\S+",
),
("Email", r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}"),
("US SSN", r"\b\d{3}-\d{2}-\d{4}\b"),
(
"Korean ID (Resident Registration Number)",
r"\b\d{6}-[1-4]\d{6}\b",
),
(
"Credit Card",
r"\b(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2}|6(?:011|5\d{2}))[- ]?\d{4}[- ]?\d{4}[- ]?\d{3,4}\b",
),
(
"Phone Number",
r"\b\+?\d{1,3}[-.\s]?\(?\d{2,4}\)?[-.\s]?\d{3,4}[-.\s]?\d{4}\b",
),
(
"Internal IP",
r"\b(?:10\.|172\.(?:1[6-9]|2\d|3[01])\.|192\.168\.)\d{1,3}\.\d{1,3}\b",
),
];
defs.iter()
.filter_map(|(cat, pat)| {
Regex::new(pat).ok().map(|r| Pattern {
category: cat,
regex: r,
})
})
.collect()
});
pub fn scan(input: &str) -> Vec<PiiMatch> {
let mut findings = Vec::new();
for pattern in PATTERNS.iter() {
for mat in pattern.regex.find_iter(input) {
let raw = mat.as_str();
let masked = mask_value(raw);
findings.push(PiiMatch {
category: pattern.category,
masked,
});
}
}
findings.sort_by(|a, b| a.category.cmp(b.category).then(a.masked.cmp(&b.masked)));
findings.dedup_by(|a, b| a.category == b.category && a.masked == b.masked);
findings
}
fn mask_value(raw: &str) -> String {
let chars: Vec<char> = raw.chars().collect();
if chars.len() <= 4 {
"****".to_string()
} else {
let visible: String = chars[..4].iter().collect();
format!("{visible}****")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detect_openai_key() {
let input = "Use this key: sk-abcdefghijklmnopqrstuvwxyz";
let results = scan(input);
assert!(results.iter().any(|r| r.category.contains("OpenAI")));
}
#[test]
fn detect_aws_key() {
let input = "AKIAIOSFODNN7EXAMPLE";
let results = scan(input);
assert!(results.iter().any(|r| r.category.contains("AWS")));
}
#[test]
fn detect_korean_id() {
let input = "주민번호 901231-1234567";
let results = scan(input);
assert!(results.iter().any(|r| r.category.contains("Korean")));
}
#[test]
fn detect_us_ssn() {
let input = "SSN: 123-45-6789";
let results = scan(input);
assert!(results.iter().any(|r| r.category.contains("SSN")));
}
#[test]
fn detect_email() {
let input = "Contact: user@example.com";
let results = scan(input);
assert!(results.iter().any(|r| r.category.contains("Email")));
}
#[test]
fn detect_password() {
let input = "password=mysecret123";
let results = scan(input);
assert!(results.iter().any(|r| r.category.contains("Password")));
}
#[test]
fn detect_private_key() {
let input = "-----BEGIN RSA PRIVATE KEY-----";
let results = scan(input);
assert!(results.iter().any(|r| r.category.contains("Private Key")));
}
#[test]
fn detect_db_connection() {
let input = "postgres://user:pass@localhost:5432/db";
let results = scan(input);
assert!(results.iter().any(|r| r.category.contains("DB")));
}
#[test]
fn clean_input_returns_empty() {
let input = "Please fix the bug in src/main.rs";
let results = scan(input);
assert!(results.is_empty());
}
#[test]
fn masking_works() {
assert_eq!(mask_value("sk-abcdefg"), "sk-a****");
assert_eq!(mask_value("abc"), "****");
}
}