use crate::config::PiiConfig;
use crate::types::{Redaction, RedactionType};
#[cfg(feature = "pii")]
use regex::Regex;
pub struct PiiDetector {
config: PiiConfig,
#[cfg(feature = "pii")]
patterns: PiiPatterns,
}
#[cfg(feature = "pii")]
struct PiiPatterns {
ssn: Regex,
credit_card: Regex,
email: Regex,
phone: Regex,
ip_v4: Regex,
ip_v6: Regex,
api_key: Regex,
}
#[cfg(feature = "pii")]
impl PiiPatterns {
fn new() -> Self {
Self {
ssn: Regex::new(r"\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b").unwrap(),
credit_card: Regex::new(
r"\b(?:\d{4}[-\s]?){3}\d{4}\b|\b\d{15,16}\b"
).unwrap(),
email: Regex::new(
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
).unwrap(),
phone: Regex::new(
r"\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b"
).unwrap(),
ip_v4: Regex::new(
r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b"
).unwrap(),
ip_v6: Regex::new(
r"\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b"
).unwrap(),
api_key: Regex::new(
r#"\b(?:sk-[a-zA-Z0-9]{20,}|api[_-]?key[=:\s]+['"]?[a-zA-Z0-9_-]{20,}['"]?)\b"#
).unwrap(),
}
}
}
impl PiiDetector {
pub fn new(config: PiiConfig) -> Self {
Self {
config,
#[cfg(feature = "pii")]
patterns: PiiPatterns::new(),
}
}
#[cfg(feature = "pii")]
pub fn detect(&self, text: &str) -> Vec<Redaction> {
if !self.config.enabled {
return vec![];
}
let mut redactions = vec![];
if self.config.detect_ssn {
for m in self.patterns.ssn.find_iter(text) {
redactions.push(Redaction {
redaction_type: RedactionType::Ssn,
original_hash: hash_value(m.as_str()),
replacement: self.format_redaction(RedactionType::Ssn),
start: m.start(),
end: m.end(),
});
}
}
if self.config.detect_credit_card {
for m in self.patterns.credit_card.find_iter(text) {
let digits: String = m.as_str().chars().filter(|c| c.is_ascii_digit()).collect();
if luhn_check(&digits) {
redactions.push(Redaction {
redaction_type: RedactionType::CreditCard,
original_hash: hash_value(m.as_str()),
replacement: self.format_redaction(RedactionType::CreditCard),
start: m.start(),
end: m.end(),
});
}
}
}
if self.config.detect_email {
for m in self.patterns.email.find_iter(text) {
redactions.push(Redaction {
redaction_type: RedactionType::Email,
original_hash: hash_value(m.as_str()),
replacement: self.format_redaction(RedactionType::Email),
start: m.start(),
end: m.end(),
});
}
}
if self.config.detect_phone {
for m in self.patterns.phone.find_iter(text) {
redactions.push(Redaction {
redaction_type: RedactionType::Phone,
original_hash: hash_value(m.as_str()),
replacement: self.format_redaction(RedactionType::Phone),
start: m.start(),
end: m.end(),
});
}
}
if self.config.detect_ip {
for m in self.patterns.ip_v4.find_iter(text) {
redactions.push(Redaction {
redaction_type: RedactionType::IpAddress,
original_hash: hash_value(m.as_str()),
replacement: self.format_redaction(RedactionType::IpAddress),
start: m.start(),
end: m.end(),
});
}
for m in self.patterns.ip_v6.find_iter(text) {
redactions.push(Redaction {
redaction_type: RedactionType::IpAddress,
original_hash: hash_value(m.as_str()),
replacement: self.format_redaction(RedactionType::IpAddress),
start: m.start(),
end: m.end(),
});
}
}
if self.config.detect_api_keys {
for m in self.patterns.api_key.find_iter(text) {
redactions.push(Redaction {
redaction_type: RedactionType::ApiKey,
original_hash: hash_value(m.as_str()),
replacement: self.format_redaction(RedactionType::ApiKey),
start: m.start(),
end: m.end(),
});
}
}
redactions.sort_by_key(|r| r.start);
remove_overlaps(&mut redactions);
redactions
}
#[cfg(not(feature = "pii"))]
pub fn detect(&self, _text: &str) -> Vec<Redaction> {
vec![]
}
pub fn redact(&self, text: &str, redactions: &[Redaction]) -> String {
if redactions.is_empty() {
return text.to_string();
}
let mut result = String::with_capacity(text.len());
let mut last_end = 0;
for redaction in redactions {
if redaction.start > last_end {
result.push_str(&text[last_end..redaction.start]);
}
result.push_str(&redaction.replacement);
last_end = redaction.end;
}
if last_end < text.len() {
result.push_str(&text[last_end..]);
}
result
}
fn format_redaction(&self, redaction_type: RedactionType) -> String {
self.config
.redaction_format
.replace("{TYPE}", &redaction_type.to_string())
}
}
fn hash_value(value: &str) -> String {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
value.hash(&mut hasher);
format!("{:x}", hasher.finish())
}
#[cfg(feature = "pii")]
fn luhn_check(number: &str) -> bool {
let digits: Vec<u32> = number.chars().filter_map(|c| c.to_digit(10)).collect();
if digits.len() < 13 {
return false;
}
let mut sum = 0;
let mut double = false;
for &digit in digits.iter().rev() {
let mut d = digit;
if double {
d *= 2;
if d > 9 {
d -= 9;
}
}
sum += d;
double = !double;
}
sum % 10 == 0
}
fn remove_overlaps(redactions: &mut Vec<Redaction>) {
if redactions.len() < 2 {
return;
}
let mut i = 0;
while i < redactions.len() - 1 {
if redactions[i].end > redactions[i + 1].start {
redactions.remove(i + 1);
} else {
i += 1;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ssn_detection() {
let config = PiiConfig::default();
let detector = PiiDetector::new(config);
let text = "My SSN is 123-45-6789 and yours is 987654321";
let redactions = detector.detect(text);
assert_eq!(redactions.len(), 2);
assert_eq!(redactions[0].redaction_type, RedactionType::Ssn);
}
#[test]
fn test_email_detection() {
let config = PiiConfig::default();
let detector = PiiDetector::new(config);
let text = "Contact me at john.doe@example.com for more info";
let redactions = detector.detect(text);
assert!(redactions
.iter()
.any(|r| r.redaction_type == RedactionType::Email));
}
#[test]
fn test_redaction() {
let config = PiiConfig::default();
let detector = PiiDetector::new(config);
let text = "My email is test@test.com";
let redactions = detector.detect(text);
let redacted = detector.redact(text, &redactions);
assert!(!redacted.contains("test@test.com"));
assert!(redacted.contains("[REDACTED:Email]"));
}
#[test]
fn test_credit_card_luhn() {
assert!(luhn_check("4532015112830366"));
assert!(!luhn_check("1234567890123456"));
}
}