use regex::Regex;
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::sync::OnceLock;
#[derive(Debug, Serialize, Deserialize)]
pub struct ScrubResult {
pub scrubbed: String,
pub detected_types: HashSet<PiiType>,
pub replacements: usize,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum PiiType {
Email,
Phone,
CreditCard,
Ssn,
IPAddress,
Address,
Name,
DateOfBirth,
}
struct PiiPatterns {
patterns: Vec<(PiiType, Regex)>,
}
impl PiiPatterns {
fn get() -> &'static Self {
static PATTERNS: OnceLock<PiiPatterns> = OnceLock::new();
PATTERNS.get_or_init(|| PiiPatterns {
patterns: vec![
(
PiiType::Email,
Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap(),
),
(
PiiType::Phone,
Regex::new(r"\+?1?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}").unwrap(),
),
(
PiiType::CreditCard,
Regex::new(r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b").unwrap(),
),
(
PiiType::Ssn,
Regex::new(r"\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b").unwrap(),
),
(
PiiType::IPAddress,
Regex::new(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b").unwrap(),
),
],
})
}
}
pub struct PiiScrubber {
replacement: String,
}
impl PiiScrubber {
pub fn new() -> Self {
Self {
replacement: "[REDACTED]".to_string(),
}
}
pub fn scrub(&self, text: &str) -> ScrubResult {
let patterns = PiiPatterns::get();
let mut scrubbed = text.to_string();
let mut detected_types = HashSet::new();
let mut replacements = 0;
for (pii_type, pattern) in &patterns.patterns {
let matches: Vec<_> = pattern.find_iter(&scrubbed).collect();
if !matches.is_empty() {
detected_types.insert(*pii_type);
replacements += matches.len();
scrubbed = pattern
.replace_all(&scrubbed, self.replacement.as_str())
.into_owned();
}
}
ScrubResult {
scrubbed,
detected_types,
replacements,
}
}
pub fn detect(&self, text: &str) -> HashSet<PiiType> {
let patterns = PiiPatterns::get();
let mut detected = HashSet::new();
for (pii_type, pattern) in &patterns.patterns {
if pattern.is_match(text) {
detected.insert(*pii_type);
}
}
detected
}
pub fn with_replacement(mut self, replacement: String) -> Self {
self.replacement = replacement;
self
}
pub fn scrub_batch(&self, texts: &[&str]) -> Vec<ScrubResult> {
texts.iter().map(|text| self.scrub(text)).collect()
}
pub fn contains_pii(&self, text: &str) -> bool {
let patterns = PiiPatterns::get();
patterns
.patterns
.iter()
.any(|(_, pattern)| pattern.is_match(text))
}
pub fn count_pii(&self, text: &str) -> usize {
let patterns = PiiPatterns::get();
patterns
.patterns
.iter()
.map(|(_, pattern)| pattern.find_iter(text).count())
.sum()
}
}
impl Default for PiiScrubber {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_email_detection() {
let scrubber = PiiScrubber::new();
let result = scrubber.scrub("Contact me at john@example.com");
assert!(result.detected_types.contains(&PiiType::Email));
assert_eq!(result.scrubbed, "Contact me at [REDACTED]");
}
#[test]
fn test_phone_detection() {
let scrubber = PiiScrubber::new();
let result = scrubber.scrub("Call me at 123-456-7890");
assert!(result.detected_types.contains(&PiiType::Phone));
}
#[test]
fn test_no_pii() {
let scrubber = PiiScrubber::new();
let result = scrubber.scrub("Hello, world!");
assert!(result.detected_types.is_empty());
assert_eq!(result.replacements, 0);
}
#[test]
fn test_chinese_pii() {
let scrubber = PiiScrubber::new();
let _result = scrubber.scrub("联系电话:13812345678");
let result = scrubber.scrub("邮箱:zhangsan@example.com");
assert!(result.detected_types.contains(&PiiType::Email));
let _result = scrubber.scrub("身份证:110101199001011234");
let result = scrubber.scrub("地址:北京市朝阳区某某路123号");
assert!(!result.scrubbed.is_empty());
}
#[test]
fn test_multiple_pii_types() {
let scrubber = PiiScrubber::new();
let text = "联系我:john@example.com 或拨打 123-456-7890。信用卡:4532-1234-5678-9012,IP:192.168.1.1";
let result = scrubber.scrub(text);
assert!(result.detected_types.contains(&PiiType::Email));
assert!(result.detected_types.contains(&PiiType::Phone));
assert!(result.detected_types.contains(&PiiType::CreditCard));
assert!(result.detected_types.contains(&PiiType::IPAddress));
assert!(!result.scrubbed.contains("john@example.com"));
assert!(!result.scrubbed.contains("123-456-7890"));
assert!(!result.scrubbed.contains("4532-1234-5678-9012"));
assert!(!result.scrubbed.contains("192.168.1.1"));
assert!(result.replacements >= 4);
}
#[test]
fn test_performance_large_text() {
use std::time::Instant;
let scrubber = PiiScrubber::new();
let mut large_text = String::with_capacity(1_000_000);
for i in 0..1000 {
large_text.push_str(&format!(
"用户{}: email{}@test.com, phone: 123-456-{:04}, IP: 192.168.1.{}\n",
i,
i,
i % 10000,
i % 256
));
}
let start = Instant::now();
let result = scrubber.scrub(&large_text);
let duration = start.elapsed();
assert!(
duration.as_millis() < 1000,
"Large text scrubbing took too long: {:?}",
duration
);
assert!(result.detected_types.contains(&PiiType::Email));
assert!(result.detected_types.contains(&PiiType::Phone));
assert!(result.detected_types.contains(&PiiType::IPAddress));
}
#[test]
fn test_edge_case_empty_string() {
let scrubber = PiiScrubber::new();
let result = scrubber.scrub("");
assert!(result.detected_types.is_empty());
assert_eq!(result.replacements, 0);
}
#[test]
fn test_edge_case_only_pii() {
let scrubber = PiiScrubber::new();
let result = scrubber.scrub("john@example.com");
assert!(result.detected_types.contains(&PiiType::Email));
assert_eq!(result.scrubbed, "[REDACTED]");
}
#[test]
fn test_custom_replacement() {
let scrubber = PiiScrubber::new().with_replacement("***MASKED***".to_string());
let result = scrubber.scrub("Contact: john@example.com");
assert!(result.scrubbed.contains("***MASKED***"));
assert!(!result.scrubbed.contains("[REDACTED]"));
}
#[test]
fn test_detect_without_replace() {
let scrubber = PiiScrubber::new();
let text = "Email: test@example.com";
let detected = scrubber.detect(text);
assert!(detected.contains(&PiiType::Email));
assert!(!detected.contains(&PiiType::Phone));
}
#[test]
fn test_overlapping_pii() {
let scrubber = PiiScrubber::new();
let result = scrubber.scrub("Server IP: 192.168.1.123 and phone: 192-168-1234");
assert!(
result.detected_types.contains(&PiiType::IPAddress)
|| result.detected_types.contains(&PiiType::Phone)
);
}
#[test]
fn test_contains_pii() {
let scrubber = PiiScrubber::new();
assert!(scrubber.contains_pii("Email: test@example.com"));
assert!(scrubber.contains_pii("Phone: 123-456-7890"));
assert!(!scrubber.contains_pii("Hello World"));
}
#[test]
fn test_count_pii() {
let scrubber = PiiScrubber::new();
let count = scrubber.count_pii("Email: a@test.com and b@test.com, IP: 192.168.1.1");
assert!(count >= 3);
}
#[test]
fn test_batch_scrub() {
let scrubber = PiiScrubber::new();
let texts = vec![
"Email: test1@example.com",
"Phone: 123-456-7890",
"No PII here",
];
let results = scrubber.scrub_batch(&texts);
assert_eq!(results.len(), 3);
assert!(results[0].detected_types.contains(&PiiType::Email));
assert!(results[1].detected_types.contains(&PiiType::Phone));
assert!(results[2].detected_types.is_empty());
}
#[test]
fn test_patterns_singleton() {
let scrubber = PiiScrubber::new();
for _ in 0..100 {
scrubber.scrub("test@example.com");
}
}
}