use regex::Regex;
use tracing::{debug, trace};
use crate::config::{DlpAction, DlpPattern};
#[derive(Debug, Clone)]
pub struct CompiledPattern {
pub name: String,
pub regex: Regex,
pub action: DlpAction,
}
#[derive(Debug, Clone)]
pub struct DlpScanner {
patterns: Vec<CompiledPattern>,
scan_responses: bool,
}
#[derive(Debug, Clone)]
pub struct ScanResult {
pub blocked: Vec<String>,
pub redacted_text: String,
pub was_redacted: bool,
}
impl DlpScanner {
pub fn new(patterns: &[DlpPattern]) -> Result<Self, regex::Error> {
Self::with_response_scanning(patterns, true)
}
pub fn with_response_scanning(
patterns: &[DlpPattern],
scan_responses: bool,
) -> Result<Self, regex::Error> {
let compiled = patterns
.iter()
.map(|p| {
trace!(name = %p.name, action = ?p.action, "Compiling DLP pattern");
Ok(CompiledPattern {
name: p.name.clone(),
regex: Regex::new(&p.regex)?,
action: p.action,
})
})
.collect::<Result<Vec<_>, regex::Error>>()?;
debug!(
pattern_count = compiled.len(),
scan_responses, "DLP scanner initialized"
);
Ok(Self {
patterns: compiled,
scan_responses,
})
}
pub fn scan(&self, text: &str) -> Vec<String> {
trace!(text_len = text.len(), "Scanning text for block patterns");
let matches: Vec<String> = self
.patterns
.iter()
.filter(|p| p.action == DlpAction::Block && p.regex.is_match(text))
.map(|p| {
debug!(pattern = %p.name, "Block pattern matched");
p.name.clone()
})
.collect();
trace!(match_count = matches.len(), "Block scan complete");
matches
}
pub fn scan_and_redact(&self, text: &str) -> ScanResult {
trace!(
text_len = text.len(),
"Scanning text for block+redact patterns"
);
let blocked: Vec<String> = self
.patterns
.iter()
.filter(|p| p.action == DlpAction::Block && p.regex.is_match(text))
.map(|p| {
debug!(pattern = %p.name, "Block pattern matched in request");
p.name.clone()
})
.collect();
let mut redacted = text.to_string();
let mut was_redacted = false;
for p in &self.patterns {
if p.action == DlpAction::Redact && p.regex.is_match(&redacted) {
debug!(pattern = %p.name, "Redact pattern matched, masking PII");
let replacement = format!("[REDACTED:{}]", p.name);
redacted = p
.regex
.replace_all(&redacted, replacement.as_str())
.to_string();
was_redacted = true;
}
}
trace!(
blocked_count = blocked.len(),
was_redacted, "Scan-and-redact complete"
);
ScanResult {
blocked,
redacted_text: redacted,
was_redacted,
}
}
pub fn redact_all(&self, text: &str) -> (String, Vec<String>) {
trace!(text_len = text.len(), "Redacting all patterns from text");
let mut redacted = text.to_string();
let mut redacted_names = Vec::new();
for p in &self.patterns {
if p.regex.is_match(&redacted) {
debug!(pattern = %p.name, "Pattern matched in response, redacting");
let replacement = format!("[REDACTED:{}]", p.name);
redacted = p
.regex
.replace_all(&redacted, replacement.as_str())
.to_string();
redacted_names.push(p.name.clone());
}
}
trace!(redacted_count = redacted_names.len(), "Redact-all complete");
(redacted, redacted_names)
}
pub fn scan_responses(&self) -> bool {
self.scan_responses
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::config::DlpPattern;
fn default_patterns() -> Vec<DlpPattern> {
vec![
DlpPattern {
name: "credit_card".to_string(),
regex: r"\b(?:\d[ -]*?){13,19}\b".to_string(),
action: DlpAction::Block,
},
DlpPattern {
name: "ssn".to_string(),
regex: r"\b\d{3}-\d{2}-\d{4}\b".to_string(),
action: DlpAction::Block,
},
DlpPattern {
name: "email".to_string(),
regex: r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b".to_string(),
action: DlpAction::Block,
},
]
}
fn mixed_patterns() -> Vec<DlpPattern> {
vec![
DlpPattern {
name: "ssn".to_string(),
regex: r"\b\d{3}-\d{2}-\d{4}\b".to_string(),
action: DlpAction::Block,
},
DlpPattern {
name: "email".to_string(),
regex: r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b".to_string(),
action: DlpAction::Redact,
},
DlpPattern {
name: "phone_number".to_string(),
regex: r"\b(?:\+?1[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b".to_string(),
action: DlpAction::Redact,
},
]
}
#[test]
fn test_detect_credit_card() {
let scanner = DlpScanner::new(&default_patterns()).unwrap();
let matches = scanner.scan("My card is 4111 1111 1111 1111 please charge it");
assert!(matches.contains(&"credit_card".to_string()));
}
#[test]
fn test_detect_ssn() {
let scanner = DlpScanner::new(&default_patterns()).unwrap();
let matches = scanner.scan("My SSN is 123-45-6789");
assert!(matches.contains(&"ssn".to_string()));
}
#[test]
fn test_detect_email() {
let scanner = DlpScanner::new(&default_patterns()).unwrap();
let matches = scanner.scan("Contact me at user@example.com");
assert!(matches.contains(&"email".to_string()));
}
#[test]
fn test_no_sensitive_data() {
let scanner = DlpScanner::new(&default_patterns()).unwrap();
let matches = scanner.scan("Tell me about the weather today");
assert!(matches.is_empty());
}
#[test]
fn test_multiple_detections() {
let scanner = DlpScanner::new(&default_patterns()).unwrap();
let matches = scanner.scan("Card: 4111111111111111, SSN: 123-45-6789, email: a@b.com");
assert!(matches.contains(&"credit_card".to_string()));
assert!(matches.contains(&"ssn".to_string()));
assert!(matches.contains(&"email".to_string()));
}
#[test]
fn test_empty_patterns() {
let scanner = DlpScanner::new(&[]).unwrap();
let matches = scanner.scan("4111111111111111");
assert!(matches.is_empty());
}
#[test]
fn test_redact_email() {
let scanner = DlpScanner::new(&mixed_patterns()).unwrap();
let result = scanner.scan_and_redact("Contact me at user@example.com");
assert!(result.blocked.is_empty());
assert!(result.was_redacted);
assert!(result.redacted_text.contains("[REDACTED:email]"));
assert!(!result.redacted_text.contains("user@example.com"));
}
#[test]
fn test_redact_phone() {
let scanner = DlpScanner::new(&mixed_patterns()).unwrap();
let result = scanner.scan_and_redact("Call me at 555-123-4567");
assert!(result.blocked.is_empty());
assert!(result.was_redacted);
assert!(result.redacted_text.contains("[REDACTED:phone_number]"));
assert!(!result.redacted_text.contains("555-123-4567"));
}
#[test]
fn test_block_ssn_and_redact_email() {
let scanner = DlpScanner::new(&mixed_patterns()).unwrap();
let result = scanner.scan_and_redact("SSN: 123-45-6789, email: user@example.com");
assert!(result.blocked.contains(&"ssn".to_string()));
assert!(result.was_redacted);
assert!(result.redacted_text.contains("[REDACTED:email]"));
}
#[test]
fn test_scan_only_returns_block_patterns() {
let scanner = DlpScanner::new(&mixed_patterns()).unwrap();
let matches = scanner.scan("Contact me at user@example.com");
assert!(!matches.contains(&"email".to_string()));
}
#[test]
fn test_scan_returns_block_patterns() {
let scanner = DlpScanner::new(&mixed_patterns()).unwrap();
let matches = scanner.scan("My SSN is 123-45-6789");
assert!(matches.contains(&"ssn".to_string()));
}
#[test]
fn test_redact_all_replaces_everything() {
let scanner = DlpScanner::new(&mixed_patterns()).unwrap();
let (redacted, names) =
scanner.redact_all("SSN: 123-45-6789, email: user@example.com, phone: 555-123-4567");
assert!(names.contains(&"ssn".to_string()));
assert!(names.contains(&"email".to_string()));
assert!(names.contains(&"phone_number".to_string()));
assert!(redacted.contains("[REDACTED:ssn]"));
assert!(redacted.contains("[REDACTED:email]"));
assert!(redacted.contains("[REDACTED:phone_number]"));
assert!(!redacted.contains("123-45-6789"));
assert!(!redacted.contains("user@example.com"));
assert!(!redacted.contains("555-123-4567"));
}
#[test]
fn test_redact_all_clean_text() {
let scanner = DlpScanner::new(&mixed_patterns()).unwrap();
let (redacted, names) = scanner.redact_all("Hello, how are you?");
assert!(names.is_empty());
assert_eq!(redacted, "Hello, how are you?");
}
#[test]
fn test_no_redaction_when_clean() {
let scanner = DlpScanner::new(&mixed_patterns()).unwrap();
let result = scanner.scan_and_redact("Hello world");
assert!(result.blocked.is_empty());
assert!(!result.was_redacted);
assert_eq!(result.redacted_text, "Hello world");
}
#[test]
fn test_scan_responses_flag() {
let scanner = DlpScanner::with_response_scanning(&[], true).unwrap();
assert!(scanner.scan_responses());
let scanner = DlpScanner::with_response_scanning(&[], false).unwrap();
assert!(!scanner.scan_responses());
}
#[test]
fn test_redact_multiple_occurrences() {
let patterns = vec![DlpPattern {
name: "email".to_string(),
regex: r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b".to_string(),
action: DlpAction::Redact,
}];
let scanner = DlpScanner::new(&patterns).unwrap();
let result = scanner.scan_and_redact("a@b.com and c@d.com");
assert!(result.was_redacted);
assert_eq!(
result.redacted_text,
"[REDACTED:email] and [REDACTED:email]"
);
}
#[test]
fn test_detect_phone_various_formats() {
let patterns = vec![DlpPattern {
name: "phone_number".to_string(),
regex: r"\b(?:\+?1[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b".to_string(),
action: DlpAction::Redact,
}];
let scanner = DlpScanner::new(&patterns).unwrap();
let (redacted, names) = scanner.redact_all("Call 555-123-4567");
assert!(names.contains(&"phone_number".to_string()));
assert!(!redacted.contains("555-123-4567"));
let (redacted, names) = scanner.redact_all("Call (555) 123-4567");
assert!(names.contains(&"phone_number".to_string()));
assert!(!redacted.contains("(555) 123-4567"));
let (redacted, names) = scanner.redact_all("Call +1-555-123-4567");
assert!(names.contains(&"phone_number".to_string()));
assert!(!redacted.contains("+1-555-123-4567"));
}
}