use regex::Regex;
#[derive(Debug, Clone)]
pub struct SensitivityResult {
pub is_sensitive: bool,
pub confidence: f32,
}
pub trait PrivacyClassifier: Send + Sync {
fn classify(&self, content: &str) -> SensitivityResult;
}
pub struct PatternPrivacyClassifier {
patterns: Vec<Regex>,
}
impl PatternPrivacyClassifier {
pub fn new() -> Result<Self, regex::Error> {
let pattern_strs = [
r"\b\d{3}-\d{2}-\d{4}\b",
r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b",
r"(?i)\b(password|passwd|api[_-]?key|auth[_-]?token|secret[_-]?key)\b",
];
let patterns = pattern_strs
.iter()
.map(|p| Regex::new(p))
.collect::<Result<Vec<_>, _>>()?;
Ok(Self { patterns })
}
}
impl PrivacyClassifier for PatternPrivacyClassifier {
fn classify(&self, content: &str) -> SensitivityResult {
let is_sensitive = self.patterns.iter().any(|p| p.is_match(content));
SensitivityResult {
is_sensitive,
confidence: if is_sensitive { 1.0 } else { 0.0 },
}
}
}
pub struct ConfigurablePrivacyClassifier {
patterns: Vec<Regex>,
}
impl ConfigurablePrivacyClassifier {
pub fn new(pattern_strs: Vec<String>) -> Result<Self, regex::Error> {
let patterns = pattern_strs
.iter()
.map(|p| Regex::new(p))
.collect::<Result<Vec<_>, _>>()?;
Ok(Self { patterns })
}
}
impl PrivacyClassifier for ConfigurablePrivacyClassifier {
fn classify(&self, content: &str) -> SensitivityResult {
let is_sensitive = self.patterns.iter().any(|p| p.is_match(content));
SensitivityResult {
is_sensitive,
confidence: if is_sensitive { 1.0 } else { 0.0 },
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn classifier() -> PatternPrivacyClassifier {
PatternPrivacyClassifier::new().unwrap()
}
#[test]
fn detects_ssn() {
let result = classifier().classify("My SSN is 123-45-6789");
assert!(result.is_sensitive);
assert_eq!(result.confidence, 1.0);
}
#[test]
fn detects_credit_card() {
let result = classifier().classify("Card: 4111 1111 1111 1111");
assert!(result.is_sensitive);
assert_eq!(result.confidence, 1.0);
}
#[test]
fn detects_password() {
assert!(classifier().classify("my password is hunter2").is_sensitive);
}
#[test]
fn detects_api_key() {
assert!(
classifier()
.classify("set the api_key to sk-1234")
.is_sensitive
);
}
#[test]
fn allows_normal_household_content() {
let result = classifier().classify("We need to buy groceries for dinner Saturday");
assert!(!result.is_sensitive);
assert_eq!(result.confidence, 0.0);
}
#[test]
fn allows_doctor_mention() {
assert!(
!classifier()
.classify("the doctor's office called about Saturday")
.is_sensitive
);
}
#[test]
fn allows_email_address() {
assert!(
!classifier()
.classify("email joe@plumber.com about the leak")
.is_sensitive
);
}
#[test]
fn allows_phone_number() {
assert!(
!classifier()
.classify("call the restaurant at 555-123-4567")
.is_sensitive
);
}
#[test]
fn allows_medical_terms_in_context() {
assert!(
!classifier()
.classify("Started new medication for anxiety")
.is_sensitive
);
}
#[test]
fn configurable_with_custom_patterns() {
let c = ConfigurablePrivacyClassifier::new(vec![
r"\b\d{3}-\d{2}-\d{4}\b".into(), ])
.unwrap();
assert!(c.classify("SSN: 123-45-6789").is_sensitive);
assert!(!c.classify("saw the doctor today").is_sensitive);
}
#[test]
fn configurable_rejects_bad_regex() {
let result = ConfigurablePrivacyClassifier::new(vec!["[invalid".into()]);
assert!(result.is_err());
}
#[test]
fn configurable_empty_patterns_allows_everything() {
let c = ConfigurablePrivacyClassifier::new(vec![]).unwrap();
assert!(!c.classify("My SSN is 123-45-6789").is_sensitive);
}
#[test]
fn detects_credit_card_no_separators() {
assert!(
classifier()
.classify("card 4111111111111111 on file")
.is_sensitive
);
}
#[test]
fn detects_credit_card_with_dashes() {
assert!(
classifier()
.classify("Card: 4111-1111-1111-1111")
.is_sensitive
);
}
#[test]
fn detects_ssn_bare() {
assert!(classifier().classify("123-45-6789").is_sensitive);
}
#[test]
fn detects_auth_token_keyword() {
assert!(
classifier()
.classify("set auth_token to abc123")
.is_sensitive
);
}
#[test]
fn detects_secret_key_keyword() {
assert!(
classifier()
.classify("the secret_key is sk-prod-xyz")
.is_sensitive
);
}
#[test]
fn detects_pii_in_longer_document() {
let content = "Meeting notes from Thursday.\n\
Discussed budget and timeline.\n\
SSN is 999-88-7777 for the insurance form.\n\
Action items: follow up with vendor.";
assert!(classifier().classify(content).is_sensitive);
}
#[test]
fn empty_string_is_not_sensitive() {
assert!(!classifier().classify("").is_sensitive);
}
#[test]
fn partial_ssn_not_sensitive() {
assert!(
!classifier()
.classify("code 123-45 in the system")
.is_sensitive
);
}
}