use std::borrow::Cow;
use std::sync::LazyLock;
use regex::{Regex, RegexBuilder};
use serde::{Deserialize, Serialize};
static EMAIL_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"[a-zA-Z0-9._%+\-]{2,}@(?:[a-zA-Z]+\.)+[a-zA-Z]{2,6}").expect("valid EMAIL_RE")
});
static PHONE_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\b(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b").expect("valid PHONE_RE")
});
static SSN_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b\d{3}-\d{2}-\d{4}\b").expect("valid SSN_RE"));
static CREDIT_CARD_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b(?:\d{4}[-\s]?){3}\d{4}\b").expect("valid CREDIT_CARD_RE"));
fn default_true() -> bool {
true
}
#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)]
pub struct CustomPiiPattern {
pub name: String,
pub pattern: String,
#[serde(default = "default_custom_replacement")]
pub replacement: String,
}
fn default_custom_replacement() -> String {
"[PII:custom]".to_owned()
}
#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)]
#[allow(clippy::struct_excessive_bools)] pub struct PiiFilterConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default = "default_true")]
pub filter_email: bool,
#[serde(default = "default_true")]
pub filter_phone: bool,
#[serde(default = "default_true")]
pub filter_ssn: bool,
#[serde(default = "default_true")]
pub filter_credit_card: bool,
#[serde(default)]
pub custom_patterns: Vec<CustomPiiPattern>,
}
impl Default for PiiFilterConfig {
fn default() -> Self {
Self {
enabled: false,
filter_email: true,
filter_phone: true,
filter_ssn: true,
filter_credit_card: true,
custom_patterns: Vec::new(),
}
}
}
struct PiiPattern {
regex: Regex,
replacement: &'static str,
}
struct CustomPiiPatternCompiled {
regex: Regex,
replacement: String,
}
pub struct PiiFilter {
enabled: bool,
builtin: Vec<PiiPattern>,
custom: Vec<CustomPiiPatternCompiled>,
}
impl PiiFilter {
#[must_use]
pub fn new(config: PiiFilterConfig) -> Self {
let mut builtin = Vec::new();
if config.filter_email {
builtin.push(PiiPattern {
regex: EMAIL_RE.clone(),
replacement: "[PII:email]",
});
}
if config.filter_phone {
builtin.push(PiiPattern {
regex: PHONE_RE.clone(),
replacement: "[PII:phone]",
});
}
if config.filter_ssn {
builtin.push(PiiPattern {
regex: SSN_RE.clone(),
replacement: "[PII:ssn]",
});
}
if config.filter_credit_card {
builtin.push(PiiPattern {
regex: CREDIT_CARD_RE.clone(),
replacement: "[PII:credit_card]",
});
}
let mut custom = Vec::new();
for p in config.custom_patterns {
match RegexBuilder::new(&p.pattern)
.size_limit(1_000_000)
.dfa_size_limit(1_000_000)
.build()
{
Ok(regex) => custom.push(CustomPiiPatternCompiled {
regex,
replacement: p.replacement,
}),
Err(e) => {
tracing::warn!(name = %p.name, error = %e, "PII filter: skipping invalid custom pattern");
}
}
}
Self {
enabled: config.enabled,
builtin,
custom,
}
}
#[must_use]
pub fn scrub<'a>(&self, text: &'a str) -> Cow<'a, str> {
if !self.enabled || (self.builtin.is_empty() && self.custom.is_empty()) {
return Cow::Borrowed(text);
}
let mut result: Option<String> = None;
for p in &self.builtin {
let current: &str = result.as_deref().unwrap_or(text);
let replaced = p.regex.replace_all(current, p.replacement);
if let Cow::Owned(s) = replaced {
result = Some(s);
}
}
for p in &self.custom {
let current: &str = result.as_deref().unwrap_or(text);
let replaced = p.regex.replace_all(current, p.replacement.as_str());
if let Cow::Owned(s) = replaced {
result = Some(s);
}
}
match result {
Some(s) => Cow::Owned(s),
None => Cow::Borrowed(text),
}
}
#[must_use]
pub fn has_pii(&self, text: &str) -> bool {
if !self.enabled {
return false;
}
self.builtin.iter().any(|p| p.regex.is_match(text))
|| self.custom.iter().any(|p| p.regex.is_match(text))
}
#[must_use]
pub fn is_enabled(&self) -> bool {
self.enabled && (!self.builtin.is_empty() || !self.custom.is_empty())
}
}
#[cfg(test)]
mod tests {
use super::*;
fn filter_all() -> PiiFilter {
PiiFilter::new(PiiFilterConfig {
enabled: true,
..PiiFilterConfig::default()
})
}
fn filter_disabled() -> PiiFilter {
PiiFilter::new(PiiFilterConfig::default())
}
#[test]
fn disabled_returns_borrowed() {
let f = filter_disabled();
let text = "email: user@example.com";
let result = f.scrub(text);
assert!(matches!(result, Cow::Borrowed(_)));
assert_eq!(result, text);
}
#[test]
fn disabled_has_pii_false() {
let f = filter_disabled();
assert!(!f.has_pii("user@example.com"));
}
#[test]
fn scrubs_email() {
let f = filter_all();
let result = f.scrub("contact us at user@example.com please");
assert_eq!(result, "contact us at [PII:email] please");
}
#[test]
fn scrubs_tagged_email() {
let f = filter_all();
let result = f.scrub("user+tag@sub.domain.org is the address");
assert_eq!(result, "[PII:email] is the address");
}
#[test]
fn does_not_match_at_localhost() {
let f = filter_all();
let text = "user@localhost should not match";
let result = f.scrub(text);
assert_eq!(result, text, "user@localhost must not be matched");
}
#[test]
fn does_not_match_versioned_domain() {
let f = filter_all();
let text = "template@v2.config should not match";
let result = f.scrub(text);
assert_eq!(
result, text,
"v2.config must not be detected as email domain"
);
}
#[test]
fn does_not_match_db_at_localhost() {
let f = filter_all();
let text = "connect to db@localhost:5432";
let result = f.scrub(text);
assert!(
!result.contains("[PII:email]"),
"localhost must not be detected as email: {result}"
);
}
#[test]
fn does_not_match_short_local() {
let f = filter_all();
let text = "a@b.co";
let result = f.scrub(text);
assert_eq!(result, text, "single-char local part must not match");
}
#[test]
fn scrubs_us_phone() {
let f = filter_all();
let result = f.scrub("call 555-867-5309 for info");
assert_eq!(result, "call [PII:phone] for info");
}
#[test]
fn scrubs_us_phone_with_country_code() {
let f = filter_all();
let result = f.scrub("call +1-800-555-1234 now");
assert_eq!(result, "call +[PII:phone] now");
}
#[test]
fn scrubs_ssn() {
let f = filter_all();
let result = f.scrub("SSN: 123-45-6789 on file");
assert_eq!(result, "SSN: [PII:ssn] on file");
}
#[test]
fn scrubs_credit_card() {
let f = filter_all();
let result = f.scrub("card: 4111 1111 1111 1111 expired");
assert_eq!(result, "card: [PII:credit_card] expired");
}
#[test]
fn scrubs_credit_card_dashes() {
let f = filter_all();
let result = f.scrub("card 4111-1111-1111-1111");
assert_eq!(result, "card [PII:credit_card]");
}
#[test]
fn no_pii_returns_borrowed() {
let f = filter_all();
let text = "no sensitive data here";
let result = f.scrub(text);
assert!(matches!(result, Cow::Borrowed(_)));
}
#[test]
fn has_pii_detects_email() {
let f = filter_all();
assert!(f.has_pii("reach user@example.com"));
assert!(!f.has_pii("no pii here"));
}
#[test]
fn custom_pattern_scrubs() {
let f = PiiFilter::new(PiiFilterConfig {
enabled: true,
filter_email: false,
filter_phone: false,
filter_ssn: false,
filter_credit_card: false,
custom_patterns: vec![CustomPiiPattern {
name: "employee_id".to_owned(),
pattern: r"EMP-\d{6}".to_owned(),
replacement: "[PII:employee_id]".to_owned(),
}],
});
let result = f.scrub("ID: EMP-123456 assigned");
assert_eq!(result, "ID: [PII:employee_id] assigned");
}
#[test]
fn invalid_custom_pattern_skipped() {
let f = PiiFilter::new(PiiFilterConfig {
enabled: true,
custom_patterns: vec![CustomPiiPattern {
name: "bad".to_owned(),
pattern: r"[invalid(".to_owned(),
replacement: "[PII:bad]".to_owned(),
}],
..PiiFilterConfig::default()
});
let result = f.scrub("user@example.com");
assert_eq!(result, "[PII:email]");
}
#[test]
fn empty_input_returns_borrowed() {
let f = filter_all();
let result = f.scrub("");
assert!(matches!(result, Cow::Borrowed(_)));
assert_eq!(result, "");
}
#[test]
fn scrubs_multiple_pii_types() {
let f = filter_all();
let input = "Email: user@example.com, SSN: 123-45-6789";
let result = f.scrub(input);
assert!(
result.contains("[PII:email]"),
"email must be scrubbed: {result}"
);
assert!(
result.contains("[PII:ssn]"),
"SSN must be scrubbed: {result}"
);
assert!(
!result.contains("user@example.com"),
"raw email must not remain"
);
assert!(!result.contains("123-45-6789"), "raw SSN must not remain");
}
#[test]
fn unicode_no_pii_returns_borrowed() {
let f = filter_all();
let text = "Привет мир, no PII here — €100";
let result = f.scrub(text);
assert!(
matches!(result, Cow::Borrowed(_)),
"unicode text without PII must be Borrowed"
);
}
#[test]
fn is_enabled_true_when_enabled_with_patterns() {
let f = filter_all();
assert!(f.is_enabled());
}
#[test]
fn is_enabled_false_when_disabled() {
let f = filter_disabled();
assert!(!f.is_enabled());
}
#[test]
fn is_enabled_false_when_all_builtin_off_and_no_custom() {
let f = PiiFilter::new(PiiFilterConfig {
enabled: true,
filter_email: false,
filter_phone: false,
filter_ssn: false,
filter_credit_card: false,
custom_patterns: vec![],
});
assert!(!f.is_enabled());
}
#[test]
fn selective_email_only() {
let f = PiiFilter::new(PiiFilterConfig {
enabled: true,
filter_email: true,
filter_phone: false,
filter_ssn: false,
filter_credit_card: false,
custom_patterns: vec![],
});
let result = f.scrub("user@example.com and 555-867-5309");
assert!(result.contains("[PII:email]"), "email scrubbed");
assert!(
result.contains("555-867-5309"),
"phone must NOT be scrubbed when disabled"
);
}
#[test]
fn has_pii_detects_custom_pattern() {
let f = PiiFilter::new(PiiFilterConfig {
enabled: true,
filter_email: false,
filter_phone: false,
filter_ssn: false,
filter_credit_card: false,
custom_patterns: vec![CustomPiiPattern {
name: "token".to_owned(),
pattern: r"TOKEN-\d+".to_owned(),
replacement: "[PII:token]".to_owned(),
}],
});
assert!(f.has_pii("auth TOKEN-42 used"));
assert!(!f.has_pii("no token here"));
}
#[test]
fn scrubs_credit_card_bare() {
let f = filter_all();
let result = f.scrub("card 4111111111111111 end");
assert!(
result.contains("[PII:credit_card]"),
"bare 16-digit CC must be scrubbed: {result}"
);
}
#[test]
fn does_not_scrub_date_as_ssn() {
let f = PiiFilter::new(PiiFilterConfig {
enabled: true,
filter_ssn: true,
filter_email: false,
filter_phone: false,
filter_credit_card: false,
custom_patterns: vec![],
});
let text = "date 12-01-2024 passed";
let result = f.scrub(text);
assert_eq!(result, text, "date DD-MM-YYYY must not be detected as SSN");
}
}