use std::sync::OnceLock;
use regex::Regex;
use super::types::{Finding, FindingCategory, FindingKind, PiiCategory, Severity};
struct PiiPattern {
category: PiiCategory,
regex: Regex,
severity: Severity,
redactor: fn(&str) -> String,
verifier: Option<fn(&str) -> bool>,
}
static PATTERNS: OnceLock<Vec<PiiPattern>> = OnceLock::new();
fn patterns() -> &'static [PiiPattern] {
PATTERNS.get_or_init(|| {
vec![
PiiPattern {
category: PiiCategory::Ssn,
regex: Regex::new(
r"(?x)
\b
(?:
(?:00[1-9]|0[1-9]\d|[1-578]\d{2}|6[0-57-9]\d|66[0-57-9])
-
(?:0[1-9]|[1-9]\d)
-
\d{4}
)
\b",
)
.expect("ssn regex"),
severity: Severity::High,
redactor: |_| "XXX-XX-XXXX".to_string(),
verifier: None,
},
PiiPattern {
category: PiiCategory::Ein,
regex: Regex::new(r"\b\d{2}-\d{7}\b").expect("ein regex"),
severity: Severity::High,
redactor: |_| "[EIN-XX-XXXXXXX]".to_string(),
verifier: None,
},
PiiPattern {
category: PiiCategory::CreditCard,
regex: Regex::new(
r"(?x)
\b
(?:
4\d{3}(?:[\ -]?\d{4}){3} | # Visa 16
4\d{12} | # Visa 13 (legacy)
5[1-5]\d{2}(?:[\ -]?\d{4}){3} | # MC
2(?:2[2-9]\d|[3-6]\d{2}|7[01]\d|720)(?:[\ -]?\d{4}){3} | # MC 2-series
3[47]\d{2}[\ -]?\d{6}[\ -]?\d{5} | # Amex
6(?:011|5\d{2})(?:[\ -]?\d{4}){3} # Discover
)
\b",
)
.expect("cc regex"),
severity: Severity::Critical,
redactor: |s| {
let digits: String = s.chars().filter(|c| c.is_ascii_digit()).collect();
if digits.len() >= 4 {
format!("[CC-****-{}]", &digits[digits.len() - 4..])
} else {
"[CC-****]".to_string()
}
},
verifier: Some(luhn_check),
},
PiiPattern {
category: PiiCategory::Iban,
regex: Regex::new(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b").expect("iban regex"),
severity: Severity::High,
redactor: |s| {
if s.len() >= 4 {
format!("{}**[IBAN]**", &s[..4])
} else {
"[IBAN]".to_string()
}
},
verifier: Some(iban_mod97_check),
},
PiiPattern {
category: PiiCategory::Email,
regex: Regex::new(
r"(?i)\b[a-z0-9._%+-]+@[a-z0-9-]+(?:\.[a-z0-9-]+)*\.[a-z]{2,24}\b",
)
.expect("email regex"),
severity: Severity::Medium,
redactor: |_| "[EMAIL]".to_string(),
verifier: None,
},
PiiPattern {
category: PiiCategory::PhoneNumber,
regex: Regex::new(
r"(?x)
(?:
# E.164 international: + then 7-15 digits with optional separators
\+\d(?:[\ \-\.]?\d){6,14}
|
# US NANP with parens: (NNN) NNN-NNNN or (NNN)NNN-NNNN
\(\d{3}\)\ ?\d{3}[\ \-\.]\d{4}
|
# US NANP separated: NNN-NNN-NNNN or NNN.NNN.NNNN or NNN NNN NNNN
\b\d{3}[\ \-\.]\d{3}[\ \-\.]\d{4}\b
)",
)
.expect("phone regex"),
severity: Severity::Medium,
redactor: |_| "[PHONE]".to_string(),
verifier: None,
},
PiiPattern {
category: PiiCategory::IpAddress,
regex: Regex::new(
r"(?x)
\b
(?:
# IPv4
(?:25[0-5]|2[0-4]\d|[01]?\d?\d)
(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d?\d)){3}
|
# IPv6 full or compressed (loose form)
(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}
|
(?:[0-9a-fA-F]{1,4}:){1,7}:
|
(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}
)
\b",
)
.expect("ip regex"),
severity: Severity::Low,
redactor: |_| "[IP]".to_string(),
verifier: None,
},
PiiPattern {
category: PiiCategory::MacAddress,
regex: Regex::new(r"\b(?:[0-9A-Fa-f]{2}[:\-]){5}[0-9A-Fa-f]{2}\b")
.expect("mac regex"),
severity: Severity::Low,
redactor: |_| "[MAC]".to_string(),
verifier: None,
},
PiiPattern {
category: PiiCategory::Passport,
regex: Regex::new(
r"(?ix)
(?:passport(?:\ (?:no\.?|number|\#))?\s*[:\-]?\s*)
(\d{9})
\b",
)
.expect("passport regex"),
severity: Severity::High,
redactor: |_| "[PASSPORT]".to_string(),
verifier: None,
},
PiiPattern {
category: PiiCategory::DriversLicense,
regex: Regex::new(
r"(?ix)
(?:driver'?s?\ licen[cs]e|driver\ licen[cs]e|DL\#?|licen[cs]e\ \#?)
\s*[:\-]?\s*
(?:[A-Z]\d{7}|[A-Z]\d{12}|[A-Z]\d{11}|\d{8}|\d{9})
\b",
)
.expect("dl regex"),
severity: Severity::High,
redactor: |_| "[DL]".to_string(),
verifier: None,
},
PiiPattern {
category: PiiCategory::BankAccountNumber,
regex: Regex::new(
r"(?ix)
(?:account(?:\ (?:no\.?|number|\#))?|acct(?:\.|\ no\.?)?)
\s*[:\-]?\s*
(\d{8,17})
\b",
)
.expect("bank acct regex"),
severity: Severity::High,
redactor: |_| "[BANK-ACCT]".to_string(),
verifier: None,
},
PiiPattern {
category: PiiCategory::RoutingNumber,
regex: Regex::new(
r"(?ix)
(?:routing(?:\ (?:no\.?|number|\#))?|aba(?:\ \#?)?|rtn)
\s*[:\-]?\s*
(\d{9})
\b",
)
.expect("rtn regex"),
severity: Severity::High,
redactor: |_| "[RTN]".to_string(),
verifier: Some(aba_routing_check),
},
PiiPattern {
category: PiiCategory::DateOfBirth,
regex: Regex::new(
r"(?ix)
(?:dob|d\.o\.b\.?|date\ of\ birth|born(?:\ on)?|birthday|birth\ date)
\s*[:\-]?\s*
(?:
\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])
|
(?:0[1-9]|1[0-2])/(?:0[1-9]|[12]\d|3[01])/\d{4}
)
\b",
)
.expect("dob regex"),
severity: Severity::High,
redactor: |_| "[DOB]".to_string(),
verifier: None,
},
PiiPattern {
category: PiiCategory::NameAddressTuple,
regex: Regex::new(
r"(?x)
(?:Mr|Mrs|Ms|Dr|Prof|Mister|Madam)\.?
\s+
[A-Z][A-Za-z'\-]+
(?:\s+[A-Z][A-Za-z'\-]+){0,3}
.{0,80}?
\b\d+\s+[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+){0,3}
\s+
(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Lane|Ln|Way|Court|Ct)\.?
\b",
)
.expect("name+addr regex"),
severity: Severity::Critical,
redactor: |_| "[NAME+ADDR]".to_string(),
verifier: None,
},
]
})
}
fn luhn_check(s: &str) -> bool {
let digits: Vec<u32> = s.chars().filter_map(|c| c.to_digit(10)).collect();
if digits.len() < 12 {
return false;
}
let mut sum = 0u32;
let mut alt = false;
for d in digits.iter().rev() {
let mut x = *d;
if alt {
x *= 2;
if x > 9 {
x -= 9;
}
}
sum += x;
alt = !alt;
}
sum.is_multiple_of(10)
}
fn iban_mod97_check(s: &str) -> bool {
let bytes: Vec<u8> = s.bytes().filter(|b| !b.is_ascii_whitespace()).collect();
if !(15..=34).contains(&bytes.len()) {
return false;
}
let (head, tail) = bytes.split_at(4);
let mut rearranged: Vec<u8> = Vec::with_capacity(bytes.len());
rearranged.extend_from_slice(tail);
rearranged.extend_from_slice(head);
let mut remainder: u32 = 0;
for b in rearranged {
let value: u32 = if b.is_ascii_digit() {
(b - b'0') as u32
} else if b.is_ascii_alphabetic() {
(b.to_ascii_uppercase() - b'A') as u32 + 10
} else {
return false;
};
if value >= 10 {
remainder = (remainder * 100 + value) % 97;
} else {
remainder = (remainder * 10 + value) % 97;
}
}
remainder == 1
}
fn aba_routing_check(s: &str) -> bool {
let digits: Vec<u32> = s.chars().filter_map(|c| c.to_digit(10)).collect();
if digits.len() != 9 {
return false;
}
let sum = 3 * (digits[0] + digits[3] + digits[6])
+ 7 * (digits[1] + digits[4] + digits[7])
+ (digits[2] + digits[5] + digits[8]);
sum.is_multiple_of(10)
}
pub fn scan_for_pii(text: &str) -> Vec<Finding> {
let mut out: Vec<Finding> = Vec::new();
for pat in patterns() {
for m in pat.regex.find_iter(text) {
let matched = m.as_str();
if let Some(verify) = pat.verifier {
if !verify(matched) {
continue;
}
}
let proposal = (pat.redactor)(matched);
out.push(Finding {
kind: FindingKind::Pii,
category: FindingCategory::Pii(pat.category),
span_start: m.start(),
span_end: m.end(),
severity: pat.severity,
redaction_proposal: Some(proposal),
});
}
}
out.sort_by(|a, b| {
a.span_start
.cmp(&b.span_start)
.then(category_priority(&a.category).cmp(&category_priority(&b.category)))
});
out.dedup_by(|a, b| a.span_start == b.span_start && a.span_end == b.span_end);
out
}
fn category_priority(c: &FindingCategory) -> u8 {
match c {
FindingCategory::Pii(p) => PiiCategory::ALL
.iter()
.position(|x| x == p)
.map(|i| i as u8)
.unwrap_or(u8::MAX),
FindingCategory::Injection(_) => u8::MAX,
}
}
#[cfg(test)]
mod verifier_tests {
use super::*;
#[test]
fn luhn_known_good() {
assert!(luhn_check("4111 1111 1111 1111")); assert!(luhn_check("5555-5555-5555-4444")); assert!(luhn_check("378282246310005")); assert!(luhn_check("6011111111111117")); }
#[test]
fn luhn_known_bad() {
assert!(!luhn_check("4111 1111 1111 1112"));
assert!(!luhn_check("0000 0000 0000 0001"));
}
#[test]
fn iban_known_good() {
assert!(iban_mod97_check("GB82WEST12345698765432"));
assert!(iban_mod97_check("DE89370400440532013000"));
assert!(iban_mod97_check("FR1420041010050500013M02606"));
}
#[test]
fn iban_known_bad() {
assert!(!iban_mod97_check("GB82WEST12345698765431"));
assert!(!iban_mod97_check("XX99NOTANIBANATALL00"));
}
#[test]
fn aba_known_good() {
assert!(aba_routing_check("011000015")); assert!(aba_routing_check("121000358")); }
#[test]
fn aba_known_bad() {
assert!(!aba_routing_check("123456789"));
assert!(!aba_routing_check("011000016")); assert!(!aba_routing_check("121000359")); }
}