use std::collections::BTreeMap;
use std::path::Path;
use aho_corasick::{AhoCorasick, MatchKind};
use datasynth_core::distributions::text_taxonomy::PiiPlaceholderKind;
use regex_lite::Regex;
use crate::FingerprintError;
#[derive(Default)]
pub struct PiiDenylist {
exact_ac: Option<AhoCorasick>,
exact_replacements: Vec<&'static str>,
patterns: Vec<(Regex, PiiPlaceholderKind)>,
n_literals: usize,
}
impl std::fmt::Debug for PiiDenylist {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("PiiDenylist")
.field("n_literals", &self.n_literals)
.field("n_patterns", &self.patterns.len())
.finish_non_exhaustive()
}
}
fn parse_kind(s: &str) -> Option<PiiPlaceholderKind> {
match s.trim() {
"patient" => Some(PiiPlaceholderKind::Patient),
"person" => Some(PiiPlaceholderKind::Person),
"company" => Some(PiiPlaceholderKind::Company),
"street" => Some(PiiPlaceholderKind::Street),
_ => None,
}
}
impl PiiDenylist {
pub fn load(path: &Path) -> Result<Self, FingerprintError> {
let raw = std::fs::read_to_string(path)
.map_err(|e| FingerprintError::PiiDenylist(format!("read {}: {e}", path.display())))?;
let mut exact_map: BTreeMap<String, PiiPlaceholderKind> = BTreeMap::new();
let mut patterns = Vec::new();
for (lineno, line) in raw.lines().enumerate() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let mut parts = line.splitn(2, '\t');
let lhs = parts.next().unwrap_or("").trim();
let rhs = parts.next().unwrap_or("").trim();
let kind = parse_kind(rhs).ok_or_else(|| {
FingerprintError::PiiDenylist(format!(
"{}:{}: unknown placeholder kind {rhs:?}",
path.display(),
lineno + 1
))
})?;
if let Some(inner) = lhs.strip_prefix('/').and_then(|s| s.strip_suffix('/')) {
let re = Regex::new(inner).map_err(|e| {
FingerprintError::PiiDenylist(format!(
"{}:{}: bad regex {inner:?}: {e}",
path.display(),
lineno + 1
))
})?;
patterns.push((re, kind));
} else if !lhs.is_empty() {
exact_map.insert(lhs.to_string(), kind);
}
}
let n_literals = exact_map.len();
let (exact_ac, exact_replacements) = if exact_map.is_empty() {
(None, Vec::new())
} else {
let (literals, kinds): (Vec<String>, Vec<PiiPlaceholderKind>) =
exact_map.into_iter().unzip();
let ac = AhoCorasick::builder()
.match_kind(MatchKind::LeftmostLongest)
.build(&literals)
.map_err(|e| FingerprintError::PiiDenylist(format!("aho-corasick build: {e}")))?;
let replacements: Vec<&'static str> = kinds.iter().map(|k| k.token()).collect();
(Some(ac), replacements)
};
Ok(Self {
exact_ac,
exact_replacements,
patterns,
n_literals,
})
}
pub fn apply(&self, s: &str) -> String {
let mut out: String = match &self.exact_ac {
Some(ac) => ac.replace_all(s, &self.exact_replacements),
None => s.to_string(),
};
for (re, kind) in &self.patterns {
out = re.replace_all(&out, kind.token()).into_owned();
}
out
}
pub fn is_empty(&self) -> bool {
self.n_literals == 0 && self.patterns.is_empty()
}
pub fn literal_count(&self) -> usize {
self.n_literals
}
pub fn pattern_count(&self) -> usize {
self.patterns.len()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn write_tmp(contents: &str) -> NamedTempFile {
let mut f = NamedTempFile::new().expect("tempfile");
f.write_all(contents.as_bytes()).expect("write tempfile");
f.flush().expect("flush tempfile");
f
}
#[test]
fn load_and_apply_exact_and_regex() {
let tf = write_tmp(
"# comment\n\
Clarunis\tcompany\n\
Inselspital\tcompany\n\
/\\bKantonalbank\\b/\tcompany\n",
);
let dl = PiiDenylist::load(tf.path()).expect("load");
assert_eq!(dl.apply("Kontokorrent Clarunis"), "Kontokorrent {company}");
assert_eq!(
dl.apply("Darlehen Inselspital Bern"),
"Darlehen {company} Bern"
);
assert_eq!(dl.apply("Basler Kantonalbank EUR"), "Basler {company} EUR");
assert_eq!(dl.apply("nothing to do here"), "nothing to do here");
}
#[test]
fn malformed_line_is_an_error() {
let tf = write_tmp("Clarunis\tnot_a_kind\n");
let res = PiiDenylist::load(tf.path());
assert!(res.is_err());
}
#[test]
fn missing_file_is_an_error() {
assert!(PiiDenylist::load(Path::new("/nonexistent/denylist.tsv")).is_err());
}
#[test]
fn longest_match_wins_when_prefixes_overlap() {
let tf = write_tmp(
"Berner\tcompany\n\
Berner Kantonalbank\tcompany\n",
);
let dl = PiiDenylist::load(tf.path()).expect("load");
assert_eq!(
dl.apply("Konto Berner Kantonalbank Zürich"),
"Konto {company} Zürich"
);
}
#[test]
fn pure_regex_denylist_works_without_literals() {
let tf = write_tmp("/\\bKantonalbank\\b/\tcompany\n");
let dl = PiiDenylist::load(tf.path()).expect("load");
assert_eq!(dl.literal_count(), 0);
assert_eq!(dl.pattern_count(), 1);
assert_eq!(dl.apply("Basler Kantonalbank EUR"), "Basler {company} EUR");
}
#[test]
fn apply_scales_with_aho_corasick_not_with_denylist_size() {
use std::time::Instant;
let mut tsv = String::new();
for i in 0..10_000 {
tsv.push_str(&format!("LiteralEntry{i}\tcompany\n"));
}
let tf = write_tmp(&tsv);
let dl = PiiDenylist::load(tf.path()).expect("load 10k-entry denylist");
assert_eq!(dl.literal_count(), 10_000);
let mut input = String::new();
while input.len() < 100_000 {
input.push_str("Some neutral filler text without any pii in it. ");
input.push_str("Match LiteralEntry42 here. ");
}
let t0 = Instant::now();
let out = dl.apply(&input);
let elapsed = t0.elapsed();
assert!(
out.contains("{company}"),
"expected {{company}} substitutions"
);
assert!(
elapsed.as_secs() < 5,
"PiiDenylist::apply took {elapsed:?} on 10k literals × 100KB — likely fell back to O(N) loop"
);
}
}