datasynth_core/distributions/
text_taxonomy.rs

1//! SP6 — corpus text taxonomy: PII-safe placeholder grammar + conditional
2//! template pools keyed on (source, account-class).
3//!
4//! Replaces SP4.4's verbatim source-keyed `TextTemplatePrior`. Generated text
5//! is synthetic-by-construction: tokenized real templates whose PII spans are
6//! fillable placeholders. Line text is conditioned on (source, account-class);
7//! header text is source-keyed; CoA descriptions are per-account templates
8//! filled once per run.
9
10use std::collections::BTreeMap;
11
12use regex::Regex;
13use serde::{Deserialize, Serialize};
14
15/// A PII-placeholder kind the generator must resolve to a concrete value.
16/// Structural placeholders (`{year}`, `{quarter}`, `{month}`, `{date}`,
17/// `{digits}`) are NOT in this enum — `PlaceholderGrammar::fill` handles those.
18#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
19pub enum PiiPlaceholderKind {
20    Patient,
21    Person,
22    Company,
23    Street,
24}
25
26impl PiiPlaceholderKind {
27    /// The placeholder token as it appears in a template string.
28    pub fn token(self) -> &'static str {
29        match self {
30            PiiPlaceholderKind::Patient => "{patient}",
31            PiiPlaceholderKind::Person => "{person}",
32            PiiPlaceholderKind::Company => "{company}",
33            PiiPlaceholderKind::Street => "{street}",
34        }
35    }
36
37    /// Parse a placeholder token to its kind. `None` for structural or unknown.
38    pub fn from_token(token: &str) -> Option<Self> {
39        match token {
40            "{patient}" => Some(PiiPlaceholderKind::Patient),
41            "{person}" => Some(PiiPlaceholderKind::Person),
42            "{company}" => Some(PiiPlaceholderKind::Company),
43            "{street}" => Some(PiiPlaceholderKind::Street),
44            _ => None,
45        }
46    }
47}
48
49/// Resolves a PII-placeholder kind to a concrete value. Implemented by the
50/// generator (wired to master data) and by `SyntheticExampleResolver` (used at
51/// extraction time, where master data does not exist).
52pub trait PlaceholderResolver {
53    /// Resolve a PII-placeholder kind to a concrete value.
54    fn resolve(&mut self, kind: PiiPlaceholderKind, rng: &mut dyn rand::Rng) -> String;
55}
56
57/// A built-in resolver emitting obviously-synthetic tokens. Used to produce
58/// `TemplateEntry::synthetic_example` at extraction time and in tests.
59pub struct SyntheticExampleResolver;
60
61impl PlaceholderResolver for SyntheticExampleResolver {
62    fn resolve(&mut self, kind: PiiPlaceholderKind, _rng: &mut dyn rand::Rng) -> String {
63        match kind {
64            PiiPlaceholderKind::Patient => "Example Patient".to_string(),
65            PiiPlaceholderKind::Person => "Example Person".to_string(),
66            PiiPlaceholderKind::Company => "Example GmbH".to_string(),
67            PiiPlaceholderKind::Street => "Example Street 1".to_string(),
68        }
69    }
70}
71
72/// One residual-PII scan hit.
73#[derive(Debug, Clone, PartialEq)]
74pub struct PiiHit {
75    /// Static label of the pattern that matched (e.g. `"patient_record"`).
76    pub pattern: &'static str,
77    /// The substring that matched.
78    pub matched: String,
79}
80
81/// A single PII-safe text template.
82#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
83pub struct TemplateEntry {
84    /// Tokenized, PII-safe template string.
85    pub template: String,
86    /// Probability mass within the owning pool (renormalised after filtering).
87    pub probability: f64,
88    /// The template run through `fill` once at extraction time with a
89    /// fixed-seed RNG and `SyntheticExampleResolver` — a debug/audit example
90    /// carrying ZERO corpus content. Replaces SP4.4's verbatim `example` field.
91    pub synthetic_example: String,
92}
93
94/// A weighted pool of templates for one `(source, class)` or `source` key.
95#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
96pub struct TemplatePool {
97    pub templates: Vec<TemplateEntry>,
98    /// Total observations underpinning the pool (pre-truncation).
99    pub n: usize,
100}
101
102/// Extraction metadata for a `TextTaxonomyPrior`.
103#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
104pub struct TaxonomyMeta {
105    pub min_occurrences: usize,
106    pub max_templates_per_pool: usize,
107    /// Class-granularity tier used for `line_pools` keys (e.g. `"iso21378_l2"`).
108    pub class_tier: String,
109    pub n_client_inputs: usize,
110}
111
112/// SP6 — corpus text taxonomy prior. Replaces `TextTemplatePrior`.
113#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
114pub struct TextTaxonomyPrior {
115    /// Line text keyed on the flattened string `"SOURCE|CLASS"`. `CLASS` is the
116    /// ISO 21378 Level-2 account class; lines whose account has no resolvable
117    /// class are grouped under `"SOURCE|_unknown_"`.
118    pub line_pools: BTreeMap<String, TemplatePool>,
119    /// Header text keyed on source only (a JE header has no single account).
120    pub header_pools: BTreeMap<String, TemplatePool>,
121    /// CoA description templates keyed on account number — one per account.
122    pub coa_pools: BTreeMap<String, TemplateEntry>,
123    /// Extraction metadata.
124    pub meta: TaxonomyMeta,
125}
126
127impl TextTaxonomyPrior {
128    /// Sentinel class component used when a line's account has no resolvable
129    /// ISO 21378 class.
130    pub const UNKNOWN_CLASS: &'static str = "_unknown_";
131
132    /// Build the flattened `"SOURCE|CLASS"` key used by `line_pools`.
133    pub fn line_key(source: &str, account_class: &str) -> String {
134        format!("{source}|{account_class}")
135    }
136}
137
138/// Stateless tokenize / fill / scan engine. No dependency on the generator or
139/// fingerprint crates — locale and master-data wiring arrive via a
140/// `PlaceholderResolver` at fill time.
141pub struct PlaceholderGrammar;
142
143// --- residual-PII scan + tokenize Phase-A statics ---
144//
145// The `.unwrap()` on each regex is sound: the literals are compile-time
146// constants whose well-formedness is pinned by the test suite. Grouped into
147// a private submodule so the crate-level `#![deny(clippy::unwrap_used)]`
148// is bypassed with a single module-level `#![allow]` rather than ten
149// per-static attributes.
150
151mod scan_patterns {
152    #![allow(clippy::unwrap_used)]
153
154    use regex::Regex;
155    use std::sync::LazyLock;
156
157    /// Patient record marker: `G:dd.dd.dd`. Presence implies an un-stripped name.
158    pub(super) static RE_PATIENT: LazyLock<Regex> =
159        LazyLock::new(|| Regex::new(r"G:\s*\d{2}\.\d{2}\.\d{2}").unwrap());
160    /// `*Lastname,Firstname` star record, anchored at start.
161    pub(super) static RE_PERSON_STAR: LazyLock<Regex> = LazyLock::new(|| {
162        Regex::new(r"^\*[A-ZÄÖÜ][\w\u{00C0}-\u{017F}.'\-]+\s*,\s*[A-ZÄÖÜ]").unwrap()
163    });
164    /// Honorific / title followed by a name.
165    pub(super) static RE_TITLE: LazyLock<Regex> = LazyLock::new(|| {
166        Regex::new(r"\b(Prof|Dr|Dipl|Pfr|Pfarrer|Herr|Frau|Hr|Fr|med|iur|lic)\.\s").unwrap()
167    });
168    /// `Initial. Surname` (e.g. `U. Frey`).
169    pub(super) static RE_INITIAL_SURNAME: LazyLock<Regex> =
170        LazyLock::new(|| Regex::new(r"\b[A-ZÄÖÜ]\.\s*[A-ZÄÖÜ][a-zäöüß]{2,}\b").unwrap());
171    /// `Surname Initial.` (e.g. `Frey U.`, `Mueller H.`). The trailing
172    /// `[A-Z].` must be followed by whitespace or end-of-string, so that
173    /// legal-entity abbreviations like `Europe B.V.` / `Suisse S.A.` /
174    /// `Nespresso S.A.` do NOT match (the period there is followed by
175    /// another capital letter — part of the abbreviation, not a name initial).
176    /// Pre-T16 corpus scan found ~90% of `surname_initial` raw hits were
177    /// legal-entity suffixes, not person names.
178    pub(super) static RE_SURNAME_INITIAL: LazyLock<Regex> =
179        LazyLock::new(|| Regex::new(r"\b[A-ZÄÖÜ][a-zäöüß]{2,}\s+[A-ZÄÖÜ]\.(?:\s|$)").unwrap());
180
181    // --- tokenize Phase-A statics ---
182
183    /// `dd.mm.yy` date triplet inside a patient `G:`/`E:`/`A:` record.
184    pub(super) static RE_GEA_DATE: LazyLock<Regex> =
185        LazyLock::new(|| Regex::new(r"([GEA]):\s*\d{2}\.\d{2}\.\d{2}").unwrap());
186    /// Street address: a capitalised word ending in a street-type suffix, then
187    /// a number. Case-insensitive on the suffix.
188    pub(super) static RE_STREET: LazyLock<Regex> = LazyLock::new(|| {
189        Regex::new(r"(?i)\b[A-ZÄÖÜ][\w\u{00C0}-\u{017F}.\-]*(?:str\.|strasse|gasse|weg|platz)\s*\d+[A-Za-z]?\b").unwrap()
190    });
191    /// 4-digit year 19xx / 20xx not embedded in a longer digit run.
192    pub(super) static RE_YEAR: LazyLock<Regex> =
193        LazyLock::new(|| Regex::new(r"\b(?:19|20)\d{2}\b").unwrap());
194    /// Quarter marker Q1–Q4 (case-insensitive), not followed by another digit.
195    pub(super) static RE_QUARTER: LazyLock<Regex> =
196        LazyLock::new(|| Regex::new(r"(?i)\bQ[1-4]\b").unwrap());
197    /// Run of >=4 digits.
198    pub(super) static RE_DIGITS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d{4,}").unwrap());
199
200    /// A run of >=2 capitalised, whitespace-separated tokens (Unicode-aware).
201    /// Candidate person-name span; confirmed only if a token is a known given
202    /// name (see `given_names`). German capitalises all nouns, so case alone
203    /// can't separate a surname from a common noun — the given-name gazetteer
204    /// is the anchor, and we redact the whole run (safe over-redaction) rather
205    /// than risk leaking the adjacent surname.
206    pub(super) static RE_NAME_RUN: LazyLock<Regex> =
207        LazyLock::new(|| Regex::new(r"\p{Lu}[\p{L}.'\-]*(?:\s+\p{Lu}[\p{L}.'\-]*)+").unwrap());
208}
209
210/// Given-name gazetteer for Phase-A name tokenization. Generic given names
211/// (country-pack union + Swiss/DE/FR/IT supplement) — NOT PII, a name
212/// dictionary like city names. Used to anchor `Firstname Lastname` detection
213/// that the initial/title/patient regexes miss.
214mod given_names {
215    use std::collections::HashSet;
216    use std::sync::LazyLock;
217
218    /// Normalize a token for gazetteer lookup: lowercase + drop umlaut/accent
219    /// characters entirely. The corpus text has umlauts STRIPPED (e.g.
220    /// `Jürg`→`Jrg`, `Rückstellung`→`Rckstellung`), so a gazetteer entry with
221    /// proper umlauts would never match the corpus form — normalizing both
222    /// sides the same way bridges that.
223    pub(super) fn normalize(s: &str) -> String {
224        let mut out = String::with_capacity(s.len());
225        for c in s.chars() {
226            match c {
227                // Umlauts: the corpus drops them entirely (Jürg→Jrg, Löhne→Lhne).
228                'ä' | 'ö' | 'ü' | 'Ä' | 'Ö' | 'Ü' => {}
229                // Accented latin: map to the base letter (Régis→Regis).
230                'é' | 'è' | 'ê' | 'ë' => out.push('e'),
231                'à' | 'â' | 'á' => out.push('a'),
232                'î' | 'ï' | 'í' => out.push('i'),
233                'ô' | 'ó' => out.push('o'),
234                'û' | 'ú' => out.push('u'),
235                'ç' => out.push('c'),
236                'ñ' => out.push('n'),
237                'ß' => out.push_str("ss"),
238                _ => out.extend(c.to_lowercase()),
239            }
240        }
241        out
242    }
243
244    pub(super) static GIVEN_NAMES: LazyLock<HashSet<String>> = LazyLock::new(|| {
245        include_str!("../../resources/given_names.txt")
246            .lines()
247            .map(str::trim)
248            .filter(|l| !l.is_empty() && !l.starts_with('#'))
249            .map(normalize)
250            .filter(|n| !n.is_empty())
251            .collect()
252    });
253
254    /// True if any sub-token of `run` is a known given name. Splits on
255    /// whitespace AND intra-token separators (`-`, `/`, `.`, `,`) so
256    /// compound/prefixed forms like `Hans-Rudolf` or `ESD-Roger` are matched
257    /// part-by-part. Each part is normalized (umlaut-stripped + lowercased)
258    /// before lookup.
259    pub(super) fn run_has_given_name(run: &str) -> bool {
260        run.split(|c: char| c.is_whitespace() || matches!(c, '-' | '/' | '.' | ',' | '_'))
261            .any(|part| {
262                let cleaned = part.trim_matches(|c: char| !c.is_alphabetic());
263                !cleaned.is_empty() && GIVEN_NAMES.contains(&normalize(cleaned))
264            })
265    }
266}
267
268use scan_patterns::{
269    RE_DIGITS, RE_GEA_DATE, RE_INITIAL_SURNAME, RE_NAME_RUN, RE_PATIENT, RE_PERSON_STAR,
270    RE_QUARTER, RE_STREET, RE_SURNAME_INITIAL, RE_TITLE, RE_YEAR,
271};
272
273/// Month names (English + German, full + common abbreviations), longest-first.
274const MONTH_NAMES: &[&str] = &[
275    "September",
276    "Februar",
277    "Dezember",
278    "November",
279    "February",
280    "December",
281    "January",
282    "October",
283    "Januar",
284    "Oktober",
285    "August",
286    "März",
287    "Maerz",
288    "April",
289    "March",
290    "Juni",
291    "Juli",
292    "June",
293    "July",
294    "Mai",
295    "May",
296    "Jan",
297    "Feb",
298    "Mar",
299    "Apr",
300    "Jun",
301    "Jul",
302    "Aug",
303    "Sep",
304    "Oct",
305    "Nov",
306    "Dec",
307];
308
309impl PlaceholderGrammar {
310    /// Fill a template to a concrete string. Structural placeholders
311    /// (`{year}`, `{quarter}`, `{month}`, `{date}`, `{digits}`) are filled
312    /// internally from `rng`; PII placeholders are delegated to `resolver`.
313    /// Unknown `{…}` tokens are emitted verbatim.
314    pub fn fill<R: rand::Rng>(
315        template: &str,
316        resolver: &mut dyn PlaceholderResolver,
317        rng: &mut R,
318    ) -> String {
319        use rand::RngExt;
320        if template.is_empty() {
321            return String::new();
322        }
323        let mut out = String::with_capacity(template.len() + 16);
324        let mut rest = template;
325        while let Some(open) = rest.find('{') {
326            out.push_str(&rest[..open]);
327            rest = &rest[open..];
328            let Some(close) = rest.find('}') else {
329                // unbalanced — emit the remainder verbatim
330                out.push_str(rest);
331                return out;
332            };
333            let token = &rest[..=close];
334            rest = &rest[close + 1..];
335            if let Some(kind) = PiiPlaceholderKind::from_token(token) {
336                let resolved = resolver.resolve(kind, rng);
337                out.push_str(&resolved);
338                continue;
339            }
340            match token {
341                "{year}" => {
342                    let y: u32 = rng.random_range(2018..=2024);
343                    out.push_str(&y.to_string());
344                }
345                "{quarter}" => {
346                    let q: u32 = rng.random_range(1..=4);
347                    out.push('Q');
348                    out.push_str(&q.to_string());
349                }
350                "{month}" => {
351                    const MONTHS: &[&str] = &[
352                        "January",
353                        "February",
354                        "March",
355                        "April",
356                        "May",
357                        "June",
358                        "July",
359                        "August",
360                        "September",
361                        "October",
362                        "November",
363                        "December",
364                    ];
365                    out.push_str(MONTHS[rng.random_range(0..MONTHS.len())]);
366                }
367                "{date}" => {
368                    let d: u32 = rng.random_range(1..=28);
369                    let m: u32 = rng.random_range(1..=12);
370                    let y: u32 = rng.random_range(2018..=2024);
371                    // Use ISO-style yyyy-mm-dd so the output never matches
372                    // RE_PATIENT's `G:\s*\d{2}\.\d{2}\.\d{2}` (dot-separated).
373                    out.push_str(&format!("{y}-{m:02}-{d:02}"));
374                }
375                "{digits}" => {
376                    let n = rng.random_range(4..=8);
377                    for _ in 0..n {
378                        out.push(char::from(b'0' + rng.random_range(0u8..10)));
379                    }
380                }
381                _ => out.push_str(token), // unknown — verbatim
382            }
383        }
384        out.push_str(rest);
385        out
386    }
387
388    /// Scan a string for residual PII patterns. Returns one hit per pattern
389    /// that matches; an empty result means the string is clean. Patterns that
390    /// detect PII-bearing *shapes* — used as a hard gate at extraction and in
391    /// CI. Templates whose PII spans are already placeholders (`{person}`,
392    /// `{patient}`, …) do not match these patterns.
393    pub fn residual_pii_scan(s: &str) -> Vec<PiiHit> {
394        let mut hits = Vec::new();
395        let checks: &[(&'static str, &Regex)] = &[
396            ("patient_record", &RE_PATIENT),
397            ("person_star", &RE_PERSON_STAR),
398            ("title", &RE_TITLE),
399            ("initial_surname", &RE_INITIAL_SURNAME),
400            ("surname_initial", &RE_SURNAME_INITIAL),
401        ];
402        for (label, re) in checks {
403            if let Some(m) = re.find(s) {
404                hits.push(PiiHit {
405                    pattern: label,
406                    matched: m.as_str().to_string(),
407                });
408            }
409        }
410        // `Firstname Lastname` / `Lastname Firstname` — a capitalised run
411        // anchored by a known given name. Catches the plain two-word names the
412        // initial/title/patient regexes structurally miss (the SP6 leak class).
413        for m in RE_NAME_RUN.find_iter(s) {
414            if given_names::run_has_given_name(m.as_str()) {
415                hits.push(PiiHit {
416                    pattern: "given_name",
417                    matched: m.as_str().to_string(),
418                });
419                break;
420            }
421        }
422        hits
423    }
424
425    /// Phase A — automated structural placeholder-ization. Raw corpus string to
426    /// a PII-safe template using only deterministic structural rules. Phase B
427    /// (curated denylist for fuzzy proper nouns) is applied by the extractor.
428    ///
429    /// Rules, in order:
430    /// 1. Patient `G:`-record: strip everything up to the first `G:dd.dd.dd`
431    ///    marker (the name region, regardless of any `G` in the name), prepend
432    ///    `*{patient} `, then replace each `G:/E:/A:dd.dd.dd` with the
433    ///    letter-preserving form `G:{date}` / `E:{date}` / `A:{date}`.
434    /// 2. `*Lastname,Firstname` star record: replace the matched span with
435    ///    `*{person}`.
436    /// 3. Street address: replace with `{street}`.
437    /// 4. 4-digit years -> `{year}`; `Q1`-`Q4` -> `{quarter}`; month names ->
438    ///    `{month}`; runs of >=4 digits -> `{digits}`.
439    pub fn tokenize(s: &str) -> String {
440        let t = s.trim();
441        if t.is_empty() {
442            return String::new();
443        }
444
445        // Rules 1 & 2 produce a *staged* prefix-handled string, then fall
446        // through to the common tail (street + name-run + structural) so a
447        // person name in the SUFFIX of a patient/star record is still caught
448        // — they previously `return`ed early, leaking e.g. a trailing
449        // `Firstname Lastname` after a `G:` patient marker.
450        let staged: String = if let Some(m) = RE_PATIENT.find(t) {
451            // Rule 1 — patient record. Slice off the name region up to the
452            // date marker; date-stamp the G:/E:/A: triplets.
453            let from_marker = &t[m.start()..];
454            let dated = RE_GEA_DATE.replace_all(from_marker, "$1:{date}");
455            format!("*{{patient}} {dated}").trim().to_string()
456        } else if let Some(m) = RE_PERSON_STAR.find(t) {
457            // Rule 2 — star person record. Replace the matched span with
458            // *{person}, keep any suffix, drop the dangling firstname fragment.
459            let mut out = String::with_capacity(t.len());
460            out.push_str("*{person}");
461            out.push_str(&t[m.end()..]);
462            trim_leading_name_fragment(&out)
463        } else {
464            t.to_string()
465        };
466
467        // Rule 3 — street address.
468        let staged = RE_STREET.replace_all(&staged, "{street}").into_owned();
469
470        // Rule 3.5 — `Firstname Lastname` / `Lastname Firstname` person names.
471        // A capitalised run anchored by a known given name collapses to
472        // `{person}`. German capitalises all nouns, so we cannot tell a
473        // surname from a description noun by case — redact the whole run
474        // (safe over-redaction) rather than leak the adjacent surname.
475        let staged = RE_NAME_RUN
476            .replace_all(&staged, |caps: &regex::Captures| {
477                let run = &caps[0];
478                if given_names::run_has_given_name(run) {
479                    "{person}".to_string()
480                } else {
481                    run.to_string()
482                }
483            })
484            .into_owned();
485
486        // Rule 4 — structural / temporal.
487        let staged = RE_YEAR.replace_all(&staged, "{year}").into_owned();
488        let staged = RE_QUARTER.replace_all(&staged, "{quarter}").into_owned();
489        let staged = replace_months(&staged);
490        RE_DIGITS.replace_all(&staged, "{digits}").into_owned()
491    }
492}
493
494/// Replace month names with `{month}` at word boundaries (longest-first so
495/// "September" wins over a hypothetical "Sep" prefix).
496fn replace_months(s: &str) -> String {
497    let mut result = s.to_string();
498    for name in MONTH_NAMES {
499        // Word-boundary replace, case-sensitive (month names are capitalised
500        // in this corpus). Build a fresh string to avoid re-matching `{month}`.
501        let mut out = String::with_capacity(result.len());
502        let nlen = name.len();
503        let mut i = 0;
504        while i < result.len() {
505            if result[i..].starts_with(name) {
506                let prev_alpha = i > 0
507                    && result[..i]
508                        .chars()
509                        .next_back()
510                        .map(|c| c.is_alphabetic())
511                        .unwrap_or(false);
512                let next_alpha = result[i + nlen..]
513                    .chars()
514                    .next()
515                    .map(|c| c.is_alphabetic())
516                    .unwrap_or(false);
517                if !prev_alpha && !next_alpha {
518                    out.push_str("{month}");
519                    i += nlen;
520                    continue;
521                }
522            }
523            // push one char
524            let ch_len = result[i..]
525                .chars()
526                .next()
527                .map(|c| c.len_utf8())
528                .unwrap_or(1);
529            out.push_str(&result[i..i + ch_len]);
530            i += ch_len;
531        }
532        result = out;
533    }
534    result
535}
536
537/// Drop a leading lowercase/name-char fragment immediately after `*{person}`.
538/// Handles `RE_PERSON_STAR` matching only `*Lastname,F` and leaving `irstname`.
539/// Consumes ONE contiguous alphabetic run (the firstname-completion) plus AT
540/// MOST one trailing separator (space or comma) — does NOT consume further
541/// alphabetic words which would eat description content. When non-empty
542/// content remains, reinsert a single space separator for readability.
543fn trim_leading_name_fragment(s: &str) -> String {
544    const PREFIX: &str = "*{person}";
545    if let Some(rest) = s.strip_prefix(PREFIX) {
546        let mut end = 0usize;
547        // Consume one contiguous alphabetic run (the firstname-completion).
548        for (i, c) in rest.char_indices() {
549            if c.is_alphabetic() {
550                end = i + c.len_utf8();
551            } else {
552                break;
553            }
554        }
555        // Consume up to one trailing comma or space — but NOT further alphabetics.
556        if let Some(c) = rest[end..].chars().next() {
557            if c == ',' || c == ' ' {
558                end += c.len_utf8();
559            }
560        }
561        let trimmed = &rest[end..];
562        if trimmed.is_empty() {
563            PREFIX.to_string()
564        } else {
565            format!("{PREFIX} {trimmed}")
566        }
567    } else {
568        s.to_string()
569    }
570}
571
572#[cfg(test)]
573mod tests {
574    use super::*;
575    use rand::SeedableRng;
576
577    #[test]
578    fn residual_scan_flags_patient_record() {
579        let hits = PlaceholderGrammar::residual_pii_scan("*Gambon,Laurin G:01.02.03 E:04.05.06");
580        assert!(
581            hits.iter().any(|h| h.pattern == "patient_record"),
582            "expected patient_record hit, got {hits:?}"
583        );
584    }
585
586    #[test]
587    fn residual_scan_flags_person_shapes() {
588        // star record
589        assert!(PlaceholderGrammar::residual_pii_scan("*Mueller,Hans")
590            .iter()
591            .any(|h| h.pattern == "person_star"));
592        // initial + surname
593        assert!(PlaceholderGrammar::residual_pii_scan("Forschung U. Frey")
594            .iter()
595            .any(|h| h.pattern == "initial_surname"));
596        // title
597        assert!(
598            PlaceholderGrammar::residual_pii_scan("Kontokorrent Prof. Dr. M. Buess")
599                .iter()
600                .any(|h| h.pattern == "title")
601        );
602        // surname + initial (e.g. "Mueller H." in a description)
603        assert!(
604            PlaceholderGrammar::residual_pii_scan("Konsultation Mueller H.")
605                .iter()
606                .any(|h| h.pattern == "surname_initial")
607        );
608    }
609
610    #[test]
611    fn residual_scan_passes_clean_templates() {
612        for clean in [
613            "Rechnung {company}",
614            "Mieten {month}.{year}",
615            "ARIBA_ASN",
616            "Darlehen {person}",
617            "*{patient} G:{date} E:{date} A:{date}",
618            "Umbuchung Anlage",
619        ] {
620            assert!(
621                PlaceholderGrammar::residual_pii_scan(clean).is_empty(),
622                "false positive on clean template: {clean:?}"
623            );
624        }
625    }
626
627    #[test]
628    fn residual_scan_excludes_legal_entity_suffixes() {
629        // Legal-entity abbreviations like B.V. / S.A. / S.r.l. have a
630        // capital letter immediately following the period. The
631        // `surname_initial` regex must NOT match these — they're
632        // corporate-entity markers, not person initials. (Pre-T16 corpus
633        // scan found ~90% of raw `surname_initial` hits were exactly
634        // this shape, e.g. `Acme S.A.`, `Globex Europe B.V.` — only the
635        // shape matters for this test, names are fictional.)
636        for legal in [
637            "Acme Europe B.V.",
638            "Globex Suisse S.A.",
639            "Initech S.A. Lugano",
640            "Switzerland S.A.",
641        ] {
642            assert!(
643                PlaceholderGrammar::residual_pii_scan(legal)
644                    .iter()
645                    .all(|h| h.pattern != "surname_initial"),
646                "must not flag legal-entity suffix in: {legal:?}"
647            );
648        }
649        // But a legitimate surname+initial at end-of-string still matches.
650        assert!(
651            PlaceholderGrammar::residual_pii_scan("Patient consult Mueller H.")
652                .iter()
653                .any(|h| h.pattern == "surname_initial"),
654            "legitimate surname-initial at end-of-string must still match"
655        );
656    }
657
658    /// SP6 leak class: plain `Firstname Lastname` / `Lastname Firstname` —
659    /// the form the initial/title/patient regexes structurally miss and that
660    /// shipped 285 real names in the first bundle cut. Fictional surnames
661    /// (Mustermann/Beispiel) + real generic given names exercise the shape.
662    #[test]
663    fn residual_scan_flags_given_name_runs() {
664        for s in [
665            "Beratung Marc Mustermann",        // desc + given + surname
666            "Erbschaft Anna Beispiel",         // desc + given + surname
667            "Mustermann Thomas Guthaben",      // surname-first + given
668            "Florian Beispiel, Verzugszinsen", // given + surname, comma
669        ] {
670            let hits = PlaceholderGrammar::residual_pii_scan(s);
671            assert!(
672                hits.iter().any(|h| h.pattern == "given_name"),
673                "must flag given-name run in: {s:?} (got {hits:?})"
674            );
675        }
676    }
677
678    #[test]
679    fn tokenize_collapses_person_name_runs() {
680        // German capitalises all nouns, so a contiguous capitalised run that
681        // contains a given name collapses whole — we cannot tell the leading
682        // description noun ("Beratung") from the surname by case, so we
683        // over-redact rather than risk leaking the surname.
684        assert_eq!(
685            PlaceholderGrammar::tokenize("Beratung Marc Mustermann"),
686            "{person}"
687        );
688        // A lowercase word / punctuation terminates the run, preserving the
689        // surrounding description.
690        assert_eq!(
691            PlaceholderGrammar::tokenize("Florian Beispiel, Verzugszinsen"),
692            "{person}, Verzugszinsen"
693        );
694        assert_eq!(
695            PlaceholderGrammar::tokenize("Kurt Beispiel/Miete Lager"),
696            "{person}/Miete Lager"
697        );
698        // No name leaks through after tokenization.
699        for s in ["Beratung Marc Mustermann", "Mustermann Thomas Guthaben"] {
700            assert!(
701                PlaceholderGrammar::residual_pii_scan(&PlaceholderGrammar::tokenize(s)).is_empty(),
702                "tokenized form of {s:?} must be PII-clean"
703            );
704        }
705    }
706
707    /// Regression: compound/prefixed given-name tokens and umlaut-stripped
708    /// corpus forms must still be recognized. The corpus drops umlauts
709    /// (`Jürg`→`Jrg`) and joins tokens with `-`/`/` (`Hans-Rudolf`,
710    /// `ESD-Roger`), which defeated a plain whitespace+lowercase lookup and
711    /// leaked names through the first regen passes.
712    #[test]
713    fn tokenize_handles_compound_and_umlaut_stripped_names() {
714        // Hyphenated compound given name (Hans + Rudolf both known).
715        assert_eq!(
716            PlaceholderGrammar::tokenize("Hans-Rudolf Beispiel"),
717            "{person}"
718        );
719        // Prefix-joined given name (ESD-Roger → Roger is known).
720        assert_eq!(
721            PlaceholderGrammar::tokenize("ESD-Roger Mustermann"),
722            "{person}"
723        );
724        // Umlaut-stripped corpus form: gazetteer has `Jürg`; corpus stores `Jrg`.
725        assert_eq!(PlaceholderGrammar::tokenize("Jrg Mustermann"), "{person}");
726        for s in [
727            "Hans-Rudolf Beispiel",
728            "ESD-Roger Mustermann",
729            "Jrg Mustermann",
730        ] {
731            assert!(
732                PlaceholderGrammar::residual_pii_scan(&PlaceholderGrammar::tokenize(s)).is_empty(),
733                "compound/umlaut name leaked: {s:?}"
734            );
735        }
736    }
737
738    /// Regression: a person name in the SUFFIX of a patient/star record must
739    /// also be collapsed. The patient/star rules used to `return` early,
740    /// leaking a trailing `Firstname Lastname` (the JE_44 `Robert Hoe` regen
741    /// failure). Now they fall through to the name-run + structural tail.
742    #[test]
743    fn tokenize_name_in_patient_or_star_suffix_is_clean() {
744        for s in [
745            "*Muster,A G:01.02.03 E:04.05.06 Thomas Beispiel",
746            "*Muster,Anna Beratung Marc Mustermann",
747        ] {
748            let tok = PlaceholderGrammar::tokenize(s);
749            assert!(
750                PlaceholderGrammar::residual_pii_scan(&tok).is_empty(),
751                "suffix name leaked: {s:?} -> {tok:?}"
752            );
753        }
754    }
755
756    #[test]
757    fn name_detection_no_false_positives() {
758        // Real bank names (kept by design), generic accounting terms, and
759        // already-placeholdered text must NOT be flagged or rewritten.
760        for clean in [
761            "Deutsche Bank",
762            "Kontokorrent {company} AG",
763            "Material Werkzeuge Werkstoffe",
764            "Goldman Sachs",
765            "Standard Chartered",
766        ] {
767            assert!(
768                PlaceholderGrammar::residual_pii_scan(clean)
769                    .iter()
770                    .all(|h| h.pattern != "given_name"),
771                "false-positive given_name on: {clean:?}"
772            );
773            assert_eq!(
774                PlaceholderGrammar::tokenize(clean),
775                clean,
776                "tokenize must not rewrite clean text: {clean:?}"
777            );
778        }
779    }
780
781    #[test]
782    fn pii_placeholder_kind_token_roundtrip() {
783        for kind in [
784            PiiPlaceholderKind::Patient,
785            PiiPlaceholderKind::Person,
786            PiiPlaceholderKind::Company,
787            PiiPlaceholderKind::Street,
788        ] {
789            assert_eq!(PiiPlaceholderKind::from_token(kind.token()), Some(kind));
790        }
791        assert_eq!(PiiPlaceholderKind::from_token("{year}"), None);
792        assert_eq!(PiiPlaceholderKind::from_token("{unknown}"), None);
793    }
794
795    #[test]
796    fn line_key_format() {
797        assert_eq!(TextTaxonomyPrior::line_key("KR", "A.B"), "KR|A.B");
798        assert_eq!(
799            TextTaxonomyPrior::line_key("RE", TextTaxonomyPrior::UNKNOWN_CLASS),
800            "RE|_unknown_"
801        );
802    }
803
804    #[test]
805    fn synthetic_example_resolver_emits_obvious_fakes() {
806        let mut r = SyntheticExampleResolver;
807        let mut rng = rand::rng();
808        for kind in [
809            PiiPlaceholderKind::Patient,
810            PiiPlaceholderKind::Person,
811            PiiPlaceholderKind::Company,
812            PiiPlaceholderKind::Street,
813        ] {
814            let v = r.resolve(kind, &mut rng);
815            assert!(v.starts_with("Example"), "expected obvious fake, got {v}");
816        }
817    }
818
819    #[test]
820    fn tokenize_patient_record_strips_name_even_with_g_in_it() {
821        // The name "Gambon" contains a G — the strip must consume it. A naive
822        // [^G]*? class cannot, and would leak the name. This is the bug the
823        // first-pass cleaning sweep found.
824        assert_eq!(
825            PlaceholderGrammar::tokenize("*Gambon,Laurin G:01.02.03 E:04.05.06 A:07.08.09"),
826            "*{patient} G:{date} E:{date} A:{date}"
827        );
828        assert_eq!(
829            PlaceholderGrammar::tokenize("*Rykart,Frank G G:11.12.13"),
830            "*{patient} G:{date}"
831        );
832    }
833
834    #[test]
835    fn tokenize_person_star_record() {
836        assert_eq!(PlaceholderGrammar::tokenize("*Mueller,Hans"), "*{person}");
837        // Pin behavior on `*Lastname,Firstname<rest>` shapes: after the
838        // firstname fragment is trimmed, a single space separates {person}
839        // from the remainder. Short digit runs (< 4) stay verbatim.
840        assert_eq!(
841            PlaceholderGrammar::tokenize("*Mueller,Hans Ref-123"),
842            "*{person} Ref-123"
843        );
844    }
845
846    #[test]
847    fn tokenize_street_address() {
848        assert_eq!(
849            PlaceholderGrammar::tokenize("LUKB Mietzinskaution Roentgenpraxis, Spitalstrasse 5"),
850            "LUKB Mietzinskaution Roentgenpraxis, {street}"
851        );
852    }
853
854    #[test]
855    fn tokenize_structural_temporal() {
856        assert_eq!(
857            PlaceholderGrammar::tokenize("Mieten 04.2021"),
858            "Mieten 04.{year}"
859        );
860        assert_eq!(
861            PlaceholderGrammar::tokenize("Sales Accrual Q1"),
862            "Sales Accrual {quarter}"
863        );
864        assert_eq!(
865            PlaceholderGrammar::tokenize("January accrual"),
866            "{month} accrual"
867        );
868        assert_eq!(PlaceholderGrammar::tokenize("INV 1234567"), "INV {digits}");
869        assert_eq!(PlaceholderGrammar::tokenize("GL 470"), "GL 470"); // short run kept
870    }
871
872    #[test]
873    fn tokenize_fixed_vocab_unchanged() {
874        assert_eq!(PlaceholderGrammar::tokenize("ARIBA_ASN"), "ARIBA_ASN");
875        assert_eq!(
876            PlaceholderGrammar::tokenize("CH Post: KUREPO Intercomp"),
877            "CH Post: KUREPO Intercomp"
878        );
879    }
880
881    #[test]
882    fn tokenize_then_scan_is_clean() {
883        // Every Phase-A-tokenized string with structural PII must scan clean.
884        for raw in [
885            "*Gambon,Laurin G:01.02.03 E:04.05.06 A:07.08.09",
886            "*Mueller,Hans",
887            "LUKB Spitalstrasse 5",
888        ] {
889            let tok = PlaceholderGrammar::tokenize(raw);
890            assert!(
891                PlaceholderGrammar::residual_pii_scan(&tok).is_empty(),
892                "tokenize left residual PII: {raw:?} -> {tok:?}"
893            );
894        }
895    }
896
897    #[test]
898    fn fill_structural_placeholders() {
899        let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7);
900        let mut resolver = SyntheticExampleResolver;
901        let out = PlaceholderGrammar::fill(
902            "Mieten {month}.{year} ref {digits} {quarter}",
903            &mut resolver,
904            &mut rng,
905        );
906        assert!(
907            !out.contains('{'),
908            "structural placeholders left unfilled: {out}"
909        );
910        assert!(out.starts_with("Mieten "));
911    }
912
913    #[test]
914    fn fill_pii_placeholders_via_resolver() {
915        let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7);
916        let mut resolver = SyntheticExampleResolver;
917        let out =
918            PlaceholderGrammar::fill("Rechnung {company} / {person}", &mut resolver, &mut rng);
919        assert_eq!(out, "Rechnung Example GmbH / Example Person");
920    }
921
922    #[test]
923    fn fill_unknown_placeholder_kept_literal() {
924        let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7);
925        let mut resolver = SyntheticExampleResolver;
926        let out = PlaceholderGrammar::fill("foo {bogus} bar", &mut resolver, &mut rng);
927        assert_eq!(out, "foo {bogus} bar");
928    }
929
930    #[test]
931    fn fill_then_scan_clean() {
932        let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7);
933        let mut resolver = SyntheticExampleResolver;
934        for tmpl in ["Darlehen {person}", "*{patient} G:{date}", "{company} AG"] {
935            let out = PlaceholderGrammar::fill(tmpl, &mut resolver, &mut rng);
936            assert!(
937                PlaceholderGrammar::residual_pii_scan(&out).is_empty(),
938                "fill produced residual-PII shape: {tmpl:?} -> {out:?}"
939            );
940        }
941    }
942
943    #[test]
944    fn prior_serde_roundtrip() {
945        let mut prior = TextTaxonomyPrior::default();
946        prior.line_pools.insert(
947            TextTaxonomyPrior::line_key("KR", "A.B"),
948            TemplatePool {
949                templates: vec![TemplateEntry {
950                    template: "Rechnung {company}".to_string(),
951                    probability: 1.0,
952                    synthetic_example: "Rechnung Example GmbH".to_string(),
953                }],
954                n: 42,
955            },
956        );
957        prior.meta.class_tier = "iso21378_l2".to_string();
958        let yaml = serde_yaml::to_string(&prior).expect("serialize");
959        let back: TextTaxonomyPrior = serde_yaml::from_str(&yaml).expect("deserialize");
960        assert_eq!(prior, back);
961    }
962}
datasynth_core/distributions/text_taxonomy.rs

datasynth_core/distributions/
text_taxonomy.rs