1use std::collections::BTreeMap;
11
12use regex::Regex;
13use serde::{Deserialize, Serialize};
14
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
19pub enum PiiPlaceholderKind {
20 Patient,
21 Person,
22 Company,
23 Street,
24}
25
26impl PiiPlaceholderKind {
27 pub fn token(self) -> &'static str {
29 match self {
30 PiiPlaceholderKind::Patient => "{patient}",
31 PiiPlaceholderKind::Person => "{person}",
32 PiiPlaceholderKind::Company => "{company}",
33 PiiPlaceholderKind::Street => "{street}",
34 }
35 }
36
37 pub fn from_token(token: &str) -> Option<Self> {
39 match token {
40 "{patient}" => Some(PiiPlaceholderKind::Patient),
41 "{person}" => Some(PiiPlaceholderKind::Person),
42 "{company}" => Some(PiiPlaceholderKind::Company),
43 "{street}" => Some(PiiPlaceholderKind::Street),
44 _ => None,
45 }
46 }
47}
48
49pub trait PlaceholderResolver {
53 fn resolve(&mut self, kind: PiiPlaceholderKind, rng: &mut dyn rand::Rng) -> String;
55}
56
57pub struct SyntheticExampleResolver;
60
61impl PlaceholderResolver for SyntheticExampleResolver {
62 fn resolve(&mut self, kind: PiiPlaceholderKind, _rng: &mut dyn rand::Rng) -> String {
63 match kind {
64 PiiPlaceholderKind::Patient => "Example Patient".to_string(),
65 PiiPlaceholderKind::Person => "Example Person".to_string(),
66 PiiPlaceholderKind::Company => "Example GmbH".to_string(),
67 PiiPlaceholderKind::Street => "Example Street 1".to_string(),
68 }
69 }
70}
71
72#[derive(Debug, Clone, PartialEq)]
74pub struct PiiHit {
75 pub pattern: &'static str,
77 pub matched: String,
79}
80
81#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
83pub struct TemplateEntry {
84 pub template: String,
86 pub probability: f64,
88 pub synthetic_example: String,
92}
93
94#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
96pub struct TemplatePool {
97 pub templates: Vec<TemplateEntry>,
98 pub n: usize,
100}
101
102#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
104pub struct TaxonomyMeta {
105 pub min_occurrences: usize,
106 pub max_templates_per_pool: usize,
107 pub class_tier: String,
109 pub n_client_inputs: usize,
110}
111
112#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
114pub struct TextTaxonomyPrior {
115 pub line_pools: BTreeMap<String, TemplatePool>,
119 pub header_pools: BTreeMap<String, TemplatePool>,
121 pub coa_pools: BTreeMap<String, TemplateEntry>,
123 pub meta: TaxonomyMeta,
125}
126
127impl TextTaxonomyPrior {
128 pub const UNKNOWN_CLASS: &'static str = "_unknown_";
131
132 pub fn line_key(source: &str, account_class: &str) -> String {
134 format!("{source}|{account_class}")
135 }
136}
137
138pub struct PlaceholderGrammar;
142
143mod scan_patterns {
152 #![allow(clippy::unwrap_used)]
153
154 use regex::Regex;
155 use std::sync::LazyLock;
156
157 pub(super) static RE_PATIENT: LazyLock<Regex> =
159 LazyLock::new(|| Regex::new(r"G:\s*\d{2}\.\d{2}\.\d{2}").unwrap());
160 pub(super) static RE_PERSON_STAR: LazyLock<Regex> = LazyLock::new(|| {
162 Regex::new(r"^\*[A-ZÄÖÜ][\w\u{00C0}-\u{017F}.'\-]+\s*,\s*[A-ZÄÖÜ]").unwrap()
163 });
164 pub(super) static RE_TITLE: LazyLock<Regex> = LazyLock::new(|| {
166 Regex::new(r"\b(Prof|Dr|Dipl|Pfr|Pfarrer|Herr|Frau|Hr|Fr|med|iur|lic)\.\s").unwrap()
167 });
168 pub(super) static RE_INITIAL_SURNAME: LazyLock<Regex> =
170 LazyLock::new(|| Regex::new(r"\b[A-ZÄÖÜ]\.\s*[A-ZÄÖÜ][a-zäöüß]{2,}\b").unwrap());
171 pub(super) static RE_SURNAME_INITIAL: LazyLock<Regex> =
179 LazyLock::new(|| Regex::new(r"\b[A-ZÄÖÜ][a-zäöüß]{2,}\s+[A-ZÄÖÜ]\.(?:\s|$)").unwrap());
180
181 pub(super) static RE_GEA_DATE: LazyLock<Regex> =
185 LazyLock::new(|| Regex::new(r"([GEA]):\s*\d{2}\.\d{2}\.\d{2}").unwrap());
186 pub(super) static RE_STREET: LazyLock<Regex> = LazyLock::new(|| {
189 Regex::new(r"(?i)\b[A-ZÄÖÜ][\w\u{00C0}-\u{017F}.\-]*(?:str\.|strasse|gasse|weg|platz)\s*\d+[A-Za-z]?\b").unwrap()
190 });
191 pub(super) static RE_YEAR: LazyLock<Regex> =
193 LazyLock::new(|| Regex::new(r"\b(?:19|20)\d{2}\b").unwrap());
194 pub(super) static RE_QUARTER: LazyLock<Regex> =
196 LazyLock::new(|| Regex::new(r"(?i)\bQ[1-4]\b").unwrap());
197 pub(super) static RE_DIGITS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d{4,}").unwrap());
199
200 pub(super) static RE_NAME_RUN: LazyLock<Regex> =
207 LazyLock::new(|| Regex::new(r"\p{Lu}[\p{L}.'\-]*(?:\s+\p{Lu}[\p{L}.'\-]*)+").unwrap());
208}
209
210mod given_names {
215 use std::collections::HashSet;
216 use std::sync::LazyLock;
217
218 pub(super) fn normalize(s: &str) -> String {
224 let mut out = String::with_capacity(s.len());
225 for c in s.chars() {
226 match c {
227 'ä' | 'ö' | 'ü' | 'Ä' | 'Ö' | 'Ü' => {}
229 'é' | 'è' | 'ê' | 'ë' => out.push('e'),
231 'à' | 'â' | 'á' => out.push('a'),
232 'î' | 'ï' | 'í' => out.push('i'),
233 'ô' | 'ó' => out.push('o'),
234 'û' | 'ú' => out.push('u'),
235 'ç' => out.push('c'),
236 'ñ' => out.push('n'),
237 'ß' => out.push_str("ss"),
238 _ => out.extend(c.to_lowercase()),
239 }
240 }
241 out
242 }
243
244 pub(super) static GIVEN_NAMES: LazyLock<HashSet<String>> = LazyLock::new(|| {
245 include_str!("../../resources/given_names.txt")
246 .lines()
247 .map(str::trim)
248 .filter(|l| !l.is_empty() && !l.starts_with('#'))
249 .map(normalize)
250 .filter(|n| !n.is_empty())
251 .collect()
252 });
253
254 pub(super) fn run_has_given_name(run: &str) -> bool {
260 run.split(|c: char| c.is_whitespace() || matches!(c, '-' | '/' | '.' | ',' | '_'))
261 .any(|part| {
262 let cleaned = part.trim_matches(|c: char| !c.is_alphabetic());
263 !cleaned.is_empty() && GIVEN_NAMES.contains(&normalize(cleaned))
264 })
265 }
266}
267
268use scan_patterns::{
269 RE_DIGITS, RE_GEA_DATE, RE_INITIAL_SURNAME, RE_NAME_RUN, RE_PATIENT, RE_PERSON_STAR,
270 RE_QUARTER, RE_STREET, RE_SURNAME_INITIAL, RE_TITLE, RE_YEAR,
271};
272
273const MONTH_NAMES: &[&str] = &[
275 "September",
276 "Februar",
277 "Dezember",
278 "November",
279 "February",
280 "December",
281 "January",
282 "October",
283 "Januar",
284 "Oktober",
285 "August",
286 "März",
287 "Maerz",
288 "April",
289 "March",
290 "Juni",
291 "Juli",
292 "June",
293 "July",
294 "Mai",
295 "May",
296 "Jan",
297 "Feb",
298 "Mar",
299 "Apr",
300 "Jun",
301 "Jul",
302 "Aug",
303 "Sep",
304 "Oct",
305 "Nov",
306 "Dec",
307];
308
309impl PlaceholderGrammar {
310 pub fn fill<R: rand::Rng>(
315 template: &str,
316 resolver: &mut dyn PlaceholderResolver,
317 rng: &mut R,
318 ) -> String {
319 use rand::RngExt;
320 if template.is_empty() {
321 return String::new();
322 }
323 let mut out = String::with_capacity(template.len() + 16);
324 let mut rest = template;
325 while let Some(open) = rest.find('{') {
326 out.push_str(&rest[..open]);
327 rest = &rest[open..];
328 let Some(close) = rest.find('}') else {
329 out.push_str(rest);
331 return out;
332 };
333 let token = &rest[..=close];
334 rest = &rest[close + 1..];
335 if let Some(kind) = PiiPlaceholderKind::from_token(token) {
336 let resolved = resolver.resolve(kind, rng);
337 out.push_str(&resolved);
338 continue;
339 }
340 match token {
341 "{year}" => {
342 let y: u32 = rng.random_range(2018..=2024);
343 out.push_str(&y.to_string());
344 }
345 "{quarter}" => {
346 let q: u32 = rng.random_range(1..=4);
347 out.push('Q');
348 out.push_str(&q.to_string());
349 }
350 "{month}" => {
351 const MONTHS: &[&str] = &[
352 "January",
353 "February",
354 "March",
355 "April",
356 "May",
357 "June",
358 "July",
359 "August",
360 "September",
361 "October",
362 "November",
363 "December",
364 ];
365 out.push_str(MONTHS[rng.random_range(0..MONTHS.len())]);
366 }
367 "{date}" => {
368 let d: u32 = rng.random_range(1..=28);
369 let m: u32 = rng.random_range(1..=12);
370 let y: u32 = rng.random_range(2018..=2024);
371 out.push_str(&format!("{y}-{m:02}-{d:02}"));
374 }
375 "{digits}" => {
376 let n = rng.random_range(4..=8);
377 for _ in 0..n {
378 out.push(char::from(b'0' + rng.random_range(0u8..10)));
379 }
380 }
381 _ => out.push_str(token), }
383 }
384 out.push_str(rest);
385 out
386 }
387
388 pub fn residual_pii_scan(s: &str) -> Vec<PiiHit> {
394 let mut hits = Vec::new();
395 let checks: &[(&'static str, &Regex)] = &[
396 ("patient_record", &RE_PATIENT),
397 ("person_star", &RE_PERSON_STAR),
398 ("title", &RE_TITLE),
399 ("initial_surname", &RE_INITIAL_SURNAME),
400 ("surname_initial", &RE_SURNAME_INITIAL),
401 ];
402 for (label, re) in checks {
403 if let Some(m) = re.find(s) {
404 hits.push(PiiHit {
405 pattern: label,
406 matched: m.as_str().to_string(),
407 });
408 }
409 }
410 for m in RE_NAME_RUN.find_iter(s) {
414 if given_names::run_has_given_name(m.as_str()) {
415 hits.push(PiiHit {
416 pattern: "given_name",
417 matched: m.as_str().to_string(),
418 });
419 break;
420 }
421 }
422 hits
423 }
424
425 pub fn tokenize(s: &str) -> String {
440 let t = s.trim();
441 if t.is_empty() {
442 return String::new();
443 }
444
445 let staged: String = if let Some(m) = RE_PATIENT.find(t) {
451 let from_marker = &t[m.start()..];
454 let dated = RE_GEA_DATE.replace_all(from_marker, "$1:{date}");
455 format!("*{{patient}} {dated}").trim().to_string()
456 } else if let Some(m) = RE_PERSON_STAR.find(t) {
457 let mut out = String::with_capacity(t.len());
460 out.push_str("*{person}");
461 out.push_str(&t[m.end()..]);
462 trim_leading_name_fragment(&out)
463 } else {
464 t.to_string()
465 };
466
467 let staged = RE_STREET.replace_all(&staged, "{street}").into_owned();
469
470 let staged = RE_NAME_RUN
476 .replace_all(&staged, |caps: ®ex::Captures| {
477 let run = &caps[0];
478 if given_names::run_has_given_name(run) {
479 "{person}".to_string()
480 } else {
481 run.to_string()
482 }
483 })
484 .into_owned();
485
486 let staged = RE_YEAR.replace_all(&staged, "{year}").into_owned();
488 let staged = RE_QUARTER.replace_all(&staged, "{quarter}").into_owned();
489 let staged = replace_months(&staged);
490 RE_DIGITS.replace_all(&staged, "{digits}").into_owned()
491 }
492}
493
494fn replace_months(s: &str) -> String {
497 let mut result = s.to_string();
498 for name in MONTH_NAMES {
499 let mut out = String::with_capacity(result.len());
502 let nlen = name.len();
503 let mut i = 0;
504 while i < result.len() {
505 if result[i..].starts_with(name) {
506 let prev_alpha = i > 0
507 && result[..i]
508 .chars()
509 .next_back()
510 .map(|c| c.is_alphabetic())
511 .unwrap_or(false);
512 let next_alpha = result[i + nlen..]
513 .chars()
514 .next()
515 .map(|c| c.is_alphabetic())
516 .unwrap_or(false);
517 if !prev_alpha && !next_alpha {
518 out.push_str("{month}");
519 i += nlen;
520 continue;
521 }
522 }
523 let ch_len = result[i..]
525 .chars()
526 .next()
527 .map(|c| c.len_utf8())
528 .unwrap_or(1);
529 out.push_str(&result[i..i + ch_len]);
530 i += ch_len;
531 }
532 result = out;
533 }
534 result
535}
536
537fn trim_leading_name_fragment(s: &str) -> String {
544 const PREFIX: &str = "*{person}";
545 if let Some(rest) = s.strip_prefix(PREFIX) {
546 let mut end = 0usize;
547 for (i, c) in rest.char_indices() {
549 if c.is_alphabetic() {
550 end = i + c.len_utf8();
551 } else {
552 break;
553 }
554 }
555 if let Some(c) = rest[end..].chars().next() {
557 if c == ',' || c == ' ' {
558 end += c.len_utf8();
559 }
560 }
561 let trimmed = &rest[end..];
562 if trimmed.is_empty() {
563 PREFIX.to_string()
564 } else {
565 format!("{PREFIX} {trimmed}")
566 }
567 } else {
568 s.to_string()
569 }
570}
571
572#[cfg(test)]
573mod tests {
574 use super::*;
575 use rand::SeedableRng;
576
577 #[test]
578 fn residual_scan_flags_patient_record() {
579 let hits = PlaceholderGrammar::residual_pii_scan("*Gambon,Laurin G:01.02.03 E:04.05.06");
580 assert!(
581 hits.iter().any(|h| h.pattern == "patient_record"),
582 "expected patient_record hit, got {hits:?}"
583 );
584 }
585
586 #[test]
587 fn residual_scan_flags_person_shapes() {
588 assert!(PlaceholderGrammar::residual_pii_scan("*Mueller,Hans")
590 .iter()
591 .any(|h| h.pattern == "person_star"));
592 assert!(PlaceholderGrammar::residual_pii_scan("Forschung U. Frey")
594 .iter()
595 .any(|h| h.pattern == "initial_surname"));
596 assert!(
598 PlaceholderGrammar::residual_pii_scan("Kontokorrent Prof. Dr. M. Buess")
599 .iter()
600 .any(|h| h.pattern == "title")
601 );
602 assert!(
604 PlaceholderGrammar::residual_pii_scan("Konsultation Mueller H.")
605 .iter()
606 .any(|h| h.pattern == "surname_initial")
607 );
608 }
609
610 #[test]
611 fn residual_scan_passes_clean_templates() {
612 for clean in [
613 "Rechnung {company}",
614 "Mieten {month}.{year}",
615 "ARIBA_ASN",
616 "Darlehen {person}",
617 "*{patient} G:{date} E:{date} A:{date}",
618 "Umbuchung Anlage",
619 ] {
620 assert!(
621 PlaceholderGrammar::residual_pii_scan(clean).is_empty(),
622 "false positive on clean template: {clean:?}"
623 );
624 }
625 }
626
627 #[test]
628 fn residual_scan_excludes_legal_entity_suffixes() {
629 for legal in [
637 "Acme Europe B.V.",
638 "Globex Suisse S.A.",
639 "Initech S.A. Lugano",
640 "Switzerland S.A.",
641 ] {
642 assert!(
643 PlaceholderGrammar::residual_pii_scan(legal)
644 .iter()
645 .all(|h| h.pattern != "surname_initial"),
646 "must not flag legal-entity suffix in: {legal:?}"
647 );
648 }
649 assert!(
651 PlaceholderGrammar::residual_pii_scan("Patient consult Mueller H.")
652 .iter()
653 .any(|h| h.pattern == "surname_initial"),
654 "legitimate surname-initial at end-of-string must still match"
655 );
656 }
657
658 #[test]
663 fn residual_scan_flags_given_name_runs() {
664 for s in [
665 "Beratung Marc Mustermann", "Erbschaft Anna Beispiel", "Mustermann Thomas Guthaben", "Florian Beispiel, Verzugszinsen", ] {
670 let hits = PlaceholderGrammar::residual_pii_scan(s);
671 assert!(
672 hits.iter().any(|h| h.pattern == "given_name"),
673 "must flag given-name run in: {s:?} (got {hits:?})"
674 );
675 }
676 }
677
678 #[test]
679 fn tokenize_collapses_person_name_runs() {
680 assert_eq!(
685 PlaceholderGrammar::tokenize("Beratung Marc Mustermann"),
686 "{person}"
687 );
688 assert_eq!(
691 PlaceholderGrammar::tokenize("Florian Beispiel, Verzugszinsen"),
692 "{person}, Verzugszinsen"
693 );
694 assert_eq!(
695 PlaceholderGrammar::tokenize("Kurt Beispiel/Miete Lager"),
696 "{person}/Miete Lager"
697 );
698 for s in ["Beratung Marc Mustermann", "Mustermann Thomas Guthaben"] {
700 assert!(
701 PlaceholderGrammar::residual_pii_scan(&PlaceholderGrammar::tokenize(s)).is_empty(),
702 "tokenized form of {s:?} must be PII-clean"
703 );
704 }
705 }
706
707 #[test]
713 fn tokenize_handles_compound_and_umlaut_stripped_names() {
714 assert_eq!(
716 PlaceholderGrammar::tokenize("Hans-Rudolf Beispiel"),
717 "{person}"
718 );
719 assert_eq!(
721 PlaceholderGrammar::tokenize("ESD-Roger Mustermann"),
722 "{person}"
723 );
724 assert_eq!(PlaceholderGrammar::tokenize("Jrg Mustermann"), "{person}");
726 for s in [
727 "Hans-Rudolf Beispiel",
728 "ESD-Roger Mustermann",
729 "Jrg Mustermann",
730 ] {
731 assert!(
732 PlaceholderGrammar::residual_pii_scan(&PlaceholderGrammar::tokenize(s)).is_empty(),
733 "compound/umlaut name leaked: {s:?}"
734 );
735 }
736 }
737
738 #[test]
743 fn tokenize_name_in_patient_or_star_suffix_is_clean() {
744 for s in [
745 "*Muster,A G:01.02.03 E:04.05.06 Thomas Beispiel",
746 "*Muster,Anna Beratung Marc Mustermann",
747 ] {
748 let tok = PlaceholderGrammar::tokenize(s);
749 assert!(
750 PlaceholderGrammar::residual_pii_scan(&tok).is_empty(),
751 "suffix name leaked: {s:?} -> {tok:?}"
752 );
753 }
754 }
755
756 #[test]
757 fn name_detection_no_false_positives() {
758 for clean in [
761 "Deutsche Bank",
762 "Kontokorrent {company} AG",
763 "Material Werkzeuge Werkstoffe",
764 "Goldman Sachs",
765 "Standard Chartered",
766 ] {
767 assert!(
768 PlaceholderGrammar::residual_pii_scan(clean)
769 .iter()
770 .all(|h| h.pattern != "given_name"),
771 "false-positive given_name on: {clean:?}"
772 );
773 assert_eq!(
774 PlaceholderGrammar::tokenize(clean),
775 clean,
776 "tokenize must not rewrite clean text: {clean:?}"
777 );
778 }
779 }
780
781 #[test]
782 fn pii_placeholder_kind_token_roundtrip() {
783 for kind in [
784 PiiPlaceholderKind::Patient,
785 PiiPlaceholderKind::Person,
786 PiiPlaceholderKind::Company,
787 PiiPlaceholderKind::Street,
788 ] {
789 assert_eq!(PiiPlaceholderKind::from_token(kind.token()), Some(kind));
790 }
791 assert_eq!(PiiPlaceholderKind::from_token("{year}"), None);
792 assert_eq!(PiiPlaceholderKind::from_token("{unknown}"), None);
793 }
794
795 #[test]
796 fn line_key_format() {
797 assert_eq!(TextTaxonomyPrior::line_key("KR", "A.B"), "KR|A.B");
798 assert_eq!(
799 TextTaxonomyPrior::line_key("RE", TextTaxonomyPrior::UNKNOWN_CLASS),
800 "RE|_unknown_"
801 );
802 }
803
804 #[test]
805 fn synthetic_example_resolver_emits_obvious_fakes() {
806 let mut r = SyntheticExampleResolver;
807 let mut rng = rand::rng();
808 for kind in [
809 PiiPlaceholderKind::Patient,
810 PiiPlaceholderKind::Person,
811 PiiPlaceholderKind::Company,
812 PiiPlaceholderKind::Street,
813 ] {
814 let v = r.resolve(kind, &mut rng);
815 assert!(v.starts_with("Example"), "expected obvious fake, got {v}");
816 }
817 }
818
819 #[test]
820 fn tokenize_patient_record_strips_name_even_with_g_in_it() {
821 assert_eq!(
825 PlaceholderGrammar::tokenize("*Gambon,Laurin G:01.02.03 E:04.05.06 A:07.08.09"),
826 "*{patient} G:{date} E:{date} A:{date}"
827 );
828 assert_eq!(
829 PlaceholderGrammar::tokenize("*Rykart,Frank G G:11.12.13"),
830 "*{patient} G:{date}"
831 );
832 }
833
834 #[test]
835 fn tokenize_person_star_record() {
836 assert_eq!(PlaceholderGrammar::tokenize("*Mueller,Hans"), "*{person}");
837 assert_eq!(
841 PlaceholderGrammar::tokenize("*Mueller,Hans Ref-123"),
842 "*{person} Ref-123"
843 );
844 }
845
846 #[test]
847 fn tokenize_street_address() {
848 assert_eq!(
849 PlaceholderGrammar::tokenize("LUKB Mietzinskaution Roentgenpraxis, Spitalstrasse 5"),
850 "LUKB Mietzinskaution Roentgenpraxis, {street}"
851 );
852 }
853
854 #[test]
855 fn tokenize_structural_temporal() {
856 assert_eq!(
857 PlaceholderGrammar::tokenize("Mieten 04.2021"),
858 "Mieten 04.{year}"
859 );
860 assert_eq!(
861 PlaceholderGrammar::tokenize("Sales Accrual Q1"),
862 "Sales Accrual {quarter}"
863 );
864 assert_eq!(
865 PlaceholderGrammar::tokenize("January accrual"),
866 "{month} accrual"
867 );
868 assert_eq!(PlaceholderGrammar::tokenize("INV 1234567"), "INV {digits}");
869 assert_eq!(PlaceholderGrammar::tokenize("GL 470"), "GL 470"); }
871
872 #[test]
873 fn tokenize_fixed_vocab_unchanged() {
874 assert_eq!(PlaceholderGrammar::tokenize("ARIBA_ASN"), "ARIBA_ASN");
875 assert_eq!(
876 PlaceholderGrammar::tokenize("CH Post: KUREPO Intercomp"),
877 "CH Post: KUREPO Intercomp"
878 );
879 }
880
881 #[test]
882 fn tokenize_then_scan_is_clean() {
883 for raw in [
885 "*Gambon,Laurin G:01.02.03 E:04.05.06 A:07.08.09",
886 "*Mueller,Hans",
887 "LUKB Spitalstrasse 5",
888 ] {
889 let tok = PlaceholderGrammar::tokenize(raw);
890 assert!(
891 PlaceholderGrammar::residual_pii_scan(&tok).is_empty(),
892 "tokenize left residual PII: {raw:?} -> {tok:?}"
893 );
894 }
895 }
896
897 #[test]
898 fn fill_structural_placeholders() {
899 let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7);
900 let mut resolver = SyntheticExampleResolver;
901 let out = PlaceholderGrammar::fill(
902 "Mieten {month}.{year} ref {digits} {quarter}",
903 &mut resolver,
904 &mut rng,
905 );
906 assert!(
907 !out.contains('{'),
908 "structural placeholders left unfilled: {out}"
909 );
910 assert!(out.starts_with("Mieten "));
911 }
912
913 #[test]
914 fn fill_pii_placeholders_via_resolver() {
915 let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7);
916 let mut resolver = SyntheticExampleResolver;
917 let out =
918 PlaceholderGrammar::fill("Rechnung {company} / {person}", &mut resolver, &mut rng);
919 assert_eq!(out, "Rechnung Example GmbH / Example Person");
920 }
921
922 #[test]
923 fn fill_unknown_placeholder_kept_literal() {
924 let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7);
925 let mut resolver = SyntheticExampleResolver;
926 let out = PlaceholderGrammar::fill("foo {bogus} bar", &mut resolver, &mut rng);
927 assert_eq!(out, "foo {bogus} bar");
928 }
929
930 #[test]
931 fn fill_then_scan_clean() {
932 let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7);
933 let mut resolver = SyntheticExampleResolver;
934 for tmpl in ["Darlehen {person}", "*{patient} G:{date}", "{company} AG"] {
935 let out = PlaceholderGrammar::fill(tmpl, &mut resolver, &mut rng);
936 assert!(
937 PlaceholderGrammar::residual_pii_scan(&out).is_empty(),
938 "fill produced residual-PII shape: {tmpl:?} -> {out:?}"
939 );
940 }
941 }
942
943 #[test]
944 fn prior_serde_roundtrip() {
945 let mut prior = TextTaxonomyPrior::default();
946 prior.line_pools.insert(
947 TextTaxonomyPrior::line_key("KR", "A.B"),
948 TemplatePool {
949 templates: vec![TemplateEntry {
950 template: "Rechnung {company}".to_string(),
951 probability: 1.0,
952 synthetic_example: "Rechnung Example GmbH".to_string(),
953 }],
954 n: 42,
955 },
956 );
957 prior.meta.class_tier = "iso21378_l2".to_string();
958 let yaml = serde_yaml::to_string(&prior).expect("serialize");
959 let back: TextTaxonomyPrior = serde_yaml::from_str(&yaml).expect("deserialize");
960 assert_eq!(prior, back);
961 }
962}