1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
//! SP6 — corpus text taxonomy: PII-safe placeholder grammar + conditional
//! template pools keyed on (source, account-class).
//!
//! Replaces SP4.4's verbatim source-keyed `TextTemplatePrior`. Generated text
//! is synthetic-by-construction: tokenized real templates whose PII spans are
//! fillable placeholders. Line text is conditioned on (source, account-class);
//! header text is source-keyed; CoA descriptions are per-account templates
//! filled once per run.
use std::collections::BTreeMap;
use regex::Regex;
use serde::{Deserialize, Serialize};
/// A PII-placeholder kind the generator must resolve to a concrete value.
/// Structural placeholders (`{year}`, `{quarter}`, `{month}`, `{date}`,
/// `{digits}`) are NOT in this enum — `PlaceholderGrammar::fill` handles those.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum PiiPlaceholderKind {
Patient,
Person,
Company,
Street,
}
impl PiiPlaceholderKind {
/// The placeholder token as it appears in a template string.
pub fn token(self) -> &'static str {
match self {
PiiPlaceholderKind::Patient => "{patient}",
PiiPlaceholderKind::Person => "{person}",
PiiPlaceholderKind::Company => "{company}",
PiiPlaceholderKind::Street => "{street}",
}
}
/// Parse a placeholder token to its kind. `None` for structural or unknown.
pub fn from_token(token: &str) -> Option<Self> {
match token {
"{patient}" => Some(PiiPlaceholderKind::Patient),
"{person}" => Some(PiiPlaceholderKind::Person),
"{company}" => Some(PiiPlaceholderKind::Company),
"{street}" => Some(PiiPlaceholderKind::Street),
_ => None,
}
}
}
/// Resolves a PII-placeholder kind to a concrete value. Implemented by the
/// generator (wired to master data) and by `SyntheticExampleResolver` (used at
/// extraction time, where master data does not exist).
pub trait PlaceholderResolver {
/// Resolve a PII-placeholder kind to a concrete value.
fn resolve(&mut self, kind: PiiPlaceholderKind, rng: &mut dyn rand::Rng) -> String;
}
/// A built-in resolver emitting obviously-synthetic tokens. Used to produce
/// `TemplateEntry::synthetic_example` at extraction time and in tests.
pub struct SyntheticExampleResolver;
impl PlaceholderResolver for SyntheticExampleResolver {
fn resolve(&mut self, kind: PiiPlaceholderKind, _rng: &mut dyn rand::Rng) -> String {
match kind {
PiiPlaceholderKind::Patient => "Example Patient".to_string(),
PiiPlaceholderKind::Person => "Example Person".to_string(),
PiiPlaceholderKind::Company => "Example GmbH".to_string(),
PiiPlaceholderKind::Street => "Example Street 1".to_string(),
}
}
}
/// One residual-PII scan hit.
#[derive(Debug, Clone, PartialEq)]
pub struct PiiHit {
/// Static label of the pattern that matched (e.g. `"patient_record"`).
pub pattern: &'static str,
/// The substring that matched.
pub matched: String,
}
/// A single PII-safe text template.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct TemplateEntry {
/// Tokenized, PII-safe template string.
pub template: String,
/// Probability mass within the owning pool (renormalised after filtering).
pub probability: f64,
/// The template run through `fill` once at extraction time with a
/// fixed-seed RNG and `SyntheticExampleResolver` — a debug/audit example
/// carrying ZERO corpus content. Replaces SP4.4's verbatim `example` field.
pub synthetic_example: String,
}
/// A weighted pool of templates for one `(source, class)` or `source` key.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct TemplatePool {
pub templates: Vec<TemplateEntry>,
/// Total observations underpinning the pool (pre-truncation).
pub n: usize,
}
/// Extraction metadata for a `TextTaxonomyPrior`.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct TaxonomyMeta {
pub min_occurrences: usize,
pub max_templates_per_pool: usize,
/// Class-granularity tier used for `line_pools` keys (e.g. `"iso21378_l2"`).
pub class_tier: String,
pub n_client_inputs: usize,
}
/// SP6 — corpus text taxonomy prior. Replaces `TextTemplatePrior`.
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
pub struct TextTaxonomyPrior {
/// Line text keyed on the flattened string `"SOURCE|CLASS"`. `CLASS` is the
/// ISO 21378 Level-2 account class; lines whose account has no resolvable
/// class are grouped under `"SOURCE|_unknown_"`.
pub line_pools: BTreeMap<String, TemplatePool>,
/// Header text keyed on source only (a JE header has no single account).
pub header_pools: BTreeMap<String, TemplatePool>,
/// CoA description templates keyed on account number — one per account.
pub coa_pools: BTreeMap<String, TemplateEntry>,
/// Extraction metadata.
pub meta: TaxonomyMeta,
}
impl TextTaxonomyPrior {
/// Sentinel class component used when a line's account has no resolvable
/// ISO 21378 class.
pub const UNKNOWN_CLASS: &'static str = "_unknown_";
/// Build the flattened `"SOURCE|CLASS"` key used by `line_pools`.
pub fn line_key(source: &str, account_class: &str) -> String {
format!("{source}|{account_class}")
}
}
/// Stateless tokenize / fill / scan engine. No dependency on the generator or
/// fingerprint crates — locale and master-data wiring arrive via a
/// `PlaceholderResolver` at fill time.
pub struct PlaceholderGrammar;
// --- residual-PII scan + tokenize Phase-A statics ---
//
// The `.unwrap()` on each regex is sound: the literals are compile-time
// constants whose well-formedness is pinned by the test suite. Grouped into
// a private submodule so the crate-level `#![deny(clippy::unwrap_used)]`
// is bypassed with a single module-level `#![allow]` rather than ten
// per-static attributes.
mod scan_patterns {
#![allow(clippy::unwrap_used)]
use regex::Regex;
use std::sync::LazyLock;
/// Patient record marker: `G:dd.dd.dd`. Presence implies an un-stripped name.
pub(super) static RE_PATIENT: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"G:\s*\d{2}\.\d{2}\.\d{2}").unwrap());
/// `*Lastname,Firstname` star record, anchored at start.
pub(super) static RE_PERSON_STAR: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"^\*[A-ZÄÖÜ][\w\u{00C0}-\u{017F}.'\-]+\s*,\s*[A-ZÄÖÜ]").unwrap()
});
/// Honorific / title followed by a name.
pub(super) static RE_TITLE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\b(Prof|Dr|Dipl|Pfr|Pfarrer|Herr|Frau|Hr|Fr|med|iur|lic)\.\s").unwrap()
});
/// `Initial. Surname` (e.g. `U. Frey`).
pub(super) static RE_INITIAL_SURNAME: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[A-ZÄÖÜ]\.\s*[A-ZÄÖÜ][a-zäöüß]{2,}\b").unwrap());
/// `Surname Initial.` (e.g. `Frey U.`, `Mueller H.`). The trailing
/// `[A-Z].` must be followed by whitespace or end-of-string, so that
/// legal-entity abbreviations like `Europe B.V.` / `Suisse S.A.` /
/// `Nespresso S.A.` do NOT match (the period there is followed by
/// another capital letter — part of the abbreviation, not a name initial).
/// Pre-T16 corpus scan found ~90% of `surname_initial` raw hits were
/// legal-entity suffixes, not person names.
pub(super) static RE_SURNAME_INITIAL: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b[A-ZÄÖÜ][a-zäöüß]{2,}\s+[A-ZÄÖÜ]\.(?:\s|$)").unwrap());
// --- tokenize Phase-A statics ---
/// `dd.mm.yy` date triplet inside a patient `G:`/`E:`/`A:` record.
pub(super) static RE_GEA_DATE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"([GEA]):\s*\d{2}\.\d{2}\.\d{2}").unwrap());
/// Street address: a capitalised word ending in a street-type suffix, then
/// a number. Case-insensitive on the suffix.
pub(super) static RE_STREET: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)\b[A-ZÄÖÜ][\w\u{00C0}-\u{017F}.\-]*(?:str\.|strasse|gasse|weg|platz)\s*\d+[A-Za-z]?\b").unwrap()
});
/// 4-digit year 19xx / 20xx not embedded in a longer digit run.
pub(super) static RE_YEAR: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b(?:19|20)\d{2}\b").unwrap());
/// Quarter marker Q1–Q4 (case-insensitive), not followed by another digit.
pub(super) static RE_QUARTER: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)\bQ[1-4]\b").unwrap());
/// Run of >=4 digits.
pub(super) static RE_DIGITS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d{4,}").unwrap());
/// A run of >=2 capitalised, whitespace-separated tokens (Unicode-aware).
/// Candidate person-name span; confirmed only if a token is a known given
/// name (see `given_names`). German capitalises all nouns, so case alone
/// can't separate a surname from a common noun — the given-name gazetteer
/// is the anchor, and we redact the whole run (safe over-redaction) rather
/// than risk leaking the adjacent surname.
pub(super) static RE_NAME_RUN: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\p{Lu}[\p{L}.'\-]*(?:\s+\p{Lu}[\p{L}.'\-]*)+").unwrap());
}
/// Given-name gazetteer for Phase-A name tokenization. Generic given names
/// (country-pack union + Swiss/DE/FR/IT supplement) — NOT PII, a name
/// dictionary like city names. Used to anchor `Firstname Lastname` detection
/// that the initial/title/patient regexes miss.
mod given_names {
use std::collections::HashSet;
use std::sync::LazyLock;
/// Normalize a token for gazetteer lookup: lowercase + drop umlaut/accent
/// characters entirely. The corpus text has umlauts STRIPPED (e.g.
/// `Jürg`→`Jrg`, `Rückstellung`→`Rckstellung`), so a gazetteer entry with
/// proper umlauts would never match the corpus form — normalizing both
/// sides the same way bridges that.
pub(super) fn normalize(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for c in s.chars() {
match c {
// Umlauts: the corpus drops them entirely (Jürg→Jrg, Löhne→Lhne).
'ä' | 'ö' | 'ü' | 'Ä' | 'Ö' | 'Ü' => {}
// Accented latin: map to the base letter (Régis→Regis).
'é' | 'è' | 'ê' | 'ë' => out.push('e'),
'à' | 'â' | 'á' => out.push('a'),
'î' | 'ï' | 'í' => out.push('i'),
'ô' | 'ó' => out.push('o'),
'û' | 'ú' => out.push('u'),
'ç' => out.push('c'),
'ñ' => out.push('n'),
'ß' => out.push_str("ss"),
_ => out.extend(c.to_lowercase()),
}
}
out
}
pub(super) static GIVEN_NAMES: LazyLock<HashSet<String>> = LazyLock::new(|| {
include_str!("../../resources/given_names.txt")
.lines()
.map(str::trim)
.filter(|l| !l.is_empty() && !l.starts_with('#'))
.map(normalize)
.filter(|n| !n.is_empty())
.collect()
});
/// True if any sub-token of `run` is a known given name. Splits on
/// whitespace AND intra-token separators (`-`, `/`, `.`, `,`) so
/// compound/prefixed forms like `Hans-Rudolf` or `ESD-Roger` are matched
/// part-by-part. Each part is normalized (umlaut-stripped + lowercased)
/// before lookup.
pub(super) fn run_has_given_name(run: &str) -> bool {
run.split(|c: char| c.is_whitespace() || matches!(c, '-' | '/' | '.' | ',' | '_'))
.any(|part| {
let cleaned = part.trim_matches(|c: char| !c.is_alphabetic());
!cleaned.is_empty() && GIVEN_NAMES.contains(&normalize(cleaned))
})
}
}
use scan_patterns::{
RE_DIGITS, RE_GEA_DATE, RE_INITIAL_SURNAME, RE_NAME_RUN, RE_PATIENT, RE_PERSON_STAR,
RE_QUARTER, RE_STREET, RE_SURNAME_INITIAL, RE_TITLE, RE_YEAR,
};
/// Month names (English + German, full + common abbreviations), longest-first.
const MONTH_NAMES: &[&str] = &[
"September",
"Februar",
"Dezember",
"November",
"February",
"December",
"January",
"October",
"Januar",
"Oktober",
"August",
"März",
"Maerz",
"April",
"March",
"Juni",
"Juli",
"June",
"July",
"Mai",
"May",
"Jan",
"Feb",
"Mar",
"Apr",
"Jun",
"Jul",
"Aug",
"Sep",
"Oct",
"Nov",
"Dec",
];
impl PlaceholderGrammar {
/// Fill a template to a concrete string. Structural placeholders
/// (`{year}`, `{quarter}`, `{month}`, `{date}`, `{digits}`) are filled
/// internally from `rng`; PII placeholders are delegated to `resolver`.
/// Unknown `{…}` tokens are emitted verbatim.
pub fn fill<R: rand::Rng>(
template: &str,
resolver: &mut dyn PlaceholderResolver,
rng: &mut R,
) -> String {
use rand::RngExt;
if template.is_empty() {
return String::new();
}
let mut out = String::with_capacity(template.len() + 16);
let mut rest = template;
while let Some(open) = rest.find('{') {
out.push_str(&rest[..open]);
rest = &rest[open..];
let Some(close) = rest.find('}') else {
// unbalanced — emit the remainder verbatim
out.push_str(rest);
return out;
};
let token = &rest[..=close];
rest = &rest[close + 1..];
if let Some(kind) = PiiPlaceholderKind::from_token(token) {
let resolved = resolver.resolve(kind, rng);
out.push_str(&resolved);
continue;
}
match token {
"{year}" => {
let y: u32 = rng.random_range(2018..=2024);
out.push_str(&y.to_string());
}
"{quarter}" => {
let q: u32 = rng.random_range(1..=4);
out.push('Q');
out.push_str(&q.to_string());
}
"{month}" => {
const MONTHS: &[&str] = &[
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
];
out.push_str(MONTHS[rng.random_range(0..MONTHS.len())]);
}
"{date}" => {
let d: u32 = rng.random_range(1..=28);
let m: u32 = rng.random_range(1..=12);
let y: u32 = rng.random_range(2018..=2024);
// Use ISO-style yyyy-mm-dd so the output never matches
// RE_PATIENT's `G:\s*\d{2}\.\d{2}\.\d{2}` (dot-separated).
out.push_str(&format!("{y}-{m:02}-{d:02}"));
}
"{digits}" => {
let n = rng.random_range(4..=8);
for _ in 0..n {
out.push(char::from(b'0' + rng.random_range(0u8..10)));
}
}
_ => out.push_str(token), // unknown — verbatim
}
}
out.push_str(rest);
out
}
/// Scan a string for residual PII patterns. Returns one hit per pattern
/// that matches; an empty result means the string is clean. Patterns that
/// detect PII-bearing *shapes* — used as a hard gate at extraction and in
/// CI. Templates whose PII spans are already placeholders (`{person}`,
/// `{patient}`, …) do not match these patterns.
pub fn residual_pii_scan(s: &str) -> Vec<PiiHit> {
let mut hits = Vec::new();
let checks: &[(&'static str, &Regex)] = &[
("patient_record", &RE_PATIENT),
("person_star", &RE_PERSON_STAR),
("title", &RE_TITLE),
("initial_surname", &RE_INITIAL_SURNAME),
("surname_initial", &RE_SURNAME_INITIAL),
];
for (label, re) in checks {
if let Some(m) = re.find(s) {
hits.push(PiiHit {
pattern: label,
matched: m.as_str().to_string(),
});
}
}
// `Firstname Lastname` / `Lastname Firstname` — a capitalised run
// anchored by a known given name. Catches the plain two-word names the
// initial/title/patient regexes structurally miss (the SP6 leak class).
for m in RE_NAME_RUN.find_iter(s) {
if given_names::run_has_given_name(m.as_str()) {
hits.push(PiiHit {
pattern: "given_name",
matched: m.as_str().to_string(),
});
break;
}
}
hits
}
/// Phase A — automated structural placeholder-ization. Raw corpus string to
/// a PII-safe template using only deterministic structural rules. Phase B
/// (curated denylist for fuzzy proper nouns) is applied by the extractor.
///
/// Rules, in order:
/// 1. Patient `G:`-record: strip everything up to the first `G:dd.dd.dd`
/// marker (the name region, regardless of any `G` in the name), prepend
/// `*{patient} `, then replace each `G:/E:/A:dd.dd.dd` with the
/// letter-preserving form `G:{date}` / `E:{date}` / `A:{date}`.
/// 2. `*Lastname,Firstname` star record: replace the matched span with
/// `*{person}`.
/// 3. Street address: replace with `{street}`.
/// 4. 4-digit years -> `{year}`; `Q1`-`Q4` -> `{quarter}`; month names ->
/// `{month}`; runs of >=4 digits -> `{digits}`.
pub fn tokenize(s: &str) -> String {
let t = s.trim();
if t.is_empty() {
return String::new();
}
// Rules 1 & 2 produce a *staged* prefix-handled string, then fall
// through to the common tail (street + name-run + structural) so a
// person name in the SUFFIX of a patient/star record is still caught
// — they previously `return`ed early, leaking e.g. a trailing
// `Firstname Lastname` after a `G:` patient marker.
let staged: String = if let Some(m) = RE_PATIENT.find(t) {
// Rule 1 — patient record. Slice off the name region up to the
// date marker; date-stamp the G:/E:/A: triplets.
let from_marker = &t[m.start()..];
let dated = RE_GEA_DATE.replace_all(from_marker, "$1:{date}");
format!("*{{patient}} {dated}").trim().to_string()
} else if let Some(m) = RE_PERSON_STAR.find(t) {
// Rule 2 — star person record. Replace the matched span with
// *{person}, keep any suffix, drop the dangling firstname fragment.
let mut out = String::with_capacity(t.len());
out.push_str("*{person}");
out.push_str(&t[m.end()..]);
trim_leading_name_fragment(&out)
} else {
t.to_string()
};
// Rule 3 — street address.
let staged = RE_STREET.replace_all(&staged, "{street}").into_owned();
// Rule 3.5 — `Firstname Lastname` / `Lastname Firstname` person names.
// A capitalised run anchored by a known given name collapses to
// `{person}`. German capitalises all nouns, so we cannot tell a
// surname from a description noun by case — redact the whole run
// (safe over-redaction) rather than leak the adjacent surname.
let staged = RE_NAME_RUN
.replace_all(&staged, |caps: ®ex::Captures| {
let run = &caps[0];
if given_names::run_has_given_name(run) {
"{person}".to_string()
} else {
run.to_string()
}
})
.into_owned();
// Rule 4 — structural / temporal.
let staged = RE_YEAR.replace_all(&staged, "{year}").into_owned();
let staged = RE_QUARTER.replace_all(&staged, "{quarter}").into_owned();
let staged = replace_months(&staged);
RE_DIGITS.replace_all(&staged, "{digits}").into_owned()
}
}
/// Replace month names with `{month}` at word boundaries (longest-first so
/// "September" wins over a hypothetical "Sep" prefix).
fn replace_months(s: &str) -> String {
let mut result = s.to_string();
for name in MONTH_NAMES {
// Word-boundary replace, case-sensitive (month names are capitalised
// in this corpus). Build a fresh string to avoid re-matching `{month}`.
let mut out = String::with_capacity(result.len());
let nlen = name.len();
let mut i = 0;
while i < result.len() {
if result[i..].starts_with(name) {
let prev_alpha = i > 0
&& result[..i]
.chars()
.next_back()
.map(|c| c.is_alphabetic())
.unwrap_or(false);
let next_alpha = result[i + nlen..]
.chars()
.next()
.map(|c| c.is_alphabetic())
.unwrap_or(false);
if !prev_alpha && !next_alpha {
out.push_str("{month}");
i += nlen;
continue;
}
}
// push one char
let ch_len = result[i..]
.chars()
.next()
.map(|c| c.len_utf8())
.unwrap_or(1);
out.push_str(&result[i..i + ch_len]);
i += ch_len;
}
result = out;
}
result
}
/// Drop a leading lowercase/name-char fragment immediately after `*{person}`.
/// Handles `RE_PERSON_STAR` matching only `*Lastname,F` and leaving `irstname`.
/// Consumes ONE contiguous alphabetic run (the firstname-completion) plus AT
/// MOST one trailing separator (space or comma) — does NOT consume further
/// alphabetic words which would eat description content. When non-empty
/// content remains, reinsert a single space separator for readability.
fn trim_leading_name_fragment(s: &str) -> String {
const PREFIX: &str = "*{person}";
if let Some(rest) = s.strip_prefix(PREFIX) {
let mut end = 0usize;
// Consume one contiguous alphabetic run (the firstname-completion).
for (i, c) in rest.char_indices() {
if c.is_alphabetic() {
end = i + c.len_utf8();
} else {
break;
}
}
// Consume up to one trailing comma or space — but NOT further alphabetics.
if let Some(c) = rest[end..].chars().next() {
if c == ',' || c == ' ' {
end += c.len_utf8();
}
}
let trimmed = &rest[end..];
if trimmed.is_empty() {
PREFIX.to_string()
} else {
format!("{PREFIX} {trimmed}")
}
} else {
s.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
use rand::SeedableRng;
#[test]
fn residual_scan_flags_patient_record() {
let hits = PlaceholderGrammar::residual_pii_scan("*Gambon,Laurin G:01.02.03 E:04.05.06");
assert!(
hits.iter().any(|h| h.pattern == "patient_record"),
"expected patient_record hit, got {hits:?}"
);
}
#[test]
fn residual_scan_flags_person_shapes() {
// star record
assert!(PlaceholderGrammar::residual_pii_scan("*Mueller,Hans")
.iter()
.any(|h| h.pattern == "person_star"));
// initial + surname
assert!(PlaceholderGrammar::residual_pii_scan("Forschung U. Frey")
.iter()
.any(|h| h.pattern == "initial_surname"));
// title
assert!(
PlaceholderGrammar::residual_pii_scan("Kontokorrent Prof. Dr. M. Buess")
.iter()
.any(|h| h.pattern == "title")
);
// surname + initial (e.g. "Mueller H." in a description)
assert!(
PlaceholderGrammar::residual_pii_scan("Konsultation Mueller H.")
.iter()
.any(|h| h.pattern == "surname_initial")
);
}
#[test]
fn residual_scan_passes_clean_templates() {
for clean in [
"Rechnung {company}",
"Mieten {month}.{year}",
"ARIBA_ASN",
"Darlehen {person}",
"*{patient} G:{date} E:{date} A:{date}",
"Umbuchung Anlage",
] {
assert!(
PlaceholderGrammar::residual_pii_scan(clean).is_empty(),
"false positive on clean template: {clean:?}"
);
}
}
#[test]
fn residual_scan_excludes_legal_entity_suffixes() {
// Legal-entity abbreviations like B.V. / S.A. / S.r.l. have a
// capital letter immediately following the period. The
// `surname_initial` regex must NOT match these — they're
// corporate-entity markers, not person initials. (Pre-T16 corpus
// scan found ~90% of raw `surname_initial` hits were exactly
// this shape, e.g. `Acme S.A.`, `Globex Europe B.V.` — only the
// shape matters for this test, names are fictional.)
for legal in [
"Acme Europe B.V.",
"Globex Suisse S.A.",
"Initech S.A. Lugano",
"Switzerland S.A.",
] {
assert!(
PlaceholderGrammar::residual_pii_scan(legal)
.iter()
.all(|h| h.pattern != "surname_initial"),
"must not flag legal-entity suffix in: {legal:?}"
);
}
// But a legitimate surname+initial at end-of-string still matches.
assert!(
PlaceholderGrammar::residual_pii_scan("Patient consult Mueller H.")
.iter()
.any(|h| h.pattern == "surname_initial"),
"legitimate surname-initial at end-of-string must still match"
);
}
/// SP6 leak class: plain `Firstname Lastname` / `Lastname Firstname` —
/// the form the initial/title/patient regexes structurally miss and that
/// shipped 285 real names in the first bundle cut. Fictional surnames
/// (Mustermann/Beispiel) + real generic given names exercise the shape.
#[test]
fn residual_scan_flags_given_name_runs() {
for s in [
"Beratung Marc Mustermann", // desc + given + surname
"Erbschaft Anna Beispiel", // desc + given + surname
"Mustermann Thomas Guthaben", // surname-first + given
"Florian Beispiel, Verzugszinsen", // given + surname, comma
] {
let hits = PlaceholderGrammar::residual_pii_scan(s);
assert!(
hits.iter().any(|h| h.pattern == "given_name"),
"must flag given-name run in: {s:?} (got {hits:?})"
);
}
}
#[test]
fn tokenize_collapses_person_name_runs() {
// German capitalises all nouns, so a contiguous capitalised run that
// contains a given name collapses whole — we cannot tell the leading
// description noun ("Beratung") from the surname by case, so we
// over-redact rather than risk leaking the surname.
assert_eq!(
PlaceholderGrammar::tokenize("Beratung Marc Mustermann"),
"{person}"
);
// A lowercase word / punctuation terminates the run, preserving the
// surrounding description.
assert_eq!(
PlaceholderGrammar::tokenize("Florian Beispiel, Verzugszinsen"),
"{person}, Verzugszinsen"
);
assert_eq!(
PlaceholderGrammar::tokenize("Kurt Beispiel/Miete Lager"),
"{person}/Miete Lager"
);
// No name leaks through after tokenization.
for s in ["Beratung Marc Mustermann", "Mustermann Thomas Guthaben"] {
assert!(
PlaceholderGrammar::residual_pii_scan(&PlaceholderGrammar::tokenize(s)).is_empty(),
"tokenized form of {s:?} must be PII-clean"
);
}
}
/// Regression: compound/prefixed given-name tokens and umlaut-stripped
/// corpus forms must still be recognized. The corpus drops umlauts
/// (`Jürg`→`Jrg`) and joins tokens with `-`/`/` (`Hans-Rudolf`,
/// `ESD-Roger`), which defeated a plain whitespace+lowercase lookup and
/// leaked names through the first regen passes.
#[test]
fn tokenize_handles_compound_and_umlaut_stripped_names() {
// Hyphenated compound given name (Hans + Rudolf both known).
assert_eq!(
PlaceholderGrammar::tokenize("Hans-Rudolf Beispiel"),
"{person}"
);
// Prefix-joined given name (ESD-Roger → Roger is known).
assert_eq!(
PlaceholderGrammar::tokenize("ESD-Roger Mustermann"),
"{person}"
);
// Umlaut-stripped corpus form: gazetteer has `Jürg`; corpus stores `Jrg`.
assert_eq!(PlaceholderGrammar::tokenize("Jrg Mustermann"), "{person}");
for s in [
"Hans-Rudolf Beispiel",
"ESD-Roger Mustermann",
"Jrg Mustermann",
] {
assert!(
PlaceholderGrammar::residual_pii_scan(&PlaceholderGrammar::tokenize(s)).is_empty(),
"compound/umlaut name leaked: {s:?}"
);
}
}
/// Regression: a person name in the SUFFIX of a patient/star record must
/// also be collapsed. The patient/star rules used to `return` early,
/// leaking a trailing `Firstname Lastname` (the JE_44 `Robert Hoe` regen
/// failure). Now they fall through to the name-run + structural tail.
#[test]
fn tokenize_name_in_patient_or_star_suffix_is_clean() {
for s in [
"*Muster,A G:01.02.03 E:04.05.06 Thomas Beispiel",
"*Muster,Anna Beratung Marc Mustermann",
] {
let tok = PlaceholderGrammar::tokenize(s);
assert!(
PlaceholderGrammar::residual_pii_scan(&tok).is_empty(),
"suffix name leaked: {s:?} -> {tok:?}"
);
}
}
#[test]
fn name_detection_no_false_positives() {
// Real bank names (kept by design), generic accounting terms, and
// already-placeholdered text must NOT be flagged or rewritten.
for clean in [
"Deutsche Bank",
"Kontokorrent {company} AG",
"Material Werkzeuge Werkstoffe",
"Goldman Sachs",
"Standard Chartered",
] {
assert!(
PlaceholderGrammar::residual_pii_scan(clean)
.iter()
.all(|h| h.pattern != "given_name"),
"false-positive given_name on: {clean:?}"
);
assert_eq!(
PlaceholderGrammar::tokenize(clean),
clean,
"tokenize must not rewrite clean text: {clean:?}"
);
}
}
#[test]
fn pii_placeholder_kind_token_roundtrip() {
for kind in [
PiiPlaceholderKind::Patient,
PiiPlaceholderKind::Person,
PiiPlaceholderKind::Company,
PiiPlaceholderKind::Street,
] {
assert_eq!(PiiPlaceholderKind::from_token(kind.token()), Some(kind));
}
assert_eq!(PiiPlaceholderKind::from_token("{year}"), None);
assert_eq!(PiiPlaceholderKind::from_token("{unknown}"), None);
}
#[test]
fn line_key_format() {
assert_eq!(TextTaxonomyPrior::line_key("KR", "A.B"), "KR|A.B");
assert_eq!(
TextTaxonomyPrior::line_key("RE", TextTaxonomyPrior::UNKNOWN_CLASS),
"RE|_unknown_"
);
}
#[test]
fn synthetic_example_resolver_emits_obvious_fakes() {
let mut r = SyntheticExampleResolver;
let mut rng = rand::rng();
for kind in [
PiiPlaceholderKind::Patient,
PiiPlaceholderKind::Person,
PiiPlaceholderKind::Company,
PiiPlaceholderKind::Street,
] {
let v = r.resolve(kind, &mut rng);
assert!(v.starts_with("Example"), "expected obvious fake, got {v}");
}
}
#[test]
fn tokenize_patient_record_strips_name_even_with_g_in_it() {
// The name "Gambon" contains a G — the strip must consume it. A naive
// [^G]*? class cannot, and would leak the name. This is the bug the
// first-pass cleaning sweep found.
assert_eq!(
PlaceholderGrammar::tokenize("*Gambon,Laurin G:01.02.03 E:04.05.06 A:07.08.09"),
"*{patient} G:{date} E:{date} A:{date}"
);
assert_eq!(
PlaceholderGrammar::tokenize("*Rykart,Frank G G:11.12.13"),
"*{patient} G:{date}"
);
}
#[test]
fn tokenize_person_star_record() {
assert_eq!(PlaceholderGrammar::tokenize("*Mueller,Hans"), "*{person}");
// Pin behavior on `*Lastname,Firstname<rest>` shapes: after the
// firstname fragment is trimmed, a single space separates {person}
// from the remainder. Short digit runs (< 4) stay verbatim.
assert_eq!(
PlaceholderGrammar::tokenize("*Mueller,Hans Ref-123"),
"*{person} Ref-123"
);
}
#[test]
fn tokenize_street_address() {
assert_eq!(
PlaceholderGrammar::tokenize("LUKB Mietzinskaution Roentgenpraxis, Spitalstrasse 5"),
"LUKB Mietzinskaution Roentgenpraxis, {street}"
);
}
#[test]
fn tokenize_structural_temporal() {
assert_eq!(
PlaceholderGrammar::tokenize("Mieten 04.2021"),
"Mieten 04.{year}"
);
assert_eq!(
PlaceholderGrammar::tokenize("Sales Accrual Q1"),
"Sales Accrual {quarter}"
);
assert_eq!(
PlaceholderGrammar::tokenize("January accrual"),
"{month} accrual"
);
assert_eq!(PlaceholderGrammar::tokenize("INV 1234567"), "INV {digits}");
assert_eq!(PlaceholderGrammar::tokenize("GL 470"), "GL 470"); // short run kept
}
#[test]
fn tokenize_fixed_vocab_unchanged() {
assert_eq!(PlaceholderGrammar::tokenize("ARIBA_ASN"), "ARIBA_ASN");
assert_eq!(
PlaceholderGrammar::tokenize("CH Post: KUREPO Intercomp"),
"CH Post: KUREPO Intercomp"
);
}
#[test]
fn tokenize_then_scan_is_clean() {
// Every Phase-A-tokenized string with structural PII must scan clean.
for raw in [
"*Gambon,Laurin G:01.02.03 E:04.05.06 A:07.08.09",
"*Mueller,Hans",
"LUKB Spitalstrasse 5",
] {
let tok = PlaceholderGrammar::tokenize(raw);
assert!(
PlaceholderGrammar::residual_pii_scan(&tok).is_empty(),
"tokenize left residual PII: {raw:?} -> {tok:?}"
);
}
}
#[test]
fn fill_structural_placeholders() {
let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7);
let mut resolver = SyntheticExampleResolver;
let out = PlaceholderGrammar::fill(
"Mieten {month}.{year} ref {digits} {quarter}",
&mut resolver,
&mut rng,
);
assert!(
!out.contains('{'),
"structural placeholders left unfilled: {out}"
);
assert!(out.starts_with("Mieten "));
}
#[test]
fn fill_pii_placeholders_via_resolver() {
let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7);
let mut resolver = SyntheticExampleResolver;
let out =
PlaceholderGrammar::fill("Rechnung {company} / {person}", &mut resolver, &mut rng);
assert_eq!(out, "Rechnung Example GmbH / Example Person");
}
#[test]
fn fill_unknown_placeholder_kept_literal() {
let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7);
let mut resolver = SyntheticExampleResolver;
let out = PlaceholderGrammar::fill("foo {bogus} bar", &mut resolver, &mut rng);
assert_eq!(out, "foo {bogus} bar");
}
#[test]
fn fill_then_scan_clean() {
let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7);
let mut resolver = SyntheticExampleResolver;
for tmpl in ["Darlehen {person}", "*{patient} G:{date}", "{company} AG"] {
let out = PlaceholderGrammar::fill(tmpl, &mut resolver, &mut rng);
assert!(
PlaceholderGrammar::residual_pii_scan(&out).is_empty(),
"fill produced residual-PII shape: {tmpl:?} -> {out:?}"
);
}
}
#[test]
fn prior_serde_roundtrip() {
let mut prior = TextTaxonomyPrior::default();
prior.line_pools.insert(
TextTaxonomyPrior::line_key("KR", "A.B"),
TemplatePool {
templates: vec![TemplateEntry {
template: "Rechnung {company}".to_string(),
probability: 1.0,
synthetic_example: "Rechnung Example GmbH".to_string(),
}],
n: 42,
},
);
prior.meta.class_tier = "iso21378_l2".to_string();
let yaml = serde_yaml::to_string(&prior).expect("serialize");
let back: TextTaxonomyPrior = serde_yaml::from_str(&yaml).expect("deserialize");
assert_eq!(prior, back);
}
}