use std::collections::BTreeMap;
pub const MAX_TEXT_TEMPLATES_PER_SOURCE: usize = 50;
use datasynth_core::distributions::behavioral_priors::CoaSemanticPrior;
use datasynth_core::distributions::text_taxonomy::{
PlaceholderGrammar, SyntheticExampleResolver, TaxonomyMeta, TemplateEntry, TemplatePool,
TextTaxonomyPrior,
};
use crate::extraction::pii_denylist::PiiDenylist;
#[derive(Debug, Clone)]
pub struct TextTaxonomyRecord<'a> {
pub source: &'a str,
pub account_class: Option<&'a str>,
pub header_text: Option<&'a str>,
pub line_text: Option<&'a str>,
pub coa_account: Option<&'a str>,
pub coa_description: Option<&'a str>,
}
pub fn extract_text_taxonomy(
records: &[TextTaxonomyRecord<'_>],
min_occurrences: usize,
denylist: Option<&PiiDenylist>,
) -> TextTaxonomyPrior {
extract_text_taxonomy_checked(records, min_occurrences, denylist)
.expect("residual PII in extracted templates")
}
pub fn extract_text_taxonomy_checked(
records: &[TextTaxonomyRecord<'_>],
min_occurrences: usize,
denylist: Option<&PiiDenylist>,
) -> Result<TextTaxonomyPrior, crate::FingerprintError> {
let tokenize = |s: &str| -> String {
let a = PlaceholderGrammar::tokenize(s);
match denylist {
Some(dl) => dl.apply(&a),
None => a,
}
};
let mut line_groups: BTreeMap<String, Vec<String>> = BTreeMap::new();
let mut header_groups: BTreeMap<String, Vec<String>> = BTreeMap::new();
let mut coa_raw: BTreeMap<String, String> = BTreeMap::new();
for r in records {
if r.source.is_empty() {
continue;
}
if let Some(lt) = r.line_text {
let t = lt.trim();
if !t.is_empty() {
let class = r.account_class.unwrap_or(TextTaxonomyPrior::UNKNOWN_CLASS);
line_groups
.entry(TextTaxonomyPrior::line_key(r.source, class))
.or_default()
.push(tokenize(t));
}
}
if let Some(ht) = r.header_text {
let t = ht.trim();
if !t.is_empty() {
header_groups
.entry(r.source.to_string())
.or_default()
.push(tokenize(t));
}
}
if let (Some(acct), Some(desc)) = (r.coa_account, r.coa_description) {
let d = desc.trim();
if !d.is_empty() {
coa_raw
.entry(acct.to_string())
.or_insert_with(|| tokenize(d));
}
}
}
let line_pools = build_taxonomy_pools(line_groups, min_occurrences)?;
let header_pools = build_taxonomy_pools(header_groups, min_occurrences)?;
let mut coa_pools: BTreeMap<String, TemplateEntry> = BTreeMap::new();
for (acct, template) in coa_raw {
let hits = PlaceholderGrammar::residual_pii_scan(&template);
if !hits.is_empty() {
return Err(crate::FingerprintError::PiiDenylist(format!(
"residual PII in CoA template for account {acct}: {hits:?}"
)));
}
coa_pools.insert(acct, make_template_entry(template, 1.0));
}
Ok(TextTaxonomyPrior {
line_pools,
header_pools,
coa_pools,
meta: TaxonomyMeta {
min_occurrences,
max_templates_per_pool: MAX_TEXT_TEMPLATES_PER_SOURCE,
class_tier: "iso21378_l2".to_string(),
n_client_inputs: 1,
},
})
}
fn build_taxonomy_pools(
groups: BTreeMap<String, Vec<String>>,
min_occurrences: usize,
) -> Result<BTreeMap<String, TemplatePool>, crate::FingerprintError> {
let mut result = BTreeMap::new();
for (key, templates) in groups {
let total = templates.len();
if total == 0 {
continue;
}
let mut counts: BTreeMap<String, usize> = BTreeMap::new();
for t in templates {
if t.is_empty() {
continue;
}
*counts.entry(t).or_insert(0) += 1;
}
let mut passing: Vec<(String, usize)> = counts
.into_iter()
.filter(|(_, c)| *c >= min_occurrences)
.collect();
if passing.is_empty() {
continue;
}
passing.sort_by_key(|(_, c)| std::cmp::Reverse(*c));
passing.truncate(MAX_TEXT_TEMPLATES_PER_SOURCE);
let retained: usize = passing.iter().map(|(_, c)| *c).sum();
let mut entries = Vec::with_capacity(passing.len());
for (template, c) in passing {
let hits = PlaceholderGrammar::residual_pii_scan(&template);
if !hits.is_empty() {
return Err(crate::FingerprintError::PiiDenylist(format!(
"residual PII in template for pool {key}: {hits:?}"
)));
}
entries.push(make_template_entry(template, c as f64 / retained as f64));
}
result.insert(
key,
TemplatePool {
templates: entries,
n: total,
},
);
}
Ok(result)
}
fn make_template_entry(template: String, probability: f64) -> TemplateEntry {
use rand::SeedableRng;
let seed: u64 = template
.bytes()
.fold(0x5036_u64, |a, b| a.wrapping_mul(31).wrapping_add(b as u64));
let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed);
let mut resolver = SyntheticExampleResolver;
let synthetic_example = PlaceholderGrammar::fill(&template, &mut resolver, &mut rng);
TemplateEntry {
template,
probability,
synthetic_example,
}
}
pub fn extract_text_taxonomy_from_records(
records: &[datasynth_eval::behavioral_fidelity::Record],
coa_prior: Option<&CoaSemanticPrior>,
denylist: Option<&PiiDenylist>,
min_occurrences: usize,
) -> Result<TextTaxonomyPrior, crate::FingerprintError> {
let tokenize = |s: &str| -> String {
let a = PlaceholderGrammar::tokenize(s);
match denylist {
Some(dl) => dl.apply(&a),
None => a,
}
};
let resolve_class = |gl: &str| -> Option<&str> {
coa_prior
.and_then(|c| c.accounts.get(gl))
.and_then(|a| a.account_class.as_deref())
};
let tx_records: Vec<TextTaxonomyRecord<'_>> = records
.iter()
.map(|r| TextTaxonomyRecord {
source: r.source.as_str(),
account_class: resolve_class(r.gl_account.as_str()),
header_text: if r.header_text.is_empty() {
None
} else {
Some(r.header_text.as_str())
},
line_text: if r.line_text.is_empty() {
None
} else {
Some(r.line_text.as_str())
},
coa_account: None,
coa_description: None,
})
.collect();
let mut prior = extract_text_taxonomy_checked(&tx_records, min_occurrences, denylist)?;
if let Some(coa) = coa_prior {
for (acct, sem) in &coa.accounts {
if sem.description.is_empty() {
continue;
}
let template = tokenize(sem.description.trim());
if template.is_empty() {
continue;
}
let hits = PlaceholderGrammar::residual_pii_scan(&template);
if !hits.is_empty() {
return Err(crate::FingerprintError::PiiDenylist(format!(
"residual PII in CoA template for account {acct}: {hits:?}"
)));
}
prior
.coa_pools
.insert(acct.clone(), make_template_entry(template, 1.0));
}
}
Ok(prior)
}
#[cfg(test)]
mod tests {
use super::*;
use datasynth_core::distributions::text_taxonomy::TextTaxonomyPrior;
#[test]
fn extract_text_taxonomy_groups_lines_by_source_class() {
let mut records: Vec<TextTaxonomyRecord<'_>> = Vec::new();
for _ in 0..12 {
records.push(TextTaxonomyRecord {
source: "KR",
account_class: Some("A.B"),
header_text: None,
line_text: Some("Rechnung Eingang"),
coa_account: None,
coa_description: None,
});
}
for _ in 0..12 {
records.push(TextTaxonomyRecord {
source: "KR",
account_class: None, header_text: None,
line_text: Some("Diverse Buchung"),
coa_account: None,
coa_description: None,
});
}
let prior = extract_text_taxonomy(&records, 10, None);
assert!(prior
.line_pools
.contains_key(&TextTaxonomyPrior::line_key("KR", "A.B")));
assert!(prior.line_pools.contains_key(&TextTaxonomyPrior::line_key(
"KR",
TextTaxonomyPrior::UNKNOWN_CLASS
)));
let ab = &prior.line_pools[&TextTaxonomyPrior::line_key("KR", "A.B")];
assert_eq!(ab.templates.len(), 1);
assert_eq!(ab.templates[0].template, "Rechnung Eingang");
}
#[test]
fn extract_text_taxonomy_synthetic_example_not_verbatim() {
let records: Vec<TextTaxonomyRecord<'_>> = (0..15)
.map(|_| TextTaxonomyRecord {
source: "KR",
account_class: Some("A.B"),
header_text: None,
line_text: Some("Darlehen Schauer"), coa_account: None,
coa_description: None,
})
.collect();
let prior = extract_text_taxonomy(&records, 10, None);
let clean: Vec<TextTaxonomyRecord<'_>> = (0..15)
.map(|_| TextTaxonomyRecord {
source: "RE",
account_class: Some("R.A"),
header_text: None,
line_text: Some("Mieten 04.2021"),
coa_account: None,
coa_description: None,
})
.collect();
let prior2 = extract_text_taxonomy(&clean, 10, None);
let pool = &prior2.line_pools[&TextTaxonomyPrior::line_key("RE", "R.A")];
assert_eq!(pool.templates[0].template, "Mieten 04.{year}");
assert_ne!(pool.templates[0].synthetic_example, "Mieten 04.2021");
let _ = prior; }
#[test]
fn extract_text_taxonomy_hard_fails_on_residual_pii() {
let records: Vec<TextTaxonomyRecord<'_>> = (0..15)
.map(|_| TextTaxonomyRecord {
source: "SA",
account_class: Some("X.X"),
header_text: None,
line_text: Some("Kontokorrent Prof. Dr. M. Buess"), coa_account: None,
coa_description: None,
})
.collect();
let result = extract_text_taxonomy_checked(&records, 10, None);
assert!(result.is_err(), "title shape must hard-fail the scan gate");
}
fn make_test_record(
source: &str,
gl_account: &str,
line_text: &str,
) -> datasynth_eval::behavioral_fidelity::Record {
use chrono::NaiveDate;
datasynth_eval::behavioral_fidelity::Record {
source: source.to_string(),
gl_account: gl_account.to_string(),
cost_center: None,
profit_center: None,
trading_partner: None,
je_number: "JE001".to_string(),
je_line_number: "1".to_string(),
effective_date: NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
entry_date: NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
created_at: None,
functional_amount: 100.0,
header_text: String::new(),
line_text: line_text.to_string(),
}
}
#[test]
fn extract_text_taxonomy_from_records_resolves_class_via_coa() {
use datasynth_core::distributions::behavioral_priors::{AccountSemantic, CoaSemanticPrior};
let mut coa = CoaSemanticPrior::default();
coa.accounts.insert(
"0000204000".to_string(),
AccountSemantic {
description: "Kreditoren".to_string(),
account_class: Some("L.2".to_string()),
..Default::default()
},
);
let records: Vec<_> = (0..12)
.map(|_| make_test_record("KR", "0000204000", "Rechnung Eingang"))
.collect();
let prior = extract_text_taxonomy_from_records(&records, Some(&coa), None, 10)
.expect("extraction ok");
assert!(
prior
.line_pools
.contains_key(&TextTaxonomyPrior::line_key("KR", "L.2")),
"expected KR|L.2 pool; got keys: {:?}",
prior.line_pools.keys().collect::<Vec<_>>()
);
assert!(
prior.coa_pools.contains_key("0000204000"),
"expected coa_pools[0000204000]; got keys: {:?}",
prior.coa_pools.keys().collect::<Vec<_>>()
);
}
}