use std::collections::BTreeMap;
use rand::RngExt;
use datasynth_core::distributions::behavioral_priors::{ReferenceFormatPrior, ReferenceTemplate};
use datasynth_eval::behavioral_fidelity::Record;
pub const MAX_TEMPLATES_PER_SOURCE: usize = 10;
pub fn extract_reference_formats(
records: &[Record],
min_occurrences: usize,
) -> ReferenceFormatPrior {
if records.is_empty() {
return ReferenceFormatPrior::default();
}
let mut by_source: BTreeMap<String, Vec<&str>> = BTreeMap::new();
for r in records {
if r.source.is_empty() || r.je_number.is_empty() {
continue;
}
by_source
.entry(r.source.clone())
.or_default()
.push(r.je_number.as_str());
}
let mut result: BTreeMap<String, Vec<ReferenceTemplate>> = BTreeMap::new();
for (source, refs) in &by_source {
let total = refs.len();
if total == 0 {
continue;
}
let mut template_counts: BTreeMap<String, (usize, String)> = BTreeMap::new();
for &r in refs {
let tmpl = tokenize_reference(r);
if tmpl.is_empty() {
continue;
}
let entry = template_counts.entry(tmpl).or_insert((0, r.to_string()));
entry.0 += 1;
}
let mut passing: Vec<(String, usize, String)> = template_counts
.into_iter()
.filter(|(_, (count, _))| *count >= min_occurrences)
.map(|(tmpl, (count, example))| (tmpl, count, example))
.collect();
if passing.is_empty() {
continue;
}
passing.sort_by_key(|item| std::cmp::Reverse(item.1));
passing.truncate(MAX_TEMPLATES_PER_SOURCE);
let retained_total: usize = passing.iter().map(|(_, c, _)| *c).sum();
if retained_total == 0 {
continue;
}
let templates: Vec<ReferenceTemplate> = passing
.into_iter()
.map(|(tmpl, count, example)| ReferenceTemplate {
template: tmpl,
probability: count as f64 / retained_total as f64,
example,
})
.collect();
result.insert(source.clone(), templates);
}
ReferenceFormatPrior { by_source: result }
}
pub fn tokenize_reference(s: &str) -> String {
if s.is_empty() {
return String::new();
}
let chars: Vec<char> = s.chars().collect();
let mut result = String::with_capacity(s.len() * 2);
let mut i = 0;
while i < chars.len() {
let ch = chars[i];
if ch.is_ascii_digit() {
let start = i;
while i < chars.len() && chars[i].is_ascii_digit() {
i += 1;
}
let n = i - start;
result.push('{');
result.push_str(&n.to_string());
result.push_str(" digits}");
} else if ch.is_ascii_alphabetic() {
let start = i;
while i < chars.len() && chars[i].is_ascii_alphabetic() {
i += 1;
}
let n = i - start;
result.push('{');
result.push_str(&n.to_string());
result.push_str(" alpha}");
} else {
result.push(ch);
i += 1;
}
}
result
}
pub fn fill_template<R: rand::Rng>(template: &str, rng: &mut R) -> String {
if template.is_empty() {
return String::new();
}
let mut result = String::with_capacity(template.len());
let bytes = template.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'{' {
if let Some(close) = template[i..].find('}') {
let inner = &template[i + 1..i + close];
i += close + 1;
if let Some(n) = parse_placeholder(inner) {
let (count, kind) = n;
match kind {
PlaceholderKind::Digits => {
for _ in 0..count {
result.push(char::from(b'0' + rng.random_range(0u8..10)));
}
}
PlaceholderKind::Alpha => {
for _ in 0..count {
result.push(char::from(b'A' + rng.random_range(0u8..26)));
}
}
}
} else {
result.push('{');
result.push_str(inner);
result.push('}');
}
} else {
result.push(bytes[i] as char);
i += 1;
}
} else {
result.push(bytes[i] as char);
i += 1;
}
}
result
}
enum PlaceholderKind {
Digits,
Alpha,
}
fn parse_placeholder(inner: &str) -> Option<(usize, PlaceholderKind)> {
let inner = inner.trim();
if let Some(rest) = inner.strip_suffix("digits") {
let n: usize = rest.trim().parse().ok()?;
Some((n, PlaceholderKind::Digits))
} else if let Some(rest) = inner.strip_suffix("alpha") {
let n: usize = rest.trim().parse().ok()?;
Some((n, PlaceholderKind::Alpha))
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
use rand::SeedableRng;
use rand_chacha::ChaCha8Rng;
#[test]
fn tokenize_reference_handles_alphanumeric() {
assert_eq!(
tokenize_reference("RE-2024-000123"),
"{2 alpha}-{4 digits}-{6 digits}"
);
assert_eq!(tokenize_reference("DOC100"), "{3 alpha}{3 digits}");
assert_eq!(tokenize_reference(""), "");
}
#[test]
fn tokenize_reference_real_corpus_format() {
assert_eq!(
tokenize_reference("2022-0090-0950645487"),
"{4 digits}-{4 digits}-{10 digits}"
);
}
#[test]
fn tokenize_reference_preserves_fixed_chars() {
assert_eq!(
tokenize_reference("GR/2024/001"),
"{2 alpha}/{4 digits}/{3 digits}"
);
assert_eq!(
tokenize_reference("INV.2024.001"),
"{3 alpha}.{4 digits}.{3 digits}"
);
}
#[test]
fn fill_template_round_trips_format() {
let mut rng = ChaCha8Rng::seed_from_u64(42);
let filled = fill_template("{2 alpha}-{4 digits}-{6 digits}", &mut rng);
assert_eq!(filled.len(), 14, "got: {filled}");
let parts: Vec<&str> = filled.split('-').collect();
assert_eq!(parts.len(), 3, "expected 3 parts, got: {filled}");
assert_eq!(parts[0].len(), 2, "first part should be 2 alpha: {filled}");
assert_eq!(
parts[1].len(),
4,
"second part should be 4 digits: {filled}"
);
assert_eq!(parts[2].len(), 6, "third part should be 6 digits: {filled}");
assert!(
parts[0].chars().all(|c| c.is_ascii_uppercase()),
"got: {filled}"
);
assert!(
parts[1].chars().all(|c| c.is_ascii_digit()),
"got: {filled}"
);
assert!(
parts[2].chars().all(|c| c.is_ascii_digit()),
"got: {filled}"
);
}
#[test]
fn fill_template_digits_only() {
let mut rng = ChaCha8Rng::seed_from_u64(99);
let filled = fill_template("{4 digits}-{4 digits}-{10 digits}", &mut rng);
assert_eq!(filled.len(), 20, "got: {filled}");
let parts: Vec<&str> = filled.split('-').collect();
assert_eq!(parts.len(), 3);
assert_eq!(parts[0].len(), 4);
assert_eq!(parts[1].len(), 4);
assert_eq!(parts[2].len(), 10);
assert!(
parts[0].chars().all(|c| c.is_ascii_digit()),
"got: {filled}"
);
}
#[test]
fn extract_reference_formats_filters_low_frequency() {
let make_record = |je: &str, src: &str| {
use chrono::NaiveDate;
Record {
source: src.to_string(),
gl_account: "1000".to_string(),
cost_center: None,
profit_center: None,
trading_partner: None,
je_number: je.to_string(),
je_line_number: "1".to_string(),
effective_date: NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
entry_date: NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(),
created_at: None,
functional_amount: 100.0,
header_text: String::new(),
line_text: String::new(),
}
};
let mut records = Vec::new();
for _ in 0..15 {
records.push(make_record("RE-2024-000001", "KR"));
}
for _ in 0..3 {
records.push(make_record("ABC-99", "KR"));
}
let prior = extract_reference_formats(&records, 10);
let kr_templates = prior.by_source.get("KR").expect("KR should be present");
assert_eq!(kr_templates.len(), 1, "got templates: {kr_templates:?}");
assert_eq!(kr_templates[0].template, "{2 alpha}-{4 digits}-{6 digits}");
assert!((kr_templates[0].probability - 1.0).abs() < 1e-9);
}
#[test]
fn extract_reference_formats_top_n_capped() {
use chrono::NaiveDate;
let date = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap();
let mut records = Vec::new();
for prefix_len in 1..=11usize {
let prefix = "A".repeat(prefix_len);
for idx in 0..20usize {
records.push(Record {
source: "SA".to_string(),
gl_account: "1000".to_string(),
cost_center: None,
profit_center: None,
trading_partner: None,
je_number: format!("{prefix}-{idx:06}"),
je_line_number: "1".to_string(),
effective_date: date,
entry_date: date,
created_at: None,
functional_amount: 100.0,
header_text: String::new(),
line_text: String::new(),
});
}
}
let prior = extract_reference_formats(&records, 10);
let sa_templates = prior.by_source.get("SA").expect("SA should be present");
assert!(
sa_templates.len() <= MAX_TEMPLATES_PER_SOURCE,
"expected ≤{MAX_TEMPLATES_PER_SOURCE}, got {}",
sa_templates.len()
);
}
#[test]
fn extract_reference_formats_empty_records() {
let prior = extract_reference_formats(&[], 10);
assert!(prior.by_source.is_empty());
}
}