datasynth-core 5.35.2

Core domain models, traits, and distributions for synthetic enterprise data generation
Documentation
//! Structural journal-entry fingerprints + heuristic flags.
//!
//! A cheap analytics layer over the generated ledger:
//!
//! * **L1 fingerprint** — a stable hash of a JE's structure (the sorted set of
//!   `(gl_account, debit/credit)` tuples). JEs with the same account structure
//!   share an L1 fingerprint, so recurring standard postings cluster.
//! * **L2 fingerprint** — the same, coarsened to the account *prefix* (a
//!   subclass proxy), so structurally-similar postings cluster more broadly.
//! * **Heuristic flags** — balance-sheet-only / income-statement-only, classified
//!   by account-number prefix (1–3 = balance sheet, 4–9 = income statement).
//!
//! Generic by design — no tool/vendor/form-specific naming.

use std::collections::BTreeMap;

use serde::{Deserialize, Serialize};

use super::JournalEntry;

/// Stable FNV-1a 64-bit hash (deterministic across runs/versions).
fn fnv1a(s: &str) -> u64 {
    let mut h: u64 = 0xcbf2_9ce4_8422_2325;
    for b in s.bytes() {
        h ^= b as u64;
        h = h.wrapping_mul(0x0000_0100_0000_01b3);
    }
    h
}

/// Account-class bucket from the account-number prefix (SAP-style numbering).
fn is_balance_sheet(account: &str) -> Option<bool> {
    match account.chars().next() {
        Some('1') | Some('2') | Some('3') => Some(true), // assets / liab / equity
        Some('4'..='9') => Some(false),                  // revenue / expense
        _ => None,
    }
}

/// One distinct L1 fingerprint and how often it occurs.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FingerprintListEntry {
    pub l1_fingerprint: String,
    /// Human-readable structure, e.g. `"Dr:1100|Cr:2000"`.
    pub structure: String,
    pub je_count: usize,
}

/// Per-JE fingerprint + flags (kept for the per-JE CSV; not serialized in the
/// summary report to keep it bounded).
#[derive(Debug, Clone, PartialEq)]
pub struct JeFingerprint {
    pub je_id: String,
    pub l1_fingerprint: u64,
    pub l2_fingerprint: u64,
    pub bs_only: bool,
    pub is_only: bool,
    pub line_count: usize,
}

/// Structural-fingerprint summary report for a generated ledger.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StructuralFingerprintReport {
    pub je_count: usize,
    pub l1_distinct: usize,
    pub l2_distinct: usize,
    pub bs_only_count: usize,
    pub is_only_count: usize,
    pub mixed_count: usize,
    /// Most frequent L1 fingerprints (capped).
    pub top_l1_fingerprints: Vec<FingerprintListEntry>,
    /// Per-JE fingerprints are not serialized here; use [`Self::per_je_csv`].
    #[serde(skip)]
    pub per_je: Vec<JeFingerprint>,
}

impl StructuralFingerprintReport {
    pub const DEFAULT_LIST_CAP: usize = 500;

    /// Build the structural-fingerprint report from journal entries.
    pub fn from_entries(entries: &[JournalEntry], list_cap: usize) -> Self {
        let mut per_je = Vec::with_capacity(entries.len());
        // l1_fingerprint -> (structure_string, count)
        let mut l1_listing: BTreeMap<u64, (String, usize)> = BTreeMap::new();
        let mut l2_set = std::collections::BTreeSet::new();
        let mut bs_only_count = 0usize;
        let mut is_only_count = 0usize;
        let mut mixed_count = 0usize;

        for je in entries {
            // Build the sorted L1 / L2 structure tuples.
            let mut l1_parts: Vec<String> = Vec::with_capacity(je.lines.len());
            let mut l2_parts: Vec<String> = Vec::with_capacity(je.lines.len());
            let mut any_bs = false;
            let mut any_is = false;
            for line in &je.lines {
                let dc = if line.debit_amount > line.credit_amount {
                    "Dr"
                } else {
                    "Cr"
                };
                l1_parts.push(format!("{dc}:{}", line.gl_account));
                let prefix: String = line.gl_account.chars().take(2).collect();
                l2_parts.push(format!("{dc}:{prefix}"));
                match is_balance_sheet(&line.gl_account) {
                    Some(true) => any_bs = true,
                    Some(false) => any_is = true,
                    None => {}
                }
            }
            l1_parts.sort();
            l2_parts.sort();
            let l1_struct = l1_parts.join("|");
            let l2_struct = l2_parts.join("|");
            let l1 = fnv1a(&l1_struct);
            let l2 = fnv1a(&l2_struct);

            let bs_only = any_bs && !any_is;
            let is_only = any_is && !any_bs;
            if bs_only {
                bs_only_count += 1;
            } else if is_only {
                is_only_count += 1;
            } else {
                mixed_count += 1;
            }

            l1_listing
                .entry(l1)
                .or_insert_with(|| (l1_struct.clone(), 0))
                .1 += 1;
            l2_set.insert(l2);

            per_je.push(JeFingerprint {
                je_id: je.header.document_id.to_string(),
                l1_fingerprint: l1,
                l2_fingerprint: l2,
                bs_only,
                is_only,
                line_count: je.lines.len(),
            });
        }

        let l1_distinct = l1_listing.len();
        let mut top: Vec<FingerprintListEntry> = l1_listing
            .into_iter()
            .map(|(fp, (structure, count))| FingerprintListEntry {
                l1_fingerprint: format!("{fp:016x}"),
                structure,
                je_count: count,
            })
            .collect();
        top.sort_by(|a, b| {
            b.je_count
                .cmp(&a.je_count)
                .then(a.structure.cmp(&b.structure))
        });
        top.truncate(list_cap);

        Self {
            je_count: entries.len(),
            l1_distinct,
            l2_distinct: l2_set.len(),
            bs_only_count,
            is_only_count,
            mixed_count,
            top_l1_fingerprints: top,
            per_je,
        }
    }

    /// Per-JE fingerprint CSV.
    pub fn per_je_csv(&self) -> String {
        let mut s =
            String::from("je_id,l1_fingerprint,l2_fingerprint,bs_only,is_only,line_count\n");
        for r in &self.per_je {
            s.push_str(&format!(
                "{},{:016x},{:016x},{},{},{}\n",
                r.je_id, r.l1_fingerprint, r.l2_fingerprint, r.bs_only, r.is_only, r.line_count
            ));
        }
        s
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::models::journal_entry::{JournalEntry, JournalEntryHeader, JournalEntryLine};
    use chrono::NaiveDate;
    use rust_decimal::Decimal;

    fn je(lines: Vec<(&str, i64, i64)>) -> JournalEntry {
        let mut e = JournalEntry::new(JournalEntryHeader::new(
            "1000".to_string(),
            NaiveDate::from_ymd_opt(2026, 3, 15).unwrap(),
        ));
        for (i, (acct, dr, cr)) in lines.into_iter().enumerate() {
            let ln = if dr != 0 {
                JournalEntryLine::debit(
                    e.header.document_id,
                    (i + 1) as u32,
                    acct.to_string(),
                    Decimal::from(dr),
                )
            } else {
                JournalEntryLine::credit(
                    e.header.document_id,
                    (i + 1) as u32,
                    acct.to_string(),
                    Decimal::from(cr),
                )
            };
            e.add_line(ln);
        }
        e
    }

    #[test]
    fn same_structure_shares_l1_fingerprint() {
        // Two JEs with the same account structure (different amounts) → same L1.
        let entries = vec![
            je(vec![("1100", 1000, 0), ("4000", 0, 1000)]),
            je(vec![("1100", 7, 0), ("4000", 0, 7)]),
            je(vec![("1200", 500, 0), ("2000", 0, 500)]), // different structure
        ];
        let r = StructuralFingerprintReport::from_entries(&entries, 500);
        assert_eq!(r.je_count, 3);
        assert_eq!(r.l1_distinct, 2);
        assert_eq!(r.per_je[0].l1_fingerprint, r.per_je[1].l1_fingerprint);
        assert_ne!(r.per_je[0].l1_fingerprint, r.per_je[2].l1_fingerprint);
        // The recurring structure is the most frequent.
        assert_eq!(r.top_l1_fingerprints[0].je_count, 2);
    }

    #[test]
    fn bs_and_is_flags() {
        let entries = vec![
            je(vec![("1100", 1000, 0), ("2000", 0, 1000)]), // both BS -> bs_only
            je(vec![("4000", 0, 500), ("5000", 500, 0)]),   // both IS -> is_only
            je(vec![("1100", 1000, 0), ("4000", 0, 1000)]), // mixed
        ];
        let r = StructuralFingerprintReport::from_entries(&entries, 500);
        assert_eq!(r.bs_only_count, 1);
        assert_eq!(r.is_only_count, 1);
        assert_eq!(r.mixed_count, 1);
    }

    #[test]
    fn deterministic() {
        let entries = vec![je(vec![("1100", 1000, 0), ("4000", 0, 1000)])];
        let a = StructuralFingerprintReport::from_entries(&entries, 500);
        let b = StructuralFingerprintReport::from_entries(&entries, 500);
        assert_eq!(a.per_je_csv(), b.per_je_csv());
        assert_eq!(a.top_l1_fingerprints, b.top_l1_fingerprints);
    }
}