Skip to main content

datasynth_core/models/
structural_fingerprint.rs

1//! Structural journal-entry fingerprints + heuristic flags.
2//!
3//! A cheap analytics layer over the generated ledger:
4//!
5//! * **L1 fingerprint** — a stable hash of a JE's structure (the sorted set of
6//!   `(gl_account, debit/credit)` tuples). JEs with the same account structure
7//!   share an L1 fingerprint, so recurring standard postings cluster.
8//! * **L2 fingerprint** — the same, coarsened to the account *prefix* (a
9//!   subclass proxy), so structurally-similar postings cluster more broadly.
10//! * **Heuristic flags** — balance-sheet-only / income-statement-only, classified
11//!   by account-number prefix (1–3 = balance sheet, 4–9 = income statement).
12//!
13//! Generic by design — no tool/vendor/form-specific naming.
14
15use std::collections::BTreeMap;
16
17use serde::{Deserialize, Serialize};
18
19use super::JournalEntry;
20
21/// Stable FNV-1a 64-bit hash (deterministic across runs/versions).
22fn fnv1a(s: &str) -> u64 {
23    let mut h: u64 = 0xcbf2_9ce4_8422_2325;
24    for b in s.bytes() {
25        h ^= b as u64;
26        h = h.wrapping_mul(0x0000_0100_0000_01b3);
27    }
28    h
29}
30
31/// Account-class bucket from the account-number prefix (SAP-style numbering).
32fn is_balance_sheet(account: &str) -> Option<bool> {
33    match account.chars().next() {
34        Some('1') | Some('2') | Some('3') => Some(true), // assets / liab / equity
35        Some('4'..='9') => Some(false),                  // revenue / expense
36        _ => None,
37    }
38}
39
40/// One distinct L1 fingerprint and how often it occurs.
41#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
42pub struct FingerprintListEntry {
43    pub l1_fingerprint: String,
44    /// Human-readable structure, e.g. `"Dr:1100|Cr:2000"`.
45    pub structure: String,
46    pub je_count: usize,
47}
48
49/// Per-JE fingerprint + flags (kept for the per-JE CSV; not serialized in the
50/// summary report to keep it bounded).
51#[derive(Debug, Clone, PartialEq)]
52pub struct JeFingerprint {
53    pub je_id: String,
54    pub l1_fingerprint: u64,
55    pub l2_fingerprint: u64,
56    pub bs_only: bool,
57    pub is_only: bool,
58    pub line_count: usize,
59}
60
61/// Structural-fingerprint summary report for a generated ledger.
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct StructuralFingerprintReport {
64    pub je_count: usize,
65    pub l1_distinct: usize,
66    pub l2_distinct: usize,
67    pub bs_only_count: usize,
68    pub is_only_count: usize,
69    pub mixed_count: usize,
70    /// Most frequent L1 fingerprints (capped).
71    pub top_l1_fingerprints: Vec<FingerprintListEntry>,
72    /// Per-JE fingerprints are not serialized here; use [`Self::per_je_csv`].
73    #[serde(skip)]
74    pub per_je: Vec<JeFingerprint>,
75}
76
77impl StructuralFingerprintReport {
78    pub const DEFAULT_LIST_CAP: usize = 500;
79
80    /// Build the structural-fingerprint report from journal entries.
81    pub fn from_entries(entries: &[JournalEntry], list_cap: usize) -> Self {
82        let mut per_je = Vec::with_capacity(entries.len());
83        // l1_fingerprint -> (structure_string, count)
84        let mut l1_listing: BTreeMap<u64, (String, usize)> = BTreeMap::new();
85        let mut l2_set = std::collections::BTreeSet::new();
86        let mut bs_only_count = 0usize;
87        let mut is_only_count = 0usize;
88        let mut mixed_count = 0usize;
89
90        for je in entries {
91            // Build the sorted L1 / L2 structure tuples.
92            let mut l1_parts: Vec<String> = Vec::with_capacity(je.lines.len());
93            let mut l2_parts: Vec<String> = Vec::with_capacity(je.lines.len());
94            let mut any_bs = false;
95            let mut any_is = false;
96            for line in &je.lines {
97                let dc = if line.debit_amount > line.credit_amount {
98                    "Dr"
99                } else {
100                    "Cr"
101                };
102                l1_parts.push(format!("{dc}:{}", line.gl_account));
103                let prefix: String = line.gl_account.chars().take(2).collect();
104                l2_parts.push(format!("{dc}:{prefix}"));
105                match is_balance_sheet(&line.gl_account) {
106                    Some(true) => any_bs = true,
107                    Some(false) => any_is = true,
108                    None => {}
109                }
110            }
111            l1_parts.sort();
112            l2_parts.sort();
113            let l1_struct = l1_parts.join("|");
114            let l2_struct = l2_parts.join("|");
115            let l1 = fnv1a(&l1_struct);
116            let l2 = fnv1a(&l2_struct);
117
118            let bs_only = any_bs && !any_is;
119            let is_only = any_is && !any_bs;
120            if bs_only {
121                bs_only_count += 1;
122            } else if is_only {
123                is_only_count += 1;
124            } else {
125                mixed_count += 1;
126            }
127
128            l1_listing
129                .entry(l1)
130                .or_insert_with(|| (l1_struct.clone(), 0))
131                .1 += 1;
132            l2_set.insert(l2);
133
134            per_je.push(JeFingerprint {
135                je_id: je.header.document_id.to_string(),
136                l1_fingerprint: l1,
137                l2_fingerprint: l2,
138                bs_only,
139                is_only,
140                line_count: je.lines.len(),
141            });
142        }
143
144        let l1_distinct = l1_listing.len();
145        let mut top: Vec<FingerprintListEntry> = l1_listing
146            .into_iter()
147            .map(|(fp, (structure, count))| FingerprintListEntry {
148                l1_fingerprint: format!("{fp:016x}"),
149                structure,
150                je_count: count,
151            })
152            .collect();
153        top.sort_by(|a, b| {
154            b.je_count
155                .cmp(&a.je_count)
156                .then(a.structure.cmp(&b.structure))
157        });
158        top.truncate(list_cap);
159
160        Self {
161            je_count: entries.len(),
162            l1_distinct,
163            l2_distinct: l2_set.len(),
164            bs_only_count,
165            is_only_count,
166            mixed_count,
167            top_l1_fingerprints: top,
168            per_je,
169        }
170    }
171
172    /// Per-JE fingerprint CSV.
173    pub fn per_je_csv(&self) -> String {
174        let mut s =
175            String::from("je_id,l1_fingerprint,l2_fingerprint,bs_only,is_only,line_count\n");
176        for r in &self.per_je {
177            s.push_str(&format!(
178                "{},{:016x},{:016x},{},{},{}\n",
179                r.je_id, r.l1_fingerprint, r.l2_fingerprint, r.bs_only, r.is_only, r.line_count
180            ));
181        }
182        s
183    }
184}
185
186#[cfg(test)]
187mod tests {
188    use super::*;
189    use crate::models::journal_entry::{JournalEntry, JournalEntryHeader, JournalEntryLine};
190    use chrono::NaiveDate;
191    use rust_decimal::Decimal;
192
193    fn je(lines: Vec<(&str, i64, i64)>) -> JournalEntry {
194        let mut e = JournalEntry::new(JournalEntryHeader::new(
195            "1000".to_string(),
196            NaiveDate::from_ymd_opt(2026, 3, 15).unwrap(),
197        ));
198        for (i, (acct, dr, cr)) in lines.into_iter().enumerate() {
199            let ln = if dr != 0 {
200                JournalEntryLine::debit(
201                    e.header.document_id,
202                    (i + 1) as u32,
203                    acct.to_string(),
204                    Decimal::from(dr),
205                )
206            } else {
207                JournalEntryLine::credit(
208                    e.header.document_id,
209                    (i + 1) as u32,
210                    acct.to_string(),
211                    Decimal::from(cr),
212                )
213            };
214            e.add_line(ln);
215        }
216        e
217    }
218
219    #[test]
220    fn same_structure_shares_l1_fingerprint() {
221        // Two JEs with the same account structure (different amounts) → same L1.
222        let entries = vec![
223            je(vec![("1100", 1000, 0), ("4000", 0, 1000)]),
224            je(vec![("1100", 7, 0), ("4000", 0, 7)]),
225            je(vec![("1200", 500, 0), ("2000", 0, 500)]), // different structure
226        ];
227        let r = StructuralFingerprintReport::from_entries(&entries, 500);
228        assert_eq!(r.je_count, 3);
229        assert_eq!(r.l1_distinct, 2);
230        assert_eq!(r.per_je[0].l1_fingerprint, r.per_je[1].l1_fingerprint);
231        assert_ne!(r.per_je[0].l1_fingerprint, r.per_je[2].l1_fingerprint);
232        // The recurring structure is the most frequent.
233        assert_eq!(r.top_l1_fingerprints[0].je_count, 2);
234    }
235
236    #[test]
237    fn bs_and_is_flags() {
238        let entries = vec![
239            je(vec![("1100", 1000, 0), ("2000", 0, 1000)]), // both BS -> bs_only
240            je(vec![("4000", 0, 500), ("5000", 500, 0)]),   // both IS -> is_only
241            je(vec![("1100", 1000, 0), ("4000", 0, 1000)]), // mixed
242        ];
243        let r = StructuralFingerprintReport::from_entries(&entries, 500);
244        assert_eq!(r.bs_only_count, 1);
245        assert_eq!(r.is_only_count, 1);
246        assert_eq!(r.mixed_count, 1);
247    }
248
249    #[test]
250    fn deterministic() {
251        let entries = vec![je(vec![("1100", 1000, 0), ("4000", 0, 1000)])];
252        let a = StructuralFingerprintReport::from_entries(&entries, 500);
253        let b = StructuralFingerprintReport::from_entries(&entries, 500);
254        assert_eq!(a.per_je_csv(), b.per_je_csv());
255        assert_eq!(a.top_l1_fingerprints, b.top_l1_fingerprints);
256    }
257}