datasynth-core 5.35.2

Core domain models, traits, and distributions for synthetic enterprise data generation
Documentation
//! Dimensional (star-schema) export — a surrogate-key-normalized view of the
//! general ledger for loading into common GL-analytics platforms.
//!
//! Produces a **fact table** of journal-entry lines, where every dimension is an
//! integer surrogate key, plus a **lookup table per dimension** (key → value) and
//! a **chart-of-accounts map** (account key → account + description). Keys are
//! assigned deterministically (sorted distinct value order) so the same ledger
//! always yields the same encoding.
//!
//! Generic by design — no tool/vendor/form-specific naming.

use std::collections::{BTreeMap, BTreeSet};

use super::JournalEntry;

/// A single fact row: one journal-entry line with its dimension keys.
#[derive(Debug, Clone, PartialEq)]
pub struct FactRow {
    pub je_id: String,
    pub line_number: u32,
    /// Signed amount (debit positive, credit negative), as a string.
    pub amount: String,
    pub drcr_key: u32,
    pub account_key: u32,
    pub source_key: u32,
    pub preparer_key: u32,
    pub company_key: u32,
    pub currency_key: u32,
    pub doc_type_key: u32,
    pub date_key: u32,
}

/// A chart-of-accounts dimension row.
#[derive(Debug, Clone, PartialEq)]
pub struct AccountDimRow {
    pub key: u32,
    pub gl_account: String,
    pub account_description: String,
}

/// The full dimensional export: fact table + dimension lookups + COA map.
#[derive(Debug, Clone)]
pub struct DimensionalExport {
    pub fact: Vec<FactRow>,
    /// Debit/Credit dimension (fixed: 1 = Debit, 2 = Credit).
    pub drcr: Vec<(u32, String)>,
    pub accounts: Vec<AccountDimRow>,
    pub sources: Vec<(u32, String)>,
    pub preparers: Vec<(u32, String)>,
    pub companies: Vec<(u32, String)>,
    pub currencies: Vec<(u32, String)>,
    pub doc_types: Vec<(u32, String)>,
    pub dates: Vec<(u32, String)>,
}

/// Intern distinct values into deterministic 1-based keys (sorted value order).
fn intern(values: BTreeSet<String>) -> (BTreeMap<String, u32>, Vec<(u32, String)>) {
    let mut map = BTreeMap::new();
    let mut rows = Vec::with_capacity(values.len());
    for (i, v) in values.into_iter().enumerate() {
        let key = (i + 1) as u32;
        map.insert(v.clone(), key);
        rows.push((key, v));
    }
    (map, rows)
}

fn csv_escape(s: &str) -> String {
    if s.contains([',', '"', '\n', '\r']) {
        format!("\"{}\"", s.replace('"', "\"\""))
    } else {
        s.to_string()
    }
}

impl DimensionalExport {
    /// The source value used for the source dimension: the SAP source code when
    /// present, else the transaction-source kind.
    fn source_value(je: &JournalEntry) -> String {
        je.header
            .sap_source_code
            .clone()
            .unwrap_or_else(|| format!("{:?}", je.header.source))
    }

    /// Build the dimensional export from journal entries.
    pub fn from_entries(entries: &[JournalEntry]) -> Self {
        // Pass 1: collect distinct dimension values.
        let mut accounts: BTreeMap<String, String> = BTreeMap::new(); // gl_account -> description
        let mut sources = BTreeSet::new();
        let mut preparers = BTreeSet::new();
        let mut companies = BTreeSet::new();
        let mut currencies = BTreeSet::new();
        let mut doc_types = BTreeSet::new();
        let mut dates = BTreeSet::new();

        for je in entries {
            sources.insert(Self::source_value(je));
            preparers.insert(je.header.created_by.clone());
            companies.insert(je.header.company_code.clone());
            currencies.insert(je.header.currency.clone());
            doc_types.insert(je.header.document_type.clone());
            dates.insert(je.header.posting_date.to_string());
            for line in &je.lines {
                accounts
                    .entry(line.gl_account.clone())
                    .or_insert_with(|| line.account_description.clone().unwrap_or_default());
            }
        }

        // Assign keys.
        let (acct_keys, account_rows): (BTreeMap<String, u32>, Vec<AccountDimRow>) = {
            let mut map = BTreeMap::new();
            let mut rows = Vec::with_capacity(accounts.len());
            for (i, (acct, desc)) in accounts.into_iter().enumerate() {
                let key = (i + 1) as u32;
                map.insert(acct.clone(), key);
                rows.push(AccountDimRow {
                    key,
                    gl_account: acct,
                    account_description: desc,
                });
            }
            (map, rows)
        };
        let (src_keys, source_rows) = intern(sources);
        let (prep_keys, preparer_rows) = intern(preparers);
        let (comp_keys, company_rows) = intern(companies);
        let (cur_keys, currency_rows) = intern(currencies);
        let (dt_keys, doc_type_rows) = intern(doc_types);
        let (date_keys, date_rows) = intern(dates);

        // Pass 2: build the fact table.
        let mut fact = Vec::new();
        for je in entries {
            let src_key = src_keys[&Self::source_value(je)];
            let prep_key = prep_keys[&je.header.created_by];
            let comp_key = comp_keys[&je.header.company_code];
            let cur_key = cur_keys[&je.header.currency];
            let dt_key = dt_keys[&je.header.document_type];
            let date_key = date_keys[&je.header.posting_date.to_string()];
            let je_id = je.header.document_id.to_string();
            for line in &je.lines {
                let is_debit = line.debit_amount > rust_decimal::Decimal::ZERO
                    || (line.credit_amount.is_zero() && line.debit_amount >= line.credit_amount);
                let (drcr_key, amount) = if line.debit_amount > line.credit_amount {
                    (1u32, line.debit_amount)
                } else {
                    (2u32, -line.credit_amount)
                };
                let _ = is_debit;
                fact.push(FactRow {
                    je_id: je_id.clone(),
                    line_number: line.line_number,
                    amount: amount.to_string(),
                    drcr_key,
                    account_key: acct_keys[&line.gl_account],
                    source_key: src_key,
                    preparer_key: prep_key,
                    company_key: comp_key,
                    currency_key: cur_key,
                    doc_type_key: dt_key,
                    date_key,
                });
            }
        }

        Self {
            fact,
            drcr: vec![(1, "Debit".into()), (2, "Credit".into())],
            accounts: account_rows,
            sources: source_rows,
            preparers: preparer_rows,
            companies: company_rows,
            currencies: currency_rows,
            doc_types: doc_type_rows,
            dates: date_rows,
        }
    }

    /// CSV for the fact table.
    pub fn fact_csv(&self) -> String {
        let mut s = String::from(
            "je_id,line_number,amount,drcr_key,account_key,source_key,preparer_key,company_key,currency_key,doc_type_key,date_key\n",
        );
        for r in &self.fact {
            s.push_str(&format!(
                "{},{},{},{},{},{},{},{},{},{},{}\n",
                csv_escape(&r.je_id),
                r.line_number,
                r.amount,
                r.drcr_key,
                r.account_key,
                r.source_key,
                r.preparer_key,
                r.company_key,
                r.currency_key,
                r.doc_type_key,
                r.date_key,
            ));
        }
        s
    }

    /// CSV for the chart-of-accounts dimension.
    pub fn account_dim_csv(&self) -> String {
        let mut s = String::from("account_key,gl_account,account_description\n");
        for r in &self.accounts {
            s.push_str(&format!(
                "{},{},{}\n",
                r.key,
                csv_escape(&r.gl_account),
                csv_escape(&r.account_description)
            ));
        }
        s
    }

    /// CSV for a simple `(key, value)` dimension with the given column headers.
    pub fn simple_dim_csv(rows: &[(u32, String)], key_col: &str, val_col: &str) -> String {
        let mut s = format!("{key_col},{val_col}\n");
        for (k, v) in rows {
            s.push_str(&format!("{},{}\n", k, csv_escape(v)));
        }
        s
    }

    /// All export files as `(relative_filename, csv_contents)` pairs.
    pub fn files(&self) -> Vec<(String, String)> {
        vec![
            ("fact_je_lines.csv".into(), self.fact_csv()),
            ("dim_account.csv".into(), self.account_dim_csv()),
            (
                "dim_drcr.csv".into(),
                Self::simple_dim_csv(&self.drcr, "drcr_key", "drcr"),
            ),
            (
                "dim_source.csv".into(),
                Self::simple_dim_csv(&self.sources, "source_key", "source"),
            ),
            (
                "dim_preparer.csv".into(),
                Self::simple_dim_csv(&self.preparers, "preparer_key", "preparer"),
            ),
            (
                "dim_company.csv".into(),
                Self::simple_dim_csv(&self.companies, "company_key", "company_code"),
            ),
            (
                "dim_currency.csv".into(),
                Self::simple_dim_csv(&self.currencies, "currency_key", "currency"),
            ),
            (
                "dim_document_type.csv".into(),
                Self::simple_dim_csv(&self.doc_types, "doc_type_key", "document_type"),
            ),
            (
                "dim_date.csv".into(),
                Self::simple_dim_csv(&self.dates, "date_key", "posting_date"),
            ),
        ]
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::models::journal_entry::{JournalEntry, JournalEntryHeader, JournalEntryLine};
    use chrono::NaiveDate;
    use rust_decimal::Decimal;

    fn je(company: &str, lines: Vec<(&str, i64, i64)>) -> JournalEntry {
        let mut e = JournalEntry::new(JournalEntryHeader::new(
            company.to_string(),
            NaiveDate::from_ymd_opt(2026, 3, 15).unwrap(),
        ));
        for (i, (acct, dr, cr)) in lines.into_iter().enumerate() {
            let ln = if dr != 0 {
                JournalEntryLine::debit(
                    e.header.document_id,
                    (i + 1) as u32,
                    acct.to_string(),
                    Decimal::from(dr),
                )
            } else {
                JournalEntryLine::credit(
                    e.header.document_id,
                    (i + 1) as u32,
                    acct.to_string(),
                    Decimal::from(cr),
                )
            };
            e.add_line(ln);
        }
        e
    }

    #[test]
    fn fact_row_count_matches_lines_and_keys_resolve() {
        let entries = vec![
            je("1000", vec![("4000", 1000, 0), ("1100", 0, 1000)]),
            je("2000", vec![("5000", 500, 0), ("2000", 0, 500)]),
        ];
        let ex = DimensionalExport::from_entries(&entries);
        assert_eq!(ex.fact.len(), 4);
        assert_eq!(ex.companies.len(), 2);
        assert_eq!(ex.accounts.len(), 4);
        // Every fact account_key resolves to a COA row.
        let valid: std::collections::BTreeSet<u32> = ex.accounts.iter().map(|a| a.key).collect();
        assert!(ex.fact.iter().all(|f| valid.contains(&f.account_key)));
        // drcr keys are 1/2.
        assert!(ex.fact.iter().all(|f| f.drcr_key == 1 || f.drcr_key == 2));
        // 9 files emitted.
        assert_eq!(ex.files().len(), 9);
    }

    #[test]
    fn keys_are_deterministic() {
        let entries = vec![je("1000", vec![("4000", 1000, 0), ("1100", 0, 1000)])];
        let a = DimensionalExport::from_entries(&entries);
        let b = DimensionalExport::from_entries(&entries);
        assert_eq!(a.fact_csv(), b.fact_csv());
        assert_eq!(a.account_dim_csv(), b.account_dim_csv());
    }

    #[test]
    fn signed_amounts_and_drcr_align() {
        let entries = vec![je("1000", vec![("4000", 1000, 0), ("1100", 0, 1000)])];
        let ex = DimensionalExport::from_entries(&entries);
        let debit = ex.fact.iter().find(|f| f.drcr_key == 1).unwrap();
        let credit = ex.fact.iter().find(|f| f.drcr_key == 2).unwrap();
        assert_eq!(debit.amount, "1000");
        assert_eq!(credit.amount, "-1000");
    }
}