datasynth_core/distributions/
line_item.rs

1//! Line item count distribution sampler.
2//!
3//! Implements the empirical distribution of journal entry line items
4//! as observed in the accounting network generation research.
5//!
6//! Key findings from the paper:
7//! - 60.68% of journal entries have exactly 2 line items
8//! - 16.63% have 4 line items
9//! - 88% have an even number of line items
10//! - 82% have equal debit and credit line counts
11
12use rand::prelude::*;
13use rand_chacha::ChaCha8Rng;
14use serde::{Deserialize, Serialize};
15
16/// Configuration for line item count distribution.
17///
18/// Based on empirical findings from Table III of the accounting network paper.
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct LineItemDistributionConfig {
21    /// Probability of 2 line items (60.68%)
22    pub two_items: f64,
23    /// Probability of 3 line items (5.77%)
24    pub three_items: f64,
25    /// Probability of 4 line items (16.63%)
26    pub four_items: f64,
27    /// Probability of 5 line items (3.06%)
28    pub five_items: f64,
29    /// Probability of 6 line items (3.32%)
30    pub six_items: f64,
31    /// Probability of 7 line items (1.13%)
32    pub seven_items: f64,
33    /// Probability of 8 line items (1.88%)
34    pub eight_items: f64,
35    /// Probability of 9 line items (0.42%)
36    pub nine_items: f64,
37    /// Probability of 10-99 line items (6.33%)
38    pub ten_to_ninety_nine: f64,
39    /// Probability of 100-999 line items (0.76%)
40    pub hundred_to_nine_ninety_nine: f64,
41    /// Probability of 1000+ line items (0.02%)
42    pub thousand_plus: f64,
43}
44
45impl Default for LineItemDistributionConfig {
46    fn default() -> Self {
47        // Values from Table III of the paper
48        Self {
49            two_items: 0.6068,
50            three_items: 0.0577,
51            four_items: 0.1663,
52            five_items: 0.0306,
53            six_items: 0.0332,
54            seven_items: 0.0113,
55            eight_items: 0.0188,
56            nine_items: 0.0042,
57            ten_to_ninety_nine: 0.0633,
58            hundred_to_nine_ninety_nine: 0.0076,
59            thousand_plus: 0.0002,
60        }
61    }
62}
63
64impl LineItemDistributionConfig {
65    /// Validate that probabilities sum to approximately 1.0.
66    pub fn validate(&self) -> Result<(), String> {
67        let sum = self.two_items
68            + self.three_items
69            + self.four_items
70            + self.five_items
71            + self.six_items
72            + self.seven_items
73            + self.eight_items
74            + self.nine_items
75            + self.ten_to_ninety_nine
76            + self.hundred_to_nine_ninety_nine
77            + self.thousand_plus;
78
79        if (sum - 1.0).abs() > 0.01 {
80            return Err(format!(
81                "Line item distribution probabilities sum to {}, expected ~1.0",
82                sum
83            ));
84        }
85        Ok(())
86    }
87
88    /// Get cumulative distribution values.
89    fn cumulative(&self) -> [f64; 11] {
90        let mut cum = [0.0; 11];
91        cum[0] = self.two_items;
92        cum[1] = cum[0] + self.three_items;
93        cum[2] = cum[1] + self.four_items;
94        cum[3] = cum[2] + self.five_items;
95        cum[4] = cum[3] + self.six_items;
96        cum[5] = cum[4] + self.seven_items;
97        cum[6] = cum[5] + self.eight_items;
98        cum[7] = cum[6] + self.nine_items;
99        cum[8] = cum[7] + self.ten_to_ninety_nine;
100        cum[9] = cum[8] + self.hundred_to_nine_ninety_nine;
101        cum[10] = cum[9] + self.thousand_plus;
102        cum
103    }
104}
105
106/// Configuration for even/odd line count distribution.
107#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct EvenOddDistributionConfig {
109    /// Probability of even line count (88%)
110    pub even: f64,
111    /// Probability of odd line count (12%)
112    pub odd: f64,
113}
114
115impl Default for EvenOddDistributionConfig {
116    fn default() -> Self {
117        // From the paper: 88% even, 12% odd
118        Self {
119            even: 0.88,
120            odd: 0.12,
121        }
122    }
123}
124
125/// Configuration for debit/credit balance distribution.
126#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct DebitCreditDistributionConfig {
128    /// Probability of equal debit and credit counts (82%)
129    pub equal: f64,
130    /// Probability of more debit lines than credit (7%)
131    pub more_debit: f64,
132    /// Probability of more credit lines than debit (11%)
133    pub more_credit: f64,
134}
135
136impl Default for DebitCreditDistributionConfig {
137    fn default() -> Self {
138        // From the paper: 82% equal, 11% more credit, 7% more debit
139        Self {
140            equal: 0.82,
141            more_debit: 0.07,
142            more_credit: 0.11,
143        }
144    }
145}
146
147/// Sampler for journal entry line item counts.
148///
149/// Produces realistic line item counts based on empirical distributions
150/// from real-world general ledger data.
151pub struct LineItemSampler {
152    /// RNG for sampling
153    rng: ChaCha8Rng,
154    /// Line item distribution config (retained for potential resets/reconfiguration)
155    #[allow(dead_code)]
156    line_config: LineItemDistributionConfig,
157    /// Even/odd distribution config
158    even_odd_config: EvenOddDistributionConfig,
159    /// Debit/credit distribution config
160    debit_credit_config: DebitCreditDistributionConfig,
161    /// Cumulative distribution for line counts
162    cumulative: [f64; 11],
163}
164
165impl LineItemSampler {
166    /// Create a new sampler with default configuration.
167    pub fn new(seed: u64) -> Self {
168        let line_config = LineItemDistributionConfig::default();
169        let cumulative = line_config.cumulative();
170
171        Self {
172            rng: ChaCha8Rng::seed_from_u64(seed),
173            line_config,
174            even_odd_config: EvenOddDistributionConfig::default(),
175            debit_credit_config: DebitCreditDistributionConfig::default(),
176            cumulative,
177        }
178    }
179
180    /// Create a sampler with custom configuration.
181    pub fn with_config(
182        seed: u64,
183        line_config: LineItemDistributionConfig,
184        even_odd_config: EvenOddDistributionConfig,
185        debit_credit_config: DebitCreditDistributionConfig,
186    ) -> Self {
187        let cumulative = line_config.cumulative();
188
189        Self {
190            rng: ChaCha8Rng::seed_from_u64(seed),
191            line_config,
192            even_odd_config,
193            debit_credit_config,
194            cumulative,
195        }
196    }
197
198    /// Sample a line item count.
199    pub fn sample_count(&mut self) -> usize {
200        let p: f64 = self.rng.gen();
201
202        // Find the bin using cumulative distribution
203        if p < self.cumulative[0] {
204            2
205        } else if p < self.cumulative[1] {
206            3
207        } else if p < self.cumulative[2] {
208            4
209        } else if p < self.cumulative[3] {
210            5
211        } else if p < self.cumulative[4] {
212            6
213        } else if p < self.cumulative[5] {
214            7
215        } else if p < self.cumulative[6] {
216            8
217        } else if p < self.cumulative[7] {
218            9
219        } else if p < self.cumulative[8] {
220            // 10-99 range - use uniform distribution within range
221            self.rng.gen_range(10..100)
222        } else if p < self.cumulative[9] {
223            // 100-999 range
224            self.rng.gen_range(100..1000)
225        } else {
226            // 1000+ range (cap at 10000 for practicality)
227            self.rng.gen_range(1000..10000)
228        }
229    }
230
231    /// Sample whether the count should be even.
232    pub fn sample_even(&mut self) -> bool {
233        self.rng.gen::<f64>() < self.even_odd_config.even
234    }
235
236    /// Sample a line item count with even/odd constraint.
237    ///
238    /// When adjustment is needed, randomly chooses to increment or decrement
239    /// to avoid biasing toward lower counts.
240    pub fn sample_count_with_parity(&mut self) -> usize {
241        let base_count = self.sample_count();
242        let should_be_even = self.sample_even();
243
244        // Adjust to match parity requirement
245        let is_even = base_count % 2 == 0;
246        if should_be_even != is_even {
247            // Use symmetric adjustment: randomly increment or decrement
248            if base_count <= 2 {
249                // Can only increment for small counts
250                base_count + 1
251            } else if self.rng.gen::<bool>() {
252                // Randomly choose to increment
253                base_count + 1
254            } else {
255                // Randomly choose to decrement
256                base_count - 1
257            }
258        } else {
259            base_count
260        }
261    }
262
263    /// Sample the debit/credit split type.
264    pub fn sample_debit_credit_type(&mut self) -> DebitCreditSplit {
265        let p: f64 = self.rng.gen();
266
267        if p < self.debit_credit_config.equal {
268            DebitCreditSplit::Equal
269        } else if p < self.debit_credit_config.equal + self.debit_credit_config.more_debit {
270            DebitCreditSplit::MoreDebit
271        } else {
272            DebitCreditSplit::MoreCredit
273        }
274    }
275
276    /// Sample a complete line item specification.
277    pub fn sample(&mut self) -> LineItemSpec {
278        let total_count = self.sample_count_with_parity();
279        let split_type = self.sample_debit_credit_type();
280
281        let (debit_count, credit_count) = match split_type {
282            DebitCreditSplit::Equal => {
283                let half = total_count / 2;
284                (half, total_count - half)
285            }
286            DebitCreditSplit::MoreDebit => {
287                // More debit lines - 60% debit, 40% credit
288                let debit = (total_count as f64 * 0.6).round() as usize;
289                let debit = debit.max(1).min(total_count - 1);
290                (debit, total_count - debit)
291            }
292            DebitCreditSplit::MoreCredit => {
293                // More credit lines - 40% debit, 60% credit
294                let credit = (total_count as f64 * 0.6).round() as usize;
295                let credit = credit.max(1).min(total_count - 1);
296                (total_count - credit, credit)
297            }
298        };
299
300        LineItemSpec {
301            total_count,
302            debit_count,
303            credit_count,
304            split_type,
305        }
306    }
307
308    /// Reset the sampler with the same seed.
309    pub fn reset(&mut self, seed: u64) {
310        self.rng = ChaCha8Rng::seed_from_u64(seed);
311    }
312}
313
314/// Type of debit/credit split.
315#[derive(Debug, Clone, Copy, PartialEq, Eq)]
316pub enum DebitCreditSplit {
317    /// Equal number of debit and credit lines
318    Equal,
319    /// More debit lines than credit
320    MoreDebit,
321    /// More credit lines than debit
322    MoreCredit,
323}
324
325/// Specification for line items in a journal entry.
326#[derive(Debug, Clone)]
327pub struct LineItemSpec {
328    /// Total number of line items
329    pub total_count: usize,
330    /// Number of debit lines
331    pub debit_count: usize,
332    /// Number of credit lines
333    pub credit_count: usize,
334    /// Type of debit/credit split
335    pub split_type: DebitCreditSplit,
336}
337
338impl LineItemSpec {
339    /// Check if the spec is valid.
340    pub fn is_valid(&self) -> bool {
341        self.total_count >= 2
342            && self.debit_count >= 1
343            && self.credit_count >= 1
344            && self.debit_count + self.credit_count == self.total_count
345    }
346}
347
348#[cfg(test)]
349mod tests {
350    use super::*;
351
352    #[test]
353    fn test_default_config_valid() {
354        let config = LineItemDistributionConfig::default();
355        assert!(config.validate().is_ok());
356    }
357
358    #[test]
359    fn test_sampler_determinism() {
360        let mut sampler1 = LineItemSampler::new(42);
361        let mut sampler2 = LineItemSampler::new(42);
362
363        for _ in 0..100 {
364            assert_eq!(sampler1.sample_count(), sampler2.sample_count());
365        }
366    }
367
368    #[test]
369    fn test_sampler_distribution() {
370        let mut sampler = LineItemSampler::new(42);
371        let sample_size = 100_000;
372
373        let mut counts = std::collections::HashMap::new();
374        for _ in 0..sample_size {
375            let count = sampler.sample_count();
376            *counts.entry(count).or_insert(0) += 1;
377        }
378
379        // Check that 2-line items are most common
380        let two_count = *counts.get(&2).unwrap_or(&0) as f64 / sample_size as f64;
381        assert!(
382            two_count > 0.55 && two_count < 0.65,
383            "Expected ~60% 2-item entries, got {}%",
384            two_count * 100.0
385        );
386
387        // Check that 4-line items are second most common
388        let four_count = *counts.get(&4).unwrap_or(&0) as f64 / sample_size as f64;
389        assert!(
390            four_count > 0.13 && four_count < 0.20,
391            "Expected ~16% 4-item entries, got {}%",
392            four_count * 100.0
393        );
394    }
395
396    #[test]
397    fn test_line_item_spec_valid() {
398        let mut sampler = LineItemSampler::new(42);
399
400        for _ in 0..1000 {
401            let spec = sampler.sample();
402            assert!(spec.is_valid(), "Invalid spec: {:?}", spec);
403        }
404    }
405}
datasynth_core/distributions/line_item.rs

datasynth_core/distributions/
line_item.rs