Skip to main content

datasynth_eval/statistical/
line_item.rs

1//! Line item distribution analysis.
2//!
3//! Analyzes the distribution of line item counts in journal entries
4//! against expected empirical distributions from accounting research.
5
6use crate::error::{EvalError, EvalResult};
7use serde::{Deserialize, Serialize};
8use statrs::distribution::{ChiSquared, ContinuousCDF};
9use std::collections::HashMap;
10
11/// Expected line item distribution from empirical research (Table III).
12pub const EXPECTED_LINE_DISTRIBUTION: [(usize, f64); 11] = [
13    (2, 0.6068),    // 60.68% two-line entries
14    (3, 0.0577),    // 5.77%
15    (4, 0.1663),    // 16.63%
16    (5, 0.0306),    // 3.06%
17    (6, 0.0332),    // 3.32%
18    (7, 0.0113),    // 1.13%
19    (8, 0.0188),    // 1.88%
20    (9, 0.0042),    // 0.42%
21    (10, 0.0633),   // 10-99: 6.33% (simplified to 10+)
22    (100, 0.0076),  // 100-999: 0.76%
23    (1000, 0.0002), // 1000+: 0.02%
24];
25
26/// Expected even/odd distribution.
27pub const EXPECTED_EVEN_RATIO: f64 = 0.88;
28
29/// Expected equal debit/credit split ratio.
30pub const EXPECTED_EQUAL_SPLIT_RATIO: f64 = 0.82;
31
32/// Results of line item distribution analysis.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct LineItemAnalysis {
35    /// Number of entries analyzed.
36    pub sample_size: usize,
37    /// Distribution of line counts.
38    pub line_count_distribution: HashMap<usize, usize>,
39    /// Chi-squared statistic against expected distribution.
40    pub chi_squared: f64,
41    /// Degrees of freedom.
42    pub degrees_of_freedom: u32,
43    /// P-value from chi-squared test.
44    pub p_value: f64,
45    /// Ratio of entries with even line counts.
46    pub even_ratio: f64,
47    /// Deviation from expected even ratio.
48    pub even_ratio_deviation: f64,
49    /// Ratio of entries with equal debit/credit counts.
50    pub equal_split_ratio: f64,
51    /// Deviation from expected equal split ratio.
52    pub equal_split_deviation: f64,
53    /// Average line count.
54    pub avg_line_count: f64,
55    /// Minimum line count.
56    pub min_line_count: usize,
57    /// Maximum line count.
58    pub max_line_count: usize,
59    /// Whether test passes.
60    pub passes: bool,
61}
62
63/// Input for line item analysis.
64#[derive(Debug, Clone)]
65pub struct LineItemEntry {
66    /// Total number of lines in the entry.
67    pub line_count: usize,
68    /// Number of debit lines.
69    pub debit_count: usize,
70    /// Number of credit lines.
71    pub credit_count: usize,
72}
73
74/// Analyzer for line item distributions.
75pub struct LineItemAnalyzer {
76    /// Significance level for statistical tests.
77    significance_level: f64,
78}
79
80impl LineItemAnalyzer {
81    /// Create a new analyzer.
82    pub fn new(significance_level: f64) -> Self {
83        Self { significance_level }
84    }
85
86    /// Analyze line item distribution from entries.
87    pub fn analyze(&self, entries: &[LineItemEntry]) -> EvalResult<LineItemAnalysis> {
88        let n = entries.len();
89        if n < 10 {
90            return Err(EvalError::InsufficientData {
91                required: 10,
92                actual: n,
93            });
94        }
95
96        // Count line count occurrences
97        let mut line_count_distribution: HashMap<usize, usize> = HashMap::new();
98        for entry in entries {
99            *line_count_distribution.entry(entry.line_count).or_insert(0) += 1;
100        }
101
102        // Group into buckets matching expected distribution
103        let buckets = self.bucket_counts(&line_count_distribution, n);
104
105        // Chi-squared test
106        let (chi_squared, p_value) = self.chi_squared_test(&buckets, n);
107
108        // Even/odd analysis
109        let even_count = entries.iter().filter(|e| e.line_count % 2 == 0).count();
110        let even_ratio = even_count as f64 / n as f64;
111        let even_ratio_deviation = (even_ratio - EXPECTED_EVEN_RATIO).abs();
112
113        // Equal split analysis
114        let equal_split_count = entries
115            .iter()
116            .filter(|e| e.debit_count == e.credit_count)
117            .count();
118        let equal_split_ratio = equal_split_count as f64 / n as f64;
119        let equal_split_deviation = (equal_split_ratio - EXPECTED_EQUAL_SPLIT_RATIO).abs();
120
121        // Basic statistics
122        let line_counts: Vec<usize> = entries.iter().map(|e| e.line_count).collect();
123        let avg_line_count = line_counts.iter().sum::<usize>() as f64 / n as f64;
124        let min_line_count = *line_counts.iter().min().unwrap_or(&0);
125        let max_line_count = *line_counts.iter().max().unwrap_or(&0);
126
127        // Pass if chi-squared test passes and deviations are acceptable
128        let passes = p_value >= self.significance_level
129            && even_ratio_deviation < 0.10
130            && equal_split_deviation < 0.10;
131
132        Ok(LineItemAnalysis {
133            sample_size: n,
134            line_count_distribution,
135            chi_squared,
136            degrees_of_freedom: (EXPECTED_LINE_DISTRIBUTION.len() - 1) as u32,
137            p_value,
138            even_ratio,
139            even_ratio_deviation,
140            equal_split_ratio,
141            equal_split_deviation,
142            avg_line_count,
143            min_line_count,
144            max_line_count,
145            passes,
146        })
147    }
148
149    /// Bucket observed counts into expected distribution categories.
150    fn bucket_counts(
151        &self,
152        distribution: &HashMap<usize, usize>,
153        _total: usize,
154    ) -> Vec<(usize, usize)> {
155        let mut buckets = vec![
156            (2, 0usize),
157            (3, 0),
158            (4, 0),
159            (5, 0),
160            (6, 0),
161            (7, 0),
162            (8, 0),
163            (9, 0),
164            (10, 0),   // 10-99
165            (100, 0),  // 100-999
166            (1000, 0), // 1000+
167        ];
168
169        for (&count, &freq) in distribution {
170            let bucket_idx = match count {
171                2 => 0,
172                3 => 1,
173                4 => 2,
174                5 => 3,
175                6 => 4,
176                7 => 5,
177                8 => 6,
178                9 => 7,
179                10..=99 => 8,
180                100..=999 => 9,
181                _ if count >= 1000 => 10,
182                _ => continue, // Skip 0, 1
183            };
184            buckets[bucket_idx].1 += freq;
185        }
186
187        buckets
188    }
189
190    /// Perform chi-squared test against expected distribution.
191    fn chi_squared_test(&self, observed: &[(usize, usize)], n: usize) -> (f64, f64) {
192        let n_f64 = n as f64;
193
194        let chi_squared: f64 = observed
195            .iter()
196            .zip(EXPECTED_LINE_DISTRIBUTION.iter())
197            .map(|((_, obs), (_, exp_prob))| {
198                let expected = exp_prob * n_f64;
199                if expected > 0.0 {
200                    let obs_f64 = *obs as f64;
201                    (obs_f64 - expected).powi(2) / expected
202                } else {
203                    0.0
204                }
205            })
206            .sum();
207
208        let df = (EXPECTED_LINE_DISTRIBUTION.len() - 1) as f64;
209        let chi_sq_dist = ChiSquared::new(df).unwrap();
210        let p_value = 1.0 - chi_sq_dist.cdf(chi_squared);
211
212        (chi_squared, p_value)
213    }
214}
215
216impl Default for LineItemAnalyzer {
217    fn default() -> Self {
218        Self::new(0.05)
219    }
220}
221
222#[cfg(test)]
223mod tests {
224    use super::*;
225
226    fn create_test_entries(distribution: &[(usize, usize)]) -> Vec<LineItemEntry> {
227        let mut entries = Vec::new();
228        for &(line_count, count) in distribution {
229            for _ in 0..count {
230                entries.push(LineItemEntry {
231                    line_count,
232                    debit_count: line_count / 2,
233                    credit_count: line_count - line_count / 2,
234                });
235            }
236        }
237        entries
238    }
239
240    #[test]
241    fn test_line_item_analysis() {
242        // Create distribution roughly matching expected
243        let distribution = vec![
244            (2, 607),
245            (3, 58),
246            (4, 166),
247            (5, 31),
248            (6, 33),
249            (7, 11),
250            (8, 19),
251            (9, 4),
252            (10, 63),
253            (100, 8),
254        ];
255
256        let entries = create_test_entries(&distribution);
257        let analyzer = LineItemAnalyzer::default();
258        let result = analyzer.analyze(&entries).unwrap();
259
260        assert_eq!(result.sample_size, 1000);
261        assert!(result.avg_line_count > 2.0);
262    }
263
264    #[test]
265    fn test_even_ratio() {
266        let entries = vec![
267            LineItemEntry {
268                line_count: 2,
269                debit_count: 1,
270                credit_count: 1,
271            },
272            LineItemEntry {
273                line_count: 4,
274                debit_count: 2,
275                credit_count: 2,
276            },
277            LineItemEntry {
278                line_count: 6,
279                debit_count: 3,
280                credit_count: 3,
281            },
282            LineItemEntry {
283                line_count: 8,
284                debit_count: 4,
285                credit_count: 4,
286            },
287            LineItemEntry {
288                line_count: 10,
289                debit_count: 5,
290                credit_count: 5,
291            },
292            LineItemEntry {
293                line_count: 2,
294                debit_count: 1,
295                credit_count: 1,
296            },
297            LineItemEntry {
298                line_count: 4,
299                debit_count: 2,
300                credit_count: 2,
301            },
302            LineItemEntry {
303                line_count: 6,
304                debit_count: 3,
305                credit_count: 3,
306            },
307            LineItemEntry {
308                line_count: 3,
309                debit_count: 2,
310                credit_count: 1,
311            },
312            LineItemEntry {
313                line_count: 5,
314                debit_count: 3,
315                credit_count: 2,
316            },
317        ];
318
319        let analyzer = LineItemAnalyzer::default();
320        let result = analyzer.analyze(&entries).unwrap();
321
322        // 8 even out of 10
323        assert!((result.even_ratio - 0.8).abs() < 0.01);
324    }
325
326    #[test]
327    fn test_insufficient_data() {
328        let entries = vec![LineItemEntry {
329            line_count: 2,
330            debit_count: 1,
331            credit_count: 1,
332        }];
333        let analyzer = LineItemAnalyzer::default();
334        let result = analyzer.analyze(&entries);
335        assert!(matches!(result, Err(EvalError::InsufficientData { .. })));
336    }
337}