Skip to main content

datasynth_eval/statistical/
line_item.rs

1//! Line item distribution analysis.
2//!
3//! Analyzes the distribution of line item counts in journal entries
4//! against expected empirical distributions from accounting research.
5
6use crate::error::{EvalError, EvalResult};
7use serde::{Deserialize, Serialize};
8use statrs::distribution::{ChiSquared, ContinuousCDF};
9use std::collections::HashMap;
10
11/// Expected line item distribution from empirical research (Table III).
12pub const EXPECTED_LINE_DISTRIBUTION: [(usize, f64); 11] = [
13    (2, 0.6068),    // 60.68% two-line entries
14    (3, 0.0577),    // 5.77%
15    (4, 0.1663),    // 16.63%
16    (5, 0.0306),    // 3.06%
17    (6, 0.0332),    // 3.32%
18    (7, 0.0113),    // 1.13%
19    (8, 0.0188),    // 1.88%
20    (9, 0.0042),    // 0.42%
21    (10, 0.0633),   // 10-99: 6.33% (simplified to 10+)
22    (100, 0.0076),  // 100-999: 0.76%
23    (1000, 0.0002), // 1000+: 0.02%
24];
25
26/// Expected even/odd distribution.
27pub const EXPECTED_EVEN_RATIO: f64 = 0.88;
28
29/// Expected equal debit/credit split ratio.
30pub const EXPECTED_EQUAL_SPLIT_RATIO: f64 = 0.82;
31
32/// Results of line item distribution analysis.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct LineItemAnalysis {
35    /// Number of entries analyzed.
36    pub sample_size: usize,
37    /// Distribution of line counts.
38    pub line_count_distribution: HashMap<usize, usize>,
39    /// Chi-squared statistic against expected distribution.
40    pub chi_squared: f64,
41    /// Degrees of freedom.
42    pub degrees_of_freedom: u32,
43    /// P-value from chi-squared test.
44    pub p_value: f64,
45    /// Ratio of entries with even line counts.
46    pub even_ratio: f64,
47    /// Deviation from expected even ratio.
48    pub even_ratio_deviation: f64,
49    /// Ratio of entries with equal debit/credit counts.
50    pub equal_split_ratio: f64,
51    /// Deviation from expected equal split ratio.
52    pub equal_split_deviation: f64,
53    /// Average line count.
54    pub avg_line_count: f64,
55    /// Minimum line count.
56    pub min_line_count: usize,
57    /// Maximum line count.
58    pub max_line_count: usize,
59    /// Whether test passes.
60    pub passes: bool,
61}
62
63/// Input for line item analysis.
64#[derive(Debug, Clone)]
65pub struct LineItemEntry {
66    /// Total number of lines in the entry.
67    pub line_count: usize,
68    /// Number of debit lines.
69    pub debit_count: usize,
70    /// Number of credit lines.
71    pub credit_count: usize,
72}
73
74/// Analyzer for line item distributions.
75pub struct LineItemAnalyzer {
76    /// Significance level for statistical tests.
77    significance_level: f64,
78}
79
80impl LineItemAnalyzer {
81    /// Create a new analyzer.
82    pub fn new(significance_level: f64) -> Self {
83        Self { significance_level }
84    }
85
86    /// Analyze line item distribution from entries.
87    pub fn analyze(&self, entries: &[LineItemEntry]) -> EvalResult<LineItemAnalysis> {
88        let n = entries.len();
89        if n < 10 {
90            return Err(EvalError::InsufficientData {
91                required: 10,
92                actual: n,
93            });
94        }
95
96        // Count line count occurrences
97        let mut line_count_distribution: HashMap<usize, usize> = HashMap::new();
98        for entry in entries {
99            *line_count_distribution.entry(entry.line_count).or_insert(0) += 1;
100        }
101
102        // Group into buckets matching expected distribution
103        let buckets = self.bucket_counts(&line_count_distribution, n);
104
105        // Chi-squared test
106        let (chi_squared, p_value) = self.chi_squared_test(&buckets, n);
107
108        // Even/odd analysis
109        let even_count = entries.iter().filter(|e| e.line_count % 2 == 0).count();
110        let even_ratio = even_count as f64 / n as f64;
111        let even_ratio_deviation = (even_ratio - EXPECTED_EVEN_RATIO).abs();
112
113        // Equal split analysis
114        let equal_split_count = entries
115            .iter()
116            .filter(|e| e.debit_count == e.credit_count)
117            .count();
118        let equal_split_ratio = equal_split_count as f64 / n as f64;
119        let equal_split_deviation = (equal_split_ratio - EXPECTED_EQUAL_SPLIT_RATIO).abs();
120
121        // Basic statistics
122        let line_counts: Vec<usize> = entries.iter().map(|e| e.line_count).collect();
123        let avg_line_count = line_counts.iter().sum::<usize>() as f64 / n as f64;
124        let min_line_count = *line_counts.iter().min().unwrap_or(&0);
125        let max_line_count = *line_counts.iter().max().unwrap_or(&0);
126
127        // Pass if chi-squared test passes and deviations are acceptable
128        let passes = p_value >= self.significance_level
129            && even_ratio_deviation < 0.10
130            && equal_split_deviation < 0.10;
131
132        Ok(LineItemAnalysis {
133            sample_size: n,
134            line_count_distribution,
135            chi_squared,
136            degrees_of_freedom: (EXPECTED_LINE_DISTRIBUTION.len() - 1) as u32,
137            p_value,
138            even_ratio,
139            even_ratio_deviation,
140            equal_split_ratio,
141            equal_split_deviation,
142            avg_line_count,
143            min_line_count,
144            max_line_count,
145            passes,
146        })
147    }
148
149    /// Bucket observed counts into expected distribution categories.
150    fn bucket_counts(
151        &self,
152        distribution: &HashMap<usize, usize>,
153        _total: usize,
154    ) -> Vec<(usize, usize)> {
155        let mut buckets = vec![
156            (2, 0usize),
157            (3, 0),
158            (4, 0),
159            (5, 0),
160            (6, 0),
161            (7, 0),
162            (8, 0),
163            (9, 0),
164            (10, 0),   // 10-99
165            (100, 0),  // 100-999
166            (1000, 0), // 1000+
167        ];
168
169        for (&count, &freq) in distribution {
170            let bucket_idx = match count {
171                2 => 0,
172                3 => 1,
173                4 => 2,
174                5 => 3,
175                6 => 4,
176                7 => 5,
177                8 => 6,
178                9 => 7,
179                10..=99 => 8,
180                100..=999 => 9,
181                _ if count >= 1000 => 10,
182                _ => continue, // Skip 0, 1
183            };
184            buckets[bucket_idx].1 += freq;
185        }
186
187        buckets
188    }
189
190    /// Perform chi-squared test against expected distribution.
191    fn chi_squared_test(&self, observed: &[(usize, usize)], n: usize) -> (f64, f64) {
192        let n_f64 = n as f64;
193
194        let chi_squared: f64 = observed
195            .iter()
196            .zip(EXPECTED_LINE_DISTRIBUTION.iter())
197            .map(|((_, obs), (_, exp_prob))| {
198                let expected = exp_prob * n_f64;
199                if expected > 0.0 {
200                    let obs_f64 = *obs as f64;
201                    (obs_f64 - expected).powi(2) / expected
202                } else {
203                    0.0
204                }
205            })
206            .sum();
207
208        let df = (EXPECTED_LINE_DISTRIBUTION.len() - 1) as f64;
209        let chi_sq_dist = ChiSquared::new(df).expect("df > 0 for chi-squared distribution");
210        let p_value = 1.0 - chi_sq_dist.cdf(chi_squared);
211
212        (chi_squared, p_value)
213    }
214}
215
216impl Default for LineItemAnalyzer {
217    fn default() -> Self {
218        Self::new(0.05)
219    }
220}
221
222#[cfg(test)]
223#[allow(clippy::unwrap_used)]
224mod tests {
225    use super::*;
226
227    fn create_test_entries(distribution: &[(usize, usize)]) -> Vec<LineItemEntry> {
228        let mut entries = Vec::new();
229        for &(line_count, count) in distribution {
230            for _ in 0..count {
231                entries.push(LineItemEntry {
232                    line_count,
233                    debit_count: line_count / 2,
234                    credit_count: line_count - line_count / 2,
235                });
236            }
237        }
238        entries
239    }
240
241    #[test]
242    fn test_line_item_analysis() {
243        // Create distribution roughly matching expected
244        let distribution = vec![
245            (2, 607),
246            (3, 58),
247            (4, 166),
248            (5, 31),
249            (6, 33),
250            (7, 11),
251            (8, 19),
252            (9, 4),
253            (10, 63),
254            (100, 8),
255        ];
256
257        let entries = create_test_entries(&distribution);
258        let analyzer = LineItemAnalyzer::default();
259        let result = analyzer.analyze(&entries).unwrap();
260
261        assert_eq!(result.sample_size, 1000);
262        assert!(result.avg_line_count > 2.0);
263    }
264
265    #[test]
266    fn test_even_ratio() {
267        let entries = vec![
268            LineItemEntry {
269                line_count: 2,
270                debit_count: 1,
271                credit_count: 1,
272            },
273            LineItemEntry {
274                line_count: 4,
275                debit_count: 2,
276                credit_count: 2,
277            },
278            LineItemEntry {
279                line_count: 6,
280                debit_count: 3,
281                credit_count: 3,
282            },
283            LineItemEntry {
284                line_count: 8,
285                debit_count: 4,
286                credit_count: 4,
287            },
288            LineItemEntry {
289                line_count: 10,
290                debit_count: 5,
291                credit_count: 5,
292            },
293            LineItemEntry {
294                line_count: 2,
295                debit_count: 1,
296                credit_count: 1,
297            },
298            LineItemEntry {
299                line_count: 4,
300                debit_count: 2,
301                credit_count: 2,
302            },
303            LineItemEntry {
304                line_count: 6,
305                debit_count: 3,
306                credit_count: 3,
307            },
308            LineItemEntry {
309                line_count: 3,
310                debit_count: 2,
311                credit_count: 1,
312            },
313            LineItemEntry {
314                line_count: 5,
315                debit_count: 3,
316                credit_count: 2,
317            },
318        ];
319
320        let analyzer = LineItemAnalyzer::default();
321        let result = analyzer.analyze(&entries).unwrap();
322
323        // 8 even out of 10
324        assert!((result.even_ratio - 0.8).abs() < 0.01);
325    }
326
327    #[test]
328    fn test_insufficient_data() {
329        let entries = vec![LineItemEntry {
330            line_count: 2,
331            debit_count: 1,
332            credit_count: 1,
333        }];
334        let analyzer = LineItemAnalyzer::default();
335        let result = analyzer.analyze(&entries);
336        assert!(matches!(result, Err(EvalError::InsufficientData { .. })));
337    }
338}