1use crate::error::{EvalError, EvalResult};
7use serde::{Deserialize, Serialize};
8use statrs::distribution::{ChiSquared, ContinuousCDF};
9use std::collections::HashMap;
10
11pub const EXPECTED_LINE_DISTRIBUTION: [(usize, f64); 11] = [
13 (2, 0.6068), (3, 0.0577), (4, 0.1663), (5, 0.0306), (6, 0.0332), (7, 0.0113), (8, 0.0188), (9, 0.0042), (10, 0.0633), (100, 0.0076), (1000, 0.0002), ];
25
26pub const EXPECTED_EVEN_RATIO: f64 = 0.88;
28
29pub const EXPECTED_EQUAL_SPLIT_RATIO: f64 = 0.82;
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct LineItemAnalysis {
35 pub sample_size: usize,
37 pub line_count_distribution: HashMap<usize, usize>,
39 pub chi_squared: f64,
41 pub degrees_of_freedom: u32,
43 pub p_value: f64,
45 pub even_ratio: f64,
47 pub even_ratio_deviation: f64,
49 pub equal_split_ratio: f64,
51 pub equal_split_deviation: f64,
53 pub avg_line_count: f64,
55 pub min_line_count: usize,
57 pub max_line_count: usize,
59 pub passes: bool,
61}
62
63#[derive(Debug, Clone)]
65pub struct LineItemEntry {
66 pub line_count: usize,
68 pub debit_count: usize,
70 pub credit_count: usize,
72}
73
74pub struct LineItemAnalyzer {
76 significance_level: f64,
78}
79
80impl LineItemAnalyzer {
81 pub fn new(significance_level: f64) -> Self {
83 Self { significance_level }
84 }
85
86 pub fn analyze(&self, entries: &[LineItemEntry]) -> EvalResult<LineItemAnalysis> {
88 let n = entries.len();
89 if n < 10 {
90 return Err(EvalError::InsufficientData {
91 required: 10,
92 actual: n,
93 });
94 }
95
96 let mut line_count_distribution: HashMap<usize, usize> = HashMap::new();
98 for entry in entries {
99 *line_count_distribution.entry(entry.line_count).or_insert(0) += 1;
100 }
101
102 let buckets = self.bucket_counts(&line_count_distribution, n);
104
105 let (chi_squared, p_value) = self.chi_squared_test(&buckets, n);
107
108 let even_count = entries.iter().filter(|e| e.line_count % 2 == 0).count();
110 let even_ratio = even_count as f64 / n as f64;
111 let even_ratio_deviation = (even_ratio - EXPECTED_EVEN_RATIO).abs();
112
113 let equal_split_count = entries
115 .iter()
116 .filter(|e| e.debit_count == e.credit_count)
117 .count();
118 let equal_split_ratio = equal_split_count as f64 / n as f64;
119 let equal_split_deviation = (equal_split_ratio - EXPECTED_EQUAL_SPLIT_RATIO).abs();
120
121 let line_counts: Vec<usize> = entries.iter().map(|e| e.line_count).collect();
123 let avg_line_count = line_counts.iter().sum::<usize>() as f64 / n as f64;
124 let min_line_count = *line_counts.iter().min().unwrap_or(&0);
125 let max_line_count = *line_counts.iter().max().unwrap_or(&0);
126
127 let passes = p_value >= self.significance_level
129 && even_ratio_deviation < 0.10
130 && equal_split_deviation < 0.10;
131
132 Ok(LineItemAnalysis {
133 sample_size: n,
134 line_count_distribution,
135 chi_squared,
136 degrees_of_freedom: (EXPECTED_LINE_DISTRIBUTION.len() - 1) as u32,
137 p_value,
138 even_ratio,
139 even_ratio_deviation,
140 equal_split_ratio,
141 equal_split_deviation,
142 avg_line_count,
143 min_line_count,
144 max_line_count,
145 passes,
146 })
147 }
148
149 fn bucket_counts(
151 &self,
152 distribution: &HashMap<usize, usize>,
153 _total: usize,
154 ) -> Vec<(usize, usize)> {
155 let mut buckets = vec![
156 (2, 0usize),
157 (3, 0),
158 (4, 0),
159 (5, 0),
160 (6, 0),
161 (7, 0),
162 (8, 0),
163 (9, 0),
164 (10, 0), (100, 0), (1000, 0), ];
168
169 for (&count, &freq) in distribution {
170 let bucket_idx = match count {
171 2 => 0,
172 3 => 1,
173 4 => 2,
174 5 => 3,
175 6 => 4,
176 7 => 5,
177 8 => 6,
178 9 => 7,
179 10..=99 => 8,
180 100..=999 => 9,
181 _ if count >= 1000 => 10,
182 _ => continue, };
184 buckets[bucket_idx].1 += freq;
185 }
186
187 buckets
188 }
189
190 fn chi_squared_test(&self, observed: &[(usize, usize)], n: usize) -> (f64, f64) {
192 let n_f64 = n as f64;
193
194 let chi_squared: f64 = observed
195 .iter()
196 .zip(EXPECTED_LINE_DISTRIBUTION.iter())
197 .map(|((_, obs), (_, exp_prob))| {
198 let expected = exp_prob * n_f64;
199 if expected > 0.0 {
200 let obs_f64 = *obs as f64;
201 (obs_f64 - expected).powi(2) / expected
202 } else {
203 0.0
204 }
205 })
206 .sum();
207
208 let df = (EXPECTED_LINE_DISTRIBUTION.len() - 1) as f64;
209 let chi_sq_dist = ChiSquared::new(df).unwrap();
210 let p_value = 1.0 - chi_sq_dist.cdf(chi_squared);
211
212 (chi_squared, p_value)
213 }
214}
215
216impl Default for LineItemAnalyzer {
217 fn default() -> Self {
218 Self::new(0.05)
219 }
220}
221
222#[cfg(test)]
223mod tests {
224 use super::*;
225
226 fn create_test_entries(distribution: &[(usize, usize)]) -> Vec<LineItemEntry> {
227 let mut entries = Vec::new();
228 for &(line_count, count) in distribution {
229 for _ in 0..count {
230 entries.push(LineItemEntry {
231 line_count,
232 debit_count: line_count / 2,
233 credit_count: line_count - line_count / 2,
234 });
235 }
236 }
237 entries
238 }
239
240 #[test]
241 fn test_line_item_analysis() {
242 let distribution = vec![
244 (2, 607),
245 (3, 58),
246 (4, 166),
247 (5, 31),
248 (6, 33),
249 (7, 11),
250 (8, 19),
251 (9, 4),
252 (10, 63),
253 (100, 8),
254 ];
255
256 let entries = create_test_entries(&distribution);
257 let analyzer = LineItemAnalyzer::default();
258 let result = analyzer.analyze(&entries).unwrap();
259
260 assert_eq!(result.sample_size, 1000);
261 assert!(result.avg_line_count > 2.0);
262 }
263
264 #[test]
265 fn test_even_ratio() {
266 let entries = vec![
267 LineItemEntry {
268 line_count: 2,
269 debit_count: 1,
270 credit_count: 1,
271 },
272 LineItemEntry {
273 line_count: 4,
274 debit_count: 2,
275 credit_count: 2,
276 },
277 LineItemEntry {
278 line_count: 6,
279 debit_count: 3,
280 credit_count: 3,
281 },
282 LineItemEntry {
283 line_count: 8,
284 debit_count: 4,
285 credit_count: 4,
286 },
287 LineItemEntry {
288 line_count: 10,
289 debit_count: 5,
290 credit_count: 5,
291 },
292 LineItemEntry {
293 line_count: 2,
294 debit_count: 1,
295 credit_count: 1,
296 },
297 LineItemEntry {
298 line_count: 4,
299 debit_count: 2,
300 credit_count: 2,
301 },
302 LineItemEntry {
303 line_count: 6,
304 debit_count: 3,
305 credit_count: 3,
306 },
307 LineItemEntry {
308 line_count: 3,
309 debit_count: 2,
310 credit_count: 1,
311 },
312 LineItemEntry {
313 line_count: 5,
314 debit_count: 3,
315 credit_count: 2,
316 },
317 ];
318
319 let analyzer = LineItemAnalyzer::default();
320 let result = analyzer.analyze(&entries).unwrap();
321
322 assert!((result.even_ratio - 0.8).abs() < 0.01);
324 }
325
326 #[test]
327 fn test_insufficient_data() {
328 let entries = vec![LineItemEntry {
329 line_count: 2,
330 debit_count: 1,
331 credit_count: 1,
332 }];
333 let analyzer = LineItemAnalyzer::default();
334 let result = analyzer.analyze(&entries);
335 assert!(matches!(result, Err(EvalError::InsufficientData { .. })));
336 }
337}