1use crate::error::{EvalError, EvalResult};
7use serde::{Deserialize, Serialize};
8use statrs::distribution::{ChiSquared, ContinuousCDF};
9use std::collections::HashMap;
10
11pub const EXPECTED_LINE_DISTRIBUTION: [(usize, f64); 11] = [
13 (2, 0.6068), (3, 0.0577), (4, 0.1663), (5, 0.0306), (6, 0.0332), (7, 0.0113), (8, 0.0188), (9, 0.0042), (10, 0.0633), (100, 0.0076), (1000, 0.0002), ];
25
26pub const EXPECTED_EVEN_RATIO: f64 = 0.88;
28
29pub const EXPECTED_EQUAL_SPLIT_RATIO: f64 = 0.82;
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct LineItemAnalysis {
35 pub sample_size: usize,
37 pub line_count_distribution: HashMap<usize, usize>,
39 pub chi_squared: f64,
41 pub degrees_of_freedom: u32,
43 pub p_value: f64,
45 pub even_ratio: f64,
47 pub even_ratio_deviation: f64,
49 pub equal_split_ratio: f64,
51 pub equal_split_deviation: f64,
53 pub avg_line_count: f64,
55 pub min_line_count: usize,
57 pub max_line_count: usize,
59 pub passes: bool,
61}
62
63#[derive(Debug, Clone)]
65pub struct LineItemEntry {
66 pub line_count: usize,
68 pub debit_count: usize,
70 pub credit_count: usize,
72}
73
74pub struct LineItemAnalyzer {
76 significance_level: f64,
78}
79
80impl LineItemAnalyzer {
81 pub fn new(significance_level: f64) -> Self {
83 Self { significance_level }
84 }
85
86 pub fn analyze(&self, entries: &[LineItemEntry]) -> EvalResult<LineItemAnalysis> {
88 let n = entries.len();
89 if n < 10 {
90 return Err(EvalError::InsufficientData {
91 required: 10,
92 actual: n,
93 });
94 }
95
96 let mut line_count_distribution: HashMap<usize, usize> = HashMap::new();
98 for entry in entries {
99 *line_count_distribution.entry(entry.line_count).or_insert(0) += 1;
100 }
101
102 let buckets = self.bucket_counts(&line_count_distribution, n);
104
105 let (chi_squared, p_value) = self.chi_squared_test(&buckets, n);
107
108 let even_count = entries.iter().filter(|e| e.line_count % 2 == 0).count();
110 let even_ratio = even_count as f64 / n as f64;
111 let even_ratio_deviation = (even_ratio - EXPECTED_EVEN_RATIO).abs();
112
113 let equal_split_count = entries
115 .iter()
116 .filter(|e| e.debit_count == e.credit_count)
117 .count();
118 let equal_split_ratio = equal_split_count as f64 / n as f64;
119 let equal_split_deviation = (equal_split_ratio - EXPECTED_EQUAL_SPLIT_RATIO).abs();
120
121 let line_counts: Vec<usize> = entries.iter().map(|e| e.line_count).collect();
123 let avg_line_count = line_counts.iter().sum::<usize>() as f64 / n as f64;
124 let min_line_count = *line_counts.iter().min().unwrap_or(&0);
125 let max_line_count = *line_counts.iter().max().unwrap_or(&0);
126
127 let passes = p_value >= self.significance_level
129 && even_ratio_deviation < 0.10
130 && equal_split_deviation < 0.10;
131
132 Ok(LineItemAnalysis {
133 sample_size: n,
134 line_count_distribution,
135 chi_squared,
136 degrees_of_freedom: (EXPECTED_LINE_DISTRIBUTION.len() - 1) as u32,
137 p_value,
138 even_ratio,
139 even_ratio_deviation,
140 equal_split_ratio,
141 equal_split_deviation,
142 avg_line_count,
143 min_line_count,
144 max_line_count,
145 passes,
146 })
147 }
148
149 fn bucket_counts(
151 &self,
152 distribution: &HashMap<usize, usize>,
153 _total: usize,
154 ) -> Vec<(usize, usize)> {
155 let mut buckets = vec![
156 (2, 0usize),
157 (3, 0),
158 (4, 0),
159 (5, 0),
160 (6, 0),
161 (7, 0),
162 (8, 0),
163 (9, 0),
164 (10, 0), (100, 0), (1000, 0), ];
168
169 for (&count, &freq) in distribution {
170 let bucket_idx = match count {
171 2 => 0,
172 3 => 1,
173 4 => 2,
174 5 => 3,
175 6 => 4,
176 7 => 5,
177 8 => 6,
178 9 => 7,
179 10..=99 => 8,
180 100..=999 => 9,
181 _ if count >= 1000 => 10,
182 _ => continue, };
184 buckets[bucket_idx].1 += freq;
185 }
186
187 buckets
188 }
189
190 fn chi_squared_test(&self, observed: &[(usize, usize)], n: usize) -> (f64, f64) {
192 let n_f64 = n as f64;
193
194 let chi_squared: f64 = observed
195 .iter()
196 .zip(EXPECTED_LINE_DISTRIBUTION.iter())
197 .map(|((_, obs), (_, exp_prob))| {
198 let expected = exp_prob * n_f64;
199 if expected > 0.0 {
200 let obs_f64 = *obs as f64;
201 (obs_f64 - expected).powi(2) / expected
202 } else {
203 0.0
204 }
205 })
206 .sum();
207
208 let df = (EXPECTED_LINE_DISTRIBUTION.len() - 1) as f64;
209 let chi_sq_dist = ChiSquared::new(df).expect("df > 0 for chi-squared distribution");
210 let p_value = 1.0 - chi_sq_dist.cdf(chi_squared);
211
212 (chi_squared, p_value)
213 }
214}
215
216impl Default for LineItemAnalyzer {
217 fn default() -> Self {
218 Self::new(0.05)
219 }
220}
221
222#[cfg(test)]
223#[allow(clippy::unwrap_used)]
224mod tests {
225 use super::*;
226
227 fn create_test_entries(distribution: &[(usize, usize)]) -> Vec<LineItemEntry> {
228 let mut entries = Vec::new();
229 for &(line_count, count) in distribution {
230 for _ in 0..count {
231 entries.push(LineItemEntry {
232 line_count,
233 debit_count: line_count / 2,
234 credit_count: line_count - line_count / 2,
235 });
236 }
237 }
238 entries
239 }
240
241 #[test]
242 fn test_line_item_analysis() {
243 let distribution = vec![
245 (2, 607),
246 (3, 58),
247 (4, 166),
248 (5, 31),
249 (6, 33),
250 (7, 11),
251 (8, 19),
252 (9, 4),
253 (10, 63),
254 (100, 8),
255 ];
256
257 let entries = create_test_entries(&distribution);
258 let analyzer = LineItemAnalyzer::default();
259 let result = analyzer.analyze(&entries).unwrap();
260
261 assert_eq!(result.sample_size, 1000);
262 assert!(result.avg_line_count > 2.0);
263 }
264
265 #[test]
266 fn test_even_ratio() {
267 let entries = vec![
268 LineItemEntry {
269 line_count: 2,
270 debit_count: 1,
271 credit_count: 1,
272 },
273 LineItemEntry {
274 line_count: 4,
275 debit_count: 2,
276 credit_count: 2,
277 },
278 LineItemEntry {
279 line_count: 6,
280 debit_count: 3,
281 credit_count: 3,
282 },
283 LineItemEntry {
284 line_count: 8,
285 debit_count: 4,
286 credit_count: 4,
287 },
288 LineItemEntry {
289 line_count: 10,
290 debit_count: 5,
291 credit_count: 5,
292 },
293 LineItemEntry {
294 line_count: 2,
295 debit_count: 1,
296 credit_count: 1,
297 },
298 LineItemEntry {
299 line_count: 4,
300 debit_count: 2,
301 credit_count: 2,
302 },
303 LineItemEntry {
304 line_count: 6,
305 debit_count: 3,
306 credit_count: 3,
307 },
308 LineItemEntry {
309 line_count: 3,
310 debit_count: 2,
311 credit_count: 1,
312 },
313 LineItemEntry {
314 line_count: 5,
315 debit_count: 3,
316 credit_count: 2,
317 },
318 ];
319
320 let analyzer = LineItemAnalyzer::default();
321 let result = analyzer.analyze(&entries).unwrap();
322
323 assert!((result.even_ratio - 0.8).abs() < 0.01);
325 }
326
327 #[test]
328 fn test_insufficient_data() {
329 let entries = vec![LineItemEntry {
330 line_count: 2,
331 debit_count: 1,
332 credit_count: 1,
333 }];
334 let analyzer = LineItemAnalyzer::default();
335 let result = analyzer.analyze(&entries);
336 assert!(matches!(result, Err(EvalError::InsufficientData { .. })));
337 }
338}