datasynth_eval/statistical/
benford.rs1use crate::error::{EvalError, EvalResult};
8use rust_decimal::Decimal;
9use serde::{Deserialize, Serialize};
10use statrs::distribution::{ChiSquared, ContinuousCDF};
11
12#[allow(clippy::approx_constant)] pub const BENFORD_PROBABILITIES: [f64; 9] = [
16 0.30103, 0.17609, 0.12494, 0.09691, 0.07918, 0.06695, 0.05799, 0.05115, 0.04576, ];
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
29pub enum BenfordConformity {
30 Close,
32 Acceptable,
34 Marginal,
36 NonConforming,
38}
39
40impl BenfordConformity {
41 pub fn from_mad(mad: f64) -> Self {
43 if mad < 0.006 {
44 Self::Close
45 } else if mad < 0.012 {
46 Self::Acceptable
47 } else if mad < 0.015 {
48 Self::Marginal
49 } else {
50 Self::NonConforming
51 }
52 }
53}
54
55#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct BenfordAnalysis {
58 pub sample_size: usize,
60 pub observed_frequencies: [f64; 9],
62 pub observed_counts: [u64; 9],
64 pub expected_frequencies: [f64; 9],
66 pub chi_squared: f64,
68 pub degrees_of_freedom: u32,
70 pub p_value: f64,
72 pub mad: f64,
74 pub conformity: BenfordConformity,
76 pub max_deviation: (u8, f64),
78 pub passes: bool,
80 pub anti_benford_score: f64,
82}
83
84pub struct BenfordAnalyzer {
86 significance_level: f64,
88}
89
90impl BenfordAnalyzer {
91 pub fn new(significance_level: f64) -> Self {
93 Self { significance_level }
94 }
95
96 fn get_first_digit(amount: Decimal) -> Option<u8> {
98 let abs_amount = amount.abs();
99 if abs_amount.is_zero() {
100 return None;
101 }
102
103 let s = abs_amount.to_string();
105 for c in s.chars() {
106 if c.is_ascii_digit() && c != '0' {
107 return Some(c.to_digit(10).unwrap() as u8);
108 }
109 }
110 None
111 }
112
113 pub fn analyze(&self, amounts: &[Decimal]) -> EvalResult<BenfordAnalysis> {
115 let first_digits: Vec<u8> = amounts
117 .iter()
118 .filter_map(|&a| Self::get_first_digit(a))
119 .collect();
120
121 let n = first_digits.len();
122 if n < 10 {
123 return Err(EvalError::InsufficientData {
124 required: 10,
125 actual: n,
126 });
127 }
128
129 let mut counts = [0u64; 9];
131 for digit in first_digits {
132 if (1..=9).contains(&digit) {
133 counts[(digit - 1) as usize] += 1;
134 }
135 }
136
137 let n_f64 = n as f64;
139 let observed_frequencies: [f64; 9] = std::array::from_fn(|i| counts[i] as f64 / n_f64);
140
141 let chi_squared: f64 = (0..9)
143 .map(|i| {
144 let observed = counts[i] as f64;
145 let expected = BENFORD_PROBABILITIES[i] * n_f64;
146 if expected > 0.0 {
147 (observed - expected).powi(2) / expected
148 } else {
149 0.0
150 }
151 })
152 .sum();
153
154 let chi_sq_dist = ChiSquared::new(8.0).map_err(|e| {
156 EvalError::StatisticalError(format!("Failed to create chi-squared distribution: {}", e))
157 })?;
158 let p_value = 1.0 - chi_sq_dist.cdf(chi_squared);
159
160 let mad: f64 = (0..9)
162 .map(|i| (observed_frequencies[i] - BENFORD_PROBABILITIES[i]).abs())
163 .sum::<f64>()
164 / 9.0;
165
166 let max_deviation = (0..9)
168 .map(|i| {
169 (
170 (i + 1) as u8,
171 (observed_frequencies[i] - BENFORD_PROBABILITIES[i]).abs(),
172 )
173 })
174 .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
175 .unwrap();
176
177 let uniform_prob = 1.0 / 9.0;
179 let anti_benford_score: f64 = (0..9)
180 .map(|i| {
181 let benford_distance = (observed_frequencies[i] - BENFORD_PROBABILITIES[i]).abs();
182 let uniform_distance = (observed_frequencies[i] - uniform_prob).abs();
183 if benford_distance > uniform_distance {
185 benford_distance - uniform_distance
186 } else {
187 0.0
188 }
189 })
190 .sum::<f64>()
191 / 9.0;
192
193 let conformity = BenfordConformity::from_mad(mad);
194 let passes = p_value >= self.significance_level;
195
196 Ok(BenfordAnalysis {
197 sample_size: n,
198 observed_frequencies,
199 observed_counts: counts,
200 expected_frequencies: BENFORD_PROBABILITIES,
201 chi_squared,
202 degrees_of_freedom: 8,
203 p_value,
204 mad,
205 conformity,
206 max_deviation,
207 passes,
208 anti_benford_score,
209 })
210 }
211
212 pub fn analyze_second_digit(&self, amounts: &[Decimal]) -> EvalResult<SecondDigitAnalysis> {
214 let second_digits: Vec<u8> = amounts
216 .iter()
217 .filter_map(|&a| Self::get_second_digit(a))
218 .collect();
219
220 let n = second_digits.len();
221 if n < 10 {
222 return Err(EvalError::InsufficientData {
223 required: 10,
224 actual: n,
225 });
226 }
227
228 let mut counts = [0u64; 10];
230 for digit in second_digits {
231 counts[digit as usize] += 1;
232 }
233
234 let n_f64 = n as f64;
236 let observed_frequencies: [f64; 10] = std::array::from_fn(|i| counts[i] as f64 / n_f64);
237
238 let expected: [f64; 10] = [
240 0.11968, 0.11389, 0.10882, 0.10433, 0.10031, 0.09668, 0.09337, 0.09035, 0.08757,
241 0.08500,
242 ];
243
244 let chi_squared: f64 = (0..10)
246 .map(|i| {
247 let observed = counts[i] as f64;
248 let exp = expected[i] * n_f64;
249 if exp > 0.0 {
250 (observed - exp).powi(2) / exp
251 } else {
252 0.0
253 }
254 })
255 .sum();
256
257 let chi_sq_dist = ChiSquared::new(9.0).map_err(|e| {
258 EvalError::StatisticalError(format!("Failed to create chi-squared distribution: {}", e))
259 })?;
260 let p_value = 1.0 - chi_sq_dist.cdf(chi_squared);
261
262 Ok(SecondDigitAnalysis {
263 sample_size: n,
264 observed_frequencies,
265 expected_frequencies: expected,
266 chi_squared,
267 p_value,
268 passes: p_value >= self.significance_level,
269 })
270 }
271
272 fn get_second_digit(amount: Decimal) -> Option<u8> {
274 let abs_amount = amount.abs();
275 if abs_amount.is_zero() {
276 return None;
277 }
278
279 let s = abs_amount.to_string();
280 let mut found_first = false;
281 for c in s.chars() {
282 if c.is_ascii_digit() {
283 if !found_first && c != '0' {
284 found_first = true;
285 } else if found_first && c != '.' {
286 return Some(c.to_digit(10).unwrap() as u8);
287 }
288 }
289 }
290 None
291 }
292}
293
294impl Default for BenfordAnalyzer {
295 fn default() -> Self {
296 Self::new(0.05)
297 }
298}
299
300#[derive(Debug, Clone, Serialize, Deserialize)]
302pub struct SecondDigitAnalysis {
303 pub sample_size: usize,
305 pub observed_frequencies: [f64; 10],
307 pub expected_frequencies: [f64; 10],
309 pub chi_squared: f64,
311 pub p_value: f64,
313 pub passes: bool,
315}
316
317#[cfg(test)]
318mod tests {
319 use super::*;
320 use rust_decimal_macros::dec;
321
322 #[test]
323 fn test_benford_probabilities_sum_to_one() {
324 let sum: f64 = BENFORD_PROBABILITIES.iter().sum();
325 assert!((sum - 1.0).abs() < 0.001);
326 }
327
328 #[test]
329 fn test_get_first_digit() {
330 assert_eq!(BenfordAnalyzer::get_first_digit(dec!(123.45)), Some(1));
331 assert_eq!(BenfordAnalyzer::get_first_digit(dec!(0.0456)), Some(4));
332 assert_eq!(BenfordAnalyzer::get_first_digit(dec!(9999)), Some(9));
333 assert_eq!(BenfordAnalyzer::get_first_digit(dec!(-567.89)), Some(5));
334 assert_eq!(BenfordAnalyzer::get_first_digit(dec!(0)), None);
335 }
336
337 #[test]
338 fn test_benford_analysis_with_compliant_data() {
339 let amounts: Vec<Decimal> = (1..=1000)
341 .map(|i| {
342 let digit = match i % 100 {
344 0..=29 => 1,
345 30..=46 => 2,
346 47..=59 => 3,
347 60..=69 => 4,
348 70..=77 => 5,
349 78..=84 => 6,
350 85..=90 => 7,
351 91..=95 => 8,
352 _ => 9,
353 };
354 Decimal::new(digit * 100 + (i % 100) as i64, 2)
355 })
356 .collect();
357
358 let analyzer = BenfordAnalyzer::default();
359 let result = analyzer.analyze(&amounts).unwrap();
360
361 assert_eq!(result.sample_size, 1000);
362 assert_eq!(result.degrees_of_freedom, 8);
363 assert!(result.mad < 0.05);
365 }
366
367 #[test]
368 fn test_benford_conformity_levels() {
369 assert_eq!(BenfordConformity::from_mad(0.004), BenfordConformity::Close);
370 assert_eq!(
371 BenfordConformity::from_mad(0.010),
372 BenfordConformity::Acceptable
373 );
374 assert_eq!(
375 BenfordConformity::from_mad(0.014),
376 BenfordConformity::Marginal
377 );
378 assert_eq!(
379 BenfordConformity::from_mad(0.020),
380 BenfordConformity::NonConforming
381 );
382 }
383
384 #[test]
385 fn test_insufficient_data() {
386 let amounts = vec![dec!(100), dec!(200), dec!(300)];
387 let analyzer = BenfordAnalyzer::default();
388 let result = analyzer.analyze(&amounts);
389 assert!(matches!(result, Err(EvalError::InsufficientData { .. })));
390 }
391}