datasynth_eval/statistical/
benford.rs

1//! Benford's Law analysis for amount distributions.
2//!
3//! Benford's Law states that in many naturally occurring collections of numbers,
4//! the leading digit is more likely to be small. Specifically, the probability
5//! of the first digit d (1-9) is: P(d) = log₁₀(1 + 1/d)
6
7use crate::error::{EvalError, EvalResult};
8use rust_decimal::Decimal;
9use serde::{Deserialize, Serialize};
10use statrs::distribution::{ChiSquared, ContinuousCDF};
11
12/// Expected Benford's Law probabilities for digits 1-9.
13/// These are the mathematically exact probabilities: P(d) = log₁₀(1 + 1/d)
14#[allow(clippy::approx_constant)] // Values are exact Benford probabilities, not approximations
15pub const BENFORD_PROBABILITIES: [f64; 9] = [
16    0.30103, // digit 1: log₁₀(2)
17    0.17609, // digit 2: log₁₀(3/2)
18    0.12494, // digit 3: log₁₀(4/3)
19    0.09691, // digit 4: log₁₀(5/4)
20    0.07918, // digit 5: log₁₀(6/5)
21    0.06695, // digit 6: log₁₀(7/6)
22    0.05799, // digit 7: log₁₀(8/7)
23    0.05115, // digit 8: log₁₀(9/8)
24    0.04576, // digit 9: log₁₀(10/9)
25];
26
27/// Conformity level based on Mean Absolute Deviation (MAD).
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
29pub enum BenfordConformity {
30    /// Close conformity (MAD < 0.006).
31    Close,
32    /// Acceptable conformity (MAD < 0.012).
33    Acceptable,
34    /// Marginal conformity (MAD < 0.015).
35    Marginal,
36    /// Non-conformity (MAD >= 0.015).
37    NonConforming,
38}
39
40impl BenfordConformity {
41    /// Determine conformity level from MAD value.
42    pub fn from_mad(mad: f64) -> Self {
43        if mad < 0.006 {
44            Self::Close
45        } else if mad < 0.012 {
46            Self::Acceptable
47        } else if mad < 0.015 {
48            Self::Marginal
49        } else {
50            Self::NonConforming
51        }
52    }
53}
54
55/// Results of Benford's Law analysis.
56#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct BenfordAnalysis {
58    /// Number of samples analyzed.
59    pub sample_size: usize,
60    /// Observed first-digit frequencies (digits 1-9).
61    pub observed_frequencies: [f64; 9],
62    /// Observed first-digit counts (digits 1-9).
63    pub observed_counts: [u64; 9],
64    /// Expected Benford frequencies.
65    pub expected_frequencies: [f64; 9],
66    /// Chi-squared statistic.
67    pub chi_squared: f64,
68    /// Degrees of freedom (8).
69    pub degrees_of_freedom: u32,
70    /// P-value from chi-squared test.
71    pub p_value: f64,
72    /// Mean Absolute Deviation from expected.
73    pub mad: f64,
74    /// Conformity level based on MAD.
75    pub conformity: BenfordConformity,
76    /// Maximum deviation (digit index, deviation value).
77    pub max_deviation: (u8, f64),
78    /// Whether test passes at the given significance level.
79    pub passes: bool,
80    /// Anti-Benford score (0.0 = perfect Benford, 1.0 = anti-Benford).
81    pub anti_benford_score: f64,
82}
83
84/// Analyzer for Benford's Law compliance.
85pub struct BenfordAnalyzer {
86    /// Significance level for the chi-squared test.
87    significance_level: f64,
88}
89
90impl BenfordAnalyzer {
91    /// Create a new analyzer with the specified significance level.
92    pub fn new(significance_level: f64) -> Self {
93        Self { significance_level }
94    }
95
96    /// Extract the first digit from a decimal amount.
97    fn get_first_digit(amount: Decimal) -> Option<u8> {
98        let abs_amount = amount.abs();
99        if abs_amount.is_zero() {
100            return None;
101        }
102
103        // Convert to string and find first non-zero digit
104        let s = abs_amount.to_string();
105        for c in s.chars() {
106            if c.is_ascii_digit() && c != '0' {
107                return Some(c.to_digit(10).unwrap() as u8);
108            }
109        }
110        None
111    }
112
113    /// Analyze a collection of amounts for Benford's Law compliance.
114    pub fn analyze(&self, amounts: &[Decimal]) -> EvalResult<BenfordAnalysis> {
115        // Filter out zero amounts and extract first digits
116        let first_digits: Vec<u8> = amounts
117            .iter()
118            .filter_map(|&a| Self::get_first_digit(a))
119            .collect();
120
121        let n = first_digits.len();
122        if n < 10 {
123            return Err(EvalError::InsufficientData {
124                required: 10,
125                actual: n,
126            });
127        }
128
129        // Count occurrences of each first digit
130        let mut counts = [0u64; 9];
131        for digit in first_digits {
132            if (1..=9).contains(&digit) {
133                counts[(digit - 1) as usize] += 1;
134            }
135        }
136
137        // Calculate observed frequencies
138        let n_f64 = n as f64;
139        let observed_frequencies: [f64; 9] = std::array::from_fn(|i| counts[i] as f64 / n_f64);
140
141        // Calculate chi-squared statistic
142        let chi_squared: f64 = (0..9)
143            .map(|i| {
144                let observed = counts[i] as f64;
145                let expected = BENFORD_PROBABILITIES[i] * n_f64;
146                if expected > 0.0 {
147                    (observed - expected).powi(2) / expected
148                } else {
149                    0.0
150                }
151            })
152            .sum();
153
154        // Calculate p-value from chi-squared distribution (df = 8)
155        let chi_sq_dist = ChiSquared::new(8.0).map_err(|e| {
156            EvalError::StatisticalError(format!("Failed to create chi-squared distribution: {}", e))
157        })?;
158        let p_value = 1.0 - chi_sq_dist.cdf(chi_squared);
159
160        // Calculate Mean Absolute Deviation
161        let mad: f64 = (0..9)
162            .map(|i| (observed_frequencies[i] - BENFORD_PROBABILITIES[i]).abs())
163            .sum::<f64>()
164            / 9.0;
165
166        // Find maximum deviation
167        let max_deviation = (0..9)
168            .map(|i| {
169                (
170                    (i + 1) as u8,
171                    (observed_frequencies[i] - BENFORD_PROBABILITIES[i]).abs(),
172                )
173            })
174            .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
175            .unwrap();
176
177        // Calculate anti-Benford score (how much it deviates toward uniform distribution)
178        let uniform_prob = 1.0 / 9.0;
179        let anti_benford_score: f64 = (0..9)
180            .map(|i| {
181                let benford_distance = (observed_frequencies[i] - BENFORD_PROBABILITIES[i]).abs();
182                let uniform_distance = (observed_frequencies[i] - uniform_prob).abs();
183                // Score increases when closer to uniform than to Benford
184                if benford_distance > uniform_distance {
185                    benford_distance - uniform_distance
186                } else {
187                    0.0
188                }
189            })
190            .sum::<f64>()
191            / 9.0;
192
193        let conformity = BenfordConformity::from_mad(mad);
194        let passes = p_value >= self.significance_level;
195
196        Ok(BenfordAnalysis {
197            sample_size: n,
198            observed_frequencies,
199            observed_counts: counts,
200            expected_frequencies: BENFORD_PROBABILITIES,
201            chi_squared,
202            degrees_of_freedom: 8,
203            p_value,
204            mad,
205            conformity,
206            max_deviation,
207            passes,
208            anti_benford_score,
209        })
210    }
211
212    /// Analyze second-digit distribution (more sensitive for fraud detection).
213    pub fn analyze_second_digit(&self, amounts: &[Decimal]) -> EvalResult<SecondDigitAnalysis> {
214        // Extract second digits
215        let second_digits: Vec<u8> = amounts
216            .iter()
217            .filter_map(|&a| Self::get_second_digit(a))
218            .collect();
219
220        let n = second_digits.len();
221        if n < 10 {
222            return Err(EvalError::InsufficientData {
223                required: 10,
224                actual: n,
225            });
226        }
227
228        // Count occurrences of each second digit (0-9)
229        let mut counts = [0u64; 10];
230        for digit in second_digits {
231            counts[digit as usize] += 1;
232        }
233
234        // Calculate observed frequencies
235        let n_f64 = n as f64;
236        let observed_frequencies: [f64; 10] = std::array::from_fn(|i| counts[i] as f64 / n_f64);
237
238        // Expected second-digit Benford probabilities
239        let expected: [f64; 10] = [
240            0.11968, 0.11389, 0.10882, 0.10433, 0.10031, 0.09668, 0.09337, 0.09035, 0.08757,
241            0.08500,
242        ];
243
244        // Chi-squared test
245        let chi_squared: f64 = (0..10)
246            .map(|i| {
247                let observed = counts[i] as f64;
248                let exp = expected[i] * n_f64;
249                if exp > 0.0 {
250                    (observed - exp).powi(2) / exp
251                } else {
252                    0.0
253                }
254            })
255            .sum();
256
257        let chi_sq_dist = ChiSquared::new(9.0).map_err(|e| {
258            EvalError::StatisticalError(format!("Failed to create chi-squared distribution: {}", e))
259        })?;
260        let p_value = 1.0 - chi_sq_dist.cdf(chi_squared);
261
262        Ok(SecondDigitAnalysis {
263            sample_size: n,
264            observed_frequencies,
265            expected_frequencies: expected,
266            chi_squared,
267            p_value,
268            passes: p_value >= self.significance_level,
269        })
270    }
271
272    /// Extract the second digit from a decimal amount.
273    fn get_second_digit(amount: Decimal) -> Option<u8> {
274        let abs_amount = amount.abs();
275        if abs_amount.is_zero() {
276            return None;
277        }
278
279        let s = abs_amount.to_string();
280        let mut found_first = false;
281        for c in s.chars() {
282            if c.is_ascii_digit() {
283                if !found_first && c != '0' {
284                    found_first = true;
285                } else if found_first && c != '.' {
286                    return Some(c.to_digit(10).unwrap() as u8);
287                }
288            }
289        }
290        None
291    }
292}
293
294impl Default for BenfordAnalyzer {
295    fn default() -> Self {
296        Self::new(0.05)
297    }
298}
299
300/// Results of second-digit Benford's Law analysis.
301#[derive(Debug, Clone, Serialize, Deserialize)]
302pub struct SecondDigitAnalysis {
303    /// Number of samples analyzed.
304    pub sample_size: usize,
305    /// Observed second-digit frequencies (digits 0-9).
306    pub observed_frequencies: [f64; 10],
307    /// Expected second-digit frequencies.
308    pub expected_frequencies: [f64; 10],
309    /// Chi-squared statistic.
310    pub chi_squared: f64,
311    /// P-value from chi-squared test.
312    pub p_value: f64,
313    /// Whether test passes.
314    pub passes: bool,
315}
316
317#[cfg(test)]
318mod tests {
319    use super::*;
320    use rust_decimal_macros::dec;
321
322    #[test]
323    fn test_benford_probabilities_sum_to_one() {
324        let sum: f64 = BENFORD_PROBABILITIES.iter().sum();
325        assert!((sum - 1.0).abs() < 0.001);
326    }
327
328    #[test]
329    fn test_get_first_digit() {
330        assert_eq!(BenfordAnalyzer::get_first_digit(dec!(123.45)), Some(1));
331        assert_eq!(BenfordAnalyzer::get_first_digit(dec!(0.0456)), Some(4));
332        assert_eq!(BenfordAnalyzer::get_first_digit(dec!(9999)), Some(9));
333        assert_eq!(BenfordAnalyzer::get_first_digit(dec!(-567.89)), Some(5));
334        assert_eq!(BenfordAnalyzer::get_first_digit(dec!(0)), None);
335    }
336
337    #[test]
338    fn test_benford_analysis_with_compliant_data() {
339        // Generate Benford-compliant data
340        let amounts: Vec<Decimal> = (1..=1000)
341            .map(|i| {
342                // Simple approximation of Benford distribution
343                let digit = match i % 100 {
344                    0..=29 => 1,
345                    30..=46 => 2,
346                    47..=59 => 3,
347                    60..=69 => 4,
348                    70..=77 => 5,
349                    78..=84 => 6,
350                    85..=90 => 7,
351                    91..=95 => 8,
352                    _ => 9,
353                };
354                Decimal::new(digit * 100 + (i % 100) as i64, 2)
355            })
356            .collect();
357
358        let analyzer = BenfordAnalyzer::default();
359        let result = analyzer.analyze(&amounts).unwrap();
360
361        assert_eq!(result.sample_size, 1000);
362        assert_eq!(result.degrees_of_freedom, 8);
363        // With approximate Benford data, should have reasonable conformity
364        assert!(result.mad < 0.05);
365    }
366
367    #[test]
368    fn test_benford_conformity_levels() {
369        assert_eq!(BenfordConformity::from_mad(0.004), BenfordConformity::Close);
370        assert_eq!(
371            BenfordConformity::from_mad(0.010),
372            BenfordConformity::Acceptable
373        );
374        assert_eq!(
375            BenfordConformity::from_mad(0.014),
376            BenfordConformity::Marginal
377        );
378        assert_eq!(
379            BenfordConformity::from_mad(0.020),
380            BenfordConformity::NonConforming
381        );
382    }
383
384    #[test]
385    fn test_insufficient_data() {
386        let amounts = vec![dec!(100), dec!(200), dec!(300)];
387        let analyzer = BenfordAnalyzer::default();
388        let result = analyzer.analyze(&amounts);
389        assert!(matches!(result, Err(EvalError::InsufficientData { .. })));
390    }
391}
datasynth_eval/statistical/benford.rs

datasynth_eval/statistical/
benford.rs