use crate::error::{EvalError, EvalResult};
use rust_decimal::Decimal;
use serde::{Deserialize, Serialize};
use statrs::distribution::{ChiSquared, ContinuousCDF};
#[allow(clippy::approx_constant)] pub const BENFORD_PROBABILITIES: [f64; 9] = [
0.30103, 0.17609, 0.12494, 0.09691, 0.07918, 0.06695, 0.05799, 0.05115, 0.04576, ];
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum BenfordConformity {
Close,
Acceptable,
Marginal,
NonConforming,
}
impl BenfordConformity {
pub fn from_mad(mad: f64) -> Self {
if mad < 0.006 {
Self::Close
} else if mad < 0.012 {
Self::Acceptable
} else if mad < 0.015 {
Self::Marginal
} else {
Self::NonConforming
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenfordAnalysis {
pub sample_size: usize,
pub observed_frequencies: [f64; 9],
pub observed_counts: [u64; 9],
pub expected_frequencies: [f64; 9],
pub chi_squared: f64,
pub degrees_of_freedom: u32,
pub p_value: f64,
pub mad: f64,
pub conformity: BenfordConformity,
pub max_deviation: (u8, f64),
pub passes: bool,
pub anti_benford_score: f64,
}
pub struct BenfordAnalyzer {
significance_level: f64,
}
impl BenfordAnalyzer {
pub fn new(significance_level: f64) -> Self {
Self { significance_level }
}
fn get_first_digit(amount: Decimal) -> Option<u8> {
let abs_amount = amount.abs();
if abs_amount.is_zero() {
return None;
}
let s = abs_amount.to_string();
for c in s.chars() {
if c.is_ascii_digit() && c != '0' {
return Some(c.to_digit(10).expect("char is ascii digit") as u8);
}
}
None
}
pub fn analyze(&self, amounts: &[Decimal]) -> EvalResult<BenfordAnalysis> {
let first_digits: Vec<u8> = amounts
.iter()
.filter_map(|&a| Self::get_first_digit(a))
.collect();
let n = first_digits.len();
if n < 10 {
return Err(EvalError::InsufficientData {
required: 10,
actual: n,
});
}
let mut counts = [0u64; 9];
for digit in first_digits {
if (1..=9).contains(&digit) {
counts[(digit - 1) as usize] += 1;
}
}
let n_f64 = n as f64;
let observed_frequencies: [f64; 9] = std::array::from_fn(|i| counts[i] as f64 / n_f64);
let chi_squared: f64 = (0..9)
.map(|i| {
let observed = counts[i] as f64;
let expected = BENFORD_PROBABILITIES[i] * n_f64;
if expected > 0.0 {
(observed - expected).powi(2) / expected
} else {
0.0
}
})
.sum();
let chi_sq_dist = ChiSquared::new(8.0).map_err(|e| {
EvalError::StatisticalError(format!("Failed to create chi-squared distribution: {e}"))
})?;
let p_value = 1.0 - chi_sq_dist.cdf(chi_squared);
let mad: f64 = (0..9)
.map(|i| (observed_frequencies[i] - BENFORD_PROBABILITIES[i]).abs())
.sum::<f64>()
/ 9.0;
let max_deviation = (0..9)
.map(|i| {
(
(i + 1) as u8,
(observed_frequencies[i] - BENFORD_PROBABILITIES[i]).abs(),
)
})
.max_by(|a, b| a.1.total_cmp(&b.1))
.expect("9-element range is non-empty");
let uniform_prob = 1.0 / 9.0;
let anti_benford_score: f64 = (0..9)
.map(|i| {
let benford_distance = (observed_frequencies[i] - BENFORD_PROBABILITIES[i]).abs();
let uniform_distance = (observed_frequencies[i] - uniform_prob).abs();
if benford_distance > uniform_distance {
benford_distance - uniform_distance
} else {
0.0
}
})
.sum::<f64>()
/ 9.0;
let conformity = BenfordConformity::from_mad(mad);
let passes = p_value >= self.significance_level;
Ok(BenfordAnalysis {
sample_size: n,
observed_frequencies,
observed_counts: counts,
expected_frequencies: BENFORD_PROBABILITIES,
chi_squared,
degrees_of_freedom: 8,
p_value,
mad,
conformity,
max_deviation,
passes,
anti_benford_score,
})
}
pub fn analyze_second_digit(&self, amounts: &[Decimal]) -> EvalResult<SecondDigitAnalysis> {
let second_digits: Vec<u8> = amounts
.iter()
.filter_map(|&a| Self::get_second_digit(a))
.collect();
let n = second_digits.len();
if n < 10 {
return Err(EvalError::InsufficientData {
required: 10,
actual: n,
});
}
let mut counts = [0u64; 10];
for digit in second_digits {
counts[digit as usize] += 1;
}
let n_f64 = n as f64;
let observed_frequencies: [f64; 10] = std::array::from_fn(|i| counts[i] as f64 / n_f64);
let expected: [f64; 10] = [
0.11968, 0.11389, 0.10882, 0.10433, 0.10031, 0.09668, 0.09337, 0.09035, 0.08757,
0.08500,
];
let chi_squared: f64 = (0..10)
.map(|i| {
let observed = counts[i] as f64;
let exp = expected[i] * n_f64;
if exp > 0.0 {
(observed - exp).powi(2) / exp
} else {
0.0
}
})
.sum();
let chi_sq_dist = ChiSquared::new(9.0).map_err(|e| {
EvalError::StatisticalError(format!("Failed to create chi-squared distribution: {e}"))
})?;
let p_value = 1.0 - chi_sq_dist.cdf(chi_squared);
Ok(SecondDigitAnalysis {
sample_size: n,
observed_frequencies,
expected_frequencies: expected,
chi_squared,
p_value,
passes: p_value >= self.significance_level,
})
}
fn get_second_digit(amount: Decimal) -> Option<u8> {
let abs_amount = amount.abs();
if abs_amount.is_zero() {
return None;
}
let s = abs_amount.to_string();
let mut found_first = false;
for c in s.chars() {
if c.is_ascii_digit() {
if !found_first && c != '0' {
found_first = true;
} else if found_first && c != '.' {
return Some(c.to_digit(10).expect("char is ascii digit") as u8);
}
}
}
None
}
}
impl Default for BenfordAnalyzer {
fn default() -> Self {
Self::new(0.05)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SecondDigitAnalysis {
pub sample_size: usize,
pub observed_frequencies: [f64; 10],
pub expected_frequencies: [f64; 10],
pub chi_squared: f64,
pub p_value: f64,
pub passes: bool,
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
use rust_decimal_macros::dec;
#[test]
fn test_benford_probabilities_sum_to_one() {
let sum: f64 = BENFORD_PROBABILITIES.iter().sum();
assert!((sum - 1.0).abs() < 0.001);
}
#[test]
fn test_get_first_digit() {
assert_eq!(BenfordAnalyzer::get_first_digit(dec!(123.45)), Some(1));
assert_eq!(BenfordAnalyzer::get_first_digit(dec!(0.0456)), Some(4));
assert_eq!(BenfordAnalyzer::get_first_digit(dec!(9999)), Some(9));
assert_eq!(BenfordAnalyzer::get_first_digit(dec!(-567.89)), Some(5));
assert_eq!(BenfordAnalyzer::get_first_digit(dec!(0)), None);
}
#[test]
fn test_benford_analysis_with_compliant_data() {
let amounts: Vec<Decimal> = (1..=1000)
.map(|i| {
let digit = match i % 100 {
0..=29 => 1,
30..=46 => 2,
47..=59 => 3,
60..=69 => 4,
70..=77 => 5,
78..=84 => 6,
85..=90 => 7,
91..=95 => 8,
_ => 9,
};
Decimal::new(digit * 100 + (i % 100) as i64, 2)
})
.collect();
let analyzer = BenfordAnalyzer::default();
let result = analyzer.analyze(&amounts).unwrap();
assert_eq!(result.sample_size, 1000);
assert_eq!(result.degrees_of_freedom, 8);
assert!(result.mad < 0.05);
}
#[test]
fn test_benford_conformity_levels() {
assert_eq!(BenfordConformity::from_mad(0.004), BenfordConformity::Close);
assert_eq!(
BenfordConformity::from_mad(0.010),
BenfordConformity::Acceptable
);
assert_eq!(
BenfordConformity::from_mad(0.014),
BenfordConformity::Marginal
);
assert_eq!(
BenfordConformity::from_mad(0.020),
BenfordConformity::NonConforming
);
}
#[test]
fn test_insufficient_data() {
let amounts = vec![dec!(100), dec!(200), dec!(300)];
let analyzer = BenfordAnalyzer::default();
let result = analyzer.analyze(&amounts);
assert!(matches!(result, Err(EvalError::InsufficientData { .. })));
}
}