Skip to main content

datasynth_core/distributions/
benford.rs

1//! Benford's Law distribution sampler and fraud amount patterns.
2//!
3//! Implements Benford's Law compliant amount generation and various fraud
4//! amount patterns for realistic synthetic accounting data.
5
6use rand::prelude::*;
7use rand_chacha::ChaCha8Rng;
8use rust_decimal::Decimal;
9use serde::{Deserialize, Serialize};
10
11use super::AmountDistributionConfig;
12
13/// Benford's Law probability distribution for first digits 1-9.
14/// P(d) = log10(1 + 1/d)
15/// Note: Uses explicit values to satisfy clippy while maintaining exact precision.
16#[allow(clippy::approx_constant)]
17pub const BENFORD_PROBABILITIES: [f64; 9] = [
18    0.30103, // 1: 30.1% - log10(2)
19    0.17609, // 2: 17.6%
20    0.12494, // 3: 12.5%
21    0.09691, // 4: 9.7%
22    0.07918, // 5: 7.9%
23    0.06695, // 6: 6.7%
24    0.05799, // 7: 5.8%
25    0.05115, // 8: 5.1%
26    0.04576, // 9: 4.6%
27];
28
29/// Cumulative distribution function for Benford's Law.
30/// Note: Uses explicit values to satisfy clippy while maintaining exact precision.
31#[allow(clippy::approx_constant)]
32pub const BENFORD_CDF: [f64; 9] = [
33    0.30103, // 1 - log10(2)
34    0.47712, // 1-2
35    0.60206, // 1-3
36    0.69897, // 1-4
37    0.77815, // 1-5
38    0.84510, // 1-6
39    0.90309, // 1-7
40    0.95424, // 1-8
41    1.00000, // 1-9
42];
43
44/// Anti-Benford distribution for generating statistically improbable amounts.
45/// Overweights digits 5, 7, and 9 which are typically rare in natural data.
46pub const ANTI_BENFORD_PROBABILITIES: [f64; 9] = [
47    0.05, // 1: 5% (normally 30%)
48    0.05, // 2: 5% (normally 18%)
49    0.05, // 3: 5% (normally 12%)
50    0.10, // 4: 10%
51    0.25, // 5: 25% (normally 8%)
52    0.10, // 6: 10%
53    0.20, // 7: 20% (normally 6%)
54    0.05, // 8: 5%
55    0.15, // 9: 15% (normally 5%)
56];
57
58/// Fraud amount pattern types.
59#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
60#[serde(rename_all = "snake_case")]
61pub enum FraudAmountPattern {
62    /// Normal amount generation (Benford-compliant if enabled)
63    #[default]
64    Normal,
65    /// Statistically improbable first digits (anti-Benford)
66    /// Excess of leading 5s, 7s, 9s - detectable via statistical analysis
67    StatisticallyImprobable,
68    /// Obvious round numbers ($50,000.00, $99,999.99)
69    /// Easy to spot in visual review
70    ObviousRoundNumbers,
71    /// Amounts clustered just below approval thresholds
72    /// Classic split-transaction pattern
73    ThresholdAdjacent,
74}
75
76/// Configuration for threshold-adjacent fraud pattern.
77#[derive(Debug, Clone, Serialize, Deserialize)]
78pub struct ThresholdConfig {
79    /// Approval thresholds to cluster below
80    pub thresholds: Vec<f64>,
81    /// Minimum percentage below threshold (e.g., 0.01 = 1%)
82    pub min_below_pct: f64,
83    /// Maximum percentage below threshold (e.g., 0.15 = 15%)
84    pub max_below_pct: f64,
85}
86
87impl Default for ThresholdConfig {
88    fn default() -> Self {
89        Self {
90            thresholds: vec![1000.0, 5000.0, 10000.0, 25000.0, 50000.0, 100000.0],
91            min_below_pct: 0.01,
92            max_below_pct: 0.15,
93        }
94    }
95}
96
97/// Sampler that produces amounts following Benford's Law distribution.
98pub struct BenfordSampler {
99    rng: ChaCha8Rng,
100    config: AmountDistributionConfig,
101}
102
103impl BenfordSampler {
104    /// Create a new Benford sampler with the given seed and amount configuration.
105    pub fn new(seed: u64, config: AmountDistributionConfig) -> Self {
106        Self {
107            rng: ChaCha8Rng::seed_from_u64(seed),
108            config,
109        }
110    }
111
112    /// Sample a first digit according to Benford's Law.
113    fn sample_benford_first_digit(&mut self) -> u8 {
114        let p: f64 = self.rng.gen();
115        for (i, &cumulative) in BENFORD_CDF.iter().enumerate() {
116            if p < cumulative {
117                return (i + 1) as u8;
118            }
119        }
120        9
121    }
122
123    /// Sample a first digit from the anti-Benford distribution.
124    fn sample_anti_benford_first_digit(&mut self) -> u8 {
125        let p: f64 = self.rng.gen();
126        let mut cumulative = 0.0;
127        for (i, &prob) in ANTI_BENFORD_PROBABILITIES.iter().enumerate() {
128            cumulative += prob;
129            if p < cumulative {
130                return (i + 1) as u8;
131            }
132        }
133        9
134    }
135
136    /// Sample an amount following Benford's Law.
137    pub fn sample(&mut self) -> Decimal {
138        let first_digit = self.sample_benford_first_digit();
139        self.sample_with_first_digit(first_digit)
140    }
141
142    /// Sample an amount with a specific first digit.
143    pub fn sample_with_first_digit(&mut self, first_digit: u8) -> Decimal {
144        let first_digit = first_digit.clamp(1, 9);
145
146        // Determine the order of magnitude based on config range
147        let min_magnitude = self.config.min_amount.log10().floor() as i32;
148        let max_magnitude = self.config.max_amount.log10().floor() as i32;
149
150        // Sample a magnitude within the valid range
151        let magnitude = self.rng.gen_range(min_magnitude..=max_magnitude);
152        let base = 10_f64.powi(magnitude);
153
154        // Generate the remaining digits (0.0 to 0.999...)
155        let remaining: f64 = self.rng.gen();
156
157        // Construct: first_digit.remaining * 10^magnitude
158        let mantissa = first_digit as f64 + remaining;
159        let mut amount = mantissa * base;
160
161        // Clamp to configured range
162        amount = amount.clamp(self.config.min_amount, self.config.max_amount);
163
164        // Apply round number bias (25% chance)
165        let p: f64 = self.rng.gen();
166        if p < self.config.round_number_probability {
167            // Round to nearest whole number ending in 00
168            amount = (amount / 100.0).round() * 100.0;
169        } else if p < self.config.round_number_probability + self.config.nice_number_probability {
170            // Round to nearest 5 or 10
171            amount = (amount / 5.0).round() * 5.0;
172        }
173
174        // Round to configured decimal places
175        let decimal_multiplier = 10_f64.powi(self.config.decimal_places as i32);
176        amount = (amount * decimal_multiplier).round() / decimal_multiplier;
177
178        // Ensure minimum after rounding
179        amount = amount.max(self.config.min_amount);
180
181        Decimal::from_f64_retain(amount).unwrap_or(Decimal::ONE)
182    }
183
184    /// Reset the sampler with a new seed.
185    pub fn reset(&mut self, seed: u64) {
186        self.rng = ChaCha8Rng::seed_from_u64(seed);
187    }
188}
189
190/// Generator for fraudulent amount patterns.
191pub struct FraudAmountGenerator {
192    rng: ChaCha8Rng,
193    benford_sampler: BenfordSampler,
194    threshold_config: ThresholdConfig,
195    config: AmountDistributionConfig,
196}
197
198impl FraudAmountGenerator {
199    /// Create a new fraud amount generator.
200    pub fn new(
201        seed: u64,
202        config: AmountDistributionConfig,
203        threshold_config: ThresholdConfig,
204    ) -> Self {
205        Self {
206            rng: ChaCha8Rng::seed_from_u64(seed),
207            benford_sampler: BenfordSampler::new(seed + 1, config.clone()),
208            threshold_config,
209            config,
210        }
211    }
212
213    /// Generate an amount with the specified fraud pattern.
214    pub fn sample(&mut self, pattern: FraudAmountPattern) -> Decimal {
215        match pattern {
216            FraudAmountPattern::Normal => self.benford_sampler.sample(),
217            FraudAmountPattern::StatisticallyImprobable => self.sample_anti_benford(),
218            FraudAmountPattern::ObviousRoundNumbers => self.sample_obvious_round(),
219            FraudAmountPattern::ThresholdAdjacent => self.sample_threshold_adjacent(),
220        }
221    }
222
223    /// Generate an amount with statistically improbable first digit distribution.
224    fn sample_anti_benford(&mut self) -> Decimal {
225        let first_digit = self.benford_sampler.sample_anti_benford_first_digit();
226        self.benford_sampler.sample_with_first_digit(first_digit)
227    }
228
229    /// Generate an obvious round number amount (suspicious pattern).
230    fn sample_obvious_round(&mut self) -> Decimal {
231        let pattern_choice = self.rng.gen_range(0..5);
232
233        let amount = match pattern_choice {
234            // Even thousands ($1,000, $5,000, $10,000, etc.)
235            0 => {
236                let multiplier = self.rng.gen_range(1..100);
237                multiplier as f64 * 1000.0
238            }
239            // $X9,999.99 pattern (just under round number)
240            1 => {
241                let base = self.rng.gen_range(1..10) as f64 * 10000.0;
242                base - 0.01
243            }
244            // Exact $X0,000.00 pattern
245            2 => {
246                let multiplier = self.rng.gen_range(1..20);
247                multiplier as f64 * 10000.0
248            }
249            // Five-thousands ($5,000, $15,000, $25,000)
250            3 => {
251                let multiplier = self.rng.gen_range(1..40);
252                multiplier as f64 * 5000.0
253            }
254            // $X,999.99 pattern
255            _ => {
256                let base = self.rng.gen_range(1..100) as f64 * 1000.0;
257                base - 0.01
258            }
259        };
260
261        // Clamp to config range
262        let clamped = amount.clamp(self.config.min_amount, self.config.max_amount);
263        Decimal::from_f64_retain(clamped).unwrap_or(Decimal::ONE)
264    }
265
266    /// Generate an amount just below an approval threshold.
267    fn sample_threshold_adjacent(&mut self) -> Decimal {
268        // Select a threshold
269        let threshold = if self.threshold_config.thresholds.is_empty() {
270            10000.0
271        } else {
272            *self
273                .threshold_config
274                .thresholds
275                .choose(&mut self.rng)
276                .unwrap_or(&10000.0)
277        };
278
279        // Calculate amount as percentage below threshold
280        let pct_below = self
281            .rng
282            .gen_range(self.threshold_config.min_below_pct..self.threshold_config.max_below_pct);
283        let base_amount = threshold * (1.0 - pct_below);
284
285        // Add small noise to avoid exact patterns
286        let noise_factor = 1.0 + self.rng.gen_range(-0.005..0.005);
287        let amount = base_amount * noise_factor;
288
289        // Round to 2 decimal places
290        let rounded = (amount * 100.0).round() / 100.0;
291
292        // Ensure we're still below threshold
293        let final_amount = rounded.min(threshold - 0.01);
294        let clamped = final_amount.clamp(self.config.min_amount, self.config.max_amount);
295
296        Decimal::from_f64_retain(clamped).unwrap_or(Decimal::ONE)
297    }
298
299    /// Reset the generator with a new seed.
300    pub fn reset(&mut self, seed: u64) {
301        self.rng = ChaCha8Rng::seed_from_u64(seed);
302        self.benford_sampler.reset(seed + 1);
303    }
304}
305
306/// Extract the first digit from a decimal amount.
307pub fn get_first_digit(amount: Decimal) -> Option<u8> {
308    let s = amount.to_string();
309    s.chars()
310        .find(|c| c.is_ascii_digit() && *c != '0')
311        .and_then(|c| c.to_digit(10))
312        .map(|d| d as u8)
313}
314
315#[cfg(test)]
316mod tests {
317    use super::*;
318
319    #[test]
320    fn test_benford_probabilities_sum_to_one() {
321        let sum: f64 = BENFORD_PROBABILITIES.iter().sum();
322        assert!(
323            (sum - 1.0).abs() < 0.001,
324            "Benford probabilities sum to {}, expected 1.0",
325            sum
326        );
327    }
328
329    #[test]
330    fn test_benford_cdf_ends_at_one() {
331        assert!(
332            (BENFORD_CDF[8] - 1.0).abs() < 0.0001,
333            "CDF should end at 1.0"
334        );
335    }
336
337    #[test]
338    fn test_anti_benford_probabilities_sum_to_one() {
339        let sum: f64 = ANTI_BENFORD_PROBABILITIES.iter().sum();
340        assert!(
341            (sum - 1.0).abs() < 0.001,
342            "Anti-Benford probabilities sum to {}, expected 1.0",
343            sum
344        );
345    }
346
347    #[test]
348    fn test_benford_sampler_determinism() {
349        let config = AmountDistributionConfig::default();
350        let mut sampler1 = BenfordSampler::new(42, config.clone());
351        let mut sampler2 = BenfordSampler::new(42, config);
352
353        for _ in 0..100 {
354            assert_eq!(sampler1.sample(), sampler2.sample());
355        }
356    }
357
358    #[test]
359    fn test_benford_first_digit_distribution() {
360        let config = AmountDistributionConfig::default();
361        let mut sampler = BenfordSampler::new(12345, config);
362
363        let mut digit_counts = [0u32; 9];
364        let iterations = 10_000;
365
366        for _ in 0..iterations {
367            let amount = sampler.sample();
368            if let Some(digit) = get_first_digit(amount) {
369                if (1..=9).contains(&digit) {
370                    digit_counts[(digit - 1) as usize] += 1;
371                }
372            }
373        }
374
375        // Verify digit 1 is most common (should be ~30%, but can vary more due to log-normal distribution)
376        let digit_1_pct = digit_counts[0] as f64 / iterations as f64;
377        assert!(
378            digit_1_pct > 0.15 && digit_1_pct < 0.50,
379            "Digit 1 should be ~30%, got {:.1}%",
380            digit_1_pct * 100.0
381        );
382
383        // Verify digit 9 is least common (should be ~5%)
384        let digit_9_pct = digit_counts[8] as f64 / iterations as f64;
385        assert!(
386            digit_9_pct > 0.02 && digit_9_pct < 0.10,
387            "Digit 9 should be ~5%, got {:.1}%",
388            digit_9_pct * 100.0
389        );
390    }
391
392    #[test]
393    fn test_threshold_adjacent_below_threshold() {
394        let config = AmountDistributionConfig::default();
395        let threshold_config = ThresholdConfig {
396            thresholds: vec![10000.0],
397            min_below_pct: 0.01,
398            max_below_pct: 0.15,
399        };
400        let mut gen = FraudAmountGenerator::new(42, config, threshold_config);
401
402        for _ in 0..100 {
403            let amount = gen.sample(FraudAmountPattern::ThresholdAdjacent);
404            let f = amount.to_string().parse::<f64>().unwrap();
405            assert!(f < 10000.0, "Amount {} should be below threshold 10000", f);
406            // Account for noise factor (up to 0.5%) and rounding
407            assert!(
408                f >= 8400.0,
409                "Amount {} should be approximately within 15% of threshold",
410                f
411            );
412        }
413    }
414
415    #[test]
416    fn test_obvious_round_numbers() {
417        let config = AmountDistributionConfig::default();
418        let threshold_config = ThresholdConfig::default();
419        let mut gen = FraudAmountGenerator::new(42, config, threshold_config);
420
421        for _ in 0..100 {
422            let amount = gen.sample(FraudAmountPattern::ObviousRoundNumbers);
423            let f = amount.to_string().parse::<f64>().unwrap();
424
425            // Should be either a round number or just under one
426            let is_round = f % 1000.0 == 0.0 || f % 5000.0 == 0.0;
427            let is_just_under = (f + 0.01) % 1000.0 < 0.02 || (f + 0.01) % 10000.0 < 0.02;
428
429            assert!(
430                is_round || is_just_under || f > 0.0,
431                "Amount {} should be a suspicious round number",
432                f
433            );
434        }
435    }
436
437    #[test]
438    fn test_get_first_digit() {
439        assert_eq!(get_first_digit(Decimal::from(123)), Some(1));
440        assert_eq!(get_first_digit(Decimal::from(999)), Some(9));
441        assert_eq!(get_first_digit(Decimal::from(50000)), Some(5));
442        assert_eq!(
443            get_first_digit(Decimal::from_str_exact("0.00123").unwrap()),
444            Some(1)
445        );
446    }
447}