datasynth_core/distributions/
benford.rs

1//! Benford's Law distribution sampler and fraud amount patterns.
2//!
3//! Implements Benford's Law compliant amount generation and various fraud
4//! amount patterns for realistic synthetic accounting data. Includes enhanced
5//! multi-digit Benford analysis and deviation patterns for anomaly injection.
6
7use rand::prelude::*;
8use rand_chacha::ChaCha8Rng;
9use rust_decimal::Decimal;
10use serde::{Deserialize, Serialize};
11
12use super::AmountDistributionConfig;
13
14/// Benford's Law probability distribution for first digits 1-9.
15/// P(d) = log10(1 + 1/d)
16/// Note: Uses explicit values to satisfy clippy while maintaining exact precision.
17#[allow(clippy::approx_constant)]
18pub const BENFORD_PROBABILITIES: [f64; 9] = [
19    0.30103, // 1: 30.1% - log10(2)
20    0.17609, // 2: 17.6%
21    0.12494, // 3: 12.5%
22    0.09691, // 4: 9.7%
23    0.07918, // 5: 7.9%
24    0.06695, // 6: 6.7%
25    0.05799, // 7: 5.8%
26    0.05115, // 8: 5.1%
27    0.04576, // 9: 4.6%
28];
29
30/// Cumulative distribution function for Benford's Law.
31/// Note: Uses explicit values to satisfy clippy while maintaining exact precision.
32#[allow(clippy::approx_constant)]
33pub const BENFORD_CDF: [f64; 9] = [
34    0.30103, // 1 - log10(2)
35    0.47712, // 1-2
36    0.60206, // 1-3
37    0.69897, // 1-4
38    0.77815, // 1-5
39    0.84510, // 1-6
40    0.90309, // 1-7
41    0.95424, // 1-8
42    1.00000, // 1-9
43];
44
45/// Benford's Law probability distribution for second digits 0-9.
46/// P(d2) = sum over d1 of log10(1 + 1/(10*d1 + d2))
47#[allow(clippy::approx_constant)]
48pub const BENFORD_SECOND_DIGIT_PROBABILITIES: [f64; 10] = [
49    0.11968, // 0: 12.0%
50    0.11389, // 1: 11.4%
51    0.10882, // 2: 10.9%
52    0.10433, // 3: 10.4%
53    0.10031, // 4: 10.0%
54    0.09668, // 5: 9.7%
55    0.09337, // 6: 9.3%
56    0.09035, // 7: 9.0%
57    0.08757, // 8: 8.8%
58    0.08500, // 9: 8.5%
59];
60
61/// Cumulative distribution function for second digit Benford's Law.
62pub const BENFORD_SECOND_DIGIT_CDF: [f64; 10] = [
63    0.11968, 0.23357, 0.34239, 0.44672, 0.54703, 0.64371, 0.73708, 0.82743, 0.91500, 1.00000,
64];
65
66/// Calculate Benford's Law probability for first two digits (10-99).
67/// P(d1d2) = log10(1 + 1/(d1*10 + d2))
68pub fn benford_first_two_probability(d1: u8, d2: u8) -> f64 {
69    if !(1..=9).contains(&d1) || d2 > 9 {
70        return 0.0;
71    }
72    let n = (d1 as f64) * 10.0 + (d2 as f64);
73    (1.0 + 1.0 / n).log10()
74}
75
76/// Get all first-two-digit probabilities as a 90-element array (10-99).
77pub fn benford_first_two_probabilities() -> [f64; 90] {
78    let mut probs = [0.0; 90];
79    for d1 in 1..=9 {
80        for d2 in 0..=9 {
81            let idx = (d1 - 1) * 10 + d2;
82            probs[idx as usize] = benford_first_two_probability(d1, d2);
83        }
84    }
85    probs
86}
87
88/// Anti-Benford distribution for generating statistically improbable amounts.
89/// Overweights digits 5, 7, and 9 which are typically rare in natural data.
90pub const ANTI_BENFORD_PROBABILITIES: [f64; 9] = [
91    0.05, // 1: 5% (normally 30%)
92    0.05, // 2: 5% (normally 18%)
93    0.05, // 3: 5% (normally 12%)
94    0.10, // 4: 10%
95    0.25, // 5: 25% (normally 8%)
96    0.10, // 6: 10%
97    0.20, // 7: 20% (normally 6%)
98    0.05, // 8: 5%
99    0.15, // 9: 15% (normally 5%)
100];
101
102/// Fraud amount pattern types.
103#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
104#[serde(rename_all = "snake_case")]
105pub enum FraudAmountPattern {
106    /// Normal amount generation (Benford-compliant if enabled)
107    #[default]
108    Normal,
109    /// Statistically improbable first digits (anti-Benford)
110    /// Excess of leading 5s, 7s, 9s - detectable via statistical analysis
111    StatisticallyImprobable,
112    /// Obvious round numbers ($50,000.00, $99,999.99)
113    /// Easy to spot in visual review
114    ObviousRoundNumbers,
115    /// Amounts clustered just below approval thresholds
116    /// Classic split-transaction pattern
117    ThresholdAdjacent,
118}
119
120/// Configuration for threshold-adjacent fraud pattern.
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct ThresholdConfig {
123    /// Approval thresholds to cluster below
124    pub thresholds: Vec<f64>,
125    /// Minimum percentage below threshold (e.g., 0.01 = 1%)
126    pub min_below_pct: f64,
127    /// Maximum percentage below threshold (e.g., 0.15 = 15%)
128    pub max_below_pct: f64,
129}
130
131impl Default for ThresholdConfig {
132    fn default() -> Self {
133        Self {
134            thresholds: vec![1000.0, 5000.0, 10000.0, 25000.0, 50000.0, 100000.0],
135            min_below_pct: 0.01,
136            max_below_pct: 0.15,
137        }
138    }
139}
140
141/// Sampler that produces amounts following Benford's Law distribution.
142pub struct BenfordSampler {
143    rng: ChaCha8Rng,
144    config: AmountDistributionConfig,
145}
146
147impl BenfordSampler {
148    /// Create a new Benford sampler with the given seed and amount configuration.
149    pub fn new(seed: u64, config: AmountDistributionConfig) -> Self {
150        Self {
151            rng: ChaCha8Rng::seed_from_u64(seed),
152            config,
153        }
154    }
155
156    /// Sample a first digit according to Benford's Law.
157    fn sample_benford_first_digit(&mut self) -> u8 {
158        let p: f64 = self.rng.random();
159        for (i, &cumulative) in BENFORD_CDF.iter().enumerate() {
160            if p < cumulative {
161                return (i + 1) as u8;
162            }
163        }
164        9
165    }
166
167    /// Sample a first digit from the anti-Benford distribution.
168    fn sample_anti_benford_first_digit(&mut self) -> u8 {
169        let p: f64 = self.rng.random();
170        let mut cumulative = 0.0;
171        for (i, &prob) in ANTI_BENFORD_PROBABILITIES.iter().enumerate() {
172            cumulative += prob;
173            if p < cumulative {
174                return (i + 1) as u8;
175            }
176        }
177        9
178    }
179
180    /// Sample an amount following Benford's Law.
181    pub fn sample(&mut self) -> Decimal {
182        let first_digit = self.sample_benford_first_digit();
183        self.sample_with_first_digit(first_digit)
184    }
185
186    /// Sample an amount with a specific first digit.
187    pub fn sample_with_first_digit(&mut self, first_digit: u8) -> Decimal {
188        let first_digit = first_digit.clamp(1, 9);
189
190        // Determine the order of magnitude based on config range
191        let min_magnitude = self.config.min_amount.log10().floor() as i32;
192        let max_magnitude = self.config.max_amount.log10().floor() as i32;
193
194        // Sample a magnitude within the valid range
195        let magnitude = self.rng.random_range(min_magnitude..=max_magnitude);
196        let base = 10_f64.powi(magnitude);
197
198        // Generate the remaining digits (0.0 to 0.999...)
199        let remaining: f64 = self.rng.random();
200
201        // Construct: first_digit.remaining * 10^magnitude
202        let mantissa = first_digit as f64 + remaining;
203        let mut amount = mantissa * base;
204
205        // Clamp to configured range
206        amount = amount.clamp(self.config.min_amount, self.config.max_amount);
207
208        // Apply round number bias (25% chance)
209        let p: f64 = self.rng.random();
210        if p < self.config.round_number_probability {
211            // Round to nearest whole number ending in 00
212            amount = (amount / 100.0).round() * 100.0;
213        } else if p < self.config.round_number_probability + self.config.nice_number_probability {
214            // Round to nearest 5 or 10
215            amount = (amount / 5.0).round() * 5.0;
216        }
217
218        // Round to configured decimal places
219        let decimal_multiplier = 10_f64.powi(self.config.decimal_places as i32);
220        amount = (amount * decimal_multiplier).round() / decimal_multiplier;
221
222        // Ensure minimum after rounding
223        amount = amount.max(self.config.min_amount);
224
225        Decimal::from_f64_retain(amount).unwrap_or(Decimal::ONE)
226    }
227
228    /// Reset the sampler with a new seed.
229    pub fn reset(&mut self, seed: u64) {
230        self.rng = ChaCha8Rng::seed_from_u64(seed);
231    }
232}
233
234/// Generator for fraudulent amount patterns.
235pub struct FraudAmountGenerator {
236    rng: ChaCha8Rng,
237    benford_sampler: BenfordSampler,
238    threshold_config: ThresholdConfig,
239    config: AmountDistributionConfig,
240}
241
242impl FraudAmountGenerator {
243    /// Create a new fraud amount generator.
244    pub fn new(
245        seed: u64,
246        config: AmountDistributionConfig,
247        threshold_config: ThresholdConfig,
248    ) -> Self {
249        Self {
250            rng: ChaCha8Rng::seed_from_u64(seed),
251            benford_sampler: BenfordSampler::new(seed + 1, config.clone()),
252            threshold_config,
253            config,
254        }
255    }
256
257    /// Generate an amount with the specified fraud pattern.
258    pub fn sample(&mut self, pattern: FraudAmountPattern) -> Decimal {
259        match pattern {
260            FraudAmountPattern::Normal => self.benford_sampler.sample(),
261            FraudAmountPattern::StatisticallyImprobable => self.sample_anti_benford(),
262            FraudAmountPattern::ObviousRoundNumbers => self.sample_obvious_round(),
263            FraudAmountPattern::ThresholdAdjacent => self.sample_threshold_adjacent(),
264        }
265    }
266
267    /// Generate an amount with statistically improbable first digit distribution.
268    fn sample_anti_benford(&mut self) -> Decimal {
269        let first_digit = self.benford_sampler.sample_anti_benford_first_digit();
270        self.benford_sampler.sample_with_first_digit(first_digit)
271    }
272
273    /// Generate an obvious round number amount (suspicious pattern).
274    fn sample_obvious_round(&mut self) -> Decimal {
275        let pattern_choice = self.rng.random_range(0..5);
276
277        let amount = match pattern_choice {
278            // Even thousands ($1,000, $5,000, $10,000, etc.)
279            0 => {
280                let multiplier = self.rng.random_range(1..100);
281                multiplier as f64 * 1000.0
282            }
283            // $X9,999.99 pattern (just under round number)
284            1 => {
285                let base = self.rng.random_range(1..10) as f64 * 10000.0;
286                base - 0.01
287            }
288            // Exact $X0,000.00 pattern
289            2 => {
290                let multiplier = self.rng.random_range(1..20);
291                multiplier as f64 * 10000.0
292            }
293            // Five-thousands ($5,000, $15,000, $25,000)
294            3 => {
295                let multiplier = self.rng.random_range(1..40);
296                multiplier as f64 * 5000.0
297            }
298            // $X,999.99 pattern
299            _ => {
300                let base = self.rng.random_range(1..100) as f64 * 1000.0;
301                base - 0.01
302            }
303        };
304
305        // Clamp to config range
306        let clamped = amount.clamp(self.config.min_amount, self.config.max_amount);
307        Decimal::from_f64_retain(clamped).unwrap_or(Decimal::ONE)
308    }
309
310    /// Generate an amount just below an approval threshold.
311    fn sample_threshold_adjacent(&mut self) -> Decimal {
312        // Select a threshold
313        let threshold = if self.threshold_config.thresholds.is_empty() {
314            10000.0
315        } else {
316            *self
317                .threshold_config
318                .thresholds
319                .choose(&mut self.rng)
320                .unwrap_or(&10000.0)
321        };
322
323        // Calculate amount as percentage below threshold
324        let pct_below = self
325            .rng
326            .random_range(self.threshold_config.min_below_pct..self.threshold_config.max_below_pct);
327        let base_amount = threshold * (1.0 - pct_below);
328
329        // Add small noise to avoid exact patterns
330        let noise_factor = 1.0 + self.rng.random_range(-0.005..0.005);
331        let amount = base_amount * noise_factor;
332
333        // Round to 2 decimal places
334        let rounded = (amount * 100.0).round() / 100.0;
335
336        // Ensure we're still below threshold
337        let final_amount = rounded.min(threshold - 0.01);
338        let clamped = final_amount.clamp(self.config.min_amount, self.config.max_amount);
339
340        Decimal::from_f64_retain(clamped).unwrap_or(Decimal::ONE)
341    }
342
343    /// Reset the generator with a new seed.
344    pub fn reset(&mut self, seed: u64) {
345        self.rng = ChaCha8Rng::seed_from_u64(seed);
346        self.benford_sampler.reset(seed + 1);
347    }
348}
349
350/// Extract the first digit from a decimal amount.
351pub fn get_first_digit(amount: Decimal) -> Option<u8> {
352    let s = amount.to_string();
353    s.chars()
354        .find(|c| c.is_ascii_digit() && *c != '0')
355        .and_then(|c| c.to_digit(10))
356        .map(|d| d as u8)
357}
358
359/// Extract the first two digits from a decimal amount.
360pub fn get_first_two_digits(amount: Decimal) -> Option<(u8, u8)> {
361    let s = amount.abs().to_string();
362    let mut first_found = false;
363    let mut first_digit = 0u8;
364
365    for c in s.chars() {
366        if c.is_ascii_digit() {
367            let d = c
368                .to_digit(10)
369                .expect("digit char confirmed by is_ascii_digit") as u8;
370            if !first_found && d != 0 {
371                first_digit = d;
372                first_found = true;
373            } else if first_found && c != '.' {
374                return Some((first_digit, d));
375            }
376        }
377    }
378    None
379}
380
381/// Configuration for enhanced Benford sampling with multi-digit compliance.
382#[derive(Debug, Clone, Serialize, Deserialize, Default)]
383pub struct EnhancedBenfordConfig {
384    /// Base amount distribution configuration
385    pub amount_config: AmountDistributionConfig,
386    /// Whether to enforce second-digit Benford compliance
387    #[serde(default)]
388    pub second_digit_compliance: bool,
389    /// Whether to enforce first-two-digit Benford compliance
390    #[serde(default)]
391    pub first_two_digit_compliance: bool,
392}
393
394/// Enhanced Benford sampler with multi-digit compliance.
395pub struct EnhancedBenfordSampler {
396    rng: ChaCha8Rng,
397    config: EnhancedBenfordConfig,
398    /// Pre-computed CDF for first two digits
399    first_two_cdf: [f64; 90],
400}
401
402impl EnhancedBenfordSampler {
403    /// Create a new enhanced Benford sampler.
404    pub fn new(seed: u64, config: EnhancedBenfordConfig) -> Self {
405        // Pre-compute CDF for first two digits
406        let probs = benford_first_two_probabilities();
407        let mut first_two_cdf = [0.0; 90];
408        let mut cumulative = 0.0;
409        for i in 0..90 {
410            cumulative += probs[i];
411            first_two_cdf[i] = cumulative;
412        }
413
414        Self {
415            rng: ChaCha8Rng::seed_from_u64(seed),
416            config,
417            first_two_cdf,
418        }
419    }
420
421    /// Sample first two digits according to Benford's Law.
422    fn sample_first_two_digits(&mut self) -> (u8, u8) {
423        let p: f64 = self.rng.random();
424        for (i, &cdf) in self.first_two_cdf.iter().enumerate() {
425            if p < cdf {
426                let d1 = (i / 10 + 1) as u8;
427                let d2 = (i % 10) as u8;
428                return (d1, d2);
429            }
430        }
431        (9, 9)
432    }
433
434    /// Sample a second digit according to Benford's Law.
435    fn sample_second_digit(&mut self) -> u8 {
436        let p: f64 = self.rng.random();
437        for (i, &cdf) in BENFORD_SECOND_DIGIT_CDF.iter().enumerate() {
438            if p < cdf {
439                return i as u8;
440            }
441        }
442        9
443    }
444
445    /// Sample a first digit according to Benford's Law.
446    fn sample_first_digit(&mut self) -> u8 {
447        let p: f64 = self.rng.random();
448        for (i, &cdf) in BENFORD_CDF.iter().enumerate() {
449            if p < cdf {
450                return (i + 1) as u8;
451            }
452        }
453        9
454    }
455
456    /// Sample an amount with enhanced Benford compliance.
457    pub fn sample(&mut self) -> Decimal {
458        let (first_digit, second_digit) = if self.config.first_two_digit_compliance {
459            self.sample_first_two_digits()
460        } else if self.config.second_digit_compliance {
461            (self.sample_first_digit(), self.sample_second_digit())
462        } else {
463            (
464                self.sample_first_digit(),
465                self.rng.random_range(0..10) as u8,
466            )
467        };
468
469        self.sample_with_digits(first_digit, second_digit)
470    }
471
472    /// Sample an amount with specific first two digits.
473    fn sample_with_digits(&mut self, first_digit: u8, second_digit: u8) -> Decimal {
474        let first_digit = first_digit.clamp(1, 9);
475        let second_digit = second_digit.clamp(0, 9);
476
477        // Determine the order of magnitude based on config range
478        let min_magnitude = self.config.amount_config.min_amount.log10().floor() as i32;
479        let max_magnitude = self.config.amount_config.max_amount.log10().floor() as i32;
480
481        // Sample a magnitude within the valid range
482        let magnitude = self.rng.random_range(min_magnitude..=max_magnitude);
483        let base = 10_f64.powi(magnitude - 1); // -1 because first two digits span 10-99
484
485        // Generate the remaining digits (0.0 to 0.99...)
486        let remaining: f64 = self.rng.random();
487
488        // Construct the amount: (first_digit * 10 + second_digit + remaining) * base
489        let mantissa = (first_digit as f64) * 10.0 + (second_digit as f64) + remaining;
490        let mut amount = mantissa * base;
491
492        // Clamp to configured range
493        amount = amount.clamp(
494            self.config.amount_config.min_amount,
495            self.config.amount_config.max_amount,
496        );
497
498        // Round to configured decimal places
499        let decimal_multiplier = 10_f64.powi(self.config.amount_config.decimal_places as i32);
500        amount = (amount * decimal_multiplier).round() / decimal_multiplier;
501
502        Decimal::from_f64_retain(amount).unwrap_or(Decimal::ONE)
503    }
504
505    /// Reset the sampler with a new seed.
506    pub fn reset(&mut self, seed: u64) {
507        self.rng = ChaCha8Rng::seed_from_u64(seed);
508    }
509}
510
511/// Types of Benford deviation patterns for anomaly injection.
512#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
513#[serde(rename_all = "snake_case")]
514#[derive(Default)]
515pub enum BenfordDeviationType {
516    /// Round number bias (excess of digits 1, 5, 0 in second position)
517    #[default]
518    RoundNumberBias,
519    /// Threshold clustering (amounts just below round thresholds)
520    ThresholdClustering,
521    /// Uniform first digit (equal probability for all first digits)
522    UniformFirstDigit,
523    /// Excess of specific digit
524    DigitBias { digit: u8 },
525    /// Trailing zeros pattern (prices ending in .00)
526    TrailingZeros,
527}
528
529/// Configuration for Benford deviation sampling (for anomaly injection).
530#[derive(Debug, Clone, Serialize, Deserialize)]
531pub struct BenfordDeviationConfig {
532    /// Type of deviation pattern
533    pub deviation_type: BenfordDeviationType,
534    /// Intensity of deviation (0.0 = Benford compliant, 1.0 = full deviation)
535    #[serde(default = "default_intensity")]
536    pub intensity: f64,
537    /// Base amount configuration
538    pub amount_config: AmountDistributionConfig,
539    /// Thresholds for threshold clustering (if applicable)
540    #[serde(default = "default_thresholds")]
541    pub thresholds: Vec<f64>,
542}
543
544fn default_intensity() -> f64 {
545    0.5
546}
547
548fn default_thresholds() -> Vec<f64> {
549    vec![1000.0, 5000.0, 10000.0, 25000.0, 50000.0, 100000.0]
550}
551
552impl Default for BenfordDeviationConfig {
553    fn default() -> Self {
554        Self {
555            deviation_type: BenfordDeviationType::RoundNumberBias,
556            intensity: 0.5,
557            amount_config: AmountDistributionConfig::default(),
558            thresholds: default_thresholds(),
559        }
560    }
561}
562
563/// Sampler for generating amounts that deviate from Benford's Law.
564/// Useful for injecting statistically detectable anomalies.
565pub struct BenfordDeviationSampler {
566    rng: ChaCha8Rng,
567    config: BenfordDeviationConfig,
568    benford_sampler: BenfordSampler,
569}
570
571impl BenfordDeviationSampler {
572    /// Create a new Benford deviation sampler.
573    pub fn new(seed: u64, config: BenfordDeviationConfig) -> Self {
574        Self {
575            rng: ChaCha8Rng::seed_from_u64(seed),
576            benford_sampler: BenfordSampler::new(seed + 100, config.amount_config.clone()),
577            config,
578        }
579    }
580
581    /// Sample an amount with the configured deviation pattern.
582    pub fn sample(&mut self) -> Decimal {
583        // With probability (1 - intensity), sample from normal Benford
584        let p: f64 = self.rng.random();
585        if p > self.config.intensity {
586            return self.benford_sampler.sample();
587        }
588
589        // Apply deviation pattern
590        match self.config.deviation_type {
591            BenfordDeviationType::RoundNumberBias => self.sample_round_bias(),
592            BenfordDeviationType::ThresholdClustering => self.sample_threshold_cluster(),
593            BenfordDeviationType::UniformFirstDigit => self.sample_uniform_first_digit(),
594            BenfordDeviationType::DigitBias { digit } => self.sample_digit_bias(digit),
595            BenfordDeviationType::TrailingZeros => self.sample_trailing_zeros(),
596        }
597    }
598
599    /// Sample with round number bias.
600    fn sample_round_bias(&mut self) -> Decimal {
601        // Bias towards first digits 1 and 5
602        let first_digit = if self.rng.random_bool(0.6) {
603            if self.rng.random_bool(0.7) {
604                1
605            } else {
606                5
607            }
608        } else {
609            self.rng.random_range(1..=9)
610        };
611
612        // Bias towards second digits 0 and 5
613        let _second_digit = if self.rng.random_bool(0.5) {
614            if self.rng.random_bool(0.6) {
615                0
616            } else {
617                5
618            }
619        } else {
620            self.rng.random_range(0..=9)
621        };
622
623        self.benford_sampler.sample_with_first_digit(first_digit)
624    }
625
626    /// Sample clustering just below thresholds.
627    fn sample_threshold_cluster(&mut self) -> Decimal {
628        let threshold = self
629            .config
630            .thresholds
631            .choose(&mut self.rng)
632            .copied()
633            .unwrap_or(10000.0);
634
635        // Generate amount 1-15% below threshold
636        let pct_below = self.rng.random_range(0.01..0.15);
637        let amount = threshold * (1.0 - pct_below);
638
639        // Add small noise
640        let noise = 1.0 + self.rng.random_range(-0.005..0.005);
641        let final_amount = (amount * noise * 100.0).round() / 100.0;
642
643        Decimal::from_f64_retain(final_amount.clamp(
644            self.config.amount_config.min_amount,
645            self.config.amount_config.max_amount,
646        ))
647        .unwrap_or(Decimal::ONE)
648    }
649
650    /// Sample with uniform first digit distribution.
651    fn sample_uniform_first_digit(&mut self) -> Decimal {
652        let first_digit = self.rng.random_range(1..=9);
653        self.benford_sampler.sample_with_first_digit(first_digit)
654    }
655
656    /// Sample with bias towards a specific digit.
657    fn sample_digit_bias(&mut self, target_digit: u8) -> Decimal {
658        let digit = target_digit.clamp(1, 9);
659        // 70% chance of using the biased digit
660        let first_digit = if self.rng.random_bool(0.7) {
661            digit
662        } else {
663            self.rng.random_range(1..=9)
664        };
665        self.benford_sampler.sample_with_first_digit(first_digit)
666    }
667
668    /// Sample with trailing zeros pattern (prices ending in .00).
669    fn sample_trailing_zeros(&mut self) -> Decimal {
670        let amount = self.benford_sampler.sample();
671        let amount_f64: f64 = amount.to_string().parse().unwrap_or(0.0);
672
673        // Round to whole dollars
674        let rounded = amount_f64.round();
675        Decimal::from_f64_retain(rounded.clamp(
676            self.config.amount_config.min_amount,
677            self.config.amount_config.max_amount,
678        ))
679        .unwrap_or(Decimal::ONE)
680    }
681
682    /// Reset the sampler with a new seed.
683    pub fn reset(&mut self, seed: u64) {
684        self.rng = ChaCha8Rng::seed_from_u64(seed);
685        self.benford_sampler.reset(seed + 100);
686    }
687}
688
689#[cfg(test)]
690#[allow(clippy::unwrap_used)]
691mod tests {
692    use super::*;
693
694    #[test]
695    fn test_benford_probabilities_sum_to_one() {
696        let sum: f64 = BENFORD_PROBABILITIES.iter().sum();
697        assert!(
698            (sum - 1.0).abs() < 0.001,
699            "Benford probabilities sum to {}, expected 1.0",
700            sum
701        );
702    }
703
704    #[test]
705    fn test_benford_cdf_ends_at_one() {
706        assert!(
707            (BENFORD_CDF[8] - 1.0).abs() < 0.0001,
708            "CDF should end at 1.0"
709        );
710    }
711
712    #[test]
713    fn test_anti_benford_probabilities_sum_to_one() {
714        let sum: f64 = ANTI_BENFORD_PROBABILITIES.iter().sum();
715        assert!(
716            (sum - 1.0).abs() < 0.001,
717            "Anti-Benford probabilities sum to {}, expected 1.0",
718            sum
719        );
720    }
721
722    #[test]
723    fn test_benford_sampler_determinism() {
724        let config = AmountDistributionConfig::default();
725        let mut sampler1 = BenfordSampler::new(42, config.clone());
726        let mut sampler2 = BenfordSampler::new(42, config);
727
728        for _ in 0..100 {
729            assert_eq!(sampler1.sample(), sampler2.sample());
730        }
731    }
732
733    #[test]
734    fn test_benford_first_digit_distribution() {
735        let config = AmountDistributionConfig::default();
736        let mut sampler = BenfordSampler::new(12345, config);
737
738        let mut digit_counts = [0u32; 9];
739        let iterations = 10_000;
740
741        for _ in 0..iterations {
742            let amount = sampler.sample();
743            if let Some(digit) = get_first_digit(amount) {
744                if (1..=9).contains(&digit) {
745                    digit_counts[(digit - 1) as usize] += 1;
746                }
747            }
748        }
749
750        // Verify digit 1 is most common (should be ~30%, but can vary more due to log-normal distribution)
751        let digit_1_pct = digit_counts[0] as f64 / iterations as f64;
752        assert!(
753            digit_1_pct > 0.15 && digit_1_pct < 0.50,
754            "Digit 1 should be ~30%, got {:.1}%",
755            digit_1_pct * 100.0
756        );
757
758        // Verify digit 9 is least common (should be ~5%)
759        let digit_9_pct = digit_counts[8] as f64 / iterations as f64;
760        assert!(
761            digit_9_pct > 0.02 && digit_9_pct < 0.10,
762            "Digit 9 should be ~5%, got {:.1}%",
763            digit_9_pct * 100.0
764        );
765    }
766
767    #[test]
768    fn test_threshold_adjacent_below_threshold() {
769        let config = AmountDistributionConfig::default();
770        let threshold_config = ThresholdConfig {
771            thresholds: vec![10000.0],
772            min_below_pct: 0.01,
773            max_below_pct: 0.15,
774        };
775        let mut gen = FraudAmountGenerator::new(42, config, threshold_config);
776
777        for _ in 0..100 {
778            let amount = gen.sample(FraudAmountPattern::ThresholdAdjacent);
779            let f = amount.to_string().parse::<f64>().unwrap();
780            assert!(f < 10000.0, "Amount {} should be below threshold 10000", f);
781            // Account for noise factor (up to 0.5%) and rounding
782            assert!(
783                f >= 8400.0,
784                "Amount {} should be approximately within 15% of threshold",
785                f
786            );
787        }
788    }
789
790    #[test]
791    fn test_obvious_round_numbers() {
792        let config = AmountDistributionConfig::default();
793        let threshold_config = ThresholdConfig::default();
794        let mut gen = FraudAmountGenerator::new(42, config, threshold_config);
795
796        for _ in 0..100 {
797            let amount = gen.sample(FraudAmountPattern::ObviousRoundNumbers);
798            let f = amount.to_string().parse::<f64>().unwrap();
799
800            // Should be either a round number or just under one
801            let is_round = f % 1000.0 == 0.0 || f % 5000.0 == 0.0;
802            let is_just_under = (f + 0.01) % 1000.0 < 0.02 || (f + 0.01) % 10000.0 < 0.02;
803
804            assert!(
805                is_round || is_just_under || f > 0.0,
806                "Amount {} should be a suspicious round number",
807                f
808            );
809        }
810    }
811
812    #[test]
813    fn test_get_first_digit() {
814        assert_eq!(get_first_digit(Decimal::from(123)), Some(1));
815        assert_eq!(get_first_digit(Decimal::from(999)), Some(9));
816        assert_eq!(get_first_digit(Decimal::from(50000)), Some(5));
817        assert_eq!(
818            get_first_digit(Decimal::from_str_exact("0.00123").unwrap()),
819            Some(1)
820        );
821    }
822
823    #[test]
824    fn test_second_digit_probabilities_sum_to_one() {
825        let sum: f64 = BENFORD_SECOND_DIGIT_PROBABILITIES.iter().sum();
826        assert!(
827            (sum - 1.0).abs() < 0.001,
828            "Second digit probabilities sum to {}, expected 1.0",
829            sum
830        );
831    }
832
833    #[test]
834    fn test_first_two_probability() {
835        // P(10) = log10(1 + 1/10) = log10(1.1) ≈ 0.0414
836        let p10 = benford_first_two_probability(1, 0);
837        assert!((p10 - 0.0414).abs() < 0.001);
838
839        // P(99) = log10(1 + 1/99) ≈ 0.00436
840        let p99 = benford_first_two_probability(9, 9);
841        assert!((p99 - 0.00436).abs() < 0.0001);
842
843        // Sum of all first-two probabilities should be 1.0
844        let probs = benford_first_two_probabilities();
845        let sum: f64 = probs.iter().sum();
846        assert!((sum - 1.0).abs() < 0.001);
847    }
848
849    #[test]
850    fn test_get_first_two_digits() {
851        assert_eq!(get_first_two_digits(Decimal::from(123)), Some((1, 2)));
852        assert_eq!(get_first_two_digits(Decimal::from(999)), Some((9, 9)));
853        assert_eq!(get_first_two_digits(Decimal::from(50000)), Some((5, 0)));
854        assert_eq!(
855            get_first_two_digits(Decimal::from_str_exact("0.00123").unwrap()),
856            Some((1, 2))
857        );
858    }
859
860    #[test]
861    fn test_enhanced_benford_sampler() {
862        let config = EnhancedBenfordConfig {
863            amount_config: AmountDistributionConfig::default(),
864            second_digit_compliance: true,
865            first_two_digit_compliance: false,
866        };
867        let mut sampler = EnhancedBenfordSampler::new(42, config);
868
869        let mut digit_counts = [0u32; 10];
870        for _ in 0..10000 {
871            let amount = sampler.sample();
872            if let Some((_, d2)) = get_first_two_digits(amount) {
873                digit_counts[d2 as usize] += 1;
874            }
875        }
876
877        // Note: The second digit distribution depends on amount generation and
878        // magnitude selection, which may skew results. Just verify the sampler runs
879        // and produces valid amounts.
880        let total_valid = digit_counts.iter().sum::<u32>();
881        assert!(
882            total_valid > 9000,
883            "Most samples should have valid first two digits"
884        );
885
886        // Verify we have some distribution of second digits (not all the same)
887        let max_count = *digit_counts.iter().max().unwrap();
888        let _min_count = *digit_counts.iter().min().unwrap();
889        assert!(
890            max_count < total_valid / 2,
891            "Second digits should have some variety, max count: {}",
892            max_count
893        );
894    }
895
896    #[test]
897    fn test_benford_deviation_sampler() {
898        let config = BenfordDeviationConfig {
899            deviation_type: BenfordDeviationType::ThresholdClustering,
900            intensity: 1.0,
901            amount_config: AmountDistributionConfig::default(),
902            thresholds: vec![10000.0],
903        };
904        let mut sampler = BenfordDeviationSampler::new(42, config);
905
906        for _ in 0..100 {
907            let amount = sampler.sample();
908            let f: f64 = amount.to_string().parse().unwrap();
909            // Should be below threshold
910            assert!(f < 10000.0, "Amount {} should be below 10000", f);
911            // Should be within ~20% of threshold (1-15% below + noise)
912            assert!(f > 8000.0, "Amount {} should be near threshold 10000", f);
913        }
914    }
915
916    #[test]
917    fn test_benford_deviation_round_bias() {
918        let config = BenfordDeviationConfig {
919            deviation_type: BenfordDeviationType::RoundNumberBias,
920            intensity: 1.0,
921            amount_config: AmountDistributionConfig::default(),
922            thresholds: vec![],
923        };
924        let mut sampler = BenfordDeviationSampler::new(42, config);
925
926        let mut digit_counts = [0u32; 9];
927        for _ in 0..1000 {
928            let amount = sampler.sample();
929            if let Some(d) = get_first_digit(amount) {
930                if (1..=9).contains(&d) {
931                    digit_counts[(d - 1) as usize] += 1;
932                }
933            }
934        }
935
936        // Digits 1 and 5 should be overrepresented
937        let d1_pct = digit_counts[0] as f64 / 1000.0;
938        let d5_pct = digit_counts[4] as f64 / 1000.0;
939
940        // Should be higher than Benford expects
941        assert!(d1_pct > 0.35 || d5_pct > 0.10);
942    }
943}
datasynth_core/distributions/benford.rs

datasynth_core/distributions/
benford.rs