datasynth_core/distributions/
benford.rs

1//! Benford's Law distribution sampler and fraud amount patterns.
2//!
3//! Implements Benford's Law compliant amount generation and various fraud
4//! amount patterns for realistic synthetic accounting data. Includes enhanced
5//! multi-digit Benford analysis and deviation patterns for anomaly injection.
6
7use rand::prelude::*;
8use rand_chacha::ChaCha8Rng;
9use rust_decimal::Decimal;
10use serde::{Deserialize, Serialize};
11
12use super::AmountDistributionConfig;
13
14/// Benford's Law probability distribution for first digits 1-9.
15/// P(d) = log10(1 + 1/d)
16/// Note: Uses explicit values to satisfy clippy while maintaining exact precision.
17#[allow(clippy::approx_constant)]
18pub const BENFORD_PROBABILITIES: [f64; 9] = [
19    0.30103, // 1: 30.1% - log10(2)
20    0.17609, // 2: 17.6%
21    0.12494, // 3: 12.5%
22    0.09691, // 4: 9.7%
23    0.07918, // 5: 7.9%
24    0.06695, // 6: 6.7%
25    0.05799, // 7: 5.8%
26    0.05115, // 8: 5.1%
27    0.04576, // 9: 4.6%
28];
29
30/// Cumulative distribution function for Benford's Law.
31/// Note: Uses explicit values to satisfy clippy while maintaining exact precision.
32#[allow(clippy::approx_constant)]
33pub const BENFORD_CDF: [f64; 9] = [
34    0.30103, // 1 - log10(2)
35    0.47712, // 1-2
36    0.60206, // 1-3
37    0.69897, // 1-4
38    0.77815, // 1-5
39    0.84510, // 1-6
40    0.90309, // 1-7
41    0.95424, // 1-8
42    1.00000, // 1-9
43];
44
45/// Benford's Law probability distribution for second digits 0-9.
46/// P(d2) = sum over d1 of log10(1 + 1/(10*d1 + d2))
47#[allow(clippy::approx_constant)]
48pub const BENFORD_SECOND_DIGIT_PROBABILITIES: [f64; 10] = [
49    0.11968, // 0: 12.0%
50    0.11389, // 1: 11.4%
51    0.10882, // 2: 10.9%
52    0.10433, // 3: 10.4%
53    0.10031, // 4: 10.0%
54    0.09668, // 5: 9.7%
55    0.09337, // 6: 9.3%
56    0.09035, // 7: 9.0%
57    0.08757, // 8: 8.8%
58    0.08500, // 9: 8.5%
59];
60
61/// Cumulative distribution function for second digit Benford's Law.
62pub const BENFORD_SECOND_DIGIT_CDF: [f64; 10] = [
63    0.11968, 0.23357, 0.34239, 0.44672, 0.54703, 0.64371, 0.73708, 0.82743, 0.91500, 1.00000,
64];
65
66/// Calculate Benford's Law probability for first two digits (10-99).
67/// P(d1d2) = log10(1 + 1/(d1*10 + d2))
68pub fn benford_first_two_probability(d1: u8, d2: u8) -> f64 {
69    if !(1..=9).contains(&d1) || d2 > 9 {
70        return 0.0;
71    }
72    let n = (d1 as f64) * 10.0 + (d2 as f64);
73    (1.0 + 1.0 / n).log10()
74}
75
76/// Get all first-two-digit probabilities as a 90-element array (10-99).
77pub fn benford_first_two_probabilities() -> [f64; 90] {
78    let mut probs = [0.0; 90];
79    for d1 in 1..=9 {
80        for d2 in 0..=9 {
81            let idx = (d1 - 1) * 10 + d2;
82            probs[idx as usize] = benford_first_two_probability(d1, d2);
83        }
84    }
85    probs
86}
87
88/// Anti-Benford distribution for generating statistically improbable amounts.
89/// Overweights digits 5, 7, and 9 which are typically rare in natural data.
90pub const ANTI_BENFORD_PROBABILITIES: [f64; 9] = [
91    0.05, // 1: 5% (normally 30%)
92    0.05, // 2: 5% (normally 18%)
93    0.05, // 3: 5% (normally 12%)
94    0.10, // 4: 10%
95    0.25, // 5: 25% (normally 8%)
96    0.10, // 6: 10%
97    0.20, // 7: 20% (normally 6%)
98    0.05, // 8: 5%
99    0.15, // 9: 15% (normally 5%)
100];
101
102/// Fraud amount pattern types.
103#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
104#[serde(rename_all = "snake_case")]
105pub enum FraudAmountPattern {
106    /// Normal amount generation (Benford-compliant if enabled)
107    #[default]
108    Normal,
109    /// Statistically improbable first digits (anti-Benford)
110    /// Excess of leading 5s, 7s, 9s - detectable via statistical analysis
111    StatisticallyImprobable,
112    /// Obvious round numbers ($50,000.00, $99,999.99)
113    /// Easy to spot in visual review
114    ObviousRoundNumbers,
115    /// Amounts clustered just below approval thresholds
116    /// Classic split-transaction pattern
117    ThresholdAdjacent,
118}
119
120/// Configuration for threshold-adjacent fraud pattern.
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct ThresholdConfig {
123    /// Approval thresholds to cluster below
124    pub thresholds: Vec<f64>,
125    /// Minimum percentage below threshold (e.g., 0.01 = 1%)
126    pub min_below_pct: f64,
127    /// Maximum percentage below threshold (e.g., 0.15 = 15%)
128    pub max_below_pct: f64,
129}
130
131impl Default for ThresholdConfig {
132    fn default() -> Self {
133        Self {
134            thresholds: vec![1000.0, 5000.0, 10000.0, 25000.0, 50000.0, 100000.0],
135            min_below_pct: 0.01,
136            max_below_pct: 0.15,
137        }
138    }
139}
140
141/// Sampler that produces amounts following Benford's Law distribution.
142pub struct BenfordSampler {
143    rng: ChaCha8Rng,
144    config: AmountDistributionConfig,
145}
146
147impl BenfordSampler {
148    /// Create a new Benford sampler with the given seed and amount configuration.
149    pub fn new(seed: u64, config: AmountDistributionConfig) -> Self {
150        Self {
151            rng: ChaCha8Rng::seed_from_u64(seed),
152            config,
153        }
154    }
155
156    /// Sample a first digit according to Benford's Law.
157    fn sample_benford_first_digit(&mut self) -> u8 {
158        let p: f64 = self.rng.gen();
159        for (i, &cumulative) in BENFORD_CDF.iter().enumerate() {
160            if p < cumulative {
161                return (i + 1) as u8;
162            }
163        }
164        9
165    }
166
167    /// Sample a first digit from the anti-Benford distribution.
168    fn sample_anti_benford_first_digit(&mut self) -> u8 {
169        let p: f64 = self.rng.gen();
170        let mut cumulative = 0.0;
171        for (i, &prob) in ANTI_BENFORD_PROBABILITIES.iter().enumerate() {
172            cumulative += prob;
173            if p < cumulative {
174                return (i + 1) as u8;
175            }
176        }
177        9
178    }
179
180    /// Sample an amount following Benford's Law.
181    pub fn sample(&mut self) -> Decimal {
182        let first_digit = self.sample_benford_first_digit();
183        self.sample_with_first_digit(first_digit)
184    }
185
186    /// Sample an amount with a specific first digit.
187    pub fn sample_with_first_digit(&mut self, first_digit: u8) -> Decimal {
188        let first_digit = first_digit.clamp(1, 9);
189
190        // Determine the order of magnitude based on config range
191        let min_magnitude = self.config.min_amount.log10().floor() as i32;
192        let max_magnitude = self.config.max_amount.log10().floor() as i32;
193
194        // Sample a magnitude within the valid range
195        let magnitude = self.rng.gen_range(min_magnitude..=max_magnitude);
196        let base = 10_f64.powi(magnitude);
197
198        // Generate the remaining digits (0.0 to 0.999...)
199        let remaining: f64 = self.rng.gen();
200
201        // Construct: first_digit.remaining * 10^magnitude
202        let mantissa = first_digit as f64 + remaining;
203        let mut amount = mantissa * base;
204
205        // Clamp to configured range
206        amount = amount.clamp(self.config.min_amount, self.config.max_amount);
207
208        // Apply round number bias (25% chance)
209        let p: f64 = self.rng.gen();
210        if p < self.config.round_number_probability {
211            // Round to nearest whole number ending in 00
212            amount = (amount / 100.0).round() * 100.0;
213        } else if p < self.config.round_number_probability + self.config.nice_number_probability {
214            // Round to nearest 5 or 10
215            amount = (amount / 5.0).round() * 5.0;
216        }
217
218        // Round to configured decimal places
219        let decimal_multiplier = 10_f64.powi(self.config.decimal_places as i32);
220        amount = (amount * decimal_multiplier).round() / decimal_multiplier;
221
222        // Ensure minimum after rounding
223        amount = amount.max(self.config.min_amount);
224
225        Decimal::from_f64_retain(amount).unwrap_or(Decimal::ONE)
226    }
227
228    /// Reset the sampler with a new seed.
229    pub fn reset(&mut self, seed: u64) {
230        self.rng = ChaCha8Rng::seed_from_u64(seed);
231    }
232}
233
234/// Generator for fraudulent amount patterns.
235pub struct FraudAmountGenerator {
236    rng: ChaCha8Rng,
237    benford_sampler: BenfordSampler,
238    threshold_config: ThresholdConfig,
239    config: AmountDistributionConfig,
240}
241
242impl FraudAmountGenerator {
243    /// Create a new fraud amount generator.
244    pub fn new(
245        seed: u64,
246        config: AmountDistributionConfig,
247        threshold_config: ThresholdConfig,
248    ) -> Self {
249        Self {
250            rng: ChaCha8Rng::seed_from_u64(seed),
251            benford_sampler: BenfordSampler::new(seed + 1, config.clone()),
252            threshold_config,
253            config,
254        }
255    }
256
257    /// Generate an amount with the specified fraud pattern.
258    pub fn sample(&mut self, pattern: FraudAmountPattern) -> Decimal {
259        match pattern {
260            FraudAmountPattern::Normal => self.benford_sampler.sample(),
261            FraudAmountPattern::StatisticallyImprobable => self.sample_anti_benford(),
262            FraudAmountPattern::ObviousRoundNumbers => self.sample_obvious_round(),
263            FraudAmountPattern::ThresholdAdjacent => self.sample_threshold_adjacent(),
264        }
265    }
266
267    /// Generate an amount with statistically improbable first digit distribution.
268    fn sample_anti_benford(&mut self) -> Decimal {
269        let first_digit = self.benford_sampler.sample_anti_benford_first_digit();
270        self.benford_sampler.sample_with_first_digit(first_digit)
271    }
272
273    /// Generate an obvious round number amount (suspicious pattern).
274    fn sample_obvious_round(&mut self) -> Decimal {
275        let pattern_choice = self.rng.gen_range(0..5);
276
277        let amount = match pattern_choice {
278            // Even thousands ($1,000, $5,000, $10,000, etc.)
279            0 => {
280                let multiplier = self.rng.gen_range(1..100);
281                multiplier as f64 * 1000.0
282            }
283            // $X9,999.99 pattern (just under round number)
284            1 => {
285                let base = self.rng.gen_range(1..10) as f64 * 10000.0;
286                base - 0.01
287            }
288            // Exact $X0,000.00 pattern
289            2 => {
290                let multiplier = self.rng.gen_range(1..20);
291                multiplier as f64 * 10000.0
292            }
293            // Five-thousands ($5,000, $15,000, $25,000)
294            3 => {
295                let multiplier = self.rng.gen_range(1..40);
296                multiplier as f64 * 5000.0
297            }
298            // $X,999.99 pattern
299            _ => {
300                let base = self.rng.gen_range(1..100) as f64 * 1000.0;
301                base - 0.01
302            }
303        };
304
305        // Clamp to config range
306        let clamped = amount.clamp(self.config.min_amount, self.config.max_amount);
307        Decimal::from_f64_retain(clamped).unwrap_or(Decimal::ONE)
308    }
309
310    /// Generate an amount just below an approval threshold.
311    fn sample_threshold_adjacent(&mut self) -> Decimal {
312        // Select a threshold
313        let threshold = if self.threshold_config.thresholds.is_empty() {
314            10000.0
315        } else {
316            *self
317                .threshold_config
318                .thresholds
319                .choose(&mut self.rng)
320                .unwrap_or(&10000.0)
321        };
322
323        // Calculate amount as percentage below threshold
324        let pct_below = self
325            .rng
326            .gen_range(self.threshold_config.min_below_pct..self.threshold_config.max_below_pct);
327        let base_amount = threshold * (1.0 - pct_below);
328
329        // Add small noise to avoid exact patterns
330        let noise_factor = 1.0 + self.rng.gen_range(-0.005..0.005);
331        let amount = base_amount * noise_factor;
332
333        // Round to 2 decimal places
334        let rounded = (amount * 100.0).round() / 100.0;
335
336        // Ensure we're still below threshold
337        let final_amount = rounded.min(threshold - 0.01);
338        let clamped = final_amount.clamp(self.config.min_amount, self.config.max_amount);
339
340        Decimal::from_f64_retain(clamped).unwrap_or(Decimal::ONE)
341    }
342
343    /// Reset the generator with a new seed.
344    pub fn reset(&mut self, seed: u64) {
345        self.rng = ChaCha8Rng::seed_from_u64(seed);
346        self.benford_sampler.reset(seed + 1);
347    }
348}
349
350/// Extract the first digit from a decimal amount.
351pub fn get_first_digit(amount: Decimal) -> Option<u8> {
352    let s = amount.to_string();
353    s.chars()
354        .find(|c| c.is_ascii_digit() && *c != '0')
355        .and_then(|c| c.to_digit(10))
356        .map(|d| d as u8)
357}
358
359/// Extract the first two digits from a decimal amount.
360pub fn get_first_two_digits(amount: Decimal) -> Option<(u8, u8)> {
361    let s = amount.abs().to_string();
362    let mut first_found = false;
363    let mut first_digit = 0u8;
364
365    for c in s.chars() {
366        if c.is_ascii_digit() {
367            let d = c
368                .to_digit(10)
369                .expect("digit char confirmed by is_ascii_digit") as u8;
370            if !first_found && d != 0 {
371                first_digit = d;
372                first_found = true;
373            } else if first_found && c != '.' {
374                return Some((first_digit, d));
375            }
376        }
377    }
378    None
379}
380
381/// Configuration for enhanced Benford sampling with multi-digit compliance.
382#[derive(Debug, Clone, Serialize, Deserialize, Default)]
383pub struct EnhancedBenfordConfig {
384    /// Base amount distribution configuration
385    pub amount_config: AmountDistributionConfig,
386    /// Whether to enforce second-digit Benford compliance
387    #[serde(default)]
388    pub second_digit_compliance: bool,
389    /// Whether to enforce first-two-digit Benford compliance
390    #[serde(default)]
391    pub first_two_digit_compliance: bool,
392}
393
394/// Enhanced Benford sampler with multi-digit compliance.
395pub struct EnhancedBenfordSampler {
396    rng: ChaCha8Rng,
397    config: EnhancedBenfordConfig,
398    /// Pre-computed CDF for first two digits
399    first_two_cdf: [f64; 90],
400}
401
402impl EnhancedBenfordSampler {
403    /// Create a new enhanced Benford sampler.
404    pub fn new(seed: u64, config: EnhancedBenfordConfig) -> Self {
405        // Pre-compute CDF for first two digits
406        let probs = benford_first_two_probabilities();
407        let mut first_two_cdf = [0.0; 90];
408        let mut cumulative = 0.0;
409        for i in 0..90 {
410            cumulative += probs[i];
411            first_two_cdf[i] = cumulative;
412        }
413
414        Self {
415            rng: ChaCha8Rng::seed_from_u64(seed),
416            config,
417            first_two_cdf,
418        }
419    }
420
421    /// Sample first two digits according to Benford's Law.
422    fn sample_first_two_digits(&mut self) -> (u8, u8) {
423        let p: f64 = self.rng.gen();
424        for (i, &cdf) in self.first_two_cdf.iter().enumerate() {
425            if p < cdf {
426                let d1 = (i / 10 + 1) as u8;
427                let d2 = (i % 10) as u8;
428                return (d1, d2);
429            }
430        }
431        (9, 9)
432    }
433
434    /// Sample a second digit according to Benford's Law.
435    fn sample_second_digit(&mut self) -> u8 {
436        let p: f64 = self.rng.gen();
437        for (i, &cdf) in BENFORD_SECOND_DIGIT_CDF.iter().enumerate() {
438            if p < cdf {
439                return i as u8;
440            }
441        }
442        9
443    }
444
445    /// Sample a first digit according to Benford's Law.
446    fn sample_first_digit(&mut self) -> u8 {
447        let p: f64 = self.rng.gen();
448        for (i, &cdf) in BENFORD_CDF.iter().enumerate() {
449            if p < cdf {
450                return (i + 1) as u8;
451            }
452        }
453        9
454    }
455
456    /// Sample an amount with enhanced Benford compliance.
457    pub fn sample(&mut self) -> Decimal {
458        let (first_digit, second_digit) = if self.config.first_two_digit_compliance {
459            self.sample_first_two_digits()
460        } else if self.config.second_digit_compliance {
461            (self.sample_first_digit(), self.sample_second_digit())
462        } else {
463            (self.sample_first_digit(), self.rng.gen_range(0..10) as u8)
464        };
465
466        self.sample_with_digits(first_digit, second_digit)
467    }
468
469    /// Sample an amount with specific first two digits.
470    fn sample_with_digits(&mut self, first_digit: u8, second_digit: u8) -> Decimal {
471        let first_digit = first_digit.clamp(1, 9);
472        let second_digit = second_digit.clamp(0, 9);
473
474        // Determine the order of magnitude based on config range
475        let min_magnitude = self.config.amount_config.min_amount.log10().floor() as i32;
476        let max_magnitude = self.config.amount_config.max_amount.log10().floor() as i32;
477
478        // Sample a magnitude within the valid range
479        let magnitude = self.rng.gen_range(min_magnitude..=max_magnitude);
480        let base = 10_f64.powi(magnitude - 1); // -1 because first two digits span 10-99
481
482        // Generate the remaining digits (0.0 to 0.99...)
483        let remaining: f64 = self.rng.gen();
484
485        // Construct the amount: (first_digit * 10 + second_digit + remaining) * base
486        let mantissa = (first_digit as f64) * 10.0 + (second_digit as f64) + remaining;
487        let mut amount = mantissa * base;
488
489        // Clamp to configured range
490        amount = amount.clamp(
491            self.config.amount_config.min_amount,
492            self.config.amount_config.max_amount,
493        );
494
495        // Round to configured decimal places
496        let decimal_multiplier = 10_f64.powi(self.config.amount_config.decimal_places as i32);
497        amount = (amount * decimal_multiplier).round() / decimal_multiplier;
498
499        Decimal::from_f64_retain(amount).unwrap_or(Decimal::ONE)
500    }
501
502    /// Reset the sampler with a new seed.
503    pub fn reset(&mut self, seed: u64) {
504        self.rng = ChaCha8Rng::seed_from_u64(seed);
505    }
506}
507
508/// Types of Benford deviation patterns for anomaly injection.
509#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
510#[serde(rename_all = "snake_case")]
511#[derive(Default)]
512pub enum BenfordDeviationType {
513    /// Round number bias (excess of digits 1, 5, 0 in second position)
514    #[default]
515    RoundNumberBias,
516    /// Threshold clustering (amounts just below round thresholds)
517    ThresholdClustering,
518    /// Uniform first digit (equal probability for all first digits)
519    UniformFirstDigit,
520    /// Excess of specific digit
521    DigitBias { digit: u8 },
522    /// Trailing zeros pattern (prices ending in .00)
523    TrailingZeros,
524}
525
526/// Configuration for Benford deviation sampling (for anomaly injection).
527#[derive(Debug, Clone, Serialize, Deserialize)]
528pub struct BenfordDeviationConfig {
529    /// Type of deviation pattern
530    pub deviation_type: BenfordDeviationType,
531    /// Intensity of deviation (0.0 = Benford compliant, 1.0 = full deviation)
532    #[serde(default = "default_intensity")]
533    pub intensity: f64,
534    /// Base amount configuration
535    pub amount_config: AmountDistributionConfig,
536    /// Thresholds for threshold clustering (if applicable)
537    #[serde(default = "default_thresholds")]
538    pub thresholds: Vec<f64>,
539}
540
541fn default_intensity() -> f64 {
542    0.5
543}
544
545fn default_thresholds() -> Vec<f64> {
546    vec![1000.0, 5000.0, 10000.0, 25000.0, 50000.0, 100000.0]
547}
548
549impl Default for BenfordDeviationConfig {
550    fn default() -> Self {
551        Self {
552            deviation_type: BenfordDeviationType::RoundNumberBias,
553            intensity: 0.5,
554            amount_config: AmountDistributionConfig::default(),
555            thresholds: default_thresholds(),
556        }
557    }
558}
559
560/// Sampler for generating amounts that deviate from Benford's Law.
561/// Useful for injecting statistically detectable anomalies.
562pub struct BenfordDeviationSampler {
563    rng: ChaCha8Rng,
564    config: BenfordDeviationConfig,
565    benford_sampler: BenfordSampler,
566}
567
568impl BenfordDeviationSampler {
569    /// Create a new Benford deviation sampler.
570    pub fn new(seed: u64, config: BenfordDeviationConfig) -> Self {
571        Self {
572            rng: ChaCha8Rng::seed_from_u64(seed),
573            benford_sampler: BenfordSampler::new(seed + 100, config.amount_config.clone()),
574            config,
575        }
576    }
577
578    /// Sample an amount with the configured deviation pattern.
579    pub fn sample(&mut self) -> Decimal {
580        // With probability (1 - intensity), sample from normal Benford
581        let p: f64 = self.rng.gen();
582        if p > self.config.intensity {
583            return self.benford_sampler.sample();
584        }
585
586        // Apply deviation pattern
587        match self.config.deviation_type {
588            BenfordDeviationType::RoundNumberBias => self.sample_round_bias(),
589            BenfordDeviationType::ThresholdClustering => self.sample_threshold_cluster(),
590            BenfordDeviationType::UniformFirstDigit => self.sample_uniform_first_digit(),
591            BenfordDeviationType::DigitBias { digit } => self.sample_digit_bias(digit),
592            BenfordDeviationType::TrailingZeros => self.sample_trailing_zeros(),
593        }
594    }
595
596    /// Sample with round number bias.
597    fn sample_round_bias(&mut self) -> Decimal {
598        // Bias towards first digits 1 and 5
599        let first_digit = if self.rng.gen_bool(0.6) {
600            if self.rng.gen_bool(0.7) {
601                1
602            } else {
603                5
604            }
605        } else {
606            self.rng.gen_range(1..=9)
607        };
608
609        // Bias towards second digits 0 and 5
610        let _second_digit = if self.rng.gen_bool(0.5) {
611            if self.rng.gen_bool(0.6) {
612                0
613            } else {
614                5
615            }
616        } else {
617            self.rng.gen_range(0..=9)
618        };
619
620        self.benford_sampler.sample_with_first_digit(first_digit)
621    }
622
623    /// Sample clustering just below thresholds.
624    fn sample_threshold_cluster(&mut self) -> Decimal {
625        let threshold = self
626            .config
627            .thresholds
628            .choose(&mut self.rng)
629            .copied()
630            .unwrap_or(10000.0);
631
632        // Generate amount 1-15% below threshold
633        let pct_below = self.rng.gen_range(0.01..0.15);
634        let amount = threshold * (1.0 - pct_below);
635
636        // Add small noise
637        let noise = 1.0 + self.rng.gen_range(-0.005..0.005);
638        let final_amount = (amount * noise * 100.0).round() / 100.0;
639
640        Decimal::from_f64_retain(final_amount.clamp(
641            self.config.amount_config.min_amount,
642            self.config.amount_config.max_amount,
643        ))
644        .unwrap_or(Decimal::ONE)
645    }
646
647    /// Sample with uniform first digit distribution.
648    fn sample_uniform_first_digit(&mut self) -> Decimal {
649        let first_digit = self.rng.gen_range(1..=9);
650        self.benford_sampler.sample_with_first_digit(first_digit)
651    }
652
653    /// Sample with bias towards a specific digit.
654    fn sample_digit_bias(&mut self, target_digit: u8) -> Decimal {
655        let digit = target_digit.clamp(1, 9);
656        // 70% chance of using the biased digit
657        let first_digit = if self.rng.gen_bool(0.7) {
658            digit
659        } else {
660            self.rng.gen_range(1..=9)
661        };
662        self.benford_sampler.sample_with_first_digit(first_digit)
663    }
664
665    /// Sample with trailing zeros pattern (prices ending in .00).
666    fn sample_trailing_zeros(&mut self) -> Decimal {
667        let amount = self.benford_sampler.sample();
668        let amount_f64: f64 = amount.to_string().parse().unwrap_or(0.0);
669
670        // Round to whole dollars
671        let rounded = amount_f64.round();
672        Decimal::from_f64_retain(rounded.clamp(
673            self.config.amount_config.min_amount,
674            self.config.amount_config.max_amount,
675        ))
676        .unwrap_or(Decimal::ONE)
677    }
678
679    /// Reset the sampler with a new seed.
680    pub fn reset(&mut self, seed: u64) {
681        self.rng = ChaCha8Rng::seed_from_u64(seed);
682        self.benford_sampler.reset(seed + 100);
683    }
684}
685
686#[cfg(test)]
687#[allow(clippy::unwrap_used)]
688mod tests {
689    use super::*;
690
691    #[test]
692    fn test_benford_probabilities_sum_to_one() {
693        let sum: f64 = BENFORD_PROBABILITIES.iter().sum();
694        assert!(
695            (sum - 1.0).abs() < 0.001,
696            "Benford probabilities sum to {}, expected 1.0",
697            sum
698        );
699    }
700
701    #[test]
702    fn test_benford_cdf_ends_at_one() {
703        assert!(
704            (BENFORD_CDF[8] - 1.0).abs() < 0.0001,
705            "CDF should end at 1.0"
706        );
707    }
708
709    #[test]
710    fn test_anti_benford_probabilities_sum_to_one() {
711        let sum: f64 = ANTI_BENFORD_PROBABILITIES.iter().sum();
712        assert!(
713            (sum - 1.0).abs() < 0.001,
714            "Anti-Benford probabilities sum to {}, expected 1.0",
715            sum
716        );
717    }
718
719    #[test]
720    fn test_benford_sampler_determinism() {
721        let config = AmountDistributionConfig::default();
722        let mut sampler1 = BenfordSampler::new(42, config.clone());
723        let mut sampler2 = BenfordSampler::new(42, config);
724
725        for _ in 0..100 {
726            assert_eq!(sampler1.sample(), sampler2.sample());
727        }
728    }
729
730    #[test]
731    fn test_benford_first_digit_distribution() {
732        let config = AmountDistributionConfig::default();
733        let mut sampler = BenfordSampler::new(12345, config);
734
735        let mut digit_counts = [0u32; 9];
736        let iterations = 10_000;
737
738        for _ in 0..iterations {
739            let amount = sampler.sample();
740            if let Some(digit) = get_first_digit(amount) {
741                if (1..=9).contains(&digit) {
742                    digit_counts[(digit - 1) as usize] += 1;
743                }
744            }
745        }
746
747        // Verify digit 1 is most common (should be ~30%, but can vary more due to log-normal distribution)
748        let digit_1_pct = digit_counts[0] as f64 / iterations as f64;
749        assert!(
750            digit_1_pct > 0.15 && digit_1_pct < 0.50,
751            "Digit 1 should be ~30%, got {:.1}%",
752            digit_1_pct * 100.0
753        );
754
755        // Verify digit 9 is least common (should be ~5%)
756        let digit_9_pct = digit_counts[8] as f64 / iterations as f64;
757        assert!(
758            digit_9_pct > 0.02 && digit_9_pct < 0.10,
759            "Digit 9 should be ~5%, got {:.1}%",
760            digit_9_pct * 100.0
761        );
762    }
763
764    #[test]
765    fn test_threshold_adjacent_below_threshold() {
766        let config = AmountDistributionConfig::default();
767        let threshold_config = ThresholdConfig {
768            thresholds: vec![10000.0],
769            min_below_pct: 0.01,
770            max_below_pct: 0.15,
771        };
772        let mut gen = FraudAmountGenerator::new(42, config, threshold_config);
773
774        for _ in 0..100 {
775            let amount = gen.sample(FraudAmountPattern::ThresholdAdjacent);
776            let f = amount.to_string().parse::<f64>().unwrap();
777            assert!(f < 10000.0, "Amount {} should be below threshold 10000", f);
778            // Account for noise factor (up to 0.5%) and rounding
779            assert!(
780                f >= 8400.0,
781                "Amount {} should be approximately within 15% of threshold",
782                f
783            );
784        }
785    }
786
787    #[test]
788    fn test_obvious_round_numbers() {
789        let config = AmountDistributionConfig::default();
790        let threshold_config = ThresholdConfig::default();
791        let mut gen = FraudAmountGenerator::new(42, config, threshold_config);
792
793        for _ in 0..100 {
794            let amount = gen.sample(FraudAmountPattern::ObviousRoundNumbers);
795            let f = amount.to_string().parse::<f64>().unwrap();
796
797            // Should be either a round number or just under one
798            let is_round = f % 1000.0 == 0.0 || f % 5000.0 == 0.0;
799            let is_just_under = (f + 0.01) % 1000.0 < 0.02 || (f + 0.01) % 10000.0 < 0.02;
800
801            assert!(
802                is_round || is_just_under || f > 0.0,
803                "Amount {} should be a suspicious round number",
804                f
805            );
806        }
807    }
808
809    #[test]
810    fn test_get_first_digit() {
811        assert_eq!(get_first_digit(Decimal::from(123)), Some(1));
812        assert_eq!(get_first_digit(Decimal::from(999)), Some(9));
813        assert_eq!(get_first_digit(Decimal::from(50000)), Some(5));
814        assert_eq!(
815            get_first_digit(Decimal::from_str_exact("0.00123").unwrap()),
816            Some(1)
817        );
818    }
819
820    #[test]
821    fn test_second_digit_probabilities_sum_to_one() {
822        let sum: f64 = BENFORD_SECOND_DIGIT_PROBABILITIES.iter().sum();
823        assert!(
824            (sum - 1.0).abs() < 0.001,
825            "Second digit probabilities sum to {}, expected 1.0",
826            sum
827        );
828    }
829
830    #[test]
831    fn test_first_two_probability() {
832        // P(10) = log10(1 + 1/10) = log10(1.1) ≈ 0.0414
833        let p10 = benford_first_two_probability(1, 0);
834        assert!((p10 - 0.0414).abs() < 0.001);
835
836        // P(99) = log10(1 + 1/99) ≈ 0.00436
837        let p99 = benford_first_two_probability(9, 9);
838        assert!((p99 - 0.00436).abs() < 0.0001);
839
840        // Sum of all first-two probabilities should be 1.0
841        let probs = benford_first_two_probabilities();
842        let sum: f64 = probs.iter().sum();
843        assert!((sum - 1.0).abs() < 0.001);
844    }
845
846    #[test]
847    fn test_get_first_two_digits() {
848        assert_eq!(get_first_two_digits(Decimal::from(123)), Some((1, 2)));
849        assert_eq!(get_first_two_digits(Decimal::from(999)), Some((9, 9)));
850        assert_eq!(get_first_two_digits(Decimal::from(50000)), Some((5, 0)));
851        assert_eq!(
852            get_first_two_digits(Decimal::from_str_exact("0.00123").unwrap()),
853            Some((1, 2))
854        );
855    }
856
857    #[test]
858    fn test_enhanced_benford_sampler() {
859        let config = EnhancedBenfordConfig {
860            amount_config: AmountDistributionConfig::default(),
861            second_digit_compliance: true,
862            first_two_digit_compliance: false,
863        };
864        let mut sampler = EnhancedBenfordSampler::new(42, config);
865
866        let mut digit_counts = [0u32; 10];
867        for _ in 0..10000 {
868            let amount = sampler.sample();
869            if let Some((_, d2)) = get_first_two_digits(amount) {
870                digit_counts[d2 as usize] += 1;
871            }
872        }
873
874        // Note: The second digit distribution depends on amount generation and
875        // magnitude selection, which may skew results. Just verify the sampler runs
876        // and produces valid amounts.
877        let total_valid = digit_counts.iter().sum::<u32>();
878        assert!(
879            total_valid > 9000,
880            "Most samples should have valid first two digits"
881        );
882
883        // Verify we have some distribution of second digits (not all the same)
884        let max_count = *digit_counts.iter().max().unwrap();
885        let _min_count = *digit_counts.iter().min().unwrap();
886        assert!(
887            max_count < total_valid / 2,
888            "Second digits should have some variety, max count: {}",
889            max_count
890        );
891    }
892
893    #[test]
894    fn test_benford_deviation_sampler() {
895        let config = BenfordDeviationConfig {
896            deviation_type: BenfordDeviationType::ThresholdClustering,
897            intensity: 1.0,
898            amount_config: AmountDistributionConfig::default(),
899            thresholds: vec![10000.0],
900        };
901        let mut sampler = BenfordDeviationSampler::new(42, config);
902
903        for _ in 0..100 {
904            let amount = sampler.sample();
905            let f: f64 = amount.to_string().parse().unwrap();
906            // Should be below threshold
907            assert!(f < 10000.0, "Amount {} should be below 10000", f);
908            // Should be within ~20% of threshold (1-15% below + noise)
909            assert!(f > 8000.0, "Amount {} should be near threshold 10000", f);
910        }
911    }
912
913    #[test]
914    fn test_benford_deviation_round_bias() {
915        let config = BenfordDeviationConfig {
916            deviation_type: BenfordDeviationType::RoundNumberBias,
917            intensity: 1.0,
918            amount_config: AmountDistributionConfig::default(),
919            thresholds: vec![],
920        };
921        let mut sampler = BenfordDeviationSampler::new(42, config);
922
923        let mut digit_counts = [0u32; 9];
924        for _ in 0..1000 {
925            let amount = sampler.sample();
926            if let Some(d) = get_first_digit(amount) {
927                if (1..=9).contains(&d) {
928                    digit_counts[(d - 1) as usize] += 1;
929                }
930            }
931        }
932
933        // Digits 1 and 5 should be overrepresented
934        let d1_pct = digit_counts[0] as f64 / 1000.0;
935        let d5_pct = digit_counts[4] as f64 / 1000.0;
936
937        // Should be higher than Benford expects
938        assert!(d1_pct > 0.35 || d5_pct > 0.10);
939    }
940}
datasynth_core/distributions/benford.rs

datasynth_core/distributions/
benford.rs