datasynth_core/distributions/
benford.rs

1//! Benford's Law distribution sampler and fraud amount patterns.
2//!
3//! Implements Benford's Law compliant amount generation and various fraud
4//! amount patterns for realistic synthetic accounting data. Includes enhanced
5//! multi-digit Benford analysis and deviation patterns for anomaly injection.
6
7use rand::prelude::*;
8use rand_chacha::ChaCha8Rng;
9use rust_decimal::Decimal;
10use serde::{Deserialize, Serialize};
11
12use super::AmountDistributionConfig;
13
14/// Benford's Law probability distribution for first digits 1-9.
15/// P(d) = log10(1 + 1/d)
16/// Note: Uses explicit values to satisfy clippy while maintaining exact precision.
17#[allow(clippy::approx_constant)]
18pub const BENFORD_PROBABILITIES: [f64; 9] = [
19    0.30103, // 1: 30.1% - log10(2)
20    0.17609, // 2: 17.6%
21    0.12494, // 3: 12.5%
22    0.09691, // 4: 9.7%
23    0.07918, // 5: 7.9%
24    0.06695, // 6: 6.7%
25    0.05799, // 7: 5.8%
26    0.05115, // 8: 5.1%
27    0.04576, // 9: 4.6%
28];
29
30/// Cumulative distribution function for Benford's Law.
31/// Note: Uses explicit values to satisfy clippy while maintaining exact precision.
32#[allow(clippy::approx_constant)]
33pub const BENFORD_CDF: [f64; 9] = [
34    0.30103, // 1 - log10(2)
35    0.47712, // 1-2
36    0.60206, // 1-3
37    0.69897, // 1-4
38    0.77815, // 1-5
39    0.84510, // 1-6
40    0.90309, // 1-7
41    0.95424, // 1-8
42    1.00000, // 1-9
43];
44
45/// Benford's Law probability distribution for second digits 0-9.
46/// P(d2) = sum over d1 of log10(1 + 1/(10*d1 + d2))
47#[allow(clippy::approx_constant)]
48pub const BENFORD_SECOND_DIGIT_PROBABILITIES: [f64; 10] = [
49    0.11968, // 0: 12.0%
50    0.11389, // 1: 11.4%
51    0.10882, // 2: 10.9%
52    0.10433, // 3: 10.4%
53    0.10031, // 4: 10.0%
54    0.09668, // 5: 9.7%
55    0.09337, // 6: 9.3%
56    0.09035, // 7: 9.0%
57    0.08757, // 8: 8.8%
58    0.08500, // 9: 8.5%
59];
60
61/// Cumulative distribution function for second digit Benford's Law.
62pub const BENFORD_SECOND_DIGIT_CDF: [f64; 10] = [
63    0.11968, 0.23357, 0.34239, 0.44672, 0.54703, 0.64371, 0.73708, 0.82743, 0.91500, 1.00000,
64];
65
66/// Calculate Benford's Law probability for first two digits (10-99).
67/// P(d1d2) = log10(1 + 1/(d1*10 + d2))
68pub fn benford_first_two_probability(d1: u8, d2: u8) -> f64 {
69    if !(1..=9).contains(&d1) || d2 > 9 {
70        return 0.0;
71    }
72    let n = (d1 as f64) * 10.0 + (d2 as f64);
73    (1.0 + 1.0 / n).log10()
74}
75
76/// Get all first-two-digit probabilities as a 90-element array (10-99).
77pub fn benford_first_two_probabilities() -> [f64; 90] {
78    let mut probs = [0.0; 90];
79    for d1 in 1..=9 {
80        for d2 in 0..=9 {
81            let idx = (d1 - 1) * 10 + d2;
82            probs[idx as usize] = benford_first_two_probability(d1, d2);
83        }
84    }
85    probs
86}
87
88/// Anti-Benford distribution for generating statistically improbable amounts.
89/// Overweights digits 5, 7, and 9 which are typically rare in natural data.
90pub const ANTI_BENFORD_PROBABILITIES: [f64; 9] = [
91    0.05, // 1: 5% (normally 30%)
92    0.05, // 2: 5% (normally 18%)
93    0.05, // 3: 5% (normally 12%)
94    0.10, // 4: 10%
95    0.25, // 5: 25% (normally 8%)
96    0.10, // 6: 10%
97    0.20, // 7: 20% (normally 6%)
98    0.05, // 8: 5%
99    0.15, // 9: 15% (normally 5%)
100];
101
102/// Fraud amount pattern types.
103#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
104#[serde(rename_all = "snake_case")]
105pub enum FraudAmountPattern {
106    /// Normal amount generation (Benford-compliant if enabled)
107    #[default]
108    Normal,
109    /// Statistically improbable first digits (anti-Benford)
110    /// Excess of leading 5s, 7s, 9s - detectable via statistical analysis
111    StatisticallyImprobable,
112    /// Obvious round numbers ($50,000.00, $99,999.99)
113    /// Easy to spot in visual review
114    ObviousRoundNumbers,
115    /// Amounts clustered just below approval thresholds
116    /// Classic split-transaction pattern
117    ThresholdAdjacent,
118}
119
120/// Configuration for threshold-adjacent fraud pattern.
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct ThresholdConfig {
123    /// Approval thresholds to cluster below
124    pub thresholds: Vec<f64>,
125    /// Minimum percentage below threshold (e.g., 0.01 = 1%)
126    pub min_below_pct: f64,
127    /// Maximum percentage below threshold (e.g., 0.15 = 15%)
128    pub max_below_pct: f64,
129}
130
131impl Default for ThresholdConfig {
132    fn default() -> Self {
133        Self {
134            thresholds: vec![1000.0, 5000.0, 10000.0, 25000.0, 50000.0, 100000.0],
135            min_below_pct: 0.01,
136            max_below_pct: 0.15,
137        }
138    }
139}
140
141/// Sampler that produces amounts following Benford's Law distribution.
142pub struct BenfordSampler {
143    rng: ChaCha8Rng,
144    config: AmountDistributionConfig,
145}
146
147impl BenfordSampler {
148    /// Create a new Benford sampler with the given seed and amount configuration.
149    pub fn new(seed: u64, config: AmountDistributionConfig) -> Self {
150        Self {
151            rng: ChaCha8Rng::seed_from_u64(seed),
152            config,
153        }
154    }
155
156    /// Sample a first digit according to Benford's Law.
157    fn sample_benford_first_digit(&mut self) -> u8 {
158        let p: f64 = self.rng.gen();
159        for (i, &cumulative) in BENFORD_CDF.iter().enumerate() {
160            if p < cumulative {
161                return (i + 1) as u8;
162            }
163        }
164        9
165    }
166
167    /// Sample a first digit from the anti-Benford distribution.
168    fn sample_anti_benford_first_digit(&mut self) -> u8 {
169        let p: f64 = self.rng.gen();
170        let mut cumulative = 0.0;
171        for (i, &prob) in ANTI_BENFORD_PROBABILITIES.iter().enumerate() {
172            cumulative += prob;
173            if p < cumulative {
174                return (i + 1) as u8;
175            }
176        }
177        9
178    }
179
180    /// Sample an amount following Benford's Law.
181    pub fn sample(&mut self) -> Decimal {
182        let first_digit = self.sample_benford_first_digit();
183        self.sample_with_first_digit(first_digit)
184    }
185
186    /// Sample an amount with a specific first digit.
187    pub fn sample_with_first_digit(&mut self, first_digit: u8) -> Decimal {
188        let first_digit = first_digit.clamp(1, 9);
189
190        // Determine the order of magnitude based on config range
191        let min_magnitude = self.config.min_amount.log10().floor() as i32;
192        let max_magnitude = self.config.max_amount.log10().floor() as i32;
193
194        // Sample a magnitude within the valid range
195        let magnitude = self.rng.gen_range(min_magnitude..=max_magnitude);
196        let base = 10_f64.powi(magnitude);
197
198        // Generate the remaining digits (0.0 to 0.999...)
199        let remaining: f64 = self.rng.gen();
200
201        // Construct: first_digit.remaining * 10^magnitude
202        let mantissa = first_digit as f64 + remaining;
203        let mut amount = mantissa * base;
204
205        // Clamp to configured range
206        amount = amount.clamp(self.config.min_amount, self.config.max_amount);
207
208        // Apply round number bias (25% chance)
209        let p: f64 = self.rng.gen();
210        if p < self.config.round_number_probability {
211            // Round to nearest whole number ending in 00
212            amount = (amount / 100.0).round() * 100.0;
213        } else if p < self.config.round_number_probability + self.config.nice_number_probability {
214            // Round to nearest 5 or 10
215            amount = (amount / 5.0).round() * 5.0;
216        }
217
218        // Round to configured decimal places
219        let decimal_multiplier = 10_f64.powi(self.config.decimal_places as i32);
220        amount = (amount * decimal_multiplier).round() / decimal_multiplier;
221
222        // Ensure minimum after rounding
223        amount = amount.max(self.config.min_amount);
224
225        Decimal::from_f64_retain(amount).unwrap_or(Decimal::ONE)
226    }
227
228    /// Reset the sampler with a new seed.
229    pub fn reset(&mut self, seed: u64) {
230        self.rng = ChaCha8Rng::seed_from_u64(seed);
231    }
232}
233
234/// Generator for fraudulent amount patterns.
235pub struct FraudAmountGenerator {
236    rng: ChaCha8Rng,
237    benford_sampler: BenfordSampler,
238    threshold_config: ThresholdConfig,
239    config: AmountDistributionConfig,
240}
241
242impl FraudAmountGenerator {
243    /// Create a new fraud amount generator.
244    pub fn new(
245        seed: u64,
246        config: AmountDistributionConfig,
247        threshold_config: ThresholdConfig,
248    ) -> Self {
249        Self {
250            rng: ChaCha8Rng::seed_from_u64(seed),
251            benford_sampler: BenfordSampler::new(seed + 1, config.clone()),
252            threshold_config,
253            config,
254        }
255    }
256
257    /// Generate an amount with the specified fraud pattern.
258    pub fn sample(&mut self, pattern: FraudAmountPattern) -> Decimal {
259        match pattern {
260            FraudAmountPattern::Normal => self.benford_sampler.sample(),
261            FraudAmountPattern::StatisticallyImprobable => self.sample_anti_benford(),
262            FraudAmountPattern::ObviousRoundNumbers => self.sample_obvious_round(),
263            FraudAmountPattern::ThresholdAdjacent => self.sample_threshold_adjacent(),
264        }
265    }
266
267    /// Generate an amount with statistically improbable first digit distribution.
268    fn sample_anti_benford(&mut self) -> Decimal {
269        let first_digit = self.benford_sampler.sample_anti_benford_first_digit();
270        self.benford_sampler.sample_with_first_digit(first_digit)
271    }
272
273    /// Generate an obvious round number amount (suspicious pattern).
274    fn sample_obvious_round(&mut self) -> Decimal {
275        let pattern_choice = self.rng.gen_range(0..5);
276
277        let amount = match pattern_choice {
278            // Even thousands ($1,000, $5,000, $10,000, etc.)
279            0 => {
280                let multiplier = self.rng.gen_range(1..100);
281                multiplier as f64 * 1000.0
282            }
283            // $X9,999.99 pattern (just under round number)
284            1 => {
285                let base = self.rng.gen_range(1..10) as f64 * 10000.0;
286                base - 0.01
287            }
288            // Exact $X0,000.00 pattern
289            2 => {
290                let multiplier = self.rng.gen_range(1..20);
291                multiplier as f64 * 10000.0
292            }
293            // Five-thousands ($5,000, $15,000, $25,000)
294            3 => {
295                let multiplier = self.rng.gen_range(1..40);
296                multiplier as f64 * 5000.0
297            }
298            // $X,999.99 pattern
299            _ => {
300                let base = self.rng.gen_range(1..100) as f64 * 1000.0;
301                base - 0.01
302            }
303        };
304
305        // Clamp to config range
306        let clamped = amount.clamp(self.config.min_amount, self.config.max_amount);
307        Decimal::from_f64_retain(clamped).unwrap_or(Decimal::ONE)
308    }
309
310    /// Generate an amount just below an approval threshold.
311    fn sample_threshold_adjacent(&mut self) -> Decimal {
312        // Select a threshold
313        let threshold = if self.threshold_config.thresholds.is_empty() {
314            10000.0
315        } else {
316            *self
317                .threshold_config
318                .thresholds
319                .choose(&mut self.rng)
320                .unwrap_or(&10000.0)
321        };
322
323        // Calculate amount as percentage below threshold
324        let pct_below = self
325            .rng
326            .gen_range(self.threshold_config.min_below_pct..self.threshold_config.max_below_pct);
327        let base_amount = threshold * (1.0 - pct_below);
328
329        // Add small noise to avoid exact patterns
330        let noise_factor = 1.0 + self.rng.gen_range(-0.005..0.005);
331        let amount = base_amount * noise_factor;
332
333        // Round to 2 decimal places
334        let rounded = (amount * 100.0).round() / 100.0;
335
336        // Ensure we're still below threshold
337        let final_amount = rounded.min(threshold - 0.01);
338        let clamped = final_amount.clamp(self.config.min_amount, self.config.max_amount);
339
340        Decimal::from_f64_retain(clamped).unwrap_or(Decimal::ONE)
341    }
342
343    /// Reset the generator with a new seed.
344    pub fn reset(&mut self, seed: u64) {
345        self.rng = ChaCha8Rng::seed_from_u64(seed);
346        self.benford_sampler.reset(seed + 1);
347    }
348}
349
350/// Extract the first digit from a decimal amount.
351pub fn get_first_digit(amount: Decimal) -> Option<u8> {
352    let s = amount.to_string();
353    s.chars()
354        .find(|c| c.is_ascii_digit() && *c != '0')
355        .and_then(|c| c.to_digit(10))
356        .map(|d| d as u8)
357}
358
359/// Extract the first two digits from a decimal amount.
360pub fn get_first_two_digits(amount: Decimal) -> Option<(u8, u8)> {
361    let s = amount.abs().to_string();
362    let mut first_found = false;
363    let mut first_digit = 0u8;
364
365    for c in s.chars() {
366        if c.is_ascii_digit() {
367            let d = c.to_digit(10).unwrap() as u8;
368            if !first_found && d != 0 {
369                first_digit = d;
370                first_found = true;
371            } else if first_found && c != '.' {
372                return Some((first_digit, d));
373            }
374        }
375    }
376    None
377}
378
379/// Configuration for enhanced Benford sampling with multi-digit compliance.
380#[derive(Debug, Clone, Serialize, Deserialize, Default)]
381pub struct EnhancedBenfordConfig {
382    /// Base amount distribution configuration
383    pub amount_config: AmountDistributionConfig,
384    /// Whether to enforce second-digit Benford compliance
385    #[serde(default)]
386    pub second_digit_compliance: bool,
387    /// Whether to enforce first-two-digit Benford compliance
388    #[serde(default)]
389    pub first_two_digit_compliance: bool,
390}
391
392/// Enhanced Benford sampler with multi-digit compliance.
393pub struct EnhancedBenfordSampler {
394    rng: ChaCha8Rng,
395    config: EnhancedBenfordConfig,
396    /// Pre-computed CDF for first two digits
397    first_two_cdf: [f64; 90],
398}
399
400impl EnhancedBenfordSampler {
401    /// Create a new enhanced Benford sampler.
402    pub fn new(seed: u64, config: EnhancedBenfordConfig) -> Self {
403        // Pre-compute CDF for first two digits
404        let probs = benford_first_two_probabilities();
405        let mut first_two_cdf = [0.0; 90];
406        let mut cumulative = 0.0;
407        for i in 0..90 {
408            cumulative += probs[i];
409            first_two_cdf[i] = cumulative;
410        }
411
412        Self {
413            rng: ChaCha8Rng::seed_from_u64(seed),
414            config,
415            first_two_cdf,
416        }
417    }
418
419    /// Sample first two digits according to Benford's Law.
420    fn sample_first_two_digits(&mut self) -> (u8, u8) {
421        let p: f64 = self.rng.gen();
422        for (i, &cdf) in self.first_two_cdf.iter().enumerate() {
423            if p < cdf {
424                let d1 = (i / 10 + 1) as u8;
425                let d2 = (i % 10) as u8;
426                return (d1, d2);
427            }
428        }
429        (9, 9)
430    }
431
432    /// Sample a second digit according to Benford's Law.
433    fn sample_second_digit(&mut self) -> u8 {
434        let p: f64 = self.rng.gen();
435        for (i, &cdf) in BENFORD_SECOND_DIGIT_CDF.iter().enumerate() {
436            if p < cdf {
437                return i as u8;
438            }
439        }
440        9
441    }
442
443    /// Sample a first digit according to Benford's Law.
444    fn sample_first_digit(&mut self) -> u8 {
445        let p: f64 = self.rng.gen();
446        for (i, &cdf) in BENFORD_CDF.iter().enumerate() {
447            if p < cdf {
448                return (i + 1) as u8;
449            }
450        }
451        9
452    }
453
454    /// Sample an amount with enhanced Benford compliance.
455    pub fn sample(&mut self) -> Decimal {
456        let (first_digit, second_digit) = if self.config.first_two_digit_compliance {
457            self.sample_first_two_digits()
458        } else if self.config.second_digit_compliance {
459            (self.sample_first_digit(), self.sample_second_digit())
460        } else {
461            (self.sample_first_digit(), self.rng.gen_range(0..10) as u8)
462        };
463
464        self.sample_with_digits(first_digit, second_digit)
465    }
466
467    /// Sample an amount with specific first two digits.
468    fn sample_with_digits(&mut self, first_digit: u8, second_digit: u8) -> Decimal {
469        let first_digit = first_digit.clamp(1, 9);
470        let second_digit = second_digit.clamp(0, 9);
471
472        // Determine the order of magnitude based on config range
473        let min_magnitude = self.config.amount_config.min_amount.log10().floor() as i32;
474        let max_magnitude = self.config.amount_config.max_amount.log10().floor() as i32;
475
476        // Sample a magnitude within the valid range
477        let magnitude = self.rng.gen_range(min_magnitude..=max_magnitude);
478        let base = 10_f64.powi(magnitude - 1); // -1 because first two digits span 10-99
479
480        // Generate the remaining digits (0.0 to 0.99...)
481        let remaining: f64 = self.rng.gen();
482
483        // Construct the amount: (first_digit * 10 + second_digit + remaining) * base
484        let mantissa = (first_digit as f64) * 10.0 + (second_digit as f64) + remaining;
485        let mut amount = mantissa * base;
486
487        // Clamp to configured range
488        amount = amount.clamp(
489            self.config.amount_config.min_amount,
490            self.config.amount_config.max_amount,
491        );
492
493        // Round to configured decimal places
494        let decimal_multiplier = 10_f64.powi(self.config.amount_config.decimal_places as i32);
495        amount = (amount * decimal_multiplier).round() / decimal_multiplier;
496
497        Decimal::from_f64_retain(amount).unwrap_or(Decimal::ONE)
498    }
499
500    /// Reset the sampler with a new seed.
501    pub fn reset(&mut self, seed: u64) {
502        self.rng = ChaCha8Rng::seed_from_u64(seed);
503    }
504}
505
506/// Types of Benford deviation patterns for anomaly injection.
507#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
508#[serde(rename_all = "snake_case")]
509#[derive(Default)]
510pub enum BenfordDeviationType {
511    /// Round number bias (excess of digits 1, 5, 0 in second position)
512    #[default]
513    RoundNumberBias,
514    /// Threshold clustering (amounts just below round thresholds)
515    ThresholdClustering,
516    /// Uniform first digit (equal probability for all first digits)
517    UniformFirstDigit,
518    /// Excess of specific digit
519    DigitBias { digit: u8 },
520    /// Trailing zeros pattern (prices ending in .00)
521    TrailingZeros,
522}
523
524/// Configuration for Benford deviation sampling (for anomaly injection).
525#[derive(Debug, Clone, Serialize, Deserialize)]
526pub struct BenfordDeviationConfig {
527    /// Type of deviation pattern
528    pub deviation_type: BenfordDeviationType,
529    /// Intensity of deviation (0.0 = Benford compliant, 1.0 = full deviation)
530    #[serde(default = "default_intensity")]
531    pub intensity: f64,
532    /// Base amount configuration
533    pub amount_config: AmountDistributionConfig,
534    /// Thresholds for threshold clustering (if applicable)
535    #[serde(default = "default_thresholds")]
536    pub thresholds: Vec<f64>,
537}
538
539fn default_intensity() -> f64 {
540    0.5
541}
542
543fn default_thresholds() -> Vec<f64> {
544    vec![1000.0, 5000.0, 10000.0, 25000.0, 50000.0, 100000.0]
545}
546
547impl Default for BenfordDeviationConfig {
548    fn default() -> Self {
549        Self {
550            deviation_type: BenfordDeviationType::RoundNumberBias,
551            intensity: 0.5,
552            amount_config: AmountDistributionConfig::default(),
553            thresholds: default_thresholds(),
554        }
555    }
556}
557
558/// Sampler for generating amounts that deviate from Benford's Law.
559/// Useful for injecting statistically detectable anomalies.
560pub struct BenfordDeviationSampler {
561    rng: ChaCha8Rng,
562    config: BenfordDeviationConfig,
563    benford_sampler: BenfordSampler,
564}
565
566impl BenfordDeviationSampler {
567    /// Create a new Benford deviation sampler.
568    pub fn new(seed: u64, config: BenfordDeviationConfig) -> Self {
569        Self {
570            rng: ChaCha8Rng::seed_from_u64(seed),
571            benford_sampler: BenfordSampler::new(seed + 100, config.amount_config.clone()),
572            config,
573        }
574    }
575
576    /// Sample an amount with the configured deviation pattern.
577    pub fn sample(&mut self) -> Decimal {
578        // With probability (1 - intensity), sample from normal Benford
579        let p: f64 = self.rng.gen();
580        if p > self.config.intensity {
581            return self.benford_sampler.sample();
582        }
583
584        // Apply deviation pattern
585        match self.config.deviation_type {
586            BenfordDeviationType::RoundNumberBias => self.sample_round_bias(),
587            BenfordDeviationType::ThresholdClustering => self.sample_threshold_cluster(),
588            BenfordDeviationType::UniformFirstDigit => self.sample_uniform_first_digit(),
589            BenfordDeviationType::DigitBias { digit } => self.sample_digit_bias(digit),
590            BenfordDeviationType::TrailingZeros => self.sample_trailing_zeros(),
591        }
592    }
593
594    /// Sample with round number bias.
595    fn sample_round_bias(&mut self) -> Decimal {
596        // Bias towards first digits 1 and 5
597        let first_digit = if self.rng.gen_bool(0.6) {
598            if self.rng.gen_bool(0.7) {
599                1
600            } else {
601                5
602            }
603        } else {
604            self.rng.gen_range(1..=9)
605        };
606
607        // Bias towards second digits 0 and 5
608        let _second_digit = if self.rng.gen_bool(0.5) {
609            if self.rng.gen_bool(0.6) {
610                0
611            } else {
612                5
613            }
614        } else {
615            self.rng.gen_range(0..=9)
616        };
617
618        self.benford_sampler.sample_with_first_digit(first_digit)
619    }
620
621    /// Sample clustering just below thresholds.
622    fn sample_threshold_cluster(&mut self) -> Decimal {
623        let threshold = self
624            .config
625            .thresholds
626            .choose(&mut self.rng)
627            .copied()
628            .unwrap_or(10000.0);
629
630        // Generate amount 1-15% below threshold
631        let pct_below = self.rng.gen_range(0.01..0.15);
632        let amount = threshold * (1.0 - pct_below);
633
634        // Add small noise
635        let noise = 1.0 + self.rng.gen_range(-0.005..0.005);
636        let final_amount = (amount * noise * 100.0).round() / 100.0;
637
638        Decimal::from_f64_retain(final_amount.clamp(
639            self.config.amount_config.min_amount,
640            self.config.amount_config.max_amount,
641        ))
642        .unwrap_or(Decimal::ONE)
643    }
644
645    /// Sample with uniform first digit distribution.
646    fn sample_uniform_first_digit(&mut self) -> Decimal {
647        let first_digit = self.rng.gen_range(1..=9);
648        self.benford_sampler.sample_with_first_digit(first_digit)
649    }
650
651    /// Sample with bias towards a specific digit.
652    fn sample_digit_bias(&mut self, target_digit: u8) -> Decimal {
653        let digit = target_digit.clamp(1, 9);
654        // 70% chance of using the biased digit
655        let first_digit = if self.rng.gen_bool(0.7) {
656            digit
657        } else {
658            self.rng.gen_range(1..=9)
659        };
660        self.benford_sampler.sample_with_first_digit(first_digit)
661    }
662
663    /// Sample with trailing zeros pattern (prices ending in .00).
664    fn sample_trailing_zeros(&mut self) -> Decimal {
665        let amount = self.benford_sampler.sample();
666        let amount_f64: f64 = amount.to_string().parse().unwrap_or(0.0);
667
668        // Round to whole dollars
669        let rounded = amount_f64.round();
670        Decimal::from_f64_retain(rounded.clamp(
671            self.config.amount_config.min_amount,
672            self.config.amount_config.max_amount,
673        ))
674        .unwrap_or(Decimal::ONE)
675    }
676
677    /// Reset the sampler with a new seed.
678    pub fn reset(&mut self, seed: u64) {
679        self.rng = ChaCha8Rng::seed_from_u64(seed);
680        self.benford_sampler.reset(seed + 100);
681    }
682}
683
684#[cfg(test)]
685mod tests {
686    use super::*;
687
688    #[test]
689    fn test_benford_probabilities_sum_to_one() {
690        let sum: f64 = BENFORD_PROBABILITIES.iter().sum();
691        assert!(
692            (sum - 1.0).abs() < 0.001,
693            "Benford probabilities sum to {}, expected 1.0",
694            sum
695        );
696    }
697
698    #[test]
699    fn test_benford_cdf_ends_at_one() {
700        assert!(
701            (BENFORD_CDF[8] - 1.0).abs() < 0.0001,
702            "CDF should end at 1.0"
703        );
704    }
705
706    #[test]
707    fn test_anti_benford_probabilities_sum_to_one() {
708        let sum: f64 = ANTI_BENFORD_PROBABILITIES.iter().sum();
709        assert!(
710            (sum - 1.0).abs() < 0.001,
711            "Anti-Benford probabilities sum to {}, expected 1.0",
712            sum
713        );
714    }
715
716    #[test]
717    fn test_benford_sampler_determinism() {
718        let config = AmountDistributionConfig::default();
719        let mut sampler1 = BenfordSampler::new(42, config.clone());
720        let mut sampler2 = BenfordSampler::new(42, config);
721
722        for _ in 0..100 {
723            assert_eq!(sampler1.sample(), sampler2.sample());
724        }
725    }
726
727    #[test]
728    fn test_benford_first_digit_distribution() {
729        let config = AmountDistributionConfig::default();
730        let mut sampler = BenfordSampler::new(12345, config);
731
732        let mut digit_counts = [0u32; 9];
733        let iterations = 10_000;
734
735        for _ in 0..iterations {
736            let amount = sampler.sample();
737            if let Some(digit) = get_first_digit(amount) {
738                if (1..=9).contains(&digit) {
739                    digit_counts[(digit - 1) as usize] += 1;
740                }
741            }
742        }
743
744        // Verify digit 1 is most common (should be ~30%, but can vary more due to log-normal distribution)
745        let digit_1_pct = digit_counts[0] as f64 / iterations as f64;
746        assert!(
747            digit_1_pct > 0.15 && digit_1_pct < 0.50,
748            "Digit 1 should be ~30%, got {:.1}%",
749            digit_1_pct * 100.0
750        );
751
752        // Verify digit 9 is least common (should be ~5%)
753        let digit_9_pct = digit_counts[8] as f64 / iterations as f64;
754        assert!(
755            digit_9_pct > 0.02 && digit_9_pct < 0.10,
756            "Digit 9 should be ~5%, got {:.1}%",
757            digit_9_pct * 100.0
758        );
759    }
760
761    #[test]
762    fn test_threshold_adjacent_below_threshold() {
763        let config = AmountDistributionConfig::default();
764        let threshold_config = ThresholdConfig {
765            thresholds: vec![10000.0],
766            min_below_pct: 0.01,
767            max_below_pct: 0.15,
768        };
769        let mut gen = FraudAmountGenerator::new(42, config, threshold_config);
770
771        for _ in 0..100 {
772            let amount = gen.sample(FraudAmountPattern::ThresholdAdjacent);
773            let f = amount.to_string().parse::<f64>().unwrap();
774            assert!(f < 10000.0, "Amount {} should be below threshold 10000", f);
775            // Account for noise factor (up to 0.5%) and rounding
776            assert!(
777                f >= 8400.0,
778                "Amount {} should be approximately within 15% of threshold",
779                f
780            );
781        }
782    }
783
784    #[test]
785    fn test_obvious_round_numbers() {
786        let config = AmountDistributionConfig::default();
787        let threshold_config = ThresholdConfig::default();
788        let mut gen = FraudAmountGenerator::new(42, config, threshold_config);
789
790        for _ in 0..100 {
791            let amount = gen.sample(FraudAmountPattern::ObviousRoundNumbers);
792            let f = amount.to_string().parse::<f64>().unwrap();
793
794            // Should be either a round number or just under one
795            let is_round = f % 1000.0 == 0.0 || f % 5000.0 == 0.0;
796            let is_just_under = (f + 0.01) % 1000.0 < 0.02 || (f + 0.01) % 10000.0 < 0.02;
797
798            assert!(
799                is_round || is_just_under || f > 0.0,
800                "Amount {} should be a suspicious round number",
801                f
802            );
803        }
804    }
805
806    #[test]
807    fn test_get_first_digit() {
808        assert_eq!(get_first_digit(Decimal::from(123)), Some(1));
809        assert_eq!(get_first_digit(Decimal::from(999)), Some(9));
810        assert_eq!(get_first_digit(Decimal::from(50000)), Some(5));
811        assert_eq!(
812            get_first_digit(Decimal::from_str_exact("0.00123").unwrap()),
813            Some(1)
814        );
815    }
816
817    #[test]
818    fn test_second_digit_probabilities_sum_to_one() {
819        let sum: f64 = BENFORD_SECOND_DIGIT_PROBABILITIES.iter().sum();
820        assert!(
821            (sum - 1.0).abs() < 0.001,
822            "Second digit probabilities sum to {}, expected 1.0",
823            sum
824        );
825    }
826
827    #[test]
828    fn test_first_two_probability() {
829        // P(10) = log10(1 + 1/10) = log10(1.1) ≈ 0.0414
830        let p10 = benford_first_two_probability(1, 0);
831        assert!((p10 - 0.0414).abs() < 0.001);
832
833        // P(99) = log10(1 + 1/99) ≈ 0.00436
834        let p99 = benford_first_two_probability(9, 9);
835        assert!((p99 - 0.00436).abs() < 0.0001);
836
837        // Sum of all first-two probabilities should be 1.0
838        let probs = benford_first_two_probabilities();
839        let sum: f64 = probs.iter().sum();
840        assert!((sum - 1.0).abs() < 0.001);
841    }
842
843    #[test]
844    fn test_get_first_two_digits() {
845        assert_eq!(get_first_two_digits(Decimal::from(123)), Some((1, 2)));
846        assert_eq!(get_first_two_digits(Decimal::from(999)), Some((9, 9)));
847        assert_eq!(get_first_two_digits(Decimal::from(50000)), Some((5, 0)));
848        assert_eq!(
849            get_first_two_digits(Decimal::from_str_exact("0.00123").unwrap()),
850            Some((1, 2))
851        );
852    }
853
854    #[test]
855    fn test_enhanced_benford_sampler() {
856        let config = EnhancedBenfordConfig {
857            amount_config: AmountDistributionConfig::default(),
858            second_digit_compliance: true,
859            first_two_digit_compliance: false,
860        };
861        let mut sampler = EnhancedBenfordSampler::new(42, config);
862
863        let mut digit_counts = [0u32; 10];
864        for _ in 0..10000 {
865            let amount = sampler.sample();
866            if let Some((_, d2)) = get_first_two_digits(amount) {
867                digit_counts[d2 as usize] += 1;
868            }
869        }
870
871        // Note: The second digit distribution depends on amount generation and
872        // magnitude selection, which may skew results. Just verify the sampler runs
873        // and produces valid amounts.
874        let total_valid = digit_counts.iter().sum::<u32>();
875        assert!(
876            total_valid > 9000,
877            "Most samples should have valid first two digits"
878        );
879
880        // Verify we have some distribution of second digits (not all the same)
881        let max_count = *digit_counts.iter().max().unwrap();
882        let _min_count = *digit_counts.iter().min().unwrap();
883        assert!(
884            max_count < total_valid / 2,
885            "Second digits should have some variety, max count: {}",
886            max_count
887        );
888    }
889
890    #[test]
891    fn test_benford_deviation_sampler() {
892        let config = BenfordDeviationConfig {
893            deviation_type: BenfordDeviationType::ThresholdClustering,
894            intensity: 1.0,
895            amount_config: AmountDistributionConfig::default(),
896            thresholds: vec![10000.0],
897        };
898        let mut sampler = BenfordDeviationSampler::new(42, config);
899
900        for _ in 0..100 {
901            let amount = sampler.sample();
902            let f: f64 = amount.to_string().parse().unwrap();
903            // Should be below threshold
904            assert!(f < 10000.0, "Amount {} should be below 10000", f);
905            // Should be within ~20% of threshold (1-15% below + noise)
906            assert!(f > 8000.0, "Amount {} should be near threshold 10000", f);
907        }
908    }
909
910    #[test]
911    fn test_benford_deviation_round_bias() {
912        let config = BenfordDeviationConfig {
913            deviation_type: BenfordDeviationType::RoundNumberBias,
914            intensity: 1.0,
915            amount_config: AmountDistributionConfig::default(),
916            thresholds: vec![],
917        };
918        let mut sampler = BenfordDeviationSampler::new(42, config);
919
920        let mut digit_counts = [0u32; 9];
921        for _ in 0..1000 {
922            let amount = sampler.sample();
923            if let Some(d) = get_first_digit(amount) {
924                if (1..=9).contains(&d) {
925                    digit_counts[(d - 1) as usize] += 1;
926                }
927            }
928        }
929
930        // Digits 1 and 5 should be overrepresented
931        let d1_pct = digit_counts[0] as f64 / 1000.0;
932        let d5_pct = digit_counts[4] as f64 / 1000.0;
933
934        // Should be higher than Benford expects
935        assert!(d1_pct > 0.35 || d5_pct > 0.10);
936    }
937}
datasynth_core/distributions/benford.rs

datasynth_core/distributions/
benford.rs