datasynth_core/distributions/
benford.rs

1//! Benford's Law distribution sampler and fraud amount patterns.
2//!
3//! Implements Benford's Law compliant amount generation and various fraud
4//! amount patterns for realistic synthetic accounting data. Includes enhanced
5//! multi-digit Benford analysis and deviation patterns for anomaly injection.
6
7use rand::prelude::*;
8use rand_chacha::ChaCha8Rng;
9use rand_distr::{Distribution, LogNormal};
10use rust_decimal::Decimal;
11use serde::{Deserialize, Serialize};
12
13use super::AmountDistributionConfig;
14
15/// Benford's Law probability distribution for first digits 1-9.
16/// P(d) = log10(1 + 1/d)
17/// Note: Uses explicit values to satisfy clippy while maintaining exact precision.
18#[allow(clippy::approx_constant)]
19pub const BENFORD_PROBABILITIES: [f64; 9] = [
20    0.30103, // 1: 30.1% - log10(2)
21    0.17609, // 2: 17.6%
22    0.12494, // 3: 12.5%
23    0.09691, // 4: 9.7%
24    0.07918, // 5: 7.9%
25    0.06695, // 6: 6.7%
26    0.05799, // 7: 5.8%
27    0.05115, // 8: 5.1%
28    0.04576, // 9: 4.6%
29];
30
31/// Cumulative distribution function for Benford's Law.
32/// Note: Uses explicit values to satisfy clippy while maintaining exact precision.
33#[allow(clippy::approx_constant)]
34pub const BENFORD_CDF: [f64; 9] = [
35    0.30103, // 1 - log10(2)
36    0.47712, // 1-2
37    0.60206, // 1-3
38    0.69897, // 1-4
39    0.77815, // 1-5
40    0.84510, // 1-6
41    0.90309, // 1-7
42    0.95424, // 1-8
43    1.00000, // 1-9
44];
45
46/// Benford's Law probability distribution for second digits 0-9.
47/// P(d2) = sum over d1 of log10(1 + 1/(10*d1 + d2))
48#[allow(clippy::approx_constant)]
49pub const BENFORD_SECOND_DIGIT_PROBABILITIES: [f64; 10] = [
50    0.11968, // 0: 12.0%
51    0.11389, // 1: 11.4%
52    0.10882, // 2: 10.9%
53    0.10433, // 3: 10.4%
54    0.10031, // 4: 10.0%
55    0.09668, // 5: 9.7%
56    0.09337, // 6: 9.3%
57    0.09035, // 7: 9.0%
58    0.08757, // 8: 8.8%
59    0.08500, // 9: 8.5%
60];
61
62/// Cumulative distribution function for second digit Benford's Law.
63pub const BENFORD_SECOND_DIGIT_CDF: [f64; 10] = [
64    0.11968, 0.23357, 0.34239, 0.44672, 0.54703, 0.64371, 0.73708, 0.82743, 0.91500, 1.00000,
65];
66
67/// Calculate Benford's Law probability for first two digits (10-99).
68/// P(d1d2) = log10(1 + 1/(d1*10 + d2))
69pub fn benford_first_two_probability(d1: u8, d2: u8) -> f64 {
70    if !(1..=9).contains(&d1) || d2 > 9 {
71        return 0.0;
72    }
73    let n = (d1 as f64) * 10.0 + (d2 as f64);
74    (1.0 + 1.0 / n).log10()
75}
76
77/// Get all first-two-digit probabilities as a 90-element array (10-99).
78pub fn benford_first_two_probabilities() -> [f64; 90] {
79    let mut probs = [0.0; 90];
80    for d1 in 1..=9 {
81        for d2 in 0..=9 {
82            let idx = (d1 - 1) * 10 + d2;
83            probs[idx as usize] = benford_first_two_probability(d1, d2);
84        }
85    }
86    probs
87}
88
89/// Anti-Benford distribution for generating statistically improbable amounts.
90/// Overweights digits 5, 7, and 9 which are typically rare in natural data.
91pub const ANTI_BENFORD_PROBABILITIES: [f64; 9] = [
92    0.05, // 1: 5% (normally 30%)
93    0.05, // 2: 5% (normally 18%)
94    0.05, // 3: 5% (normally 12%)
95    0.10, // 4: 10%
96    0.25, // 5: 25% (normally 8%)
97    0.10, // 6: 10%
98    0.20, // 7: 20% (normally 6%)
99    0.05, // 8: 5%
100    0.15, // 9: 15% (normally 5%)
101];
102
103/// Fraud amount pattern types.
104#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
105#[serde(rename_all = "snake_case")]
106pub enum FraudAmountPattern {
107    /// Normal amount generation (Benford-compliant if enabled)
108    #[default]
109    Normal,
110    /// Statistically improbable first digits (anti-Benford)
111    /// Excess of leading 5s, 7s, 9s - detectable via statistical analysis
112    StatisticallyImprobable,
113    /// Obvious round numbers ($50,000.00, $99,999.99)
114    /// Easy to spot in visual review
115    ObviousRoundNumbers,
116    /// Amounts clustered just below approval thresholds
117    /// Classic split-transaction pattern
118    ThresholdAdjacent,
119}
120
121/// Configuration for threshold-adjacent fraud pattern.
122#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct ThresholdConfig {
124    /// Approval thresholds to cluster below
125    pub thresholds: Vec<f64>,
126    /// Minimum percentage below threshold (e.g., 0.01 = 1%)
127    pub min_below_pct: f64,
128    /// Maximum percentage below threshold (e.g., 0.15 = 15%)
129    pub max_below_pct: f64,
130}
131
132impl Default for ThresholdConfig {
133    fn default() -> Self {
134        Self {
135            thresholds: vec![1000.0, 5000.0, 10000.0, 25000.0, 50000.0, 100000.0],
136            min_below_pct: 0.01,
137            max_below_pct: 0.15,
138        }
139    }
140}
141
142/// Sampler that produces amounts following Benford's Law distribution.
143///
144/// The sampler derives the order-of-magnitude of each amount from the
145/// configured log-normal distribution (parameterised by
146/// `lognormal_mu` / `lognormal_sigma` on
147/// [`AmountDistributionConfig`]) and then overrides the first
148/// significant digit to match the requested Benford / anti-Benford
149/// distribution.  This produces realistic per-amount magnitudes that
150/// match routine traffic, while preserving the first-digit
151/// statistical signature that fraud-detection consumers care about.
152///
153/// (Earlier versions sampled magnitude uniformly across
154/// `[log10(min_amount), log10(max_amount)]`.  That over-represented
155/// the high tail and caused account-level balances to drift into
156/// numerically unsafe territory under audit-group + 12-month +
157/// `hundred_k` retail configs — see GitHub issue #185.)
158pub struct BenfordSampler {
159    rng: ChaCha8Rng,
160    config: AmountDistributionConfig,
161    /// Underlying log-normal that supplies realistic magnitudes for
162    /// `sample_with_first_digit`.  Identical parameters to the
163    /// parent `AmountSampler::lognormal` so fraud and routine
164    /// amounts share the same magnitude distribution.
165    lognormal: LogNormal<f64>,
166}
167
168impl BenfordSampler {
169    /// Create a new Benford sampler with the given seed and amount configuration.
170    pub fn new(seed: u64, config: AmountDistributionConfig) -> Self {
171        let lognormal = LogNormal::new(config.lognormal_mu, config.lognormal_sigma)
172            .expect("Invalid log-normal parameters in BenfordSampler");
173        Self {
174            rng: ChaCha8Rng::seed_from_u64(seed),
175            config,
176            lognormal,
177        }
178    }
179
180    /// Sample a first digit according to Benford's Law.
181    fn sample_benford_first_digit(&mut self) -> u8 {
182        let p: f64 = self.rng.random();
183        for (i, &cumulative) in BENFORD_CDF.iter().enumerate() {
184            if p < cumulative {
185                return (i + 1) as u8;
186            }
187        }
188        9
189    }
190
191    /// Sample a first digit from the anti-Benford distribution.
192    fn sample_anti_benford_first_digit(&mut self) -> u8 {
193        let p: f64 = self.rng.random();
194        let mut cumulative = 0.0;
195        for (i, &prob) in ANTI_BENFORD_PROBABILITIES.iter().enumerate() {
196            cumulative += prob;
197            if p < cumulative {
198                return (i + 1) as u8;
199            }
200        }
201        9
202    }
203
204    /// Sample an amount following Benford's Law.
205    pub fn sample(&mut self) -> Decimal {
206        let first_digit = self.sample_benford_first_digit();
207        self.sample_with_first_digit(first_digit)
208    }
209
210    /// Sample an amount with a specific first digit.
211    ///
212    /// The order-of-magnitude is drawn from the parent log-normal
213    /// distribution (clamped to `[min_amount, max_amount]`), which
214    /// matches the magnitude distribution of routine traffic.  The
215    /// first significant digit is then forced to `first_digit`,
216    /// preserving the Benford / anti-Benford statistical signature
217    /// that consumers expect.  This avoids the uniform-magnitude
218    /// over-representation of the high tail that previously inflated
219    /// account-level balances into the 10^15+ range under
220    /// audit-group + multi-period retail configs (issue #185).
221    pub fn sample_with_first_digit(&mut self, first_digit: u8) -> Decimal {
222        let first_digit = first_digit.clamp(1, 9);
223
224        // Sample a realistic raw magnitude from the parent log-normal.
225        // The clamp to `[min_amount, max_amount]` matches the bounds
226        // applied throughout the rest of `AmountSampler`.
227        let raw_amount = self
228            .lognormal
229            .sample(&mut self.rng)
230            .clamp(self.config.min_amount, self.config.max_amount);
231        let magnitude = raw_amount.log10().floor() as i32;
232        let base = 10_f64.powi(magnitude);
233
234        // Generate the remaining digits (0.0 to 0.999...)
235        let remaining: f64 = self.rng.random();
236
237        // Construct: first_digit.remaining * 10^magnitude
238        let mantissa = first_digit as f64 + remaining;
239        let mut amount = mantissa * base;
240
241        // Clamp to configured range
242        amount = amount.clamp(self.config.min_amount, self.config.max_amount);
243
244        // Apply round number bias (25% chance)
245        let p: f64 = self.rng.random();
246        if p < self.config.round_number_probability {
247            // Round to nearest whole number ending in 00
248            amount = (amount / 100.0).round() * 100.0;
249        } else if p < self.config.round_number_probability + self.config.nice_number_probability {
250            // Round to nearest 5 or 10
251            amount = (amount / 5.0).round() * 5.0;
252        }
253
254        // Round to configured decimal places
255        let decimal_multiplier = 10_f64.powi(self.config.decimal_places as i32);
256        amount = (amount * decimal_multiplier).round() / decimal_multiplier;
257
258        // Ensure minimum after rounding
259        amount = amount.max(self.config.min_amount);
260
261        Decimal::from_f64_retain(amount).unwrap_or(Decimal::ONE)
262    }
263
264    /// Reset the sampler with a new seed.
265    pub fn reset(&mut self, seed: u64) {
266        self.rng = ChaCha8Rng::seed_from_u64(seed);
267    }
268}
269
270/// Generator for fraudulent amount patterns.
271pub struct FraudAmountGenerator {
272    rng: ChaCha8Rng,
273    benford_sampler: BenfordSampler,
274    threshold_config: ThresholdConfig,
275    config: AmountDistributionConfig,
276}
277
278impl FraudAmountGenerator {
279    /// Create a new fraud amount generator.
280    pub fn new(
281        seed: u64,
282        config: AmountDistributionConfig,
283        threshold_config: ThresholdConfig,
284    ) -> Self {
285        Self {
286            rng: ChaCha8Rng::seed_from_u64(seed),
287            benford_sampler: BenfordSampler::new(seed + 1, config.clone()),
288            threshold_config,
289            config,
290        }
291    }
292
293    /// Generate an amount with the specified fraud pattern.
294    pub fn sample(&mut self, pattern: FraudAmountPattern) -> Decimal {
295        match pattern {
296            FraudAmountPattern::Normal => self.benford_sampler.sample(),
297            FraudAmountPattern::StatisticallyImprobable => self.sample_anti_benford(),
298            FraudAmountPattern::ObviousRoundNumbers => self.sample_obvious_round(),
299            FraudAmountPattern::ThresholdAdjacent => self.sample_threshold_adjacent(),
300        }
301    }
302
303    /// Generate an amount with statistically improbable first digit distribution.
304    fn sample_anti_benford(&mut self) -> Decimal {
305        let first_digit = self.benford_sampler.sample_anti_benford_first_digit();
306        self.benford_sampler.sample_with_first_digit(first_digit)
307    }
308
309    /// Generate an obvious round number amount (suspicious pattern).
310    fn sample_obvious_round(&mut self) -> Decimal {
311        let pattern_choice = self.rng.random_range(0..5);
312
313        let amount = match pattern_choice {
314            // Even thousands ($1,000, $5,000, $10,000, etc.)
315            0 => {
316                let multiplier = self.rng.random_range(1..100);
317                multiplier as f64 * 1000.0
318            }
319            // $X9,999.99 pattern (just under round number)
320            1 => {
321                let base = self.rng.random_range(1..10) as f64 * 10000.0;
322                base - 0.01
323            }
324            // Exact $X0,000.00 pattern
325            2 => {
326                let multiplier = self.rng.random_range(1..20);
327                multiplier as f64 * 10000.0
328            }
329            // Five-thousands ($5,000, $15,000, $25,000)
330            3 => {
331                let multiplier = self.rng.random_range(1..40);
332                multiplier as f64 * 5000.0
333            }
334            // $X,999.99 pattern
335            _ => {
336                let base = self.rng.random_range(1..100) as f64 * 1000.0;
337                base - 0.01
338            }
339        };
340
341        // Clamp to config range
342        let clamped = amount.clamp(self.config.min_amount, self.config.max_amount);
343        Decimal::from_f64_retain(clamped).unwrap_or(Decimal::ONE)
344    }
345
346    /// Generate an amount just below an approval threshold.
347    fn sample_threshold_adjacent(&mut self) -> Decimal {
348        // Select a threshold
349        let threshold = if self.threshold_config.thresholds.is_empty() {
350            10000.0
351        } else {
352            *self
353                .threshold_config
354                .thresholds
355                .choose(&mut self.rng)
356                .unwrap_or(&10000.0)
357        };
358
359        // Calculate amount as percentage below threshold
360        let pct_below = self
361            .rng
362            .random_range(self.threshold_config.min_below_pct..self.threshold_config.max_below_pct);
363        let base_amount = threshold * (1.0 - pct_below);
364
365        // Add small noise to avoid exact patterns
366        let noise_factor = 1.0 + self.rng.random_range(-0.005..0.005);
367        let amount = base_amount * noise_factor;
368
369        // Round to 2 decimal places
370        let rounded = (amount * 100.0).round() / 100.0;
371
372        // Ensure we're still below threshold
373        let final_amount = rounded.min(threshold - 0.01);
374        let clamped = final_amount.clamp(self.config.min_amount, self.config.max_amount);
375
376        Decimal::from_f64_retain(clamped).unwrap_or(Decimal::ONE)
377    }
378
379    /// Reset the generator with a new seed.
380    pub fn reset(&mut self, seed: u64) {
381        self.rng = ChaCha8Rng::seed_from_u64(seed);
382        self.benford_sampler.reset(seed + 1);
383    }
384}
385
386/// Extract the first digit from a decimal amount.
387pub fn get_first_digit(amount: Decimal) -> Option<u8> {
388    let s = amount.to_string();
389    s.chars()
390        .find(|c| c.is_ascii_digit() && *c != '0')
391        .and_then(|c| c.to_digit(10))
392        .map(|d| d as u8)
393}
394
395/// Extract the first two digits from a decimal amount.
396pub fn get_first_two_digits(amount: Decimal) -> Option<(u8, u8)> {
397    let s = amount.abs().to_string();
398    let mut first_found = false;
399    let mut first_digit = 0u8;
400
401    for c in s.chars() {
402        if c.is_ascii_digit() {
403            let d = c
404                .to_digit(10)
405                .expect("digit char confirmed by is_ascii_digit") as u8;
406            if !first_found && d != 0 {
407                first_digit = d;
408                first_found = true;
409            } else if first_found && c != '.' {
410                return Some((first_digit, d));
411            }
412        }
413    }
414    None
415}
416
417/// Configuration for enhanced Benford sampling with multi-digit compliance.
418#[derive(Debug, Clone, Serialize, Deserialize, Default)]
419pub struct EnhancedBenfordConfig {
420    /// Base amount distribution configuration
421    pub amount_config: AmountDistributionConfig,
422    /// Whether to enforce second-digit Benford compliance
423    #[serde(default)]
424    pub second_digit_compliance: bool,
425    /// Whether to enforce first-two-digit Benford compliance
426    #[serde(default)]
427    pub first_two_digit_compliance: bool,
428}
429
430/// Enhanced Benford sampler with multi-digit compliance.
431///
432/// Like [`BenfordSampler`], the magnitude of each amount is drawn
433/// from the configured log-normal distribution rather than uniformly
434/// across the magnitude range — see issue #185.
435pub struct EnhancedBenfordSampler {
436    rng: ChaCha8Rng,
437    config: EnhancedBenfordConfig,
438    /// Pre-computed CDF for first two digits
439    first_two_cdf: [f64; 90],
440    /// Underlying log-normal that supplies realistic magnitudes.
441    lognormal: LogNormal<f64>,
442}
443
444impl EnhancedBenfordSampler {
445    /// Create a new enhanced Benford sampler.
446    pub fn new(seed: u64, config: EnhancedBenfordConfig) -> Self {
447        // Pre-compute CDF for first two digits
448        let probs = benford_first_two_probabilities();
449        let mut first_two_cdf = [0.0; 90];
450        let mut cumulative = 0.0;
451        for i in 0..90 {
452            cumulative += probs[i];
453            first_two_cdf[i] = cumulative;
454        }
455
456        let lognormal = LogNormal::new(
457            config.amount_config.lognormal_mu,
458            config.amount_config.lognormal_sigma,
459        )
460        .expect("Invalid log-normal parameters in EnhancedBenfordSampler");
461
462        Self {
463            rng: ChaCha8Rng::seed_from_u64(seed),
464            config,
465            first_two_cdf,
466            lognormal,
467        }
468    }
469
470    /// Sample first two digits according to Benford's Law.
471    fn sample_first_two_digits(&mut self) -> (u8, u8) {
472        let p: f64 = self.rng.random();
473        for (i, &cdf) in self.first_two_cdf.iter().enumerate() {
474            if p < cdf {
475                let d1 = (i / 10 + 1) as u8;
476                let d2 = (i % 10) as u8;
477                return (d1, d2);
478            }
479        }
480        (9, 9)
481    }
482
483    /// Sample a second digit according to Benford's Law.
484    fn sample_second_digit(&mut self) -> u8 {
485        let p: f64 = self.rng.random();
486        for (i, &cdf) in BENFORD_SECOND_DIGIT_CDF.iter().enumerate() {
487            if p < cdf {
488                return i as u8;
489            }
490        }
491        9
492    }
493
494    /// Sample a first digit according to Benford's Law.
495    fn sample_first_digit(&mut self) -> u8 {
496        let p: f64 = self.rng.random();
497        for (i, &cdf) in BENFORD_CDF.iter().enumerate() {
498            if p < cdf {
499                return (i + 1) as u8;
500            }
501        }
502        9
503    }
504
505    /// Sample an amount with enhanced Benford compliance.
506    pub fn sample(&mut self) -> Decimal {
507        let (first_digit, second_digit) = if self.config.first_two_digit_compliance {
508            self.sample_first_two_digits()
509        } else if self.config.second_digit_compliance {
510            (self.sample_first_digit(), self.sample_second_digit())
511        } else {
512            (
513                self.sample_first_digit(),
514                self.rng.random_range(0..10) as u8,
515            )
516        };
517
518        self.sample_with_digits(first_digit, second_digit)
519    }
520
521    /// Sample an amount with specific first two digits.
522    ///
523    /// Magnitude is drawn from the parent log-normal (see
524    /// [`BenfordSampler::sample_with_first_digit`] for the rationale).
525    fn sample_with_digits(&mut self, first_digit: u8, second_digit: u8) -> Decimal {
526        let first_digit = first_digit.clamp(1, 9);
527        let second_digit = second_digit.clamp(0, 9);
528
529        // Sample a realistic raw magnitude from the parent log-normal.
530        let raw_amount = self.lognormal.sample(&mut self.rng).clamp(
531            self.config.amount_config.min_amount,
532            self.config.amount_config.max_amount,
533        );
534        let magnitude = raw_amount.log10().floor() as i32;
535        let base = 10_f64.powi(magnitude - 1); // -1 because first two digits span 10-99
536
537        // Generate the remaining digits (0.0 to 0.99...)
538        let remaining: f64 = self.rng.random();
539
540        // Construct the amount: (first_digit * 10 + second_digit + remaining) * base
541        let mantissa = (first_digit as f64) * 10.0 + (second_digit as f64) + remaining;
542        let mut amount = mantissa * base;
543
544        // Clamp to configured range
545        amount = amount.clamp(
546            self.config.amount_config.min_amount,
547            self.config.amount_config.max_amount,
548        );
549
550        // Round to configured decimal places
551        let decimal_multiplier = 10_f64.powi(self.config.amount_config.decimal_places as i32);
552        amount = (amount * decimal_multiplier).round() / decimal_multiplier;
553
554        Decimal::from_f64_retain(amount).unwrap_or(Decimal::ONE)
555    }
556
557    /// Reset the sampler with a new seed.
558    pub fn reset(&mut self, seed: u64) {
559        self.rng = ChaCha8Rng::seed_from_u64(seed);
560    }
561}
562
563/// Types of Benford deviation patterns for anomaly injection.
564#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
565#[serde(rename_all = "snake_case")]
566#[derive(Default)]
567pub enum BenfordDeviationType {
568    /// Round number bias (excess of digits 1, 5, 0 in second position)
569    #[default]
570    RoundNumberBias,
571    /// Threshold clustering (amounts just below round thresholds)
572    ThresholdClustering,
573    /// Uniform first digit (equal probability for all first digits)
574    UniformFirstDigit,
575    /// Excess of specific digit
576    DigitBias { digit: u8 },
577    /// Trailing zeros pattern (prices ending in .00)
578    TrailingZeros,
579}
580
581/// Configuration for Benford deviation sampling (for anomaly injection).
582#[derive(Debug, Clone, Serialize, Deserialize)]
583pub struct BenfordDeviationConfig {
584    /// Type of deviation pattern
585    pub deviation_type: BenfordDeviationType,
586    /// Intensity of deviation (0.0 = Benford compliant, 1.0 = full deviation)
587    #[serde(default = "default_intensity")]
588    pub intensity: f64,
589    /// Base amount configuration
590    pub amount_config: AmountDistributionConfig,
591    /// Thresholds for threshold clustering (if applicable)
592    #[serde(default = "default_thresholds")]
593    pub thresholds: Vec<f64>,
594}
595
596fn default_intensity() -> f64 {
597    0.5
598}
599
600fn default_thresholds() -> Vec<f64> {
601    vec![1000.0, 5000.0, 10000.0, 25000.0, 50000.0, 100000.0]
602}
603
604impl Default for BenfordDeviationConfig {
605    fn default() -> Self {
606        Self {
607            deviation_type: BenfordDeviationType::RoundNumberBias,
608            intensity: 0.5,
609            amount_config: AmountDistributionConfig::default(),
610            thresholds: default_thresholds(),
611        }
612    }
613}
614
615/// Sampler for generating amounts that deviate from Benford's Law.
616/// Useful for injecting statistically detectable anomalies.
617pub struct BenfordDeviationSampler {
618    rng: ChaCha8Rng,
619    config: BenfordDeviationConfig,
620    benford_sampler: BenfordSampler,
621}
622
623impl BenfordDeviationSampler {
624    /// Create a new Benford deviation sampler.
625    pub fn new(seed: u64, config: BenfordDeviationConfig) -> Self {
626        Self {
627            rng: ChaCha8Rng::seed_from_u64(seed),
628            benford_sampler: BenfordSampler::new(seed + 100, config.amount_config.clone()),
629            config,
630        }
631    }
632
633    /// Sample an amount with the configured deviation pattern.
634    pub fn sample(&mut self) -> Decimal {
635        // With probability (1 - intensity), sample from normal Benford
636        let p: f64 = self.rng.random();
637        if p > self.config.intensity {
638            return self.benford_sampler.sample();
639        }
640
641        // Apply deviation pattern
642        match self.config.deviation_type {
643            BenfordDeviationType::RoundNumberBias => self.sample_round_bias(),
644            BenfordDeviationType::ThresholdClustering => self.sample_threshold_cluster(),
645            BenfordDeviationType::UniformFirstDigit => self.sample_uniform_first_digit(),
646            BenfordDeviationType::DigitBias { digit } => self.sample_digit_bias(digit),
647            BenfordDeviationType::TrailingZeros => self.sample_trailing_zeros(),
648        }
649    }
650
651    /// Sample with round number bias.
652    fn sample_round_bias(&mut self) -> Decimal {
653        // Bias towards first digits 1 and 5
654        let first_digit = if self.rng.random_bool(0.6) {
655            if self.rng.random_bool(0.7) {
656                1
657            } else {
658                5
659            }
660        } else {
661            self.rng.random_range(1..=9)
662        };
663
664        // Bias towards second digits 0 and 5
665        let _second_digit = if self.rng.random_bool(0.5) {
666            if self.rng.random_bool(0.6) {
667                0
668            } else {
669                5
670            }
671        } else {
672            self.rng.random_range(0..=9)
673        };
674
675        self.benford_sampler.sample_with_first_digit(first_digit)
676    }
677
678    /// Sample clustering just below thresholds.
679    fn sample_threshold_cluster(&mut self) -> Decimal {
680        let threshold = self
681            .config
682            .thresholds
683            .choose(&mut self.rng)
684            .copied()
685            .unwrap_or(10000.0);
686
687        // Generate amount 1-15% below threshold
688        let pct_below = self.rng.random_range(0.01..0.15);
689        let amount = threshold * (1.0 - pct_below);
690
691        // Add small noise
692        let noise = 1.0 + self.rng.random_range(-0.005..0.005);
693        let final_amount = (amount * noise * 100.0).round() / 100.0;
694
695        Decimal::from_f64_retain(final_amount.clamp(
696            self.config.amount_config.min_amount,
697            self.config.amount_config.max_amount,
698        ))
699        .unwrap_or(Decimal::ONE)
700    }
701
702    /// Sample with uniform first digit distribution.
703    fn sample_uniform_first_digit(&mut self) -> Decimal {
704        let first_digit = self.rng.random_range(1..=9);
705        self.benford_sampler.sample_with_first_digit(first_digit)
706    }
707
708    /// Sample with bias towards a specific digit.
709    fn sample_digit_bias(&mut self, target_digit: u8) -> Decimal {
710        let digit = target_digit.clamp(1, 9);
711        // 70% chance of using the biased digit
712        let first_digit = if self.rng.random_bool(0.7) {
713            digit
714        } else {
715            self.rng.random_range(1..=9)
716        };
717        self.benford_sampler.sample_with_first_digit(first_digit)
718    }
719
720    /// Sample with trailing zeros pattern (prices ending in .00).
721    fn sample_trailing_zeros(&mut self) -> Decimal {
722        let amount = self.benford_sampler.sample();
723        let amount_f64: f64 = amount.to_string().parse().unwrap_or(0.0);
724
725        // Round to whole dollars
726        let rounded = amount_f64.round();
727        Decimal::from_f64_retain(rounded.clamp(
728            self.config.amount_config.min_amount,
729            self.config.amount_config.max_amount,
730        ))
731        .unwrap_or(Decimal::ONE)
732    }
733
734    /// Reset the sampler with a new seed.
735    pub fn reset(&mut self, seed: u64) {
736        self.rng = ChaCha8Rng::seed_from_u64(seed);
737        self.benford_sampler.reset(seed + 100);
738    }
739}
740
741#[cfg(test)]
742#[allow(clippy::unwrap_used)]
743mod tests {
744    use super::*;
745
746    #[test]
747    fn test_benford_probabilities_sum_to_one() {
748        let sum: f64 = BENFORD_PROBABILITIES.iter().sum();
749        assert!(
750            (sum - 1.0).abs() < 0.001,
751            "Benford probabilities sum to {}, expected 1.0",
752            sum
753        );
754    }
755
756    #[test]
757    fn test_benford_cdf_ends_at_one() {
758        assert!(
759            (BENFORD_CDF[8] - 1.0).abs() < 0.0001,
760            "CDF should end at 1.0"
761        );
762    }
763
764    #[test]
765    fn test_anti_benford_probabilities_sum_to_one() {
766        let sum: f64 = ANTI_BENFORD_PROBABILITIES.iter().sum();
767        assert!(
768            (sum - 1.0).abs() < 0.001,
769            "Anti-Benford probabilities sum to {}, expected 1.0",
770            sum
771        );
772    }
773
774    #[test]
775    fn test_benford_sampler_determinism() {
776        let config = AmountDistributionConfig::default();
777        let mut sampler1 = BenfordSampler::new(42, config.clone());
778        let mut sampler2 = BenfordSampler::new(42, config);
779
780        for _ in 0..100 {
781            assert_eq!(sampler1.sample(), sampler2.sample());
782        }
783    }
784
785    #[test]
786    fn test_benford_first_digit_distribution() {
787        let config = AmountDistributionConfig::default();
788        let mut sampler = BenfordSampler::new(12345, config);
789
790        let mut digit_counts = [0u32; 9];
791        let iterations = 10_000;
792
793        for _ in 0..iterations {
794            let amount = sampler.sample();
795            if let Some(digit) = get_first_digit(amount) {
796                if (1..=9).contains(&digit) {
797                    digit_counts[(digit - 1) as usize] += 1;
798                }
799            }
800        }
801
802        // Verify digit 1 is most common (should be ~30%, but can vary more due to log-normal distribution)
803        let digit_1_pct = digit_counts[0] as f64 / iterations as f64;
804        assert!(
805            digit_1_pct > 0.15 && digit_1_pct < 0.50,
806            "Digit 1 should be ~30%, got {:.1}%",
807            digit_1_pct * 100.0
808        );
809
810        // Verify digit 9 is least common (should be ~5%)
811        let digit_9_pct = digit_counts[8] as f64 / iterations as f64;
812        assert!(
813            digit_9_pct > 0.02 && digit_9_pct < 0.10,
814            "Digit 9 should be ~5%, got {:.1}%",
815            digit_9_pct * 100.0
816        );
817    }
818
819    #[test]
820    fn test_threshold_adjacent_below_threshold() {
821        let config = AmountDistributionConfig::default();
822        let threshold_config = ThresholdConfig {
823            thresholds: vec![10000.0],
824            min_below_pct: 0.01,
825            max_below_pct: 0.15,
826        };
827        let mut gen = FraudAmountGenerator::new(42, config, threshold_config);
828
829        for _ in 0..100 {
830            let amount = gen.sample(FraudAmountPattern::ThresholdAdjacent);
831            let f = amount.to_string().parse::<f64>().unwrap();
832            assert!(f < 10000.0, "Amount {} should be below threshold 10000", f);
833            // Account for noise factor (up to 0.5%) and rounding
834            assert!(
835                f >= 8400.0,
836                "Amount {} should be approximately within 15% of threshold",
837                f
838            );
839        }
840    }
841
842    #[test]
843    fn test_obvious_round_numbers() {
844        let config = AmountDistributionConfig::default();
845        let threshold_config = ThresholdConfig::default();
846        let mut gen = FraudAmountGenerator::new(42, config, threshold_config);
847
848        for _ in 0..100 {
849            let amount = gen.sample(FraudAmountPattern::ObviousRoundNumbers);
850            let f = amount.to_string().parse::<f64>().unwrap();
851
852            // Should be either a round number or just under one
853            let is_round = f % 1000.0 == 0.0 || f % 5000.0 == 0.0;
854            let is_just_under = (f + 0.01) % 1000.0 < 0.02 || (f + 0.01) % 10000.0 < 0.02;
855
856            assert!(
857                is_round || is_just_under || f > 0.0,
858                "Amount {} should be a suspicious round number",
859                f
860            );
861        }
862    }
863
864    #[test]
865    fn test_get_first_digit() {
866        assert_eq!(get_first_digit(Decimal::from(123)), Some(1));
867        assert_eq!(get_first_digit(Decimal::from(999)), Some(9));
868        assert_eq!(get_first_digit(Decimal::from(50000)), Some(5));
869        assert_eq!(
870            get_first_digit(Decimal::from_str_exact("0.00123").unwrap()),
871            Some(1)
872        );
873    }
874
875    #[test]
876    fn test_second_digit_probabilities_sum_to_one() {
877        let sum: f64 = BENFORD_SECOND_DIGIT_PROBABILITIES.iter().sum();
878        assert!(
879            (sum - 1.0).abs() < 0.001,
880            "Second digit probabilities sum to {}, expected 1.0",
881            sum
882        );
883    }
884
885    #[test]
886    fn test_first_two_probability() {
887        // P(10) = log10(1 + 1/10) = log10(1.1) ≈ 0.0414
888        let p10 = benford_first_two_probability(1, 0);
889        assert!((p10 - 0.0414).abs() < 0.001);
890
891        // P(99) = log10(1 + 1/99) ≈ 0.00436
892        let p99 = benford_first_two_probability(9, 9);
893        assert!((p99 - 0.00436).abs() < 0.0001);
894
895        // Sum of all first-two probabilities should be 1.0
896        let probs = benford_first_two_probabilities();
897        let sum: f64 = probs.iter().sum();
898        assert!((sum - 1.0).abs() < 0.001);
899    }
900
901    #[test]
902    fn test_get_first_two_digits() {
903        assert_eq!(get_first_two_digits(Decimal::from(123)), Some((1, 2)));
904        assert_eq!(get_first_two_digits(Decimal::from(999)), Some((9, 9)));
905        assert_eq!(get_first_two_digits(Decimal::from(50000)), Some((5, 0)));
906        assert_eq!(
907            get_first_two_digits(Decimal::from_str_exact("0.00123").unwrap()),
908            Some((1, 2))
909        );
910    }
911
912    #[test]
913    fn test_enhanced_benford_sampler() {
914        let config = EnhancedBenfordConfig {
915            amount_config: AmountDistributionConfig::default(),
916            second_digit_compliance: true,
917            first_two_digit_compliance: false,
918        };
919        let mut sampler = EnhancedBenfordSampler::new(42, config);
920
921        let mut digit_counts = [0u32; 10];
922        for _ in 0..10000 {
923            let amount = sampler.sample();
924            if let Some((_, d2)) = get_first_two_digits(amount) {
925                digit_counts[d2 as usize] += 1;
926            }
927        }
928
929        // Note: The second digit distribution depends on amount generation and
930        // magnitude selection, which may skew results. Just verify the sampler runs
931        // and produces valid amounts.
932        let total_valid = digit_counts.iter().sum::<u32>();
933        assert!(
934            total_valid > 9000,
935            "Most samples should have valid first two digits"
936        );
937
938        // Verify we have some distribution of second digits (not all the same)
939        let max_count = *digit_counts.iter().max().unwrap();
940        let _min_count = *digit_counts.iter().min().unwrap();
941        assert!(
942            max_count < total_valid / 2,
943            "Second digits should have some variety, max count: {}",
944            max_count
945        );
946    }
947
948    #[test]
949    fn test_benford_deviation_sampler() {
950        let config = BenfordDeviationConfig {
951            deviation_type: BenfordDeviationType::ThresholdClustering,
952            intensity: 1.0,
953            amount_config: AmountDistributionConfig::default(),
954            thresholds: vec![10000.0],
955        };
956        let mut sampler = BenfordDeviationSampler::new(42, config);
957
958        for _ in 0..100 {
959            let amount = sampler.sample();
960            let f: f64 = amount.to_string().parse().unwrap();
961            // Should be below threshold
962            assert!(f < 10000.0, "Amount {} should be below 10000", f);
963            // Should be within ~20% of threshold (1-15% below + noise)
964            assert!(f > 8000.0, "Amount {} should be near threshold 10000", f);
965        }
966    }
967
968    #[test]
969    fn test_benford_deviation_round_bias() {
970        let config = BenfordDeviationConfig {
971            deviation_type: BenfordDeviationType::RoundNumberBias,
972            intensity: 1.0,
973            amount_config: AmountDistributionConfig::default(),
974            thresholds: vec![],
975        };
976        let mut sampler = BenfordDeviationSampler::new(42, config);
977
978        let mut digit_counts = [0u32; 9];
979        for _ in 0..1000 {
980            let amount = sampler.sample();
981            if let Some(d) = get_first_digit(amount) {
982                if (1..=9).contains(&d) {
983                    digit_counts[(d - 1) as usize] += 1;
984                }
985            }
986        }
987
988        // Digits 1 and 5 should be overrepresented
989        let d1_pct = digit_counts[0] as f64 / 1000.0;
990        let d5_pct = digit_counts[4] as f64 / 1000.0;
991
992        // Should be higher than Benford expects
993        assert!(d1_pct > 0.35 || d5_pct > 0.10);
994    }
995}
datasynth_core/distributions/benford.rs

datasynth_core/distributions/
benford.rs