datasynth_core/distributions/
benford.rs1use rand::prelude::*;
7use rand_chacha::ChaCha8Rng;
8use rust_decimal::Decimal;
9use serde::{Deserialize, Serialize};
10
11use super::AmountDistributionConfig;
12
13#[allow(clippy::approx_constant)]
17pub const BENFORD_PROBABILITIES: [f64; 9] = [
18 0.30103, 0.17609, 0.12494, 0.09691, 0.07918, 0.06695, 0.05799, 0.05115, 0.04576, ];
28
29#[allow(clippy::approx_constant)]
32pub const BENFORD_CDF: [f64; 9] = [
33 0.30103, 0.47712, 0.60206, 0.69897, 0.77815, 0.84510, 0.90309, 0.95424, 1.00000, ];
43
44pub const ANTI_BENFORD_PROBABILITIES: [f64; 9] = [
47 0.05, 0.05, 0.05, 0.10, 0.25, 0.10, 0.20, 0.05, 0.15, ];
57
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
60#[serde(rename_all = "snake_case")]
61pub enum FraudAmountPattern {
62 #[default]
64 Normal,
65 StatisticallyImprobable,
68 ObviousRoundNumbers,
71 ThresholdAdjacent,
74}
75
76#[derive(Debug, Clone, Serialize, Deserialize)]
78pub struct ThresholdConfig {
79 pub thresholds: Vec<f64>,
81 pub min_below_pct: f64,
83 pub max_below_pct: f64,
85}
86
87impl Default for ThresholdConfig {
88 fn default() -> Self {
89 Self {
90 thresholds: vec![1000.0, 5000.0, 10000.0, 25000.0, 50000.0, 100000.0],
91 min_below_pct: 0.01,
92 max_below_pct: 0.15,
93 }
94 }
95}
96
97pub struct BenfordSampler {
99 rng: ChaCha8Rng,
100 config: AmountDistributionConfig,
101}
102
103impl BenfordSampler {
104 pub fn new(seed: u64, config: AmountDistributionConfig) -> Self {
106 Self {
107 rng: ChaCha8Rng::seed_from_u64(seed),
108 config,
109 }
110 }
111
112 fn sample_benford_first_digit(&mut self) -> u8 {
114 let p: f64 = self.rng.gen();
115 for (i, &cumulative) in BENFORD_CDF.iter().enumerate() {
116 if p < cumulative {
117 return (i + 1) as u8;
118 }
119 }
120 9
121 }
122
123 fn sample_anti_benford_first_digit(&mut self) -> u8 {
125 let p: f64 = self.rng.gen();
126 let mut cumulative = 0.0;
127 for (i, &prob) in ANTI_BENFORD_PROBABILITIES.iter().enumerate() {
128 cumulative += prob;
129 if p < cumulative {
130 return (i + 1) as u8;
131 }
132 }
133 9
134 }
135
136 pub fn sample(&mut self) -> Decimal {
138 let first_digit = self.sample_benford_first_digit();
139 self.sample_with_first_digit(first_digit)
140 }
141
142 pub fn sample_with_first_digit(&mut self, first_digit: u8) -> Decimal {
144 let first_digit = first_digit.clamp(1, 9);
145
146 let min_magnitude = self.config.min_amount.log10().floor() as i32;
148 let max_magnitude = self.config.max_amount.log10().floor() as i32;
149
150 let magnitude = self.rng.gen_range(min_magnitude..=max_magnitude);
152 let base = 10_f64.powi(magnitude);
153
154 let remaining: f64 = self.rng.gen();
156
157 let mantissa = first_digit as f64 + remaining;
159 let mut amount = mantissa * base;
160
161 amount = amount.clamp(self.config.min_amount, self.config.max_amount);
163
164 let p: f64 = self.rng.gen();
166 if p < self.config.round_number_probability {
167 amount = (amount / 100.0).round() * 100.0;
169 } else if p < self.config.round_number_probability + self.config.nice_number_probability {
170 amount = (amount / 5.0).round() * 5.0;
172 }
173
174 let decimal_multiplier = 10_f64.powi(self.config.decimal_places as i32);
176 amount = (amount * decimal_multiplier).round() / decimal_multiplier;
177
178 amount = amount.max(self.config.min_amount);
180
181 Decimal::from_f64_retain(amount).unwrap_or(Decimal::ONE)
182 }
183
184 pub fn reset(&mut self, seed: u64) {
186 self.rng = ChaCha8Rng::seed_from_u64(seed);
187 }
188}
189
190pub struct FraudAmountGenerator {
192 rng: ChaCha8Rng,
193 benford_sampler: BenfordSampler,
194 threshold_config: ThresholdConfig,
195 config: AmountDistributionConfig,
196}
197
198impl FraudAmountGenerator {
199 pub fn new(
201 seed: u64,
202 config: AmountDistributionConfig,
203 threshold_config: ThresholdConfig,
204 ) -> Self {
205 Self {
206 rng: ChaCha8Rng::seed_from_u64(seed),
207 benford_sampler: BenfordSampler::new(seed + 1, config.clone()),
208 threshold_config,
209 config,
210 }
211 }
212
213 pub fn sample(&mut self, pattern: FraudAmountPattern) -> Decimal {
215 match pattern {
216 FraudAmountPattern::Normal => self.benford_sampler.sample(),
217 FraudAmountPattern::StatisticallyImprobable => self.sample_anti_benford(),
218 FraudAmountPattern::ObviousRoundNumbers => self.sample_obvious_round(),
219 FraudAmountPattern::ThresholdAdjacent => self.sample_threshold_adjacent(),
220 }
221 }
222
223 fn sample_anti_benford(&mut self) -> Decimal {
225 let first_digit = self.benford_sampler.sample_anti_benford_first_digit();
226 self.benford_sampler.sample_with_first_digit(first_digit)
227 }
228
229 fn sample_obvious_round(&mut self) -> Decimal {
231 let pattern_choice = self.rng.gen_range(0..5);
232
233 let amount = match pattern_choice {
234 0 => {
236 let multiplier = self.rng.gen_range(1..100);
237 multiplier as f64 * 1000.0
238 }
239 1 => {
241 let base = self.rng.gen_range(1..10) as f64 * 10000.0;
242 base - 0.01
243 }
244 2 => {
246 let multiplier = self.rng.gen_range(1..20);
247 multiplier as f64 * 10000.0
248 }
249 3 => {
251 let multiplier = self.rng.gen_range(1..40);
252 multiplier as f64 * 5000.0
253 }
254 _ => {
256 let base = self.rng.gen_range(1..100) as f64 * 1000.0;
257 base - 0.01
258 }
259 };
260
261 let clamped = amount.clamp(self.config.min_amount, self.config.max_amount);
263 Decimal::from_f64_retain(clamped).unwrap_or(Decimal::ONE)
264 }
265
266 fn sample_threshold_adjacent(&mut self) -> Decimal {
268 let threshold = if self.threshold_config.thresholds.is_empty() {
270 10000.0
271 } else {
272 *self
273 .threshold_config
274 .thresholds
275 .choose(&mut self.rng)
276 .unwrap_or(&10000.0)
277 };
278
279 let pct_below = self
281 .rng
282 .gen_range(self.threshold_config.min_below_pct..self.threshold_config.max_below_pct);
283 let base_amount = threshold * (1.0 - pct_below);
284
285 let noise_factor = 1.0 + self.rng.gen_range(-0.005..0.005);
287 let amount = base_amount * noise_factor;
288
289 let rounded = (amount * 100.0).round() / 100.0;
291
292 let final_amount = rounded.min(threshold - 0.01);
294 let clamped = final_amount.clamp(self.config.min_amount, self.config.max_amount);
295
296 Decimal::from_f64_retain(clamped).unwrap_or(Decimal::ONE)
297 }
298
299 pub fn reset(&mut self, seed: u64) {
301 self.rng = ChaCha8Rng::seed_from_u64(seed);
302 self.benford_sampler.reset(seed + 1);
303 }
304}
305
306pub fn get_first_digit(amount: Decimal) -> Option<u8> {
308 let s = amount.to_string();
309 s.chars()
310 .find(|c| c.is_ascii_digit() && *c != '0')
311 .and_then(|c| c.to_digit(10))
312 .map(|d| d as u8)
313}
314
315#[cfg(test)]
316mod tests {
317 use super::*;
318
319 #[test]
320 fn test_benford_probabilities_sum_to_one() {
321 let sum: f64 = BENFORD_PROBABILITIES.iter().sum();
322 assert!(
323 (sum - 1.0).abs() < 0.001,
324 "Benford probabilities sum to {}, expected 1.0",
325 sum
326 );
327 }
328
329 #[test]
330 fn test_benford_cdf_ends_at_one() {
331 assert!(
332 (BENFORD_CDF[8] - 1.0).abs() < 0.0001,
333 "CDF should end at 1.0"
334 );
335 }
336
337 #[test]
338 fn test_anti_benford_probabilities_sum_to_one() {
339 let sum: f64 = ANTI_BENFORD_PROBABILITIES.iter().sum();
340 assert!(
341 (sum - 1.0).abs() < 0.001,
342 "Anti-Benford probabilities sum to {}, expected 1.0",
343 sum
344 );
345 }
346
347 #[test]
348 fn test_benford_sampler_determinism() {
349 let config = AmountDistributionConfig::default();
350 let mut sampler1 = BenfordSampler::new(42, config.clone());
351 let mut sampler2 = BenfordSampler::new(42, config);
352
353 for _ in 0..100 {
354 assert_eq!(sampler1.sample(), sampler2.sample());
355 }
356 }
357
358 #[test]
359 fn test_benford_first_digit_distribution() {
360 let config = AmountDistributionConfig::default();
361 let mut sampler = BenfordSampler::new(12345, config);
362
363 let mut digit_counts = [0u32; 9];
364 let iterations = 10_000;
365
366 for _ in 0..iterations {
367 let amount = sampler.sample();
368 if let Some(digit) = get_first_digit(amount) {
369 if (1..=9).contains(&digit) {
370 digit_counts[(digit - 1) as usize] += 1;
371 }
372 }
373 }
374
375 let digit_1_pct = digit_counts[0] as f64 / iterations as f64;
377 assert!(
378 digit_1_pct > 0.15 && digit_1_pct < 0.50,
379 "Digit 1 should be ~30%, got {:.1}%",
380 digit_1_pct * 100.0
381 );
382
383 let digit_9_pct = digit_counts[8] as f64 / iterations as f64;
385 assert!(
386 digit_9_pct > 0.02 && digit_9_pct < 0.10,
387 "Digit 9 should be ~5%, got {:.1}%",
388 digit_9_pct * 100.0
389 );
390 }
391
392 #[test]
393 fn test_threshold_adjacent_below_threshold() {
394 let config = AmountDistributionConfig::default();
395 let threshold_config = ThresholdConfig {
396 thresholds: vec![10000.0],
397 min_below_pct: 0.01,
398 max_below_pct: 0.15,
399 };
400 let mut gen = FraudAmountGenerator::new(42, config, threshold_config);
401
402 for _ in 0..100 {
403 let amount = gen.sample(FraudAmountPattern::ThresholdAdjacent);
404 let f = amount.to_string().parse::<f64>().unwrap();
405 assert!(f < 10000.0, "Amount {} should be below threshold 10000", f);
406 assert!(
408 f >= 8400.0,
409 "Amount {} should be approximately within 15% of threshold",
410 f
411 );
412 }
413 }
414
415 #[test]
416 fn test_obvious_round_numbers() {
417 let config = AmountDistributionConfig::default();
418 let threshold_config = ThresholdConfig::default();
419 let mut gen = FraudAmountGenerator::new(42, config, threshold_config);
420
421 for _ in 0..100 {
422 let amount = gen.sample(FraudAmountPattern::ObviousRoundNumbers);
423 let f = amount.to_string().parse::<f64>().unwrap();
424
425 let is_round = f % 1000.0 == 0.0 || f % 5000.0 == 0.0;
427 let is_just_under = (f + 0.01) % 1000.0 < 0.02 || (f + 0.01) % 10000.0 < 0.02;
428
429 assert!(
430 is_round || is_just_under || f > 0.0,
431 "Amount {} should be a suspicious round number",
432 f
433 );
434 }
435 }
436
437 #[test]
438 fn test_get_first_digit() {
439 assert_eq!(get_first_digit(Decimal::from(123)), Some(1));
440 assert_eq!(get_first_digit(Decimal::from(999)), Some(9));
441 assert_eq!(get_first_digit(Decimal::from(50000)), Some(5));
442 assert_eq!(
443 get_first_digit(Decimal::from_str_exact("0.00123").unwrap()),
444 Some(1)
445 );
446 }
447}