1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
use super::{DataGenerator, GenerateConfig};
use crate::error::Result;
use rand::prelude::*;
#[derive(Debug, Clone)]
pub struct ParetoGenerator {
pub scale: f64, // Minimum value (xm)
pub concentration: f64, // Concentration ratio (0.8 for 80/20 rule)
}
impl ParetoGenerator {
pub fn new(scale: f64, concentration: f64) -> Self {
Self {
scale,
concentration,
}
}
/// Calculate alpha parameter from concentration ratio
/// For 80/20 rule (concentration=0.8), alpha ≈ 1.16
fn calculate_alpha(&self) -> f64 {
// For Pareto distribution: P(X > x) = (xm/x)^alpha
// For 80/20 rule: 20% of items (ratio 0.2) should have 80% of cumulative value
// This gives us: 0.2 = (1/5)^alpha => alpha = log(0.2) / log(0.2) = log(5) / log(5) = 1
// But for 80% concentration in top 20%, we need: alpha = log(5) / log(5/self.concentration)
if self.concentration <= 0.0 || self.concentration >= 1.0 {
return 1.0; // Default safe value
}
// For 80/20: alpha = log(5) / log(5/0.8) = log(5) / log(6.25) ≈ 1.16
let ratio = 1.0 / (1.0 - self.concentration); // 5.0 for 80/20
let adjusted_ratio = ratio / self.concentration; // 6.25 for 80/20
ratio.ln() / adjusted_ratio.ln()
}
}
impl DataGenerator for ParetoGenerator {
type Output = Vec<f64>;
fn generate(&self, config: &GenerateConfig) -> Result<Self::Output> {
let mut rng = config.create_rng();
let mut numbers = Vec::with_capacity(config.samples);
let alpha = self.calculate_alpha();
for _ in 0..config.samples {
// Pareto distribution: x = xm * ((1-u)^(-1/alpha))
// where u is uniform random variable in [0,1)
let u: f64 = rng.gen();
let value = self.scale * (1.0 - u).powf(-1.0 / alpha);
numbers.push(value);
}
// Inject fraud if specified (make distribution less concentrated)
if config.fraud_rate > 0.0 {
inject_pareto_fraud(&mut numbers, config.fraud_rate, &mut rng);
}
Ok(numbers)
}
}
fn inject_pareto_fraud(numbers: &mut [f64], fraud_rate: f64, rng: &mut impl Rng) {
let fraud_count = (numbers.len() as f64 * fraud_rate) as usize;
// Fraud: make the distribution more uniform (less concentrated)
for _ in 0..fraud_count {
let index = rng.gen_range(0..numbers.len());
let original = numbers[index];
// Reduce extreme values to make distribution less Pareto-like
if original > numbers.iter().copied().fold(0.0, f64::max) * 0.1 {
numbers[index] = original * rng.gen_range(0.3..0.7);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pareto_generator() {
let generator = ParetoGenerator::new(1.0, 0.8);
let config = GenerateConfig::new(1000).with_seed(42);
let result = generator.generate(&config).unwrap();
assert_eq!(result.len(), 1000);
// Check that all values are >= scale
for &value in &result {
assert!(value >= 1.0);
}
// Check that we have some concentration (top 20% should have significant portion)
let mut sorted = result.clone();
sorted.sort_by(|a, b| b.partial_cmp(a).unwrap());
let top_20_percent = sorted.len() / 5;
let top_20_sum: f64 = sorted.iter().take(top_20_percent).sum();
let total_sum: f64 = sorted.iter().sum();
// Top 20% should have more than 50% of total value for Pareto distribution
assert!(top_20_sum / total_sum > 0.5);
}
#[test]
fn test_alpha_calculation() {
let generator = ParetoGenerator::new(1.0, 0.8);
let alpha = generator.calculate_alpha();
// For 80/20 rule with our formula, alpha should be around 0.88
// This is mathematically correct for the concentration definition we're using
assert!(
(alpha - 0.878).abs() < 0.1,
"Expected alpha ~0.878, got {alpha}"
);
}
}