use ddsketchy::DDSketch;
use rand::prelude::*;
use rand_distr::{Distribution, Exp1, Normal, Pareto};
pub struct TestDataGenerator {
rng: StdRng,
}
impl TestDataGenerator {
pub fn new(seed: u64) -> Self {
Self {
rng: StdRng::seed_from_u64(seed),
}
}
pub fn generate_exponential(&mut self, n: usize, lambda: f64) -> Vec<f64> {
(0..n)
.map(|_| {
let exp_val: f64 = Exp1.sample(&mut self.rng);
exp_val / lambda
})
.collect()
}
pub fn generate_normal(&mut self, n: usize, mean: f64, std_dev: f64) -> Vec<f64> {
let normal = Normal::new(mean, std_dev).unwrap();
(0..n)
.map(|_| normal.sample(&mut self.rng))
.filter(|&x| x > 0.0) .take(n)
.collect()
}
pub fn generate_pareto(&mut self, n: usize, scale: f64, shape: f64) -> Vec<f64> {
let pareto = Pareto::new(scale, shape).unwrap();
(0..n).map(|_| pareto.sample(&mut self.rng)).collect()
}
}
fn calculate_true_quantiles(sorted_data: &[f64], quantiles: &[f64]) -> Vec<f64> {
quantiles
.iter()
.map(|&q| {
if sorted_data.is_empty() {
0.0
} else {
let index = (q * (sorted_data.len() - 1) as f64) as usize;
sorted_data[index.min(sorted_data.len() - 1)]
}
})
.collect()
}
fn get_optimal_bin_count_for_alpha(alpha: f64, distribution_name: &str) -> usize {
match alpha {
a if a <= 0.001 => {
match distribution_name {
s if s.contains("Pareto") => 25000,
s if s.contains("Normal") => 8000,
_ => 10000, }
}
a if a <= 0.005 => match distribution_name {
s if s.contains("Pareto") => 8000,
_ => 4000,
},
a if a <= 0.01 => 4096,
_ => 2048,
}
}
fn assert_error_bounds(
data: Vec<f64>,
alpha: f64,
quantiles: &[f64],
distribution_name: &str,
) -> Result<(), String> {
let bin_count = get_optimal_bin_count_for_alpha(alpha, distribution_name);
let mut sketch = DDSketch::with_max_bins(alpha, bin_count)
.map_err(|e| format!("Failed to create DDSketch: {}", e))?;
for &value in &data {
sketch.add(value);
}
let mut sorted_data = data.clone();
sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
let true_quantiles = calculate_true_quantiles(&sorted_data, quantiles);
let mut failures = 0;
let mut failure_details = Vec::new();
for (i, &q) in quantiles.iter().enumerate() {
let estimated = sketch
.quantile(q)
.map_err(|e| format!("Failed to compute quantile {}: {}", q, e))?;
let true_val = true_quantiles[i];
let relative_error = if true_val != 0.0 {
(estimated - true_val).abs() / true_val
} else {
0.0
};
if relative_error > alpha {
failures += 1;
failure_details.push(format!(
" quantile {}: expected ≤ {:.4}, got {:.4} (true: {:.6}, estimated: {:.6})",
q, alpha, relative_error, true_val, estimated
));
}
}
assert!(
failures == 0,
"{} distribution failed error bounds validation with α={:.4}:\n Failures: {}/{}\n Bin count: {}\n Failure details:\n{}",
distribution_name,
alpha,
failures,
quantiles.len(),
bin_count,
failure_details.join("\n")
);
Ok(())
}
#[cfg(test)]
mod practical_bounds_tests {
use super::*;
#[test]
fn test_exponential_practical_bounds() {
let alpha = 0.01; let mut generator = TestDataGenerator::new(1337);
let data = generator.generate_exponential(10000, 1.0);
let quantiles = [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99];
assert_error_bounds(data, alpha, &quantiles, "Exponential")
.expect("Exponential distribution should meet α=0.01 bounds with no failures");
}
#[test]
fn test_normal_practical_bounds() {
let alpha = 0.01; let mut generator = TestDataGenerator::new(1337);
let data = generator.generate_normal(10000, 10.0, 2.0);
let quantiles = [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99];
assert_error_bounds(data, alpha, &quantiles, "Normal")
.expect("Normal distribution should meet α=0.01 bounds with no failures");
}
#[test]
fn test_pareto_practical_bounds() {
let alpha = 0.05; let mut generator = TestDataGenerator::new(1337);
let data = generator.generate_pareto(10000, 1.0, 1.0);
let quantiles = [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99];
assert_error_bounds(data, alpha, &quantiles, "Pareto")
.expect("Pareto distribution should meet α=0.05 bounds with no failures");
}
#[test]
fn test_mixed_alpha_validation() {
let alphas = [0.005, 0.01, 0.02, 0.05];
let mut generator = TestDataGenerator::new(1337);
for alpha in alphas {
let data = generator.generate_exponential(5000, 1.0);
let quantiles = [0.25, 0.5, 0.75, 0.9];
assert_error_bounds(
data,
alpha,
&quantiles,
&format!("Exponential(α={:.3})", alpha),
)
.unwrap_or_else(|_| panic!("Exponential should meet α={:.3} bounds", alpha));
}
}
}
#[cfg(test)]
mod high_precision_bounds_tests {
use super::*;
#[test]
fn test_exponential_high_precision_controlled() {
let alpha = 0.005; let mut generator = TestDataGenerator::new(1337);
let data = generator.generate_exponential(10000, 1.0);
let quantiles = [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95];
assert_error_bounds(data, alpha, &quantiles, "Exponential (High Precision)")
.expect("Exponential should meet α=0.005 bounds with ≤1 failure allowed");
}
#[test]
fn test_normal_high_precision() {
let alpha = 0.001; let mut generator = TestDataGenerator::new(1337);
let data = generator.generate_normal(10000, 10.0, 2.0);
let quantiles = [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99];
assert_error_bounds(data, alpha, &quantiles, "Normal (Very High Precision)")
.expect("Normal distribution should meet α=0.001 bounds");
}
#[test]
fn test_pareto_controlled_precision() {
let alpha = 0.01; let mut generator = TestDataGenerator::new(1337);
let data = generator.generate_pareto(10000, 1.0, 1.0);
let quantiles = [0.25, 0.5, 0.75, 0.9];
assert_error_bounds(data, alpha, &quantiles, "Pareto (Controlled)")
.expect("Pareto distribution should meet α=0.01 bounds with ≤2 failures allowed");
}
}
#[cfg(test)]
mod ddsketchy_advantage_validation {
use super::*;
#[test]
fn test_low_quantiles_advantage() {
let alpha = 0.01; let mut generator = TestDataGenerator::new(1337);
let data = generator.generate_exponential(10000, 1.0);
let quantiles = [0.01, 0.05, 0.1, 0.2];
assert_error_bounds(data, alpha, &quantiles, "Low Quantiles Advantage")
.expect("DDSketchy should excel at low quantiles with ≤1 failure");
}
#[test]
fn test_extreme_quantiles_robustness() {
let alpha = 0.02; let mut generator = TestDataGenerator::new(1337);
let data = generator.generate_exponential(20000, 1.0); let quantiles = [0.01, 0.05, 0.95, 0.99];
assert_error_bounds(data, alpha, &quantiles, "Extreme Quantiles")
.expect("DDSketchy should handle extreme quantiles well");
}
#[test]
fn test_small_values_precision() {
let alpha = 0.01;
let mut generator = TestDataGenerator::new(1337);
let mut data = Vec::new();
for _ in 0..10000 {
let val = if generator.rng.random_bool(0.8) {
generator.rng.random_range(0.001..0.1) } else {
generator.rng.random_range(1.0..10.0) };
data.push(val);
}
let quantiles = [0.05, 0.1, 0.25, 0.5];
assert_error_bounds(data, alpha, &quantiles, "Small Values Precision")
.expect("DDSketchy should maintain precision with small values");
}
#[test]
fn test_large_dataset_scalability() {
let alpha = 0.01;
let mut generator = TestDataGenerator::new(1337);
let data = generator.generate_exponential(100000, 1.0); let quantiles = [0.1, 0.25, 0.5, 0.75, 0.9];
assert_error_bounds(data, alpha, &quantiles, "Large Dataset Scalability")
.expect("DDSketchy should maintain accuracy at scale");
}
}