use std::env;
use std::fs::File;
use std::io::Write;
use samkhya_core::sketches::{CorrelatedHistogram2D, EquiDepthHistogram};
#[derive(Clone)]
struct Rng {
s: [u64; 4],
}
impl Rng {
fn seeded(seed: u64) -> Self {
let mut z = seed.wrapping_add(0x9E3779B97F4A7C15);
let mut next = || {
z = z.wrapping_add(0x9E3779B97F4A7C15);
let mut x = z;
x = (x ^ (x >> 30)).wrapping_mul(0xBF58476D1CE4E5B9);
x = (x ^ (x >> 27)).wrapping_mul(0x94D049BB133111EB);
x ^ (x >> 31)
};
Self {
s: [next(), next(), next(), next()],
}
}
fn next_u64(&mut self) -> u64 {
let result = self.s[1].wrapping_mul(5).rotate_left(7).wrapping_mul(9);
let t = self.s[1] << 17;
self.s[2] ^= self.s[0];
self.s[3] ^= self.s[1];
self.s[1] ^= self.s[2];
self.s[0] ^= self.s[3];
self.s[2] ^= t;
self.s[3] = self.s[3].rotate_left(45);
result
}
fn uniform(&mut self) -> f64 {
(self.next_u64() >> 11) as f64 / ((1u64 << 53) as f64)
}
fn normal(&mut self) -> f64 {
let mut u1 = self.uniform();
if u1 < 1e-300 {
u1 = 1e-300;
}
let u2 = self.uniform();
(-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos()
}
}
#[derive(Clone, Copy)]
enum Dist {
Uniform,
Gaussian,
Zipfian,
Lognormal,
Bimodal,
}
impl Dist {
fn name(self) -> &'static str {
match self {
Dist::Uniform => "uniform",
Dist::Gaussian => "gaussian",
Dist::Zipfian => "zipfian",
Dist::Lognormal => "lognormal",
Dist::Bimodal => "bimodal",
}
}
fn sample(self, rng: &mut Rng) -> f64 {
match self {
Dist::Uniform => rng.uniform(),
Dist::Gaussian => rng.normal(),
Dist::Zipfian => {
let u = rng.uniform().max(1e-12);
let s = 1.07f64;
let x = (1.0 - u).powf(-1.0 / (s - 1.0));
x.min(1.0e6)
}
Dist::Lognormal => (rng.normal() * 0.7).exp(),
Dist::Bimodal => {
if rng.uniform() < 0.5 {
rng.normal() * 0.3 - 2.0
} else {
rng.normal() * 0.3 + 2.0
}
}
}
}
}
fn generate(d: Dist, n: usize, rng: &mut Rng) -> Vec<f64> {
(0..n).map(|_| d.sample(rng)).collect()
}
fn percentile(sorted: &[f64], p: f64) -> f64 {
if sorted.is_empty() {
return 0.0;
}
let idx = ((sorted.len() as f64 - 1.0) * p).round() as usize;
sorted[idx.min(sorted.len() - 1)]
}
fn bootstrap_p95_ci(samples: &[f64], rng: &mut Rng, b: usize) -> (f64, f64) {
if samples.is_empty() {
return (0.0, 0.0);
}
let n = samples.len();
let mut p95s: Vec<f64> = Vec::with_capacity(b);
for _ in 0..b {
let mut resample: Vec<f64> = (0..n)
.map(|_| samples[(rng.next_u64() as usize) % n])
.collect();
resample.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
p95s.push(percentile(&resample, 0.95));
}
p95s.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
(percentile(&p95s, 0.025), percentile(&p95s, 0.975))
}
fn true_range_count(values: &[f64], lo: f64, hi: f64) -> u64 {
values.iter().filter(|&&v| v >= lo && v <= hi).count() as u64
}
fn random_range(rng: &mut Rng, vmin: f64, vmax: f64) -> (f64, f64) {
let a = vmin + rng.uniform() * (vmax - vmin);
let b = vmin + rng.uniform() * (vmax - vmin);
if a <= b { (a, b) } else { (b, a) }
}
fn rel_err(est: u64, truth: u64) -> f64 {
let t = truth.max(1) as f64;
(est as i64 - truth as i64).abs() as f64 / t
}
struct CellSummary {
p50: f64,
p95: f64,
p99: f64,
max: f64,
p95_ci_lo: f64,
p95_ci_hi: f64,
p95_per_trial: Vec<f64>,
}
fn run_equidepth_cell(
dist: Dist,
n: usize,
buckets: usize,
trials: usize,
queries: usize,
) -> CellSummary {
let mut p95_per_trial: Vec<f64> = Vec::with_capacity(trials);
let mut all_errs: Vec<f64> = Vec::with_capacity(trials * queries);
for trial in 0..trials {
let seed =
(dist as u64) * 1_000_003 + (n as u64) * 97 + (buckets as u64) * 31 + trial as u64;
let mut rng = Rng::seeded(seed);
let pop = generate(dist, n, &mut rng);
let (vmin, vmax) = pop
.iter()
.fold((f64::INFINITY, f64::NEG_INFINITY), |(lo, hi), &v| {
(lo.min(v), hi.max(v))
});
let hist = EquiDepthHistogram::from_values(&pop, buckets).expect("buckets > 0");
let mut errs: Vec<f64> = Vec::with_capacity(queries);
for _ in 0..queries {
let (lo, hi) = random_range(&mut rng, vmin, vmax);
let est = hist.estimate_range(lo, hi);
let truth = true_range_count(&pop, lo, hi);
errs.push(rel_err(est, truth));
}
errs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
p95_per_trial.push(percentile(&errs, 0.95));
all_errs.extend_from_slice(&errs);
}
all_errs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let mut ci_rng = Rng::seeded(0xDEADBEEF_u64.wrapping_add(buckets as u64));
let (lo, hi) = bootstrap_p95_ci(&p95_per_trial, &mut ci_rng, 1000);
CellSummary {
p50: percentile(&all_errs, 0.50),
p95: percentile(&all_errs, 0.95),
p99: percentile(&all_errs, 0.99),
max: *all_errs.last().unwrap_or(&0.0),
p95_ci_lo: lo,
p95_ci_hi: hi,
p95_per_trial,
}
}
fn correlated_pairs(rho: f64, n: usize, rng: &mut Rng) -> Vec<(f64, f64)> {
let k = (1.0 - rho * rho).max(0.0).sqrt();
(0..n)
.map(|_| {
let z1 = rng.normal();
let z2 = rng.normal();
(z1, rho * z1 + k * z2)
})
.collect()
}
struct TwoDSummary {
p50: f64,
p95: f64,
p99: f64,
max: f64,
p95_ci_lo: f64,
p95_ci_hi: f64,
indep_p95: f64,
p95_per_trial: Vec<f64>,
}
fn run_correlated_cell(
rho: f64,
n: usize,
bins: usize,
trials: usize,
queries: usize,
) -> TwoDSummary {
let mut p95_per_trial: Vec<f64> = Vec::with_capacity(trials);
let mut all_errs: Vec<f64> = Vec::with_capacity(trials * queries);
let mut indep_errs: Vec<f64> = Vec::with_capacity(trials * queries);
for trial in 0..trials {
let seed = ((rho * 1000.0) as u64) * 1_000_003
+ (n as u64) * 97
+ (bins as u64) * 31
+ trial as u64;
let mut rng = Rng::seeded(seed);
let pairs = correlated_pairs(rho, n, &mut rng);
let xs: Vec<f64> = pairs.iter().map(|p| p.0).collect();
let ys: Vec<f64> = pairs.iter().map(|p| p.1).collect();
let (xmin, xmax) = xs
.iter()
.fold((f64::INFINITY, f64::NEG_INFINITY), |(lo, hi), &v| {
(lo.min(v), hi.max(v))
});
let (ymin, ymax) = ys
.iter()
.fold((f64::INFINITY, f64::NEG_INFINITY), |(lo, hi), &v| {
(lo.min(v), hi.max(v))
});
let h2d = CorrelatedHistogram2D::from_pairs(&pairs, bins, bins).expect("bins > 0");
let hx = EquiDepthHistogram::from_values(&xs, bins).expect("bins > 0");
let hy = EquiDepthHistogram::from_values(&ys, bins).expect("bins > 0");
let total = pairs.len() as f64;
let mut errs: Vec<f64> = Vec::with_capacity(queries);
for _ in 0..queries {
let (ax, bx) = random_range(&mut rng, xmin, xmax);
let (ay, by) = random_range(&mut rng, ymin, ymax);
let truth = pairs
.iter()
.filter(|&&(x, y)| x >= ax && x <= bx && y >= ay && y <= by)
.count() as u64;
let est2d = h2d.estimate_range(ax, bx, ay, by);
errs.push(rel_err(est2d, truth));
let nx = hx.estimate_range(ax, bx) as f64;
let ny = hy.estimate_range(ay, by) as f64;
let est_indep = if total > 0.0 {
(nx * ny / total).round() as u64
} else {
0
};
indep_errs.push(rel_err(est_indep, truth));
}
errs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
p95_per_trial.push(percentile(&errs, 0.95));
all_errs.extend_from_slice(&errs);
}
all_errs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
indep_errs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let mut ci_rng = Rng::seeded(0xC0FFEE_u64.wrapping_add(bins as u64));
let (lo, hi) = bootstrap_p95_ci(&p95_per_trial, &mut ci_rng, 1000);
TwoDSummary {
p50: percentile(&all_errs, 0.50),
p95: percentile(&all_errs, 0.95),
p99: percentile(&all_errs, 0.99),
max: *all_errs.last().unwrap_or(&0.0),
p95_ci_lo: lo,
p95_ci_hi: hi,
indep_p95: percentile(&indep_errs, 0.95),
p95_per_trial,
}
}
fn main() {
let trials = 30;
let queries = 1000;
let dists = [
Dist::Uniform,
Dist::Gaussian,
Dist::Zipfian,
Dist::Lognormal,
Dist::Bimodal,
];
let ns = [10_000usize, 100_000, 1_000_000];
let buckets_set = [32usize, 64, 128, 256];
let raw_path = env::var("SAMKHYA_RAW_OUT").ok();
let mut raw_1d: Vec<String> = Vec::new();
let mut raw_2d: Vec<String> = Vec::new();
println!("# EquiDepthHistogram 1D");
println!("dist,n,buckets,p50,p95,p99,max,p95_ci_lo,p95_ci_hi");
for &dist in &dists {
for &n in &ns {
for &b in &buckets_set {
let s = run_equidepth_cell(dist, n, b, trials, queries);
println!(
"{},{},{},{:.4},{:.4},{:.4},{:.4},{:.4},{:.4}",
dist.name(),
n,
b,
s.p50,
s.p95,
s.p99,
s.max,
s.p95_ci_lo,
s.p95_ci_hi,
);
if raw_path.is_some() {
let p95_vec = s
.p95_per_trial
.iter()
.map(|v| format!("{v:.10}"))
.collect::<Vec<_>>()
.join(",");
raw_1d.push(format!(
"{{\"dist\":\"{}\",\"n\":{n},\"buckets\":{b},\"trials\":{trials},\"queries_per_trial\":{queries},\"p95_per_trial\":[{p95_vec}]}}",
dist.name()
));
}
}
}
}
println!();
println!("# CorrelatedHistogram2D");
println!("rho,n,bins,p50,p95,p99,max,p95_ci_lo,p95_ci_hi,indep_p95");
let rhos = [0.0f64, 0.3, 0.7, 0.95];
let ns2 = [10_000usize, 100_000];
let bins_set = [16usize, 32, 64];
for &rho in &rhos {
for &n in &ns2 {
for &b in &bins_set {
let s = run_correlated_cell(rho, n, b, trials, queries);
println!(
"{:.2},{},{},{:.4},{:.4},{:.4},{:.4},{:.4},{:.4},{:.4}",
rho, n, b, s.p50, s.p95, s.p99, s.max, s.p95_ci_lo, s.p95_ci_hi, s.indep_p95,
);
if raw_path.is_some() {
let p95_vec = s
.p95_per_trial
.iter()
.map(|v| format!("{v:.10}"))
.collect::<Vec<_>>()
.join(",");
raw_2d.push(format!(
"{{\"rho\":{rho},\"n\":{n},\"bins\":{b},\"trials\":{trials},\"queries_per_trial\":{queries},\"p95_per_trial\":[{p95_vec}]}}"
));
}
}
}
}
if let Some(path) = raw_path {
let body = format!(
"{{\"benchmark\":\"histogram_accuracy\",\"seed_scheme\":\"f(dist,n,buckets,trial) splitmix64\",\"equidepth_1d\":[{}],\"correlated_2d\":[{}]}}",
raw_1d.join(","),
raw_2d.join(",")
);
let mut f = File::create(&path).expect("create raw output file");
f.write_all(body.as_bytes()).expect("write raw output");
eprintln!("# raw per-trial vectors written to {path}");
}
}