use std::env;
use std::fs::File;
use std::io::Write;
use samkhya_core::sketches::{
BloomFilter, CorrelatedHistogram2D, CountMinSketch, EquiDepthHistogram, HllSketch,
};
#[derive(Clone, Copy)]
struct Fixture {
name: &'static str,
n_cols: usize,
bytes_per_row: usize,
high_card_cols: usize,
numeric_cols: usize, fk_pairs: usize, distinct_per_col: u64, }
const FIXTURES: &[Fixture] = &[
Fixture {
name: "logs",
n_cols: 10,
bytes_per_row: 96, high_card_cols: 1,
numeric_cols: 3,
fk_pairs: 0,
distinct_per_col: 10_000,
},
Fixture {
name: "orders",
n_cols: 8,
bytes_per_row: 80,
high_card_cols: 0,
numeric_cols: 5,
fk_pairs: 2,
distinct_per_col: 5_000,
},
Fixture {
name: "users",
n_cols: 5,
bytes_per_row: 64,
high_card_cols: 0,
numeric_cols: 1,
fk_pairs: 0,
distinct_per_col: 200,
},
Fixture {
name: "events",
n_cols: 15,
bytes_per_row: 128,
high_card_cols: 1,
numeric_cols: 5,
fk_pairs: 0,
distinct_per_col: 2_000,
},
Fixture {
name: "wide",
n_cols: 50,
bytes_per_row: 408, high_card_cols: 0,
numeric_cols: 45,
fk_pairs: 0,
distinct_per_col: 1_000,
},
];
const SCALES: &[u64] = &[1, 10, 100];
const BASE_ROWS: u64 = 10_000;
const REPLICATES: usize = 12;
const HLL_PRECISION: u8 = 12; const BLOOM_FP: f64 = 0.01;
const HIST_BUCKETS: usize = 64;
const CORR_BINS: usize = 16;
fn synthetic_distinct_values(n: u64, seed: u64) -> Vec<[u8; 8]> {
let mut state = seed.wrapping_add(0x9E37_79B9_7F4A_7C15);
(0..n)
.map(|_| {
state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
let mut z = state;
z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
z = z ^ (z >> 31);
z.to_le_bytes()
})
.collect()
}
fn build_hll(distinct: u64, seed: u64) -> usize {
let mut h = HllSketch::new(HLL_PRECISION).unwrap();
for v in synthetic_distinct_values(distinct.min(20_000), seed) {
h.add(&v);
}
bincode::serialize(&h).unwrap().len()
}
fn build_bloom(distinct: u64, seed: u64) -> usize {
let cap = distinct.max(1) as usize;
let mut b = BloomFilter::new(cap, BLOOM_FP);
for v in synthetic_distinct_values(distinct.min(20_000), seed) {
b.insert(&v);
}
bincode::serialize(&b).unwrap().len()
}
fn build_equidepth(n: u64, seed: u64) -> usize {
let n = n.min(20_000) as usize;
let mut state = seed;
let values: Vec<f64> = (0..n)
.map(|_| {
state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
(state as f64) / (u64::MAX as f64) * 1000.0
})
.collect();
let h = EquiDepthHistogram::from_values(&values, HIST_BUCKETS).unwrap();
bincode::serialize(&h).unwrap().len()
}
fn build_corr2d(n: u64, seed: u64) -> usize {
let n = n.min(20_000) as usize;
let mut state = seed;
let pairs: Vec<(f64, f64)> = (0..n)
.map(|_| {
state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
let a = (state as f64) / (u64::MAX as f64) * 1000.0;
state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
let b = (state as f64) / (u64::MAX as f64) * 1000.0;
(a, b)
})
.collect();
let h = CorrelatedHistogram2D::from_pairs(&pairs, CORR_BINS, CORR_BINS).unwrap();
bincode::serialize(&h).unwrap().len()
}
fn build_cms(distinct: u64, seed: u64) -> usize {
let mut c = CountMinSketch::with_defaults();
for v in synthetic_distinct_values(distinct.min(20_000), seed) {
c.add(&v, 1);
}
bincode::serialize(&c).unwrap().len()
}
fn bootstrap_mean_ci(samples: &[f64], iters: usize, seed: u64) -> (f64, f64, f64) {
let n = samples.len();
let mean: f64 = samples.iter().sum::<f64>() / (n as f64);
let mut state = seed;
let mut means = Vec::with_capacity(iters);
for _ in 0..iters {
let mut acc = 0.0;
for _ in 0..n {
state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
let idx = (state as usize) % n;
acc += samples[idx];
}
means.push(acc / (n as f64));
}
means.sort_by(|a, b| a.partial_cmp(b).unwrap());
let lo = means[(iters as f64 * 0.025) as usize];
let hi = means[(iters as f64 * 0.975) as usize];
(mean, lo, hi)
}
fn main() {
let raw_path = env::var("SAMKHYA_RAW_OUT").ok();
let mut raw_cells: Vec<String> = Vec::new();
println!(
"fixture,scale,rows,bytes_per_row,raw_bytes,hll_total,bloom_total,equidepth_total,\
cms_total,corr2d_total,samkhya_total_mean,samkhya_total_lo,samkhya_total_hi,pct_of_raw"
);
for fx in FIXTURES {
for &sf in SCALES {
let rows = BASE_ROWS * sf;
let raw_bytes = rows * (fx.bytes_per_row as u64);
let distinct = ((fx.distinct_per_col as f64) * (sf as f64).sqrt()) as u64;
let mut totals: Vec<f64> = Vec::with_capacity(REPLICATES);
let mut hll_total = 0;
let mut bloom_total = 0;
let mut equidepth_total = 0;
let mut cms_total = 0;
let mut corr2d_total = 0;
for rep in 0..REPLICATES {
let seed = 0xA5A5_5A5A_DEAD_BEEFu64.wrapping_add(rep as u64);
let mut total = 0usize;
let mut h = 0;
for c in 0..fx.n_cols {
h += build_hll(distinct, seed.wrapping_add(c as u64));
}
total += h;
let mut b = 0;
for c in 0..fx.n_cols {
b += build_bloom(distinct, seed.wrapping_add(0x1000 + c as u64));
}
total += b;
let mut e = 0;
for c in 0..fx.numeric_cols {
e += build_equidepth(rows.min(20_000), seed.wrapping_add(0x2000 + c as u64));
}
total += e;
let mut m = 0;
for c in 0..fx.high_card_cols {
m += build_cms(distinct, seed.wrapping_add(0x3000 + c as u64));
}
total += m;
let mut cr = 0;
for c in 0..fx.fk_pairs {
cr += build_corr2d(rows.min(20_000), seed.wrapping_add(0x4000 + c as u64));
}
total += cr;
totals.push(total as f64);
if rep == 0 {
hll_total = h;
bloom_total = b;
equidepth_total = e;
cms_total = m;
corr2d_total = cr;
}
}
let (mean, lo, hi) = bootstrap_mean_ci(&totals, 2000, 0xCAFE);
let pct = mean / (raw_bytes as f64) * 100.0;
println!(
"{},{},{},{},{},{},{},{},{},{},{:.1},{:.1},{:.1},{:.6}",
fx.name,
sf,
rows,
fx.bytes_per_row,
raw_bytes,
hll_total,
bloom_total,
equidepth_total,
cms_total,
corr2d_total,
mean,
lo,
hi,
pct
);
if raw_path.is_some() {
let totals_vec = totals
.iter()
.map(|v| format!("{v:.1}"))
.collect::<Vec<_>>()
.join(",");
raw_cells.push(format!(
"{{\"fixture\":\"{}\",\"scale\":{},\"rows\":{},\"replicates\":{},\"samkhya_total_bytes\":[{totals_vec}]}}",
fx.name, sf, rows, REPLICATES
));
}
}
}
if let Some(path) = raw_path {
let body = format!(
"{{\"benchmark\":\"memory_profile\",\"seed_scheme\":\"0xA5A5_5A5A_DEAD_BEEF + replicate\",\"cells\":[{}]}}",
raw_cells.join(",")
);
let mut f = File::create(&path).expect("create raw output file");
f.write_all(body.as_bytes()).expect("write raw output");
eprintln!("# raw per-trial vectors written to {path}");
}
}