use std::hash::{DefaultHasher, Hash, Hasher};
use std::time::Instant;
use exaloglog::ExaLogLog;
use xxhash_rust::xxh3::xxh3_64;
fn splitmix64(mut x: u64) -> u64 {
x = x.wrapping_add(0x9E37_79B9_7F4A_7C15);
x = (x ^ (x >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
x = (x ^ (x >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
x ^ (x >> 31)
}
fn main() {
let p = 12;
let n = 1_000_000usize;
let inputs: Vec<Vec<u8>> = (0..n)
.map(|i| format!("user_{i:09}@example.com").into_bytes())
.collect();
println!(
"Hashing-cost comparison at p = {p}, n = {n}.\n\
Each row: total time, throughput, distinct-count estimate.\n"
);
println!(
"{:<32} {:>10} {:>15} {:>12}",
"path", "time", "M ins/s", "estimate"
);
println!("{}", "-".repeat(72));
{
let start = Instant::now();
let mut s = ExaLogLog::new_dense(p);
for input in &inputs {
s.add(input);
}
let elapsed = start.elapsed();
let est = s.estimate();
let mps = n as f64 / elapsed.as_secs_f64() / 1e6;
println!(
"{:<32} {:>9.3}s {:>14.1}M {:>12.0}",
"add(&T) [SipHash13 default]",
elapsed.as_secs_f64(),
mps,
est
);
let mut h = DefaultHasher::new();
inputs[0].hash(&mut h);
let _ = h.finish();
}
{
let start = Instant::now();
let mut s = ExaLogLog::new_dense(p);
for input in &inputs {
s.add_hash(xxh3_64(input));
}
let elapsed = start.elapsed();
let est = s.estimate();
let mps = n as f64 / elapsed.as_secs_f64() / 1e6;
println!(
"{:<32} {:>9.3}s {:>14.1}M {:>12.0}",
"add_hash(xxh3_64(bytes))",
elapsed.as_secs_f64(),
mps,
est
);
}
{
let start = Instant::now();
let mut s = ExaLogLog::new_dense(p);
for i in 0..n as u64 {
s.add_hash(splitmix64(i));
}
let elapsed = start.elapsed();
let est = s.estimate();
let mps = n as f64 / elapsed.as_secs_f64() / 1e6;
println!(
"{:<32} {:>9.3}s {:>14.1}M {:>12.0}",
"add_hash(splitmix64) baseline",
elapsed.as_secs_f64(),
mps,
est
);
}
println!();
println!("Notes:");
println!("- SipHash13 is cryptographically motivated and slower per byte.");
println!("- xxh3 is non-cryptographic and SIMD-accelerated for small inputs.");
println!("- All three produce correct estimates within the sketch's RMSE.");
println!("- The gap between rows 1 and 3 is the cost of hashing.");
}