exaloglog 0.12.0

ExaLogLog: space-efficient approximate distinct counting (Ertl 2024). 43% smaller than HyperLogLog with the same estimation error.
Documentation
//! Demonstrates the cost of hashing in real workloads, and how to bypass
//! the standard library's `DefaultHasher` (SipHash13) by hashing inputs
//! yourself with a faster function and feeding the resulting `u64` to
//! [`exaloglog::ExaLogLog::add_hash`].
//!
//! Compare three paths over a million inserts of `&[u8]` values:
//!
//! 1. `add(&T)` using DefaultHasher (SipHash13) — the convenient default.
//! 2. `add_hash(xxh3_64(bytes))` using xxhash-rust — typically 5-10×
//!    faster on small inputs.
//! 3. `add_hash(splitmix64(seed))` for a synthetic baseline that
//!    measures sketch work alone.
//!
//! Run with:
//!
//! ```sh
//! cargo run --release --example fast_hashing
//! ```

use std::hash::{DefaultHasher, Hash, Hasher};
use std::time::Instant;

use exaloglog::ExaLogLog;
use xxhash_rust::xxh3::xxh3_64;

fn splitmix64(mut x: u64) -> u64 {
    x = x.wrapping_add(0x9E37_79B9_7F4A_7C15);
    x = (x ^ (x >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
    x = (x ^ (x >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
    x ^ (x >> 31)
}

fn main() {
    let p = 12;
    let n = 1_000_000usize;

    // Pre-compute byte inputs so we measure hashing+sketch work, not
    // string formatting.
    let inputs: Vec<Vec<u8>> = (0..n)
        .map(|i| format!("user_{i:09}@example.com").into_bytes())
        .collect();

    println!(
        "Hashing-cost comparison at p = {p}, n = {n}.\n\
         Each row: total time, throughput, distinct-count estimate.\n"
    );
    println!(
        "{:<32} {:>10} {:>15} {:>12}",
        "path", "time", "M ins/s", "estimate"
    );
    println!("{}", "-".repeat(72));

    // 1. add(&T) — DefaultHasher (SipHash13)
    {
        let start = Instant::now();
        let mut s = ExaLogLog::new_dense(p);
        for input in &inputs {
            s.add(input);
        }
        let elapsed = start.elapsed();
        let est = s.estimate();
        let mps = n as f64 / elapsed.as_secs_f64() / 1e6;
        println!(
            "{:<32} {:>9.3}s {:>14.1}M {:>12.0}",
            "add(&T)  [SipHash13 default]",
            elapsed.as_secs_f64(),
            mps,
            est
        );
        // Sanity check that DefaultHasher produces same hash via Hash trait.
        let mut h = DefaultHasher::new();
        inputs[0].hash(&mut h);
        let _ = h.finish();
    }

    // 2. add_hash(xxh3_64(bytes))
    {
        let start = Instant::now();
        let mut s = ExaLogLog::new_dense(p);
        for input in &inputs {
            s.add_hash(xxh3_64(input));
        }
        let elapsed = start.elapsed();
        let est = s.estimate();
        let mps = n as f64 / elapsed.as_secs_f64() / 1e6;
        println!(
            "{:<32} {:>9.3}s {:>14.1}M {:>12.0}",
            "add_hash(xxh3_64(bytes))",
            elapsed.as_secs_f64(),
            mps,
            est
        );
    }

    // 3. Synthetic splitmix64 baseline (no input hashing).
    {
        let start = Instant::now();
        let mut s = ExaLogLog::new_dense(p);
        for i in 0..n as u64 {
            s.add_hash(splitmix64(i));
        }
        let elapsed = start.elapsed();
        let est = s.estimate();
        let mps = n as f64 / elapsed.as_secs_f64() / 1e6;
        println!(
            "{:<32} {:>9.3}s {:>14.1}M {:>12.0}",
            "add_hash(splitmix64) baseline",
            elapsed.as_secs_f64(),
            mps,
            est
        );
    }

    println!();
    println!("Notes:");
    println!("- SipHash13 is cryptographically motivated and slower per byte.");
    println!("- xxh3 is non-cryptographic and SIMD-accelerated for small inputs.");
    println!("- All three produce correct estimates within the sketch's RMSE.");
    println!("- The gap between rows 1 and 3 is the cost of hashing.");
}