use std::hash::{Hash, Hasher};
struct Fnv1aHasher {
state: u64,
}
impl Fnv1aHasher {
const OFFSET_BASIS: u64 = 0xcbf29ce484222325;
const PRIME: u64 = 0x100000001b3;
fn new_with_seed(seed: u64) -> Self {
Self {
state: Self::OFFSET_BASIS ^ seed,
}
}
}
impl Default for Fnv1aHasher {
fn default() -> Self {
Self {
state: Self::OFFSET_BASIS,
}
}
}
impl Hasher for Fnv1aHasher {
fn finish(&self) -> u64 {
self.state
}
fn write(&mut self, bytes: &[u8]) {
for &b in bytes {
self.state ^= u64::from(b);
self.state = self.state.wrapping_mul(Self::PRIME);
}
}
}
fn mix64(mut x: u64) -> u64 {
x ^= x >> 33;
x = x.wrapping_mul(0xff51afd7ed558ccd);
x ^= x >> 33;
x = x.wrapping_mul(0xc4ceb9fe1a85ec53);
x ^= x >> 33;
x
}
fn fnv1a_hash<T: Hash>(value: &T, seed: u64) -> u64 {
let mut h = Fnv1aHasher::new_with_seed(seed);
value.hash(&mut h);
mix64(h.finish())
}
#[derive(Debug, Clone)]
pub struct HyperLogLog {
registers: Vec<u8>,
p: u8,
m: usize,
}
impl HyperLogLog {
pub fn new(p: u8) -> Self {
let p = p.clamp(4, 18);
let m = 1usize << p;
Self {
registers: vec![0u8; m],
p,
m,
}
}
pub fn precision(&self) -> u8 {
self.p
}
pub fn num_registers(&self) -> usize {
self.m
}
pub fn add_raw(&mut self, hash: u64) {
let index = (hash >> (64 - self.p)) as usize;
let w = hash << self.p;
let rho = (w.leading_zeros() + 1) as u8;
if rho > self.registers[index] {
self.registers[index] = rho;
}
}
pub fn add<T: Hash>(&mut self, value: &T) {
let h = fnv1a_hash(value, 0);
self.add_raw(h);
}
pub fn estimate(&self) -> f64 {
let m = self.m as f64;
let alpha = alpha_m(self.m);
let sum: f64 = self
.registers
.iter()
.map(|&r| 2.0_f64.powi(-(r as i32)))
.sum();
let raw_estimate = alpha * m * m / sum;
let zeros = self.registers.iter().filter(|&&r| r == 0).count() as f64;
if raw_estimate <= 2.5 * m && zeros > 0.0 {
return m * (m / zeros).ln();
}
let threshold = (1u64 << 32) as f64 / 30.0;
if raw_estimate > threshold {
return -(2.0_f64.powi(32)) * (1.0 - raw_estimate / 2.0_f64.powi(32)).ln();
}
raw_estimate
}
pub fn merge(&mut self, other: &HyperLogLog) -> Result<(), String> {
if self.p != other.p {
return Err(format!(
"Cannot merge HyperLogLog with p={} into one with p={}",
other.p, self.p
));
}
for (r, &o) in self.registers.iter_mut().zip(other.registers.iter()) {
if o > *r {
*r = o;
}
}
Ok(())
}
pub fn clear(&mut self) {
self.registers.iter_mut().for_each(|r| *r = 0);
}
}
pub fn alpha_m(m: usize) -> f64 {
match m {
16 => 0.673,
32 => 0.697,
64 => 0.709,
_ => {
0.7213 / (1.0 + 1.079 / m as f64)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_new_clamps_p() {
let hll = HyperLogLog::new(2); assert_eq!(hll.precision(), 4);
let hll2 = HyperLogLog::new(20); assert_eq!(hll2.precision(), 18);
}
#[test]
fn test_single_element_estimate() {
let mut hll = HyperLogLog::new(12);
hll.add(&42u64);
let est = hll.estimate();
assert!(
est > 0.5 && est < 2.0,
"Expected ≈1, got {est}"
);
}
#[test]
fn test_cardinality_10k_within_5_percent() {
let mut hll = HyperLogLog::new(12);
for i in 0u64..10_000 {
hll.add(&i);
}
let est = hll.estimate();
let rel_err = (est - 10_000.0).abs() / 10_000.0;
assert!(
rel_err < 0.05,
"Relative error {rel_err:.3} exceeds 5% (estimate={est:.0})"
);
}
#[test]
fn test_merge_union_of_disjoint_sets() {
let mut hll_a = HyperLogLog::new(14);
let mut hll_b = HyperLogLog::new(14);
for i in 0u64..5_000 {
hll_a.add(&i);
}
for i in 5_000u64..10_000 {
hll_b.add(&i);
}
hll_a.merge(&hll_b).expect("merge should succeed");
let est = hll_a.estimate();
let rel_err = (est - 10_000.0).abs() / 10_000.0;
assert!(
rel_err < 0.05,
"Merged estimate relative error {rel_err:.3} exceeds 5% (estimate={est:.0})"
);
}
#[test]
fn test_merge_incompatible_precision_returns_error() {
let mut hll_a = HyperLogLog::new(12);
let hll_b = HyperLogLog::new(8);
assert!(hll_a.merge(&hll_b).is_err());
}
#[test]
fn test_p4_vs_p12_relative_error() {
let n = 1_000u64;
let mut hll_p4 = HyperLogLog::new(4);
let mut hll_p12 = HyperLogLog::new(12);
for i in 0..n {
hll_p4.add(&i);
hll_p12.add(&i);
}
let err4 = (hll_p4.estimate() - n as f64).abs() / n as f64;
let err12 = (hll_p12.estimate() - n as f64).abs() / n as f64;
assert!(
err12 <= err4 || err12 < 0.1,
"Expected p=12 more accurate: err4={err4:.3}, err12={err12:.3}"
);
}
}