Skip to main content

subms_hyperloglog/
lib.rs

1//! HyperLogLog cardinality estimator.
2//!
3//! Precision `p` in [4, 18]. The register array is `m = 2^p` bytes. Hashes
4//! split into a `p`-bit register index (top bits) and a leading-zero count on
5//! the remaining bits. Each register stores the max observed count + 1.
6//! Estimate is the harmonic mean of `2^r` over the registers, scaled by an
7//! `alpha_m` correction. Linear-counting kicks in at low cardinality where
8//! the raw estimator is biased.
9//!
10//! ```
11//! use subms_hyperloglog::HyperLogLog;
12//! let mut hll = HyperLogLog::new(14);
13//! for i in 0..10_000 { hll.add(&format!("key{i}")); }
14//! let est = hll.estimate();
15//! assert!(est > 9_000.0 && est < 11_000.0, "10k distinct within 10%, got {est}");
16//! ```
17
18const FNV_OFFSET: u64 = 0xcbf29ce484222325;
19const FNV_PRIME: u64 = 0x100000001b3;
20
21pub struct HyperLogLog {
22    p: u32,
23    m: u32,
24    registers: Vec<u8>,
25    alpha: f64,
26}
27
28impl HyperLogLog {
29    /// New empty HLL at the given precision. `precision` is clamped to
30    /// `[4, 18]`; 14 gives ~16k registers / ~16 KB / ~1% std error.
31    pub fn new(precision: u32) -> Self {
32        let p = precision.clamp(4, 18);
33        let m = 1u32 << p;
34        let alpha = alpha_m(m);
35        Self {
36            p,
37            m,
38            registers: vec![0u8; m as usize],
39            alpha,
40        }
41    }
42
43    pub fn precision(&self) -> u32 {
44        self.p
45    }
46    pub fn register_count(&self) -> u32 {
47        self.m
48    }
49
50    /// Record a key.
51    pub fn add(&mut self, key: &str) {
52        let h = fnv1a64(key.as_bytes());
53        let idx = (h >> (64 - self.p)) as usize;
54        // Use the remaining 64-p bits for the leading-zero count. Place a
55        // sentinel 1 at the bottom so leading_zeros never exceeds (64-p).
56        let w = (h << self.p) | (1u64 << (self.p - 1));
57        let r = (w.leading_zeros() + 1) as u8;
58        if r > self.registers[idx] {
59            self.registers[idx] = r;
60        }
61    }
62
63    /// Estimate distinct count.
64    pub fn estimate(&self) -> f64 {
65        let m = self.m as f64;
66        // Sum 2^-r_i, harmonic-style.
67        let sum: f64 = self.registers.iter().map(|&r| 2f64.powi(-(r as i32))).sum();
68        let raw = self.alpha * m * m / sum;
69
70        // Linear counting at low cardinality. Threshold per Flajolet et al.
71        let zeros = self.registers.iter().filter(|&&r| r == 0).count();
72        if zeros > 0 && raw <= 2.5 * m {
73            -m * (zeros as f64 / m).ln()
74        } else {
75            raw
76        }
77    }
78
79    /// Merge another HLL of the same precision. Element-wise max over registers.
80    pub fn merge(&mut self, other: &Self) -> Result<(), &'static str> {
81        if self.p != other.p {
82            return Err("precision mismatch");
83        }
84        for (a, b) in self.registers.iter_mut().zip(other.registers.iter()) {
85            if *b > *a {
86                *a = *b;
87            }
88        }
89        Ok(())
90    }
91}
92
93fn alpha_m(m: u32) -> f64 {
94    match m {
95        16 => 0.673,
96        32 => 0.697,
97        64 => 0.709,
98        _ => 0.7213 / (1.0 + 1.079 / m as f64),
99    }
100}
101
102fn fnv1a64(bytes: &[u8]) -> u64 {
103    let mut h = FNV_OFFSET;
104    for &b in bytes {
105        h ^= b as u64;
106        h = h.wrapping_mul(FNV_PRIME);
107    }
108    // FNV-1a's bit distribution is poor for short sequential keys; pipe
109    // through a SplitMix64 finalizer so HLL's bucket index and leading-zero
110    // extraction see a well-mixed value.
111    h ^= h >> 30;
112    h = h.wrapping_mul(0xbf58476d1ce4e5b9);
113    h ^= h >> 27;
114    h = h.wrapping_mul(0x94d049bb133111eb);
115    h ^= h >> 31;
116    h
117}
118
119#[cfg(feature = "harness")]
120pub mod recipe;