Skip to main content

subms_hyperloglog/
lib.rs

1//! HyperLogLog cardinality estimator.
2//!
3//! Precision `p` in [4, 18]. The register array is `m = 2^p` bytes. Hashes
4//! split into a `p`-bit register index (top bits) and a leading-zero count on
5//! the remaining bits. Each register stores the max observed count + 1.
6//! Estimate is the harmonic mean of `2^r` over the registers, scaled by an
7//! `alpha_m` correction. Linear-counting kicks in at low cardinality where
8//! the raw estimator is biased.
9//!
10//! ```
11//! use subms_hyperloglog::HyperLogLog;
12//! let mut hll = HyperLogLog::new(14);
13//! for i in 0..10_000 { hll.add(&format!("key{i}")); }
14//! let est = hll.estimate();
15//! assert!(est > 9_000.0 && est < 11_000.0, "10k distinct within 10%, got {est}");
16//! ```
17
18pub(crate) const FNV_OFFSET: u64 = 0xcbf29ce484222325;
19pub(crate) const FNV_PRIME: u64 = 0x100000001b3;
20
21#[cfg(feature = "serde")]
22use serde::{Deserialize, Serialize};
23
24#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
25pub struct HyperLogLog {
26    p: u32,
27    m: u32,
28    pub(crate) registers: Vec<u8>,
29    alpha: f64,
30}
31
32impl HyperLogLog {
33    /// New empty HLL at the given precision. `precision` is clamped to
34    /// `[4, 18]`; 14 gives ~16k registers / ~16 KB / ~1% std error.
35    pub fn new(precision: u32) -> Self {
36        let p = precision.clamp(4, 18);
37        let m = 1u32 << p;
38        let alpha = alpha_m(m);
39        Self {
40            p,
41            m,
42            registers: vec![0u8; m as usize],
43            alpha,
44        }
45    }
46
47    pub fn precision(&self) -> u32 {
48        self.p
49    }
50    pub fn register_count(&self) -> u32 {
51        self.m
52    }
53
54    /// Record a key.
55    pub fn add(&mut self, key: &str) {
56        let h = fnv1a64(key.as_bytes());
57        let idx = (h >> (64 - self.p)) as usize;
58        // Use the remaining 64-p bits for the leading-zero count. Place a
59        // sentinel 1 at the bottom so leading_zeros never exceeds (64-p).
60        let w = (h << self.p) | (1u64 << (self.p - 1));
61        let r = (w.leading_zeros() + 1) as u8;
62        if r > self.registers[idx] {
63            self.registers[idx] = r;
64        }
65    }
66
67    /// Estimate distinct count.
68    pub fn estimate(&self) -> f64 {
69        let m = self.m as f64;
70        // Sum 2^-r_i, harmonic-style.
71        let sum: f64 = self.registers.iter().map(|&r| 2f64.powi(-(r as i32))).sum();
72        let raw = self.alpha * m * m / sum;
73
74        // Linear counting at low cardinality. Threshold per Flajolet et al.
75        let zeros = self.registers.iter().filter(|&&r| r == 0).count();
76        if zeros > 0 && raw <= 2.5 * m {
77            -m * (zeros as f64 / m).ln()
78        } else {
79            raw
80        }
81    }
82
83    /// Merge another HLL of the same precision. Element-wise max over registers.
84    pub fn merge(&mut self, other: &Self) -> Result<(), &'static str> {
85        if self.p != other.p {
86            return Err("precision mismatch");
87        }
88        for (a, b) in self.registers.iter_mut().zip(other.registers.iter()) {
89            if *b > *a {
90                *a = *b;
91            }
92        }
93        Ok(())
94    }
95}
96
97impl HyperLogLog {
98    /// Access the underlying register array. Used by the feature
99    /// modules (sparse promotion, union/intersect) without making the
100    /// field itself public.
101    #[inline]
102    #[allow(dead_code)] // used only by the feature modules (off under default features)
103    pub(crate) fn registers(&self) -> &[u8] {
104        &self.registers
105    }
106}
107
108pub(crate) fn alpha_m(m: u32) -> f64 {
109    match m {
110        16 => 0.673,
111        32 => 0.697,
112        64 => 0.709,
113        _ => 0.7213 / (1.0 + 1.079 / m as f64),
114    }
115}
116
117pub(crate) fn fnv1a64(bytes: &[u8]) -> u64 {
118    let mut h = FNV_OFFSET;
119    for &b in bytes {
120        h ^= b as u64;
121        h = h.wrapping_mul(FNV_PRIME);
122    }
123    // FNV-1a's bit distribution is poor for short sequential keys; pipe
124    // through a SplitMix64 finalizer so HLL's bucket index and leading-zero
125    // extraction see a well-mixed value.
126    h ^= h >> 30;
127    h = h.wrapping_mul(0xbf58476d1ce4e5b9);
128    h ^= h >> 27;
129    h = h.wrapping_mul(0x94d049bb133111eb);
130    h ^= h >> 31;
131    h
132}
133
134#[cfg(feature = "harness")]
135pub mod recipe;
136
137// Opt-in feature modules. Base HLL is zero-dep + std-only; each opt-in
138// adds a focused capability under its own Cargo feature.
139#[cfg(any(feature = "sparse", feature = "union-intersect"))]
140pub mod features;
141
142#[cfg(feature = "sparse")]
143pub use features::sparse::SparseHyperLogLog;
144#[cfg(feature = "union-intersect")]
145pub use features::union_intersect::{estimate_intersect, estimate_union};