subms_hyperloglog/lib.rs
1//! HyperLogLog cardinality estimator.
2//!
3//! Precision `p` in [4, 18]. The register array is `m = 2^p` bytes. Hashes
4//! split into a `p`-bit register index (top bits) and a leading-zero count on
5//! the remaining bits. Each register stores the max observed count + 1.
6//! Estimate is the harmonic mean of `2^r` over the registers, scaled by an
7//! `alpha_m` correction. Linear-counting kicks in at low cardinality where
8//! the raw estimator is biased.
9//!
10//! ```
11//! use subms_hyperloglog::HyperLogLog;
12//! let mut hll = HyperLogLog::new(14);
13//! for i in 0..10_000 { hll.add(&format!("key{i}")); }
14//! let est = hll.estimate();
15//! assert!(est > 9_000.0 && est < 11_000.0, "10k distinct within 10%, got {est}");
16//! ```
17
18const FNV_OFFSET: u64 = 0xcbf29ce484222325;
19const FNV_PRIME: u64 = 0x100000001b3;
20
21pub struct HyperLogLog {
22 p: u32,
23 m: u32,
24 registers: Vec<u8>,
25 alpha: f64,
26}
27
28impl HyperLogLog {
29 /// New empty HLL at the given precision. `precision` is clamped to
30 /// `[4, 18]`; 14 gives ~16k registers / ~16 KB / ~1% std error.
31 pub fn new(precision: u32) -> Self {
32 let p = precision.clamp(4, 18);
33 let m = 1u32 << p;
34 let alpha = alpha_m(m);
35 Self {
36 p,
37 m,
38 registers: vec![0u8; m as usize],
39 alpha,
40 }
41 }
42
43 pub fn precision(&self) -> u32 {
44 self.p
45 }
46 pub fn register_count(&self) -> u32 {
47 self.m
48 }
49
50 /// Record a key.
51 pub fn add(&mut self, key: &str) {
52 let h = fnv1a64(key.as_bytes());
53 let idx = (h >> (64 - self.p)) as usize;
54 // Use the remaining 64-p bits for the leading-zero count. Place a
55 // sentinel 1 at the bottom so leading_zeros never exceeds (64-p).
56 let w = (h << self.p) | (1u64 << (self.p - 1));
57 let r = (w.leading_zeros() + 1) as u8;
58 if r > self.registers[idx] {
59 self.registers[idx] = r;
60 }
61 }
62
63 /// Estimate distinct count.
64 pub fn estimate(&self) -> f64 {
65 let m = self.m as f64;
66 // Sum 2^-r_i, harmonic-style.
67 let sum: f64 = self.registers.iter().map(|&r| 2f64.powi(-(r as i32))).sum();
68 let raw = self.alpha * m * m / sum;
69
70 // Linear counting at low cardinality. Threshold per Flajolet et al.
71 let zeros = self.registers.iter().filter(|&&r| r == 0).count();
72 if zeros > 0 && raw <= 2.5 * m {
73 -m * (zeros as f64 / m).ln()
74 } else {
75 raw
76 }
77 }
78
79 /// Merge another HLL of the same precision. Element-wise max over registers.
80 pub fn merge(&mut self, other: &Self) -> Result<(), &'static str> {
81 if self.p != other.p {
82 return Err("precision mismatch");
83 }
84 for (a, b) in self.registers.iter_mut().zip(other.registers.iter()) {
85 if *b > *a {
86 *a = *b;
87 }
88 }
89 Ok(())
90 }
91}
92
93fn alpha_m(m: u32) -> f64 {
94 match m {
95 16 => 0.673,
96 32 => 0.697,
97 64 => 0.709,
98 _ => 0.7213 / (1.0 + 1.079 / m as f64),
99 }
100}
101
102fn fnv1a64(bytes: &[u8]) -> u64 {
103 let mut h = FNV_OFFSET;
104 for &b in bytes {
105 h ^= b as u64;
106 h = h.wrapping_mul(FNV_PRIME);
107 }
108 // FNV-1a's bit distribution is poor for short sequential keys; pipe
109 // through a SplitMix64 finalizer so HLL's bucket index and leading-zero
110 // extraction see a well-mixed value.
111 h ^= h >> 30;
112 h = h.wrapping_mul(0xbf58476d1ce4e5b9);
113 h ^= h >> 27;
114 h = h.wrapping_mul(0x94d049bb133111eb);
115 h ^= h >> 31;
116 h
117}
118
119#[cfg(feature = "harness")]
120pub mod recipe;