bloom_lib/hyperloglog.rs
1//! A HyperLogLog estimator for set cardinality.
2
3use core::{hash::BuildHasher, marker::PhantomData};
4
5use alloc::{vec, vec::Vec};
6
7use crate::{hash::DefaultHashBuilder, Error};
8
9/// Smallest supported precision (16 registers).
10const MIN_PRECISION: u8 = 4;
11
12/// Largest supported precision (262,144 registers, ~256 KiB).
13const MAX_PRECISION: u8 = 18;
14
15/// Estimates the number of *distinct* items in a stream in fixed, tiny memory.
16///
17/// HyperLogLog counts unique items without storing them. It hashes each item,
18/// uses the leading bits to pick one of `2^p` registers, and records the longest
19/// run of leading zeros seen in the remaining bits — a quantity that grows
20/// logarithmically with the number of distinct items hitting that register.
21/// Combining the registers with a bias-corrected harmonic mean yields a
22/// cardinality estimate with a standard error of about `1.04 / sqrt(2^p)`.
23///
24/// Memory is `2^p` bytes regardless of how many items are inserted, so a
25/// precision-14 estimator counts billions of distinct items in 16 KiB with a
26/// typical error under 1%.
27///
28/// The estimator is generic over the item type `T` and a
29/// [`BuildHasher`](core::hash::BuildHasher) `S`, defaulting to the deterministic
30/// [`DefaultHashBuilder`](crate::hash::DefaultHashBuilder).
31///
32/// # Examples
33///
34/// ```
35/// use bloom_lib::HyperLogLog;
36///
37/// let mut hll = HyperLogLog::new(14).unwrap();
38/// for i in 0..100_000u32 {
39/// hll.insert(&i);
40/// }
41///
42/// // Within a couple of percent of the true cardinality of 100,000.
43/// let estimate = hll.count();
44/// assert!((97_000..=103_000).contains(&estimate));
45/// ```
46#[derive(Debug, Clone)]
47#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
48pub struct HyperLogLog<T: ?Sized, S = DefaultHashBuilder> {
49 registers: Vec<u8>,
50 precision: u8,
51 #[cfg_attr(feature = "serde", serde(skip))]
52 hasher: S,
53 #[cfg_attr(feature = "serde", serde(skip))]
54 _marker: PhantomData<fn(&T)>,
55}
56
57impl<T: ?Sized> HyperLogLog<T, DefaultHashBuilder> {
58 /// Creates an estimator with the given `precision`, using the default
59 /// hasher.
60 ///
61 /// The estimator uses `2^precision` one-byte registers and has a standard
62 /// error of roughly `1.04 / sqrt(2^precision)`. Precision 14 (16 KiB, ~0.8%
63 /// error) is a common production choice.
64 ///
65 /// # Parameters
66 ///
67 /// - `precision`: the log2 of the register count. Must be in `4..=18`.
68 ///
69 /// # Errors
70 ///
71 /// Returns [`Error::InvalidParameter`] if `precision` is outside `4..=18`.
72 ///
73 /// # Examples
74 ///
75 /// ```
76 /// use bloom_lib::HyperLogLog;
77 ///
78 /// let hll = HyperLogLog::<&str>::new(12).unwrap();
79 /// assert_eq!(hll.precision(), 12);
80 /// assert!(hll.is_empty());
81 /// ```
82 pub fn new(precision: u8) -> Result<Self, Error> {
83 Self::with_hasher(precision, DefaultHashBuilder)
84 }
85
86 /// Creates an estimator sized for a target relative `error`, using the
87 /// default hasher.
88 ///
89 /// The precision is chosen as the smallest value whose standard error does
90 /// not exceed `error`, then clamped to the supported `4..=18` range.
91 ///
92 /// # Parameters
93 ///
94 /// - `error`: the desired standard error, in `(0.0, 1.0)`. For example
95 /// `0.01` targets ~1% error.
96 ///
97 /// # Errors
98 ///
99 /// Returns [`Error::InvalidParameter`] if `error` is not a finite value in
100 /// `(0.0, 1.0)`.
101 ///
102 /// # Examples
103 ///
104 /// ```
105 /// use bloom_lib::HyperLogLog;
106 ///
107 /// let hll = HyperLogLog::<&str>::with_error_rate(0.01).unwrap();
108 /// // ~1% error needs about 2^14 registers.
109 /// assert_eq!(hll.precision(), 14);
110 /// ```
111 pub fn with_error_rate(error: f64) -> Result<Self, Error> {
112 if !(error.is_finite() && error > 0.0 && error < 1.0) {
113 return Err(Error::InvalidParameter {
114 param: "error",
115 reason: "must be a finite value in the open interval (0.0, 1.0)",
116 });
117 }
118 let registers = libm::pow(1.04 / error, 2.0);
119 let raw = libm::ceil(libm::log2(registers)) as i64;
120 let precision = raw.clamp(i64::from(MIN_PRECISION), i64::from(MAX_PRECISION)) as u8;
121 Self::new(precision)
122 }
123}
124
125impl<T: ?Sized, S: BuildHasher> HyperLogLog<T, S> {
126 /// Creates an estimator with the given `precision` and a caller-supplied
127 /// hasher.
128 ///
129 /// # Errors
130 ///
131 /// Returns [`Error::InvalidParameter`] if `precision` is outside `4..=18`.
132 ///
133 /// # Examples
134 ///
135 /// ```
136 /// # #[cfg(feature = "std")] {
137 /// use std::collections::hash_map::RandomState;
138 /// use bloom_lib::HyperLogLog;
139 ///
140 /// let hll: HyperLogLog<&str, RandomState> =
141 /// HyperLogLog::with_hasher(14, RandomState::new()).unwrap();
142 /// # }
143 /// ```
144 pub fn with_hasher(precision: u8, hasher: S) -> Result<Self, Error> {
145 if !(MIN_PRECISION..=MAX_PRECISION).contains(&precision) {
146 return Err(Error::InvalidParameter {
147 param: "precision",
148 reason: "must be in the range 4..=18",
149 });
150 }
151 let num_registers = 1usize << precision;
152 Ok(Self {
153 registers: vec![0u8; num_registers],
154 precision,
155 hasher,
156 _marker: PhantomData,
157 })
158 }
159
160 /// Adds `item` to the estimator.
161 ///
162 /// Inserting an item already counted has no effect on the estimate, so the
163 /// operation is idempotent with respect to distinct values.
164 ///
165 /// # Examples
166 ///
167 /// ```
168 /// use bloom_lib::HyperLogLog;
169 ///
170 /// let mut hll = HyperLogLog::new(14).unwrap();
171 /// hll.insert("first");
172 /// hll.insert("first"); // no additional effect
173 /// hll.insert("second");
174 /// assert_eq!(hll.count(), 2);
175 /// ```
176 pub fn insert(&mut self, item: &T)
177 where
178 T: core::hash::Hash,
179 {
180 let hash = self.hasher.hash_one(item);
181 let p = u32::from(self.precision);
182 // Top `p` bits select the register.
183 let index = (hash >> (64 - p)) as usize;
184 // Rank = 1 + leading zeros of the remaining bits. Setting the low `p`
185 // bits ensures an all-zero remainder yields the maximum rank `64-p+1`
186 // rather than counting the shifted-in zeros.
187 let remainder = (hash << p) | ((1u64 << p) - 1);
188 let rank = (remainder.leading_zeros() + 1) as u8;
189 if rank > self.registers[index] {
190 self.registers[index] = rank;
191 }
192 }
193
194 /// Returns the estimated number of distinct items inserted.
195 ///
196 /// Uses the bias-corrected HyperLogLog estimator, falling back to linear
197 /// counting at low cardinalities where the raw estimate is unreliable. The
198 /// result is approximate, with the standard error implied by the precision.
199 ///
200 /// # Examples
201 ///
202 /// ```
203 /// use bloom_lib::HyperLogLog;
204 ///
205 /// let mut hll = HyperLogLog::new(14).unwrap();
206 /// assert_eq!(hll.count(), 0);
207 /// for i in 0..1_000u32 {
208 /// hll.insert(&i);
209 /// }
210 /// let estimate = hll.count();
211 /// assert!((960..=1_040).contains(&estimate));
212 /// ```
213 #[must_use]
214 pub fn count(&self) -> u64 {
215 let m = self.registers.len() as f64;
216 let mut sum = 0.0f64;
217 let mut zeros = 0u64;
218 for ®ister in &self.registers {
219 sum += libm::exp2(-f64::from(register));
220 if register == 0 {
221 zeros += 1;
222 }
223 }
224
225 let raw = alpha(self.registers.len()) * m * m / sum;
226
227 // Linear counting is more accurate than the raw estimate when the table
228 // is sparsely populated.
229 if raw <= 2.5 * m && zeros > 0 {
230 let linear = m * libm::log(m / zeros as f64);
231 return libm::round(linear) as u64;
232 }
233
234 libm::round(raw) as u64
235 }
236
237 /// Returns `true` if no items have been inserted.
238 #[inline]
239 #[must_use]
240 pub fn is_empty(&self) -> bool {
241 self.registers.iter().all(|®ister| register == 0)
242 }
243
244 /// The configured precision (log2 of the register count).
245 #[inline]
246 #[must_use]
247 pub fn precision(&self) -> u8 {
248 self.precision
249 }
250
251 /// Resets the estimator to empty, retaining the allocation.
252 ///
253 /// # Examples
254 ///
255 /// ```
256 /// use bloom_lib::HyperLogLog;
257 ///
258 /// let mut hll = HyperLogLog::new(14).unwrap();
259 /// hll.insert("x");
260 /// hll.clear();
261 /// assert!(hll.is_empty());
262 /// assert_eq!(hll.count(), 0);
263 /// ```
264 pub fn clear(&mut self) {
265 self.registers.iter_mut().for_each(|register| *register = 0);
266 }
267
268 /// Merges `other` into `self` by taking the register-wise maximum.
269 ///
270 /// The result estimates the cardinality of the *union* of the two streams.
271 /// Both estimators must share the same precision.
272 ///
273 /// # Errors
274 ///
275 /// Returns [`Error::IncompatibleParameters`] if the precisions differ.
276 ///
277 /// # Examples
278 ///
279 /// ```
280 /// use bloom_lib::HyperLogLog;
281 ///
282 /// let mut a = HyperLogLog::new(14).unwrap();
283 /// let mut b = HyperLogLog::new(14).unwrap();
284 /// for i in 0..1_000u32 {
285 /// a.insert(&i);
286 /// }
287 /// for i in 500..1_500u32 {
288 /// b.insert(&i);
289 /// }
290 ///
291 /// a.merge(&b).unwrap();
292 /// // Union of [0,1000) and [500,1500) is [0,1500): ~1,500 distinct.
293 /// let estimate = a.count();
294 /// assert!((1_400..=1_600).contains(&estimate));
295 /// ```
296 pub fn merge(&mut self, other: &Self) -> Result<(), Error> {
297 if self.precision != other.precision {
298 return Err(Error::IncompatibleParameters);
299 }
300 for (dst, src) in self.registers.iter_mut().zip(other.registers.iter()) {
301 *dst = (*dst).max(*src);
302 }
303 Ok(())
304 }
305}
306
307/// The HyperLogLog bias-correction constant `alpha_m` for a register count `m`.
308fn alpha(m: usize) -> f64 {
309 match m {
310 16 => 0.673,
311 32 => 0.697,
312 64 => 0.709,
313 _ => 0.7213 / (1.0 + 1.079 / m as f64),
314 }
315}
316
317#[cfg(test)]
318mod tests {
319 #![allow(clippy::unwrap_used)]
320
321 use super::*;
322
323 #[test]
324 fn test_new_rejects_out_of_range_precision() {
325 assert!(matches!(
326 HyperLogLog::<&str>::new(3),
327 Err(Error::InvalidParameter { .. })
328 ));
329 assert!(matches!(
330 HyperLogLog::<&str>::new(19),
331 Err(Error::InvalidParameter { .. })
332 ));
333 }
334
335 #[test]
336 fn test_with_error_rate_picks_precision() {
337 let hll = HyperLogLog::<&str>::with_error_rate(0.01).unwrap();
338 assert_eq!(hll.precision(), 14);
339 // A tiny target is clamped to the maximum precision.
340 let tight = HyperLogLog::<&str>::with_error_rate(0.0001).unwrap();
341 assert_eq!(tight.precision(), MAX_PRECISION);
342 }
343
344 #[test]
345 fn test_empty_counts_zero() {
346 let hll = HyperLogLog::<u32>::new(14).unwrap();
347 assert!(hll.is_empty());
348 assert_eq!(hll.count(), 0);
349 }
350
351 #[test]
352 fn test_small_cardinality_is_exact_ish() {
353 let mut hll = HyperLogLog::new(14).unwrap();
354 for i in 0..10u32 {
355 hll.insert(&i);
356 }
357 // Linear counting is very accurate at tiny cardinalities.
358 let estimate = hll.count();
359 assert!(
360 (9..=11).contains(&estimate),
361 "estimate {estimate} off for n=10"
362 );
363 }
364
365 #[test]
366 fn test_large_cardinality_within_error() {
367 let mut hll = HyperLogLog::new(14).unwrap();
368 let n = 100_000u32;
369 for i in 0..n {
370 hll.insert(&i);
371 }
372 let estimate = hll.count();
373 let error = (estimate as f64 - f64::from(n)).abs() / f64::from(n);
374 // Precision 14 has ~0.8% standard error; allow a 3% envelope.
375 assert!(
376 error < 0.03,
377 "relative error {error} too high (est {estimate})"
378 );
379 }
380
381 #[test]
382 fn test_idempotent_inserts() {
383 let mut hll = HyperLogLog::new(14).unwrap();
384 for _ in 0..1_000 {
385 hll.insert("same");
386 }
387 assert_eq!(hll.count(), 1);
388 }
389
390 #[test]
391 fn test_clear() {
392 let mut hll = HyperLogLog::new(14).unwrap();
393 for i in 0..100u32 {
394 hll.insert(&i);
395 }
396 hll.clear();
397 assert!(hll.is_empty());
398 assert_eq!(hll.count(), 0);
399 }
400
401 #[test]
402 fn test_merge_estimates_union() {
403 let mut a = HyperLogLog::new(14).unwrap();
404 let mut b = HyperLogLog::new(14).unwrap();
405 for i in 0..1_000u32 {
406 a.insert(&i);
407 }
408 for i in 500..1_500u32 {
409 b.insert(&i);
410 }
411 a.merge(&b).unwrap();
412 let estimate = a.count();
413 assert!(
414 (1_400..=1_600).contains(&estimate),
415 "union estimate {estimate}"
416 );
417 }
418
419 #[test]
420 fn test_merge_rejects_incompatible() {
421 let mut a = HyperLogLog::<u32>::new(14).unwrap();
422 let b = HyperLogLog::<u32>::new(12).unwrap();
423 assert_eq!(a.merge(&b), Err(Error::IncompatibleParameters));
424 }
425}