Skip to main content

bloom_lib/
hyperloglog.rs

1//! A HyperLogLog estimator for set cardinality.
2
3use core::{hash::BuildHasher, marker::PhantomData};
4
5use alloc::{vec, vec::Vec};
6
7use crate::{hash::DefaultHashBuilder, Error};
8
9/// Smallest supported precision (16 registers).
10const MIN_PRECISION: u8 = 4;
11
12/// Largest supported precision (262,144 registers, ~256 KiB).
13const MAX_PRECISION: u8 = 18;
14
15/// Estimates the number of *distinct* items in a stream in fixed, tiny memory.
16///
17/// HyperLogLog counts unique items without storing them. It hashes each item,
18/// uses the leading bits to pick one of `2^p` registers, and records the longest
19/// run of leading zeros seen in the remaining bits — a quantity that grows
20/// logarithmically with the number of distinct items hitting that register.
21/// Combining the registers with a bias-corrected harmonic mean yields a
22/// cardinality estimate with a standard error of about `1.04 / sqrt(2^p)`.
23///
24/// Memory is `2^p` bytes regardless of how many items are inserted, so a
25/// precision-14 estimator counts billions of distinct items in 16 KiB with a
26/// typical error under 1%.
27///
28/// The estimator is generic over the item type `T` and a
29/// [`BuildHasher`](core::hash::BuildHasher) `S`, defaulting to the deterministic
30/// [`DefaultHashBuilder`](crate::hash::DefaultHashBuilder).
31///
32/// # Examples
33///
34/// ```
35/// use bloom_lib::HyperLogLog;
36///
37/// let mut hll = HyperLogLog::new(14).unwrap();
38/// for i in 0..100_000u32 {
39///     hll.insert(&i);
40/// }
41///
42/// // Within a couple of percent of the true cardinality of 100,000.
43/// let estimate = hll.count();
44/// assert!((97_000..=103_000).contains(&estimate));
45/// ```
46#[derive(Debug, Clone)]
47#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
48pub struct HyperLogLog<T: ?Sized, S = DefaultHashBuilder> {
49    registers: Vec<u8>,
50    precision: u8,
51    #[cfg_attr(feature = "serde", serde(skip))]
52    hasher: S,
53    #[cfg_attr(feature = "serde", serde(skip))]
54    _marker: PhantomData<fn(&T)>,
55}
56
57impl<T: ?Sized> HyperLogLog<T, DefaultHashBuilder> {
58    /// Creates an estimator with the given `precision`, using the default
59    /// hasher.
60    ///
61    /// The estimator uses `2^precision` one-byte registers and has a standard
62    /// error of roughly `1.04 / sqrt(2^precision)`. Precision 14 (16 KiB, ~0.8%
63    /// error) is a common production choice.
64    ///
65    /// # Parameters
66    ///
67    /// - `precision`: the log2 of the register count. Must be in `4..=18`.
68    ///
69    /// # Errors
70    ///
71    /// Returns [`Error::InvalidParameter`] if `precision` is outside `4..=18`.
72    ///
73    /// # Examples
74    ///
75    /// ```
76    /// use bloom_lib::HyperLogLog;
77    ///
78    /// let hll = HyperLogLog::<&str>::new(12).unwrap();
79    /// assert_eq!(hll.precision(), 12);
80    /// assert!(hll.is_empty());
81    /// ```
82    pub fn new(precision: u8) -> Result<Self, Error> {
83        Self::with_hasher(precision, DefaultHashBuilder)
84    }
85
86    /// Creates an estimator sized for a target relative `error`, using the
87    /// default hasher.
88    ///
89    /// The precision is chosen as the smallest value whose standard error does
90    /// not exceed `error`, then clamped to the supported `4..=18` range.
91    ///
92    /// # Parameters
93    ///
94    /// - `error`: the desired standard error, in `(0.0, 1.0)`. For example
95    ///   `0.01` targets ~1% error.
96    ///
97    /// # Errors
98    ///
99    /// Returns [`Error::InvalidParameter`] if `error` is not a finite value in
100    /// `(0.0, 1.0)`.
101    ///
102    /// # Examples
103    ///
104    /// ```
105    /// use bloom_lib::HyperLogLog;
106    ///
107    /// let hll = HyperLogLog::<&str>::with_error_rate(0.01).unwrap();
108    /// // ~1% error needs about 2^14 registers.
109    /// assert_eq!(hll.precision(), 14);
110    /// ```
111    pub fn with_error_rate(error: f64) -> Result<Self, Error> {
112        if !(error.is_finite() && error > 0.0 && error < 1.0) {
113            return Err(Error::InvalidParameter {
114                param: "error",
115                reason: "must be a finite value in the open interval (0.0, 1.0)",
116            });
117        }
118        let registers = libm::pow(1.04 / error, 2.0);
119        let raw = libm::ceil(libm::log2(registers)) as i64;
120        let precision = raw.clamp(i64::from(MIN_PRECISION), i64::from(MAX_PRECISION)) as u8;
121        Self::new(precision)
122    }
123}
124
125impl<T: ?Sized, S: BuildHasher> HyperLogLog<T, S> {
126    /// Creates an estimator with the given `precision` and a caller-supplied
127    /// hasher.
128    ///
129    /// # Errors
130    ///
131    /// Returns [`Error::InvalidParameter`] if `precision` is outside `4..=18`.
132    ///
133    /// # Examples
134    ///
135    /// ```
136    /// # #[cfg(feature = "std")] {
137    /// use std::collections::hash_map::RandomState;
138    /// use bloom_lib::HyperLogLog;
139    ///
140    /// let hll: HyperLogLog<&str, RandomState> =
141    ///     HyperLogLog::with_hasher(14, RandomState::new()).unwrap();
142    /// # }
143    /// ```
144    pub fn with_hasher(precision: u8, hasher: S) -> Result<Self, Error> {
145        if !(MIN_PRECISION..=MAX_PRECISION).contains(&precision) {
146            return Err(Error::InvalidParameter {
147                param: "precision",
148                reason: "must be in the range 4..=18",
149            });
150        }
151        let num_registers = 1usize << precision;
152        Ok(Self {
153            registers: vec![0u8; num_registers],
154            precision,
155            hasher,
156            _marker: PhantomData,
157        })
158    }
159
160    /// Adds `item` to the estimator.
161    ///
162    /// Inserting an item already counted has no effect on the estimate, so the
163    /// operation is idempotent with respect to distinct values.
164    ///
165    /// # Examples
166    ///
167    /// ```
168    /// use bloom_lib::HyperLogLog;
169    ///
170    /// let mut hll = HyperLogLog::new(14).unwrap();
171    /// hll.insert("first");
172    /// hll.insert("first"); // no additional effect
173    /// hll.insert("second");
174    /// assert_eq!(hll.count(), 2);
175    /// ```
176    pub fn insert(&mut self, item: &T)
177    where
178        T: core::hash::Hash,
179    {
180        let hash = self.hasher.hash_one(item);
181        let p = u32::from(self.precision);
182        // Top `p` bits select the register.
183        let index = (hash >> (64 - p)) as usize;
184        // Rank = 1 + leading zeros of the remaining bits. Setting the low `p`
185        // bits ensures an all-zero remainder yields the maximum rank `64-p+1`
186        // rather than counting the shifted-in zeros.
187        let remainder = (hash << p) | ((1u64 << p) - 1);
188        let rank = (remainder.leading_zeros() + 1) as u8;
189        if rank > self.registers[index] {
190            self.registers[index] = rank;
191        }
192    }
193
194    /// Returns the estimated number of distinct items inserted.
195    ///
196    /// Uses the bias-corrected HyperLogLog estimator, falling back to linear
197    /// counting at low cardinalities where the raw estimate is unreliable. The
198    /// result is approximate, with the standard error implied by the precision.
199    ///
200    /// # Examples
201    ///
202    /// ```
203    /// use bloom_lib::HyperLogLog;
204    ///
205    /// let mut hll = HyperLogLog::new(14).unwrap();
206    /// assert_eq!(hll.count(), 0);
207    /// for i in 0..1_000u32 {
208    ///     hll.insert(&i);
209    /// }
210    /// let estimate = hll.count();
211    /// assert!((960..=1_040).contains(&estimate));
212    /// ```
213    #[must_use]
214    pub fn count(&self) -> u64 {
215        let m = self.registers.len() as f64;
216        let mut sum = 0.0f64;
217        let mut zeros = 0u64;
218        for &register in &self.registers {
219            sum += libm::exp2(-f64::from(register));
220            if register == 0 {
221                zeros += 1;
222            }
223        }
224
225        let raw = alpha(self.registers.len()) * m * m / sum;
226
227        // Linear counting is more accurate than the raw estimate when the table
228        // is sparsely populated.
229        if raw <= 2.5 * m && zeros > 0 {
230            let linear = m * libm::log(m / zeros as f64);
231            return libm::round(linear) as u64;
232        }
233
234        libm::round(raw) as u64
235    }
236
237    /// Returns `true` if no items have been inserted.
238    #[inline]
239    #[must_use]
240    pub fn is_empty(&self) -> bool {
241        self.registers.iter().all(|&register| register == 0)
242    }
243
244    /// The configured precision (log2 of the register count).
245    #[inline]
246    #[must_use]
247    pub fn precision(&self) -> u8 {
248        self.precision
249    }
250
251    /// Resets the estimator to empty, retaining the allocation.
252    ///
253    /// # Examples
254    ///
255    /// ```
256    /// use bloom_lib::HyperLogLog;
257    ///
258    /// let mut hll = HyperLogLog::new(14).unwrap();
259    /// hll.insert("x");
260    /// hll.clear();
261    /// assert!(hll.is_empty());
262    /// assert_eq!(hll.count(), 0);
263    /// ```
264    pub fn clear(&mut self) {
265        self.registers.iter_mut().for_each(|register| *register = 0);
266    }
267
268    /// Merges `other` into `self` by taking the register-wise maximum.
269    ///
270    /// The result estimates the cardinality of the *union* of the two streams.
271    /// Both estimators must share the same precision.
272    ///
273    /// # Errors
274    ///
275    /// Returns [`Error::IncompatibleParameters`] if the precisions differ.
276    ///
277    /// # Examples
278    ///
279    /// ```
280    /// use bloom_lib::HyperLogLog;
281    ///
282    /// let mut a = HyperLogLog::new(14).unwrap();
283    /// let mut b = HyperLogLog::new(14).unwrap();
284    /// for i in 0..1_000u32 {
285    ///     a.insert(&i);
286    /// }
287    /// for i in 500..1_500u32 {
288    ///     b.insert(&i);
289    /// }
290    ///
291    /// a.merge(&b).unwrap();
292    /// // Union of [0,1000) and [500,1500) is [0,1500): ~1,500 distinct.
293    /// let estimate = a.count();
294    /// assert!((1_400..=1_600).contains(&estimate));
295    /// ```
296    pub fn merge(&mut self, other: &Self) -> Result<(), Error> {
297        if self.precision != other.precision {
298            return Err(Error::IncompatibleParameters);
299        }
300        for (dst, src) in self.registers.iter_mut().zip(other.registers.iter()) {
301            *dst = (*dst).max(*src);
302        }
303        Ok(())
304    }
305}
306
307/// The HyperLogLog bias-correction constant `alpha_m` for a register count `m`.
308fn alpha(m: usize) -> f64 {
309    match m {
310        16 => 0.673,
311        32 => 0.697,
312        64 => 0.709,
313        _ => 0.7213 / (1.0 + 1.079 / m as f64),
314    }
315}
316
317#[cfg(test)]
318mod tests {
319    #![allow(clippy::unwrap_used)]
320
321    use super::*;
322
323    #[test]
324    fn test_new_rejects_out_of_range_precision() {
325        assert!(matches!(
326            HyperLogLog::<&str>::new(3),
327            Err(Error::InvalidParameter { .. })
328        ));
329        assert!(matches!(
330            HyperLogLog::<&str>::new(19),
331            Err(Error::InvalidParameter { .. })
332        ));
333    }
334
335    #[test]
336    fn test_with_error_rate_picks_precision() {
337        let hll = HyperLogLog::<&str>::with_error_rate(0.01).unwrap();
338        assert_eq!(hll.precision(), 14);
339        // A tiny target is clamped to the maximum precision.
340        let tight = HyperLogLog::<&str>::with_error_rate(0.0001).unwrap();
341        assert_eq!(tight.precision(), MAX_PRECISION);
342    }
343
344    #[test]
345    fn test_empty_counts_zero() {
346        let hll = HyperLogLog::<u32>::new(14).unwrap();
347        assert!(hll.is_empty());
348        assert_eq!(hll.count(), 0);
349    }
350
351    #[test]
352    fn test_small_cardinality_is_exact_ish() {
353        let mut hll = HyperLogLog::new(14).unwrap();
354        for i in 0..10u32 {
355            hll.insert(&i);
356        }
357        // Linear counting is very accurate at tiny cardinalities.
358        let estimate = hll.count();
359        assert!(
360            (9..=11).contains(&estimate),
361            "estimate {estimate} off for n=10"
362        );
363    }
364
365    #[test]
366    fn test_large_cardinality_within_error() {
367        let mut hll = HyperLogLog::new(14).unwrap();
368        let n = 100_000u32;
369        for i in 0..n {
370            hll.insert(&i);
371        }
372        let estimate = hll.count();
373        let error = (estimate as f64 - f64::from(n)).abs() / f64::from(n);
374        // Precision 14 has ~0.8% standard error; allow a 3% envelope.
375        assert!(
376            error < 0.03,
377            "relative error {error} too high (est {estimate})"
378        );
379    }
380
381    #[test]
382    fn test_idempotent_inserts() {
383        let mut hll = HyperLogLog::new(14).unwrap();
384        for _ in 0..1_000 {
385            hll.insert("same");
386        }
387        assert_eq!(hll.count(), 1);
388    }
389
390    #[test]
391    fn test_clear() {
392        let mut hll = HyperLogLog::new(14).unwrap();
393        for i in 0..100u32 {
394            hll.insert(&i);
395        }
396        hll.clear();
397        assert!(hll.is_empty());
398        assert_eq!(hll.count(), 0);
399    }
400
401    #[test]
402    fn test_merge_estimates_union() {
403        let mut a = HyperLogLog::new(14).unwrap();
404        let mut b = HyperLogLog::new(14).unwrap();
405        for i in 0..1_000u32 {
406            a.insert(&i);
407        }
408        for i in 500..1_500u32 {
409            b.insert(&i);
410        }
411        a.merge(&b).unwrap();
412        let estimate = a.count();
413        assert!(
414            (1_400..=1_600).contains(&estimate),
415            "union estimate {estimate}"
416        );
417    }
418
419    #[test]
420    fn test_merge_rejects_incompatible() {
421        let mut a = HyperLogLog::<u32>::new(14).unwrap();
422        let b = HyperLogLog::<u32>::new(12).unwrap();
423        assert_eq!(a.merge(&b), Err(Error::IncompatibleParameters));
424    }
425}