Skip to main content

probabilistic_collections/
hyperloglog.rs

1//! Space-efficient probabilistic data structure for estimating the number of distinct items in a
2//! multiset.
3
4use crate::SipHasherBuilder;
5#[cfg(feature = "serde")]
6use serde_crate::{Deserialize, Serialize};
7use std::borrow::Borrow;
8use std::cmp;
9use std::f64;
10use std::fmt::Debug;
11use std::hash::BuildHasher;
12use std::hash::{Hash, Hasher};
13use std::marker::PhantomData;
14
15/// A space-efficient probabilitic data structure to count the number of distinct items in a
16/// multiset.
17///
18/// A `HyperLogLog<T>` uses the observation that the cardinality of a multiset of uniformly
19/// distributed items can be estimated by calculating the maximum number of leading zeros in the
20/// hash of each item in the multiset. It also buckets each item in a register and takes the
21/// harmonic mean of the count in order to reduce the variance. Finally, it uses linear counting
22/// for small cardinalities and small correction for large cardinalities.
23///
24/// # Examples
25///
26/// ```
27/// # use std::f64::EPSILON;
28/// use probabilistic_collections::hyperloglog::HyperLogLog;
29/// use probabilistic_collections::SipHasherBuilder;
30///
31/// let mut hhl = HyperLogLog::<u32>::with_hasher(0.1, SipHasherBuilder::from_seed(0, 0));
32///
33/// assert!(hhl.is_empty());
34///
35/// for key in &[0, 1, 2, 0, 1, 2] {
36///     hhl.insert(&key);
37/// }
38///
39/// assert!((hhl.len().round() - 3.0).abs() < EPSILON);
40/// ```
41#[cfg_attr(
42    feature = "serde",
43    derive(Deserialize, Serialize),
44    serde(crate = "serde_crate")
45)]
46pub struct HyperLogLog<T, B = SipHasherBuilder> {
47    alpha: f64,
48    p: usize,
49    registers: Vec<u8>,
50    hash_builder: B,
51    _marker: PhantomData<T>,
52}
53
54impl<T> HyperLogLog<T> {
55    /// Constructs a new, empty `HyperLogLog<T>` with a given error probability.
56    ///
57    /// # Panics
58    ///
59    /// Panics if `error_probability` is not in (0, 1).
60    ///
61    /// # Examples
62    ///
63    /// ```
64    /// use probabilistic_collections::hyperloglog::HyperLogLog;
65    ///
66    /// let hhl = HyperLogLog::<u32>::new(0.1);
67    /// ```
68    pub fn new(error_probability: f64) -> Self {
69        Self::with_hasher(error_probability, SipHasherBuilder::from_entropy())
70    }
71}
72
73impl<T, B> HyperLogLog<T, B>
74where
75    B: BuildHasher,
76{
77    fn get_alpha(p: usize) -> f64 {
78        assert!(4 <= p && p <= 16);
79        match p {
80            4 => 0.673,
81            5 => 0.697,
82            6 => 0.709,
83            p => 0.7213 / (1.0 + 1.079 / f64::from(1 << p)),
84        }
85    }
86
87    /// Constructs a new, empty `HyperLogLog<T>` with a given error probability and hasher builder.
88    ///
89    /// # Panics
90    ///
91    /// Panics if `error_probability` is not in (0, 1).
92    ///
93    /// # Examples
94    ///
95    /// ```
96    /// use probabilistic_collections::hyperloglog::HyperLogLog;
97    /// use probabilistic_collections::SipHasherBuilder;
98    ///
99    /// let hhl = HyperLogLog::<u32, _>::with_hasher(0.1, SipHasherBuilder::from_entropy());
100    /// ```
101    pub fn with_hasher(error_probability: f64, hash_builder: B) -> Self {
102        assert!(0.0 < error_probability && error_probability < 1.0);
103        let p = (1.04 / error_probability).powi(2).ln().ceil() as usize;
104        let alpha = Self::get_alpha(p);
105        let registers_len = 1 << p;
106        HyperLogLog {
107            alpha,
108            p,
109            registers: vec![0; registers_len],
110            hash_builder,
111            _marker: PhantomData,
112        }
113    }
114
115    /// Inserts an item into the `HyperLogLog<T>`.
116    ///
117    /// # Examples
118    ///
119    /// ```
120    /// use probabilistic_collections::hyperloglog::HyperLogLog;
121    ///
122    /// let mut hhl = HyperLogLog::<u32>::new(0.1);
123    ///
124    /// hhl.insert(&0);
125    /// ```
126    pub fn insert<U>(&mut self, item: &U)
127    where
128        T: Borrow<U>,
129        U: Hash + ?Sized,
130    {
131        let mut hasher = self.hash_builder.build_hasher();
132        item.hash(&mut hasher);
133        let hash = hasher.finish();
134        let register_index = hash as usize & (self.registers.len() - 1);
135        let value = (!hash >> self.p).trailing_zeros() as u8;
136        self.registers[register_index] = cmp::max(self.registers[register_index], value + 1);
137    }
138
139    /// Merges `self` with `other`.
140    ///
141    /// # Panics
142    ///
143    /// Panics if the error probability of `self` is not equal to the error probability of `other`
144    /// or if the hash builder of `self` is not equal to the hash builder of `other`.
145    ///
146    /// # Examples
147    ///
148    /// ```
149    /// # use std::f64::EPSILON;
150    /// use probabilistic_collections::hyperloglog::HyperLogLog;
151    /// use probabilistic_collections::SipHasherBuilder;
152    ///
153    /// let mut hhl1 = HyperLogLog::<u32>::with_hasher(0.1, SipHasherBuilder::from_seed(0, 0));
154    /// hhl1.insert(&0);
155    /// hhl1.insert(&1);
156    ///
157    /// let mut hhl2 = HyperLogLog::<u32>::with_hasher(0.1, *hhl1.hasher());
158    /// hhl2.insert(&0);
159    /// hhl2.insert(&2);
160    ///
161    /// hhl1.merge(&hhl2);
162    ///
163    /// assert!((hhl1.len().round() - 3.0).abs() < EPSILON);
164    /// ```
165    pub fn merge(&mut self, other: &HyperLogLog<T, B>)
166    where
167        B: Debug + PartialEq,
168    {
169        assert_eq!(self.p, other.p);
170        assert_eq!(self.hash_builder, other.hash_builder);
171
172        for (index, value) in self.registers.iter_mut().enumerate() {
173            *value = cmp::max(*value, other.registers[index]);
174        }
175    }
176
177    fn get_estimate(&self) -> f64 {
178        let len = self.registers.len() as f64;
179        1.0 / (self.alpha
180            * len
181            * len
182            * self
183                .registers
184                .iter()
185                .map(|value| 1.0 / 2.0f64.powi(i32::from(*value)))
186                .sum::<f64>())
187    }
188
189    /// Returns the estimated number of distinct items in the `HyperLogLog<T>`.
190    ///
191    /// # Examples
192    ///
193    /// ```
194    /// # use std::f64::EPSILON;
195    /// use probabilistic_collections::hyperloglog::HyperLogLog;
196    ///
197    /// let mut hhl = HyperLogLog::<u32>::new(0.1);
198    /// assert!((hhl.len().round() - 0.0).abs() < EPSILON);
199    ///
200    /// hhl.insert(&1);
201    /// assert!((hhl.len().round() - 1.0).abs() < EPSILON);
202    /// ```
203    pub fn len(&self) -> f64 {
204        let len = self.registers.len() as f64;
205        match self.get_estimate() {
206            x if x <= 2.5 * len => {
207                let zeros = self
208                    .registers
209                    .iter()
210                    .map(|value| if *value == 0 { 1 } else { 0 })
211                    .sum::<u64>();
212                len * (len / zeros as f64).ln()
213            }
214            x if x <= 1.0 / 3.0 * 2.0f64.powi(32) => x,
215            x => -(2.0f64.powi(32)) * (1.0 - x / 2.0f64.powi(32)).ln(),
216        }
217    }
218
219    /// Returns `true` is the `HyperLogLog<T>` is empty.
220    ///
221    /// # Examples
222    ///
223    /// ```
224    /// # use std::f64::EPSILON;
225    /// use probabilistic_collections::hyperloglog::HyperLogLog;
226    ///
227    /// let mut hhl = HyperLogLog::<u32>::new(0.1);
228    /// assert!(hhl.is_empty());
229    ///
230    /// hhl.insert(&1);
231    /// assert!(!hhl.is_empty());
232    /// ```
233    pub fn is_empty(&self) -> bool {
234        self.len() < f64::EPSILON
235    }
236
237    /// Clears the `HyperLogLog<T>`, removing all items.
238    ///
239    /// # Examples
240    ///
241    /// ```
242    /// # use std::f64::EPSILON;
243    /// use probabilistic_collections::hyperloglog::HyperLogLog;
244    ///
245    /// let mut hhl = HyperLogLog::<u32>::new(0.1);
246    /// assert!(hhl.is_empty());
247    ///
248    /// hhl.insert(&1);
249    /// assert!(!hhl.is_empty());
250    ///
251    /// hhl.clear();
252    /// assert!(hhl.is_empty());
253    /// ```
254    pub fn clear(&mut self) {
255        for value in &mut self.registers {
256            *value = 0;
257        }
258    }
259
260    /// Returns a reference to the HyperLogLog's hasher builder.
261    ///
262    /// # Examples
263    ///
264    /// ```
265    /// use probabilistic_collections::hyperloglog::HyperLogLog;
266    ///
267    /// let hhl = HyperLogLog::<String>::new(0.1);
268    /// let hasher = hhl.hasher();
269    /// ```
270    pub fn hasher(&self) -> &B {
271        &self.hash_builder
272    }
273}
274
275#[cfg(test)]
276mod tests {
277    use super::HyperLogLog;
278    use crate::util::tests::hash_builder_1;
279    use std::f64::EPSILON;
280
281    #[test]
282    #[should_panic]
283    fn test_panic_new_invalid_error_probability() {
284        let _hhl = HyperLogLog::<u32>::new(0.0);
285    }
286
287    #[test]
288    #[should_panic]
289    fn test_panic_new_mismatch_error_iprobability() {
290        let mut hhl1 = HyperLogLog::<u32>::new(0.1);
291        let hhl2 = HyperLogLog::<u32>::new(0.2);
292        hhl1.merge(&hhl2);
293    }
294
295    #[test]
296    fn test_simple() {
297        let mut hhl = HyperLogLog::<u32>::with_hasher(0.01, hash_builder_1());
298        assert!(hhl.is_empty());
299        assert!(hhl.len() < EPSILON);
300
301        for key in &[0, 1, 2, 0, 1, 2] {
302            hhl.insert(&key);
303        }
304
305        assert!(!hhl.is_empty());
306        assert!((hhl.len().round() - 3.0).abs() < EPSILON);
307
308        hhl.clear();
309        assert!(hhl.is_empty());
310    }
311
312    #[test]
313    fn test_merge() {
314        let mut hhl1 = HyperLogLog::<u32>::with_hasher(0.01, hash_builder_1());
315
316        for key in &[0, 1, 2, 0, 1, 2] {
317            hhl1.insert(&key);
318        }
319
320        let mut hhl2 = HyperLogLog::<u32>::with_hasher(0.01, *hhl1.hasher());
321
322        for key in &[0, 1, 3, 0, 1, 3] {
323            hhl2.insert(&key);
324        }
325
326        assert!((hhl1.len().round() - 3.0).abs() < EPSILON);
327        assert!((hhl2.len().round() - 3.0).abs() < EPSILON);
328
329        hhl1.merge(&hhl2);
330        assert!((hhl1.len().round() - 4.0).abs() < EPSILON);
331    }
332
333    #[cfg(feature = "serde")]
334    #[test]
335    fn test_ser_de() {
336        let mut hhl = HyperLogLog::<u32>::new(0.01);
337        for key in &[0, 1, 2, 0, 1, 2] {
338            hhl.insert(&key);
339        }
340
341        let serialized_hhl = bincode::serialize(&hhl).unwrap();
342        let de_hhl: HyperLogLog<u32> = bincode::deserialize(&serialized_hhl).unwrap();
343
344        assert!((hhl.len() - de_hhl.len()).abs() < EPSILON);
345        assert!((hhl.alpha - de_hhl.alpha).abs() < EPSILON);
346        assert_eq!(hhl.p, de_hhl.p);
347        assert_eq!(hhl.registers, de_hhl.registers);
348        assert_eq!(hhl.hasher(), de_hhl.hasher());
349    }
350}