probabilistic_collections/hyperloglog.rs
1//! Space-efficient probabilistic data structure for estimating the number of distinct items in a
2//! multiset.
3
4use crate::SipHasherBuilder;
5#[cfg(feature = "serde")]
6use serde_crate::{Deserialize, Serialize};
7use std::borrow::Borrow;
8use std::cmp;
9use std::f64;
10use std::fmt::Debug;
11use std::hash::BuildHasher;
12use std::hash::{Hash, Hasher};
13use std::marker::PhantomData;
14
15/// A space-efficient probabilitic data structure to count the number of distinct items in a
16/// multiset.
17///
18/// A `HyperLogLog<T>` uses the observation that the cardinality of a multiset of uniformly
19/// distributed items can be estimated by calculating the maximum number of leading zeros in the
20/// hash of each item in the multiset. It also buckets each item in a register and takes the
21/// harmonic mean of the count in order to reduce the variance. Finally, it uses linear counting
22/// for small cardinalities and small correction for large cardinalities.
23///
24/// # Examples
25///
26/// ```
27/// # use std::f64::EPSILON;
28/// use probabilistic_collections::hyperloglog::HyperLogLog;
29/// use probabilistic_collections::SipHasherBuilder;
30///
31/// let mut hhl = HyperLogLog::<u32>::with_hasher(0.1, SipHasherBuilder::from_seed(0, 0));
32///
33/// assert!(hhl.is_empty());
34///
35/// for key in &[0, 1, 2, 0, 1, 2] {
36/// hhl.insert(&key);
37/// }
38///
39/// assert!((hhl.len().round() - 3.0).abs() < EPSILON);
40/// ```
41#[cfg_attr(
42 feature = "serde",
43 derive(Deserialize, Serialize),
44 serde(crate = "serde_crate")
45)]
46pub struct HyperLogLog<T, B = SipHasherBuilder> {
47 alpha: f64,
48 p: usize,
49 registers: Vec<u8>,
50 hash_builder: B,
51 _marker: PhantomData<T>,
52}
53
54impl<T> HyperLogLog<T> {
55 /// Constructs a new, empty `HyperLogLog<T>` with a given error probability.
56 ///
57 /// # Panics
58 ///
59 /// Panics if `error_probability` is not in (0, 1).
60 ///
61 /// # Examples
62 ///
63 /// ```
64 /// use probabilistic_collections::hyperloglog::HyperLogLog;
65 ///
66 /// let hhl = HyperLogLog::<u32>::new(0.1);
67 /// ```
68 pub fn new(error_probability: f64) -> Self {
69 Self::with_hasher(error_probability, SipHasherBuilder::from_entropy())
70 }
71}
72
73impl<T, B> HyperLogLog<T, B>
74where
75 B: BuildHasher,
76{
77 fn get_alpha(p: usize) -> f64 {
78 assert!(4 <= p && p <= 16);
79 match p {
80 4 => 0.673,
81 5 => 0.697,
82 6 => 0.709,
83 p => 0.7213 / (1.0 + 1.079 / f64::from(1 << p)),
84 }
85 }
86
87 /// Constructs a new, empty `HyperLogLog<T>` with a given error probability and hasher builder.
88 ///
89 /// # Panics
90 ///
91 /// Panics if `error_probability` is not in (0, 1).
92 ///
93 /// # Examples
94 ///
95 /// ```
96 /// use probabilistic_collections::hyperloglog::HyperLogLog;
97 /// use probabilistic_collections::SipHasherBuilder;
98 ///
99 /// let hhl = HyperLogLog::<u32, _>::with_hasher(0.1, SipHasherBuilder::from_entropy());
100 /// ```
101 pub fn with_hasher(error_probability: f64, hash_builder: B) -> Self {
102 assert!(0.0 < error_probability && error_probability < 1.0);
103 let p = (1.04 / error_probability).powi(2).ln().ceil() as usize;
104 let alpha = Self::get_alpha(p);
105 let registers_len = 1 << p;
106 HyperLogLog {
107 alpha,
108 p,
109 registers: vec![0; registers_len],
110 hash_builder,
111 _marker: PhantomData,
112 }
113 }
114
115 /// Inserts an item into the `HyperLogLog<T>`.
116 ///
117 /// # Examples
118 ///
119 /// ```
120 /// use probabilistic_collections::hyperloglog::HyperLogLog;
121 ///
122 /// let mut hhl = HyperLogLog::<u32>::new(0.1);
123 ///
124 /// hhl.insert(&0);
125 /// ```
126 pub fn insert<U>(&mut self, item: &U)
127 where
128 T: Borrow<U>,
129 U: Hash + ?Sized,
130 {
131 let mut hasher = self.hash_builder.build_hasher();
132 item.hash(&mut hasher);
133 let hash = hasher.finish();
134 let register_index = hash as usize & (self.registers.len() - 1);
135 let value = (!hash >> self.p).trailing_zeros() as u8;
136 self.registers[register_index] = cmp::max(self.registers[register_index], value + 1);
137 }
138
139 /// Merges `self` with `other`.
140 ///
141 /// # Panics
142 ///
143 /// Panics if the error probability of `self` is not equal to the error probability of `other`
144 /// or if the hash builder of `self` is not equal to the hash builder of `other`.
145 ///
146 /// # Examples
147 ///
148 /// ```
149 /// # use std::f64::EPSILON;
150 /// use probabilistic_collections::hyperloglog::HyperLogLog;
151 /// use probabilistic_collections::SipHasherBuilder;
152 ///
153 /// let mut hhl1 = HyperLogLog::<u32>::with_hasher(0.1, SipHasherBuilder::from_seed(0, 0));
154 /// hhl1.insert(&0);
155 /// hhl1.insert(&1);
156 ///
157 /// let mut hhl2 = HyperLogLog::<u32>::with_hasher(0.1, *hhl1.hasher());
158 /// hhl2.insert(&0);
159 /// hhl2.insert(&2);
160 ///
161 /// hhl1.merge(&hhl2);
162 ///
163 /// assert!((hhl1.len().round() - 3.0).abs() < EPSILON);
164 /// ```
165 pub fn merge(&mut self, other: &HyperLogLog<T, B>)
166 where
167 B: Debug + PartialEq,
168 {
169 assert_eq!(self.p, other.p);
170 assert_eq!(self.hash_builder, other.hash_builder);
171
172 for (index, value) in self.registers.iter_mut().enumerate() {
173 *value = cmp::max(*value, other.registers[index]);
174 }
175 }
176
177 fn get_estimate(&self) -> f64 {
178 let len = self.registers.len() as f64;
179 1.0 / (self.alpha
180 * len
181 * len
182 * self
183 .registers
184 .iter()
185 .map(|value| 1.0 / 2.0f64.powi(i32::from(*value)))
186 .sum::<f64>())
187 }
188
189 /// Returns the estimated number of distinct items in the `HyperLogLog<T>`.
190 ///
191 /// # Examples
192 ///
193 /// ```
194 /// # use std::f64::EPSILON;
195 /// use probabilistic_collections::hyperloglog::HyperLogLog;
196 ///
197 /// let mut hhl = HyperLogLog::<u32>::new(0.1);
198 /// assert!((hhl.len().round() - 0.0).abs() < EPSILON);
199 ///
200 /// hhl.insert(&1);
201 /// assert!((hhl.len().round() - 1.0).abs() < EPSILON);
202 /// ```
203 pub fn len(&self) -> f64 {
204 let len = self.registers.len() as f64;
205 match self.get_estimate() {
206 x if x <= 2.5 * len => {
207 let zeros = self
208 .registers
209 .iter()
210 .map(|value| if *value == 0 { 1 } else { 0 })
211 .sum::<u64>();
212 len * (len / zeros as f64).ln()
213 }
214 x if x <= 1.0 / 3.0 * 2.0f64.powi(32) => x,
215 x => -(2.0f64.powi(32)) * (1.0 - x / 2.0f64.powi(32)).ln(),
216 }
217 }
218
219 /// Returns `true` is the `HyperLogLog<T>` is empty.
220 ///
221 /// # Examples
222 ///
223 /// ```
224 /// # use std::f64::EPSILON;
225 /// use probabilistic_collections::hyperloglog::HyperLogLog;
226 ///
227 /// let mut hhl = HyperLogLog::<u32>::new(0.1);
228 /// assert!(hhl.is_empty());
229 ///
230 /// hhl.insert(&1);
231 /// assert!(!hhl.is_empty());
232 /// ```
233 pub fn is_empty(&self) -> bool {
234 self.len() < f64::EPSILON
235 }
236
237 /// Clears the `HyperLogLog<T>`, removing all items.
238 ///
239 /// # Examples
240 ///
241 /// ```
242 /// # use std::f64::EPSILON;
243 /// use probabilistic_collections::hyperloglog::HyperLogLog;
244 ///
245 /// let mut hhl = HyperLogLog::<u32>::new(0.1);
246 /// assert!(hhl.is_empty());
247 ///
248 /// hhl.insert(&1);
249 /// assert!(!hhl.is_empty());
250 ///
251 /// hhl.clear();
252 /// assert!(hhl.is_empty());
253 /// ```
254 pub fn clear(&mut self) {
255 for value in &mut self.registers {
256 *value = 0;
257 }
258 }
259
260 /// Returns a reference to the HyperLogLog's hasher builder.
261 ///
262 /// # Examples
263 ///
264 /// ```
265 /// use probabilistic_collections::hyperloglog::HyperLogLog;
266 ///
267 /// let hhl = HyperLogLog::<String>::new(0.1);
268 /// let hasher = hhl.hasher();
269 /// ```
270 pub fn hasher(&self) -> &B {
271 &self.hash_builder
272 }
273}
274
275#[cfg(test)]
276mod tests {
277 use super::HyperLogLog;
278 use crate::util::tests::hash_builder_1;
279 use std::f64::EPSILON;
280
281 #[test]
282 #[should_panic]
283 fn test_panic_new_invalid_error_probability() {
284 let _hhl = HyperLogLog::<u32>::new(0.0);
285 }
286
287 #[test]
288 #[should_panic]
289 fn test_panic_new_mismatch_error_iprobability() {
290 let mut hhl1 = HyperLogLog::<u32>::new(0.1);
291 let hhl2 = HyperLogLog::<u32>::new(0.2);
292 hhl1.merge(&hhl2);
293 }
294
295 #[test]
296 fn test_simple() {
297 let mut hhl = HyperLogLog::<u32>::with_hasher(0.01, hash_builder_1());
298 assert!(hhl.is_empty());
299 assert!(hhl.len() < EPSILON);
300
301 for key in &[0, 1, 2, 0, 1, 2] {
302 hhl.insert(&key);
303 }
304
305 assert!(!hhl.is_empty());
306 assert!((hhl.len().round() - 3.0).abs() < EPSILON);
307
308 hhl.clear();
309 assert!(hhl.is_empty());
310 }
311
312 #[test]
313 fn test_merge() {
314 let mut hhl1 = HyperLogLog::<u32>::with_hasher(0.01, hash_builder_1());
315
316 for key in &[0, 1, 2, 0, 1, 2] {
317 hhl1.insert(&key);
318 }
319
320 let mut hhl2 = HyperLogLog::<u32>::with_hasher(0.01, *hhl1.hasher());
321
322 for key in &[0, 1, 3, 0, 1, 3] {
323 hhl2.insert(&key);
324 }
325
326 assert!((hhl1.len().round() - 3.0).abs() < EPSILON);
327 assert!((hhl2.len().round() - 3.0).abs() < EPSILON);
328
329 hhl1.merge(&hhl2);
330 assert!((hhl1.len().round() - 4.0).abs() < EPSILON);
331 }
332
333 #[cfg(feature = "serde")]
334 #[test]
335 fn test_ser_de() {
336 let mut hhl = HyperLogLog::<u32>::new(0.01);
337 for key in &[0, 1, 2, 0, 1, 2] {
338 hhl.insert(&key);
339 }
340
341 let serialized_hhl = bincode::serialize(&hhl).unwrap();
342 let de_hhl: HyperLogLog<u32> = bincode::deserialize(&serialized_hhl).unwrap();
343
344 assert!((hhl.len() - de_hhl.len()).abs() < EPSILON);
345 assert!((hhl.alpha - de_hhl.alpha).abs() < EPSILON);
346 assert_eq!(hhl.p, de_hhl.p);
347 assert_eq!(hhl.registers, de_hhl.registers);
348 assert_eq!(hhl.hasher(), de_hhl.hasher());
349 }
350}