hyperloglogplus/
hyperloglog.rs

1use core::borrow::Borrow;
2use core::hash::{BuildHasher, Hash, Hasher};
3use core::marker::PhantomData;
4
5use serde::{Deserialize, Serialize};
6
7use crate::common::*;
8use crate::HyperLogLog;
9use crate::HyperLogLogError;
10
11/// Implements the original HyperLogLog algorithm for cardinality estimation.
12///
13/// This implementation is based on the original paper of P. Flajolet et al:
14///
15/// *HyperLogLog: the analysis of a near-optimal cardinality estimation
16/// algorithm.*
17///
18/// - Uses 5-bit registers, packed in a 32-bit unsigned integer. Thus, every
19///   six registers 2 bits are not used.
20/// - Supports serialization/deserialization through `serde`.
21/// - Compiles in a `no_std` environment using a global allocator.
22///
23/// # Examples
24///
25/// ```
26/// use std::collections::hash_map::RandomState;
27/// use hyperloglogplus::{HyperLogLog, HyperLogLogPF};
28///
29/// let mut hll: HyperLogLogPF<u32, _> =
30///     HyperLogLogPF::new(16, RandomState::new()).unwrap();
31///
32/// hll.insert(&12345);
33/// hll.insert(&23456);
34///
35/// assert_eq!(hll.count().trunc() as u32, 2);
36/// ```
37///
38/// # References
39///
40/// - ["HyperLogLog: the analysis of a near-optimal cardinality estimation
41///   algorithm", Philippe Flajolet, Éric Fusy, Olivier Gandouet and Frédéric
42///   Meunier.](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf)
43///
44#[derive(Clone, Debug, Serialize, Deserialize)]
45pub struct HyperLogLogPF<H, B>
46where
47    H: Hash + ?Sized,
48    B: BuildHasher,
49{
50    builder:   B,
51    count:     usize,
52    precision: u8,
53    registers: Registers,
54    phantom:   PhantomData<H>,
55}
56
57impl<H, B> HyperLogLogPF<H, B>
58where
59    H: Hash + ?Sized,
60    B: BuildHasher,
61{
62    // Minimum precision allowed.
63    const MIN_PRECISION: u8 = 4;
64    // Maximum precision allowed.
65    const MAX_PRECISION: u8 = 16;
66
67    /// Creates a new HyperLogLogPF instance.
68    pub fn new(precision: u8, builder: B) -> Result<Self, HyperLogLogError> {
69        // Ensure the specified precision is within bounds.
70        if precision < Self::MIN_PRECISION || precision > Self::MAX_PRECISION {
71            return Err(HyperLogLogError::InvalidPrecision);
72        }
73
74        // Calculate register count based on given precision.
75        let count = Self::register_count(precision);
76
77        Ok(HyperLogLogPF {
78            builder:   builder,
79            count:     count,
80            precision: precision,
81            registers: Registers::with_count(count),
82            phantom:   PhantomData,
83        })
84    }
85
86    /// Merges the `other` HyperLogLogPF instance into `self`.
87    ///
88    /// Both sketches must have the same precision.
89    pub fn merge<S, T>(
90        &mut self,
91        other: &HyperLogLogPF<S, T>,
92    ) -> Result<(), HyperLogLogError>
93    where
94        S: Hash + ?Sized,
95        T: BuildHasher,
96    {
97        if self.precision != other.precision() {
98            return Err(HyperLogLogError::IncompatiblePrecision);
99        }
100
101        for (i, val) in other.registers_iter().enumerate() {
102            self.registers.set_greater(i, val);
103        }
104
105        Ok(())
106    }
107
108    /// Inserts a new value, of any type, to the multiset.
109    pub fn insert_any<R>(&mut self, value: &R)
110    where
111        R: Hash + ?Sized,
112    {
113        self.insert_impl(value);
114    }
115
116    #[inline(always)] // Inserts a new value to the multiset.
117    fn insert_impl<R>(&mut self, value: &R)
118    where
119        R: Hash + ?Sized,
120    {
121        // Create a new hasher.
122        let mut hasher = self.builder.build_hasher();
123        // Calculate the hash.
124        value.hash(&mut hasher);
125        // Drops the higher 32 bits.
126        let mut hash: u32 = hasher.finish() as u32;
127
128        // Calculate the register's index.
129        let index: usize = (hash >> (32 - self.precision)) as usize;
130
131        // Shift left the bits of the index.
132        hash = (hash << self.precision) | (1 << (self.precision - 1));
133
134        // Count leading zeros.
135        let zeros: u32 = 1 + hash.leading_zeros();
136
137        // Update the register with the max leading zeros counts.
138        self.registers.set_greater(index, zeros);
139    }
140
141    #[inline] // Returns the precision of the HyperLogLogPF instance.
142    fn precision(&self) -> u8 {
143        self.precision
144    }
145
146    #[inline] // Returns an iterator to the Registers' values.
147    fn registers_iter(&self) -> impl Iterator<Item = u32> + '_ {
148        self.registers.iter()
149    }
150}
151
152impl<H, B> HyperLogLogCommon for HyperLogLogPF<H, B>
153where
154    H: Hash + ?Sized,
155    B: BuildHasher,
156{
157}
158
159impl<H, B> HyperLogLog<H> for HyperLogLogPF<H, B>
160where
161    H: Hash + ?Sized,
162    B: BuildHasher,
163{
164    /// Adds a new value to the multiset.
165    fn add(&mut self, value: &H) {
166        self.insert_impl(value);
167    }
168
169    /// Inserts a new value to the multiset.
170    fn insert<Q>(&mut self, value: &Q)
171    where
172        H: Borrow<Q>,
173        Q: Hash + ?Sized,
174    {
175        self.insert_impl(value);
176    }
177
178    /// Estimates the cardinality of the multiset.
179    fn count(&mut self) -> f64 {
180        // Calculate the raw estimate.
181        let (mut raw, zeros) = (
182            Self::estimate_raw_plus(self.registers.iter(), self.count),
183            self.registers.zeros(),
184        );
185
186        let two32 = (1u64 << 32) as f64;
187
188        if raw <= 2.5 * self.count as f64 && zeros != 0 {
189            // Apply small range correction.
190            raw = Self::linear_count(self.count, zeros);
191        } else if raw > two32 / 30.0 {
192            // Apply large range correction.
193            raw = -1.0 * two32 * ln(1.0 - raw / two32);
194        }
195
196        raw
197    }
198}
199
200#[cfg(test)]
201mod tests {
202    use super::*;
203
204    use core::hash::{BuildHasher, Hasher};
205    use siphasher::sip::SipHasher13;
206
207    struct PassThroughHasher(u64);
208
209    impl Hasher for PassThroughHasher {
210        #[inline]
211        fn finish(&self) -> u64 {
212            self.0
213        }
214
215        #[inline]
216        fn write(&mut self, _: &[u8]) {}
217
218        #[inline]
219        fn write_u64(&mut self, i: u64) {
220            self.0 = i;
221        }
222    }
223
224    #[derive(Serialize, Deserialize)]
225    struct PassThroughHasherBuilder;
226
227    impl BuildHasher for PassThroughHasherBuilder {
228        type Hasher = PassThroughHasher;
229
230        fn build_hasher(&self) -> Self::Hasher {
231            PassThroughHasher(0)
232        }
233    }
234
235    #[derive(Serialize, Deserialize)]
236    struct DefaultBuildHasher;
237
238    impl BuildHasher for DefaultBuildHasher {
239        type Hasher = SipHasher13;
240
241        fn build_hasher(&self) -> Self::Hasher {
242            SipHasher13::new()
243        }
244    }
245
246    #[test]
247    fn test_insert() {
248        let builder = PassThroughHasherBuilder {};
249
250        let mut hll: HyperLogLogPF<u64, PassThroughHasherBuilder> =
251            HyperLogLogPF::new(16, builder).unwrap();
252
253        hll.insert(&0x0000000000010fff);
254
255        assert_eq!(hll.registers.get(1), 5);
256
257        hll.insert(&0x000000000002ffff);
258
259        assert_eq!(hll.registers.get(2), 1);
260
261        hll.insert(&0x0000000000030000);
262
263        assert_eq!(hll.registers.get(3), 17);
264
265        hll.insert(&0x0000000000030001);
266
267        assert_eq!(hll.registers.get(3), 17);
268
269        hll.insert(&0x00000000ff037000);
270
271        assert_eq!(hll.registers.get(0xff03), 2);
272
273        hll.insert(&0x00000000ff030800);
274
275        assert_eq!(hll.registers.get(0xff03), 5);
276
277        let builder = PassThroughHasherBuilder {};
278
279        let mut hll: HyperLogLogPF<u64, PassThroughHasherBuilder> =
280            HyperLogLogPF::new(4, builder).unwrap();
281
282        hll.insert(&0x000000001fffffff);
283        assert_eq!(hll.registers.get(1), 1);
284
285        hll.insert(&0x00000000ffffffff);
286        assert_eq!(hll.registers.get(0xf), 1);
287
288        hll.insert(&0x0000000000ffffff);
289        assert_eq!(hll.registers.get(0), 5);
290    }
291
292    #[test]
293    fn test_count() {
294        let builder = PassThroughHasherBuilder {};
295
296        let mut hll: HyperLogLogPF<u64, PassThroughHasherBuilder> =
297            HyperLogLogPF::new(16, builder).unwrap();
298
299        assert_eq!(hll.count(), 0.0);
300
301        hll.insert(&0x0000000000010fff);
302        hll.insert(&0x0000000000020fff);
303        hll.insert(&0x0000000000030fff);
304        hll.insert(&0x0000000000040fff);
305        hll.insert(&0x0000000000050fff);
306        hll.insert(&0x0000000000050fff);
307
308        assert_eq!(hll.count().trunc() as u64, 5);
309    }
310
311    #[test]
312    fn test_merge() {
313        let builder = PassThroughHasherBuilder {};
314
315        let mut hll: HyperLogLogPF<u64, PassThroughHasherBuilder> =
316            HyperLogLogPF::new(16, builder).unwrap();
317
318        hll.insert(&0x00000000000101ff);
319        hll.insert(&0x00000000000202ff);
320        hll.insert(&0x00000000000304ff);
321        hll.insert(&0x0000000000040fff);
322        hll.insert(&0x0000000000050fff);
323        hll.insert(&0x0000000000060fff);
324
325        assert_eq!(hll.registers.get(1), 8);
326        assert_eq!(hll.registers.get(2), 7);
327        assert_eq!(hll.registers.get(3), 6);
328        assert_eq!(hll.registers.get(4), 5);
329        assert_eq!(hll.registers.get(5), 5);
330        assert_eq!(hll.registers.get(6), 5);
331
332        let builder = PassThroughHasherBuilder {};
333
334        let err: HyperLogLogPF<u64, PassThroughHasherBuilder> =
335            HyperLogLogPF::new(9, builder).unwrap();
336
337        assert_eq!(
338            hll.merge(&err),
339            Err(HyperLogLogError::IncompatiblePrecision)
340        );
341
342        let builder = PassThroughHasherBuilder {};
343
344        let mut other: HyperLogLogPF<u64, PassThroughHasherBuilder> =
345            HyperLogLogPF::new(16, builder).unwrap();
346
347        hll.insert(&0x00000000000404ff);
348        hll.insert(&0x00000000000502ff);
349        hll.insert(&0x00000000000601ff);
350
351        assert_eq!(other.merge(&hll), Ok(()));
352
353        assert_eq!(other.count().trunc() as u64, 6);
354
355        assert_eq!(hll.registers.get(1), 8);
356        assert_eq!(hll.registers.get(2), 7);
357        assert_eq!(hll.registers.get(3), 6);
358        assert_eq!(hll.registers.get(4), 6);
359        assert_eq!(hll.registers.get(5), 7);
360        assert_eq!(hll.registers.get(6), 8);
361    }
362
363    #[test]
364    fn test_serialization() {
365        let builder = PassThroughHasherBuilder {};
366
367        let mut hll: HyperLogLogPF<u64, PassThroughHasherBuilder> =
368            HyperLogLogPF::new(16, builder).unwrap();
369
370        hll.insert(&0x0000000000010fff);
371        hll.insert(&0x0000000000020fff);
372        hll.insert(&0x0000000000030fff);
373        hll.insert(&0x0000000000040fff);
374        hll.insert(&0x0000000000050fff);
375        hll.insert(&0x0000000000050fff);
376
377        assert_eq!(hll.count().trunc() as usize, 5);
378
379        let serialized = serde_json::to_string(&hll).unwrap();
380
381        let mut deserialized: HyperLogLogPF<u64, PassThroughHasherBuilder> =
382            serde_json::from_str(&serialized).unwrap();
383
384        assert_eq!(deserialized.count().trunc() as usize, 5);
385
386        deserialized.insert(&0x0000000000060fff);
387
388        assert_eq!(deserialized.count().trunc() as usize, 6);
389
390        let mut hll: HyperLogLogPF<u64, DefaultBuildHasher> =
391            HyperLogLogPF::new(16, DefaultBuildHasher {}).unwrap();
392
393        hll.insert(&0x0000000000010fff);
394        hll.insert(&0x0000000000020fff);
395        hll.insert(&0x0000000000030fff);
396        hll.insert(&0x0000000000040fff);
397        hll.insert(&0x0000000000050fff);
398        hll.insert(&0x0000000000050fff);
399
400        assert_eq!(hll.count().trunc() as usize, 5);
401
402        let serialized = serde_json::to_string(&hll).unwrap();
403
404        let mut deserialized: HyperLogLogPF<u64, PassThroughHasherBuilder> =
405            serde_json::from_str(&serialized).unwrap();
406
407        assert_eq!(deserialized.count().trunc() as usize, 5);
408
409        deserialized.insert(&0x0000000000060fff);
410
411        assert_eq!(deserialized.count().trunc() as usize, 6);
412    }
413
414    #[cfg(feature = "bench-units")]
415    mod benches {
416        extern crate test;
417
418        use super::*;
419        use rand::prelude::*;
420        use test::{black_box, Bencher};
421
422        #[bench]
423        fn bench_insert(b: &mut Bencher) {
424            let builder = PassThroughHasherBuilder {};
425
426            let mut hll: HyperLogLogPF<u64, PassThroughHasherBuilder> =
427                HyperLogLogPF::new(16, builder).unwrap();
428
429            b.iter(|| {
430                for i in 0u64..1000 {
431                    hll.insert(&(u64::max_value() - i));
432                }
433            })
434        }
435
436        #[bench]
437        fn bench_insert_with_hash(b: &mut Bencher) {
438            let mut rng = rand::thread_rng();
439
440            let workload: Vec<String> = (0..2000)
441                .map(|_| {
442                    format!("- {} - {} -", rng.gen::<u64>(), rng.gen::<u64>())
443                })
444                .collect();
445
446            b.iter(|| {
447                let mut hll: HyperLogLogPF<&String, DefaultBuildHasher> =
448                    HyperLogLogPF::new(16, DefaultBuildHasher {}).unwrap();
449
450                for val in &workload {
451                    hll.insert(&val);
452                }
453
454                let val = hll.count();
455
456                black_box(val);
457            })
458        }
459
460        #[bench]
461        fn bench_count(b: &mut Bencher) {
462            let builder = PassThroughHasherBuilder {};
463
464            let mut hll: HyperLogLogPF<u64, PassThroughHasherBuilder> =
465                HyperLogLogPF::new(16, builder).unwrap();
466
467            for i in 0u64..10000 {
468                hll.insert(&(u64::max_value() - i));
469            }
470
471            b.iter(|| {
472                let count = hll.count();
473                black_box(count);
474            })
475        }
476    }
477}