hyperloglog_rs/
hyperloglog.rs

1use crate::array_default::{ArrayDefault, ArrayIter};
2use crate::precisions::{Precision, WordType};
3use crate::prelude::HyperLogLogTrait;
4use crate::primitive::Primitive;
5use core::hash::Hash;
6
7#[derive(Clone, Debug, Copy)]
8/// A probabilistic algorithm for estimating the number of distinct elements in a set.
9///
10/// HyperLogLog is a probabilistic algorithm designed to estimate the number
11/// of distinct elements in a set. It does so by taking advantage of the fact
12/// that the representation of an element can be transformed into a uniform
13/// distribution in a space with a fixed range.
14///
15/// HyperLogLog works by maintaining a fixed-sized register array,
16/// where each register holds a counter. The algorithm splits the input set into subsets,
17/// applies a hash function to each element in the subset, and then updates
18/// the corresponding counter in the register array.
19///
20/// HyperLogLog uses a trick called "probabilistic counting" to estimate
21/// the number of distinct elements in the set. Each register counter is converted
22/// to a binary string, and the algorithm counts the number of leading zeros in
23/// each binary string. The maximum number of leading zeros over all counters
24/// is used to estimate the number of distinct elements in the set.
25///
26/// HyperLogLog has a tunable parameter called precision that determines
27/// the accuracy of the algorithm. Higher precision leads to better accuracy,
28/// but requires more memory. The error rate of the algorithm is guaranteed
29/// to be within a certain bound, depending on the chosen precision.
30///
31/// # Examples
32///
33/// ```
34/// use hyperloglog_rs::prelude::*;
35///
36/// let mut hll = HyperLogLog::<Precision12, 6>::default();
37/// hll.insert(&"apple");
38/// hll.insert(&"banana");
39/// hll.insert(&"cherry");
40///
41/// let estimated_cardinality = hll.estimate_cardinality();
42/// assert!(estimated_cardinality >= 3.0_f32 * 0.9 &&
43///         estimated_cardinality <= 3.0_f32 * 1.1);
44/// ```
45///
46/// # Citations
47///
48/// This implementation is based on the following papers:
49///
50/// * Flajolet, Philippe, et al. "HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm." DMTCS Proceedings 1 (2007): 127-146.
51/// * Heule, Stefan, Marc Nunkesser, and Alexander Hall. "HyperLogLog in practice: algorithmic engineering of a state of the art cardinality estimation algorithm." Proceedings of the 16th International Conference on Extending Database Technology. 2013.
52///
53pub struct HyperLogLog<P: Precision + WordType<BITS>, const BITS: usize> {
54    pub(crate) words: P::Words,
55    pub(crate) number_of_zero_registers: P::NumberOfZeros,
56}
57
58impl<P: Precision + WordType<BITS>, const BITS: usize> HyperLogLog<P, BITS> {
59    /// Create a new HyperLogLog counter.
60    fn new() -> Self {
61        Self {
62            words: P::Words::default_array(),
63            number_of_zero_registers: P::NumberOfZeros::reverse(P::NUMBER_OF_REGISTERS),
64        }
65    }
66
67    /// Create a new HyperLogLog counter from an array of words.
68    ///
69    /// # Arguments
70    /// * `words` - An array of u64 words to use for the HyperLogLog counter.
71    ///
72    /// # Returns
73    /// A new HyperLogLog counter initialized with the given words.
74    ///
75    /// # Examples
76    ///
77    /// ```rust
78    /// use hyperloglog_rs::prelude::*;
79    ///                                                                                                                                                                                                                                                                    
80    /// let words = [0_u32; 4];
81    /// let hll = HyperLogLog::<Precision4, 6>::from_words(&words);
82    /// assert_eq!(hll.len(), 16);
83    /// ```
84    pub fn from_words(words: &P::Words) -> Self {
85        let number_of_zero_registers =
86            P::NumberOfZeros::reverse(words.iter_elements().fold(
87                0,
88                |number_of_zero_registers, word| {
89                    // We check that in all words the PADDING_BITS_MASK
90                    // is all zeros.
91                    debug_assert!(
92                        word & Self::PADDING_BITS_MASK == 0,
93                        concat!(
94                            "The padding bits of the word {} must be all zeros. ",
95                            "We have obtained {} instead."
96                        ),
97                        word,
98                        word & Self::PADDING_BITS_MASK
99                    );
100
101                    (0..Self::NUMBER_OF_REGISTERS_IN_WORD).fold(
102                        number_of_zero_registers,
103                        |number_of_zero_registers, i| {
104                            let register = (word >> (i * BITS)) & Self::LOWER_REGISTER_MASK;
105                            number_of_zero_registers + (register == 0) as usize
106                        },
107                    )
108                },
109            )) - P::NumberOfZeros::reverse(Self::get_number_of_padding_registers());
110
111        // We check that the values in the last word are masked
112        // according to the LAST_WORD_PADDING_BITS_MASK.
113        debug_assert!(
114            words.last().unwrap() & Self::LAST_WORD_PADDING_BITS_MASK == 0,
115            concat!(
116                "The padding bits of the last word {} must be all zeros. ",
117                "We have obtained {} instead. The last word padding bits mask is, ",
118                "when represented in binary, {:#034b}."
119            ),
120            words.last().unwrap(),
121            words.last().unwrap() & Self::LAST_WORD_PADDING_BITS_MASK,
122            Self::LAST_WORD_PADDING_BITS_MASK
123        );
124
125        Self {
126            words: *words,
127            number_of_zero_registers,
128        }
129    }
130
131    /// Create a new HyperLogLog counter from an array of registers.
132    ///
133    /// # Arguments
134    ///
135    /// * `registers` - An array of u32 registers to use for the HyperLogLog counter.
136    ///
137    /// # Returns
138    ///
139    /// A new HyperLogLog counter initialized with the given registers.
140    ///
141    /// # Examples
142    ///
143    /// ```
144    /// use hyperloglog_rs::prelude::*;
145    ///
146    /// let registers = [0_u32; 1 << 4];
147    /// let hll = HyperLogLog::<Precision4, 6>::from_registers(&registers);
148    /// assert_eq!(hll.len(), 1 << 4);
149    /// ```
150    pub fn from_registers(registers: &[u32]) -> Self {
151        debug_assert!(
152            registers.len() == P::NUMBER_OF_REGISTERS,
153            "We expect {} registers, but got {}",
154            P::NUMBER_OF_REGISTERS,
155            registers.len()
156        );
157        let mut words = P::Words::default_array();
158        let number_of_zero_registers = P::NumberOfZeros::reverse(
159            words
160                .iter_elements_mut()
161                .zip(registers.chunks(Self::NUMBER_OF_REGISTERS_IN_WORD))
162                .fold(0, |number_of_zero_registers, (word, word_registers)| {
163                    word_registers.iter().copied().enumerate().fold(
164                        number_of_zero_registers,
165                        |number_of_zero_registers, (i, register)| {
166                            debug_assert!(
167                                register <= Self::LOWER_REGISTER_MASK,
168                                "Register value {} is too large for the given number of bits {}",
169                                register,
170                                BITS
171                            );
172                            *word |= register << (i * BITS);
173                            number_of_zero_registers + (register == 0) as usize
174                        },
175                    )
176                }),
177        );
178
179        Self {
180            words,
181            number_of_zero_registers,
182        }
183    }
184
185    #[inline(always)]
186    /// Adds an element to the HyperLogLog counter, and returns whether the counter has changed.
187    ///
188    /// # Arguments
189    /// * `rhs` - The element to add.
190    ///
191    /// # Examples
192    ///
193    /// ```
194    /// use hyperloglog_rs::prelude::*;
195    ///
196    /// let mut hll = HyperLogLog::<Precision10, 6>::default();
197    ///
198    /// hll.insert("Hello");
199    /// hll.insert("World");
200    ///
201    /// assert!(hll.estimate_cardinality() >= 2.0);
202    /// ```
203    ///
204    /// # Performance
205    ///
206    /// The performance of this function depends on the size of the HyperLogLog counter (`N`), the number
207    /// of distinct elements in the input, and the hash function used to hash elements. For a given value of `N`,
208    /// the function has an average time complexity of O(1) and a worst-case time complexity of O(log N).
209    /// However, the actual time complexity may vary depending on the distribution of the hashed elements.
210    ///
211    /// # Errors
212    ///
213    /// This function does not return any errors.
214    pub fn insert<T: Hash>(&mut self, rhs: T) -> bool {
215        let (mut hash, index) = self.get_hash_and_index::<T>(&rhs);
216
217        // We need to add ones to the hash to make sure that the
218        // the number of zeros we obtain afterwards is never higher
219        // than the maximal value that may be represented in a register
220        // with BITS bits.
221        if BITS < 6 {
222            // In the case of BITS = 5, we have registers of
223            // 5 bits each. The maximal value that can be represented
224            // in a register is 31. As such, we need to add 1 << (65 - 32)
225            // to the hash. Similarly, in the case of BITS = 4, we have
226            // registers of 4 bits each. The maximal value that can be
227            // represented in a register is 15.
228            hash |= 1 << (65 - (1 << BITS))
229        } // else {
230          //     // Otherwise, with registers from 6 bits upwards we can
231          //     // represent a value of 64 or larger. Since we are using
232          //     // an hash function of 64 bits, the maximal number of
233          //     //
234          //     // 1 << (P::EXPONENT - 1)
235          // };
236
237        // Count leading zeros.
238        let number_of_zeros: u32 = 1 + hash.leading_zeros();
239
240        // We add a debug assertion to make sure that the number of zeros
241        // we have obtained is not larger than the maximal value that can
242        // be represented in a register with BITS bits.
243        debug_assert!(
244            number_of_zeros < 1 << BITS,
245            concat!("The number of zeros {} must be less than or equal to {}. ",),
246            number_of_zeros,
247            1 << BITS
248        );
249
250        // Calculate the position of the register in the internal buffer array.
251        let word_position = index / Self::NUMBER_OF_REGISTERS_IN_WORD;
252        let register_position = index - word_position * Self::NUMBER_OF_REGISTERS_IN_WORD;
253
254        debug_assert!(
255            word_position < self.words.len(),
256            concat!(
257                "The word_position {} must be less than the number of words {}. ",
258                "You have obtained this values starting from the index {} and the number of registers in word {}. ",
259                "We currently have {} registers. Currently using precision {} and number of bits {}."
260            ),
261            word_position,
262            self.words.len(),
263            index,
264            Self::NUMBER_OF_REGISTERS_IN_WORD,
265            P::NUMBER_OF_REGISTERS,
266            P::EXPONENT,
267            BITS
268        );
269
270        // Extract the current value of the register at `index`.
271        let register_value: u32 =
272            (self.words[word_position] >> (register_position * BITS)) & Self::LOWER_REGISTER_MASK;
273
274        // Otherwise, update the register using a bit mask.
275        if number_of_zeros > register_value {
276            self.words[word_position] &= !(Self::LOWER_REGISTER_MASK << (register_position * BITS));
277            self.words[word_position] |= number_of_zeros << (register_position * BITS);
278            self.number_of_zero_registers -=
279                P::NumberOfZeros::reverse((register_value == 0) as usize);
280
281            // We check that the value we have written to the register is correct.
282            debug_assert!(
283                self.words[word_position] >> (register_position * BITS) & Self::LOWER_REGISTER_MASK
284                    == number_of_zeros,
285                concat!(
286                    "The value of the register at position {} must be {}. ",
287                    "We have obtained {} instead. ",
288                    "The current value of the word is {}."
289                ),
290                index,
291                number_of_zeros,
292                self.words[word_position] >> (register_position * BITS) & Self::LOWER_REGISTER_MASK,
293                self.words[word_position]
294            );
295
296            // We check that the word we have edited maintains that the padding bits are all zeros
297            // and have not been manipulated in any way. If these bits were manipulated, it would mean
298            // that we have a bug in the code.
299            debug_assert!(
300                self.words[word_position] & Self::PADDING_BITS_MASK == 0,
301                concat!(
302                    "The padding bits of the word {} must be all zeros. ",
303                    "We have obtained {} instead."
304                ),
305                self.words[word_position],
306                self.words[word_position] & Self::PADDING_BITS_MASK
307            );
308
309            // We also check that if the word we have edites is the last word, then the padding bits
310            // of the word must be all zeros.
311            debug_assert!(
312                index != P::NUMBER_OF_REGISTERS - 1
313                    || self.words[word_position] & Self::LAST_WORD_PADDING_BITS_MASK == 0,
314                concat!(
315                    "The padding bits of the last word {} must be all zeros. ",
316                    "We have obtained {} instead. The last word padding bits mask is, ",
317                    "when represented in binary, {:#034b}.\n ",
318                    "The word in binary is {:#034b}. ",
319                    "The current case is using precision {} and bits {}. As such, ",
320                    "we expect to have {} padding registers in the last word."
321                ),
322                self.words[word_position],
323                self.words[word_position] & Self::LAST_WORD_PADDING_BITS_MASK,
324                Self::LAST_WORD_PADDING_BITS_MASK,
325                self.words[word_position],
326                P::EXPONENT,
327                BITS,
328                Self::get_number_of_padding_registers()
329            );
330
331            true
332        } else {
333            false
334        }
335    }
336}
337
338impl<P: Precision + WordType<BITS>, const BITS: usize> Eq for HyperLogLog<P, BITS> {
339    fn assert_receiver_is_total_eq(&self) {
340        // This is a no-op because we know that `Self` is `Eq`.
341    }
342}
343
344/// Implements PartialEq for HyperLogLog.
345///
346/// # Implementative details
347/// Two HyperLogLog counters are considered equal if they have the same words.
348///
349/// # Examples
350///
351/// ```
352/// # use hyperloglog_rs::prelude::*;
353///
354/// let mut hll1 = HyperLogLog::<Precision14, 5>::default();
355/// hll1.insert(&2);
356///
357/// let mut hll2 = HyperLogLog::<Precision14, 5>::default();
358/// hll2.insert(&2);
359/// hll2.insert(&3);
360///
361/// assert_ne!(hll1, hll2);
362///
363/// hll1 |= hll2;
364///
365/// assert_eq!(hll1, hll2);
366/// ```
367impl<P: Precision + WordType<BITS>, const BITS: usize> PartialEq for HyperLogLog<P, BITS> {
368    /// Returns whether the two HyperLogLog counters are equal.
369    fn eq(&self, other: &Self) -> bool {
370        self.words == other.words
371    }
372}
373
374/// Implements the Default trait for HyperLogLog.
375///
376/// HyperLogLog is a probabilistic cardinality estimator that uses a fixed
377/// amount of memory to estimate the number of distinct elements in a set.
378///
379/// # Examples
380///
381/// ```rust
382/// # use hyperloglog_rs::prelude::*;
383///
384/// let hll: HyperLogLog<Precision10, 6> = Default::default();
385/// assert_eq!(hll.len(), 1024);
386/// ```
387impl<P: Precision + WordType<BITS>, const BITS: usize> Default for HyperLogLog<P, BITS> {
388    /// Returns a new HyperLogLog instance with default configuration settings.
389    fn default() -> Self {
390        Self::new()
391    }
392}
393
394impl<P: Precision + WordType<BITS>, const BITS: usize, T: Hash> From<T> for HyperLogLog<P, BITS> {
395    /// Create a new HyperLogLog counter from a value.
396    ///
397    /// This method creates a new empty HyperLogLog counter and inserts the hash
398    /// of the given value into it. The value can be any type that implements
399    /// the `Hash` trait.
400    ///
401    /// # Examples
402    ///
403    /// ```
404    /// # use hyperloglog_rs::prelude::*;
405    ///
406    /// let hll = HyperLogLog::<Precision14, 5>::from("test");
407    ///
408    /// assert!(hll.estimate_cardinality() >=  1.0_f32);
409    /// assert!(!hll.is_empty());
410    /// assert!(hll.may_contain(&"test"));
411    /// ```
412    fn from(value: T) -> Self {
413        let mut hll = Self::new();
414        hll.insert(value);
415        hll
416    }
417}
418
419impl<P: Precision + WordType<BITS>, const BITS: usize> HyperLogLogTrait<P, BITS>
420    for HyperLogLog<P, BITS>
421{
422    #[inline(always)]
423    /// Returns the number of registers with zero values. This value is used for computing a small
424    /// correction when estimating the cardinality of a small set.
425    ///
426    /// # Examples
427    ///
428    /// ```
429    /// # use hyperloglog_rs::prelude::*;
430    ///
431    /// // Create a new HyperLogLog counter with precision 14 and 5 bits per register.
432    /// let mut hll = HyperLogLog::<Precision14, 5>::default();
433    ///
434    /// // Add some elements to the counter.
435    /// hll.insert(&1);
436    /// hll.insert(&2);
437    /// hll.insert(&3);
438    ///
439    /// // Get the number of zero registers.
440    /// let number_of_zero_registers = hll.get_number_of_zero_registers();
441    ///
442    /// assert_eq!(number_of_zero_registers, 16381);
443    /// ```
444    fn get_number_of_zero_registers(&self) -> usize {
445        self.number_of_zero_registers.convert()
446    }
447
448    #[inline(always)]
449    /// Returns the array of words of the HyperLogLog counter.
450    fn get_words(&self) -> &P::Words {
451        &self.words
452    }
453}
454
455impl<P: Precision + WordType<BITS>, const BITS: usize, A: Hash> core::iter::FromIterator<A>
456    for HyperLogLog<P, BITS>
457{
458    #[inline(always)]
459    /// Creates a new HyperLogLog counter and adds all elements from an iterator to it.
460    ///
461    /// # Examples
462    ///
463    /// ```
464    /// use hyperloglog_rs::prelude::*;
465    ///
466    /// let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9];
467    /// let hll: HyperLogLog<Precision12, 5> = data.iter().collect();
468    /// assert!(
469    ///     hll.estimate_cardinality() > 0.9 * data.len() as f32,
470    ///     concat!(
471    ///         "The estimate is too low, expected ",
472    ///         "at least {}, got {}",
473    ///     ),
474    ///     0.9 * data.len() as f32,
475    ///     hll.estimate_cardinality()
476    /// );
477    /// assert!(
478    ///     hll.estimate_cardinality() < 1.1 * data.len() as f32,
479    ///     concat!(
480    ///     "The estimate is too high, expected ",
481    ///     "at most {}, got {}",
482    ///    ),
483    ///     1.1 * data.len() as f32,
484    ///     hll.estimate_cardinality()
485    /// );
486    /// ```
487    fn from_iter<T: IntoIterator<Item = A>>(iter: T) -> Self {
488        let mut hll = Self::new();
489        for item in iter {
490            hll.insert(item);
491        }
492        hll
493    }
494}