hyperloglog_rs/
hyperloglog_multiplicities.rs

1use crate::array_default::{ArrayDefault, ArrayIter};
2use crate::precisions::{Precision, WordType};
3use crate::prelude::*;
4use core::hash::Hash;
5
6/// A HyperLogLog counter with multiplicities.
7///
8/// # Implementation details
9/// This struct differs from the traditional HyperLogLog counter in that it stores the multiplicities
10/// of the registers. This allows us to speed up significantly the computation of the cardinality of
11/// the counter, as we do not need to compute the harmonic mean of the registers but we can instead
12/// use the multiplities instead, reducing by a large amount the sums we need to compute.
13///
14/// For instance, for a counter with 2^14 registers, we need to compute the harmonic mean of 2^14
15/// registers, i.e. 16384 registers. With the multiplicities, we only need to compute the sum of the
16/// multiplicities, which is much smaller, and at most equal to 52 when you use 6 bits per register.
17///
18/// That being said, when memory is an extreme concern, you may want to use the traditional HyperLogLog
19/// as this struct contains the multiplicities vector, which in the example case we considered above
20/// would be adding u16 * 52 = 104 bytes to the size of the counter.
21///
22/// Additionally, note that while one may expect to obtain better accuracy by executing less sums,
23/// we do not observe any statistically significant difference in the accuracy of the counter when
24/// using the multiplicities instead of the registers in our tests.
25///
26/// Note that this struct DOES NOT provide any other faster operation other than the estimation of the
27/// cardinality of the counter. All other operations, such as the union of two counters, are fast as
28/// they are implemented using the traditional HyperLogLog counter.
29///
30pub struct HyperLogLogWithMultiplicities<P: Precision + WordType<BITS>, const BITS: usize> {
31    pub(crate) words: P::Words,
32    pub(crate) multiplicities: P::RegisterMultiplicities,
33}
34
35impl<P: Precision + WordType<BITS>, const BITS: usize> HyperLogLogWithMultiplicities<P, BITS> {
36    fn new() -> Self {
37        let mut multiplicities = P::RegisterMultiplicities::default_array();
38
39        multiplicities[0] = P::NumberOfZeros::reverse(P::NUMBER_OF_REGISTERS);
40
41        Self {
42            words: P::Words::default_array(),
43            multiplicities,
44        }
45    }
46
47    /// Create a new HyperLogLog counter from an array of words.
48    ///
49    /// # Arguments
50    /// * `words` - An array of u64 words to use for the HyperLogLog counter.
51    ///
52    /// # Returns
53    /// A new HyperLogLog counter initialized with the given words.
54    ///
55    /// # Examples
56    ///
57    /// ```rust
58    /// use hyperloglog_rs::prelude::*;
59    ///
60    /// let words = [0_u32; 4];
61    /// let hll = HyperLogLogWithMultiplicities::<Precision4, 6>::from_words(&words);
62    /// assert_eq!(hll.len(), 16);
63    /// ```
64    pub fn from_words(words: &P::Words) -> Self {
65        let mut multiplicities = P::RegisterMultiplicities::default_array();
66
67        words.iter_elements().for_each(|word| {
68            (0..Self::NUMBER_OF_REGISTERS_IN_WORD).for_each(|i| {
69                let register = (word >> (i * BITS)) & Self::LOWER_REGISTER_MASK;
70                multiplicities[register as usize] += P::NumberOfZeros::ONE;
71            });
72        });
73
74        multiplicities[0] -= P::NumberOfZeros::reverse(Self::get_number_of_padding_registers());
75
76        Self {
77            words: *words,
78            multiplicities,
79        }
80    }
81
82    /// Create a new HyperLogLog counter from an array of registers.
83    ///
84    /// # Arguments
85    ///
86    /// * `registers` - An array of u32 registers to use for the HyperLogLog counter.
87    ///
88    /// # Returns
89    ///
90    /// A new HyperLogLog counter initialized with the given registers.
91    ///
92    /// # Examples
93    ///
94    /// ```
95    /// use hyperloglog_rs::prelude::*;
96    ///
97    /// let registers = [0_u32; 1 << 4];
98    /// let hll = HyperLogLogWithMultiplicities::<Precision4, 6>::from_registers(&registers);
99    /// assert_eq!(hll.len(), 1 << 4);
100    /// ```
101    pub fn from_registers(registers: &[u32]) -> Self {
102        debug_assert!(
103            registers.len() == P::NUMBER_OF_REGISTERS,
104            "We expect {} registers, but got {}",
105            P::NUMBER_OF_REGISTERS,
106            registers.len()
107        );
108        let mut words = P::Words::default_array();
109        let mut multiplicities = P::RegisterMultiplicities::default_array();
110        words
111            .iter_elements_mut()
112            .zip(registers.chunks(Self::NUMBER_OF_REGISTERS_IN_WORD))
113            .for_each(|(word, word_registers)| {
114                for (i, register) in word_registers.iter().copied().enumerate() {
115                    debug_assert!(
116                        register <= Self::LOWER_REGISTER_MASK,
117                        "Register value {} is too large for the given number of bits {}",
118                        register,
119                        BITS
120                    );
121                    multiplicities[register as usize] += P::NumberOfZeros::ONE;
122                    *word |= register << (i * BITS);
123                }
124            });
125
126        Self {
127            words,
128            multiplicities,
129        }
130    }
131
132    #[inline(always)]
133    /// Adds an element to the HyperLogLog counter , and returns whether the counter has changed.
134    ///
135    /// # Arguments
136    /// * `rhs` - The element to add.
137    ///
138    /// # Examples
139    ///
140    /// ```
141    /// use hyperloglog_rs::prelude::*;
142    ///
143    /// let mut hll = HyperLogLogWithMultiplicities::<Precision10, 6>::default();
144    ///
145    /// hll.insert("Hello");
146    /// hll.insert("World");
147    ///
148    /// assert!(hll.estimate_cardinality() >= 2.0);
149    /// ```
150    ///
151    /// # Performance
152    ///
153    /// The performance of this function depends on the size of the HyperLogLog counter (`N`), the number
154    /// of distinct elements in the input, and the hash function used to hash elements. For a given value of `N`,
155    /// the function has an average time complexity of O(1) and a worst-case time complexity of O(log N).
156    /// However, the actual time complexity may vary depending on the distribution of the hashed elements.
157    ///
158    /// # Errors
159    ///
160    /// This function does not return any errors.
161    pub fn insert<T: Hash>(&mut self, rhs: T) -> bool {
162        let (mut hash, index) = self.get_hash_and_index::<T>(&rhs);
163
164        // We need to add ones to the hash to make sure that the
165        // the number of zeros we obtain afterwards is never higher
166        // than the maximal value that may be represented in a register
167        // with BITS bits.
168        if BITS < 6 {
169            hash |= 1 << (64 - ((1 << BITS) - 1));
170        } else {
171            hash |= 1 << (P::EXPONENT - 1);
172        }
173
174        // Count leading zeros.
175        let number_of_zeros: u32 = 1 + hash.leading_zeros();
176
177        debug_assert!(
178            number_of_zeros < (1 << BITS),
179            concat!(
180                "The number of leading zeros {} must be less than the number of bits {}. ",
181                "You have obtained this values starting from the hash {:064b} and the precision {}."
182            ),
183            number_of_zeros,
184            1 << BITS,
185            hash,
186            P::EXPONENT
187        );
188
189        // Calculate the position of the register in the internal buffer array.
190        let word_position = index / Self::NUMBER_OF_REGISTERS_IN_WORD;
191        let register_position_in_u32 = index - word_position * Self::NUMBER_OF_REGISTERS_IN_WORD;
192
193        debug_assert!(
194            word_position < self.words.len(),
195            concat!(
196                "The word_position {} must be less than the number of words {}. ",
197                "You have obtained this values starting from the index {} and the number of registers in word {}. ",
198                "We currently have {} registers. Currently using precision {} and number of bits {}."
199            ),
200            word_position,
201            self.words.len(),
202            index,
203            Self::NUMBER_OF_REGISTERS_IN_WORD,
204            P::NUMBER_OF_REGISTERS,
205            P::EXPONENT,
206            BITS
207        );
208
209        // Extract the current value of the register at `index`.
210        let register_value: u32 = (self.words[word_position] >> (register_position_in_u32 * BITS))
211            & Self::LOWER_REGISTER_MASK;
212
213        // Otherwise, update the register using a bit mask.
214        if number_of_zeros > register_value {
215            debug_assert!(self.multiplicities[register_value as usize] > P::NumberOfZeros::ZERO,);
216
217            self.multiplicities[register_value as usize] -= P::NumberOfZeros::ONE;
218            self.multiplicities[number_of_zeros as usize] += P::NumberOfZeros::ONE;
219
220            self.words[word_position] &=
221                !(Self::LOWER_REGISTER_MASK << (register_position_in_u32 * BITS));
222            self.words[word_position] |= number_of_zeros << (register_position_in_u32 * BITS);
223
224            // We check that the word we have edited maintains that the padding bits are all zeros
225            // and have not been manipulated in any way. If these bits were manipulated, it would mean
226            // that we have a bug in the code.
227            debug_assert!(
228                self.words[word_position] & Self::PADDING_BITS_MASK == 0,
229                concat!(
230                    "The padding bits of the word {} must be all zeros. ",
231                    "We have obtained {} instead."
232                ),
233                self.words[word_position],
234                self.words[word_position] & Self::PADDING_BITS_MASK
235            );
236
237            // We also check that if the word we have edites is the last word, then the padding bits
238            // of the word must be all zeros.
239            debug_assert!(
240                index != P::NUMBER_OF_REGISTERS - 1
241                    || self.words[word_position] & Self::LAST_WORD_PADDING_BITS_MASK == 0,
242                concat!(
243                    "The padding bits of the last word {} must be all zeros. ",
244                    "We have obtained {} instead. The last word padding bits mask is, ",
245                    "when represented in binary, {:#034b}.\n ",
246                    "The word in binary is {:#034b}. ",
247                    "The current case is using precision {} and bits {}. As such, ",
248                    "we expect to have {} padding registers in the last word."
249                ),
250                self.words[word_position],
251                self.words[word_position] & Self::LAST_WORD_PADDING_BITS_MASK,
252                Self::LAST_WORD_PADDING_BITS_MASK,
253                self.words[word_position],
254                P::EXPONENT,
255                BITS,
256                Self::get_number_of_padding_registers()
257            );
258
259            true
260        } else {
261            false
262        }
263    }
264}
265
266impl<P: Precision + WordType<BITS>, const BITS: usize> Default
267    for HyperLogLogWithMultiplicities<P, BITS>
268{
269    fn default() -> Self {
270        Self::new()
271    }
272}
273
274impl<P: Precision + WordType<BITS>, const BITS: usize> From<HyperLogLogWithMultiplicities<P, BITS>>
275    for HyperLogLog<P, BITS>
276{
277    fn from(hll: HyperLogLogWithMultiplicities<P, BITS>) -> Self {
278        Self::from_words(hll.get_words())
279    }
280}
281
282impl<P: Precision + WordType<BITS>, const BITS: usize> From<HyperLogLog<P, BITS>>
283    for HyperLogLogWithMultiplicities<P, BITS>
284{
285    fn from(hll: HyperLogLog<P, BITS>) -> Self {
286        Self::from_words(hll.get_words())
287    }
288}
289
290impl<P: Precision + WordType<BITS>, const BITS: usize> HyperLogLogTrait<P, BITS>
291    for HyperLogLogWithMultiplicities<P, BITS>
292{
293    #[inline(always)]
294    /// Returns the number of registers in the counter.
295    ///
296    /// # Implementation details
297    /// This function is overriding the estimate_cardinality function of the HyperLogLogTrait trait
298    /// as we can compute the cardinality of the counter using the multiplicities instead of the
299    /// registers. This is much faster as we do not need to compute the harmonic mean of the registers.
300    fn estimate_cardinality(&self) -> f32 {
301        Self::estimate_cardinality_from_multiplicities(&self.multiplicities)
302    }
303
304    /// Returns a reference to the words vector.
305    fn get_words(&self) -> &P::Words {
306        &self.words
307    }
308
309    #[inline(always)]
310    /// Returns the number of registers with zero values. This value is used for computing a small
311    /// correction when estimating the cardinality of a small set.
312    ///
313    /// # Examples
314    ///
315    /// ```
316    /// # use hyperloglog_rs::prelude::*;
317    ///
318    /// // Create a new HyperLogLog counter with precision 14 and 5 bits per register.
319    /// let mut hll = HyperLogLogWithMultiplicities::<Precision14, 5>::default();
320    ///
321    /// // Add some elements to the counter.
322    /// hll.insert(&1);
323    /// hll.insert(&2);
324    /// hll.insert(&3);
325    ///
326    /// // Get the number of zero registers.
327    /// let number_of_zero_registers = hll.get_number_of_zero_registers();
328    ///
329    /// assert_eq!(number_of_zero_registers, 16381);
330    /// ```
331    fn get_number_of_zero_registers(&self) -> usize {
332        self.multiplicities[0].convert()
333    }
334}
335
336impl<P: Precision + WordType<BITS>, const BITS: usize, A: Hash> core::iter::FromIterator<A>
337    for HyperLogLogWithMultiplicities<P, BITS>
338{
339    #[inline(always)]
340    /// Creates a new HyperLogLogWithMultiplicities counter and adds all elements from an iterator to it.
341    ///
342    /// # Examples
343    ///
344    /// ```
345    /// use hyperloglog_rs::prelude::*;
346    ///
347    /// let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9];
348    /// let hll: HyperLogLogWithMultiplicities<Precision12, 5> = data.iter().collect();
349    /// assert!(
350    ///     hll.estimate_cardinality() > 0.9 * data.len() as f32,
351    ///     concat!(
352    ///         "The estimate is too low, expected ",
353    ///         "at least {}, got {}",
354    ///     ),
355    ///     0.9 * data.len() as f32,
356    ///     hll.estimate_cardinality()
357    /// );
358    /// assert!(
359    ///     hll.estimate_cardinality() < 1.1 * data.len() as f32,
360    ///     concat!(
361    ///     "The estimate is too high, expected ",
362    ///     "at most {}, got {}",
363    ///    ),
364    ///     1.1 * data.len() as f32,
365    ///     hll.estimate_cardinality()
366    /// );
367    /// ```
368    fn from_iter<T: IntoIterator<Item = A>>(iter: T) -> Self {
369        let mut hll = Self::default();
370        for item in iter {
371            hll.insert(item);
372        }
373        hll
374    }
375}