hyperloglog_rs/hyperloglog_multiplicities.rs
1use crate::array_default::{ArrayDefault, ArrayIter};
2use crate::precisions::{Precision, WordType};
3use crate::prelude::*;
4use core::hash::Hash;
5
6/// A HyperLogLog counter with multiplicities.
7///
8/// # Implementation details
9/// This struct differs from the traditional HyperLogLog counter in that it stores the multiplicities
10/// of the registers. This allows us to speed up significantly the computation of the cardinality of
11/// the counter, as we do not need to compute the harmonic mean of the registers but we can instead
12/// use the multiplities instead, reducing by a large amount the sums we need to compute.
13///
14/// For instance, for a counter with 2^14 registers, we need to compute the harmonic mean of 2^14
15/// registers, i.e. 16384 registers. With the multiplicities, we only need to compute the sum of the
16/// multiplicities, which is much smaller, and at most equal to 52 when you use 6 bits per register.
17///
18/// That being said, when memory is an extreme concern, you may want to use the traditional HyperLogLog
19/// as this struct contains the multiplicities vector, which in the example case we considered above
20/// would be adding u16 * 52 = 104 bytes to the size of the counter.
21///
22/// Additionally, note that while one may expect to obtain better accuracy by executing less sums,
23/// we do not observe any statistically significant difference in the accuracy of the counter when
24/// using the multiplicities instead of the registers in our tests.
25///
26/// Note that this struct DOES NOT provide any other faster operation other than the estimation of the
27/// cardinality of the counter. All other operations, such as the union of two counters, are fast as
28/// they are implemented using the traditional HyperLogLog counter.
29///
30pub struct HyperLogLogWithMultiplicities<P: Precision + WordType<BITS>, const BITS: usize> {
31 pub(crate) words: P::Words,
32 pub(crate) multiplicities: P::RegisterMultiplicities,
33}
34
35impl<P: Precision + WordType<BITS>, const BITS: usize> HyperLogLogWithMultiplicities<P, BITS> {
36 fn new() -> Self {
37 let mut multiplicities = P::RegisterMultiplicities::default_array();
38
39 multiplicities[0] = P::NumberOfZeros::reverse(P::NUMBER_OF_REGISTERS);
40
41 Self {
42 words: P::Words::default_array(),
43 multiplicities,
44 }
45 }
46
47 /// Create a new HyperLogLog counter from an array of words.
48 ///
49 /// # Arguments
50 /// * `words` - An array of u64 words to use for the HyperLogLog counter.
51 ///
52 /// # Returns
53 /// A new HyperLogLog counter initialized with the given words.
54 ///
55 /// # Examples
56 ///
57 /// ```rust
58 /// use hyperloglog_rs::prelude::*;
59 ///
60 /// let words = [0_u32; 4];
61 /// let hll = HyperLogLogWithMultiplicities::<Precision4, 6>::from_words(&words);
62 /// assert_eq!(hll.len(), 16);
63 /// ```
64 pub fn from_words(words: &P::Words) -> Self {
65 let mut multiplicities = P::RegisterMultiplicities::default_array();
66
67 words.iter_elements().for_each(|word| {
68 (0..Self::NUMBER_OF_REGISTERS_IN_WORD).for_each(|i| {
69 let register = (word >> (i * BITS)) & Self::LOWER_REGISTER_MASK;
70 multiplicities[register as usize] += P::NumberOfZeros::ONE;
71 });
72 });
73
74 multiplicities[0] -= P::NumberOfZeros::reverse(Self::get_number_of_padding_registers());
75
76 Self {
77 words: *words,
78 multiplicities,
79 }
80 }
81
82 /// Create a new HyperLogLog counter from an array of registers.
83 ///
84 /// # Arguments
85 ///
86 /// * `registers` - An array of u32 registers to use for the HyperLogLog counter.
87 ///
88 /// # Returns
89 ///
90 /// A new HyperLogLog counter initialized with the given registers.
91 ///
92 /// # Examples
93 ///
94 /// ```
95 /// use hyperloglog_rs::prelude::*;
96 ///
97 /// let registers = [0_u32; 1 << 4];
98 /// let hll = HyperLogLogWithMultiplicities::<Precision4, 6>::from_registers(®isters);
99 /// assert_eq!(hll.len(), 1 << 4);
100 /// ```
101 pub fn from_registers(registers: &[u32]) -> Self {
102 debug_assert!(
103 registers.len() == P::NUMBER_OF_REGISTERS,
104 "We expect {} registers, but got {}",
105 P::NUMBER_OF_REGISTERS,
106 registers.len()
107 );
108 let mut words = P::Words::default_array();
109 let mut multiplicities = P::RegisterMultiplicities::default_array();
110 words
111 .iter_elements_mut()
112 .zip(registers.chunks(Self::NUMBER_OF_REGISTERS_IN_WORD))
113 .for_each(|(word, word_registers)| {
114 for (i, register) in word_registers.iter().copied().enumerate() {
115 debug_assert!(
116 register <= Self::LOWER_REGISTER_MASK,
117 "Register value {} is too large for the given number of bits {}",
118 register,
119 BITS
120 );
121 multiplicities[register as usize] += P::NumberOfZeros::ONE;
122 *word |= register << (i * BITS);
123 }
124 });
125
126 Self {
127 words,
128 multiplicities,
129 }
130 }
131
132 #[inline(always)]
133 /// Adds an element to the HyperLogLog counter , and returns whether the counter has changed.
134 ///
135 /// # Arguments
136 /// * `rhs` - The element to add.
137 ///
138 /// # Examples
139 ///
140 /// ```
141 /// use hyperloglog_rs::prelude::*;
142 ///
143 /// let mut hll = HyperLogLogWithMultiplicities::<Precision10, 6>::default();
144 ///
145 /// hll.insert("Hello");
146 /// hll.insert("World");
147 ///
148 /// assert!(hll.estimate_cardinality() >= 2.0);
149 /// ```
150 ///
151 /// # Performance
152 ///
153 /// The performance of this function depends on the size of the HyperLogLog counter (`N`), the number
154 /// of distinct elements in the input, and the hash function used to hash elements. For a given value of `N`,
155 /// the function has an average time complexity of O(1) and a worst-case time complexity of O(log N).
156 /// However, the actual time complexity may vary depending on the distribution of the hashed elements.
157 ///
158 /// # Errors
159 ///
160 /// This function does not return any errors.
161 pub fn insert<T: Hash>(&mut self, rhs: T) -> bool {
162 let (mut hash, index) = self.get_hash_and_index::<T>(&rhs);
163
164 // We need to add ones to the hash to make sure that the
165 // the number of zeros we obtain afterwards is never higher
166 // than the maximal value that may be represented in a register
167 // with BITS bits.
168 if BITS < 6 {
169 hash |= 1 << (64 - ((1 << BITS) - 1));
170 } else {
171 hash |= 1 << (P::EXPONENT - 1);
172 }
173
174 // Count leading zeros.
175 let number_of_zeros: u32 = 1 + hash.leading_zeros();
176
177 debug_assert!(
178 number_of_zeros < (1 << BITS),
179 concat!(
180 "The number of leading zeros {} must be less than the number of bits {}. ",
181 "You have obtained this values starting from the hash {:064b} and the precision {}."
182 ),
183 number_of_zeros,
184 1 << BITS,
185 hash,
186 P::EXPONENT
187 );
188
189 // Calculate the position of the register in the internal buffer array.
190 let word_position = index / Self::NUMBER_OF_REGISTERS_IN_WORD;
191 let register_position_in_u32 = index - word_position * Self::NUMBER_OF_REGISTERS_IN_WORD;
192
193 debug_assert!(
194 word_position < self.words.len(),
195 concat!(
196 "The word_position {} must be less than the number of words {}. ",
197 "You have obtained this values starting from the index {} and the number of registers in word {}. ",
198 "We currently have {} registers. Currently using precision {} and number of bits {}."
199 ),
200 word_position,
201 self.words.len(),
202 index,
203 Self::NUMBER_OF_REGISTERS_IN_WORD,
204 P::NUMBER_OF_REGISTERS,
205 P::EXPONENT,
206 BITS
207 );
208
209 // Extract the current value of the register at `index`.
210 let register_value: u32 = (self.words[word_position] >> (register_position_in_u32 * BITS))
211 & Self::LOWER_REGISTER_MASK;
212
213 // Otherwise, update the register using a bit mask.
214 if number_of_zeros > register_value {
215 debug_assert!(self.multiplicities[register_value as usize] > P::NumberOfZeros::ZERO,);
216
217 self.multiplicities[register_value as usize] -= P::NumberOfZeros::ONE;
218 self.multiplicities[number_of_zeros as usize] += P::NumberOfZeros::ONE;
219
220 self.words[word_position] &=
221 !(Self::LOWER_REGISTER_MASK << (register_position_in_u32 * BITS));
222 self.words[word_position] |= number_of_zeros << (register_position_in_u32 * BITS);
223
224 // We check that the word we have edited maintains that the padding bits are all zeros
225 // and have not been manipulated in any way. If these bits were manipulated, it would mean
226 // that we have a bug in the code.
227 debug_assert!(
228 self.words[word_position] & Self::PADDING_BITS_MASK == 0,
229 concat!(
230 "The padding bits of the word {} must be all zeros. ",
231 "We have obtained {} instead."
232 ),
233 self.words[word_position],
234 self.words[word_position] & Self::PADDING_BITS_MASK
235 );
236
237 // We also check that if the word we have edites is the last word, then the padding bits
238 // of the word must be all zeros.
239 debug_assert!(
240 index != P::NUMBER_OF_REGISTERS - 1
241 || self.words[word_position] & Self::LAST_WORD_PADDING_BITS_MASK == 0,
242 concat!(
243 "The padding bits of the last word {} must be all zeros. ",
244 "We have obtained {} instead. The last word padding bits mask is, ",
245 "when represented in binary, {:#034b}.\n ",
246 "The word in binary is {:#034b}. ",
247 "The current case is using precision {} and bits {}. As such, ",
248 "we expect to have {} padding registers in the last word."
249 ),
250 self.words[word_position],
251 self.words[word_position] & Self::LAST_WORD_PADDING_BITS_MASK,
252 Self::LAST_WORD_PADDING_BITS_MASK,
253 self.words[word_position],
254 P::EXPONENT,
255 BITS,
256 Self::get_number_of_padding_registers()
257 );
258
259 true
260 } else {
261 false
262 }
263 }
264}
265
266impl<P: Precision + WordType<BITS>, const BITS: usize> Default
267 for HyperLogLogWithMultiplicities<P, BITS>
268{
269 fn default() -> Self {
270 Self::new()
271 }
272}
273
274impl<P: Precision + WordType<BITS>, const BITS: usize> From<HyperLogLogWithMultiplicities<P, BITS>>
275 for HyperLogLog<P, BITS>
276{
277 fn from(hll: HyperLogLogWithMultiplicities<P, BITS>) -> Self {
278 Self::from_words(hll.get_words())
279 }
280}
281
282impl<P: Precision + WordType<BITS>, const BITS: usize> From<HyperLogLog<P, BITS>>
283 for HyperLogLogWithMultiplicities<P, BITS>
284{
285 fn from(hll: HyperLogLog<P, BITS>) -> Self {
286 Self::from_words(hll.get_words())
287 }
288}
289
290impl<P: Precision + WordType<BITS>, const BITS: usize> HyperLogLogTrait<P, BITS>
291 for HyperLogLogWithMultiplicities<P, BITS>
292{
293 #[inline(always)]
294 /// Returns the number of registers in the counter.
295 ///
296 /// # Implementation details
297 /// This function is overriding the estimate_cardinality function of the HyperLogLogTrait trait
298 /// as we can compute the cardinality of the counter using the multiplicities instead of the
299 /// registers. This is much faster as we do not need to compute the harmonic mean of the registers.
300 fn estimate_cardinality(&self) -> f32 {
301 Self::estimate_cardinality_from_multiplicities(&self.multiplicities)
302 }
303
304 /// Returns a reference to the words vector.
305 fn get_words(&self) -> &P::Words {
306 &self.words
307 }
308
309 #[inline(always)]
310 /// Returns the number of registers with zero values. This value is used for computing a small
311 /// correction when estimating the cardinality of a small set.
312 ///
313 /// # Examples
314 ///
315 /// ```
316 /// # use hyperloglog_rs::prelude::*;
317 ///
318 /// // Create a new HyperLogLog counter with precision 14 and 5 bits per register.
319 /// let mut hll = HyperLogLogWithMultiplicities::<Precision14, 5>::default();
320 ///
321 /// // Add some elements to the counter.
322 /// hll.insert(&1);
323 /// hll.insert(&2);
324 /// hll.insert(&3);
325 ///
326 /// // Get the number of zero registers.
327 /// let number_of_zero_registers = hll.get_number_of_zero_registers();
328 ///
329 /// assert_eq!(number_of_zero_registers, 16381);
330 /// ```
331 fn get_number_of_zero_registers(&self) -> usize {
332 self.multiplicities[0].convert()
333 }
334}
335
336impl<P: Precision + WordType<BITS>, const BITS: usize, A: Hash> core::iter::FromIterator<A>
337 for HyperLogLogWithMultiplicities<P, BITS>
338{
339 #[inline(always)]
340 /// Creates a new HyperLogLogWithMultiplicities counter and adds all elements from an iterator to it.
341 ///
342 /// # Examples
343 ///
344 /// ```
345 /// use hyperloglog_rs::prelude::*;
346 ///
347 /// let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9];
348 /// let hll: HyperLogLogWithMultiplicities<Precision12, 5> = data.iter().collect();
349 /// assert!(
350 /// hll.estimate_cardinality() > 0.9 * data.len() as f32,
351 /// concat!(
352 /// "The estimate is too low, expected ",
353 /// "at least {}, got {}",
354 /// ),
355 /// 0.9 * data.len() as f32,
356 /// hll.estimate_cardinality()
357 /// );
358 /// assert!(
359 /// hll.estimate_cardinality() < 1.1 * data.len() as f32,
360 /// concat!(
361 /// "The estimate is too high, expected ",
362 /// "at most {}, got {}",
363 /// ),
364 /// 1.1 * data.len() as f32,
365 /// hll.estimate_cardinality()
366 /// );
367 /// ```
368 fn from_iter<T: IntoIterator<Item = A>>(iter: T) -> Self {
369 let mut hll = Self::default();
370 for item in iter {
371 hll.insert(item);
372 }
373 hll
374 }
375}