hyperloglog_rs/hyperloglog.rs
1use crate::array_default::{ArrayDefault, ArrayIter};
2use crate::precisions::{Precision, WordType};
3use crate::prelude::HyperLogLogTrait;
4use crate::primitive::Primitive;
5use core::hash::Hash;
6
7#[derive(Clone, Debug, Copy)]
8/// A probabilistic algorithm for estimating the number of distinct elements in a set.
9///
10/// HyperLogLog is a probabilistic algorithm designed to estimate the number
11/// of distinct elements in a set. It does so by taking advantage of the fact
12/// that the representation of an element can be transformed into a uniform
13/// distribution in a space with a fixed range.
14///
15/// HyperLogLog works by maintaining a fixed-sized register array,
16/// where each register holds a counter. The algorithm splits the input set into subsets,
17/// applies a hash function to each element in the subset, and then updates
18/// the corresponding counter in the register array.
19///
20/// HyperLogLog uses a trick called "probabilistic counting" to estimate
21/// the number of distinct elements in the set. Each register counter is converted
22/// to a binary string, and the algorithm counts the number of leading zeros in
23/// each binary string. The maximum number of leading zeros over all counters
24/// is used to estimate the number of distinct elements in the set.
25///
26/// HyperLogLog has a tunable parameter called precision that determines
27/// the accuracy of the algorithm. Higher precision leads to better accuracy,
28/// but requires more memory. The error rate of the algorithm is guaranteed
29/// to be within a certain bound, depending on the chosen precision.
30///
31/// # Examples
32///
33/// ```
34/// use hyperloglog_rs::prelude::*;
35///
36/// let mut hll = HyperLogLog::<Precision12, 6>::default();
37/// hll.insert(&"apple");
38/// hll.insert(&"banana");
39/// hll.insert(&"cherry");
40///
41/// let estimated_cardinality = hll.estimate_cardinality();
42/// assert!(estimated_cardinality >= 3.0_f32 * 0.9 &&
43/// estimated_cardinality <= 3.0_f32 * 1.1);
44/// ```
45///
46/// # Citations
47///
48/// This implementation is based on the following papers:
49///
50/// * Flajolet, Philippe, et al. "HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm." DMTCS Proceedings 1 (2007): 127-146.
51/// * Heule, Stefan, Marc Nunkesser, and Alexander Hall. "HyperLogLog in practice: algorithmic engineering of a state of the art cardinality estimation algorithm." Proceedings of the 16th International Conference on Extending Database Technology. 2013.
52///
53pub struct HyperLogLog<P: Precision + WordType<BITS>, const BITS: usize> {
54 pub(crate) words: P::Words,
55 pub(crate) number_of_zero_registers: P::NumberOfZeros,
56}
57
58impl<P: Precision + WordType<BITS>, const BITS: usize> HyperLogLog<P, BITS> {
59 /// Create a new HyperLogLog counter.
60 fn new() -> Self {
61 Self {
62 words: P::Words::default_array(),
63 number_of_zero_registers: P::NumberOfZeros::reverse(P::NUMBER_OF_REGISTERS),
64 }
65 }
66
67 /// Create a new HyperLogLog counter from an array of words.
68 ///
69 /// # Arguments
70 /// * `words` - An array of u64 words to use for the HyperLogLog counter.
71 ///
72 /// # Returns
73 /// A new HyperLogLog counter initialized with the given words.
74 ///
75 /// # Examples
76 ///
77 /// ```rust
78 /// use hyperloglog_rs::prelude::*;
79 ///
80 /// let words = [0_u32; 4];
81 /// let hll = HyperLogLog::<Precision4, 6>::from_words(&words);
82 /// assert_eq!(hll.len(), 16);
83 /// ```
84 pub fn from_words(words: &P::Words) -> Self {
85 let number_of_zero_registers =
86 P::NumberOfZeros::reverse(words.iter_elements().fold(
87 0,
88 |number_of_zero_registers, word| {
89 // We check that in all words the PADDING_BITS_MASK
90 // is all zeros.
91 debug_assert!(
92 word & Self::PADDING_BITS_MASK == 0,
93 concat!(
94 "The padding bits of the word {} must be all zeros. ",
95 "We have obtained {} instead."
96 ),
97 word,
98 word & Self::PADDING_BITS_MASK
99 );
100
101 (0..Self::NUMBER_OF_REGISTERS_IN_WORD).fold(
102 number_of_zero_registers,
103 |number_of_zero_registers, i| {
104 let register = (word >> (i * BITS)) & Self::LOWER_REGISTER_MASK;
105 number_of_zero_registers + (register == 0) as usize
106 },
107 )
108 },
109 )) - P::NumberOfZeros::reverse(Self::get_number_of_padding_registers());
110
111 // We check that the values in the last word are masked
112 // according to the LAST_WORD_PADDING_BITS_MASK.
113 debug_assert!(
114 words.last().unwrap() & Self::LAST_WORD_PADDING_BITS_MASK == 0,
115 concat!(
116 "The padding bits of the last word {} must be all zeros. ",
117 "We have obtained {} instead. The last word padding bits mask is, ",
118 "when represented in binary, {:#034b}."
119 ),
120 words.last().unwrap(),
121 words.last().unwrap() & Self::LAST_WORD_PADDING_BITS_MASK,
122 Self::LAST_WORD_PADDING_BITS_MASK
123 );
124
125 Self {
126 words: *words,
127 number_of_zero_registers,
128 }
129 }
130
131 /// Create a new HyperLogLog counter from an array of registers.
132 ///
133 /// # Arguments
134 ///
135 /// * `registers` - An array of u32 registers to use for the HyperLogLog counter.
136 ///
137 /// # Returns
138 ///
139 /// A new HyperLogLog counter initialized with the given registers.
140 ///
141 /// # Examples
142 ///
143 /// ```
144 /// use hyperloglog_rs::prelude::*;
145 ///
146 /// let registers = [0_u32; 1 << 4];
147 /// let hll = HyperLogLog::<Precision4, 6>::from_registers(®isters);
148 /// assert_eq!(hll.len(), 1 << 4);
149 /// ```
150 pub fn from_registers(registers: &[u32]) -> Self {
151 debug_assert!(
152 registers.len() == P::NUMBER_OF_REGISTERS,
153 "We expect {} registers, but got {}",
154 P::NUMBER_OF_REGISTERS,
155 registers.len()
156 );
157 let mut words = P::Words::default_array();
158 let number_of_zero_registers = P::NumberOfZeros::reverse(
159 words
160 .iter_elements_mut()
161 .zip(registers.chunks(Self::NUMBER_OF_REGISTERS_IN_WORD))
162 .fold(0, |number_of_zero_registers, (word, word_registers)| {
163 word_registers.iter().copied().enumerate().fold(
164 number_of_zero_registers,
165 |number_of_zero_registers, (i, register)| {
166 debug_assert!(
167 register <= Self::LOWER_REGISTER_MASK,
168 "Register value {} is too large for the given number of bits {}",
169 register,
170 BITS
171 );
172 *word |= register << (i * BITS);
173 number_of_zero_registers + (register == 0) as usize
174 },
175 )
176 }),
177 );
178
179 Self {
180 words,
181 number_of_zero_registers,
182 }
183 }
184
185 #[inline(always)]
186 /// Adds an element to the HyperLogLog counter, and returns whether the counter has changed.
187 ///
188 /// # Arguments
189 /// * `rhs` - The element to add.
190 ///
191 /// # Examples
192 ///
193 /// ```
194 /// use hyperloglog_rs::prelude::*;
195 ///
196 /// let mut hll = HyperLogLog::<Precision10, 6>::default();
197 ///
198 /// hll.insert("Hello");
199 /// hll.insert("World");
200 ///
201 /// assert!(hll.estimate_cardinality() >= 2.0);
202 /// ```
203 ///
204 /// # Performance
205 ///
206 /// The performance of this function depends on the size of the HyperLogLog counter (`N`), the number
207 /// of distinct elements in the input, and the hash function used to hash elements. For a given value of `N`,
208 /// the function has an average time complexity of O(1) and a worst-case time complexity of O(log N).
209 /// However, the actual time complexity may vary depending on the distribution of the hashed elements.
210 ///
211 /// # Errors
212 ///
213 /// This function does not return any errors.
214 pub fn insert<T: Hash>(&mut self, rhs: T) -> bool {
215 let (mut hash, index) = self.get_hash_and_index::<T>(&rhs);
216
217 // We need to add ones to the hash to make sure that the
218 // the number of zeros we obtain afterwards is never higher
219 // than the maximal value that may be represented in a register
220 // with BITS bits.
221 if BITS < 6 {
222 // In the case of BITS = 5, we have registers of
223 // 5 bits each. The maximal value that can be represented
224 // in a register is 31. As such, we need to add 1 << (65 - 32)
225 // to the hash. Similarly, in the case of BITS = 4, we have
226 // registers of 4 bits each. The maximal value that can be
227 // represented in a register is 15.
228 hash |= 1 << (65 - (1 << BITS))
229 } // else {
230 // // Otherwise, with registers from 6 bits upwards we can
231 // // represent a value of 64 or larger. Since we are using
232 // // an hash function of 64 bits, the maximal number of
233 // //
234 // // 1 << (P::EXPONENT - 1)
235 // };
236
237 // Count leading zeros.
238 let number_of_zeros: u32 = 1 + hash.leading_zeros();
239
240 // We add a debug assertion to make sure that the number of zeros
241 // we have obtained is not larger than the maximal value that can
242 // be represented in a register with BITS bits.
243 debug_assert!(
244 number_of_zeros < 1 << BITS,
245 concat!("The number of zeros {} must be less than or equal to {}. ",),
246 number_of_zeros,
247 1 << BITS
248 );
249
250 // Calculate the position of the register in the internal buffer array.
251 let word_position = index / Self::NUMBER_OF_REGISTERS_IN_WORD;
252 let register_position = index - word_position * Self::NUMBER_OF_REGISTERS_IN_WORD;
253
254 debug_assert!(
255 word_position < self.words.len(),
256 concat!(
257 "The word_position {} must be less than the number of words {}. ",
258 "You have obtained this values starting from the index {} and the number of registers in word {}. ",
259 "We currently have {} registers. Currently using precision {} and number of bits {}."
260 ),
261 word_position,
262 self.words.len(),
263 index,
264 Self::NUMBER_OF_REGISTERS_IN_WORD,
265 P::NUMBER_OF_REGISTERS,
266 P::EXPONENT,
267 BITS
268 );
269
270 // Extract the current value of the register at `index`.
271 let register_value: u32 =
272 (self.words[word_position] >> (register_position * BITS)) & Self::LOWER_REGISTER_MASK;
273
274 // Otherwise, update the register using a bit mask.
275 if number_of_zeros > register_value {
276 self.words[word_position] &= !(Self::LOWER_REGISTER_MASK << (register_position * BITS));
277 self.words[word_position] |= number_of_zeros << (register_position * BITS);
278 self.number_of_zero_registers -=
279 P::NumberOfZeros::reverse((register_value == 0) as usize);
280
281 // We check that the value we have written to the register is correct.
282 debug_assert!(
283 self.words[word_position] >> (register_position * BITS) & Self::LOWER_REGISTER_MASK
284 == number_of_zeros,
285 concat!(
286 "The value of the register at position {} must be {}. ",
287 "We have obtained {} instead. ",
288 "The current value of the word is {}."
289 ),
290 index,
291 number_of_zeros,
292 self.words[word_position] >> (register_position * BITS) & Self::LOWER_REGISTER_MASK,
293 self.words[word_position]
294 );
295
296 // We check that the word we have edited maintains that the padding bits are all zeros
297 // and have not been manipulated in any way. If these bits were manipulated, it would mean
298 // that we have a bug in the code.
299 debug_assert!(
300 self.words[word_position] & Self::PADDING_BITS_MASK == 0,
301 concat!(
302 "The padding bits of the word {} must be all zeros. ",
303 "We have obtained {} instead."
304 ),
305 self.words[word_position],
306 self.words[word_position] & Self::PADDING_BITS_MASK
307 );
308
309 // We also check that if the word we have edites is the last word, then the padding bits
310 // of the word must be all zeros.
311 debug_assert!(
312 index != P::NUMBER_OF_REGISTERS - 1
313 || self.words[word_position] & Self::LAST_WORD_PADDING_BITS_MASK == 0,
314 concat!(
315 "The padding bits of the last word {} must be all zeros. ",
316 "We have obtained {} instead. The last word padding bits mask is, ",
317 "when represented in binary, {:#034b}.\n ",
318 "The word in binary is {:#034b}. ",
319 "The current case is using precision {} and bits {}. As such, ",
320 "we expect to have {} padding registers in the last word."
321 ),
322 self.words[word_position],
323 self.words[word_position] & Self::LAST_WORD_PADDING_BITS_MASK,
324 Self::LAST_WORD_PADDING_BITS_MASK,
325 self.words[word_position],
326 P::EXPONENT,
327 BITS,
328 Self::get_number_of_padding_registers()
329 );
330
331 true
332 } else {
333 false
334 }
335 }
336}
337
338impl<P: Precision + WordType<BITS>, const BITS: usize> Eq for HyperLogLog<P, BITS> {
339 fn assert_receiver_is_total_eq(&self) {
340 // This is a no-op because we know that `Self` is `Eq`.
341 }
342}
343
344/// Implements PartialEq for HyperLogLog.
345///
346/// # Implementative details
347/// Two HyperLogLog counters are considered equal if they have the same words.
348///
349/// # Examples
350///
351/// ```
352/// # use hyperloglog_rs::prelude::*;
353///
354/// let mut hll1 = HyperLogLog::<Precision14, 5>::default();
355/// hll1.insert(&2);
356///
357/// let mut hll2 = HyperLogLog::<Precision14, 5>::default();
358/// hll2.insert(&2);
359/// hll2.insert(&3);
360///
361/// assert_ne!(hll1, hll2);
362///
363/// hll1 |= hll2;
364///
365/// assert_eq!(hll1, hll2);
366/// ```
367impl<P: Precision + WordType<BITS>, const BITS: usize> PartialEq for HyperLogLog<P, BITS> {
368 /// Returns whether the two HyperLogLog counters are equal.
369 fn eq(&self, other: &Self) -> bool {
370 self.words == other.words
371 }
372}
373
374/// Implements the Default trait for HyperLogLog.
375///
376/// HyperLogLog is a probabilistic cardinality estimator that uses a fixed
377/// amount of memory to estimate the number of distinct elements in a set.
378///
379/// # Examples
380///
381/// ```rust
382/// # use hyperloglog_rs::prelude::*;
383///
384/// let hll: HyperLogLog<Precision10, 6> = Default::default();
385/// assert_eq!(hll.len(), 1024);
386/// ```
387impl<P: Precision + WordType<BITS>, const BITS: usize> Default for HyperLogLog<P, BITS> {
388 /// Returns a new HyperLogLog instance with default configuration settings.
389 fn default() -> Self {
390 Self::new()
391 }
392}
393
394impl<P: Precision + WordType<BITS>, const BITS: usize, T: Hash> From<T> for HyperLogLog<P, BITS> {
395 /// Create a new HyperLogLog counter from a value.
396 ///
397 /// This method creates a new empty HyperLogLog counter and inserts the hash
398 /// of the given value into it. The value can be any type that implements
399 /// the `Hash` trait.
400 ///
401 /// # Examples
402 ///
403 /// ```
404 /// # use hyperloglog_rs::prelude::*;
405 ///
406 /// let hll = HyperLogLog::<Precision14, 5>::from("test");
407 ///
408 /// assert!(hll.estimate_cardinality() >= 1.0_f32);
409 /// assert!(!hll.is_empty());
410 /// assert!(hll.may_contain(&"test"));
411 /// ```
412 fn from(value: T) -> Self {
413 let mut hll = Self::new();
414 hll.insert(value);
415 hll
416 }
417}
418
419impl<P: Precision + WordType<BITS>, const BITS: usize> HyperLogLogTrait<P, BITS>
420 for HyperLogLog<P, BITS>
421{
422 #[inline(always)]
423 /// Returns the number of registers with zero values. This value is used for computing a small
424 /// correction when estimating the cardinality of a small set.
425 ///
426 /// # Examples
427 ///
428 /// ```
429 /// # use hyperloglog_rs::prelude::*;
430 ///
431 /// // Create a new HyperLogLog counter with precision 14 and 5 bits per register.
432 /// let mut hll = HyperLogLog::<Precision14, 5>::default();
433 ///
434 /// // Add some elements to the counter.
435 /// hll.insert(&1);
436 /// hll.insert(&2);
437 /// hll.insert(&3);
438 ///
439 /// // Get the number of zero registers.
440 /// let number_of_zero_registers = hll.get_number_of_zero_registers();
441 ///
442 /// assert_eq!(number_of_zero_registers, 16381);
443 /// ```
444 fn get_number_of_zero_registers(&self) -> usize {
445 self.number_of_zero_registers.convert()
446 }
447
448 #[inline(always)]
449 /// Returns the array of words of the HyperLogLog counter.
450 fn get_words(&self) -> &P::Words {
451 &self.words
452 }
453}
454
455impl<P: Precision + WordType<BITS>, const BITS: usize, A: Hash> core::iter::FromIterator<A>
456 for HyperLogLog<P, BITS>
457{
458 #[inline(always)]
459 /// Creates a new HyperLogLog counter and adds all elements from an iterator to it.
460 ///
461 /// # Examples
462 ///
463 /// ```
464 /// use hyperloglog_rs::prelude::*;
465 ///
466 /// let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9];
467 /// let hll: HyperLogLog<Precision12, 5> = data.iter().collect();
468 /// assert!(
469 /// hll.estimate_cardinality() > 0.9 * data.len() as f32,
470 /// concat!(
471 /// "The estimate is too low, expected ",
472 /// "at least {}, got {}",
473 /// ),
474 /// 0.9 * data.len() as f32,
475 /// hll.estimate_cardinality()
476 /// );
477 /// assert!(
478 /// hll.estimate_cardinality() < 1.1 * data.len() as f32,
479 /// concat!(
480 /// "The estimate is too high, expected ",
481 /// "at most {}, got {}",
482 /// ),
483 /// 1.1 * data.len() as f32,
484 /// hll.estimate_cardinality()
485 /// );
486 /// ```
487 fn from_iter<T: IntoIterator<Item = A>>(iter: T) -> Self {
488 let mut hll = Self::new();
489 for item in iter {
490 hll.insert(item);
491 }
492 hll
493 }
494}