hyperloglog_rs/
hyper_spheres_sketch.rs

1//! Exact sketching algorithms.
2//!
3//! This submodule contains the implementation of the exact sketching algorithms
4//! as part of a trait.
5//!
6//! A sketch is a representation of the similarity between two list of sets.
7//!
8//! It is used in cases such as in graphs for representing the similarity between
9//! two nodes, de facto providing features that characterize a candidate edge
10//! between two nodes.
11//!
12//! While in the HyperLogLog case we provide the approximated version of this algorithm,
13//! sometimes it is necessary, such as in test cases, to have the exact version of the
14//! algorithm. The approximated version is faster and uses less memory, but it is not,
15//! of course, guaranteed to be exact. Learn more about the approximated version in the
16//! [HyperLogLog.estimated_overlap_and_differences_cardinality_matrices] method.
17//!
18use crate::prelude::*;
19use core::fmt::Debug;
20use core::ops::AddAssign;
21
22pub trait SetLike<I> {
23    /// Returns the estimated intersection and left and right difference cardinality between two sets.
24    fn get_estimated_union_cardinality(
25        &self,
26        self_cardinality: I,
27        other: &Self,
28        other_cardinality: I,
29    ) -> EstimatedUnionCardinalities<I>;
30
31    /// Returns the cardinality of the set.
32    fn get_cardinality(&self) -> I;
33}
34
35impl<F: Primitive<f32>, P: Precision + WordType<BITS>, const BITS: usize> SetLike<F>
36    for HyperLogLog<P, BITS>
37{
38    fn get_estimated_union_cardinality(
39        &self,
40        self_cardinality: F,
41        other: &Self,
42        other_cardinality: F,
43    ) -> EstimatedUnionCardinalities<F> {
44        let mut raw_union_estimate = 0.0;
45
46        let mut union_zeros = 0;
47        for (left_word, right_word) in self
48            .get_words()
49            .iter_elements()
50            .copied()
51            .zip(other.get_words().iter_elements().copied())
52        {
53            let mut union_partial: f32 = 0.0;
54            for i in 0..Self::NUMBER_OF_REGISTERS_IN_WORD {
55                let left_register = (left_word >> (i * BITS)) & Self::LOWER_REGISTER_MASK;
56                let right_register = (right_word >> (i * BITS)) & Self::LOWER_REGISTER_MASK;
57                let maximal_register = (left_register).max(right_register);
58                union_partial += f32::from_le_bytes(((127 - maximal_register) << 23).to_le_bytes());
59                union_zeros += (maximal_register == 0) as usize;
60            }
61            raw_union_estimate += union_partial;
62        }
63
64        union_zeros -= Self::get_number_of_padding_registers();
65
66        // We need to subtract the padding registers from the raw estimates
67        // as for each such register we are adding a one.
68        raw_union_estimate -= Self::get_number_of_padding_registers() as f32;
69
70        let union_estimate = F::reverse(Self::adjust_estimate_with_zeros(
71            raw_union_estimate,
72            union_zeros,
73        ));
74
75        // union_estimate = union_estimate.get_min(self_cardinality + other_cardinality);
76
77        EstimatedUnionCardinalities::from((self_cardinality, other_cardinality, union_estimate))
78    }
79
80    fn get_cardinality(&self) -> F {
81        F::reverse(self.estimate_cardinality())
82    }
83}
84
85pub trait HyperSpheresSketch<I>: Sized + SetLike<I>
86where
87    I: Copy
88        + Default
89        + core::ops::Add<Output = I>
90        + core::ops::Sub<Output = I>
91        + core::ops::Div<Output = I>
92        + core::ops::Mul<Output = I>
93        + AddAssign
94        + Debug
95        + One
96        + MaxMin,
97{
98    #[inline(always)]
99    /// Returns the overlap and differences cardinality matrices of two lists of sets.
100    ///
101    /// # Arguments
102    /// * `left` - The first list of sets.
103    /// * `right` - The second list of sets.
104    ///
105    /// # Returns
106    /// * `overlap_cardinality_matrix` - Matrix of estimated overlapping cardinalities between the elements of the left and right arrays.
107    /// * `left_difference_cardinality_vector` - Vector of estimated difference cardinalities between the elements of the left array and the last element of the right array.
108    /// * `right_difference_cardinality_vector` - Vector of estimated difference cardinalities between the elements of the right array and the last element of the left array.
109    ///
110    /// # Implementative details
111    /// We expect the elements of the left and right arrays to be increasingly contained in the next one.
112    ///
113    /// # Examples
114    /// In the following illustration, we show that for two vectors left and right of three elements,
115    /// we expect to compute the exclusively overlap matrix $A_{ij}$ and the exclusively differences vectors $B_i$.    
116    ///
117    /// ![Illustration of overlaps](https://github.com/LucaCappelletti94/hyperloglog-rs/blob/main/triple_overlap.png?raw=true)
118    ///
119    /// Very similarly, for the case of vectors of two elements:
120    ///
121    /// ![Illustration of overlaps](https://github.com/LucaCappelletti94/hyperloglog-rs/blob/main/tuple_overlap.png?raw=true)
122    ///
123    fn overlap_and_differences_cardinality_matrices<const L: usize, const R: usize>(
124        left: &[Self; L],
125        right: &[Self; R],
126    ) -> ([[I; R]; L], [I; L], [I; R]) {
127        // Initialize overlap and differences cardinality matrices/vectors.
128        let mut last_row = [I::default(); R];
129        let mut differential_overlap_cardinality_matrix = [[I::default(); R]; L];
130        let mut left_difference_cardinality_vector = [I::default(); L];
131        let mut right_cardinalities = [I::default(); R];
132
133        right.iter().zip(right_cardinalities.iter_mut()).for_each(
134            |(right_hll, right_cardinality)| {
135                *right_cardinality = right_hll.get_cardinality();
136            },
137        );
138
139        let mut right_difference_cardinality_vector = [I::default(); R];
140        let mut euc: EstimatedUnionCardinalities<I> = EstimatedUnionCardinalities::default();
141        let mut last_left_difference: I = I::default();
142
143        // Populate the overlap cardinality matrix.
144        for (i, left_hll) in left.iter().enumerate() {
145            let mut last_right_difference: I = I::default();
146            let left_cardinality = left_hll.get_cardinality();
147            let mut comulative_row = I::default();
148            for (j, (right_hll, right_cardinality)) in
149                right.iter().zip(right_cardinalities).enumerate()
150            {
151                euc = left_hll.get_estimated_union_cardinality(
152                    left_cardinality,
153                    right_hll,
154                    right_cardinality,
155                );
156                let delta = last_row[j] + comulative_row;
157                differential_overlap_cardinality_matrix[i][j] =
158                    (euc.get_intersection_cardinality() - delta).get_max(I::default());
159                last_row[j] = euc.get_intersection_cardinality().get_max(delta);
160                comulative_row += differential_overlap_cardinality_matrix[i][j];
161
162                // We always set the value of the right difference so that the
163                // last time we write this will necessarily be with the last
164                // and largest left set.
165                right_difference_cardinality_vector[j] = (euc.get_right_difference_cardinality()
166                    - last_right_difference)
167                    .get_max(I::default());
168
169                last_right_difference = euc.get_right_difference_cardinality();
170            }
171            left_difference_cardinality_vector[i] = (euc.get_left_difference_cardinality()
172                - last_left_difference)
173                .get_max(I::default());
174            last_left_difference = euc.get_left_difference_cardinality();
175        }
176
177        (
178            differential_overlap_cardinality_matrix,
179            left_difference_cardinality_vector,
180            right_difference_cardinality_vector,
181        )
182    }
183
184    #[cfg(feature = "std")]
185    #[inline(always)]
186    /// Returns the overlap and differences cardinality matrices of two lists of sets.
187    ///
188    /// # Arguments
189    /// * `left` - The first list of sets.
190    /// * `right` - The second list of sets.
191    ///
192    /// # Returns
193    /// * `overlap_cardinality_matrix` - Matrix of estimated overlapping cardinalities between the elements of the left and right arrays.
194    /// * `left_difference_cardinality_vector` - Vector of estimated difference cardinalities between the elements of the left array and the last element of the right array.
195    /// * `right_difference_cardinality_vector` - Vector of estimated difference cardinalities between the elements of the right array and the last element of the left array.
196    ///
197    /// # Implementative details
198    /// We expect the elements of the left and right arrays to be increasingly contained in the next one.
199    ///
200    /// # Examples
201    /// In the following illustration, we show that for two vectors left and right of three elements,
202    /// we expect to compute the exclusively overlap matrix $A_{ij}$ and the exclusively differences vectors $B_i$.    
203    ///
204    /// ![Illustration of overlaps](https://github.com/LucaCappelletti94/hyperloglog-rs/blob/main/triple_overlap.png?raw=true)
205    ///
206    /// Very similarly, for the case of vectors of two elements:
207    ///
208    /// ![Illustration of overlaps](https://github.com/LucaCappelletti94/hyperloglog-rs/blob/main/tuple_overlap.png?raw=true)
209    ///
210    fn overlap_and_differences_cardinality_matrices_vec(
211        left: &[Self],
212        right: &[Self],
213    ) -> (Vec<Vec<I>>, Vec<I>, Vec<I>) {
214        // Initialize overlap and differences cardinality matrices/vectors.
215        let mut last_row = vec![I::default(); right.len()];
216        let mut differential_overlap_cardinality_matrix =
217            vec![vec![I::default(); right.len()]; left.len()];
218        let mut left_difference_cardinality_vector = vec![I::default(); left.len()];
219        let mut right_cardinalities = vec![I::default(); right.len()];
220
221        right.iter().zip(right_cardinalities.iter_mut()).for_each(
222            |(right_hll, right_cardinality)| {
223                *right_cardinality = right_hll.get_cardinality();
224            },
225        );
226
227        let mut right_difference_cardinality_vector = vec![I::default(); right.len()];
228        let mut euc: EstimatedUnionCardinalities<I> = EstimatedUnionCardinalities::default();
229        let mut last_left_difference: I = I::default();
230
231        // Populate the overlap cardinality matrix.
232        for (i, left_hll) in left.iter().enumerate() {
233            let mut last_right_difference: I = I::default();
234            let left_cardinality = left_hll.get_cardinality();
235            let mut comulative_row = I::default();
236            for (j, (right_hll, right_cardinality)) in right
237                .iter()
238                .zip(right_cardinalities.iter().copied())
239                .enumerate()
240            {
241                euc = left_hll.get_estimated_union_cardinality(
242                    left_cardinality,
243                    right_hll,
244                    right_cardinality,
245                );
246                let delta = last_row[j] + comulative_row;
247                differential_overlap_cardinality_matrix[i][j] =
248                    (euc.get_intersection_cardinality() - delta).get_max(I::default());
249                last_row[j] = euc.get_intersection_cardinality().get_max(delta);
250                comulative_row += differential_overlap_cardinality_matrix[i][j];
251
252                // We always set the value of the right difference so that the
253                // last time we write this will necessarily be with the last
254                // and largest left set.
255                right_difference_cardinality_vector[j] = (euc.get_right_difference_cardinality()
256                    - last_right_difference)
257                    .get_max(I::default());
258
259                last_right_difference = euc.get_right_difference_cardinality();
260            }
261            left_difference_cardinality_vector[i] = (euc.get_left_difference_cardinality()
262                - last_left_difference)
263                .get_max(I::default());
264            last_left_difference = euc.get_left_difference_cardinality();
265        }
266
267        (
268            differential_overlap_cardinality_matrix,
269            left_difference_cardinality_vector,
270            right_difference_cardinality_vector,
271        )
272    }
273
274    #[inline(always)]
275    /// Returns the normalized overlap and differences cardinality matrices of two lists of sets.
276    ///
277    /// # Arguments
278    /// * `left` - The first list of sets.
279    /// * `right` - The second list of sets.
280    ///
281    /// # Returns
282    /// * `overlap_cardinality_matrix` - Matrix of normalized estimated overlapping cardinalities between the elements of the left and right arrays.
283    /// * `left_difference_cardinality_vector` - Vector of normalized estimated difference cardinalities between the elements of the left array and the last element of the right array.
284    /// * `right_difference_cardinality_vector` - Vector of normalized estimated difference cardinalities between the elements of the right array and the last element of the left array.
285    ///
286    fn normalized_overlap_and_differences_cardinality_matrices<const L: usize, const R: usize>(
287        left: &[Self; L],
288        right: &[Self; R],
289    ) -> ([[I; R]; L], [I; L], [I; R]) {
290        // Initialize overlap and differences cardinality matrices/vectors.
291        let mut last_row = [I::default(); R];
292        let mut differential_overlap_cardinality_matrix = [[I::default(); R]; L];
293        let mut left_difference_cardinality_vector = [I::default(); L];
294        let mut right_cardinalities = [I::default(); R];
295
296        right.iter().zip(right_cardinalities.iter_mut()).for_each(
297            |(right_hll, right_cardinality)| {
298                *right_cardinality = right_hll.get_cardinality();
299            },
300        );
301
302        // We run a debug assert where we check that each right cardinality is
303        // larger than the previous one.
304        debug_assert!(right_cardinalities
305            .iter()
306            .zip(right_cardinalities.iter().skip(1))
307            .all(|(left, right)| left <= right));
308
309        let mut right_difference_cardinality_vector = [I::default(); R];
310        let mut euc: EstimatedUnionCardinalities<I> = EstimatedUnionCardinalities::default();
311        let mut last_left_difference: I = I::default();
312        let mut last_left_cardinality: I = I::default();
313
314        // Populate the overlap cardinality matrix.
315        for (i, left_hll) in left.iter().enumerate() {
316            let mut last_right_difference: I = I::default();
317            let left_cardinality = left_hll.get_cardinality();
318            let mut comulative_row = I::default();
319            let mut last_right_cardinality = I::default();
320            for (j, (right_hll, right_cardinality)) in
321                right.iter().zip(right_cardinalities).enumerate()
322            {
323                euc = left_hll.get_estimated_union_cardinality(
324                    left_cardinality,
325                    right_hll,
326                    right_cardinality,
327                );
328                let delta = last_row[j] + comulative_row;
329                let differential_intersection =
330                    (euc.get_intersection_cardinality() - delta).get_max(I::default());
331
332                debug_assert!(
333                    differential_intersection >= I::default(),
334                    concat!(
335                        "Expected differential_intersection to be larger than zero, but it is not. ",
336                        "Got: differential_intersection: {:?}, delta: {:?}",
337                    ),
338                    differential_intersection,
339                    delta,
340                );
341
342                let maximal_differential_intersection_cardinality =
343                    (euc.get_left_difference_cardinality() - last_left_difference
344                        + right_cardinality
345                        - last_right_cardinality)
346                        .get_max(I::non_zero_positive_min_value());
347
348                differential_overlap_cardinality_matrix[i][j] = (differential_intersection
349                    / maximal_differential_intersection_cardinality)
350                    .get_min(I::ONE);
351                last_row[j] = euc.get_intersection_cardinality().get_max(delta);
352                comulative_row += differential_intersection;
353
354                // We always set the value of the right difference so that the
355                // last time we write this will necessarily be with the last
356                // and largest left set.
357
358                let differential_right_difference = (euc.get_right_difference_cardinality()
359                    - last_right_difference)
360                    .get_max(I::default());
361                let maximal_differential_right_difference = (right_cardinality
362                    - last_right_cardinality)
363                    .get_max(I::non_zero_positive_min_value());
364
365                right_difference_cardinality_vector[j] = (differential_right_difference
366                    / maximal_differential_right_difference)
367                    .get_min(I::ONE);
368                last_right_difference = euc.get_right_difference_cardinality();
369                last_right_cardinality = right_cardinality;
370            }
371            left_difference_cardinality_vector[i] = ((euc.get_left_difference_cardinality()
372                - last_left_difference)
373                .get_max(I::default())
374                / (left_cardinality - last_left_cardinality)
375                    .get_max(I::non_zero_positive_min_value()))
376            .get_min(I::ONE);
377            last_left_cardinality = left_cardinality;
378            last_left_difference = euc.get_left_difference_cardinality();
379        }
380
381        (
382            differential_overlap_cardinality_matrix,
383            left_difference_cardinality_vector,
384            right_difference_cardinality_vector,
385        )
386    }
387
388    #[cfg(feature = "std")]
389    #[inline(always)]
390    /// Returns the normalized overlap and differences cardinality matrices of two vectors of sets.
391    ///
392    /// # Arguments
393    /// * `left` - The first list of sets.
394    /// * `right` - The second list of sets.
395    ///
396    /// # Returns
397    /// * `overlap_cardinality_matrix` - Matrix of normalized estimated overlapping cardinalities between the elements of the left and right arrays.
398    /// * `left_difference_cardinality_vector` - Vector of normalized estimated difference cardinalities between the elements of the left array and the last element of the right array.
399    /// * `right_difference_cardinality_vector` - Vector of normalized estimated difference cardinalities between the elements of the right array and the last element of the left array.
400    ///
401    fn normalized_overlap_and_differences_cardinality_matrices_vec(
402        left: &[Self],
403        right: &[Self],
404    ) -> (Vec<Vec<I>>, Vec<I>, Vec<I>) {
405        // Initialize overlap and differences cardinality matrices/vectors.
406        let mut last_row = vec![I::default(); right.len()];
407        let mut differential_overlap_cardinality_matrix =
408            vec![vec![I::default(); right.len()]; left.len()];
409        let mut left_difference_cardinality_vector = vec![I::default(); left.len()];
410        let mut right_cardinalities = vec![I::default(); right.len()];
411
412        right.iter().zip(right_cardinalities.iter_mut()).for_each(
413            |(right_hll, right_cardinality)| {
414                *right_cardinality = right_hll.get_cardinality();
415            },
416        );
417
418        // We run a debug assert where we check that each right cardinality is
419        // larger than the previous one.
420        debug_assert!(right_cardinalities
421            .iter()
422            .zip(right_cardinalities.iter().skip(1))
423            .all(|(left, right)| left <= right));
424
425        let mut right_difference_cardinality_vector = vec![I::default(); right.len()];
426        let mut euc: EstimatedUnionCardinalities<I> = EstimatedUnionCardinalities::default();
427        let mut last_left_difference: I = I::default();
428        let mut last_left_cardinality: I = I::default();
429
430        // Populate the overlap cardinality matrix.
431        for (i, left_hll) in left.iter().enumerate() {
432            let mut last_right_difference: I = I::default();
433            let left_cardinality = left_hll.get_cardinality();
434            let mut comulative_row = I::default();
435            let mut last_right_cardinality = I::default();
436            for (j, (right_hll, right_cardinality)) in right
437                .iter()
438                .zip(right_cardinalities.iter().copied())
439                .enumerate()
440            {
441                euc = left_hll.get_estimated_union_cardinality(
442                    left_cardinality,
443                    right_hll,
444                    right_cardinality,
445                );
446                let delta = last_row[j] + comulative_row;
447                let differential_intersection =
448                    (euc.get_intersection_cardinality() - delta).get_max(I::default());
449
450                debug_assert!(
451                    differential_intersection >= I::default(),
452                    concat!(
453                        "Expected differential_intersection to be larger than zero, but it is not. ",
454                        "Got: differential_intersection: {:?}, delta: {:?}",
455                    ),
456                    differential_intersection,
457                    delta,
458                );
459
460                let maximal_differential_intersection_cardinality =
461                    (euc.get_left_difference_cardinality() - last_left_difference
462                        + right_cardinality
463                        - last_right_cardinality)
464                        .get_max(I::non_zero_positive_min_value());
465
466                differential_overlap_cardinality_matrix[i][j] = (differential_intersection
467                    / maximal_differential_intersection_cardinality)
468                    .get_min(I::ONE);
469                last_row[j] = euc.get_intersection_cardinality().get_max(delta);
470                comulative_row += differential_intersection;
471
472                // We always set the value of the right difference so that the
473                // last time we write this will necessarily be with the last
474                // and largest left set.
475
476                let differential_right_difference = (euc.get_right_difference_cardinality()
477                    - last_right_difference)
478                    .get_max(I::default());
479                let maximal_differential_right_difference = (right_cardinality
480                    - last_right_cardinality)
481                    .get_max(I::non_zero_positive_min_value());
482
483                right_difference_cardinality_vector[j] = (differential_right_difference
484                    / maximal_differential_right_difference)
485                    .get_min(I::ONE);
486                last_right_difference = euc.get_right_difference_cardinality();
487                last_right_cardinality = right_cardinality;
488            }
489            left_difference_cardinality_vector[i] = ((euc.get_left_difference_cardinality()
490                - last_left_difference)
491                .get_max(I::default())
492                / (left_cardinality - last_left_cardinality)
493                    .get_max(I::non_zero_positive_min_value()))
494            .get_min(I::ONE);
495            last_left_cardinality = left_cardinality;
496            last_left_difference = euc.get_left_difference_cardinality();
497        }
498
499        (
500            differential_overlap_cardinality_matrix,
501            left_difference_cardinality_vector,
502            right_difference_cardinality_vector,
503        )
504    }
505}
506
507impl<P: Precision + WordType<BITS>, const BITS: usize, I: Primitive<f32>> HyperSpheresSketch<I>
508    for HyperLogLog<P, BITS>
509{
510}