1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
//! Exact sketching algorithms.
//!
//! This submodule contains the implementation of the exact sketching algorithms
//! as part of a trait.
//!
//! A sketch is a representation of the similarity between two list of sets.
//!
//! It is used in cases such as in graphs for representing the similarity between
//! two nodes, de facto providing features that characterize a candidate edge
//! between two nodes.
//!
//! While in the HyperLogLog case we provide the approximated version of this algorithm,
//! sometimes it is necessary, such as in test cases, to have the exact version of the
//! algorithm. The approximated version is faster and uses less memory, but it is not,
//! of course, guaranteed to be exact. Learn more about the approximated version in the
//! [HyperLogLog.estimated_overlap_and_differences_cardinality_matrices] method.
//!
use crate::prelude::*;
use core::iter::Sum;
use core::ops::AddAssign;
use core::ops::{DivAssign, MulAssign, SubAssign};
pub trait SetLike<I> {
/// Returns the estimated intersection and left and right difference cardinality between two sets.
fn get_estimated_union_cardinality(
&self,
self_cardinality: I,
other: &Self,
other_cardinality: I,
) -> EstimatedUnionCardinalities<I>;
/// Returns the cardinality of the set.
fn get_cardinality(&self) -> I;
}
impl<
F: Primitive<f32>,
PRECISION: Precision + WordType<BITS>,
const BITS: usize,
> SetLike<F> for HyperLogLog<PRECISION, BITS>
{
fn get_estimated_union_cardinality(
&self,
self_cardinality: F,
other: &Self,
other_cardinality: F,
) -> EstimatedUnionCardinalities<F> {
let mut raw_union_estimate = 0.0;
let mut union_zeros = 0;
for (left_word, right_word) in self
.get_words()
.iter_elements()
.copied()
.zip(other.get_words().iter_elements().copied())
{
let mut union_partial: f32 = 0.0;
for i in 0..Self::NUMBER_OF_REGISTERS_IN_WORD {
let left_register = (left_word >> (i * BITS)) & Self::LOWER_REGISTER_MASK;
let right_register = (right_word >> (i * BITS)) & Self::LOWER_REGISTER_MASK;
let maximal_register = (left_register).max(right_register);
union_partial += f32::from_le_bytes(((127 - maximal_register) << 23).to_le_bytes());
union_zeros += (maximal_register == 0) as usize;
}
raw_union_estimate += union_partial;
}
union_zeros -= Self::get_number_of_padding_registers();
// We need to subtract the padding registers from the raw estimates
// as for each such register we are adding a one.
raw_union_estimate -= Self::get_number_of_padding_registers() as f32;
let union_estimate = F::reverse(Self::adjust_estimate_with_zeros(
raw_union_estimate,
union_zeros,
));
EstimatedUnionCardinalities::from((self_cardinality, other_cardinality, union_estimate))
}
fn get_cardinality(&self) -> F {
F::reverse(self.estimate_cardinality())
}
}
pub trait HyperSpheresSketch: Sized {
/// Returns the overlap and differences cardinality matrices of two lists of sets.
///
/// # Arguments
/// * `left` - The first list of sets.
/// * `right` - The second list of sets.
///
/// # Returns
/// * `overlap_cardinality_matrix` - Matrix of estimated overlapping cardinalities between the elements of the left and right arrays.
/// * `left_difference_cardinality_vector` - Vector of estimated difference cardinalities between the elements of the left array and the last element of the right array.
/// * `right_difference_cardinality_vector` - Vector of estimated difference cardinalities between the elements of the right array and the last element of the left array.
///
/// # Implementative details
/// We expect the elements of the left and right arrays to be increasingly contained in the next one.
///
/// # Examples
/// In the following illustration, we show that for two vectors left and right of three elements,
/// we expect to compute the exclusively overlap matrix $A_{ij}$ and the exclusively differences vectors $B_i$.
///
/// 
///
/// Very similarly, for the case of vectors of two elements:
///
/// 
///
fn overlap_and_differences_cardinality_matrices<
I: Copy
+ Default
+ Primitive<f32>
+ core::ops::Add<Output = I>
+ core::ops::Sub<Output = I>
+ core::ops::Div<Output = I>
+ core::ops::Mul<Output = I>
+ Sum
+ Send
+ Sync
+ AddAssign
+ SubAssign
+ MulAssign
+ DivAssign
+ MaxMin,
const L: usize,
const R: usize,
>(
left: &[Self; L],
right: &[Self; R],
) -> ([[I; R]; L], [I; L], [I; R])
where
Self: SetLike<I>,
{
// Initialize overlap and differences cardinality matrices/vectors.
let mut last_row = [I::default(); R];
let mut differential_overlap_cardinality_matrix = [[I::default(); R]; L];
let mut left_difference_cardinality_vector = [I::default(); L];
let mut right_cardinalities = [I::default(); R];
right.iter().zip(right_cardinalities.iter_mut()).for_each(
|(right_hll, right_cardinality)| {
*right_cardinality = right_hll.get_cardinality();
},
);
let mut right_difference_cardinality_vector = [I::default(); R];
let mut euc: EstimatedUnionCardinalities<I> = EstimatedUnionCardinalities::default();
let mut last_left_difference: I = I::default();
// Populate the overlap cardinality matrix.
for (i, left_hll) in left.iter().enumerate() {
let mut last_right_difference: I = I::default();
let left_cardinality = left_hll.get_cardinality();
let mut comulative_row = I::default();
for (j, (right_hll, right_cardinality)) in
right.iter().zip(right_cardinalities).enumerate()
{
euc = left_hll.get_estimated_union_cardinality(
left_cardinality,
right_hll,
right_cardinality,
);
let delta = last_row[j] + comulative_row;
differential_overlap_cardinality_matrix[i][j] =
(euc.get_intersection_cardinality() - delta).get_max(I::default());
last_row[j] = euc.get_intersection_cardinality().get_max(delta);
comulative_row += differential_overlap_cardinality_matrix[i][j];
// We always set the value of the right difference so that the
// last time we write this will necessarily be with the last
// and largest left set.
let this_difference = euc.get_right_difference_cardinality();
right_difference_cardinality_vector[j] = this_difference - last_right_difference;
last_right_difference = this_difference;
}
let this_difference = euc.get_left_difference_cardinality();
left_difference_cardinality_vector[i] = this_difference - last_left_difference;
last_left_difference = this_difference;
}
(
differential_overlap_cardinality_matrix,
left_difference_cardinality_vector,
right_difference_cardinality_vector,
)
}
}
impl<PRECISION: Precision + WordType<BITS>, const BITS: usize> HyperSpheresSketch
for HyperLogLog<PRECISION, BITS>
{
}