use std::{hash, io};
const EMPTY_HASH: u128 = 0x99aa06d3014798d86001c324468d497f;
const P: u32 = 14;
const M: u32 = 1 << P;
const MAX: u32 = 64 - P;
const MAXX: u64 = u64::MAX >> MAX;
const ALPHA: f64 = 0.7213 / (1f64 + 1.079 / (M as f64));
const Q: u8 = 6;
const R: u8 = 10;
const TQ: u32 = 1 << Q;
const TR: u32 = 1 << R;
const C: f64 = 0.169_919_487_159_739_1;
type Regs = [u16; M as usize];
static L: [f64; 64] = {
let mut l = [0.0f64; 64];
let mut i = 0u64;
while i < 64 {
l[i as usize] = f64::from_bits((1023 - i) << 52);
i += 1;
}
l
};
fn beta(ez: u16) -> f64 {
let zl = (f64::from(ez) + 1.0).ln();
-0.370_393_911 * f64::from(ez)
+ 0.070_471_823 * zl
+ 0.173_936_86 * zl.powi(2)
+ 0.163_398_39 * zl.powi(3)
+ -0.092_377_45 * zl.powi(4)
+ 0.037_380_27 * zl.powi(5)
+ -0.005_384_159 * zl.powi(6)
+ 0.000_424_19 * zl.powi(7)
}
static TBL: std::sync::LazyLock<EcTable> = std::sync::LazyLock::new(EcTable::new);
struct EcTable {
ln1p_neg_b1: Box<[[f64; TR as usize]; TQ as usize]>,
ln1p_neg_b2: Box<[[f64; TR as usize]; TQ as usize]>,
}
impl EcTable {
fn new() -> Self {
let mut ln1p_neg_b1 = Box::new([[0.0; TR as usize]; TQ as usize]);
let mut ln1p_neg_b2 = Box::new([[0.0; TR as usize]; TQ as usize]);
for i in 1..=TQ {
let row = (i as usize) - 1;
if i != TQ {
let den = 2f64.powf(f64::from(P) + f64::from(R) + f64::from(i));
for j1 in 1..=TR {
let col = (j1 as usize) - 1;
let j = f64::from(j1);
let b1 = (f64::from(TR) + j) / den;
let b2 = (f64::from(TR) + j + 1.0) / den;
ln1p_neg_b1[row][col] = f64::ln_1p(-b1);
ln1p_neg_b2[row][col] = f64::ln_1p(-b2);
}
} else {
let den = 2f64.powf(f64::from(P) + f64::from(R) + f64::from(i) - 1.0);
for j1 in 1..=TR {
let col = (j1 as usize) - 1;
let j = f64::from(j1);
let b1 = j / den;
let b2 = (j + 1.0) / den;
ln1p_neg_b1[row][col] = f64::ln_1p(-b1);
ln1p_neg_b2[row][col] = f64::ln_1p(-b2);
}
}
}
EcTable {
ln1p_neg_b1,
ln1p_neg_b2,
}
}
}
#[derive(Clone, Default)]
pub struct Entry {
hasher: xxhash_rust::xxh3::Xxh3Default,
}
impl Entry {
pub fn new() -> Self {
Default::default()
}
pub fn add(&mut self, v: impl hash::Hash) {
v.hash(&mut self.hasher);
}
pub fn add_bytes(&mut self, v: &[u8]) {
self.hasher.update(v);
}
pub fn add_reader(&mut self, mut r: impl io::Read) -> io::Result<u64> {
io::copy(&mut r, &mut self.hasher)
}
pub fn is_empty(&self) -> bool {
self.hasher.digest128() == EMPTY_HASH
}
#[doc(hidden)]
pub fn digest(&self) -> u128 {
self.hasher.digest128()
}
}
impl PartialEq for Entry {
fn eq(&self, other: &Self) -> bool {
self.hasher.digest128() == other.hasher.digest128()
}
}
impl std::fmt::Debug for Entry {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Entry")
.field("digest", &self.hasher.digest128())
.finish()
}
}
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct Sketch {
regs: Regs,
}
impl std::fmt::Debug for Sketch {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.debug_struct("Sketch")
.field("cardinality", &self.cardinality())
.finish()
}
}
impl Default for Sketch {
fn default() -> Self {
Self {
regs: [0; M as usize],
}
}
}
impl<T: hash::Hash> std::iter::FromIterator<T> for Sketch {
fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
let mut sk = Self::default();
iter.into_iter().for_each(|v| sk.add(v));
sk
}
}
impl PartialOrd for Sketch {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
self.cardinality().partial_cmp(&other.cardinality())
}
}
impl Sketch {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.regs.iter().all(|r| *r == 0)
}
fn new_reg(lz: u8, sig: u16) -> u16 {
(u16::from(lz) << R) | sig
}
fn lz(reg: u16) -> u8 {
(reg >> (16 - Q)) as u8
}
fn add_hash(&mut self, h: u128) {
let x: u64 = h as u64;
let y: u64 = (h >> 64) as u64;
let k = x >> MAX;
let lz = ((x << P) ^ MAXX).leading_zeros() as u8 + 1;
let sig = (y << (64 - R) >> (64 - R)) as u16;
let reg = Self::new_reg(lz, sig);
if self.regs[k as usize] < reg {
self.regs[k as usize] = reg;
}
}
pub fn add(&mut self, v: impl hash::Hash) {
let mut hasher = xxhash_rust::xxh3::Xxh3Default::new();
v.hash(&mut hasher);
self.add_hash(hasher.digest128());
}
pub fn add_reader(&mut self, mut r: impl io::Read) -> io::Result<u64> {
let mut hasher = xxhash_rust::xxh3::Xxh3Default::new();
let read = io::copy(&mut r, &mut hasher)?;
self.add_hash(hasher.digest128());
Ok(read)
}
pub fn add_bytes(&mut self, v: &[u8]) {
self.add_hash(xxhash_rust::xxh3::xxh3_128(v));
}
pub fn add_entry(&mut self, entry: &Entry) {
self.add_hash(entry.hasher.digest128())
}
pub fn add_with_seed(&mut self, v: impl hash::Hash, seed: u64) {
let mut hasher = xxhash_rust::xxh3::Xxh3::with_seed(seed);
v.hash(&mut hasher);
self.add_hash(hasher.digest128());
}
pub fn add_bytes_with_seed(&mut self, v: &[u8], seed: u64) {
self.add_hash(xxhash_rust::xxh3::xxh3_128_with_seed(v, seed));
}
pub fn add_reader_with_seed(&mut self, mut r: impl io::Read, seed: u64) -> io::Result<u64> {
let mut hasher = xxhash_rust::xxh3::Xxh3::with_seed(seed);
let read = io::copy(&mut r, &mut hasher)?;
self.add_hash(hasher.digest128());
Ok(read)
}
fn sum_and_zeros(&self) -> (f64, u16) {
const LANES: usize = 8;
let mut sums = [0.0f64; LANES];
let mut ezs = [0u16; LANES];
for chunk in self.regs.chunks_exact(LANES) {
for i in 0..LANES {
let lz = Self::lz(chunk[i]);
sums[i] += L[lz as usize];
ezs[i] += u16::from(lz == 0);
}
}
let sum: f64 = sums.iter().sum();
let ez: u16 = ezs.iter().sum();
(sum, ez)
}
#[must_use]
pub fn cardinality(&self) -> f64 {
let (sum, ez) = self.sum_and_zeros();
ALPHA * (f64::from(M)) * ((f64::from(M)) - f64::from(ez)) / (beta(ez) + sum)
}
pub fn union<'a>(&'a mut self, other: &Self) -> &'a Self {
for (r, rr) in self.regs.iter_mut().zip(other.regs.iter()) {
if *r < *rr {
*r = *rr;
}
}
self
}
fn cardinality_from_parts(sum: f64, ez: u16) -> f64 {
ALPHA * (f64::from(M)) * ((f64::from(M)) - f64::from(ez)) / (beta(ez) + sum)
}
fn overlap_counts(&self, other: &Self) -> (u32, u32) {
const LANES: usize = 16;
let mut cc_lanes = [0u16; LANES];
let mut cn_lanes = [0u16; LANES];
for (lc, rc) in self
.regs
.chunks_exact(LANES)
.zip(other.regs.chunks_exact(LANES))
{
for i in 0..LANES {
let l = lc[i];
let r = rc[i];
cn_lanes[i] += u16::from((l | r) != 0);
cc_lanes[i] += u16::from((l == r) & (l != 0));
}
}
let cc: u32 = cc_lanes.iter().map(|&x| u32::from(x)).sum();
let cn: u32 = cn_lanes.iter().map(|&x| u32::from(x)).sum();
(cc, cn)
}
fn comparison_stats(&self, other: &Self) -> ComparisonStats {
const LANES: usize = 8;
let mut left_sums = [0.0f64; LANES];
let mut right_sums = [0.0f64; LANES];
let mut union_sums = [0.0f64; LANES];
let mut left_ez_lanes = [0u16; LANES];
let mut right_ez_lanes = [0u16; LANES];
let mut union_ez_lanes = [0u16; LANES];
let mut cc_lanes = [0u16; LANES];
let mut cn_lanes = [0u16; LANES];
for (lc, rc) in self
.regs
.chunks_exact(LANES)
.zip(other.regs.chunks_exact(LANES))
{
let mut left_lz = [0u8; LANES];
let mut right_lz = [0u8; LANES];
let mut union_lz = [0u8; LANES];
for i in 0..LANES {
let l = lc[i];
let r = rc[i];
let u = l.max(r);
left_lz[i] = Self::lz(l);
right_lz[i] = Self::lz(r);
union_lz[i] = Self::lz(u);
left_ez_lanes[i] += u16::from(l < (1u16 << R));
right_ez_lanes[i] += u16::from(r < (1u16 << R));
union_ez_lanes[i] += u16::from(u < (1u16 << R));
cn_lanes[i] += u16::from((l | r) != 0);
cc_lanes[i] += u16::from((l == r) & (l != 0));
}
for i in 0..LANES {
left_sums[i] += L[left_lz[i] as usize];
right_sums[i] += L[right_lz[i] as usize];
union_sums[i] += L[union_lz[i] as usize];
}
}
ComparisonStats {
left_sum: left_sums.iter().sum(),
right_sum: right_sums.iter().sum(),
union_sum: union_sums.iter().sum(),
left_ez: left_ez_lanes.iter().sum(),
right_ez: right_ez_lanes.iter().sum(),
union_ez: union_ez_lanes.iter().sum(),
cc: cc_lanes.iter().map(|&x| u32::from(x)).sum(),
cn: cn_lanes.iter().map(|&x| u32::from(x)).sum(),
}
}
fn approximate_expected_collisions(n: f64, m: f64) -> f64 {
let (n, m) = (n.max(m), n.min(m));
if n > 2f64.powf(2f64.powf(f64::from(Q)) + f64::from(R)) {
f64::INFINITY
} else if n > 2f64.powf(f64::from(P) + 5.0) {
let d = (4.0 * n / m) / ((1.0 + n) / m).powi(2);
C * 2f64.powf(f64::from(P) - f64::from(R)) * d
} else {
Self::expected_collisions(n, m) / f64::from(P)
}
}
fn expected_collisions(n: f64, m: f64) -> f64 {
let tbl = &*TBL;
let mut x = 0.0;
for i in 1..=TQ {
let row = (i as usize) - 1;
let l1 = &tbl.ln1p_neg_b1[row];
let l2 = &tbl.ln1p_neg_b2[row];
for j1 in 1..=TR {
let col = (j1 as usize) - 1;
let a1 = (n * l2[col]).exp();
let a0 = (n * l1[col]).exp();
let b1 = (m * l2[col]).exp();
let b0 = (m * l1[col]).exp();
x += (a1 - a0) * (b1 - b0);
}
}
x * f64::from(P)
}
fn precompute_a_diff(n: f64) -> Box<ADiff> {
let tbl = &*TBL;
let mut out: Box<ADiff> = Box::new([[0.0; TR as usize]; TQ as usize]);
for i in 1..=TQ {
let row = (i as usize) - 1;
let l1 = &tbl.ln1p_neg_b1[row];
let l2 = &tbl.ln1p_neg_b2[row];
let o = &mut out[row];
for j1 in 1..=TR {
let col = (j1 as usize) - 1;
let a1 = (n * l2[col]).exp();
let a0 = (n * l1[col]).exp();
o[col] = a1 - a0;
}
}
out
}
fn expected_collisions_with_a_diff(a_diff: &ADiff, m: f64) -> f64 {
let tbl = &*TBL;
let mut x = 0.0;
for i in 1..=TQ {
let row = (i as usize) - 1;
let l1 = &tbl.ln1p_neg_b1[row];
let l2 = &tbl.ln1p_neg_b2[row];
let ad = &a_diff[row];
for j1 in 1..=TR {
let col = (j1 as usize) - 1;
let b1 = (m * l2[col]).exp();
let b0 = (m * l1[col]).exp();
x += ad[col] * (b1 - b0);
}
}
x * f64::from(P)
}
fn approximate_expected_collisions_with_a_diff(
a_diff: &ADiff,
n_self: f64,
m_other: f64,
) -> f64 {
let (big, small) = (n_self.max(m_other), n_self.min(m_other));
if big > 2f64.powf(2f64.powf(f64::from(Q)) + f64::from(R)) {
f64::INFINITY
} else if big > 2f64.powf(f64::from(P) + 5.0) {
let d = (4.0 * big / small) / ((1.0 + big) / small).powi(2);
C * 2f64.powf(f64::from(P) - f64::from(R)) * d
} else {
Self::expected_collisions_with_a_diff(a_diff, m_other) / f64::from(P)
}
}
#[inline(always)]
fn right_and_union_stats(&self, other: &Self) -> PartialStats {
const LANES: usize = 8;
let mut right_sums = [0.0f64; LANES];
let mut union_sums = [0.0f64; LANES];
let mut right_ez_lanes = [0u16; LANES];
let mut union_ez_lanes = [0u16; LANES];
let mut cc_lanes = [0u16; LANES];
let mut cn_lanes = [0u16; LANES];
for (lc, rc) in self
.regs
.chunks_exact(LANES)
.zip(other.regs.chunks_exact(LANES))
{
let mut right_lz = [0u8; LANES];
let mut union_lz = [0u8; LANES];
for i in 0..LANES {
let l = lc[i];
let r = rc[i];
let u = l.max(r);
right_lz[i] = Self::lz(r);
union_lz[i] = Self::lz(u);
right_ez_lanes[i] += u16::from(r < (1u16 << R));
union_ez_lanes[i] += u16::from(u < (1u16 << R));
cn_lanes[i] += u16::from((l | r) != 0);
cc_lanes[i] += u16::from((l == r) & (l != 0));
}
for i in 0..LANES {
right_sums[i] += L[right_lz[i] as usize];
union_sums[i] += L[union_lz[i] as usize];
}
}
PartialStats {
right_sum: right_sums.iter().sum(),
right_ez: right_ez_lanes.iter().sum(),
union_sum: union_sums.iter().sum(),
union_ez: union_ez_lanes.iter().sum(),
cc: cc_lanes.iter().map(|&x| u32::from(x)).sum(),
cn: cn_lanes.iter().map(|&x| u32::from(x)).sum(),
}
}
fn similarity_impl(&self, other: &Self, high_precision: bool) -> f64 {
let (cc, cn, ec) = if high_precision {
let stats = self.comparison_stats(other);
let n = Self::cardinality_from_parts(stats.left_sum, stats.left_ez);
let m = Self::cardinality_from_parts(stats.right_sum, stats.right_ez);
(
stats.cc,
stats.cn,
Self::approximate_expected_collisions(n, m),
)
} else {
let (cc, cn) = self.overlap_counts(other);
(cc, cn, 0.0)
};
if cc == 0 {
return 0.0;
}
if (cc as f64) < ec {
return 0.0;
}
(cc as f64 - ec) / cn as f64
}
#[must_use]
pub fn similarity(&self, other: &Self) -> f64 {
if self == other {
return 1.0;
}
self.similarity_impl(other, true)
}
#[must_use]
pub fn similarity_fast(&self, other: &Self) -> f64 {
if self == other {
return 1.0;
}
self.similarity_impl(other, false)
}
#[must_use]
pub fn intersection(&self, other: &Self) -> f64 {
if self == other {
return self.cardinality();
}
let stats = self.comparison_stats(other);
if stats.cc == 0 {
return 0.0;
}
let n = Self::cardinality_from_parts(stats.left_sum, stats.left_ez);
let m = Self::cardinality_from_parts(stats.right_sum, stats.right_ez);
let ec = Self::approximate_expected_collisions(n, m);
if (stats.cc as f64) < ec {
return 0.0;
}
let similarity = (stats.cc as f64 - ec) / stats.cn as f64;
let union_card = Self::cardinality_from_parts(stats.union_sum, stats.union_ez);
similarity * union_card
}
#[must_use]
pub fn intersection_fast(&self, other: &Self) -> f64 {
if self == other {
return self.cardinality();
}
let stats = self.comparison_stats(other);
if stats.cc == 0 {
return 0.0;
}
let similarity = stats.cc as f64 / stats.cn as f64;
let union_card = Self::cardinality_from_parts(stats.union_sum, stats.union_ez);
similarity * union_card
}
pub fn similarity_many<'a, I>(&'a self, others: I) -> BatchIter<'a, I::IntoIter>
where
I: IntoIterator<Item = &'a Sketch>,
I::IntoIter: 'a,
{
BatchIter::new(self, others.into_iter(), BatchKind::Similarity, true)
}
pub fn similarity_many_fast<'a, I>(&'a self, others: I) -> BatchIter<'a, I::IntoIter>
where
I: IntoIterator<Item = &'a Sketch>,
I::IntoIter: 'a,
{
BatchIter::new(self, others.into_iter(), BatchKind::Similarity, false)
}
pub fn intersection_many<'a, I>(&'a self, others: I) -> BatchIter<'a, I::IntoIter>
where
I: IntoIterator<Item = &'a Sketch>,
I::IntoIter: 'a,
{
BatchIter::new(self, others.into_iter(), BatchKind::Intersection, true)
}
pub fn intersection_many_fast<'a, I>(&'a self, others: I) -> BatchIter<'a, I::IntoIter>
where
I: IntoIterator<Item = &'a Sketch>,
I::IntoIter: 'a,
{
BatchIter::new(self, others.into_iter(), BatchKind::Intersection, false)
}
pub fn save(&self, mut writer: impl std::io::Write) -> std::io::Result<()> {
for r in self.regs {
writer.write_all(&r.to_le_bytes())?;
}
Ok(())
}
pub fn load(mut reader: impl std::io::Read) -> std::io::Result<Self> {
let mut regs = [0; M as usize];
let mut buf = [0u8; 2];
for r in &mut regs {
reader.read_exact(&mut buf)?;
*r = u16::from_le_bytes(buf);
}
Ok(Self { regs })
}
}
type ADiff = [[f64; TR as usize]; TQ as usize];
struct PartialStats {
cc: u32,
cn: u32,
right_sum: f64,
right_ez: u16,
union_sum: f64,
union_ez: u16,
}
#[derive(Copy, Clone)]
enum BatchKind {
Similarity,
Intersection,
}
struct LeftCache {
sum: f64,
ez: u16,
n: f64,
}
pub struct BatchIter<'a, I: Iterator<Item = &'a Sketch>> {
left: &'a Sketch,
others: I,
kind: BatchKind,
high_precision: bool,
left_cache: Option<LeftCache>,
a_diff: Option<Box<ADiff>>,
}
impl<'a, I: Iterator<Item = &'a Sketch>> BatchIter<'a, I> {
fn new(left: &'a Sketch, others: I, kind: BatchKind, high_precision: bool) -> Self {
Self {
left,
others,
kind,
high_precision,
left_cache: None,
a_diff: None,
}
}
}
impl<'a, I: Iterator<Item = &'a Sketch>> Iterator for BatchIter<'a, I> {
type Item = f64;
fn next(&mut self) -> Option<f64> {
let other = self.others.next()?;
if !self.high_precision {
return Some(match self.kind {
BatchKind::Similarity => self.left.similarity_fast(other),
BatchKind::Intersection => self.left.intersection_fast(other),
});
}
let cache = self.left_cache.get_or_insert_with(|| {
let (sum, ez) = self.left.sum_and_zeros();
let n = Sketch::cardinality_from_parts(sum, ez);
LeftCache { sum, ez, n }
});
if self.left == other {
return Some(match self.kind {
BatchKind::Similarity => 1.0,
BatchKind::Intersection => Sketch::cardinality_from_parts(cache.sum, cache.ez),
});
}
let slow_thresh = 2f64.powf(f64::from(P) + 5.0);
if cache.n > slow_thresh {
return Some(match self.kind {
BatchKind::Similarity => self.left.similarity(other),
BatchKind::Intersection => self.left.intersection(other),
});
}
let stats = self.left.right_and_union_stats(other);
if stats.cc == 0 {
return Some(0.0);
}
let ec = if self.high_precision {
let n = cache.n;
let m = Sketch::cardinality_from_parts(stats.right_sum, stats.right_ez);
let big = n.max(m);
let cap = 2f64.powf(2f64.powf(f64::from(Q)) + f64::from(R));
let slow_thresh = 2f64.powf(f64::from(P) + 5.0);
if big <= cap && big <= slow_thresh {
let a_diff = self
.a_diff
.get_or_insert_with(|| Sketch::precompute_a_diff(n));
Sketch::approximate_expected_collisions_with_a_diff(a_diff, n, m)
} else {
Sketch::approximate_expected_collisions(n, m)
}
} else {
0.0
};
if (stats.cc as f64) < ec {
return Some(0.0);
}
let similarity = (stats.cc as f64 - ec) / stats.cn as f64;
Some(match self.kind {
BatchKind::Similarity => similarity,
BatchKind::Intersection => {
let union_card = Sketch::cardinality_from_parts(stats.union_sum, stats.union_ez);
similarity * union_card
}
})
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.others.size_hint()
}
}
#[derive(Default)]
struct ComparisonStats {
cc: u32,
cn: u32,
left_sum: f64,
left_ez: u16,
right_sum: f64,
right_ez: u16,
union_sum: f64,
union_ez: u16,
}
#[cfg(test)]
mod tests {
use super::*;
const HASH_A: u128 = 0xa96faf705af16834e6c632b61e964e1f;
const HASH_AA: u128 = 0xb9fe94d346d39b20369242a646a19333;
#[test]
fn empty() {
let mut sk = Sketch::new();
assert!(sk.is_empty());
assert_eq!(sk.cardinality(), 0.0);
sk.add(0);
assert!(!sk.is_empty());
assert_ne!(sk.cardinality(), 0.0);
}
#[test]
fn eq() {
let mut sk1 = Sketch::new();
let mut sk2 = Sketch::new();
assert_eq!(sk1, sk2);
sk1.add("foo");
assert_ne!(sk1, sk2);
assert!(sk1 > sk2);
assert!(sk2 <= sk1);
sk2.add("foo");
assert_eq!(sk1, sk2);
assert!(sk1 >= sk2);
assert!(sk1 <= sk2);
}
#[test]
fn approx_count() {
let sk1: Sketch = (0..50_000).collect();
let exp1: f64 = 50_000.0;
let sk2: Sketch = (5_000..75_000).collect();
let exp2: f64 = 70_000.0;
assert!((sk1.cardinality() - exp1).abs() / exp1 < 0.01);
assert!((sk2.cardinality() - exp2).abs() / exp2 < 0.01);
let exp_is = 45_000.0;
assert!((sk1.intersection(&sk2) - exp_is).abs() / exp_is < 0.01);
}
#[test]
fn intersection_of_disjoint_sketches_is_zero() {
let sk1: Sketch = (0..1_000).collect();
let sk2: Sketch = (1_000..2_000).collect();
assert_eq!(sk1.similarity(&sk2), 0.0);
assert_eq!(sk1.intersection(&sk2), 0.0);
}
#[test]
fn similarity_of_identical_single_register_sketches_is_one() {
let mut serialized = vec![0u8; (M as usize) * std::mem::size_of::<u16>()];
serialized[..2].copy_from_slice(&Sketch::new_reg(1, 0).to_le_bytes());
let sk1 = Sketch::load(&serialized[..]).unwrap();
let sk2 = Sketch::load(&serialized[..]).unwrap();
assert!((sk1.cardinality() - 1.0).abs() < 1e-3);
assert!((sk1.similarity(&sk2) - 1.0).abs() < 1e-9);
}
#[test]
fn test_similarity_partially_overlapping() {
let vamax = 300000;
let va = (0..vamax).collect();
let vbmin = 290000;
let vbmax = 2 * vamax;
let vb = (vbmin..vbmax).collect();
let inter = vamax - vbmin;
let jexact = inter as f64 / vbmax as f64;
let sketch1: Sketch = va;
let sketch2: Sketch = vb;
let actual_similarity = sketch1.similarity(&sketch2);
let sigma = (actual_similarity - jexact).abs() / jexact;
println!(
" jaccard estimate : {:.5e} exact value : {:.5e} , error : {:.5e}",
actual_similarity, jexact, sigma
);
assert!((actual_similarity - jexact).abs() / jexact < 0.1);
}
#[test]
fn seeded_hash() {
let mut s = Sketch::default();
s.add_with_seed("foo", 0);
assert!(s.cardinality() >= 0.0);
s.add_with_seed("foo", 1);
assert!(s.cardinality() >= 1.0);
let mut s0 = Sketch::default();
s0.add_with_seed("foo", 0);
let mut s1 = Sketch::default();
s1.add_with_seed("foo", 1);
assert_ne!(s0, s1, "different seeds should yield different registers");
let c0 = s0.cardinality();
let c1 = s1.cardinality();
let rel = (c0 - c1).abs() / c0.max(c1).max(1.0);
assert!(
rel < 1e-3,
"cardinality should be nearly identical with different seeds: c0={c0}, c1={c1}, rel={rel}"
);
}
#[test]
fn add_io() {
let elemens = [&[b'a'; 640][..], &[b'b'; 30], &[b'c'; 1]];
let mut s1 = Sketch::default();
let mut s2 = Sketch::default();
for e in elemens {
s1.add_reader(e).unwrap();
s2.add_bytes(e);
}
assert_eq!(s1, s2);
let c = s1.cardinality();
assert!((2.9..=3.1).contains(&c));
}
#[test]
fn entry_consistency() {
let mut e = Entry::new();
assert_eq!(e.hasher.digest128(), EMPTY_HASH);
e.add_bytes(b"a");
assert_eq!(e.hasher.digest128(), HASH_A);
e.add_bytes(b"a");
assert_eq!(e.hasher.digest128(), HASH_AA);
let mut e2 = Entry::new();
e2.add_bytes(b"aa");
assert_eq!(e2.hasher.digest128(), HASH_AA);
assert_eq!(e, e2);
}
}