#![allow(clippy::cast_possible_truncation)] #![allow(clippy::cast_precision_loss)]
use std::collections::HashSet;
#[derive(Clone, Copy, Default, Debug, PartialEq, Eq)]
pub struct TrigramFingerprint {
bits: [u64; 4],
}
#[inline]
fn trigram_hash(trigram: [u8; 3]) -> (usize, usize) {
let v = u32::from(trigram[0]) | (u32::from(trigram[1]) << 8) | (u32::from(trigram[2]) << 16);
let h1 = v.wrapping_mul(0x9E37_79B9) as usize % 256;
let h2 = v.wrapping_mul(0x517C_C1B7).wrapping_add(0x6A09_E667) as usize % 256;
(h1, h2)
}
#[inline]
fn set_bit(bits: &mut [u64; 4], pos: usize) {
let word = pos / 64;
let bit = pos % 64;
bits[word] |= 1u64 << bit;
}
impl TrigramFingerprint {
#[must_use]
pub fn from_trigram_set(trigrams: &HashSet<[u8; 3]>) -> Self {
let mut fp = Self::default();
for trigram in trigrams {
fp.insert(trigram);
}
fp
}
#[must_use]
pub fn from_text(text: &str) -> Self {
let trigrams = super::extract_trigrams(text);
Self::from_trigram_set(&trigrams)
}
#[inline]
pub fn insert(&mut self, trigram: &[u8; 3]) {
let (h1, h2) = trigram_hash(*trigram);
set_bit(&mut self.bits, h1);
set_bit(&mut self.bits, h2);
}
#[must_use]
#[inline]
pub fn approx_intersection_count(&self, other: &Self) -> u32 {
(self.bits[0] & other.bits[0]).count_ones()
+ (self.bits[1] & other.bits[1]).count_ones()
+ (self.bits[2] & other.bits[2]).count_ones()
+ (self.bits[3] & other.bits[3]).count_ones()
}
#[must_use]
pub fn approx_jaccard(&self, other: &Self, _self_count: usize, _other_count: usize) -> f32 {
let and_pop = self.approx_intersection_count(other);
let or_pop = self.union_popcount(other);
if or_pop == 0 {
return 0.0;
}
and_pop as f32 / or_pop as f32
}
#[inline]
fn union_popcount(&self, other: &Self) -> u32 {
(self.bits[0] | other.bits[0]).count_ones()
+ (self.bits[1] | other.bits[1]).count_ones()
+ (self.bits[2] | other.bits[2]).count_ones()
+ (self.bits[3] | other.bits[3]).count_ones()
}
#[must_use]
#[inline]
pub fn is_empty(&self) -> bool {
self.bits[0] == 0 && self.bits[1] == 0 && self.bits[2] == 0 && self.bits[3] == 0
}
}