use alloc::string::String;
use crate::canonical::Canonicalizer;
use crate::classical::Fingerprinter;
use crate::classical::hash::{HashFamily, hash128};
use crate::error::{Error, Result};
use crate::tokenize::Tokenizer;
use super::sig::MinHashSig;
pub const DEFAULT_SEED: u64 = 0x00C0_FFEE_5EED;
#[derive(Clone, Debug)]
pub struct MinHashFingerprinterBuilder {
seed: u64,
hasher: HashFamily,
}
impl Default for MinHashFingerprinterBuilder {
fn default() -> Self {
Self {
seed: DEFAULT_SEED,
hasher: HashFamily::Xxh3_64,
}
}
}
impl MinHashFingerprinterBuilder {
#[must_use]
pub fn seed(mut self, seed: u64) -> Self {
self.seed = seed;
self
}
#[must_use]
pub fn hasher(mut self, hasher: HashFamily) -> Self {
self.hasher = hasher;
self
}
#[must_use]
pub fn build<T: Tokenizer, const H: usize>(
self,
canonicalizer: Canonicalizer,
tokenizer: T,
) -> MinHashFingerprinter<T, H> {
MinHashFingerprinter {
canonicalizer,
tokenizer,
seed: self.seed,
hasher: self.hasher,
}
}
}
#[derive(Clone, Debug)]
pub struct MinHashFingerprinter<T: Tokenizer, const H: usize> {
canonicalizer: Canonicalizer,
tokenizer: T,
seed: u64,
hasher: HashFamily,
}
impl<T: Tokenizer, const H: usize> MinHashFingerprinter<T, H> {
pub fn new(canonicalizer: Canonicalizer, tokenizer: T) -> Self {
Self {
canonicalizer,
tokenizer,
seed: DEFAULT_SEED,
hasher: HashFamily::Xxh3_64,
}
}
#[must_use]
pub fn with_seed(mut self, seed: u64) -> Self {
self.seed = seed;
self
}
#[must_use]
pub fn with_hasher(mut self, hasher: HashFamily) -> Self {
self.hasher = hasher;
self
}
pub fn canonicalizer(&self) -> &Canonicalizer {
&self.canonicalizer
}
pub fn tokenizer(&self) -> &T {
&self.tokenizer
}
pub fn seed(&self) -> u64 {
self.seed
}
pub fn hasher(&self) -> HashFamily {
self.hasher
}
pub(super) fn sketch_canonical(&self, canonical: &str) -> Result<MinHashSig<H>> {
let mut sig = MinHashSig::<H>::empty();
let mut any = false;
let hasher = self.hasher;
let seed = self.seed;
let hashes = &mut sig.hashes;
self.tokenizer.for_each_token(canonical, &mut |tok| {
any = true;
let (lo, hi) = hash128(hasher, tok.as_bytes(), seed);
for (i, slot) in hashes.iter_mut().enumerate() {
let h = lo.wrapping_add((i as u64).wrapping_mul(hi));
if h < *slot {
*slot = h;
}
}
});
if !any {
return Err(Error::InvalidInput("empty document".into()));
}
Ok(sig)
}
}
impl<T: Tokenizer, const H: usize> Fingerprinter for MinHashFingerprinter<T, H> {
type Output = MinHashSig<H>;
fn fingerprint(&self, input: &str) -> Result<Self::Output> {
if input.is_empty() {
return Err(Error::InvalidInput("empty document".into()));
}
let canonical: String = self.canonicalizer.canonicalize(input);
self.sketch_canonical(&canonical)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::canonical::Canonicalizer;
use crate::classical::minhash::jaccard::jaccard;
use crate::tokenize::{ShingleTokenizer, WordTokenizer};
fn fp() -> MinHashFingerprinter<ShingleTokenizer<WordTokenizer>, 128> {
MinHashFingerprinter::new(
Canonicalizer::default(),
ShingleTokenizer {
k: 3,
inner: WordTokenizer,
},
)
}
#[test]
fn empty_input_errors() {
let f = fp();
let r = f.fingerprint("");
assert!(matches!(r, Err(Error::InvalidInput(_))));
}
#[test]
fn whitespace_only_errors() {
let f = fp();
let r = f.fingerprint(" \n\n\n");
assert!(matches!(r, Err(Error::InvalidInput(_))));
}
#[test]
fn deterministic() {
let f = fp();
let a = f
.fingerprint("the quick brown fox jumps over the lazy dog")
.unwrap();
let b = f
.fingerprint("the quick brown fox jumps over the lazy dog")
.unwrap();
assert_eq!(a, b);
}
#[test]
fn similar_docs_have_high_jaccard() {
let f = fp();
let a = f
.fingerprint("the quick brown fox jumps over the lazy dog")
.unwrap();
let b = f
.fingerprint("the quick brown fox leaps over the lazy dog")
.unwrap();
let j = jaccard(&a, &b);
assert!(j > 0.30, "expected j > 0.30, got {j}");
}
#[test]
fn different_docs_have_low_jaccard() {
let f = fp();
let a = f
.fingerprint("the quick brown fox jumps over the lazy dog")
.unwrap();
let b = f
.fingerprint("a completely unrelated sentence about astronomy")
.unwrap();
let j = jaccard(&a, &b);
assert!(j < 0.3, "expected j < 0.3, got {j}");
}
#[test]
fn permutation_invariance() {
let f = MinHashFingerprinter::<_, 64>::new(Canonicalizer::default(), WordTokenizer);
let a = f.fingerprint("alpha beta gamma delta").unwrap();
let b = f.fingerprint("delta gamma beta alpha").unwrap();
assert_eq!(a, b);
}
#[test]
fn duplicate_insensitivity() {
let f = MinHashFingerprinter::<_, 64>::new(Canonicalizer::default(), WordTokenizer);
let a = f.fingerprint("alpha beta gamma delta").unwrap();
let b = f
.fingerprint("alpha beta gamma delta alpha beta gamma delta")
.unwrap();
assert_eq!(a, b);
}
#[test]
fn schema_field_set() {
let f = fp();
let s = f.fingerprint("hello world hello world").unwrap();
assert_eq!(s.schema, super::super::sig::SCHEMA_VERSION);
}
#[test]
fn seed_change_changes_signature() {
let f1 = fp();
let f2 = fp().with_seed(42);
let a = f1.fingerprint("the quick brown fox jumps").unwrap();
let b = f2.fingerprint("the quick brown fox jumps").unwrap();
assert_ne!(a, b);
}
#[test]
fn xxh3_hasher_is_selectable() {
let f = fp().with_hasher(HashFamily::Xxh3_64);
let a = f.fingerprint("the quick brown fox jumps").unwrap();
assert!(a.hashes.iter().any(|h| *h != u64::MAX));
}
#[test]
fn builder_default_matches_constructor() {
let canon = Canonicalizer::default();
let tok = ShingleTokenizer {
k: 3,
inner: WordTokenizer,
};
let a = MinHashFingerprinterBuilder::default().build::<_, 128>(canon.clone(), tok.clone());
let b: MinHashFingerprinter<_, 128> = MinHashFingerprinter::new(canon, tok);
let s_a = a
.fingerprint("the quick brown fox jumps over the lazy dog")
.unwrap();
let s_b = b
.fingerprint("the quick brown fox jumps over the lazy dog")
.unwrap();
assert_eq!(s_a, s_b);
}
}