#![cfg_attr(coverage_nightly, coverage(off))]
use blake3::Hasher;
use xxhash_rust::xxh64::xxh64;
use super::types::{MinHashSignature, Token};
pub struct MinHashGenerator {
pub(super) num_hashes: usize,
pub(super) seeds: Vec<u64>,
}
impl MinHashGenerator {
#[must_use]
pub fn new(num_hashes: usize) -> Self {
let seeds = (0..num_hashes).map(|i| i as u64).collect();
Self { num_hashes, seeds }
}
#[must_use]
pub fn compute_signature(&self, shingles: &[u64]) -> MinHashSignature {
let mut signature = vec![u64::MAX; self.num_hashes];
for &shingle in shingles {
for (i, &seed) in self.seeds.iter().enumerate() {
let hash = xxh64(&shingle.to_le_bytes(), seed);
signature[i] = signature[i].min(hash);
}
}
MinHashSignature { values: signature }
}
#[must_use]
pub fn generate_shingles(&self, tokens: &[Token], k: usize) -> Vec<u64> {
if tokens.len() < k {
return vec![];
}
let mut shingles = Vec::new();
let mut hasher = Hasher::new();
for window in tokens.windows(k) {
hasher.reset();
for token in window {
hasher.update(token.text.as_bytes());
}
let hash = hasher.finalize();
shingles.push(u64::from_le_bytes(
hash.as_bytes()[0..8].try_into().expect("internal error"),
));
}
shingles
}
}