use std::cmp;
use std::fmt::{Debug, Formatter};
use std::hash::{Hash, Hasher};
use std::marker::PhantomData;
pub use crate::hasher::{Murmur3Hasher, SeedHasher};
fn shift<T: Copy, const L: usize>(arr: &mut [T; L]) {
for i in 0..L - 1 {
let next = arr[i + 1];
arr[i] = next;
}
}
pub struct MinShingleHash<H, const N: usize, const L: usize>
where
H: SeedHasher,
{
hash: [u32; N],
seed_hasher: PhantomData<H>,
}
impl<H, const N: usize, const L: usize> MinShingleHash<H, N, L>
where
H: SeedHasher,
{
pub fn new<I>(input: I) -> Self
where
I: IntoIterator,
I::Item: Hash + Copy + Default,
{
let mut hash = [u32::MAX; N];
let mut input_iter = input.into_iter();
let mut buf = [I::Item::default(); L];
for i in 0..L - 1 {
if let Some(item) = input_iter.next() {
buf[i] = item;
}
}
for item in input_iter {
buf[L - 1] = item;
for i in 0..N {
let mut hasher = H::with_seed(i as u32);
for item in buf.iter() {
item.hash(&mut hasher);
}
let shingle_hash = hasher.finish() as u32;
hash[i] = cmp::min(hash[i], shingle_hash);
}
shift(&mut buf);
}
return MinShingleHash {
hash,
seed_hasher: PhantomData,
};
}
pub fn iter(&self) -> std::slice::Iter<u32> {
self.hash.iter()
}
pub fn compare(&self, other: &Self) -> f32 {
let matches = self.iter().zip(other.iter()).filter(|pair| pair.0 == pair.1).count();
return matches as f32 / N as f32;
}
}
impl<H, const N: usize, const L: usize> IntoIterator for MinShingleHash<H, N, L>
where
H: SeedHasher,
{
type Item = u32;
type IntoIter = <[Self::Item; N] as IntoIterator>::IntoIter;
fn into_iter(self) -> Self::IntoIter {
self.hash.into_iter()
}
}
impl<H, const N: usize, const L: usize> PartialEq for MinShingleHash<H, N, L>
where
H: SeedHasher,
{
fn eq(&self, other: &Self) -> bool {
self.hash.eq(&other.hash)
}
}
impl<H, const N: usize, const L: usize> Eq for MinShingleHash<H, N, L> where H: SeedHasher {}
impl<H, const N: usize, const L: usize> Debug for MinShingleHash<H, N, L>
where
H: SeedHasher,
{
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
self.hash.fmt(f)
}
}
#[cfg(test)]
mod tests {
use super::{MinShingleHash, Murmur3Hasher};
#[test]
fn test_shingle_hash() {
let data = "hello world";
let hash = MinShingleHash::<Murmur3Hasher, 10, 4>::new(data.chars());
let actual_hash = Vec::from_iter(hash);
let expected_hash = vec![
507627377, 327820559, 366909560, 1875448240, 273434197, 278282553, 784375550, 314569335, 938527530, 4954098,
];
assert_eq!(actual_hash, expected_hash);
}
}