mod min_hash;
mod sim_hash;
pub use self::min_hash::MinHash;
pub use self::sim_hash::SimHash;
use std::collections::HashSet;
use std::hash::Hash;
use std::iter::FromIterator;
pub struct ShingleIterator<'a, T>
where
T: ?Sized,
{
token_count: usize,
index: usize,
tokens: Vec<&'a T>,
}
impl<'a, T> ShingleIterator<'a, T>
where
T: ?Sized,
{
pub fn new(token_count: usize, tokens: Vec<&'a T>) -> Self {
ShingleIterator {
token_count,
index: 0,
tokens,
}
}
}
impl<'a, T> Iterator for ShingleIterator<'a, T>
where
T: ?Sized,
{
type Item = Vec<&'a T>;
fn next(&mut self) -> Option<Self::Item> {
if self.index > self.tokens.len() - self.token_count {
return None;
}
self.index += 1;
Some(self.tokens[self.index - 1..self.index + self.token_count - 1].to_vec())
}
}
pub fn get_jaccard_similarity<T, U>(iter_1: T, iter_2: T) -> f64
where
T: Iterator<Item = U>,
U: Hash + Eq,
{
let h1 = HashSet::<U>::from_iter(iter_1);
let h2 = HashSet::<U>::from_iter(iter_2);
(h1.intersection(&h2).count() as f64) / (h1.union(&h2).count() as f64)
}
#[cfg(test)]
pub mod tests {
use super::{get_jaccard_similarity, ShingleIterator};
use std::f64;
pub const S1: &str = "the cat sat on a mat";
pub const S2: &str = "the cat sat on the mat";
pub const S3: &str = "we all scream for ice cream";
#[test]
fn test_jaccard_similarity() {
let similarity = get_jaccard_similarity(
ShingleIterator::new(2, S1.split(' ').collect()),
ShingleIterator::new(2, S2.split(' ').collect()),
);
assert!(f64::abs(similarity - 3.0 / 7.0) < f64::EPSILON);
let similarity = get_jaccard_similarity(
ShingleIterator::new(2, S1.split(' ').collect()),
ShingleIterator::new(2, S3.split(' ').collect()),
);
assert!(f64::abs(similarity - 0.0 / 7.0) < f64::EPSILON);
}
}