1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
//! Module for measuring similarities between sets.

mod min_hash;
mod sim_hash;

pub use self::min_hash::MinHash;
pub use self::sim_hash::SimHash;

use std::collections::HashSet;
use std::hash::Hash;
use std::iter::FromIterator;

/// A w-shingle iterator for an list of items.
///
/// # Examples
/// ```
/// use probabilistic_collections::similarity::ShingleIterator;
///
/// let mut shingle_iter = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect());
///
/// assert_eq!(shingle_iter.next(), Some(vec!["the", "cat"]));
/// assert_eq!(shingle_iter.next(), Some(vec!["cat", "sat"]));
/// assert_eq!(shingle_iter.next(), Some(vec!["sat", "on"]));
/// assert_eq!(shingle_iter.next(), Some(vec!["on", "a"]));
/// assert_eq!(shingle_iter.next(), Some(vec!["a", "mat"]));
/// assert_eq!(shingle_iter.next(), None);
/// ```
pub struct ShingleIterator<'a, T>
where
    T: 'a + ?Sized,
{
    token_count: usize,
    index: usize,
    tokens: Vec<&'a T>,
}

impl<'a, T> ShingleIterator<'a, T>
where
    T: ?Sized
{
    /// Constructs a new `ShingleIterator` that contains shingles of `token_count` tokens from
    /// `tokens`.
    ///
    /// # Examples
    /// ```
    /// use probabilistic_collections::similarity::ShingleIterator;
    ///
    /// let mut shingle_iter = ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect());
    /// ```
    pub fn new(token_count: usize, tokens: Vec<&'a T>) -> Self {
        ShingleIterator {
            token_count,
            index: 0,
            tokens,
        }
    }
}

impl<'a, T> Iterator for ShingleIterator<'a, T>
where
    T: ?Sized
{
    type Item = Vec<&'a T>;

    fn next(&mut self) -> Option<Self::Item> {
        if self.index > self.tokens.len() - self.token_count {
            return None;
        }
        self.index += 1;
        return Some(self.tokens[self.index - 1..self.index + self.token_count - 1].to_vec())
    }
}

/// Computes the Jaccard Similarity between two iterators. The Jaccard Similarity is the quotient
/// between the intersection and the union.
///
/// # Examples
/// ```
/// use probabilistic_collections::similarity::{get_jaccard_similarity, ShingleIterator};
///
/// assert_eq!(
///     get_jaccard_similarity(
///         ShingleIterator::new(2, "the cat sat on a mat".split(' ').collect()),
///         ShingleIterator::new(2, "the cat sat on the mat".split(' ').collect()),
///     ),
///     3.0 / 7.0,
/// );
/// ```
pub fn get_jaccard_similarity<T, U>(iter_1: T, iter_2: T) -> f64
where
    T: Iterator<Item=U>,
    U: Hash + Eq,
{
    let h1 = HashSet::<U>::from_iter(iter_1);
    let h2 = HashSet::<U>::from_iter(iter_2);

    return (h1.intersection(&h2).count() as f64) / (h1.union(&h2).count() as f64);
}

#[cfg(test)]
mod tests {
    use super::{get_jaccard_similarity, ShingleIterator};
    static S1: &'static str = "the cat sat on a mat";
    static S2: &'static str = "the cat sat on the mat";
    static S3: &'static str = "we all scream for ice cream";


    #[test]
    fn test_jaccard_similarity() {
        assert_eq!(
            get_jaccard_similarity(
                ShingleIterator::new(2, S1.split(' ').collect()),
                ShingleIterator::new(2, S2.split(' ').collect()),
            ),
            3.0 / 7.0,
        );

        assert_eq!(
            get_jaccard_similarity(
                ShingleIterator::new(2, S1.split(' ').collect()),
                ShingleIterator::new(2, S3.split(' ').collect()),
            ),
            0.0 / 7.0,
        );
    }
}