1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
mod min_hash;
mod sim_hash;
pub use self::min_hash::MinHash;
pub use self::sim_hash::SimHash;
use std::collections::HashSet;
use std::hash::Hash;
use std::iter::FromIterator;
pub struct ShingleIterator<'a, T>
where
T: 'a + ?Sized,
{
token_count: usize,
index: usize,
tokens: Vec<&'a T>,
}
impl<'a, T> ShingleIterator<'a, T>
where
T: ?Sized
{
pub fn new(token_count: usize, tokens: Vec<&'a T>) -> Self {
ShingleIterator {
token_count,
index: 0,
tokens,
}
}
}
impl<'a, T> Iterator for ShingleIterator<'a, T>
where
T: ?Sized
{
type Item = Vec<&'a T>;
fn next(&mut self) -> Option<Self::Item> {
if self.index > self.tokens.len() - self.token_count {
return None;
}
self.index += 1;
return Some(self.tokens[self.index - 1..self.index + self.token_count - 1].to_vec())
}
}
pub fn get_jaccard_similarity<T, U>(iter_1: T, iter_2: T) -> f64
where
T: Iterator<Item=U>,
U: Hash + Eq,
{
let h1 = HashSet::<U>::from_iter(iter_1);
let h2 = HashSet::<U>::from_iter(iter_2);
return (h1.intersection(&h2).count() as f64) / (h1.union(&h2).count() as f64);
}
#[cfg(test)]
mod tests {
use super::{get_jaccard_similarity, ShingleIterator};
static S1: &'static str = "the cat sat on a mat";
static S2: &'static str = "the cat sat on the mat";
static S3: &'static str = "we all scream for ice cream";
#[test]
fn test_jaccard_similarity() {
assert_eq!(
get_jaccard_similarity(
ShingleIterator::new(2, S1.split(' ').collect()),
ShingleIterator::new(2, S2.split(' ').collect()),
),
3.0 / 7.0,
);
assert_eq!(
get_jaccard_similarity(
ShingleIterator::new(2, S1.split(' ').collect()),
ShingleIterator::new(2, S3.split(' ').collect()),
),
0.0 / 7.0,
);
}
}