1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
use crate::error::StrSimError;
use super::{ExpectTokenizerType, JaroWinkler, StrSim, TokenizerType};
use anyhow::Result;
use derive_more::Display;
use lsap::get_assigned_cost;
#[derive(Display)]
#[display(fmt = "HybridJaccard")]
pub struct HybridJaccard<S: StrSim<Vec<char>> + ExpectTokenizerType> {
pub threshold: f64,
pub lower_bound: f64,
pub strsim: S,
}
impl HybridJaccard<JaroWinkler> {
pub fn default() -> Self {
HybridJaccard {
threshold: 0.5,
lower_bound: 0.0,
strsim: JaroWinkler::default(),
}
}
}
impl<S: StrSim<Vec<char>> + ExpectTokenizerType> HybridJaccard<S> {
pub fn new(strsim: S, threshold: Option<f64>, lower_bound: Option<f64>) -> Self {
HybridJaccard {
threshold: threshold.unwrap_or(0.5),
lower_bound: lower_bound.unwrap_or(0.0),
strsim,
}
}
pub fn similarity<'t>(
&self,
mut set1: &'t Vec<Vec<char>>,
mut set2: &'t Vec<Vec<char>>,
) -> Result<f64, StrSimError> {
if set1.len() > set2.len() {
(set1, set2) = (set2, set1);
}
let total_num_matches = set1.len();
let mut matching_score = vec![1.0; set1.len() * set2.len()];
// let mut matching_score = Array2::from_elem((set1.len(), set2.len()), 1.0);
let mut row_max: Vec<f64> = vec![0.0; set1.len()];
for (i, s1) in set1.iter().enumerate() {
for (j, s2) in set2.iter().enumerate() {
let mut score: f64 = self.strsim.similarity_pre_tok2(s1, s2)?;
if score < self.threshold {
score = 0.0;
}
row_max[i] = row_max[i].max(score);
// matching_score[[i, j]] = 1.0 - score // munkres finds out the smallest element
// matching_score[[i, j]] = score
matching_score[i * set2.len() + j] = score
}
if self.lower_bound > 0.0 {
let max_possible_score_sum: f64 =
row_max[..i + 1].iter().sum::<f64>() + (total_num_matches - i - 1) as f64;
let max_possible =
max_possible_score_sum / (set1.len() + set2.len() - total_num_matches) as f64;
if max_possible < self.lower_bound {
return Ok(0.0);
}
}
}
let score_sum = get_assigned_cost(set1.len(), set2.len(), &matching_score, true)?;
if set1.len() + set2.len() - total_num_matches == 0 {
return Ok(1.0);
}
let sim = score_sum / (set1.len() + set2.len() - total_num_matches) as f64;
if self.lower_bound > 0.0 && sim < self.lower_bound {
Ok(0.0)
} else {
Ok(sim)
}
}
// /**
// *
// */
// fn similarity_impl_v1(&self, mut set1: &Vec<Vec<char>>, mut set2: &Vec<Vec<char>>) -> f64 {
// if set1.len() > set2.len() {
// let tmp = set1;
// set1 = set2;
// set2 = set1;
// }
// let mut match_score = 0.0;
// let mut match_count = 0.0;
// let mut matches = vec![];
// for (i, s1) in set1.iter().enumerate() {
// for (j, s2) in set2.iter().enumerate() {
// let mut score = self.strsim.similarity(s1, s2);
// if score > self.threshold {
// matches.push((s1, s2, score));
// }
// }
// }
// // sort the score of all the pairs
// matches.sort_by(|a, b| b[2].partial_cmp(&a[2]).unwrap());
// // select score in increasing order of their weightage
// // do not reselect the same element from either set.
// let mut set1x = HashSet::new();
// let mut set2x = HashSet::new();
// for (s1, s2, score) in matches {
// if !set1x.contains(s1) && !set2x.contains(s2) {
// set1x.add(s1);
// set2x.add(s2);
// match_score += score;
// match_count += 1.0;
// }
// }
// match_score / (set1.len() + set2.len() - match_count)
// }
}
impl<S: StrSim<Vec<char>> + ExpectTokenizerType> StrSim<Vec<Vec<char>>> for HybridJaccard<S> {
fn similarity_pre_tok2(
&self,
set1: &Vec<Vec<char>>,
set2: &Vec<Vec<char>>,
) -> Result<f64, StrSimError> {
self.similarity(set1, set2)
}
}
impl<S: StrSim<Vec<char>> + ExpectTokenizerType> ExpectTokenizerType for HybridJaccard<S> {
fn get_expected_tokenizer_type(&self) -> TokenizerType {
TokenizerType::Set(Box::new(Some(self.strsim.get_expected_tokenizer_type())))
}
}