1use crate::error::StrSimError;
2
3use super::{ExpectTokenizerType, JaroWinkler, StrSim, TokenizerType};
4use derive_more::Display;
5
6#[derive(Display)]
7#[display(fmt = "MongeElkan")]
8pub struct MongeElkan<S: StrSim<Vec<char>>> {
9 pub strsim: S,
10 pub lower_bound: f64,
13}
14#[derive(Display)]
15#[display(fmt = "SymmetricMongeElkan")]
16pub struct SymmetricMongeElkan<S: StrSim<Vec<char>>>(pub MongeElkan<S>);
17
18impl MongeElkan<JaroWinkler> {
19 pub fn default() -> Self {
20 MongeElkan {
21 strsim: JaroWinkler::default(),
22 lower_bound: 0.0,
23 }
24 }
25}
26
27impl<S: StrSim<Vec<char>>> MongeElkan<S> {
28 pub fn new(strsim: S, lower_bound: Option<f64>) -> Self {
29 MongeElkan {
30 strsim,
31 lower_bound: lower_bound.unwrap_or(0.0),
32 }
33 }
34
35 pub fn similarity(
36 &self,
37 bag1: &Vec<Vec<char>>,
38 bag2: &Vec<Vec<char>>,
39 ) -> Result<f64, StrSimError> {
40 if bag1.len() == 0 || bag2.len() == 0 {
41 if bag1.len() == 0 && bag2.len() == 0 {
42 return Ok(1.0);
43 } else {
44 return Ok(0.0);
45 }
46 }
47
48 let mut score_sum = 0.0;
49 for (idx, ele1) in bag1.iter().enumerate() {
50 let mut max_score = self.strsim.similarity_pre_tok2(ele1, &bag2[0])?;
51 for ele2 in &bag2[1..] {
52 let score = self.strsim.similarity_pre_tok2(ele1, ele2)?;
53 if score > max_score {
54 max_score = score;
55 }
56 }
57 score_sum += max_score;
58
59 if self.lower_bound > 0.0 {
61 let rest_max = (bag1.len() - 1 - idx) as f64; if (score_sum + rest_max / bag1.len() as f64) < self.lower_bound {
63 return Ok(0.0);
64 }
65 }
66 }
67
68 let sim = score_sum / bag1.len() as f64;
69 if self.lower_bound > 0.0 && sim < self.lower_bound {
70 return Ok(0.0);
71 }
72 return Ok(sim);
73 }
74
75 pub fn symmetric_similarity(
76 &self,
77 bag1: &Vec<Vec<char>>,
78 bag2: &Vec<Vec<char>>,
79 ) -> Result<f64, StrSimError> {
80 let sim1 = self.similarity(bag1, bag2)?;
81 if self.lower_bound > 0.0 && sim1 == 0.0 {
82 return Ok(0.0);
83 }
84 let sim2 = self.similarity(bag2, bag1)?;
85 if self.lower_bound > 0.0 && sim2 == 0.0 {
86 return Ok(0.0);
87 }
88
89 return Ok((sim1 + sim2) / 2.0);
90 }
91}
92
93impl<S: StrSim<Vec<char>>> StrSim<Vec<Vec<char>>> for MongeElkan<S> {
94 fn similarity_pre_tok2(
95 &self,
96 bag1: &Vec<Vec<char>>,
97 bag2: &Vec<Vec<char>>,
98 ) -> Result<f64, StrSimError> {
99 self.similarity(bag1, bag2)
100 }
101}
102
103impl<S: StrSim<Vec<char>>> StrSim<Vec<Vec<char>>> for SymmetricMongeElkan<S> {
104 fn similarity_pre_tok2(
105 &self,
106 bag1: &Vec<Vec<char>>,
107 bag2: &Vec<Vec<char>>,
108 ) -> Result<f64, StrSimError> {
109 self.0.symmetric_similarity(bag1, bag2)
110 }
111}
112
113impl<S: StrSim<Vec<char>> + ExpectTokenizerType> ExpectTokenizerType for MongeElkan<S> {
114 fn get_expected_tokenizer_type(&self) -> TokenizerType {
115 TokenizerType::Seq(Box::new(Some(self.strsim.get_expected_tokenizer_type())))
116 }
117}
118
119impl<S: StrSim<Vec<char>> + ExpectTokenizerType> ExpectTokenizerType for SymmetricMongeElkan<S> {
120 fn get_expected_tokenizer_type(&self) -> TokenizerType {
121 TokenizerType::Seq(Box::new(Some(self.0.strsim.get_expected_tokenizer_type())))
122 }
123}