yass/
monge_elkan.rs

1use crate::error::StrSimError;
2
3use super::{ExpectTokenizerType, JaroWinkler, StrSim, TokenizerType};
4use derive_more::Display;
5
6#[derive(Display)]
7#[display(fmt = "MongeElkan")]
8pub struct MongeElkan<S: StrSim<Vec<char>>> {
9    pub strsim: S,
10    // This is for early exit. If the similarity is not possible to satisfy this value,
11    // the function returns immediately with the return value 0.0. Defaults to None.
12    pub lower_bound: f64,
13}
14#[derive(Display)]
15#[display(fmt = "SymmetricMongeElkan")]
16pub struct SymmetricMongeElkan<S: StrSim<Vec<char>>>(pub MongeElkan<S>);
17
18impl MongeElkan<JaroWinkler> {
19    pub fn default() -> Self {
20        MongeElkan {
21            strsim: JaroWinkler::default(),
22            lower_bound: 0.0,
23        }
24    }
25}
26
27impl<S: StrSim<Vec<char>>> MongeElkan<S> {
28    pub fn new(strsim: S, lower_bound: Option<f64>) -> Self {
29        MongeElkan {
30            strsim,
31            lower_bound: lower_bound.unwrap_or(0.0),
32        }
33    }
34
35    pub fn similarity(
36        &self,
37        bag1: &Vec<Vec<char>>,
38        bag2: &Vec<Vec<char>>,
39    ) -> Result<f64, StrSimError> {
40        if bag1.len() == 0 || bag2.len() == 0 {
41            if bag1.len() == 0 && bag2.len() == 0 {
42                return Ok(1.0);
43            } else {
44                return Ok(0.0);
45            }
46        }
47
48        let mut score_sum = 0.0;
49        for (idx, ele1) in bag1.iter().enumerate() {
50            let mut max_score = self.strsim.similarity_pre_tok2(ele1, &bag2[0])?;
51            for ele2 in &bag2[1..] {
52                let score = self.strsim.similarity_pre_tok2(ele1, ele2)?;
53                if score > max_score {
54                    max_score = score;
55                }
56            }
57            score_sum += max_score;
58
59            // if it satisfies early exit condition
60            if self.lower_bound > 0.0 {
61                let rest_max = (bag1.len() - 1 - idx) as f64; // assume the rest scores are all 1
62                if (score_sum + rest_max / bag1.len() as f64) < self.lower_bound {
63                    return Ok(0.0);
64                }
65            }
66        }
67
68        let sim = score_sum / bag1.len() as f64;
69        if self.lower_bound > 0.0 && sim < self.lower_bound {
70            return Ok(0.0);
71        }
72        return Ok(sim);
73    }
74
75    pub fn symmetric_similarity(
76        &self,
77        bag1: &Vec<Vec<char>>,
78        bag2: &Vec<Vec<char>>,
79    ) -> Result<f64, StrSimError> {
80        let sim1 = self.similarity(bag1, bag2)?;
81        if self.lower_bound > 0.0 && sim1 == 0.0 {
82            return Ok(0.0);
83        }
84        let sim2 = self.similarity(bag2, bag1)?;
85        if self.lower_bound > 0.0 && sim2 == 0.0 {
86            return Ok(0.0);
87        }
88
89        return Ok((sim1 + sim2) / 2.0);
90    }
91}
92
93impl<S: StrSim<Vec<char>>> StrSim<Vec<Vec<char>>> for MongeElkan<S> {
94    fn similarity_pre_tok2(
95        &self,
96        bag1: &Vec<Vec<char>>,
97        bag2: &Vec<Vec<char>>,
98    ) -> Result<f64, StrSimError> {
99        self.similarity(bag1, bag2)
100    }
101}
102
103impl<S: StrSim<Vec<char>>> StrSim<Vec<Vec<char>>> for SymmetricMongeElkan<S> {
104    fn similarity_pre_tok2(
105        &self,
106        bag1: &Vec<Vec<char>>,
107        bag2: &Vec<Vec<char>>,
108    ) -> Result<f64, StrSimError> {
109        self.0.symmetric_similarity(bag1, bag2)
110    }
111}
112
113impl<S: StrSim<Vec<char>> + ExpectTokenizerType> ExpectTokenizerType for MongeElkan<S> {
114    fn get_expected_tokenizer_type(&self) -> TokenizerType {
115        TokenizerType::Seq(Box::new(Some(self.strsim.get_expected_tokenizer_type())))
116    }
117}
118
119impl<S: StrSim<Vec<char>> + ExpectTokenizerType> ExpectTokenizerType for SymmetricMongeElkan<S> {
120    fn get_expected_tokenizer_type(&self) -> TokenizerType {
121        TokenizerType::Seq(Box::new(Some(self.0.strsim.get_expected_tokenizer_type())))
122    }
123}