yass/
jaro_winkler.rs

1use crate::error::StrSimError;
2
3use super::{ExpectTokenizerType, Jaro, StrSim, TokenizerType};
4
5use anyhow::Result;
6use derive_more::Display;
7
8#[derive(Display)]
9#[display(fmt = "JaroWinkler")]
10pub struct JaroWinkler {
11    // Boost threshold, prefix bonus is only added when compared strings have a Jaro Distance above it. Defaults to 0.7.
12    pub threshold: f64,
13    // Scaling factor for how much the score is adjusted upwards for having common prefixes. Defaults to 0.1.
14    pub scaling_factor: f64,
15    pub prefix_len: usize,
16}
17
18impl JaroWinkler {
19    pub fn default() -> Self {
20        JaroWinkler {
21            threshold: 0.7,
22            scaling_factor: 0.1,
23            prefix_len: 4,
24        }
25    }
26
27    pub fn new(
28        threshold: Option<f64>,
29        scaling_factor: Option<f64>,
30        prefix_len: Option<usize>,
31    ) -> Self {
32        JaroWinkler {
33            threshold: threshold.unwrap_or(0.7),
34            scaling_factor: scaling_factor.unwrap_or(0.1),
35            prefix_len: prefix_len.unwrap_or(4),
36        }
37    }
38
39    fn similarity(&self, s1: &[char], s2: &[char]) -> f64 {
40        let mut jw_score = Jaro::similarity(s1, s2);
41        if jw_score > self.threshold {
42            // common prefix len
43            let mut common_prefix_len = 0;
44
45            let max_common_prefix_len = s1.len().min(s2.len()).min(self.prefix_len);
46            while common_prefix_len < max_common_prefix_len
47                && s1[common_prefix_len] == s2[common_prefix_len]
48            {
49                common_prefix_len += 1;
50            }
51            if common_prefix_len != 0 {
52                jw_score += self.scaling_factor * (common_prefix_len as f64) * (1.0 - jw_score);
53            }
54        }
55
56        jw_score
57    }
58}
59
60impl StrSim<Vec<char>> for JaroWinkler {
61    fn similarity_pre_tok2(
62        &self,
63        tokenized_key: &Vec<char>,
64        tokenized_query: &Vec<char>,
65    ) -> Result<f64, StrSimError> {
66        Ok(self.similarity(tokenized_key, tokenized_query))
67    }
68}
69
70impl ExpectTokenizerType for JaroWinkler {
71    fn get_expected_tokenizer_type(&self) -> TokenizerType {
72        TokenizerType::Seq(Box::new(None))
73    }
74}