yass/
jaro.rs

1use crate::error::StrSimError;
2
3use super::{ExpectTokenizerType, StrSim, TokenizerType};
4
5use anyhow::Result;
6use derive_more::Display;
7
8#[derive(Display)]
9#[display(fmt = "Jaro")]
10pub struct Jaro;
11
12impl Jaro {
13    pub fn similarity(s1: &[char], s2: &[char]) -> f64 {
14        let max_len = s1.len().max(s2.len());
15
16        if max_len == 0 {
17            return 1.0;
18        } else if s1.len() == 0 || s2.len() == 0 {
19            return 0.0;
20        }
21
22        let search_range = ((max_len / 2) - 1).max(0); // equal floor(max_len as f64 / 2) - 1;
23
24        let mut flags_s1 = vec![false; s1.len()];
25        let mut flags_s2 = vec![false; s2.len()];
26
27        // find number of matching characters (common_chars)
28        let mut common_chars = 0;
29
30        for i in 0..s1.len() {
31            let low = if i > search_range {
32                i - search_range
33            } else {
34                0
35            };
36            let high = (i + search_range).min(s2.len() - 1);
37            for j in low..=high {
38                if flags_s2[j] == false && s2[j] == s1[i] {
39                    flags_s1[i] = true;
40                    flags_s2[j] = true;
41                    common_chars += 1;
42                    break;
43                }
44            }
45        }
46
47        if common_chars == 0 {
48            return 0.0;
49        }
50
51        // find the number of transpositions and jaro distance
52        let mut trans_count = 0;
53        let mut k = 0;
54
55        for i in 0..s1.len() {
56            if flags_s1[i] == true {
57                let mut pivot = k;
58                for j in k..s2.len() {
59                    if flags_s2[j] == true {
60                        k = j + 1;
61                        pivot = j;
62                        break;
63                    }
64                }
65                if s1[i] != s2[pivot] {
66                    trans_count += 1;
67                }
68            }
69        }
70
71        trans_count /= 2;
72        return ((common_chars as f64) / (s1.len() as f64)
73            + (common_chars as f64) / (s2.len() as f64)
74            + ((common_chars - trans_count) as f64) / (common_chars as f64))
75            / 3.0;
76    }
77}
78
79impl StrSim<Vec<char>> for Jaro {
80    fn similarity_pre_tok2(
81        &self,
82        tokenized_key: &Vec<char>,
83        tokenized_query: &Vec<char>,
84    ) -> Result<f64, StrSimError> {
85        Ok(Jaro::similarity(tokenized_key, tokenized_query))
86    }
87}
88
89impl ExpectTokenizerType for Jaro {
90    fn get_expected_tokenizer_type(&self) -> TokenizerType {
91        TokenizerType::Seq(Box::new(None))
92    }
93}