1use crate::error::StrSimError;
2
3use super::{ExpectTokenizerType, Jaro, StrSim, TokenizerType};
4
5use anyhow::Result;
6use derive_more::Display;
7
8#[derive(Display)]
9#[display(fmt = "JaroWinkler")]
10pub struct JaroWinkler {
11 pub threshold: f64,
13 pub scaling_factor: f64,
15 pub prefix_len: usize,
16}
17
18impl JaroWinkler {
19 pub fn default() -> Self {
20 JaroWinkler {
21 threshold: 0.7,
22 scaling_factor: 0.1,
23 prefix_len: 4,
24 }
25 }
26
27 pub fn new(
28 threshold: Option<f64>,
29 scaling_factor: Option<f64>,
30 prefix_len: Option<usize>,
31 ) -> Self {
32 JaroWinkler {
33 threshold: threshold.unwrap_or(0.7),
34 scaling_factor: scaling_factor.unwrap_or(0.1),
35 prefix_len: prefix_len.unwrap_or(4),
36 }
37 }
38
39 fn similarity(&self, s1: &[char], s2: &[char]) -> f64 {
40 let mut jw_score = Jaro::similarity(s1, s2);
41 if jw_score > self.threshold {
42 let mut common_prefix_len = 0;
44
45 let max_common_prefix_len = s1.len().min(s2.len()).min(self.prefix_len);
46 while common_prefix_len < max_common_prefix_len
47 && s1[common_prefix_len] == s2[common_prefix_len]
48 {
49 common_prefix_len += 1;
50 }
51 if common_prefix_len != 0 {
52 jw_score += self.scaling_factor * (common_prefix_len as f64) * (1.0 - jw_score);
53 }
54 }
55
56 jw_score
57 }
58}
59
60impl StrSim<Vec<char>> for JaroWinkler {
61 fn similarity_pre_tok2(
62 &self,
63 tokenized_key: &Vec<char>,
64 tokenized_query: &Vec<char>,
65 ) -> Result<f64, StrSimError> {
66 Ok(self.similarity(tokenized_key, tokenized_query))
67 }
68}
69
70impl ExpectTokenizerType for JaroWinkler {
71 fn get_expected_tokenizer_type(&self) -> TokenizerType {
72 TokenizerType::Seq(Box::new(None))
73 }
74}