use crate::error::StrSimError;
use super::{ExpectTokenizerType, Jaro, StrSim, TokenizerType};
use anyhow::Result;
use derive_more::Display;
#[derive(Display)]
#[display(fmt = "JaroWinkler")]
pub struct JaroWinkler {
pub threshold: f64,
pub scaling_factor: f64,
pub prefix_len: usize,
}
impl JaroWinkler {
pub fn default() -> Self {
JaroWinkler {
threshold: 0.7,
scaling_factor: 0.1,
prefix_len: 4,
}
}
pub fn new(
threshold: Option<f64>,
scaling_factor: Option<f64>,
prefix_len: Option<usize>,
) -> Self {
JaroWinkler {
threshold: threshold.unwrap_or(0.7),
scaling_factor: scaling_factor.unwrap_or(0.1),
prefix_len: prefix_len.unwrap_or(4),
}
}
fn similarity(&self, s1: &[char], s2: &[char]) -> f64 {
let mut jw_score = Jaro::similarity(s1, s2);
if jw_score > self.threshold {
let mut common_prefix_len = 0;
let max_common_prefix_len = s1.len().min(s2.len()).min(self.prefix_len);
while common_prefix_len < max_common_prefix_len
&& s1[common_prefix_len] == s2[common_prefix_len]
{
common_prefix_len += 1;
}
if common_prefix_len != 0 {
jw_score += self.scaling_factor * (common_prefix_len as f64) * (1.0 - jw_score);
}
}
jw_score
}
}
impl StrSim<Vec<char>> for JaroWinkler {
fn similarity_pre_tok2(
&self,
tokenized_key: &Vec<char>,
tokenized_query: &Vec<char>,
) -> Result<f64, StrSimError> {
Ok(self.similarity(tokenized_key, tokenized_query))
}
}
impl ExpectTokenizerType for JaroWinkler {
fn get_expected_tokenizer_type(&self) -> TokenizerType {
TokenizerType::Seq(Box::new(None))
}
}